misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
   6
   7 =cut
   8
   9 use strict;
  10 use Getopt::Long;
  11 use Locale::PO;
  12 use TmplTokenizer;
  13 use VerboseWarnings;
  14
  15 use vars qw( $convert_from );
  16 use vars qw( $files_from $directory $output $sort );
  17 use vars qw( $extract_all_p );
  18 use vars qw( $pedantic_p );
  19 use vars qw( %text %translation );
  20 use vars qw( $charset_in $charset_out );
  21
  22 ###############################################################################
  23
  24 sub negligible_p ($) {
  25     my($t) = @_;                                # a string
  26     # Don't emit pure whitespace, pure numbers, pure punctuation,
  27     # single letters, or TMPL_VAR's.
  28     # Punctuation should arguably be translated. But without context
  29     # they are untranslatable.
  30     return !$extract_all_p && (
  31                TmplTokenizer::blank_p($t)               # blank or TMPL_VAR
  32             || $t =~ /^\d+$/                    # purely digits
  33             || $t =~ /^[-\.,:;'"%\(\)\[\]\|]+$/ # pure punctuation w/o context
  34             || $t =~ /^[A-Za-z]$/               # single letters
  35         );
  36 }
  37
  38 ###############################################################################
  39
  40 sub remember ($$) {
  41     my($token, $string) = @_;
  42     $text{$string} = [] unless defined $text{$string};
  43     push @{$text{$string}}, $token;
  44 }
  45
  46 ###############################################################################
  47
  48 sub string_list () {
  49     my @t = keys %text;
  50     # The real gettext tools seems to sort case sensitively; I don't know why
  51     @t = sort { $a cmp $b } @t if $sort eq 's';
  52     @t = sort {
  53             my @aa = sort { $a->pathname cmp $b->pathname
  54                     || $a->line_number <=> $b->line_number } @{$text{$a}};
  55             my @bb = sort { $a->pathname cmp $b->pathname
  56                     || $a->line_number <=> $b->line_number } @{$text{$b}};
  57             $aa[0]->pathname cmp $bb[0]->pathname
  58                     || $aa[0]->line_number <=> $bb[0]->line_number;
  59         } @t if $sort eq 'F';
  60     return @t;
  61 }
  62
  63 ###############################################################################
  64
  65 sub text_extract (*) {
  66     my($h) = @_;
  67     for (;;) {
  68         my $s = TmplTokenizer::next_token $h;
  69     last unless defined $s;
  70         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
  71         if ($kind eq TmplTokenType::TEXT) {
  72             #$t = TmplTokenizer::trim $t;
  73             remember( $s, $t ) if $t =~ /\S/s;
  74         } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
  75             #$t = TmplTokenizer::trim $t;
  76             remember( $s, $s->form ) if $s->form =~ /\S/s;
  77         } elsif ($kind eq TmplTokenType::TAG && %$attr) {
  78             # value [tag=input], meta
  79             my $tag = lc($1) if $t =~ /^<(\S+)/s;
  80             for my $a ('alt', 'content', 'title', 'value') {
  81                 if ($attr->{$a}) {
  82                     next if $a eq 'content' && $tag ne 'meta';
  83                     next if $a eq 'value' && ($tag ne 'input'
  84                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
  85                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
  86                     $val = TmplTokenizer::trim $val;
  87                     remember( $s, $val ) if $val =~ /\S/s;
  88                 }
  89             }
  90         }
  91     }
  92 }
  93
  94 ###############################################################################
  95
  96 sub generate_strings_list () {
  97     # Emit all extracted strings.
  98     for my $t (string_list) {
  99         printf OUTPUT "%s\n", $t unless negligible_p($t);
 100     }
 101 }
 102
 103 ###############################################################################
 104
 105 sub generate_po_file () {
 106     # We don't emit the Plural-Forms header; it's meaningless for us
 107     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
 108     $pot_charset = TmplTokenizer::charset_canon $pot_charset;
 109     print OUTPUT <<EOF;
 110 # SOME DESCRIPTIVE TITLE.
 111 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 112 # This file is distributed under the same license as the PACKAGE package.
 113 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
 114 #
 115 #, fuzzy
 116 msgid ""
 117 msgstr ""
 118 "Project-Id-Version: PACKAGE VERSION\\n"
 119 "POT-Creation-Date: 2004-02-05 20:55-0500\\n"
 120 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
 121 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 122 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 123 "MIME-Version: 1.0\\n"
 124 "Content-Type: text/plain; charset=$pot_charset\\n"
 125 "Content-Transfer-Encoding: 8bit\\n"
 126
 127 EOF
 128     my $directory_re = quotemeta("$directory/");
 129     for my $t (string_list) {
 130         next if negligible_p($t);
 131         my $cformat_p;
 132         for my $token (@{$text{$t}}) {
 133             my $pathname = $token->pathname;
 134             $pathname =~ s/^$directory_re//os;
 135             printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
 136                     if defined $pathname && defined $token->line_number;
 137             $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
 138         }
 139         printf OUTPUT "#, c-format\n" if $cformat_p;
 140         printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
 141                 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
 142         printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
 143                 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
 144     }
 145 }
 146
 147 ###############################################################################
 148
 149 sub convert_translation_file () {
 150     open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
 151     VerboseWarnings::set_input_file_name $convert_from;
 152     while (<INPUT>) {
 153         chomp;
 154         my($msgid, $msgstr) = split(/\t/);
 155         die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
 156                 unless defined $msgstr;
 157
 158         # Fixup some of the bad strings
 159         $msgid =~ s/^SELECTED>//;
 160
 161         # Create dummy token
 162         my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
 163         remember( $token, $msgid );
 164         $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
 165         $translation{$msgid} = $msgstr unless $msgstr eq '*****';
 166
 167         if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 168             my $candidate = TmplTokenizer::charset_canon $2;
 169             die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
 170                     if defined $charset_in && $charset_in ne $candidate;
 171             $charset_in = $candidate;
 172         }
 173         if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 174             my $candidate = TmplTokenizer::charset_canon $2;
 175             die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
 176                     if defined $charset_out && $charset_out ne $candidate;
 177             $charset_out = $candidate;
 178         }
 179     }
 180     # The following assumption is correct; that's what HTML::Template assumes
 181     if (!defined $charset_in) {
 182         $charset_in = $charset_out = TmplTokenizer::charset_canon 'iso8859-1';
 183         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
 184     }
 185 }
 186
 187 ###############################################################################
 188
 189 sub usage ($) {
 190     my($exitcode) = @_;
 191     my $h = $exitcode? *STDERR: *STDOUT;
 192     print $h <<EOF;
 193 Usage: $0 [OPTIONS]
 194 Extract translatable strings from given HTML::Template input files.
 195
 196 Input file location:
 197   -f, --files-from=FILE          Get list of input files from FILE
 198   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 199
 200 Output file location:
 201   -o, --output=FILE              Write output to specified file
 202
 203 HTML::Template options:
 204   -a, --extract-all              Extract all strings
 205       --pedantic-warnings        Issue warnings even for detected problems
 206                                  which are likely to be harmless
 207
 208 Output details:
 209   -s, --sort-output              generate sorted output
 210   -F, --sort-by-file             sort output by file location
 211
 212 Informative output:
 213       --help                     Display this help and exit
 214 EOF
 215     exit($exitcode);
 216 }
 217
 218 ###############################################################################
 219
 220 sub usage_error (;$) {
 221     print STDERR "$_[0]\n" if @_;
 222     print STDERR "Try `$0 --help' for more information.\n";
 223     exit(-1);
 224 }
 225
 226 ###############################################################################
 227
 228 Getopt::Long::config qw( bundling no_auto_abbrev );
 229 GetOptions(
 230     'a|extract-all'                     => \$extract_all_p,
 231     'charset=s' => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
 232     'convert-from=s'                    => \$convert_from,
 233     'D|directory=s'                     => \$directory,
 234     'f|files-from=s'                    => \$files_from,
 235     'I|input-charset=s'                 => \$charset_in,        # INTERNAL
 236     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 237     'O|output-charset=s'                => \$charset_out,       # INTERNAL
 238     'output|o=s'                        => \$output,
 239     's|sort-output'                     => sub { $sort = 's' },
 240     'F|sort-by-file'                    => sub { $sort = 'F' },
 241     'help'                              => sub { usage(0) },
 242 ) || usage_error;
 243
 244 VerboseWarnings::set_application_name $0;
 245 VerboseWarnings::set_pedantic_mode $pedantic_p;
 246
 247 usage_error('Missing mandatory option -f')
 248         unless defined $files_from || defined $convert_from;
 249 $directory = '.' unless defined $directory;
 250
 251 usage_error('You cannot specify both --convert-from and --files-from')
 252         if defined $convert_from && defined $files_from;
 253
 254 if (defined $output && $output ne '-') {
 255     open(OUTPUT, ">$output") || die "$output: $!\n";
 256 } else {
 257     open(OUTPUT, ">&STDOUT");
 258 }
 259
 260 if (defined $files_from) {
 261     open(INPUT, "<$files_from") || die "$files_from: $!\n";
 262     while (<INPUT>) {
 263         chomp;
 264         my $h = TmplTokenizer->new( "$directory/$_" );
 265         $h->set_allow_cformat( 1 );
 266         VerboseWarnings::set_input_file_name "$directory/$_";
 267         text_extract( $h );
 268     }
 269     close INPUT;
 270 } else {
 271     convert_translation_file;
 272 }
 273 generate_po_file;
 274
 275 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 276         if TmplTokenizer::syntaxerror_p;
 277
 278
 279 exit(-1) if TmplTokenizer::fatal_p;
 280
 281 ###############################################################################
 282
 283 =head1 DESCRIPTION
 284
 285 This is an experimental script based on the modularized
 286 text-extract2.pl script.  It has behaviour similar to
 287 xgettext(1), and generates gettext-compatible output files.
 288
 289 A gettext-like format provides the following advantages:
 290
 291 =over
 292
 293 =item -
 294
 295 (Future goal)
 296 Translation to non-English-like languages with different word
 297 order:  gettext's c-format strings can theoretically be
 298 emulated if we are able to do some analysis on the .tmpl input
 299 and treat <TMPL_VAR> in a way similar to %s.
 300
 301 =item -
 302
 303 Context for the extracted strings:  the gettext format provides
 304 the filenames and line numbers where each string can be found.
 305 The translator can read the source file and see the context,
 306 in case the string by itself can mean several different things.
 307
 308 =item -
 309
 310 Place for the translator to add comments about the translations.
 311
 312 =item -
 313
 314 Gettext-compatible tools, if any, might be usable if we adopt
 315 the gettext format.
 316
 317 =back
 318
 319 Right now it does about the same thing as text-extract2.pl but
 320 generates gettext-style output; however, because it is scanner-
 321 instead of parser-based, it is able to address the 4 weaknesses
 322 listed in translator_doc.txt.  Ultimately, the goal is to make
 323 this able to do some kind of simple analysis on the input to
 324 produce gettext-style output with c-format strings, in order to
 325 facilitate translation to languages with a different word order
 326 than English.
 327
 328 When the above is finished, the generated po file may contain
 329 some HTML tags in addition to %s strings.
 330
 331 If you want to generate GNOME-style POTFILES.in files, such
 332 files (passed to -f) can be generated thus:
 333
 334         (cd ../.. && find koha-tmpl/opac-tmpl/default/en
 335                 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
 336         (cd ../.. && find koha-tmpl/intranet-tmpl/default/en
 337                 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
 338
 339 This is, however, quite pointless, because the "create" and
 340 "update" actions have already been implemented in tmpl_process3.pl.
 341
 342 =head1 SEE ALSO
 343
 344 tmpl_process.pl,
 345 xgettext(1),
 346 Locale::PO(3),
 347 translator_doc.txt
 348
 349 =head1 BUGS
 350
 351 There probably are some. Bugs related to scanning of <INPUT>
 352 tags seem to be especially likely to be present.
 353
 354 Its diagnostics are probably too verbose.
 355
 356 When a <TMPL_VAR> within a JavaScript-related attribute is
 357 detected, the script currently displays no warnings at all.
 358 It might be good to display some kind of warning.
 359
 360 Its sort order (-s option) seems to be different than the real
 361 xgettext(1)'s sort option. This will result in translation
 362 strings inside the generated PO file spuriously moving about
 363 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 364
 365 =cut