misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
   6
   7 =cut
   8
   9 use strict;
  10 use warnings;
  11 use Getopt::Long;
  12 use POSIX;
  13 use Locale::PO;
  14 use TmplTokenizer;
  15 use VerboseWarnings;
  16
  17 use vars qw( $convert_from );
  18 use vars qw( $files_from $directory $output $sort );
  19 use vars qw( $extract_all_p );
  20 use vars qw( $pedantic_p );
  21 use vars qw( %text %translation );
  22 use vars qw( $charset_in $charset_out );
  23 use vars qw( $disable_fuzzy_p );
  24 use vars qw( $verbose_p );
  25 use vars qw( $po_mode_p );
  26
  27 ###############################################################################
  28
  29 sub string_negligible_p ($) {
  30     my($t) = @_;                                # a string
  31     # Don't emit pure whitespace, pure numbers, pure punctuation,
  32     # single letters, or TMPL_VAR's.
  33     # Punctuation should arguably be translated. But without context
  34     # they are untranslatable. Note that $t is a string, not a token object.
  35     return !$extract_all_p && (
  36                TmplTokenizer::blank_p($t)       # blank or TMPL_VAR
  37             || $t =~ /^\d+$/                    # purely digits
  38             || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
  39             || $t =~ /^[A-Za-z]$/               # single letters
  40         )
  41 }
  42
  43 sub token_negligible_p( $ ) {
  44     my($x) = @_;
  45     my $t = $x->type;
  46     return !$extract_all_p && (
  47             $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
  48             $t == TmplTokenType::DIRECTIVE? 1:
  49             $t == TmplTokenType::TEXT_PARAMETRIZED
  50                 && join( '', map { my $t = $_->type;
  51                         $t == TmplTokenType::DIRECTIVE?
  52                                 '1': $t == TmplTokenType::TAG?
  53                                         '': token_negligible_p( $_ )?
  54                                         '': '1' } @{$x->children} ) eq '' );
  55 }
  56
  57 ###############################################################################
  58
  59 sub remember ($$) {
  60     my($token, $string) = @_;
  61     # If we determine that the string is negligible, don't bother to remember
  62     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
  63         my $key = TmplTokenizer::string_canon( $string );
  64         $text{$key} = [] unless defined $text{$key};
  65         push @{$text{$key}}, $token;
  66     }
  67 }
  68
  69 ###############################################################################
  70
  71 sub string_list () {
  72     my @t = keys %text;
  73     # The real gettext tools seems to sort case sensitively; I don't know why
  74     @t = sort { $a cmp $b } @t if $sort eq 's';
  75     @t = sort {
  76             my @aa = sort { $a->pathname cmp $b->pathname
  77                     || $a->line_number <=> $b->line_number } @{$text{$a}};
  78             my @bb = sort { $a->pathname cmp $b->pathname
  79                     || $a->line_number <=> $b->line_number } @{$text{$b}};
  80             $aa[0]->pathname cmp $bb[0]->pathname
  81                     || $aa[0]->line_number <=> $bb[0]->line_number;
  82         } @t if $sort eq 'F';
  83     return @t;
  84 }
  85
  86 ###############################################################################
  87
  88 sub text_extract (*) {
  89     my($h) = @_;
  90     for (;;) {
  91         my $s = TmplTokenizer::next_token $h;
  92     last unless defined $s;
  93         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
  94         if ($kind eq TmplTokenType::TEXT) {
  95             remember( $s, $t ) if $t =~ /\S/s;
  96         } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
  97             remember( $s, $s->form ) if $s->form =~ /\S/s;
  98         } elsif ($kind eq TmplTokenType::TAG && %$attr) {
  99             # value [tag=input], meta
 100             my $tag = lc($1) if $t =~ /^<(\S+)/s;
 101             for my $a ('alt', 'content', 'title', 'value') {
 102                 if ($attr->{$a}) {
 103                     next if $a eq 'content' && $tag ne 'meta';
 104                     next if $a eq 'value' && ($tag ne 'input'
 105                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|submit)$/)); # FIXME
 106                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
 107                     $val = TmplTokenizer::trim $val;
 108                     remember( $s, $val ) if $val =~ /\S/s;
 109                 }
 110             }
 111         } elsif ($s->has_js_data) {
 112             for my $t (@{$s->js_data}) {
 113                 remember( $s, $t->[3] ) if $t->[0]; # FIXME
 114             }
 115         }
 116     }
 117 }
 118
 119 ###############################################################################
 120
 121 sub generate_strings_list () {
 122     # Emit all extracted strings.
 123     for my $t (string_list) {
 124         printf OUTPUT "%s\n", $t;
 125     }
 126 }
 127
 128 ###############################################################################
 129
 130 sub generate_po_file () {
 131     # We don't emit the Plural-Forms header; it's meaningless for us
 132     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
 133     $pot_charset = TmplTokenizer::charset_canon $pot_charset;
 134     # Time stamps aren't exactly right semantically. I don't know how to fix it.
 135     my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
 136     my $time_pot = $time;
 137     my $time_po  = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
 138     print OUTPUT <<EOF;
 139 # SOME DESCRIPTIVE TITLE.
 140 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 141 # This file is distributed under the same license as the PACKAGE package.
 142 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
 143 #
 144 EOF
 145     print OUTPUT <<EOF unless $disable_fuzzy_p;
 146 #, fuzzy
 147 EOF
 148     print OUTPUT <<EOF;
 149 msgid ""
 150 msgstr ""
 151 "Project-Id-Version: PACKAGE VERSION\\n"
 152 "POT-Creation-Date: $time_pot\\n"
 153 "PO-Revision-Date: $time_po\\n"
 154 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 155 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 156 "MIME-Version: 1.0\\n"
 157 "Content-Type: text/plain; charset=$pot_charset\\n"
 158 "Content-Transfer-Encoding: 8bit\\n"
 159
 160 EOF
 161     my $directory_re = quotemeta("$directory/");
 162     for my $t (string_list) {
 163         if ($text{$t}->[0]->type == TmplTokenType::TEXT_PARAMETRIZED) {
 164             my($token, $n) = ($text{$t}->[0], 0);
 165             printf OUTPUT "#. For the first occurrence,\n"
 166                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 167             for my $param ($token->parameters_and_fields) {
 168                 $n += 1;
 169                 my $type = $param->type;
 170                 my $subtype = ($type == TmplTokenType::TAG
 171                         && $param->string =~ /^<input\b/is?
 172                                 $param->attributes->{'type'}->[1]: undef);
 173                 my $fmt = TmplTokenizer::_formalize( $param );
 174                 $fmt =~ s/^%/%$n\$/;
 175                 if ($type == TmplTokenType::DIRECTIVE) {
 176                     $type = $param->string =~ /(TMPL_[A-Z]+)+/is? $1: 'ERROR';
 177                     my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
 178                             $2: undef;
 179                     printf OUTPUT "#. %s: %s\n", $fmt,
 180                         "$type" . (defined $name? " name=$name": '');
 181                 } else {
 182                     my $name = $param->attributes->{'name'};
 183                     my $value = $param->attributes->{'value'}
 184                             unless $subtype =~ /^(?:text)$/;
 185                     printf OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
 186                             . (defined $name?  " name=$name->[1]": '')
 187                             . (defined $value? " value=$value->[1]": '');
 188                 }
 189             }
 190         } elsif ($text{$t}->[0]->type == TmplTokenType::TAG) {
 191             my($token) = ($text{$t}->[0]);
 192             printf OUTPUT "#. For the first occurrence,\n"
 193                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 194             if ($token->string =~ /^<meta\b/is) {
 195                 my $type = $token->attributes->{'http-equiv'}->[1];
 196                 print OUTPUT "#. META http-equiv=$type\n" if defined $type;
 197             } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
 198                 my $tag = uc($1);
 199                 my $type = (lc($tag) eq 'input'?
 200                         $token->attributes->{'type'}: undef);
 201                 my $name = $token->attributes->{'name'};
 202                 printf OUTPUT "#. %s\n", $tag
 203                     . (defined $type? " type=$type->[1]": '')
 204                     . (defined $name? " name=$name->[1]": '');
 205             }
 206         } elsif ($text{$t}->[0]->has_js_data) {
 207             printf OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
 208             printf OUTPUT "#. SCRIPT\n";
 209         }
 210         my $cformat_p;
 211         for my $token (@{$text{$t}}) {
 212             my $pathname = $token->pathname;
 213             $pathname =~ s/^$directory_re//os;
 214             printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
 215                     if defined $pathname && defined $token->line_number;
 216             $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
 217         }
 218         printf OUTPUT "#, c-format\n" if $cformat_p;
 219         printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
 220                 TmplTokenizer::string_canon
 221                 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
 222         printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
 223                 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
 224     }
 225 }
 226
 227 ###############################################################################
 228
 229 sub convert_translation_file () {
 230     open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
 231     VerboseWarnings::set_input_file_name $convert_from;
 232     while (<INPUT>) {
 233         chomp;
 234         my($msgid, $msgstr) = split(/\t/);
 235         die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
 236                 unless defined $msgstr;
 237
 238         # Fixup some of the bad strings
 239         $msgid =~ s/^SELECTED>//;
 240
 241         # Create dummy token
 242         my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
 243         remember( $token, $msgid );
 244         $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
 245         $translation{$msgid} = $msgstr unless $msgstr eq '*****';
 246
 247         if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 248             my $candidate = TmplTokenizer::charset_canon $2;
 249             die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
 250                     if defined $charset_in && $charset_in ne $candidate;
 251             $charset_in = $candidate;
 252         }
 253         if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 254             my $candidate = TmplTokenizer::charset_canon $2;
 255             die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
 256                     if defined $charset_out && $charset_out ne $candidate;
 257             $charset_out = $candidate;
 258         }
 259     }
 260     # The following assumption is correct; that's what HTML::Template assumes
 261     if (!defined $charset_in) {
 262         $charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8';
 263         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
 264     }
 265 }
 266
 267 ###############################################################################
 268
 269 sub usage ($) {
 270     my($exitcode) = @_;
 271     my $h = $exitcode? *STDERR: *STDOUT;
 272     print $h <<EOF;
 273 Usage: $0 [OPTIONS]
 274 Extract translatable strings from given HTML::Template input files.
 275
 276 Input file location:
 277   -f, --files-from=FILE          Get list of input files from FILE
 278   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 279
 280 Output file location:
 281   -o, --output=FILE              Write output to specified file
 282
 283 HTML::Template options:
 284   -a, --extract-all              Extract all strings
 285       --pedantic-warnings        Issue warnings even for detected problems
 286                                  which are likely to be harmless
 287
 288 Output details:
 289   -s, --sort-output              generate sorted output
 290   -F, --sort-by-file             sort output by file location
 291   -v, --verbose                  explain what is being done
 292
 293 Informative output:
 294       --help                     Display this help and exit
 295
 296 Try `perldoc $0' for perhaps more information.
 297 EOF
 298     exit($exitcode);
 299 }
 300
 301 ###############################################################################
 302
 303 sub usage_error (;$) {
 304     print STDERR "$_[0]\n" if @_;
 305     print STDERR "Try `$0 --help' for more information.\n";
 306     exit(-1);
 307 }
 308
 309 ###############################################################################
 310
 311 Getopt::Long::config qw( bundling no_auto_abbrev );
 312 GetOptions(
 313     'a|extract-all'                     => \$extract_all_p,
 314     'charset=s' => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
 315     'convert-from=s'                    => \$convert_from,
 316     'D|directory=s'                     => \$directory,
 317     'disable-fuzzy'                     => \$disable_fuzzy_p,   # INTERNAL
 318     'f|files-from=s'                    => \$files_from,
 319     'I|input-charset=s'                 => \$charset_in,        # INTERNAL
 320     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 321     'O|output-charset=s'                => \$charset_out,       # INTERNAL
 322     'output|o=s'                        => \$output,
 323     'po-mode'                           => \$po_mode_p,         # INTERNAL
 324     's|sort-output'                     => sub { $sort = 's' },
 325     'F|sort-by-file'                    => sub { $sort = 'F' },
 326     'v|verbose'                         => \$verbose_p,
 327     'help'                              => sub { usage(0) },
 328 ) || usage_error;
 329
 330 VerboseWarnings::set_application_name $0;
 331 VerboseWarnings::set_pedantic_mode $pedantic_p;
 332
 333 usage_error('Missing mandatory option -f')
 334         unless defined $files_from || defined $convert_from;
 335 $directory = '.' unless defined $directory;
 336
 337 usage_error('You cannot specify both --convert-from and --files-from')
 338         if defined $convert_from && defined $files_from;
 339
 340 if (defined $output && $output ne '-') {
 341     print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
 342     open(OUTPUT, ">$output") || die "$output: $!\n";
 343 } else {
 344     print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
 345     open(OUTPUT, ">&STDOUT");
 346 }
 347
 348 if (defined $files_from) {
 349     print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
 350     open(INPUT, "<$files_from") || die "$files_from: $!\n";
 351     while (<INPUT>) {
 352         chomp;
 353         my $input = /^\//? $_: "$directory/$_";
 354         my $h = TmplTokenizer->new( $input );
 355         $h->set_allow_cformat( 1 );
 356         VerboseWarnings::set_input_file_name $input;
 357         print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
 358         text_extract( $h );
 359     }
 360     close INPUT;
 361 } else {
 362     print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
 363     convert_translation_file;
 364 }
 365 generate_po_file;
 366
 367 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 368         if TmplTokenizer::syntaxerror_p;
 369
 370
 371 exit(-1) if TmplTokenizer::fatal_p;
 372
 373 ###############################################################################
 374
 375 =head1 DESCRIPTION
 376
 377 This is an experimental script based on the modularized
 378 text-extract2.pl script.  It has behaviour similar to
 379 xgettext(1), and generates gettext-compatible output files.
 380
 381 A gettext-like format provides the following advantages:
 382
 383 =over
 384
 385 =item -
 386
 387 Translation to non-English-like languages with different word
 388 order:  gettext's c-format strings can theoretically be
 389 emulated if we are able to do some analysis on the .tmpl input
 390 and treat <TMPL_VAR> in a way similar to %s.
 391
 392 =item -
 393
 394 Context for the extracted strings:  the gettext format provides
 395 the filenames and line numbers where each string can be found.
 396 The translator can read the source file and see the context,
 397 in case the string by itself can mean several different things.
 398
 399 =item -
 400
 401 Place for the translator to add comments about the translations.
 402
 403 =item -
 404
 405 Gettext-compatible tools, if any, might be usable if we adopt
 406 the gettext format.
 407
 408 =back
 409
 410 This script has already been in use for over a year and should
 411 be reasonable stable. Nevertheless, it is still somewhat
 412 experimental and there are still some issues.
 413
 414 Please refer to the explanation in tmpl_process3 for further
 415 details.
 416
 417 If you want to generate GNOME-style POTFILES.in files, such
 418 files (passed to -f) can be generated thus:
 419
 420         (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
 421                 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
 422         (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
 423                 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
 424
 425 This is, however, quite pointless, because the "create" and
 426 "update" actions have already been implemented in tmpl_process3.pl.
 427
 428 =head2 Strings inside JavaScript
 429
 430 In the SCRIPT elements, the script will attempt to scan for
 431 _("I<string literal>") patterns, and extract the I<string literal>
 432 as a translatable string.
 433
 434 Note that the C-like _(...) notation is required.
 435
 436 The JavaScript must actually define a _ function
 437 so that the code remains correct JavaScript.
 438 A suitable definition of such a function can be
 439
 440         function _(s) { return s } // dummy function for gettext
 441
 442 =head1 SEE ALSO
 443
 444 tmpl_process3.pl,
 445 xgettext(1),
 446 Locale::PO(3),
 447 translator_doc.txt
 448
 449 =head1 BUGS
 450
 451 There probably are some. Bugs related to scanning of <INPUT>
 452 tags seem to be especially likely to be present.
 453
 454 Its diagnostics are probably too verbose.
 455
 456 When a <TMPL_VAR> within a JavaScript-related attribute is
 457 detected, the script currently displays no warnings at all.
 458 It might be good to display some kind of warning.
 459
 460 Its sort order (-s option) seems to be different than the real
 461 xgettext(1)'s sort option. This will result in translation
 462 strings inside the generated PO file spuriously moving about
 463 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 464
 465 If a Javascript string has leading spaces, it will
 466 generate strings with spurious leading spaces,
 467 leading to failure to match the strings when actually generating
 468 translated files.
 469
 470 =cut