misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
   6
   7 =cut
   8
   9 use strict;
  10 use Getopt::Long;
  11 use POSIX;
  12 use Locale::PO;
  13 use TmplTokenizer;
  14 use VerboseWarnings;
  15
  16 use vars qw( $convert_from );
  17 use vars qw( $files_from $directory $output $sort );
  18 use vars qw( $extract_all_p );
  19 use vars qw( $pedantic_p );
  20 use vars qw( %text %translation );
  21 use vars qw( $charset_in $charset_out );
  22 use vars qw( $disable_fuzzy_p );
  23 use vars qw( $verbose_p );
  24 use vars qw( $po_mode_p );
  25
  26 ###############################################################################
  27
  28 sub string_negligible_p ($) {
  29     my($t) = @_;                                # a string
  30     # Don't emit pure whitespace, pure numbers, pure punctuation,
  31     # single letters, or TMPL_VAR's.
  32     # Punctuation should arguably be translated. But without context
  33     # they are untranslatable. Note that $t is a string, not a token object.
  34     return !$extract_all_p && (
  35                TmplTokenizer::blank_p($t)       # blank or TMPL_VAR
  36             || $t =~ /^\d+$/                    # purely digits
  37             || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
  38             || $t =~ /^[A-Za-z]$/               # single letters
  39         )
  40 }
  41
  42 sub token_negligible_p( $ ) {
  43     my($x) = @_;
  44     my $t = $x->type;
  45     return !$extract_all_p && (
  46             $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
  47             $t == TmplTokenType::DIRECTIVE? 1:
  48             $t == TmplTokenType::TEXT_PARAMETRIZED
  49                 && join( '', map { my $t = $_->type;
  50                         $t == TmplTokenType::DIRECTIVE?
  51                                 '1': $t == TmplTokenType::TAG?
  52                                         '': token_negligible_p( $_ )?
  53                                         '': '1' } @{$x->children} ) eq '' );
  54 }
  55
  56 ###############################################################################
  57
  58 sub remember ($$) {
  59     my($token, $string) = @_;
  60     # If we determine that the string is negligible, don't bother to remember
  61     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
  62         my $key = TmplTokenizer::string_canon( $string );
  63         $text{$key} = [] unless defined $text{$key};
  64         push @{$text{$key}}, $token;
  65     }
  66 }
  67
  68 ###############################################################################
  69
  70 sub string_list () {
  71     my @t = keys %text;
  72     # The real gettext tools seems to sort case sensitively; I don't know why
  73     @t = sort { $a cmp $b } @t if $sort eq 's';
  74     @t = sort {
  75             my @aa = sort { $a->pathname cmp $b->pathname
  76                     || $a->line_number <=> $b->line_number } @{$text{$a}};
  77             my @bb = sort { $a->pathname cmp $b->pathname
  78                     || $a->line_number <=> $b->line_number } @{$text{$b}};
  79             $aa[0]->pathname cmp $bb[0]->pathname
  80                     || $aa[0]->line_number <=> $bb[0]->line_number;
  81         } @t if $sort eq 'F';
  82     return @t;
  83 }
  84
  85 ###############################################################################
  86
  87 sub text_extract (*) {
  88     my($h) = @_;
  89     for (;;) {
  90         my $s = TmplTokenizer::next_token $h;
  91     last unless defined $s;
  92         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
  93         if ($kind eq TmplTokenType::TEXT) {
  94             remember( $s, $t ) if $t =~ /\S/s;
  95         } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
  96             remember( $s, $s->form ) if $s->form =~ /\S/s;
  97         } elsif ($kind eq TmplTokenType::TAG && %$attr) {
  98             # value [tag=input], meta
  99             my $tag = lc($1) if $t =~ /^<(\S+)/s;
 100             for my $a ('alt', 'content', 'title', 'value') {
 101                 if ($attr->{$a}) {
 102                     next if $a eq 'content' && $tag ne 'meta';
 103                     next if $a eq 'value' && ($tag ne 'input'
 104                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
 105                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
 106                     $val = TmplTokenizer::trim $val;
 107                     remember( $s, $val ) if $val =~ /\S/s;
 108                 }
 109             }
 110         }
 111     }
 112 }
 113
 114 ###############################################################################
 115
 116 sub generate_strings_list () {
 117     # Emit all extracted strings.
 118     for my $t (string_list) {
 119         printf OUTPUT "%s\n", $t;
 120     }
 121 }
 122
 123 ###############################################################################
 124
 125 sub generate_po_file () {
 126     # We don't emit the Plural-Forms header; it's meaningless for us
 127     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
 128     $pot_charset = TmplTokenizer::charset_canon $pot_charset;
 129     # Time stamps aren't exactly right semantically. I don't know how to fix it.
 130     my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
 131     my $time_pot = $time;
 132     my $time_po  = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
 133     print OUTPUT <<EOF;
 134 # SOME DESCRIPTIVE TITLE.
 135 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 136 # This file is distributed under the same license as the PACKAGE package.
 137 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
 138 #
 139 EOF
 140     print OUTPUT <<EOF unless $disable_fuzzy_p;
 141 #, fuzzy
 142 EOF
 143     print OUTPUT <<EOF;
 144 msgid ""
 145 msgstr ""
 146 "Project-Id-Version: PACKAGE VERSION\\n"
 147 "POT-Creation-Date: $time_pot\\n"
 148 "PO-Revision-Date: $time_po\\n"
 149 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 150 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 151 "MIME-Version: 1.0\\n"
 152 "Content-Type: text/plain; charset=$pot_charset\\n"
 153 "Content-Transfer-Encoding: 8bit\\n"
 154
 155 EOF
 156     my $directory_re = quotemeta("$directory/");
 157     for my $t (string_list) {
 158         if ($text{$t}->[0]->type == TmplTokenType::TEXT_PARAMETRIZED) {
 159             my($token, $n) = ($text{$t}->[0], 0);
 160             printf OUTPUT "#. For the first occurrence,\n"
 161                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 162             for my $param ($token->parameters_and_fields) {
 163                 $n += 1;
 164                 my $type = $param->type;
 165                 my $subtype = ($type == TmplTokenType::TAG
 166                         && $param->string =~ /^<input\b/is?
 167                                 $param->attributes->{'type'}->[1]: undef);
 168                 my $fmt = TmplTokenizer::_formalize( $param );
 169                 $fmt =~ s/^%/%$n\$/;
 170                 if ($type == TmplTokenType::DIRECTIVE) {
 171                     $type = $param->string =~ /(TMPL_[A-Z]+)+/is? $1: 'ERROR';
 172                     my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
 173                             $2: undef;
 174                     printf OUTPUT "#. %s: %s\n", $fmt,
 175                         "$type" . (defined $name? " name=$name": '');
 176                 } else {
 177                     my $name = $param->attributes->{'name'};
 178                     my $value = $param->attributes->{'value'}
 179                             unless $subtype =~ /^(?:text)$/;
 180                     printf OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
 181                             . (defined $name?  " name=$name->[1]": '')
 182                             . (defined $value? " value=$value->[1]": '');
 183                 }
 184             }
 185         } elsif ($text{$t}->[0]->type == TmplTokenType::TAG) {
 186             my($token) = ($text{$t}->[0]);
 187             printf OUTPUT "#. For the first occurrence,\n"
 188                     if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
 189             if ($token->string =~ /^<meta\b/is) {
 190                 my $type = $token->attributes->{'http-equiv'}->[1];
 191                 print OUTPUT "#. META http-equiv=$type\n" if defined $type;
 192             } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
 193                 my $tag = uc($1);
 194                 my $type = (lc($tag) eq 'input'?
 195                         $token->attributes->{'type'}: undef);
 196                 my $name = $token->attributes->{'name'};
 197                 printf OUTPUT "#. %s\n", $tag
 198                     . (defined $type? " type=$type->[1]": '')
 199                     . (defined $name? " name=$name->[1]": '');
 200             }
 201         }
 202         my $cformat_p;
 203         for my $token (@{$text{$t}}) {
 204             my $pathname = $token->pathname;
 205             $pathname =~ s/^$directory_re//os;
 206             printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
 207                     if defined $pathname && defined $token->line_number;
 208             $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
 209         }
 210         printf OUTPUT "#, c-format\n" if $cformat_p;
 211         printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
 212                 TmplTokenizer::string_canon
 213                 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
 214         printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
 215                 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
 216     }
 217 }
 218
 219 ###############################################################################
 220
 221 sub convert_translation_file () {
 222     open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
 223     VerboseWarnings::set_input_file_name $convert_from;
 224     while (<INPUT>) {
 225         chomp;
 226         my($msgid, $msgstr) = split(/\t/);
 227         die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
 228                 unless defined $msgstr;
 229
 230         # Fixup some of the bad strings
 231         $msgid =~ s/^SELECTED>//;
 232
 233         # Create dummy token
 234         my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
 235         remember( $token, $msgid );
 236         $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
 237         $translation{$msgid} = $msgstr unless $msgstr eq '*****';
 238
 239         if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 240             my $candidate = TmplTokenizer::charset_canon $2;
 241             die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
 242                     if defined $charset_in && $charset_in ne $candidate;
 243             $charset_in = $candidate;
 244         }
 245         if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 246             my $candidate = TmplTokenizer::charset_canon $2;
 247             die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
 248                     if defined $charset_out && $charset_out ne $candidate;
 249             $charset_out = $candidate;
 250         }
 251     }
 252     # The following assumption is correct; that's what HTML::Template assumes
 253     if (!defined $charset_in) {
 254         $charset_in = $charset_out = TmplTokenizer::charset_canon 'iso8859-1';
 255         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
 256     }
 257 }
 258
 259 ###############################################################################
 260
 261 sub usage ($) {
 262     my($exitcode) = @_;
 263     my $h = $exitcode? *STDERR: *STDOUT;
 264     print $h <<EOF;
 265 Usage: $0 [OPTIONS]
 266 Extract translatable strings from given HTML::Template input files.
 267
 268 Input file location:
 269   -f, --files-from=FILE          Get list of input files from FILE
 270   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 271
 272 Output file location:
 273   -o, --output=FILE              Write output to specified file
 274
 275 HTML::Template options:
 276   -a, --extract-all              Extract all strings
 277       --pedantic-warnings        Issue warnings even for detected problems
 278                                  which are likely to be harmless
 279
 280 Output details:
 281   -s, --sort-output              generate sorted output
 282   -F, --sort-by-file             sort output by file location
 283   -v, --verbose                  explain what is being done
 284
 285 Informative output:
 286       --help                     Display this help and exit
 287
 288 Try `perldoc $0' for perhaps more information.
 289 EOF
 290     exit($exitcode);
 291 }
 292
 293 ###############################################################################
 294
 295 sub usage_error (;$) {
 296     print STDERR "$_[0]\n" if @_;
 297     print STDERR "Try `$0 --help' for more information.\n";
 298     exit(-1);
 299 }
 300
 301 ###############################################################################
 302
 303 Getopt::Long::config qw( bundling no_auto_abbrev );
 304 GetOptions(
 305     'a|extract-all'                     => \$extract_all_p,
 306     'charset=s' => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
 307     'convert-from=s'                    => \$convert_from,
 308     'D|directory=s'                     => \$directory,
 309     'disable-fuzzy'                     => \$disable_fuzzy_p,   # INTERNAL
 310     'f|files-from=s'                    => \$files_from,
 311     'I|input-charset=s'                 => \$charset_in,        # INTERNAL
 312     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 313     'O|output-charset=s'                => \$charset_out,       # INTERNAL
 314     'output|o=s'                        => \$output,
 315     'po-mode'                           => \$po_mode_p,         # INTERNAL
 316     's|sort-output'                     => sub { $sort = 's' },
 317     'F|sort-by-file'                    => sub { $sort = 'F' },
 318     'v|verbose'                         => \$verbose_p,
 319     'help'                              => sub { usage(0) },
 320 ) || usage_error;
 321
 322 VerboseWarnings::set_application_name $0;
 323 VerboseWarnings::set_pedantic_mode $pedantic_p;
 324
 325 usage_error('Missing mandatory option -f')
 326         unless defined $files_from || defined $convert_from;
 327 $directory = '.' unless defined $directory;
 328
 329 usage_error('You cannot specify both --convert-from and --files-from')
 330         if defined $convert_from && defined $files_from;
 331
 332 if (defined $output && $output ne '-') {
 333     print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
 334     open(OUTPUT, ">$output") || die "$output: $!\n";
 335 } else {
 336     print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
 337     open(OUTPUT, ">&STDOUT");
 338 }
 339
 340 if (defined $files_from) {
 341     print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
 342     open(INPUT, "<$files_from") || die "$files_from: $!\n";
 343     while (<INPUT>) {
 344         chomp;
 345         my $input = /^\//? $_: "$directory/$_";
 346         my $h = TmplTokenizer->new( $input );
 347         $h->set_allow_cformat( 1 );
 348         VerboseWarnings::set_input_file_name $input;
 349         print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
 350         text_extract( $h );
 351     }
 352     close INPUT;
 353 } else {
 354     print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
 355     convert_translation_file;
 356 }
 357 generate_po_file;
 358
 359 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 360         if TmplTokenizer::syntaxerror_p;
 361
 362
 363 exit(-1) if TmplTokenizer::fatal_p;
 364
 365 ###############################################################################
 366
 367 =head1 DESCRIPTION
 368
 369 This is an experimental script based on the modularized
 370 text-extract2.pl script.  It has behaviour similar to
 371 xgettext(1), and generates gettext-compatible output files.
 372
 373 A gettext-like format provides the following advantages:
 374
 375 =over
 376
 377 =item -
 378
 379 (Future goal)
 380 Translation to non-English-like languages with different word
 381 order:  gettext's c-format strings can theoretically be
 382 emulated if we are able to do some analysis on the .tmpl input
 383 and treat <TMPL_VAR> in a way similar to %s.
 384
 385 =item -
 386
 387 Context for the extracted strings:  the gettext format provides
 388 the filenames and line numbers where each string can be found.
 389 The translator can read the source file and see the context,
 390 in case the string by itself can mean several different things.
 391
 392 =item -
 393
 394 Place for the translator to add comments about the translations.
 395
 396 =item -
 397
 398 Gettext-compatible tools, if any, might be usable if we adopt
 399 the gettext format.
 400
 401 =back
 402
 403 Note that this script is experimental and should still be
 404 considered unstable.
 405
 406 Please refer to the explanation in tmpl_process3 for further
 407 details.
 408
 409 If you want to generate GNOME-style POTFILES.in files, such
 410 files (passed to -f) can be generated thus:
 411
 412         (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
 413                 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
 414         (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
 415                 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
 416
 417 This is, however, quite pointless, because the "create" and
 418 "update" actions have already been implemented in tmpl_process3.pl.
 419
 420 =head1 SEE ALSO
 421
 422 tmpl_process.pl,
 423 xgettext(1),
 424 Locale::PO(3),
 425 translator_doc.txt
 426
 427 =head1 BUGS
 428
 429 There probably are some. Bugs related to scanning of <INPUT>
 430 tags seem to be especially likely to be present.
 431
 432 Its diagnostics are probably too verbose.
 433
 434 When a <TMPL_VAR> within a JavaScript-related attribute is
 435 detected, the script currently displays no warnings at all.
 436 It might be good to display some kind of warning.
 437
 438 Its sort order (-s option) seems to be different than the real
 439 xgettext(1)'s sort option. This will result in translation
 440 strings inside the generated PO file spuriously moving about
 441 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 442
 443 =cut