misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
   6
   7 =cut
   8
   9 use strict;
  10 use Getopt::Long;
  11 use POSIX;
  12 use Locale::PO;
  13 use TmplTokenizer;
  14 use VerboseWarnings;
  15
  16 use vars qw( $convert_from );
  17 use vars qw( $files_from $directory $output $sort );
  18 use vars qw( $extract_all_p );
  19 use vars qw( $pedantic_p );
  20 use vars qw( %text %translation );
  21 use vars qw( $charset_in $charset_out );
  22 use vars qw( $verbose_p );
  23 use vars qw( $po_mode_p );
  24
  25 ###############################################################################
  26
  27 sub string_negligible_p ($) {
  28     my($t) = @_;                                # a string
  29     # Don't emit pure whitespace, pure numbers, pure punctuation,
  30     # single letters, or TMPL_VAR's.
  31     # Punctuation should arguably be translated. But without context
  32     # they are untranslatable. Note that $t is a string, not a token object.
  33     return !$extract_all_p && (
  34                TmplTokenizer::blank_p($t)       # blank or TMPL_VAR
  35             || $t =~ /^\d+$/                    # purely digits
  36             || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
  37             || $t =~ /^[A-Za-z]$/               # single letters
  38         )
  39 }
  40
  41 sub token_negligible_p( $ ) {
  42     my($x) = @_;
  43     my $t = $x->type;
  44     return !$extract_all_p && (
  45             $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
  46             $t == TmplTokenType::DIRECTIVE? 1:
  47             $t == TmplTokenType::TEXT_PARAMETRIZED
  48                 && join( '', map { my $t = $_->type;
  49                         $t == TmplTokenType::DIRECTIVE?
  50                                 '1': $t == TmplTokenType::TAG?
  51                                         '': token_negligible_p( $_ )?
  52                                         '': '1' } @{$x->children} ) eq '' );
  53 }
  54
  55 ###############################################################################
  56
  57 sub remember ($$) {
  58     my($token, $string) = @_;
  59     # If we determine that the string is negligible, don't bother to remember
  60     unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
  61         my $key = TmplTokenizer::string_canon( $string );
  62         $text{$key} = [] unless defined $text{$key};
  63         push @{$text{$key}}, $token;
  64     }
  65 }
  66
  67 ###############################################################################
  68
  69 sub string_list () {
  70     my @t = keys %text;
  71     # The real gettext tools seems to sort case sensitively; I don't know why
  72     @t = sort { $a cmp $b } @t if $sort eq 's';
  73     @t = sort {
  74             my @aa = sort { $a->pathname cmp $b->pathname
  75                     || $a->line_number <=> $b->line_number } @{$text{$a}};
  76             my @bb = sort { $a->pathname cmp $b->pathname
  77                     || $a->line_number <=> $b->line_number } @{$text{$b}};
  78             $aa[0]->pathname cmp $bb[0]->pathname
  79                     || $aa[0]->line_number <=> $bb[0]->line_number;
  80         } @t if $sort eq 'F';
  81     return @t;
  82 }
  83
  84 ###############################################################################
  85
  86 sub text_extract (*) {
  87     my($h) = @_;
  88     for (;;) {
  89         my $s = TmplTokenizer::next_token $h;
  90     last unless defined $s;
  91         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
  92         if ($kind eq TmplTokenType::TEXT) {
  93             remember( $s, $t ) if $t =~ /\S/s;
  94         } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
  95             remember( $s, $s->form ) if $s->form =~ /\S/s;
  96         } elsif ($kind eq TmplTokenType::TAG && %$attr) {
  97             # value [tag=input], meta
  98             my $tag = lc($1) if $t =~ /^<(\S+)/s;
  99             for my $a ('alt', 'content', 'title', 'value') {
 100                 if ($attr->{$a}) {
 101                     next if $a eq 'content' && $tag ne 'meta';
 102                     next if $a eq 'value' && ($tag ne 'input'
 103                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
 104                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
 105                     $val = TmplTokenizer::trim $val;
 106                     remember( $s, $val ) if $val =~ /\S/s;
 107                 }
 108             }
 109         }
 110     }
 111 }
 112
 113 ###############################################################################
 114
 115 sub generate_strings_list () {
 116     # Emit all extracted strings.
 117     for my $t (string_list) {
 118         printf OUTPUT "%s\n", $t # unless negligible_p($t);
 119     }
 120 }
 121
 122 ###############################################################################
 123
 124 sub generate_po_file () {
 125     # We don't emit the Plural-Forms header; it's meaningless for us
 126     my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
 127     $pot_charset = TmplTokenizer::charset_canon $pot_charset;
 128     # Time stamps aren't exactly right semantically. I don't know how to fix it.
 129     my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
 130     my $time_pot = $time;
 131     my $time_po  = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
 132     print OUTPUT <<EOF;
 133 # SOME DESCRIPTIVE TITLE.
 134 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 135 # This file is distributed under the same license as the PACKAGE package.
 136 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
 137 #
 138 #, fuzzy
 139 msgid ""
 140 msgstr ""
 141 "Project-Id-Version: PACKAGE VERSION\\n"
 142 "POT-Creation-Date: $time_pot\\n"
 143 "PO-Revision-Date: $time_po\\n"
 144 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 145 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 146 "MIME-Version: 1.0\\n"
 147 "Content-Type: text/plain; charset=$pot_charset\\n"
 148 "Content-Transfer-Encoding: 8bit\\n"
 149
 150 EOF
 151     my $directory_re = quotemeta("$directory/");
 152     for my $t (string_list) {
 153         #next if negligible_p($t);
 154         my $cformat_p;
 155         for my $token (@{$text{$t}}) {
 156             my $pathname = $token->pathname;
 157             $pathname =~ s/^$directory_re//os;
 158             printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
 159                     if defined $pathname && defined $token->line_number;
 160             $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
 161         }
 162         printf OUTPUT "#, c-format\n" if $cformat_p;
 163         printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
 164                 TmplTokenizer::string_canon
 165                 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
 166         printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
 167                 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
 168     }
 169 }
 170
 171 ###############################################################################
 172
 173 sub convert_translation_file () {
 174     open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
 175     VerboseWarnings::set_input_file_name $convert_from;
 176     while (<INPUT>) {
 177         chomp;
 178         my($msgid, $msgstr) = split(/\t/);
 179         die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
 180                 unless defined $msgstr;
 181
 182         # Fixup some of the bad strings
 183         $msgid =~ s/^SELECTED>//;
 184
 185         # Create dummy token
 186         my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
 187         remember( $token, $msgid );
 188         $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
 189         $translation{$msgid} = $msgstr unless $msgstr eq '*****';
 190
 191         if ($msgid  =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 192             my $candidate = TmplTokenizer::charset_canon $2;
 193             die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
 194                     if defined $charset_in && $charset_in ne $candidate;
 195             $charset_in = $candidate;
 196         }
 197         if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
 198             my $candidate = TmplTokenizer::charset_canon $2;
 199             die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
 200                     if defined $charset_out && $charset_out ne $candidate;
 201             $charset_out = $candidate;
 202         }
 203     }
 204     # The following assumption is correct; that's what HTML::Template assumes
 205     if (!defined $charset_in) {
 206         $charset_in = $charset_out = TmplTokenizer::charset_canon 'iso8859-1';
 207         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
 208     }
 209 }
 210
 211 ###############################################################################
 212
 213 sub usage ($) {
 214     my($exitcode) = @_;
 215     my $h = $exitcode? *STDERR: *STDOUT;
 216     print $h <<EOF;
 217 Usage: $0 [OPTIONS]
 218 Extract translatable strings from given HTML::Template input files.
 219
 220 Input file location:
 221   -f, --files-from=FILE          Get list of input files from FILE
 222   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 223
 224 Output file location:
 225   -o, --output=FILE              Write output to specified file
 226
 227 HTML::Template options:
 228   -a, --extract-all              Extract all strings
 229       --pedantic-warnings        Issue warnings even for detected problems
 230                                  which are likely to be harmless
 231
 232 Output details:
 233   -s, --sort-output              generate sorted output
 234   -F, --sort-by-file             sort output by file location
 235   -v, --verbose                  explain what is being done
 236
 237 Informative output:
 238       --help                     Display this help and exit
 239 EOF
 240     exit($exitcode);
 241 }
 242
 243 ###############################################################################
 244
 245 sub usage_error (;$) {
 246     print STDERR "$_[0]\n" if @_;
 247     print STDERR "Try `$0 --help' for more information.\n";
 248     exit(-1);
 249 }
 250
 251 ###############################################################################
 252
 253 Getopt::Long::config qw( bundling no_auto_abbrev );
 254 GetOptions(
 255     'a|extract-all'                     => \$extract_all_p,
 256     'charset=s' => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
 257     'convert-from=s'                    => \$convert_from,
 258     'D|directory=s'                     => \$directory,
 259     'f|files-from=s'                    => \$files_from,
 260     'I|input-charset=s'                 => \$charset_in,        # INTERNAL
 261     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 262     'O|output-charset=s'                => \$charset_out,       # INTERNAL
 263     'output|o=s'                        => \$output,
 264     'po-mode'                           => \$po_mode_p,         # INTERNAL
 265     's|sort-output'                     => sub { $sort = 's' },
 266     'F|sort-by-file'                    => sub { $sort = 'F' },
 267     'v|verbose'                         => \$verbose_p,
 268     'help'                              => sub { usage(0) },
 269 ) || usage_error;
 270
 271 VerboseWarnings::set_application_name $0;
 272 VerboseWarnings::set_pedantic_mode $pedantic_p;
 273
 274 usage_error('Missing mandatory option -f')
 275         unless defined $files_from || defined $convert_from;
 276 $directory = '.' unless defined $directory;
 277
 278 usage_error('You cannot specify both --convert-from and --files-from')
 279         if defined $convert_from && defined $files_from;
 280
 281 if (defined $output && $output ne '-') {
 282     print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
 283     open(OUTPUT, ">$output") || die "$output: $!\n";
 284 } else {
 285     print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
 286     open(OUTPUT, ">&STDOUT");
 287 }
 288
 289 if (defined $files_from) {
 290     print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
 291     open(INPUT, "<$files_from") || die "$files_from: $!\n";
 292     while (<INPUT>) {
 293         chomp;
 294         my $h = TmplTokenizer->new( "$directory/$_" );
 295         $h->set_allow_cformat( 1 );
 296         VerboseWarnings::set_input_file_name "$directory/$_";
 297         print STDERR "$0: Processing file \"$directory/$_\"\n" if $verbose_p;
 298         text_extract( $h );
 299     }
 300     close INPUT;
 301 } else {
 302     print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
 303     convert_translation_file;
 304 }
 305 generate_po_file;
 306
 307 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 308         if TmplTokenizer::syntaxerror_p;
 309
 310
 311 exit(-1) if TmplTokenizer::fatal_p;
 312
 313 ###############################################################################
 314
 315 =head1 DESCRIPTION
 316
 317 This is an experimental script based on the modularized
 318 text-extract2.pl script.  It has behaviour similar to
 319 xgettext(1), and generates gettext-compatible output files.
 320
 321 A gettext-like format provides the following advantages:
 322
 323 =over
 324
 325 =item -
 326
 327 (Future goal)
 328 Translation to non-English-like languages with different word
 329 order:  gettext's c-format strings can theoretically be
 330 emulated if we are able to do some analysis on the .tmpl input
 331 and treat <TMPL_VAR> in a way similar to %s.
 332
 333 =item -
 334
 335 Context for the extracted strings:  the gettext format provides
 336 the filenames and line numbers where each string can be found.
 337 The translator can read the source file and see the context,
 338 in case the string by itself can mean several different things.
 339
 340 =item -
 341
 342 Place for the translator to add comments about the translations.
 343
 344 =item -
 345
 346 Gettext-compatible tools, if any, might be usable if we adopt
 347 the gettext format.
 348
 349 =back
 350
 351 Note that this script is experimental and should still be
 352 considered unstable.
 353
 354 Please refer to the explanation in tmpl_process3 for further
 355 details.
 356
 357 If you want to generate GNOME-style POTFILES.in files, such
 358 files (passed to -f) can be generated thus:
 359
 360         (cd ../.. && find koha-tmpl/opac-tmpl/default/en
 361                 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
 362         (cd ../.. && find koha-tmpl/intranet-tmpl/default/en
 363                 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
 364
 365 This is, however, quite pointless, because the "create" and
 366 "update" actions have already been implemented in tmpl_process3.pl.
 367
 368 =head1 SEE ALSO
 369
 370 tmpl_process.pl,
 371 xgettext(1),
 372 Locale::PO(3),
 373 translator_doc.txt
 374
 375 =head1 BUGS
 376
 377 There probably are some. Bugs related to scanning of <INPUT>
 378 tags seem to be especially likely to be present.
 379
 380 Its diagnostics are probably too verbose.
 381
 382 When a <TMPL_VAR> within a JavaScript-related attribute is
 383 detected, the script currently displays no warnings at all.
 384 It might be good to display some kind of warning.
 385
 386 Its sort order (-s option) seems to be different than the real
 387 xgettext(1)'s sort option. This will result in translation
 388 strings inside the generated PO file spuriously moving about
 389 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 390
 391 =cut