misc/translator/xgettext.pl

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
   6
   7 =cut
   8
   9 use strict;
  10 use Getopt::Long;
  11 use Locale::PO;
  12 use TmplTokenizer;
  13 use VerboseWarnings;
  14
  15 use vars qw( $files_from $directory $output $sort );
  16 use vars qw( $pedantic_p );
  17 use vars qw( %text );
  18
  19 ###############################################################################
  20
  21 sub remember ($$) {
  22     my($token, $string) = @_;
  23     $text{$string} = [] unless defined $text{$string};
  24     push @{$text{$string}}, $token;
  25 }
  26
  27 ###############################################################################
  28
  29 sub string_list () {
  30     my @t = keys %text;
  31     # The real gettext tools seems to sort case sensitively; I don't know why
  32     @t = sort { $a cmp $b } @t if $sort eq 's';
  33     @t = sort {
  34             my @aa = sort { $a->pathname cmp $b->pathname
  35                     || $a->line_number <=> $b->line_number } @{$text{$a}};
  36             my @bb = sort { $a->pathname cmp $b->pathname
  37                     || $a->line_number <=> $b->line_number } @{$text{$b}};
  38             $aa[0]->pathname cmp $bb[0]->pathname
  39                     || $aa[0]->line_number <=> $bb[0]->line_number;
  40         } @t if $sort eq 'F';
  41     return @t;
  42 }
  43
  44 ###############################################################################
  45
  46 sub text_extract (*) {
  47     my($h) = @_;
  48     for (;;) {
  49         my $s = TmplTokenizer::next_token $h;
  50     last unless defined $s;
  51         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
  52         if ($kind eq TmplTokenType::TEXT) {
  53             #$t = TmplTokenizer::trim $t;
  54             remember( $s, $t ) if $t =~ /\S/s;
  55         } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
  56             #$t = TmplTokenizer::trim $t;
  57             remember( $s, $s->form ) if $s->form =~ /\S/s;
  58         } elsif ($kind eq TmplTokenType::TAG && %$attr) {
  59             # value [tag=input], meta
  60             my $tag = lc($1) if $t =~ /^<(\S+)/s;
  61             for my $a ('alt', 'content', 'title', 'value') {
  62                 if ($attr->{$a}) {
  63                     next if $a eq 'content' && $tag ne 'meta';
  64                     next if $a eq 'value' && ($tag ne 'input'
  65                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
  66                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
  67                     $val = TmplTokenizer::trim $val;
  68                     remember( $s, $val ) if $val =~ /\S/s;
  69                 }
  70             }
  71         }
  72     }
  73 }
  74
  75 ###############################################################################
  76
  77 sub generate_strings_list () {
  78     # Emit all extracted strings.
  79     # Don't emit pure whitespace, pure numbers, or TMPL_VAR's.
  80     for my $t (string_list) {
  81         printf OUTPUT "%s\n", $t
  82             unless TmplTokenizer::blank_p($t) || $t =~ /^\d+$/;
  83     }
  84 }
  85
  86 ###############################################################################
  87
  88 sub generate_po_file () {
  89     # We don't emit the Plural-Forms header; it's meaningless for us
  90     print OUTPUT <<EOF;
  91 # SOME DESCRIPTIVE TITLE.
  92 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
  93 # This file is distributed under the same license as the PACKAGE package.
  94 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
  95 #
  96 #, fuzzy
  97 msgid ""
  98 msgstr ""
  99 "Project-Id-Version: PACKAGE VERSION\\n"
 100 "POT-Creation-Date: 2004-02-05 20:55-0500\\n"
 101 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
 102 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
 103 "Language-Team: LANGUAGE <LL\@li.org>\\n"
 104 "MIME-Version: 1.0\\n"
 105 "Content-Type: text/plain; charset=CHARSET\\n"
 106 "Content-Transfer-Encoding: 8bit\\n"
 107
 108 EOF
 109     my $directory_re = quotemeta("$directory/");
 110     for my $t (string_list) {
 111         next if TmplTokenizer::blank_p($t) || $t =~ /^\d+$/;
 112         my $cformat_p;
 113         for my $token (@{$text{$t}}) {
 114             my $pathname = $token->pathname;
 115             $pathname =~ s/^$directory_re//os;
 116             printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number;
 117             $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
 118         }
 119         printf OUTPUT "#, c-format\n" if $cformat_p;
 120         printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po( $t );
 121         printf OUTPUT "msgstr \"\"\n\n";
 122     }
 123 }
 124
 125 ###############################################################################
 126
 127 sub usage ($) {
 128     my($exitcode) = @_;
 129     my $h = $exitcode? *STDERR: *STDOUT;
 130     print $h <<EOF;
 131 Usage: $0 [OPTIONS]
 132 Extract translatable strings from given HTML::Template input files.
 133
 134 Input file location:
 135   -f, --files-from=FILE          Get list of input files from FILE
 136   -D, --directory=DIRECTORY      Add DIRECTORY to list for input files search
 137
 138 Output file location:
 139   -o, --output=FILE              Write output to specified file
 140
 141 HTML::Template options:
 142       --pedantic-warnings        Issue warnings even for detected problems
 143                                  which are likely to be harmless
 144
 145 Output details:
 146   -s, --sort-output              generate sorted output
 147   -F, --sort-by-file             sort output by file location
 148
 149 Informative output:
 150       --help                     Display this help and exit
 151 EOF
 152     exit($exitcode);
 153 }
 154
 155 ###############################################################################
 156
 157 sub usage_error (;$) {
 158     print STDERR "$_[0]\n" if @_;
 159     print STDERR "Try `$0 --help' for more information.\n";
 160     exit(-1);
 161 }
 162
 163 ###############################################################################
 164
 165 Getopt::Long::config qw( bundling no_auto_abbrev );
 166 GetOptions(
 167     'D|directory=s'                     => \$directory,
 168     'f|files-from=s'                    => \$files_from,
 169     'pedantic-warnings|pedantic'        => sub { $pedantic_p = 1 },
 170     'output|o=s'                        => \$output,
 171     's|sort-output'                     => sub { $sort = 's' },
 172     'F|sort-by-file'                    => sub { $sort = 'F' },
 173     'help'                              => sub { usage(0) },
 174 ) || usage_error;
 175
 176 VerboseWarnings::set_application_name $0;
 177 VerboseWarnings::set_pedantic_mode $pedantic_p;
 178
 179 usage_error('Missing mandatory option -f') unless defined $files_from;
 180 $directory = '.' unless defined $directory;
 181
 182 if (defined $output && $output ne '-') {
 183     open(OUTPUT, ">$output") || die "$output: $!\n";
 184 } else {
 185     open(OUTPUT, ">&STDOUT");
 186 }
 187
 188 open(INPUT, "<$files_from") || die "$files_from: $!\n";
 189 while (<INPUT>) {
 190     chomp;
 191     my $h = TmplTokenizer->new( "$directory/$_" );
 192     $h->set_allow_cformat( 1 );
 193     VerboseWarnings::set_input_file_name "$directory/$_";
 194     text_extract( $h );
 195 }
 196 close INPUT;
 197 generate_po_file;
 198
 199 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 200         if TmplTokenizer::syntaxerror_p;
 201
 202
 203 exit(-1) if TmplTokenizer::fatal_p;
 204
 205 ###############################################################################
 206
 207 =head1 DESCRIPTION
 208
 209 This is an experimental script based on the modularized
 210 text-extract2.pl script.  It has behaviour similar to
 211 xgettext(1), and generates gettext-compatible output files.
 212
 213 A gettext-like format provides the following advantages:
 214
 215 =over
 216
 217 =item -
 218
 219 (Future goal)
 220 Translation to non-English-like languages with different word
 221 order:  gettext's c-format strings can theoretically be
 222 emulated if we are able to do some analysis on the .tmpl input
 223 and treat <TMPL_VAR> in a way similar to %s.
 224
 225 =item -
 226
 227 Context for the extracted strings:  the gettext format provides
 228 the filenames and line numbers where each string can be found.
 229 The translator can read the source file and see the context,
 230 in case the string by itself can mean several different things.
 231
 232 =item -
 233
 234 Place for the translator to add comments about the translations.
 235
 236 =item -
 237
 238 Gettext-compatible tools, if any, might be usable if we adopt
 239 the gettext format.
 240
 241 =back
 242
 243 Right now it does about the same thing as text-extract2.pl but
 244 generates gettext-style output; however, because it is scanner-
 245 instead of parser-based, it is able to address the 4 weaknesses
 246 listed in translator_doc.txt.  Ultimately, the goal is to make
 247 this able to do some kind of simple analysis on the input to
 248 produce gettext-style output with c-format strings, in order to
 249 facilitate translation to languages with a different word order
 250 than English.
 251
 252 When the above is finished, the generated po file may contain
 253 some HTML tags in addition to %s strings.
 254
 255 If you want to generate GNOME-style POTFILES.in files, such
 256 files (passed to -f) can be generated thus:
 257
 258         (cd ../.. && find koha-tmpl/opac-tmpl/default/en
 259                 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
 260         (cd ../.. && find koha-tmpl/intranet-tmpl/default/en
 261                 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
 262
 263 This is, however, quite pointless, because the "create" and
 264 "update" actions have already been implemented in tmpl_process3.pl.
 265
 266 =head1 SEE ALSO
 267
 268 tmpl_process.pl,
 269 xgettext(1),
 270 Locale::PO(3),
 271 translator_doc.txt
 272
 273 =head1 BUGS
 274
 275 There probably are some. Bugs related to scanning of <INPUT>
 276 tags seem to be especially likely to be present.
 277
 278 Its diagnostics are probably too verbose.
 279
 280 When a <TMPL_VAR> within a JavaScript-related attribute is
 281 detected, the script currently displays no warnings at all.
 282 It might be good to display some kind of warning.
 283
 284 Its sort order (-s option) seems to be different than the real
 285 xgettext(1)'s sort option. This will result in translation
 286 strings inside the generated PO file spuriously moving about
 287 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
 288
 289 =cut