5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
16 use vars qw( $convert_from );
17 use vars qw( $files_from $directory $output $sort );
18 use vars qw( $extract_all_p );
19 use vars qw( $pedantic_p );
20 use vars qw( %text %translation );
21 use vars qw( $charset_in $charset_out );
22 use vars qw( $disable_fuzzy_p );
23 use vars qw( $verbose_p );
24 use vars qw( $po_mode_p );
26 ###############################################################################
28 sub string_negligible_p ($) {
29 my($t) = @_; # a string
30 # Don't emit pure whitespace, pure numbers, pure punctuation,
31 # single letters, or TMPL_VAR's.
32 # Punctuation should arguably be translated. But without context
33 # they are untranslatable. Note that $t is a string, not a token object.
34 return !$extract_all_p && (
35 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
36 || $t =~ /^\d+$/ # purely digits
37 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
38 || $t =~ /^[A-Za-z]$/ # single letters
42 sub token_negligible_p( $ ) {
45 return !$extract_all_p && (
46 $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
47 $t == TmplTokenType::DIRECTIVE? 1:
48 $t == TmplTokenType::TEXT_PARAMETRIZED
49 && join( '', map { my $t = $_->type;
50 $t == TmplTokenType::DIRECTIVE?
51 '1': $t == TmplTokenType::TAG?
52 '': token_negligible_p( $_ )?
53 '': '1' } @{$x->children} ) eq '' );
56 ###############################################################################
59 my($token, $string) = @_;
60 # If we determine that the string is negligible, don't bother to remember
61 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
62 my $key = TmplTokenizer::string_canon( $string );
63 $text{$key} = [] unless defined $text{$key};
64 push @{$text{$key}}, $token;
68 ###############################################################################
72 # The real gettext tools seems to sort case sensitively; I don't know why
73 @t = sort { $a cmp $b } @t if $sort eq 's';
75 my @aa = sort { $a->pathname cmp $b->pathname
76 || $a->line_number <=> $b->line_number } @{$text{$a}};
77 my @bb = sort { $a->pathname cmp $b->pathname
78 || $a->line_number <=> $b->line_number } @{$text{$b}};
79 $aa[0]->pathname cmp $bb[0]->pathname
80 || $aa[0]->line_number <=> $bb[0]->line_number;
85 ###############################################################################
87 sub text_extract (*) {
90 my $s = TmplTokenizer::next_token $h;
91 last unless defined $s;
92 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
93 if ($kind eq TmplTokenType::TEXT) {
94 remember( $s, $t ) if $t =~ /\S/s;
95 } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
96 remember( $s, $s->form ) if $s->form =~ /\S/s;
97 } elsif ($kind eq TmplTokenType::TAG && %$attr) {
98 # value [tag=input], meta
99 my $tag = lc($1) if $t =~ /^<(\S+)/s;
100 for my $a ('alt', 'content', 'title', 'value') {
102 next if $a eq 'content' && $tag ne 'meta';
103 next if $a eq 'value' && ($tag ne 'input'
104 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
105 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
106 $val = TmplTokenizer::trim $val;
107 remember( $s, $val ) if $val =~ /\S/s;
114 ###############################################################################
116 sub generate_strings_list () {
117 # Emit all extracted strings.
118 for my $t (string_list) {
119 printf OUTPUT "%s\n", $t;
123 ###############################################################################
125 sub generate_po_file () {
126 # We don't emit the Plural-Forms header; it's meaningless for us
127 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
128 $pot_charset = TmplTokenizer::charset_canon $pot_charset;
129 # Time stamps aren't exactly right semantically. I don't know how to fix it.
130 my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
131 my $time_pot = $time;
132 my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
134 # SOME DESCRIPTIVE TITLE.
135 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
136 # This file is distributed under the same license as the PACKAGE package.
137 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
140 print OUTPUT <<EOF unless $disable_fuzzy_p;
146 "Project-Id-Version: PACKAGE VERSION\\n"
147 "POT-Creation-Date: $time_pot\\n"
148 "PO-Revision-Date: $time_po\\n"
149 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
150 "Language-Team: LANGUAGE <LL\@li.org>\\n"
151 "MIME-Version: 1.0\\n"
152 "Content-Type: text/plain; charset=$pot_charset\\n"
153 "Content-Transfer-Encoding: 8bit\\n"
156 my $directory_re = quotemeta("$directory/");
157 for my $t (string_list) {
158 if ($text{$t}->[0]->type == TmplTokenType::TEXT_PARAMETRIZED) {
159 my($token, $n) = ($text{$t}->[0], 0);
160 printf OUTPUT "#. For the first occurrence,\n"
161 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
162 for my $param ($token->parameters_and_fields) {
164 my $type = $param->type;
165 my $subtype = ($type == TmplTokenType::TAG
166 && $param->string =~ /^<input\b/is?
167 $param->attributes->{'type'}->[1]: undef);
168 my $fmt = TmplTokenizer::_formalize( $param );
170 if ($type == TmplTokenType::DIRECTIVE) {
171 $type = $param->string =~ /(TMPL_[A-Z]+)+/is? $1: 'ERROR';
172 my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
174 printf OUTPUT "#. %s: %s\n", $fmt,
175 "$type" . (defined $name? " name=$name": '');
177 my $name = $param->attributes->{'name'};
178 my $value = $param->attributes->{'value'}
179 unless $subtype =~ /^(?:text)$/;
180 printf OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
181 . (defined $name? " name=$name->[1]": '')
182 . (defined $value? " value=$value->[1]": '');
185 } elsif ($text{$t}->[0]->type == TmplTokenType::TAG) {
186 my($token) = ($text{$t}->[0]);
187 printf OUTPUT "#. For the first occurrence,\n"
188 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
189 if ($token->string =~ /^<meta\b/is) {
190 my $type = $token->attributes->{'http-equiv'}->[1];
191 print OUTPUT "#. META http-equiv=$type\n" if defined $type;
192 } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
194 my $type = (lc($tag) eq 'input'?
195 $token->attributes->{'type'}: undef);
196 my $name = $token->attributes->{'name'};
197 printf OUTPUT "#. %s\n", $tag
198 . (defined $type? " type=$type->[1]": '')
199 . (defined $name? " name=$name->[1]": '');
203 for my $token (@{$text{$t}}) {
204 my $pathname = $token->pathname;
205 $pathname =~ s/^$directory_re//os;
206 printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
207 if defined $pathname && defined $token->line_number;
208 $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
210 printf OUTPUT "#, c-format\n" if $cformat_p;
211 printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
212 TmplTokenizer::string_canon
213 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
214 printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
215 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
219 ###############################################################################
221 sub convert_translation_file () {
222 open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
223 VerboseWarnings::set_input_file_name $convert_from;
226 my($msgid, $msgstr) = split(/\t/);
227 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
228 unless defined $msgstr;
230 # Fixup some of the bad strings
231 $msgid =~ s/^SELECTED>//;
234 my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
235 remember( $token, $msgid );
236 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
237 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
239 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
240 my $candidate = TmplTokenizer::charset_canon $2;
241 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
242 if defined $charset_in && $charset_in ne $candidate;
243 $charset_in = $candidate;
245 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
246 my $candidate = TmplTokenizer::charset_canon $2;
247 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
248 if defined $charset_out && $charset_out ne $candidate;
249 $charset_out = $candidate;
252 # The following assumption is correct; that's what HTML::Template assumes
253 if (!defined $charset_in) {
254 $charset_in = $charset_out = TmplTokenizer::charset_canon 'iso8859-1';
255 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
259 ###############################################################################
263 my $h = $exitcode? *STDERR: *STDOUT;
266 Extract translatable strings from given HTML::Template input files.
269 -f, --files-from=FILE Get list of input files from FILE
270 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
272 Output file location:
273 -o, --output=FILE Write output to specified file
275 HTML::Template options:
276 -a, --extract-all Extract all strings
277 --pedantic-warnings Issue warnings even for detected problems
278 which are likely to be harmless
281 -s, --sort-output generate sorted output
282 -F, --sort-by-file sort output by file location
283 -v, --verbose explain what is being done
286 --help Display this help and exit
288 Try `perldoc $0' for perhaps more information.
293 ###############################################################################
295 sub usage_error (;$) {
296 print STDERR "$_[0]\n" if @_;
297 print STDERR "Try `$0 --help' for more information.\n";
301 ###############################################################################
303 Getopt::Long::config qw( bundling no_auto_abbrev );
305 'a|extract-all' => \$extract_all_p,
306 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
307 'convert-from=s' => \$convert_from,
308 'D|directory=s' => \$directory,
309 'disable-fuzzy' => \$disable_fuzzy_p, # INTERNAL
310 'f|files-from=s' => \$files_from,
311 'I|input-charset=s' => \$charset_in, # INTERNAL
312 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
313 'O|output-charset=s' => \$charset_out, # INTERNAL
314 'output|o=s' => \$output,
315 'po-mode' => \$po_mode_p, # INTERNAL
316 's|sort-output' => sub { $sort = 's' },
317 'F|sort-by-file' => sub { $sort = 'F' },
318 'v|verbose' => \$verbose_p,
319 'help' => sub { usage(0) },
322 VerboseWarnings::set_application_name $0;
323 VerboseWarnings::set_pedantic_mode $pedantic_p;
325 usage_error('Missing mandatory option -f')
326 unless defined $files_from || defined $convert_from;
327 $directory = '.' unless defined $directory;
329 usage_error('You cannot specify both --convert-from and --files-from')
330 if defined $convert_from && defined $files_from;
332 if (defined $output && $output ne '-') {
333 print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
334 open(OUTPUT, ">$output") || die "$output: $!\n";
336 print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
337 open(OUTPUT, ">&STDOUT");
340 if (defined $files_from) {
341 print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
342 open(INPUT, "<$files_from") || die "$files_from: $!\n";
345 my $input = /^\//? $_: "$directory/$_";
346 my $h = TmplTokenizer->new( $input );
347 $h->set_allow_cformat( 1 );
348 VerboseWarnings::set_input_file_name $input;
349 print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
354 print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
355 convert_translation_file;
359 warn "This input will not work with Mozilla standards-compliant mode\n", undef
360 if TmplTokenizer::syntaxerror_p;
363 exit(-1) if TmplTokenizer::fatal_p;
365 ###############################################################################
369 This is an experimental script based on the modularized
370 text-extract2.pl script. It has behaviour similar to
371 xgettext(1), and generates gettext-compatible output files.
373 A gettext-like format provides the following advantages:
380 Translation to non-English-like languages with different word
381 order: gettext's c-format strings can theoretically be
382 emulated if we are able to do some analysis on the .tmpl input
383 and treat <TMPL_VAR> in a way similar to %s.
387 Context for the extracted strings: the gettext format provides
388 the filenames and line numbers where each string can be found.
389 The translator can read the source file and see the context,
390 in case the string by itself can mean several different things.
394 Place for the translator to add comments about the translations.
398 Gettext-compatible tools, if any, might be usable if we adopt
403 Note that this script is experimental and should still be
406 Please refer to the explanation in tmpl_process3 for further
409 If you want to generate GNOME-style POTFILES.in files, such
410 files (passed to -f) can be generated thus:
412 (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
413 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
414 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
415 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
417 This is, however, quite pointless, because the "create" and
418 "update" actions have already been implemented in tmpl_process3.pl.
429 There probably are some. Bugs related to scanning of <INPUT>
430 tags seem to be especially likely to be present.
432 Its diagnostics are probably too verbose.
434 When a <TMPL_VAR> within a JavaScript-related attribute is
435 detected, the script currently displays no warnings at all.
436 It might be good to display some kind of warning.
438 Its sort order (-s option) seems to be different than the real
439 xgettext(1)'s sort option. This will result in translation
440 strings inside the generated PO file spuriously moving about
441 when tmpl_process3.pl calls msgmerge(1) to update the PO file.