5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
15 use vars qw( $convert_from );
16 use vars qw( $files_from $directory $output $sort );
17 use vars qw( $extract_all_p );
18 use vars qw( $pedantic_p );
19 use vars qw( %text %translation );
20 use vars qw( $charset_in $charset_out );
22 ###############################################################################
24 sub negligible_p ($) {
25 my($t) = @_; # a string
26 # Don't emit pure whitespace, pure numbers, pure punctuation,
27 # single letters, or TMPL_VAR's.
28 # Punctuation should arguably be translated. But without context
29 # they are untranslatable.
30 return !$extract_all_p && (
31 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
32 || $t =~ /^\d+$/ # purely digits
33 || $t =~ /^[-\.,:;'"%\(\)\[\]\|]+$/ # pure punctuation w/o context
34 || $t =~ /^[A-Za-z]$/ # single letters
38 ###############################################################################
41 my($token, $string) = @_;
42 $text{$string} = [] unless defined $text{$string};
43 push @{$text{$string}}, $token;
46 ###############################################################################
50 # The real gettext tools seems to sort case sensitively; I don't know why
51 @t = sort { $a cmp $b } @t if $sort eq 's';
53 my @aa = sort { $a->pathname cmp $b->pathname
54 || $a->line_number <=> $b->line_number } @{$text{$a}};
55 my @bb = sort { $a->pathname cmp $b->pathname
56 || $a->line_number <=> $b->line_number } @{$text{$b}};
57 $aa[0]->pathname cmp $bb[0]->pathname
58 || $aa[0]->line_number <=> $bb[0]->line_number;
63 ###############################################################################
65 sub text_extract (*) {
68 my $s = TmplTokenizer::next_token $h;
69 last unless defined $s;
70 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
71 if ($kind eq TmplTokenType::TEXT) {
72 #$t = TmplTokenizer::trim $t;
73 remember( $s, $t ) if $t =~ /\S/s;
74 } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
75 #$t = TmplTokenizer::trim $t;
76 remember( $s, $s->form ) if $s->form =~ /\S/s;
77 } elsif ($kind eq TmplTokenType::TAG && %$attr) {
78 # value [tag=input], meta
79 my $tag = lc($1) if $t =~ /^<(\S+)/s;
80 for my $a ('alt', 'content', 'title', 'value') {
82 next if $a eq 'content' && $tag ne 'meta';
83 next if $a eq 'value' && ($tag ne 'input'
84 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
85 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
86 $val = TmplTokenizer::trim $val;
87 remember( $s, $val ) if $val =~ /\S/s;
94 ###############################################################################
96 sub generate_strings_list () {
97 # Emit all extracted strings.
98 for my $t (string_list) {
99 printf OUTPUT "%s\n", $t unless negligible_p($t);
103 ###############################################################################
105 sub generate_po_file () {
106 # We don't emit the Plural-Forms header; it's meaningless for us
107 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
108 $pot_charset = TmplTokenizer::charset_canon $pot_charset;
110 # SOME DESCRIPTIVE TITLE.
111 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
112 # This file is distributed under the same license as the PACKAGE package.
113 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
118 "Project-Id-Version: PACKAGE VERSION\\n"
119 "POT-Creation-Date: 2004-02-05 20:55-0500\\n"
120 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
121 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
122 "Language-Team: LANGUAGE <LL\@li.org>\\n"
123 "MIME-Version: 1.0\\n"
124 "Content-Type: text/plain; charset=$pot_charset\\n"
125 "Content-Transfer-Encoding: 8bit\\n"
128 my $directory_re = quotemeta("$directory/");
129 for my $t (string_list) {
130 next if negligible_p($t);
132 for my $token (@{$text{$t}}) {
133 my $pathname = $token->pathname;
134 $pathname =~ s/^$directory_re//os;
135 printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
136 if defined $pathname && defined $token->line_number;
137 $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
139 printf OUTPUT "#, c-format\n" if $cformat_p;
140 printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
141 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
142 printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
143 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
147 ###############################################################################
149 sub convert_translation_file () {
150 open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
151 VerboseWarnings::set_input_file_name $convert_from;
154 my($msgid, $msgstr) = split(/\t/);
155 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
156 unless defined $msgstr;
158 # Fixup some of the bad strings
159 $msgid =~ s/^SELECTED>//;
162 my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
163 remember( $token, $msgid );
164 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
165 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
167 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
168 my $candidate = TmplTokenizer::charset_canon $2;
169 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
170 if defined $charset_in && $charset_in ne $candidate;
171 $charset_in = $candidate;
173 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
174 my $candidate = TmplTokenizer::charset_canon $2;
175 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
176 if defined $charset_out && $charset_out ne $candidate;
177 $charset_out = $candidate;
180 # The following assumption is correct; that's what HTML::Template assumes
181 if (!defined $charset_in) {
182 $charset_in = $charset_out = TmplTokenizer::charset_canon 'iso8859-1';
183 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
187 ###############################################################################
191 my $h = $exitcode? *STDERR: *STDOUT;
194 Extract translatable strings from given HTML::Template input files.
197 -f, --files-from=FILE Get list of input files from FILE
198 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
200 Output file location:
201 -o, --output=FILE Write output to specified file
203 HTML::Template options:
204 -a, --extract-all Extract all strings
205 --pedantic-warnings Issue warnings even for detected problems
206 which are likely to be harmless
209 -s, --sort-output generate sorted output
210 -F, --sort-by-file sort output by file location
213 --help Display this help and exit
218 ###############################################################################
220 sub usage_error (;$) {
221 print STDERR "$_[0]\n" if @_;
222 print STDERR "Try `$0 --help' for more information.\n";
226 ###############################################################################
228 Getopt::Long::config qw( bundling no_auto_abbrev );
230 'a|extract-all' => \$extract_all_p,
231 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
232 'convert-from=s' => \$convert_from,
233 'D|directory=s' => \$directory,
234 'f|files-from=s' => \$files_from,
235 'I|input-charset=s' => \$charset_in, # INTERNAL
236 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
237 'O|output-charset=s' => \$charset_out, # INTERNAL
238 'output|o=s' => \$output,
239 's|sort-output' => sub { $sort = 's' },
240 'F|sort-by-file' => sub { $sort = 'F' },
241 'help' => sub { usage(0) },
244 VerboseWarnings::set_application_name $0;
245 VerboseWarnings::set_pedantic_mode $pedantic_p;
247 usage_error('Missing mandatory option -f')
248 unless defined $files_from || defined $convert_from;
249 $directory = '.' unless defined $directory;
251 usage_error('You cannot specify both --convert-from and --files-from')
252 if defined $convert_from && defined $files_from;
254 if (defined $output && $output ne '-') {
255 open(OUTPUT, ">$output") || die "$output: $!\n";
257 open(OUTPUT, ">&STDOUT");
260 if (defined $files_from) {
261 open(INPUT, "<$files_from") || die "$files_from: $!\n";
264 my $h = TmplTokenizer->new( "$directory/$_" );
265 $h->set_allow_cformat( 1 );
266 VerboseWarnings::set_input_file_name "$directory/$_";
271 convert_translation_file;
275 warn "This input will not work with Mozilla standards-compliant mode\n", undef
276 if TmplTokenizer::syntaxerror_p;
279 exit(-1) if TmplTokenizer::fatal_p;
281 ###############################################################################
285 This is an experimental script based on the modularized
286 text-extract2.pl script. It has behaviour similar to
287 xgettext(1), and generates gettext-compatible output files.
289 A gettext-like format provides the following advantages:
296 Translation to non-English-like languages with different word
297 order: gettext's c-format strings can theoretically be
298 emulated if we are able to do some analysis on the .tmpl input
299 and treat <TMPL_VAR> in a way similar to %s.
303 Context for the extracted strings: the gettext format provides
304 the filenames and line numbers where each string can be found.
305 The translator can read the source file and see the context,
306 in case the string by itself can mean several different things.
310 Place for the translator to add comments about the translations.
314 Gettext-compatible tools, if any, might be usable if we adopt
319 Right now it does about the same thing as text-extract2.pl but
320 generates gettext-style output; however, because it is scanner-
321 instead of parser-based, it is able to address the 4 weaknesses
322 listed in translator_doc.txt. Ultimately, the goal is to make
323 this able to do some kind of simple analysis on the input to
324 produce gettext-style output with c-format strings, in order to
325 facilitate translation to languages with a different word order
328 When the above is finished, the generated po file may contain
329 some HTML tags in addition to %s strings.
331 If you want to generate GNOME-style POTFILES.in files, such
332 files (passed to -f) can be generated thus:
334 (cd ../.. && find koha-tmpl/opac-tmpl/default/en
335 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
336 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en
337 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
339 This is, however, quite pointless, because the "create" and
340 "update" actions have already been implemented in tmpl_process3.pl.
351 There probably are some. Bugs related to scanning of <INPUT>
352 tags seem to be especially likely to be present.
354 Its diagnostics are probably too verbose.
356 When a <TMPL_VAR> within a JavaScript-related attribute is
357 detected, the script currently displays no warnings at all.
358 It might be good to display some kind of warning.
360 Its sort order (-s option) seems to be different than the real
361 xgettext(1)'s sort option. This will result in translation
362 strings inside the generated PO file spuriously moving about
363 when tmpl_process3.pl calls msgmerge(1) to update the PO file.