5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
16 use vars qw( $convert_from );
17 use vars qw( $files_from $directory $output $sort );
18 use vars qw( $extract_all_p );
19 use vars qw( $pedantic_p );
20 use vars qw( %text %translation );
21 use vars qw( $charset_in $charset_out );
22 use vars qw( $verbose_p );
23 use vars qw( $po_mode_p );
25 ###############################################################################
27 sub string_negligible_p ($) {
28 my($t) = @_; # a string
29 # Don't emit pure whitespace, pure numbers, pure punctuation,
30 # single letters, or TMPL_VAR's.
31 # Punctuation should arguably be translated. But without context
32 # they are untranslatable. Note that $t is a string, not a token object.
33 return !$extract_all_p && (
34 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
35 || $t =~ /^\d+$/ # purely digits
36 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
37 || $t =~ /^[A-Za-z]$/ # single letters
41 sub token_negligible_p( $ ) {
44 return !$extract_all_p && (
45 $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
46 $t == TmplTokenType::DIRECTIVE? 1:
47 $t == TmplTokenType::TEXT_PARAMETRIZED
48 && join( '', map { my $t = $_->type;
49 $t == TmplTokenType::DIRECTIVE?
50 '1': $t == TmplTokenType::TAG?
51 '': token_negligible_p( $_ )?
52 '': '1' } @{$x->children} ) eq '' );
55 ###############################################################################
58 my($token, $string) = @_;
59 # If we determine that the string is negligible, don't bother to remember
60 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
61 my $key = TmplTokenizer::string_canon( $string );
62 $text{$key} = [] unless defined $text{$key};
63 push @{$text{$key}}, $token;
67 ###############################################################################
71 # The real gettext tools seems to sort case sensitively; I don't know why
72 @t = sort { $a cmp $b } @t if $sort eq 's';
74 my @aa = sort { $a->pathname cmp $b->pathname
75 || $a->line_number <=> $b->line_number } @{$text{$a}};
76 my @bb = sort { $a->pathname cmp $b->pathname
77 || $a->line_number <=> $b->line_number } @{$text{$b}};
78 $aa[0]->pathname cmp $bb[0]->pathname
79 || $aa[0]->line_number <=> $bb[0]->line_number;
84 ###############################################################################
86 sub text_extract (*) {
89 my $s = TmplTokenizer::next_token $h;
90 last unless defined $s;
91 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
92 if ($kind eq TmplTokenType::TEXT) {
93 remember( $s, $t ) if $t =~ /\S/s;
94 } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
95 remember( $s, $s->form ) if $s->form =~ /\S/s;
96 } elsif ($kind eq TmplTokenType::TAG && %$attr) {
97 # value [tag=input], meta
98 my $tag = lc($1) if $t =~ /^<(\S+)/s;
99 for my $a ('alt', 'content', 'title', 'value') {
101 next if $a eq 'content' && $tag ne 'meta';
102 next if $a eq 'value' && ($tag ne 'input'
103 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
104 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
105 $val = TmplTokenizer::trim $val;
106 remember( $s, $val ) if $val =~ /\S/s;
113 ###############################################################################
115 sub generate_strings_list () {
116 # Emit all extracted strings.
117 for my $t (string_list) {
118 printf OUTPUT "%s\n", $t # unless negligible_p($t);
122 ###############################################################################
124 sub generate_po_file () {
125 # We don't emit the Plural-Forms header; it's meaningless for us
126 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
127 $pot_charset = TmplTokenizer::charset_canon $pot_charset;
128 # Time stamps aren't exactly right semantically. I don't know how to fix it.
129 my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
130 my $time_pot = $time;
131 my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
133 # SOME DESCRIPTIVE TITLE.
134 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
135 # This file is distributed under the same license as the PACKAGE package.
136 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
141 "Project-Id-Version: PACKAGE VERSION\\n"
142 "POT-Creation-Date: $time_pot\\n"
143 "PO-Revision-Date: $time_po\\n"
144 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
145 "Language-Team: LANGUAGE <LL\@li.org>\\n"
146 "MIME-Version: 1.0\\n"
147 "Content-Type: text/plain; charset=$pot_charset\\n"
148 "Content-Transfer-Encoding: 8bit\\n"
151 my $directory_re = quotemeta("$directory/");
152 for my $t (string_list) {
153 #next if negligible_p($t);
155 for my $token (@{$text{$t}}) {
156 my $pathname = $token->pathname;
157 $pathname =~ s/^$directory_re//os;
158 printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
159 if defined $pathname && defined $token->line_number;
160 $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
162 printf OUTPUT "#, c-format\n" if $cformat_p;
163 printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
164 TmplTokenizer::string_canon
165 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
166 printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
167 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
171 ###############################################################################
173 sub convert_translation_file () {
174 open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
175 VerboseWarnings::set_input_file_name $convert_from;
178 my($msgid, $msgstr) = split(/\t/);
179 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
180 unless defined $msgstr;
182 # Fixup some of the bad strings
183 $msgid =~ s/^SELECTED>//;
186 my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
187 remember( $token, $msgid );
188 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
189 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
191 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
192 my $candidate = TmplTokenizer::charset_canon $2;
193 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
194 if defined $charset_in && $charset_in ne $candidate;
195 $charset_in = $candidate;
197 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
198 my $candidate = TmplTokenizer::charset_canon $2;
199 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
200 if defined $charset_out && $charset_out ne $candidate;
201 $charset_out = $candidate;
204 # The following assumption is correct; that's what HTML::Template assumes
205 if (!defined $charset_in) {
206 $charset_in = $charset_out = TmplTokenizer::charset_canon 'iso8859-1';
207 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
211 ###############################################################################
215 my $h = $exitcode? *STDERR: *STDOUT;
218 Extract translatable strings from given HTML::Template input files.
221 -f, --files-from=FILE Get list of input files from FILE
222 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
224 Output file location:
225 -o, --output=FILE Write output to specified file
227 HTML::Template options:
228 -a, --extract-all Extract all strings
229 --pedantic-warnings Issue warnings even for detected problems
230 which are likely to be harmless
233 -s, --sort-output generate sorted output
234 -F, --sort-by-file sort output by file location
235 -v, --verbose explain what is being done
238 --help Display this help and exit
243 ###############################################################################
245 sub usage_error (;$) {
246 print STDERR "$_[0]\n" if @_;
247 print STDERR "Try `$0 --help' for more information.\n";
251 ###############################################################################
253 Getopt::Long::config qw( bundling no_auto_abbrev );
255 'a|extract-all' => \$extract_all_p,
256 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
257 'convert-from=s' => \$convert_from,
258 'D|directory=s' => \$directory,
259 'f|files-from=s' => \$files_from,
260 'I|input-charset=s' => \$charset_in, # INTERNAL
261 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
262 'O|output-charset=s' => \$charset_out, # INTERNAL
263 'output|o=s' => \$output,
264 'po-mode' => \$po_mode_p, # INTERNAL
265 's|sort-output' => sub { $sort = 's' },
266 'F|sort-by-file' => sub { $sort = 'F' },
267 'v|verbose' => \$verbose_p,
268 'help' => sub { usage(0) },
271 VerboseWarnings::set_application_name $0;
272 VerboseWarnings::set_pedantic_mode $pedantic_p;
274 usage_error('Missing mandatory option -f')
275 unless defined $files_from || defined $convert_from;
276 $directory = '.' unless defined $directory;
278 usage_error('You cannot specify both --convert-from and --files-from')
279 if defined $convert_from && defined $files_from;
281 if (defined $output && $output ne '-') {
282 print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
283 open(OUTPUT, ">$output") || die "$output: $!\n";
285 print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
286 open(OUTPUT, ">&STDOUT");
289 if (defined $files_from) {
290 print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
291 open(INPUT, "<$files_from") || die "$files_from: $!\n";
294 my $h = TmplTokenizer->new( "$directory/$_" );
295 $h->set_allow_cformat( 1 );
296 VerboseWarnings::set_input_file_name "$directory/$_";
297 print STDERR "$0: Processing file \"$directory/$_\"\n" if $verbose_p;
302 print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
303 convert_translation_file;
307 warn "This input will not work with Mozilla standards-compliant mode\n", undef
308 if TmplTokenizer::syntaxerror_p;
311 exit(-1) if TmplTokenizer::fatal_p;
313 ###############################################################################
317 This is an experimental script based on the modularized
318 text-extract2.pl script. It has behaviour similar to
319 xgettext(1), and generates gettext-compatible output files.
321 A gettext-like format provides the following advantages:
328 Translation to non-English-like languages with different word
329 order: gettext's c-format strings can theoretically be
330 emulated if we are able to do some analysis on the .tmpl input
331 and treat <TMPL_VAR> in a way similar to %s.
335 Context for the extracted strings: the gettext format provides
336 the filenames and line numbers where each string can be found.
337 The translator can read the source file and see the context,
338 in case the string by itself can mean several different things.
342 Place for the translator to add comments about the translations.
346 Gettext-compatible tools, if any, might be usable if we adopt
351 Note that this script is experimental and should still be
354 Please refer to the explanation in tmpl_process3 for further
357 If you want to generate GNOME-style POTFILES.in files, such
358 files (passed to -f) can be generated thus:
360 (cd ../.. && find koha-tmpl/opac-tmpl/default/en
361 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
362 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en
363 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
365 This is, however, quite pointless, because the "create" and
366 "update" actions have already been implemented in tmpl_process3.pl.
377 There probably are some. Bugs related to scanning of <INPUT>
378 tags seem to be especially likely to be present.
380 Its diagnostics are probably too verbose.
382 When a <TMPL_VAR> within a JavaScript-related attribute is
383 detected, the script currently displays no warnings at all.
384 It might be good to display some kind of warning.
386 Its sort order (-s option) seems to be different than the real
387 xgettext(1)'s sort option. This will result in translation
388 strings inside the generated PO file spuriously moving about
389 when tmpl_process3.pl calls msgmerge(1) to update the PO file.