5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
15 use vars qw( $files_from $directory $output $sort );
16 use vars qw( $pedantic_p );
19 ###############################################################################
22 my($token, $string) = @_;
23 $text{$string} = [] unless defined $text{$string};
24 push @{$text{$string}}, $token;
27 ###############################################################################
31 # The real gettext tools seems to sort case sensitively; I don't know why
32 @t = sort { $a cmp $b } @t if $sort eq 's';
34 my @aa = sort { $a->pathname cmp $b->pathname
35 || $a->line_number <=> $b->line_number } @{$text{$a}};
36 my @bb = sort { $a->pathname cmp $b->pathname
37 || $a->line_number <=> $b->line_number } @{$text{$b}};
38 $aa[0]->pathname cmp $bb[0]->pathname
39 || $aa[0]->line_number <=> $bb[0]->line_number;
44 ###############################################################################
46 sub text_extract (*) {
49 my $s = TmplTokenizer::next_token $h;
50 last unless defined $s;
51 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
52 if ($kind eq TmplTokenType::TEXT) {
53 #$t = TmplTokenizer::trim $t;
54 remember( $s, $t ) if $t =~ /\S/s;
55 } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
56 #$t = TmplTokenizer::trim $t;
57 remember( $s, $s->form ) if $s->form =~ /\S/s;
58 } elsif ($kind eq TmplTokenType::TAG && %$attr) {
59 # value [tag=input], meta
60 my $tag = lc($1) if $t =~ /^<(\S+)/s;
61 for my $a ('alt', 'content', 'title', 'value') {
63 next if $a eq 'content' && $tag ne 'meta';
64 next if $a eq 'value' && ($tag ne 'input'
65 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
66 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
67 $val = TmplTokenizer::trim $val;
68 remember( $s, $val ) if $val =~ /\S/s;
75 ###############################################################################
77 sub generate_strings_list () {
78 # Emit all extracted strings.
79 # Don't emit pure whitespace, pure numbers, or TMPL_VAR's.
80 for my $t (string_list) {
81 printf OUTPUT "%s\n", $t
82 unless TmplTokenizer::blank_p($t) || $t =~ /^\d+$/;
86 ###############################################################################
88 sub generate_po_file () {
89 # We don't emit the Plural-Forms header; it's meaningless for us
91 # SOME DESCRIPTIVE TITLE.
92 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
93 # This file is distributed under the same license as the PACKAGE package.
94 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
99 "Project-Id-Version: PACKAGE VERSION\\n"
100 "POT-Creation-Date: 2004-02-05 20:55-0500\\n"
101 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
102 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
103 "Language-Team: LANGUAGE <LL\@li.org>\\n"
104 "MIME-Version: 1.0\\n"
105 "Content-Type: text/plain; charset=CHARSET\\n"
106 "Content-Transfer-Encoding: 8bit\\n"
109 my $directory_re = quotemeta("$directory/");
110 for my $t (string_list) {
111 next if TmplTokenizer::blank_p($t) || $t =~ /^\d+$/;
113 for my $token (@{$text{$t}}) {
114 my $pathname = $token->pathname;
115 $pathname =~ s/^$directory_re//os;
116 printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number;
117 $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
119 printf OUTPUT "#, c-format\n" if $cformat_p;
120 printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po( $t );
121 printf OUTPUT "msgstr \"\"\n\n";
125 ###############################################################################
129 my $h = $exitcode? *STDERR: *STDOUT;
132 Extract translatable strings from given HTML::Template input files.
135 -f, --files-from=FILE Get list of input files from FILE
136 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
138 Output file location:
139 -o, --output=FILE Write output to specified file
141 HTML::Template options:
142 --pedantic-warnings Issue warnings even for detected problems
143 which are likely to be harmless
146 -s, --sort-output generate sorted output
147 -F, --sort-by-file sort output by file location
150 --help Display this help and exit
155 ###############################################################################
157 sub usage_error (;$) {
158 print STDERR "$_[0]\n" if @_;
159 print STDERR "Try `$0 --help' for more information.\n";
163 ###############################################################################
165 Getopt::Long::config qw( bundling no_auto_abbrev );
167 'D|directory=s' => \$directory,
168 'f|files-from=s' => \$files_from,
169 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
170 'output|o=s' => \$output,
171 's|sort-output' => sub { $sort = 's' },
172 'F|sort-by-file' => sub { $sort = 'F' },
173 'help' => sub { usage(0) },
176 VerboseWarnings::set_application_name $0;
177 VerboseWarnings::set_pedantic_mode $pedantic_p;
179 usage_error('Missing mandatory option -f') unless defined $files_from;
180 $directory = '.' unless defined $directory;
182 if (defined $output && $output ne '-') {
183 open(OUTPUT, ">$output") || die "$output: $!\n";
185 open(OUTPUT, ">&STDOUT");
188 open(INPUT, "<$files_from") || die "$files_from: $!\n";
191 my $h = TmplTokenizer->new( "$directory/$_" );
192 $h->set_allow_cformat( 1 );
193 VerboseWarnings::set_input_file_name "$directory/$_";
199 warn "This input will not work with Mozilla standards-compliant mode\n", undef
200 if TmplTokenizer::syntaxerror_p;
203 exit(-1) if TmplTokenizer::fatal_p;
205 ###############################################################################
209 This is an experimental script based on the modularized
210 text-extract2.pl script. It has behaviour similar to
211 xgettext(1), and generates gettext-compatible output files.
213 A gettext-like format provides the following advantages:
220 Translation to non-English-like languages with different word
221 order: gettext's c-format strings can theoretically be
222 emulated if we are able to do some analysis on the .tmpl input
223 and treat <TMPL_VAR> in a way similar to %s.
227 Context for the extracted strings: the gettext format provides
228 the filenames and line numbers where each string can be found.
229 The translator can read the source file and see the context,
230 in case the string by itself can mean several different things.
234 Place for the translator to add comments about the translations.
238 Gettext-compatible tools, if any, might be usable if we adopt
243 Right now it does about the same thing as text-extract2.pl but
244 generates gettext-style output; however, because it is scanner-
245 instead of parser-based, it is able to address the 4 weaknesses
246 listed in translator_doc.txt. Ultimately, the goal is to make
247 this able to do some kind of simple analysis on the input to
248 produce gettext-style output with c-format strings, in order to
249 facilitate translation to languages with a different word order
252 When the above is finished, the generated po file may contain
253 some HTML tags in addition to %s strings.
255 If you want to generate GNOME-style POTFILES.in files, such
256 files (passed to -f) can be generated thus:
258 (cd ../.. && find koha-tmpl/opac-tmpl/default/en
259 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
260 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en
261 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
263 This is, however, quite pointless, because the "create" and
264 "update" actions have already been implemented in tmpl_process3.pl.
275 There probably are some. Bugs related to scanning of <INPUT>
276 tags seem to be especially likely to be present.
278 Its diagnostics are probably too verbose.
280 When a <TMPL_VAR> within a JavaScript-related attribute is
281 detected, the script currently displays no warnings at all.
282 It might be good to display some kind of warning.
284 Its sort order (-s option) seems to be different than the real
285 xgettext(1)'s sort option. This will result in translation
286 strings inside the generated PO file spuriously moving about
287 when tmpl_process3.pl calls msgmerge(1) to update the PO file.