=cut
use strict;
+use warnings;
use Getopt::Long;
+use POSIX;
use Locale::PO;
use TmplTokenizer;
use VerboseWarnings;
+use vars qw( $convert_from );
use vars qw( $files_from $directory $output $sort );
+use vars qw( $extract_all_p );
use vars qw( $pedantic_p );
-use vars qw( %text );
+use vars qw( %text %translation );
+use vars qw( $charset_in $charset_out );
+use vars qw( $disable_fuzzy_p );
+use vars qw( $verbose_p );
+use vars qw( $po_mode_p );
+
+###############################################################################
+
+sub string_negligible_p ($) {
+ my($t) = @_; # a string
+ # Don't emit pure whitespace, pure numbers, pure punctuation,
+ # single letters, or TMPL_VAR's.
+ # Punctuation should arguably be translated. But without context
+ # they are untranslatable. Note that $t is a string, not a token object.
+ return !$extract_all_p && (
+ TmplTokenizer::blank_p($t) # blank or TMPL_VAR
+ || $t =~ /^\d+$/ # purely digits
+ || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
+ || $t =~ /^[A-Za-z]$/ # single letters
+ )
+}
+
+sub token_negligible_p( $ ) {
+ my($x) = @_;
+ my $t = $x->type;
+ return !$extract_all_p && (
+ $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
+ $t == TmplTokenType::DIRECTIVE? 1:
+ $t == TmplTokenType::TEXT_PARAMETRIZED
+ && join( '', map { my $t = $_->type;
+ $t == TmplTokenType::DIRECTIVE?
+ '1': $t == TmplTokenType::TAG?
+ '': token_negligible_p( $_ )?
+ '': '1' } @{$x->children} ) eq '' );
+}
###############################################################################
sub remember ($$) {
my($token, $string) = @_;
- $text{$string} = [] unless defined $text{$string};
- push @{$text{$string}}, $token;
+ # If we determine that the string is negligible, don't bother to remember
+ unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
+ my $key = TmplTokenizer::string_canon( $string );
+ $text{$key} = [] unless defined $text{$key};
+ push @{$text{$key}}, $token;
+ }
}
###############################################################################
last unless defined $s;
my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
if ($kind eq TmplTokenType::TEXT) {
- #$t = TmplTokenizer::trim $t;
remember( $s, $t ) if $t =~ /\S/s;
} elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
- #$t = TmplTokenizer::trim $t;
remember( $s, $s->form ) if $s->form =~ /\S/s;
} elsif ($kind eq TmplTokenType::TAG && %$attr) {
# value [tag=input], meta
if ($attr->{$a}) {
next if $a eq 'content' && $tag ne 'meta';
next if $a eq 'value' && ($tag ne 'input'
- || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
+ || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|submit)$/)); # FIXME
my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
$val = TmplTokenizer::trim $val;
remember( $s, $val ) if $val =~ /\S/s;
}
}
+ } elsif ($s->has_js_data) {
+ for my $t (@{$s->js_data}) {
+ remember( $s, $t->[3] ) if $t->[0]; # FIXME
+ }
}
}
}
sub generate_strings_list () {
# Emit all extracted strings.
- # Don't emit pure whitespace, pure numbers, or TMPL_VAR's.
for my $t (string_list) {
- printf OUTPUT "%s\n", $t
- unless TmplTokenizer::blank_p($t) || $t =~ /^\d+$/;
+ printf OUTPUT "%s\n", $t;
}
}
sub generate_po_file () {
# We don't emit the Plural-Forms header; it's meaningless for us
+ my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
+ $pot_charset = TmplTokenizer::charset_canon $pot_charset;
+ # Time stamps aren't exactly right semantically. I don't know how to fix it.
+ my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
+ my $time_pot = $time;
+ my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
print OUTPUT <<EOF;
# SOME DESCRIPTIVE TITLE.
# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
# This file is distributed under the same license as the PACKAGE package.
# FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
#
+EOF
+ print OUTPUT <<EOF unless $disable_fuzzy_p;
#, fuzzy
+EOF
+ print OUTPUT <<EOF;
msgid ""
msgstr ""
"Project-Id-Version: PACKAGE VERSION\\n"
-"POT-Creation-Date: 2004-02-05 20:55-0500\\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
+"POT-Creation-Date: $time_pot\\n"
+"PO-Revision-Date: $time_po\\n"
"Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
"Language-Team: LANGUAGE <LL\@li.org>\\n"
"MIME-Version: 1.0\\n"
-"Content-Type: text/plain; charset=CHARSET\\n"
+"Content-Type: text/plain; charset=$pot_charset\\n"
"Content-Transfer-Encoding: 8bit\\n"
EOF
my $directory_re = quotemeta("$directory/");
for my $t (string_list) {
- next if TmplTokenizer::blank_p($t) || $t =~ /^\d+$/;
+ if ($text{$t}->[0]->type == TmplTokenType::TEXT_PARAMETRIZED) {
+ my($token, $n) = ($text{$t}->[0], 0);
+ printf OUTPUT "#. For the first occurrence,\n"
+ if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
+ for my $param ($token->parameters_and_fields) {
+ $n += 1;
+ my $type = $param->type;
+ my $subtype = ($type == TmplTokenType::TAG
+ && $param->string =~ /^<input\b/is?
+ $param->attributes->{'type'}->[1]: undef);
+ my $fmt = TmplTokenizer::_formalize( $param );
+ $fmt =~ s/^%/%$n\$/;
+ if ($type == TmplTokenType::DIRECTIVE) {
+ $type = $param->string =~ /(TMPL_[A-Z]+)+/is? $1: 'ERROR';
+ my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
+ $2: undef;
+ printf OUTPUT "#. %s: %s\n", $fmt,
+ "$type" . (defined $name? " name=$name": '');
+ } else {
+ my $name = $param->attributes->{'name'};
+ my $value = $param->attributes->{'value'}
+ unless $subtype =~ /^(?:text)$/;
+ printf OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
+ . (defined $name? " name=$name->[1]": '')
+ . (defined $value? " value=$value->[1]": '');
+ }
+ }
+ } elsif ($text{$t}->[0]->type == TmplTokenType::TAG) {
+ my($token) = ($text{$t}->[0]);
+ printf OUTPUT "#. For the first occurrence,\n"
+ if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
+ if ($token->string =~ /^<meta\b/is) {
+ my $type = $token->attributes->{'http-equiv'}->[1];
+ print OUTPUT "#. META http-equiv=$type\n" if defined $type;
+ } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
+ my $tag = uc($1);
+ my $type = (lc($tag) eq 'input'?
+ $token->attributes->{'type'}: undef);
+ my $name = $token->attributes->{'name'};
+ printf OUTPUT "#. %s\n", $tag
+ . (defined $type? " type=$type->[1]": '')
+ . (defined $name? " name=$name->[1]": '');
+ }
+ } elsif ($text{$t}->[0]->has_js_data) {
+ printf OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
+ printf OUTPUT "#. SCRIPT\n";
+ }
my $cformat_p;
for my $token (@{$text{$t}}) {
my $pathname = $token->pathname;
$pathname =~ s/^$directory_re//os;
- printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number;
+ printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
+ if defined $pathname && defined $token->line_number;
$cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
}
printf OUTPUT "#, c-format\n" if $cformat_p;
- printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po( $t );
- printf OUTPUT "msgstr \"\"\n\n";
+ printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
+ TmplTokenizer::string_canon
+ TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
+ printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
+ TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
+ }
+}
+
+###############################################################################
+
+sub convert_translation_file () {
+ open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
+ VerboseWarnings::set_input_file_name $convert_from;
+ while (<INPUT>) {
+ chomp;
+ my($msgid, $msgstr) = split(/\t/);
+ die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
+ unless defined $msgstr;
+
+ # Fixup some of the bad strings
+ $msgid =~ s/^SELECTED>//;
+
+ # Create dummy token
+ my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
+ remember( $token, $msgid );
+ $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
+ $translation{$msgid} = $msgstr unless $msgstr eq '*****';
+
+ if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
+ my $candidate = TmplTokenizer::charset_canon $2;
+ die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
+ if defined $charset_in && $charset_in ne $candidate;
+ $charset_in = $candidate;
+ }
+ if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
+ my $candidate = TmplTokenizer::charset_canon $2;
+ die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
+ if defined $charset_out && $charset_out ne $candidate;
+ $charset_out = $candidate;
+ }
+ }
+ # The following assumption is correct; that's what HTML::Template assumes
+ if (!defined $charset_in) {
+ $charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8';
+ warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
}
}
-o, --output=FILE Write output to specified file
HTML::Template options:
+ -a, --extract-all Extract all strings
--pedantic-warnings Issue warnings even for detected problems
- which are likely to be harmless
+ which are likely to be harmless
Output details:
-s, --sort-output generate sorted output
-F, --sort-by-file sort output by file location
+ -v, --verbose explain what is being done
Informative output:
--help Display this help and exit
+
+Try `perldoc $0' for perhaps more information.
EOF
exit($exitcode);
}
Getopt::Long::config qw( bundling no_auto_abbrev );
GetOptions(
+ 'a|extract-all' => \$extract_all_p,
+ 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
+ 'convert-from=s' => \$convert_from,
'D|directory=s' => \$directory,
+ 'disable-fuzzy' => \$disable_fuzzy_p, # INTERNAL
'f|files-from=s' => \$files_from,
+ 'I|input-charset=s' => \$charset_in, # INTERNAL
'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
+ 'O|output-charset=s' => \$charset_out, # INTERNAL
'output|o=s' => \$output,
+ 'po-mode' => \$po_mode_p, # INTERNAL
's|sort-output' => sub { $sort = 's' },
'F|sort-by-file' => sub { $sort = 'F' },
+ 'v|verbose' => \$verbose_p,
'help' => sub { usage(0) },
) || usage_error;
VerboseWarnings::set_application_name $0;
VerboseWarnings::set_pedantic_mode $pedantic_p;
-usage_error('Missing mandatory option -f') unless defined $files_from;
+usage_error('Missing mandatory option -f')
+ unless defined $files_from || defined $convert_from;
$directory = '.' unless defined $directory;
+usage_error('You cannot specify both --convert-from and --files-from')
+ if defined $convert_from && defined $files_from;
+
if (defined $output && $output ne '-') {
+ print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
open(OUTPUT, ">$output") || die "$output: $!\n";
} else {
+ print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
open(OUTPUT, ">&STDOUT");
}
-open(INPUT, "<$files_from") || die "$files_from: $!\n";
-while (<INPUT>) {
- chomp;
- my $h = TmplTokenizer->new( "$directory/$_" );
- $h->set_allow_cformat( 1 );
- VerboseWarnings::set_input_file_name "$directory/$_";
- text_extract( $h );
+if (defined $files_from) {
+ print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
+ open(INPUT, "<$files_from") || die "$files_from: $!\n";
+ while (<INPUT>) {
+ chomp;
+ my $input = /^\//? $_: "$directory/$_";
+ my $h = TmplTokenizer->new( $input );
+ $h->set_allow_cformat( 1 );
+ VerboseWarnings::set_input_file_name $input;
+ print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
+ text_extract( $h );
+ }
+ close INPUT;
+} else {
+ print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
+ convert_translation_file;
}
-close INPUT;
generate_po_file;
warn "This input will not work with Mozilla standards-compliant mode\n", undef
=item -
-(Future goal)
Translation to non-English-like languages with different word
order: gettext's c-format strings can theoretically be
emulated if we are able to do some analysis on the .tmpl input
=back
-Right now it does about the same thing as text-extract2.pl but
-generates gettext-style output; however, because it is scanner-
-instead of parser-based, it is able to address the 4 weaknesses
-listed in translator_doc.txt. Ultimately, the goal is to make
-this able to do some kind of simple analysis on the input to
-produce gettext-style output with c-format strings, in order to
-facilitate translation to languages with a different word order
-than English.
+This script has already been in use for over a year and should
+be reasonable stable. Nevertheless, it is still somewhat
+experimental and there are still some issues.
-When the above is finished, the generated po file may contain
-some HTML tags in addition to %s strings.
+Please refer to the explanation in tmpl_process3 for further
+details.
If you want to generate GNOME-style POTFILES.in files, such
files (passed to -f) can be generated thus:
- (cd ../.. && find koha-tmpl/opac-tmpl/default/en
+ (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
-name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
- (cd ../.. && find koha-tmpl/intranet-tmpl/default/en
+ (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
-name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
This is, however, quite pointless, because the "create" and
"update" actions have already been implemented in tmpl_process3.pl.
+=head2 Strings inside JavaScript
+
+In the SCRIPT elements, the script will attempt to scan for
+_("I<string literal>") patterns, and extract the I<string literal>
+as a translatable string.
+
+Note that the C-like _(...) notation is required.
+
+The JavaScript must actually define a _ function
+so that the code remains correct JavaScript.
+A suitable definition of such a function can be
+
+ function _(s) { return s } // dummy function for gettext
+
=head1 SEE ALSO
-tmpl_process.pl,
+tmpl_process3.pl,
xgettext(1),
Locale::PO(3),
translator_doc.txt
strings inside the generated PO file spuriously moving about
when tmpl_process3.pl calls msgmerge(1) to update the PO file.
+If a Javascript string has leading spaces, it will
+generate strings with spurious leading spaces,
+leading to failure to match the strings when actually generating
+translated files.
+
=cut