X-Git-Url: http://git.rot13.org/?a=blobdiff_plain;f=misc%2Ftranslator%2Fxgettext.pl;h=0343ed29c23103a9cdaff9e538ae8395b04eb23e;hb=cd81bdc6a0b0250d97cbb5c49bcb90290d29e34a;hp=abe315e5d7f1e3856ef342eeb1c37e9040a0ff7d;hpb=2ea6fdec156419fbbdd5f64da927d0b2b45e751c;p=koha.git diff --git a/misc/translator/xgettext.pl b/misc/translator/xgettext.pl index abe315e5d7..0343ed29c2 100755 --- a/misc/translator/xgettext.pl +++ b/misc/translator/xgettext.pl @@ -2,10 +2,13 @@ =head1 NAME -xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction +xgettext.pl - xgettext(1)-like interface for .tt strings extraction =cut +use FindBin; +use lib $FindBin::Bin; + use strict; use warnings; use Getopt::Long; @@ -24,9 +27,11 @@ use vars qw( $disable_fuzzy_p ); use vars qw( $verbose_p ); use vars qw( $po_mode_p ); +our $OUTPUT; + ############################################################################### -sub string_negligible_p ($) { +sub string_negligible_p { my($t) = @_; # a string # Don't emit pure whitespace, pure numbers, pure punctuation, # single letters, or TMPL_VAR's. @@ -37,38 +42,46 @@ sub string_negligible_p ($) { || $t =~ /^\d+$/ # purely digits || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context || $t =~ /^[A-Za-z]$/ # single letters + || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ... + || ( $t =~ /^\[\%.*\%\]$/ and $t !~ /\%\].*\[\%/ ) # pure TT entities ) } -sub token_negligible_p( $ ) { - my($x) = @_; +sub token_negligible_p { + my ($x) = @_; my $t = $x->type; return !$extract_all_p && ( - $t == TmplTokenType::TEXT? string_negligible_p( $x->string ): - $t == TmplTokenType::DIRECTIVE? 1: - $t == TmplTokenType::TEXT_PARAMETRIZED - && join( '', map { my $t = $_->type; - $t == TmplTokenType::DIRECTIVE? - '1': $t == TmplTokenType::TAG? - '': token_negligible_p( $_ )? - '': '1' } @{$x->children} ) eq '' ); + $t == C4::TmplTokenType::TEXT() ? string_negligible_p( $x->string ) + : $t == C4::TmplTokenType::DIRECTIVE() ? 1 + : $t == C4::TmplTokenType::TEXT_PARAMETRIZED() + && join( + '', + map { + my $t = $_->type; + $t == C4::TmplTokenType::DIRECTIVE() ? '1' + : $t == C4::TmplTokenType::TAG() ? '' + : token_negligible_p($_) ? '' + : '1' + } @{ $x->children } + ) eq '' + ); } ############################################################################### -sub remember ($$) { +sub remember { my($token, $string) = @_; # If we determine that the string is negligible, don't bother to remember unless (string_negligible_p( $string ) || token_negligible_p( $token )) { - my $key = TmplTokenizer::string_canon( $string ); - $text{$key} = [] unless defined $text{$key}; - push @{$text{$key}}, $token; + my $key = TmplTokenizer::string_canon( $string ); + $text{$key} = [] unless defined $text{$key}; + push @{$text{$key}}, $token; } } ############################################################################### -sub string_list () { +sub string_list { my @t = keys %text; # The real gettext tools seems to sort case sensitively; I don't know why @t = sort { $a cmp $b } @t if $sort eq 's'; @@ -83,52 +96,57 @@ sub string_list () { return @t; } -############################################################################### + ############################################################################### -sub text_extract (*) { +sub text_extract { my($h) = @_; for (;;) { - my $s = TmplTokenizer::next_token $h; - last unless defined $s; - my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes); - if ($kind eq TmplTokenType::TEXT) { - remember( $s, $t ) if $t =~ /\S/s; - } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) { - remember( $s, $s->form ) if $s->form =~ /\S/s; - } elsif ($kind eq TmplTokenType::TAG && %$attr) { - # value [tag=input], meta - my $tag = lc($1) if $t =~ /^<(\S+)/s; - for my $a ('alt', 'content', 'title', 'value','label') { - if ($attr->{$a}) { - next if $a eq 'label' && $tag ne 'optgroup'; - next if $a eq 'content' && $tag ne 'meta'; - next if $a eq 'value' && ($tag ne 'input' - || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME - my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME - $val = TmplTokenizer::trim $val; - remember( $s, $val ) if $val =~ /\S/s; - } + my $s = TmplTokenizer::next_token $h; + last unless defined $s; + my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes); + if ($kind eq C4::TmplTokenType::TEXT) { + if ($t =~ /\S/s && $t !~ /has_js_data) { - for my $t (@{$s->js_data}) { - remember( $s, $t->[3] ) if $t->[0]; # FIXME + } elsif ($kind eq C4::TmplTokenType::TEXT_PARAMETRIZED) { + if ($s->form =~ /\S/s && $s->form !~ /form ); } - } + } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) { + # value [tag=input], meta + my $tag; + $tag = lc($1) if $t =~ /^<(\S+)/s; + for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder') { + if ($attr->{$a}) { + next if $a eq 'label' && $tag ne 'optgroup'; + next if $a eq 'content' && $tag ne 'meta'; + next if $a eq 'value' && ($tag ne 'input' + || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME + my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME + $val = TmplTokenizer::trim $val; + remember( $s, $val ) if $val =~ /\S/s; + } + } + } elsif ($s->has_js_data) { + for my $t (@{$s->js_data}) { + remember( $s, $t->[3] ) if $t->[0]; # FIXME + } + } } } ############################################################################### -sub generate_strings_list () { +sub generate_strings_list { # Emit all extracted strings. for my $t (string_list) { - printf OUTPUT "%s\n", $t; + printf $OUTPUT "%s\n", $t; } } ############################################################################### -sub generate_po_file () { +sub generate_po_file { # We don't emit the Plural-Forms header; it's meaningless for us my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET'); $pot_charset = TmplTokenizer::charset_canon $pot_charset; @@ -136,17 +154,17 @@ sub generate_po_file () { my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time)); my $time_pot = $time; my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE'; - print OUTPUT <, YEAR. # EOF - print OUTPUT <[0]->type == TmplTokenType::TEXT_PARAMETRIZED) { + if ($text{$t}->[0]->type == C4::TmplTokenType::TEXT_PARAMETRIZED) { my($token, $n) = ($text{$t}->[0], 0); - printf OUTPUT "#. For the first occurrence,\n" + printf $OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1 && $token->parameters_and_fields > 0; for my $param ($token->parameters_and_fields) { $n += 1; my $type = $param->type; - my $subtype = ($type == TmplTokenType::TAG + my $subtype = ($type == C4::TmplTokenType::TAG && $param->string =~ /^attributes->{'type'}->[1]: undef); my $fmt = TmplTokenizer::_formalize( $param ); $fmt =~ s/^%/%$n\$/; - if ($type == TmplTokenType::DIRECTIVE) { - $type = $param->string =~ /(TMPL_[A-Z]+)+/is? $1: 'ERROR'; + if ($type == C4::TmplTokenType::DIRECTIVE) { +# $type = "Template::Toolkit Directive"; + $type = $param->string =~ /\[%(.*?)%\]/is? $1: 'ERROR'; my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is? $2: undef; - printf OUTPUT "#. %s: %s\n", $fmt, + printf $OUTPUT "#. %s: %s\n", $fmt, "$type" . (defined $name? " name=$name": ''); } else { my $name = $param->attributes->{'name'}; - my $value = $param->attributes->{'value'} + my $value; + $value = $param->attributes->{'value'} unless $subtype =~ /^(?:text)$/; - printf OUTPUT "#. %s: %s\n", $fmt, "type=$subtype" + printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype" . (defined $name? " name=$name->[1]": '') . (defined $value? " value=$value->[1]": ''); } } - } elsif ($text{$t}->[0]->type == TmplTokenType::TAG) { + } elsif ($text{$t}->[0]->type == C4::TmplTokenType::TAG) { my($token) = ($text{$t}->[0]); - printf OUTPUT "#. For the first occurrence,\n" + printf $OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1 && $token->parameters_and_fields > 0; if ($token->string =~ /^attributes->{'http-equiv'}->[1]; - print OUTPUT "#. META http-equiv=$type\n" if defined $type; + print $OUTPUT "#. META http-equiv=$type\n" if defined $type; } elsif ($token->string =~ /^<([a-z0-9]+)/is) { my $tag = uc($1); my $type = (lc($tag) eq 'input'? $token->attributes->{'type'}: undef); my $name = $token->attributes->{'name'}; - printf OUTPUT "#. %s\n", $tag + printf $OUTPUT "#. %s\n", $tag . (defined $type? " type=$type->[1]": '') . (defined $name? " name=$name->[1]": ''); } } elsif ($text{$t}->[0]->has_js_data) { - printf OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1; - printf OUTPUT "#. SCRIPT\n"; + printf $OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1; + printf $OUTPUT "#. SCRIPT\n"; } my $cformat_p; for my $token (@{$text{$t}}) { my $pathname = $token->pathname; $pathname =~ s/^$directory_re//os; $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/; - printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number + printf $OUTPUT "#: %s:%d\n", $pathname, $token->line_number if defined $pathname && defined $token->line_number; - $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED; + $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED; } - printf OUTPUT "#, c-format\n" if $cformat_p; - printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po + printf $OUTPUT "#, c-format\n" if $cformat_p; + printf $OUTPUT "msgid %s\n", TmplTokenizer::quote_po TmplTokenizer::string_canon TmplTokenizer::charset_convert $t, $charset_in, $charset_out; - printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}? + printf $OUTPUT "msgstr %s\n\n", (defined $translation{$t}? TmplTokenizer::quote_po( $translation{$t} ): "\"\""); } } ############################################################################### -sub convert_translation_file () { - open(INPUT, "<$convert_from") || die "$convert_from: $!\n"; +sub convert_translation_file { + open(my $INPUT, '<', $convert_from) || die "$convert_from: $!\n"; VerboseWarnings::set_input_file_name $convert_from; - while () { + while (<$INPUT>) { chomp; my($msgid, $msgstr) = split(/\t/); die "$convert_from: $.: Malformed tmpl_process input (no tab)\n" @@ -241,7 +261,7 @@ sub convert_translation_file () { $msgid =~ s/^SELECTED>//; # Create dummy token - my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef ); + my $token = TmplToken->new( $msgid, C4::TmplTokenType::UNKNOWN, undef, undef ); remember( $token, $msgid ); $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3 $translation{$msgid} = $msgstr unless $msgstr eq '*****'; @@ -268,7 +288,7 @@ sub convert_translation_file () { ############################################################################### -sub usage ($) { +sub usage { my($exitcode) = @_; my $h = $exitcode? *STDERR: *STDOUT; print $h <$output") || die "$output: $!\n"; + open($OUTPUT, '>', $output) || die "$output: $!\n"; } else { print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p; - open(OUTPUT, ">&STDOUT"); + open($OUTPUT, ">&STDOUT"); } if (defined $files_from) { print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p; - open(INPUT, "<$files_from") || die "$files_from: $!\n"; - while () { + open(my $INPUT, '<', $files_from) || die "$files_from: $!\n"; + while (<$INPUT>) { chomp; my $input = /^\//? $_: "$directory/$_"; my $h = TmplTokenizer->new( $input ); @@ -359,7 +379,7 @@ if (defined $files_from) { print STDERR "$0: Processing file \"$input\"\n" if $verbose_p; text_extract( $h ); } - close INPUT; + close $INPUT; } else { print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p; convert_translation_file; @@ -376,8 +396,7 @@ exit(-1) if TmplTokenizer::fatal_p; =head1 DESCRIPTION -This is an experimental script based on the modularized -text-extract2.pl script. It has behaviour similar to +This script has behaviour similar to xgettext(1), and generates gettext-compatible output files. A gettext-like format provides the following advantages: @@ -388,7 +407,7 @@ A gettext-like format provides the following advantages: Translation to non-English-like languages with different word order: gettext's c-format strings can theoretically be -emulated if we are able to do some analysis on the .tmpl input +emulated if we are able to do some analysis on the .tt input and treat in a way similar to %s. =item - @@ -419,10 +438,10 @@ details. If you want to generate GNOME-style POTFILES.in files, such files (passed to -f) can be generated thus: - (cd ../.. && find koha-tmpl/opac-tmpl/default/en \ - -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in - (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \ - -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in + (cd ../.. && find koha-tmpl/opac-tmpl/default/en \ + -name \*.inc -o -name \*.tt) > opac/POTFILES.in + (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \ + -name \*.inc -o -name \*.tt) > intranet/POTFILES.in This is, however, quite pointless, because the "create" and "update" actions have already been implemented in tmpl_process3.pl.