misc/translator/text-extract2.pl

   1 #!/usr/bin/perl
   2
   3 # Test filter partially based on Ambrose's hideous subst.pl code
   4 # The idea is that the .tt files are not valid HTML, and as a result
   5 # HTML::Parse would be completely confused by these templates.
   6 # This is just a simple scanner (not a parser) & should give better results.
   7
   8 # This script is meant to be a drop-in replacement of text-extract.pl
   9
  10 # A grander plan: Code could be written to detect template variables and
  11 # construct gettext-c-format-string-like meta-strings (e.g., "Results %s
  12 # through %s of %s records" that will be more likely to be translatable
  13 # to languages where word order is very unlike English word order.
  14 # --> This will be relatively major rework, and requires corresponding
  15 # rework in tmpl_process.pl
  16
  17 use FindBin;
  18 use lib $FindBin::Bin;
  19
  20 use strict;
  21 #use warnings; FIXME - Bug 2505
  22 use Getopt::Long;
  23 use TmplTokenizer;
  24 use VerboseWarnings;
  25
  26 use vars qw( $input );
  27 use vars qw( $debug_dump_only_p );
  28 use vars qw( $pedantic_p );
  29 use vars qw( $allow_cformat_p ); # FOR TESTING PURPOSES ONLY!!
  30
  31 ###############################################################################
  32
  33 sub underline ($) { # for testing only
  34     my($s) = @_;
  35     join('', map {/[\0-\37]/? $_: "$_\b$_"} split(//, $s));
  36 }
  37
  38 sub debug_dump ($) { # for testing only
  39     my($h) = @_;
  40     print "re_tag_compat is /", TmplTokenizer::re_tag(1), "/\n";
  41     for (;;) {
  42         my $s = TmplTokenizer::next_token $h;
  43     last unless defined $s;
  44         printf "%s\n", ('-' x 79);
  45         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
  46         printf "%s [line %d]:\n", $kind->to_string, $s->line_number;
  47         printf "%4dH%s\n", length($t), underline($t);
  48     if ($kind == TmplTokenType::TAG() && %$attr) {
  49             printf "Attributes:\n";
  50             for my $a (keys %$attr) {
  51                 my($key, $val, $val_orig, $order) = @{$attr->{$a}};
  52                 printf "%s = %dH%s -- %s\n", $a, length $val, underline $val,
  53                 $val_orig;
  54             }
  55         }
  56     if ($kind == TmplTokenType::TEXT_PARAMETRIZED()) {
  57             printf "Form (c-format string):\n";
  58             printf "%dH%s\n", length $s->form, underline $s->form;
  59             printf "Parameters:\n";
  60             my $i = 1;
  61             for my $a ($s->parameters) {
  62                 my $t = $a->string;
  63                 printf "%%%d\$s = %dH%s\n", $i, length $t, underline $t;
  64                 $i += 1;
  65             }
  66         }
  67         if ($s->has_js_data) {
  68             printf "JavaScript translatable strings:\n";
  69             for my $t (@{$s->js_data}) {
  70                 printf "%dH%s\n", length $t->[3], underline $t->[3] if $t->[0]; # FIXME
  71             }
  72         }
  73     }
  74 }
  75
  76 ###############################################################################
  77
  78 sub text_extract ($) {
  79     my($h) = @_;
  80     my %text = ();
  81     for (;;) {
  82         my $s = TmplTokenizer::next_token $h;
  83     last unless defined $s;
  84         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
  85     if ($kind == TmplTokenType::TEXT()) {
  86             $t = TmplTokenizer::trim $t;
  87             $text{$t} = 1 if $t =~ /\S/s;
  88     } elsif ($kind == TmplTokenType::TAG() && %$attr) {
  89             # value [tag=input], meta
  90             my $tag = lc($1) if $t =~ /^<(\S+)/s;
  91             for my $a ('alt', 'content', 'title', 'value') {
  92                 if ($attr->{$a}) {
  93                     next if $a eq 'content' && $tag ne 'meta';
  94                     next if $a eq 'value' && ($tag ne 'input'
  95                         || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
  96                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
  97                     $val = TmplTokenizer::trim $val;
  98                     $text{$val} = 1 if $val =~ /\S/s;
  99                 }
 100             }
 101         } elsif ($s->has_js_data) {
 102             for my $t (@{$s->js_data}) {
 103                 remember( $s, $t->[3] ) if $t->[0]; # FIXME
 104             }
 105         }
 106     }
 107     # Emit all extracted strings.
 108     # Don't emit pure whitespace, pure numbers, or TMPL_VAR's.
 109     for my $t (keys %text) {
 110         printf "%s\n", $t
 111             unless TmplTokenizer::blank_p($t) || $t =~ /^\d+$/;
 112     }
 113 }
 114
 115 ###############################################################################
 116
 117 sub usage ($) {
 118     my($exitcode) = @_;
 119     my $h = $exitcode? *STDERR: *STDOUT;
 120     print $h <<EOF;
 121 Usage: $0 [OPTIONS]
 122 Extract strings from HTML file.
 123
 124       --debug-dump-only     Do not extract strings; but display scanned tokens
 125   -f, --file=FILE           Extract from the specified FILE
 126       --pedantic-warnings   Issue warnings even for detected problems which
 127                             are likely to be harmless
 128       --help                Display this help and exit
 129 EOF
 130     exit($exitcode);
 131 }
 132
 133 ###############################################################################
 134
 135 sub usage_error (;$) {
 136     print STDERR "$_[0]\n" if @_;
 137     print STDERR "Try `$0 --help' for more information.\n";
 138     exit(-1);
 139 }
 140
 141 ###############################################################################
 142
 143 GetOptions(
 144     'enable-cformat'    => \$allow_cformat_p,
 145     'f|file=s'          => \$input,
 146     'debug-dump-only'   => \$debug_dump_only_p,
 147     'pedantic-warnings' => sub { $pedantic_p = 1 },
 148     'help'              => sub { usage(0) },
 149 ) || usage_error;
 150
 151 VerboseWarnings::set_application_name $0;
 152 VerboseWarnings::set_input_file_name $input;
 153 VerboseWarnings::set_pedantic_mode $pedantic_p;
 154
 155 usage_error('Missing mandatory option -f') unless defined $input;
 156
 157 my $h = TmplTokenizer->new( $input );
 158 $h->set_allow_cformat( 1 ) if $allow_cformat_p;
 159 if ($debug_dump_only_p) {
 160     debug_dump( $h );
 161 } else {
 162     text_extract( $h );
 163 }
 164
 165 warn "This input will not work with Mozilla standards-compliant mode\n", undef
 166         if TmplTokenizer::syntaxerror_p;
 167
 168 close INPUT;
 169
 170 exit(-1) if TmplTokenizer::fatal_p;