Preliminary support for "analysis" of strings with <a> tags.

author acli <acli>

Sun, 22 Feb 2004 21:34:40 +0000 (21:34 +0000)

committer acli <acli>

Sun, 22 Feb 2004 21:34:40 +0000 (21:34 +0000)
author acli <acli>
Sun, 22 Feb 2004 21:34:40 +0000 (21:34 +0000)
committer acli <acli>
Sun, 22 Feb 2004 21:34:40 +0000 (21:34 +0000)
diff --git a/misc/translator/TmplToken.pm b/misc/translator/TmplToken.pm

index 854a70a..87d3607 100644 (file)
--- a/misc/translator/TmplToken.pm
+++ b/misc/translator/TmplToken.pm
@@ -87,6 +87,12 @@ sub parameters {
      return map { $_->type == TmplTokenType::DIRECTIVE? $_: ()} @{$this->{'_kids'}};
  }
  
+# only meaningful for TEXT_PARAMETRIZED tokens
+sub anchors {
+    my $this = shift;
+    return map { $_->type == TmplTokenType::TAG && $_->string =~ /^<a\b/is? $_: ()} @{$this->{'_kids'}};
+}
+
  # only meaningful for TEXT_PARAMETRIZED tokens
  sub form {
      my $this = shift;
diff --git a/misc/translator/TmplTokenizer.pm b/misc/translator/TmplTokenizer.pm

index 608d141..ce66ffe 100644 (file)
--- a/misc/translator/TmplTokenizer.pm
+++ b/misc/translator/TmplTokenizer.pm
@@ -420,13 +420,24 @@ sub _next_token_intermediate {
      return $it;
  }
  
-sub _token_groupable_p ($) { # groupable into a TEXT_PARAMETRIZED token
+sub _token_groupable1_p ($) { # as first token, groupable into TEXT_PARAMETRIZED
      my($t) = @_;
-    return $t->type == TmplTokenType::TEXT
+    return ($t->type == TmplTokenType::TEXT && $t->string !~ /^[,\.:\|\s]+$/s)
         || ($t->type == TmplTokenType::DIRECTIVE
                 && $t->string =~ /^(?:$re_tmpl_var)$/os)
         || ($t->type == TmplTokenType::TAG
-               && ($t->string =~ /^<\/?(?:b|em|h[123456]|i|u)\b/is
+               && ($t->string =~ /^<(?:b|em|h[123456]|i|u)\b/is
+               || ($t->string =~ /^<input/i
+                   && $t->attributes->{'type'} =~ /^(?:text)$/i)))
+}
+
+sub _token_groupable2_p ($) { # as other token, groupable into TEXT_PARAMETRIZED
+    my($t) = @_;
+    return ($t->type == TmplTokenType::TEXT && ($t->string =~ /^\s*$/s || $t->string !~ /^[\|\s]+$/s))
+       || ($t->type == TmplTokenType::DIRECTIVE
+               && $t->string =~ /^(?:$re_tmpl_var)$/os)
+       || ($t->type == TmplTokenType::TAG
+               && ($t->string =~ /^<\/?(?:a|b|em|h[123456]|i|u)\b/is
                 || ($t->string =~ /^<input/i
                     && $t->attributes->{'type'} =~ /^(?:text)$/i)))
  }
@@ -439,7 +450,10 @@ sub _quote_cformat ($) {
  
  sub _formalize ($) {
      my($t) = @_;
-    return $t->type == TmplTokenType::DIRECTIVE? '%s': _quote_cformat($t->string);
+    return $t->type == TmplTokenType::DIRECTIVE? '%s':
+          $t->type == TmplTokenType::TAG?
+                  ($t->string =~ /^<a\b/is? '<a>': _quote_cformat($t->string)):
+              _quote_cformat($t->string);
  }
  
  sub _optimize {
@@ -452,6 +466,7 @@ sub _optimize {
                     push @{$this->{_queue}}, pop @structure;
                 }
             };
+    &$undo_trailing_blanks;
      # FIXME: If the last token is a close tag but there are no tags
      # FIXME: before it, drop the close tag back into the queue. This
      # FIXME: is an ugly hack to get rid of "foo %s</h1>" type mess.
@@ -514,46 +529,56 @@ sub next_token {
         $it = $this->_next_token_intermediate($h);
         if (!$this->cdata_mode_p && $this->allow_cformat_p && defined $it
             && ($it->type == TmplTokenType::TEXT?
-               !blank_p( $it->string ): _token_groupable_p( $it ))) {
+               !blank_p( $it->string ): _token_groupable1_p( $it ))) {
             my @structure = ( $it );
-           my($n_trailing_spaces, $next) = (0, undef);
-           my($nonblank_text_p, $parametrized_p, $next) = (0, 0);
+           my @tags = ();
+           my $next = undef;
+           my($nonblank_text_p, $parametrized_p, $with_anchor_p) = (0, 0, 0);
             if ($it->type == TmplTokenType::TEXT) {
                 $nonblank_text_p = 1 if !blank_p( $it->string );
             } elsif ($it->type == TmplTokenType::DIRECTIVE) {
                 $parametrized_p = 1;
+           } elsif ($it->type == TmplTokenType::TAG && $it->string =~ /^<([A-Z0-9]+)/is) {
+               push @tags, lc($1);
+               $with_anchor_p = 1 if lc($1) eq 'a';
             }
-           for (my $i = 1, $n_trailing_spaces = 0;; $i += 1) {
+           # We hate | and || in msgid strings, so we try to avoid them
+           for (my $i = 1, my $quit_p = 0, my $quit_next_p = ($it->type == TmplTokenType::TEXT && $it->string =~ /^\|+$/s);; $i += 1) {
                 $next = $this->_next_token_intermediate($h);
                 push @structure, $next; # for consistency (with initialization)
-           last unless defined $next && _token_groupable_p( $next );
+           last unless defined $next && _token_groupable2_p( $next );
+           last if $quit_next_p;
                 if ($next->type == TmplTokenType::TEXT) {
-                   if (blank_p( $next->string )) {
-                       $n_trailing_spaces += 1;
-                   } else {
-                       ($n_trailing_spaces, $nonblank_text_p) = (0, 1);
-                   }
+                   $nonblank_text_p = 1 if !blank_p( $next->string );
+                   $quit_p = 1 if $next->string =~ /^\|+$/s; # We hate | and ||
                 } elsif ($next->type == TmplTokenType::DIRECTIVE) {
-                   $n_trailing_spaces = 0;
                     $parametrized_p = 1;
-               } else {
-                   $n_trailing_spaces = 0;
+               } elsif ($next->type == TmplTokenType::TAG) {
+                   if ($next->string =~ /^<([A-Z0-9]+)/is) {
+                       push @tags, lc($1);
+                       $with_anchor_p = 1 if lc($1) eq 'a';
+                   } elsif ($next->string =~ /^<\/([A-Z0-9]+)/is) {
+                       my $close = lc($1);
+                       $quit_p = 1 unless @tags && $close eq $tags[$#tags];
+                       $quit_next_p = 1 if $close =~ /^h\d$/;
+                       pop @tags;
+                   }
                 }
+           last if $quit_p;
             }
             # Undo the last token
             push @{$this->{_queue}}, pop @structure;
-           # Undo trailing blank tokens
-           for (my $i = 0; $i < $n_trailing_spaces; $i += 1) {
-               push @{$this->{_queue}}, pop @structure;
-           }
+           # Simply it a bit more
             @structure = $this->_optimize( @structure );
             if (@structure < 2) {
                 # Nothing to do
                 ;
-           } elsif ($nonblank_text_p && $parametrized_p) {
+           } elsif ($nonblank_text_p && ($parametrized_p || $with_anchor_p)) {
                 # Create the corresponding c-format string
                 my $string = join('', map { $_->string } @structure);
                 my $form = join('', map { _formalize $_ } @structure);
+               my $a_counter = 0;
+               $form =~ s/<a>/ $a_counter += 1, "<a$a_counter>" /egs;
                 $it = TmplToken->new($string, TmplTokenType::TEXT_PARAMETRIZED, $it->line_number, $it->pathname);
                 $it->set_form( $form );
                 $it->set_children( @structure );
@@ -604,10 +629,10 @@ sub quote_po ($) {
  }
  
  # Some functions that shouldn't be here... should be moved out some time
-sub parametrize ($@) {
-    my($fmt, @params) = @_;
+sub parametrize ($$$) {
+    my($fmt_0, $params, $anchors) = @_;
      my $it = '';
-    for (my $n = 0; length $fmt;) {
+    for (my $n = 0, my $fmt = $fmt_0; length $fmt;) {
         if ($fmt =~ /^[^%]+/) {
             $fmt = $';
             $it .= $&;
@@ -619,7 +644,7 @@ sub parametrize ($@) {
             my($i, $width, $prec) = ((defined $1? $1: $n), $2, $3);
             $fmt = $';
             if (!defined $width && !defined $prec) {
-               $it .= $params[$i]
+               $it .= $params->[$i]
             } elsif (defined $width && defined $prec && !$width && !$prec) {
                 ;
             } else {
@@ -633,6 +658,22 @@ sub parametrize ($@) {
             die "Completely confused parametrizing: $fmt\n";#XXX
         }
      }
+    for (my $n = 0, my $fmt = $it, $it = ''; length $fmt;) {
+       if ($fmt =~ /^(?:(?!<a\d+>).)+/is) {
+           $fmt = $';
+           $it .= $&;
+       } elsif ($fmt =~ /^<a(\d+)>/is) {
+           $n += 1;
+           my $i  = $1;
+           $fmt = $';
+           my $anchor = $anchors->[$i - 1];
+           warn_normal "$&: Anchor $1 not found for msgid \"$fmt_0\"", undef #FIXME
+                   unless defined $anchor;
+           $it .= $anchor->string;
+       } else {
+           die "Completely confused decoding anchors: $fmt\n";#XXX
+       }
+    }
      return $it;
  }
  
diff --git a/misc/translator/tmpl_process3.pl b/misc/translator/tmpl_process3.pl

index ae383fe..a7637cb 100755 (executable)
--- a/misc/translator/tmpl_process3.pl
+++ b/misc/translator/tmpl_process3.pl
@@ -88,10 +88,10 @@ sub text_replace (**) {
             print $output $pre, find_translation($trimmed), $post;
         } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
             my $fmt = find_translation($s->form);
-           print $output TmplTokenizer::parametrize($fmt, map {
+           print $output TmplTokenizer::parametrize($fmt, [ map {
                 my($kind, $t, $attr) = ($_->type, $_->string, $_->attributes);
                 $kind == TmplTokenType::TAG && %$attr?
-                   text_replace_tag($t, $attr): $t } $s->parameters);
+                   text_replace_tag($t, $attr): $t } $s->parameters ], [ $s->anchors ]);
         } elsif ($kind eq TmplTokenType::TAG && %$attr) {
             print $output text_replace_tag($t, $attr);
         } elsif (defined $t) {
@@ -297,7 +297,8 @@ exit 0;
  =head1 DESCRIPTION
  
  This is an experimental version of the tmpl_process.pl script,
-using standard gettext-style PO files.
+using standard gettext-style PO files.  Note that the behaviour
+of this script should still be considered unstable.
  
  Currently, the create, update, and install actions have all been
  reimplemented and seem to work.
@@ -315,8 +316,14 @@ file as c-format strings with %s.
  
  The --help option has not been implemented yet.
  
-There are probably some real bugs too, since this has not been
-tested very much.
+If an extracted string contain actual text (versus tags or
+TMPL_VAR directives), the strings are extracted verbatim,
+resulting in unwieldy things like multiple spaces, tabs,
+and/or newlines which are semantically indistinguishable
+from single blanks. If the template writer changes the
+spacing just a little bit, the new formatting would be
+considered new strings. This is arguably wrong, and in any
+case counter-productive.
  
  xgettext.pl must be present in the current directory; the
  msgmerge(1) command must also be present in the search path.
@@ -331,6 +338,9 @@ generate GNU PO files properly; a couple of workarounds have
  been written in TmplTokenizer and more is likely to be needed
  (e.g., to get rid of the "Strange line" warning for #~).
  
+There are probably some other bugs too, since this has not been
+tested very much.
+
  =head1 SEE ALSO
  
  xgettext.pl,
diff --git a/misc/translator/xgettext.pl b/misc/translator/xgettext.pl

index 3d5d11b..11a4f15 100755 (executable)
--- a/misc/translator/xgettext.pl
+++ b/misc/translator/xgettext.pl
@@ -21,26 +21,43 @@ use vars qw( $charset_in $charset_out );
  
  ###############################################################################
  
-sub negligible_p ($) {
+sub string_negligible_p ($) {
      my($t) = @_;                               # a string
      # Don't emit pure whitespace, pure numbers, pure punctuation,
      # single letters, or TMPL_VAR's.
      # Punctuation should arguably be translated. But without context
-    # they are untranslatable.
+    # they are untranslatable. Note that $t is a string, not a token object.
      return !$extract_all_p && (
-              TmplTokenizer::blank_p($t)               # blank or TMPL_VAR
+              TmplTokenizer::blank_p($t)       # blank or TMPL_VAR
             || $t =~ /^\d+$/                    # purely digits
-           || $t =~ /^[-\.,:;'"%\(\)\[\]\|]+$/ # pure punctuation w/o context
+           || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
             || $t =~ /^[A-Za-z]$/               # single letters
-       );
+       )
+}
+
+sub token_negligible_p( $ ) {
+    my($x) = @_;
+    my $t = $x->type;
+    return !$extract_all_p && (
+           $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
+           $t == TmplTokenType::DIRECTIVE? 1:
+           $t == TmplTokenType::TEXT_PARAMETRIZED
+               && join( '', map { my $t = $_->type;
+                       $t == TmplTokenType::DIRECTIVE?
+                               '1': $t == TmplTokenType::TAG?
+                                       '': token_negligible_p( $_ )?
+                                       '': '1' } @{$x->children} ) eq '' );
  }
  
  ###############################################################################
  
  sub remember ($$) {
      my($token, $string) = @_;
-    $text{$string} = [] unless defined $text{$string};
-    push @{$text{$string}}, $token;
+    # If we determine that the string is negligible, don't bother to remember
+    unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
+       $text{$string} = [] unless defined $text{$string};
+       push @{$text{$string}}, $token;
+    }
  }
  
  ###############################################################################
@@ -69,10 +86,8 @@ sub text_extract (*) {
      last unless defined $s;
         my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
         if ($kind eq TmplTokenType::TEXT) {
-           #$t = TmplTokenizer::trim $t;
             remember( $s, $t ) if $t =~ /\S/s;
         } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
-           #$t = TmplTokenizer::trim $t;
             remember( $s, $s->form ) if $s->form =~ /\S/s;
         } elsif ($kind eq TmplTokenType::TAG && %$attr) {
             # value [tag=input], meta
@@ -96,7 +111,7 @@ sub text_extract (*) {
  sub generate_strings_list () {
      # Emit all extracted strings.
      for my $t (string_list) {
-       printf OUTPUT "%s\n", $t unless negligible_p($t);
+       printf OUTPUT "%s\n", $t # unless negligible_p($t);
      }
  }
  
@@ -127,7 +142,7 @@ msgstr ""
  EOF
      my $directory_re = quotemeta("$directory/");
      for my $t (string_list) {
-       next if negligible_p($t);
+       #next if negligible_p($t);
         my $cformat_p;
         for my $token (@{$text{$t}}) {
             my $pathname = $token->pathname;
@@ -316,17 +331,11 @@ the gettext format.
  
  =back
  
-Right now it does about the same thing as text-extract2.pl but
-generates gettext-style output; however, because it is scanner-
-instead of parser-based, it is able to address the 4 weaknesses
-listed in translator_doc.txt.  Ultimately, the goal is to make
-this able to do some kind of simple analysis on the input to
-produce gettext-style output with c-format strings, in order to
-facilitate translation to languages with a different word order
-than English.
-
-When the above is finished, the generated po file may contain
-some HTML tags in addition to %s strings.
+Note that this script is experimental and should still be
+considered unstable.
+
+Please refer to the explanation in tmpl_process3 for further
+details.
  
  If you want to generate GNOME-style POTFILES.in files, such
  files (passed to -f) can be generated thus:
author	acli <acli>
	Sun, 22 Feb 2004 21:34:40 +0000 (21:34 +0000)
committer	acli <acli>
	Sun, 22 Feb 2004 21:34:40 +0000 (21:34 +0000)
misc/translator/TmplToken.pm		patch \| blob \| history
misc/translator/TmplTokenizer.pm		patch \| blob \| history
misc/translator/tmpl_process3.pl		patch \| blob \| history
misc/translator/xgettext.pl		patch \| blob \| history