Bug 5915 : C4::AuthoritiesMarc AddAuthority

[koha.git] / misc / translator / TmplTokenizer.pm
diff --git a/misc/translator/TmplTokenizer.pm b/misc/translator/TmplTokenizer.pm

index bb37eaa..5c4f28b 100644 (file)
--- a/misc/translator/TmplTokenizer.pm
+++ b/misc/translator/TmplTokenizer.pm
@@ -1,6 +1,7 @@
  package TmplTokenizer;
  
  use strict;
+#use warnings; FIXME - Bug 2505
  use TmplTokenType;
  use TmplToken;
  use VerboseWarnings qw( pedantic_p error_normal warn_normal warn_pedantic );
@@ -22,11 +23,6 @@ it might be better to create a customized scanner
  to scan the template files for tokens.
  This module is a simple-minded attempt at such a scanner.
  
-=head1 HISTORY
-
-This tokenizer is mostly based
-on Ambrose's hideous Perl script known as subst.pl.
-
  =cut
  
  ###############################################################################
@@ -38,12 +34,13 @@ $VERSION = 0.02;
  
  use vars qw( $pedantic_attribute_error_in_nonpedantic_mode_p );
  use vars qw( $pedantic_tmpl_var_use_in_nonpedantic_mode_p );
+use vars qw( $pedantic_error_markup_in_pcdata_p );
  
  ###############################################################################
  
  # Hideous stuff
  use vars qw( $re_directive $re_tmpl_var $re_tmpl_var_escaped $re_tmpl_include );
-use vars qw( $re_directive_control $re_tmpl_endif_endloop );
+use vars qw( $re_directive_control $re_tmpl_endif_endloop $re_xsl);
  BEGIN {
      # $re_directive must not do any backreferences
      $re_directive = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:VAR|LOOP|INCLUDE|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
@@ -51,6 +48,7 @@ BEGIN {
      $re_tmpl_var = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
      $re_tmpl_include = q{<(?:(?i)(?:!--\s*)?TMPL_(?:INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
      # TMPL_VAR ESCAPE=1/HTML/URL
+    $re_xsl = q{<\/?(?:xsl:)(?:[\s\-a-zA-Z0-9"'\/\.\[\]\@\(\):=,$]+)\/?>};
      $re_tmpl_var_escaped = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR|INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))\s+ESCAPE=(?:1|HTML|URL)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
      # Any control flow directive
      $re_directive_control = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:LOOP|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
@@ -67,7 +65,7 @@ sub re_tag ($) {
     my $etag = $compat? '>': '<>\/';
     # This is no longer similar to the original regexp in subst.pl :-(
     # Note that we don't want <> in compat mode; Mozilla knows about <
-   q{(<\/?(?:|(?:"(?:} . $re_directive . q{|[^"])*"|'(?:} . $re_directive . q{|[^'])*'|--(?:[^-]|-[^-])*--|(?:}
+   q{(<\/?(?:|(?:"(?:} . $re_directive . q{|[^"])*"|'(?:} . $re_directive . q{|[^'])*'|--(?:(?!--)(?:$re_directive)*.)*--|(?:}
     . $re_directive
     . q{|(?!--)[^"'<>} . $etag . q{]))+))([} . $etag . q{]|(?=<))(.*)};
  }
@@ -96,24 +94,20 @@ sub LINENUM_START   () {'lc_0'}
  sub LINENUM            () {'lc'}
  sub CDATA_MODE_P       () {'cdata-mode-p'}
  sub CDATA_CLOSE                () {'cdata-close'}
+sub PCDATA_MODE_P      () {'pcdata-mode-p'}    # additional submode for CDATA
+sub JS_MODE_P          () {'js-mode-p'}        # cdata-mode-p must also be true
  
-sub new {
-    my $this = shift;
-    my($input) = @_;
-    my $class = ref($this) || $this;
-    my $self = {};
-    bless $self, $class;
-
-    my $handle = sprintf('TMPLTOKENIZER%d', $serial);
-    $serial += 1;
+sub ALLOW_CFORMAT_P    () {'allow-cformat-p'}
  
-    no strict;
-    open($handle, "<$input") || die "$input: $!\n";
-    use strict;
-    $self->{+FILENAME} = $input;
-    $self->{+HANDLE} = $handle;
-    $self->{+READAHEAD} = [];
-    return $self;
+sub new {
+    shift;
+    my ($filename) = @_;
+    open my $handle,$filename or die "can't open $filename";
+    bless {
+           filename => $filename
+           , handle => $handle
+           , readahead => []
+    } , __PACKAGE__;
  }
  
  ###############################################################################
@@ -122,12 +116,12 @@ sub new {
  
  sub filename {
      my $this = shift;
-    return $this->{+FILENAME};
+    return $this->{filename};
  }
  
  sub _handle {
      my $this = shift;
-    return $this->{+HANDLE};
+    return $this->{handle};
  }
  
  sub fatal_p {
@@ -142,12 +136,12 @@ sub syntaxerror_p {
  
  sub has_readahead_p {
      my $this = shift;
-    return @{$this->{+READAHEAD}};
+    return @{$this->{readahead}};
  }
  
  sub _peek_readahead {
      my $this = shift;
-    return $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}];
+    return $this->{readahead}->[$#{$this->{readahead}}];
  }
  
  sub line_number_start {
@@ -165,11 +159,26 @@ sub cdata_mode_p {
      return $this->{+CDATA_MODE_P};
  }
  
+sub pcdata_mode_p {
+    my $this = shift;
+    return $this->{+PCDATA_MODE_P};
+}
+
+sub js_mode_p {
+    my $this = shift;
+    return $this->{+JS_MODE_P};
+}
+
  sub cdata_close {
      my $this = shift;
      return $this->{+CDATA_CLOSE};
  }
  
+sub allow_cformat_p {
+    my $this = shift;
+    return $this->{+ALLOW_CFORMAT_P};
+}
+
  # Simple setters
  
  sub _set_fatal {
@@ -186,24 +195,24 @@ sub _set_syntaxerror {
  
  sub _push_readahead {
      my $this = shift;
-    push @{$this->{+READAHEAD}}, $_[0];
+    push @{$this->{readahead}}, $_[0];
      return $this;
  }
  
  sub _pop_readahead {
      my $this = shift;
-    return pop @{$this->{+READAHEAD}};
+    return pop @{$this->{readahead}};
  }
  
  sub _append_readahead {
      my $this = shift;
-    $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}] .= $_[0];
+    $this->{readahead}->[$#{$this->{readahead}}] .= $_[0];
      return $this;
  }
  
  sub _set_readahead {
      my $this = shift;
-    $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}] = $_[0];
+    $this->{readahead}->[$#{$this->{readahead}}] = $_[0];
      return $this;
  }
  
@@ -225,20 +234,132 @@ sub _set_cdata_mode {
      return $this;
  }
  
+sub _set_pcdata_mode {
+    my $this = shift;
+    $this->{+PCDATA_MODE_P} = $_[0];
+    return $this;
+}
+
+sub _set_js_mode {
+    my $this = shift;
+    $this->{+JS_MODE_P} = $_[0];
+    return $this;
+}
+
  sub _set_cdata_close {
      my $this = shift;
      $this->{+CDATA_CLOSE} = $_[0];
      return $this;
  }
  
+sub set_allow_cformat {
+    my $this = shift;
+    $this->{+ALLOW_CFORMAT_P} = $_[0];
+    return $this;
+}
+
+###############################################################################
+
+use vars qw( $js_EscapeSequence );
+BEGIN {
+    # Perl quoting is really screwed up, but this common subexp is way too long
+    $js_EscapeSequence = q{\\\\(?:['"\\\\bfnrt]|[^0-7xu]|[0-3]?[0-7]{1,2}|x[\da-fA-F]{2}|u[\da-fA-F]{4})};
+}
+sub parenleft  () { '(' }
+sub parenright () { ')' }
+
+sub split_js ($) {
+    my ($s0) = @_;
+    my @it = ();
+    while (length $s0) {
+       if ($s0 =~ /^\s+/s) {                           # whitespace
+           push @it, $&;
+           $s0 = $';
+       } elsif ($s0 =~ /^\/\/[^\r\n]*(?:[\r\n]|$)/s) { # C++-style comment
+           push @it, $&;
+           $s0 = $';
+       } elsif ($s0 =~ /^\/\*(?:(?!\*\/).)*\*\//s) {   # C-style comment
+           push @it, $&;
+           $s0 = $';
+       # Keyword or identifier, ECMA-262 p.13 (section 7.5)
+       } elsif ($s0 =~ /^[A-Z_\$][A-Z\d_\$]*/is) {     # IdentifierName
+           push @it, $&;
+           $s0 = $';
+       # Punctuator, ECMA-262 p.13 (section 7.6)
+       } elsif ($s0 =~ /^(?:[\(\){}\[\];]|>>>=|<<=|>>=|[-\+\*\/\&\|\^\%]=|>>>|<<|>>|--|\+\+|\|\||\&\&|==|<=|>=|!=|[=><,!~\?:\.\-\+\*\/\&\|\^\%])/s) {
+           push @it, $&;
+           $s0 = $';
+       # DecimalLiteral, ECMA-262 p.14 (section 7.7.3); note: bug in the spec
+       } elsif ($s0 =~ /^(?:0|[1-9]\d+(?:\.\d*(?:[eE][-\+]?\d+)?)?)/s) {
+           push @it, $&;
+           $s0 = $';
+       # HexIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
+       } elsif ($s0 =~ /^0[xX][\da-fA-F]+/s) {
+           push @it, $&;
+           $s0 = $';
+       # OctalIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
+       } elsif ($s0 =~ /^0[\da-fA-F]+/s) {
+           push @it, $&;
+           $s0 = $';
+       # StringLiteral, ECMA-262 p.17 (section 7.7.4)
+       # XXX SourceCharacter doesn't seem to be defined (?)
+       } elsif ($s0 =~ /^(?:"(?:(?!["\\\r\n]).|$js_EscapeSequence)*"|'(?:(?!['\\\r\n]).|$js_EscapeSequence)*')/os) {
+           push @it, $&;
+           $s0 = $';
+       } elsif ($s0 =~ /^./) {                         # UNKNOWN TOKEN !!!
+           push @it, $&;
+           $s0 = $';
+       }
+    }
+    return @it;
+}
+
+sub STATE_UNDERSCORE     () { 1 }
+sub STATE_PARENLEFT      () { 2 }
+sub STATE_STRING_LITERAL () { 3 }
+
+# XXX This is a crazy hack. I don't want to write an ECMAScript parser.
+# XXX A scanner is one thing; a parser another thing.
+sub identify_js_translatables (@) {
+    my @input = @_;
+    my @output = ();
+    # We mark a JavaScript translatable string as in C, i.e., _("literal")
+    # For simplicity, we ONLY look for "_" "(" StringLiteral ")"
+    for (my $i = 0, my $state = 0, my($j, $q, $s); $i <= $#input; $i += 1) {
+       my $reset_state_p = 0;
+       push @output, [0, $input[$i]];
+       if ($input[$i] !~ /\S/s) {
+           ;
+       } elsif ($state == 0) {
+           $state = STATE_UNDERSCORE if $input[$i] eq '_';
+       } elsif ($state == STATE_UNDERSCORE) {
+           $state = $input[$i] eq parenleft ? STATE_PARENLEFT : 0;
+       } elsif ($state == STATE_PARENLEFT) {
+           if ($input[$i] =~ /^(['"])(.*)\1$/s) {
+               ($state, $j, $q, $s) = (STATE_STRING_LITERAL, $#output, $1, $2);
+           } else {
+               $state = 0;
+           }
+       } elsif ($state == STATE_STRING_LITERAL) {
+           if ($input[$i] eq parenright) {
+               $output[$j] = [1, $output[$j]->[1], $q, $s];
+           }
+           $state = 0;
+       } else {
+           die "identify_js_translatables internal error: Unknown state $state"
+       }
+    }
+    return \@output;
+}
+
  ###############################################################################
  
  sub _extract_attributes ($;$) {
      my $this = shift;
      my($s, $lc) = @_;
      my %attr;
-    $s = $1 if $s =~ /^<\S+(.*)\/\S$/s # XML-style self-closing tags
-           || $s =~ /^<\S+(.*)\S$/s;   # SGML-style tags
+    $s = $1 if $s =~ /^<(?:(?!$re_directive_control)\S)+(.*)\/\S$/s    # XML-style self-closing tags
+           || $s =~ /^<(?:(?!$re_directive_control)\S)+(.*)\S$/s;      # SGML-style tags
  
      for (my $i = 0; $s =~ /^(?:$re_directive_control)?\s+(?:$re_directive_control)?(?:([a-zA-Z][-a-zA-Z0-9]*)\s*=\s*)?('((?:$re_directive|[^'])*)'|"((?:$re_directive|[^"])*)"|((?:$re_directive|[^\s<>])+))/os;) {
         my($key, $val, $val_orig, $rest)
@@ -274,7 +395,13 @@ sub _extract_attributes ($;$) {
             error_normal((scalar(split(/\n/, $s)) - 1) . " more line(s) not shown.", undef);
             $this->_set_fatal( 1 );
         } else {
-           warn_normal "Strange attribute syntax: $s\n", $lc;
+           # There's something wrong with the attribute syntax.
+           # We might be able to deduce a likely cause by looking more.
+           if ($s =~ /^[a-z0-9]/is && "<foo $s>" =~ /^$re_tag_compat$/s) {
+               warn_normal "Probably missing whitespace before or missing quotation mark near: $s\n", $lc;
+           } else {
+               warn_normal "Strange attribute syntax: $s\n", $lc;
+           }
         }
      }
      return \%attr;
@@ -305,34 +432,55 @@ sub _next_token_internal {
         ($kind, $it) = (TmplTokenType::TEXT, $&);
         $this->_set_readahead( $' );
      # FIXME the following (the [<\s] part) is an unreliable HACK :-(
-    } elsif ($this->_peek_readahead =~ /^(?:[^<]|<[<\s])+/s) { # non-space normal text
+    } elsif ($this->_peek_readahead =~ /^(?:[^<]|<[<\s])*(?:[^<\s])/s) {       # non-space normal text
         ($kind, $it) = (TmplTokenType::TEXT, $&);
         $this->_set_readahead( $' );
         warn_normal "Unescaped < in $it\n", $this->line_number_start
                 if !$this->cdata_mode_p && $it =~ /</s;
      } else {                           # tag/declaration/processing instruction
         my $ok_p = 0;
+       my $bad_comment_p = 0;
         for (my $cdata_close = $this->cdata_close;;) {
             if ($this->cdata_mode_p) {
-               if ($this->_peek_readahead =~ /^$cdata_close/) {
+               my $next = $this->_pop_readahead;
+               if ($next =~ /^$cdata_close/is) {
                     ($kind, $it) = (TmplTokenType::TAG, $&);
-                   $this->_set_readahead( $' );
+                   $this->_push_readahead( $' );
+                   $ok_p = 1;
+               } elsif ($next =~ /^((?:(?!$cdata_close).)+)($cdata_close)/is) {
+                   ($kind, $it) = (TmplTokenType::TEXT, $1);
+                   $this->_push_readahead( "$2$'" );
                     $ok_p = 1;
                 } else {
-                   ($kind, $it) = (TmplTokenType::TEXT, $this->_pop_readahead);
+                   ($kind, $it) = (TmplTokenType::TEXT, $next);
                     $ok_p = 1;
                 }
             } elsif ($this->_peek_readahead =~ /^$re_tag_compat/os) {
-               ($kind, $it) = (TmplTokenType::TAG, "$1>");
-               $this->_set_readahead( $3 );
-               $ok_p = 1;
-               warn_normal "SGML \"closed start tag\" notation: $1<\n", $this->line_number_start if $2 eq '';
-           } elsif ($this->_peek_readahead =~ /^<!--(?:(?!-->).)*-->/s) {
+               # If we detect a "closed start tag" but we know that the
+               # following token looks like a TMPL_VAR, don't stop
+               my($head, $tail, $post) = ($1, $2, $3);
+               if ($tail eq '' && $post =~ $re_tmpl_var) {
+                   # Don't bother to show the warning if we're too confused
+                   # FIXME. There's no method for _closed_start_tag_warning
+                   if (!defined $this->{'_closed_start_tag_warning'}
+                       || ($this->{'_closed_start_tag_warning'}->[0] eq $head
+                       && $this->{'_closed_start_tag_warning'}->[1] != $this->line_number - 1)) {
+                   warn_normal "Possible SGML \"closed start tag\" notation: $head<\n", $this->line_number
+                           if split(/\n/, $head) < 10;
+                   }
+                   $this->{'_closed_start_tag_warning'} = [$head, $this->line_number];
+               } else {
+                   ($kind, $it) = (TmplTokenType::TAG, "$head>");
+                   $this->_set_readahead( $post );
+                   $ok_p = 1;
+                   warn_normal "SGML \"closed start tag\" notation: $head<\n", $this->line_number if $tail eq '' 
+                and $head ne '<!DOCTYPE stylesheet ['; # another bit of temporary ugliness for bug 4472
+               }
+           } elsif ($this->_peek_readahead =~ /^<!--(?:(?!-->)$re_directive*.)*-->/os) {
                 ($kind, $it) = (TmplTokenType::COMMENT, $&);
                 $this->_set_readahead( $' );
                 $ok_p = 1;
-               warn_normal "Syntax error in comment: $&\n", $this->line_number_start;
-               $this->_set_syntaxerror( 1 );
+               $bad_comment_p = 1;
             }
         last if $ok_p;
             my $next = scalar <$h>;
@@ -347,13 +495,17 @@ sub _next_token_internal {
             $kind = TmplTokenType::DECL;
             $kind = TmplTokenType::COMMENT if $it =~ /^<!--(?:(?!-->).)*-->/;
             if ($kind == TmplTokenType::COMMENT && $it =~ /^<!--\s*#include/s) {
-               warn_normal "Apache #include directive found instead of HTML::Template directive <TMPL_INCLUDE>", $this->line_number_start;
+               warn_normal "Apache #include directive found instead of HTML::Template <TMPL_INCLUDE> directive", $this->line_number_start;
             }
         } elsif ($it =~ /^<\?/) {
             $kind = TmplTokenType::PI;
         }
         if ($it =~ /^$re_directive/ios && !$this->cdata_mode_p) {
             $kind = TmplTokenType::DIRECTIVE;
+       } elsif ($bad_comment_p) {
+           warn_normal sprintf("Syntax error in comment: %s\n", $it),
+                   $this->line_number_start;
+           $this->_set_syntaxerror( 1 );
         }
         if (!$ok_p && $eof_p) {
             ($kind, $it) = (TmplTokenType::UNKNOWN, $this->_peek_readahead);
@@ -361,48 +513,375 @@ sub _next_token_internal {
             $this->_set_syntaxerror( 1 );
         }
      }
-    warn_normal "Unrecognizable token found: $it\n", $this->line_number_start
-           if $kind eq TmplTokenType::UNKNOWN;
+    warn_normal "Unrecognizable token found: "
+           . (split(/\n/, $it) < 10? $it: '(too confused to show details)')
+           . "\n", $this->line_number_start
+       if $kind == TmplTokenType::UNKNOWN;
      return defined $it? (ref $it? $it: TmplToken->new($it, $kind, $this->line_number, $this->filename)): undef;
  }
  
-sub next_token {
+sub _next_token_intermediate {
      my $this = shift;
      my $h = $this->_handle;
      my $it;
      if (!$this->cdata_mode_p) {
         $it = $this->_next_token_internal($h);
-       if (defined $it && $it->type eq TmplTokenType::TAG) {
-           if ($it->string =~ /^<(script|style|textarea)\b/i) {
+       if (defined $it && $it->type == TmplTokenType::TAG) {
+           if ($it->string =~ /^<(script|style|textarea)\b/is ||
+            ($this->filename =~ /(opensearch)|(opac-showreviews-rss)/ && $it->string =~ /^<(description)\b/) # FIXME special case to handle
+                                                                                    # a CDATA in opac-opensearch.tmpl and opac-showreviews-rss.tmpl
+           ) {
                 $this->_set_cdata_mode( 1 );
                 $this->_set_cdata_close( "</$1\\s*>" );
+               $this->_set_pcdata_mode( 0 );
+               $this->_set_js_mode( lc($1) eq 'script' );
+#          } elsif ($it->string =~ /^<(title)\b/is) {
+#              $this->_set_cdata_mode( 1 );
+#              $this->_set_cdata_close( "</$1\\s*>" );
+#              $this->_set_pcdata_mode( 1 );
             }
             $it->set_attributes( $this->_extract_attributes($it->string, $it->line_number) );
         }
      } else {
+       my $eof_p = 0;
         for ($it = '', my $cdata_close = $this->cdata_close;;) {
             my $next = $this->_next_token_internal($h);
-       last if !defined $next;
-           if (defined $next && $next->string =~ /$cdata_close/i) {
+           $eof_p = !defined $next;
+       last if $eof_p;
+           if (defined $next && $next->string =~ /$cdata_close/is) {
                 $this->_push_readahead( $next ); # push entire TmplToken object
                 $this->_set_cdata_mode( 0 );
             }
         last unless $this->cdata_mode_p;
             $it .= $next->string;
         }
-       $it = TmplToken->new( $it, TmplTokenType::CDATA, $this->line_number );
-       $this->_set_cdata_close, undef;
+       if ($eof_p) {
+           $it = undef;
+           error_normal "Unexpected end of file while looking for "
+                   . $this->cdata_close
+                   . "\n", $this->line_number_start;
+           $this->_set_fatal( 1 );
+           $this->_set_syntaxerror( 1 );
+       }
+       if ($this->pcdata_mode_p) {
+           my $check = $it;
+           $check =~ s/$re_directive//gos;
+           warn_pedantic "Markup found in PCDATA\n", $this->line_number,
+                           \$pedantic_error_markup_in_pcdata_p
+                   if $check =~ /$re_tag_compat/s;
+       }
+       # PCDATA should be treated as text, not CDATA
+       # Actually it should be treated as TEXT_PARAMETRIZED :-(
+       $it = TmplToken->new( $it,
+                       ($this->pcdata_mode_p?
+                           TmplTokenType::TEXT: TmplTokenType::CDATA),
+                       $this->line_number, $this->filename )
+               if defined $it;
+       if ($this->js_mode_p) {
+           my $s0 = $it->string;
+           my @head = ();
+           my @tail = ();
+           if ($s0 =~ /^(\s*<!--\s*)(.*)(\s*--\s*>\s*)$/s) {
+               push @head, $1;
+               push @tail, $3;
+               $s0 = $2;
+           }
+           push @head, split_js $s0;
+           $it->set_js_data( identify_js_translatables(@head, @tail) );
+       }
+       $this->_set_pcdata_mode, 0;
+       $this->_set_cdata_close, undef unless !defined $it;
+    }
+    return $it;
+}
+
+sub _token_groupable1_p ($) { # as first token, groupable into TEXT_PARAMETRIZED
+    my($t) = @_;
+    return ($t->type == TmplTokenType::TEXT && $t->string !~ /^[,\.:\|\s]+$/is)
+       || ($t->type == TmplTokenType::DIRECTIVE
+               && $t->string =~ /^(?:$re_tmpl_var)$/os)
+       || ($t->type == TmplTokenType::TAG
+               && ($t->string =~ /^<(?:a|b|em|h[123456]|i|u)\b/is
+               || ($t->string =~ /^<input\b/is
+                   && $t->attributes->{'type'}->[1] =~ /^(?:radio|text)$/is)
+                   ))
+}
+
+sub _token_groupable2_p ($) { # as other token, groupable into TEXT_PARAMETRIZED
+    my($t) = @_;
+    return ($t->type == TmplTokenType::TEXT && ($t->string =~ /^\s*$/s || $t->string !~ /^[\|\s]+$/is))
+       || ($t->type == TmplTokenType::DIRECTIVE
+               && $t->string =~ /^(?:$re_tmpl_var)$/os)
+       || ($t->type == TmplTokenType::TAG
+               && ($t->string =~ /^<\/?(?:a|b|em|h[123456]|i|u)\b/is
+               || ($t->string =~ /^<input\b/is
+                   && $t->attributes->{'type'}->[1] =~ /^(?:radio|text)$/is)))
+}
+
+sub _quote_cformat ($) {
+    my($s) = @_;
+    $s =~ s/%/%%/g;
+    return $s;
+}
+
+sub string_canon ($) {
+    my($s) = @_;
+    if (1) { # FIXME
+       # Fold all whitespace into single blanks
+       $s =~ s/\s+/ /gs;
+    }
+    return $s;
+}
+
+sub _formalize_string_cformat ($) {
+    my($s) = @_;
+    return _quote_cformat string_canon $s;
+}
+
+sub _formalize ($) {
+    my($t) = @_;
+    return $t->type == TmplTokenType::DIRECTIVE? '%s':
+          $t->type == TmplTokenType::TEXT?
+                  _formalize_string_cformat($t->string):
+          $t->type == TmplTokenType::TAG?
+                  ($t->string =~ /^<a\b/is? '<a>':
+                   $t->string =~ /^<input\b/is? (
+                           lc $t->attributes->{'type'}->[1] eq 'text' ? '%S':
+                           '%p'):
+                   _quote_cformat($t->string)):
+              _quote_cformat($t->string);
+}
+
+sub _optimize {
+    my $this = shift;
+    my @structure = @_;
+    my $undo_trailing_blanks = sub {
+               for (my $i = $#structure; $i >= 0; $i -= 1) {
+               last unless ($structure[$i]->type == TmplTokenType::TEXT && blank_p($structure[$i]->string)) ;#|| ($structure[$i]->type == TmplTokenType::TAG && $structure[$i]->string =~ /^<br\b/is);
+                   # Queue element structure: [reanalysis-p, token]
+                   push @{$this->{_queue}}, [1, pop @structure];
+               }
+           };
+    &$undo_trailing_blanks;
+    while (@structure >= 2) {
+       my $something_done_p = 0;
+       # FIXME: If the last token is a close tag but there are no tags
+       # FIXME: before it, drop the close tag back into the queue. This
+       # FIXME: is an ugly hack to get rid of "foo %s</h1>" type mess.
+       if (@structure >= 2
+               && $structure[$#structure]->type == TmplTokenType::TAG
+               && $structure[$#structure]->string =~ /^<\//s) {
+           my $has_other_tags_p = 0;
+           for (my $i = 0; $i < $#structure; $i += 1) {
+               $has_other_tags_p = 1
+                       if $structure[$i]->type == TmplTokenType::TAG;
+           last if $has_other_tags_p;
+           }
+           if (!$has_other_tags_p) {
+               push @{$this->{_queue}}, [0, pop @structure]
+               &$undo_trailing_blanks;
+               $something_done_p = 1;
+           }
+       }
+       # FIXME: Do the same ugly hack for the last token being a ( or [
+       if (@structure >= 2
+               && $structure[$#structure]->type == TmplTokenType::TEXT
+               && $structure[$#structure]->string =~ /^[\(\[]$/) { # not )]
+           push @{$this->{_queue}}, [1, pop @structure];
+           &$undo_trailing_blanks;
+           $something_done_p = 1;
+       }
+       # FIXME: If the first token is an open tag, but there is no
+       # FIXME: corresponding close tag, "drop the open tag", i.e.,
+       # FIXME: requeue everything for reanalysis, except the frist tag. :-(
+       if (@structure >= 2
+               && $structure[0]->type == TmplTokenType::TAG
+               && $structure[0]->string =~ /^<([a-z0-9]+)/is
+               && (my $tag = $1) !~ /^(?:br|hr|img|input)\b/is
+       ) {
+           my $tag_open_count = 1;
+           for (my $i = 1; $i <= $#structure; $i += 1) {
+               if ($structure[$i]->type == TmplTokenType::TAG) {
+                   if ($structure[$i]->string =~ /^<(\/?)$tag\b/is) {
+                       $tag_open_count += ($1? -1: +1);
+                   }
+               }
+           }
+           if ($tag_open_count > 0) {
+               for (my $i = $#structure; $i; $i -= 1) {
+                   push @{$this->{_queue}}, [1, pop @structure];
+               }
+               $something_done_p = 1;
+           }
+       }
+       # FIXME: If the first token is an open tag, the last token is the
+       # FIXME: corresponding close tag, and there are no other close tags 
+       # FIXME: inbetween, requeue the tokens from the second token on,
+       # FIXME: flagged as ok for re-analysis
+       if (@structure >= 3
+               && $structure[0]->type == TmplTokenType::TAG
+               && $structure[0]->string =~ /^<([a-z0-9]+)/is && (my $tag = $1)
+               && $structure[$#structure]->type == TmplTokenType::TAG
+               && $structure[$#structure]->string =~ /^<\/$1\s*>$/is) {
+           my $has_other_open_or_close_tags_p = 0;
+           for (my $i = 1; $i < $#structure; $i += 1) {
+               $has_other_open_or_close_tags_p = 1
+                       if $structure[$i]->type == TmplTokenType::TAG
+                       && $structure[$i]->string =~ /^<\/?$tag\b/is;
+           last if $has_other_open_or_close_tags_p;
+           }
+           if (!$has_other_open_or_close_tags_p) {
+               for (my $i = $#structure; $i; $i -= 1) {
+                   push @{$this->{_queue}}, [1, pop @structure];
+               }
+               $something_done_p = 1;
+           }
+       }
+    last if !$something_done_p;
+    }
+    return @structure;
+}
+
+sub looks_plausibly_like_groupable_text_p (@) {
+    my @structure = @_;
+    # The text would look plausibly groupable if all open tags are also closed.
+    my @tags = ();
+    my $error_p = 0;
+    for (my $i = 0; $i <= $#structure; $i += 1) {
+       if ($structure[$i]->type == TmplTokenType::TAG) {
+           my $form = $structure[$i]->string;
+           if ($form =~ /^<([A-Z0-9]+)/is) {
+               my $tag = lc($1);
+               if ($tag !~ /^(?:br|input)$/is && $form !~ /\/>$/is) {
+                   push @tags, $tag;
+               }
+           } elsif ($form =~ /^<\/([A-Z0-9]+)/is) {
+               if (@tags && lc($1) eq $tags[$#tags]) {
+                   pop @tags;
+               } else {
+                   $error_p = 1;
+               }
+           }
+       } elsif ($structure[$i]->type != TmplTokenType::TEXT) {
+           $error_p = 1;
+       }
+    last if $error_p;
+    }
+    return !$error_p && !@tags;
+}
+
+sub next_token {
+    my $this = shift;
+    my $h = $this->_handle;
+    my $it;
+    $this->{_queue} = [] unless defined $this->{_queue};
+
+    # Elements in the queue are ordered pairs. The first in the ordered pair
+    # specifies whether we are allowed to reanalysis; the second is the token.
+    if (@{$this->{_queue}} && !$this->{_queue}->[$#{$this->{_queue}}]->[0]) {
+       $it = (pop @{$this->{_queue}})->[1];
+    } else {
+       if (@{$this->{_queue}}) {
+           $it = (pop @{$this->{_queue}})->[1];
+       } else {
+           $it = $this->_next_token_intermediate($h);
+       }
+       if (!$this->cdata_mode_p && $this->allow_cformat_p && defined $it
+           && ($it->type == TmplTokenType::TEXT?
+               !blank_p( $it->string ): _token_groupable1_p( $it ))) {
+           my @structure = ( $it );
+           my @tags = ();
+           my $next = undef;
+           my($nonblank_text_p, $parametrized_p, $with_anchor_p, $with_input_p) = (0, 0, 0, 0);
+           if ($it->type == TmplTokenType::TEXT) {
+               $nonblank_text_p = 1 if !blank_p( $it->string );
+           } elsif ($it->type == TmplTokenType::DIRECTIVE) {
+               $parametrized_p = 1;
+           } elsif ($it->type == TmplTokenType::TAG && $it->string =~ /^<([A-Z0-9]+)/is) {
+               my $tag = lc($1);
+               push @tags, $tag if $tag !~ /^(?:br|input)$/i;
+               $with_anchor_p = 1 if $tag eq 'a';
+               $with_input_p = 1 if $tag eq 'input';
+           }
+           # We hate | and || in msgid strings, so we try to avoid them
+           for (my $i = 1, my $quit_p = 0, my $quit_next_p = ($it->type == TmplTokenType::TEXT && $it->string =~ /^\|+$/s);; $i += 1) {
+               if (@{$this->{_queue}}) {
+                   $next = (pop @{$this->{_queue}})->[1];
+               } else {
+                   $next = $this->_next_token_intermediate($h);
+               }
+               push @structure, $next; # for consistency (with initialization)
+           last unless defined $next && _token_groupable2_p( $next );
+           last if $quit_next_p;
+               if ($next->type == TmplTokenType::TEXT) {
+                   $nonblank_text_p = 1 if !blank_p( $next->string );
+                   $quit_p = 1 if $next->string =~ /^\|+$/s; # We hate | and ||
+               } elsif ($next->type == TmplTokenType::DIRECTIVE) {
+                   $parametrized_p = 1;
+               } elsif ($next->type == TmplTokenType::TAG) {
+                   if ($next->string =~ /^<([A-Z0-9]+)/is) {
+                       my $tag = lc($1);
+                       push @tags, $tag if $tag !~ /^(?:br|input)$/i;
+                       $with_anchor_p = 1 if $tag eq 'a';
+                       $with_input_p = 1 if $tag eq 'input';
+                   } elsif ($next->string =~ /^<\/([A-Z0-9]+)/is) {
+                       my $close = lc($1);
+                       $quit_p = 1 unless @tags && $close eq $tags[$#tags];
+                       $quit_next_p = 1 if $close =~ /^h\d$/;
+                       pop @tags;
+                   }
+               }
+           last if $quit_p;
+           }
+           # Undo the last token, allowing reanalysis
+           push @{$this->{_queue}}, [1, pop @structure];
+           # Simply it a bit more
+           @structure = $this->_optimize( @structure );
+           if (@structure < 2) {
+               # Nothing to do
+               ;
+           } elsif ($nonblank_text_p && ($parametrized_p || $with_anchor_p || $with_input_p)) {
+               # Create the corresponding c-format string
+               my $string = join('', map { $_->string } @structure);
+               my $form = join('', map { _formalize $_ } @structure);
+               my($a_counter, $input_counter) = (0, 0);
+               $form =~ s/<a>/ $a_counter += 1, "<a$a_counter>" /egs;
+               $form =~ s/<input>/ $input_counter += 1, "<input$input_counter>" /egs;
+               $it = TmplToken->new($string, TmplTokenType::TEXT_PARAMETRIZED,
+                       $it->line_number, $it->pathname);
+               $it->set_form( $form );
+               $it->set_children( @structure );
+           } elsif ($nonblank_text_p
+                   && looks_plausibly_like_groupable_text_p( @structure )
+                   && $structure[$#structure]->type == TmplTokenType::TEXT) {
+               # Combine the strings
+               my $string = join('', map { $_->string } @structure);
+               $it = TmplToken->new($string, TmplTokenType::TEXT,
+                       $it->line_number, $it->pathname);;
+           } else {
+               # Requeue the tokens thus seen for re-emitting, allow reanalysis
+               for (;;) {
+                   push @{$this->{_queue}}, [1, pop @structure];
+               last if !@structure;
+               }
+               $it = (pop @{$this->{_queue}})->[1];
+           }
+       }
+    }
+    if (defined $it && $it->type == TmplTokenType::TEXT) {
+       my $form = string_canon $it->string;
+       $it->set_form( $form );
      }
      return $it;
  }
  
  ###############################################################################
  
-# Other easy functions
+# Other simple functions (These are not methods)
  
  sub blank_p ($) {
      my($s) = @_;
-    return $s =~ /^(?:\s|\&nbsp$re_end_entity|$re_tmpl_var)*$/os;
+    return $s =~ /^(?:\s|\&nbsp$re_end_entity|$re_tmpl_var|$re_xsl)*$/os;
  }
  
  sub trim ($) {
@@ -414,26 +893,255 @@ sub trim ($) {
      return wantarray? (substr($s0, 0, $l1), $s, substr($s0, $l0 - $l2)): $s;
  }
  
+sub quote_po ($) {
+    my($s) = @_;
+    # Locale::PO->quote is buggy, it doesn't quote newlines :-/
+    $s =~ s/([\\"])/\\\1/gs;
+    $s =~ s/\n/\\n/g;
+    #$s =~ s/[\177-\377]/ sprintf("\\%03o", ord($&)) /egs;
+    return "\"$s\"";
+}
+
+# Some functions that shouldn't be here... should be moved out some time
+sub parametrize ($$$$) {
+    my($fmt_0, $cformat_p, $t, $f) = @_;
+    my $it = '';
+    if ($cformat_p) {
+       my @params = $t->parameters_and_fields;
+       for (my $n = 0, my $fmt = $fmt_0; length $fmt;) {
+           if ($fmt =~ /^[^%]+/) {
+               $fmt = $';
+               $it .= $&;
+           } elsif ($fmt =~ /^%%/) {
+               $fmt = $';
+               $it .= '%';
+           } elsif ($fmt =~ /^%(?:(\d+)\$)?(?:(\d+)(?:\.(\d+))?)?s/s) {
+               $n += 1;
+               my($i, $width, $prec) = ((defined $1? $1: $n), $2, $3);
+               $fmt = $';
+               if (defined $width && defined $prec && !$width && !$prec) {
+                   ;
+               } elsif (defined $params[$i - 1]) {
+                   my $param = $params[$i - 1];
+                   warn_normal "$fmt_0: $&: Expected a TMPL_VAR, but found a "
+                           . $param->type->to_string . "\n", undef
+                           if $param->type != TmplTokenType::DIRECTIVE;
+                   warn_normal "$fmt_0: $&: Unsupported "
+                               . "field width or precision\n", undef
+                           if defined $width || defined $prec;
+                   warn_normal "$fmt_0: $&: Parameter $i not known", undef
+                           unless defined $param;
+                   $it .= defined $f? &$f( $param ): $param->string;
+               }
+           } elsif ($fmt =~ /^%(?:(\d+)\$)?(?:(\d+)(?:\.(\d+))?)?([pS])/s) {
+               $n += 1;
+               my($i, $width, $prec, $conv) = ((defined $1? $1: $n), $2, $3, $4);
+               $fmt = $';
+
+               my $param = $params[$i - 1];
+               if (!defined $param) {
+                   warn_normal "$fmt_0: $&: Parameter $i not known", undef;
+               } else {
+                   if ($param->type == TmplTokenType::TAG
+                           && $param->string =~ /^<input\b/is) {
+                       my $type = defined $param->attributes?
+                               lc($param->attributes->{'type'}->[1]): undef;
+                       if ($conv eq 'S') {
+                           warn_normal "$fmt_0: $&: Expected type=text, "
+                                       . "but found type=$type", undef
+                                   unless $type eq 'text';
+                       } elsif ($conv eq 'p') {
+                           warn_normal "$fmt_0: $&: Expected type=radio, "
+                                       . "but found type=$type", undef
+                                   unless $type eq 'radio';
+                       }
+                   } else {
+                       warn_normal "$&: Expected an INPUT, but found a "
+                               . $param->type->to_string . "\n", undef
+                   }
+                   warn_normal "$fmt_0: $&: Unsupported "
+                               . "field width or precision\n", undef
+                           if defined $width || defined $prec;
+                   $it .= defined $f? &$f( $param ): $param->string;
+               }
+           } elsif ($fmt =~ /^%[^%a-zA-Z]*[a-zA-Z]/) {
+               $fmt = $';
+               $it .= $&;
+               die "$&: Unknown or unsupported format specification\n"; #XXX
+           } else {
+               die "$&: Completely confused parametrizing\n";#XXX
+           }
+       }
+    }
+    my @anchors = $t->anchors;
+    for (my $n = 0, my $fmt = $it, $it = ''; length $fmt;) {
+       if ($fmt =~ /^(?:(?!<a\d+>).)+/is) {
+           $fmt = $';
+           $it .= $&;
+       } elsif ($fmt =~ /^<a(\d+)>/is) {
+           $n += 1;
+           my $i  = $1;
+           $fmt = $';
+           my $anchor = $anchors[$i - 1];
+           warn_normal "$&: Anchor $1 not found for msgid \"$fmt_0\"", undef #FIXME
+                   unless defined $anchor;
+           $it .= $anchor->string;
+       } else {
+           die "Completely confused decoding anchors: $fmt\n";#XXX
+       }
+    }
+    return $it;
+}
+
+sub charset_canon ($) {
+    my($charset) = @_;
+    $charset = uc($charset);
+    $charset = "$1-$2" if $charset =~ /^(ISO|UTF)(\d.*)/i;
+    $charset = 'Big5' if $charset eq 'BIG5'; # "Big5" must be in mixed case
+    return $charset;
+}
+
+use vars qw( @latin1_utf8 );
+@latin1_utf8 = (
+    "\302\200", "\302\201", "\302\202", "\302\203", "\302\204", "\302\205",
+    "\302\206", "\302\207", "\302\210", "\302\211", "\302\212", "\302\213",
+    "\302\214", "\302\215",   undef,      undef,    "\302\220", "\302\221",
+    "\302\222", "\302\223", "\302\224", "\302\225", "\302\226", "\302\227",
+    "\302\230", "\302\231", "\302\232", "\302\233", "\302\234", "\302\235",
+    "\302\236", "\302\237", "\302\240", "\302\241", "\302\242", "\302\243",
+    "\302\244", "\302\245", "\302\246", "\302\247", "\302\250", "\302\251",
+    "\302\252", "\302\253", "\302\254", "\302\255", "\302\256", "\302\257",
+    "\302\260", "\302\261", "\302\262", "\302\263", "\302\264", "\302\265",
+    "\302\266", "\302\267", "\302\270", "\302\271", "\302\272", "\302\273",
+    "\302\274", "\302\275", "\302\276", "\302\277", "\303\200", "\303\201",
+    "\303\202", "\303\203", "\303\204", "\303\205", "\303\206", "\303\207",
+    "\303\210", "\303\211", "\303\212", "\303\213", "\303\214", "\303\215",
+    "\303\216", "\303\217", "\303\220", "\303\221", "\303\222", "\303\223",
+    "\303\224", "\303\225", "\303\226", "\303\227", "\303\230", "\303\231",
+    "\303\232", "\303\233", "\303\234", "\303\235", "\303\236", "\303\237",
+    "\303\240", "\303\241", "\303\242", "\303\243", "\303\244", "\303\245",
+    "\303\246", "\303\247", "\303\250", "\303\251", "\303\252", "\303\253",
+    "\303\254", "\303\255", "\303\256", "\303\257", "\303\260", "\303\261",
+    "\303\262", "\303\263", "\303\264", "\303\265", "\303\266", "\303\267",
+    "\303\270", "\303\271", "\303\272", "\303\273", "\303\274", "\303\275",
+    "\303\276", "\303\277" );
+
+sub charset_convert ($$$) {
+    my($s, $charset_in, $charset_out) = @_;
+    if ($s !~ /[\200-\377]/s) { # FIXME: don't worry about iso2022 for now
+       ;
+    } elsif ($charset_in eq 'ISO-8859-1' && $charset_out eq 'UTF-8') {
+       $s =~ s/[\200-\377]/ $latin1_utf8[ord($&) - 128] /egs;
+    } elsif ($charset_in ne $charset_out) {
+       VerboseWarnings::warn_normal "conversion from $charset_in to $charset_out is not supported\n", undef;
+    }
+    return $s;
+}
+
  ###############################################################################
  
-=head1 FUTURE PLANS
+=pod
+
+In addition to the basic scanning, this class will also perform
+the following:
+
+=over
+
+=item -
+
+Emulation of c-format strings (see below)
+
+=item -
+
+Display of warnings for certain things that affects either the
+ability of this class to yield correct output, or things that
+are known to cause the original template to cause trouble.
+
+=item -
+
+Automatic correction of some of the things warned about
+(e.g., SGML "closed start tag" notation).
+
+=back
+
+=head2 c-format strings emulation
+
+Because English word order is not universal, a simple extraction
+of translatable strings may yield some strings like "Accounts for"
+or ambiguous strings like "in". This makes the resulting strings
+difficult to translate, but does not affect all languages alike.
+For example, Chinese (with a somewhat different word order) would
+be hit harder, but French would be relatively unaffected.
+
+To overcome this problem, the scanner can be configured to detect
+patterns with <TMPL_VAR> directives (as well as certain HTML tags),
+and try to construct a larger pattern that will appear in the PO
+file as c-format strings with %s placeholders. This additional
+step allows the translator to deal with cases where word order
+is different (replacing %s with %1$s, %2$s, etc.), or when certain
+words will require certain inflectional suffixes in sentences.
+
+Because this is an incompatible change, this mode must be explicitly
+turned on using the set_cformat(1) method call.
  
-Code could be written to detect template variables and
-construct gettext-c-format-string-like meta-strings (e.g., "Results %s
-through %s of %s records" that will be more likely to be translatable
-to languages where word order is very unlike English word order.
-This will be relatively major rework, requiring corresponding
-rework in tmpl_process.pl
+=head2 The flag characters
  
-Gettext-style line number references would also be very helpful in
-disambiguating the strings. Ultimately, we should generate and work
-with gettext-style po files, so that translators are able to use
-tools designed for gettext.
+The character % is followed by zero or more of the following flags:
  
-An example of a string untranslatable to Chinese is "Accounts for";
-"Accounts for %s", however, would be translatable. Short words like
-"in" would also be untranslatable, not only to Chinese, but also to
-languages requiring declension of nouns.
+=over
+
+=item #
+
+The value comes from HTML <INPUT> elements.
+This abuse of the flag character is somewhat reasonable,
+since TMPL_VAR and INPUT are both variables, but of different kinds.
+
+=back
+
+=head2 The field width and precision
+
+An optional 0.0 can be specified for %s to specify
+that the <TMPL_VAR> should be suppressed.
+
+=head2 The conversion specifier
+
+=over
+
+=item p
+
+Specifies any input field that is neither text nor hidden
+(which currently mean radio buttons).
+The p conversion specifier is chosen because this does not
+evoke any certain sensible data type.
+
+=item S
+
+Specifies a text input field (<INPUT TYPE=TEXT>).
+This use of the S conversion specifier is somewhat reasonable,
+since text input fields contain values of undeterminable type,
+which can be treated as strings.
+
+=item s
+
+Specifies a <TMPL_VAR>.
+This use of the o conversion specifier is somewhat reasonable,
+since <TMPL_VAR> denotes values of undeterminable type, which
+can be treated as strings.
+
+=back
+
+=head1 BUGS
+
+There is no code to save the tag name anywhere in the scanned token.
+
+The use of <AI<i>> to stand for the I<i>th anchor
+is not very well thought out.
+Some abuse of c-format specifies might have been more appropriate.
+
+=head1 HISTORY
+
+This tokenizer is mostly based
+on Ambrose's hideous Perl script known as subst.pl.
  
  =cut