+ if ($eof_p) {
+ $it = undef;
+ error_normal "Unexpected end of file while looking for "
+ . $this->cdata_close
+ . "\n", $this->line_number_start;
+ $this->_set_fatal( 1 );
+ $this->_set_syntaxerror( 1 );
+ }
+ if ($this->pcdata_mode_p) {
+ my $check = $it;
+ $check =~ s/$re_directive//gos;
+ warn_pedantic "Markup found in PCDATA\n", $this->line_number,
+ \$pedantic_error_markup_in_pcdata_p
+ if $check =~ /$re_tag_compat/s;
+ }
+ # PCDATA should be treated as text, not CDATA
+ # Actually it should be treated as TEXT_PARAMETRIZED :-(
+ $it = TmplToken->new( $it,
+ ($this->pcdata_mode_p?
+ TmplTokenType::TEXT: TmplTokenType::CDATA),
+ $this->line_number )
+ if defined $it;
+ $this->_set_pcdata_mode, 0;
+ $this->_set_cdata_close, undef unless !defined $it;
+ }
+ return $it;
+}
+
+sub _token_groupable1_p ($) { # as first token, groupable into TEXT_PARAMETRIZED
+ my($t) = @_;
+ return ($t->type == TmplTokenType::TEXT && $t->string !~ /^[,\.:\|\s]+$/is)
+ || ($t->type == TmplTokenType::DIRECTIVE
+ && $t->string =~ /^(?:$re_tmpl_var)$/os)
+ || ($t->type == TmplTokenType::TAG
+ && ($t->string =~ /^<(?:a|b|em|h[123456]|i|u)\b/is
+ || ($t->string =~ /^<input\b/is
+ && $t->attributes->{'type'}->[1] =~ /^(?:radio|text)$/is)
+ ))
+}
+
+sub _token_groupable2_p ($) { # as other token, groupable into TEXT_PARAMETRIZED
+ my($t) = @_;
+ return ($t->type == TmplTokenType::TEXT && ($t->string =~ /^\s*$/s || $t->string !~ /^[\|\s]+$/is))
+ || ($t->type == TmplTokenType::DIRECTIVE
+ && $t->string =~ /^(?:$re_tmpl_var)$/os)
+ || ($t->type == TmplTokenType::TAG
+ && ($t->string =~ /^<\/?(?:a|b|em|h[123456]|i|u)\b/is
+ || ($t->string =~ /^<input\b/is
+ && $t->attributes->{'type'}->[1] =~ /^(?:radio|text)$/is)))
+}
+
+sub _quote_cformat ($) {
+ my($s) = @_;
+ $s =~ s/%/%%/g;
+ return $s;
+}
+
+sub string_canon ($) {
+ my($s) = @_;
+ if (1) { # FIXME
+ # Fold all whitespace into single blanks
+ $s =~ s/\s+/ /gs;
+ }
+ return $s;
+}
+
+sub _formalize_string_cformat ($) {
+ my($s) = @_;
+ return _quote_cformat string_canon $s;
+}
+
+sub _formalize ($) {
+ my($t) = @_;
+ return $t->type == TmplTokenType::DIRECTIVE? '%s':
+ $t->type == TmplTokenType::TEXT?
+ _formalize_string_cformat($t->string):
+ $t->type == TmplTokenType::TAG?
+ ($t->string =~ /^<a\b/is? '<a>':
+ $t->string =~ /^<input\b/is? (
+ lc $t->attributes->{'type'}->[1] eq 'text' ? '%S':
+ '%p'):
+ _quote_cformat($t->string)):
+ _quote_cformat($t->string);
+}
+
+sub _optimize {
+ my $this = shift;
+ my @structure = @_;
+ my $undo_trailing_blanks = sub {
+ for (my $i = $#structure; $i >= 0; $i -= 1) {
+ last if $structure[$i]->type != TmplTokenType::TEXT;
+ last if !blank_p($structure[$i]->string);
+ # Queue element structure: [reanalysis-p, token]
+ push @{$this->{_queue}}, [1, pop @structure];
+ }
+ };
+ &$undo_trailing_blanks;
+ while (@structure >= 2) {
+ my $something_done_p = 0;
+ # FIXME: If the last token is a close tag but there are no tags
+ # FIXME: before it, drop the close tag back into the queue. This
+ # FIXME: is an ugly hack to get rid of "foo %s</h1>" type mess.
+ if (@structure >= 2
+ && $structure[$#structure]->type == TmplTokenType::TAG
+ && $structure[$#structure]->string =~ /^<\//s) {
+ my $has_other_tags_p = 0;
+ for (my $i = 0; $i < $#structure; $i += 1) {
+ $has_other_tags_p = 1
+ if $structure[$i]->type == TmplTokenType::TAG;
+ last if $has_other_tags_p;
+ }
+ if (!$has_other_tags_p) {
+ push @{$this->{_queue}}, [0, pop @structure]
+ &$undo_trailing_blanks;
+ $something_done_p = 1;
+ }
+ }
+ # FIXME: Do the same ugly hack for the last token being a ( or [
+ if (@structure >= 2
+ && $structure[$#structure]->type == TmplTokenType::TEXT
+ && $structure[$#structure]->string =~ /^[\(\[]$/) { # not )]
+ push @{$this->{_queue}}, [1, pop @structure];
+ &$undo_trailing_blanks;
+ $something_done_p = 1;
+ }
+ # FIXME: If the first token is an open tag, the last token is the
+ # FIXME: corresponding close tag, and there are no other close tags
+ # FIXME: inbetween, requeue the tokens from the second token on,
+ # FIXME: flagged as ok for re-analysis
+ if (@structure >= 3
+ && $structure[0]->type == TmplTokenType::TAG
+ && $structure[0]->string =~ /^<([a-z0-9])/is && (my $tag = $1)
+ && $structure[$#structure]->type == TmplTokenType::TAG
+ && $structure[$#structure]->string =~ /^<\/$1\s*>$/is) {
+ my $has_other_open_or_close_tags_p = 0;
+ for (my $i = 1; $i < $#structure; $i += 1) {
+ $has_other_open_or_close_tags_p = 1
+ if $structure[$i]->type == TmplTokenType::TAG
+ && $structure[$i]->string =~ /^<\/?$tag\b/is;
+ last if $has_other_open_or_close_tags_p;
+ }
+ if (!$has_other_open_or_close_tags_p) {
+ for (my $i = $#structure; $i; $i -= 1) {
+ push @{$this->{_queue}}, [1, pop @structure];
+ }
+ $something_done_p = 1;
+ }
+ }
+ last if !$something_done_p;
+ }
+ return @structure;
+}
+
+sub looks_plausibly_like_groupable_text_p (@) {
+ my @structure = @_;
+ # The text would look plausibly groupable if all open tags are also closed.
+ my @tags = ();
+ my $error_p = 0;
+ for (my $i = 0; $i <= $#structure; $i += 1) {
+ if ($structure[$i]->type == TmplTokenType::TAG) {
+ if ($structure[$i]->string =~ /^<([A-Z0-9]+)/is) {
+ my $tag = lc($1);
+ push @tags, $tag unless $tag =~ /^<(?:input)/is
+ || $tag =~ /\/>$/is;
+ } elsif ($structure[$i]->string =~ /^<\/([A-Z0-9]+)/is) {
+ if (@tags && lc($1) eq $tags[$#tags]) {
+ pop @tags;
+ } else {
+ $error_p = 1;
+ }
+ }
+ } elsif ($structure[$i]->type != TmplTokenType::TEXT) {
+ $error_p = 1;
+ }
+ last if $error_p;
+ }
+ return !$error_p && !@tags;
+}
+
+sub next_token {
+ my $this = shift;
+ my $h = $this->_handle;
+ my $it;
+ $this->{_queue} = [] unless defined $this->{_queue};
+
+ # Elements in the queue are ordered pairs. The first in the ordered pair
+ # specifies whether we are allowed to reanalysis; the second is the token.
+ if (@{$this->{_queue}} && !$this->{_queue}->[$#{$this->{_queue}}]->[0]) {
+ $it = (pop @{$this->{_queue}})->[1];
+ } else {
+ if (@{$this->{_queue}}) {
+ $it = (pop @{$this->{_queue}})->[1];
+ } else {
+ $it = $this->_next_token_intermediate($h);
+ }
+ if (!$this->cdata_mode_p && $this->allow_cformat_p && defined $it
+ && ($it->type == TmplTokenType::TEXT?
+ !blank_p( $it->string ): _token_groupable1_p( $it ))) {
+ my @structure = ( $it );
+ my @tags = ();
+ my $next = undef;
+ my($nonblank_text_p, $parametrized_p, $with_anchor_p, $with_input_p) = (0, 0, 0, 0);
+ if ($it->type == TmplTokenType::TEXT) {
+ $nonblank_text_p = 1 if !blank_p( $it->string );
+ } elsif ($it->type == TmplTokenType::DIRECTIVE) {
+ $parametrized_p = 1;
+ } elsif ($it->type == TmplTokenType::TAG && $it->string =~ /^<([A-Z0-9]+)/is) {
+ push @tags, lc($1);
+ $with_anchor_p = 1 if lc($1) eq 'a';
+ $with_input_p = 1 if lc($1) eq 'input';
+ }
+ # We hate | and || in msgid strings, so we try to avoid them
+ for (my $i = 1, my $quit_p = 0, my $quit_next_p = ($it->type == TmplTokenType::TEXT && $it->string =~ /^\|+$/s);; $i += 1) {
+ if (@{$this->{_queue}}) {
+ $next = (pop @{$this->{_queue}})->[1];
+ } else {
+ $next = $this->_next_token_intermediate($h);
+ }
+ push @structure, $next; # for consistency (with initialization)
+ last unless defined $next && _token_groupable2_p( $next );
+ last if $quit_next_p;
+ if ($next->type == TmplTokenType::TEXT) {
+ $nonblank_text_p = 1 if !blank_p( $next->string );
+ $quit_p = 1 if $next->string =~ /^\|+$/s; # We hate | and ||
+ } elsif ($next->type == TmplTokenType::DIRECTIVE) {
+ $parametrized_p = 1;
+ } elsif ($next->type == TmplTokenType::TAG) {
+ if ($next->string =~ /^<([A-Z0-9]+)/is) {
+ my $candidate = lc($1);
+ push @tags, $candidate
+ unless $candidate =~ /^(?:input)$/is;
+ $with_anchor_p = 1 if lc($1) eq 'a';
+ $with_input_p = 1 if lc($1) eq 'input';
+ } elsif ($next->string =~ /^<\/([A-Z0-9]+)/is) {
+ my $close = lc($1);
+ $quit_p = 1 unless @tags && $close eq $tags[$#tags];
+ $quit_next_p = 1 if $close =~ /^h\d$/;
+ pop @tags;
+ }
+ }
+ last if $quit_p;
+ }
+ # Undo the last token, allowing reanalysis
+ push @{$this->{_queue}}, [1, pop @structure];
+ # Simply it a bit more
+ @structure = $this->_optimize( @structure );
+ if (@structure < 2) {
+ # Nothing to do
+ ;
+ } elsif ($nonblank_text_p && ($parametrized_p || $with_anchor_p || $with_input_p)) {
+ # Create the corresponding c-format string
+ my $string = join('', map { $_->string } @structure);
+ my $form = join('', map { _formalize $_ } @structure);
+ my($a_counter, $input_counter) = (0, 0);
+ $form =~ s/<a>/ $a_counter += 1, "<a$a_counter>" /egs;
+ $form =~ s/<input>/ $input_counter += 1, "<input$input_counter>" /egs;
+ $it = TmplToken->new($string, TmplTokenType::TEXT_PARAMETRIZED,
+ $it->line_number, $it->pathname);
+ $it->set_form( $form );
+ $it->set_children( @structure );
+ } elsif ($nonblank_text_p
+ && looks_plausibly_like_groupable_text_p( @structure )
+ && $structure[$#structure]->type == TmplTokenType::TEXT) {
+ # Combine the strings
+ my $string = join('', map { $_->string } @structure);
+ $it = TmplToken->new($string, TmplTokenType::TEXT,
+ $it->line_number, $it->pathname);;
+ } else {
+ # Requeue the tokens thus seen for re-emitting, allow reanalysis
+ for (;;) {
+ push @{$this->{_queue}}, [1, pop @structure];
+ last if !@structure;
+ }
+ $it = (pop @{$this->{_queue}})->[1];
+ }
+ }
+ }
+ if (defined $it && $it->type == TmplTokenType::TEXT) {
+ my $form = string_canon $it->string;
+ $it->set_form( $form );