package TmplTokenizer;
use strict;
+#use warnings; FIXME - Bug 2505
use TmplTokenType;
use TmplToken;
use VerboseWarnings qw( pedantic_p error_normal warn_normal warn_pedantic );
use vars qw( $pedantic_attribute_error_in_nonpedantic_mode_p );
use vars qw( $pedantic_tmpl_var_use_in_nonpedantic_mode_p );
+use vars qw( $pedantic_error_markup_in_pcdata_p );
###############################################################################
# Hideous stuff
use vars qw( $re_directive $re_tmpl_var $re_tmpl_var_escaped $re_tmpl_include );
-use vars qw( $re_directive_control $re_tmpl_endif_endloop );
+use vars qw( $re_directive_control $re_tmpl_endif_endloop $re_xsl);
BEGIN {
# $re_directive must not do any backreferences
$re_directive = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:VAR|LOOP|INCLUDE|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
$re_tmpl_var = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
$re_tmpl_include = q{<(?:(?i)(?:!--\s*)?TMPL_(?:INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
# TMPL_VAR ESCAPE=1/HTML/URL
+ $re_xsl = q{<\/?(?:xsl:)(?:[\s\-a-zA-Z0-9"'\/\.\[\]\@\(\):=,$]+)\/?>};
$re_tmpl_var_escaped = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR|INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))\s+ESCAPE=(?:1|HTML|URL)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
# Any control flow directive
$re_directive_control = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:LOOP|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
my $etag = $compat? '>': '<>\/';
# This is no longer similar to the original regexp in subst.pl :-(
# Note that we don't want <> in compat mode; Mozilla knows about <
- q{(<\/?(?:|(?:"(?:} . $re_directive . q{|[^"])*"|'(?:} . $re_directive . q{|[^'])*'|--(?:[^-]|-[^-])*--|(?:}
+ q{(<\/?(?:|(?:"(?:} . $re_directive . q{|[^"])*"|'(?:} . $re_directive . q{|[^'])*'|--(?:(?!--)(?:$re_directive)*.)*--|(?:}
. $re_directive
. q{|(?!--)[^"'<>} . $etag . q{]))+))([} . $etag . q{]|(?=<))(.*)};
}
sub LINENUM () {'lc'}
sub CDATA_MODE_P () {'cdata-mode-p'}
sub CDATA_CLOSE () {'cdata-close'}
+sub PCDATA_MODE_P () {'pcdata-mode-p'} # additional submode for CDATA
+sub JS_MODE_P () {'js-mode-p'} # cdata-mode-p must also be true
sub ALLOW_CFORMAT_P () {'allow-cformat-p'}
sub new {
- my $this = shift;
- my($input) = @_;
- my $class = ref($this) || $this;
- my $self = {};
- bless $self, $class;
-
- my $handle = sprintf('TMPLTOKENIZER%d', $serial);
- $serial += 1;
-
- no strict;
- open($handle, "<$input") || die "$input: $!\n";
- use strict;
- $self->{+FILENAME} = $input;
- $self->{+HANDLE} = $handle;
- $self->{+READAHEAD} = [];
- return $self;
+ shift;
+ my ($filename) = @_;
+ open my $handle,$filename or die "can't open $filename";
+ bless {
+ filename => $filename
+ , handle => $handle
+ , readahead => []
+ } , __PACKAGE__;
}
###############################################################################
sub filename {
my $this = shift;
- return $this->{+FILENAME};
+ return $this->{filename};
}
sub _handle {
my $this = shift;
- return $this->{+HANDLE};
+ return $this->{handle};
}
sub fatal_p {
sub has_readahead_p {
my $this = shift;
- return @{$this->{+READAHEAD}};
+ return @{$this->{readahead}};
}
sub _peek_readahead {
my $this = shift;
- return $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}];
+ return $this->{readahead}->[$#{$this->{readahead}}];
}
sub line_number_start {
return $this->{+CDATA_MODE_P};
}
+sub pcdata_mode_p {
+ my $this = shift;
+ return $this->{+PCDATA_MODE_P};
+}
+
+sub js_mode_p {
+ my $this = shift;
+ return $this->{+JS_MODE_P};
+}
+
sub cdata_close {
my $this = shift;
return $this->{+CDATA_CLOSE};
sub _push_readahead {
my $this = shift;
- push @{$this->{+READAHEAD}}, $_[0];
+ push @{$this->{readahead}}, $_[0];
return $this;
}
sub _pop_readahead {
my $this = shift;
- return pop @{$this->{+READAHEAD}};
+ return pop @{$this->{readahead}};
}
sub _append_readahead {
my $this = shift;
- $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}] .= $_[0];
+ $this->{readahead}->[$#{$this->{readahead}}] .= $_[0];
return $this;
}
sub _set_readahead {
my $this = shift;
- $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}] = $_[0];
+ $this->{readahead}->[$#{$this->{readahead}}] = $_[0];
return $this;
}
return $this;
}
+sub _set_pcdata_mode {
+ my $this = shift;
+ $this->{+PCDATA_MODE_P} = $_[0];
+ return $this;
+}
+
+sub _set_js_mode {
+ my $this = shift;
+ $this->{+JS_MODE_P} = $_[0];
+ return $this;
+}
+
sub _set_cdata_close {
my $this = shift;
$this->{+CDATA_CLOSE} = $_[0];
###############################################################################
+use vars qw( $js_EscapeSequence );
+BEGIN {
+ # Perl quoting is really screwed up, but this common subexp is way too long
+ $js_EscapeSequence = q{\\\\(?:['"\\\\bfnrt]|[^0-7xu]|[0-3]?[0-7]{1,2}|x[\da-fA-F]{2}|u[\da-fA-F]{4})};
+}
+sub parenleft () { '(' }
+sub parenright () { ')' }
+
+sub split_js ($) {
+ my ($s0) = @_;
+ my @it = ();
+ while (length $s0) {
+ if ($s0 =~ /^\s+/s) { # whitespace
+ push @it, $&;
+ $s0 = $';
+ } elsif ($s0 =~ /^\/\/[^\r\n]*(?:[\r\n]|$)/s) { # C++-style comment
+ push @it, $&;
+ $s0 = $';
+ } elsif ($s0 =~ /^\/\*(?:(?!\*\/).)*\*\//s) { # C-style comment
+ push @it, $&;
+ $s0 = $';
+ # Keyword or identifier, ECMA-262 p.13 (section 7.5)
+ } elsif ($s0 =~ /^[A-Z_\$][A-Z\d_\$]*/is) { # IdentifierName
+ push @it, $&;
+ $s0 = $';
+ # Punctuator, ECMA-262 p.13 (section 7.6)
+ } elsif ($s0 =~ /^(?:[\(\){}\[\];]|>>>=|<<=|>>=|[-\+\*\/\&\|\^\%]=|>>>|<<|>>|--|\+\+|\|\||\&\&|==|<=|>=|!=|[=><,!~\?:\.\-\+\*\/\&\|\^\%])/s) {
+ push @it, $&;
+ $s0 = $';
+ # DecimalLiteral, ECMA-262 p.14 (section 7.7.3); note: bug in the spec
+ } elsif ($s0 =~ /^(?:0|[1-9]\d+(?:\.\d*(?:[eE][-\+]?\d+)?)?)/s) {
+ push @it, $&;
+ $s0 = $';
+ # HexIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
+ } elsif ($s0 =~ /^0[xX][\da-fA-F]+/s) {
+ push @it, $&;
+ $s0 = $';
+ # OctalIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
+ } elsif ($s0 =~ /^0[\da-fA-F]+/s) {
+ push @it, $&;
+ $s0 = $';
+ # StringLiteral, ECMA-262 p.17 (section 7.7.4)
+ # XXX SourceCharacter doesn't seem to be defined (?)
+ } elsif ($s0 =~ /^(?:"(?:(?!["\\\r\n]).|$js_EscapeSequence)*"|'(?:(?!['\\\r\n]).|$js_EscapeSequence)*')/os) {
+ push @it, $&;
+ $s0 = $';
+ } elsif ($s0 =~ /^./) { # UNKNOWN TOKEN !!!
+ push @it, $&;
+ $s0 = $';
+ }
+ }
+ return @it;
+}
+
+sub STATE_UNDERSCORE () { 1 }
+sub STATE_PARENLEFT () { 2 }
+sub STATE_STRING_LITERAL () { 3 }
+
+# XXX This is a crazy hack. I don't want to write an ECMAScript parser.
+# XXX A scanner is one thing; a parser another thing.
+sub identify_js_translatables (@) {
+ my @input = @_;
+ my @output = ();
+ # We mark a JavaScript translatable string as in C, i.e., _("literal")
+ # For simplicity, we ONLY look for "_" "(" StringLiteral ")"
+ for (my $i = 0, my $state = 0, my($j, $q, $s); $i <= $#input; $i += 1) {
+ my $reset_state_p = 0;
+ push @output, [0, $input[$i]];
+ if ($input[$i] !~ /\S/s) {
+ ;
+ } elsif ($state == 0) {
+ $state = STATE_UNDERSCORE if $input[$i] eq '_';
+ } elsif ($state == STATE_UNDERSCORE) {
+ $state = $input[$i] eq parenleft ? STATE_PARENLEFT : 0;
+ } elsif ($state == STATE_PARENLEFT) {
+ if ($input[$i] =~ /^(['"])(.*)\1$/s) {
+ ($state, $j, $q, $s) = (STATE_STRING_LITERAL, $#output, $1, $2);
+ } else {
+ $state = 0;
+ }
+ } elsif ($state == STATE_STRING_LITERAL) {
+ if ($input[$i] eq parenright) {
+ $output[$j] = [1, $output[$j]->[1], $q, $s];
+ }
+ $state = 0;
+ } else {
+ die "identify_js_translatables internal error: Unknown state $state"
+ }
+ }
+ return \@output;
+}
+
+###############################################################################
+
sub _extract_attributes ($;$) {
my $this = shift;
my($s, $lc) = @_;
my %attr;
- $s = $1 if $s =~ /^<\S+(.*)\/\S$/s # XML-style self-closing tags
- || $s =~ /^<\S+(.*)\S$/s; # SGML-style tags
+ $s = $1 if $s =~ /^<(?:(?!$re_directive_control)\S)+(.*)\/\S$/s # XML-style self-closing tags
+ || $s =~ /^<(?:(?!$re_directive_control)\S)+(.*)\S$/s; # SGML-style tags
for (my $i = 0; $s =~ /^(?:$re_directive_control)?\s+(?:$re_directive_control)?(?:([a-zA-Z][-a-zA-Z0-9]*)\s*=\s*)?('((?:$re_directive|[^'])*)'|"((?:$re_directive|[^"])*)"|((?:$re_directive|[^\s<>])+))/os;) {
my($key, $val, $val_orig, $rest)
error_normal((scalar(split(/\n/, $s)) - 1) . " more line(s) not shown.", undef);
$this->_set_fatal( 1 );
} else {
- warn_normal "Strange attribute syntax: $s\n", $lc;
+ # There's something wrong with the attribute syntax.
+ # We might be able to deduce a likely cause by looking more.
+ if ($s =~ /^[a-z0-9]/is && "<foo $s>" =~ /^$re_tag_compat$/s) {
+ warn_normal "Probably missing whitespace before or missing quotation mark near: $s\n", $lc;
+ } else {
+ warn_normal "Strange attribute syntax: $s\n", $lc;
+ }
}
}
return \%attr;
if !$this->cdata_mode_p && $it =~ /</s;
} else { # tag/declaration/processing instruction
my $ok_p = 0;
+ my $bad_comment_p = 0;
for (my $cdata_close = $this->cdata_close;;) {
if ($this->cdata_mode_p) {
my $next = $this->_pop_readahead;
- if ($next =~ /^$cdata_close/) {
+ if ($next =~ /^$cdata_close/is) {
($kind, $it) = (TmplTokenType::TAG, $&);
$this->_push_readahead( $' );
$ok_p = 1;
- } elsif ($next =~ /^((?:(?!$cdata_close).)+)($cdata_close)/) {
+ } elsif ($next =~ /^((?:(?!$cdata_close).)+)($cdata_close)/is) {
($kind, $it) = (TmplTokenType::TEXT, $1);
$this->_push_readahead( "$2$'" );
$ok_p = 1;
my($head, $tail, $post) = ($1, $2, $3);
if ($tail eq '' && $post =~ $re_tmpl_var) {
# Don't bother to show the warning if we're too confused
+ # FIXME. There's no method for _closed_start_tag_warning
+ if (!defined $this->{'_closed_start_tag_warning'}
+ || ($this->{'_closed_start_tag_warning'}->[0] eq $head
+ && $this->{'_closed_start_tag_warning'}->[1] != $this->line_number - 1)) {
warn_normal "Possible SGML \"closed start tag\" notation: $head<\n", $this->line_number
if split(/\n/, $head) < 10;
+ }
+ $this->{'_closed_start_tag_warning'} = [$head, $this->line_number];
} else {
($kind, $it) = (TmplTokenType::TAG, "$head>");
$this->_set_readahead( $post );
$ok_p = 1;
- warn_normal "SGML \"closed start tag\" notation: $head<\n", $this->line_number if $tail eq '';
+ warn_normal "SGML \"closed start tag\" notation: $head<\n", $this->line_number if $tail eq ''
+ and $head ne '<!DOCTYPE stylesheet ['; # another bit of temporary ugliness for bug 4472
}
- } elsif ($this->_peek_readahead =~ /^<!--(?:(?!-->).)*-->/s) {
+ } elsif ($this->_peek_readahead =~ /^<!--(?:(?!-->)$re_directive*.)*-->/os) {
($kind, $it) = (TmplTokenType::COMMENT, $&);
$this->_set_readahead( $' );
$ok_p = 1;
- warn_normal "Syntax error in comment: $&\n", $this->line_number_start;
- $this->_set_syntaxerror( 1 );
+ $bad_comment_p = 1;
}
last if $ok_p;
my $next = scalar <$h>;
}
if ($it =~ /^$re_directive/ios && !$this->cdata_mode_p) {
$kind = TmplTokenType::DIRECTIVE;
+ } elsif ($bad_comment_p) {
+ warn_normal sprintf("Syntax error in comment: %s\n", $it),
+ $this->line_number_start;
+ $this->_set_syntaxerror( 1 );
}
if (!$ok_p && $eof_p) {
($kind, $it) = (TmplTokenType::UNKNOWN, $this->_peek_readahead);
if (!$this->cdata_mode_p) {
$it = $this->_next_token_internal($h);
if (defined $it && $it->type == TmplTokenType::TAG) {
- if ($it->string =~ /^<(script|style|textarea)\b/i) {
+ if ($it->string =~ /^<(script|style|textarea)\b/is ||
+ ($this->filename =~ /(opensearch)|(opac-showreviews-rss)/ && $it->string =~ /^<(description)\b/) # FIXME special case to handle
+ # a CDATA in opac-opensearch.tmpl and opac-showreviews-rss.tmpl
+ ) {
$this->_set_cdata_mode( 1 );
$this->_set_cdata_close( "</$1\\s*>" );
+ $this->_set_pcdata_mode( 0 );
+ $this->_set_js_mode( lc($1) eq 'script' );
+# } elsif ($it->string =~ /^<(title)\b/is) {
+# $this->_set_cdata_mode( 1 );
+# $this->_set_cdata_close( "</$1\\s*>" );
+# $this->_set_pcdata_mode( 1 );
}
$it->set_attributes( $this->_extract_attributes($it->string, $it->line_number) );
}
} else {
+ my $eof_p = 0;
for ($it = '', my $cdata_close = $this->cdata_close;;) {
my $next = $this->_next_token_internal($h);
- last if !defined $next;
- if (defined $next && $next->string =~ /$cdata_close/i) {
+ $eof_p = !defined $next;
+ last if $eof_p;
+ if (defined $next && $next->string =~ /$cdata_close/is) {
$this->_push_readahead( $next ); # push entire TmplToken object
$this->_set_cdata_mode( 0 );
}
last unless $this->cdata_mode_p;
$it .= $next->string;
}
- $it = TmplToken->new( $it, TmplTokenType::CDATA, $this->line_number );
- $this->_set_cdata_close, undef;
+ if ($eof_p) {
+ $it = undef;
+ error_normal "Unexpected end of file while looking for "
+ . $this->cdata_close
+ . "\n", $this->line_number_start;
+ $this->_set_fatal( 1 );
+ $this->_set_syntaxerror( 1 );
+ }
+ if ($this->pcdata_mode_p) {
+ my $check = $it;
+ $check =~ s/$re_directive//gos;
+ warn_pedantic "Markup found in PCDATA\n", $this->line_number,
+ \$pedantic_error_markup_in_pcdata_p
+ if $check =~ /$re_tag_compat/s;
+ }
+ # PCDATA should be treated as text, not CDATA
+ # Actually it should be treated as TEXT_PARAMETRIZED :-(
+ $it = TmplToken->new( $it,
+ ($this->pcdata_mode_p?
+ TmplTokenType::TEXT: TmplTokenType::CDATA),
+ $this->line_number, $this->filename )
+ if defined $it;
+ if ($this->js_mode_p) {
+ my $s0 = $it->string;
+ my @head = ();
+ my @tail = ();
+ if ($s0 =~ /^(\s*<!--\s*)(.*)(\s*--\s*>\s*)$/s) {
+ push @head, $1;
+ push @tail, $3;
+ $s0 = $2;
+ }
+ push @head, split_js $s0;
+ $it->set_js_data( identify_js_translatables(@head, @tail) );
+ }
+ $this->_set_pcdata_mode, 0;
+ $this->_set_cdata_close, undef unless !defined $it;
}
return $it;
}
sub _token_groupable1_p ($) { # as first token, groupable into TEXT_PARAMETRIZED
my($t) = @_;
- return ($t->type == TmplTokenType::TEXT && $t->string !~ /^[,\.:\|\s]+$/s)
+ return ($t->type == TmplTokenType::TEXT && $t->string !~ /^[,\.:\|\s]+$/is)
|| ($t->type == TmplTokenType::DIRECTIVE
&& $t->string =~ /^(?:$re_tmpl_var)$/os)
|| ($t->type == TmplTokenType::TAG
- && ($t->string =~ /^<(?:b|em|h[123456]|i|u)\b/is
- || ($t->string =~ /^<input/i
- && $t->attributes->{'type'} =~ /^(?:text)$/i)))
+ && ($t->string =~ /^<(?:a|b|em|h[123456]|i|u)\b/is
+ || ($t->string =~ /^<input\b/is
+ && $t->attributes->{'type'}->[1] =~ /^(?:radio|text)$/is)
+ ))
}
sub _token_groupable2_p ($) { # as other token, groupable into TEXT_PARAMETRIZED
my($t) = @_;
- return ($t->type == TmplTokenType::TEXT && ($t->string =~ /^\s*$/s || $t->string !~ /^[\|\s]+$/s))
+ return ($t->type == TmplTokenType::TEXT && ($t->string =~ /^\s*$/s || $t->string !~ /^[\|\s]+$/is))
|| ($t->type == TmplTokenType::DIRECTIVE
&& $t->string =~ /^(?:$re_tmpl_var)$/os)
|| ($t->type == TmplTokenType::TAG
&& ($t->string =~ /^<\/?(?:a|b|em|h[123456]|i|u)\b/is
- || ($t->string =~ /^<input/i
- && $t->attributes->{'type'} =~ /^(?:text)$/i)))
+ || ($t->string =~ /^<input\b/is
+ && $t->attributes->{'type'}->[1] =~ /^(?:radio|text)$/is)))
}
sub _quote_cformat ($) {
$t->type == TmplTokenType::TEXT?
_formalize_string_cformat($t->string):
$t->type == TmplTokenType::TAG?
- ($t->string =~ /^<a\b/is? '<a>': _quote_cformat($t->string)):
+ ($t->string =~ /^<a\b/is? '<a>':
+ $t->string =~ /^<input\b/is? (
+ lc $t->attributes->{'type'}->[1] eq 'text' ? '%S':
+ '%p'):
+ _quote_cformat($t->string)):
_quote_cformat($t->string);
}
my @structure = @_;
my $undo_trailing_blanks = sub {
for (my $i = $#structure; $i >= 0; $i -= 1) {
- last if $structure[$i]->type != TmplTokenType::TEXT;
- last if !blank_p($structure[$i]->string);
- push @{$this->{_queue}}, pop @structure;
+ last unless ($structure[$i]->type == TmplTokenType::TEXT && blank_p($structure[$i]->string)) ;#|| ($structure[$i]->type == TmplTokenType::TAG && $structure[$i]->string =~ /^<br\b/is);
+ # Queue element structure: [reanalysis-p, token]
+ push @{$this->{_queue}}, [1, pop @structure];
}
};
&$undo_trailing_blanks;
- # FIXME: If the last token is a close tag but there are no tags
- # FIXME: before it, drop the close tag back into the queue. This
- # FIXME: is an ugly hack to get rid of "foo %s</h1>" type mess.
- if (@structure >= 2
- && $structure[$#structure]->type == TmplTokenType::TAG
- && $structure[$#structure]->string =~ /^<\//s) {
- my $has_other_tags_p = 0;
- for (my $i = 0; $i < $#structure; $i += 1) {
- $has_other_tags_p = 1 if $structure[$i]->type == TmplTokenType::TAG;
- last if $has_other_tags_p;
+ while (@structure >= 2) {
+ my $something_done_p = 0;
+ # FIXME: If the last token is a close tag but there are no tags
+ # FIXME: before it, drop the close tag back into the queue. This
+ # FIXME: is an ugly hack to get rid of "foo %s</h1>" type mess.
+ if (@structure >= 2
+ && $structure[$#structure]->type == TmplTokenType::TAG
+ && $structure[$#structure]->string =~ /^<\//s) {
+ my $has_other_tags_p = 0;
+ for (my $i = 0; $i < $#structure; $i += 1) {
+ $has_other_tags_p = 1
+ if $structure[$i]->type == TmplTokenType::TAG;
+ last if $has_other_tags_p;
+ }
+ if (!$has_other_tags_p) {
+ push @{$this->{_queue}}, [0, pop @structure]
+ &$undo_trailing_blanks;
+ $something_done_p = 1;
+ }
}
- push @{$this->{_queue}}, pop @structure unless $has_other_tags_p;
- &$undo_trailing_blanks;
- }
- # FIXME: Do the same ugly hack for the last token being a ( or [
- if (@structure >= 2
- && $structure[$#structure]->type == TmplTokenType::TEXT
- && $structure[$#structure]->string =~ /^[\(\[]$/) { # not )]
- push @{$this->{_queue}}, pop @structure;
- &$undo_trailing_blanks;
+ # FIXME: Do the same ugly hack for the last token being a ( or [
+ if (@structure >= 2
+ && $structure[$#structure]->type == TmplTokenType::TEXT
+ && $structure[$#structure]->string =~ /^[\(\[]$/) { # not )]
+ push @{$this->{_queue}}, [1, pop @structure];
+ &$undo_trailing_blanks;
+ $something_done_p = 1;
+ }
+ # FIXME: If the first token is an open tag, but there is no
+ # FIXME: corresponding close tag, "drop the open tag", i.e.,
+ # FIXME: requeue everything for reanalysis, except the frist tag. :-(
+ if (@structure >= 2
+ && $structure[0]->type == TmplTokenType::TAG
+ && $structure[0]->string =~ /^<([a-z0-9]+)/is
+ && (my $tag = $1) !~ /^(?:br|hr|img|input)\b/is
+ ) {
+ my $tag_open_count = 1;
+ for (my $i = 1; $i <= $#structure; $i += 1) {
+ if ($structure[$i]->type == TmplTokenType::TAG) {
+ if ($structure[$i]->string =~ /^<(\/?)$tag\b/is) {
+ $tag_open_count += ($1? -1: +1);
+ }
+ }
+ }
+ if ($tag_open_count > 0) {
+ for (my $i = $#structure; $i; $i -= 1) {
+ push @{$this->{_queue}}, [1, pop @structure];
+ }
+ $something_done_p = 1;
+ }
+ }
+ # FIXME: If the first token is an open tag, the last token is the
+ # FIXME: corresponding close tag, and there are no other close tags
+ # FIXME: inbetween, requeue the tokens from the second token on,
+ # FIXME: flagged as ok for re-analysis
+ if (@structure >= 3
+ && $structure[0]->type == TmplTokenType::TAG
+ && $structure[0]->string =~ /^<([a-z0-9]+)/is && (my $tag = $1)
+ && $structure[$#structure]->type == TmplTokenType::TAG
+ && $structure[$#structure]->string =~ /^<\/$1\s*>$/is) {
+ my $has_other_open_or_close_tags_p = 0;
+ for (my $i = 1; $i < $#structure; $i += 1) {
+ $has_other_open_or_close_tags_p = 1
+ if $structure[$i]->type == TmplTokenType::TAG
+ && $structure[$i]->string =~ /^<\/?$tag\b/is;
+ last if $has_other_open_or_close_tags_p;
+ }
+ if (!$has_other_open_or_close_tags_p) {
+ for (my $i = $#structure; $i; $i -= 1) {
+ push @{$this->{_queue}}, [1, pop @structure];
+ }
+ $something_done_p = 1;
+ }
+ }
+ last if !$something_done_p;
}
return @structure;
}
my $error_p = 0;
for (my $i = 0; $i <= $#structure; $i += 1) {
if ($structure[$i]->type == TmplTokenType::TAG) {
- if ($structure[$i]->string =~ /^<([A-Z0-9]+)/i) {
- push @tags, lc($1);
- } elsif ($structure[$i]->string =~ /^<\/([A-Z0-9]+)/i) {
+ my $form = $structure[$i]->string;
+ if ($form =~ /^<([A-Z0-9]+)/is) {
+ my $tag = lc($1);
+ if ($tag !~ /^(?:br|input)$/is && $form !~ /\/>$/is) {
+ push @tags, $tag;
+ }
+ } elsif ($form =~ /^<\/([A-Z0-9]+)/is) {
if (@tags && lc($1) eq $tags[$#tags]) {
pop @tags;
} else {
my $it;
$this->{_queue} = [] unless defined $this->{_queue};
- # Don't reparse anything in the queue. We can put a parametrized token
- # there if we need to, however.
- if (@{$this->{_queue}}) {
- $it = pop @{$this->{_queue}};
+ # Elements in the queue are ordered pairs. The first in the ordered pair
+ # specifies whether we are allowed to reanalysis; the second is the token.
+ if (@{$this->{_queue}} && !$this->{_queue}->[$#{$this->{_queue}}]->[0]) {
+ $it = (pop @{$this->{_queue}})->[1];
} else {
- $it = $this->_next_token_intermediate($h);
+ if (@{$this->{_queue}}) {
+ $it = (pop @{$this->{_queue}})->[1];
+ } else {
+ $it = $this->_next_token_intermediate($h);
+ }
if (!$this->cdata_mode_p && $this->allow_cformat_p && defined $it
&& ($it->type == TmplTokenType::TEXT?
!blank_p( $it->string ): _token_groupable1_p( $it ))) {
my @structure = ( $it );
my @tags = ();
my $next = undef;
- my($nonblank_text_p, $parametrized_p, $with_anchor_p) = (0, 0, 0);
+ my($nonblank_text_p, $parametrized_p, $with_anchor_p, $with_input_p) = (0, 0, 0, 0);
if ($it->type == TmplTokenType::TEXT) {
$nonblank_text_p = 1 if !blank_p( $it->string );
} elsif ($it->type == TmplTokenType::DIRECTIVE) {
$parametrized_p = 1;
} elsif ($it->type == TmplTokenType::TAG && $it->string =~ /^<([A-Z0-9]+)/is) {
- push @tags, lc($1);
- $with_anchor_p = 1 if lc($1) eq 'a';
+ my $tag = lc($1);
+ push @tags, $tag if $tag !~ /^(?:br|input)$/i;
+ $with_anchor_p = 1 if $tag eq 'a';
+ $with_input_p = 1 if $tag eq 'input';
}
# We hate | and || in msgid strings, so we try to avoid them
for (my $i = 1, my $quit_p = 0, my $quit_next_p = ($it->type == TmplTokenType::TEXT && $it->string =~ /^\|+$/s);; $i += 1) {
- $next = $this->_next_token_intermediate($h);
+ if (@{$this->{_queue}}) {
+ $next = (pop @{$this->{_queue}})->[1];
+ } else {
+ $next = $this->_next_token_intermediate($h);
+ }
push @structure, $next; # for consistency (with initialization)
last unless defined $next && _token_groupable2_p( $next );
last if $quit_next_p;
$parametrized_p = 1;
} elsif ($next->type == TmplTokenType::TAG) {
if ($next->string =~ /^<([A-Z0-9]+)/is) {
- push @tags, lc($1);
- $with_anchor_p = 1 if lc($1) eq 'a';
+ my $tag = lc($1);
+ push @tags, $tag if $tag !~ /^(?:br|input)$/i;
+ $with_anchor_p = 1 if $tag eq 'a';
+ $with_input_p = 1 if $tag eq 'input';
} elsif ($next->string =~ /^<\/([A-Z0-9]+)/is) {
my $close = lc($1);
$quit_p = 1 unless @tags && $close eq $tags[$#tags];
}
last if $quit_p;
}
- # Undo the last token
- push @{$this->{_queue}}, pop @structure;
+ # Undo the last token, allowing reanalysis
+ push @{$this->{_queue}}, [1, pop @structure];
# Simply it a bit more
@structure = $this->_optimize( @structure );
if (@structure < 2) {
# Nothing to do
;
- } elsif ($nonblank_text_p && ($parametrized_p || $with_anchor_p)) {
+ } elsif ($nonblank_text_p && ($parametrized_p || $with_anchor_p || $with_input_p)) {
# Create the corresponding c-format string
my $string = join('', map { $_->string } @structure);
my $form = join('', map { _formalize $_ } @structure);
- my $a_counter = 0;
+ my($a_counter, $input_counter) = (0, 0);
$form =~ s/<a>/ $a_counter += 1, "<a$a_counter>" /egs;
- $it = TmplToken->new($string, TmplTokenType::TEXT_PARAMETRIZED, $it->line_number, $it->pathname);
+ $form =~ s/<input>/ $input_counter += 1, "<input$input_counter>" /egs;
+ $it = TmplToken->new($string, TmplTokenType::TEXT_PARAMETRIZED,
+ $it->line_number, $it->pathname);
$it->set_form( $form );
$it->set_children( @structure );
} elsif ($nonblank_text_p
- && looks_plausibly_like_groupable_text_p( @structure )
- && $structure[$#structure]->type == TmplTokenType::TEXT) {
+ && looks_plausibly_like_groupable_text_p( @structure )
+ && $structure[$#structure]->type == TmplTokenType::TEXT) {
# Combine the strings
my $string = join('', map { $_->string } @structure);
- $it = TmplToken->new($string, TmplTokenType::TEXT, $it->line_number, $it->pathname);;
+ $it = TmplToken->new($string, TmplTokenType::TEXT,
+ $it->line_number, $it->pathname);;
} else {
- # Requeue the tokens thus seen for re-emitting
+ # Requeue the tokens thus seen for re-emitting, allow reanalysis
for (;;) {
- push @{$this->{_queue}}, pop @structure;
+ push @{$this->{_queue}}, [1, pop @structure];
last if !@structure;
}
- $it = pop @{$this->{_queue}};
+ $it = (pop @{$this->{_queue}})->[1];
}
}
}
sub blank_p ($) {
my($s) = @_;
- return $s =~ /^(?:\s|\ $re_end_entity|$re_tmpl_var)*$/os;
+ return $s =~ /^(?:\s|\ $re_end_entity|$re_tmpl_var|$re_xsl)*$/os;
}
sub trim ($) {
}
# Some functions that shouldn't be here... should be moved out some time
-sub parametrize ($$$) {
- my($fmt_0, $params, $anchors) = @_;
+sub parametrize ($$$$) {
+ my($fmt_0, $cformat_p, $t, $f) = @_;
my $it = '';
- for (my $n = 0, my $fmt = $fmt_0; length $fmt;) {
- if ($fmt =~ /^[^%]+/) {
- $fmt = $';
- $it .= $&;
- } elsif ($fmt =~ /^%%/) {
- $fmt = $';
- $it .= '%';
- } elsif ($fmt =~ /^%(?:(\d+)\$)?(?:(\d+)(?:\.(\d+))?)?s/) {
- $n += 1;
- my($i, $width, $prec) = ((defined $1? $1: $n), $2, $3);
- $fmt = $';
- if (!defined $width && !defined $prec) {
- $it .= $params->[$i]
- } elsif (defined $width && defined $prec && !$width && !$prec) {
- ;
+ if ($cformat_p) {
+ my @params = $t->parameters_and_fields;
+ for (my $n = 0, my $fmt = $fmt_0; length $fmt;) {
+ if ($fmt =~ /^[^%]+/) {
+ $fmt = $';
+ $it .= $&;
+ } elsif ($fmt =~ /^%%/) {
+ $fmt = $';
+ $it .= '%';
+ } elsif ($fmt =~ /^%(?:(\d+)\$)?(?:(\d+)(?:\.(\d+))?)?s/s) {
+ $n += 1;
+ my($i, $width, $prec) = ((defined $1? $1: $n), $2, $3);
+ $fmt = $';
+ if (defined $width && defined $prec && !$width && !$prec) {
+ ;
+ } elsif (defined $params[$i - 1]) {
+ my $param = $params[$i - 1];
+ warn_normal "$fmt_0: $&: Expected a TMPL_VAR, but found a "
+ . $param->type->to_string . "\n", undef
+ if $param->type != TmplTokenType::DIRECTIVE;
+ warn_normal "$fmt_0: $&: Unsupported "
+ . "field width or precision\n", undef
+ if defined $width || defined $prec;
+ warn_normal "$fmt_0: $&: Parameter $i not known", undef
+ unless defined $param;
+ $it .= defined $f? &$f( $param ): $param->string;
+ }
+ } elsif ($fmt =~ /^%(?:(\d+)\$)?(?:(\d+)(?:\.(\d+))?)?([pS])/s) {
+ $n += 1;
+ my($i, $width, $prec, $conv) = ((defined $1? $1: $n), $2, $3, $4);
+ $fmt = $';
+
+ my $param = $params[$i - 1];
+ if (!defined $param) {
+ warn_normal "$fmt_0: $&: Parameter $i not known", undef;
+ } else {
+ if ($param->type == TmplTokenType::TAG
+ && $param->string =~ /^<input\b/is) {
+ my $type = defined $param->attributes?
+ lc($param->attributes->{'type'}->[1]): undef;
+ if ($conv eq 'S') {
+ warn_normal "$fmt_0: $&: Expected type=text, "
+ . "but found type=$type", undef
+ unless $type eq 'text';
+ } elsif ($conv eq 'p') {
+ warn_normal "$fmt_0: $&: Expected type=radio, "
+ . "but found type=$type", undef
+ unless $type eq 'radio';
+ }
+ } else {
+ warn_normal "$&: Expected an INPUT, but found a "
+ . $param->type->to_string . "\n", undef
+ }
+ warn_normal "$fmt_0: $&: Unsupported "
+ . "field width or precision\n", undef
+ if defined $width || defined $prec;
+ $it .= defined $f? &$f( $param ): $param->string;
+ }
+ } elsif ($fmt =~ /^%[^%a-zA-Z]*[a-zA-Z]/) {
+ $fmt = $';
+ $it .= $&;
+ die "$&: Unknown or unsupported format specification\n"; #XXX
} else {
- die "Unsupported precision specification in format: $&\n"; #XXX
+ die "$&: Completely confused parametrizing\n";#XXX
}
- } elsif ($fmt =~ /^%[^%a-zA-Z]*[a-zA-Z]/) {
- $fmt = $';
- $it .= $&;
- die "Unknown or unsupported format specification: $&\n"; #XXX
- } else {
- die "Completely confused parametrizing: $fmt\n";#XXX
}
}
+ my @anchors = $t->anchors;
for (my $n = 0, my $fmt = $it, $it = ''; length $fmt;) {
if ($fmt =~ /^(?:(?!<a\d+>).)+/is) {
$fmt = $';
$n += 1;
my $i = $1;
$fmt = $';
- my $anchor = $anchors->[$i - 1];
+ my $anchor = $anchors[$i - 1];
warn_normal "$&: Anchor $1 not found for msgid \"$fmt_0\"", undef #FIXME
unless defined $anchor;
$it .= $anchor->string;
Because this is an incompatible change, this mode must be explicitly
turned on using the set_cformat(1) method call.
+=head2 The flag characters
+
+The character % is followed by zero or more of the following flags:
+
+=over
+
+=item #
+
+The value comes from HTML <INPUT> elements.
+This abuse of the flag character is somewhat reasonable,
+since TMPL_VAR and INPUT are both variables, but of different kinds.
+
+=back
+
+=head2 The field width and precision
+
+An optional 0.0 can be specified for %s to specify
+that the <TMPL_VAR> should be suppressed.
+
+=head2 The conversion specifier
+
+=over
+
+=item p
+
+Specifies any input field that is neither text nor hidden
+(which currently mean radio buttons).
+The p conversion specifier is chosen because this does not
+evoke any certain sensible data type.
+
+=item S
+
+Specifies a text input field (<INPUT TYPE=TEXT>).
+This use of the S conversion specifier is somewhat reasonable,
+since text input fields contain values of undeterminable type,
+which can be treated as strings.
+
+=item s
+
+Specifies a <TMPL_VAR>.
+This use of the o conversion specifier is somewhat reasonable,
+since <TMPL_VAR> denotes values of undeterminable type, which
+can be treated as strings.
+
+=back
+
+=head1 BUGS
+
+There is no code to save the tag name anywhere in the scanned token.
+
+The use of <AI<i>> to stand for the I<i>th anchor
+is not very well thought out.
+Some abuse of c-format specifies might have been more appropriate.
+
=head1 HISTORY
This tokenizer is mostly based