6 use VerboseWarnings qw( pedantic_p error_normal warn_normal warn_pedantic );
9 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
11 ###############################################################################
15 TmplTokenizer.pm - Simple-minded tokenizer class for HTML::Template .tmpl files
19 Because .tmpl files contains HTML::Template directives
20 that tend to confuse real parsers (e.g., HTML::Parse),
21 it might be better to create a customized scanner
22 to scan the template files for tokens.
23 This module is a simple-minded attempt at such a scanner.
27 ###############################################################################
34 use vars qw( $pedantic_attribute_error_in_nonpedantic_mode_p );
35 use vars qw( $pedantic_tmpl_var_use_in_nonpedantic_mode_p );
36 use vars qw( $pedantic_error_markup_in_pcdata_p );
38 ###############################################################################
41 use vars qw( $re_directive $re_tmpl_var $re_tmpl_var_escaped $re_tmpl_include );
42 use vars qw( $re_directive_control $re_tmpl_endif_endloop );
44 # $re_directive must not do any backreferences
45 $re_directive = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:VAR|LOOP|INCLUDE|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
46 # TMPL_VAR or TMPL_INCLUDE
47 $re_tmpl_var = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
48 $re_tmpl_include = q{<(?:(?i)(?:!--\s*)?TMPL_(?:INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
49 # TMPL_VAR ESCAPE=1/HTML/URL
50 $re_tmpl_var_escaped = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR|INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))\s+ESCAPE=(?:1|HTML|URL)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
51 # Any control flow directive
52 $re_directive_control = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:LOOP|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
53 # /LOOP or /IF or /UNLESS
54 $re_tmpl_endif_endloop = q{<(?:(?i)(?:!--\s*)?\/TMPL_(?:LOOP|IF|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
57 # Hideous stuff from subst.pl, slightly modified to use the above hideous stuff
58 # Note: The $re_tag's set $1 (<tag), $2 (>), and $3 (rest of string)
59 use vars qw( $re_comment $re_entity_name $re_end_entity $re_etag );
60 use vars qw( $re_tag_strict $re_tag_compat @re_tag );
63 my $etag = $compat? '>': '<>\/';
64 # This is no longer similar to the original regexp in subst.pl :-(
65 # Note that we don't want <> in compat mode; Mozilla knows about <
66 q{(<\/?(?:|(?:"(?:} . $re_directive . q{|[^"])*"|'(?:} . $re_directive . q{|[^'])*'|--(?:(?!--)(?:$re_directive)*.)*--|(?:}
68 . q{|(?!--)[^"'<>} . $etag . q{]))+))([} . $etag . q{]|(?=<))(.*)};
71 $re_comment = '(?:--(?:[^-]|-[^-])*--)';
72 $re_entity_name = '(?:[^&%#;<>\s]+)'; # NOTE: not really correct SGML
73 $re_end_entity = '(?:;|$|(?=\s))'; # semicolon or before-whitespace
74 $re_etag = q{(?:<\/?(?:"[^"]*"|'[^']*'|[^"'>\/])*[>\/])}; # end-tag
75 @re_tag = ($re_tag_strict, $re_tag_compat) = (re_tag(0), re_tag(1));
78 # End of the hideous stuff
80 use vars qw( $serial );
82 ###############################################################################
84 sub FATAL_P () {'fatal-p'}
85 sub SYNTAXERROR_P () {'syntaxerror-p'}
87 sub FILENAME () {'input'}
88 sub HANDLE () {'handle'}
90 sub READAHEAD () {'readahead'}
91 sub LINENUM_START () {'lc_0'}
93 sub CDATA_MODE_P () {'cdata-mode-p'}
94 sub CDATA_CLOSE () {'cdata-close'}
95 sub PCDATA_MODE_P () {'pcdata-mode-p'} # additional submode for CDATA
96 sub JS_MODE_P () {'js-mode-p'} # cdata-mode-p must also be true
98 sub ALLOW_CFORMAT_P () {'allow-cformat-p'}
103 open my $handle,$filename or die "can't open $filename";
105 filename => $filename
111 ###############################################################################
117 return $this->{filename};
122 return $this->{handle};
127 return $this->{+FATAL_P};
132 return $this->{+SYNTAXERROR_P};
135 sub has_readahead_p {
137 return @{$this->{readahead}};
140 sub _peek_readahead {
142 return $this->{readahead}->[$#{$this->{readahead}}];
145 sub line_number_start {
147 return $this->{+LINENUM_START};
152 return $this->{+LINENUM};
157 return $this->{+CDATA_MODE_P};
162 return $this->{+PCDATA_MODE_P};
167 return $this->{+JS_MODE_P};
172 return $this->{+CDATA_CLOSE};
175 sub allow_cformat_p {
177 return $this->{+ALLOW_CFORMAT_P};
184 $this->{+FATAL_P} = $_[0];
188 sub _set_syntaxerror {
190 $this->{+SYNTAXERROR_P} = $_[0];
194 sub _push_readahead {
196 push @{$this->{readahead}}, $_[0];
202 return pop @{$this->{readahead}};
205 sub _append_readahead {
207 $this->{readahead}->[$#{$this->{readahead}}] .= $_[0];
213 $this->{readahead}->[$#{$this->{readahead}}] = $_[0];
217 sub _increment_line_number {
219 $this->{+LINENUM} += 1;
223 sub _set_line_number_start {
225 $this->{+LINENUM_START} = $_[0];
229 sub _set_cdata_mode {
231 $this->{+CDATA_MODE_P} = $_[0];
235 sub _set_pcdata_mode {
237 $this->{+PCDATA_MODE_P} = $_[0];
243 $this->{+JS_MODE_P} = $_[0];
247 sub _set_cdata_close {
249 $this->{+CDATA_CLOSE} = $_[0];
253 sub set_allow_cformat {
255 $this->{+ALLOW_CFORMAT_P} = $_[0];
259 ###############################################################################
261 use vars qw( $js_EscapeSequence );
263 # Perl quoting is really screwed up, but this common subexp is way too long
264 $js_EscapeSequence = q{\\\\(?:['"\\\\bfnrt]|[^0-7xu]|[0-3]?[0-7]{1,2}|x[\da-fA-F]{2}|u[\da-fA-F]{4})};
266 sub parenleft () { '(' }
267 sub parenright () { ')' }
273 if ($s0 =~ /^\s+/s) { # whitespace
276 } elsif ($s0 =~ /^\/\/[^\r\n]*(?:[\r\n]|$)/s) { # C++-style comment
279 } elsif ($s0 =~ /^\/\*(?:(?!\*\/).)*\*\//s) { # C-style comment
282 # Keyword or identifier, ECMA-262 p.13 (section 7.5)
283 } elsif ($s0 =~ /^[A-Z_\$][A-Z\d_\$]*/is) { # IdentifierName
286 # Punctuator, ECMA-262 p.13 (section 7.6)
287 } elsif ($s0 =~ /^(?:[\(\){}\[\];]|>>>=|<<=|>>=|[-\+\*\/\&\|\^\%]=|>>>|<<|>>|--|\+\+|\|\||\&\&|==|<=|>=|!=|[=><,!~\?:\.\-\+\*\/\&\|\^\%])/s) {
290 # DecimalLiteral, ECMA-262 p.14 (section 7.7.3); note: bug in the spec
291 } elsif ($s0 =~ /^(?:0|[1-9]\d+(?:\.\d*(?:[eE][-\+]?\d+)?)?)/s) {
294 # HexIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
295 } elsif ($s0 =~ /^0[xX][\da-fA-F]+/s) {
298 # OctalIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
299 } elsif ($s0 =~ /^0[\da-fA-F]+/s) {
302 # StringLiteral, ECMA-262 p.17 (section 7.7.4)
303 # XXX SourceCharacter doesn't seem to be defined (?)
304 } elsif ($s0 =~ /^(?:"(?:(?!["\\\r\n]).|$js_EscapeSequence)*"|'(?:(?!['\\\r\n]).|$js_EscapeSequence)*')/os) {
307 } elsif ($s0 =~ /^./) { # UNKNOWN TOKEN !!!
315 sub STATE_UNDERSCORE () { 1 }
316 sub STATE_PARENLEFT () { 2 }
317 sub STATE_STRING_LITERAL () { 3 }
319 # XXX This is a crazy hack. I don't want to write an ECMAScript parser.
320 # XXX A scanner is one thing; a parser another thing.
321 sub identify_js_translatables (@) {
324 # We mark a JavaScript translatable string as in C, i.e., _("literal")
325 # For simplicity, we ONLY look for "_" "(" StringLiteral ")"
326 for (my $i = 0, my $state = 0, my($j, $q, $s); $i <= $#input; $i += 1) {
327 my $reset_state_p = 0;
328 push @output, [0, $input[$i]];
329 if ($input[$i] !~ /\S/s) {
331 } elsif ($state == 0) {
332 $state = STATE_UNDERSCORE if $input[$i] eq '_';
333 } elsif ($state == STATE_UNDERSCORE) {
334 $state = $input[$i] eq parenleft ? STATE_PARENLEFT : 0;
335 } elsif ($state == STATE_PARENLEFT) {
336 if ($input[$i] =~ /^(['"])(.*)\1$/s) {
337 ($state, $j, $q, $s) = (STATE_STRING_LITERAL, $#output, $1, $2);
341 } elsif ($state == STATE_STRING_LITERAL) {
342 if ($input[$i] eq parenright) {
343 $output[$j] = [1, $output[$j]->[1], $q, $s];
347 die "identify_js_translatables internal error: Unknown state $state"
353 ###############################################################################
355 sub _extract_attributes ($;$) {
359 $s = $1 if $s =~ /^<(?:(?!$re_directive_control)\S)+(.*)\/\S$/s # XML-style self-closing tags
360 || $s =~ /^<(?:(?!$re_directive_control)\S)+(.*)\S$/s; # SGML-style tags
362 for (my $i = 0; $s =~ /^(?:$re_directive_control)?\s+(?:$re_directive_control)?(?:([a-zA-Z][-a-zA-Z0-9]*)\s*=\s*)?('((?:$re_directive|[^'])*)'|"((?:$re_directive|[^"])*)"|((?:$re_directive|[^\s<>])+))/os;) {
363 my($key, $val, $val_orig, $rest)
364 = ($1, (defined $3? $3: defined $4? $4: $5), $2, $');
366 $attr{+lc($key)} = [$key, $val, $val_orig, $i];
368 if ($val =~ /$re_tmpl_include/os) {
369 warn_normal "TMPL_INCLUDE in attribute: $val_orig\n", $lc;
370 } elsif ($val =~ /$re_tmpl_var/os && $val !~ /$re_tmpl_var_escaped/os) {
371 # XXX: we probably should not warn if key is "onclick" etc
372 # XXX: there's just no reasonable thing to suggest
373 my $suggest = ($key =~ /^(?:action|archive|background|cite|classid|codebase|data|datasrc|for|href|longdesc|profile|src|usemap)$/i? 'URL': 'HTML');
374 undef $suggest if $key =~ /^(?:onblur|onchange|onclick|ondblclick|onfocus|onkeydown|onkeypress|onkeyup|onload|onmousedown|onmousemove|onmouseout|onmouseover|onmouseup|onreset|onselect|onsubmit|onunload)$/i;
376 "Suggest ESCAPE=$suggest for TMPL_VAR in attribute \"$key\""
378 $lc, \$pedantic_tmpl_var_use_in_nonpedantic_mode_p
379 if defined $suggest && (pedantic_p || !$pedantic_tmpl_var_use_in_nonpedantic_mode_p);
380 } elsif ($val_orig !~ /^['"]/) {
381 my $t = $val; $t =~ s/$re_directive_control//os;
383 "Unquoted attribute contains character(s) that should be quoted"
385 $lc, \$pedantic_attribute_error_in_nonpedantic_mode_p
386 if $t =~ /[^-\.A-Za-z0-9]/s;
389 my $s2 = $s; $s2 =~ s/$re_tmpl_endif_endloop//g; # for the next check
390 if ($s2 =~ /\S/s) { # should never happen
391 if ($s =~ /^([^\n]*)\n/s) { # this is even worse
392 error_normal("Completely confused while extracting attributes: $1", $lc);
393 error_normal((scalar(split(/\n/, $s)) - 1) . " more line(s) not shown.", undef);
394 $this->_set_fatal( 1 );
396 # There's something wrong with the attribute syntax.
397 # We might be able to deduce a likely cause by looking more.
398 if ($s =~ /^[a-z0-9]/is && "<foo $s>" =~ /^$re_tag_compat$/s) {
399 warn_normal "Probably missing whitespace before or missing quotation mark near: $s\n", $lc;
401 warn_normal "Strange attribute syntax: $s\n", $lc;
408 sub _next_token_internal {
413 $this->_pop_readahead if $this->has_readahead_p
414 && !ref $this->_peek_readahead
415 && !length $this->_peek_readahead;
416 if (!$this->has_readahead_p) {
417 my $next = scalar <$h>;
418 $eof_p = !defined $next;
420 $this->_increment_line_number;
421 $this->_push_readahead( $next );
424 $this->_set_line_number_start( $this->line_number ); # remember 1st line num
425 if ($this->has_readahead_p && ref $this->_peek_readahead) { # TmplToken obj.
426 ($it, $kind) = ($this->_pop_readahead, undef);
427 } elsif ($eof_p && !$this->has_readahead_p) { # nothing left to do
429 } elsif ($this->_peek_readahead =~ /^\s+/s) { # whitespace
430 ($kind, $it) = (TmplTokenType::TEXT, $&);
431 $this->_set_readahead( $' );
432 # FIXME the following (the [<\s] part) is an unreliable HACK :-(
433 } elsif ($this->_peek_readahead =~ /^(?:[^<]|<[<\s])*(?:[^<\s])/s) { # non-space normal text
434 ($kind, $it) = (TmplTokenType::TEXT, $&);
435 $this->_set_readahead( $' );
436 warn_normal "Unescaped < in $it\n", $this->line_number_start
437 if !$this->cdata_mode_p && $it =~ /</s;
438 } else { # tag/declaration/processing instruction
440 my $bad_comment_p = 0;
441 for (my $cdata_close = $this->cdata_close;;) {
442 if ($this->cdata_mode_p) {
443 my $next = $this->_pop_readahead;
444 if ($next =~ /^$cdata_close/is) {
445 ($kind, $it) = (TmplTokenType::TAG, $&);
446 $this->_push_readahead( $' );
448 } elsif ($next =~ /^((?:(?!$cdata_close).)+)($cdata_close)/is) {
449 ($kind, $it) = (TmplTokenType::TEXT, $1);
450 $this->_push_readahead( "$2$'" );
453 ($kind, $it) = (TmplTokenType::TEXT, $next);
456 } elsif ($this->_peek_readahead =~ /^$re_tag_compat/os) {
457 # If we detect a "closed start tag" but we know that the
458 # following token looks like a TMPL_VAR, don't stop
459 my($head, $tail, $post) = ($1, $2, $3);
460 if ($tail eq '' && $post =~ $re_tmpl_var) {
461 # Don't bother to show the warning if we're too confused
462 # FIXME. There's no method for _closed_start_tag_warning
463 if (!defined $this->{'_closed_start_tag_warning'}
464 || ($this->{'_closed_start_tag_warning'}->[0] eq $head
465 && $this->{'_closed_start_tag_warning'}->[1] != $this->line_number - 1)) {
466 warn_normal "Possible SGML \"closed start tag\" notation: $head<\n", $this->line_number
467 if split(/\n/, $head) < 10;
469 $this->{'_closed_start_tag_warning'} = [$head, $this->line_number];
471 ($kind, $it) = (TmplTokenType::TAG, "$head>");
472 $this->_set_readahead( $post );
474 warn_normal "SGML \"closed start tag\" notation: $head<\n", $this->line_number if $tail eq '';
476 } elsif ($this->_peek_readahead =~ /^<!--(?:(?!-->)$re_directive*.)*-->/os) {
477 ($kind, $it) = (TmplTokenType::COMMENT, $&);
478 $this->_set_readahead( $' );
483 my $next = scalar <$h>;
484 $eof_p = !defined $next;
486 $this->_increment_line_number;
487 $this->_append_readahead( $next );
489 if ($kind ne TmplTokenType::TAG) {
491 } elsif ($it =~ /^<!/) {
492 $kind = TmplTokenType::DECL;
493 $kind = TmplTokenType::COMMENT if $it =~ /^<!--(?:(?!-->).)*-->/;
494 if ($kind == TmplTokenType::COMMENT && $it =~ /^<!--\s*#include/s) {
495 warn_normal "Apache #include directive found instead of HTML::Template <TMPL_INCLUDE> directive", $this->line_number_start;
497 } elsif ($it =~ /^<\?/) {
498 $kind = TmplTokenType::PI;
500 if ($it =~ /^$re_directive/ios && !$this->cdata_mode_p) {
501 $kind = TmplTokenType::DIRECTIVE;
502 } elsif ($bad_comment_p) {
503 warn_normal sprintf("Syntax error in comment: %s\n", $it),
504 $this->line_number_start;
505 $this->_set_syntaxerror( 1 );
507 if (!$ok_p && $eof_p) {
508 ($kind, $it) = (TmplTokenType::UNKNOWN, $this->_peek_readahead);
509 $this->_set_readahead, undef;
510 $this->_set_syntaxerror( 1 );
513 warn_normal "Unrecognizable token found: "
514 . (split(/\n/, $it) < 10? $it: '(too confused to show details)')
515 . "\n", $this->line_number_start
516 if $kind == TmplTokenType::UNKNOWN;
517 return defined $it? (ref $it? $it: TmplToken->new($it, $kind, $this->line_number, $this->filename)): undef;
520 sub _next_token_intermediate {
522 my $h = $this->_handle;
524 if (!$this->cdata_mode_p) {
525 $it = $this->_next_token_internal($h);
526 if (defined $it && $it->type == TmplTokenType::TAG) {
527 if ($it->string =~ /^<(script|style|textarea)\b/is) {
528 $this->_set_cdata_mode( 1 );
529 $this->_set_cdata_close( "</$1\\s*>" );
530 $this->_set_pcdata_mode( 0 );
531 $this->_set_js_mode( lc($1) eq 'script' );
532 # } elsif ($it->string =~ /^<(title)\b/is) {
533 # $this->_set_cdata_mode( 1 );
534 # $this->_set_cdata_close( "</$1\\s*>" );
535 # $this->_set_pcdata_mode( 1 );
537 $it->set_attributes( $this->_extract_attributes($it->string, $it->line_number) );
541 for ($it = '', my $cdata_close = $this->cdata_close;;) {
542 my $next = $this->_next_token_internal($h);
543 $eof_p = !defined $next;
545 if (defined $next && $next->string =~ /$cdata_close/is) {
546 $this->_push_readahead( $next ); # push entire TmplToken object
547 $this->_set_cdata_mode( 0 );
549 last unless $this->cdata_mode_p;
550 $it .= $next->string;
554 error_normal "Unexpected end of file while looking for "
556 . "\n", $this->line_number_start;
557 $this->_set_fatal( 1 );
558 $this->_set_syntaxerror( 1 );
560 if ($this->pcdata_mode_p) {
562 $check =~ s/$re_directive//gos;
563 warn_pedantic "Markup found in PCDATA\n", $this->line_number,
564 \$pedantic_error_markup_in_pcdata_p
565 if $check =~ /$re_tag_compat/s;
567 # PCDATA should be treated as text, not CDATA
568 # Actually it should be treated as TEXT_PARAMETRIZED :-(
569 $it = TmplToken->new( $it,
570 ($this->pcdata_mode_p?
571 TmplTokenType::TEXT: TmplTokenType::CDATA),
572 $this->line_number, $this->filename )
574 if ($this->js_mode_p) {
575 my $s0 = $it->string;
578 if ($s0 =~ /^(\s*<!--\s*)(.*)(\s*--\s*>\s*)$/s) {
583 push @head, split_js $s0;
584 $it->set_js_data( identify_js_translatables(@head, @tail) );
586 $this->_set_pcdata_mode, 0;
587 $this->_set_cdata_close, undef unless !defined $it;
592 sub _token_groupable1_p ($) { # as first token, groupable into TEXT_PARAMETRIZED
594 return ($t->type == TmplTokenType::TEXT && $t->string !~ /^[,\.:\|\s]+$/is)
595 || ($t->type == TmplTokenType::DIRECTIVE
596 && $t->string =~ /^(?:$re_tmpl_var)$/os)
597 || ($t->type == TmplTokenType::TAG
598 && ($t->string =~ /^<(?:a|b|em|h[123456]|i|u)\b/is
599 || ($t->string =~ /^<input\b/is
600 && $t->attributes->{'type'}->[1] =~ /^(?:radio|text)$/is)
604 sub _token_groupable2_p ($) { # as other token, groupable into TEXT_PARAMETRIZED
606 return ($t->type == TmplTokenType::TEXT && ($t->string =~ /^\s*$/s || $t->string !~ /^[\|\s]+$/is))
607 || ($t->type == TmplTokenType::DIRECTIVE
608 && $t->string =~ /^(?:$re_tmpl_var)$/os)
609 || ($t->type == TmplTokenType::TAG
610 && ($t->string =~ /^<\/?(?:a|b|em|h[123456]|i|u)\b/is
611 || ($t->string =~ /^<input\b/is
612 && $t->attributes->{'type'}->[1] =~ /^(?:radio|text)$/is)))
615 sub _quote_cformat ($) {
621 sub string_canon ($) {
624 # Fold all whitespace into single blanks
630 sub _formalize_string_cformat ($) {
632 return _quote_cformat string_canon $s;
637 return $t->type == TmplTokenType::DIRECTIVE? '%s':
638 $t->type == TmplTokenType::TEXT?
639 _formalize_string_cformat($t->string):
640 $t->type == TmplTokenType::TAG?
641 ($t->string =~ /^<a\b/is? '<a>':
642 $t->string =~ /^<input\b/is? (
643 lc $t->attributes->{'type'}->[1] eq 'text' ? '%S':
645 _quote_cformat($t->string)):
646 _quote_cformat($t->string);
652 my $undo_trailing_blanks = sub {
653 for (my $i = $#structure; $i >= 0; $i -= 1) {
654 last unless ($structure[$i]->type == TmplTokenType::TEXT && blank_p($structure[$i]->string)) ;#|| ($structure[$i]->type == TmplTokenType::TAG && $structure[$i]->string =~ /^<br\b/is);
655 # Queue element structure: [reanalysis-p, token]
656 push @{$this->{_queue}}, [1, pop @structure];
659 &$undo_trailing_blanks;
660 while (@structure >= 2) {
661 my $something_done_p = 0;
662 # FIXME: If the last token is a close tag but there are no tags
663 # FIXME: before it, drop the close tag back into the queue. This
664 # FIXME: is an ugly hack to get rid of "foo %s</h1>" type mess.
666 && $structure[$#structure]->type == TmplTokenType::TAG
667 && $structure[$#structure]->string =~ /^<\//s) {
668 my $has_other_tags_p = 0;
669 for (my $i = 0; $i < $#structure; $i += 1) {
670 $has_other_tags_p = 1
671 if $structure[$i]->type == TmplTokenType::TAG;
672 last if $has_other_tags_p;
674 if (!$has_other_tags_p) {
675 push @{$this->{_queue}}, [0, pop @structure]
676 &$undo_trailing_blanks;
677 $something_done_p = 1;
680 # FIXME: Do the same ugly hack for the last token being a ( or [
682 && $structure[$#structure]->type == TmplTokenType::TEXT
683 && $structure[$#structure]->string =~ /^[\(\[]$/) { # not )]
684 push @{$this->{_queue}}, [1, pop @structure];
685 &$undo_trailing_blanks;
686 $something_done_p = 1;
688 # FIXME: If the first token is an open tag, but there is no
689 # FIXME: corresponding close tag, "drop the open tag", i.e.,
690 # FIXME: requeue everything for reanalysis, except the frist tag. :-(
692 && $structure[0]->type == TmplTokenType::TAG
693 && $structure[0]->string =~ /^<([a-z0-9]+)/is
694 && (my $tag = $1) !~ /^(?:br|hr|img|input)\b/is
696 my $tag_open_count = 1;
697 for (my $i = 1; $i <= $#structure; $i += 1) {
698 if ($structure[$i]->type == TmplTokenType::TAG) {
699 if ($structure[$i]->string =~ /^<(\/?)$tag\b/is) {
700 $tag_open_count += ($1? -1: +1);
704 if ($tag_open_count > 0) {
705 for (my $i = $#structure; $i; $i -= 1) {
706 push @{$this->{_queue}}, [1, pop @structure];
708 $something_done_p = 1;
711 # FIXME: If the first token is an open tag, the last token is the
712 # FIXME: corresponding close tag, and there are no other close tags
713 # FIXME: inbetween, requeue the tokens from the second token on,
714 # FIXME: flagged as ok for re-analysis
716 && $structure[0]->type == TmplTokenType::TAG
717 && $structure[0]->string =~ /^<([a-z0-9]+)/is && (my $tag = $1)
718 && $structure[$#structure]->type == TmplTokenType::TAG
719 && $structure[$#structure]->string =~ /^<\/$1\s*>$/is) {
720 my $has_other_open_or_close_tags_p = 0;
721 for (my $i = 1; $i < $#structure; $i += 1) {
722 $has_other_open_or_close_tags_p = 1
723 if $structure[$i]->type == TmplTokenType::TAG
724 && $structure[$i]->string =~ /^<\/?$tag\b/is;
725 last if $has_other_open_or_close_tags_p;
727 if (!$has_other_open_or_close_tags_p) {
728 for (my $i = $#structure; $i; $i -= 1) {
729 push @{$this->{_queue}}, [1, pop @structure];
731 $something_done_p = 1;
734 last if !$something_done_p;
739 sub looks_plausibly_like_groupable_text_p (@) {
741 # The text would look plausibly groupable if all open tags are also closed.
744 for (my $i = 0; $i <= $#structure; $i += 1) {
745 if ($structure[$i]->type == TmplTokenType::TAG) {
746 my $form = $structure[$i]->string;
747 if ($form =~ /^<([A-Z0-9]+)/is) {
749 if ($tag !~ /^(?:br|input)$/is && $form !~ /\/>$/is) {
752 } elsif ($form =~ /^<\/([A-Z0-9]+)/is) {
753 if (@tags && lc($1) eq $tags[$#tags]) {
759 } elsif ($structure[$i]->type != TmplTokenType::TEXT) {
764 return !$error_p && !@tags;
769 my $h = $this->_handle;
771 $this->{_queue} = [] unless defined $this->{_queue};
773 # Elements in the queue are ordered pairs. The first in the ordered pair
774 # specifies whether we are allowed to reanalysis; the second is the token.
775 if (@{$this->{_queue}} && !$this->{_queue}->[$#{$this->{_queue}}]->[0]) {
776 $it = (pop @{$this->{_queue}})->[1];
778 if (@{$this->{_queue}}) {
779 $it = (pop @{$this->{_queue}})->[1];
781 $it = $this->_next_token_intermediate($h);
783 if (!$this->cdata_mode_p && $this->allow_cformat_p && defined $it
784 && ($it->type == TmplTokenType::TEXT?
785 !blank_p( $it->string ): _token_groupable1_p( $it ))) {
786 my @structure = ( $it );
789 my($nonblank_text_p, $parametrized_p, $with_anchor_p, $with_input_p) = (0, 0, 0, 0);
790 if ($it->type == TmplTokenType::TEXT) {
791 $nonblank_text_p = 1 if !blank_p( $it->string );
792 } elsif ($it->type == TmplTokenType::DIRECTIVE) {
794 } elsif ($it->type == TmplTokenType::TAG && $it->string =~ /^<([A-Z0-9]+)/is) {
796 push @tags, $tag if $tag !~ /^(?:br|input)$/i;
797 $with_anchor_p = 1 if $tag eq 'a';
798 $with_input_p = 1 if $tag eq 'input';
800 # We hate | and || in msgid strings, so we try to avoid them
801 for (my $i = 1, my $quit_p = 0, my $quit_next_p = ($it->type == TmplTokenType::TEXT && $it->string =~ /^\|+$/s);; $i += 1) {
802 if (@{$this->{_queue}}) {
803 $next = (pop @{$this->{_queue}})->[1];
805 $next = $this->_next_token_intermediate($h);
807 push @structure, $next; # for consistency (with initialization)
808 last unless defined $next && _token_groupable2_p( $next );
809 last if $quit_next_p;
810 if ($next->type == TmplTokenType::TEXT) {
811 $nonblank_text_p = 1 if !blank_p( $next->string );
812 $quit_p = 1 if $next->string =~ /^\|+$/s; # We hate | and ||
813 } elsif ($next->type == TmplTokenType::DIRECTIVE) {
815 } elsif ($next->type == TmplTokenType::TAG) {
816 if ($next->string =~ /^<([A-Z0-9]+)/is) {
818 push @tags, $tag if $tag !~ /^(?:br|input)$/i;
819 $with_anchor_p = 1 if $tag eq 'a';
820 $with_input_p = 1 if $tag eq 'input';
821 } elsif ($next->string =~ /^<\/([A-Z0-9]+)/is) {
823 $quit_p = 1 unless @tags && $close eq $tags[$#tags];
824 $quit_next_p = 1 if $close =~ /^h\d$/;
830 # Undo the last token, allowing reanalysis
831 push @{$this->{_queue}}, [1, pop @structure];
832 # Simply it a bit more
833 @structure = $this->_optimize( @structure );
834 if (@structure < 2) {
837 } elsif ($nonblank_text_p && ($parametrized_p || $with_anchor_p || $with_input_p)) {
838 # Create the corresponding c-format string
839 my $string = join('', map { $_->string } @structure);
840 my $form = join('', map { _formalize $_ } @structure);
841 my($a_counter, $input_counter) = (0, 0);
842 $form =~ s/<a>/ $a_counter += 1, "<a$a_counter>" /egs;
843 $form =~ s/<input>/ $input_counter += 1, "<input$input_counter>" /egs;
844 $it = TmplToken->new($string, TmplTokenType::TEXT_PARAMETRIZED,
845 $it->line_number, $it->pathname);
846 $it->set_form( $form );
847 $it->set_children( @structure );
848 } elsif ($nonblank_text_p
849 && looks_plausibly_like_groupable_text_p( @structure )
850 && $structure[$#structure]->type == TmplTokenType::TEXT) {
851 # Combine the strings
852 my $string = join('', map { $_->string } @structure);
853 $it = TmplToken->new($string, TmplTokenType::TEXT,
854 $it->line_number, $it->pathname);;
856 # Requeue the tokens thus seen for re-emitting, allow reanalysis
858 push @{$this->{_queue}}, [1, pop @structure];
861 $it = (pop @{$this->{_queue}})->[1];
865 if (defined $it && $it->type == TmplTokenType::TEXT) {
866 my $form = string_canon $it->string;
867 $it->set_form( $form );
872 ###############################################################################
874 # Other simple functions (These are not methods)
878 return $s =~ /^(?:\s|\ $re_end_entity|$re_tmpl_var)*$/os;
885 $s =~ s/^(\s|\ $re_end_entity)+//os; my $l1 = $l0 - length $s;
886 $s =~ s/(\s|\ $re_end_entity)+$//os; my $l2 = $l0 - $l1 - length $s;
887 return wantarray? (substr($s0, 0, $l1), $s, substr($s0, $l0 - $l2)): $s;
892 # Locale::PO->quote is buggy, it doesn't quote newlines :-/
893 $s =~ s/([\\"])/\\\1/gs;
895 #$s =~ s/[\177-\377]/ sprintf("\\%03o", ord($&)) /egs;
899 # Some functions that shouldn't be here... should be moved out some time
900 sub parametrize ($$$$) {
901 my($fmt_0, $cformat_p, $t, $f) = @_;
904 my @params = $t->parameters_and_fields;
905 for (my $n = 0, my $fmt = $fmt_0; length $fmt;) {
906 if ($fmt =~ /^[^%]+/) {
909 } elsif ($fmt =~ /^%%/) {
912 } elsif ($fmt =~ /^%(?:(\d+)\$)?(?:(\d+)(?:\.(\d+))?)?s/s) {
914 my($i, $width, $prec) = ((defined $1? $1: $n), $2, $3);
916 if (defined $width && defined $prec && !$width && !$prec) {
918 } elsif (defined $params[$i - 1]) {
919 my $param = $params[$i - 1];
920 warn_normal "$fmt_0: $&: Expected a TMPL_VAR, but found a "
921 . $param->type->to_string . "\n", undef
922 if $param->type != TmplTokenType::DIRECTIVE;
923 warn_normal "$fmt_0: $&: Unsupported "
924 . "field width or precision\n", undef
925 if defined $width || defined $prec;
926 warn_normal "$fmt_0: $&: Parameter $i not known", undef
927 unless defined $param;
928 $it .= defined $f? &$f( $param ): $param->string;
930 } elsif ($fmt =~ /^%(?:(\d+)\$)?(?:(\d+)(?:\.(\d+))?)?([pS])/s) {
932 my($i, $width, $prec, $conv) = ((defined $1? $1: $n), $2, $3, $4);
935 my $param = $params[$i - 1];
936 if (!defined $param) {
937 warn_normal "$fmt_0: $&: Parameter $i not known", undef;
939 if ($param->type == TmplTokenType::TAG
940 && $param->string =~ /^<input\b/is) {
941 my $type = defined $param->attributes?
942 lc($param->attributes->{'type'}->[1]): undef;
944 warn_normal "$fmt_0: $&: Expected type=text, "
945 . "but found type=$type", undef
946 unless $type eq 'text';
947 } elsif ($conv eq 'p') {
948 warn_normal "$fmt_0: $&: Expected type=radio, "
949 . "but found type=$type", undef
950 unless $type eq 'radio';
953 warn_normal "$&: Expected an INPUT, but found a "
954 . $param->type->to_string . "\n", undef
956 warn_normal "$fmt_0: $&: Unsupported "
957 . "field width or precision\n", undef
958 if defined $width || defined $prec;
959 $it .= defined $f? &$f( $param ): $param->string;
961 } elsif ($fmt =~ /^%[^%a-zA-Z]*[a-zA-Z]/) {
964 die "$&: Unknown or unsupported format specification\n"; #XXX
966 die "$&: Completely confused parametrizing\n";#XXX
970 my @anchors = $t->anchors;
971 for (my $n = 0, my $fmt = $it, $it = ''; length $fmt;) {
972 if ($fmt =~ /^(?:(?!<a\d+>).)+/is) {
975 } elsif ($fmt =~ /^<a(\d+)>/is) {
979 my $anchor = $anchors[$i - 1];
980 warn_normal "$&: Anchor $1 not found for msgid \"$fmt_0\"", undef #FIXME
981 unless defined $anchor;
982 $it .= $anchor->string;
984 die "Completely confused decoding anchors: $fmt\n";#XXX
990 sub charset_canon ($) {
992 $charset = uc($charset);
993 $charset = "$1-$2" if $charset =~ /^(ISO|UTF)(\d.*)/i;
994 $charset = 'Big5' if $charset eq 'BIG5'; # "Big5" must be in mixed case
998 use vars qw( @latin1_utf8 );
1000 "\302\200", "\302\201", "\302\202", "\302\203", "\302\204", "\302\205",
1001 "\302\206", "\302\207", "\302\210", "\302\211", "\302\212", "\302\213",
1002 "\302\214", "\302\215", undef, undef, "\302\220", "\302\221",
1003 "\302\222", "\302\223", "\302\224", "\302\225", "\302\226", "\302\227",
1004 "\302\230", "\302\231", "\302\232", "\302\233", "\302\234", "\302\235",
1005 "\302\236", "\302\237", "\302\240", "\302\241", "\302\242", "\302\243",
1006 "\302\244", "\302\245", "\302\246", "\302\247", "\302\250", "\302\251",
1007 "\302\252", "\302\253", "\302\254", "\302\255", "\302\256", "\302\257",
1008 "\302\260", "\302\261", "\302\262", "\302\263", "\302\264", "\302\265",
1009 "\302\266", "\302\267", "\302\270", "\302\271", "\302\272", "\302\273",
1010 "\302\274", "\302\275", "\302\276", "\302\277", "\303\200", "\303\201",
1011 "\303\202", "\303\203", "\303\204", "\303\205", "\303\206", "\303\207",
1012 "\303\210", "\303\211", "\303\212", "\303\213", "\303\214", "\303\215",
1013 "\303\216", "\303\217", "\303\220", "\303\221", "\303\222", "\303\223",
1014 "\303\224", "\303\225", "\303\226", "\303\227", "\303\230", "\303\231",
1015 "\303\232", "\303\233", "\303\234", "\303\235", "\303\236", "\303\237",
1016 "\303\240", "\303\241", "\303\242", "\303\243", "\303\244", "\303\245",
1017 "\303\246", "\303\247", "\303\250", "\303\251", "\303\252", "\303\253",
1018 "\303\254", "\303\255", "\303\256", "\303\257", "\303\260", "\303\261",
1019 "\303\262", "\303\263", "\303\264", "\303\265", "\303\266", "\303\267",
1020 "\303\270", "\303\271", "\303\272", "\303\273", "\303\274", "\303\275",
1021 "\303\276", "\303\277" );
1023 sub charset_convert ($$$) {
1024 my($s, $charset_in, $charset_out) = @_;
1025 if ($s !~ /[\200-\377]/s) { # FIXME: don't worry about iso2022 for now
1027 } elsif ($charset_in eq 'ISO-8859-1' && $charset_out eq 'UTF-8') {
1028 $s =~ s/[\200-\377]/ $latin1_utf8[ord($&) - 128] /egs;
1029 } elsif ($charset_in ne $charset_out) {
1030 VerboseWarnings::warn_normal "conversion from $charset_in to $charset_out is not supported\n", undef;
1035 ###############################################################################
1039 In addition to the basic scanning, this class will also perform
1046 Emulation of c-format strings (see below)
1050 Display of warnings for certain things that affects either the
1051 ability of this class to yield correct output, or things that
1052 are known to cause the original template to cause trouble.
1056 Automatic correction of some of the things warned about
1057 (e.g., SGML "closed start tag" notation).
1061 =head2 c-format strings emulation
1063 Because English word order is not universal, a simple extraction
1064 of translatable strings may yield some strings like "Accounts for"
1065 or ambiguous strings like "in". This makes the resulting strings
1066 difficult to translate, but does not affect all languages alike.
1067 For example, Chinese (with a somewhat different word order) would
1068 be hit harder, but French would be relatively unaffected.
1070 To overcome this problem, the scanner can be configured to detect
1071 patterns with <TMPL_VAR> directives (as well as certain HTML tags),
1072 and try to construct a larger pattern that will appear in the PO
1073 file as c-format strings with %s placeholders. This additional
1074 step allows the translator to deal with cases where word order
1075 is different (replacing %s with %1$s, %2$s, etc.), or when certain
1076 words will require certain inflectional suffixes in sentences.
1078 Because this is an incompatible change, this mode must be explicitly
1079 turned on using the set_cformat(1) method call.
1081 =head2 The flag characters
1083 The character % is followed by zero or more of the following flags:
1089 The value comes from HTML <INPUT> elements.
1090 This abuse of the flag character is somewhat reasonable,
1091 since TMPL_VAR and INPUT are both variables, but of different kinds.
1095 =head2 The field width and precision
1097 An optional 0.0 can be specified for %s to specify
1098 that the <TMPL_VAR> should be suppressed.
1100 =head2 The conversion specifier
1106 Specifies any input field that is neither text nor hidden
1107 (which currently mean radio buttons).
1108 The p conversion specifier is chosen because this does not
1109 evoke any certain sensible data type.
1113 Specifies a text input field (<INPUT TYPE=TEXT>).
1114 This use of the S conversion specifier is somewhat reasonable,
1115 since text input fields contain values of undeterminable type,
1116 which can be treated as strings.
1120 Specifies a <TMPL_VAR>.
1121 This use of the o conversion specifier is somewhat reasonable,
1122 since <TMPL_VAR> denotes values of undeterminable type, which
1123 can be treated as strings.
1129 There is no code to save the tag name anywhere in the scanned token.
1131 The use of <AI<i>> to stand for the I<i>th anchor
1132 is not very well thought out.
1133 Some abuse of c-format specifies might have been more appropriate.
1137 This tokenizer is mostly based
1138 on Ambrose's hideous Perl script known as subst.pl.