X-Git-Url: http://git.rot13.org/?a=blobdiff_plain;f=misc%2Ftranslator%2FTmplTokenizer.pm;h=6129f8d37e5a81407aed6fc5ffd2b1dd68fbdf14;hb=4f4946f8dfc8511a02af2cdebf96ea59be57fafc;hp=5c4f28ba5ce79282ca49b7c73c341b495c5df739;hpb=f5e65148ace51060f739fce4c26b772d09f35ba6;p=koha.git diff --git a/misc/translator/TmplTokenizer.pm b/misc/translator/TmplTokenizer.pm index 5c4f28ba5c..6129f8d37e 100644 --- a/misc/translator/TmplTokenizer.pm +++ b/misc/translator/TmplTokenizer.pm @@ -2,8 +2,9 @@ package TmplTokenizer; use strict; #use warnings; FIXME - Bug 2505 -use TmplTokenType; -use TmplToken; +use C4::TmplTokenType; +use C4::TmplToken; +use C4::TTParser; use VerboseWarnings qw( pedantic_p error_normal warn_normal warn_pedantic ); require Exporter; @@ -13,15 +14,11 @@ use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); =head1 NAME -TmplTokenizer.pm - Simple-minded tokenizer class for HTML::Template .tmpl files +TmplTokenizer.pm - Simple-minded wrapper class for TTParser =head1 DESCRIPTION -Because .tmpl files contains HTML::Template directives -that tend to confuse real parsers (e.g., HTML::Parse), -it might be better to create a customized scanner -to scan the template files for tokens. -This module is a simple-minded attempt at such a scanner. +A wrapper for the functionality found in TTParser to allow an easier transition to Template Toolkit =cut @@ -39,44 +36,12 @@ use vars qw( $pedantic_error_markup_in_pcdata_p ); ############################################################################### # Hideous stuff -use vars qw( $re_directive $re_tmpl_var $re_tmpl_var_escaped $re_tmpl_include ); -use vars qw( $re_directive_control $re_tmpl_endif_endloop $re_xsl); +use vars qw( $re_xsl $re_end_entity $re_tmpl_var); BEGIN { - # $re_directive must not do any backreferences - $re_directive = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:VAR|LOOP|INCLUDE|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>}; - # TMPL_VAR or TMPL_INCLUDE - $re_tmpl_var = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>}; - $re_tmpl_include = q{<(?:(?i)(?:!--\s*)?TMPL_(?:INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>}; - # TMPL_VAR ESCAPE=1/HTML/URL + $re_tmpl_var = q{\[%\s*[get|set|default]?\s*[\w\.]+\s*[|.*?]?\s*%\]}; $re_xsl = q{<\/?(?:xsl:)(?:[\s\-a-zA-Z0-9"'\/\.\[\]\@\(\):=,$]+)\/?>}; - $re_tmpl_var_escaped = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR|INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))\s+ESCAPE=(?:1|HTML|URL)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>}; - # Any control flow directive - $re_directive_control = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:LOOP|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>}; - # /LOOP or /IF or /UNLESS - $re_tmpl_endif_endloop = q{<(?:(?i)(?:!--\s*)?\/TMPL_(?:LOOP|IF|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>}; -} - -# Hideous stuff from subst.pl, slightly modified to use the above hideous stuff -# Note: The $re_tag's set $1 (), and $3 (rest of string) -use vars qw( $re_comment $re_entity_name $re_end_entity $re_etag ); -use vars qw( $re_tag_strict $re_tag_compat @re_tag ); -sub re_tag ($) { - my($compat) = @_; - my $etag = $compat? '>': '<>\/'; - # This is no longer similar to the original regexp in subst.pl :-( - # Note that we don't want <> in compat mode; Mozilla knows about < - q{(<\/?(?:|(?:"(?:} . $re_directive . q{|[^"])*"|'(?:} . $re_directive . q{|[^'])*'|--(?:(?!--)(?:$re_directive)*.)*--|(?:} - . $re_directive - . q{|(?!--)[^"'<>} . $etag . q{]))+))([} . $etag . q{]|(?=<))(.*)}; -} -BEGIN { - $re_comment = '(?:--(?:[^-]|-[^-])*--)'; - $re_entity_name = '(?:[^&%#;<>\s]+)'; # NOTE: not really correct SGML $re_end_entity = '(?:;|$|(?=\s))'; # semicolon or before-whitespace - $re_etag = q{(?:<\/?(?:"[^"]*"|'[^']*'|[^"'>\/])*[>\/])}; # end-tag - @re_tag = ($re_tag_strict, $re_tag_compat) = (re_tag(0), re_tag(1)); } - # End of the hideous stuff use vars qw( $serial ); @@ -87,14 +52,14 @@ sub FATAL_P () {'fatal-p'} sub SYNTAXERROR_P () {'syntaxerror-p'} sub FILENAME () {'input'} -sub HANDLE () {'handle'} +#sub HANDLE () {'handle'} -sub READAHEAD () {'readahead'} +#sub READAHEAD () {'readahead'} sub LINENUM_START () {'lc_0'} sub LINENUM () {'lc'} sub CDATA_MODE_P () {'cdata-mode-p'} sub CDATA_CLOSE () {'cdata-close'} -sub PCDATA_MODE_P () {'pcdata-mode-p'} # additional submode for CDATA +#sub PCDATA_MODE_P () {'pcdata-mode-p'} # additional submode for CDATA sub JS_MODE_P () {'js-mode-p'} # cdata-mode-p must also be true sub ALLOW_CFORMAT_P () {'allow-cformat-p'} @@ -102,11 +67,14 @@ sub ALLOW_CFORMAT_P () {'allow-cformat-p'} sub new { shift; my ($filename) = @_; - open my $handle,$filename or die "can't open $filename"; + #open my $handle,$filename or die "can't open $filename"; + my $parser = C4::TTParser->new; + $parser->build_tokens( $filename ); bless { - filename => $filename - , handle => $handle - , readahead => [] + filename => $filename, + _parser => $parser +# , handle => $handle +# , readahead => [] } , __PACKAGE__; } @@ -119,49 +87,16 @@ sub filename { return $this->{filename}; } -sub _handle { - my $this = shift; - return $this->{handle}; -} - sub fatal_p { my $this = shift; return $this->{+FATAL_P}; } +# work around, currently not implemented sub syntaxerror_p { - my $this = shift; - return $this->{+SYNTAXERROR_P}; -} - -sub has_readahead_p { - my $this = shift; - return @{$this->{readahead}}; -} - -sub _peek_readahead { - my $this = shift; - return $this->{readahead}->[$#{$this->{readahead}}]; -} - -sub line_number_start { - my $this = shift; - return $this->{+LINENUM_START}; -} - -sub line_number { - my $this = shift; - return $this->{+LINENUM}; -} - -sub cdata_mode_p { - my $this = shift; - return $this->{+CDATA_MODE_P}; -} - -sub pcdata_mode_p { - my $this = shift; - return $this->{+PCDATA_MODE_P}; +# my $this = shift; +# return $this->{+SYNTAXERROR_P}; + return 0; } sub js_mode_p { @@ -169,11 +104,6 @@ sub js_mode_p { return $this->{+JS_MODE_P}; } -sub cdata_close { - my $this = shift; - return $this->{+CDATA_CLOSE}; -} - sub allow_cformat_p { my $this = shift; return $this->{+ALLOW_CFORMAT_P}; @@ -187,71 +117,13 @@ sub _set_fatal { return $this; } -sub _set_syntaxerror { - my $this = shift; - $this->{+SYNTAXERROR_P} = $_[0]; - return $this; -} - -sub _push_readahead { - my $this = shift; - push @{$this->{readahead}}, $_[0]; - return $this; -} - -sub _pop_readahead { - my $this = shift; - return pop @{$this->{readahead}}; -} - -sub _append_readahead { - my $this = shift; - $this->{readahead}->[$#{$this->{readahead}}] .= $_[0]; - return $this; -} - -sub _set_readahead { - my $this = shift; - $this->{readahead}->[$#{$this->{readahead}}] = $_[0]; - return $this; -} - -sub _increment_line_number { - my $this = shift; - $this->{+LINENUM} += 1; - return $this; -} - -sub _set_line_number_start { - my $this = shift; - $this->{+LINENUM_START} = $_[0]; - return $this; -} - -sub _set_cdata_mode { - my $this = shift; - $this->{+CDATA_MODE_P} = $_[0]; - return $this; -} - -sub _set_pcdata_mode { - my $this = shift; - $this->{+PCDATA_MODE_P} = $_[0]; - return $this; -} - sub _set_js_mode { my $this = shift; $this->{+JS_MODE_P} = $_[0]; return $this; } -sub _set_cdata_close { - my $this = shift; - $this->{+CDATA_CLOSE} = $_[0]; - return $this; -} - +#used in xgettext, tmpl_process3 and text-extract2 sub set_allow_cformat { my $this = shift; $this->{+ALLOW_CFORMAT_P} = $_[0]; @@ -268,48 +140,48 @@ BEGIN { sub parenleft () { '(' } sub parenright () { ')' } -sub split_js ($) { +sub _split_js ($) { my ($s0) = @_; my @it = (); while (length $s0) { - if ($s0 =~ /^\s+/s) { # whitespace - push @it, $&; - $s0 = $'; - } elsif ($s0 =~ /^\/\/[^\r\n]*(?:[\r\n]|$)/s) { # C++-style comment - push @it, $&; - $s0 = $'; - } elsif ($s0 =~ /^\/\*(?:(?!\*\/).)*\*\//s) { # C-style comment - push @it, $&; - $s0 = $'; - # Keyword or identifier, ECMA-262 p.13 (section 7.5) - } elsif ($s0 =~ /^[A-Z_\$][A-Z\d_\$]*/is) { # IdentifierName - push @it, $&; - $s0 = $'; - # Punctuator, ECMA-262 p.13 (section 7.6) - } elsif ($s0 =~ /^(?:[\(\){}\[\];]|>>>=|<<=|>>=|[-\+\*\/\&\|\^\%]=|>>>|<<|>>|--|\+\+|\|\||\&\&|==|<=|>=|!=|[=><,!~\?:\.\-\+\*\/\&\|\^\%])/s) { - push @it, $&; - $s0 = $'; - # DecimalLiteral, ECMA-262 p.14 (section 7.7.3); note: bug in the spec - } elsif ($s0 =~ /^(?:0|[1-9]\d+(?:\.\d*(?:[eE][-\+]?\d+)?)?)/s) { - push @it, $&; - $s0 = $'; - # HexIntegerLiteral, ECMA-262 p.15 (section 7.7.3) - } elsif ($s0 =~ /^0[xX][\da-fA-F]+/s) { - push @it, $&; - $s0 = $'; - # OctalIntegerLiteral, ECMA-262 p.15 (section 7.7.3) - } elsif ($s0 =~ /^0[\da-fA-F]+/s) { - push @it, $&; - $s0 = $'; - # StringLiteral, ECMA-262 p.17 (section 7.7.4) - # XXX SourceCharacter doesn't seem to be defined (?) - } elsif ($s0 =~ /^(?:"(?:(?!["\\\r\n]).|$js_EscapeSequence)*"|'(?:(?!['\\\r\n]).|$js_EscapeSequence)*')/os) { - push @it, $&; - $s0 = $'; - } elsif ($s0 =~ /^./) { # UNKNOWN TOKEN !!! - push @it, $&; - $s0 = $'; - } + if ($s0 =~ /^\s+/s) { # whitespace + push @it, $&; + $s0 = $'; + } elsif ($s0 =~ /^\/\/[^\r\n]*(?:[\r\n]|$)/s) { # C++-style comment + push @it, $&; + $s0 = $'; + } elsif ($s0 =~ /^\/\*(?:(?!\*\/).)*\*\//s) { # C-style comment + push @it, $&; + $s0 = $'; + # Keyword or identifier, ECMA-262 p.13 (section 7.5) + } elsif ($s0 =~ /^[A-Z_\$][A-Z\d_\$]*/is) { # IdentifierName + push @it, $&; + $s0 = $'; + # Punctuator, ECMA-262 p.13 (section 7.6) + } elsif ($s0 =~ /^(?:[\(\){}\[\];]|>>>=|<<=|>>=|[-\+\*\/\&\|\^\%]=|>>>|<<|>>|--|\+\+|\|\||\&\&|==|<=|>=|!=|[=><,!~\?:\.\-\+\*\/\&\|\^\%])/s) { + push @it, $&; + $s0 = $'; + # DecimalLiteral, ECMA-262 p.14 (section 7.7.3); note: bug in the spec + } elsif ($s0 =~ /^(?:0|[1-9]\d+(?:\.\d*(?:[eE][-\+]?\d+)?)?)/s) { + push @it, $&; + $s0 = $'; + # HexIntegerLiteral, ECMA-262 p.15 (section 7.7.3) + } elsif ($s0 =~ /^0[xX][\da-fA-F]+/s) { + push @it, $&; + $s0 = $'; + # OctalIntegerLiteral, ECMA-262 p.15 (section 7.7.3) + } elsif ($s0 =~ /^0[\da-fA-F]+/s) { + push @it, $&; + $s0 = $'; + # StringLiteral, ECMA-262 p.17 (section 7.7.4) + # XXX SourceCharacter doesn't seem to be defined (?) + } elsif ($s0 =~ /^(?:"(?:(?!["\\\r\n]).|$js_EscapeSequence)*"|'(?:(?!['\\\r\n]).|$js_EscapeSequence)*')/os) { + push @it, $&; + $s0 = $'; + } elsif ($s0 =~ /^./) { # UNKNOWN TOKEN !!! + push @it, $&; + $s0 = $'; + } } return @it; } @@ -320,589 +192,175 @@ sub STATE_STRING_LITERAL () { 3 } # XXX This is a crazy hack. I don't want to write an ECMAScript parser. # XXX A scanner is one thing; a parser another thing. -sub identify_js_translatables (@) { +sub _identify_js_translatables (@) { my @input = @_; my @output = (); # We mark a JavaScript translatable string as in C, i.e., _("literal") # For simplicity, we ONLY look for "_" "(" StringLiteral ")" for (my $i = 0, my $state = 0, my($j, $q, $s); $i <= $#input; $i += 1) { - my $reset_state_p = 0; - push @output, [0, $input[$i]]; - if ($input[$i] !~ /\S/s) { - ; - } elsif ($state == 0) { - $state = STATE_UNDERSCORE if $input[$i] eq '_'; - } elsif ($state == STATE_UNDERSCORE) { - $state = $input[$i] eq parenleft ? STATE_PARENLEFT : 0; - } elsif ($state == STATE_PARENLEFT) { - if ($input[$i] =~ /^(['"])(.*)\1$/s) { - ($state, $j, $q, $s) = (STATE_STRING_LITERAL, $#output, $1, $2); - } else { - $state = 0; - } - } elsif ($state == STATE_STRING_LITERAL) { - if ($input[$i] eq parenright) { - $output[$j] = [1, $output[$j]->[1], $q, $s]; - } - $state = 0; - } else { - die "identify_js_translatables internal error: Unknown state $state" - } +# warn $input[$i]; + my $reset_state_p = 0; + push @output, [0, $input[$i]]; + if ($input[$i] !~ /\S/s) { + ; + } elsif ($state == 0) { + $state = STATE_UNDERSCORE if $input[$i] eq '_'; + } elsif ($state == STATE_UNDERSCORE) { + $state = $input[$i] eq parenleft ? STATE_PARENLEFT : 0; + } elsif ($state == STATE_PARENLEFT) { + if ($input[$i] =~ /^(['"])(.*)\1$/s) { + ($state, $j, $q, $s) = (STATE_STRING_LITERAL, $#output, $1, $2); + } else { + $state = 0; + } + } elsif ($state == STATE_STRING_LITERAL) { + if ($input[$i] eq parenright) { + $output[$j] = [1, $output[$j]->[1], $q, $s]; + } + $state = 0; + } else { + die "identify_js_translatables internal error: Unknown state $state" + } } +# use Data::Dumper; +# warn Dumper \@output; return \@output; } ############################################################################### -sub _extract_attributes ($;$) { - my $this = shift; - my($s, $lc) = @_; - my %attr; - $s = $1 if $s =~ /^<(?:(?!$re_directive_control)\S)+(.*)\/\S$/s # XML-style self-closing tags - || $s =~ /^<(?:(?!$re_directive_control)\S)+(.*)\S$/s; # SGML-style tags - - for (my $i = 0; $s =~ /^(?:$re_directive_control)?\s+(?:$re_directive_control)?(?:([a-zA-Z][-a-zA-Z0-9]*)\s*=\s*)?('((?:$re_directive|[^'])*)'|"((?:$re_directive|[^"])*)"|((?:$re_directive|[^\s<>])+))/os;) { - my($key, $val, $val_orig, $rest) - = ($1, (defined $3? $3: defined $4? $4: $5), $2, $'); - $i += 1; - $attr{+lc($key)} = [$key, $val, $val_orig, $i]; - $s = $rest; - if ($val =~ /$re_tmpl_include/os) { - warn_normal "TMPL_INCLUDE in attribute: $val_orig\n", $lc; - } elsif ($val =~ /$re_tmpl_var/os && $val !~ /$re_tmpl_var_escaped/os) { - # XXX: we probably should not warn if key is "onclick" etc - # XXX: there's just no reasonable thing to suggest - my $suggest = ($key =~ /^(?:action|archive|background|cite|classid|codebase|data|datasrc|for|href|longdesc|profile|src|usemap)$/i? 'URL': 'HTML'); - undef $suggest if $key =~ /^(?:onblur|onchange|onclick|ondblclick|onfocus|onkeydown|onkeypress|onkeyup|onload|onmousedown|onmousemove|onmouseout|onmouseover|onmouseup|onreset|onselect|onsubmit|onunload)$/i; - warn_pedantic - "Suggest ESCAPE=$suggest for TMPL_VAR in attribute \"$key\"" - . ": $val_orig", - $lc, \$pedantic_tmpl_var_use_in_nonpedantic_mode_p - if defined $suggest && (pedantic_p || !$pedantic_tmpl_var_use_in_nonpedantic_mode_p); - } elsif ($val_orig !~ /^['"]/) { - my $t = $val; $t =~ s/$re_directive_control//os; - warn_pedantic - "Unquoted attribute contains character(s) that should be quoted" - . ": $val_orig", - $lc, \$pedantic_attribute_error_in_nonpedantic_mode_p - if $t =~ /[^-\.A-Za-z0-9]/s; - } - } - my $s2 = $s; $s2 =~ s/$re_tmpl_endif_endloop//g; # for the next check - if ($s2 =~ /\S/s) { # should never happen - if ($s =~ /^([^\n]*)\n/s) { # this is even worse - error_normal("Completely confused while extracting attributes: $1", $lc); - error_normal((scalar(split(/\n/, $s)) - 1) . " more line(s) not shown.", undef); - $this->_set_fatal( 1 ); - } else { - # There's something wrong with the attribute syntax. - # We might be able to deduce a likely cause by looking more. - if ($s =~ /^[a-z0-9]/is && "" =~ /^$re_tag_compat$/s) { - warn_normal "Probably missing whitespace before or missing quotation mark near: $s\n", $lc; - } else { - warn_normal "Strange attribute syntax: $s\n", $lc; - } - } - } - return \%attr; -} - -sub _next_token_internal { - my $this = shift; - my($h) = @_; - my($it, $kind); - my $eof_p = 0; - $this->_pop_readahead if $this->has_readahead_p - && !ref $this->_peek_readahead - && !length $this->_peek_readahead; - if (!$this->has_readahead_p) { - my $next = scalar <$h>; - $eof_p = !defined $next; - if (!$eof_p) { - $this->_increment_line_number; - $this->_push_readahead( $next ); - } - } - $this->_set_line_number_start( $this->line_number ); # remember 1st line num - if ($this->has_readahead_p && ref $this->_peek_readahead) { # TmplToken obj. - ($it, $kind) = ($this->_pop_readahead, undef); - } elsif ($eof_p && !$this->has_readahead_p) { # nothing left to do - ; - } elsif ($this->_peek_readahead =~ /^\s+/s) { # whitespace - ($kind, $it) = (TmplTokenType::TEXT, $&); - $this->_set_readahead( $' ); - # FIXME the following (the [<\s] part) is an unreliable HACK :-( - } elsif ($this->_peek_readahead =~ /^(?:[^<]|<[<\s])*(?:[^<\s])/s) { # non-space normal text - ($kind, $it) = (TmplTokenType::TEXT, $&); - $this->_set_readahead( $' ); - warn_normal "Unescaped < in $it\n", $this->line_number_start - if !$this->cdata_mode_p && $it =~ /cdata_close;;) { - if ($this->cdata_mode_p) { - my $next = $this->_pop_readahead; - if ($next =~ /^$cdata_close/is) { - ($kind, $it) = (TmplTokenType::TAG, $&); - $this->_push_readahead( $' ); - $ok_p = 1; - } elsif ($next =~ /^((?:(?!$cdata_close).)+)($cdata_close)/is) { - ($kind, $it) = (TmplTokenType::TEXT, $1); - $this->_push_readahead( "$2$'" ); - $ok_p = 1; - } else { - ($kind, $it) = (TmplTokenType::TEXT, $next); - $ok_p = 1; - } - } elsif ($this->_peek_readahead =~ /^$re_tag_compat/os) { - # If we detect a "closed start tag" but we know that the - # following token looks like a TMPL_VAR, don't stop - my($head, $tail, $post) = ($1, $2, $3); - if ($tail eq '' && $post =~ $re_tmpl_var) { - # Don't bother to show the warning if we're too confused - # FIXME. There's no method for _closed_start_tag_warning - if (!defined $this->{'_closed_start_tag_warning'} - || ($this->{'_closed_start_tag_warning'}->[0] eq $head - && $this->{'_closed_start_tag_warning'}->[1] != $this->line_number - 1)) { - warn_normal "Possible SGML \"closed start tag\" notation: $head<\n", $this->line_number - if split(/\n/, $head) < 10; - } - $this->{'_closed_start_tag_warning'} = [$head, $this->line_number]; - } else { - ($kind, $it) = (TmplTokenType::TAG, "$head>"); - $this->_set_readahead( $post ); - $ok_p = 1; - warn_normal "SGML \"closed start tag\" notation: $head<\n", $this->line_number if $tail eq '' - and $head ne '_peek_readahead =~ /^)$re_directive*.)*-->/os) { - ($kind, $it) = (TmplTokenType::COMMENT, $&); - $this->_set_readahead( $' ); - $ok_p = 1; - $bad_comment_p = 1; - } - last if $ok_p; - my $next = scalar <$h>; - $eof_p = !defined $next; - last if $eof_p; - $this->_increment_line_number; - $this->_append_readahead( $next ); - } - if ($kind ne TmplTokenType::TAG) { - ; - } elsif ($it =~ /^).)*-->/; - if ($kind == TmplTokenType::COMMENT && $it =~ /^