return map { $_->type == TmplTokenType::DIRECTIVE? $_: ()} @{$this->{'_kids'}};
}
+# only meaningful for TEXT_PARAMETRIZED tokens
+sub anchors {
+ my $this = shift;
+ return map { $_->type == TmplTokenType::TAG && $_->string =~ /^<a\b/is? $_: ()} @{$this->{'_kids'}};
+}
+
# only meaningful for TEXT_PARAMETRIZED tokens
sub form {
my $this = shift;
return $it;
}
-sub _token_groupable_p ($) { # groupable into a TEXT_PARAMETRIZED token
+sub _token_groupable1_p ($) { # as first token, groupable into TEXT_PARAMETRIZED
my($t) = @_;
- return $t->type == TmplTokenType::TEXT
+ return ($t->type == TmplTokenType::TEXT && $t->string !~ /^[,\.:\|\s]+$/s)
|| ($t->type == TmplTokenType::DIRECTIVE
&& $t->string =~ /^(?:$re_tmpl_var)$/os)
|| ($t->type == TmplTokenType::TAG
- && ($t->string =~ /^<\/?(?:b|em|h[123456]|i|u)\b/is
+ && ($t->string =~ /^<(?:b|em|h[123456]|i|u)\b/is
+ || ($t->string =~ /^<input/i
+ && $t->attributes->{'type'} =~ /^(?:text)$/i)))
+}
+
+sub _token_groupable2_p ($) { # as other token, groupable into TEXT_PARAMETRIZED
+ my($t) = @_;
+ return ($t->type == TmplTokenType::TEXT && ($t->string =~ /^\s*$/s || $t->string !~ /^[\|\s]+$/s))
+ || ($t->type == TmplTokenType::DIRECTIVE
+ && $t->string =~ /^(?:$re_tmpl_var)$/os)
+ || ($t->type == TmplTokenType::TAG
+ && ($t->string =~ /^<\/?(?:a|b|em|h[123456]|i|u)\b/is
|| ($t->string =~ /^<input/i
&& $t->attributes->{'type'} =~ /^(?:text)$/i)))
}
sub _formalize ($) {
my($t) = @_;
- return $t->type == TmplTokenType::DIRECTIVE? '%s': _quote_cformat($t->string);
+ return $t->type == TmplTokenType::DIRECTIVE? '%s':
+ $t->type == TmplTokenType::TAG?
+ ($t->string =~ /^<a\b/is? '<a>': _quote_cformat($t->string)):
+ _quote_cformat($t->string);
}
sub _optimize {
push @{$this->{_queue}}, pop @structure;
}
};
+ &$undo_trailing_blanks;
# FIXME: If the last token is a close tag but there are no tags
# FIXME: before it, drop the close tag back into the queue. This
# FIXME: is an ugly hack to get rid of "foo %s</h1>" type mess.
$it = $this->_next_token_intermediate($h);
if (!$this->cdata_mode_p && $this->allow_cformat_p && defined $it
&& ($it->type == TmplTokenType::TEXT?
- !blank_p( $it->string ): _token_groupable_p( $it ))) {
+ !blank_p( $it->string ): _token_groupable1_p( $it ))) {
my @structure = ( $it );
- my($n_trailing_spaces, $next) = (0, undef);
- my($nonblank_text_p, $parametrized_p, $next) = (0, 0);
+ my @tags = ();
+ my $next = undef;
+ my($nonblank_text_p, $parametrized_p, $with_anchor_p) = (0, 0, 0);
if ($it->type == TmplTokenType::TEXT) {
$nonblank_text_p = 1 if !blank_p( $it->string );
} elsif ($it->type == TmplTokenType::DIRECTIVE) {
$parametrized_p = 1;
+ } elsif ($it->type == TmplTokenType::TAG && $it->string =~ /^<([A-Z0-9]+)/is) {
+ push @tags, lc($1);
+ $with_anchor_p = 1 if lc($1) eq 'a';
}
- for (my $i = 1, $n_trailing_spaces = 0;; $i += 1) {
+ # We hate | and || in msgid strings, so we try to avoid them
+ for (my $i = 1, my $quit_p = 0, my $quit_next_p = ($it->type == TmplTokenType::TEXT && $it->string =~ /^\|+$/s);; $i += 1) {
$next = $this->_next_token_intermediate($h);
push @structure, $next; # for consistency (with initialization)
- last unless defined $next && _token_groupable_p( $next );
+ last unless defined $next && _token_groupable2_p( $next );
+ last if $quit_next_p;
if ($next->type == TmplTokenType::TEXT) {
- if (blank_p( $next->string )) {
- $n_trailing_spaces += 1;
- } else {
- ($n_trailing_spaces, $nonblank_text_p) = (0, 1);
- }
+ $nonblank_text_p = 1 if !blank_p( $next->string );
+ $quit_p = 1 if $next->string =~ /^\|+$/s; # We hate | and ||
} elsif ($next->type == TmplTokenType::DIRECTIVE) {
- $n_trailing_spaces = 0;
$parametrized_p = 1;
- } else {
- $n_trailing_spaces = 0;
+ } elsif ($next->type == TmplTokenType::TAG) {
+ if ($next->string =~ /^<([A-Z0-9]+)/is) {
+ push @tags, lc($1);
+ $with_anchor_p = 1 if lc($1) eq 'a';
+ } elsif ($next->string =~ /^<\/([A-Z0-9]+)/is) {
+ my $close = lc($1);
+ $quit_p = 1 unless @tags && $close eq $tags[$#tags];
+ $quit_next_p = 1 if $close =~ /^h\d$/;
+ pop @tags;
+ }
}
+ last if $quit_p;
}
# Undo the last token
push @{$this->{_queue}}, pop @structure;
- # Undo trailing blank tokens
- for (my $i = 0; $i < $n_trailing_spaces; $i += 1) {
- push @{$this->{_queue}}, pop @structure;
- }
+ # Simply it a bit more
@structure = $this->_optimize( @structure );
if (@structure < 2) {
# Nothing to do
;
- } elsif ($nonblank_text_p && $parametrized_p) {
+ } elsif ($nonblank_text_p && ($parametrized_p || $with_anchor_p)) {
# Create the corresponding c-format string
my $string = join('', map { $_->string } @structure);
my $form = join('', map { _formalize $_ } @structure);
+ my $a_counter = 0;
+ $form =~ s/<a>/ $a_counter += 1, "<a$a_counter>" /egs;
$it = TmplToken->new($string, TmplTokenType::TEXT_PARAMETRIZED, $it->line_number, $it->pathname);
$it->set_form( $form );
$it->set_children( @structure );
}
# Some functions that shouldn't be here... should be moved out some time
-sub parametrize ($@) {
- my($fmt, @params) = @_;
+sub parametrize ($$$) {
+ my($fmt_0, $params, $anchors) = @_;
my $it = '';
- for (my $n = 0; length $fmt;) {
+ for (my $n = 0, my $fmt = $fmt_0; length $fmt;) {
if ($fmt =~ /^[^%]+/) {
$fmt = $';
$it .= $&;
my($i, $width, $prec) = ((defined $1? $1: $n), $2, $3);
$fmt = $';
if (!defined $width && !defined $prec) {
- $it .= $params[$i]
+ $it .= $params->[$i]
} elsif (defined $width && defined $prec && !$width && !$prec) {
;
} else {
die "Completely confused parametrizing: $fmt\n";#XXX
}
}
+ for (my $n = 0, my $fmt = $it, $it = ''; length $fmt;) {
+ if ($fmt =~ /^(?:(?!<a\d+>).)+/is) {
+ $fmt = $';
+ $it .= $&;
+ } elsif ($fmt =~ /^<a(\d+)>/is) {
+ $n += 1;
+ my $i = $1;
+ $fmt = $';
+ my $anchor = $anchors->[$i - 1];
+ warn_normal "$&: Anchor $1 not found for msgid \"$fmt_0\"", undef #FIXME
+ unless defined $anchor;
+ $it .= $anchor->string;
+ } else {
+ die "Completely confused decoding anchors: $fmt\n";#XXX
+ }
+ }
return $it;
}
print $output $pre, find_translation($trimmed), $post;
} elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
my $fmt = find_translation($s->form);
- print $output TmplTokenizer::parametrize($fmt, map {
+ print $output TmplTokenizer::parametrize($fmt, [ map {
my($kind, $t, $attr) = ($_->type, $_->string, $_->attributes);
$kind == TmplTokenType::TAG && %$attr?
- text_replace_tag($t, $attr): $t } $s->parameters);
+ text_replace_tag($t, $attr): $t } $s->parameters ], [ $s->anchors ]);
} elsif ($kind eq TmplTokenType::TAG && %$attr) {
print $output text_replace_tag($t, $attr);
} elsif (defined $t) {
=head1 DESCRIPTION
This is an experimental version of the tmpl_process.pl script,
-using standard gettext-style PO files.
+using standard gettext-style PO files. Note that the behaviour
+of this script should still be considered unstable.
Currently, the create, update, and install actions have all been
reimplemented and seem to work.
The --help option has not been implemented yet.
-There are probably some real bugs too, since this has not been
-tested very much.
+If an extracted string contain actual text (versus tags or
+TMPL_VAR directives), the strings are extracted verbatim,
+resulting in unwieldy things like multiple spaces, tabs,
+and/or newlines which are semantically indistinguishable
+from single blanks. If the template writer changes the
+spacing just a little bit, the new formatting would be
+considered new strings. This is arguably wrong, and in any
+case counter-productive.
xgettext.pl must be present in the current directory; the
msgmerge(1) command must also be present in the search path.
been written in TmplTokenizer and more is likely to be needed
(e.g., to get rid of the "Strange line" warning for #~).
+There are probably some other bugs too, since this has not been
+tested very much.
+
=head1 SEE ALSO
xgettext.pl,
###############################################################################
-sub negligible_p ($) {
+sub string_negligible_p ($) {
my($t) = @_; # a string
# Don't emit pure whitespace, pure numbers, pure punctuation,
# single letters, or TMPL_VAR's.
# Punctuation should arguably be translated. But without context
- # they are untranslatable.
+ # they are untranslatable. Note that $t is a string, not a token object.
return !$extract_all_p && (
- TmplTokenizer::blank_p($t) # blank or TMPL_VAR
+ TmplTokenizer::blank_p($t) # blank or TMPL_VAR
|| $t =~ /^\d+$/ # purely digits
- || $t =~ /^[-\.,:;'"%\(\)\[\]\|]+$/ # pure punctuation w/o context
+ || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
|| $t =~ /^[A-Za-z]$/ # single letters
- );
+ )
+}
+
+sub token_negligible_p( $ ) {
+ my($x) = @_;
+ my $t = $x->type;
+ return !$extract_all_p && (
+ $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
+ $t == TmplTokenType::DIRECTIVE? 1:
+ $t == TmplTokenType::TEXT_PARAMETRIZED
+ && join( '', map { my $t = $_->type;
+ $t == TmplTokenType::DIRECTIVE?
+ '1': $t == TmplTokenType::TAG?
+ '': token_negligible_p( $_ )?
+ '': '1' } @{$x->children} ) eq '' );
}
###############################################################################
sub remember ($$) {
my($token, $string) = @_;
- $text{$string} = [] unless defined $text{$string};
- push @{$text{$string}}, $token;
+ # If we determine that the string is negligible, don't bother to remember
+ unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
+ $text{$string} = [] unless defined $text{$string};
+ push @{$text{$string}}, $token;
+ }
}
###############################################################################
last unless defined $s;
my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
if ($kind eq TmplTokenType::TEXT) {
- #$t = TmplTokenizer::trim $t;
remember( $s, $t ) if $t =~ /\S/s;
} elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
- #$t = TmplTokenizer::trim $t;
remember( $s, $s->form ) if $s->form =~ /\S/s;
} elsif ($kind eq TmplTokenType::TAG && %$attr) {
# value [tag=input], meta
sub generate_strings_list () {
# Emit all extracted strings.
for my $t (string_list) {
- printf OUTPUT "%s\n", $t unless negligible_p($t);
+ printf OUTPUT "%s\n", $t # unless negligible_p($t);
}
}
EOF
my $directory_re = quotemeta("$directory/");
for my $t (string_list) {
- next if negligible_p($t);
+ #next if negligible_p($t);
my $cformat_p;
for my $token (@{$text{$t}}) {
my $pathname = $token->pathname;
=back
-Right now it does about the same thing as text-extract2.pl but
-generates gettext-style output; however, because it is scanner-
-instead of parser-based, it is able to address the 4 weaknesses
-listed in translator_doc.txt. Ultimately, the goal is to make
-this able to do some kind of simple analysis on the input to
-produce gettext-style output with c-format strings, in order to
-facilitate translation to languages with a different word order
-than English.
-
-When the above is finished, the generated po file may contain
-some HTML tags in addition to %s strings.
+Note that this script is experimental and should still be
+considered unstable.
+
+Please refer to the explanation in tmpl_process3 for further
+details.
If you want to generate GNOME-style POTFILES.in files, such
files (passed to -f) can be generated thus: