###############################################################################
-sub negligible_p ($) {
+sub string_negligible_p ($) {
my($t) = @_; # a string
# Don't emit pure whitespace, pure numbers, pure punctuation,
# single letters, or TMPL_VAR's.
# Punctuation should arguably be translated. But without context
- # they are untranslatable.
+ # they are untranslatable. Note that $t is a string, not a token object.
return !$extract_all_p && (
- TmplTokenizer::blank_p($t) # blank or TMPL_VAR
+ TmplTokenizer::blank_p($t) # blank or TMPL_VAR
|| $t =~ /^\d+$/ # purely digits
- || $t =~ /^[-\.,:;'"%\(\)\[\]\|]+$/ # pure punctuation w/o context
+ || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
|| $t =~ /^[A-Za-z]$/ # single letters
- );
+ )
+}
+
+sub token_negligible_p( $ ) {
+ my($x) = @_;
+ my $t = $x->type;
+ return !$extract_all_p && (
+ $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
+ $t == TmplTokenType::DIRECTIVE? 1:
+ $t == TmplTokenType::TEXT_PARAMETRIZED
+ && join( '', map { my $t = $_->type;
+ $t == TmplTokenType::DIRECTIVE?
+ '1': $t == TmplTokenType::TAG?
+ '': token_negligible_p( $_ )?
+ '': '1' } @{$x->children} ) eq '' );
}
###############################################################################
sub remember ($$) {
my($token, $string) = @_;
- $text{$string} = [] unless defined $text{$string};
- push @{$text{$string}}, $token;
+ # If we determine that the string is negligible, don't bother to remember
+ unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
+ my $key = TmplTokenizer::string_canon( $string );
+ $text{$key} = [] unless defined $text{$key};
+ push @{$text{$key}}, $token;
+ }
}
###############################################################################
last unless defined $s;
my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
if ($kind eq TmplTokenType::TEXT) {
- #$t = TmplTokenizer::trim $t;
remember( $s, $t ) if $t =~ /\S/s;
} elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
- #$t = TmplTokenizer::trim $t;
remember( $s, $s->form ) if $s->form =~ /\S/s;
} elsif ($kind eq TmplTokenType::TAG && %$attr) {
# value [tag=input], meta
sub generate_strings_list () {
# Emit all extracted strings.
for my $t (string_list) {
- printf OUTPUT "%s\n", $t unless negligible_p($t);
+ printf OUTPUT "%s\n", $t # unless negligible_p($t);
}
}
EOF
my $directory_re = quotemeta("$directory/");
for my $t (string_list) {
- next if negligible_p($t);
+ #next if negligible_p($t);
my $cformat_p;
for my $token (@{$text{$t}}) {
my $pathname = $token->pathname;
}
printf OUTPUT "#, c-format\n" if $cformat_p;
printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
+ TmplTokenizer::string_canon
TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
=back
-Right now it does about the same thing as text-extract2.pl but
-generates gettext-style output; however, because it is scanner-
-instead of parser-based, it is able to address the 4 weaknesses
-listed in translator_doc.txt. Ultimately, the goal is to make
-this able to do some kind of simple analysis on the input to
-produce gettext-style output with c-format strings, in order to
-facilitate translation to languages with a different word order
-than English.
-
-When the above is finished, the generated po file may contain
-some HTML tags in addition to %s strings.
+Note that this script is experimental and should still be
+considered unstable.
+
+Please refer to the explanation in tmpl_process3 for further
+details.
If you want to generate GNOME-style POTFILES.in files, such
files (passed to -f) can be generated thus: