X-Git-Url: http://git.rot13.org/?a=blobdiff_plain;f=misc%2Ftranslator%2Fxgettext.pl;h=2da236360f38ca0f502a4d16a5e622723f432962;hb=09ab9d4769a978769d9e0e17c8d83aaa6876aa13;hp=3d5d11b042f814f959b4778ca574923ccbe26811;hpb=fb1cfd3dd395ba68d2344449f44357263ef62d04;p=koha.git

diff --git a/misc/translator/xgettext.pl b/misc/translator/xgettext.pl
index 3d5d11b042..2da236360f 100755
--- a/misc/translator/xgettext.pl
+++ b/misc/translator/xgettext.pl
@@ -21,26 +21,44 @@ use vars qw( $charset_in $charset_out );
 
 ###############################################################################
 
-sub negligible_p ($) {
+sub string_negligible_p ($) {
     my($t) = @_;				# a string
     # Don't emit pure whitespace, pure numbers, pure punctuation,
     # single letters, or TMPL_VAR's.
     # Punctuation should arguably be translated. But without context
-    # they are untranslatable.
+    # they are untranslatable. Note that $t is a string, not a token object.
     return !$extract_all_p && (
-    	       TmplTokenizer::blank_p($t)		# blank or TMPL_VAR
+    	       TmplTokenizer::blank_p($t)	# blank or TMPL_VAR
 	    || $t =~ /^\d+$/			# purely digits
-	    || $t =~ /^[-\.,:;'"%\(\)\[\]\|]+$/	# pure punctuation w/o context
+	    || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
 	    || $t =~ /^[A-Za-z]$/		# single letters
-	);
+	)
+}
+
+sub token_negligible_p( $ ) {
+    my($x) = @_;
+    my $t = $x->type;
+    return !$extract_all_p && (
+	    $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
+	    $t == TmplTokenType::DIRECTIVE? 1:
+	    $t == TmplTokenType::TEXT_PARAMETRIZED
+		&& join( '', map { my $t = $_->type;
+			$t == TmplTokenType::DIRECTIVE?
+				'1': $t == TmplTokenType::TAG?
+					'': token_negligible_p( $_ )?
+					'': '1' } @{$x->children} ) eq '' );
 }
 
 ###############################################################################
 
 sub remember ($$) {
     my($token, $string) = @_;
-    $text{$string} = [] unless defined $text{$string};
-    push @{$text{$string}}, $token;
+    # If we determine that the string is negligible, don't bother to remember
+    unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
+	my $key = TmplTokenizer::string_canon( $string );
+	$text{$key} = [] unless defined $text{$key};
+	push @{$text{$key}}, $token;
+    }
 }
 
 ###############################################################################
@@ -69,10 +87,8 @@ sub text_extract (*) {
     last unless defined $s;
 	my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
 	if ($kind eq TmplTokenType::TEXT) {
-	    #$t = TmplTokenizer::trim $t;
 	    remember( $s, $t ) if $t =~ /\S/s;
 	} elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
-	    #$t = TmplTokenizer::trim $t;
 	    remember( $s, $s->form ) if $s->form =~ /\S/s;
 	} elsif ($kind eq TmplTokenType::TAG && %$attr) {
 	    # value [tag=input], meta
@@ -96,7 +112,7 @@ sub text_extract (*) {
 sub generate_strings_list () {
     # Emit all extracted strings.
     for my $t (string_list) {
-	printf OUTPUT "%s\n", $t unless negligible_p($t);
+	printf OUTPUT "%s\n", $t # unless negligible_p($t);
     }
 }
 
@@ -127,7 +143,7 @@ msgstr ""
 EOF
     my $directory_re = quotemeta("$directory/");
     for my $t (string_list) {
-	next if negligible_p($t);
+	#next if negligible_p($t);
 	my $cformat_p;
 	for my $token (@{$text{$t}}) {
 	    my $pathname = $token->pathname;
@@ -138,6 +154,7 @@ EOF
 	}
 	printf OUTPUT "#, c-format\n" if $cformat_p;
 	printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
+		TmplTokenizer::string_canon
 		TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
 	printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
 		TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
@@ -316,17 +333,11 @@ the gettext format.
 
 =back
 
-Right now it does about the same thing as text-extract2.pl but
-generates gettext-style output; however, because it is scanner-
-instead of parser-based, it is able to address the 4 weaknesses
-listed in translator_doc.txt.  Ultimately, the goal is to make
-this able to do some kind of simple analysis on the input to
-produce gettext-style output with c-format strings, in order to
-facilitate translation to languages with a different word order
-than English.
-
-When the above is finished, the generated po file may contain
-some HTML tags in addition to %s strings.
+Note that this script is experimental and should still be
+considered unstable.
+
+Please refer to the explanation in tmpl_process3 for further
+details.
 
 If you want to generate GNOME-style POTFILES.in files, such
 files (passed to -f) can be generated thus: