Adding xsl strings to po

[koha.git] / misc / translator / xgettext.pl
diff --git a/misc/translator/xgettext.pl b/misc/translator/xgettext.pl

index 2da2363..996b952 100755 (executable)
--- a/misc/translator/xgettext.pl
+++ b/misc/translator/xgettext.pl
@@ -7,7 +7,9 @@ xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
  =cut
  
  use strict;
+use warnings;
  use Getopt::Long;
+use POSIX;
  use Locale::PO;
  use TmplTokenizer;
  use VerboseWarnings;
@@ -18,6 +20,9 @@ use vars qw( $extract_all_p );
  use vars qw( $pedantic_p );
  use vars qw( %text %translation );
  use vars qw( $charset_in $charset_out );
+use vars qw( $disable_fuzzy_p );
+use vars qw( $verbose_p );
+use vars qw( $po_mode_p );
  
  ###############################################################################
  
@@ -97,12 +102,16 @@ sub text_extract (*) {
                 if ($attr->{$a}) {
                     next if $a eq 'content' && $tag ne 'meta';
                     next if $a eq 'value' && ($tag ne 'input'
-                       || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio)$/)); # FIXME
+                       || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|submit)$/)); # FIXME
                     my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
                     $val = TmplTokenizer::trim $val;
                     remember( $s, $val ) if $val =~ /\S/s;
                 }
             }
+       } elsif ($s->has_js_data) {
+           for my $t (@{$s->js_data}) {
+               remember( $s, $t->[3] ) if $t->[0]; # FIXME
+           }
         }
      }
  }
@@ -112,7 +121,7 @@ sub text_extract (*) {
  sub generate_strings_list () {
      # Emit all extracted strings.
      for my $t (string_list) {
-       printf OUTPUT "%s\n", $t # unless negligible_p($t);
+       printf OUTPUT "%s\n", $t;
      }
  }
  
@@ -122,18 +131,26 @@ sub generate_po_file () {
      # We don't emit the Plural-Forms header; it's meaningless for us
      my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
      $pot_charset = TmplTokenizer::charset_canon $pot_charset;
+    # Time stamps aren't exactly right semantically. I don't know how to fix it.
+    my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
+    my $time_pot = $time;
+    my $time_po  = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
      print OUTPUT <<EOF;
  # SOME DESCRIPTIVE TITLE.
  # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
  # This file is distributed under the same license as the PACKAGE package.
  # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
  #
+EOF
+    print OUTPUT <<EOF unless $disable_fuzzy_p;
  #, fuzzy
+EOF
+    print OUTPUT <<EOF;
  msgid ""
  msgstr ""
  "Project-Id-Version: PACKAGE VERSION\\n"
-"POT-Creation-Date: 2004-02-05 20:55-0500\\n"
-"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
+"POT-Creation-Date: $time_pot\\n"
+"PO-Revision-Date: $time_po\\n"
  "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
  "Language-Team: LANGUAGE <LL\@li.org>\\n"
  "MIME-Version: 1.0\\n"
@@ -143,7 +160,53 @@ msgstr ""
  EOF
      my $directory_re = quotemeta("$directory/");
      for my $t (string_list) {
-       #next if negligible_p($t);
+       if ($text{$t}->[0]->type == TmplTokenType::TEXT_PARAMETRIZED) {
+           my($token, $n) = ($text{$t}->[0], 0);
+           printf OUTPUT "#. For the first occurrence,\n"
+                   if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
+           for my $param ($token->parameters_and_fields) {
+               $n += 1;
+               my $type = $param->type;
+               my $subtype = ($type == TmplTokenType::TAG
+                       && $param->string =~ /^<input\b/is?
+                               $param->attributes->{'type'}->[1]: undef);
+               my $fmt = TmplTokenizer::_formalize( $param );
+               $fmt =~ s/^%/%$n\$/;
+               if ($type == TmplTokenType::DIRECTIVE) {
+                   $type = $param->string =~ /(TMPL_[A-Z]+)+/is? $1: 'ERROR';
+                   my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
+                           $2: undef;
+                   printf OUTPUT "#. %s: %s\n", $fmt,
+                       "$type" . (defined $name? " name=$name": '');
+               } else {
+                   my $name = $param->attributes->{'name'};
+                   my $value = $param->attributes->{'value'}
+                           unless $subtype =~ /^(?:text)$/;
+                   printf OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
+                           . (defined $name?  " name=$name->[1]": '')
+                           . (defined $value? " value=$value->[1]": '');
+               }
+           }
+       } elsif ($text{$t}->[0]->type == TmplTokenType::TAG) {
+           my($token) = ($text{$t}->[0]);
+           printf OUTPUT "#. For the first occurrence,\n"
+                   if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
+           if ($token->string =~ /^<meta\b/is) {
+               my $type = $token->attributes->{'http-equiv'}->[1];
+               print OUTPUT "#. META http-equiv=$type\n" if defined $type;
+           } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
+               my $tag = uc($1);
+               my $type = (lc($tag) eq 'input'?
+                       $token->attributes->{'type'}: undef);
+               my $name = $token->attributes->{'name'};
+               printf OUTPUT "#. %s\n", $tag
+                   . (defined $type? " type=$type->[1]": '')
+                   . (defined $name? " name=$name->[1]": '');
+           }
+       } elsif ($text{$t}->[0]->has_js_data) {
+           printf OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
+           printf OUTPUT "#. SCRIPT\n";
+       }
         my $cformat_p;
         for my $token (@{$text{$t}}) {
             my $pathname = $token->pathname;
@@ -196,7 +259,7 @@ sub convert_translation_file () {
      }
      # The following assumption is correct; that's what HTML::Template assumes
      if (!defined $charset_in) {
-       $charset_in = $charset_out = TmplTokenizer::charset_canon 'iso8859-1';
+       $charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8';
         warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
      }
  }
@@ -220,14 +283,17 @@ Output file location:
  HTML::Template options:
    -a, --extract-all              Extract all strings
        --pedantic-warnings        Issue warnings even for detected problems
-                                which are likely to be harmless
+                                 which are likely to be harmless
  
  Output details:
    -s, --sort-output              generate sorted output
    -F, --sort-by-file             sort output by file location
+  -v, --verbose                  explain what is being done
  
  Informative output:
        --help                     Display this help and exit
+
+Try `perldoc $0' for perhaps more information.
  EOF
      exit($exitcode);
  }
@@ -248,13 +314,16 @@ GetOptions(
      'charset=s'        => sub { $charset_in = $charset_out = $_[1] },  # INTERNAL
      'convert-from=s'                   => \$convert_from,
      'D|directory=s'                    => \$directory,
+    'disable-fuzzy'                    => \$disable_fuzzy_p,   # INTERNAL
      'f|files-from=s'                   => \$files_from,
      'I|input-charset=s'                        => \$charset_in,        # INTERNAL
      'pedantic-warnings|pedantic'       => sub { $pedantic_p = 1 },
      'O|output-charset=s'               => \$charset_out,       # INTERNAL
      'output|o=s'                       => \$output,
+    'po-mode'                          => \$po_mode_p,         # INTERNAL
      's|sort-output'                    => sub { $sort = 's' },
      'F|sort-by-file'                   => sub { $sort = 'F' },
+    'v|verbose'                                => \$verbose_p,
      'help'                             => sub { usage(0) },
  ) || usage_error;
  
@@ -269,22 +338,28 @@ usage_error('You cannot specify both --convert-from and --files-from')
         if defined $convert_from && defined $files_from;
  
  if (defined $output && $output ne '-') {
+    print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
      open(OUTPUT, ">$output") || die "$output: $!\n";
  } else {
+    print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
      open(OUTPUT, ">&STDOUT");
  }
  
  if (defined $files_from) {
+    print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
      open(INPUT, "<$files_from") || die "$files_from: $!\n";
      while (<INPUT>) {
         chomp;
-       my $h = TmplTokenizer->new( "$directory/$_" );
+       my $input = /^\//? $_: "$directory/$_";
+       my $h = TmplTokenizer->new( $input );
         $h->set_allow_cformat( 1 );
-       VerboseWarnings::set_input_file_name "$directory/$_";
+       VerboseWarnings::set_input_file_name $input;
+       print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
         text_extract( $h );
      }
      close INPUT;
  } else {
+    print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
      convert_translation_file;
  }
  generate_po_file;
@@ -309,7 +384,6 @@ A gettext-like format provides the following advantages:
  
  =item -
  
-(Future goal)
  Translation to non-English-like languages with different word
  order:  gettext's c-format strings can theoretically be
  emulated if we are able to do some analysis on the .tmpl input
@@ -333,8 +407,9 @@ the gettext format.
  
  =back
  
-Note that this script is experimental and should still be
-considered unstable.
+This script has already been in use for over a year and should
+be reasonable stable. Nevertheless, it is still somewhat
+experimental and there are still some issues.
  
  Please refer to the explanation in tmpl_process3 for further
  details.
@@ -342,17 +417,31 @@ details.
  If you want to generate GNOME-style POTFILES.in files, such
  files (passed to -f) can be generated thus:
  
-       (cd ../.. && find koha-tmpl/opac-tmpl/default/en
+       (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
                 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
-       (cd ../.. && find koha-tmpl/intranet-tmpl/default/en
+       (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
                 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
  
  This is, however, quite pointless, because the "create" and
  "update" actions have already been implemented in tmpl_process3.pl.
  
+=head2 Strings inside JavaScript
+
+In the SCRIPT elements, the script will attempt to scan for
+_("I<string literal>") patterns, and extract the I<string literal>
+as a translatable string.
+
+Note that the C-like _(...) notation is required.
+
+The JavaScript must actually define a _ function
+so that the code remains correct JavaScript.
+A suitable definition of such a function can be
+
+       function _(s) { return s } // dummy function for gettext
+
  =head1 SEE ALSO
  
-tmpl_process.pl,
+tmpl_process3.pl,
  xgettext(1),
  Locale::PO(3),
  translator_doc.txt
@@ -373,4 +462,9 @@ xgettext(1)'s sort option. This will result in translation
  strings inside the generated PO file spuriously moving about
  when tmpl_process3.pl calls msgmerge(1) to update the PO file.
  
+If a Javascript string has leading spaces, it will
+generate strings with spurious leading spaces,
+leading to failure to match the strings when actually generating
+translated files.
+
  =cut