+
+ # begin real work: go field by field
+ foreach my $field (@sorted_tags) {
+
+ $field=x($field);
+ $field_usage{$field}++;
+
+ my $swish_data = "";
+ my $swish_exact_data = "";
+ my $display_data = "";
+ my @index_data;
+ my $line_delimiter;
+
+ my ($swish,$display);
+
+ my $tag = $type2tag{$type} || die "can't find which tag to use for type $type";
+
+ # is this field page-by-page?
+ my $iterate_by_page = $config->{indexer}->{$field}->{iterate_by_page};
+ push @page_fields,$field if ($iterate_by_page);
+ my %page_max = ();
+ # default line_delimiter if using
+ my $page_line_delimiter = $config->{indexer}->{$field}->{page_line_delimiter} || '<br/>';
+ $cache->{index_delimiter}->{$field} = $config->{indexer}->{$field}->{index_delimiter};
+
+ my $format_name = $config->{indexer}->{$field}->{format_name};
+ my $format_delimiter = $config->{indexer}->{$field}->{format_delimiter};
+ if ($format_name && $format_delimiter) {
+ $cache->{format}->{$field}->{format_name} = $format_name;
+ $cache->{format}->{$field}->{format_delimiter} = $format_delimiter;
+ }
+
+ foreach my $x (@{$config->{indexer}->{$field}->{$tag}}) {
+
+ $x = unroll_x($x);
+
+ my $format = x($x->{content});
+ my $delimiter = x($x->{delimiter}) || ' ';
+
+ my $repeat_off = 0; # init repeatable offset
+
+ my ($s,$se,$d,$i,$il) = init_visible_type($x->{type});
+
+ # what will separate last line from this one?
+ if ($display_data && $x->{append}) {
+ $line_delimiter = $delimiter;
+ } elsif ($display_data) {
+ $line_delimiter = '<br/>';
+ }
+
+ # init vars so that we go into while...
+ ($swish,$display) = (1,1);
+
+ # placeholder for all repeatable entries for index
+
+ sub mkformat($$) {
+ my $x = shift || die "mkformat needs tag reference";
+ my $data = shift || return;
+ my $format_name = x($x->{format_name}) || return $data;
+ my $fmt = x($config->{format}->{$format_name}->{content}) || die "<format name=\"$format_name\"> is not defined!";
+ my $format_delimiter = x($x->{format_delimiter});
+ my @data;
+ if ($format_delimiter) {
+ @data = split(/$format_delimiter/,$data);
+ } else {
+ push @data,$data;
+ }
+
+ if ($fmt) {
+ my $nr = scalar $fmt =~ s/%s/%s/g;
+ if (($#data+1) == $nr) {
+ return sprintf($fmt,@data);
+ } else {
+ #print STDERR "mkformat: [$data] can't be split on [$format_delimiter] to $nr fields!\n";
+ return $data;
+ }
+ } else {
+ print STDERR "usage of link '$format_name' without defined format (<link> tag)\n";
+ }
+ }
+
+ # while because of repeatable fields
+ while ($swish || $display) {
+ my $page = $repeat_off;
+ $page_max{$field} = $page if ($iterate_by_page && $page > ($page_max{$field} || 0));
+ ($swish,$display) = parse_format($type, $format,$row,$repeat_off++,$import2cp);
+ if ($repeat_off > 1000) {
+ print STDERR "loop (more than 1000 repeatable fields) deteced in $row, $format\n";
+ last;
+ }
+
+ # is this field is lookup?
+ if ($display && $x->{lookup}) {
+ my $null = "<!-- null -->";
+ if ($use_lhash_cache) {
+ if (!defined($cache->{lhash}->{$display})) {
+ my $new_display = $lhash{$display};
+ if (defined($new_display)) {
+#print STDERR "lookup cache store '$display' = '$new_display'\n";
+ $display = $new_display;
+ $cache->{lhash}->{$display} = $new_display;
+ } else {
+# print STDERR "WARNING: lookup for '$display' didn't find anything.\n";
+ $display = "";
+ $cache->{lhash}->{$display} = $null;
+ }
+ } else {
+ $display = $cache->{lhash}->{$display};
+ }
+ } else {
+ $display = $lhash{$display} || $null;
+ }
+ }
+
+ # filter="name" ; filter this field through
+ # filter/[name].pm
+ my $filter = $x->{filter};
+ if ($filter && !$cache->{filter_loaded}->{$filter}) {
+ require "filter/".$filter.".pm";
+ $cache->{filter_loaded}->{$filter}++;
+ }
+ # type="swish" ; field for swish
+ if ($swish) {
+ my $tmp = $swish;
+ if ($filter && ($s || $se)) {
+ no strict 'refs';
+ $tmp = join(" ",&$filter($tmp)) if ($s || $se);
+ }
+
+ $swish_data .= $tmp if ($s && $tmp);
+ $swish_exact_data .= "xxbxx $tmp xxexx " if ($tmp && $tmp ne "" && $se);
+ }
+
+ # type="display" ; field for display
+ if ($d && $display) {
+ my $ldel = $delimiter;
+ if ($line_delimiter && $display_data) {
+ $ldel = $line_delimiter;
+ }
+ if ($filter) {
+ no strict 'refs';
+ my @arr;
+ foreach my $tmp (&$filter($display)) {
+ my $tmp2 = mkformat($x,$tmp);
+ push @arr,$tmp2 if ($tmp2);
+ }
+ $display_data .= $ldel if ($display_data && @arr);
+ $display_data .= join($delimiter,@arr);
+ } else {
+ $display_data .= $ldel if ($display_data);
+ my $tmp = mkformat($x,$display);
+ $display_data .= $tmp if ($tmp);
+ }
+ }
+
+ # type="index" ; insert into index
+ my $idisplay;
+ if ($i && $display) {
+ $idisplay = $display;
+ if ($filter) {
+ no strict 'refs';
+ $idisplay = &$filter($idisplay);
+ }
+ push @index_data, $idisplay if ($idisplay && !$iterate_by_page);
+ }
+
+ # store fields in lookup
+ if ($il && $display) {
+ if (lc($x->{type}) eq "lookup_key") {
+ if ($lookup_key) {
+ print STDERR "WARNING: try to redefine lookup_key (keys shouldn't be repeatable fields!)";
+ } else {
+ if ($filter) {
+ no strict 'refs';
+ $lookup_key = &$filter($display);
+ } else {
+ $lookup_key = $display;
+ }
+ }
+ } elsif (lc($x->{type}) eq "lookup_val") {
+ if ($lookup_key) {
+ if ($filter) {
+ no strict 'refs';
+ $lhash{$lookup_key} = &$filter($display);
+ } else {
+ $lhash{$lookup_key} = $display;
+ }
+ } else {
+ print STDERR "WARNING: no lookup_key defined for '$display'?";
+ }
+ }
+
+ }
+
+ # store data for page-by-page repeatable fields
+ if ($iterate_by_page) {
+ sub iterate_fld($$$$$$) {
+ my ($cache,$what,$field,$page,$data,$append) = @_;
+ return if (!$data);
+
+ my $ldel = $page_line_delimiter;
+ $ldel = " " if ($append);
+#print STDERR "line delimiter: ",Dumper($ldel) if ($ldel);
+ if (! $cache->{$what}->{$field}->[$page]) {
+ $cache->{$what}->{$field}->[$page] = $data;
+ } else {
+ $cache->{$what}->{$field}->[$page] .= $ldel.$data;
+ }
+ }
+
+ if ($display_data) {
+ iterate_fld($cache,'display_data',$field,$page,$display_data,$x->{append});
+ }
+ $display_data = "";
+ if ($swish_data) {
+ iterate_fld($cache,'swish_data',$field,$page,$swish_data,$x->{append});
+ $swish_data = "";
+ }
+ if ($swish_exact_data) {
+ iterate_fld($cache,'swish_exact_data',$field,$page,$swish_exact_data,$x->{append});
+ $swish_exact_data = "";
+ }
+
+ if ($idisplay) {
+ my $ldel=$page_line_delimiter;
+ my @index_data;
+ if ($cache->{index_data}->{$field}->[$page]) {
+
+ @index_data = @{$cache->{index_data}->{$field}->[$page]};
+ }
+ if ($x->{append}) {
+ if (@index_data) {
+ $index_data[$#index_data] .= $idisplay;
+ } else {
+ push @index_data, $idisplay;
+ }
+ } else {
+ push @index_data, $idisplay;
+ }
+ $idisplay = "";
+ @{$cache->{index_data}->{$field}->[$page]} = @index_data;
+ }
+ }
+ }
+
+ if (! $iterate_by_page) {
+ my $idel = $x->{index_delimiter};
+ # fill data in index
+ foreach my $tmp (@index_data) {
+ my $i = $d = $tmp;
+ if ($idel && $tmp =~ m/$idel/) {
+ ($i,$d) = split(/$idel/,$tmp);
+ }
+ $index->insert($field, $i, $d, $path);
+ }
+ @index_data = ();
+ }
+ }
+
+ # now try to parse variables from configuration file
+ foreach my $x (@{$config->{indexer}->{$field}->{'config'}}) {
+
+ $x = unroll_x($x);
+
+ my $delimiter = x($x->{delimiter}) || ' ';
+ my $val = $cfg->val($database, x($x->{content}));
+
+ # FIXME index_lookup is not supported!
+ my ($s,$se,$d,$i,$il) = init_visible_type($x->{type});
+
+ if ($val) {
+ $display_data .= $delimiter.$val if ($d);
+ $swish_data .= " ".$val if ($s);
+ $index->insert($field, $val, $path) if ($i);
+ }
+
+ if ($iterate_by_page) {
+ # FIXME data from config tag will appear just
+ # on first page!!!
+ my $page = 0;
+ if ($display_data) {
+ $cache->{display_data}->{$field}->[$page] = $display_data;
+ $display_data = "";
+ }
+ if ($swish_data) {
+ $cache->{swish_data}->{$field}->[$page] = $swish_data;
+ $swish_data = "";
+ }
+ if ($swish_exact_data) {
+ $cache->{swish_exact_data}->{$field}->[$page] = $swish_exact_data;
+ $swish_exact_data = "";
+ }
+ }
+ }
+
+ # save data page-by-page
+ foreach my $field (@page_fields) {
+ my $nr_pages = $page_max{$field} || next;
+#print STDERR "field '$field' iterate over ",($nr_pages || 0)," pages...\n";
+#print STDERR Dumper($cache->{display_data});
+ for (my $page=0; $page <= $nr_pages; $page++) {
+ my $display_data;
+ if ($cache->{format}->{$field}) {
+ my $tmp = mkformat($cache->{format}->{$field},$cache->{display_data}->{$field}->[$page]);
+ $display_data=$tmp if ($tmp);
+ } else {
+ $display_data = $cache->{display_data}->{$field}->[$page];
+ }
+ if ($display_data) { # default
+ if ($field eq "headline") {
+ $xml .= xmlify("headline", $display_data);
+ } else {
+
+ # fallback to empty field name if needed
+ $html .= get_field_name($config,$field,$field_usage{$field}) || '';
+ $html .= "#-#".$display_data."###\n";
+ }
+ }
+
+ my $swish_data = $cache->{swish_data}->{$field}->[$page];
+ if ($swish_data) {
+ # remove extra spaces
+ $swish_data =~ s/ +/ /g;
+ $swish_data =~ s/ +$//g;
+
+ $xml .= xmlify($field."_swish", my_unac_string($codepage,$swish_data));
+ }
+
+ my $swish_exact_data = $cache->{swish_exact_data}->{$field}->[$page];
+ if ($swish_exact_data) {
+ $swish_exact_data =~ s/ +/ /g;
+ $swish_exact_data =~ s/ +$//g;
+
+ # add delimiters before and after word.
+ # That is required to produce exact match
+ $xml .= xmlify($field."_swish_exact", my_unac_string($codepage,$swish_exact_data));
+ }
+
+ my $idel = $cache->{index_delimiter}->{$field};
+ foreach my $tmp (@{$cache->{index_data}->{$field}->[$page]}) {
+ my $i = $tmp;
+ my $d = $tmp;
+ if ($idel && $tmp =~ m/$idel/) {
+ ($i,$d) = split(/$idel/,$tmp);
+ }
+ $index->insert($field, $i, $d, $path);
+#print STDERR "index [$idel] $field: $i --> $d [$path]\n";
+ }
+ }
+
+ }
+
+ if (! $iterate_by_page) {
+ if ($display_data) {
+ if ($field eq "headline") {
+ $xml .= xmlify("headline", $display_data);
+ } else {
+
+ # fallback to empty field name if needed
+ $html .= get_field_name($config,$field,$field_usage{$field}) || '';
+ $html .= "#-#".$display_data."###\n";
+ }
+ }
+ if ($swish_data) {
+ # remove extra spaces
+ $swish_data =~ s/ +/ /g;
+ $swish_data =~ s/ +$//g;
+
+ $xml .= xmlify($field."_swish", my_unac_string($codepage,$swish_data));
+ }
+
+ if ($swish_exact_data) {
+ $swish_exact_data =~ s/ +/ /g;
+ $swish_exact_data =~ s/ +$//g;
+
+ # add delimiters before and after word.
+ # That is required to produce exact match
+ $xml .= xmlify($field."_swish_exact", my_unac_string($codepage,$swish_exact_data));
+ }
+ }
+ }
+
+ # dump formatted output in <html>
+ if ($html) {
+ #$xml .= xmlify("html",$html);
+ $xml .= "<html><![CDATA[ $html ]]></html>";
+ }
+
+ if ($xml) {
+ $xml .= $add_xml if ($add_xml);
+ return "<xml>\n$xml</xml>\n";
+ } else {
+ return;
+ }
+}
+
+##########################################################################
+
+# read configuration for this script
+my $cfg = new Config::IniFiles( -file => $config_file );
+
+# read global.conf configuration
+my $cfg_global = new Config::IniFiles( -file => 'global.conf' );
+
+# open index
+$index = new index_DBI(
+ $cfg_global->val('global', 'dbi_dbd'),
+ $cfg_global->val('global', 'dbi_dsn'),
+ $cfg_global->val('global', 'dbi_user'),
+ $cfg_global->val('global', 'dbi_passwd') || '',
+ );
+
+my $show_progress = $cfg_global->val('global', 'show_progress');
+
+my $my_unac_filter = $cfg_global->val('global', 'my_unac_filter');
+if ($my_unac_filter) {
+ print STDERR "using $my_unac_filter to filter characters for search\n";
+ require $my_unac_filter;
+} else {
+ print STDERR "### fallback to default my_unac_string!\n";
+ eval q{
+ sub main::my_unac_string($$) {
+ my ($charset, $string) = (@_);
+ return $string;