use Data::Dumper;
use XML::Simple;
use Text::Unaccent 1.02; # 1.01 won't compile on my platform,
-require Unicode::Map8;
-use DBI;
+use Text::Iconv;
-my $config=XMLin(undef, forcearray => [ 'isis' ], forcecontent => 1);
-my $dbh = DBI->connect("DBI:Pg:dbname=webpac","","") || die $DBI::errstr; # FIX
-# FIX; select relname from pg_class where relname like 'index_%' ;
-$dbh->begin_work || die $dbh->errstr();
-$dbh->do("delete from index_author") || die $dbh->errstr();
-$dbh->do("delete from index_title") || die $dbh->errstr();
+$|=1;
+
+my $config;
+
+$config=XMLin(undef, forcearray => [ 'isis' ], forcecontent => 1);
+
+use index_DBI; # there is no other, right now ;-)
+
+my $index = new index_DBI(); # open index
my %opts;
#print Dumper($config->{indexer});
#print "-" x 70,"\n";
-# how to convert isis code page to UTF8?
-my $isis_map = Unicode::Map8->new($config->{isis_codepage}) || die;
+Text::Iconv->raise_error(1); # Conversion errors raise exceptions
+
+my $isis_codepage = Text::Iconv->new($config->{isis_codepage},'UTF8');
+my $index_codepage = Text::Iconv->new($config->{isis_codepage},$config->{index_codepage});
+my $cludge_codepage = Text::Iconv->new('UTF8','ISO8859-1');
sub isis2xml {
+ use xmlify;
+
my $row = shift @_;
my $xml;
- $xml->{db_dir} = [ $db_dir ]; # FIX remove?
-
- sub isis_sf {
- my $row = shift @_;
- my $isis_id = shift @_;
- my $subfield = shift @_;
- if ($row->{$isis_id}->[0]) {
- my $sf = OpenIsis::subfields($row->{$isis_id}->[0]);
- if (! defined $subfield || length($subfield) == 0) {
- # subfield list undef, empty or no defined subfields for this record
- my $all_sf = $row->{$isis_id}->[0];
- $all_sf =~ s/\^./ /g; nuke definirions
- return $all_sf;
- } elsif ($sf->{$subfield}) {
- return $sf->{$subfield};
- }
- }
- }
+ $xml .= xmlify('db_dir',$db_dir); # FIX remove?
+
+ use parse_format;
+
foreach my $field (keys %{$config->{indexer}}) {
- my $display_data = "";
my $swish_data = "";
+ my $display_data = "";
my $index_data = "";
foreach my $x (@{$config->{indexer}->{$field}->{isis}}) {
- my $display_tmp = "";
- my $swish_tmp = "";
- my $index_tmp = "";
-
my $format = $x->{content};
- my $s = 1; # swish only
- my $d = 1; # display only
- my $i = 0; # index only
+ my ($s,$d,$i) = (1,1,0); # swish, display default
$s = 0 if (lc($x->{type}) eq "display");
$d = 0 if (lc($x->{type}) eq "swish");
($s,$d,$i) = (0,0,1) if (lc($x->{type}) eq "index");
#print STDERR "## s: $s d: $d i: $i ## $format ##\n";
- # parse format
- my $prefix = "";
- if ($format =~ s/^([^\d]+)//) {
- $prefix = $1;
- }
- while ($format) {
- if ($format =~ s/^(\d\d\d)(\w?)//) {
- my $isis_tmp = isis_sf($row,$1,$2);
- if ($isis_tmp) {
-# $display_tmp .= $prefix . "/$1/$2/".$isis_tmp if ($d);
- $display_tmp .= $prefix . $isis_tmp if ($d);
- $swish_tmp .= $isis_tmp." " if ($s);
- $index_tmp .= $prefix . $isis_tmp if ($i);
-#print STDERR " $isis_tmp <--\n";
- }
- $prefix = "";
- } elsif ($format =~ s/^([^\d]+)//) {
- $prefix = $1;
- } else {
- print STDERR "WARNING: unparsed format '$format'\n";
- last;
- };
- }
- # add suffix
- $display_tmp .= $prefix if ($display_tmp);
- $index_tmp .= $prefix if ($index_tmp);
-# $display_data .= $display_tmp if ($display_tmp ne "");
-# $swish_data .= $swish_tmp if ($swish_tmp ne "");
- $display_data .= $display_tmp;
- $swish_data .= $swish_tmp;
- $index_data .= $index_tmp;
+ $format = $cludge_codepage->convert($format);
+ my ($swish,$display) = parse_format($format,$row);
+#print STDERR "s: $swish\nd: $display\n" if ($swish);
+
+#print STDERR "swish: $swish<-- display: $display<--\n";
+ # FIX: this is ugly, UGLY, cludge: OpenIsis return
+ # UTF8 encoding of strings, but as if source charset
+ # is ISO8859-1 and not some other. This breaks our
+ # isis character encoding, so we convert it first
+ # back to ISO8859-1 (which can actually be different
+ # encoding in isis)
+
+ $swish_data .= $swish if ($s && $swish);
+ $display_data .= $display if ($d && $display);
+ $index_data .= $display if ($i && $display);
+ }
+#print STDERR "s_d: $swish_data\nd_d: $display_data\n" if ($swish_data);
+ if ($display_data) {
+ $display_data = $isis_codepage->convert($display_data)."##" || $display_data;
+ $xml .= xmlify($field."_display", $display_data);
+ }
+ if ($swish_data) {
+ my $i = Text::Iconv->new($config->{isis_codepage},'ISO8859-2');
+ $swish_data = $i->convert($swish_data);
+ $xml .= xmlify($field."_swish",unac_string('ISO8859-2',$swish_data));
+ #$swish_data = $isis_codepage->convert($swish_data)."##" || $swish_data;
+ #$xml .= xmlify($field."_swish",unac_string($config->{isis_codepage},$swish_data));
}
-#print "--display:$display_data\n--swish:$swish_data\n";
- #$xml->{$field."_display"} = $isis_map->tou($display_data)->utf8 if ($display_data);
- #$xml->{$field."_swish"} = unac_string($config->{isis_codepage},$swish_data) if ($swish_data);
- $xml->{$field."_display" } = [ $isis_map->tou($display_data)->utf8 ] if ($display_data);
- $xml->{$field."_swish"} = [ unac_string($config->{isis_codepage},$swish_data) ] if ($swish_data);
# index
if ($index_data && $index_data ne "") {
- my $sql = "select $field from index_$field where upper($field)=upper(?)";
- my $sth = $dbh->prepare($sql) || die $dbh->errstr();
- $sth->execute($index_data) || die "SQL: $sql; ".$dbh->errstr();
-#print STDERR "--->$index_data<---\n";
- if (! $sth->fetchrow_hashref) {
- my $sql = "insert into index_$field values (?)";
- my $sth = $dbh->prepare($sql) || die $dbh->errstr();
-#print STDERR "$sql: $index_data<!----\n";
- $sth->execute($index_data) || die "SQL: $sql; ".$dbh->errstr();
- }
+ $index_data = $index_codepage->convert($index_data) || $index_data;
+ $index->insert($field, $index_data, $db_dir);
}
}
if ($xml) {
- return XMLout($xml, rootname => 'xml', keeproot => 0, noattr => 0 );
+#print STDERR "x: $xml\n";
+ return "<xml>\n$xml</xml>\n";
} else {
return;
}
print STDERR "Reading database: $isis_db [$max_rowid rows]\n";
+ my $path = $isis_db;
+ $path =~ s#$config->{isis_data}/*##g;
+
my $last_p = 0;
-# { my $row_id = 1;
+# { my $row_id = 4514;
# FIX
for (my $row_id = 1; $row_id <= $max_rowid; $row_id++ ) {
my $row = OpenIsis::read( $db, $row_id );
if ($row && $row->{mfn}) {
-
+#print STDERR "mfn: ",$row->{mfn},"\n";
# output current process indicator
my $p = int($row->{mfn} * 100 / $max_rowid);
if ($p != $last_p) {
}
if (my $xml = isis2xml($row)) {
- my $path = $isis_db;
- $path =~ s#$config->{isis_data}/*##g;
- my $out = "Path-Name: $path#".$row->{mfn}."\n";
- $out .= "Content-Length: ".(length($xml)+1)."\n";
- $out .= "Document-Type: XML\n\n$xml\n";
- print $out;
+#print STDERR "--ret-->$xml\n";
+ print "Path-Name: $path#".int($row->{mfn})."\n";
+ print "Content-Length: ".(length($xml)+1)."\n";
+ print "Document-Type: XML\n\n$xml\n";
}
}
}
print STDERR "\n";
}
-$dbh->commit || die $dbh->errstr();
+# call this to commit index
+$index->close;
1;
__END__