From 6a631ecdcd789163702283df7d9a58e6fd0a8869 Mon Sep 17 00:00:00 2001 From: Dobrica Pavlinusic Date: Sun, 27 Feb 2005 23:07:35 +0000 Subject: [PATCH] Experimental support for dBase .dbf files. Usege like this in all2xml.conf: [hda] dbf_file=/data/drustvene/hda/ISO.DBF type=dbf dbf_codepage=cp852 dbf_mapping=<<_END_OF_MAP_ ID_BROJ 001 ISBN_BROJ 010 SKUPINA1 200 SKUPINA2 205 SKUPINA4 210 SKUPINA5 215 SKUPINA6 225 SKUPINA7 300 ANOTACIJA 330 PREDMET1 610 PREDMET2 610 PREDMET3 510 UDK 675 REDALICA 700 SIGNATURA 990 _END_OF_MAP_ dbf type will use tag in import_xml and dbf_codepage will override codepage specified in import_xml file. Small code refactoring. git-svn-id: file:///home/dpavlin/private/svn/webpac/trunk@678 13eb9ef6-21d5-0310-b721-a9d68796d827 --- all2xml.pl | 85 +++++++++++++++++++++++++++++++++++++++++++++++-- hash_sf.pm | 28 ++++++++++++++++ isis_sf.pm | 28 ---------------- parse_format.pm | 6 +++- to_hash.pm | 39 +++++++++++++++++++++++ 5 files changed, 154 insertions(+), 32 deletions(-) create mode 100644 hash_sf.pm delete mode 100644 isis_sf.pm create mode 100644 to_hash.pm diff --git a/all2xml.pl b/all2xml.pl index 1c5123b..8f6438f 100755 --- a/all2xml.pl +++ b/all2xml.pl @@ -11,6 +11,7 @@ use Encode; #use GDBM_File; use Fcntl; # for O_RDWR use TDB_File; +use Carp; $|=1; @@ -62,7 +63,8 @@ my %type2tag = ( 'isis' => 'isis', 'excel' => 'column', 'marc' => 'marc', - 'feed' => 'feed' + 'feed' => 'feed', + 'dbf' => 'isis', # special case, re-use isis import_xml ); my $cache; # for cacheing @@ -112,6 +114,10 @@ sub data2xml { $cache->{tags_by_order} = \@sorted_tags; } + if (! @sorted_tags) { + print STDERR "WARNING: no tags for this type found in import_xml file!\n"; + } + # lookup key my $lookup_key; @@ -662,13 +668,20 @@ foreach my $database ($cfg->Sections) { print STDERR "opening lookup file '$lookup_file'\n"; } -print STDERR "reading ./import_xml/$type.xml\n"; + my $import_xml_file = "./import_xml/$type.xml"; + + if (! -r $import_xml_file) { + print STDERR "ERROR: file $import_xml_file not readable skipping!\n"; + next; + } + + print STDERR "reading $import_xml_file\n"; # extract just type basic my $type_base = $type; $type_base =~ s/_.+$//g; - $config=XMLin("./import_xml/$type.xml", ForceArray => [ $type2tag{$type_base}, 'config', 'format' ], ForceContent => 1 ); + $config=XMLin($import_xml_file, ForceArray => [ $type2tag{$type_base}, 'config', 'format' ], ForceContent => 1 ); # helper for progress bar sub fmt_time { @@ -927,6 +940,72 @@ print STDERR "using: $type...\n"; } # close lookup untie %lhash if (%lhash); + + } elsif ($type_base eq "dbf") { + + my $dbf_file = $cfg -> val($database, 'dbf_file') || die "$database doesn't have 'dbf_file' defined!"; + my $dbf_codepage = $cfg -> val($database, 'dbf_codepage') || die "$database doesn't have 'dbf_codepage' defined!"; + my $dbf_mapping = $cfg -> val($database, 'dbf_mapping') || die "$database doesn't have 'dbf_mapping' defined!"; + + $import2cp = Text::Iconv->new($dbf_codepage,$codepage); + require XBase; + my $db = new XBase $dbf_file; + + if (! $db) { + print STDERR "ERROR: can't read DBF database: $dbf_file, skipping...\n"; + next; + } + + my $max_rowid = $db->last_record; + + print STDERR "Reading database: $dbf_file [$max_rowid rows]\n"; + + my %dbf2iso; + foreach my $m (split(/[\n\r]+/,$dbf_mapping)) { + my ($col,$fld) = split(/\s+/,$m,2); + $dbf2iso{$col} = $fld; + } + +#print STDERR "## dbf2iso: ",Dumper(\%dbf2iso),"\n## /dbf2iso\n"; + + # bad, bad... + require "to_hash.pm"; + + foreach my $row_id (0 .. $max_rowid) { + my $dbf_row = $db->get_record_as_hash($row_id); + if ($dbf_row) { + +#print STDERR "## dbf_row: ",Dumper($dbf_row),"\n## /dbf_row\n"; + # apply mapping from config file + # all unspecified records will get _ in + # front of them - _DELETE will be __DELETE + my $rec; + map { + my $new_fld = $dbf2iso{$_} || '_'.$_; + my $data = $dbf_row->{$_}; + push @{ $rec->{$new_fld} }, $data if ($data && $data !~ /^(?:\s+|\$a\.|)$/); + } keys %{$dbf_row}; +#print STDERR "## rec: ",Dumper($rec),"\n## /rec\n"; + my $row = to_hash($row_id+1, $rec); + + $row->{mfn} = $row_id+1; + $row->{record} = $rec; + +#print STDERR "## row: ",Dumper($row),"\n## /row\n"; + progress($row->{mfn}, $max_rowid); + + my $swishpath = $path."#".int($row->{mfn}); + + if (my $xml = data2xml($type_base,$row,$add_xml,$cfg,$database)) { + $xml = $cp2utf->convert($xml); + use bytes; # as opposed to chars + print "Path-Name: $swishpath\n"; + print "Content-Length: ".(length($xml)+1)."\n"; + print "Document-Type: XML\n\n$xml\n"; + } + } + } + print STDERR "\n"; } } diff --git a/hash_sf.pm b/hash_sf.pm new file mode 100644 index 0000000..8af05d3 --- /dev/null +++ b/hash_sf.pm @@ -0,0 +1,28 @@ +# +# hash_sf($row_data,'field'[,'subfield']) +# +# e.g. hash_sf($row,'700','a') +# +sub hash_sf { + my $row = shift @_; + my $field = shift @_; + my $subfield = shift @_; + + my $i = shift @_ || 0; + + my $out; + + if ($row->{$field}->[$i]) { + if (! $subfield) { + # subfield list undef, empty or no defined subfields for this record + my $all_sf = $row->{record}->{$field}->[$i] || confess "can't find field $field:$i",Dumper($row); + $all_sf =~ s/[\^\$]./ /g; # nuke definitions + return $all_sf; + } + my $sf = $row->{$field}->[$i]->{$subfield}; + return $sf if ($sf); + } +} + +1; + diff --git a/isis_sf.pm b/isis_sf.pm deleted file mode 100644 index f118318..0000000 --- a/isis_sf.pm +++ /dev/null @@ -1,28 +0,0 @@ -# -# isis_sf($isis_row,'isis_field'[,'subfield']) -# -# e.g. isis_sf($row,'700','a') -# -sub isis_sf { - my $row = shift @_; - my $isis_id = shift @_; - my $subfield = shift @_; - - my $i = shift @_ || 0; - - my $out; - - if ($row->{$isis_id}->[$i]) { - if (! $subfield) { - # subfield list undef, empty or no defined subfields for this record - my $all_sf = $row->{record}->{$isis_id}->[$i]; - $all_sf =~ s/\^./ /g; # nuke definitions - return $all_sf; - } - my $sf = $row->{$isis_id}->[$i]->{$subfield}; - return $sf if ($sf); - } -} - -1; - diff --git a/parse_format.pm b/parse_format.pm index e348dde..12f7d96 100644 --- a/parse_format.pm +++ b/parse_format.pm @@ -10,13 +10,17 @@ sub parse_format { my $i = shift || 0; # isis repeatable number my $codepage = shift || die "parse_format must be called with codepage!"; if ($type eq "isis") { - return parse_iso_format($format,$row,$i,$codepage,'isis_sf'); + return parse_iso_format($format,$row,$i,$codepage,'hash_sf'); } elsif ($type eq "excel") { return parse_excel_format($format,$row,$i,$codepage); } elsif ($type eq "marc") { return parse_iso_format($format,$row,$i,$codepage,'marc_sf'); } elsif ($type eq "feed") { return parse_feed_format($format,$row,$i,$codepage); + } elsif ($type eq "dbf") { + return parse_iso_format($format,$row,$i,$codepage,'hash_sf'); + } else { + confess "FATAL: unknown type '$type'"; } } diff --git a/to_hash.pm b/to_hash.pm new file mode 100644 index 0000000..a2813b1 --- /dev/null +++ b/to_hash.pm @@ -0,0 +1,39 @@ +# This is slight modification of to_hash as included in Biblio::Isis +# +# This is incredibly bad (duplicate code), but it would have to wait for +# WebPac v2 to be fixed by extracting all file imported in sane OO way. + + +sub to_hash { + my $mfn = shift || confess "need mfn!"; + my $row = shift || confess "need data"; + + # init record to include MFN as field 000 + my $rec = { '000' => [ $mfn ] }; + + foreach my $k (keys %{$row}) { + foreach my $l (@{$row->{$k}}) { + + my $val; + + # has identifiers? + #($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/); + + # has subfields? + if ($l =~ m/[\^\$]/) { + foreach my $t (split(/[\^\$]/,$l)) { + next if (! $t); + $val->{substr($t,0,1)} = substr($t,1); + } + } else { + $val = $l; + } + + push @{$rec->{$k}}, $val; + } + } + + return $rec; +} + +1; -- 2.20.1