Experimental support for dBase .dbf files. Usege like this in all2xml.conf:
authorDobrica Pavlinusic <dpavlin@rot13.org>
Sun, 27 Feb 2005 23:07:35 +0000 (23:07 +0000)
committerDobrica Pavlinusic <dpavlin@rot13.org>
Sun, 27 Feb 2005 23:07:35 +0000 (23:07 +0000)
[hda]
       dbf_file=/data/drustvene/hda/ISO.DBF
       type=dbf
       dbf_codepage=cp852
       dbf_mapping=<<_END_OF_MAP_
ID_BROJ                001
ISBN_BROJ      010
SKUPINA1       200
SKUPINA2       205
SKUPINA4       210
SKUPINA5       215
SKUPINA6       225
SKUPINA7       300
ANOTACIJA      330
PREDMET1       610
PREDMET2       610
PREDMET3       510
UDK            675
REDALICA       700
SIGNATURA      990
_END_OF_MAP_

dbf type will use <isis> tag in import_xml and dbf_codepage will
override codepage specified in import_xml file.

Small code refactoring.

git-svn-id: file:///home/dpavlin/private/svn/webpac/trunk@678 13eb9ef6-21d5-0310-b721-a9d68796d827

all2xml.pl
hash_sf.pm [new file with mode: 0644]
isis_sf.pm [deleted file]
parse_format.pm
to_hash.pm [new file with mode: 0644]

index 1c5123b..8f6438f 100755 (executable)
@@ -11,6 +11,7 @@ use Encode;
 #use GDBM_File;
 use Fcntl;     # for O_RDWR
 use TDB_File;
+use Carp;
 
 $|=1;
 
@@ -62,7 +63,8 @@ my %type2tag = (
        'isis' => 'isis',
        'excel' => 'column',
        'marc' => 'marc',
-       'feed' => 'feed'
+       'feed' => 'feed',
+       'dbf' => 'isis',        # special case, re-use isis import_xml
 );
 
 my $cache;     # for cacheing
@@ -112,6 +114,10 @@ sub data2xml {
                $cache->{tags_by_order} = \@sorted_tags;
        }
 
+       if (! @sorted_tags) {
+               print STDERR "WARNING: no tags for this type found in import_xml file!\n";
+       }
+
        # lookup key
        my $lookup_key;
 
@@ -662,13 +668,20 @@ foreach my $database ($cfg->Sections) {
                print STDERR "opening lookup file '$lookup_file'\n";
        }
 
-print STDERR "reading ./import_xml/$type.xml\n";
+       my $import_xml_file = "./import_xml/$type.xml";
+
+       if (! -r $import_xml_file) {
+               print STDERR "ERROR: file $import_xml_file not readable skipping!\n";
+               next;
+       }
+
+       print STDERR "reading $import_xml_file\n";
 
        # extract just type basic
        my $type_base = $type;
        $type_base =~ s/_.+$//g;
 
-       $config=XMLin("./import_xml/$type.xml", ForceArray => [ $type2tag{$type_base}, 'config', 'format' ], ForceContent => 1 );
+       $config=XMLin($import_xml_file, ForceArray => [ $type2tag{$type_base}, 'config', 'format' ], ForceContent => 1 );
 
        # helper for progress bar
        sub fmt_time {
@@ -927,6 +940,72 @@ print STDERR "using: $type...\n";
                }
                # close lookup
                untie %lhash if (%lhash);
+
+       } elsif ($type_base eq "dbf") {
+
+               my $dbf_file = $cfg -> val($database, 'dbf_file') || die "$database doesn't have 'dbf_file' defined!";
+               my $dbf_codepage = $cfg -> val($database, 'dbf_codepage') || die "$database doesn't have 'dbf_codepage' defined!";
+               my $dbf_mapping = $cfg -> val($database, 'dbf_mapping') || die "$database doesn't have 'dbf_mapping' defined!";
+
+               $import2cp = Text::Iconv->new($dbf_codepage,$codepage);
+               require XBase;
+               my $db = new XBase $dbf_file;
+
+               if (! $db) {
+                       print STDERR "ERROR: can't read DBF database: $dbf_file, skipping...\n";
+                       next;
+               }
+
+               my $max_rowid = $db->last_record;
+
+               print STDERR "Reading database: $dbf_file [$max_rowid rows]\n";
+
+               my %dbf2iso;
+               foreach my $m (split(/[\n\r]+/,$dbf_mapping)) {
+                       my ($col,$fld) = split(/\s+/,$m,2);
+                       $dbf2iso{$col} = $fld;
+               }
+
+#print STDERR "## dbf2iso: ",Dumper(\%dbf2iso),"\n## /dbf2iso\n";
+
+               # bad, bad...
+               require "to_hash.pm";
+
+               foreach my $row_id (0 .. $max_rowid) {
+                       my $dbf_row = $db->get_record_as_hash($row_id);
+                       if ($dbf_row) {
+
+#print STDERR "## dbf_row: ",Dumper($dbf_row),"\n## /dbf_row\n";
+                               # apply mapping from config file
+                               # all unspecified records will get _ in
+                               # front of them - _DELETE will be __DELETE
+                               my $rec;
+                               map { 
+                                       my $new_fld = $dbf2iso{$_} || '_'.$_;
+                                       my $data = $dbf_row->{$_};
+                                       push @{ $rec->{$new_fld} }, $data if ($data && $data !~ /^(?:\s+|\$a\.|)$/);
+                               } keys %{$dbf_row};
+#print STDERR "## rec: ",Dumper($rec),"\n## /rec\n";
+                               my $row = to_hash($row_id+1, $rec);
+
+                               $row->{mfn} = $row_id+1;
+                               $row->{record} = $rec;
+
+#print STDERR "## row: ",Dumper($row),"\n## /row\n";
+                               progress($row->{mfn}, $max_rowid);
+
+                               my $swishpath = $path."#".int($row->{mfn});
+
+                               if (my $xml = data2xml($type_base,$row,$add_xml,$cfg,$database)) {
+                                       $xml = $cp2utf->convert($xml);
+                                       use bytes;      # as opposed to chars
+                                       print "Path-Name: $swishpath\n";
+                                       print "Content-Length: ".(length($xml)+1)."\n";
+                                       print "Document-Type: XML\n\n$xml\n";
+                               }
+                       }
+               }
+               print STDERR "\n";
        }
 }
 
diff --git a/hash_sf.pm b/hash_sf.pm
new file mode 100644 (file)
index 0000000..8af05d3
--- /dev/null
@@ -0,0 +1,28 @@
+#
+# hash_sf($row_data,'field'[,'subfield'])
+#
+# e.g. hash_sf($row,'700','a')
+#
+sub hash_sf {
+       my $row = shift @_;
+       my $field = shift @_;
+       my $subfield = shift @_;
+
+       my $i = shift @_ || 0;
+
+       my $out;
+
+       if ($row->{$field}->[$i]) {
+               if (! $subfield) {
+                       # subfield list undef, empty or no defined subfields for this record
+                       my $all_sf = $row->{record}->{$field}->[$i] || confess "can't find field $field:$i",Dumper($row);
+                       $all_sf =~ s/[\^\$]./ /g;   # nuke definitions
+                       return $all_sf; 
+               }
+               my $sf = $row->{$field}->[$i]->{$subfield};
+               return $sf if ($sf);
+       }
+}
+
+1;
+
diff --git a/isis_sf.pm b/isis_sf.pm
deleted file mode 100644 (file)
index f118318..0000000
+++ /dev/null
@@ -1,28 +0,0 @@
-#
-# isis_sf($isis_row,'isis_field'[,'subfield'])
-#
-# e.g. isis_sf($row,'700','a')
-#
-sub isis_sf {
-       my $row = shift @_;
-       my $isis_id = shift @_;
-       my $subfield = shift @_;
-
-       my $i = shift @_ || 0;
-
-       my $out;
-
-       if ($row->{$isis_id}->[$i]) {
-               if (! $subfield) {
-                       # subfield list undef, empty or no defined subfields for this record
-                       my $all_sf = $row->{record}->{$isis_id}->[$i];
-                       $all_sf =~ s/\^./ /g;   # nuke definitions
-                       return $all_sf; 
-               }
-               my $sf = $row->{$isis_id}->[$i]->{$subfield};
-               return $sf if ($sf);
-       }
-}
-
-1;
-
index e348dde..12f7d96 100644 (file)
@@ -10,13 +10,17 @@ sub parse_format {
        my $i = shift || 0;     # isis repeatable number
        my $codepage = shift || die "parse_format must be called with codepage!";
        if ($type eq "isis") {
-               return parse_iso_format($format,$row,$i,$codepage,'isis_sf');
+               return parse_iso_format($format,$row,$i,$codepage,'hash_sf');
        } elsif ($type eq "excel") {
                return parse_excel_format($format,$row,$i,$codepage);
        } elsif ($type eq "marc") {
                return parse_iso_format($format,$row,$i,$codepage,'marc_sf');
        } elsif ($type eq "feed") {
                return parse_feed_format($format,$row,$i,$codepage);
+       } elsif ($type eq "dbf") {
+               return parse_iso_format($format,$row,$i,$codepage,'hash_sf');
+       } else {
+               confess "FATAL: unknown type '$type'";
        }
 }
 
diff --git a/to_hash.pm b/to_hash.pm
new file mode 100644 (file)
index 0000000..a2813b1
--- /dev/null
@@ -0,0 +1,39 @@
+# This is slight modification of to_hash as included in Biblio::Isis
+#
+# This is incredibly bad (duplicate code), but it would have to wait for
+# WebPac v2 to be fixed by extracting all file imported in sane OO way.
+
+
+sub to_hash {
+       my $mfn = shift || confess "need mfn!";
+       my $row = shift || confess "need data";
+
+       # init record to include MFN as field 000
+       my $rec = { '000' => [ $mfn ] };
+
+       foreach my $k (keys %{$row}) {
+               foreach my $l (@{$row->{$k}}) {
+
+                       my $val;
+
+                       # has identifiers?
+                       #($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
+
+                       # has subfields?
+                       if ($l =~ m/[\^\$]/) {
+                               foreach my $t (split(/[\^\$]/,$l)) {
+                                       next if (! $t);
+                                       $val->{substr($t,0,1)} = substr($t,1);
+                               }
+                       } else {
+                               $val = $l;
+                       }
+
+                       push @{$rec->{$k}}, $val;
+               }
+       }
+
+       return $rec;
+}
+
+1;