From 6a631ecdcd789163702283df7d9a58e6fd0a8869 Mon Sep 17 00:00:00 2001
From: Dobrica Pavlinusic <dpavlin@rot13.org>
Date: Sun, 27 Feb 2005 23:07:35 +0000
Subject: [PATCH] Experimental support for dBase .dbf files. Usege like this in
 all2xml.conf:

[hda]
       dbf_file=/data/drustvene/hda/ISO.DBF
       type=dbf
       dbf_codepage=cp852
       dbf_mapping=<<_END_OF_MAP_
ID_BROJ                001
ISBN_BROJ      010
SKUPINA1       200
SKUPINA2       205
SKUPINA4       210
SKUPINA5       215
SKUPINA6       225
SKUPINA7       300
ANOTACIJA      330
PREDMET1       610
PREDMET2       610
PREDMET3       510
UDK            675
REDALICA       700
SIGNATURA      990
_END_OF_MAP_

dbf type will use <isis> tag in import_xml and dbf_codepage will
override codepage specified in import_xml file.

Small code refactoring.



git-svn-id: file:///home/dpavlin/private/svn/webpac/trunk@678 13eb9ef6-21d5-0310-b721-a9d68796d827
---
 all2xml.pl      | 85 +++++++++++++++++++++++++++++++++++++++++++++++--
 hash_sf.pm      | 28 ++++++++++++++++
 isis_sf.pm      | 28 ----------------
 parse_format.pm |  6 +++-
 to_hash.pm      | 39 +++++++++++++++++++++++
 5 files changed, 154 insertions(+), 32 deletions(-)
 create mode 100644 hash_sf.pm
 delete mode 100644 isis_sf.pm
 create mode 100644 to_hash.pm
diff --git a/all2xml.pl b/all2xml.pl
index 1c5123b..8f6438f 100755
--- a/all2xml.pl
+++ b/all2xml.pl
@@ -11,6 +11,7 @@ use Encode;
 #use GDBM_File;
 use Fcntl;	# for O_RDWR
 use TDB_File;
+use Carp;
 
 $|=1;
 
@@ -62,7 +63,8 @@ my %type2tag = (
 	'isis' => 'isis',
 	'excel' => 'column',
 	'marc' => 'marc',
-	'feed' => 'feed'
+	'feed' => 'feed',
+	'dbf' => 'isis',	# special case, re-use isis import_xml
 );
 
 my $cache;	# for cacheing
@@ -112,6 +114,10 @@ sub data2xml {
 		$cache->{tags_by_order} = \@sorted_tags;
 	}
 
+	if (! @sorted_tags) {
+		print STDERR "WARNING: no tags for this type found in import_xml file!\n";
+	}
+
 	# lookup key
 	my $lookup_key;
 
@@ -662,13 +668,20 @@ foreach my $database ($cfg->Sections) {
 		print STDERR "opening lookup file '$lookup_file'\n";
 	}
 
-print STDERR "reading ./import_xml/$type.xml\n";
+	my $import_xml_file = "./import_xml/$type.xml";
+
+	if (! -r $import_xml_file) {
+		print STDERR "ERROR: file $import_xml_file not readable skipping!\n";
+		next;
+	}
+
+	print STDERR "reading $import_xml_file\n";
 
 	# extract just type basic
 	my $type_base = $type;
 	$type_base =~ s/_.+$//g;
 
-	$config=XMLin("./import_xml/$type.xml", ForceArray => [ $type2tag{$type_base}, 'config', 'format' ], ForceContent => 1 );
+	$config=XMLin($import_xml_file, ForceArray => [ $type2tag{$type_base}, 'config', 'format' ], ForceContent => 1 );
 
 	# helper for progress bar
 	sub fmt_time {
@@ -927,6 +940,72 @@ print STDERR "using: $type...\n";
 		}
 		# close lookup
 		untie %lhash if (%lhash);
+
+	} elsif ($type_base eq "dbf") {
+
+		my $dbf_file = $cfg -> val($database, 'dbf_file') || die "$database doesn't have 'dbf_file' defined!";
+		my $dbf_codepage = $cfg -> val($database, 'dbf_codepage') || die "$database doesn't have 'dbf_codepage' defined!";
+		my $dbf_mapping = $cfg -> val($database, 'dbf_mapping') || die "$database doesn't have 'dbf_mapping' defined!";
+
+		$import2cp = Text::Iconv->new($dbf_codepage,$codepage);
+		require XBase;
+		my $db = new XBase $dbf_file;
+
+		if (! $db) {
+			print STDERR "ERROR: can't read DBF database: $dbf_file, skipping...\n";
+			next;
+		}
+
+		my $max_rowid = $db->last_record;
+
+		print STDERR "Reading database: $dbf_file [$max_rowid rows]\n";
+
+		my %dbf2iso;
+		foreach my $m (split(/[\n\r]+/,$dbf_mapping)) {
+			my ($col,$fld) = split(/\s+/,$m,2);
+			$dbf2iso{$col} = $fld;
+		}
+
+#print STDERR "## dbf2iso: ",Dumper(\%dbf2iso),"\n## /dbf2iso\n";
+
+		# bad, bad...
+		require "to_hash.pm";
+
+		foreach my $row_id (0 .. $max_rowid) {
+			my $dbf_row = $db->get_record_as_hash($row_id);
+			if ($dbf_row) {
+
+#print STDERR "## dbf_row: ",Dumper($dbf_row),"\n## /dbf_row\n";
+				# apply mapping from config file
+				# all unspecified records will get _ in
+				# front of them - _DELETE will be __DELETE
+				my $rec;
+				map { 
+					my $new_fld = $dbf2iso{$_} || '_'.$_;
+					my $data = $dbf_row->{$_};
+					push @{ $rec->{$new_fld} }, $data if ($data && $data !~ /^(?:\s+|\$a\.|)$/);
+				} keys %{$dbf_row};
+#print STDERR "## rec: ",Dumper($rec),"\n## /rec\n";
+				my $row = to_hash($row_id+1, $rec);
+
+				$row->{mfn} = $row_id+1;
+				$row->{record} = $rec;
+
+#print STDERR "## row: ",Dumper($row),"\n## /row\n";
+				progress($row->{mfn}, $max_rowid);
+
+				my $swishpath = $path."#".int($row->{mfn});
+
+				if (my $xml = data2xml($type_base,$row,$add_xml,$cfg,$database)) {
+					$xml = $cp2utf->convert($xml);
+					use bytes;	# as opposed to chars
+					print "Path-Name: $swishpath\n";
+					print "Content-Length: ".(length($xml)+1)."\n";
+					print "Document-Type: XML\n\n$xml\n";
+				}
+			}
+		}
+		print STDERR "\n";
 	}
 }
 
diff --git a/hash_sf.pm b/hash_sf.pm
new file mode 100644
index 0000000..8af05d3
--- /dev/null
+++ b/hash_sf.pm
@@ -0,0 +1,28 @@
+#
+# hash_sf($row_data,'field'[,'subfield'])
+#
+# e.g. hash_sf($row,'700','a')
+#
+sub hash_sf {
+	my $row = shift @_;
+	my $field = shift @_;
+	my $subfield = shift @_;
+
+	my $i = shift @_ || 0;
+
+	my $out;
+
+	if ($row->{$field}->[$i]) {
+		if (! $subfield) {
+			# subfield list undef, empty or no defined subfields for this record
+			my $all_sf = $row->{record}->{$field}->[$i] || confess "can't find field $field:$i",Dumper($row);
+			$all_sf =~ s/[\^\$]./ /g;   # nuke definitions
+			return $all_sf;	
+		}
+		my $sf = $row->{$field}->[$i]->{$subfield};
+		return $sf if ($sf);
+	}
+}
+
+1;
+
diff --git a/isis_sf.pm b/isis_sf.pm
deleted file mode 100644
index f118318..0000000
--- a/isis_sf.pm
+++ /dev/null
@@ -1,28 +0,0 @@
-#
-# isis_sf($isis_row,'isis_field'[,'subfield'])
-#
-# e.g. isis_sf($row,'700','a')
-#
-sub isis_sf {
-	my $row = shift @_;
-	my $isis_id = shift @_;
-	my $subfield = shift @_;
-
-	my $i = shift @_ || 0;
-
-	my $out;
-
-	if ($row->{$isis_id}->[$i]) {
-		if (! $subfield) {
-			# subfield list undef, empty or no defined subfields for this record
-			my $all_sf = $row->{record}->{$isis_id}->[$i];
-			$all_sf =~ s/\^./ /g;   # nuke definitions
-			return $all_sf;	
-		}
-		my $sf = $row->{$isis_id}->[$i]->{$subfield};
-		return $sf if ($sf);
-	}
-}
-
-1;
-
diff --git a/parse_format.pm b/parse_format.pm
index e348dde..12f7d96 100644
--- a/parse_format.pm
+++ b/parse_format.pm
@@ -10,13 +10,17 @@ sub parse_format {
 	my $i = shift || 0;	# isis repeatable number
 	my $codepage = shift || die "parse_format must be called with codepage!";
 	if ($type eq "isis") {
-		return parse_iso_format($format,$row,$i,$codepage,'isis_sf');
+		return parse_iso_format($format,$row,$i,$codepage,'hash_sf');
 	} elsif ($type eq "excel") {
 		return parse_excel_format($format,$row,$i,$codepage);
 	} elsif ($type eq "marc") {
 		return parse_iso_format($format,$row,$i,$codepage,'marc_sf');
 	} elsif ($type eq "feed") {
 		return parse_feed_format($format,$row,$i,$codepage);
+	} elsif ($type eq "dbf") {
+		return parse_iso_format($format,$row,$i,$codepage,'hash_sf');
+	} else {
+		confess "FATAL: unknown type '$type'";
 	}
 }
 
diff --git a/to_hash.pm b/to_hash.pm
new file mode 100644
index 0000000..a2813b1
--- /dev/null
+++ b/to_hash.pm
@@ -0,0 +1,39 @@
+# This is slight modification of to_hash as included in Biblio::Isis
+#
+# This is incredibly bad (duplicate code), but it would have to wait for
+# WebPac v2 to be fixed by extracting all file imported in sane OO way.
+
+
+sub to_hash {
+	my $mfn = shift || confess "need mfn!";
+	my $row = shift || confess "need data";
+
+	# init record to include MFN as field 000
+	my $rec = { '000' => [ $mfn ] };
+
+	foreach my $k (keys %{$row}) {
+		foreach my $l (@{$row->{$k}}) {
+
+			my $val;
+
+			# has identifiers?
+			#($val->{'i1'},$val->{'i2'}) = ($1,$2) if ($l =~ s/^([01 #])([01 #])\^/\^/);
+
+			# has subfields?
+			if ($l =~ m/[\^\$]/) {
+				foreach my $t (split(/[\^\$]/,$l)) {
+					next if (! $t);
+					$val->{substr($t,0,1)} = substr($t,1);
+				}
+			} else {
+				$val = $l;
+			}
+
+			push @{$rec->{$k}}, $val;
+		}
+	}
+
+	return $rec;
+}
+
+1;
-- 
2.20.1