added Microsoft Excel file import
authorDobrica Pavlinusic <dpavlin@rot13.org>
Mon, 23 Jun 2003 20:20:32 +0000 (20:20 +0000)
committerDobrica Pavlinusic <dpavlin@rot13.org>
Mon, 23 Jun 2003 20:20:32 +0000 (20:20 +0000)
git-svn-id: file:///home/dpavlin/private/svn/webpac/trunk@54 13eb9ef6-21d5-0310-b721-a9d68796d827

all2xml.conf
all2xml.pl
import_xml/excel.xml [new file with mode: 0644]
parse_format.pm
swish_isis.conf

index 3499d3d..fe82c8b 100644 (file)
@@ -1,37 +1,36 @@
-# global configuration file
-
-# global options
-[global]
-       # define database to use for indexes
-       dbi_dbd=Pg
-       dbi_dsn=dbname=webpac
-       dbi_user=dpavlin
-       dbi_passwd=xx
-
 # describe your databases here
-[ps-libri]
-       # directory for isis databases
-       isis_db=/data/isis_data/ps/LIBRI/LIBRI
-       # this will use xml file in import_xml/ dir
-       type=isis
-       # this is arbirary xml added to each record (e.g. Library identifier
-       # or something like that
-       xml=<type>Knjiga</type>
 
-#[ge-libri]
-#      isis_db=/data/isis_data/ge/LIBRI/LIBRI
+### ISIS ###
+#
+#[ps-libri]
+#      # directory for isis databases
+#      isis_db=/data/isis_data/ps/LIBRI/LIBRI
+#      # this will use xml file in import_xml/ dir
 #      type=isis
+#      # this is arbirary xml added to each record (e.g. Library identifier
+#      # or something like that
 #      xml=<type>Knjiga</type>
+#
+##[ge-libri]
+##     isis_db=/data/isis_data/ge/LIBRI/LIBRI
+##     type=isis
+##     xml=<type>Knjiga</type>
+#
+#
+#[ps-peri]
+#      isis_db=/data/isis_data/ps/PERI/PERI
+#      type=isis
+#      xml=<type>Èasopis</type>
+#
+#[kk-arti]
+#      isis_db=/data/isis_data/kk/ARTI/ARTI
+#      type=isis
+#      xml=<type>Èlanak</type>
+#
+### M$ Excel ###
 
-
-[ps-peri]
-       isis_db=/data/isis_data/ps/PERI/PERI
-       type=isis
+[ffzg-journals]
+       type=excel
+       excel_file=/data/isis_data/excel/tekuci_FF.xls
        xml=<type>Èasopis</type>
 
-[kk-arti]
-       isis_db=/data/isis_data/kk/ARTI/ARTI
-       type=isis
-       xml=<type>Èlanak</type>
-
-
index 8fc8177..32381b3 100755 (executable)
@@ -45,16 +45,24 @@ sub x {
        return $utf2cp->convert($_[0]);
 }
 
-# decode isis import codepage
-my $isis2cp;
+# decode isis/excel or other import codepage
+my $import2cp;
 
 # outgoing xml must be in UTF-8
 my $cp2utf = Text::Iconv->new($codepage,'UTF-8');
 
-sub isis2xml {
+# mapping between data type and tag which specify
+# format in XML file
+my %type2tag = (
+       'isis' => 'isis',
+       'excel' => 'column'
+);
+
+sub data2xml {
 
        use xmlify;
 
+       my $type = shift @_;
        my $row = shift @_;
        my $add_xml = shift @_;
 
@@ -87,12 +95,13 @@ sub isis2xml {
 
                my ($swish,$display);
 
-               foreach my $x (@{$config->{indexer}->{$field}->{isis}}) {
+               my $tag = $type2tag{$type} || die "can't find which tag to use for type $type";
+               foreach my $x (@{$config->{indexer}->{$field}->{$tag}}) {
 
                        my $format = x($x->{content});
                        my $delimiter = x($x->{delimiter}) || ' ';
 
-                       my $isis_i = 0;         # isis repeatable offset
+                       my $repeat_off = 0;             # repeatable offset
 
                        my ($s,$d,$i) = (1,1,0);        # swish, display default
                        $s = 0 if (lc($x->{type}) eq "display");
@@ -109,9 +118,8 @@ sub isis2xml {
                        # init vars so that we go into while...
                        ($swish,$display) = (1,1);
 
-                       while ($swish || $display) {
-                               ($swish,$display) = parse_format($format,$row,$isis_i++,$isis2cp);
-
+                       if ($swish || $display) {
+                               ($swish,$display) = parse_format($type, $format,$row,$repeat_off++,$import2cp);
                                # filter="name" ; filter this field through
                                # filter/[name].pm
                                my $filter = $x->{filter};
@@ -211,60 +219,123 @@ sub isis2xml {
 
 ##########################################################################
 
+# read configuration for this script
 my $cfg = new Config::IniFiles( -file => $config_file );
 
+# read global.conf configuration
+my $cfg_global = new Config::IniFiles( -file => 'global.conf' );
+
 # open index
 $index = new index_DBI(
-               $cfg->val('global', 'dbi_dbd'),
-               $cfg->val('global', 'dbi_dsn'),
-               $cfg->val('global', 'dbi_user'),
-               $cfg->val('global', 'dbi_passwd') || '',
+               $cfg_global->val('global', 'dbi_dbd'),
+               $cfg_global->val('global', 'dbi_dsn'),
+               $cfg_global->val('global', 'dbi_user'),
+               $cfg_global->val('global', 'dbi_passwd') || '',
        );
 
-# delete [global] section to leave just databases sections
-$cfg->DeleteSection('global');
-
 foreach my $database ($cfg->Sections) {
 
-       my $isis_db = $cfg -> val($database, 'isis_db') || die "$database doesn't have 'isis_db' defined!";
-       my $type = $cfg -> val($database, 'type') || die "$database doesn't have 'type' defined";
+       my $type = lc($cfg -> val($database, 'type')) || die "$database doesn't have 'type' defined";
        my $add_xml = $cfg -> val($database, 'xml');    # optional
 
-       $config=XMLin("./import_xml/$type.xml", forcearray => [ 'isis' ], forcecontent => 1);
+print STDERR "reading ./import_xml/$type.xml\n";
 
-       $isis2cp = Text::Iconv->new($config->{isis_codepage},$codepage);
+       $config=XMLin("./import_xml/$type.xml", forcearray => [ $type2tag{$type} ], forcecontent => 1);
 
-       my $db = OpenIsis::open( $isis_db );
-       if (0) {
-#      # FIX
-#      if (! $db ) {
-               print STDERR "WARNING: can't open '$isis_db'\n";
-               next ;
+       # output current progress indicator
+       my $last_p = 0;
+       sub progress {
+               #return if (! $opts{q});        # FIXME
+               my $current = shift;
+               my $total = shift;
+               my $p = int($current * 100 / $total);
+               if ($p != $last_p) {
+                       printf STDERR ("%5d / %5d [%-51s] %-2d %% \r",$current,$total,"=" x ($p/2).">", $p );
+                       $last_p = $p;
+               }
        }
 
-       my $max_rowid = OpenIsis::maxRowid( $db );
+       # now read database
+print STDERR "using: $type...\n";
 
-       print STDERR "Reading database: $isis_db [$max_rowid rows]\n";
+       if ($type eq "isis") {
+               my $isis_db = $cfg -> val($database, 'isis_db') || die "$database doesn't have 'isis_db' defined!";
 
-       my $path = $database;                   # was $isis_db
+               $import2cp = Text::Iconv->new($config->{isis_codepage},$codepage);
+               my $db = OpenIsis::open( $isis_db );
 
-       my $last_p = 0;
+               my $max_rowid = OpenIsis::maxRowid( $db );
+
+               print STDERR "Reading database: $isis_db [$max_rowid rows]\n";
+
+               my $path = $database;
+
+               for (my $row_id = 1; $row_id <= $max_rowid; $row_id++ ) {
+                       my $row = OpenIsis::read( $db, $row_id );
+                       if ($row && $row->{mfn}) {
+       
+                               progress($row->{mfn}, $max_rowid);
+
+                               my $swishpath = $path."#".int($row->{mfn});
+
+                               if (my $xml = data2xml($type,$row,$add_xml)) {
+                                       $xml = $cp2utf->convert($xml);
+                                       use bytes;      # as opposed to chars
+                                       print "Path-Name: $swishpath\n";
+                                       print "Content-Length: ".(length($xml)+1)."\n";
+                                       print "Document-Type: XML\n\n$xml\n";
+                               }
+                       }
+               }
+               print STDERR "\n";
+
+       } elsif ($type eq "excel") {
+               use Spreadsheet::ParseExcel;
+               use Spreadsheet::ParseExcel::Utility qw(int2col);
+               
+               $import2cp = Text::Iconv->new($config->{excel_codepage},$codepage);
+               my $excel_file = $cfg -> val($database, 'excel_file') || die "$database doesn't have 'excel_file' defined!";
 
-#      { my $row_id = 4514;
-# FIX
-       for (my $row_id = 1; $row_id <= $max_rowid; $row_id++ ) {
-               my $row = OpenIsis::read( $db, $row_id );
-               if ($row && $row->{mfn}) {
-                       # output current process indicator
-                       my $p = int($row->{mfn} * 100 / $max_rowid);
-                       if ($p != $last_p) {
-                               printf STDERR ("%5d / %5d [%-51s] %-2d %% \r",$row->{mfn},$max_rowid,"=" x ($p/2).">", $p ) if (! $opts{q});
-                               $last_p = $p;
+               my $sheet = x($config->{sheet}) || die "no sheet in $type.xml";
+               my $start_row = x($config->{start_row}) || die "no start_row in $type.xml";
+
+               my $oBook = Spreadsheet::ParseExcel::Workbook->Parse($excel_file) || die "can't open Excel file '$excel_file'";
+
+               my $sheet_nr = 0;
+               foreach my $oWks (@{$oBook->{Worksheet}}) {
+                       #print STDERR "-- SHEET $sheet_nr:", $oWks->{Name}, "\n";
+                       last if ($oWks->{Name} eq $sheet);
+                       $sheet_nr++;
+               }
+
+               my $oWorksheet = $oBook->{Worksheet}[$sheet_nr];
+       
+               print STDERR "using sheet: ",$oWorksheet->{Name},"\n";
+               defined ($oWorksheet) || die "can't find sheet '$sheet' in $excel_file";
+               my $end_row = x($config->{end_row}) || $oWorksheet->{MaxRow};
+
+               for(my $iR = $oWorksheet->{MinRow} ; defined $end_row && $iR <= $end_row ; $iR++) {
+                       my $row;
+                       for(my $iC = $oWorksheet->{MinCol} ; defined $oWorksheet->{MaxCol} && $iC <= $oWorksheet->{MaxCol} ; $iC++) {
+                               my $cell = $oWorksheet->{Cells}[$iR][$iC];
+                               if ($cell) {
+                                       $row->{int2col($iC)} = $cell->Value;
+                               }
                        }
 
-                       my $swishpath = $path."#".int($row->{mfn});
+                       progress($iR, $end_row);
+
+#                      print "row[$iR/$end_row] ";
+#                      foreach (keys %{$row}) {
+#                              print "$_: ",$row->{$_},"\t";
+#                      }
+#                      print "\n";
+
+                       my $swishpath = $database."#".$iR;
+
+                       next if (! $row);
 
-                       if (my $xml = isis2xml($row,$add_xml)) {
+                       if (my $xml = data2xml($type,$row,$add_xml)) {
                                $xml = $cp2utf->convert($xml);
                                use bytes;      # as opposed to chars
                                print "Path-Name: $swishpath\n";
@@ -273,7 +344,6 @@ foreach my $database ($cfg->Sections) {
                        }
                }
        }
-       print STDERR "\n";
 }
 
 # call this to commit index
diff --git a/import_xml/excel.xml b/import_xml/excel.xml
new file mode 100644 (file)
index 0000000..08617f9
--- /dev/null
@@ -0,0 +1,159 @@
+<?xml version="1.0" encoding="ISO-8859-2"?>
+<!--
+       parametars for M$ Excel import from spreadsheet
+
+       column letters must be in UPPER CASE
+       end_row configuration option is optional, but you should use it,
+               because it will use Excel MaxRow property which is
+               often set at 31999
+-->
+<config
+       excel_codepage="windows-1250"
+       sheet="Sheet1"
+       start_row="2"
+       end_row="500"
+>
+
+<!--
+       <isis type="swish|display|index">_pre_000x_sep_000x_sep_000x_post_</isis>
+-->
+    <indexer>
+       <headline>
+               <column type="display">|A|</column>
+       </headline>
+
+       <IdentificationNumbers name="ISN">
+               <column type="swish" filter="isn_swish">|B|</column>
+               <column type="index">|B|</column>
+       </IdentificationNumbers>
+
+<!--
+       <ISBN name="ISBN" order="10">
+               <column type="display">10</column>
+       </ISBN>
+
+-->
+       <ISSN name="ISSN" order="11">
+               <column type="display">|B|</column>
+       </ISSN>
+
+<!--
+       <MFN name="MFN">
+               <column>mfn</column>
+               <column type="index">mfn</column>
+       </MFN>
+               
+       <PersonalName name="Autor" order="15">
+               <column type="swish">700 701 702 710 711 503 200f 200g</column>
+               <column type="display">700a, 700b</column>
+               <column type="display">710a. 710b (710d ; 710f ; 710e)</column>
+               <column type="display">503a, 503b</column>
+               <column type="index">700a, 700b</column>
+               <column type="index">701a, 701b</column>
+               <column type="index">702a, 702b</column>        
+               <column type="index">710a</column>
+               <column type="index">503a, 503b</column>
+       </PersonalName>
+       
+-->
+       <TitleAndResponsibility name="Naslov" order="20">
+               <column>|A|</column>
+               <column type="index">|A|</column>
+       </TitleAndResponsibility>
+
+<!--
+       <EditionStatement name="Izdanje" order="25">
+               <column type="display">205a</column>
+       </EditionStatement>
+       
+       <Date name="Godina izdavanja" order="30">
+               <column type="swish">210d</column>
+       </Date>
+
+       <Publisher name="Izdavaè" order="35">
+               <column type="swish">210a 210c</column>
+               <column type="index">210c, 210a</column>
+       </Publisher>
+
+       <Publication name="Impresum" order="40">
+               <column type="display">210a : 210c, 210d</column>
+       </Publication>
+       
+       <PhisicalDescription name="Materijalni opis" order="45">
+               <column type="swish">215a : 215c ; 215d</column>
+               <column type="display">215a : 215c ; 215d</column>
+       </PhisicalDescription>
+       
+       <Language name="Jezik" order="50">
+               <column type="swish" filter="lang_swish">101</column>
+               <column type="display" filter="lang_display">101</column>
+               <column type="index" filter="lang_display">101</column>
+       </Language>
+       
+       <Series name="Nakladnièka cjelina" order="55">
+               <column>225a : 225e ; 225v. 225h, 225i ; 225w</column>
+               <column type="index">225a</column>
+       </Series>
+       
+       <Note name="Napomena" order="60">
+               <column delimiter=". -">300</column>
+               <column append="1" delimiter=". - ">320</column>
+               <column append="1" delimiter=". - ">327</column>
+               <column append="1" delimiter=". - ">329</column>
+               <column type="index">300</column>
+               <column type="index">320</column>
+               <column type="index">327</column>
+               <column type="index">329</column>
+       </Note>
+               
+       <UncontrolledTerms name="Tema" order="65">
+               <column delimiter=" * " format="upper">610 </column>    
+               <column type="index">610</column>
+       </UncontrolledTerms>
+       
+       <contents name="Sadr¾aj ¹ðè澩ÐÈÆ®" order="70">
+               <column>330 ¹ðè澩ÐÈÆ®</column>
+               <column type="index">330</column>
+       </contents>
+
+       <title_vol name="Podaci o svescima" order="75">
+               <column type="display">230v : 230a. - 250. - 260. ISBN 290</column>
+               <column type="display">231v : 231a. - 251. - 261. ISBN 291</column>
+               <column type="display">232v : 232a. - 252. - 262. ISBN 292</column>
+               <column type="display">233v : 233a. - 253. - 263. ISBN 293</column>
+       </title_vol>
+
+       <UDC name="UDK" order="80">
+               <column>675a</column>
+               <column type="index">675a</column>
+       </UDC>
+
+       <OtherClass name="APA CC" order="81">
+               <column>686</column>
+               <column type="index">686</column>
+       </OtherClass>
+-->
+
+       <library name="Dostupno u" order="85">
+               <column>|C|</column>
+       </library>
+
+<!--
+       <shelfNumber name="Signatura" order="90">
+               <column>990</column>
+               <column type="index">990</column>
+       </shelfNumber>
+
+       <invNo name="Inventarni broj" order="95">
+               <column delimiter=", ">991</column>
+               <column type="index">991</column>
+       </invNo>
+-->
+
+       <type name="Tip graðe" type="display" order="100">
+               <!-- added via xml tag in .conf -->
+       </type>
+
+
+    </indexer>
+</config>
index e92958a..5f5705a 100644 (file)
@@ -1,15 +1,31 @@
 #-------------------------------------------------------------
 #
-# parse_format('format',$isis_row);
+# parse_format(...)
 #
 
-use isis_sf;
 
 sub parse_format {
+       my $type = shift || die "parset_format must be called with type!";
        my $format = shift || die "parse_format must be called with format!";
        my $row = shift || die "parse_format must be called with row!";
        my $i = shift || 0;     # isis repeatable number
        my $codepage = shift || die "parse_format must be called with codepage!";
+       if ($type eq "isis") {
+               return parse_isis_format($format,$row,$i,$codepage);
+       } elsif ($type eq "excel") {
+               return parse_excel_format($format,$row,$i,$codepage);
+       }
+}
+
+#-------------------------------------------------------------
+
+sub parse_isis_format {
+       use isis_sf;
+
+       my $format = shift;
+       my $row = shift;
+       my $i = shift;
+       my $codepage = shift;
 
        my $out;
        my $out_swish;
@@ -66,4 +82,49 @@ sub parse_format {
 }
 
 #-------------------------------------------------------------
+
+sub parse_excel_format {
+       my $format = shift;
+       my $row = shift;
+       my $i = shift;
+       my $codepage = shift;
+
+       my $out;
+       my $out_swish;
+
+       my $prefix = "";
+       if ($format =~ s/^([^A-Z\|]{1,3})//) {
+               $prefix = $1;
+       }
+
+       my $display;
+       my $swish;
+
+       while ($format && length($format) > 0) {
+#print STDERR "\n#### $format #";
+               if ($format =~ s/^\|([A-Z]{1,2})\|//) {
+#print STDERR "--$1-> $format -[",length($format),"] ";
+                       if ($row->{$1}) {
+                               my $tmp = $row->{$1};
+                               $tmp = $codepage->convert($tmp) if ($codepage);
+                               $display .= $prefix . $tmp;
+                               $swish .= $tmp." ";
+#print STDERR " == $tmp";
+                       }
+                       $prefix = "";
+               } elsif ($format =~ s/^([^A-Z\|]+)(\|[A-Z]{1,2}\|)/$2/) {
+                       $prefix .= $1 if ($display);
+               } else {
+                       print STDERR "unparsed format: $format\n";
+                       $prefix .= $format;
+                       $format = "";
+               }
+#print STDERR " display: $display swish: $swish [format: $format]";
+       }
+       # add suffix
+       $display .= $prefix if ($display);
+
+       return ($swish,$display);
+}
+
 1;
index 544261a..7f282d2 100644 (file)
@@ -56,7 +56,8 @@ IndexAdmin "dpavlin@rot13.org"
 # Enable alternate search syntax.
 # This means a search query can contain ``+'' and ``-'' as syntax parameter.
 
-EnableAltSearchSyntax yes
+# this directive is removed in swish 2.4
+#EnableAltSearchSyntax yes
 
 
 #