cleanup code a bit
[koha-eprints] / tsv2eprints.pl
index 7535069..7c9ea89 100755 (executable)
@@ -18,24 +18,39 @@ use utf8;
 use Encode;
 use Data::Dump qw(dump);
 use Storable;
-
-=for eprints-api
+use LWP::Simple;
 
 use EPrints;
-my $institution = 'Grafički fakultet';
 
 my $ep = EPrints->new();
-my $repo = $ep->repository( 'grf' );
-$repo->{config}->{enable_file_imports} = 1;
-$repo->{config}->{enable_web_imports} = 1;
-
+my $repo = $ep->repository( 'snz' );
+#$repo->{config}->{enable_file_imports} = 1;
+#$repo->{config}->{enable_web_imports} = 1;
 my $dataset = $repo->dataset( 'eprint' );
 my $list = $dataset->search;
 my $count = $list->count;
 warn "# found [$count] eprints\n";
 
-my $eprint = $dataset->dataobj( 21 );
-warn dump( $eprint->get_value('institution'), $institution );
+warn ref( $list );
+
+#warn "# ids = ",dump( $list->ids );
+
+my $info = { count => 0 };
+$list->map( sub {
+       my( $session, $dataset, $eprint, $info ) = @_;
+
+       my $biblionumber = $eprint->get_value('biblionumber');
+
+       $info->{biblionumber}->{$biblionumber}++;
+       $info->{count}++;
+
+}, $info );
+warn dump( $info );
+
+=for update
+
+#my $eprint = $dataset->dataobj( 21 );
+#warn dump( $eprint->get_value('institution'), $institution );
 
 warn dump( $eprint );
 
@@ -49,8 +64,6 @@ $repo->terminate();
 
 =cut
 
-our $eprintid = 1;
-
 my $files;
 
 my $mkp_path = "/mnt/share/MKP/ELEKTRONIČKI DOKUMENTI/EL.DOKUMENTI PO BIBLIOBROJU/";
@@ -79,201 +92,51 @@ warn "# got ", scalar keys %$files, " files\n";
 store $files, "$koha_path.biblionumber.file";
 
 my $stat;
-my $this_id = '';
-my $item;
-
-binmode STDOUT, ":utf8";
-
-sub dump_item {
-       my $item = shift || return;
-
-       my $f200 = $item->{200}->[0] || die "no 200 in ",dump($item);
-
-       if ( $f200 =~ s/\s*;\s*([^;]+?)$//i ) {
-               $item->{mentor} = $1;
-               $item->{mentor} =~ s/^\s*voditelji?\s*(?:rada)\s*//i;
-       } else {
-               warn "MISSING ; voditelj [$f200]\n";
-       }
-
-       if ( $f200 =~ s{\s*/\s*([^/]+?)$}{} ) {
-               $item->{autor} = $1;
-       } else {
-               warn "MISSING / autor [$f200]\n";
-       }
-
-       if ( $f200 =~ s{\s*:\s*([^:]+?)$}{} ) {
-               $item->{tip} = lc($1);
-       } else {
-               warn "MISSING : tip [$f200]\n";
-       }
-
-       $item->{title} = $f200;
-
-       if ( exists $item->{991} ) {
-               my $file_id = $item->{991}->[0];
-               if ( exists $files->{ $file_id } ) {
-                       $item->{full_path} = delete $files->{ $file_id };
-               } elsif ( $file_id =~ s/(\w)0*(\d)/$1$2/ ) {
-                       if ( exists $files->{ $file_id } ) {
-                               $item->{full_path} = delete $files->{ $file_id };
-                       }
-               }
-       }
-
-       if ( ! exists $item->{full_path} ) {
-               my $file_id = ucfirst( $item->{300}->[0] . ' ' . $item->{700}->[0] );
-               $file_id =~ s/[\.\,]//g;
-               if ( exists $files->{ $file_id } ) {
-                       $item->{full_path} = delete $files->{ $file_id };
-               }
-       }
-
-       warn "MISSING file for $eprintid\n" unless exists $item->{full_path};
-
-       warn "# item ",dump($item);
-
-       my $eprint = {
-               eprintid => $eprintid++
-       };
-
-       $eprint->{filename} = $1 if $item->{full_path} =~ m{/([^/]+)$};
-       $eprint->{full_path} = $item->{full_path};
-
-       $eprint->{date} = $1 if $item->{210}->[0] =~ m/\$d(\d+)/;
-       $eprint->{pages} = $1 if $item->{215}->[0] =~ m/^(\d+)/;
-
-       ( $eprint->{creators_family}, $eprint->{creators_given} ) = split(/,\s*/, $item->{700}->[0] );
-
-       $eprint->{title} = $item->{title};
-
-       $eprint->{keywords} = join(", ", @{ $item->{610} }) if exists $item->{610};
-
-       if ( exists $item->{700}->[1] ) {
-               ( $eprint->{thesis_mentor_family}, $eprint->{thesis_mentor_given} ) = split(/,\s*/, $item->{700}->[1] );
-       } elsif ( $item->{mentor} ) {
-               ( $eprint->{thesis_mentor_given}, $eprint->{thesis_mentor_family} ) = split(/\s+/, $item->{mentor} );
-       }
-
-       $eprint->{thesis_mentor_family} =~ s/(\S+)\s*-\s*(\S+)/$1-$2/; # fix spaces between dash in double surname
-
-       $eprint->{thesis_callnumber} = $item->{990}->[0];
-       $eprint->{thesis_invnumber} =  $item->{991}->[0]; # FIXME?
-
-       # fallback za radove bez datuma na godinu
-       if ( ! $eprint->{thesis_date} && $item->{990}->[0] =~ m{/(\d\d\d\d)/} ) {
-               $eprint->{thesis_date} = $1;
-       }
-
-       if ( ! $eprint->{date} ) {
-               $eprint->{date} = $eprint->{thesis_date};
-       }
-
-       warn "# eprint ",dump($eprint);
-
-       print qq|
-
-  <eprint>
-    <eprintid>$eprint->{eprintid}</eprintid>
-
-       |;
-
-       if ( $eprint->{full_path} ) {
-               print qq|
-
-    <documents>
-      <document>
-
-        <files>
-          <file>
-            <datasetid>document</datasetid>
-            <filename>$eprint->{filename}</filename>
-            <mime_type>application/pdf</mime_type>
-            <url>file://$eprint->{full_path}</url>
-          </file>
-        </files>
-        <mime_type>application/pdf</mime_type>
-        <format>application/pdf</format>
-        <language>hr</language>
-        <security>validuser</security>
-        <main>$eprint->{filename}</main>
-      </document>
-    </documents>
-
-               |;
-       }
-       print qq|
-
-    <eprint_status>archive</eprint_status>
-    <type>$eprint->{type}</type>
-    <metadata_visibility>show</metadata_visibility>
-    <creators>
-      <item>
-        <name>
-          <family>$eprint->{creators_family}</family>
-          <given>$eprint->{creators_given}</given>
-        </name>
-      </item>
-    </creators>
-    <title>$eprint->{title}</title>
-    <ispublished>unpub</ispublished>
-    <subjects>
-      <item>2.06</item>
-    </subjects>
-    <full_text_status>restricted</full_text_status>
-    <keywords>$eprint->{keywords}</keywords>
-    <date>$eprint->{date}</date>
-    <date_type>completed</date_type>
-    <pages>$eprint->{pages}</pages>
-    <institution>Grafički fakultet</institution>
-<!--
-    <department>strojevi</department>
--->
-    <thesis_date>$eprint->{thesis_date}</thesis_date>
-    <thesis_callnumber>$eprint->{thesis_callnumber}</thesis_callnumber>
-    <thesis_invnumber>$eprint->{thesis_invnumber}</thesis_invnumber>
-    <thesis_mentor>
-      <name>
-        <family>$eprint->{thesis_mentor_family}</family>
-        <given>$eprint->{thesis_mentor_given}</given>
-      </name>
-    </thesis_mentor>
-  </eprint>
-
-       |;
-
-}
-
-print qq{<?xml version="1.0" encoding="utf-8" ?>
-<eprints>
-};
-
 
 open(my $tsv_fh,  '<:encoding(UTF-8)', "$koha_path.tsv");
 open(my $marc_fh, '<', "$koha_path.marc");
 open(my $import_fh, '>', "$koha_path.import.marc");
 
 my $last_offset = 0;
+my @cols;
 
 while(<$tsv_fh>) {
+       chomp;
        my $line = $_;
        $line =~ s/[\n\r]+$//;
 
-       my ($offset, $biblionumber, $title) = split(/\t/,$_,3);
+       if ( ! @cols && $line =~ m/#(.+)/ ) {
+               @cols = split(/\t/, $1);
+               next;
+       }
 
-       warn "# $offset $biblionumber $title\n";
+       my @v = split(/\t/, $line, $#cols + 1);
+       my %row;
+       @row{@cols} = @v;
+#warn "## row = ",dump( \%row );
 
-       exit if $ENV{LAST} && $eprintid >= $ENV{LAST};
+       my $offset = $row{offset} // die "no offset";
+       my $biblionumber = $row{biblionumber} || die "no biblionumber";
 
-       my $item;
+#      warn "# ", join(' ', map { $row{$_} } qw(biblionumber title)), "\n";
 
-       if ( $item->{full_path} = $files->{$biblionumber} ) {
+       if ( delete $files->{$biblionumber} ) {
                $stat->{file}++;
 
-               seek $marc_fh, $last_offset, 0;
-               read $marc_fh, my $marc, $offset - $last_offset;
-               print $import_fh $marc;
-               warn "# marc $biblionumber $title\n";
+               if ( $info->{biblionumber}->{$biblionumber} ) {
+                       $stat->{existing}++;
+                       warn "EXISTING $biblionumber found in eprints\n";
+               } else {
+
+                       $stat->{new}++;
+
+                       seek $marc_fh, $last_offset, 0;
+                       read $marc_fh, my $marc, $offset - $last_offset;
+                       print $import_fh $marc;
+                       warn "# NEW ", join(' ', map { $row{$_} } qw(biblionumber title)), "\n";
+#                      warn "# NEW $biblionumber\n";
+
+               }
 
        } else {
                $stat->{missing}++;
@@ -283,11 +146,23 @@ while(<$tsv_fh>) {
 
 }
 
-print qq{
-</eprints>
-};
+warn "# files left ", dump($files);
 
+foreach my $biblionumber ( keys %$files ) {
 
-warn "# files left ", dump($files);
+       if ( $info->{biblionumber}->{$biblionumber} ) {
+               $stat->{existing}++;
+               warn "EXISTING $biblionumber found in eprints\n";
+               next;
+       }
+
+       if ( my $marc = get("https://koha.ffzg.hr/cgi-bin/koha/opac-export.pl?op=export&bib=$biblionumber&format=utf8") ) {
+               print $import_fh $marc;
+               warn "## marc $biblionumber from koha!";
+               $stat->{koha}++;
+       } else {
+               warn "ERROR: can't fetch $biblionumber from koha";
+       }
+}
 
 warn "# stat ", dump($stat);