first version of snz repository import script

author dpavlin <dpavlin@rot13.org>

Wed, 20 May 2015 21:59:29 +0000 (23:59 +0200)

committer dpavlin <dpavlin@rot13.org>

Wed, 20 May 2015 22:00:00 +0000 (00:00 +0200)
author dpavlin <dpavlin@rot13.org>
Wed, 20 May 2015 21:59:29 +0000 (23:59 +0200)
committer dpavlin <dpavlin@rot13.org>
Wed, 20 May 2015 22:00:00 +0000 (00:00 +0200)
diff --git a/EPrints/Plugin/Import/MARC.pm b/EPrints/Plugin/Import/MARC.pm

new file mode 100644 (file)

index 0000000..9693f62
--- /dev/null
+++ b/EPrints/Plugin/Import/MARC.pm
@@ -0,0 +1,175 @@
+package EPrints::Plugin::Import::MARC;
+
+=head1 NAME
+
+EPrints::Plugin::Import::MARC -- allows to import MARC records
+
+=head1 DESCRIPTION
+
+This plugin allows you to import MARC and MARC XML records into GNU EPrints.
+
+=head1 CONFIGURATION
+
+Configuration might be changed in cfg.d/marc.pl. Webserver needs to be restarted after any configuration changes.
+
+=head1 COPYRIGHT AND LICENSE
+
+(C) 2008 Jose Miguel Parrella Romero <bureado@cpan.org>
+(C) 2013 Dobrica Pavlinušić <dpavlin@rot13.org>
+This module is free software under the same terms of Perl.
+
+=cut
+
+use Data::Dump qw(dump);
+
+use Encode;
+use strict;
+
+our @ISA = qw/EPrints::Plugin::Import/;
+
+sub new
+{
+       my( $class, %params ) = @_;
+
+       my $self = $class->SUPER::new( %params );
+
+       $self->{name} = "MARC";
+       $self->{visible} = "all";
+       $self->{produce} = [ 'list/eprint' ];
+
+       my $rc = EPrints::Utils::require_if_exists("MARC::Record") and EPrints::Utils::require_if_exists("MARC::File::USMARC");
+       unless( $rc ) 
+       {
+               $self->{visible} = "";
+               $self->{error} = "Failed to load required modules.";
+       }
+
+       return $self;
+}
+
+sub input_fh
+{
+       my( $plugin, %opts ) = @_;
+       
+       my @ids;
+       my $file = MARC::File::USMARC->in( $opts{fh} );
+
+       while ( my $marc = $file->next() ) {
+               my $epdata = $plugin->convert_input( $marc );
+               next unless( defined $epdata );
+
+               my $dataobj = $plugin->epdata_to_dataobj( $opts{dataset}, $epdata );
+               if( defined $dataobj )
+               {
+                       push @ids, $dataobj->get_id;
+               }
+       }
+
+       return EPrints::List->new( 
+               dataset => $opts{dataset}, 
+               session => $plugin->{session},
+               ids=>\@ids );
+
+       return undef;
+}
+
+sub input_file
+{
+       my( $plugin, %opts ) = @_;
+
+       if( $opts{filename} eq '-' )
+       {
+               $plugin->error("Does not support input from STDIN");
+
+               return undef;
+       }
+
+       my @ids;
+       my $file = MARC::File::USMARC->in( $opts{filename} );
+
+       while ( my $marc = $file->next() ) {
+               my $epdata = $plugin->convert_input( $marc );
+               next unless( defined $epdata );
+
+               my $dataobj = $plugin->epdata_to_dataobj( $opts{dataset}, $epdata );
+               if( defined $dataobj )
+               {
+                       # Callback
+                       if ( my $code = $plugin->{session}->get_repository->get_conf( "marc" )->{dataobj_callback} ) {
+                               $epdata = $code->($dataobj);
+                       }
+
+                       push @ids, $dataobj->get_id;
+               }
+       }
+
+       return EPrints::List->new( 
+               dataset => $opts{dataset}, 
+               session => $plugin->{session},
+               ids=>\@ids );
+}
+
+our $debug;
+
+sub convert_input 
+{
+
+       my ( $plugin, $marc ) = @_;
+       my $epdata = (); # to be returned
+
+       # Taken from cfg.d/marc.pl
+        my %mappings = %{$plugin->{session}->get_repository->get_conf( "marc" )->{marc2ep}};
+
+       my $dataset = $plugin->{session}->get_dataset('archive');
+
+        foreach my $field ( $marc->fields() ) {             # each field of the record
+               my $t = $field->tag();
+               my @list = grep ( /^$t/, keys %mappings );  # lookup for mappings
+               foreach my $i ( sort @list ) {
+                       ( my $s ) = $i =~ /$t(.)/;          # mapped subfield
+                       my $ts = $t . $s;                   # complete tag+subfield
+                       my $value = $field->as_string($s);
+
+                       my $field = $mappings{$ts} || $plugin->error("no mapping for $ts");
+                       my $metafield = $dataset->get_field($field);
+
+                       if ($metafield->get_property('multiple')) {
+                               warn "# multiple $field ",dump( $metafield ) if ! $debug->{$field}++;
+                               $epdata->{$field} = [ { name => $value } ];
+                       } else {
+                               $epdata->{$field} = $value; # bye!
+                       }
+               }
+        }
+
+       # Authors
+       my $field = $marc->field('100');
+       if ( defined $field ) {
+               foreach my $i ( $field->subfield('a') ) {
+                       my $name;
+                       ( $name->{family}, $name->{given} ) = split ( "," , $i );
+                       push @{ $epdata->{creators_name} }, $name if defined $name;
+               }
+       }
+
+       # Subjects
+       if ( $plugin->{session}->get_repository->get_conf( "marc" )->{importSubjects} ) {
+
+               if ( $field = $marc->field('650') ) {
+                       foreach my $i ( $field->subfield('a') ) {
+                               push @{ $epdata->{subjects} }, $i;
+                       }
+               }
+
+       }
+
+       # Callback
+       if ( my $code = $plugin->{session}->get_repository->get_conf( "marc" )->{epdata_callback} ) {
+               $epdata = $code->($epdata);
+       }
+
+       return $epdata;
+
+}
+
+1;
diff --git a/mkp-share.sh b/mkp-share.sh

deleted file mode 100755 (executable)

index f250cca..0000000
--- a/mkp-share.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/sh -xe
-
-cd "/mnt/share/MKP/ELEKTRONIČKI DOKUMENTI/EL.DOKUMENTI PO BIBLIOBROJU/do18052015/"
-find . -name '[0-9]*.pdf' -print | sed 's,./,,' | tee pdf-files.list
diff --git a/snz-rebuild.sh b/snz-rebuild.sh

new file mode 100755 (executable)

index 0000000..06fb8ec
--- /dev/null
+++ b/snz-rebuild.sh
@@ -0,0 +1,8 @@
+#!/bin/sh -xe
+
+#cd "/mnt/share/MKP/ELEKTRONIČKI DOKUMENTI/EL.DOKUMENTI PO BIBLIOBROJU/do18052015/"
+#find . -name '[0-9]*.pdf' -print | sed 's,./,,' | tee pdf-files.list
+sudo -u eprints /usr/share/eprints3/bin/epadmin update snz
+sudo -u eprints /usr/share/eprints3/bin/epadmin --force erase_eprints snz
+sudo -u eprints /usr/share/eprints3/bin/import --update --force snz eprint MARC /tmp/koha_ffzg.marc
+sudo -u eprints /usr/share/eprints3/bin/generate_views snz
diff --git a/tsv2eprints.pl b/tsv2eprints.pl

new file mode 100755 (executable)

index 0000000..173007d
--- /dev/null
+++ b/tsv2eprints.pl
@@ -0,0 +1,282 @@
+#!/usr/bin/perl -I /usr/share/eprints3/perl_lib
+use warnings;
+use strict;
+use autodie;
+use utf8;
+
+# export single record to get structure:
+# sudo -u eprints /usr/share/eprints3/bin/export snz archive XMLFiles 20 > /tmp/20.xml
+
+# Import procedure:
+
+# 3. import with:
+# sudo -u eprints /usr/share/eprints3/bin/import --verbose --migration --enable-file-imports --update --enable-import-fields grf eprint XML /tmp/xml
+#
+# 4. re-run view generation
+# sudo -u eprints /usr/share/eprints3/bin/generate_views grf --verbose
+
+use Encode;
+use Data::Dump qw(dump);
+use Storable;
+
+=for eprints-api
+
+use EPrints;
+my $institution = 'Grafički fakultet';
+
+my $ep = EPrints->new();
+my $repo = $ep->repository( 'grf' );
+$repo->{config}->{enable_file_imports} = 1;
+$repo->{config}->{enable_web_imports} = 1;
+
+my $dataset = $repo->dataset( 'eprint' );
+my $list = $dataset->search;
+my $count = $list->count;
+warn "# found [$count] eprints\n";
+
+my $eprint = $dataset->dataobj( 21 );
+warn dump( $eprint->get_value('institution'), $institution );
+
+warn dump( $eprint );
+
+if ( $eprint->get_value( 'institution' ) ne $institution ) {
+       $eprint->set_value( 'institution' => $institution );
+       $eprint->save_revision();
+       $eprint->commit();
+}
+
+$repo->terminate();
+
+=cut
+
+our $eprintid = 1;
+
+my $files;
+
+my $mkp_path = "/mnt/share/MKP/ELEKTRONIČKI DOKUMENTI/EL.DOKUMENTI PO BIBLIOBROJU/do18052015/";
+my $koha_path = "/tmp/koha_ffzg";
+
+open(my $fh, '-|:encoding(UTF-8)', 'find "' . $mkp_path . '" -iname "*.pdf"');
+while(my $full_path = <$fh>) {
+       chomp $full_path;
+
+       my $file = $1 if $full_path =~ m{/([^/]+)\.pdf}i;
+
+       my $file_id;
+       if ( $file =~ m/^(\d+)/ ) {
+               $file_id = $1;
+       } else {
+               $file_id = $file;
+       }
+
+       warn "# $file_id\t$full_path\n";
+       $files->{ $file_id } = $full_path;
+
+}
+
+warn "# got ", scalar keys %$files, " files\n";
+
+store $files, "$koha_path.biblionumber.file";
+
+my $stat;
+my $this_id = '';
+my $item;
+
+binmode STDOUT, ":utf8";
+
+sub dump_item {
+       my $item = shift || return;
+
+       my $f200 = $item->{200}->[0] || die "no 200 in ",dump($item);
+
+       if ( $f200 =~ s/\s*;\s*([^;]+?)$//i ) {
+               $item->{mentor} = $1;
+               $item->{mentor} =~ s/^\s*voditelji?\s*(?:rada)\s*//i;
+       } else {
+               warn "MISSING ; voditelj [$f200]\n";
+       }
+
+       if ( $f200 =~ s{\s*/\s*([^/]+?)$}{} ) {
+               $item->{autor} = $1;
+       } else {
+               warn "MISSING / autor [$f200]\n";
+       }
+
+       if ( $f200 =~ s{\s*:\s*([^:]+?)$}{} ) {
+               $item->{tip} = lc($1);
+       } else {
+               warn "MISSING : tip [$f200]\n";
+       }
+
+       $item->{title} = $f200;
+
+       if ( exists $item->{991} ) {
+               my $file_id = $item->{991}->[0];
+               if ( exists $files->{ $file_id } ) {
+                       $item->{full_path} = delete $files->{ $file_id };
+               } elsif ( $file_id =~ s/(\w)0*(\d)/$1$2/ ) {
+                       if ( exists $files->{ $file_id } ) {
+                               $item->{full_path} = delete $files->{ $file_id };
+                       }
+               }
+       }
+
+       if ( ! exists $item->{full_path} ) {
+               my $file_id = ucfirst( $item->{300}->[0] . ' ' . $item->{700}->[0] );
+               $file_id =~ s/[\.\,]//g;
+               if ( exists $files->{ $file_id } ) {
+                       $item->{full_path} = delete $files->{ $file_id };
+               }
+       }
+
+       warn "MISSING file for $eprintid\n" unless exists $item->{full_path};
+
+       warn "# item ",dump($item);
+
+       my $eprint = {
+               eprintid => $eprintid++
+       };
+
+       $eprint->{filename} = $1 if $item->{full_path} =~ m{/([^/]+)$};
+       $eprint->{full_path} = $item->{full_path};
+
+       $eprint->{date} = $1 if $item->{210}->[0] =~ m/\$d(\d+)/;
+       $eprint->{pages} = $1 if $item->{215}->[0] =~ m/^(\d+)/;
+
+       ( $eprint->{creators_family}, $eprint->{creators_given} ) = split(/,\s*/, $item->{700}->[0] );
+
+       $eprint->{title} = $item->{title};
+
+       $eprint->{keywords} = join(", ", @{ $item->{610} }) if exists $item->{610};
+
+       if ( exists $item->{700}->[1] ) {
+               ( $eprint->{thesis_mentor_family}, $eprint->{thesis_mentor_given} ) = split(/,\s*/, $item->{700}->[1] );
+       } elsif ( $item->{mentor} ) {
+               ( $eprint->{thesis_mentor_given}, $eprint->{thesis_mentor_family} ) = split(/\s+/, $item->{mentor} );
+       }
+
+       $eprint->{thesis_mentor_family} =~ s/(\S+)\s*-\s*(\S+)/$1-$2/; # fix spaces between dash in double surname
+
+       $eprint->{thesis_callnumber} = $item->{990}->[0];
+       $eprint->{thesis_invnumber} =  $item->{991}->[0]; # FIXME?
+
+       # fallback za radove bez datuma na godinu
+       if ( ! $eprint->{thesis_date} && $item->{990}->[0] =~ m{/(\d\d\d\d)/} ) {
+               $eprint->{thesis_date} = $1;
+       }
+
+       if ( ! $eprint->{date} ) {
+               $eprint->{date} = $eprint->{thesis_date};
+       }
+
+       warn "# eprint ",dump($eprint);
+
+       print qq|
+
+  <eprint>
+    <eprintid>$eprint->{eprintid}</eprintid>
+
+       |;
+
+       if ( $eprint->{full_path} ) {
+               print qq|
+
+    <documents>
+      <document>
+
+        <files>
+          <file>
+            <datasetid>document</datasetid>
+            <filename>$eprint->{filename}</filename>
+            <mime_type>application/pdf</mime_type>
+            <url>file://$eprint->{full_path}</url>
+          </file>
+        </files>
+        <mime_type>application/pdf</mime_type>
+        <format>application/pdf</format>
+        <language>hr</language>
+        <security>validuser</security>
+        <main>$eprint->{filename}</main>
+      </document>
+    </documents>
+
+               |;
+       }
+       print qq|
+
+    <eprint_status>archive</eprint_status>
+    <type>$eprint->{type}</type>
+    <metadata_visibility>show</metadata_visibility>
+    <creators>
+      <item>
+        <name>
+          <family>$eprint->{creators_family}</family>
+          <given>$eprint->{creators_given}</given>
+        </name>
+      </item>
+    </creators>
+    <title>$eprint->{title}</title>
+    <ispublished>unpub</ispublished>
+    <subjects>
+      <item>2.06</item>
+    </subjects>
+    <full_text_status>restricted</full_text_status>
+    <keywords>$eprint->{keywords}</keywords>
+    <date>$eprint->{date}</date>
+    <date_type>completed</date_type>
+    <pages>$eprint->{pages}</pages>
+    <institution>Grafički fakultet</institution>
+<!--
+    <department>strojevi</department>
+-->
+    <thesis_date>$eprint->{thesis_date}</thesis_date>
+    <thesis_callnumber>$eprint->{thesis_callnumber}</thesis_callnumber>
+    <thesis_invnumber>$eprint->{thesis_invnumber}</thesis_invnumber>
+    <thesis_mentor>
+      <name>
+        <family>$eprint->{thesis_mentor_family}</family>
+        <given>$eprint->{thesis_mentor_given}</given>
+      </name>
+    </thesis_mentor>
+  </eprint>
+
+       |;
+
+}
+
+print qq{<?xml version="1.0" encoding="utf-8" ?>
+<eprints>
+};
+
+
+open(my $tsv_fh,   '<:encoding(UTF-8)', "$koha_path.tsv");
+open(my $tsv_marc, '<:encoding(UTF-8)', "$koha_path.marc");
+
+while(<$tsv_fh>) {
+       my $line = $_;
+       $line =~ s/[\n\r]+$//;
+
+       my ($offset, $biblionumber, $title) = split(/\t/,$_,3);
+
+       warn "# $offset $biblionumber $title\n";
+
+       exit if $ENV{LAST} && $eprintid >= $ENV{LAST};
+
+       my $item;
+
+       if ( $item->{full_path} = $files->{$biblionumber} ) {
+               $stat->{file}++;
+       } else {
+               $stat->{missing}++;
+       }
+
+}
+
+print qq{
+</eprints>
+};
+
+
+warn "# files left ", dump($files);
+
+warn "# stat ", dump($stat);
author	dpavlin <dpavlin@rot13.org>
	Wed, 20 May 2015 21:59:29 +0000 (23:59 +0200)
committer	dpavlin <dpavlin@rot13.org>
	Wed, 20 May 2015 22:00:00 +0000 (00:00 +0200)
EPrints/Plugin/Import/MARC.pm	[new file with mode: 0644]	patch \| blob
mkp-share.sh	[deleted file]	patch \| blob \| history
snz-rebuild.sh	[new file with mode: 0755]	patch \| blob
tsv2eprints.pl	[new file with mode: 0755]	patch \| blob