From 283cdd22b65cb891275e417b1226c3dd13f6bd02 Mon Sep 17 00:00:00 2001 From: Dobrica Pavlinusic Date: Sun, 17 Oct 2010 00:53:07 +0200 Subject: [PATCH] eprints-dev: /home/dpavlin/mtoolkit/export3data.pl [commit] --- mtoolkit/export3data.pl | 830 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 830 insertions(+) create mode 100755 mtoolkit/export3data.pl diff --git a/mtoolkit/export3data.pl b/mtoolkit/export3data.pl new file mode 100755 index 0000000..0e64000 --- /dev/null +++ b/mtoolkit/export3data.pl @@ -0,0 +1,830 @@ +#!/usr/bin/perl -w -I/data/eprints2/perl_lib + +# map eprints 2 formats to proper mime-types +# these will need configuring in eprints 3 +our %FORMAT_MAPPING = qw( + html text/html + pdf application/pdf + ps application/postscript + ascii text/plain + msword application/mssword + image image + latex latex + powerpoint application/vnd.ms-powerpoint + coverimage coverimage + other other +); + +=pod + +=head1 NAME + +B - export data from an eprints 2 repository in eprints 3 xml format + +=head1 SYNOPSIS + +B [B] I I [B] + +=head1 DESCRIPTION + +This tool will attempt to export data from an eprints 2 repository in a format suitable for import into an appropriately configured eprints 3 repository. This is probably a good place to make alterations to your metadata layout (but you will have to customise this script). + +This script will not allow you to export records that contain badly encoded records (because they'd just fail on import anyway). + +This script requires Perl IO, which is only in Perl 5.8 onwards. It is anticipated that you would copy your existing eprints 2 installation to a new server, parallel to your eprints 3 installation, before executing this script. + +=head1 ARGUMENTS + +=over 8 + +=item I + +The ID of the EPrint archive to export from. + +=item I + +The dataset to export. + +=back + +=head1 OPTIONS + +=over 8 + +=item B<--inline> + +Base-64 encode documents and include them in the XML output. + +=item B<--verbose> + +Be more verbose about what's going on (repeat for more verbosity). + +=item B<--skiplog> + +Specify a file to write eprint ids to that are in badly encoded UTF8. You will need to fix these eprints by hand. + +=back + +=cut + +use Carp; + +use Pod::Usage; + +our %AVAILABLE; + +if( $^V lt v5.8.0 ) +{ + print STDERR "Warning! You are using a Perl older than 5.8. The unicode-checking features of this utility require 5.8. You can continue, but you may have trouble importing the resulting XML file.\n"; +} + +use_module( "Encode", "Encode module is not available - this is required to check output is correctly formatted UTF-8" ); + +# $SIG{__DIE__} = $SIG{__WARN__} = sub { Carp::confess(@_) }; + +use EPrints::EPrint; +use EPrints::Session; +use EPrints::Subject; + +use Getopt::Long; + +use strict; +use warnings; + +our( $opt_help, $opt_skiplog, $opt_inline ); +our $opt_verbose = 0; + +GetOptions( + 'help' => \$opt_help, + 'verbose+' => \$opt_verbose, + 'skiplog=s' => \$opt_skiplog, + 'inline' => \$opt_inline, +) or pod2usage( 2 ); +pod2usage( 1 ) if $opt_help; +pod2usage( 2 ) if scalar @ARGV < 2; + +if( $opt_inline and !use_module( "PerlIO::via::Base64", "PerlIO::via::Base64 is required to inline file content" ) ) +{ + exit -1; +} + +my $SKIPLOG; +if( $opt_skiplog ) +{ + open($SKIPLOG, ">", $opt_skiplog) + or die "Unable to open $opt_skiplog for writing: $!"; +} + +# We can optionally only export a given set of items (very useful for +# debugging) +our @IDS = splice(@ARGV,2); + +############################################################################## +# End of Command-Line Arguments +############################################################################## + +# Global variables/constants +our $TOTAL = -1; +our $DONE = 0; +our $XMLNS = 'http://eprints.org/ep3/data/3.0'; +our $UTF8_QUOTE = pack('U',0x201d); # Opening quote +if( $AVAILABLE{ Encode } ) +{ + Encode::_utf8_off($UTF8_QUOTE); +} + +# Lets connect to eprints +my $session = new EPrints::Session( 1 , $ARGV[0] ); +exit( 1 ) unless( defined $session ); + +my $archive = $session->get_archive; + +my $fh = *STDOUT; + +binmode($fh, ":utf8") if $^V gt v5.7.0; + +if( $ARGV[1] eq "subjects" ) +{ + export_subjects(); +} +elsif( $ARGV[1] eq "eprints" ) +{ + export_eprints(); +} +elsif( $ARGV[1] eq "users" ) +{ + export_users(); +} +else +{ + print "Unknown dataset: $ARGV[1]. (users/eprints/subjects)\n"; +} + + +$session->terminate(); +exit; + + +sub export_eprints +{ + print $fh "\n"; + print $fh "\n\n"; + if( @IDS ) + { + $TOTAL = @IDS; + foreach my $id (@IDS) + { + my $item = EPrints::EPrint->new( $session, $id ); + if( !$item ) + { + die "$id does not exist\n"; + } + my $dataset = $item->get_dataset(); + print STDERR "Reading eprint $id from dataset ".$dataset->{id}."\n" if $opt_verbose > 1; + export_eprint( $session, $dataset, $item ); + } + } + else + { + my @datasets = qw( inbox buffer archive deletion ); + $TOTAL = 0; + foreach my $dsid ( @datasets ) + { + my $dataset = $archive->get_dataset( $dsid ); + $TOTAL += $dataset->count( $session ); + } + foreach my $dsid ( @datasets ) + { + print STDERR "Dataset: $dsid\n" if $opt_verbose; + my $dataset = $archive->get_dataset( $dsid ); + $dataset->map( $session, \&export_eprint ); + } + } + print $fh "\n"; +} + +sub export_users +{ + print $fh "\n"; + print $fh "\n\n"; + my $dataset = $archive->get_dataset( 'user' ); + if( @IDS ) + { + $TOTAL = @IDS; + foreach my $id (@IDS) + { + my $item = EPrints::User->new( $session, $id ); + if( !$item ) + { + die "$id does not exist\n"; + } + print STDERR "Reading user $id from dataset ".$dataset->{id}."\n" if $opt_verbose > 1; + export_user( $session, $dataset, $item ); + } + } + else + { + $dataset->map( $session, \&export_user ); + } + print $fh "\n"; +} + +sub export_subjects +{ + print $fh "\n"; + print $fh "\n\n"; + my $dataset = $archive->get_dataset( 'subject' ); + $dataset->map( $session, \&export_subject ); + print $fh "\n"; +} + + +sub export_subject +{ + my( $session, $dataset, $item ) = @_; + + my $subject = $session->make_element( 'subject', xmlns => $XMLNS ); + + foreach my $field ( $dataset->get_fields ) + { + my $name = $field->get_name; + next if $name eq "ancestors"; + my $value = $item->get_value( $name ); + next unless EPrints::Utils::is_set $value; + $subject->appendChild(export_value( $session, $field, $value )); + } + print $fh $subject->toString . "\n\n"; +} + + +sub export_user +{ + my( $session, $dataset, $item ) = @_; + + my $user = $session->make_element( 'user', xmlns => $XMLNS ); + + my $sql = "SELECT `password` FROM `users` WHERE `userid`=".$item->get_id; + ( $item->{data}->{password} ) = $session->get_db->{dbh}->selectrow_array( $sql ); + foreach my $field ( $dataset->get_fields ) + { + my $name = $field->get_name; + my $value = $item->get_value( $name ); + next unless EPrints::Utils::is_set $value; + $user->appendChild( export_value( $session, $field, $value ) ); + } + + print $fh $user->toString . "\n\n"; +} + +sub export_value +{ + my( $session, $field, $value ) = @_; + + my $name = $field->get_name; + + $name = 'creators' if $name eq 'authors'; + $name = 'pagerange' if $name eq 'pages' && $field->get_type eq "pagerange"; + + my $dom = $session->make_element( $name ); + + if( $field->get_property( "multilang" ) ) + { + if( $field->get_property( "multiple" ) ) + { + die "multiple+multilang fields not currently supported."; + } + + foreach my $langid ( keys %{$value} ) + { + my $item = $session->make_element( 'item' ); + $dom->appendChild( $item ); + + my $el_name = $session->make_element( 'name' ); + $item->appendChild( $el_name ); + $el_name->appendChild( rv($session, $field, $value->{$langid}) ); + + my $el_lang = $session->make_element( 'lang' ); + $item->appendChild( $el_lang ); + $el_lang->appendChild( $session->make_text( $langid ) ); + } + return $dom; + } + + + if( !$field->get_property( "multiple" ) ) + { + $dom->appendChild( rv($session, $field, $value) ); + return $dom; + } + + foreach my $v ( @{$value} ) + { + next unless EPrints::Utils::is_set($v); + $dom->appendChild( my $item = $session->make_element( 'item' ) ); + if( $field->get_property( "hasid" ) ) + { + if( EPrints::Utils::is_set($v->{id}) ) + { + my $tag = $session->make_element( 'id' ); + $item->appendChild( $tag ); + $tag->appendChild( $session->make_text( $v->{id} ) ); + } + if( EPrints::Utils::is_set($v->{main}) ) + { + my $tag = $session->make_element( 'name' ); + $item->appendChild( $tag ); + $tag->appendChild( rv( $session, $field, $v->{main} ) ); + } + } + else + { + $item->appendChild( rv( $session, $field, $v ) ); + } + } + return $dom; +} + +sub export_hashref +{ + my( $session, $value ) = @_; + + my $dom = $session->make_doc_fragment(); + + if( ref($value) eq 'HASH' ) + { + foreach my $key (keys %$value) + { + if( defined($value->{$key}) and $value->{$key} ne '' ) + { + my $el = $session->make_element( $key ); + $dom->appendChild( $el ); + $el->appendChild( export_hashref( $session, $value->{$key} ) ); + } + } + } + elsif( defined($value) ) + { + $dom->appendChild( $session->make_text( $value ) ); + } + + return $dom; +} + +sub export_dataobj +{ + my( $session, $name, $value ) = @_; + + my $dom = $session->make_element( $name ); + + if( ref($value) eq 'ARRAY' ) + { + foreach my $v ( @$value ) + { + my $item = $session->make_element( 'item' ); + $dom->appendChild( $item ); + if( ref($v) eq 'HASH' ) + { + foreach my $key (keys %$v) + { + my $el = $session->make_element( $key ); + $item->appendChild( $el ); + $el->appendChild( export_hashref($session, $v->{$key}) ); + } + } + else + { + $item->appendChild( $session->make_text( $v ) ); + } + } + } + elsif( defined( $value ) ) + { + $dom->appendChild( $session->make_text( $value ) ); + } + + return $dom; +} + +sub export_eprint +{ + my( $session, $dataset, $item ) = @_; + + $DONE++; + + print STDERR int(100*$DONE/$TOTAL) . " \% " . $item->get_id() . " \r" if $opt_verbose; + + my $eprint = $session->make_element( 'eprint', xmlns => $XMLNS ); + + $eprint->appendChild( $session->make_element( 'eprint_status' )) + ->appendChild( $session->make_text( $dataset->id )); + + foreach my $field ( $dataset->get_fields ) + { + my $name = $field->get_name; + next if $name eq "fileinfo"; + next if $name eq "date_issue"; + next if $name eq "date_effective"; + next if $name eq "date_sub"; + next if $name eq "dir"; + next if $name eq "month"; + next if $name eq "year"; + my $value = $item->get_value( $name ); + next unless EPrints::Utils::is_set $value; + + print STDERR "Adding field: $name\n" if $opt_verbose > 1; + + $eprint->appendChild( export_value( $session, $field, $value ) ); + } + + print STDERR "Processing date fields\n" if $opt_verbose > 1; + + my $date = ""; + my $date_type = ""; + if( $dataset->has_field( "year" ) && $item->is_set( "year" ) ) + { + $date = $item->get_value( "year" ); + if( $dataset->has_field( "month" ) && $item->is_set( "month" ) ) + { + my $month_num = { + jan=>"01", feb=>"02", mar=>"03", apr=>"04", + may=>"05", jun=>"06", jul=>"07", aug=>"08", + sep=>"09", "oct"=>"10", nov=>"11", dec=>"12", + january=>"01", + february=>"02", + march=>"03", + april=>"04", + may=>"05", + june=>"06", + july=>"07", + august=>"08", + september=>"09", + october=>"10", + november=>"11", + december=>"12", + }->{$item->get_value( "month" )}; + if( !defined $month_num ) + { + print STDERR "Warning: unknown month code: '".$item->get_value( "month" )."'\n"; + } + $date .= "-".$month_num; + } + $date_type = "published"; + } + if( $dataset->has_field( "date_sub" ) && $item->is_set( "date_sub" ) ) + { + $date = $item->get_value( "date_sub" ); + $date_type = "submitted"; + } + if( $dataset->has_field( "date_issue" ) && $item->is_set( "date_issue" ) ) + { + $date = $item->get_value( "date_issue" ); + $date_type = "published"; + } + if( $date eq "" && $dataset->has_field( "date_effective" ) && $item->is_set( "date_effective" ) ) + { + $date = $item->get_value( "date_effective" ); + $date_type = "published"; + } + $eprint->appendChild( $session->make_element( 'date' ) ) + ->appendChild( $session->make_text( $date ) ); + $eprint->appendChild( $session->make_element( 'date_type' ) ) + ->appendChild( $session->make_text( $date_type ) ); + + print STDERR "Processing documents\n" if $opt_verbose > 1; + + my $documents = $eprint->appendChild( $session->make_element( 'documents' ) ); + + my @docs = $item->get_all_documents; + + print STDERR "Got ".@docs." documents\n" if $opt_verbose > 2; + + foreach my $doc ( @docs ) + { + my $document = $documents->appendChild( $session->make_element( 'document' ) ); + my $docid = $doc->get_id; + $docid=~m/^(\d+)-(\d+)$/; + my $pos = $2+0; + + print STDERR "Processing document $pos\n" if $opt_verbose > 2; + + $document->appendChild( $session->make_element( 'eprintid' ) ) + ->appendChild($session->make_text($doc->get_value( 'eprintid' ))); + + my $format = $doc->get_value( 'format' ) || 'other'; + if( exists $FORMAT_MAPPING{$format} ) + { + $format = $FORMAT_MAPPING{$format}; + } + $document->appendChild( $session->make_element( 'format' ) ) + ->appendChild($session->make_text($format)); + + $document->appendChild( $session->make_element( 'language' ) ) + ->appendChild($session->make_text($doc->get_value( 'language' )||'')); + my $security = $doc->get_value( "security" ) || "public"; + $document->appendChild( $session->make_element( 'security' ) ) + ->appendChild($session->make_text($security)); + $document->appendChild( $session->make_element( 'main' ) ) + ->appendChild($session->make_text($doc->get_value( 'main' )||'')); + $document->appendChild( $session->make_element( 'pos' ) ) + ->appendChild($session->make_text($pos)); + + my $files = $document->appendChild( $session->make_element( 'files' ) ); + + my %filenames = $doc->files; + print STDERR "Contains ".scalar(keys(%filenames))." files\n" if $opt_verbose > 2; + + # No files in this document, destroy it (something odd happened) + if( scalar(keys %filenames) == 0 ) + { + $documents->removeChild( $document ); + } + else + { + foreach my $filename ( keys %filenames ) + { + my $file = $files->appendChild( $session->make_element( 'file' ) ); + + $file->appendChild($session->make_element( 'filename' )) + ->appendChild($session->make_text( $filename )); + my $fullpath = $doc->local_path."/".$filename; + $file->appendChild($session->make_element( 'data', + 'href' => "file://" . $fullpath )); + } + } + } + + # In eprints.soton we have multiple isbns, which are a compound of isbn and + # cover. There are some legacy records with a single isbn which we'll + # resurrect if isbns isn't set + +# print STDERR "Processing ISBNs\n" if $opt_verbose > 1; + +# if( $dataset->has_field( "isbns" ) and $item->is_set( "isbns" ) ) +# { +# my $values = $item->get_value( "isbns" ); +# if( defined $values ) +# { +# for( @$values ) +# { +# $_ = { +# isbn => $_->{main}, +# cover => ((defined($_->{id}) and $_->{id} ne '') ? $_->{id} : 'unspecified'), +# }; +# } +# $eprint->appendChild( export_dataobj( $session, "isbns" , $values ) ); +# } +# } +# elsif( $item->is_set( "isbn" ) ) +# { +# my $value = $item->get_value( "isbn" ); +# $value = { +# isbn => $value, +# cover => 'unspecified' +# }; +# $eprint->appendChild( export_dataobj( $session, "isbns", [$value] )); +# } + + # In eprints 3 issns will be flagged as electronic or paper (another + # compound field) + +# print STDERR "Processing ISSN\n" if $opt_verbose > 1; + +# if( $dataset->has_field( "issn" ) and $item->is_set( "issn" ) ) +# { +# my $value = $item->get_value( "issn" ); +# $eprint->appendChild( export_dataobj( $session, "issns" , [ { issn => $value, cover => 'unspecified' } ] ) ); +# } + + # More fields being turned into compounds + +# print STDERR "Processing exhibition_eventlocdate\n" if $opt_verbose > 1; + +# if( $dataset->has_field( "exhibition_eventlocdate" ) and $item->is_set( "exhibition_eventlocdate" ) ) +# { +# my $values = $item->get_value( "exhibition_eventlocdate" ); +# if( defined $values ) +# { +# for(@$values) +# { +# my( $date, $venue ) = split /\|/, $_, 2; +# $_ = { +# venue => $venue, +# date => $date, +# }; +# } + +# $eprint->appendChild( export_dataobj( $session, "venue_date", $values ) ); +# } +# } + + # In eprints.soton we store the staff id for all RAE-returnable fields (or, + # if not a member of staff, 'internal', 'external' or 'unknown'). In + # eprints 3 this is obviously a compound field, whereas in 2 it was two + # fields that were kept synchronised. + # (We didn't use the id part in eprints 2, because we don't want users to be + # able to directly edit the staff id bit) + +# foreach my $namefield (qw( creators editors exhibitors )) +# { +# print STDERR "Processing $namefield\n" if $opt_verbose > 1; + +# if( $dataset->has_field( $namefield ) and $item->is_set( $namefield ) ) +# { +# my $names = $item->get_value( $namefield ); +# my $staffids = $item->get_value( $namefield."_empid" ) || []; + +# Ignore the id +# for(@$names) +# { +# $_ = { +# name => $_->{main}, +# staffid => 'unknown', +# }; +# } + +# for(my $i = 0; $i < @$staffids; $i++) +# { +# if( $staffids->[$i] ne '' ) +# { +# $names->[$i]->{staffid} = $staffids->[$i]; +# } +# } + +# $eprint->appendChild( export_dataobj( $session, $namefield, $names )); +# } +# } + + # Check that our output is valid utf8, otherwise we'll have trouble parsing + # it (and import is much, much slower than export) + # You might want to modify this to automatically replace bad chars with a + # '?' or similar, but it's probably better to manually inspect and fix + # problems. + + my $xml = $eprint->toString(); + $xml =~ s/\xe2\x80\x3f/$UTF8_QUOTE/sg; # Fix word's bespoke quote for Unicode +# $xml =~ s/[\x00-\x08\x0B\x0C\x0E-\x1F]//g; + my $error; + unless( check_utf8($xml, \$error) ) + { + if( defined($SKIPLOG) ) + { + print $SKIPLOG $item->{dataset}->{id} . "\t" . $item->get_id . "\t$error\n"; + } + else + { + print STDERR "Fix invalid utf8 in eprint " . $item->get_id . " (or use the skiplog argument to log all unexportable eprints): $error\n"; + exit; + } + return; + } + + # inject the base64-encoded files + if( $opt_inline ) + { + print STDERR "Injecting base64 encoded files\n" if $opt_verbose > 1; + # locate the fields + my( $pre, @files ) = split /(]+(?:>\s*<\/\s*data\s*>|\/>))/, $xml; + @files = grep { length($_) } @files; # remove the tween bits + my $post = pop @files; + + print $fh $pre; + foreach my $data (@files) + { + ($data) = EPrints::XML::parse_xml_string( $data )->getElementsByTagName( 'data' ); + print $fh ""; + my $url = $data->getAttribute( 'href' ); + $url =~ s/^file:\/\///; + write_base64_file( $fh, $url ); + print $fh "\n"; + } + print $fh $post if defined $post; + } + else + { + print $fh $xml . "\n"; + } + + print STDERR "Done Processing Eprint: " . $item->get_id . "\n" if $opt_verbose > 1; +} + +# Handle name fields correctly (should this include id???) + +sub rv +{ + my( $session, $field, $value ) = @_; + + my $dom = $session->make_doc_fragment; + + if( $field->is_type( "name" ) ) + { + foreach my $p ( qw/ family given lineage honourific / ) + { + next if !EPrints::Utils::is_set( $value->{$p} ); + my $tag = $session->make_element( $p ); + $dom->appendChild( $tag ); + $tag->appendChild( $session->make_text( $value->{$p} ) ); + } + } + else + { + $dom->appendChild( $session->make_text( $value ) ); + } + + return $dom; +} + +# write a $filename to $out in base64 encoding + +sub write_base64_file +{ + my( $out, $filename ) = @_; + + binmode($out, ":via(Base64)"); + open(my $fh, "<", $filename) or die "Unable to open $filename: $!\n"; + binmode($fh); + while(read($fh, my $buffer, 4096)) + { + print $out $buffer; + } + close($fh); + binmode($out, ":pop"); +} + +# fill $error with the locations of bad chars in $bytes +# returns true if the string is ok + +sub check_utf8 +{ + my( $bytes, $error ) = @_; + + return 1 unless $AVAILABLE{ Encode }; + + my $max_errors = 10; + $$error = ''; + + do { + my $str = Encode::decode("utf8", $bytes, eval 'Encode::FB_QUIET'); + if( length($bytes) ) + { + $str =~ s/^.+(.{40})$/... $1/s; + $$error .= "Bad char '$str'<--HERE!!! "; + while( length($bytes) and ord(substr($bytes, 0, 1)) > 0x80 ) + { + substr($bytes, 0, 1) = ''; + } + } + } while( length($bytes) and $max_errors-- ); + + return length($$error) == 0; +} + +sub use_module +{ + my( $name, $msg ) = @_; + + eval "use $name;"; + + if( $@ ) + { + print STDERR "$msg\n"; + return $AVAILABLE{$name} = 0; + } + else + { + return $AVAILABLE{$name} = 1; + } +} + +__DATA__ + 1 + 11 + buffer + 1 + disk0/00/00/00/01 + 2006-12-18 17:11:56 + 2006-12-18 17:11:56 + release + show + http://files3.eprints.org/style/images/fileicons/html.png;http://files3.eprints.org/1/1/versions.txt + Other + + + 1 + 3 + 1 + 1 + html + en + validuser +
versions.txt
+ + + versions.txt + 3515 + http://files3.eprints.org/1/1/versions.txt + + +
+
+ + -- 2.20.1