1 #!/usr/bin/perl -w -I/data/eprints2/perl_lib
3 # map eprints 2 formats to proper mime-types
4 # these will need configuring in eprints 3
5 our %FORMAT_MAPPING = qw(
8 ps application/postscript
10 msword application/mssword
13 powerpoint application/vnd.ms-powerpoint
22 B<export3data.pl> - export data from an eprints 2 repository in eprints 3 xml format
26 B<export3data.pl> [B<options>] I<archive> I<eprints|users|subjects> [B<list of ids>]
30 This tool will attempt to export data from an eprints 2 repository in a format suitable for import into an appropriately configured eprints 3 repository. This is probably a good place to make alterations to your metadata layout (but you will have to customise this script).
32 This script will not allow you to export records that contain badly encoded records (because they'd just fail on import anyway).
34 This script requires Perl IO, which is only in Perl 5.8 onwards. It is anticipated that you would copy your existing eprints 2 installation to a new server, parallel to your eprints 3 installation, before executing this script.
42 The ID of the EPrint archive to export from.
44 =item I<eprints|users|subjects>
46 The dataset to export.
56 Base-64 encode documents and include them in the XML output.
60 Be more verbose about what's going on (repeat for more verbosity).
64 Specify a file to write eprint ids to that are in badly encoded UTF8. You will need to fix these eprints by hand.
78 print STDERR "Warning! You are using a Perl older than 5.8. The unicode-checking features of this utility require 5.8. You can continue, but you may have trouble importing the resulting XML file.\n";
81 use_module( "Encode", "Encode module is not available - this is required to check output is correctly formatted UTF-8" );
83 # $SIG{__DIE__} = $SIG{__WARN__} = sub { Carp::confess(@_) };
94 our( $opt_help, $opt_skiplog, $opt_inline );
99 'verbose+' => \$opt_verbose,
100 'skiplog=s' => \$opt_skiplog,
101 'inline' => \$opt_inline,
103 pod2usage( 1 ) if $opt_help;
104 pod2usage( 2 ) if scalar @ARGV < 2;
106 if( $opt_inline and !use_module( "PerlIO::via::Base64", "PerlIO::via::Base64 is required to inline file content" ) )
114 open($SKIPLOG, ">", $opt_skiplog)
115 or die "Unable to open $opt_skiplog for writing: $!";
118 # We can optionally only export a given set of items (very useful for
120 our @IDS = splice(@ARGV,2);
122 ##############################################################################
123 # End of Command-Line Arguments
124 ##############################################################################
126 # Global variables/constants
129 our $XMLNS = 'http://eprints.org/ep3/data/3.0';
130 our $UTF8_QUOTE = pack('U',0x201d); # Opening quote
131 if( $AVAILABLE{ Encode } )
133 Encode::_utf8_off($UTF8_QUOTE);
136 # Lets connect to eprints
137 my $session = new EPrints::Session( 1 , $ARGV[0] );
138 exit( 1 ) unless( defined $session );
140 my $archive = $session->get_archive;
144 binmode($fh, ":utf8") if $^V gt v5.7.0;
146 if( $ARGV[1] eq "subjects" )
150 elsif( $ARGV[1] eq "eprints" )
154 elsif( $ARGV[1] eq "users" )
160 print "Unknown dataset: $ARGV[1]. (users/eprints/subjects)\n";
164 $session->terminate();
170 print $fh "<?xml version=\"1.0\" encoding=\"utf-8\" ?>\n";
171 print $fh "<eprints>\n\n";
175 foreach my $id (@IDS)
177 my $item = EPrints::EPrint->new( $session, $id );
180 die "$id does not exist\n";
182 my $dataset = $item->get_dataset();
183 print STDERR "Reading eprint $id from dataset ".$dataset->{id}."\n" if $opt_verbose > 1;
184 export_eprint( $session, $dataset, $item );
189 my @datasets = qw( inbox buffer archive deletion );
191 foreach my $dsid ( @datasets )
193 my $dataset = $archive->get_dataset( $dsid );
194 $TOTAL += $dataset->count( $session );
196 foreach my $dsid ( @datasets )
198 print STDERR "Dataset: $dsid\n" if $opt_verbose;
199 my $dataset = $archive->get_dataset( $dsid );
200 $dataset->map( $session, \&export_eprint );
203 print $fh "</eprints>\n";
208 print $fh "<?xml version=\"1.0\" encoding=\"utf-8\" ?>\n";
209 print $fh "<users>\n\n";
210 my $dataset = $archive->get_dataset( 'user' );
214 foreach my $id (@IDS)
216 my $item = EPrints::User->new( $session, $id );
219 die "$id does not exist\n";
221 print STDERR "Reading user $id from dataset ".$dataset->{id}."\n" if $opt_verbose > 1;
222 export_user( $session, $dataset, $item );
227 $dataset->map( $session, \&export_user );
229 print $fh "</users>\n";
234 print $fh "<?xml version=\"1.0\" encoding=\"utf-8\" ?>\n";
235 print $fh "<subjects>\n\n";
236 my $dataset = $archive->get_dataset( 'subject' );
237 $dataset->map( $session, \&export_subject );
238 print $fh "</subjects>\n";
244 my( $session, $dataset, $item ) = @_;
246 my $subject = $session->make_element( 'subject', xmlns => $XMLNS );
248 foreach my $field ( $dataset->get_fields )
250 my $name = $field->get_name;
251 next if $name eq "ancestors";
252 my $value = $item->get_value( $name );
253 next unless EPrints::Utils::is_set $value;
254 $subject->appendChild(export_value( $session, $field, $value ));
256 print $fh $subject->toString . "\n\n";
262 my( $session, $dataset, $item ) = @_;
264 my $user = $session->make_element( 'user', xmlns => $XMLNS );
266 my $sql = "SELECT `password` FROM `users` WHERE `userid`=".$item->get_id;
267 ( $item->{data}->{password} ) = $session->get_db->{dbh}->selectrow_array( $sql );
268 foreach my $field ( $dataset->get_fields )
270 my $name = $field->get_name;
271 my $value = $item->get_value( $name );
272 next unless EPrints::Utils::is_set $value;
273 $user->appendChild( export_value( $session, $field, $value ) );
276 print $fh $user->toString . "\n\n";
281 my( $session, $field, $value ) = @_;
283 my $name = $field->get_name;
285 $name = 'creators' if $name eq 'authors';
286 $name = 'pagerange' if $name eq 'pages' && $field->get_type eq "pagerange";
288 my $dom = $session->make_element( $name );
290 if( $field->get_property( "multilang" ) )
292 if( $field->get_property( "multiple" ) )
294 die "multiple+multilang fields not currently supported.";
297 foreach my $langid ( keys %{$value} )
299 my $item = $session->make_element( 'item' );
300 $dom->appendChild( $item );
302 my $el_name = $session->make_element( 'name' );
303 $item->appendChild( $el_name );
304 $el_name->appendChild( rv($session, $field, $value->{$langid}) );
306 my $el_lang = $session->make_element( 'lang' );
307 $item->appendChild( $el_lang );
308 $el_lang->appendChild( $session->make_text( $langid ) );
314 if( !$field->get_property( "multiple" ) )
316 $dom->appendChild( rv($session, $field, $value) );
320 foreach my $v ( @{$value} )
322 next unless EPrints::Utils::is_set($v);
323 $dom->appendChild( my $item = $session->make_element( 'item' ) );
324 if( $field->get_property( "hasid" ) )
326 if( EPrints::Utils::is_set($v->{id}) )
328 my $tag = $session->make_element( 'id' );
329 $item->appendChild( $tag );
330 $tag->appendChild( $session->make_text( $v->{id} ) );
332 if( EPrints::Utils::is_set($v->{main}) )
334 my $tag = $session->make_element( 'name' );
335 $item->appendChild( $tag );
336 $tag->appendChild( rv( $session, $field, $v->{main} ) );
341 $item->appendChild( rv( $session, $field, $v ) );
349 my( $session, $value ) = @_;
351 my $dom = $session->make_doc_fragment();
353 if( ref($value) eq 'HASH' )
355 foreach my $key (keys %$value)
357 if( defined($value->{$key}) and $value->{$key} ne '' )
359 my $el = $session->make_element( $key );
360 $dom->appendChild( $el );
361 $el->appendChild( export_hashref( $session, $value->{$key} ) );
365 elsif( defined($value) )
367 $dom->appendChild( $session->make_text( $value ) );
375 my( $session, $name, $value ) = @_;
377 my $dom = $session->make_element( $name );
379 if( ref($value) eq 'ARRAY' )
381 foreach my $v ( @$value )
383 my $item = $session->make_element( 'item' );
384 $dom->appendChild( $item );
385 if( ref($v) eq 'HASH' )
387 foreach my $key (keys %$v)
389 my $el = $session->make_element( $key );
390 $item->appendChild( $el );
391 $el->appendChild( export_hashref($session, $v->{$key}) );
396 $item->appendChild( $session->make_text( $v ) );
400 elsif( defined( $value ) )
402 $dom->appendChild( $session->make_text( $value ) );
410 my( $session, $dataset, $item ) = @_;
414 print STDERR int(100*$DONE/$TOTAL) . " \% " . $item->get_id() . " \r" if $opt_verbose;
416 my $eprint = $session->make_element( 'eprint', xmlns => $XMLNS );
418 $eprint->appendChild( $session->make_element( 'eprint_status' ))
419 ->appendChild( $session->make_text( $dataset->id ));
421 foreach my $field ( $dataset->get_fields )
423 my $name = $field->get_name;
424 next if $name eq "fileinfo";
425 next if $name eq "date_issue";
426 next if $name eq "date_effective";
427 next if $name eq "date_sub";
428 next if $name eq "dir";
429 next if $name eq "month";
430 next if $name eq "year";
431 my $value = $item->get_value( $name );
432 next unless EPrints::Utils::is_set $value;
434 print STDERR "Adding field: $name\n" if $opt_verbose > 1;
436 $eprint->appendChild( export_value( $session, $field, $value ) );
439 print STDERR "Processing date fields\n" if $opt_verbose > 1;
443 if( $dataset->has_field( "year" ) && $item->is_set( "year" ) )
445 $date = $item->get_value( "year" );
446 if( $dataset->has_field( "month" ) && $item->is_set( "month" ) )
449 jan=>"01", feb=>"02", mar=>"03", apr=>"04",
450 may=>"05", jun=>"06", jul=>"07", aug=>"08",
451 sep=>"09", "oct"=>"10", nov=>"11", dec=>"12",
464 }->{$item->get_value( "month" )};
465 if( !defined $month_num )
467 print STDERR "Warning: unknown month code: '".$item->get_value( "month" )."'\n";
469 $date .= "-".$month_num;
471 $date_type = "published";
473 if( $dataset->has_field( "date_sub" ) && $item->is_set( "date_sub" ) )
475 $date = $item->get_value( "date_sub" );
476 $date_type = "submitted";
478 if( $dataset->has_field( "date_issue" ) && $item->is_set( "date_issue" ) )
480 $date = $item->get_value( "date_issue" );
481 $date_type = "published";
483 if( $date eq "" && $dataset->has_field( "date_effective" ) && $item->is_set( "date_effective" ) )
485 $date = $item->get_value( "date_effective" );
486 $date_type = "published";
488 $eprint->appendChild( $session->make_element( 'date' ) )
489 ->appendChild( $session->make_text( $date ) );
490 $eprint->appendChild( $session->make_element( 'date_type' ) )
491 ->appendChild( $session->make_text( $date_type ) );
493 print STDERR "Processing documents\n" if $opt_verbose > 1;
495 my $documents = $eprint->appendChild( $session->make_element( 'documents' ) );
497 my @docs = $item->get_all_documents;
499 print STDERR "Got ".@docs." documents\n" if $opt_verbose > 2;
501 foreach my $doc ( @docs )
503 my $document = $documents->appendChild( $session->make_element( 'document' ) );
504 my $docid = $doc->get_id;
505 $docid=~m/^(\d+)-(\d+)$/;
508 print STDERR "Processing document $pos\n" if $opt_verbose > 2;
510 $document->appendChild( $session->make_element( 'eprintid' ) )
511 ->appendChild($session->make_text($doc->get_value( 'eprintid' )));
513 my $format = $doc->get_value( 'format' ) || 'other';
514 if( exists $FORMAT_MAPPING{$format} )
516 $format = $FORMAT_MAPPING{$format};
518 $document->appendChild( $session->make_element( 'format' ) )
519 ->appendChild($session->make_text($format));
521 $document->appendChild( $session->make_element( 'language' ) )
522 ->appendChild($session->make_text($doc->get_value( 'language' )||''));
523 my $security = $doc->get_value( "security" ) || "public";
524 $document->appendChild( $session->make_element( 'security' ) )
525 ->appendChild($session->make_text($security));
526 $document->appendChild( $session->make_element( 'main' ) )
527 ->appendChild($session->make_text($doc->get_value( 'main' )||''));
528 $document->appendChild( $session->make_element( 'pos' ) )
529 ->appendChild($session->make_text($pos));
531 my $files = $document->appendChild( $session->make_element( 'files' ) );
533 my %filenames = $doc->files;
534 print STDERR "Contains ".scalar(keys(%filenames))." files\n" if $opt_verbose > 2;
536 # No files in this document, destroy it (something odd happened)
537 if( scalar(keys %filenames) == 0 )
539 $documents->removeChild( $document );
543 foreach my $filename ( keys %filenames )
545 my $file = $files->appendChild( $session->make_element( 'file' ) );
547 $file->appendChild($session->make_element( 'filename' ))
548 ->appendChild($session->make_text( $filename ));
549 my $fullpath = $doc->local_path."/".$filename;
550 $file->appendChild($session->make_element( 'data',
551 'href' => "file://" . $fullpath ));
556 # In eprints.soton we have multiple isbns, which are a compound of isbn and
557 # cover. There are some legacy records with a single isbn which we'll
558 # resurrect if isbns isn't set
560 # print STDERR "Processing ISBNs\n" if $opt_verbose > 1;
562 # if( $dataset->has_field( "isbns" ) and $item->is_set( "isbns" ) )
564 # my $values = $item->get_value( "isbns" );
565 # if( defined $values )
570 # isbn => $_->{main},
571 # cover => ((defined($_->{id}) and $_->{id} ne '') ? $_->{id} : 'unspecified'),
574 # $eprint->appendChild( export_dataobj( $session, "isbns" , $values ) );
577 # elsif( $item->is_set( "isbn" ) )
579 # my $value = $item->get_value( "isbn" );
582 # cover => 'unspecified'
584 # $eprint->appendChild( export_dataobj( $session, "isbns", [$value] ));
587 # In eprints 3 issns will be flagged as electronic or paper (another
590 # print STDERR "Processing ISSN\n" if $opt_verbose > 1;
592 # if( $dataset->has_field( "issn" ) and $item->is_set( "issn" ) )
594 # my $value = $item->get_value( "issn" );
595 # $eprint->appendChild( export_dataobj( $session, "issns" , [ { issn => $value, cover => 'unspecified' } ] ) );
598 # More fields being turned into compounds
600 # print STDERR "Processing exhibition_eventlocdate\n" if $opt_verbose > 1;
602 # if( $dataset->has_field( "exhibition_eventlocdate" ) and $item->is_set( "exhibition_eventlocdate" ) )
604 # my $values = $item->get_value( "exhibition_eventlocdate" );
605 # if( defined $values )
609 # my( $date, $venue ) = split /\|/, $_, 2;
616 # $eprint->appendChild( export_dataobj( $session, "venue_date", $values ) );
620 # In eprints.soton we store the staff id for all RAE-returnable fields (or,
621 # if not a member of staff, 'internal', 'external' or 'unknown'). In
622 # eprints 3 this is obviously a compound field, whereas in 2 it was two
623 # fields that were kept synchronised.
624 # (We didn't use the id part in eprints 2, because we don't want users to be
625 # able to directly edit the staff id bit)
627 # foreach my $namefield (qw( creators editors exhibitors ))
629 # print STDERR "Processing $namefield\n" if $opt_verbose > 1;
631 # if( $dataset->has_field( $namefield ) and $item->is_set( $namefield ) )
633 # my $names = $item->get_value( $namefield );
634 # my $staffids = $item->get_value( $namefield."_empid" ) || [];
640 # name => $_->{main},
641 # staffid => 'unknown',
645 # for(my $i = 0; $i < @$staffids; $i++)
647 # if( $staffids->[$i] ne '' )
649 # $names->[$i]->{staffid} = $staffids->[$i];
653 # $eprint->appendChild( export_dataobj( $session, $namefield, $names ));
657 # Check that our output is valid utf8, otherwise we'll have trouble parsing
658 # it (and import is much, much slower than export)
659 # You might want to modify this to automatically replace bad chars with a
660 # '?' or similar, but it's probably better to manually inspect and fix
663 my $xml = $eprint->toString();
664 $xml =~ s/\xe2\x80\x3f/$UTF8_QUOTE/sg; # Fix word's bespoke quote for Unicode
665 # $xml =~ s/[\x00-\x08\x0B\x0C\x0E-\x1F]//g;
667 unless( check_utf8($xml, \$error) )
669 if( defined($SKIPLOG) )
671 print $SKIPLOG $item->{dataset}->{id} . "\t" . $item->get_id . "\t$error\n";
675 print STDERR "Fix invalid utf8 in eprint " . $item->get_id . " (or use the skiplog argument to log all unexportable eprints): $error\n";
681 # inject the base64-encoded files
684 print STDERR "Injecting base64 encoded files\n" if $opt_verbose > 1;
685 # locate the <data></data> fields
686 my( $pre, @files ) = split /(<data[^>]+(?:>\s*<\/\s*data\s*>|\/>))/, $xml;
687 @files = grep { length($_) } @files; # remove the tween bits
688 my $post = pop @files;
691 foreach my $data (@files)
693 ($data) = EPrints::XML::parse_xml_string( $data )->getElementsByTagName( 'data' );
694 print $fh "<data encoding=\"base64\">";
695 my $url = $data->getAttribute( 'href' );
696 $url =~ s/^file:\/\///;
697 write_base64_file( $fh, $url );
698 print $fh "</data>\n";
700 print $fh $post if defined $post;
704 print $fh $xml . "\n";
707 print STDERR "Done Processing Eprint: " . $item->get_id . "\n" if $opt_verbose > 1;
710 # Handle name fields correctly (should this include id???)
714 my( $session, $field, $value ) = @_;
716 my $dom = $session->make_doc_fragment;
718 if( $field->is_type( "name" ) )
720 foreach my $p ( qw/ family given lineage honourific / )
722 next if !EPrints::Utils::is_set( $value->{$p} );
723 my $tag = $session->make_element( $p );
724 $dom->appendChild( $tag );
725 $tag->appendChild( $session->make_text( $value->{$p} ) );
730 $dom->appendChild( $session->make_text( $value ) );
736 # write a $filename to $out in base64 encoding
738 sub write_base64_file
740 my( $out, $filename ) = @_;
742 binmode($out, ":via(Base64)");
743 open(my $fh, "<", $filename) or die "Unable to open $filename: $!\n";
745 while(read($fh, my $buffer, 4096))
750 binmode($out, ":pop");
753 # fill $error with the locations of bad chars in $bytes
754 # returns true if the string is ok
758 my( $bytes, $error ) = @_;
760 return 1 unless $AVAILABLE{ Encode };
766 my $str = Encode::decode("utf8", $bytes, eval 'Encode::FB_QUIET');
769 $str =~ s/^.+(.{40})$/... $1/s;
770 $$error .= "Bad char '$str'<--HERE!!! ";
771 while( length($bytes) and ord(substr($bytes, 0, 1)) > 0x80 )
773 substr($bytes, 0, 1) = '';
776 } while( length($bytes) and $max_errors-- );
778 return length($$error) == 0;
783 my( $name, $msg ) = @_;
789 print STDERR "$msg\n";
790 return $AVAILABLE{$name} = 0;
794 return $AVAILABLE{$name} = 1;
799 <eprintid>1</eprintid>
800 <rev_number>11</rev_number>
801 <eprint_status>buffer</eprint_status>
803 <dir>disk0/00/00/00/01</dir>
804 <lastmod>2006-12-18 17:11:56</lastmod>
805 <status_changed>2006-12-18 17:11:56</status_changed>
807 <metadata_visibility>show</metadata_visibility>
808 <fileinfo>http://files3.eprints.org/style/images/fileicons/html.png;http://files3.eprints.org/1/1/versions.txt</fileinfo>
809 <license>Other</license>
811 <document xmlns="http://eprints.org/ep3/data/3.0">
813 <rev_number>3</rev_number>
814 <eprintid>1</eprintid>
816 <format>html</format>
817 <language>en</language>
818 <security>validuser</security>
819 <main>versions.txt</main>
822 <filename>versions.txt</filename>
823 <filesize>3515</filesize>
824 <url>http://files3.eprints.org/1/1/versions.txt</url>