tsv2xp-xml.pl

   1 #!/usr/bin/perl
   2
   3 # ./tsv2xp-xml.pl 2>/dev/null > ep-diplomski.xml
   4 # sudo -u eprints /usr/share/eprints3/bin/import --verbose --migration ffzg eprint XML ep-diplomski.xml
   5
   6 use warnings;
   7 use strict;
   8
   9 use File::Slurp;
  10 use Data::Dump qw(dump);
  11
  12 my $tsv_file = ( glob 'items.*' )[0];
  13
  14 our $eprintsid = 700;
  15
  16 my $xml = read_file 'ep-xml.xml';
  17 my @files = read_file "files.txt";
  18
  19 my $file2path;
  20 foreach my $full ( @files ) {
  21         chomp $full;
  22         my $file = $1 if $full =~ m{/([^/]+)$};
  23         $file =~ s/\.\w+$//;
  24         $file2path->{ lc $file } = $full;
  25 }
  26 #warn "# file2path ",dump($file2path);
  27
  28 print qq{<?xml version="1.0" encoding="utf-8" ?>
  29 <eprints>
  30 };
  31
  32 my @header;
  33 my $header2col;
  34 my $col = 0;
  35
  36 our @v;
  37 sub interpolate {
  38         my $f = shift;
  39         my $optional = $1 if $f =~ s{(\?)$}{};
  40         my $i = $header2col->{$f};
  41         die "no $f in ", dump( $header2col ) if not defined $i and not $optional;
  42         my $v = $v[$i];
  43         warn "# $f $i = $v\n";
  44         return $v;
  45 }
  46
  47
  48 open(my $tsv, '<', $tsv_file) || die "$tsv_file: $!";
  49 while(<$tsv>) {
  50         chomp;
  51         if ( m/#(.+)/ ) {
  52                 @header = split(/\t/, $1);
  53                 warn "# header ",dump( @header );
  54                 my $i = 0;
  55                 $header2col->{$_} = $col++ foreach @header;
  56                 warn "# header2col ",dump( $header2col );
  57                 next;
  58         }
  59
  60         @v = map { s/\\N//g; $_ } split(/\t/, $_);
  61         warn "# v = ", dump(@v);
  62
  63         my $file = interpolate 'IME FILE-a';
  64         $file =~ s/ //g;
  65         my $full_path;
  66         if ( my $full = $file2path->{ lc $file } ) {
  67                 $full_path = $full;
  68                 warn "# file $file -> $full_path\n";
  69                 $file .= $1 if $full_path =~ m/(\.\w+)$/;
  70         }
  71
  72         my $subject;
  73         if ( $file =~ m/(\w+)/ ) {
  74                 $_ = $1;
  75                 $subject =
  76                         m/bib/i    ? 'IZBIB' :
  77                         m/in[fo]/i ? 'IZDHI' :
  78                         m/muz/i    ? 'IZMUZ' :
  79                         m/arh/i    ? 'IZARH' :
  80                         die "unknown subject: $2";
  81         }
  82
  83         my $c = $col;
  84         $header2col->{'eprintsid'} = $c; $v[$c++] = $eprintsid++;
  85         $header2col->{'file'} = $c; $v[$c++] = $file;
  86         $header2col->{'full_path'} = $c; $v[$c++] = $full_path;
  87         $header2col->{'subject'} = $c; $v[$c++] = $subject;
  88
  89         my $eprints = $xml;
  90         while ( $eprints =~ s/<!-- "(.+?)" -->/interpolate($1)/seg ) {
  91                 warn "# replaced $1\n";
  92         }
  93
  94         $eprints =~ s{<documents>.+</documents>}{<!-- no documents -->}s if ! $full_path;
  95         $eprints =~ s{<subjects>.+</subjects>}{<!-- no subjects -->}s if ! $subject;
  96
  97         print $eprints;
  98 }
  99
 100 print qq{
 101 </eprints>
 102 };
 103