file import works
[koha-eprints] / tsv2eprints.pl
1 #!/usr/bin/perl -I /usr/share/eprints3/perl_lib
2 use warnings;
3 use strict;
4 use autodie;
5 use utf8;
6
7 # export single record to get structure:
8 # sudo -u eprints /usr/share/eprints3/bin/export snz archive XMLFiles 20 > /tmp/20.xml
9
10 # Import procedure:
11
12 # 3. import with:
13 # sudo -u eprints /usr/share/eprints3/bin/import --verbose --migration --enable-file-imports --update --enable-import-fields grf eprint XML /tmp/xml
14 #
15 # 4. re-run view generation
16 # sudo -u eprints /usr/share/eprints3/bin/generate_views grf --verbose
17
18 use Encode;
19 use Data::Dump qw(dump);
20 use Storable;
21
22 =for eprints-api
23
24 use EPrints;
25 my $institution = 'Grafički fakultet';
26
27 my $ep = EPrints->new();
28 my $repo = $ep->repository( 'grf' );
29 $repo->{config}->{enable_file_imports} = 1;
30 $repo->{config}->{enable_web_imports} = 1;
31
32 my $dataset = $repo->dataset( 'eprint' );
33 my $list = $dataset->search;
34 my $count = $list->count;
35 warn "# found [$count] eprints\n";
36
37 my $eprint = $dataset->dataobj( 21 );
38 warn dump( $eprint->get_value('institution'), $institution );
39
40 warn dump( $eprint );
41
42 if ( $eprint->get_value( 'institution' ) ne $institution ) {
43         $eprint->set_value( 'institution' => $institution );
44         $eprint->save_revision();
45         $eprint->commit();
46 }
47
48 $repo->terminate();
49
50 =cut
51
52 our $eprintid = 1;
53
54 my $files;
55
56 my $mkp_path = "/mnt/share/MKP/ELEKTRONIČKI DOKUMENTI/EL.DOKUMENTI PO BIBLIOBROJU/do18052015/";
57 my $koha_path = "/tmp/koha_ffzg";
58
59 open(my $fh, '-|:encoding(UTF-8)', 'find "' . $mkp_path . '" -iname "*.pdf"');
60 while(my $full_path = <$fh>) {
61         chomp $full_path;
62
63         my $file = $1 if $full_path =~ m{/([^/]+)\.pdf}i;
64
65         my $file_id;
66         if ( $file =~ m/^(\d+)/ ) {
67                 $file_id = $1;
68         } else {
69                 $file_id = $file;
70         }
71
72         warn "# $file_id\t$full_path\n";
73         $files->{ $file_id } = $full_path;
74
75 }
76
77 warn "# got ", scalar keys %$files, " files\n";
78
79 store $files, "$koha_path.biblionumber.file";
80
81 my $stat;
82 my $this_id = '';
83 my $item;
84
85 binmode STDOUT, ":utf8";
86
87 sub dump_item {
88         my $item = shift || return;
89
90         my $f200 = $item->{200}->[0] || die "no 200 in ",dump($item);
91
92         if ( $f200 =~ s/\s*;\s*([^;]+?)$//i ) {
93                 $item->{mentor} = $1;
94                 $item->{mentor} =~ s/^\s*voditelji?\s*(?:rada)\s*//i;
95         } else {
96                 warn "MISSING ; voditelj [$f200]\n";
97         }
98
99         if ( $f200 =~ s{\s*/\s*([^/]+?)$}{} ) {
100                 $item->{autor} = $1;
101         } else {
102                 warn "MISSING / autor [$f200]\n";
103         }
104
105         if ( $f200 =~ s{\s*:\s*([^:]+?)$}{} ) {
106                 $item->{tip} = lc($1);
107         } else {
108                 warn "MISSING : tip [$f200]\n";
109         }
110
111         $item->{title} = $f200;
112
113         if ( exists $item->{991} ) {
114                 my $file_id = $item->{991}->[0];
115                 if ( exists $files->{ $file_id } ) {
116                         $item->{full_path} = delete $files->{ $file_id };
117                 } elsif ( $file_id =~ s/(\w)0*(\d)/$1$2/ ) {
118                         if ( exists $files->{ $file_id } ) {
119                                 $item->{full_path} = delete $files->{ $file_id };
120                         }
121                 }
122         }
123
124         if ( ! exists $item->{full_path} ) {
125                 my $file_id = ucfirst( $item->{300}->[0] . ' ' . $item->{700}->[0] );
126                 $file_id =~ s/[\.\,]//g;
127                 if ( exists $files->{ $file_id } ) {
128                         $item->{full_path} = delete $files->{ $file_id };
129                 }
130         }
131
132         warn "MISSING file for $eprintid\n" unless exists $item->{full_path};
133
134         warn "# item ",dump($item);
135
136         my $eprint = {
137                 eprintid => $eprintid++
138         };
139
140         $eprint->{filename} = $1 if $item->{full_path} =~ m{/([^/]+)$};
141         $eprint->{full_path} = $item->{full_path};
142
143         $eprint->{date} = $1 if $item->{210}->[0] =~ m/\$d(\d+)/;
144         $eprint->{pages} = $1 if $item->{215}->[0] =~ m/^(\d+)/;
145
146         ( $eprint->{creators_family}, $eprint->{creators_given} ) = split(/,\s*/, $item->{700}->[0] );
147
148         $eprint->{title} = $item->{title};
149
150         $eprint->{keywords} = join(", ", @{ $item->{610} }) if exists $item->{610};
151
152         if ( exists $item->{700}->[1] ) {
153                 ( $eprint->{thesis_mentor_family}, $eprint->{thesis_mentor_given} ) = split(/,\s*/, $item->{700}->[1] );
154         } elsif ( $item->{mentor} ) {
155                 ( $eprint->{thesis_mentor_given}, $eprint->{thesis_mentor_family} ) = split(/\s+/, $item->{mentor} );
156         }
157
158         $eprint->{thesis_mentor_family} =~ s/(\S+)\s*-\s*(\S+)/$1-$2/; # fix spaces between dash in double surname
159
160         $eprint->{thesis_callnumber} = $item->{990}->[0];
161         $eprint->{thesis_invnumber} =  $item->{991}->[0]; # FIXME?
162
163         # fallback za radove bez datuma na godinu
164         if ( ! $eprint->{thesis_date} && $item->{990}->[0] =~ m{/(\d\d\d\d)/} ) {
165                 $eprint->{thesis_date} = $1;
166         }
167
168         if ( ! $eprint->{date} ) {
169                 $eprint->{date} = $eprint->{thesis_date};
170         }
171
172         warn "# eprint ",dump($eprint);
173
174         print qq|
175
176   <eprint>
177     <eprintid>$eprint->{eprintid}</eprintid>
178
179         |;
180
181         if ( $eprint->{full_path} ) {
182                 print qq|
183
184     <documents>
185       <document>
186
187         <files>
188           <file>
189             <datasetid>document</datasetid>
190             <filename>$eprint->{filename}</filename>
191             <mime_type>application/pdf</mime_type>
192             <url>file://$eprint->{full_path}</url>
193           </file>
194         </files>
195         <mime_type>application/pdf</mime_type>
196         <format>application/pdf</format>
197         <language>hr</language>
198         <security>validuser</security>
199         <main>$eprint->{filename}</main>
200       </document>
201     </documents>
202
203                 |;
204         }
205         print qq|
206
207     <eprint_status>archive</eprint_status>
208     <type>$eprint->{type}</type>
209     <metadata_visibility>show</metadata_visibility>
210     <creators>
211       <item>
212         <name>
213           <family>$eprint->{creators_family}</family>
214           <given>$eprint->{creators_given}</given>
215         </name>
216       </item>
217     </creators>
218     <title>$eprint->{title}</title>
219     <ispublished>unpub</ispublished>
220     <subjects>
221       <item>2.06</item>
222     </subjects>
223     <full_text_status>restricted</full_text_status>
224     <keywords>$eprint->{keywords}</keywords>
225     <date>$eprint->{date}</date>
226     <date_type>completed</date_type>
227     <pages>$eprint->{pages}</pages>
228     <institution>Grafički fakultet</institution>
229 <!--
230     <department>strojevi</department>
231 -->
232     <thesis_date>$eprint->{thesis_date}</thesis_date>
233     <thesis_callnumber>$eprint->{thesis_callnumber}</thesis_callnumber>
234     <thesis_invnumber>$eprint->{thesis_invnumber}</thesis_invnumber>
235     <thesis_mentor>
236       <name>
237         <family>$eprint->{thesis_mentor_family}</family>
238         <given>$eprint->{thesis_mentor_given}</given>
239       </name>
240     </thesis_mentor>
241   </eprint>
242
243         |;
244
245 }
246
247 print qq{<?xml version="1.0" encoding="utf-8" ?>
248 <eprints>
249 };
250
251
252 open(my $tsv_fh,   '<:encoding(UTF-8)', "$koha_path.tsv");
253 open(my $tsv_marc, '<:encoding(UTF-8)', "$koha_path.marc");
254
255 while(<$tsv_fh>) {
256         my $line = $_;
257         $line =~ s/[\n\r]+$//;
258
259         my ($offset, $biblionumber, $title) = split(/\t/,$_,3);
260
261         warn "# $offset $biblionumber $title\n";
262
263         exit if $ENV{LAST} && $eprintid >= $ENV{LAST};
264
265         my $item;
266
267         if ( $item->{full_path} = $files->{$biblionumber} ) {
268                 $stat->{file}++;
269         } else {
270                 $stat->{missing}++;
271         }
272
273 }
274
275 print qq{
276 </eprints>
277 };
278
279
280 warn "# files left ", dump($files);
281
282 warn "# stat ", dump($stat);