basic parser for Aleph html at NSK
authordpavlin <dpavlin@ae73d1a6-5fa4-44a9-8f13-f281fb455051>
Fri, 22 Oct 2010 20:25:51 +0000 (20:25 +0000)
committerdpavlin <dpavlin@ae73d1a6-5fa4-44a9-8f13-f281fb455051>
Fri, 22 Oct 2010 20:25:51 +0000 (20:25 +0000)
git-svn-id: svn+ssh://llin.lib/home/dpavlin/private/svn/Z3950-HTML-Scraper@11 ae73d1a6-5fa4-44a9-8f13-f281fb455051

Aleph.pm [new file with mode: 0644]
t/2-Aleph.t [new file with mode: 0755]

diff --git a/Aleph.pm b/Aleph.pm
new file mode 100644 (file)
index 0000000..953fda7
--- /dev/null
+++ b/Aleph.pm
@@ -0,0 +1,164 @@
+package Aleph;
+
+use warnings;
+use strict;
+
+use WWW::Mechanize;
+use MARC::Record;
+use Data::Dump qw/dump/;
+
+binmode STDOUT, ':utf8';
+
+our $mech = WWW::Mechanize->new();
+our $hits;
+
+sub diag {
+       print "# ", @_, $/;
+}
+
+# Koha Z39.50 query:
+#
+# Bib-1 @and @and @and @and @and @and @and @or
+# @attr 1=8 isbn-issn 
+# @attr 1=7 isbn-issn 
+# @attr 1=4 title 
+# @attr 1=1003 author 
+# @attr 1=16 dewey 
+# @attr 1=21 subject-holding 
+# @attr 1=12 control-no 
+# @attr 1=1007 standard-id 
+# @attr 1=1016 any
+
+# LCC - Klasifikacija Kongresne knjižnice 
+# LCN - Signatura Kongresne knjižnice
+# DDC - Deweyjeva klasifikacija 
+# TIT - Naslovi 
+# AUT - Autori 
+# IMP - Impresum
+# SUB - Predmetnice
+# SRS - Nakladnička cjelina 
+# LOC - Lokacija 
+# WRD - Riječi 
+# WTI - Riječi u polju naslova 
+# WAU - Riječi u polju autora 
+# WPE - Riječi u polju individualnog autora 
+# WCO - Riječi u polju korporativnog autora 
+# WME - Riječi u polju sastanka 
+# WUT - Riječi u polju jedinstvenog naslova 
+# WPL - Riječi u polju mjesta izdavanja 
+# WPU - Riječi u polju nakladnika 
+# WSU - Riječi u polju predmetnica 
+# WSM - Riječi u predmetnicama MeSH-a 
+# WST - Riječi u polju status
+# WGA - Riječi u geografskim odrednicama 
+# WYR - Godina izdavanja
+
+our $usemap = {
+#      8               => '',
+#      7               => '',
+       4               => 'WTI',
+       1003    => 'WTI',
+       16              => 'CU',
+       21              => 'SU',
+#      12              => '',
+#      1007    => '',
+#      1016    => '',
+
+};
+
+sub usemap {
+       my $f = shift || die;
+       $usemap->{$f};
+}
+
+sub search {
+       my ( $self, $query ) = @_;
+
+       die "need query" unless defined $query;
+
+       my $url = 'http://161.53.240.197:8991/F?RN=' . rand(1000000000);
+       # fake JavaScript code on page which creates random session
+
+diag "get $url";
+
+       $mech->get( $url );
+
+diag "advanced search";
+
+       $mech->follow_link( url_regex => qr/find-c/ );
+
+diag "submit search $query";
+
+       $mech->submit_form(
+               fields => {
+                       'ccl_term' => $query,
+               },
+       );
+
+       $hits = 0;
+       if ( $mech->content =~ m{ukupno\s+(\d+).*(do\s+(\d+))}s ) {
+               $hits = $1;
+               $hits = $2 if $2 && $2 < $1; # correct for max. results
+       } else {
+               diag "get't find results in ", $mech->content;
+               return;
+       }
+
+diag "got $hits results, get first one";
+
+       $mech->follow_link( url_regex => qr/set_entry=000001/ );
+
+diag "in MARC format";
+
+       $mech->follow_link( url_regex => qr/format=001/ );
+}
+
+
+sub next_marc {
+       my ($self,$format) = @_;
+
+print $mech->content;
+
+       if ( $mech->content =~ m{Zapis\s+(\d+)}s ) {
+
+               my $nr = $1;
+
+diag "parse $nr";
+
+               my $html = $mech->content;
+               my $hash;
+               $html =~ s|<tr>\s*<td class=td1 id=bold[^>]*>(.+?)</td>\s*<td class=td1>(.+?)</td>|$hash->{$1} = "$2";|ges;
+               diag dump($hash);
+
+               my $id = $hash->{SYS} || die "no SYS";
+
+die;
+
+               my $marc = MARC::Record->new;
+
+#              $marc->add_fields( $f, $i1, $i2, @{ $out->{$f} } );
+
+               my $path = "marc/$id.$format";
+
+               open(my $out, '>:utf8', $path);
+               print $out $marc->as_usmarc;
+               close($out);
+
+               diag "created $path ", -s $path, " bytes";
+
+               diag $marc->as_formatted;
+
+               $nr++;
+
+               die if $nr == 3; # FIXME
+
+               $mech->follow_link( url_regex => qr/set_entry=0*$nr/ );
+
+               return $marc->as_usmarc;
+       } else {
+               die "can't fetch COMARC format from ", $mech->content;
+       }
+
+}
+
+1;
diff --git a/t/2-Aleph.t b/t/2-Aleph.t
new file mode 100755 (executable)
index 0000000..42e29bf
--- /dev/null
@@ -0,0 +1,12 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+
+use Test::More tests => 3;
+
+use_ok 'Aleph';
+
+ok( my $search = Aleph->search( 'WTI=linux' ), 'search' );
+
+ok( my $marc = Aleph->next_marc, 'next_marc' );