From 039fc7e9b9d0e7b0efff760bc058bb1b0982a32d Mon Sep 17 00:00:00 2001 From: Dobrica Pavlinusic Date: Mon, 23 Apr 2012 14:12:08 +0200 Subject: [PATCH] vuFind scraper which fetch marc records directly --- server.pl | 2 ++ t/4-vuFind.t | 22 +++++++++++++ t/yaz/vuFind | 5 +++ vuFind.pm | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 120 insertions(+) create mode 100755 t/4-vuFind.t create mode 100644 t/yaz/vuFind create mode 100644 vuFind.pm diff --git a/server.pl b/server.pl index 8d7494e..40c1ff0 100755 --- a/server.pl +++ b/server.pl @@ -9,6 +9,7 @@ use Data::Dumper; use COBISS; use Aleph; use GoogleBooks; +use vuFind; use Encode; @@ -19,6 +20,7 @@ my $databases = { 'NSK10' => 'Aleph', 'ZAG01' => 'Aleph', 'GOOGLEBOOKS' => 'GoogleBooks', + 'HATHITRUST' => 'vuFind', }; my $max_records = 3; # XXX configure this diff --git a/t/4-vuFind.t b/t/4-vuFind.t new file mode 100755 index 0000000..b5d5809 --- /dev/null +++ b/t/4-vuFind.t @@ -0,0 +1,22 @@ +#!/usr/bin/perl + +use warnings; +use strict; + +use Test::More tests => 14; + +my $search = join(' ', @ARGV) || 'croatia'; + +use_ok 'vuFind'; + +ok( my $o = vuFind->new(), 'new' ); + +ok( my $hits = $o->search( $search ), "search: $search" ); +like $hits, qr/^\d+$/, "hits: $hits"; + +foreach ( 1 .. 10 ) { + + ok( my $marc = $o->next_marc, "next_marc $_" ); + diag $marc; + +} diff --git a/t/yaz/vuFind b/t/yaz/vuFind new file mode 100644 index 0000000..61b005a --- /dev/null +++ b/t/yaz/vuFind @@ -0,0 +1,5 @@ +open localhost:9999/HATHITRUST +find "Croatia" +show 1+3 + +quit diff --git a/vuFind.pm b/vuFind.pm new file mode 100644 index 0000000..b05b73f --- /dev/null +++ b/vuFind.pm @@ -0,0 +1,91 @@ +package vuFind; + +use warnings; +use strict; + +use MARC::Record; +use Data::Dump qw/dump/; +use JSON::XS; + +use base 'Scraper'; + +my $debug = $ENV{DEBUG} || 0; + +sub diag { + warn "# ", @_, $/; +} + +# Koha Z39.50 query: +# +# Bib-1 @and @and @and @and @and @and @and @or +# @attr 1=4 title +# @attr 1=7 isbn +# @attr 1=8 issn +# @attr 1=1003 author +# @attr 1=16 dewey +# @attr 1=21 subject-holding +# @attr 1=12 control-no +# @attr 1=1007 standard-id +# @attr 1=1016 any + +sub usemap {{ + 4 => 'title', + 7 => 'isn', + 8 => 'isn', + 1003 => 'author', +# 16 => '', + 21 => 'subject', +# 12 => '', +# 1007 => '', + 1016 => 'all', +}}; + +sub search { + my ( $self, $query ) = @_; + + die "need query" unless defined $query; + + # http://catalog.hathitrust.org/Search/Home?lookfor=croatia%20AND%20zagreb&type=title + my $url = 'http://catalog.hathitrust.org/Search/Home?lookfor=' . $query; + +diag "get $url"; + + $self->mech->get( $url ); + + my $hits = 0; + + if ( $self->mech->content =~ m{of\s*(\d+)\s*Results for}s ) { + $hits = $1; + } else { + diag "get't find results in ", $self->mech->content; + return; + } + +diag "got $hits results"; + + foreach my $link ( $self->mech->find_all_links( url_regex => qr{/Record/\d+} ) ) { + push @{ $self->{records} }, $link->url; + } + + return $self->{hits} = $hits; +} + + +sub next_marc { + my ($self,$format) = @_; + + $format ||= 'marc'; + + my $url = shift @{ $self->{records} }; + + my $id = $1 if $url =~ m{Record/(\d+)}; + + $self->mech->get( $url . '.mrc' ); + + $self->save_marc( "$id.marc", $self->mech->content ); + + return $id; + +} + +1; -- 2.20.1