From: dpavlin Date: Fri, 22 Oct 2010 20:25:51 +0000 (+0000) Subject: basic parser for Aleph html at NSK X-Git-Url: http://git.rot13.org/?p=Biblio-Z3950.git;a=commitdiff_plain;h=a7a3aa7288f84129fa55dbb7d1fa1cea101fa4cc basic parser for Aleph html at NSK git-svn-id: svn+ssh://llin.lib/home/dpavlin/private/svn/Z3950-HTML-Scraper@11 ae73d1a6-5fa4-44a9-8f13-f281fb455051 --- diff --git a/Aleph.pm b/Aleph.pm new file mode 100644 index 0000000..953fda7 --- /dev/null +++ b/Aleph.pm @@ -0,0 +1,164 @@ +package Aleph; + +use warnings; +use strict; + +use WWW::Mechanize; +use MARC::Record; +use Data::Dump qw/dump/; + +binmode STDOUT, ':utf8'; + +our $mech = WWW::Mechanize->new(); +our $hits; + +sub diag { + print "# ", @_, $/; +} + +# Koha Z39.50 query: +# +# Bib-1 @and @and @and @and @and @and @and @or +# @attr 1=8 isbn-issn +# @attr 1=7 isbn-issn +# @attr 1=4 title +# @attr 1=1003 author +# @attr 1=16 dewey +# @attr 1=21 subject-holding +# @attr 1=12 control-no +# @attr 1=1007 standard-id +# @attr 1=1016 any + +# LCC - Klasifikacija Kongresne knjižnice +# LCN - Signatura Kongresne knjižnice +# DDC - Deweyjeva klasifikacija +# TIT - Naslovi +# AUT - Autori +# IMP - Impresum +# SUB - Predmetnice +# SRS - Nakladnička cjelina +# LOC - Lokacija +# WRD - Riječi +# WTI - Riječi u polju naslova +# WAU - Riječi u polju autora +# WPE - Riječi u polju individualnog autora +# WCO - Riječi u polju korporativnog autora +# WME - Riječi u polju sastanka +# WUT - Riječi u polju jedinstvenog naslova +# WPL - Riječi u polju mjesta izdavanja +# WPU - Riječi u polju nakladnika +# WSU - Riječi u polju predmetnica +# WSM - Riječi u predmetnicama MeSH-a +# WST - Riječi u polju status +# WGA - Riječi u geografskim odrednicama +# WYR - Godina izdavanja + +our $usemap = { +# 8 => '', +# 7 => '', + 4 => 'WTI', + 1003 => 'WTI', + 16 => 'CU', + 21 => 'SU', +# 12 => '', +# 1007 => '', +# 1016 => '', + +}; + +sub usemap { + my $f = shift || die; + $usemap->{$f}; +} + +sub search { + my ( $self, $query ) = @_; + + die "need query" unless defined $query; + + my $url = 'http://161.53.240.197:8991/F?RN=' . rand(1000000000); + # fake JavaScript code on page which creates random session + +diag "get $url"; + + $mech->get( $url ); + +diag "advanced search"; + + $mech->follow_link( url_regex => qr/find-c/ ); + +diag "submit search $query"; + + $mech->submit_form( + fields => { + 'ccl_term' => $query, + }, + ); + + $hits = 0; + if ( $mech->content =~ m{ukupno\s+(\d+).*(do\s+(\d+))}s ) { + $hits = $1; + $hits = $2 if $2 && $2 < $1; # correct for max. results + } else { + diag "get't find results in ", $mech->content; + return; + } + +diag "got $hits results, get first one"; + + $mech->follow_link( url_regex => qr/set_entry=000001/ ); + +diag "in MARC format"; + + $mech->follow_link( url_regex => qr/format=001/ ); +} + + +sub next_marc { + my ($self,$format) = @_; + +print $mech->content; + + if ( $mech->content =~ m{Zapis\s+(\d+)}s ) { + + my $nr = $1; + +diag "parse $nr"; + + my $html = $mech->content; + my $hash; + $html =~ s|\s*]*>(.+?)\s*(.+?)|$hash->{$1} = "$2";|ges; + diag dump($hash); + + my $id = $hash->{SYS} || die "no SYS"; + +die; + + my $marc = MARC::Record->new; + +# $marc->add_fields( $f, $i1, $i2, @{ $out->{$f} } ); + + my $path = "marc/$id.$format"; + + open(my $out, '>:utf8', $path); + print $out $marc->as_usmarc; + close($out); + + diag "created $path ", -s $path, " bytes"; + + diag $marc->as_formatted; + + $nr++; + + die if $nr == 3; # FIXME + + $mech->follow_link( url_regex => qr/set_entry=0*$nr/ ); + + return $marc->as_usmarc; + } else { + die "can't fetch COMARC format from ", $mech->content; + } + +} + +1; diff --git a/t/2-Aleph.t b/t/2-Aleph.t new file mode 100755 index 0000000..42e29bf --- /dev/null +++ b/t/2-Aleph.t @@ -0,0 +1,12 @@ +#!/usr/bin/perl + +use warnings; +use strict; + +use Test::More tests => 3; + +use_ok 'Aleph'; + +ok( my $search = Aleph->search( 'WTI=linux' ), 'search' ); + +ok( my $marc = Aleph->next_marc, 'next_marc' );