X-Git-Url: http://git.rot13.org/?a=blobdiff_plain;f=Aleph.pm;h=6a3cf3cc2bde539ad19b130ac57ab54a9b4d462d;hb=e0c3654e9e83473ac0b91857e2bf2987c88fb563;hp=f89aaf7cb9dc2e5c054babaa2166be0c08cf7db9;hpb=1e7d14d016aadb14fea11906d2f41ec2472716c8;p=Biblio-Z3950.git diff --git a/Aleph.pm b/Aleph.pm index f89aaf7..6a3cf3c 100644 --- a/Aleph.pm +++ b/Aleph.pm @@ -3,17 +3,13 @@ package Aleph; use warnings; use strict; -use WWW::Mechanize; use MARC::Record; use Data::Dump qw/dump/; -binmode STDOUT, ':utf8'; - -our $mech = WWW::Mechanize->new(); -our $hits; +use base 'Scraper'; sub diag { - print "# ", @_, $/; + warn "# ", @_, $/; } # Koha Z39.50 query: @@ -53,9 +49,7 @@ sub diag { # WGA - Riječi u geografskim odrednicama # WYR - Godina izdavanja -our $usemap = { -# 8 => '', -# 7 => '', +sub usemap {{ 4 => 'WTI', 1003 => 'WTI', 16 => 'CU', @@ -63,40 +57,42 @@ our $usemap = { # 12 => '', # 1007 => '', # 1016 => '', +}}; -}; - -sub usemap { - my $f = shift || die; - $usemap->{$f}; -} +our $session_id; sub search { my ( $self, $query ) = @_; die "need query" unless defined $query; - my $url = 'http://161.53.240.197:8991/F?RN=' . rand(1000000000); + $session_id ||= int rand(1000000000); + # FIXME allocate session just once + my $url = 'http://161.53.240.197:8991/F?RN=' . $session_id; # fake JavaScript code on page which creates random session diag "get $url"; + my $mech = $self->{mech} || die "no mech?"; $mech->get( $url ); diag "advanced search"; $mech->follow_link( url_regex => qr/find-c/ ); -diag "submit search $query"; +diag "submit search [$query] on ", $self->{database}; + + $self->save_content; $mech->submit_form( fields => { 'ccl_term' => $query, + 'local_base' => $self->{database}, }, ); - $hits = 0; - if ( $mech->content =~ m{ukupno\s+(\d+).*(do\s+(\d+))}s ) { + my $hits = 0; + if ( $mech->content =~ m{ukupno\s+(\d+).*do\s+(\d+)}s ) { $hits = $1; $hits = $2 if $2 && $2 < $1; # correct for max. results } else { @@ -111,60 +107,74 @@ diag "got $hits results, get first one"; diag "in MARC format"; $mech->follow_link( url_regex => qr/format=001/ ); + + return $hits; } +our ( $hash, $marc ); + sub next_marc { my ($self,$format) = @_; -print $mech->content; + $format ||= 'marc'; + + my $mech = $self->{mech} || die "no mech?"; + +#warn "## ", $mech->content; if ( $mech->content =~ m{Zapis\s+(\d+)}s ) { my $nr = $1; -diag "parse $nr"; +warn "parse $nr"; - my $marc = MARC::Record->new; + $marc = MARC::Record->new; + $hash = {}; my $html = $mech->content; - my $hash; sub field { my ( $f, $v ) = @_; $v =~ s/\Q \E/ /gs; -warn "# $f\t$v\n"; +warn "## $f\t$v\n"; $hash->{$f} = $v; + + if ( $f eq 'LDR' ) { + $marc->leader( $v ); + return; + } + + if ( $f =~ m/\D/ ) { + warn "$f not numeric!"; + return; + } + + if ( $v !~ s/^\|// ) { # no subfields + $marc->add_fields( $f, $v ); +warn "## ++ ", dump( $f, $v ); + return; + } + my ($i1,$i2) = (' ',' '); ($i1,$i2) = ($2,$3) if $f =~ s/^(...)(.)?(.)?/$1/; my @sf = split(/\|/, $v); - shift @sf; @sf = map { s/^(\w)\s+//; { $1 => $_ } } @sf; -diag "sf = ", dump(@sf); - $marc->add_fields( $f, $i1, $i2, @sf ) if $f =~ m/^\d+$/; +#warn "## sf = ", dump(@sf); + $marc->add_fields( $f, $i1, $i2, @sf ); +warn "## ++ ", dump( $f, $i1, $i2, @sf ); } $html =~ s|\s*]*>(.+?)\s*(.+?)|field($1,$2)|ges; - diag dump($hash); + diag "# hash ",dump($hash); + diag "# marc ", $marc->as_formatted; my $id = $hash->{SYS} || die "no SYS"; - - - my $path = "marc/$id.$format"; - - open(my $out, '>:utf8', $path); - print $out $marc->as_usmarc; - close($out); - - diag "created $path ", -s $path, " bytes"; - - diag $marc->as_formatted; + $self->save_marc( $id, $marc->as_usmarc ); $nr++; - die if $nr == 3; # FIXME - $mech->follow_link( url_regex => qr/set_entry=0*$nr/ ); return $marc->as_usmarc;