use warnings;
use strict;
-use WWW::Mechanize;
use MARC::Record;
use Data::Dump qw/dump/;
-binmode STDOUT, ':utf8';
-
-our $mech = WWW::Mechanize->new();
-our $hits;
+use base 'Scraper';
sub diag {
- print "# ", @_, $/;
+ warn "# ", @_, $/;
}
# Koha Z39.50 query:
#
# Bib-1 @and @and @and @and @and @and @and @or
-# @attr 1=8 isbn-issn
-# @attr 1=7 isbn-issn
# @attr 1=4 title
+# @attr 1=7 isbn
+# @attr 1=8 issn
# @attr 1=1003 author
# @attr 1=16 dewey
# @attr 1=21 subject-holding
# WGA - Riječi u geografskim odrednicama
# WYR - Godina izdavanja
-our $usemap = {
-# 8 => '',
-# 7 => '',
- 4 => 'WTI',
- 1003 => 'WTI',
- 16 => 'CU',
- 21 => 'SU',
-# 12 => '',
+sub usemap {{
+ 4 => 'WTI=',
+ 7 => 'ISBN=',
+ 8 => 'ISSN=',
+ 1003 => 'AUT=',
+ 16 => 'DDC=',
+ 21 => 'SUB=',
+ 12 => 'LCN=',
# 1007 => '',
-# 1016 => '',
-
-};
+ 1016 => 'WRD=',
+}};
-sub usemap {
- my $f = shift || die;
- $usemap->{$f};
-}
+our $session_id;
sub search {
my ( $self, $query ) = @_;
die "need query" unless defined $query;
- my $url = 'http://161.53.240.197:8991/F?RN=' . rand(1000000000);
+ $session_id ||= int rand(1000000000);
+ # FIXME allocate session just once
+ my $url = 'http://katalog.nsk.hr/F?RN=' . $session_id;
# fake JavaScript code on page which creates random session
diag "get $url";
+ my $mech = $self->{mech} || die "no mech?";
$mech->get( $url );
diag "advanced search";
$mech->follow_link( url_regex => qr/find-c/ );
-diag "submit search $query";
+ my $database = $self->{database};
+
+ if ( $mech->content =~ m{Requested library is unavailable at the moment} ) {
+ warn "ERROR: default database not available, try to swith to $database\n";
+ $self->save_content;
+ $mech->follow_link( url_regex => qr/local_base=$database/i );
+ diag "re-try advanced search";
+ $mech->follow_link( url_regex => qr/find-c/ );
+ }
+
+diag "submit search [$query] on ", $self->{database};
+
$mech->submit_form(
fields => {
'ccl_term' => $query,
+ 'local_base' => $self->{database},
},
);
- $hits = 0;
- if ( $mech->content =~ m{ukupno\s+(\d+).*(do\s+(\d+))}s ) {
+ my $hits = 0;
+ if ( $mech->content =~ m{ukupno\s+(\d+).*do\s+(\d+)}s ) { # FIXME Many results in Crotian
$hits = $1;
$hits = $2 if $2 && $2 < $1; # correct for max. results
+ } elsif ( $mech->content =~ m{(\d+)\s+od\s+(\d+)}s ) { # FIXME single result in Croatian
+ $hits = $2;
} else {
diag "get't find results in ", $mech->content;
return;
diag "in MARC format";
$mech->follow_link( url_regex => qr/format=001/ );
+
+ return $self->{hits} = $hits;
}
+our ( $hash, $marc );
+
sub next_marc {
my ($self,$format) = @_;
-print $mech->content;
+ $format ||= 'marc';
- if ( $mech->content =~ m{Zapis\s+(\d+)}s ) {
+ my $mech = $self->{mech} || die "no mech?";
- my $nr = $1;
+#warn "## ", $mech->content;
-diag "parse $nr";
-
- my $html = $mech->content;
- my $hash;
- $html =~ s|<tr>\s*<td class=td1 id=bold[^>]*>(.+?)</td>\s*<td class=td1>(.+?)</td>|$hash->{$1} = "$2";|ges;
- diag dump($hash);
-
- my $id = $hash->{SYS} || die "no SYS";
-
-die;
-
- my $marc = MARC::Record->new;
+ if ( $mech->content =~ m{Zapis\s+(\d+)}s ) {
-# $marc->add_fields( $f, $i1, $i2, @{ $out->{$f} } );
+ my $nr = $1;
- my $path = "marc/$id.$format";
+warn "parse $nr";
- open(my $out, '>:utf8', $path);
- print $out $marc->as_usmarc;
- close($out);
+ $marc = MARC::Record->new;
+ $marc->encoding('utf-8');
+ $hash = {};
- diag "created $path ", -s $path, " bytes";
+ my $html = $mech->content;
- diag $marc->as_formatted;
+ sub field {
+ my ( $f, $v ) = @_;
+ $v =~ s/\Q \E/ /gs;
+ $v =~ s/\s+$//gs;
+warn "## $f\t$v\n";
+ $hash->{$f} = $v;
+
+ if ( $f eq 'LDR' ) {
+ $marc->leader( $v );
+ return;
+ }
+
+ if ( $f =~ m/\D/ ) {
+ warn "$f not numeric!";
+ return;
+ }
+
+ if ( $v !~ s/^\|// ) { # no subfields
+ $marc->add_fields( $f, $v );
+warn "## ++ ", dump( $f, $v );
+ return;
+ }
+
+ my ($i1,$i2) = (' ',' ');
+ ($i1,$i2) = ($2,$3) if $f =~ s/^(...)(.)?(.)?/$1/;
+ my @sf = split(/\|/, $v);
+ @sf = map { s/^(\w)\s+//; { $1 => $_ } } @sf;
+#warn "## sf = ", dump(@sf);
+ $marc->add_fields( $f, $i1, $i2, @sf );
+warn "## ++ ", dump( $f, $i1, $i2, @sf );
+ }
+
+ $html =~ s|<tr>\s*<td class=td1 id=bold[^>]*>(.+?)</td>\s*<td class=td1>(.+?)</td>|field($1,$2)|ges;
+ diag "# hash ",dump($hash);
+ diag "# marc ", $marc->as_formatted;
- $nr++;
+ my $id = $hash->{SYS} || die "no SYS";
- die if $nr == 3; # FIXME
+ $self->save_marc( "$id.marc", $marc->as_usmarc );
- $mech->follow_link( url_regex => qr/set_entry=0*$nr/ );
+ if ( $nr < $self->{hits} ) {
+ $nr++;
+ diag "follow link to next record $nr";
+ $mech->follow_link( url_regex => qr/set_entry=0*$nr/ );
+ }
- return $marc->as_usmarc;
+ return $id;
} else {
die "can't fetch COMARC format from ", $mech->content;
}