ignore ssl certificate errors
[Biblio-Z3950.git] / COBISS.pm
index ce1eea0..81314ea 100644 (file)
--- a/COBISS.pm
+++ b/COBISS.pm
@@ -3,19 +3,10 @@ package COBISS;
 use warnings;
 use strict;
 
-use WWW::Mechanize;
 use MARC::Record;
 use Data::Dump qw/dump/;
 
-binmode STDOUT, ':utf8';
-
-sub new {
-    my ( $class ) = @_;
-    my $self = {};
-    bless $self, $class;
-    return $self;
-}
-
+use base 'Scraper';
 
 my $cobiss_marc21 = {
        '010' => { a => [ '020', 'a' ] },
@@ -46,9 +37,9 @@ sub diag {
 # Koha Z39.50 query:
 #
 # Bib-1 @and @and @and @and @and @and @and @or
-# @attr 1=8 isbn-issn 
-# @attr 1=7 isbn-issn 
 # @attr 1=4 title 
+# @attr 1=7 isbn
+# @attr 1=8 issn 
 # @attr 1=1003 author 
 # @attr 1=16 dewey 
 # @attr 1=21 subject-holding 
@@ -56,11 +47,41 @@ sub diag {
 # @attr 1=1007 standard-id 
 # @attr 1=1016 any
 
+
+# AU=  Autor - osoba
+# CB=  Autor - korporacija
+# CL=  Zbirka
+# CP=  Mesto sast./dod. nazivu korp.
+# PP=  Mesto izdanja
+# PU=  Izdavač
+# PY=  Godina izdanja
+# P2=  Zaključna godina izdanja
+# TI=  Naslov
+# TO=  Naslov originala
+# BN=  ISBN
+# SN=  ISSN uz članak
+# SP=  ISSN
+# PN=  Predmetna odrednica - lično ime
+# CS=  Predm. odred. - naziv korporacije
+# DU=  Slobodno oblikovane predm. odred.
+# SU=  Predmetne odrednice - sve
+# AC=  Kod za vrstu autorstva
+# CC=  Kod za vrstu sadržaja
+# CO=  Zemlja/regija izdavanja
+# FC=  Šifra organizacije
+# LA=  Jezik teksta
+# LC=  Kod za književni oblik
+# LO=  Jezik izvornog dela
+# TA=  Kod za predviđene korisnike
+# TD=  Tipologija dok./dela
+# UC=  UDK za pretraživanje
+# KW=  Ključne reči
+
 sub usemap {{
-       8               => 'BN',        # FIXME check
-       7               => 'SN',        # FIXME check
+       7               => 'BN',        # FIXME check
+       8               => 'SP',        # FIXME check
        4               => 'TI',
-       1003    => 'TI',
+       1003    => 'AU',
        16              => 'CU',
        21              => 'SU',
 #      12              => '',
@@ -74,11 +95,13 @@ sub search {
 
        die "need query" unless defined $query;
 
-       my $url = 'http://cobiss.izum.si/scripts/cobiss?ukaz=GETID&lani=en';
+#      my $url = 'http://cobiss.izum.si/scripts/cobiss?ukaz=GETID&lani=en';
+       my $url = 'http://www.cobiss.ba/scripts/cobiss?ukaz=GETID&lani=en';
 
 diag "get $url";
 
-       my $mech = $self->{mech} = WWW::Mechanize->new();
+       my $mech = $self->{mech} || die "no mech?";
+
        my $hits;
        $mech->get( $url );
 
@@ -100,7 +123,7 @@ diag "submit search $query";
 
        $hits = 0;
        if ( $mech->content =~ m{hits:\s*<b>\s*(\d+)\s*</b>}s ) {
-               $hits = $1;
+               $self->{hits} = $hits = $1;
        } else {
                diag "get't find results in ", $mech->content;
                return;
@@ -131,81 +154,77 @@ sub next_marc {
 
        if ( $mech->content =~ m{<pre>\s*(.+?(\d+)\.\s+ID=(\d+).+?)\s*</pre>}s ) {
 
-               my $comarc = $1;
+               my $markup = $1;
                my $nr = $2;
                my $id = $3;
 
-diag "fetch_marc $nr [$id] $format";
+diag "fetch $nr [$id] $format";
 
-               $comarc =~ s{</?b>}{}gs;
-               $comarc =~ s{<font[^>]*>}{<s>}gs;
-               $comarc =~ s{</font>}{<e>}gs;
+               $markup =~ s{</?b>}{}gs;
+               $markup =~ s{<font[^>]*>}{<s>}gs;
+               $markup =~ s{</font>}{<e>}gs;
 
-               open(my $out, '>:utf8', "comarc/$id");
-               print $out $comarc;
-               close($out);
+               $markup =~ s/[\r\n]+\s{5}//gs; # join continuation lines
 
-               print $comarc;
+               $self->save_marc( "$id.xml", $markup );
 
                my $marc = MARC::Record->new;
+               my $comarc = MARC::Record->new;
 
-               foreach my $line ( split(/[\r\n]+/, $comarc) ) {
+               foreach my $line ( split(/[\r\n]+/, $markup) ) {
 
                        if ( $line !~ s{^(\d\d\d)([01 ])([01 ])}{} ) {
                                diag "SKIP: $line";
                        } else {
-                               our @f = ( $1, $2, $3 );
                                $line .= "<eol>";
 
-                               if ( $format eq 'unimarc' ) {
-
-                                       diag dump(@f), "line: $line";
-                                       sub sf_uni {
-                                               warn "sf ",dump(@_);
-                                               push @f, @_;
-                                       }
-                                       $line =~ s{<s>(\w)<e>([^<]+)\s*}{sf_uni($1, $2)}ges;
-                                       diag "f:", dump(@f), " left: |$line|";
-                                       $marc->add_fields( @f );
+                               my ( $f, $i1, $i2 ) = ( $1, $2, $3 );
 
-                               } elsif ( $format eq 'usmarc' ) {
+                               our $marc_map = undef;
+                               our $comarc_map = undef;
+                               our $ignored = undef;
 
-                                       my ( $f, $i1, $i2 ) = @f;
+                               sub sf_parse {
+                                       my ($f,$sf,$v) = @_;
 
-                                       our $out = {};
+                                       $v =~ s/\s+$//;
 
-                                       sub sf_us {
-                                               my ($f,$sf,$v) = @_;
-                                               if ( my $m = $cobiss_marc21->{$f}->{$sf} ) {
-                                                       push @{ $out->{ $m->[0] } }, ( $m->[1], $v );
-                                               }
-                                               return;
+                                       push @{ $comarc_map->{ $f } }, ( $sf, $v );
+                                       if ( my $m = $cobiss_marc21->{$f}->{$sf} ) {
+                                               push @{ $marc_map->{ $m->[0] } }, ( $m->[1], $v );
+                                       } else {
+                                               $ignored->{$f}++;
                                        }
-                                       $line =~ s{<s>(\w)<e>([^<]+)\s*}{sf_us($f,$1, $2)}ges;
+                                       return ''; # fix warning
+                               }
+                               my $l = $line;
+                               $l =~ s{<s>(\w)<e>([^<]+)}{sf_parse($f,$1, $2)}ges;
 
-                                       diag "converted marc21 ",dump( $out );
+                               diag "[$format] $line -> ",dump( $comarc_map, $marc_map ) if $comarc_map;
 
-                                       foreach my $f ( keys %$out ) {
-                                               $marc->add_fields( $f, $i1, $i2, @{ $out->{$f} } );
-                                       }
+                               foreach my $f ( keys %$comarc_map ) {
+                                       $comarc->add_fields( $f, $i1, $i2, @{ $comarc_map->{$f} } );
+                               }
+
+                               foreach my $f ( keys %$marc_map ) {
+                                       $marc->add_fields( $f, $i1, $i2, @{ $marc_map->{$f} } );
                                }
                        }
                }
 
-               my $path = "marc/$id.$format";
-
-               open($out, '>:utf8', $path);
-               print $out $marc->as_usmarc;
-               close($out);
-
-               diag "created $path ", -s $path, " bytes";
-
+               $self->save_marc( "$id.marc", $marc->as_usmarc );
+               $self->save_marc( "$id.unimarc", $comarc->as_usmarc );
                diag $marc->as_formatted;
 
-               $nr++;
-               $mech->follow_link( url_regex => qr/rec=$nr/ );
+               if ( $nr < $self->{hits} ) {
+                       warn "# fetch next result";
+                       $nr++;
+                       $mech->follow_link( url_regex => qr/rec=$nr/ );
+               } else {
+                       warn "# no more results";
+               }
 
-               return $marc->as_usmarc;
+               return $id;
        } else {
                die "can't fetch COMARC format from ", $mech->content;
        }