From eb4809ffe89cc83a68a1da0c4cb654b81d47e35e Mon Sep 17 00:00:00 2001 From: Dobrica Pavlinusic Date: Thu, 26 Apr 2012 20:18:03 +0200 Subject: [PATCH] Digital Public Library of America http://dp.la/dev/wiki/Item_API --- DPLA.pm | 163 +++++++++++++++++++++++++++++++++++++++++++++++++++++ server.pl | 2 + t/5-DPLA.t | 22 ++++++++ 3 files changed, 187 insertions(+) create mode 100644 DPLA.pm create mode 100755 t/5-DPLA.t diff --git a/DPLA.pm b/DPLA.pm new file mode 100644 index 0000000..31808d6 --- /dev/null +++ b/DPLA.pm @@ -0,0 +1,163 @@ +package DPLA; + +use warnings; +use strict; + +use MARC::Record; +use Data::Dump qw/dump/; +use JSON::XS; +use Encode; + +use base 'Scraper'; + +my $debug = $ENV{DEBUG} || 0; + +sub diag { + warn "# ", @_, $/; +} + +# http://dp.la/dev/wiki/Item_API +# +# Base Fields: Mapping to a set of common terms +# Field name Field description +# dpla.keyword Almost all of a record's fields get copied to this field +# dpla.title The title and/or subtitle of the item. Exact matching. +# dpla.title_keyword The title and/or subtitle of the item. Keyword matching. +# dpla.creator The creator(s), contributor(s), editor(s), etc. of the item. Exact matching +# dpla.creator_keyword The creator(s), contributor(s), editor(s), etc. of the item. Keyword matching +# dpla.date The item's date of publication. +# dpla.description The item's description. This often includes the item's Table of Contents. Exact matching. +# dpla.description_keyword The item's description. This often includes the item's Table of Contents. Keyword matching. +# dpla.subject A catchall for subject information. LCSH, Dewey, and other tag related fields are copied to this field. Exact matching. +# dpla.subject_keyword A catchall for subject information. LCSH, Dewey, and other tag related fields are copied to this field. Keyword matching. +# dpla.publisher The name of the publisher. Exact matching. +# dpla.language The primary language of the item. Exact matching. +# dpla.isbn The item's ISBN. Exact matching. +# dpla.oclc The item's OCLC identifier. Exact matching. +# dpla.lccn The item's LCCN. Exact matching. +# dpla.call_num The item's call number. Exact matching. +# dpla.content_link A link to the item's content. Exact matching. +# dpla.contributor The contributing partner. Exact matching. +# dpla.resource_type The resource's type. Common values include item and collection. Exact matching. + +# Koha Z39.50 query: +# +# Bib-1 @and @and @and @and @and @and @and @or +# @attr 1=4 title +# @attr 1=7 isbn +# @attr 1=8 issn +# @attr 1=1003 author +# @attr 1=16 dewey +# @attr 1=21 subject-holding +# @attr 1=12 control-no +# @attr 1=1007 standard-id +# @attr 1=1016 any + +sub usemap {{ + 4 => 'dpla.title', + 7 => 'dpla.isbn', + 8 => 'dpla.keyword', # XXX fake + 1003 => 'dpla.creator_keyword', +# 16 => '', + 21 => 'dpla.subject', +# 12 => '', +# 1007 => '', + 1016 => 'dpla.keyword', + + RPN => { + And => '&', + Or => '&', # FIXME sigh, not really supported? + }, + prefix_term => sub { + my ( $prefix, $term ) = @_; + return 'filter=' . $prefix . ':' . $term; + } +}}; + +sub search { + my ( $self, $query ) = @_; + + die "need query" unless defined $query; + + my $url = 'http://api.dp.la/v0.03/item/?' . $query; + +diag "get $url"; + + my $mech = $self->mech; + + $mech->get( $url ); + + my $json = decode_json $mech->content; + diag "# json = ", dump($json) if $debug; + + my $hits = 0; + + if ( exists $json->{num_found} ) { + $hits = $json->{num_found}; + } else { + diag "get't find num_found in ", $mech->content; + return; + } + +diag "got $hits results"; + + $self->{_json} = $json; + + return $self->{hits} = $hits; +} + +sub next_marc { + my ($self,$format) = @_; + + $format ||= 'marc'; + + my $item = shift @{ $self->{_json}->{docs} }; + + my $marc = MARC::Record->new; + $marc->encoding('utf-8'); + + my $fields; # empty marc + + foreach my $key ( sort keys %$item ) { + my $v = $item->{$key}; + warn "# item ",dump( $key, $v ) if $debug; + if ( $key =~ m/^(\d\d\d)(\w)$/ ) { + my ($f,$sf) = ($1,$2); + + # XXX do magic and unroll into proper MARC record + + $v = [ $v ] unless ref $v eq 'ARRAY'; + + if ( $fields ) { + if ( $fields->[0]->[0] ne $f ) { + $marc->add_fields( @$fields ); + warn "# add_fields ",dump($fields) if $debug; + $fields = undef; + } + } + foreach my $i ( 0 .. $#$v ) { + $fields->[$i]->[0] = $f; + $fields->[$i]->[1] = ' '; + $fields->[$i]->[2] = ' '; + push @{ $fields->[$i] }, $sf, $v->[$i]; + } + + } else { + warn "# IGNORED: $key ", dump($item->{$key}), "\n"; + } + } + + $marc->add_fields( @$fields ); + + diag "# marc ", $marc->as_formatted; + + my $id = $item->{'dpla.id'} || warn "no dpla.id in ",dump($item); + $self->save_marc( "$id.marc", $marc->as_usmarc ); + + $self->mech->back; # return to search results for next page + + return $id; + +} + +1; diff --git a/server.pl b/server.pl index ea711c5..8262c2a 100755 --- a/server.pl +++ b/server.pl @@ -10,6 +10,7 @@ use Data::Dumper; use Aleph; use GoogleBooks; use vuFind; +use DPLA; use Encode; @@ -21,6 +22,7 @@ my $databases = { 'ZAG01' => 'Aleph', 'GOOGLEBOOKS' => 'GoogleBooks', 'HATHITRUST' => 'vuFind', + 'DPLA' => 'DPLA', }; my $max_records = 3; # XXX configure this diff --git a/t/5-DPLA.t b/t/5-DPLA.t new file mode 100755 index 0000000..da10def --- /dev/null +++ b/t/5-DPLA.t @@ -0,0 +1,22 @@ +#!/usr/bin/perl + +use warnings; +use strict; + +use Test::More tests => 29; + +my $search = join(' ', @ARGV) || 'krleža'; + +use_ok 'DPLA'; + +ok( my $o = DPLA->new(), 'new' ); + +ok( my $hits = $o->search( $o->usemap->{prefix_term}->( 'dpla.keyword' => $search ) ), "search: $search" ); +like $hits, qr/^\d+$/, "hits: $hits"; + +foreach ( 1 .. 25 ) { # > 20 to hit next page + + ok( my $marc = $o->next_marc, "next_marc $_" ); + diag $marc; + +} -- 2.20.1