download results from ovid page by page
authorDobrica Pavlinusic <dpavlin@rot13.org>
Sat, 19 Sep 2009 22:24:13 +0000 (22:24 +0000)
committerDobrica Pavlinusic <dpavlin@rot13.org>
Sat, 19 Sep 2009 22:24:13 +0000 (22:24 +0000)
git-svn-id: svn+ssh://mjesec/home/dpavlin/svn/webpac2/trunk@1293 07558da8-63fa-0310-ba24-9fe276d99e06

bin/ovid-download-results.pl [new file with mode: 0755]

diff --git a/bin/ovid-download-results.pl b/bin/ovid-download-results.pl
new file mode 100755 (executable)
index 0000000..b5b7f21
--- /dev/null
@@ -0,0 +1,97 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+
+my $url = 'http://ovidsp.ovid.com/ovidweb.cgi?T=JS&NEWS=n&MODE=ovid&PAGE=main&D=psyh';
+
+our $location = 'Croatia';
+my $results_per_page = 100; # 5 10 25 50 100
+
+use WWW::Mechanize;
+use Data::Dump qw(dump);
+use File::Path;
+
+our $mech = WWW::Mechanize->new(
+       autocheck => 1,
+       cookie_jar => undef,
+);
+
+our $step = 0;
+
+my $dir = '/tmp/ovid';
+rmtree $dir if -e $dir;
+mkdir $dir;
+
+sub save_mech {
+       my $path = shift;
+       $step++;
+       my $base_path = sprintf('%s/%04d', $dir,$step);
+       $path ||= $base_path;
+       $path .= $mech->{ct} =~ m{html}i ? '.html' : '.txt';
+       $mech->save_content( $path );
+       warn "# [$step] $path ", -s $path, " ", $mech->ct, "\n";
+       open(my $dump, '>', "$base_path.dump.txt");
+       $mech->dump_all($dump);
+}
+
+warn "# get $url";
+$mech->get( $url );
+save_mech;
+
+warn "# multifield";
+$mech->follow_link( url_regex => qr/multifield/ );
+save_mech;
+
+warn "# search lo:$location";
+$mech->submit_form(
+       form_name => 'sfmultifield',
+       fields => {
+               'fields001'  => 'lo',
+               'textBox001' => $location,
+       },
+);
+save_mech;
+
+while (1) {
+
+       # $mech->form_id( 'nav-results' ); # XXX not supported by older WWW::Mechanize
+       $mech->form_number(3);
+
+       my @records = $mech->find_all_inputs(
+               type => 'radio',
+               name => 'cmRecordSelect',
+       );
+
+       #warn '## records ', dump @records;
+       my $range = $records[0]->{menu}->[1]->{value} || die "All on this page";
+
+       warn "submit_form save $range";
+       $mech->submit_form(
+               fields => {
+                       'cmRecordSelect' => $range,
+                       'cmFields'       => 'ALL',
+                       'cmFormat'       => 'export',
+                       'saveStrategy'   => 'on',
+                       'jumpstartLink'  => 'on',
+               },
+               button => 'submit:cmsave|1',
+       );
+       save_mech "$dir.$range";
+
+       $mech->back;
+
+       my @next_page = $mech->find_all_submits( value_regex => qr/Next Page/i );
+       #warn "## next page ", dump @next_page;
+       my $button = $next_page[-1]->{name} || die "next page button?";
+       warn "submit_form next page $button\n";
+       $mech->submit_form(
+               fields => {
+                       results_per_page => $results_per_page, # FIXME doesn't work?
+               },
+               button => $button,
+       );
+       save_mech;
+
+}
+