X-Git-Url: http://git.rot13.org/?a=blobdiff_plain;f=bin%2Fisi-download-results.pl;h=6bb37d4c4081385b5089a141fc5dce0bf0d9d16c;hb=27a60b583075e44cc866c9c3a8ab65e2143a4a6b;hp=b5992642107c129fbcc64e3c2d82e06e209c6836;hpb=1cfc969d9fa0655aceaea370a068625dfc6dd5c8;p=webpac2 diff --git a/bin/isi-download-results.pl b/bin/isi-download-results.pl index b599264..6bb37d4 100755 --- a/bin/isi-download-results.pl +++ b/bin/isi-download-results.pl @@ -3,29 +3,36 @@ use warnings; use strict; +use WWW::Mechanize; +use Data::Dump qw(dump); +use File::Path; +use Text::Unaccent; + # Advanced search syntax: # http://images.isiknowledge.com/WOK46/help/WOS/h_advanced_examples.html our $q = 'AD=Croatia'; -my $range_size = 500; +$q = 'AU=BRATKO, D'; +#$q = 'AD=(croat* OR hrvat*)'; + +my $range_size = 100; my $overlap = 3; # between previous and this range -my $skip_results = 1; + +my $results = 1; +my $citations = 0; +my $cited_reference = 0; # html tables +my $citing_articles = 0; # as many files as cited articles + +my $cites_by_year = 0; my $max_cites = 5000; # ISI limit to get cites -if ( 0 ) { - $q = 'TS=psychology AND AD=Croatia'; - $range_size = 50; - $overlap = 0; - $max_cites = 50; -} +$q = unac_string( 'utf-8', join(' ', @ARGV) ) if @ARGV; -use WWW::Mechanize; -use Data::Dump qw(dump); -use File::Path; +warn "QUERY: $q\n"; our $mech = WWW::Mechanize->new( - autocheck => 1, + autocheck => 0, # it dies in reference download with it! cookie_jar => undef, ); @@ -33,8 +40,8 @@ our $step = 0; our @ranges; my $dir = '/tmp/isi/'; -rmtree $dir if -e $dir; -mkdir $dir; +#rmtree $dir if -e $dir; +mkdir $dir unless -d $dir; sub save_mech { my $path = shift; @@ -49,7 +56,7 @@ sub save_mech { } warn "# get session"; -$mech->get( 'http://isiknowledge.com/?DestApp=WOS' ); +$mech->get( 'http://www.webofknowledge.com/?DestApp=WOS' ); save_mech; sub search { @@ -87,24 +94,28 @@ sub get_results { my $to = $from + $range_size; warn "# submit_form results $from - $to\n"; + save_mech; $mech->submit_form( - form_name => 'summary_output_form', + form_name => 'output_form', fields => { - record_select_type => 'range', + 'value(record_select_type)' => 'range', + markFrom => $from, + markTo => $to, + mark_from => $from, mark_to => $to, mark_id => 'WOS', - qo_fields => 'fullrecord', - citedref => 'citedref', + fields_selection => 'ACCESSION_NUM FUNDING SUBJECT_CATEGORY JCR_CATEGORY LANG IDS PAGEC SABBR CITREFC ISSN PUBINFO KEYWORDS CITTIMES ADDRS CONFERENCE_SPONSORS DOCTYPE ABSTRACT CONFERENCE_INFO SOURCE TITLE AUTHORS', + filters => 'ACCESSION_NUM FUNDING SUBJECT_CATEGORY JCR_CATEGORY LANG IDS PAGEC SABBR CITREFC ISSN PUBINFO KEYWORDS CITTIMES ADDRS CONFERENCE_SPONSORS DOCTYPE ABSTRACT CONFERENCE_INFO SOURCE TITLE AUTHORS CITREF', + fullrec_fields_option => 'CITREF', - save_options => 'plain_text', + save_options => 'fieldtagged', + format => 'saveToFile', - fields => 'Full', - format => 'save', }, - button => 'save', + button => 'saveToFile', ); save_mech; @@ -114,19 +125,25 @@ sub get_results { last; } + if ( $mech->content !~ m{Please wait while your request is processed} ) { + warn "WARNING: expecting processing request"; + } + my $path = "/tmp/isi.$q.$from-$to"; $path .= '.' . $desc if $desc; warn "save $from - $to into $path\n"; - $mech->follow_link( url_regex => qr/save_file/ ); + $mech->submit_form( + form_name => 'etsForm', + ); save_mech $path; $from += $range_size - $overlap; $mech->back; $mech->back; - #save_mech; + save_mech; } } @@ -170,10 +187,17 @@ sub years { my @y = sort keys %$years; + @ranges = (); + + if ( $cites_by_year ) { + push @ranges, [ $_ ] foreach @y; + warn "# cites_by_year ranges ", dump @ranges; + return; + } + my $y = shift @y; my $size = $years->{$y}; - @ranges = (); my $cites_range; $cites_range = [$y] if $y; @@ -200,21 +224,118 @@ sub years { return $years; } -search; -years; -get_results unless $skip_results; +our $page = 1; +sub next_page { + $page++; + warn "next_page $page\n"; + + $mech->submit_form( + form_name => 'summary_navigation', + fields => { + 'page' => $page, + }, + ); + + save_mech; + + $mech->form_name( 'summary_navigation' ); + my $is_next_page = $mech->value('page') == $page; + warn "no next_page" unless $is_next_page; + return $is_next_page; +} + +if ( $results ) { + search; + years; + get_results; +} + +if ( $citations ) { -citations; + citations; + years unless @ranges; -do { - my $part; - if ( @ranges ) { - $part .= $ranges[0]->[0] . '.'; - search; - citations; + do { + my $part; + if ( @ranges ) { + $part .= $ranges[0]->[0] . '.'; + search; + citations; + } + $part .= 'citing'; + get_results $part; + } while ( @ranges ); + +} + + + +if ( $q =~ m{CA=(.+)} && $cited_reference ) { + + my $CA = $1; + + warn "# citated reference search"; + $mech->follow_link( url_regex => qr/CitedReferenceSearch/ ); + save_mech; + + + $mech->submit_form( + form_name => 'WOS_CitedReferenceSearch_input_form', + fields => { + 'value(input1)' => $CA, + }, + ); + + my $page = 1; + my $records = $1 if $mech->content =~ m/(\d+)\s+records/; + warn "# found $records records\n"; + my $last_span = 'fake'; + + while (1) { + save_mech "/tmp/isi.$q.citedref.$page"; + + last unless next_page(); + + if ( $mech->content =~ m/(\d+\s*-\s*(\d+))/ ) { + warn "span: $1\n"; + last if $2 == $records; + last if $1 == $last_span; + $last_span = $1; + } elsif ( $page > 5 ) { + warn "ARTIFICALLY LIMITED TO 5 PAGES WITHOUT VALID SPAN!"; + last; + } + + } + +} + +if ( $q =~ m{CA=(.+)} && $citing_articles ) { + + search; + + my $orig_q = $q; + my $nr = 0; + + do { + + foreach my $link ( $mech->find_all_links( url_regex => qr/CitingArticles.do/ ) ) { + $nr++; + warn "link $nr\n"; + $mech->get( $link->url ); + save_mech; + $q = $orig_q . '.citing_article.' . $nr; + get_results; + $mech->back; + $mech->back; + + #last if $nr > 3; # FIXME only for development } - $part .= 'citing'; - get_results $part; -} while ( @ranges ); + } while next_page; + + $q = $orig_q; +} + +warn "OVER\n";