refactor into small mini-DSL at bottom of code
authorDobrica Pavlinusic <dpavlin@rot13.org>
Sat, 19 Sep 2009 12:43:03 +0000 (12:43 +0000)
committerDobrica Pavlinusic <dpavlin@rot13.org>
Sat, 19 Sep 2009 12:43:03 +0000 (12:43 +0000)
and added report about year breakdown of results

git-svn-id: svn+ssh://mjesec/home/dpavlin/svn/webpac2/trunk@1290 07558da8-63fa-0310-ba24-9fe276d99e06

bin/isi-download-results.pl

index 8ea5c32..3fa0040 100755 (executable)
@@ -23,32 +23,39 @@ our $step = 0;
 sub save_mech {
        my ( $mech, $path ) = @_;
        $step++;
-       $path ||= sprintf('/tmp/isi.%02d.%s', $step, $mech->{ct} =~ m{html}i ? 'html' : 'txt' );
+       mkdir '/tmp/isi/' unless -e '/tmp/isi';
+       my $base_path = sprintf('/tmp/isi/%04d', $step);
+       $path ||= $base_path . ( $mech->{ct} =~ m{html}i ? '.html' : '.txt' );
        $mech->save_content( $path );
-       warn "# [$step] $path ", -s $path, " ", $mech->ct;
-       $mech->dump_all if $dump;
+       warn "# [$step] $path ", -s $path, " ", $mech->ct, "\n";
+       open(my $dump, '>', "$base_path.dump.txt");
+       $mech->dump_all($dump);
 }
 
 warn "# get session";
 $mech->get( 'http://isiknowledge.com/?DestApp=WOS' );
 save_mech $mech;
 
-warn "# advanced serach";
-$mech->follow_link( url_regex => qr/AdvancedSearch/ );
-save_mech $mech;
+sub search {
+       my $q = shift;
 
-warn "# cookie_jar ", dump $mech->cookie_jar;
+       warn "# advanced serach";
+       $mech->follow_link( url_regex => qr/AdvancedSearch/ );
+       save_mech $mech;
 
-$mech->submit_form(
-       fields => {
-               'value(input1)' => $q,
-       }
-);
-save_mech $mech;
+       warn "# cookie_jar ", dump $mech->cookie_jar;
 
-warn "# summary";
-$mech->follow_link( url_regex => qr/summary/ );
-save_mech $mech;
+       $mech->submit_form(
+               fields => {
+                       'value(input1)' => $q,
+               }
+       );
+       save_mech $mech;
+
+       warn "# summary";
+       $mech->follow_link( url_regex => qr/summary/ );
+       save_mech $mech;
+}
 
 sub get_results {
        my $q = shift;
@@ -80,10 +87,10 @@ sub get_results {
 
                if ( $mech->content =~ m{invalid API call} ) {
                        $mech->back;
-                       return;
+                       last;
                }
 
-               warn "# save_file $from - $to [$q]";
+               warn "range $from - $to [$q]\n";
                $mech->follow_link( url_regex => qr/save_file/ );
                save_mech $mech => "/tmp/isi.$q.$from-$to.txt";
 
@@ -93,19 +100,46 @@ sub get_results {
                $mech->back;
                #save_mech $mech;
 
-       } # while
+       }
 
 }
 
-get_results $q;
 
-save_mech $mech;
-warn "# citations";
-$mech->follow_link( url_regex => qr/search_mode=CitationReport/ );
-save_mech $mech;
+sub citations {
+       save_mech $mech;
+       warn "# citation report";
+       $mech->follow_link( url_regex => qr/search_mode=CitationReport/ );
+       save_mech $mech;
 
-$mech->follow_link( url_regex => qr/search_mode=TotalCitingArticles/ );
-save_mech $mech;
+       warn "view citing articles";
+       $mech->follow_link( url_regex => qr/search_mode=TotalCitingArticles/ );
+       save_mech $mech;
+}
+
+sub years {
+       my $years_url = $mech->find_link( text_regex => qr/more options/ )->url_abs;
+       warn "## $years_url";
+       $years_url =~ s{ra_name=\w+}{ra_name=PublicationYear} || die "ra_name";
+       warn "# refine years (hidden by javascript)";
+       warn "http://apps.isiknowledge.com/RAMore.do?product=WOS&search_mode=TotalCitingArticles&SID=T1o6bChdN9PGP1LN1Nh&qid=3&ra_mode=more&ra_name=PublicationYear&db_id=WOS&viewType=raMore\n$years_url\n";
+       $mech->get( $years_url );
+       save_mech $mech;
+
+       my $html = $mech->content;
+       my @years;
+       while ( $html =~ s{>(\d\d\d\d)\s\((\d+)\)</label.+?value="PublicationYear_}{} ) {
+               push @years, [ $1 => $2 ];
+       }
+       warn "# years ",dump @years;
+       $mech->back;
+       return @years;
+}
+
+search $q;
+years;
+get_results $q;
 
+citations;
+years;
 get_results $q . '.citing';