From 19485fab478b2300a68769cf14375a42a93ece97 Mon Sep 17 00:00:00 2001 From: Dobrica Pavlinusic Date: Sat, 19 Sep 2009 15:05:12 +0000 Subject: [PATCH 1/1] return years in hash git-svn-id: svn+ssh://mjesec/home/dpavlin/svn/webpac2/trunk@1291 07558da8-63fa-0310-ba24-9fe276d99e06 --- bin/isi-download-results.pl | 43 +++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/bin/isi-download-results.pl b/bin/isi-download-results.pl index 3fa0040..8e0f918 100755 --- a/bin/isi-download-results.pl +++ b/bin/isi-download-results.pl @@ -3,7 +3,7 @@ use warnings; use strict; -my $q = 'AD=Croatia'; +our $q = 'AD=Croatia'; my $range_size = 500; my $dump = @ARGV ? 1 : 0; @@ -11,7 +11,8 @@ my $dump = @ARGV ? 1 : 0; $q = 'TS=psychology AND AD=Croatia'; use WWW::Mechanize; -use Data::Dump qw/dump/; +use Data::Dump qw(dump); +use File::Path; our $mech = WWW::Mechanize->new( autocheck => 1, @@ -20,11 +21,14 @@ our $mech = WWW::Mechanize->new( our $step = 0; +my $dir = '/tmp/isi/'; +rmtree $dir if -e $dir; +mkdir $dir; + sub save_mech { my ( $mech, $path ) = @_; $step++; - mkdir '/tmp/isi/' unless -e '/tmp/isi'; - my $base_path = sprintf('/tmp/isi/%04d', $step); + my $base_path = sprintf('%s/%04d', $dir,$step); $path ||= $base_path . ( $mech->{ct} =~ m{html}i ? '.html' : '.txt' ); $mech->save_content( $path ); warn "# [$step] $path ", -s $path, " ", $mech->ct, "\n"; @@ -37,8 +41,6 @@ $mech->get( 'http://isiknowledge.com/?DestApp=WOS' ); save_mech $mech; sub search { - my $q = shift; - warn "# advanced serach"; $mech->follow_link( url_regex => qr/AdvancedSearch/ ); save_mech $mech; @@ -58,7 +60,6 @@ sub search { } sub get_results { - my $q = shift; my $from = 1; while ( 1 ) { @@ -117,29 +118,39 @@ sub citations { } sub years { - my $years_url = $mech->find_link( text_regex => qr/more options/ )->url_abs; + my $years_url = $mech->find_link( text_regex => qr/more options/ ); + if ( ! $years_url ) { + warn "W: can't find years\n"; + return; + } + $years_url = $years_url->url_abs; warn "## $years_url"; - $years_url =~ s{ra_name=\w+}{ra_name=PublicationYear} || die "ra_name"; + if ( $years_url !~ s{ra_name=\w+}{ra_name=PublicationYear} ) { + warn "W: no ra_name\n"; + return; + } warn "# refine years (hidden by javascript)"; warn "http://apps.isiknowledge.com/RAMore.do?product=WOS&search_mode=TotalCitingArticles&SID=T1o6bChdN9PGP1LN1Nh&qid=3&ra_mode=more&ra_name=PublicationYear&db_id=WOS&viewType=raMore\n$years_url\n"; $mech->get( $years_url ); save_mech $mech; my $html = $mech->content; - my @years; + my $years; while ( $html =~ s{>(\d\d\d\d)\s\((\d+)\) $2 ]; + $years->{$1} = $2; } - warn "# years ",dump @years; + warn "# years ",dump $years; $mech->back; - return @years; + return $years; } -search $q; +search; years; -get_results $q; +get_results; + citations; years; -get_results $q . '.citing'; +$q .= '.citing'; +get_results; -- 2.20.1