refactor into small mini-DSL at bottom of code
[webpac2] / bin / isi-download-results.pl
1 #!/usr/bin/perl
2
3 use warnings;
4 use strict;
5
6 my $q = 'AD=Croatia';
7 my $range_size = 500;
8
9 my $dump = @ARGV ? 1 : 0;
10
11 $q = 'TS=psychology AND AD=Croatia';
12
13 use WWW::Mechanize;
14 use Data::Dump qw/dump/;
15
16 our $mech = WWW::Mechanize->new(
17         autocheck => 1,
18         cookie_jar => undef,
19 );
20
21 our $step = 0;
22
23 sub save_mech {
24         my ( $mech, $path ) = @_;
25         $step++;
26         mkdir '/tmp/isi/' unless -e '/tmp/isi';
27         my $base_path = sprintf('/tmp/isi/%04d', $step);
28         $path ||= $base_path . ( $mech->{ct} =~ m{html}i ? '.html' : '.txt' );
29         $mech->save_content( $path );
30         warn "# [$step] $path ", -s $path, " ", $mech->ct, "\n";
31         open(my $dump, '>', "$base_path.dump.txt");
32         $mech->dump_all($dump);
33 }
34
35 warn "# get session";
36 $mech->get( 'http://isiknowledge.com/?DestApp=WOS' );
37 save_mech $mech;
38
39 sub search {
40         my $q = shift;
41
42         warn "# advanced serach";
43         $mech->follow_link( url_regex => qr/AdvancedSearch/ );
44         save_mech $mech;
45
46         warn "# cookie_jar ", dump $mech->cookie_jar;
47
48         $mech->submit_form(
49                 fields => {
50                         'value(input1)' => $q,
51                 }
52         );
53         save_mech $mech;
54
55         warn "# summary";
56         $mech->follow_link( url_regex => qr/summary/ );
57         save_mech $mech;
58 }
59
60 sub get_results {
61         my $q = shift;
62         my $from = 1;
63
64         while ( 1 ) {
65
66                 my $to = $from + $range_size;
67
68                 $mech->submit_form(
69                         form_name => 'summary_output_form',
70                         fields => {
71                                 record_select_type => 'range',
72                                 mark_from => $from,
73                                 mark_to => $to,
74                                 mark_id => 'WOS',
75
76                                 qo_fields => 'fullrecord',
77                                 citedref => 'citedref',
78
79                                 save_options => 'plain_text',
80
81                                 fields => 'Full',
82                                 format => 'save',
83                         },
84                         button => 'save',
85                 );
86                 save_mech $mech;
87
88                 if ( $mech->content =~ m{invalid API call} ) {
89                         $mech->back;
90                         last;
91                 }
92
93                 warn "range $from - $to [$q]\n";
94                 $mech->follow_link( url_regex => qr/save_file/ );
95                 save_mech $mech => "/tmp/isi.$q.$from-$to.txt";
96
97                 $from += $range_size;
98
99                 $mech->back;
100                 $mech->back;
101                 #save_mech $mech;
102
103         }
104
105 }
106
107
108 sub citations {
109         save_mech $mech;
110         warn "# citation report";
111         $mech->follow_link( url_regex => qr/search_mode=CitationReport/ );
112         save_mech $mech;
113
114         warn "view citing articles";
115         $mech->follow_link( url_regex => qr/search_mode=TotalCitingArticles/ );
116         save_mech $mech;
117 }
118
119 sub years {
120         my $years_url = $mech->find_link( text_regex => qr/more options/ )->url_abs;
121         warn "## $years_url";
122         $years_url =~ s{ra_name=\w+}{ra_name=PublicationYear} || die "ra_name";
123         warn "# refine years (hidden by javascript)";
124         warn "http://apps.isiknowledge.com/RAMore.do?product=WOS&search_mode=TotalCitingArticles&SID=T1o6bChdN9PGP1LN1Nh&qid=3&ra_mode=more&ra_name=PublicationYear&db_id=WOS&viewType=raMore\n$years_url\n";
125         $mech->get( $years_url );
126         save_mech $mech;
127
128         my $html = $mech->content;
129         my @years;
130         while ( $html =~ s{>(\d\d\d\d)\s\((\d+)\)</label.+?value="PublicationYear_}{} ) {
131                 push @years, [ $1 => $2 ];
132         }
133         warn "# years ",dump @years;
134         $mech->back;
135         return @years;
136 }
137
138 search $q;
139 years;
140 get_results $q;
141
142 citations;
143 years;
144 get_results $q . '.citing';
145