fix download for new page url
[webpac2] / bin / isi-download-results.pl
1 #!/usr/bin/perl
2
3 use warnings;
4 use strict;
5
6 use WWW::Mechanize;
7 use Data::Dump qw(dump);
8 use File::Path;
9 use Text::Unaccent;
10
11 # Advanced search syntax:
12 # http://images.isiknowledge.com/WOK46/help/WOS/h_advanced_examples.html
13
14 our $q = 'AD=Croatia';
15 $q = 'AU=BRATKO, D';
16 #$q = 'AD=(croat* OR hrvat*)';
17
18 my $range_size = 100;
19 my $overlap    = 3; # between previous and this range
20
21 my $results = 1;
22 my $citations = 0;
23 my $cited_reference = 0; # html tables
24 my $citing_articles = 0; # as many files as cited articles
25
26 my $cites_by_year = 0;
27
28 my $max_cites = 5000; # ISI limit to get cites
29
30 $q = unac_string( 'utf-8', join(' ', @ARGV) ) if @ARGV;
31
32 warn "QUERY: $q\n";
33
34 our $mech = WWW::Mechanize->new(
35         autocheck => 0, # it dies in reference download with it!
36         cookie_jar => undef,
37 );
38
39 our $step = 0;
40 our @ranges;
41
42 my $dir = '/tmp/isi/';
43 #rmtree $dir if -e $dir;
44 mkdir $dir unless -d $dir;
45
46 sub save_mech {
47         my $path = shift;
48         $step++;
49         my $base_path = sprintf('%s/%04d', $dir,$step);
50         $path ||= $base_path;
51         $path .= $mech->{ct} =~ m{html}i ? '.html' : '.txt';
52         $mech->save_content( $path );
53         warn "# [$step] $path ", -s $path, " ", $mech->ct, "\n";
54         open(my $dump, '>', "$base_path.dump.txt");
55         $mech->dump_all($dump);
56 }
57
58 warn "# get session";
59 $mech->get( 'http://www.webofknowledge.com/?DestApp=WOS' );
60 save_mech;
61
62 sub search {
63         warn "# advanced serach";
64         $mech->follow_link( url_regex => qr/AdvancedSearch/ );
65         save_mech;
66
67         warn "# cookie_jar ", dump $mech->cookie_jar;
68
69         my $q_this = $q;
70
71         if ( @ranges ) {
72                 $q_this .= ' AND (' . join(' OR ', map { "PY=$_" } @{ shift @ranges } ) . ')';
73         }
74
75         warn "# submit_form search: $q_this\n";
76         $mech->submit_form(
77                 fields => {
78                         'value(input1)' => $q_this,
79                 },
80         );
81         save_mech;
82
83         warn "# summary";
84         $mech->follow_link( url_regex => qr/summary/ );
85         save_mech;
86 }
87
88 sub get_results {
89         my $desc = shift;
90         my $from = 1;
91
92         while ( 1 ) {
93
94                 my $to = $from + $range_size;
95
96                 warn "# submit_form results $from - $to\n";
97                 save_mech;
98
99                 $mech->submit_form(
100                         form_name => 'output_form',
101                         fields => {
102                                 'value(record_select_type)' => 'range',
103                                 markFrom => $from,
104                                 markTo => $to,
105
106                                 mark_from => $from,
107                                 mark_to => $to,
108                                 mark_id => 'WOS',
109
110                                 fields_selection => 'ACCESSION_NUM FUNDING SUBJECT_CATEGORY JCR_CATEGORY LANG IDS PAGEC SABBR CITREFC ISSN PUBINFO KEYWORDS CITTIMES ADDRS CONFERENCE_SPONSORS DOCTYPE ABSTRACT CONFERENCE_INFO SOURCE TITLE AUTHORS',
111                                 filters => 'ACCESSION_NUM FUNDING SUBJECT_CATEGORY JCR_CATEGORY LANG IDS PAGEC SABBR CITREFC ISSN PUBINFO KEYWORDS CITTIMES ADDRS CONFERENCE_SPONSORS DOCTYPE ABSTRACT CONFERENCE_INFO SOURCE TITLE AUTHORS CITREF',
112                                 fullrec_fields_option => 'CITREF',
113
114                                 save_options => 'fieldtagged',
115                                 format => 'saveToFile',
116
117                         },
118                         button => 'saveToFile',
119                 );
120                 save_mech;
121
122
123                 if ( $mech->content =~ m{invalid API call} ) {
124                         $mech->back;
125                         last;
126                 }
127
128                 if ( $mech->content !~ m{Please wait while your request is processed} ) {
129                         warn "WARNING: expecting processing request";
130                 }
131
132
133                 my $path = "/tmp/isi.$q.$from-$to";
134                 $path .= '.' . $desc if $desc;
135
136                 warn "save $from - $to into $path\n";
137                 $mech->submit_form(
138                         form_name => 'etsForm',
139                 );
140                 save_mech $path;
141
142                 $from += $range_size - $overlap;
143
144                 $mech->back;
145                 $mech->back;
146                 save_mech;
147         }
148 }
149
150
151 sub citations {
152         warn "# citation report";
153         $mech->follow_link( url_regex => qr/search_mode=CitationReport/ );
154         save_mech;
155
156         warn "view citing articles";
157         $mech->follow_link( url_regex => qr/search_mode=TotalCitingArticles/ );
158         save_mech;
159 }
160
161 sub years {
162         my $years_url = $mech->find_link( url_regex => qr/ra_name=/ );
163         if ( ! $years_url ) {
164                 warn "W: can't find ra_name link\n";
165                 return;
166         }
167         $years_url = $years_url->url_abs;
168         warn "## $years_url";
169         if ( $years_url !~ s{ra_name=\w+}{ra_name=PublicationYear} ) {
170                 warn "W: no ra_name in $years_url\n";
171                 return;
172         }
173         warn "# refine years (hidden by javascript)";
174 #       warn "http://apps.isiknowledge.com/RAMore.do?product=WOS&search_mode=TotalCitingArticles&SID=T1o6bChdN9PGP1LN1Nh&qid=3&ra_mode=more&ra_name=PublicationYear&db_id=WOS&viewType=raMore\n$years_url\n";
175         $mech->get( $years_url );
176         save_mech;
177
178         my $html = $mech->content;
179         my $years;
180         while ( $html =~ s{<label.+?PublicationYear.+?>(\d{4})\s\(([\d,]+)\)</label>}{} ) {
181                 my ( $year, $count ) = ( $1, $2 );
182                 $count =~ s{,}{}g;
183                 $years->{$year} = $count;
184         }
185         warn "# years ",dump $years;
186         $mech->back;
187
188         my @y = sort keys %$years;
189
190         @ranges = ();
191
192         if ( $cites_by_year ) {
193                 push @ranges, [ $_ ] foreach @y;
194                 warn "# cites_by_year ranges ", dump @ranges;
195                 return;
196         }
197
198         my $y = shift @y;
199         my $size = $years->{$y};
200
201         my $cites_range;
202         $cites_range = [$y] if $y;
203
204         foreach my $y ( @y ) {
205                 if ( $size + $years->{$y} > $max_cites ) {
206                         push @ranges, $cites_range;
207                         warn "# cites_range $size years ",dump( $cites_range ),$/;
208
209                         $cites_range = [];
210                         $size = 0;
211                 }
212                 $size += $years->{$y};
213                 push @$cites_range, $y;
214         }
215
216         if ( $cites_range ) {
217                 push @ranges, $cites_range;
218                 warn "# cites_range $size years ",dump( $cites_range ), " FINAL\n"
219         }
220
221         warn '# ranges ', dump @ranges;
222         @ranges = () if $#ranges == 1; # just take all
223
224         return $years;
225 }
226
227
228 our $page = 1;
229 sub next_page {
230         $page++;
231         warn "next_page $page\n";
232
233         $mech->submit_form(
234                 form_name => 'summary_navigation',
235                 fields => {
236                         'page' => $page,
237                 },
238         );
239
240         save_mech;
241
242         $mech->form_name( 'summary_navigation' );
243         my $is_next_page = $mech->value('page') == $page;
244         warn "no next_page" unless $is_next_page;
245         return $is_next_page;
246 }
247
248 if ( $results ) {
249         search;
250         years;
251         get_results;
252 }
253
254 if ( $citations ) {
255
256         citations;
257         years unless @ranges;
258
259         do {
260                 my $part;
261                 if ( @ranges ) {
262                         $part .= $ranges[0]->[0] . '.';
263                         search;
264                         citations;
265                 }
266                 $part .= 'citing';
267                 get_results $part;
268         } while ( @ranges );
269
270 }
271
272
273
274 if ( $q =~ m{CA=(.+)} && $cited_reference ) {
275
276         my $CA = $1;
277
278         warn "# citated reference search";
279         $mech->follow_link( url_regex => qr/CitedReferenceSearch/ );
280         save_mech;
281
282
283         $mech->submit_form(
284                 form_name => 'WOS_CitedReferenceSearch_input_form',
285                 fields => {
286                         'value(input1)' => $CA,
287                 },
288         );
289
290         my $page = 1;
291         my $records = $1 if $mech->content =~ m/(\d+)\s+records/;
292         warn "# found $records records\n";
293         my $last_span = 'fake';
294
295         while (1) {
296                 save_mech "/tmp/isi.$q.citedref.$page";
297
298                 last unless next_page();
299
300                 if ( $mech->content =~ m/(\d+\s*-\s*(\d+))/ ) {
301                         warn "span: $1\n";
302                         last if $2 == $records;
303                         last if $1 == $last_span;
304                         $last_span = $1;
305                 } elsif ( $page > 5 ) {
306                         warn "ARTIFICALLY LIMITED TO 5 PAGES WITHOUT VALID SPAN!";
307                         last;
308                 }
309
310         }
311
312 }
313
314 if ( $q =~ m{CA=(.+)} && $citing_articles ) {
315
316         search;
317
318         my $orig_q = $q;
319         my $nr = 0;
320
321         do {
322
323         foreach my $link ( $mech->find_all_links( url_regex => qr/CitingArticles.do/ ) ) {
324                 $nr++;
325                 warn "link $nr\n";
326                 $mech->get( $link->url );
327                 save_mech;
328                 $q = $orig_q . '.citing_article.' . $nr;
329                 get_results;
330                 $mech->back;
331                 $mech->back;
332
333                 #last if $nr > 3; # FIXME only for development
334         }
335
336         } while next_page;
337
338         $q = $orig_q;
339 }
340
341 warn "OVER\n";