b768fe0b840a4f9aabddcf82e3502a83450af7a4
[webpac2] / bin / isi-download-results.pl
1 #!/usr/bin/perl
2
3 use warnings;
4 use strict;
5
6 use WWW::Mechanize;
7 use Data::Dump qw(dump);
8 use File::Path;
9 use Text::Unaccent;
10
11 # Advanced search syntax:
12 # http://images.isiknowledge.com/WOK46/help/WOS/h_advanced_examples.html
13
14 our $q = 'AD=Croatia';
15 $q = 'CA=BRATKO, D';
16
17 my $range_size = 500;
18 my $overlap    = 3; # between previous and this range
19
20 my $results = 0;
21 my $citations = 0;
22 my $cited_reference = 0; # html tables
23 my $citing_articles = 1; # as many files as cited articles
24
25 my $cites_by_year = 0;
26
27 my $max_cites = 5000; # ISI limit to get cites
28
29 $q = unac_string( 'utf-8', join(' ', @ARGV) ) if @ARGV;
30
31 warn "QUERY: $q\n";
32
33 our $mech = WWW::Mechanize->new(
34         autocheck => 0, # it dies in reference download with it!
35         cookie_jar => undef,
36 );
37
38 our $step = 0;
39 our @ranges;
40
41 my $dir = '/tmp/isi/';
42 #rmtree $dir if -e $dir;
43 mkdir $dir unless -d $dir;
44
45 sub save_mech {
46         my $path = shift;
47         $step++;
48         my $base_path = sprintf('%s/%04d', $dir,$step);
49         $path ||= $base_path;
50         $path .= $mech->{ct} =~ m{html}i ? '.html' : '.txt';
51         $mech->save_content( $path );
52         warn "# [$step] $path ", -s $path, " ", $mech->ct, "\n";
53         open(my $dump, '>', "$base_path.dump.txt");
54         $mech->dump_all($dump);
55 }
56
57 warn "# get session";
58 $mech->get( 'http://isiknowledge.com/?DestApp=WOS' );
59 save_mech;
60
61 sub search {
62         warn "# advanced serach";
63         $mech->follow_link( url_regex => qr/AdvancedSearch/ );
64         save_mech;
65
66         warn "# cookie_jar ", dump $mech->cookie_jar;
67
68         my $q_this = $q;
69
70         if ( @ranges ) {
71                 $q_this .= ' AND (' . join(' OR ', map { "PY=$_" } @{ shift @ranges } ) . ')';
72         }
73
74         warn "# submit_form search: $q_this\n";
75         $mech->submit_form(
76                 fields => {
77                         'value(input1)' => $q_this,
78                 },
79         );
80         save_mech;
81
82         warn "# summary";
83         $mech->follow_link( url_regex => qr/summary/ );
84         save_mech;
85 }
86
87 sub get_results {
88         my $desc = shift;
89         my $from = 1;
90
91         while ( 1 ) {
92
93                 my $to = $from + $range_size;
94
95                 warn "# submit_form results $from - $to\n";
96
97                 $mech->submit_form(
98                         form_name => 'summary_output_form',
99                         fields => {
100                                 record_select_type => 'range',
101                                 mark_from => $from,
102                                 mark_to => $to,
103                                 mark_id => 'WOS',
104
105                                 qo_fields => 'fullrecord',
106                                 citedref => 'citedref',
107
108                                 save_options => 'plain_text',
109
110                                 fields => 'Full',
111                                 format => 'save',
112                         },
113                         button => 'save',
114                 );
115                 save_mech;
116
117
118                 if ( $mech->content =~ m{invalid API call} ) {
119                         $mech->back;
120                         last;
121                 }
122
123                 if ( $mech->content =~ m{Please wait while your request is processed} ) {
124                         warn "WARNING: processing request";
125                 }
126
127
128                 my $path = "/tmp/isi.$q.$from-$to";
129                 $path .= '.' . $desc if $desc;
130
131                 warn "save $from - $to into $path\n";
132                 $mech->follow_link( url_regex => qr/save_file/ );
133                 save_mech $path;
134
135                 $from += $range_size - $overlap;
136
137                 $mech->back;
138                 $mech->back;
139                 #save_mech;
140         }
141 }
142
143
144 sub citations {
145         warn "# citation report";
146         $mech->follow_link( url_regex => qr/search_mode=CitationReport/ );
147         save_mech;
148
149         warn "view citing articles";
150         $mech->follow_link( url_regex => qr/search_mode=TotalCitingArticles/ );
151         save_mech;
152 }
153
154 sub years {
155         my $years_url = $mech->find_link( url_regex => qr/ra_name=/ );
156         if ( ! $years_url ) {
157                 warn "W: can't find ra_name link\n";
158                 return;
159         }
160         $years_url = $years_url->url_abs;
161         warn "## $years_url";
162         if ( $years_url !~ s{ra_name=\w+}{ra_name=PublicationYear} ) {
163                 warn "W: no ra_name in $years_url\n";
164                 return;
165         }
166         warn "# refine years (hidden by javascript)";
167 #       warn "http://apps.isiknowledge.com/RAMore.do?product=WOS&search_mode=TotalCitingArticles&SID=T1o6bChdN9PGP1LN1Nh&qid=3&ra_mode=more&ra_name=PublicationYear&db_id=WOS&viewType=raMore\n$years_url\n";
168         $mech->get( $years_url );
169         save_mech;
170
171         my $html = $mech->content;
172         my $years;
173         while ( $html =~ s{<label.+?PublicationYear.+?>(\d{4})\s\(([\d,]+)\)</label>}{} ) {
174                 my ( $year, $count ) = ( $1, $2 );
175                 $count =~ s{,}{}g;
176                 $years->{$year} = $count;
177         }
178         warn "# years ",dump $years;
179         $mech->back;
180
181         my @y = sort keys %$years;
182
183         @ranges = ();
184
185         if ( $cites_by_year ) {
186                 push @ranges, [ $_ ] foreach @y;
187                 warn "# cites_by_year ranges ", dump @ranges;
188                 return;
189         }
190
191         my $y = shift @y;
192         my $size = $years->{$y};
193
194         my $cites_range;
195         $cites_range = [$y] if $y;
196
197         foreach my $y ( @y ) {
198                 if ( $size + $years->{$y} > $max_cites ) {
199                         push @ranges, $cites_range;
200                         warn "# cites_range $size years ",dump( $cites_range ),$/;
201
202                         $cites_range = [];
203                         $size = 0;
204                 }
205                 $size += $years->{$y};
206                 push @$cites_range, $y;
207         }
208
209         if ( $cites_range ) {
210                 push @ranges, $cites_range;
211                 warn "# cites_range $size years ",dump( $cites_range ), " FINAL\n"
212         }
213
214         warn '# ranges ', dump @ranges;
215         @ranges = () if $#ranges == 1; # just take all
216
217         return $years;
218 }
219
220
221 our $page = 1;
222 sub next_page {
223         $page++;
224         warn "next_page $page\n";
225
226         $mech->submit_form(
227                 form_name => 'summary_navigation',
228                 fields => {
229                         'page' => $page,
230                 },
231         );
232
233         save_mech;
234
235         $mech->form_name( 'summary_navigation' );
236         my $is_next_page = $mech->value('page') == $page;
237         warn "no next_page" unless $is_next_page;
238         return $is_next_page;
239 }
240
241 if ( $results ) {
242         search;
243         years;
244         get_results;
245 }
246
247 if ( $citations ) {
248
249         citations;
250         years unless @ranges;
251
252         do {
253                 my $part;
254                 if ( @ranges ) {
255                         $part .= $ranges[0]->[0] . '.';
256                         search;
257                         citations;
258                 }
259                 $part .= 'citing';
260                 get_results $part;
261         } while ( @ranges );
262
263 }
264
265
266
267 if ( $q =~ m{CA=(.+)} && $cited_reference ) {
268
269         my $CA = $1;
270
271         warn "# citated reference search";
272         $mech->follow_link( url_regex => qr/CitedReferenceSearch/ );
273         save_mech;
274
275
276         $mech->submit_form(
277                 form_name => 'WOS_CitedReferenceSearch_input_form',
278                 fields => {
279                         'value(input1)' => $CA,
280                 },
281         );
282
283         my $page = 1;
284         my $records = $1 if $mech->content =~ m/(\d+)\s+records/;
285         warn "# found $records records\n";
286         my $last_span = 'fake';
287
288         while (1) {
289                 save_mech "/tmp/isi.$q.citedref.$page";
290
291                 last unless next_page();
292
293                 if ( $mech->content =~ m/(\d+\s*-\s*(\d+))/ ) {
294                         warn "span: $1\n";
295                         last if $2 == $records;
296                         last if $1 == $last_span;
297                         $last_span = $1;
298                 } elsif ( $page > 5 ) {
299                         warn "ARTIFICALLY LIMITED TO 5 PAGES WITHOUT VALID SPAN!";
300                         last;
301                 }
302
303         }
304
305 }
306
307 if ( $q =~ m{CA=(.+)} && $citing_articles ) {
308
309         search;
310
311         my $orig_q = $q;
312         my $nr = 0;
313
314         do {
315
316         foreach my $link ( $mech->find_all_links( url_regex => qr/CitingArticles.do/ ) ) {
317                 $nr++;
318                 warn "link $nr\n";
319                 $mech->get( $link->url );
320                 save_mech;
321                 $q = $orig_q . '.citing_article.' . $nr;
322                 get_results;
323                 $mech->back;
324                 $mech->back;
325
326                 #last if $nr > 3; # FIXME only for development
327         }
328
329         } while next_page;
330
331         $q = $orig_q;
332 }
333
334 warn "OVER\n";