open modify file with utf-8 encoding
[webpac2] / bin / isi-download-results.pl
1 #!/usr/bin/perl
2
3 use warnings;
4 use strict;
5
6 use WWW::Mechanize;
7 use Data::Dump qw(dump);
8 use File::Path;
9 use Text::Unaccent;
10
11 # Advanced search syntax:
12 # http://images.isiknowledge.com/WOK46/help/WOS/h_advanced_examples.html
13
14 our $q = 'AD=Croatia';
15 #$q = 'AU=BRATKO, D';
16 $q = 'CA=BRATKO, D';
17 #$q = 'AD=(croat* OR hrvat*)';
18
19 my $range_size = 100;
20 my $overlap    = 3; # between previous and this range
21
22 my $results = 0;
23 my $citations = 0;
24 my $cited_reference = 1; # html tables
25 my $citing_articles = 1; # as many files as cited articles
26
27 my $cites_by_year = 0;
28
29 my $max_cites = 5000; # ISI limit to get cites
30
31 $q = unac_string( 'utf-8', join(' ', @ARGV) ) if @ARGV;
32
33 warn "QUERY: $q\n";
34
35 our $mech = WWW::Mechanize->new(
36         autocheck => 0, # it dies in reference download with it!
37         cookie_jar => undef,
38 );
39
40 our $step = 0;
41 our @ranges;
42
43 my $dir = '/tmp/isi/';
44 #rmtree $dir if -e $dir;
45 mkdir $dir unless -d $dir;
46
47 sub save_mech {
48         my $path = shift;
49         $step++;
50         my $base_path = sprintf('%s/%04d', $dir,$step);
51         $path ||= $base_path;
52         $path .= $mech->{ct} =~ m{html}i ? '.html' : '.txt';
53         $mech->save_content( $path );
54         warn "# [$step] $path ", -s $path, " ", $mech->ct, "\n";
55         open(my $dump, '>', "$base_path.dump.txt");
56         $mech->dump_all($dump);
57 }
58
59 warn "# get session";
60 $mech->get( 'http://www.webofknowledge.com/?DestApp=WOS' );
61 save_mech;
62
63 sub search {
64         warn "# advanced serach";
65         $mech->follow_link( url_regex => qr/AdvancedSearch/ );
66         save_mech;
67
68         warn "# cookie_jar ", dump $mech->cookie_jar;
69
70         my $q_this = $q;
71
72         if ( @ranges ) {
73                 $q_this .= ' AND (' . join(' OR ', map { "PY=$_" } @{ shift @ranges } ) . ')';
74         }
75
76         warn "# submit_form search: $q_this\n";
77         $mech->submit_form(
78                 fields => {
79                         'value(input1)' => $q_this,
80                 },
81         );
82         save_mech;
83
84         warn "# summary";
85         $mech->follow_link( url_regex => qr/summary/ );
86         save_mech;
87 }
88
89 sub get_results {
90         my $desc = shift;
91         my $from = 1;
92
93         while ( 1 ) {
94
95                 my $to = $from + $range_size;
96
97                 warn "# submit_form results $from - $to\n";
98                 save_mech;
99
100                 $mech->submit_form(
101                         form_name => 'output_form',
102                         fields => {
103                                 'value(record_select_type)' => 'range',
104                                 markFrom => $from,
105                                 markTo => $to,
106
107                                 mark_from => $from,
108                                 mark_to => $to,
109                                 mark_id => 'WOS',
110
111                                 fields_selection => 'ACCESSION_NUM FUNDING SUBJECT_CATEGORY JCR_CATEGORY LANG IDS PAGEC SABBR CITREFC ISSN PUBINFO KEYWORDS CITTIMES ADDRS CONFERENCE_SPONSORS DOCTYPE ABSTRACT CONFERENCE_INFO SOURCE TITLE AUTHORS',
112                                 filters => 'ACCESSION_NUM FUNDING SUBJECT_CATEGORY JCR_CATEGORY LANG IDS PAGEC SABBR CITREFC ISSN PUBINFO KEYWORDS CITTIMES ADDRS CONFERENCE_SPONSORS DOCTYPE ABSTRACT CONFERENCE_INFO SOURCE TITLE AUTHORS CITREF',
113                                 fullrec_fields_option => 'CITREF',
114
115                                 save_options => 'fieldtagged',
116                                 format => 'saveToFile',
117
118                         },
119                         button => 'saveToFile',
120                 );
121                 save_mech;
122
123
124                 if ( $mech->content =~ m{invalid API call} ) {
125                         $mech->back;
126                         last;
127                 }
128
129                 if ( $mech->content !~ m{Please wait while your request is processed} ) {
130                         warn "WARNING: expecting processing request";
131                 }
132
133
134                 my $path = "/tmp/isi.$q.$from-$to";
135                 $path .= '.' . $desc if $desc;
136
137                 warn "save $from - $to into $path\n";
138                 $mech->submit_form(
139                         form_name => 'etsForm',
140                 );
141                 save_mech $path;
142
143                 $from += $range_size - $overlap;
144
145                 $mech->back;
146                 $mech->back;
147                 save_mech;
148         }
149 }
150
151
152 sub citations {
153         warn "# citation report";
154         $mech->follow_link( url_regex => qr/search_mode=CitationReport/ );
155         save_mech;
156
157         warn "view citing articles";
158         $mech->follow_link( url_regex => qr/search_mode=TotalCitingArticles/ );
159         save_mech;
160 }
161
162 sub years {
163         my $years_url = $mech->find_link( url_regex => qr/ra_name=/ );
164         if ( ! $years_url ) {
165                 warn "W: can't find ra_name link\n";
166                 return;
167         }
168         $years_url = $years_url->url_abs;
169         warn "## $years_url";
170         if ( $years_url !~ s{ra_name=\w+}{ra_name=PublicationYear} ) {
171                 warn "W: no ra_name in $years_url\n";
172                 return;
173         }
174         warn "# refine years (hidden by javascript)";
175 #       warn "http://apps.isiknowledge.com/RAMore.do?product=WOS&search_mode=TotalCitingArticles&SID=T1o6bChdN9PGP1LN1Nh&qid=3&ra_mode=more&ra_name=PublicationYear&db_id=WOS&viewType=raMore\n$years_url\n";
176         $mech->get( $years_url );
177         save_mech;
178
179         my $html = $mech->content;
180         my $years;
181         while ( $html =~ s{<label.+?PublicationYear.+?>(\d{4})\s\(([\d,]+)\)</label>}{} ) {
182                 my ( $year, $count ) = ( $1, $2 );
183                 $count =~ s{,}{}g;
184                 $years->{$year} = $count;
185         }
186         warn "# years ",dump $years;
187         $mech->back;
188
189         my @y = sort keys %$years;
190
191         @ranges = ();
192
193         if ( $cites_by_year ) {
194                 push @ranges, [ $_ ] foreach @y;
195                 warn "# cites_by_year ranges ", dump @ranges;
196                 return;
197         }
198
199         my $y = shift @y;
200         my $size = $years->{$y};
201
202         my $cites_range;
203         $cites_range = [$y] if $y;
204
205         foreach my $y ( @y ) {
206                 if ( $size + $years->{$y} > $max_cites ) {
207                         push @ranges, $cites_range;
208                         warn "# cites_range $size years ",dump( $cites_range ),$/;
209
210                         $cites_range = [];
211                         $size = 0;
212                 }
213                 $size += $years->{$y};
214                 push @$cites_range, $y;
215         }
216
217         if ( $cites_range ) {
218                 push @ranges, $cites_range;
219                 warn "# cites_range $size years ",dump( $cites_range ), " FINAL\n"
220         }
221
222         warn '# ranges ', dump @ranges;
223         @ranges = () if $#ranges == 1; # just take all
224
225         return $years;
226 }
227
228
229 our $page = 1;
230 sub next_page {
231         $page++;
232         warn "next_page $page\n";
233
234         $mech->submit_form(
235                 form_name => 'summary_navigation',
236                 fields => {
237                         'page' => $page,
238                 },
239         );
240
241         save_mech;
242
243         $mech->form_name( 'summary_navigation' );
244         my $is_next_page = $mech->value('page') == $page;
245         warn "no next_page" unless $is_next_page;
246         return $is_next_page;
247 }
248
249 if ( $results ) {
250         search;
251         years;
252         get_results;
253 }
254
255 if ( $citations ) {
256
257         citations;
258         years unless @ranges;
259
260         do {
261                 my $part;
262                 if ( @ranges ) {
263                         $part .= $ranges[0]->[0] . '.';
264                         search;
265                         citations;
266                 }
267                 $part .= 'citing';
268                 get_results $part;
269         } while ( @ranges );
270
271 }
272
273
274
275 if ( $q =~ m{CA=(.+)} && $cited_reference ) {
276
277         my $CA = $1;
278
279         warn "# citated reference search";
280         $mech->follow_link( url_regex => qr/CitedReferenceSearch/ );
281         save_mech;
282
283
284         $mech->submit_form(
285                 form_name => 'WOS_CitedReferenceSearch_input_form',
286                 fields => {
287                         'value(input1)' => $CA,
288                 },
289         );
290
291         my $page = 1;
292         my $records = $1 if $mech->content =~ m/(\d+)\s+records/;
293         warn "# found $records records\n";
294         my $last_span = 'fake';
295
296         while (1) {
297                 save_mech "/tmp/isi.$q.citedref.$page";
298
299                 last unless next_page();
300                 $page++;
301
302                 if ( $mech->content =~ m/(\d+\s*-\s*(\d+))/ ) {
303                         warn "span: $1\n";
304                         last if $2 == $records;
305                         last if $1 == $last_span;
306                         $last_span = $1;
307                 } elsif ( $page > 5 ) {
308                         warn "ARTIFICALLY LIMITED TO 5 PAGES WITHOUT VALID SPAN!";
309                         last;
310                 }
311
312         }
313
314 }
315
316 if ( $q =~ m{CA=(.+)} && $citing_articles ) {
317
318         search;
319
320         my $orig_q = $q;
321         my $nr = 0;
322
323         do {
324
325         foreach my $link ( $mech->find_all_links( url_regex => qr/CitingArticles.do/ ) ) {
326                 $nr++;
327                 warn "link $nr\n";
328                 $mech->get( $link->url );
329                 save_mech;
330                 $q = $orig_q . '.citing_article.' . $nr;
331                 get_results;
332                 $mech->back;
333                 $mech->back;
334
335                 #last if $nr > 3; # FIXME only for development
336         }
337
338         } while next_page;
339
340         $q = $orig_q;
341 }
342
343 warn "OVER\n";