7 use Data::Dump qw(dump);
11 # Advanced search syntax:
12 # http://images.isiknowledge.com/WOK46/help/WOS/h_advanced_examples.html
14 our $q = 'AD=Croatia';
17 #$q = 'AD=(croat* OR hrvat*)';
20 my $overlap = 3; # between previous and this range
24 my $cited_reference = 1; # html tables
25 my $citing_articles = 1; # as many files as cited articles
27 my $cites_by_year = 0;
29 my $max_cites = 5000; # ISI limit to get cites
31 $q = unac_string( 'utf-8', join(' ', @ARGV) ) if @ARGV;
35 our $mech = WWW::Mechanize->new(
36 autocheck => 0, # it dies in reference download with it!
43 my $dir = '/tmp/isi/';
44 #rmtree $dir if -e $dir;
45 mkdir $dir unless -d $dir;
50 my $base_path = sprintf('%s/%04d', $dir,$step);
52 $path .= $mech->{ct} =~ m{html}i ? '.html' : '.txt';
53 $mech->save_content( $path );
54 warn "# [$step] $path ", -s $path, " ", $mech->ct, "\n";
55 open(my $dump, '>', "$base_path.dump.txt");
56 $mech->dump_all($dump);
60 $mech->get( 'http://www.webofknowledge.com/?DestApp=WOS' );
64 warn "# advanced serach";
65 $mech->follow_link( url_regex => qr/AdvancedSearch/ );
68 warn "# cookie_jar ", dump $mech->cookie_jar;
73 $q_this .= ' AND (' . join(' OR ', map { "PY=$_" } @{ shift @ranges } ) . ')';
76 warn "# submit_form search: $q_this\n";
79 'value(input1)' => $q_this,
85 $mech->follow_link( url_regex => qr/summary/ );
95 my $to = $from + $range_size;
97 warn "# submit_form results $from - $to\n";
101 form_name => 'output_form',
103 'value(record_select_type)' => 'range',
111 fields_selection => 'ACCESSION_NUM FUNDING SUBJECT_CATEGORY JCR_CATEGORY LANG IDS PAGEC SABBR CITREFC ISSN PUBINFO KEYWORDS CITTIMES ADDRS CONFERENCE_SPONSORS DOCTYPE ABSTRACT CONFERENCE_INFO SOURCE TITLE AUTHORS',
112 filters => 'ACCESSION_NUM FUNDING SUBJECT_CATEGORY JCR_CATEGORY LANG IDS PAGEC SABBR CITREFC ISSN PUBINFO KEYWORDS CITTIMES ADDRS CONFERENCE_SPONSORS DOCTYPE ABSTRACT CONFERENCE_INFO SOURCE TITLE AUTHORS CITREF',
113 fullrec_fields_option => 'CITREF',
115 save_options => 'fieldtagged',
116 format => 'saveToFile',
119 button => 'saveToFile',
124 if ( $mech->content =~ m{invalid API call} ) {
129 if ( $mech->content !~ m{Please wait while your request is processed} ) {
130 warn "WARNING: expecting processing request";
134 my $path = "/tmp/isi.$q.$from-$to";
135 $path .= '.' . $desc if $desc;
137 warn "save $from - $to into $path\n";
139 form_name => 'etsForm',
143 $from += $range_size - $overlap;
153 warn "# citation report";
154 $mech->follow_link( url_regex => qr/search_mode=CitationReport/ );
157 warn "view citing articles";
158 $mech->follow_link( url_regex => qr/search_mode=TotalCitingArticles/ );
163 my $years_url = $mech->find_link( url_regex => qr/ra_name=/ );
164 if ( ! $years_url ) {
165 warn "W: can't find ra_name link\n";
168 $years_url = $years_url->url_abs;
169 warn "## $years_url";
170 if ( $years_url !~ s{ra_name=\w+}{ra_name=PublicationYear} ) {
171 warn "W: no ra_name in $years_url\n";
174 warn "# refine years (hidden by javascript)";
175 # warn "http://apps.isiknowledge.com/RAMore.do?product=WOS&search_mode=TotalCitingArticles&SID=T1o6bChdN9PGP1LN1Nh&qid=3&ra_mode=more&ra_name=PublicationYear&db_id=WOS&viewType=raMore\n$years_url\n";
176 $mech->get( $years_url );
179 my $html = $mech->content;
181 while ( $html =~ s{<label.+?PublicationYear.+?>(\d{4})\s\(([\d,]+)\)</label>}{} ) {
182 my ( $year, $count ) = ( $1, $2 );
184 $years->{$year} = $count;
186 warn "# years ",dump $years;
189 my @y = sort keys %$years;
193 if ( $cites_by_year ) {
194 push @ranges, [ $_ ] foreach @y;
195 warn "# cites_by_year ranges ", dump @ranges;
200 my $size = $years->{$y};
203 $cites_range = [$y] if $y;
205 foreach my $y ( @y ) {
206 if ( $size + $years->{$y} > $max_cites ) {
207 push @ranges, $cites_range;
208 warn "# cites_range $size years ",dump( $cites_range ),$/;
213 $size += $years->{$y};
214 push @$cites_range, $y;
217 if ( $cites_range ) {
218 push @ranges, $cites_range;
219 warn "# cites_range $size years ",dump( $cites_range ), " FINAL\n"
222 warn '# ranges ', dump @ranges;
223 @ranges = () if $#ranges == 1; # just take all
232 warn "next_page $page\n";
235 form_name => 'summary_navigation',
243 $mech->form_name( 'summary_navigation' );
244 my $is_next_page = $mech->value('page') == $page;
245 warn "no next_page" unless $is_next_page;
246 return $is_next_page;
258 years unless @ranges;
263 $part .= $ranges[0]->[0] . '.';
275 if ( $q =~ m{CA=(.+)} && $cited_reference ) {
279 warn "# citated reference search";
280 $mech->follow_link( url_regex => qr/CitedReferenceSearch/ );
285 form_name => 'WOS_CitedReferenceSearch_input_form',
287 'value(input1)' => $CA,
292 my $records = $1 if $mech->content =~ m/(\d+)\s+records/;
293 warn "# found $records records\n";
294 my $last_span = 'fake';
297 save_mech "/tmp/isi.$q.citedref.$page";
299 last unless next_page();
302 if ( $mech->content =~ m/(\d+\s*-\s*(\d+))/ ) {
304 last if $2 == $records;
305 last if $1 == $last_span;
307 } elsif ( $page > 5 ) {
308 warn "ARTIFICALLY LIMITED TO 5 PAGES WITHOUT VALID SPAN!";
316 if ( $q =~ m{CA=(.+)} && $citing_articles ) {
325 foreach my $link ( $mech->find_all_links( url_regex => qr/CitingArticles.do/ ) ) {
328 $mech->get( $link->url );
330 $q = $orig_q . '.citing_article.' . $nr;
335 #last if $nr > 3; # FIXME only for development