git.rot13.org Git - webpac/blob - feeds/sciencedirect.pl

   1 #!/usr/bin/perl -w
   2
   3 # This script will fatch list of articles on which you have access
   4 # (using IP authorisation) from ScienceDirect
   5
   6 use LWP::UserAgent;
   7 use HTML::TreeBuilder;
   8 use strict;
   9
  10 my $debug=1;
  11
  12 my $base_url = 'http://www.sciencedirect.com';
  13 my $url = $base_url . '/science?_ob=JournalListURL&_type=subscribed&_stype=title&subjColl=all&_auth=y&_update=y&_frameSeg=M&_title=all&_acct=C000050661&_version=1&_urlVersion=0&_userid=1034703&md5=6d4b6e263318a1d7d2a3b523d861f920';
  14
  15 $debug++ if (lc($ARGV[0]) eq "-d");
  16
  17 my $ua = new LWP::UserAgent;
  18 $ua->agent("Mjesec educational harvester -- contact mglavica\@ffzg.hr 0.0");
  19 $ua->timeout(60);
  20 #$ua->env_proxy();
  21 #$ua->proxy(['http', 'ftp'], 'http://proxy.carnet.hr:8001/');
  22
  23 print STDERR "getting '$url'...\n" if ($debug);
  24 my $req = HTTP::Request->new(GET => $url);
  25
  26 my @out;
  27
  28 my $res = $ua->request($req);
  29 if ($res->is_success) {
  30         print STDERR "parsing html...\n" if ($debug);
  31         my $tree = HTML::TreeBuilder->new;
  32 #       $tree->parse_file("list.html");   # !
  33         $tree->parse($res->content);
  34
  35         foreach my $tr ($tree->look_down('_tag', 'tr')) {
  36                 my $link;
  37                 if ($link = $tr->look_down('_tag','a')) {
  38                         if ($link->attr('href') =~ m{/science\?_ob=JournalURL}) {
  39                                 print "0: ",$link->as_text."\n";
  40                                 print "7: ",$base_url.$link->attr('href')."\n";
  41                                 print "\n";
  42                         }
  43                 }
  44         }
  45
  46         $tree->delete; # clear memory!
  47
  48 } else {
  49     warn "can't fetch web page from '$url'";
  50 }