<config> tags (which use values from all2xml.conf) are now properly handled
[webpac] / feeds / sciencedirect.pl
1 #!/usr/bin/perl -w
2
3 # This script will fatch list of articles on which you have access
4 # (using IP authorisation) from ScienceDirect
5
6 use LWP::UserAgent;
7 use HTML::TreeBuilder;
8 use strict;
9
10 my $debug=1;
11
12 my $base_url = 'http://www.sciencedirect.com';
13 my $url = $base_url . '/science?_ob=JournalListURL&_type=subscribed&_stype=title&subjColl=all&_auth=y&_update=y&_frameSeg=M&_title=all&_acct=C000050661&_version=1&_urlVersion=0&_userid=1034703&md5=6d4b6e263318a1d7d2a3b523d861f920';
14
15 $debug++ if (lc($ARGV[0]) eq "-d");
16
17 my $ua = new LWP::UserAgent;
18 $ua->agent("Mjesec educational harvester -- contact mglavica\@ffzg.hr 0.0");
19 $ua->timeout(60);
20 #$ua->env_proxy();
21 #$ua->proxy(['http', 'ftp'], 'http://proxy.carnet.hr:8001/');
22
23 print STDERR "getting '$url'...\n" if ($debug);
24 my $req = HTTP::Request->new(GET => $url);
25
26 my @out;
27
28 my $res = $ua->request($req);
29 if ($res->is_success) {
30         print STDERR "parsing html...\n" if ($debug);
31         my $tree = HTML::TreeBuilder->new;
32 #       $tree->parse_file("list.html");   # !
33         $tree->parse($res->content);
34
35         foreach my $tr ($tree->look_down('_tag', 'tr')) {
36                 my $link;
37                 if ($link = $tr->look_down('_tag','a')) {
38                         if ($link->attr('href') =~ m{/science\?_ob=JournalURL}) {
39                                 print "0: ",$link->as_text."\n";
40                                 print "7: ",$base_url.$link->attr('href')."\n";
41                                 print "\n";
42                         }
43                 }
44         }
45
46         $tree->delete; # clear memory!
47
48 } else {
49     warn "can't fetch web page from '$url'";
50 }