<config> tags (which use values from all2xml.conf) are now properly handled
[webpac] / feeds / sciencedirect2.pl
1 #!/usr/bin/perl -w
2
3 # This script will fatch list of articles on which you have access
4 # (using IP authorisation) from ScienceDirect
5 #
6 # This version requires CSV dumps from ScienceDirect for Holdings data
7 # and categories, but can output much more data about each record
8
9 use LWP::UserAgent;
10 use HTML::TreeBuilder;
11 require Text::CSV;
12 use Text::Unaccent;
13 use strict;
14
15 my $debug=1;
16
17 # configure ScienceDirect CVS files location
18 my $csv_dir="/data/isis_data/sciencedirect";
19 my $j_holdings="sd_JournalHoldingsRpt.txt";
20 my $j_category="sd_Journal_Category.txt";
21
22 # URL to list of subscribed journals
23 my $url = 'http://www.sciencedirect.com/science?_ob=JournalListURL&_type=subscribed&_stype=title&subjColl=all&_auth=y&_update=y&_frameSeg=M&_title=all&_acct=C000050661&_version=1&_urlVersion=0&_userid=1034703&md5=6d4b6e263318a1d7d2a3b523d861f920';
24 my $html_codepage="iso-8859-1";
25
26 my $csv = Text::CSV->new(); 
27 my $journal;
28 my $c_wo_h = 0;         # category without holding record
29 my $c_nr = 0;           # number of categories assigned
30
31 my $j_basic = 0;
32 my $j_detailed = 0;
33
34 print STDERR "unrolling $j_holdings\n";
35
36 sub nuc {
37         # normalizing UC
38         my $s=shift @_ || return "";
39         $s=unac_string($html_codepage,$s);
40         $s=~s/[^\w]/ /g;
41         $s=~s/  +/ /g;
42         return uc($s);
43 }
44
45 open(H,"$csv_dir/$j_holdings") || die "can't open $csv_dir/$j_holdings: $!";
46 my $line = <H>;         # skip header line
47 while(<H>) {
48         chomp;
49         $csv->parse($_) || warn "can't parse '$_': ".$csv->error_input;
50         my @data = $csv->fields;
51         my $key = nuc($data[0]);
52         push @data,"";                  # for categories later...
53         $journal->{$key} = \@data;
54 }
55 close(H);
56
57 print STDERR "unrolling $j_category\n";
58
59 open(C,"$csv_dir/$j_category") || die "can't open $csv_dir/$j_category: $!";
60 $line = <C>;            # skip header line
61 while(<C>) {
62         chomp;
63         $csv->parse($_) || warn "can't parse '$_': ".$csv->error_input;
64         my @data = $csv->fields;
65         my $key = nuc($data[1]);
66         if (! $journal->{$key}) {
67                 $c_wo_h++;
68                 next;
69         }
70
71         foreach my $i (4, 6, 8, 10) {
72                 push @{$journal->{$key}},$data[$i] || "";
73                 if ($data[$i]) {
74                         $c_nr++;
75                 }
76         }
77 }
78 close(C);
79
80 print STDERR "$c_nr categories assigned, $c_wo_h categories with holdings\n";
81
82 $debug++ if (lc($ARGV[0]) eq "-d");
83
84 my $ua = new LWP::UserAgent;
85 $ua->agent("Mjesec educational harvester -- contact mglavica\@ffzg.hr 0.0");
86 $ua->timeout(60);
87 #$ua->env_proxy();
88 #$ua->proxy(['http', 'ftp'], 'http://proxy.carnet.hr:8001/');
89
90 print STDERR "getting '$url'...\n" if ($debug);
91 my $req = HTTP::Request->new(GET => $url);
92
93 my @out;
94
95 my $res = $ua->request($req);
96 if ($res->is_success) {
97         print STDERR "parsing html...\n" if ($debug);
98         my $tree = HTML::TreeBuilder->new;
99 #       $tree->parse_file("list.html");   # !
100         $tree->parse($res->content);
101
102         foreach my $tr ($tree->look_down('_tag', 'tr')) {
103                 my $link;
104                 if ($link = $tr->look_down('_tag','a')) {
105                         if ($link->attr('href') =~ m{/science\?_ob=JournalURL}) {
106                                 my $j=nuc($link->as_text);
107                                 if ($journal->{$j}) {
108                                         my $i=0;
109                                         foreach my $line (@{$journal->{$j}}) {
110                                                 print $i++,": $line\n";
111                                         }
112                                         $j_detailed++;
113                                 } else {
114                                         print "0: ",$link->as_text."\n";
115                                         print "7: http://www.sciencedirect.com",$link->attr('href')."\n";
116                                         $j_basic++;
117                                         print STDERR "can't find details for $j\n" if ($debug);
118                                 }
119
120                                 print "\n";
121                         }
122                 }
123         }
124
125         $tree->delete; # clear memory!
126
127 } else {
128     warn "can't fetch web page from '$url'";
129 }
130
131 print STDERR "Processed ",($j_basic+$j_detailed)," journals, $j_basic with basic data and $j_detailed detailed\n";
132