git.rot13.org Git - webpac/blob - feeds/sciencedirect2.pl

   1 #!/usr/bin/perl -w
   2
   3 # This script will fatch list of articles on which you have access
   4 # (using IP authorisation) from ScienceDirect
   5 #
   6 # This version requires CSV dumps from ScienceDirect for Holdings data
   7 # and categories, but can output much more data about each record
   8
   9 use LWP::UserAgent;
  10 use HTML::TreeBuilder;
  11 require Text::CSV;
  12 use Text::Unaccent;
  13 use strict;
  14
  15 my $debug=1;
  16
  17 my $file;
  18
  19 # uncomment following line if you want to use file instead of http connection
  20 #$file="list.html";
  21
  22 # configure ScienceDirect CVS files location
  23 my $csv_dir="/data/isis_data/sciencedirect";
  24 my $j_holdings="sd_JournalHoldingsRpt.txt";
  25 my $j_category="sd_Journal_Category.txt";
  26
  27 # URL to list of subscribed journals
  28 my $url = 'http://www.sciencedirect.com/science?_ob=JournalListURL&_type=subscribed&_stype=title&subjColl=all&_auth=y&_update=y&_frameSeg=M&_title=all&_acct=C000050661&_version=1&_urlVersion=0&_userid=1034703&md5=6d4b6e263318a1d7d2a3b523d861f920';
  29 my $html_codepage="iso-8859-1";
  30
  31 my $csv = Text::CSV->new();
  32 my $journal;
  33 my $c_wo_h = 0;         # category without holding record
  34 my $c_nr = 0;           # number of categories assigned
  35
  36 my $j_basic = 0;
  37 my $j_detailed = 0;
  38
  39 print STDERR "unrolling $j_holdings\n";
  40
  41 sub nuc {
  42         # normalizing UC
  43         my $s=shift @_ || return "";
  44         $s=unac_string($html_codepage,$s);
  45         $s=~s/[^\w]/ /g;
  46         $s=~s/  +/ /g;
  47         return uc($s);
  48 }
  49
  50 open(H,"$csv_dir/$j_holdings") || die "can't open $csv_dir/$j_holdings: $!";
  51 my $line = <H>;         # skip header line
  52 while(<H>) {
  53         chomp;
  54         $csv->parse($_) || warn "can't parse '$_': ".$csv->error_input;
  55         my @data = $csv->fields;
  56         my $key = nuc($data[0]);
  57         push @data,"";                  # for categories later...
  58         $journal->{$key} = \@data;
  59 }
  60 close(H);
  61
  62 print STDERR "unrolling $j_category\n";
  63
  64 open(C,"$csv_dir/$j_category") || die "can't open $csv_dir/$j_category: $!";
  65 $line = <C>;            # skip header line
  66 while(<C>) {
  67         chomp;
  68         $csv->parse($_) || warn "can't parse '$_': ".$csv->error_input;
  69         my @data = $csv->fields;
  70         my $key = nuc($data[1]);
  71         if (! $journal->{$key}) {
  72                 $c_wo_h++;
  73                 next;
  74         }
  75
  76         foreach my $i (4, 6, 8, 10) {
  77                 push @{$journal->{$key}},$data[$i] || "";
  78                 if ($data[$i]) {
  79                         $c_nr++;
  80                 }
  81         }
  82 }
  83 close(C);
  84
  85 print STDERR "$c_nr categories assigned, $c_wo_h categories with holdings\n";
  86
  87 $debug++ if (lc($ARGV[0]) eq "-d");
  88
  89
  90 my $res;
  91 if (! $file) {
  92         my $ua = new LWP::UserAgent;
  93         $ua->agent("Mjesec educational harvester -- contact mglavica\@ffzg.hr 0.0");
  94         $ua->timeout(60);
  95         #$ua->env_proxy();
  96         #$ua->proxy(['http', 'ftp'], 'http://proxy.carnet.hr:8001/');
  97
  98         print STDERR "getting '$url'...\n" if ($debug);
  99         my $req = HTTP::Request->new(GET => $url);
 100
 101         $res = $ua->request($req);
 102 } elsif (! -e $file) {
 103         die "can't find feed file '$file'";
 104 }
 105
 106 if ($file || $res->is_success) {
 107         print STDERR "parsing html...\n" if ($debug);
 108         my $tree = HTML::TreeBuilder->new;
 109         if ($file) {
 110                 $tree->parse_file("list.html");
 111         } else {
 112                 $tree->parse($res->content);
 113         }
 114
 115         foreach my $tr ($tree->look_down('_tag', 'tr')) {
 116                 my $link;
 117                 foreach my $link ($tr->look_down('_tag','a')) {
 118                         if ($link->attr('href') =~ m{/science\?_ob=JournalURL}) {
 119                                 my $j=nuc($link->as_text);
 120                                 if ($journal->{$j}) {
 121                                         my $i=0;
 122                                         foreach my $line (@{$journal->{$j}}) {
 123                                                 print $i++,": $line\n";
 124                                         }
 125                                         $j_detailed++;
 126                                 } else {
 127                                         print "0: ",$link->as_text."\n";
 128                                         print "7: http://www.sciencedirect.com",$link->attr('href')."\n";
 129                                         $j_basic++;
 130                                         print STDERR "can't find details for $j\n" if ($debug);
 131                                 }
 132
 133                                 print "\n";
 134                         }
 135                 }
 136         }
 137
 138         $tree->delete; # clear memory!
 139
 140 } else {
 141     warn "can't fetch web page from '$url'";
 142 }
 143
 144 print STDERR "Processed ",($j_basic+$j_detailed)," journals, $j_basic with basic data and $j_detailed detailed\n";
 145