fixed parsing for science direct html with more than one <a href=> per one <tr>

author Dobrica Pavlinusic <dpavlin@rot13.org>

Thu, 10 Jun 2004 22:05:38 +0000 (22:05 +0000)

committer Dobrica Pavlinusic <dpavlin@rot13.org>

Thu, 10 Jun 2004 22:05:38 +0000 (22:05 +0000)
author Dobrica Pavlinusic <dpavlin@rot13.org>
Thu, 10 Jun 2004 22:05:38 +0000 (22:05 +0000)
committer Dobrica Pavlinusic <dpavlin@rot13.org>
Thu, 10 Jun 2004 22:05:38 +0000 (22:05 +0000)
diff --git a/feeds/sciencedirect2.pl b/feeds/sciencedirect2.pl

index 430defa..0c8ba91 100755 (executable)
--- a/feeds/sciencedirect2.pl
+++ b/feeds/sciencedirect2.pl
@@ -14,6 +14,11 @@ use strict;
  
  my $debug=1;
  
+my $file;
+
+# uncomment following line if you want to use file instead of http connection
+#$file="list.html";
+
  # configure ScienceDirect CVS files location
  my $csv_dir="/data/isis_data/sciencedirect";
  my $j_holdings="sd_JournalHoldingsRpt.txt";
@@ -81,27 +86,35 @@ print STDERR "$c_nr categories assigned, $c_wo_h categories with holdings\n";
  
  $debug++ if (lc($ARGV[0]) eq "-d");
  
-my $ua = new LWP::UserAgent;
-$ua->agent("Mjesec educational harvester -- contact mglavica\@ffzg.hr 0.0");
-$ua->timeout(60);
-#$ua->env_proxy();
-#$ua->proxy(['http', 'ftp'], 'http://proxy.carnet.hr:8001/');
  
-print STDERR "getting '$url'...\n" if ($debug);
-my $req = HTTP::Request->new(GET => $url);
+my $res;
+if (! $file) {
+       my $ua = new LWP::UserAgent;
+       $ua->agent("Mjesec educational harvester -- contact mglavica\@ffzg.hr 0.0");
+       $ua->timeout(60);
+       #$ua->env_proxy();
+       #$ua->proxy(['http', 'ftp'], 'http://proxy.carnet.hr:8001/');
  
-my @out;
+       print STDERR "getting '$url'...\n" if ($debug);
+       my $req = HTTP::Request->new(GET => $url);
  
-my $res = $ua->request($req);
-if ($res->is_success) {
+       $res = $ua->request($req);
+} elsif (! -e $file) {
+       die "can't find feed file '$file'";
+}
+
+if ($file || $res->is_success) {
         print STDERR "parsing html...\n" if ($debug);
         my $tree = HTML::TreeBuilder->new;
-#      $tree->parse_file("list.html");   # !
-       $tree->parse($res->content);
+       if ($file) {
+               $tree->parse_file("list.html");
+       } else {
+               $tree->parse($res->content);
+       }
  
         foreach my $tr ($tree->look_down('_tag', 'tr')) {
                 my $link;
-               if ($link = $tr->look_down('_tag','a')) {
+               foreach my $link ($tr->look_down('_tag','a')) {
                         if ($link->attr('href') =~ m{/science\?_ob=JournalURL}) {
                                 my $j=nuc($link->as_text);
                                 if ($journal->{$j}) {
author	Dobrica Pavlinusic <dpavlin@rot13.org>
	Thu, 10 Jun 2004 22:05:38 +0000 (22:05 +0000)
committer	Dobrica Pavlinusic <dpavlin@rot13.org>
	Thu, 10 Jun 2004 22:05:38 +0000 (22:05 +0000)