BookReaderIA/datanode/flipbook_search_br.php

   1 <?php
   2 /*
   3 Copyright(c)2008 Internet Archive. Software license AGPL version 3.
   4
   5 This file is part of BookReader.
   6
   7     BookReader is free software: you can redistribute it and/or modify
   8     it under the terms of the GNU Affero General Public License as published by
   9     the Free Software Foundation, either version 3 of the License, or
  10     (at your option) any later version.
  11
  12     BookReader is distributed in the hope that it will be useful,
  13     but WITHOUT ANY WARRANTY; without even the implied warranty of
  14     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15     GNU Affero General Public License for more details.
  16
  17     You should have received a copy of the GNU Affero General Public License
  18     along with BookReader.  If not, see <http://www.gnu.org/licenses/>.
  19 */
  20
  21 // FIXME: TODO: Change path above to production installation location of perl
  22 // when we deploy, Brad Neuberg, bkn3@columbia.edu
  23 //
  24 // From:
  25 //   search.cgi v0.4
  26 //   by Ralf Muehlen
  27 //
  28 // slight alterations by Brad Neuberg, bkn3@columbia.edu
  29 //
  30 // ported from perl to php by tracey, Oct 2005
  31
  32 //fixxx require_once '/petabox/setup.inc';
  33 //I think this fixxx refers to the need to set DOCUMENT_ROOT below -- mang
  34
  35 if (strpos($_SERVER["REQUEST_URI"], "/~mang") === 0) { // Serving out of home dir
  36     $_SERVER['DOCUMENT_ROOT']='/home/mang/petabox/www/sf';
  37     require_once '/home/mang/petabox/setup.inc';
  38 } else if (strpos($_SERVER["REQUEST_URI"], "/~testflip") === 0) { // Serving out of home dir
  39     $_SERVER['DOCUMENT_ROOT']='/home/testflip/petabox/www/sf';
  40     require_once '/home/testflip/petabox/setup.inc';
  41 } else {
  42     $_SERVER['DOCUMENT_ROOT']='/petabox/www/sf';
  43     require_once '/petabox/setup.inc';
  44 }
  45
  46 ini_set("memory_limit","200M"); // XML can be big, esp. brittanica (100MB)
  47
  48 /////// SETUP /////////////////////////////////////////////////////////////
  49
  50 $debug_level = 0; // 0=least, 3=most debugging info
  51
  52 $num_pre  = 3; // context words before search term
  53 $num_post = 9; // context words after  search term
  54
  55 // defaults for testing (when no args given)
  56 $url ='http://ia300202.us.archive.org/0/items/englishbookbindings00davenuoft/englishbookbindings00davenuoft_djvuxml.xml';
  57
  58 $term = "history";
  59 $format = "XML";
  60 $callback = false;
  61
  62 /////// SETUP /////////////////////////////////////////////////////////////
  63
  64
  65 // fixxx prolly should escapesystemcall() these...
  66 if (isset($_GET['url']))
  67   $url =        $_GET['url'];
  68 if (isset($_GET['term']))
  69   $term =       $_GET['term'];
  70 if (isset($_GET['format']))
  71   $format =     $_GET['format'];
  72 if (isset($_GET['callback']))
  73   $callback =     $_GET['callback'];
  74
  75 //$url='http://homeserver.hq.archive.org/metafetch/thespy00cooparch_djvu.xml';
  76 //$url='http://homeserver.hq.archive.org/metafetch/oldchristmas00irviarch_djvu.xml';
  77 //$url = 'http://homeserver.hq.archive.org/metafetch/intlepisode00jamearch_djvu.xml';
  78
  79
  80 if ($format == "XML")
  81 {
  82   // This is kinda weird (confession!) but allows existing calls to "fatal()"
  83   // to throw an exception instead of dumping HTML to stdout/browser!
  84   $GLOBALS['fatal-exceptions']=1;
  85 }
  86 try
  87 {
  88
  89
  90 // pageFiles was added to keep track of on what page each search match was
  91 // found, Brad Neuberg, bkn3@columbia.edu
  92 // pageHeights and pageWidths was added to track the size of each page so that
  93 // we can send it over to the client; this is necessary for scaling the images
  94 // for search, Brad Neuberg, bkn3@columbia.edu
  95 $pages =       array();
  96 $pageFiles =   array();
  97 $pageHeights = array();
  98 $pageWidths =  array();
  99
 100
 101 $time0 = microtime(true);
 102 $timestamp = date('Y-m-d H:i:s');
 103 $pid = posix_getpid();
 104 debug_msg("Invoked at ".$time0."=$timestamp under UID ".posix_getuid(),2);
 105
 106
 107
 108 //////////////////////////////////////
 109
 110 debug_msg("query: ".$_SERVER['QUERY_STRING'],3);
 111
 112
 113 $term = preg_replace('/[^A-Za-z0-9 ]/', ' ', $term); // legal characters
 114 $terms = explode(' ',$term);
 115 debug_msg("url,term,format: $url,".var_export($terms,true).",$format",3);
 116
 117
 118 if ($format == "HTML")
 119 {
 120   echo "<html><head><title>Search</title></head> <body> Searching <p>";
 121   $tag_pre  = '<b style="color:black;background-color:#A0FFFF">';
 122   $tag_post = '</b>';
 123 }
 124 else if ($format == "XML")
 125 {
 126   if (false === $callback) {
 127       header('Content-type: text/xml');
 128   } else {
 129       header('application/x-javascript');
 130   }
 131   $tag_pre  = '</CONTEXT>';
 132   $tag_post = '<CONTEXT>';
 133 }
 134 else
 135 {
 136   fatal("Unknown format request. ");
 137 }
 138
 139 // Ensure file is readable
 140 checkPrivs($url);
 141
 142 // This looks like where we load the djvu.xml - $$$ and rapidly exhaust memory for large books such as OED
 143 if (!($document = file_get_contents($url)))
 144   fatal("could not load $url");
 145
 146
 147
 148 $time1 = microtime(true) - $time0;
 149
 150
 151 //// Pass 1 - build up page* arrays with xml fragments corresponding to matches
 152 $pagenumber=0;
 153 foreach (explode('</OBJECT>', $document) as $page)
 154 {
 155   $pagenumber++;
 156   if (matches_terms($page, $terms)  &&
 157       // 2nd clause here is to ensure that we aren't matching in the end
 158       // of the overall XML document -- thus we ensure that OBJECT tag starts
 159       // in the chunk we just were handed.  (traceyj)
 160       strstr($page, '<OBJECT '))
 161   {
 162     // extract the page value so that we know what page we are on,
 163     // Brad Neuberg, bkn3@columbia.edu
 164     if (!preg_match('|<PARAM name="PAGE" value="([^"]*)"\s*\/>|', $page, $match))
 165       fatal("page value not set on page number $pagenumber in $page!");
 166     $pageFile = $match[1];
 167
 168     // extract the page width and height, Brad Neuberg, bkn3@columbia.edu
 169     if (!preg_match('/width="([^"]*)"/', $page, $match))
 170       fatal("page width not set!");
 171     $pageWidth = $match[1];
 172
 173     if (!preg_match('/height="([^"]*)"/', $page, $match))
 174       fatal("page height not set!");
 175     $pageHeight = $match[1];
 176
 177     $page_new='';
 178     foreach (explode('</WORD>',$page) as $token)
 179     {
 180       if (matches_terms($token, $terms))
 181       {
 182         list($junk, $keep) = explode('<WORD ',$token);
 183         $token = " $tag_pre<WORD $keep</WORD>$tag_post ";
 184       }
 185       else
 186       {
 187         $token = preg_replace('/<[^<]*>/','', $token);     //mark-up
 188         $token = preg_replace('/[\&\#\d+;]/', ' ', $token);//non-ascii chars
 189         $token = preg_replace('/\s+/', ' ', $token);       //white space
 190       }
 191
 192       $page_new .= $token;
 193     }
 194
 195
 196
 197     $page_new =
 198       preg_replace('|.*((\W\w*){'.$num_pre.'}'.$tag_pre.')|',"$1",$page_new);
 199     $page_new =
 200       preg_replace('/('.$tag_post.'(\w*\W){'.$num_post.'}).*/',"$1",$page_new);
 201
 202
 203     // added to keep track of the page we are on
 204     // Brad Neuberg, bkn3@columbia.edu
 205     $pageFiles  [$pagenumber] = $pageFile;
 206     $pages      [$pagenumber] = $page_new;
 207     // added to keep track of page widths and heights
 208     $pageWidths [$pagenumber] = $pageWidth;
 209     $pageHeights[$pagenumber] = $pageHeight;
 210   }
 211 }
 212
 213
 214 $time2 =  microtime(true) - $time1;
 215
 216 //// Pass 2 - generate output from previously built arrays
 217
 218
 219 if ($format == "HTML")
 220 {
 221   echo "Found ".count($pages)." pages containing $tag_pre";
 222   print_r($terms);
 223   echo "$tag_post.<br>\n";
 224   foreach ($pages as $index => $page)
 225   {
 226     echo "<h4>Page $page:</h4>\n";
 227     print_r($page);
 228     echo "<p><br><p>\n";
 229   }
 230   $time3 = microtime(true) - $time2;
 231   echo $tag_pre . "Fetched document in $time1 ms.$tag_post<p>\n";
 232   echo $tag_pre . "Processed document in $time2 ms.$tag_post<p>\n";
 233   echo $tag_pre . "Printed document in $time3 ms.$tag_post<p>\n";
 234   echo "</body></html>\n";
 235 }
 236 else if ($format == "XML")
 237 {
 238   $xml = "";
 239   $xml .= '<?xml version="1.0" encoding="utf-8"?>'."\n";
 240   // Added to prevent Internet Explorer from adding default XML stylesheet,
 241   // which messes up processing, Brad Neuberg, bkn3@columbia.edu
 242   $xml .= '<?xml-stylesheet type="text/css" href="blank.css"?>'."\n";//fixxx
 243   $xml .= '<SEARCH>';
 244
 245   foreach ($pages as $index => $page)
 246   {
 247     $xml .= "<PAGE file=\"{$pageFiles[$index]}\" width=\"{$pageWidths[$index]}\" height=\"{$pageHeights[$index]}\">\n";
 248     $xml .= "<CONTEXT>\n";
 249     $xml .= $page;
 250     $xml .= "</CONTEXT>\n";
 251     $xml .= "</PAGE>\n";
 252   }
 253   $xml .= "</SEARCH>\n";
 254
 255   if (false === $callback) {
 256       // The XML contains the page numbers from the DJVU XML.  We must remap them to flipbook indices
 257       // since the flipbook indices are monotonically increasing generated from the pages with
 258       // addToAccessFormats true (maybe)
 259       $fsm = FlipSearchMap::buildSearchMap($url);
 260       echo $fsm->remapSearch($xml);
 261   } else {
 262       $patterns[0] = '/\n/';
 263       $patterns[1] = "/\'/";
 264       $replac[0]   = '';
 265       $replac[1]   = '&#39;';
 266
 267       // We don't have FlipSearchMap remap since we have our own mapping between
 268       // scandata.xml leaf numbers and BR indices that happens in BRSearchCallback
 269       echo "$callback('". preg_replace($patterns, $replac, $xml)."');";
 270   }
 271   //echo $xml;
 272 }
 273
 274 //////
 275 debug_msg("Done and exiting!",2);
 276 exit;
 277 //////
 278 }
 279 catch (Exception $e)
 280 {
 281   // an internal method call invoked "fatal()"...
 282   XML::resultMessage('error','internal_error', $e->getMessage());
 283 }
 284
 285
 286
 287
 288 function matches_terms(&$text, // search space
 289                        &$terms)// array of search terms
 290 {
 291   foreach ($terms as $term)
 292   {
 293     if (preg_match("/$term/i", $text))
 294       return true;
 295   }
 296   return false;
 297 }
 298
 299
 300 function debug_msg($msg, $level)
 301 {
 302   global $debug_level;
 303   global $pid;
 304   global $format;
 305   if ($level <= $debug_level)
 306   {
 307     if ($format == "XML")
 308       echo "<!-- FILL  ($pid):$level: $msg -->\n";
 309     else
 310       echo "FILL  ($pid):$level: $msg<br/>\n";
 311   }
 312 }
 313
 314 function checkPrivs($filename) {
 315     if (!is_readable($filename)) {
 316         header('HTTP/1.1 403 Forbidden');
 317         exit(0);
 318     }
 319 }
 320
 321
 322 ?>