. */ // FIXME: TODO: Change path above to production installation location of perl // when we deploy, Brad Neuberg, bkn3@columbia.edu // // From: // search.cgi v0.4 // by Ralf Muehlen // // slight alterations by Brad Neuberg, bkn3@columbia.edu // // ported from perl to php by tracey, Oct 2005 //fixxx require_once '/petabox/setup.inc'; //I think this fixxx refers to the need to set DOCUMENT_ROOT below -- mang if (strpos($_SERVER["REQUEST_URI"], "/~mang") === 0) { // Serving out of home dir $_SERVER['DOCUMENT_ROOT']='/home/mang/petabox/www/sf'; require_once '/home/mang/petabox/setup.inc'; } else if (strpos($_SERVER["REQUEST_URI"], "/~testflip") === 0) { // Serving out of home dir $_SERVER['DOCUMENT_ROOT']='/home/testflip/petabox/www/sf'; require_once '/home/testflip/petabox/setup.inc'; } else { $_SERVER['DOCUMENT_ROOT']='/petabox/www/sf'; require_once '/petabox/setup.inc'; } ini_set("memory_limit","200M"); // XML can be big, esp. brittanica (100MB) /////// SETUP ///////////////////////////////////////////////////////////// $debug_level = 0; // 0=least, 3=most debugging info $num_pre = 3; // context words before search term $num_post = 9; // context words after search term // defaults for testing (when no args given) $url ='http://ia300202.us.archive.org/0/items/englishbookbindings00davenuoft/englishbookbindings00davenuoft_djvuxml.xml'; $term = "history"; $format = "XML"; $callback = false; /////// SETUP ///////////////////////////////////////////////////////////// // fixxx prolly should escapesystemcall() these... if (isset($_GET['url'])) $url = $_GET['url']; if (isset($_GET['term'])) $term = $_GET['term']; if (isset($_GET['format'])) $format = $_GET['format']; if (isset($_GET['callback'])) $callback = $_GET['callback']; //$url='http://homeserver.hq.archive.org/metafetch/thespy00cooparch_djvu.xml'; //$url='http://homeserver.hq.archive.org/metafetch/oldchristmas00irviarch_djvu.xml'; //$url = 'http://homeserver.hq.archive.org/metafetch/intlepisode00jamearch_djvu.xml'; if ($format == "XML") { // This is kinda weird (confession!) but allows existing calls to "fatal()" // to throw an exception instead of dumping HTML to stdout/browser! $GLOBALS['fatal-exceptions']=1; } try { // pageFiles was added to keep track of on what page each search match was // found, Brad Neuberg, bkn3@columbia.edu // pageHeights and pageWidths was added to track the size of each page so that // we can send it over to the client; this is necessary for scaling the images // for search, Brad Neuberg, bkn3@columbia.edu $pages = array(); $pageFiles = array(); $pageHeights = array(); $pageWidths = array(); $time0 = microtime(true); $timestamp = date('Y-m-d H:i:s'); $pid = posix_getpid(); debug_msg("Invoked at ".$time0."=$timestamp under UID ".posix_getuid(),2); ////////////////////////////////////// debug_msg("query: ".$_SERVER['QUERY_STRING'],3); $term = preg_replace('/[^A-Za-z0-9 ]/', ' ', $term); // legal characters $terms = explode(' ',$term); debug_msg("url,term,format: $url,".var_export($terms,true).",$format",3); if ($format == "HTML") { echo "Search Searching

"; $tag_pre = ''; $tag_post = ''; } else if ($format == "XML") { if (false === $callback) { header('Content-type: text/xml'); } else { header('application/x-javascript'); } $tag_pre = ''; $tag_post = ''; } else { fatal("Unknown format request. "); } // Ensure file is readable checkPrivs($url); // This looks like where we load the djvu.xml - $$$ and rapidly exhaust memory for large books such as OED if (!($document = file_get_contents($url))) fatal("could not load $url"); $time1 = microtime(true) - $time0; //// Pass 1 - build up page* arrays with xml fragments corresponding to matches $pagenumber=0; foreach (explode('', $document) as $page) { $pagenumber++; if (matches_terms($page, $terms) && // 2nd clause here is to ensure that we aren't matching in the end // of the overall XML document -- thus we ensure that OBJECT tag starts // in the chunk we just were handed. (traceyj) strstr($page, '|', $page, $match)) fatal("page value not set on page number $pagenumber in $page!"); $pageFile = $match[1]; // extract the page width and height, Brad Neuberg, bkn3@columbia.edu if (!preg_match('/width="([^"]*)"/', $page, $match)) fatal("page width not set!"); $pageWidth = $match[1]; if (!preg_match('/height="([^"]*)"/', $page, $match)) fatal("page height not set!"); $pageHeight = $match[1]; $page_new=''; foreach (explode('',$page) as $token) { if (matches_terms($token, $terms)) { list($junk, $keep) = explode('/','', $token); //mark-up $token = preg_replace('/[\&\#\d+;]/', ' ', $token);//non-ascii chars $token = preg_replace('/\s+/', ' ', $token); //white space } $page_new .= $token; } $page_new = preg_replace('|.*((\W\w*){'.$num_pre.'}'.$tag_pre.')|',"$1",$page_new); $page_new = preg_replace('/('.$tag_post.'(\w*\W){'.$num_post.'}).*/',"$1",$page_new); // added to keep track of the page we are on // Brad Neuberg, bkn3@columbia.edu $pageFiles [$pagenumber] = $pageFile; $pages [$pagenumber] = $page_new; // added to keep track of page widths and heights $pageWidths [$pagenumber] = $pageWidth; $pageHeights[$pagenumber] = $pageHeight; } } $time2 = microtime(true) - $time1; //// Pass 2 - generate output from previously built arrays if ($format == "HTML") { echo "Found ".count($pages)." pages containing $tag_pre"; print_r($terms); echo "$tag_post.
\n"; foreach ($pages as $index => $page) { echo "

Page $page:

\n"; print_r($page); echo "


\n"; } $time3 = microtime(true) - $time2; echo $tag_pre . "Fetched document in $time1 ms.$tag_post

\n"; echo $tag_pre . "Processed document in $time2 ms.$tag_post

\n"; echo $tag_pre . "Printed document in $time3 ms.$tag_post

\n"; echo "\n"; } else if ($format == "XML") { $xml = ""; $xml .= ''."\n"; // Added to prevent Internet Explorer from adding default XML stylesheet, // which messes up processing, Brad Neuberg, bkn3@columbia.edu $xml .= ''."\n";//fixxx $xml .= ''; foreach ($pages as $index => $page) { $xml .= "\n"; $xml .= "\n"; $xml .= $page; $xml .= "\n"; $xml .= "\n"; } $xml .= "\n"; if (false === $callback) { // The XML contains the page numbers from the DJVU XML. We must remap them to flipbook indices // since the flipbook indices are monotonically increasing generated from the pages with // addToAccessFormats true (maybe) $fsm = FlipSearchMap::buildSearchMap($url); echo $fsm->remapSearch($xml); } else { $patterns[0] = '/\n/'; $patterns[1] = "/\'/"; $replac[0] = ''; $replac[1] = '''; // We don't have FlipSearchMap remap since we have our own mapping between // scandata.xml leaf numbers and BR indices that happens in BRSearchCallback echo "$callback('". preg_replace($patterns, $replac, $xml)."');"; } //echo $xml; } ////// debug_msg("Done and exiting!",2); exit; ////// } catch (Exception $e) { // an internal method call invoked "fatal()"... XML::resultMessage('error','internal_error', $e->getMessage()); } function matches_terms(&$text, // search space &$terms)// array of search terms { foreach ($terms as $term) { if (preg_match("/$term/i", $text)) return true; } return false; } function debug_msg($msg, $level) { global $debug_level; global $pid; global $format; if ($level <= $debug_level) { if ($format == "XML") echo "\n"; else echo "FILL ($pid):$level: $msg
\n"; } } function checkPrivs($filename) { if (!is_readable($filename)) { header('HTTP/1.1 403 Forbidden'); exit(0); } } ?>