3 Copyright(c)2008 Internet Archive. Software license AGPL version 3.
5 This file is part of BookReader.
7 BookReader is free software: you can redistribute it and/or modify
8 it under the terms of the GNU Affero General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
12 BookReader is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU Affero General Public License for more details.
17 You should have received a copy of the GNU Affero General Public License
18 along with BookReader. If not, see <http://www.gnu.org/licenses/>.
21 // FIXME: TODO: Change path above to production installation location of perl
22 // when we deploy, Brad Neuberg, bkn3@columbia.edu
28 // slight alterations by Brad Neuberg, bkn3@columbia.edu
30 // ported from perl to php by tracey, Oct 2005
32 //fixxx require_once '/petabox/setup.inc';
33 //I think this fixxx refers to the need to set DOCUMENT_ROOT below -- mang
35 if (strpos($_SERVER["REQUEST_URI"], "/~mang") === 0) { // Serving out of home dir
36 $_SERVER['DOCUMENT_ROOT']='/home/mang/petabox/www/sf';
37 require_once '/home/mang/petabox/setup.inc';
38 } else if (strpos($_SERVER["REQUEST_URI"], "/~testflip") === 0) { // Serving out of home dir
39 $_SERVER['DOCUMENT_ROOT']='/home/testflip/petabox/www/sf';
40 require_once '/home/testflip/petabox/setup.inc';
42 $_SERVER['DOCUMENT_ROOT']='/petabox/www/sf';
43 require_once '/petabox/setup.inc';
46 ini_set("memory_limit","200M"); // XML can be big, esp. brittanica (100MB)
48 /////// SETUP /////////////////////////////////////////////////////////////
50 $debug_level = 0; // 0=least, 3=most debugging info
52 $num_pre = 3; // context words before search term
53 $num_post = 9; // context words after search term
55 // defaults for testing (when no args given)
56 $url ='http://ia300202.us.archive.org/0/items/englishbookbindings00davenuoft/englishbookbindings00davenuoft_djvuxml.xml';
62 /////// SETUP /////////////////////////////////////////////////////////////
65 // fixxx prolly should escapesystemcall() these...
66 if (isset($_GET['url']))
68 if (isset($_GET['term']))
69 $term = $_GET['term'];
70 if (isset($_GET['format']))
71 $format = $_GET['format'];
72 if (isset($_GET['callback']))
73 $callback = $_GET['callback'];
75 //$url='http://homeserver.hq.archive.org/metafetch/thespy00cooparch_djvu.xml';
76 //$url='http://homeserver.hq.archive.org/metafetch/oldchristmas00irviarch_djvu.xml';
77 //$url = 'http://homeserver.hq.archive.org/metafetch/intlepisode00jamearch_djvu.xml';
82 // This is kinda weird (confession!) but allows existing calls to "fatal()"
83 // to throw an exception instead of dumping HTML to stdout/browser!
84 $GLOBALS['fatal-exceptions']=1;
90 // pageFiles was added to keep track of on what page each search match was
91 // found, Brad Neuberg, bkn3@columbia.edu
92 // pageHeights and pageWidths was added to track the size of each page so that
93 // we can send it over to the client; this is necessary for scaling the images
94 // for search, Brad Neuberg, bkn3@columbia.edu
97 $pageHeights = array();
98 $pageWidths = array();
101 $time0 = microtime(true);
102 $timestamp = date('Y-m-d H:i:s');
103 $pid = posix_getpid();
104 debug_msg("Invoked at ".$time0."=$timestamp under UID ".posix_getuid(),2);
108 //////////////////////////////////////
110 debug_msg("query: ".$_SERVER['QUERY_STRING'],3);
113 $term = preg_replace('/[^A-Za-z0-9 ]/', ' ', $term); // legal characters
114 $terms = explode(' ',$term);
115 debug_msg("url,term,format: $url,".var_export($terms,true).",$format",3);
118 if ($format == "HTML")
120 echo "<html><head><title>Search</title></head> <body> Searching <p>";
121 $tag_pre = '<b style="color:black;background-color:#A0FFFF">';
124 else if ($format == "XML")
126 if (false === $callback) {
127 header('Content-type: text/xml');
129 header('application/x-javascript');
131 $tag_pre = '</CONTEXT>';
132 $tag_post = '<CONTEXT>';
136 fatal("Unknown format request. ");
139 // Ensure file is readable
142 // This looks like where we load the djvu.xml - $$$ and rapidly exhaust memory for large books such as OED
143 if (!($document = file_get_contents($url)))
144 fatal("could not load $url");
148 $time1 = microtime(true) - $time0;
151 //// Pass 1 - build up page* arrays with xml fragments corresponding to matches
153 foreach (explode('</OBJECT>', $document) as $page)
156 if (matches_terms($page, $terms) &&
157 // 2nd clause here is to ensure that we aren't matching in the end
158 // of the overall XML document -- thus we ensure that OBJECT tag starts
159 // in the chunk we just were handed. (traceyj)
160 strstr($page, '<OBJECT '))
162 // extract the page value so that we know what page we are on,
163 // Brad Neuberg, bkn3@columbia.edu
164 if (!preg_match('|<PARAM name="PAGE" value="([^"]*)"\s*\/>|', $page, $match))
165 fatal("page value not set on page number $pagenumber in $page!");
166 $pageFile = $match[1];
168 // extract the page width and height, Brad Neuberg, bkn3@columbia.edu
169 if (!preg_match('/width="([^"]*)"/', $page, $match))
170 fatal("page width not set!");
171 $pageWidth = $match[1];
173 if (!preg_match('/height="([^"]*)"/', $page, $match))
174 fatal("page height not set!");
175 $pageHeight = $match[1];
178 foreach (explode('</WORD>',$page) as $token)
180 if (matches_terms($token, $terms))
182 list($junk, $keep) = explode('<WORD ',$token);
183 $token = " $tag_pre<WORD $keep</WORD>$tag_post ";
187 $token = preg_replace('/<[^<]*>/','', $token); //mark-up
188 $token = preg_replace('/[\&\#\d+;]/', ' ', $token);//non-ascii chars
189 $token = preg_replace('/\s+/', ' ', $token); //white space
198 preg_replace('|.*((\W\w*){'.$num_pre.'}'.$tag_pre.')|',"$1",$page_new);
200 preg_replace('/('.$tag_post.'(\w*\W){'.$num_post.'}).*/',"$1",$page_new);
203 // added to keep track of the page we are on
204 // Brad Neuberg, bkn3@columbia.edu
205 $pageFiles [$pagenumber] = $pageFile;
206 $pages [$pagenumber] = $page_new;
207 // added to keep track of page widths and heights
208 $pageWidths [$pagenumber] = $pageWidth;
209 $pageHeights[$pagenumber] = $pageHeight;
214 $time2 = microtime(true) - $time1;
216 //// Pass 2 - generate output from previously built arrays
219 if ($format == "HTML")
221 echo "Found ".count($pages)." pages containing $tag_pre";
223 echo "$tag_post.<br>\n";
224 foreach ($pages as $index => $page)
226 echo "<h4>Page $page:</h4>\n";
230 $time3 = microtime(true) - $time2;
231 echo $tag_pre . "Fetched document in $time1 ms.$tag_post<p>\n";
232 echo $tag_pre . "Processed document in $time2 ms.$tag_post<p>\n";
233 echo $tag_pre . "Printed document in $time3 ms.$tag_post<p>\n";
234 echo "</body></html>\n";
236 else if ($format == "XML")
239 $xml .= '<?xml version="1.0" encoding="utf-8"?>'."\n";
240 // Added to prevent Internet Explorer from adding default XML stylesheet,
241 // which messes up processing, Brad Neuberg, bkn3@columbia.edu
242 $xml .= '<?xml-stylesheet type="text/css" href="blank.css"?>'."\n";//fixxx
245 foreach ($pages as $index => $page)
247 $xml .= "<PAGE file=\"{$pageFiles[$index]}\" width=\"{$pageWidths[$index]}\" height=\"{$pageHeights[$index]}\">\n";
248 $xml .= "<CONTEXT>\n";
250 $xml .= "</CONTEXT>\n";
253 $xml .= "</SEARCH>\n";
255 if (false === $callback) {
256 // The XML contains the page numbers from the DJVU XML. We must remap them to flipbook indices
257 // since the flipbook indices are monotonically increasing generated from the pages with
258 // addToAccessFormats true (maybe)
259 $fsm = FlipSearchMap::buildSearchMap($url);
260 echo $fsm->remapSearch($xml);
262 $patterns[0] = '/\n/';
263 $patterns[1] = "/\'/";
265 $replac[1] = ''';
267 // We don't have FlipSearchMap remap since we have our own mapping between
268 // scandata.xml leaf numbers and BR indices that happens in BRSearchCallback
269 echo "$callback('". preg_replace($patterns, $replac, $xml)."');";
275 debug_msg("Done and exiting!",2);
281 // an internal method call invoked "fatal()"...
282 XML::resultMessage('error','internal_error', $e->getMessage());
288 function matches_terms(&$text, // search space
289 &$terms)// array of search terms
291 foreach ($terms as $term)
293 if (preg_match("/$term/i", $text))
300 function debug_msg($msg, $level)
305 if ($level <= $debug_level)
307 if ($format == "XML")
308 echo "<!-- FILL ($pid):$level: $msg -->\n";
310 echo "FILL ($pid):$level: $msg<br/>\n";
314 function checkPrivs($filename) {
315 if (!is_readable($filename)) {
316 header('HTTP/1.1 403 Forbidden');