4 Builds metadata about a book on the Internet Archive in json(p) format so that the book
5 can be accessed by other software including the Internet Archive BookReader.
7 Michael Ang <http://github.com/mangtronix>
9 Copyright (c) 2008-2010 Internet Archive. Software license AGPL version 3.
11 This file is part of BookReader.
13 BookReader is free software: you can redistribute it and/or modify
14 it under the terms of the GNU Affero General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
18 BookReader is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU Affero General Public License for more details.
23 You should have received a copy of the GNU Affero General Public License
24 along with BookReader. If not, see <http://www.gnu.org/licenses/>.
27 class BookReaderMeta {
29 // Fields from _meta.xml to add to response (if present)
30 var $metaFields = array(
33 'publisher' => 'publisher',
35 'language' => 'language',
36 'contributor' => 'contributor',
37 'collection' => 'collection',
38 'page-progression' => 'pageProgression',
42 var $metaDefaults = array(
43 'pageProgression' => 'lr',
46 // Stash spot for callback data... where are closures when we need them?
47 static $cbData = NULL;
49 // Builds metadata object (to be encoded as JSON)
50 function buildMetadata($id, $itemPath, $subPrefix, $server) {
55 $this->BRFatal("No identifier specified!");
58 if ("" == $itemPath) {
59 $this->BRFatal("No itemPath specified!");
63 $this->BRFatal("No server specified!");
66 if (!preg_match("|^/\d+/items/{$id}$|", $itemPath)) {
67 $this->BRFatal("Bad id!");
70 $filesDataFile = "$itemPath/${id}_files.xml";
72 if (file_exists($filesDataFile)) {
73 $filesData = simplexml_load_file("$itemPath/${id}_files.xml");
75 $this->BRfatal("File metadata not found!");
78 $imageStackInfo = $this->findImageStack($subPrefix, $filesData);
79 if ($imageStackInfo['imageFormat'] == 'unknown') {
80 $this->BRfatal('Couldn\'t find image stack');
82 // Update subPrefix -> may have been autodetected
83 $subPrefix = $imageStackInfo['subPrefix'];
84 $subItemPath = $itemPath . '/' . $subPrefix;
87 $imageFormat = $imageStackInfo['imageFormat'];
88 $archiveFormat = $imageStackInfo['archiveFormat'];
89 $imageStackFile = $itemPath . "/" . $imageStackInfo['imageStackFile'];
91 if ("unknown" == $imageFormat) {
92 $this->BRfatal("Unknown image format");
95 if ("unknown" == $archiveFormat) {
96 $this->BRfatal("Unknown archive format");
100 $scanDataFile = "${subItemPath}_scandata.xml";
101 $scanDataZip = "$itemPath/scandata.zip";
102 if (file_exists($scanDataFile)) {
103 $this->checkPrivs($scanDataFile);
104 $scanData = simplexml_load_file($scanDataFile);
105 } else if (file_exists($scanDataZip)) {
106 $this->checkPrivs($scanDataZip);
107 $cmd = 'unzip -p ' . escapeshellarg($scanDataZip) . ' scandata.xml';
108 exec($cmd, $output, $retval);
110 $this->BRFatal("Could not unzip ScanData!");
113 $dump = join("\n", $output);
114 $scanData = simplexml_load_string($dump);
115 } else if (file_exists("$itemPath/scandata.xml")) {
116 // For e.g. Scribe v.0 books!
117 $scanData = simplexml_load_file("$itemPath/scandata.xml");
119 $this->BRFatal("ScanData file not found!");
122 $metaDataFile = "$itemPath/{$id}_meta.xml";
123 if (!file_exists($metaDataFile)) {
124 $this->BRFatal("MetaData file not found!");
128 $metaData = simplexml_load_file($metaDataFile);
130 /* Find pages by type */
132 $coverLeafs = array();
133 foreach ($scanData->pageData->page as $page) {
134 if (("Title Page" == $page->pageType) || ("Title" == $page->pageType)) {
135 if ('' == $titleLeaf) {
137 $titleLeaf = "{$page['leafNum']}";
141 if (('Cover' == $page->pageType) || ('Cover Page' == $page->pageType)) {
142 array_push($coverLeafs, $page['leafNum']);
146 // These arrays map accessible page index numbers to width, height, scanned leaf numbers
147 // and page number strings (NB: these may not be unique)
148 $pageWidths = array();
149 $pageHeights = array();
153 foreach ($scanData->pageData->page as $page) {
154 if ($this->shouldAddPage($page)) {
155 $pageWidths[$i] = intval($page->cropBox->w);
156 $pageHeights[$i] = intval($page->cropBox->h);
157 $totalHeight += intval($page->cropBox->h/4) + 10;
158 $leafNums[$i] = intval($page['leafNum']);
159 $pageNums[$i] = $page->pageNumber . '';
164 # Load some values from meta.xml
165 foreach ($this->metaFields as $srcName => $destName) {
166 if ($metaData->{$srcName}) {
167 $response[$destName] = $metaData->{$srcName} . '';
169 if (array_key_exists($destName, $this->metaDefaults)) {
170 $response[$destName] = $this->metaDefaults[$destName];
176 $response['numPages'] = count($pageNums); // $$$ renamed
177 if ('' != $titleLeaf) {
178 $response['titleLeaf'] = $titleLeaf; // $$$ change to titleIndex - do leaf mapping here
179 $titleIndex = $this->indexForLeaf($titleLeaf, $leafNums);
180 if ($titleIndex !== NULL) {
181 $response['titleIndex'] = intval($titleIndex);
184 $response['url'] = "http://www.archive.org/details/$id";
185 $response['pageWidths'] = $pageWidths;
186 $response['pageHeights'] = $pageHeights;
187 $response['pageNums'] = $pageNums;
189 // Internet Archive specific
190 $response['itemId'] = $id; // $$$ renamed
191 $response['subPrefix'] = $subPrefix; // $$$ renamed
192 $response['itemPath'] = $itemPath;
193 $response['zip'] = $imageStackFile;
194 $response['server'] = $server;
195 $response['imageFormat'] = $imageFormat;
196 $response['archiveFormat'] = $archiveFormat;
197 $response['leafNums'] = $leafNums;
198 $response['previewImage'] = $this->previewURL('preview', $response);
200 // URL to title image
201 if ('' != $titleLeaf) {
202 $response['titleImage'] = $this->previewURL('title', $response);
205 if (count($coverLeafs) > 0) {
206 $coverIndices = array();
207 $coverImages = array();
208 foreach ($coverLeafs as $key => $leafNum) {
209 array_push($coverIndices, $this->indexForLeaf($leafNum, $leafNums));
210 // $$$ TODO use preview API once it supports multiple covers
211 array_push($coverImages, $this->imageUrl($leafNum, $response));
214 $response['coverIndices'] = $coverIndices;
215 $response['coverImages'] = $coverImages;
221 function emitResponse($metadata) {
222 $callback = $_REQUEST['callback'];
224 $contentType = 'application/json'; // default
226 if (! $this->isValidCallback($callback) ) {
227 $this->BRfatal("Invalid callback");
229 $contentType = 'text/javascript'; // JSONP is not JSON
232 header('Content-type: ' . $contentType . ';charset=UTF-8');
233 header('Access-Control-Allow-Origin: *'); // allow cross-origin requests
236 print $callback . '( ';
238 print json_encode($metadata);
244 function BRFatal($string) {
245 // $$$ TODO log error
246 throw new Exception("Metadata error: $string");
247 //echo "alert('$string');\n";
251 // Returns true if a page should be added based on it's information in
253 function shouldAddPage($page) {
254 // Return false only if the page is marked addToAccessFormats false.
255 // If there is no assertion we assume it should be added.
256 if (isset($page->addToAccessFormats)) {
257 if ("false" == strtolower(trim($page->addToAccessFormats))) {
265 // Returns { 'imageFormat' => , 'archiveFormat' => '} given a sub-item prefix and loaded xml data
266 function findImageStack($subPrefix, $filesData) {
268 // The order of the image formats determines which will be returned first
269 $imageFormats = array('JP2' => 'jp2', 'TIFF' => 'tif', 'JPEG' => 'jpg');
270 $imageFormatOrder = array_values($imageFormats);
271 $archiveFormats = array('ZIP' => 'zip', 'Tar' => 'tar');
272 $imageGroup = implode('|', array_keys($imageFormats));
273 $archiveGroup = implode('|', array_keys($archiveFormats));
274 // $$$ Currently only return processed images
275 $imageStackRegex = "/Single Page (Processed) (${imageGroup}) (${archiveGroup})/";
278 // - Find potential image stacks, regardless of subPrefix
279 // - If not given subPrefix sort based on potential subPrefix and assign based on asciibetical first
280 // - Filter results by subPrefix
281 // - Sort based on image format
284 $imageStacks = array();
285 foreach ($filesData->file as $file) {
286 if ( preg_match($imageStackRegex, $file->format, $matches) === 1 ) {
287 $imageFormat = $imageFormats[$matches[2]];
288 $archiveFormat = $archiveFormats[$matches[3]];
289 $imageStackFile = $file['name'] . '';
291 if ( preg_match("#(.*)_${imageFormat}\.${archiveFormat}#", $imageStackFile, $matches) === 0) {
292 // stack filename not regular
295 array_push($imageStacks, array(
296 'imageFormat' => $imageFormat,
297 'archiveFormat' => $archiveFormat,
298 'imageStackFile' => $imageStackFile,
299 'subPrefix' => $matches[1])
307 // print("found subPrefix $subPrefix\n");
308 // print_r($imageStacks);
311 function subPrefixSort($imageStackA, $imageStackB) {
312 return strcmp($imageStackA['subPrefix'], $imageStackB['subPrefix']);
315 usort($imageStacks, 'subPrefixSort');
316 $subPrefix = $imageStacks[0]['subPrefix'];
319 self::$cbData = $subPrefix;
320 function subPrefixFilter($imageStack) {
321 return $imageStack['subPrefix'] == BookReaderMeta::$cbData;
323 $imageStacks = array_filter($imageStacks, 'subPrefixFilter');
325 function formatSort($imageStackA, $imageStackB) {
326 $formatA = $imageStackA['imageFormat'];
327 $formatB = $imageStackB['imageFormat'];
328 if ($formatA == $formatB) {
332 $indexA = array_search($formatA, $imageFormatOrder);
333 $indexB = array_search($formatB, $imageFormatOrder);
334 // We already matched base on format, so both indices should be set
335 if ($indexA == $indexB) {
338 return ($indexA < $indexB) ? 1 : -1;
340 usort($imageStacks, 'formatSort'); // necessary to remap keys
342 if ( count($imageStacks) > 0 ) {
343 return $imageStacks[0];
345 return array('imageFormat' => 'unknown', 'archiveFormat' => 'unknown', 'imageStackFile' => 'unknown');
349 function isValidCallback($identifier) {
350 $pattern = '/^[a-zA-Z_$][a-zA-Z0-9_$]*$/';
351 return preg_match($pattern, $identifier) == 1;
354 function indexForLeaf($leafNum, $leafNums) {
355 $key = array_search($leafNum, $leafNums);
356 if ($key === FALSE) {
363 function leafForIndex($index, $leafNums) {
364 return $leafNums[$index]; // $$$ todo change to instance variables
367 function imageURL($leafNum, $metadata, $scale = null, $rotate = null) {
368 // "Under the hood", non-public, dynamically changing (achtung!) image URLs currently look like:
369 // http://{server}/BookReader/BookReaderImages.php?zip={zipPath}&file={filePath}&scale={scale}&rotate={rotate}
370 // e.g. http://ia311213.us.archive.org/BookReader/BookReaderImages.php?zip=/0/items/coloritsapplicat00andriala/coloritsapplicat00andriala_jp2.zip&file=coloritsapplicat00andriala_jp2/coloritsapplicat00andriala_0009.jp2&scale=8&rotate=0
373 $filePath = $this->imageFilePath($leafNum, $metadata['subPrefix'], $metadata['imageFormat']);
374 $url = 'http://' . $metadata['server'] . '/BookReader/BookReaderImages.php?zip=' . $metadata['zip'] . '&file=' . $filePath;
376 if ($scale !== null) {
377 $url .= '&scale=' . $scale;
379 if ($rotate !== null) {
380 $url .= '&rotate=' . $rotate;
386 // $$$ move inside BookReaderPreview
387 function previewURL($page, $metadata) {
389 'id' => $metadata['itemId'],
390 'subPrefix' => $metadata['subPrefix'],
391 'itemPath' => $metadata['itemPath'],
392 'server' => $metadata['server'],
396 return 'http://' . $metadata['server'] . '/BookReader/BookReaderPreview.php?' . http_build_query($query, '', '&');
399 function imageFilePath($leafNum, $subPrefix, $format) {
400 $pathParts = pathinfo($subPrefix);
401 $almostIdentifier = $pathParts['basename'];
402 return sprintf("%s_%s/%s_%04d.%s", $almostIdentifier, $format, $almostIdentifier, intval($leafNum), $format);
405 // Parse date from _meta.xml to integer
406 function parseYear($dateFromMetaXML) {
407 // grab the first run of digits
408 if (preg_match('|(\d+)|', $dateFromMetaXML, $matches)) {
409 return (int)$matches[1];
414 function processRequest($requestEnv) {
415 $id = $requestEnv['itemId']; // $$$ renamed
416 $itemPath = $requestEnv['itemPath'];
417 $subPrefix = $requestEnv['subPrefix']; // $$$ renamed
418 $server = $requestEnv['server'];
420 // Check if we're on a dev vhost and point to JSIA in the user's public_html on the datanode
421 // $$$ TODO consolidate this logic
422 $devHosts = array('testflip', 'rkumar', 'mang');
423 foreach ($devHosts as $host) {
424 if (strpos($_SERVER["REQUEST_URI"], '/~' . $host) === 0) { // Serving out of home dir
425 $server .= ':80/' . $host;
429 $this->emitResponse( $this->buildMetadata($id, $itemPath, $subPrefix, $server) );
432 function checkPrivs($filename) {
433 if (!is_readable($filename)) {
434 header('HTTP/1.1 403 Forbidden');