BookReaderIA/datanode/BookReaderMeta.inc.php

   1 <?
   2 /*
   3
   4 Builds metadata about a book on the Internet Archive in json(p) format so that the book
   5 can be accessed by other software including the Internet Archive BookReader.
   6
   7 Michael Ang <http://github.com/mangtronix>
   8
   9 Copyright (c) 2008-2010 Internet Archive. Software license AGPL version 3.
  10
  11 This file is part of BookReader.
  12
  13     BookReader is free software: you can redistribute it and/or modify
  14     it under the terms of the GNU Affero General Public License as published by
  15     the Free Software Foundation, either version 3 of the License, or
  16     (at your option) any later version.
  17
  18     BookReader is distributed in the hope that it will be useful,
  19     but WITHOUT ANY WARRANTY; without even the implied warranty of
  20     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21     GNU Affero General Public License for more details.
  22
  23     You should have received a copy of the GNU Affero General Public License
  24     along with BookReader.  If not, see <http://www.gnu.org/licenses/>.
  25 */
  26
  27 class BookReaderMeta {
  28
  29     // Fields from _meta.xml to add to response (if present)
  30     var $metaFields = array(
  31         'title' => 'title',
  32         'author' => 'author',
  33         'publisher' => 'publisher',
  34         'date' => 'date',
  35         'language' => 'language',
  36         'contributor' => 'contributor',
  37         'collection' => 'collection',
  38         'page-progression' => 'pageProgression',
  39     );
  40
  41     var $metaDefaults = array(
  42         'pageProgression' => 'lr',
  43     );
  44
  45     // Builds metadata object (to be encoded as JSON)
  46     function buildMetadata($id, $itemPath, $bookId, $server) {
  47
  48         $response = array();
  49
  50         if (! $bookId) {
  51             $bookId = $id;
  52         }
  53         $subItemPath = $itemPath . '/' . $bookId;
  54
  55         if ("" == $id) {
  56             $this->BRFatal("No identifier specified!");
  57         }
  58
  59         if ("" == $itemPath) {
  60             $this->BRFatal("No itemPath specified!");
  61         }
  62
  63         if ("" == $server) {
  64             $this->BRFatal("No server specified!");
  65         }
  66
  67         if (!preg_match("|^/\d+/items/{$id}$|", $itemPath)) {
  68             $this->BRFatal("Bad id!");
  69         }
  70
  71         // XXX check here that subitem is okay
  72
  73         $filesDataFile = "$itemPath/${id}_files.xml";
  74
  75         if (file_exists($filesDataFile)) {
  76             $filesData = simplexml_load_file("$itemPath/${id}_files.xml");
  77         } else {
  78             $this->BRfatal("File metadata not found!");
  79         }
  80
  81         $imageStackInfo = $this->findImageStack($bookId, $filesData);
  82         if ($imageStackInfo['imageFormat'] == 'unknown') {
  83             $this->BRfatal('Couldn\'t find image stack');
  84         }
  85
  86         $imageFormat = $imageStackInfo['imageFormat'];
  87         $archiveFormat = $imageStackInfo['archiveFormat'];
  88         $imageStackFile = $itemPath . "/" . $imageStackInfo['imageStackFile'];
  89
  90         if ("unknown" == $imageFormat) {
  91           $this->BRfatal("Unknown image format");
  92         }
  93
  94         if ("unknown" == $archiveFormat) {
  95           $this->BRfatal("Unknown archive format");
  96         }
  97
  98
  99         $scanDataFile = "${subItemPath}_scandata.xml";
 100         $scanDataZip  = "$itemPath/scandata.zip";
 101         if (file_exists($scanDataFile)) {
 102             $this->checkPrivs($scanDataFile);
 103             $scanData = simplexml_load_file($scanDataFile);
 104         } else if (file_exists($scanDataZip)) {
 105             $this->checkPrivs($scanDataZip);
 106             $cmd  = 'unzip -p ' . escapeshellarg($scanDataZip) . ' scandata.xml';
 107             exec($cmd, $output, $retval);
 108             if ($retval != 0) {
 109                 $this->BRFatal("Could not unzip ScanData!");
 110             }
 111
 112             $dump = join("\n", $output);
 113             $scanData = simplexml_load_string($dump);
 114         } else if (file_exists("$itemPath/scandata.xml")) {
 115             // For e.g. Scribe v.0 books!
 116             $scanData = simplexml_load_file("$itemPath/scandata.xml");
 117         } else {
 118             $this->BRFatal("ScanData file not found!");
 119         }
 120
 121         $metaDataFile = "$itemPath/{$id}_meta.xml";
 122         if (!file_exists($metaDataFile)) {
 123             $this->BRFatal("MetaData file not found!");
 124         }
 125
 126
 127         $metaData = simplexml_load_file($metaDataFile);
 128
 129         /* Find pages by type */
 130         $titleLeaf = '';
 131         $coverLeafs = array();
 132         foreach ($scanData->pageData->page as $page) {
 133             if (("Title Page" == $page->pageType) || ("Title" == $page->pageType)) {
 134                 if ('' == $titleLeaf) {
 135                     // not already set
 136                     $titleLeaf = "{$page['leafNum']}";
 137                 }
 138             }
 139
 140             if (('Cover' == $page->pageType) || ('Cover Page' == $page->pageType)) {
 141                 array_push($coverLeafs, $page['leafNum']);
 142             }
 143         }
 144
 145         // These arrays map accessible page index numbers to width, height, scanned leaf numbers
 146         // and page number strings (NB: these may not be unique)
 147         $pageWidths = array();
 148         $pageHeights = array();
 149         $leafNums = array();
 150         $i=0;
 151         $totalHeight = 0;
 152         foreach ($scanData->pageData->page as $page) {
 153             if ($this->shouldAddPage($page)) {
 154                 $pageWidths[$i] = intval($page->cropBox->w);
 155                 $pageHeights[$i] = intval($page->cropBox->h);
 156                 $totalHeight += intval($page->cropBox->h/4) + 10;
 157                 $leafNums[$i] = intval($page['leafNum']);
 158                 $pageNums[$i] = $page->pageNumber . '';
 159                 $i++;
 160             }
 161         }
 162
 163         # Load some values from meta.xml
 164         foreach ($this->metaFields as $srcName => $destName) {
 165             if ($metaData->{$srcName}) {
 166                 $response[$destName] = $metaData->{$srcName} . '';
 167             } else {
 168                 if (array_key_exists($destName, $this->metaDefaults)) {
 169                     $response[$destName] = $this->metaDefaults[$destName];
 170                 }
 171             }
 172         }
 173
 174         // General metadata
 175         $response['numPages'] = count($pageNums); // $$$ renamed
 176         if ('' != $titleLeaf) {
 177             $response['titleLeaf'] = $titleLeaf; // $$$ change to titleIndex - do leaf mapping here
 178             $titleIndex = $this->indexForLeaf($titleLeaf, $leafNums);
 179             if ($titleIndex !== NULL) {
 180                 $response['titleIndex'] = intval($titleIndex);
 181             }
 182         }
 183         $response['url'] = "http://www.archive.org/details/$id";
 184         $response['pageWidths'] = $pageWidths;
 185         $response['pageHeights'] = $pageHeights;
 186         $response['pageNums'] = $pageNums;
 187
 188         // Internet Archive specific
 189         $response['itemId'] = $id; // $$$ renamed
 190         $response['bookId'] = $bookId;  // $$$ renamed
 191         $response['itemPath'] = $itemPath;
 192         $response['zip'] = $imageStackFile;
 193         $response['server'] = $server;
 194         $response['imageFormat'] = $imageFormat;
 195         $response['archiveFormat'] = $archiveFormat;
 196         $response['leafNums'] = $leafNums;
 197         $response['previewImage'] = $this->previewURL('preview', $response);
 198
 199         // URL to title image
 200         if ('' != $titleLeaf) {
 201             $response['titleImage'] = $this->previewURL('title', $response);
 202         }
 203
 204         if (count($coverLeafs) > 0) {
 205             $coverIndices = array();
 206             $coverImages = array();
 207             foreach ($coverLeafs as $key => $leafNum) {
 208                 array_push($coverIndices, $this->indexForLeaf($leafNum, $leafNums));
 209                 // $$$ TODO use preview API once it supports multiple covers
 210                 array_push($coverImages, $this->imageUrl($leafNum, $response));
 211             }
 212
 213             $response['coverIndices'] = $coverIndices;
 214             $response['coverImages'] = $coverImages;
 215         }
 216
 217         return $response;
 218     }
 219
 220     function emitResponse($metadata) {
 221         $callback = $_REQUEST['callback'];
 222
 223         $contentType = 'application/json'; // default
 224         if ($callback) {
 225             if (! $this->isValidCallback($callback) ) {
 226                 $this->BRfatal("Invalid callback");
 227             }
 228             $contentType = 'text/javascript'; // JSONP is not JSON
 229         }
 230
 231         header('Content-type: ' . $contentType . ';charset=UTF-8');
 232         header('Access-Control-Allow-Origin: *'); // allow cross-origin requests
 233
 234         if ($callback) {
 235             print $callback . '( ';
 236         }
 237         print json_encode($metadata);
 238         if ($callback) {
 239             print ' );';
 240         }
 241     }
 242
 243     function BRFatal($string) {
 244         // $$$ TODO log error
 245         throw new Exception("Metadata error: $string");
 246         //echo "alert('$string');\n";
 247         //die(-1);
 248     }
 249
 250     // Returns true if a page should be added based on it's information in
 251     // the metadata
 252     function shouldAddPage($page) {
 253         // Return false only if the page is marked addToAccessFormats false.
 254         // If there is no assertion we assume it should be added.
 255         if (isset($page->addToAccessFormats)) {
 256             if ("false" == strtolower(trim($page->addToAccessFormats))) {
 257                 return false;
 258             }
 259         }
 260
 261         return true;
 262     }
 263
 264     // Returns { 'imageFormat' => , 'archiveFormat' => '} given a sub-item prefix and loaded xml data
 265     function findImageStack($subPrefix, $filesData) {
 266
 267         // $$$ The order of the image formats determines which will be returned first
 268         $imageFormats = array('JP2' => 'jp2', 'TIFF' => 'tif', 'JPEG' => 'jpg');
 269         $archiveFormats = array('ZIP' => 'zip', 'Tar' => 'tar');
 270         $imageGroup = implode('|', array_keys($imageFormats));
 271         $archiveGroup = implode('|', array_keys($archiveFormats));
 272         // $$$ Currently only return processed images
 273         $imageStackRegex = "/Single Page (Processed) (${imageGroup}) (${archiveGroup})/";
 274
 275         foreach ($filesData->file as $file) {
 276             if (strpos($file['name'], $subPrefix) === 0) { // subprefix matches beginning
 277                 if (preg_match($imageStackRegex, $file->format, $matches)) {
 278
 279                     // Make sure we have a regular image stack
 280                     $imageFormat = $imageFormats[$matches[2]];
 281                     if (strpos($file['name'], $subPrefix . '_' . $imageFormat) === 0) {
 282                         return array('imageFormat' => $imageFormat,
 283                                      'archiveFormat' => $archiveFormats[$matches[3]],
 284                                      'imageStackFile' => $file['name']);
 285                     }
 286                 }
 287             }
 288         }
 289
 290         return array('imageFormat' => 'unknown', 'archiveFormat' => 'unknown', 'imageStackFile' => 'unknown');
 291     }
 292
 293     function isValidCallback($identifier) {
 294         $pattern = '/^[a-zA-Z_$][a-zA-Z0-9_$]*$/';
 295         return preg_match($pattern, $identifier) == 1;
 296     }
 297
 298     function indexForLeaf($leafNum, $leafNums) {
 299         $key = array_search($leafNum, $leafNums);
 300         if ($key === FALSE) {
 301             return NULL;
 302         } else {
 303             return $key;
 304         }
 305     }
 306
 307     function leafForIndex($index, $leafNums) {
 308         return $leafNums[$index]; // $$$ todo change to instance variables
 309     }
 310
 311     function imageURL($leafNum, $metadata, $scale = null, $rotate = null) {
 312         // "Under the hood", non-public, dynamically changing (achtung!) image URLs currently look like:
 313         // http://{server}/BookReader/BookReaderImages.php?zip={zipPath}&file={filePath}&scale={scale}&rotate={rotate}
 314         // e.g. http://ia311213.us.archive.org/BookReader/BookReaderImages.php?zip=/0/items/coloritsapplicat00andriala/coloritsapplicat00andriala_jp2.zip&file=coloritsapplicat00andriala_jp2/coloritsapplicat00andriala_0009.jp2&scale=8&rotate=0
 315
 316
 317         $filePath = $this->imageFilePath($leafNum, $metadata['bookId'], $metadata['imageFormat']);
 318         $url = 'http://' . $metadata['server'] . '/BookReader/BookReaderImages.php?zip=' . $metadata['zip'] . '&file=' . $filePath;
 319
 320         if ($scale !== null) {
 321             $url .= '&scale=' . $scale;
 322         }
 323         if ($rotate !== null) {
 324             $url .= '&rotate=' . $rotate;
 325         }
 326
 327         return $url;
 328     }
 329
 330     // $$$ move inside BookReaderPreview
 331     function previewURL($page, $metadata) {
 332         $query = array(
 333             'id' => $metadata['itemId'],
 334             'bookId' => $metadata['bookId'],
 335             'itemPath' => $metadata['itemPath'],
 336             'server' => $metadata['server'],
 337             'page' => $page,
 338         );
 339
 340         return 'http://' . $metadata['server'] . '/BookReader/BookReaderPreview.php?' . http_build_query($query, '', '&');
 341     }
 342
 343     function imageFilePath($leafNum, $bookId, $format) {
 344         return sprintf("%s_%s/%s_%04d.%s", $bookId, $format, $bookId, intval($leafNum), $format);
 345     }
 346
 347     // Parse date from _meta.xml to integer
 348     function parseYear($dateFromMetaXML) {
 349         // grab the first run of digits
 350         if (preg_match('|(\d+)|', $dateFromMetaXML, $matches)) {
 351             return (int)$matches[1];
 352         }
 353         return null;
 354     }
 355
 356     function processRequest($requestEnv) {
 357         $id = $requestEnv['itemId']; // $$$ renamed
 358         $itemPath = $requestEnv['itemPath'];
 359         $bookId = $requestEnv['bookId']; // $$$ renamed
 360         $server = $requestEnv['server'];
 361
 362         // Check if we're on a dev vhost and point to JSIA in the user's public_html on the datanode
 363         // $$$ TODO consolidate this logic
 364         if (strpos($_SERVER["REQUEST_URI"], "/~mang") === 0) { // Serving out of home dir
 365             $server .= ':80/~mang';
 366         } else if (strpos($_SERVER["REQUEST_URI"], "/~testflip") === 0) { // Serving out of home dir
 367             $server .= ':80/~testflip';
 368         }
 369
 370         $this->emitResponse( $this->buildMetadata($id, $itemPath, $bookId, $server) );
 371     }
 372
 373     function checkPrivs($filename) {
 374         if (!is_readable($filename)) {
 375             header('HTTP/1.1 403 Forbidden');
 376             exit(0);
 377         }
 378     }
 379
 380 }
 381
 382 ?>