4 Copyright(c) 2008-2010 Internet Archive. Software license AGPL version 3.
6 This file is part of BookReader. The full source code can be found at GitHub:
7 http://github.com/openlibrary/bookreader
9 The canonical short name of an image type is the same as in the MIME type.
10 For example both .jpeg and .jpg are considered to have type "jpeg" since
11 the MIME type is "image/jpeg".
13 BookReader is free software: you can redistribute it and/or modify
14 it under the terms of the GNU Affero General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
18 BookReader is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU Affero General Public License for more details.
23 You should have received a copy of the GNU Affero General Public License
24 along with BookReader. If not, see <http://www.gnu.org/licenses/>.
27 require_once("BookReaderMeta.inc.php");
29 class BookReaderImages
31 public static $MIMES = array('gif' => 'image/gif',
33 'jpg' => 'image/jpeg',
34 'jpeg' => 'image/jpeg',
36 'tif' => 'image/tiff',
37 'tiff' => 'image/tiff');
39 public static $EXTENSIONS = array('gif' => 'gif',
47 // Width when generating thumbnails
48 public static $imageSizes = array(
55 // Keys in the image permalink urls, e.g. http://www.archive.org/download/itemid/page/cover_{keyval}_{keyval}.jpg
56 public static $imageUrlKeys = array(
65 // Paths to command-line tools
66 var $exiftool = '/petabox/sw/books/exiftool/exiftool';
67 var $kduExpand = '/petabox/sw/bin/kdu_expand';
70 * Serve an image request that requires looking up the book metadata
74 * - Parse the requested page (e.g. cover_t.jpg, n5_r4.jpg) to determine which page type,
75 * size and format (etc) is being requested
76 * - Determine the leaf number corresponding to the page
77 * - Determine scaling values
78 * - Serve image request now that all information has been gathered
81 function serveLookupRequest($requestEnv) {
82 $brm = new BookReaderMeta();
84 $metadata = $brm->buildMetadata($_REQUEST['id'], $_REQUEST['itemPath'], $_REQUEST['subPrefix'], $_REQUEST['server']);
85 } catch (Exception $e) {
86 $this->BRfatal($e->getMessage);
89 $page = $_REQUEST['page'];
91 // Index of image to return
94 // deal with subPrefix
95 if ($_REQUEST['subPrefix']) {
96 $parts = split('/', $_REQUEST['subPrefix']);
97 $bookId = $parts[count($parts) - 1 ];
99 $bookId = $_REQUEST['id'];
102 $pageInfo = $this->parsePageRequest($page, $bookId);
104 $basePage = $pageInfo['type'];
108 if (! array_key_exists('titleIndex', $metadata)) {
109 $this->BRfatal("No title page asserted in book");
111 $imageIndex = $metadata['titleIndex'];
115 if (! array_key_exists('coverIndices', $metadata)) {
116 $this->BRfatal("No cover asserted in book");
118 $imageIndex = $metadata['coverIndices'][0]; // $$$ TODO add support for other covers
123 // Cover page if book was published >= 1950
128 if ( array_key_exists('date', $metadata) && array_key_exists('coverIndices', $metadata) ) {
129 if ($brm->parseYear($metadata['date']) >= 1950) {
130 $imageIndex = $metadata['coverIndices'][0];
134 if (array_key_exists('titleIndex', $metadata)) {
135 $imageIndex = $metadata['titleIndex'];
138 if (array_key_exists('coverIndices', $metadata)) {
139 $imageIndex = $metadata['coverIndices'][0];
148 // Accessible index page
149 $imageIndex = intval($pageInfo['value']);
154 $index = array_search($pageInfo['value'], $metadata['pageNums']);
155 if ($index === FALSE) {
157 $this->BRfatal("Page not found");
161 $imageIndex = $index;
165 // Shouldn't be possible
166 $this->BRfatal("Unrecognized page type requested");
171 $leaf = $brm->leafForIndex($imageIndex, $metadata['leafNums']);
174 'zip' => $metadata['zip'],
175 'file' => $brm->imageFilePath($leaf, $metadata['subPrefix'], $metadata['imageFormat']),
179 // remove non-passthrough keys from pageInfo
180 unset($pageInfo['type']);
181 unset($pageInfo['value']);
183 // add pageinfo to request
184 $requestEnv = array_merge($pageInfo, $requestEnv);
186 // Return image data - will check privs
187 $this->serveRequest($requestEnv);
192 * Returns a page image when all parameters such as the image stack location are
197 * Get info about requested image (input)
198 * Get info about requested output format
199 * Determine processing parameters
202 * Clean up temporary files
204 function serveRequest($requestEnv) {
205 // Process some of the request parameters
206 $zipPath = $requestEnv['zip'];
207 $file = $requestEnv['file'];
209 $ext = $requestEnv['ext'];
214 if (isset($requestEnv['callback'])) {
215 // validate callback is valid JS identifier (only)
216 $callback = $requestEnv['callback'];
217 $identifierPatt = '/^[[:alpha:]$_]([[:alnum:]$_])*$/';
218 if (! preg_match($identifierPatt, $callback)) {
219 $this->BRfatal('Invalid callback');
225 if ( !file_exists($zipPath) ) {
226 $this->BRfatal('Image stack does not exist at ' . $zipPath);
228 // Make sure the image stack is readable - return 403 if not
229 $this->checkPrivs($zipPath);
232 // Get the image size and depth
233 $imageInfo = $this->getImageInfo($zipPath, $file);
235 // Output json if requested
236 if ('json' == $ext) {
237 // $$$ we should determine the output size first based on requested scale
238 $this->outputJSON($imageInfo, $callback); // $$$ move to BookReaderRequest
242 // Unfortunately kakadu requires us to know a priori if the
243 // output file should be .ppm or .pgm. By decompressing to
244 // .bmp kakadu will write a file we can consistently turn into
245 // .pnm. Really kakadu should support .pnm as the file output
246 // extension and automatically write ppm or pgm format as
248 $this->decompressToBmp = true; // $$$ shouldn't be necessary if we use file info to determine output format
249 if ($this->decompressToBmp) {
250 $stdoutLink = '/tmp/stdout.bmp';
252 $stdoutLink = '/tmp/stdout.ppm';
255 $fileExt = strtolower(pathinfo($file, PATHINFO_EXTENSION));
257 // Rotate is currently only supported for jp2 since it does not add server load
258 $allowedRotations = array("0", "90", "180", "270");
259 $rotate = $requestEnv['rotate'];
260 if ( !in_array($rotate, $allowedRotations) ) {
264 // Image conversion options
266 $jpegOptions = '-quality 75';
268 // The pbmreduce reduction factor produces an image with dimension 1/n
269 // The kakadu reduction factor produceds an image with dimension 1/(2^n)
270 if (isset($requestEnv['height'])) {
271 $powReduce = $this->nearestPow2Reduce($requestEnv['height'], $imageInfo['height']);
272 $scale = pow(2, $powReduce);
273 } else if (isset($requestEnv['width'])) {
274 $powReduce = $this->nearestPow2Reduce($requestEnv['width'], $imageInfo['width']);
275 $scale = pow(2, $powReduce);
278 // $$$ could be cleaner
279 // Provide next smaller power of two reduction
280 $scale = $requestEnv['scale'];
284 if (array_key_exists($scale, self::$imageSizes)) {
285 $srcRatio = floatval($imageInfo['width']) / floatval($imageInfo['height']);
288 $dimension = 'width';
290 $dimension = 'height';
292 $powReduce = $this->nearestPow2Reduce($this->imageSizes[$scale], $imageInfo[$dimension]);
294 $powReduce = $this->nearestPow2ForScale($scale);
296 $scale = pow(2, $powReduce);
299 // Override depending on source image format
300 // $$$ consider doing a 302 here instead, to make better use of the browser cache
301 // Limit scaling for 1-bit images. See https://bugs.edge.launchpad.net/bookreader/+bug/486011
302 if (1 == $imageInfo['bits']) {
307 // Hard limit so there are some black pixels to use!
315 if (!file_exists($stdoutLink))
317 system('ln -s /dev/stdout ' . $stdoutLink);
320 putenv('LD_LIBRARY_PATH=/petabox/sw/lib/kakadu');
322 $unzipCmd = $this->getUnarchiveCommand($zipPath, $file);
324 $decompressCmd = $this->getDecompressCmd($imageInfo['type'], $powReduce, $rotate, $scale, $stdoutLink);
326 // Non-integer scaling is currently disabled on the cluster
327 // if (isset($_REQUEST['height'])) {
328 // $cmd .= " | pnmscale -height {$_REQUEST['height']} ";
333 $compressCmd = ' | pnmtopng ' . $pngOptions;
339 $compressCmd = ' | pnmtojpeg ' . $jpegOptions;
340 $ext = 'jpeg'; // for matching below
345 if (($ext == $fileExt) && ($scale == 1) && ($rotate === "0")) {
346 // Just pass through original data if same format and size
349 $cmd = $unzipCmd . $decompressCmd . $compressCmd;
354 $filenameForClient = $this->filenameForClient($file, $ext);
356 $headers = array('Content-type: '. self::$MIMES[$ext],
357 'Cache-Control: max-age=15552000',
358 'Content-disposition: inline; filename=' . $filenameForClient);
362 if (! $this->passthruIfSuccessful($headers, $cmd, $errorMessage)) { // $$$ move to BookReaderRequest
363 // $$$ automated reporting
364 trigger_error('BookReader Processing Error: ' . $cmd . ' -- ' . $errorMessage, E_USER_WARNING);
366 // Try some content-specific recovery
368 if ($imageInfo['type'] == 'jp2') {
369 $records = $this->getJp2Records($zipPath, $file);
370 if ($powReduce > intval($records['Clevels'])) {
371 $powReduce = $records['Clevels'];
372 $reduce = pow(2, $powReduce);
378 $cmd = $unzipCmd . $this->getDecompressCmd($imageInfo['type'], $powReduce, $rotate, $scale, $stdoutLink) . $compressCmd;
379 if ($this->passthruIfSuccessful($headers, $cmd, $errorMessage)) { // $$$ move to BookReaderRequest
382 trigger_error('BookReader fallback image processing also failed: ' . $errorMessage, E_USER_WARNING);
387 $this->BRfatal('Problem processing image - command failed');
391 if (isset($tempFile)) {
396 function getUnarchiveCommand($archivePath, $file)
398 $lowerPath = strtolower($archivePath);
399 if (preg_match('/\.([^\.]+)$/', $lowerPath, $matches)) {
400 $suffix = $matches[1];
402 if ($suffix == 'zip') {
404 . escapeshellarg($archivePath)
405 . ' ' . escapeshellarg($file);
406 } else if ($suffix == 'tar') {
407 return ' ( 7z e -so '
408 . escapeshellarg($archivePath)
409 . ' ' . escapeshellarg($file) . ' 2>/dev/null ) ';
411 $this->BRfatal('Incompatible archive format');
415 $this->BRfatal('Bad image stack path');
418 $this->BRfatal('Bad image stack path or archive format');
423 * Returns the image type associated with the file extension.
425 function imageExtensionToType($extension)
428 if (array_key_exists($extension, self::$EXTENSIONS)) {
429 return self::$EXTENSIONS[$extension];
431 $this->BRfatal('Unknown image extension');
436 * Get the image information. The returned associative array fields will
437 * vary depending on the image type. The basic keys are width, height, type
440 function getImageInfo($zipPath, $file)
442 return $this->getImageInfoFromExif($zipPath, $file); // this is fast
445 $fileExt = strtolower(pathinfo($file, PATHINFO_EXTENSION));
446 $type = imageExtensionToType($fileExt);
450 return getImageInfoFromJp2($zipPath, $file);
453 return getImageInfoFromExif($zipPath, $file);
458 // Get the records of of JP2 as returned by kdu_expand
459 function getJp2Records($zipPath, $file)
462 $cmd = $this->getUnarchiveCommand($zipPath, $file)
463 . ' | ' . $this->kduExpand
464 . ' -no_seek -quiet -i /dev/stdin -record /dev/stdout';
468 foreach ($output as $line) {
469 $elems = explode("=", $line, 2);
470 if (1 == count($elems)) {
471 // delimiter not found
474 $records[$elems[0]] = $elems[1];
481 * Get the image width, height and depth using the EXIF information.
483 function getImageInfoFromExif($zipPath, $file)
486 // We look for all the possible tags of interest then act on the
487 // ones presumed present based on the file type
488 $tagsToGet = ' -ImageWidth -ImageHeight -FileType' // all formats
489 . ' -BitsPerComponent -ColorSpace' // jp2
490 . ' -BitDepth' // png
491 . ' -BitsPerSample'; // tiff
493 $cmd = $this->getUnarchiveCommand($zipPath, $file)
494 . ' | '. $this->exiftool . ' -S -fast' . $tagsToGet . ' -';
498 foreach ($output as $line) {
499 $keyValue = explode(": ", $line);
500 $tags[$keyValue[0]] = $keyValue[1];
503 $width = intval($tags["ImageWidth"]);
504 $height = intval($tags["ImageHeight"]);
505 $type = strtolower($tags["FileType"]);
509 $bits = intval($tags["BitsPerComponent"]);
512 $bits = intval($tags["BitsPerSample"]);
518 $bits = intval($tags["BitDepth"]);
521 $this->BRfatal("Unsupported image type $type for file $file in $zipPath");
526 $retval = Array('width' => $width, 'height' => $height,
527 'bits' => $bits, 'type' => $type);
533 * Output JSON given the imageInfo associative array
535 function outputJSON($imageInfo, $callback)
537 header('Content-type: text/plain');
538 $jsonOutput = json_encode($imageInfo);
540 $jsonOutput = $callback . '(' . $jsonOutput . ');';
545 function getDecompressCmd($imageType, $powReduce, $rotate, $scale, $stdoutLink) {
547 switch ($imageType) {
550 " | " . $this->kduExpand . " -no_seek -quiet -reduce $powReduce -rotate $rotate -i /dev/stdin -o " . $stdoutLink;
551 if ($this->decompressToBmp) {
552 // We suppress output since bmptopnm always outputs on stderr
553 $decompressCmd .= ' | (bmptopnm 2>/dev/null)';
558 // We need to create a temporary file for tifftopnm since it cannot
559 // work on a pipe (the file must be seekable).
560 // We use the BookReaderTiff prefix to give a hint in case things don't
562 $tempFile = tempnam("/tmp", "BookReaderTiff");
564 // $$$ look at bit depth when reducing
566 ' > ' . $tempFile . ' ; tifftopnm ' . $tempFile . ' 2>/dev/null' . $this->reduceCommand($scale);
570 $decompressCmd = ' | ( jpegtopnm 2>/dev/null ) ' . $this->reduceCommand($scale);
574 $decompressCmd = ' | ( pngtopnm 2>/dev/null ) ' . $this->reduceCommand($scale);
578 $this->BRfatal('Unknown image type: ' . $imageType);
581 return $decompressCmd;
584 // If the command has its initial output on stdout the headers will be emitted followed
585 // by the stdout output. If initial output is on stderr an error message will be
589 // true - if command emits stdout and has zero exit code
590 // false - command has initial output on stderr or non-zero exit code
591 // &$errorMessage - error string if there was an error
593 // $$$ Tested with our command-line image processing. May be deadlocks for
595 function passthruIfSuccessful($headers, $cmd, &$errorMessage)
600 $descriptorspec = array(
601 0 => array("pipe", "r"), // stdin is a pipe that the child will read from
602 1 => array("pipe", "w"), // stdout is a pipe that the child will write to
603 2 => array("pipe", "w"), // stderr is a pipe to write to
609 $process = proc_open($cmd, $descriptorspec, $pipes, $cwd, $env);
611 if (is_resource($process)) {
612 // $pipes now looks like this:
613 // 0 => writeable handle connected to child stdin
614 // 1 => readable handle connected to child stdout
615 // 2 => readable handle connected to child stderr
621 // check whether we get input first on stdout or stderr
622 $read = array($stdout, $stderr);
625 $numChanged = stream_select($read, $write, $except, NULL); // $$$ no timeout
626 if (false === $numChanged) {
628 $errorMessage = 'Select failed';
631 if ($read[0] == $stdout && (1 == $numChanged)) {
632 // Got output first on stdout (only)
633 // $$$ make sure we get all stdout
634 $output = fopen('php://output', 'w');
635 foreach($headers as $header) {
638 stream_copy_to_stream($pipes[1], $output);
639 fclose($output); // okay since tied to special php://output
642 // Got output on stderr
643 // $$$ make sure we get all stderr
644 $errorMessage = stream_get_contents($stderr);
653 // It is important that you close any pipes before calling
654 // proc_close in order to avoid a deadlock
655 $cmdRet = proc_close($process);
658 $errorMessage .= "Command failed with result code " . $cmdRet;
664 function BRfatal($string) {
665 throw new Exception("Image error: $string");
668 // Returns true if using a power node
669 function onPowerNode() {
670 exec("lspci | fgrep -c Realtek", $output, $return);
671 if ("0" != $output[0]) {
674 exec("egrep -q AMD /proc/cpuinfo", $output, $return);
682 function reduceCommand($scale) {
684 if ($this->onPowerNode()) {
685 return ' | pnmscale -reduce ' . $scale . ' 2>/dev/null ';
687 return ' | pnmscale -nomix -reduce ' . $scale . ' 2>/dev/null ';
694 function checkPrivs($filename) {
695 if (!is_readable($filename)) {
696 header('HTTP/1.1 403 Forbidden');
701 // Given file path (inside archive) and output file extension, return a filename
702 // suitable for Content-disposition header
703 function filenameForClient($filePath, $ext) {
704 $pathParts = pathinfo($filePath);
705 if ('jpeg' == $ext) {
708 return $pathParts['filename'] . '.' . $ext;
711 // Returns the nearest power of 2 reduction factor that results in a larger image
712 function nearestPow2Reduce($desiredDimension, $sourceDimension) {
713 $ratio = floatval($sourceDimension) / floatval($desiredDimension);
714 return $this->nearestPow2ForScale($ratio);
717 // Returns nearest power of 2 reduction factor that results in a larger image
718 function nearestPow2ForScale($scale) {
719 $scale = intval($scale);
723 $binStr = decbin($scale); // convert to binary string. e.g. 5 -> '101'
724 return strlen($binStr) - 1;
728 * Parses a page request like "page5_r2.jpg" or "cover_t.jpg" to corresponding
729 * page type, size, reduce, and format
731 function parsePageRequest($pageRequest, $bookPrefix) {
733 // Will hold parsed results
737 $pageRequest = strtolower($pageRequest);
739 // Pull off extension
740 if (preg_match('#(.*)\.([^.]+)$#', $pageRequest, $matches) === 1) {
741 $pageRequest = $matches[1];
742 $extension = $matches[2];
743 if ($extension == 'jpeg') {
749 $pageInfo['extension'] = $extension;
752 $parts = explode('_', $pageRequest);
754 // Remove book prefix if it was included (historical)
755 if ($parts[0] == $bookPrefix) {
759 if (count($parts) === 0) {
760 $this->BRfatal('No page type specified');
762 $page = array_shift($parts);
768 'preview' => 'single',
772 // Look for known page types
773 foreach ( $pageTypes as $pageName => $kind ) {
774 if ( preg_match('#^(' . $pageName . ')(.*)#', $page, $matches) === 1 ) {
775 $pageInfo['type'] = $matches[1];
778 $pageInfo['value'] = $matches[2];
781 $pageInfo['value'] = intval($matches[2]);
789 if ( !array_key_exists('type', $pageInfo) ) {
790 $this->BRfatal('Unrecognized page type');
793 // Look for other known parts
794 foreach ($parts as $part) {
795 if ( in_array($part, self::$imageSizes) ) {
796 $pageInfo['size'] = $part;
800 // Key must be alpha, value must start with digit and contain digits, alpha, ',' or '.'
801 // Should prevent injection of strange values into the redirect to datanode
802 if ( preg_match('#^([a-z]+)(\d[a-z0-9,.]*)#', $part, $matches) === 0) {
808 $value = $matches[2];
810 if ( array_key_exists($key, self::$imageUrlKeys) ) {
811 $pageInfo[self::$imageUrlKeys[$key]] = $value;
815 // If we hit here, was unrecognized (no action)