4 Copyright(c) 2008-2010 Internet Archive. Software license AGPL version 3.
6 This file is part of BookReader. The full source code can be found at GitHub:
7 http://github.com/openlibrary/bookreader
9 The canonical short name of an image type is the same as in the MIME type.
10 For example both .jpeg and .jpg are considered to have type "jpeg" since
11 the MIME type is "image/jpeg".
13 BookReader is free software: you can redistribute it and/or modify
14 it under the terms of the GNU Affero General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
18 BookReader is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU Affero General Public License for more details.
23 You should have received a copy of the GNU Affero General Public License
24 along with BookReader. If not, see <http://www.gnu.org/licenses/>.
27 require_once("BookReaderMeta.inc.php");
29 class BookReaderImages
31 public static $MIMES = array('gif' => 'image/gif',
33 'jpg' => 'image/jpeg',
34 'jpeg' => 'image/jpeg',
36 'tif' => 'image/tiff',
37 'tiff' => 'image/tiff');
39 public static $EXTENSIONS = array('gif' => 'gif',
47 // Width when generating thumbnails
48 public static $imageSizes = array(
55 // Keys in the image permalink urls, e.g. http://www.archive.org/download/itemid/page/cover_{keyval}_{keyval}.jpg
56 public static $imageUrlKeys = array(
57 //'r' => 'reduce', // pow of 2 reduction
58 's' => 'scale', // $$$ scale is downscaling factor in BookReaderImages but most people call this "reduce"
66 // Paths to command-line tools
67 var $exiftool = '/petabox/sw/books/exiftool/exiftool';
68 var $kduExpand = '/petabox/sw/bin/kdu_expand';
70 // Name of temporary files, to be cleaned at exit
71 var $tempFiles = array();
74 * Serve an image request that requires looking up the book metadata
78 * - Parse the requested page (e.g. cover_t.jpg, n5_r4.jpg) to determine which page type,
79 * size and format (etc) is being requested
80 * - Determine the leaf number corresponding to the page
81 * - Determine scaling values
82 * - Serve image request now that all information has been gathered
85 function serveLookupRequest($requestEnv) {
86 $brm = new BookReaderMeta();
88 $metadata = $brm->buildMetadata($_REQUEST['id'], $_REQUEST['itemPath'], $_REQUEST['subPrefix'], $_REQUEST['server']);
89 } catch (Exception $e) {
90 $this->BRfatal($e->getMessage);
93 $page = $_REQUEST['page'];
95 // Index of image to return
98 // deal with subPrefix
99 if ($_REQUEST['subPrefix']) {
100 $parts = split('/', $_REQUEST['subPrefix']);
101 $bookId = $parts[count($parts) - 1 ];
103 $bookId = $_REQUEST['id'];
106 $pageInfo = $this->parsePageRequest($page, $bookId);
108 $basePage = $pageInfo['type'];
114 if (! array_key_exists('titleIndex', $metadata)) {
115 $this->BRfatal("No title page asserted in book");
117 $imageIndex = $metadata['titleIndex'];
120 /* Old 'cover' behaviour where it would show cover 0 if it exists or return 404.
121 Could be re-added as cover0, cover1, etc
123 if (! array_key_exists('coverIndices', $metadata)) {
124 $this->BRfatal("No cover asserted in book");
126 $imageIndex = $metadata['coverIndices'][0]; // $$$ TODO add support for other covers
131 case 'cover': // Show our best guess if cover is requested
133 // Cover page if book was published >= 1950
138 if ( array_key_exists('date', $metadata) && array_key_exists('coverIndices', $metadata) ) {
139 if ($brm->parseYear($metadata['date']) >= 1950) {
140 $imageIndex = $metadata['coverIndices'][0];
144 if (array_key_exists('titleIndex', $metadata)) {
145 $imageIndex = $metadata['titleIndex'];
148 if (array_key_exists('coverIndices', $metadata)) {
149 $imageIndex = $metadata['coverIndices'][0];
158 // Accessible index page
159 $imageIndex = intval($pageInfo['value']);
164 $index = array_search($pageInfo['value'], $metadata['pageNums']);
165 if ($index === FALSE) {
167 $this->BRfatal("Page not found");
171 $imageIndex = $index;
175 // Leaf explicitly specified
176 $leaf = $pageInfo['value'];
180 // Shouldn't be possible
181 $this->BRfatal("Unrecognized page type requested");
186 if (is_null($leaf)) {
187 // Leaf was not explicitly set -- look it up
188 $leaf = $brm->leafForIndex($imageIndex, $metadata['leafNums']);
192 'zip' => $metadata['zip'],
193 'file' => $brm->imageFilePath($leaf, $metadata['subPrefix'], $metadata['imageFormat']),
197 // remove non-passthrough keys from pageInfo
198 unset($pageInfo['type']);
199 unset($pageInfo['value']);
201 // add pageinfo to request
202 $requestEnv = array_merge($pageInfo, $requestEnv);
204 // Return image data - will check privs
205 $this->serveRequest($requestEnv);
210 * Returns a page image when all parameters such as the image stack location are
215 * Get info about requested image (input)
216 * Get info about requested output format
217 * Determine processing parameters
220 * Clean up temporary files
222 function serveRequest($requestEnv) {
223 // Process some of the request parameters
224 $zipPath = $requestEnv['zip'];
225 $file = $requestEnv['file'];
227 $ext = $requestEnv['ext'];
232 if (isset($requestEnv['callback'])) {
233 // validate callback is valid JS identifier (only)
234 $callback = $requestEnv['callback'];
235 $identifierPatt = '/^[[:alpha:]$_]([[:alnum:]$_])*$/';
236 if (! preg_match($identifierPatt, $callback)) {
237 $this->BRfatal('Invalid callback');
243 if ( !file_exists($zipPath) ) {
244 $this->BRfatal('Image stack does not exist at ' . $zipPath);
246 // Make sure the image stack is readable - return 403 if not
247 $this->checkPrivs($zipPath);
250 // Get the image size and depth
251 $imageInfo = $this->getImageInfo($zipPath, $file);
253 // Output json if requested
254 if ('json' == $ext) {
255 // $$$ we should determine the output size first based on requested scale
256 $this->outputJSON($imageInfo, $callback); // $$$ move to BookReaderRequest
260 // Unfortunately kakadu requires us to know a priori if the
261 // output file should be .ppm or .pgm. By decompressing to
262 // .bmp kakadu will write a file we can consistently turn into
263 // .pnm. Really kakadu should support .pnm as the file output
264 // extension and automatically write ppm or pgm format as
266 $this->decompressToBmp = true; // $$$ shouldn't be necessary if we use file info to determine output format
267 if ($this->decompressToBmp) {
268 $stdoutLink = '/tmp/stdout.bmp';
270 $stdoutLink = '/tmp/stdout.ppm';
273 $fileExt = strtolower(pathinfo($file, PATHINFO_EXTENSION));
275 // Rotate is currently only supported for jp2 since it does not add server load
276 $allowedRotations = array("0", "90", "180", "270");
277 $rotate = $requestEnv['rotate'];
278 if ( !in_array($rotate, $allowedRotations) ) {
282 // Image conversion options
284 $jpegOptions = '-quality 75';
286 // The pbmreduce reduction factor produces an image with dimension 1/n
287 // The kakadu reduction factor produces an image with dimension 1/(2^n)
289 // We interpret the requested size and scale, look at image format, and determine the
290 // actual scaling to be returned to the client. We generally return the largest
291 // power of 2 reduction that is larger than the requested size in order to reduce
292 // image processing load on our cluster. The client should then scale to their final
295 // Set scale from height or width if set
296 if (isset($requestEnv['height'])) {
297 $powReduce = $this->nearestPow2Reduce($requestEnv['height'], $imageInfo['height']);
298 $scale = pow(2, $powReduce);
299 } else if (isset($requestEnv['width'])) {
300 $powReduce = $this->nearestPow2Reduce($requestEnv['width'], $imageInfo['width']);
301 $scale = pow(2, $powReduce);
304 // Set scale from named size (e.g. 'large') if set
305 $size = $requestEnv['size'];
306 if ( $size && array_key_exists($size, self::$imageSizes)) {
307 $srcRatio = floatval($imageInfo['width']) / floatval($imageInfo['height']);
310 $dimension = 'width';
312 $dimension = 'height';
314 $powReduce = $this->nearestPow2Reduce(self::$imageSizes[$size], $imageInfo[$dimension]);
315 $scale = pow(2, $powReduce);
318 // No named size - use explicit scale, if given
319 $scale = $requestEnv['scale'];
323 $powReduce = $this->nearestPow2ForScale($scale);
324 // ensure integer scale
325 $scale = pow(2, $powReduce);
329 // Override depending on source image format
330 // $$$ consider doing a 302 here instead, to make better use of the browser cache
331 // Limit scaling for 1-bit images. See https://bugs.edge.launchpad.net/bookreader/+bug/486011
332 if (1 == $imageInfo['bits']) {
337 // Hard limit so there are some black pixels to use!
345 if (!file_exists($stdoutLink))
347 system('ln -s /dev/stdout ' . $stdoutLink);
350 putenv('LD_LIBRARY_PATH=/petabox/sw/lib/kakadu');
352 $unzipCmd = $this->getUnarchiveCommand($zipPath, $file);
354 $decompressCmd = $this->getDecompressCmd($imageInfo['type'], $powReduce, $rotate, $scale, $stdoutLink);
356 // Non-integer scaling is currently disabled on the cluster
357 // if (isset($_REQUEST['height'])) {
358 // $cmd .= " | pnmscale -height {$_REQUEST['height']} ";
363 $compressCmd = ' | pnmtopng ' . $pngOptions;
369 $compressCmd = ' | pnmtojpeg ' . $jpegOptions;
370 $ext = 'jpeg'; // for matching below
375 if (($ext == $fileExt) && ($scale == 1) && ($rotate === "0")) {
376 // Just pass through original data if same format and size
379 $cmd = $unzipCmd . $decompressCmd . $compressCmd;
384 $filenameForClient = $this->filenameForClient($file, $ext);
386 $headers = array('Content-type: '. self::$MIMES[$ext],
387 'Cache-Control: max-age=15552000',
388 'Content-disposition: inline; filename=' . $filenameForClient);
392 if (! $this->passthruIfSuccessful($headers, $cmd, $errorMessage)) { // $$$ move to BookReaderRequest
393 // $$$ automated reporting
394 trigger_error('BookReader Processing Error: ' . $cmd . ' -- ' . $errorMessage, E_USER_WARNING);
396 // Try some content-specific recovery
398 if ($imageInfo['type'] == 'jp2') {
399 $records = $this->getJp2Records($zipPath, $file);
400 if ($powReduce > intval($records['Clevels'])) {
401 $powReduce = $records['Clevels'];
402 $reduce = pow(2, $powReduce);
408 $cmd = $unzipCmd . $this->getDecompressCmd($imageInfo['type'], $powReduce, $rotate, $scale, $stdoutLink) . $compressCmd;
409 if ($this->passthruIfSuccessful($headers, $cmd, $errorMessage)) { // $$$ move to BookReaderRequest
413 trigger_error('BookReader fallback image processing also failed: ' . $errorMessage, E_USER_WARNING);
418 $this->BRfatal('Problem processing image - command failed');
425 function getUnarchiveCommand($archivePath, $file)
427 $lowerPath = strtolower($archivePath);
428 if (preg_match('/\.([^\.]+)$/', $lowerPath, $matches)) {
429 $suffix = $matches[1];
431 if ($suffix == 'zip') {
433 . escapeshellarg($archivePath)
434 . ' ' . escapeshellarg($file);
435 } else if ($suffix == 'tar') {
436 return ' ( 7z e -so '
437 . escapeshellarg($archivePath)
438 . ' ' . escapeshellarg($file) . ' 2>/dev/null ) ';
440 $this->BRfatal('Incompatible archive format');
444 $this->BRfatal('Bad image stack path');
447 $this->BRfatal('Bad image stack path or archive format');
452 * Returns the image type associated with the file extension.
454 function imageExtensionToType($extension)
457 if (array_key_exists($extension, self::$EXTENSIONS)) {
458 return self::$EXTENSIONS[$extension];
460 $this->BRfatal('Unknown image extension');
465 * Get the image information. The returned associative array fields will
466 * vary depending on the image type. The basic keys are width, height, type
469 function getImageInfo($zipPath, $file)
471 return $this->getImageInfoFromExif($zipPath, $file); // this is fast
474 $fileExt = strtolower(pathinfo($file, PATHINFO_EXTENSION));
475 $type = imageExtensionToType($fileExt);
479 return getImageInfoFromJp2($zipPath, $file);
482 return getImageInfoFromExif($zipPath, $file);
487 // Get the records of of JP2 as returned by kdu_expand
488 function getJp2Records($zipPath, $file)
491 $cmd = $this->getUnarchiveCommand($zipPath, $file)
492 . ' | ' . $this->kduExpand
493 . ' -no_seek -quiet -i /dev/stdin -record /dev/stdout';
497 foreach ($output as $line) {
498 $elems = explode("=", $line, 2);
499 if (1 == count($elems)) {
500 // delimiter not found
503 $records[$elems[0]] = $elems[1];
510 * Get the image width, height and depth using the EXIF information.
512 function getImageInfoFromExif($zipPath, $file)
515 // We look for all the possible tags of interest then act on the
516 // ones presumed present based on the file type
517 $tagsToGet = ' -ImageWidth -ImageHeight -FileType' // all formats
518 . ' -BitsPerComponent -ColorSpace' // jp2
519 . ' -BitDepth' // png
520 . ' -BitsPerSample'; // tiff
522 $cmd = $this->getUnarchiveCommand($zipPath, $file)
523 . ' | '. $this->exiftool . ' -S -fast' . $tagsToGet . ' -';
527 foreach ($output as $line) {
528 $keyValue = explode(": ", $line);
529 $tags[$keyValue[0]] = $keyValue[1];
532 $width = intval($tags["ImageWidth"]);
533 $height = intval($tags["ImageHeight"]);
534 $type = strtolower($tags["FileType"]);
538 $bits = intval($tags["BitsPerComponent"]);
541 $bits = intval($tags["BitsPerSample"]);
547 $bits = intval($tags["BitDepth"]);
550 $this->BRfatal("Unsupported image type $type for file $file in $zipPath");
555 $retval = Array('width' => $width, 'height' => $height,
556 'bits' => $bits, 'type' => $type);
562 * Output JSON given the imageInfo associative array
564 function outputJSON($imageInfo, $callback)
566 header('Content-type: text/plain');
567 $jsonOutput = json_encode($imageInfo);
569 $jsonOutput = $callback . '(' . $jsonOutput . ');';
574 function getDecompressCmd($imageType, $powReduce, $rotate, $scale, $stdoutLink) {
576 switch ($imageType) {
579 " | " . $this->kduExpand . " -no_seek -quiet -reduce $powReduce -rotate $rotate -i /dev/stdin -o " . $stdoutLink;
580 if ($this->decompressToBmp) {
581 // We suppress output since bmptopnm always outputs on stderr
582 $decompressCmd .= ' | (bmptopnm 2>/dev/null)';
587 // We need to create a temporary file for tifftopnm since it cannot
588 // work on a pipe (the file must be seekable).
589 // We use the BookReaderTiff prefix to give a hint in case things don't
591 $tempFile = tempnam("/tmp", "BookReaderTiff");
592 array_push($this->tempFiles, $tempFile);
594 // $$$ look at bit depth when reducing
596 ' > ' . $tempFile . ' ; tifftopnm ' . $tempFile . ' 2>/dev/null' . $this->reduceCommand($scale);
600 $decompressCmd = ' | ( jpegtopnm 2>/dev/null ) ' . $this->reduceCommand($scale);
604 $decompressCmd = ' | ( pngtopnm 2>/dev/null ) ' . $this->reduceCommand($scale);
608 $this->BRfatal('Unknown image type: ' . $imageType);
611 return $decompressCmd;
614 // If the command has its initial output on stdout the headers will be emitted followed
615 // by the stdout output. If initial output is on stderr an error message will be
619 // true - if command emits stdout and has zero exit code
620 // false - command has initial output on stderr or non-zero exit code
621 // &$errorMessage - error string if there was an error
623 // $$$ Tested with our command-line image processing. May be deadlocks for
625 function passthruIfSuccessful($headers, $cmd, &$errorMessage)
630 $descriptorspec = array(
631 0 => array("pipe", "r"), // stdin is a pipe that the child will read from
632 1 => array("pipe", "w"), // stdout is a pipe that the child will write to
633 2 => array("pipe", "w"), // stderr is a pipe to write to
639 $process = proc_open($cmd, $descriptorspec, $pipes, $cwd, $env);
641 if (is_resource($process)) {
642 // $pipes now looks like this:
643 // 0 => writeable handle connected to child stdin
644 // 1 => readable handle connected to child stdout
645 // 2 => readable handle connected to child stderr
651 // check whether we get input first on stdout or stderr
652 $read = array($stdout, $stderr);
655 $numChanged = stream_select($read, $write, $except, NULL); // $$$ no timeout
656 if (false === $numChanged) {
658 $errorMessage = 'Select failed';
661 if ($read[0] == $stdout && (1 == $numChanged)) {
662 // Got output first on stdout (only)
663 // $$$ make sure we get all stdout
664 $output = fopen('php://output', 'w');
665 foreach($headers as $header) {
668 stream_copy_to_stream($pipes[1], $output);
669 fclose($output); // okay since tied to special php://output
672 // Got output on stderr
673 // $$$ make sure we get all stderr
674 $errorMessage = stream_get_contents($stderr);
683 // It is important that you close any pipes before calling
684 // proc_close in order to avoid a deadlock
685 $cmdRet = proc_close($process);
688 $errorMessage .= "Command failed with result code " . $cmdRet;
694 function BRfatal($string) {
696 throw new Exception("Image error: $string");
699 // Returns true if using a power node
700 function onPowerNode() {
701 exec("lspci | fgrep -c Realtek", $output, $return);
702 if ("0" != $output[0]) {
705 exec("egrep -q AMD /proc/cpuinfo", $output, $return);
713 function reduceCommand($scale) {
715 if ($this->onPowerNode()) {
716 return ' | pnmscale -reduce ' . $scale . ' 2>/dev/null ';
718 return ' | pnmscale -nomix -reduce ' . $scale . ' 2>/dev/null ';
725 function checkPrivs($filename) {
726 // $$$ we assume here that requests for the title, cover or preview
727 // come in via BookReaderPreview.php which will be re-run with
728 // privileges after we return the 403
729 if (!is_readable($filename)) {
730 header('HTTP/1.1 403 Forbidden');
735 // Given file path (inside archive) and output file extension, return a filename
736 // suitable for Content-disposition header
737 function filenameForClient($filePath, $ext) {
738 $pathParts = pathinfo($filePath);
739 if ('jpeg' == $ext) {
742 return $pathParts['filename'] . '.' . $ext;
745 // Returns the nearest power of 2 reduction factor that results in a larger image
746 function nearestPow2Reduce($desiredDimension, $sourceDimension) {
747 $ratio = floatval($sourceDimension) / floatval($desiredDimension);
748 return $this->nearestPow2ForScale($ratio);
751 // Returns nearest power of 2 reduction factor that results in a larger image
752 function nearestPow2ForScale($scale) {
753 $scale = intval($scale);
757 $binStr = decbin($scale); // convert to binary string. e.g. 5 -> '101'
758 return strlen($binStr) - 1;
762 * Parses a page request like "page5_r2.jpg" or "cover_t.jpg" to corresponding
763 * page type, size, reduce, and format
765 function parsePageRequest($pageRequest, $bookPrefix) {
767 // Will hold parsed results
771 $pageRequest = strtolower($pageRequest);
773 // Pull off extension
774 if (preg_match('#(.*)\.([^.]+)$#', $pageRequest, $matches) === 1) {
775 $pageRequest = $matches[1];
776 $extension = $matches[2];
777 if ($extension == 'jpeg') {
783 $pageInfo['extension'] = $extension;
786 $parts = explode('_', $pageRequest);
788 // Remove book prefix if it was included (historical)
789 if ($parts[0] == $bookPrefix) {
793 if (count($parts) === 0) {
794 $this->BRfatal('No page type specified');
796 $page = array_shift($parts);
802 'preview' => 'single',
807 // Look for known page types
808 foreach ( $pageTypes as $pageName => $kind ) {
809 if ( preg_match('#^(' . $pageName . ')(.*)#', $page, $matches) === 1 ) {
810 $pageInfo['type'] = $matches[1];
813 $pageInfo['value'] = $matches[2];
816 $pageInfo['value'] = intval($matches[2]);
824 if ( !array_key_exists('type', $pageInfo) ) {
825 $this->BRfatal('Unrecognized page type');
828 // Look for other known parts
829 foreach ($parts as $part) {
830 if ( array_key_exists($part, self::$imageSizes) ) {
831 $pageInfo['size'] = $part;
835 // Key must be alpha, value must start with digit and contain digits, alpha, ',' or '.'
836 // Should prevent injection of strange values into the redirect to datanode
837 if ( preg_match('#^([a-z]+)(\d[a-z0-9,.]*)#', $part, $matches) === 0) {
843 $value = $matches[2];
845 if ( array_key_exists($key, self::$imageUrlKeys) ) {
846 $pageInfo[self::$imageUrlKeys[$key]] = $value;
850 // If we hit here, was unrecognized (no action)
856 // Clean up temporary files and resources
858 foreach($this->tempFiles as $tempFile) {
861 $this->tempFiles = array();