4 Copyright(c) 2008-2010 Internet Archive. Software license AGPL version 3.
6 This file is part of BookReader. The full source code can be found at GitHub:
7 http://github.com/openlibrary/bookreader
9 The canonical short name of an image type is the same as in the MIME type.
10 For example both .jpeg and .jpg are considered to have type "jpeg" since
11 the MIME type is "image/jpeg".
13 BookReader is free software: you can redistribute it and/or modify
14 it under the terms of the GNU Affero General Public License as published by
15 the Free Software Foundation, either version 3 of the License, or
16 (at your option) any later version.
18 BookReader is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU Affero General Public License for more details.
23 You should have received a copy of the GNU Affero General Public License
24 along with BookReader. If not, see <http://www.gnu.org/licenses/>.
27 require_once("BookReaderMeta.inc.php");
29 class BookReaderImages
31 public static $MIMES = array('gif' => 'image/gif',
33 'jpg' => 'image/jpeg',
34 'jpeg' => 'image/jpeg',
36 'tif' => 'image/tiff',
37 'tiff' => 'image/tiff');
39 public static $EXTENSIONS = array('gif' => 'gif',
47 // Width when generating thumbnails
48 public static $imageSizes = array(
55 // Keys in the image permalink urls, e.g. http://www.archive.org/download/itemid/page/cover_{keyval}_{keyval}.jpg
56 public static $imageUrlKeys = array(
66 // Paths to command-line tools
67 var $exiftool = '/petabox/sw/books/exiftool/exiftool';
68 var $kduExpand = '/petabox/sw/bin/kdu_expand';
70 // Name of temporary files, to be cleaned at exit
71 var $tempFiles = array();
74 * Serve an image request that requires looking up the book metadata
78 * - Parse the requested page (e.g. cover_t.jpg, n5_r4.jpg) to determine which page type,
79 * size and format (etc) is being requested
80 * - Determine the leaf number corresponding to the page
81 * - Determine scaling values
82 * - Serve image request now that all information has been gathered
85 function serveLookupRequest($requestEnv) {
86 $brm = new BookReaderMeta();
88 $metadata = $brm->buildMetadata($_REQUEST['id'], $_REQUEST['itemPath'], $_REQUEST['subPrefix'], $_REQUEST['server']);
89 } catch (Exception $e) {
90 $this->BRfatal($e->getMessage);
93 $page = $_REQUEST['page'];
95 // Index of image to return
98 // deal with subPrefix
99 if ($_REQUEST['subPrefix']) {
100 $parts = split('/', $_REQUEST['subPrefix']);
101 $bookId = $parts[count($parts) - 1 ];
103 $bookId = $_REQUEST['id'];
106 $pageInfo = $this->parsePageRequest($page, $bookId);
108 $basePage = $pageInfo['type'];
112 if (! array_key_exists('titleIndex', $metadata)) {
113 $this->BRfatal("No title page asserted in book");
115 $imageIndex = $metadata['titleIndex'];
119 if (! array_key_exists('coverIndices', $metadata)) {
120 $this->BRfatal("No cover asserted in book");
122 $imageIndex = $metadata['coverIndices'][0]; // $$$ TODO add support for other covers
127 // Cover page if book was published >= 1950
132 if ( array_key_exists('date', $metadata) && array_key_exists('coverIndices', $metadata) ) {
133 if ($brm->parseYear($metadata['date']) >= 1950) {
134 $imageIndex = $metadata['coverIndices'][0];
138 if (array_key_exists('titleIndex', $metadata)) {
139 $imageIndex = $metadata['titleIndex'];
142 if (array_key_exists('coverIndices', $metadata)) {
143 $imageIndex = $metadata['coverIndices'][0];
152 // Accessible index page
153 $imageIndex = intval($pageInfo['value']);
158 $index = array_search($pageInfo['value'], $metadata['pageNums']);
159 if ($index === FALSE) {
161 $this->BRfatal("Page not found");
165 $imageIndex = $index;
169 // Shouldn't be possible
170 $this->BRfatal("Unrecognized page type requested");
175 $leaf = $brm->leafForIndex($imageIndex, $metadata['leafNums']);
178 'zip' => $metadata['zip'],
179 'file' => $brm->imageFilePath($leaf, $metadata['subPrefix'], $metadata['imageFormat']),
183 // remove non-passthrough keys from pageInfo
184 unset($pageInfo['type']);
185 unset($pageInfo['value']);
187 // add pageinfo to request
188 $requestEnv = array_merge($pageInfo, $requestEnv);
190 // Return image data - will check privs
191 $this->serveRequest($requestEnv);
196 * Returns a page image when all parameters such as the image stack location are
201 * Get info about requested image (input)
202 * Get info about requested output format
203 * Determine processing parameters
206 * Clean up temporary files
208 function serveRequest($requestEnv) {
209 // Process some of the request parameters
210 $zipPath = $requestEnv['zip'];
211 $file = $requestEnv['file'];
213 $ext = $requestEnv['ext'];
218 if (isset($requestEnv['callback'])) {
219 // validate callback is valid JS identifier (only)
220 $callback = $requestEnv['callback'];
221 $identifierPatt = '/^[[:alpha:]$_]([[:alnum:]$_])*$/';
222 if (! preg_match($identifierPatt, $callback)) {
223 $this->BRfatal('Invalid callback');
229 if ( !file_exists($zipPath) ) {
230 $this->BRfatal('Image stack does not exist at ' . $zipPath);
232 // Make sure the image stack is readable - return 403 if not
233 $this->checkPrivs($zipPath);
236 // Get the image size and depth
237 $imageInfo = $this->getImageInfo($zipPath, $file);
239 // Output json if requested
240 if ('json' == $ext) {
241 // $$$ we should determine the output size first based on requested scale
242 $this->outputJSON($imageInfo, $callback); // $$$ move to BookReaderRequest
246 // Unfortunately kakadu requires us to know a priori if the
247 // output file should be .ppm or .pgm. By decompressing to
248 // .bmp kakadu will write a file we can consistently turn into
249 // .pnm. Really kakadu should support .pnm as the file output
250 // extension and automatically write ppm or pgm format as
252 $this->decompressToBmp = true; // $$$ shouldn't be necessary if we use file info to determine output format
253 if ($this->decompressToBmp) {
254 $stdoutLink = '/tmp/stdout.bmp';
256 $stdoutLink = '/tmp/stdout.ppm';
259 $fileExt = strtolower(pathinfo($file, PATHINFO_EXTENSION));
261 // Rotate is currently only supported for jp2 since it does not add server load
262 $allowedRotations = array("0", "90", "180", "270");
263 $rotate = $requestEnv['rotate'];
264 if ( !in_array($rotate, $allowedRotations) ) {
268 // Image conversion options
270 $jpegOptions = '-quality 75';
272 // The pbmreduce reduction factor produces an image with dimension 1/n
273 // The kakadu reduction factor produces an image with dimension 1/(2^n)
275 // We interpret the requested size and scale, look at image format, and determine the
276 // actual scaling to be returned to the client. We generally return the largest
277 // power of 2 reduction that is larger than the requested size in order to reduce
278 // image processing load on our cluster. The client should then scale to their final
281 // Set scale from height or width if set
282 if (isset($requestEnv['height'])) {
283 $powReduce = $this->nearestPow2Reduce($requestEnv['height'], $imageInfo['height']);
284 $scale = pow(2, $powReduce);
285 } else if (isset($requestEnv['width'])) {
286 $powReduce = $this->nearestPow2Reduce($requestEnv['width'], $imageInfo['width']);
287 $scale = pow(2, $powReduce);
290 // Set scale from named size (e.g. 'large') if set
291 $size = $requestEnv['size'];
292 if ( $size && array_key_exists($size, self::$imageSizes)) {
293 $srcRatio = floatval($imageInfo['width']) / floatval($imageInfo['height']);
296 $dimension = 'width';
298 $dimension = 'height';
300 $powReduce = $this->nearestPow2Reduce(self::$imageSizes[$size], $imageInfo[$dimension]);
301 $scale = pow(2, $powReduce);
304 // No named size - use explicit scale, if given
305 $scale = $requestEnv['scale'];
309 $powReduce = $this->nearestPow2ForScale($scale);
310 // ensure integer scale
311 $scale = pow(2, $powReduce);
315 // Override depending on source image format
316 // $$$ consider doing a 302 here instead, to make better use of the browser cache
317 // Limit scaling for 1-bit images. See https://bugs.edge.launchpad.net/bookreader/+bug/486011
318 if (1 == $imageInfo['bits']) {
323 // Hard limit so there are some black pixels to use!
331 if (!file_exists($stdoutLink))
333 system('ln -s /dev/stdout ' . $stdoutLink);
336 putenv('LD_LIBRARY_PATH=/petabox/sw/lib/kakadu');
338 $unzipCmd = $this->getUnarchiveCommand($zipPath, $file);
340 $decompressCmd = $this->getDecompressCmd($imageInfo['type'], $powReduce, $rotate, $scale, $stdoutLink);
342 // Non-integer scaling is currently disabled on the cluster
343 // if (isset($_REQUEST['height'])) {
344 // $cmd .= " | pnmscale -height {$_REQUEST['height']} ";
349 $compressCmd = ' | pnmtopng ' . $pngOptions;
355 $compressCmd = ' | pnmtojpeg ' . $jpegOptions;
356 $ext = 'jpeg'; // for matching below
361 if (($ext == $fileExt) && ($scale == 1) && ($rotate === "0")) {
362 // Just pass through original data if same format and size
365 $cmd = $unzipCmd . $decompressCmd . $compressCmd;
370 $filenameForClient = $this->filenameForClient($file, $ext);
372 $headers = array('Content-type: '. self::$MIMES[$ext],
373 'Cache-Control: max-age=15552000',
374 'Content-disposition: inline; filename=' . $filenameForClient);
378 if (! $this->passthruIfSuccessful($headers, $cmd, $errorMessage)) { // $$$ move to BookReaderRequest
379 // $$$ automated reporting
380 trigger_error('BookReader Processing Error: ' . $cmd . ' -- ' . $errorMessage, E_USER_WARNING);
382 // Try some content-specific recovery
384 if ($imageInfo['type'] == 'jp2') {
385 $records = $this->getJp2Records($zipPath, $file);
386 if ($powReduce > intval($records['Clevels'])) {
387 $powReduce = $records['Clevels'];
388 $reduce = pow(2, $powReduce);
394 $cmd = $unzipCmd . $this->getDecompressCmd($imageInfo['type'], $powReduce, $rotate, $scale, $stdoutLink) . $compressCmd;
395 if ($this->passthruIfSuccessful($headers, $cmd, $errorMessage)) { // $$$ move to BookReaderRequest
399 trigger_error('BookReader fallback image processing also failed: ' . $errorMessage, E_USER_WARNING);
404 $this->BRfatal('Problem processing image - command failed');
411 function getUnarchiveCommand($archivePath, $file)
413 $lowerPath = strtolower($archivePath);
414 if (preg_match('/\.([^\.]+)$/', $lowerPath, $matches)) {
415 $suffix = $matches[1];
417 if ($suffix == 'zip') {
419 . escapeshellarg($archivePath)
420 . ' ' . escapeshellarg($file);
421 } else if ($suffix == 'tar') {
422 return ' ( 7z e -so '
423 . escapeshellarg($archivePath)
424 . ' ' . escapeshellarg($file) . ' 2>/dev/null ) ';
426 $this->BRfatal('Incompatible archive format');
430 $this->BRfatal('Bad image stack path');
433 $this->BRfatal('Bad image stack path or archive format');
438 * Returns the image type associated with the file extension.
440 function imageExtensionToType($extension)
443 if (array_key_exists($extension, self::$EXTENSIONS)) {
444 return self::$EXTENSIONS[$extension];
446 $this->BRfatal('Unknown image extension');
451 * Get the image information. The returned associative array fields will
452 * vary depending on the image type. The basic keys are width, height, type
455 function getImageInfo($zipPath, $file)
457 return $this->getImageInfoFromExif($zipPath, $file); // this is fast
460 $fileExt = strtolower(pathinfo($file, PATHINFO_EXTENSION));
461 $type = imageExtensionToType($fileExt);
465 return getImageInfoFromJp2($zipPath, $file);
468 return getImageInfoFromExif($zipPath, $file);
473 // Get the records of of JP2 as returned by kdu_expand
474 function getJp2Records($zipPath, $file)
477 $cmd = $this->getUnarchiveCommand($zipPath, $file)
478 . ' | ' . $this->kduExpand
479 . ' -no_seek -quiet -i /dev/stdin -record /dev/stdout';
483 foreach ($output as $line) {
484 $elems = explode("=", $line, 2);
485 if (1 == count($elems)) {
486 // delimiter not found
489 $records[$elems[0]] = $elems[1];
496 * Get the image width, height and depth using the EXIF information.
498 function getImageInfoFromExif($zipPath, $file)
501 // We look for all the possible tags of interest then act on the
502 // ones presumed present based on the file type
503 $tagsToGet = ' -ImageWidth -ImageHeight -FileType' // all formats
504 . ' -BitsPerComponent -ColorSpace' // jp2
505 . ' -BitDepth' // png
506 . ' -BitsPerSample'; // tiff
508 $cmd = $this->getUnarchiveCommand($zipPath, $file)
509 . ' | '. $this->exiftool . ' -S -fast' . $tagsToGet . ' -';
513 foreach ($output as $line) {
514 $keyValue = explode(": ", $line);
515 $tags[$keyValue[0]] = $keyValue[1];
518 $width = intval($tags["ImageWidth"]);
519 $height = intval($tags["ImageHeight"]);
520 $type = strtolower($tags["FileType"]);
524 $bits = intval($tags["BitsPerComponent"]);
527 $bits = intval($tags["BitsPerSample"]);
533 $bits = intval($tags["BitDepth"]);
536 $this->BRfatal("Unsupported image type $type for file $file in $zipPath");
541 $retval = Array('width' => $width, 'height' => $height,
542 'bits' => $bits, 'type' => $type);
548 * Output JSON given the imageInfo associative array
550 function outputJSON($imageInfo, $callback)
552 header('Content-type: text/plain');
553 $jsonOutput = json_encode($imageInfo);
555 $jsonOutput = $callback . '(' . $jsonOutput . ');';
560 function getDecompressCmd($imageType, $powReduce, $rotate, $scale, $stdoutLink) {
562 switch ($imageType) {
565 " | " . $this->kduExpand . " -no_seek -quiet -reduce $powReduce -rotate $rotate -i /dev/stdin -o " . $stdoutLink;
566 if ($this->decompressToBmp) {
567 // We suppress output since bmptopnm always outputs on stderr
568 $decompressCmd .= ' | (bmptopnm 2>/dev/null)';
573 // We need to create a temporary file for tifftopnm since it cannot
574 // work on a pipe (the file must be seekable).
575 // We use the BookReaderTiff prefix to give a hint in case things don't
577 $tempFile = tempnam("/tmp", "BookReaderTiff");
578 array_push($this->tempFiles, $tempFile);
580 // $$$ look at bit depth when reducing
582 ' > ' . $tempFile . ' ; tifftopnm ' . $tempFile . ' 2>/dev/null' . $this->reduceCommand($scale);
586 $decompressCmd = ' | ( jpegtopnm 2>/dev/null ) ' . $this->reduceCommand($scale);
590 $decompressCmd = ' | ( pngtopnm 2>/dev/null ) ' . $this->reduceCommand($scale);
594 $this->BRfatal('Unknown image type: ' . $imageType);
597 return $decompressCmd;
600 // If the command has its initial output on stdout the headers will be emitted followed
601 // by the stdout output. If initial output is on stderr an error message will be
605 // true - if command emits stdout and has zero exit code
606 // false - command has initial output on stderr or non-zero exit code
607 // &$errorMessage - error string if there was an error
609 // $$$ Tested with our command-line image processing. May be deadlocks for
611 function passthruIfSuccessful($headers, $cmd, &$errorMessage)
616 $descriptorspec = array(
617 0 => array("pipe", "r"), // stdin is a pipe that the child will read from
618 1 => array("pipe", "w"), // stdout is a pipe that the child will write to
619 2 => array("pipe", "w"), // stderr is a pipe to write to
625 $process = proc_open($cmd, $descriptorspec, $pipes, $cwd, $env);
627 if (is_resource($process)) {
628 // $pipes now looks like this:
629 // 0 => writeable handle connected to child stdin
630 // 1 => readable handle connected to child stdout
631 // 2 => readable handle connected to child stderr
637 // check whether we get input first on stdout or stderr
638 $read = array($stdout, $stderr);
641 $numChanged = stream_select($read, $write, $except, NULL); // $$$ no timeout
642 if (false === $numChanged) {
644 $errorMessage = 'Select failed';
647 if ($read[0] == $stdout && (1 == $numChanged)) {
648 // Got output first on stdout (only)
649 // $$$ make sure we get all stdout
650 $output = fopen('php://output', 'w');
651 foreach($headers as $header) {
654 stream_copy_to_stream($pipes[1], $output);
655 fclose($output); // okay since tied to special php://output
658 // Got output on stderr
659 // $$$ make sure we get all stderr
660 $errorMessage = stream_get_contents($stderr);
669 // It is important that you close any pipes before calling
670 // proc_close in order to avoid a deadlock
671 $cmdRet = proc_close($process);
674 $errorMessage .= "Command failed with result code " . $cmdRet;
680 function BRfatal($string) {
682 throw new Exception("Image error: $string");
685 // Returns true if using a power node
686 function onPowerNode() {
687 exec("lspci | fgrep -c Realtek", $output, $return);
688 if ("0" != $output[0]) {
691 exec("egrep -q AMD /proc/cpuinfo", $output, $return);
699 function reduceCommand($scale) {
701 if ($this->onPowerNode()) {
702 return ' | pnmscale -reduce ' . $scale . ' 2>/dev/null ';
704 return ' | pnmscale -nomix -reduce ' . $scale . ' 2>/dev/null ';
711 function checkPrivs($filename) {
712 if (!is_readable($filename)) {
713 header('HTTP/1.1 403 Forbidden');
718 // Given file path (inside archive) and output file extension, return a filename
719 // suitable for Content-disposition header
720 function filenameForClient($filePath, $ext) {
721 $pathParts = pathinfo($filePath);
722 if ('jpeg' == $ext) {
725 return $pathParts['filename'] . '.' . $ext;
728 // Returns the nearest power of 2 reduction factor that results in a larger image
729 function nearestPow2Reduce($desiredDimension, $sourceDimension) {
730 $ratio = floatval($sourceDimension) / floatval($desiredDimension);
731 return $this->nearestPow2ForScale($ratio);
734 // Returns nearest power of 2 reduction factor that results in a larger image
735 function nearestPow2ForScale($scale) {
736 $scale = intval($scale);
740 $binStr = decbin($scale); // convert to binary string. e.g. 5 -> '101'
741 return strlen($binStr) - 1;
745 * Parses a page request like "page5_r2.jpg" or "cover_t.jpg" to corresponding
746 * page type, size, reduce, and format
748 function parsePageRequest($pageRequest, $bookPrefix) {
750 // Will hold parsed results
754 $pageRequest = strtolower($pageRequest);
756 // Pull off extension
757 if (preg_match('#(.*)\.([^.]+)$#', $pageRequest, $matches) === 1) {
758 $pageRequest = $matches[1];
759 $extension = $matches[2];
760 if ($extension == 'jpeg') {
766 $pageInfo['extension'] = $extension;
769 $parts = explode('_', $pageRequest);
771 // Remove book prefix if it was included (historical)
772 if ($parts[0] == $bookPrefix) {
776 if (count($parts) === 0) {
777 $this->BRfatal('No page type specified');
779 $page = array_shift($parts);
785 'preview' => 'single',
789 // Look for known page types
790 foreach ( $pageTypes as $pageName => $kind ) {
791 if ( preg_match('#^(' . $pageName . ')(.*)#', $page, $matches) === 1 ) {
792 $pageInfo['type'] = $matches[1];
795 $pageInfo['value'] = $matches[2];
798 $pageInfo['value'] = intval($matches[2]);
806 if ( !array_key_exists('type', $pageInfo) ) {
807 $this->BRfatal('Unrecognized page type');
810 // Look for other known parts
811 foreach ($parts as $part) {
812 if ( array_key_exists($part, self::$imageSizes) ) {
813 $pageInfo['size'] = $part;
817 // Key must be alpha, value must start with digit and contain digits, alpha, ',' or '.'
818 // Should prevent injection of strange values into the redirect to datanode
819 if ( preg_match('#^([a-z]+)(\d[a-z0-9,.]*)#', $part, $matches) === 0) {
825 $value = $matches[2];
827 if ( array_key_exists($key, self::$imageUrlKeys) ) {
828 $pageInfo[self::$imageUrlKeys[$key]] = $value;
832 // If we hit here, was unrecognized (no action)
838 // Clean up temporary files and resources
840 foreach($this->tempFiles as $tempFile) {
843 $this->tempFiles = array();