plack/lib/Plack/App/BookReader.pm

   1 package Plack::App::BookReader;
   2 use parent qw(Plack::App::File);
   3 use strict;
   4 use warnings;
   5 use Plack::Util;
   6 use HTTP::Date;
   7 use Plack::MIME;
   8 use DirHandle;
   9 use URI::Escape;
  10 use Plack::Request;
  11 use Data::Dump qw(dump);
  12 use File::Path qw(make_path remove_tree);
  13 use Graphics::Magick;
  14 use File::Slurp;
  15 use JSON;
  16 use autodie;
  17 use Time::HiRes qw(time);
  18 use Encode;
  19
  20 sub make_basedir {
  21         my $path = shift;
  22         return if -e $path;
  23         $path =~ s{/[^/]+$}{} || die "no dir/file in $path";
  24         warn "# make_basedir $path\n";
  25         -e $path ? 0 : File::Path::make_path $path;
  26 }
  27
  28 # Stolen from rack/directory.rb
  29 my $dir_file = "<tr><td class='name'><a href='%s'>%s</a></td><td class='size'>%s</td><td class='type'>%s</td><td class='mtime'>%s</td></tr>";
  30 my $dir_page = <<PAGE;
  31 <html><head>
  32   <title>%s</title>
  33   <meta http-equiv="content-type" content="text/html; charset=utf-8" />
  34   <style type='text/css'>
  35 table { width:100%%; }
  36 .name { text-align:left; }
  37 .size, .mtime { text-align:right; }
  38 .type { width:11em; }
  39 .mtime { width:15em; }
  40   </style>
  41 </head><body>
  42 <h1>%s</h1>
  43 <hr />
  44 <table>
  45   <tr>
  46     <th class='name'>Name</th>
  47     <th class='size'>Size</th>
  48     <th class='type'>Type</th>
  49     <th class='mtime'>Last Modified</th>
  50   </tr>
  51 %s
  52 </table>
  53 <hr />
  54 <code>%s</code>
  55 </body></html>
  56 PAGE
  57
  58 my $reader_page = <<'PAGE';
  59 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
  60 <html>
  61 <head>
  62     <title>%s</title>
  63
  64     <link rel="stylesheet" type="text/css" href="/BookReader/BookReader.css"/>
  65     <script type="text/javascript" src="http://archive.org/includes/jquery-1.4.2.min.js"></script>
  66     <script type="text/javascript" src="http://archive.org/bookreader/jquery-ui-1.8.5.custom.min.js"></script>
  67
  68     <script type="text/javascript" src="http://archive.org/bookreader/dragscrollable.js"></script>
  69     <script type="text/javascript" src="http://archive.org/bookreader/jquery.colorbox-min.js"></script>
  70     <script type="text/javascript" src="http://archive.org/bookreader/jquery.ui.ipad.js"></script>
  71     <script type="text/javascript" src="http://archive.org/bookreader/jquery.bt.min.js"></script>
  72
  73     <script type="text/javascript" src="/BookReader/BookReader.js"></script>
  74
  75 <style type="text/css">
  76
  77 /* Hide print and embed functionality */
  78 #BRtoolbar .embed, .print {
  79     display: none;
  80 }
  81
  82 </style>
  83
  84 <script type="text/javascript">
  85 $(document).ready( function() {
  86
  87 //
  88 // This file shows the minimum you need to provide to BookReader to display a book
  89 //
  90 // Copyright(c)2008-2009 Internet Archive. Software license AGPL version 3.
  91
  92 // Create the BookReader object
  93 var br = new BookReader();
  94
  95 var pages = %s;
  96
  97 // Return the width of a given page.  Here we assume all images are 800 pixels wide
  98 br.getPageWidth = function(index) {
  99         if ( ! pages[index] ) return;
 100     return parseInt( pages[index][1] );
 101 }
 102
 103 // Return the height of a given page.  Here we assume all images are 1200 pixels high
 104 br.getPageHeight = function(index) {
 105         if ( ! pages[index] ) return;
 106     return parseInt( pages[index][2] );
 107 }
 108
 109 // We load the images from archive.org -- you can modify this function to retrieve images
 110 // using a different URL structure
 111 br.getPageURI = function(index, reduce, rotate) {
 112         if ( ! pages[index] ) return;
 113     // reduce and rotate are ignored in this simple implementation, but we
 114     // could e.g. look at reduce and load images from a different directory
 115     // or pass the information to an image server
 116         var r = 1 << ( Math.ceil(reduce).toString(2).length - 1 ); // reduce to nearest higher pow 2
 117         var url = pages[index][0] + '?reduce='+r;
 118         console.debug('getPageURI', index, reduce, r, rotate, url);
 119     return url;
 120 }
 121
 122 // Return which side, left or right, that a given page should be displayed on
 123 br.getPageSide = function(index) {
 124     if (0 == (index & 0x1)) {
 125         return 'R';
 126     } else {
 127         return 'L';
 128     }
 129 }
 130
 131 // This function returns the left and right indices for the user-visible
 132 // spread that contains the given index.  The return values may be
 133 // null if there is no facing page or the index is invalid.
 134 br.getSpreadIndices = function(pindex) {
 135     var spreadIndices = [null, null];
 136     if ('rl' == this.pageProgression) {
 137         // Right to Left
 138         if (this.getPageSide(pindex) == 'R') {
 139             spreadIndices[1] = pindex;
 140             spreadIndices[0] = pindex + 1;
 141         } else {
 142             // Given index was LHS
 143             spreadIndices[0] = pindex;
 144             spreadIndices[1] = pindex - 1;
 145         }
 146     } else {
 147         // Left to right
 148         if (this.getPageSide(pindex) == 'L') {
 149             spreadIndices[0] = pindex;
 150             spreadIndices[1] = pindex + 1;
 151         } else {
 152             // Given index was RHS
 153             spreadIndices[1] = pindex;
 154             spreadIndices[0] = pindex - 1;
 155         }
 156     }
 157
 158     return spreadIndices;
 159 }
 160
 161 // For a given "accessible page index" return the page number in the book.
 162 //
 163 // For example, index 5 might correspond to "Page 1" if there is front matter such
 164 // as a title page and table of contents.
 165 br.getPageNum = function(index) {
 166     return index+1;
 167 }
 168
 169 // Total number of leafs
 170 br.numLeafs = pages.length;
 171
 172 // Book title and the URL used for the book title link
 173 br.bookTitle= '%s';
 174 br.bookUrl  = '%s';
 175
 176 // Override the path used to find UI images
 177 br.imagesBaseURL = '/BookReader/images/';
 178
 179 br.getEmbedCode = function(frameWidth, frameHeight, viewParams) {
 180     return "Embed code not supported in bookreader demo.";
 181 }
 182
 183 // Let's go!
 184 br.init();
 185
 186 // read-aloud and search need backend compenents and are not supported in the demo
 187 $('#BRtoolbar').find('.read').hide();
 188 $('#textSrch').hide();
 189 $('#btnSrch').hide();
 190
 191 } );
 192 </script>
 193
 194 </head>
 195 <body style="background-color: ##939598;">
 196
 197 <div id="BookReader">
 198     Internet Archive BookReader<br/>
 199
 200     <noscript>
 201     <p>
 202         The BookReader requires JavaScript to be enabled. Please check that your browser supports JavaScript and that it is enabled in the browser settings.
 203     </p>
 204     </noscript>
 205 </div>
 206
 207
 208 </body>
 209 </html>
 210 PAGE
 211
 212 sub should_handle {
 213     my($self, $file) = @_;
 214     return -d $file || -f $file;
 215 }
 216
 217 sub return_dir_redirect {
 218     my ($self, $env) = @_;
 219     my $uri = Plack::Request->new($env)->uri;
 220     return [ 301,
 221         [
 222             'Location' => $uri . '/',
 223             'Content-Type' => 'text/plain',
 224             'Content-Length' => 8,
 225         ],
 226         [ 'Redirect' ],
 227     ];
 228 }
 229
 230 sub convert { gm('convert',@_) }
 231 sub montage { gm('montage',@_) }
 232
 233 sub gm {
 234         my $command = shift;
 235         warn "# $command ",dump(@_);
 236         my $t = time();
 237         system 'gm', $command, @_;
 238         $t = time() - $t;
 239         warn sprintf("## $command %d bytes in %.2f s %s\n", -s $_[-1], $t, $_[-1]);
 240 }
 241
 242 sub longest_common_prefix {
 243            my $prefix = shift;
 244         for (@_) {
 245                 chop $prefix while (! /^\Q$prefix\E/i);
 246         }
 247         warn "# longest_common_prefix [$prefix]\n";
 248         return $prefix;
 249 }
 250
 251 sub sort_pages {
 252         my $prefix = longest_common_prefix @_;
 253         sort {
 254                         my ( $an,$bn ) = ( $a,$b );
 255                         $an =~ s/^\Q$prefix\E//i; $an =~ s/\D+//g;
 256                         $bn =~ s/^\Q$prefix\E//i; $bn =~ s/\D+//g;
 257                         warn "## sort [$a] $an <=> $bn [$b]\n";
 258                         $an <=> $bn;
 259         } @_;
 260 }
 261
 262 sub convert_pdf_page {
 263         my ($pdf, $page, $path) = @_;
 264         my $t = time();
 265
 266         make_path $path;
 267
 268         warn "# pdfimages $page $pdf -> $path/\n";
 269         system 'pdfimages', '-f', $page, '-l', $page, '-q', '-j', '-p', $pdf, "$path/p";
 270
 271         my @parts = ();
 272         # glob split on spaces!
 273         opendir(my $dh, $path);
 274         while (readdir($dh)) {
 275                 my $full = "$path/$_";
 276                 warn "## readdir $full\n";
 277                 next unless -f $full; # skip . ..
 278                 push @parts, $_;
 279         }
 280         closedir $dh;
 281
 282         die "can't find images for $pdf in $path" unless $#parts >= 0;
 283
 284         @parts = sort_pages @parts;
 285
 286         my $image = "$path.jpg";
 287
 288         if ( $#parts == 0 ) { # single image
 289                         my $part = "$path/$parts[0]";
 290                         convert( $part => $image );
 291         } else {
 292                         my @full = map { "$path/$_" } @parts;
 293                         montage( @full, '-tile', '1x'.scalar(@full), '-geometry', '+1+1' => $image );
 294         }
 295
 296         die "$image: $!" unless -r $image;
 297
 298         remove_tree $path;
 299
 300         $t = time() - $t;
 301         warn sprintf("## page: %d in %.2f s for %s\n", $page, $t, $image);
 302         return $image;
 303 }
 304
 305 sub render_pdf_page {
 306         my ( $pdf, $page, $path ) = @_;
 307         my $t = time();
 308
 309         warn "# pdftocairo $pdf\n";
 310         system('pdftocairo', '-jpeg', '-f', $page, '-l', $page, $pdf, $path);
 311
 312         my $image = sprintf( '%s-%03d.jpg', $path, $page );
 313
 314         die "can't find $image: $!" unless -r $image;
 315
 316         $t = time() - $t;
 317         warn sprintf("## page: %d in %.2f s for %s\n", $page, $t, $image);
 318         return $image;
 319 }
 320
 321 sub serve_path {
 322     my($self, $env, $path, $fullpath) = @_;
 323
 324         my $req = Plack::Request->new($env);
 325
 326     my $dir_url = $env->{SCRIPT_NAME} . $env->{PATH_INFO};
 327         my @files = ();
 328         my @page_files;
 329
 330         if ( -f $path && $path =~ s{/([^/]+\.pdf)$}{} ) {
 331                 push @page_files, $1;
 332                 warn "# single pdf: $path / $1\n";
 333     } elsif (-f $path ) {
 334
 335                 if ( my $reduce = $req->param('reduce') ) {
 336                         $reduce = int($reduce); # BookReader javascript somethimes returns float
 337                         warn "# reduce $reduce $path\n";
 338
 339                         my $cache_path = "cache/$dir_url.reduce.$reduce.jpg";
 340                         if ( $reduce <= 1 && $path =~ m/\.jpe?g$/ ) {
 341                                 $cache_path = $path;
 342                         } elsif ( ! -e $cache_path ) {
 343                                 make_basedir $cache_path;
 344                                 convert( '-scale', ( 100 / $reduce ) .'%', $path => $cache_path );
 345                         }
 346
 347                 return $self->SUPER::serve_path($env, $cache_path, $fullpath);
 348
 349                 }
 350
 351         return $self->SUPER::serve_path($env, $path, $fullpath);
 352      } elsif ( -d $path ) {
 353
 354                 if ($dir_url !~ m{/$}) {
 355                         return $self->return_dir_redirect($env);
 356                 }
 357
 358                 my $dh = DirHandle->new($path);
 359                 my @children;
 360                 while (defined(my $ent = $dh->read)) {
 361                         next if $ent eq '.';
 362                         push @children, $ent;
 363                 }
 364
 365                 for my $basename (sort { $a cmp $b } @children) {
 366                         push @page_files, $basename if $basename =~ m/\d+\D?\.(jpg|gif|pdf)$/;
 367                         my $file = "$path/$basename";
 368                         my $url = $dir_url . $basename;
 369
 370                         my $is_dir = -d $file;
 371                         my @stat = stat _;
 372
 373
 374                         $url = join '/', map {uri_escape($_)} split m{/}, $url;
 375
 376                         if ($is_dir) {
 377                                 $basename .= "/";
 378                                 $url      .= "/";
 379                         }
 380
 381                         my $mime_type = $is_dir ? 'directory' : ( Plack::MIME->mime_type($file) || 'text/plain' );
 382                         push @files, [ $url, $basename, $stat[7], $mime_type, HTTP::Date::time2str($stat[9]) ];
 383                 }
 384
 385         } else {
 386                 die "Unsupported format: $path";
 387         }
 388
 389         if ( @page_files ) {
 390                 @page_files = sort_pages @page_files;
 391                 warn "# page_files = ",dump( @page_files );
 392         }
 393
 394     my $dir  = Plack::Util::encode_html( $env->{PATH_INFO} );
 395         my $page = 'empty';
 396
 397         if ( $req->param('bookreader') ) {
 398
 399                 my $pages; # []:
 400                 my $pages_path = "meta/$dir_url/bookreader.json";
 401                 if ( -e $pages_path ) {
 402                         $pages = decode_json read_file $pages_path;
 403                 } else {
 404                         foreach my $page ( @page_files ) {
 405                                 my $image = Graphics::Magick->new;
 406                                 if ( $page =~ m/\.pdf$/ ) {
 407                                         die "$path/$page: $!" unless -r "$path/$page";
 408
 409                                         my $info = `pdfinfo "$path/$page"`;
 410                                         warn "# pdfinfo $path/$page\n$info\n";
 411                                         my $pdf_pages = $1 if ( $info =~ m/Pages:\s*(\d+)/s );
 412                                         die "can't find number of pages for $path/$page in:\n$pdf_pages\n" unless $pdf_pages;
 413
 414                                         my $cache_path = "cache/$dir_url/$page";
 415                                         my $txt = "$cache_path.txt";
 416                                         make_basedir $txt;
 417                                         system('pdftotext', "$path/$page", $txt);
 418                                         warn "# pdftotext $txt ", -s $txt, " bytes\n";
 419
 420                                         my $is_bitmap = -s $txt == $pdf_pages;
 421
 422                                         $pdf_pages = $ENV{MAX_PAGES} if defined $ENV{MAX_PAGES} && $pdf_pages > $ENV{MAX_PAGES}; # FIXME
 423
 424                                         warn "DIAG: bitmap:$is_bitmap pdf_pages:$pdf_pages\n";
 425
 426                                         foreach my $nr ( 1 .. $pdf_pages ) {
 427                                                 my $page_url = $is_bitmap
 428                                                         ? convert_pdf_page( "$path/$page", $nr, "$cache_path.$nr" )
 429                                                         : render_pdf_page(  "$path/$page", $nr, "$cache_path" )
 430                                                 ;
 431                                                 warn "## ping $page_url\n";
 432                                                 my ( $w, $h, $size, $format ) = $image->ping($page_url);
 433                                                 warn "## image size $w*$h $size $format $page_url\n";
 434                                                 my $url = decode('utf-8',"/$page_url");
 435                                                 push @$pages, [ $url, $w, $h ] if $w && $h;
 436                                         }
 437
 438                                 } else {
 439                                         die "$path/$page: $!" unless -r "$path/$page";
 440                                         my ( $w, $h, $size, $format ) = $image->ping("$path/$page");
 441                                         warn "# image size $w*$h $size $format $path/$page\n";
 442                                         my $url = decode('utf-8',"$dir_url/$page");
 443                                         push @$pages, [ $url, $w, $h ] if $w && $h;
 444                                 }
 445                         }
 446                         make_basedir $pages_path;
 447                         write_file $pages_path, => encode_json( $pages );
 448                         warn "# created $pages_path ", -s $pages_path, " bytes\n";
 449                 }
 450                 warn "# pages = ",dump($pages);
 451                 $page = sprintf $reader_page, $dir, encode_json( $pages ), $dir, $dir =~ m/\/$/ ? '..' : '.';
 452
 453         } else {
 454
 455                 my $files = join "\n", map {
 456                         my $f = $_;
 457                         sprintf $dir_file, map Plack::Util::encode_html($_), @$f;
 458                 } @files;
 459
 460                 $page = sprintf $dir_page, $dir, $dir, $files,
 461                         @page_files ? '<form><input type=submit name=bookreader value="Read"></form>' . dump( [ @page_files ] ) : '';
 462
 463         }
 464
 465     return [ 200, ['Content-Type' => 'text/html; charset=utf-8'], [ $page ] ];
 466 }
 467
 468 1;
 469
 470 __END__
 471
 472 =head1 NAME
 473
 474 Plack::App::BookReader - Internet Archive Book Reader with directory index
 475
 476 =head1 SYNOPSIS
 477
 478   # app.psgi
 479   use Plack::App::BookReader;
 480   my $app = Plack::App::BookReader->new({ root => "/path/to/htdocs" })->to_app;
 481
 482 =head1 DESCRIPTION
 483
 484 This is a static file server PSGI application with directory index a la Apache's mod_autoindex.
 485
 486 =head1 CONFIGURATION
 487
 488 =over 4
 489
 490 =item root
 491
 492 Document root directory. Defaults to the current directory.
 493
 494 =back
 495
 496 =head1 AUTHOR
 497
 498 Dobrica Pavlinusic
 499 Tatsuhiko Miyagawa (based on L<Plack::App::Directory>
 500
 501 =head1 SEE ALSO
 502
 503 L<Plack::App::File>
 504
 505 =cut
 506