git.rot13.org Git - webpac2/blob - lib/WebPAC/Input.pm

   1 package WebPAC::Input;
   2
   3 use warnings;
   4 use strict;
   5
   6 use WebPAC::Common;
   7 use base qw/WebPAC::Common/;
   8 use Text::Iconv;
   9 use Data::Dumper;
  10
  11 =head1 NAME
  12
  13 WebPAC::Input - read different file formats into WebPAC
  14
  15 =head1 VERSION
  16
  17 Version 0.04
  18
  19 =cut
  20
  21 our $VERSION = '0.04';
  22
  23 =head1 SYNOPSIS
  24
  25 This module implements input as database which have fixed and known
  26 I<size> while indexing and single unique numeric identifier for database
  27 position ranging from 1 to I<size>.
  28
  29 Simply, something that is indexed by unmber from 1 .. I<size>.
  30
  31 Examples of such databases are CDS/ISIS files, MARC files, lines in
  32 text file, and so on.
  33
  34 Specific file formats are implemented using low-level interface modules,
  35 located in C<WebPAC::Input::*> namespace which export C<open_db>,
  36 C<fetch_rec> and optional C<init> functions.
  37
  38 Perhaps a little code snippet.
  39
  40     use WebPAC::Input;
  41
  42     my $db = WebPAC::Input->new(
  43         module => 'WebPAC::Input::ISIS',
  44                 config => $config,
  45                 lookup => $lookup_obj,
  46                 low_mem => 1,
  47     );
  48
  49     $db->open('/path/to/database');
  50         print "database size: ",$db->size,"\n";
  51         while (my $rec = $db->fetch) {
  52                 # do something with $rec
  53         }
  54
  55
  56
  57 =head1 FUNCTIONS
  58
  59 =head2 new
  60
  61 Create new input database object.
  62
  63   my $db = new WebPAC::Input(
  64         module => 'WebPAC::Input::MARC',
  65         code_page => 'ISO-8859-2',
  66         low_mem => 1,
  67         recode => 'char pairs',
  68   );
  69
  70 C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
  71 L<WebPAC::Input::MARC>.
  72
  73 Optional parametar C<code_page> specify application code page (which will be
  74 used internally). This should probably be your terminal encoding, and by
  75 default, it C<ISO-8859-2>.
  76
  77 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
  78
  79 This function will also call low-level C<init> if it exists with same
  80 parametars.
  81
  82 =cut
  83
  84 sub new {
  85         my $class = shift;
  86         my $self = {@_};
  87         bless($self, $class);
  88
  89         my $log = $self->_get_logger;
  90
  91         $log->logconfess("specify low-level file format module") unless ($self->{module});
  92         my $module = $self->{module};
  93         $module =~ s#::#/#g;
  94         $module .= '.pm';
  95         $log->debug("require low-level module $self->{module} from $module");
  96
  97         require $module;
  98         #eval $self->{module} .'->import';
  99
 100         # check if required subclasses are implemented
 101         foreach my $subclass (qw/open_db fetch_rec init/) {
 102                 my $n = $self->{module} . '::' . $subclass;
 103                 if (! defined &{ $n }) {
 104                         my $missing = "missing $subclass in $self->{module}";
 105                         $self->{$subclass} = sub { $log->logwarn($missing) };
 106                 } else {
 107                         $self->{$subclass} = \&{ $n };
 108                 }
 109         }
 110
 111         if ($self->{init}) {
 112                 $log->debug("calling init");
 113                 $self->{init}->($self, @_);
 114         }
 115
 116         $self->{'code_page'} ||= 'ISO-8859-2';
 117
 118         # running with low_mem flag? well, use DBM::Deep then.
 119         if ($self->{'low_mem'}) {
 120                 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
 121
 122                 my $db_file = "data.db";
 123
 124                 if (-e $db_file) {
 125                         unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
 126                         $log->debug("removed '$db_file' from last run");
 127                 }
 128
 129                 require DBM::Deep;
 130
 131                 my $db = new DBM::Deep $db_file;
 132
 133                 $log->logdie("DBM::Deep error: $!") unless ($db);
 134
 135                 if ($db->error()) {
 136                         $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
 137                 } else {
 138                         $log->debug("using file '$db_file' for DBM::Deep");
 139                 }
 140
 141                 $self->{'db'} = $db;
 142         }
 143
 144         $self ? return $self : return undef;
 145 }
 146
 147 =head2 open
 148
 149 This function will read whole database in memory and produce lookups.
 150
 151  $input->open(
 152         path => '/path/to/database/file',
 153         code_page => '852',
 154         limit => 500,
 155         offset => 6000,
 156         lookup => $lookup_obj,
 157  );
 158
 159 By default, C<code_page> is assumed to be C<852>.
 160
 161 C<offset> is optional parametar to position at some offset before reading from database.
 162
 163 C<limit> is optional parametar to read just C<limit> records from database
 164
 165 Returns size of database, regardless of C<offset> and C<limit>
 166 parametars, see also C<size>.
 167
 168 =cut
 169
 170 sub open {
 171         my $self = shift;
 172         my $arg = {@_};
 173
 174         my $log = $self->_get_logger();
 175
 176         $log->logcroak("need path") if (! $arg->{'path'});
 177         my $code_page = $arg->{'code_page'} || '852';
 178
 179         # store data in object
 180         $self->{'input_code_page'} = $code_page;
 181         foreach my $v (qw/path offset limit/) {
 182                 $self->{$v} = $arg->{$v} if ($arg->{$v});
 183         }
 184
 185         # create Text::Iconv object
 186         $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
 187
 188         my $filter_ref;
 189
 190         if ($self->{recode}) {
 191                 my @r = split(/\s/, $self->{recode});
 192                 if ($#r % 2 != 1) {
 193                         $log->logwarn("recode needs even number of elements (some number of valid pairs)");
 194                 } else {
 195                         my $recode;
 196                         while (@r) {
 197                                 my $from = shift @r;
 198                                 my $to = shift @r;
 199                                 $recode->{$from} = $to;
 200                         }
 201
 202                         my $regex = join '|' => keys %{ $recode };
 203
 204                         $log->debug("using recode regex: $regex");
 205
 206                         $filter_ref = sub {
 207                                 my $t = shift;
 208                                 $t =~ s/($regex)/$recode->{$1}/g;
 209                                 return $t;
 210                         };
 211
 212                 }
 213
 214         }
 215
 216         my ($db, $size) = $self->{open_db}->( $self,
 217                 path => $arg->{path},
 218                 filter => $filter_ref,
 219         );
 220
 221         unless ($db) {
 222                 $log->logwarn("can't open database $arg->{path}, skipping...");
 223                 return;
 224         }
 225
 226         unless ($size) {
 227                 $log->logwarn("no records in database $arg->{path}, skipping...");
 228                 return;
 229         }
 230
 231         my $from_rec = 1;
 232         my $to_rec = $size;
 233
 234         if (my $s = $self->{offset}) {
 235                 $log->info("skipping to MFN $s");
 236                 $from_rec = $s;
 237         } else {
 238                 $self->{offset} = $from_rec;
 239         }
 240
 241         if ($self->{limit}) {
 242                 $log->debug("limiting to ",$self->{limit}," records");
 243                 $to_rec = $from_rec + $self->{limit} - 1;
 244                 $to_rec = $size if ($to_rec > $size);
 245         }
 246
 247         # store size for later
 248         $self->{size} = ($to_rec - $from_rec) ? ($to_rec - $from_rec + 1) : 0;
 249
 250         $log->info("processing $self->{size}/$size records [$from_rec-$to_rec] convert $code_page -> $self->{code_page}");
 251
 252         # read database
 253         for (my $pos = $from_rec; $pos <= $to_rec; $pos++) {
 254
 255                 $log->debug("position: $pos\n");
 256
 257                 my $rec = $self->{fetch_rec}->($self, $db, $pos );
 258
 259                 $log->debug(sub { Dumper($rec) });
 260
 261                 if (! $rec) {
 262                         $log->warn("record $pos empty? skipping...");
 263                         next;
 264                 }
 265
 266                 # store
 267                 if ($self->{low_mem}) {
 268                         $self->{db}->put($pos, $rec);
 269                 } else {
 270                         $self->{data}->{$pos} = $rec;
 271                 }
 272
 273                 # create lookup
 274                 $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
 275
 276                 $self->progress_bar($pos,$to_rec);
 277
 278         }
 279
 280         $self->{pos} = -1;
 281         $self->{last_pcnt} = 0;
 282
 283         # store max mfn and return it.
 284         $self->{max_pos} = $to_rec;
 285         $log->debug("max_pos: $to_rec");
 286
 287         return $size;
 288 }
 289
 290 =head2 fetch
 291
 292 Fetch next record from database. It will also displays progress bar.
 293
 294  my $rec = $isis->fetch;
 295
 296 Record from this function should probably go to C<data_structure> for
 297 normalisation.
 298
 299 =cut
 300
 301 sub fetch {
 302         my $self = shift;
 303
 304         my $log = $self->_get_logger();
 305
 306         $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
 307
 308         if ($self->{pos} == -1) {
 309                 $self->{pos} = $self->{offset};
 310         } else {
 311                 $self->{pos}++;
 312         }
 313
 314         my $mfn = $self->{pos};
 315
 316         if ($mfn > $self->{max_pos}) {
 317                 $self->{pos} = $self->{max_pos};
 318                 $log->debug("at EOF");
 319                 return;
 320         }
 321
 322         $self->progress_bar($mfn,$self->{max_pos});
 323
 324         my $rec;
 325
 326         if ($self->{low_mem}) {
 327                 $rec = $self->{db}->get($mfn);
 328         } else {
 329                 $rec = $self->{data}->{$mfn};
 330         }
 331
 332         $rec ||= 0E0;
 333 }
 334
 335 =head2 pos
 336
 337 Returns current record number (MFN).
 338
 339  print $isis->pos;
 340
 341 First record in database has position 1.
 342
 343 =cut
 344
 345 sub pos {
 346         my $self = shift;
 347         return $self->{pos};
 348 }
 349
 350
 351 =head2 size
 352
 353 Returns number of records in database
 354
 355  print $isis->size;
 356
 357 Result from this function can be used to loop through all records
 358
 359  foreach my $mfn ( 1 ... $isis->size ) { ... }
 360
 361 because it takes into account C<offset> and C<limit>.
 362
 363 =cut
 364
 365 sub size {
 366         my $self = shift;
 367         return $self->{size};
 368 }
 369
 370 =head2 seek
 371
 372 Seek to specified MFN in file.
 373
 374  $isis->seek(42);
 375
 376 First record in database has position 1.
 377
 378 =cut
 379
 380 sub seek {
 381         my $self = shift;
 382         my $pos = shift || return;
 383
 384         my $log = $self->_get_logger();
 385
 386         if ($pos < 1) {
 387                 $log->warn("seek before first record");
 388                 $pos = 1;
 389         } elsif ($pos > $self->{max_pos}) {
 390                 $log->warn("seek beyond last record");
 391                 $pos = $self->{max_pos};
 392         }
 393
 394         return $self->{pos} = (($pos - 1) || -1);
 395 }
 396
 397
 398 =head1 MEMORY USAGE
 399
 400 C<low_mem> options is double-edged sword. If enabled, WebPAC
 401 will run on memory constraint machines (which doesn't have enough
 402 physical RAM to create memory structure for whole source database).
 403
 404 If your machine has 512Mb or more of RAM and database is around 10000 records,
 405 memory shouldn't be an issue. If you don't have enough physical RAM, you
 406 might consider using virtual memory (if your operating system is handling it
 407 well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
 408 parsed structure of ISIS database (this is what C<low_mem> option does).
 409
 410 Hitting swap at end of reading source database is probably o.k. However,
 411 hitting swap before 90% will dramatically decrease performance and you will
 412 be better off with C<low_mem> and using rest of availble memory for
 413 operating system disk cache (Linux is particuallary good about this).
 414 However, every access to database record will require disk access, so
 415 generation phase will be slower 10-100 times.
 416
 417 Parsed structures are essential - you just have option to trade RAM memory
 418 (which is fast) for disk space (which is slow). Be sure to have planty of
 419 disk space if you are using C<low_mem> and thus L<DBM::Deep>.
 420
 421 However, when WebPAC is running on desktop machines (or laptops :-), it's
 422 highly undesireable for system to start swapping. Using C<low_mem> option can
 423 reduce WecPAC memory usage to around 64Mb for same database with lookup
 424 fields and sorted indexes which stay in RAM. Performance will suffer, but
 425 memory usage will really be minimal. It might be also more confortable to
 426 run WebPAC reniced on those machines.
 427
 428
 429 =head1 AUTHOR
 430
 431 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
 432
 433 =head1 COPYRIGHT & LICENSE
 434
 435 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
 436
 437 This program is free software; you can redistribute it and/or modify it
 438 under the same terms as Perl itself.
 439
 440 =cut
 441
 442 1; # End of WebPAC::Input