9 use base qw/WebPAC::Common/;
14 WebPAC::Input - read different file formats into WebPAC
22 our $VERSION = '0.03';
26 This module implements input as database which have fixed and known
27 I<size> while indexing and single unique numeric identifier for database
28 position ranging from 1 to I<size>.
30 Simply, something that is indexed by unmber from 1 .. I<size>.
32 Examples of such databases are CDS/ISIS files, MARC files, lines in
35 Specific file formats are implemented using low-level interface modules,
36 located in C<WebPAC::Input::*> namespace which export C<open_db>,
37 C<fetch_rec> and optional C<init> functions.
39 Perhaps a little code snippet.
43 my $db = WebPAC::Input->new(
44 module => 'WebPAC::Input::ISIS',
46 lookup => $lookup_obj,
50 $db->open('/path/to/database');
51 print "database size: ",$db->size,"\n";
52 while (my $rec = $db->fetch) {
61 Create new input database object.
63 my $db = new WebPAC::Input(
64 module => 'WebPAC::Input::MARC',
65 code_page => 'ISO-8859-2',
69 C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
70 L<WebPAC::Input::MARC>.
72 Optional parametar C<code_page> specify application code page (which will be
73 used internally). This should probably be your terminal encoding, and by
74 default, it C<ISO-8859-2>.
76 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
78 This function will also call low-level C<init> if it exists with same
88 my $log = $self->_get_logger;
90 $log->logconfess("specify low-level file format module") unless ($self->{module});
91 my $module = $self->{module};
94 $log->debug("require low-level module $self->{module} from $module");
97 #eval $self->{module} .'->import';
99 # check if required subclasses are implemented
100 foreach my $subclass (qw/open_db fetch_rec init/) {
101 my $n = $self->{module} . '::' . $subclass;
102 if (! defined &{ $n }) {
103 my $missing = "missing $subclass in $self->{module}";
104 $log->logwarn($missing);
105 $self->{$subclass} = sub { warn "$missing\n" };
107 $self->{$subclass} = \&{ $n };
112 $log->debug("calling init");
113 $self->{init}->($self, @_);
116 $self->{'code_page'} ||= 'ISO-8859-2';
118 # running with low_mem flag? well, use DBM::Deep then.
119 if ($self->{'low_mem'}) {
120 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
122 my $db_file = "data.db";
125 unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
126 $log->debug("removed '$db_file' from last run");
131 my $db = new DBM::Deep $db_file;
133 $log->logdie("DBM::Deep error: $!") unless ($db);
136 $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
138 $log->debug("using file '$db_file' for DBM::Deep");
144 $self ? return $self : return undef;
149 This function will read whole database in memory and produce lookups.
152 path => '/path/to/database/file',
156 lookup => $lookup_obj,
159 By default, C<code_page> is assumed to be C<852>.
161 C<offset> is optional parametar to position at some offset before reading from database.
163 C<limit> is optional parametar to read just C<limit> records from database
165 Returns size of database, regardless of C<offset> and C<limit>
166 parametars, see also C<size>.
174 my $log = $self->_get_logger();
176 $log->logcroak("need path") if (! $arg->{'path'});
177 my $code_page = $arg->{'code_page'} || '852';
179 # store data in object
180 $self->{'input_code_page'} = $code_page;
181 foreach my $v (qw/path offset limit/) {
182 $self->{$v} = $arg->{$v} if ($arg->{$v});
185 # create Text::Iconv object
186 $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
188 my ($db, $size) = $self->{open_db}->( $self,
189 path => $arg->{path},
193 $log->logwarn("can't open database $arg->{path}, skipping...");
198 $log->logwarn("no records in database $arg->{path}, skipping...");
205 if (my $s = $self->{offset}) {
206 $log->info("skipping to MFN $s");
209 $self->{offset} = $offset;
212 if ($self->{limit}) {
213 $log->info("limiting to ",$self->{limit}," records");
214 $limit = $offset + $self->{limit} - 1;
215 $limit = $size if ($limit > $size);
218 # store size for later
219 $self->{size} = ($limit - $offset) ? ($limit - $offset + 1) : 0;
221 $log->info("processing $self->{size} records in $code_page, convert to $self->{code_page}");
224 for (my $pos = $offset; $pos <= $limit; $pos++) {
226 $log->debug("position: $pos\n");
228 my $rec = $self->{fetch_rec}->($self, $db, $pos );
231 $log->warn("record $pos empty? skipping...");
236 if ($self->{low_mem}) {
237 $self->{db}->put($pos, $rec);
239 $self->{data}->{$pos} = $rec;
243 $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
245 $self->progress_bar($pos,$limit);
250 $self->{last_pcnt} = 0;
252 # store max mfn and return it.
253 $self->{max_pos} = $limit;
254 $log->debug("max_pos: $limit");
261 Fetch next record from database. It will also displays progress bar.
263 my $rec = $isis->fetch;
265 Record from this function should probably go to C<data_structure> for
273 my $log = $self->_get_logger();
275 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
277 if ($self->{pos} == -1) {
278 $self->{pos} = $self->{offset};
283 my $mfn = $self->{pos};
285 if ($mfn > $self->{max_pos}) {
286 $self->{pos} = $self->{max_pos};
287 $log->debug("at EOF");
291 $self->progress_bar($mfn,$self->{max_pos});
295 if ($self->{low_mem}) {
296 $rec = $self->{db}->get($mfn);
298 $rec = $self->{data}->{$mfn};
306 Returns current record number (MFN).
310 First record in database has position 1.
322 Returns number of records in database
326 Result from this function can be used to loop through all records
328 foreach my $mfn ( 1 ... $isis->size ) { ... }
330 because it takes into account C<offset> and C<limit>.
336 return $self->{size};
341 Seek to specified MFN in file.
345 First record in database has position 1.
351 my $pos = shift || return;
353 my $log = $self->_get_logger();
356 $log->warn("seek before first record");
358 } elsif ($pos > $self->{max_pos}) {
359 $log->warn("seek beyond last record");
360 $pos = $self->{max_pos};
363 return $self->{pos} = (($pos - 1) || -1);
369 C<low_mem> options is double-edged sword. If enabled, WebPAC
370 will run on memory constraint machines (which doesn't have enough
371 physical RAM to create memory structure for whole source database).
373 If your machine has 512Mb or more of RAM and database is around 10000 records,
374 memory shouldn't be an issue. If you don't have enough physical RAM, you
375 might consider using virtual memory (if your operating system is handling it
376 well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
377 parsed structure of ISIS database (this is what C<low_mem> option does).
379 Hitting swap at end of reading source database is probably o.k. However,
380 hitting swap before 90% will dramatically decrease performance and you will
381 be better off with C<low_mem> and using rest of availble memory for
382 operating system disk cache (Linux is particuallary good about this).
383 However, every access to database record will require disk access, so
384 generation phase will be slower 10-100 times.
386 Parsed structures are essential - you just have option to trade RAM memory
387 (which is fast) for disk space (which is slow). Be sure to have planty of
388 disk space if you are using C<low_mem> and thus L<DBM::Deep>.
390 However, when WebPAC is running on desktop machines (or laptops :-), it's
391 highly undesireable for system to start swapping. Using C<low_mem> option can
392 reduce WecPAC memory usage to around 64Mb for same database with lookup
393 fields and sorted indexes which stay in RAM. Performance will suffer, but
394 memory usage will really be minimal. It might be also more confortable to
395 run WebPAC reniced on those machines.
400 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
402 =head1 COPYRIGHT & LICENSE
404 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
406 This program is free software; you can redistribute it and/or modify it
407 under the same terms as Perl itself.
411 1; # End of WebPAC::Input