7 use base qw/WebPAC::Common/;
12 WebPAC::Input - read different file formats into WebPAC
20 our $VERSION = '0.03';
24 This module implements input as database which have fixed and known
25 I<size> while indexing and single unique numeric identifier for database
26 position ranging from 1 to I<size>.
28 Simply, something that is indexed by unmber from 1 .. I<size>.
30 Examples of such databases are CDS/ISIS files, MARC files, lines in
33 Specific file formats are implemented using low-level interface modules,
34 located in C<WebPAC::Input::*> namespace which export C<open_db>,
35 C<fetch_rec> and optional C<init> functions.
37 Perhaps a little code snippet.
41 my $db = WebPAC::Input->new(
42 module => 'WebPAC::Input::ISIS',
44 lookup => $lookup_obj,
48 $db->open('/path/to/database');
49 print "database size: ",$db->size,"\n";
50 while (my $rec = $db->fetch) {
59 Create new input database object.
61 my $db = new WebPAC::Input(
62 module => 'WebPAC::Input::MARC',
63 code_page => 'ISO-8859-2',
67 C<module> is low-level file format module. See L<WebPAC::Input::Isis> and
68 L<WebPAC::Input::MARC>.
70 Optional parametar C<code_page> specify application code page (which will be
71 used internally). This should probably be your terminal encoding, and by
72 default, it C<ISO-8859-2>.
74 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
76 This function will also call low-level C<init> if it exists with same
86 my $log = $self->_get_logger;
88 $log->logconfess("specify low-level file format module") unless ($self->{module});
89 my $module = $self->{module};
92 $log->debug("require low-level module $self->{module} from $module");
95 #eval $self->{module} .'->import';
97 # check if required subclasses are implemented
98 foreach my $subclass (qw/open_db fetch_rec init/) {
99 my $n = $self->{module} . '::' . $subclass;
100 if (! defined &{ $n }) {
101 my $missing = "missing $subclass in $self->{module}";
102 $self->{$subclass} = sub { $log->logwarn($missing) };
104 $self->{$subclass} = \&{ $n };
109 $log->debug("calling init");
110 $self->{init}->($self, @_);
113 $self->{'code_page'} ||= 'ISO-8859-2';
115 # running with low_mem flag? well, use DBM::Deep then.
116 if ($self->{'low_mem'}) {
117 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
119 my $db_file = "data.db";
122 unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
123 $log->debug("removed '$db_file' from last run");
128 my $db = new DBM::Deep $db_file;
130 $log->logdie("DBM::Deep error: $!") unless ($db);
133 $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
135 $log->debug("using file '$db_file' for DBM::Deep");
141 $self ? return $self : return undef;
146 This function will read whole database in memory and produce lookups.
149 path => '/path/to/database/file',
153 lookup => $lookup_obj,
156 By default, C<code_page> is assumed to be C<852>.
158 C<offset> is optional parametar to position at some offset before reading from database.
160 C<limit> is optional parametar to read just C<limit> records from database
162 Returns size of database, regardless of C<offset> and C<limit>
163 parametars, see also C<size>.
171 my $log = $self->_get_logger();
173 $log->logcroak("need path") if (! $arg->{'path'});
174 my $code_page = $arg->{'code_page'} || '852';
176 # store data in object
177 $self->{'input_code_page'} = $code_page;
178 foreach my $v (qw/path offset limit/) {
179 $self->{$v} = $arg->{$v} if ($arg->{$v});
182 # create Text::Iconv object
183 $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
185 my ($db, $size) = $self->{open_db}->( $self,
186 path => $arg->{path},
190 $log->logwarn("can't open database $arg->{path}, skipping...");
195 $log->logwarn("no records in database $arg->{path}, skipping...");
202 if (my $s = $self->{offset}) {
203 $log->info("skipping to MFN $s");
206 $self->{offset} = $offset;
209 if ($self->{limit}) {
210 $log->debug("limiting to ",$self->{limit}," records");
211 $limit = $offset + $self->{limit} - 1;
212 $limit = $size if ($limit > $size);
215 # store size for later
216 $self->{size} = ($limit - $offset) ? ($limit - $offset + 1) : 0;
218 $log->info("processing $self->{size} records in $code_page, convert to $self->{code_page}");
221 for (my $pos = $offset; $pos <= $limit; $pos++) {
223 $log->debug("position: $pos\n");
225 my $rec = $self->{fetch_rec}->($self, $db, $pos );
228 $log->warn("record $pos empty? skipping...");
233 if ($self->{low_mem}) {
234 $self->{db}->put($pos, $rec);
236 $self->{data}->{$pos} = $rec;
240 $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
242 $self->progress_bar($pos,$limit);
247 $self->{last_pcnt} = 0;
249 # store max mfn and return it.
250 $self->{max_pos} = $limit;
251 $log->debug("max_pos: $limit");
258 Fetch next record from database. It will also displays progress bar.
260 my $rec = $isis->fetch;
262 Record from this function should probably go to C<data_structure> for
270 my $log = $self->_get_logger();
272 $log->logconfess("it seems that you didn't load database!") unless ($self->{pos});
274 if ($self->{pos} == -1) {
275 $self->{pos} = $self->{offset};
280 my $mfn = $self->{pos};
282 if ($mfn > $self->{max_pos}) {
283 $self->{pos} = $self->{max_pos};
284 $log->debug("at EOF");
288 $self->progress_bar($mfn,$self->{max_pos});
292 if ($self->{low_mem}) {
293 $rec = $self->{db}->get($mfn);
295 $rec = $self->{data}->{$mfn};
303 Returns current record number (MFN).
307 First record in database has position 1.
319 Returns number of records in database
323 Result from this function can be used to loop through all records
325 foreach my $mfn ( 1 ... $isis->size ) { ... }
327 because it takes into account C<offset> and C<limit>.
333 return $self->{size};
338 Seek to specified MFN in file.
342 First record in database has position 1.
348 my $pos = shift || return;
350 my $log = $self->_get_logger();
353 $log->warn("seek before first record");
355 } elsif ($pos > $self->{max_pos}) {
356 $log->warn("seek beyond last record");
357 $pos = $self->{max_pos};
360 return $self->{pos} = (($pos - 1) || -1);
366 C<low_mem> options is double-edged sword. If enabled, WebPAC
367 will run on memory constraint machines (which doesn't have enough
368 physical RAM to create memory structure for whole source database).
370 If your machine has 512Mb or more of RAM and database is around 10000 records,
371 memory shouldn't be an issue. If you don't have enough physical RAM, you
372 might consider using virtual memory (if your operating system is handling it
373 well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
374 parsed structure of ISIS database (this is what C<low_mem> option does).
376 Hitting swap at end of reading source database is probably o.k. However,
377 hitting swap before 90% will dramatically decrease performance and you will
378 be better off with C<low_mem> and using rest of availble memory for
379 operating system disk cache (Linux is particuallary good about this).
380 However, every access to database record will require disk access, so
381 generation phase will be slower 10-100 times.
383 Parsed structures are essential - you just have option to trade RAM memory
384 (which is fast) for disk space (which is slow). Be sure to have planty of
385 disk space if you are using C<low_mem> and thus L<DBM::Deep>.
387 However, when WebPAC is running on desktop machines (or laptops :-), it's
388 highly undesireable for system to start swapping. Using C<low_mem> option can
389 reduce WecPAC memory usage to around 64Mb for same database with lookup
390 fields and sorted indexes which stay in RAM. Performance will suffer, but
391 memory usage will really be minimal. It might be also more confortable to
392 run WebPAC reniced on those machines.
397 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
399 =head1 COPYRIGHT & LICENSE
401 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
403 This program is free software; you can redistribute it and/or modify it
404 under the same terms as Perl itself.
408 1; # End of WebPAC::Input