9 use base qw/WebPAC::Common/;
14 WebPAC::Input - core module for input file format
22 our $VERSION = '0.02';
26 This module is used as base class for all database specific modules
27 (basically, files which have one handle, fixed size while indexing and some
28 kind of numeric idefinirier which goes from 1 to filesize).
30 Perhaps a little code snippet.
34 my $db = WebPAC::Input->new(
37 lookup => $lookup_obj,
41 $db->open('/path/to/database');
42 print "database size: ",$db->size,"\n";
43 while (my $row = $db->fetch) {
51 Create new input database object.
53 my $db = new WebPAC::Input(
55 code_page => 'ISO-8859-2',
59 Optional parametar C<code_page> specify application code page (which will be
60 used internally). This should probably be your terminal encoding, and by
61 default, it C<ISO-8859-2>.
63 Default is not to use C<low_mem> options (see L<MEMORY USAGE> below).
65 This function will also call low-level C<init> if it exists with same
75 my $log = $self->_get_logger;
77 # check if required subclasses are implemented
78 foreach my $subclass (qw/open_db fetch_rec/) {
79 $log->logdie("missing implementation of $subclass") unless ($self->SUPER::can($subclass));
82 if ($self->can('init')) {
83 $log->debug("calling init");
87 $self->{'code_page'} ||= 'ISO-8859-2';
89 # running with low_mem flag? well, use DBM::Deep then.
90 if ($self->{'low_mem'}) {
91 $log->info("running with low_mem which impacts performance (<32 Mb memory usage)");
93 my $db_file = "data.db";
96 unlink $db_file or $log->logdie("can't remove '$db_file' from last run");
97 $log->debug("removed '$db_file' from last run");
102 my $db = new DBM::Deep $db_file;
104 $log->logdie("DBM::Deep error: $!") unless ($db);
107 $log->logdie("can't open '$db_file' under low_mem: ",$db->error());
109 $log->debug("using file '$db_file' for DBM::Deep");
115 $self ? return $self : return undef;
120 This function will read whole database in memory and produce lookups.
123 path => '/path/to/database/file',
127 lookup => $lookup_obj,
130 By default, C<code_page> is assumed to be C<852>.
132 If optional parametar C<start_mfn> is set, this will be first MFN to read
133 from database (so you can skip beginning of your database if you need to).
135 If optional parametar C<limit_mfn> is set, it will read just 500 records
136 from database in example above.
138 Returns size of database, regardless of C<start_mfn> and C<limit_mfn>
139 parametars, see also C<$isis->size>.
147 my $log = $self->_get_logger();
149 $log->logcroak("need path") if (! $arg->{'path'});
150 my $code_page = $arg->{'code_page'} || '852';
152 # store data in object
153 $self->{'code_page'} = $code_page;
154 foreach my $v (qw/path start_mfn limit_mfn/) {
155 $self->{$v} = $arg->{$v} if ($arg->{$v});
158 # create Text::Iconv object
159 $self->{iconv} = Text::Iconv->new($code_page,$self->{'code_page'});
161 my ($db, $size) = $self->open_db(
162 path => $arg->{path},
166 $log->logwarn("can't open database $arg->{path}, skipping...");
171 $log->logwarn("no records in database $arg->{path}, skipping...");
178 if (my $s = $self->{start_mfn}) {
179 $log->info("skipping to MFN $s");
182 $self->{start_mfn} = $startmfn;
185 if ($self->{limit_mfn}) {
186 $log->info("limiting to ",$self->{limit_mfn}," records");
187 $maxmfn = $startmfn + $self->{limit_mfn} - 1;
188 $maxmfn = $size if ($maxmfn > $size);
191 # store size for later
192 $self->{size} = ($maxmfn - $startmfn) ? ($maxmfn - $startmfn + 1) : 0;
194 $log->info("processing $self->{size} records in $code_page, convert to $self->{code_page}");
197 for (my $mfn = $startmfn; $mfn <= $maxmfn; $mfn++) {
199 $log->debug("mfn: $mfn\n");
201 my $rec = $self->fetch_rec( $db, $mfn );
204 $log->warn("record $mfn empty? skipping...");
209 if ($self->{'low_mem'}) {
210 $self->{'db'}->put($mfn, $rec);
212 $self->{'data'}->{$mfn} = $rec;
216 $self->{'lookup'}->add( $rec ) if ($rec && $self->{'lookup'});
218 $self->progress_bar($mfn,$maxmfn);
222 $self->{'current_mfn'} = -1;
223 $self->{'last_pcnt'} = 0;
225 $log->debug("max mfn: $maxmfn");
227 # store max mfn and return it.
228 $self->{'max_mfn'} = $maxmfn;
235 Fetch next record from database. It will also displays progress bar.
237 my $rec = $isis->fetch;
239 Record from this function should probably go to C<data_structure> for
247 my $log = $self->_get_logger();
249 $log->logconfess("it seems that you didn't load database!") unless ($self->{'current_mfn'});
251 if ($self->{'current_mfn'} == -1) {
252 $self->{'current_mfn'} = $self->{'start_mfn'};
254 $self->{'current_mfn'}++;
257 my $mfn = $self->{'current_mfn'};
259 if ($mfn > $self->{'max_mfn'}) {
260 $self->{'current_mfn'} = $self->{'max_mfn'};
261 $log->debug("at EOF");
265 $self->progress_bar($mfn,$self->{'max_mfn'});
269 if ($self->{'low_mem'}) {
270 $rec = $self->{'db'}->get($mfn);
272 $rec = $self->{'data'}->{$mfn};
280 Returns current record number (MFN).
284 First record in database has position 1.
290 return $self->{'current_mfn'};
296 Returns number of records in database
300 Result from this function can be used to loop through all records
302 foreach my $mfn ( 1 ... $isis->size ) { ... }
304 because it takes into account C<start_mfn> and C<limit_mfn>.
310 return $self->{'size'};
315 Seek to specified MFN in file.
319 First record in database has position 1.
325 my $pos = shift || return;
327 my $log = $self->_get_logger();
330 $log->warn("seek before first record");
332 } elsif ($pos > $self->{'max_mfn'}) {
333 $log->warn("seek beyond last record");
334 $pos = $self->{'max_mfn'};
337 return $self->{'current_mfn'} = (($pos - 1) || -1);
343 C<low_mem> options is double-edged sword. If enabled, WebPAC
344 will run on memory constraint machines (which doesn't have enough
345 physical RAM to create memory structure for whole source database).
347 If your machine has 512Mb or more of RAM and database is around 10000 records,
348 memory shouldn't be an issue. If you don't have enough physical RAM, you
349 might consider using virtual memory (if your operating system is handling it
350 well, like on FreeBSD or Linux) instead of dropping to L<DBM::Deep> to handle
351 parsed structure of ISIS database (this is what C<low_mem> option does).
353 Hitting swap at end of reading source database is probably o.k. However,
354 hitting swap before 90% will dramatically decrease performance and you will
355 be better off with C<low_mem> and using rest of availble memory for
356 operating system disk cache (Linux is particuallary good about this).
357 However, every access to database record will require disk access, so
358 generation phase will be slower 10-100 times.
360 Parsed structures are essential - you just have option to trade RAM memory
361 (which is fast) for disk space (which is slow). Be sure to have planty of
362 disk space if you are using C<low_mem> and thus L<DBM::Deep>.
364 However, when WebPAC is running on desktop machines (or laptops :-), it's
365 highly undesireable for system to start swapping. Using C<low_mem> option can
366 reduce WecPAC memory usage to around 64Mb for same database with lookup
367 fields and sorted indexes which stay in RAM. Performance will suffer, but
368 memory usage will really be minimal. It might be also more confortable to
369 run WebPAC reniced on those machines.
374 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
376 =head1 COPYRIGHT & LICENSE
378 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
380 This program is free software; you can redistribute it and/or modify it
381 under the same terms as Perl itself.
385 1; # End of WebPAC::Input