git.rot13.org Git - webpac2/blob - lib/WebPAC/Normalize.pm

   1 package WebPAC::Normalize;
   2
   3 use warnings;
   4 use strict;
   5 use base 'WebPAC::Common';
   6 use Data::Dumper;
   7
   8 =head1 NAME
   9
  10 WebPAC::Normalize - data mungling for normalisation
  11
  12 =head1 VERSION
  13
  14 Version 0.02
  15
  16 =cut
  17
  18 our $VERSION = '0.02';
  19
  20 =head1 SYNOPSIS
  21
  22 This package contains code that mungle data to produce normalized format.
  23
  24 It contains several assumptions:
  25
  26 =over
  27
  28 =item *
  29
  30 format of fields is defined using C<v123^a> notation for repeatable fields
  31 or C<s123^a> for single (or first) value, where C<123> is field number and
  32 C<a> is subfield.
  33
  34 =item *
  35
  36 source data records (C<$rec>) have unique identifiers in field C<000>
  37
  38 =item *
  39
  40 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
  41 perl code that is evaluated before producing output (value of field will be
  42 interpolated before that)
  43
  44 =item *
  45
  46 optional C<filter{filter_name}> at B<begining of format> will apply perl
  47 code defined as code ref on format after field substitution to producing
  48 output
  49
  50 =item *
  51
  52 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
  53
  54 =item *
  55
  56 at end, optional C<format>s rules are resolved. Format rules are similar to
  57 C<sprintf> and can also contain C<lookup{...}> which is performed after
  58 values are inserted in format.
  59
  60 =back
  61
  62 This also describes order in which transformations are applied (eval,
  63 filter, lookup, format) which is important to undestand when deciding how to
  64 solve your data mungling and normalisation process.
  65
  66
  67
  68
  69 =head1 FUNCTIONS
  70
  71 =head2 new
  72
  73 Create new normalisation object
  74
  75   my $n = new WebPAC::Normalize::Something(
  76         filter => {
  77                 'filter_name_1' => sub {
  78                         # filter code
  79                         return length($_);
  80                 }, ...
  81         },
  82         db => $db_obj,
  83         lookup_regex => $lookup->regex,
  84         lookup => $lookup_obj,
  85   );
  86
  87 Parametar C<filter> defines user supplied snippets of perl code which can
  88 be use with C<filter{...}> notation.
  89
  90 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
  91 in structures. If you pass this parametar, you must also pass C<lookup>
  92 which is C<WebPAC::Lookup> object.
  93
  94 =cut
  95
  96 sub new {
  97         my $class = shift;
  98         my $self = {@_};
  99         bless($self, $class);
 100
 101         my $r = $self->{'lookup_regex'} ? 1 : 0;
 102         my $l = $self->{'lookup'} ? 1 : 0;
 103
 104         my $log = $self->_get_logger();
 105
 106         # those two must be in pair
 107         if ( ($r & $l) != ($r || $l) ) {
 108                 my $log = $self->_get_logger();
 109                 $log->logdie("lookup_regex and lookup must be in pair");
 110         }
 111
 112         $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
 113
 114         $self ? return $self : return undef;
 115 }
 116
 117
 118 =head2 data_structure
 119
 120 Create in-memory data structure which represents normalized layout from
 121 C<conf/normalize/*.xml>.
 122
 123 This structures are used to produce output.
 124
 125  my $ds = $webpac->data_structure($rec);
 126
 127 B<Note: historical oddity follows>
 128
 129 This method will also set C<< $webpac->{'currnet_filename'} >> if there is
 130 C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
 131 C<< <headline> >> tag.
 132
 133 =cut
 134
 135 sub data_structure {
 136         my $self = shift;
 137
 138         my $log = $self->_get_logger();
 139
 140         my $rec = shift;
 141         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 142
 143         my $cache_file;
 144
 145         if ($self->{'db'}) {
 146                 my $ds = $self->{'db'}->load_ds($rec);
 147                 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
 148                 return $ds if ($ds);
 149                 $log->debug("cache miss, creating");
 150         }
 151
 152         undef $self->{'currnet_filename'};
 153         undef $self->{'headline'};
 154
 155         my @sorted_tags;
 156         if ($self->{tags_by_order}) {
 157                 @sorted_tags = @{$self->{tags_by_order}};
 158         } else {
 159                 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
 160                 $self->{tags_by_order} = \@sorted_tags;
 161         }
 162
 163         my $ds;
 164
 165         $log->debug("tags: ",sub { join(", ",@sorted_tags) });
 166
 167         foreach my $field (@sorted_tags) {
 168
 169                 my $row;
 170
 171 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
 172
 173                 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
 174                         my $format;
 175
 176                         $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
 177                         $format = $tag->{'value'} || $tag->{'content'};
 178
 179                         $log->debug("format: $format");
 180
 181                         my @v;
 182                         if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
 183                                 @v = $self->fill_in_to_arr($rec,$format);
 184                         } else {
 185                                 @v = $self->parse_to_arr($rec,$format);
 186                         }
 187                         next if (! @v);
 188
 189                         if ($tag->{'sort'}) {
 190                                 @v = $self->sort_arr(@v);
 191                         }
 192
 193                         # use format?
 194                         if ($tag->{'format_name'}) {
 195                                 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
 196                         }
 197
 198                         if ($field eq 'filename') {
 199                                 $self->{'current_filename'} = join('',@v);
 200                                 $log->debug("filename: ",$self->{'current_filename'});
 201                         } elsif ($field eq 'headline') {
 202                                 $self->{'headline'} .= join('',@v);
 203                                 $log->debug("headline: ",$self->{'headline'});
 204                                 next; # don't return headline in data_structure!
 205                         }
 206
 207                         # delimiter will join repeatable fields
 208                         if ($tag->{'delimiter'}) {
 209                                 @v = ( join($tag->{'delimiter'}, @v) );
 210                         }
 211
 212                         # default types
 213                         my @types = qw(display search);
 214                         # override by type attribute
 215                         @types = ( $tag->{'type'} ) if ($tag->{'type'});
 216
 217                         foreach my $type (@types) {
 218                                 # append to previous line?
 219                                 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
 220                                 if ($tag->{'append'}) {
 221
 222                                         # I will delimit appended part with
 223                                         # delimiter (or ,)
 224                                         my $d = $tag->{'delimiter'};
 225                                         # default delimiter
 226                                         $d ||= " ";
 227
 228                                         my $last = pop @{$row->{$type}};
 229                                         $d = "" if (! $last);
 230                                         $last .= $d . join($d, @v);
 231                                         push @{$row->{$type}}, $last;
 232
 233                                 } else {
 234                                         push @{$row->{$type}}, @v;
 235                                 }
 236                         }
 237
 238
 239                 }
 240
 241                 if ($row) {
 242                         $row->{'tag'} = $field;
 243
 244                         # TODO: name_sigular, name_plural
 245                         my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
 246                         my $row_name = $name ? $self->_x($name) : $field;
 247
 248                         # post-sort all values in field
 249                         if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
 250                                 $log->warn("sort at field tag not implemented");
 251                         }
 252
 253                         $ds->{$row_name} = $row;
 254
 255                         $log->debug("row $field: ",sub { Dumper($row) });
 256                 }
 257
 258         }
 259
 260         $log->logdie("there is no current_filename defined! Do you have filename tag in conf/normalize/?.xml") unless ($self->{'current_filename'});
 261
 262         $self->{'db'}->save_ds(
 263                 ds => $ds,
 264                 current_filename => $self->{'current_filename'},
 265                 headline => $self->{'headline'},
 266         ) if ($self->{'db'});
 267
 268         $log->debug("ds: ", sub { Dumper($ds) });
 269
 270         $log->logconfess("data structure returned is not array any more!") if wantarray;
 271
 272         return $ds;
 273
 274 }
 275
 276 =head2 parse
 277
 278 Perform smart parsing of string, skipping delimiters for fields which aren't
 279 defined. It can also eval code in format starting with C<eval{...}> and
 280 return output or nothing depending on eval code.
 281
 282  my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
 283
 284 =cut
 285
 286 sub parse {
 287         my $self = shift;
 288
 289         my ($rec, $format_utf8, $i) = @_;
 290
 291         return if (! $format_utf8);
 292
 293         my $log = $self->_get_logger();
 294
 295         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 296
 297         $i = 0 if (! $i);
 298
 299         my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
 300
 301         my @out;
 302
 303         $log->debug("format: $format");
 304
 305         my $eval_code;
 306         # remove eval{...} from beginning
 307         $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
 308
 309         my $filter_name;
 310         # remove filter{...} from beginning
 311         $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
 312
 313         my $prefix;
 314         my $all_found=0;
 315
 316         while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
 317
 318                 my $del = $1 || '';
 319                 $prefix ||= $del if ($all_found == 0);
 320
 321                 # repeatable index
 322                 my $r = $i;
 323                 $r = 0 if (lc("$2") eq 's');
 324
 325                 my $found = 0;
 326                 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
 327
 328                 if ($found) {
 329                         push @out, $del;
 330                         push @out, $tmp;
 331                         $all_found += $found;
 332                 }
 333         }
 334
 335         return if (! $all_found);
 336
 337         my $out = join('',@out);
 338
 339         if ($out) {
 340                 # add rest of format (suffix)
 341                 $out .= $format;
 342
 343                 # add prefix if not there
 344                 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
 345
 346                 $log->debug("result: $out");
 347         }
 348
 349         if ($eval_code) {
 350                 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
 351                 $log->debug("about to eval{$eval} format: $out");
 352                 return if (! $self->_eval($eval));
 353         }
 354
 355         if ($filter_name && $self->{'filter'}->{$filter_name}) {
 356                 $log->debug("about to filter{$filter_name} format: $out");
 357                 $out = $self->{'filter'}->{$filter_name}->($out);
 358                 return unless(defined($out));
 359                 $log->debug("filter result: $out");
 360         }
 361
 362         return $out;
 363 }
 364
 365 =head2 parse_to_arr
 366
 367 Similar to C<parse>, but returns array of all repeatable fields
 368
 369  my @arr = $webpac->parse_to_arr($rec,'v250^a');
 370
 371 =cut
 372
 373 sub parse_to_arr {
 374         my $self = shift;
 375
 376         my ($rec, $format_utf8) = @_;
 377
 378         my $log = $self->_get_logger();
 379
 380         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 381         return if (! $format_utf8);
 382
 383         my $i = 0;
 384         my @arr;
 385
 386         while (my $v = $self->parse($rec,$format_utf8,$i++)) {
 387                 push @arr, $v;
 388         }
 389
 390         $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
 391
 392         return @arr;
 393 }
 394
 395
 396 =head2 fill_in
 397
 398 Workhourse of all: takes record from in-memory structure of database and
 399 strings with placeholders and returns string or array of with substituted
 400 values from record.
 401
 402  my $text = $webpac->fill_in($rec,'v250^a');
 403
 404 Optional argument is ordinal number for repeatable fields. By default,
 405 it's assume to be first repeatable field (fields are perl array, so first
 406 element is 0).
 407 Following example will read second value from repeatable field.
 408
 409  my $text = $webpac->fill_in($rec,'Title: v250^a',1);
 410
 411 This function B<does not> perform parsing of format to inteligenty skip
 412 delimiters before fields which aren't used.
 413
 414 This method will automatically decode UTF-8 string to local code page
 415 if needed.
 416
 417 =cut
 418
 419 sub fill_in {
 420         my $self = shift;
 421
 422         my $log = $self->_get_logger();
 423
 424         my $rec = shift || $log->logconfess("need data record");
 425         my $format = shift || $log->logconfess("need format to parse");
 426         # iteration (for repeatable fields)
 427         my $i = shift || 0;
 428
 429         $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
 430
 431         # FIXME remove for speedup?
 432         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 433
 434         if (utf8::is_utf8($format)) {
 435                 $format = $self->_x($format);
 436         }
 437
 438         my $found = 0;
 439
 440         my $eval_code;
 441         # remove eval{...} from beginning
 442         $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
 443
 444         my $filter_name;
 445         # remove filter{...} from beginning
 446         $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
 447
 448         # do actual replacement of placeholders
 449         # repeatable fields
 450         $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
 451         # non-repeatable fields
 452         $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
 453
 454         if ($found) {
 455                 $log->debug("format: $format");
 456                 if ($eval_code) {
 457                         my $eval = $self->fill_in($rec,$eval_code,$i);
 458                         return if (! $self->_eval($eval));
 459                 }
 460                 if ($filter_name && $self->{'filter'}->{$filter_name}) {
 461                         $log->debug("filter '$filter_name' for $format");
 462                         $format = $self->{'filter'}->{$filter_name}->($format);
 463                         return unless(defined($format));
 464                         $log->debug("filter result: $format");
 465                 }
 466                 # do we have lookups?
 467                 if ($self->{'lookup'}) {
 468                         if ($self->{'lookup'}->can('lookup')) {
 469                                 return $self->{'lookup'}->lookup($format);
 470                         } else {
 471                                 $log->warn("Have lookup object but can't invoke lookup method");
 472                         }
 473                 } else {
 474                         return $format;
 475                 }
 476         } else {
 477                 return;
 478         }
 479 }
 480
 481
 482 =head2 fill_in_to_arr
 483
 484 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
 485 for fields which have lookups, so they shouldn't be parsed but rather
 486 C<fill_id>ed.
 487
 488  my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
 489
 490 =cut
 491
 492 sub fill_in_to_arr {
 493         my $self = shift;
 494
 495         my ($rec, $format_utf8) = @_;
 496
 497         my $log = $self->_get_logger();
 498
 499         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 500         return if (! $format_utf8);
 501
 502         my $i = 0;
 503         my @arr;
 504
 505         while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
 506                 push @arr, @v;
 507         }
 508
 509         $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
 510
 511         return @arr;
 512 }
 513
 514
 515 =head2 get_data
 516
 517 Returns value from record.
 518
 519  my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
 520
 521 Arguments are:
 522 record reference C<$rec>,
 523 field C<$f>,
 524 optional subfiled C<$sf>,
 525 index for repeatable values C<$i>.
 526
 527 Optinal variable C<$found> will be incremeted if there
 528 is field.
 529
 530 Returns value or empty string.
 531
 532 =cut
 533
 534 sub get_data {
 535         my $self = shift;
 536
 537         my ($rec,$f,$sf,$i,$found) = @_;
 538
 539         if ($$rec->{$f}) {
 540                 return '' if (! $$rec->{$f}->[$i]);
 541                 no strict 'refs';
 542                 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
 543                         $$found++ if (defined($$found));
 544                         return $$rec->{$f}->[$i]->{$sf};
 545                 } elsif (! $sf && $$rec->{$f}->[$i]) {
 546                         $$found++ if (defined($$found));
 547                         # it still might have subfield, just
 548                         # not specified, so we'll dump all
 549                         if ($$rec->{$f}->[$i] =~ /HASH/o) {
 550                                 my $out;
 551                                 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
 552                                         $out .= $$rec->{$f}->[$i]->{$k}." ";
 553                                 }
 554                                 return $out;
 555                         } else {
 556                                 return $$rec->{$f}->[$i];
 557                         }
 558                 } else {
 559                         return '';
 560                 }
 561         } else {
 562                 return '';
 563         }
 564 }
 565
 566
 567 =head2 apply_format
 568
 569 Apply format specified in tag with C<format_name="name"> and
 570 C<format_delimiter=";;">.
 571
 572  my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
 573
 574 Formats can contain C<lookup{...}> if you need them.
 575
 576 =cut
 577
 578 sub apply_format {
 579         my $self = shift;
 580
 581         my ($name,$delimiter,$data) = @_;
 582
 583         my $log = $self->_get_logger();
 584
 585         if (! $self->{'import_xml'}->{'format'}->{$name}) {
 586                 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
 587                 return $data;
 588         }
 589
 590         $log->warn("no delimiter for format $name") if (! $delimiter);
 591
 592         my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
 593
 594         my @data = split(/\Q$delimiter\E/, $data);
 595
 596         my $out = sprintf($format, @data);
 597         $log->debug("using format $name [$format] on $data to produce: $out");
 598
 599         if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
 600                 return $self->{'lookup'}->lookup($out);
 601         } else {
 602                 return $out;
 603         }
 604
 605 }
 606
 607 =head2 sort_arr
 608
 609 Sort array ignoring case and html in data
 610
 611  my @sorted = $webpac->sort_arr(@unsorted);
 612
 613 =cut
 614
 615 sub sort_arr {
 616         my $self = shift;
 617
 618         my $log = $self->_get_logger();
 619
 620         # FIXME add Schwartzian Transformation?
 621
 622         my @sorted = sort {
 623                 $a =~ s#<[^>]+/*>##;
 624                 $b =~ s#<[^>]+/*>##;
 625                 lc($b) cmp lc($a)
 626         } @_;
 627         $log->debug("sorted values: ",sub { join(", ",@sorted) });
 628
 629         return @sorted;
 630 }
 631
 632
 633 =head1 INTERNAL METHODS
 634
 635 =head2 _sort_by_order
 636
 637 Sort xml tags data structure accoding to C<order=""> attribute.
 638
 639 =cut
 640
 641 sub _sort_by_order {
 642         my $self = shift;
 643
 644         my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
 645                 $self->{'import_xml'}->{'indexer'}->{$a};
 646         my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
 647                 $self->{'import_xml'}->{'indexer'}->{$b};
 648
 649         return $va <=> $vb;
 650 }
 651
 652 =head2 _x
 653
 654 Convert strings from C<conf/normalize/*.xml> encoding into application
 655 specific encoding (optinally specified using C<code_page> to C<new>
 656 constructor).
 657
 658  my $text = $n->_x('normalize text string');
 659
 660 This is a stub so that other modules doesn't have to implement it.
 661
 662 =cut
 663
 664 sub _x {
 665         my $self = shift;
 666         return shift;
 667 }
 668
 669
 670 =head1 AUTHOR
 671
 672 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
 673
 674 =head1 COPYRIGHT & LICENSE
 675
 676 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
 677
 678 This program is free software; you can redistribute it and/or modify it
 679 under the same terms as Perl itself.
 680
 681 =cut
 682
 683 1; # End of WebPAC::DB