git.rot13.org Git - webpac2/blob - lib/WebPAC/Normalize.pm

   1 package WebPAC::Normalize;
   2
   3 use warnings;
   4 use strict;
   5 use base 'WebPAC::Common';
   6 use Data::Dumper;
   7
   8 =head1 NAME
   9
  10 WebPAC::Normalize - data mungling for normalisation
  11
  12 =head1 VERSION
  13
  14 Version 0.06
  15
  16 =cut
  17
  18 our $VERSION = '0.06';
  19
  20 =head1 SYNOPSIS
  21
  22 This package contains code that mungle data to produce normalized format.
  23
  24 It contains several assumptions:
  25
  26 =over
  27
  28 =item *
  29
  30 format of fields is defined using C<v123^a> notation for repeatable fields
  31 or C<s123^a> for single (or first) value, where C<123> is field number and
  32 C<a> is subfield.
  33
  34 =item *
  35
  36 source data records (C<$rec>) have unique identifiers in field C<000>
  37
  38 =item *
  39
  40 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
  41 perl code that is evaluated before producing output (value of field will be
  42 interpolated before that)
  43
  44 =item *
  45
  46 optional C<filter{filter_name}> at B<begining of format> will apply perl
  47 code defined as code ref on format after field substitution to producing
  48 output
  49
  50 There is one built-in filter called C<regex> which can be use like this:
  51
  52   filter{regex(s/foo/bar/)}
  53
  54 =item *
  55
  56 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
  57
  58 =item *
  59
  60 at end, optional C<format>s rules are resolved. Format rules are similar to
  61 C<sprintf> and can also contain C<lookup{...}> which is performed after
  62 values are inserted in format.
  63
  64 =back
  65
  66 This also describes order in which transformations are applied (eval,
  67 filter, lookup, format) which is important to undestand when deciding how to
  68 solve your data mungling and normalisation process.
  69
  70
  71
  72
  73 =head1 FUNCTIONS
  74
  75 =head2 new
  76
  77 Create new normalisation object
  78
  79   my $n = new WebPAC::Normalize::Something(
  80         filter => {
  81                 'filter_name_1' => sub {
  82                         # filter code
  83                         return length($_);
  84                 }, ...
  85         },
  86         db => $db_obj,
  87         lookup_regex => $lookup->regex,
  88         lookup => $lookup_obj,
  89         prefix => 'foobar',
  90   );
  91
  92 Parametar C<filter> defines user supplied snippets of perl code which can
  93 be use with C<filter{...}> notation.
  94
  95 C<prefix> is used to form filename for database record (to support multiple
  96 source files which are joined in one database).
  97
  98 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
  99 in structures. If you pass this parametar, you must also pass C<lookup>
 100 which is C<WebPAC::Lookup> object.
 101
 102 =cut
 103
 104 sub new {
 105         my $class = shift;
 106         my $self = {@_};
 107         bless($self, $class);
 108
 109         my $r = $self->{'lookup_regex'} ? 1 : 0;
 110         my $l = $self->{'lookup'} ? 1 : 0;
 111
 112         my $log = $self->_get_logger();
 113
 114         # those two must be in pair
 115         if ( ($r & $l) != ($r || $l) ) {
 116                 my $log = $self->_get_logger();
 117                 $log->logdie("lookup_regex and lookup must be in pair");
 118         }
 119
 120         $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
 121
 122         $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
 123
 124         $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
 125
 126         if ($self->{filter} && ! $self->{filter}->{regex}) {
 127                 $log->debug("adding built-in filter regex");
 128                 $self->{filter}->{regex} = sub {
 129                         my ($val, $regex) = @_;
 130                         eval "\$val =~ $regex";
 131                         return $val;
 132                 };
 133         }
 134
 135         $self ? return $self : return undef;
 136 }
 137
 138
 139 =head2 data_structure
 140
 141 Create in-memory data structure which represents normalized layout from
 142 C<conf/normalize/*.xml>.
 143
 144 This structures are used to produce output.
 145
 146  my $ds = $webpac->data_structure($rec);
 147
 148 =cut
 149
 150 sub data_structure {
 151         my $self = shift;
 152
 153         my $log = $self->_get_logger();
 154
 155         my $rec = shift;
 156         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 157
 158         $log->debug("data_structure rec = ", sub { Dumper($rec) });
 159
 160         $log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'}));
 161
 162         my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
 163
 164         my $cache_file;
 165
 166         if ($self->{'db'}) {
 167                 my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
 168                 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
 169                 return $ds if ($ds);
 170                 $log->debug("cache miss, creating");
 171         }
 172
 173         undef $self->{'currnet_filename'};
 174         undef $self->{'headline'};
 175
 176         my @sorted_tags;
 177         if ($self->{tags_by_order}) {
 178                 @sorted_tags = @{$self->{tags_by_order}};
 179         } else {
 180                 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
 181                 $self->{tags_by_order} = \@sorted_tags;
 182         }
 183
 184         my $ds;
 185
 186         $log->debug("tags: ",sub { join(", ",@sorted_tags) });
 187
 188         foreach my $field (@sorted_tags) {
 189
 190                 my $row;
 191
 192 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
 193
 194                 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
 195                         my $format;
 196
 197                         $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
 198                         $format = $tag->{'value'} || $tag->{'content'};
 199
 200                         $log->debug("format: $format");
 201
 202                         my @v;
 203                         if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
 204                                 @v = $self->fill_in_to_arr($rec,$format);
 205                         } else {
 206                                 @v = $self->parse_to_arr($rec,$format);
 207                         }
 208                         next if (! @v);
 209
 210                         if ($tag->{'sort'}) {
 211                                 @v = $self->sort_arr(@v);
 212                         }
 213
 214                         # use format?
 215                         if ($tag->{'format_name'}) {
 216                                 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
 217                         }
 218
 219                         # delimiter will join repeatable fields
 220                         if ($tag->{'delimiter'}) {
 221                                 @v = ( join($tag->{'delimiter'}, @v) );
 222                         }
 223
 224                         # default types
 225                         my @types = qw(display search);
 226                         # override by type attribute
 227                         @types = ( $tag->{'type'} ) if ($tag->{'type'});
 228
 229                         foreach my $type (@types) {
 230                                 # append to previous line?
 231                                 $log->debug("type: $type ",sub { join(" ",@v) }, " ", $row->{'append'} || 'no append');
 232                                 if ($tag->{'append'}) {
 233
 234                                         # I will delimit appended part with
 235                                         # delimiter (or ,)
 236                                         my $d = $tag->{'delimiter'};
 237                                         # default delimiter
 238                                         $d ||= " ";
 239
 240                                         my $last = pop @{$row->{$type}};
 241                                         $d = "" if (! $last);
 242                                         $last .= $d . join($d, @v);
 243                                         push @{$row->{$type}}, $last;
 244
 245                                 } else {
 246                                         push @{$row->{$type}}, @v;
 247                                 }
 248                         }
 249
 250
 251                 }
 252
 253                 if ($row) {
 254                         $row->{'tag'} = $field;
 255
 256                         # TODO: name_sigular, name_plural
 257                         my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
 258                         my $row_name = $name ? $self->_x($name) : $field;
 259
 260                         # post-sort all values in field
 261                         if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
 262                                 $log->warn("sort at field tag not implemented");
 263                         }
 264
 265                         $ds->{$row_name} = $row;
 266
 267                         $log->debug("row $field: ",sub { Dumper($row) });
 268                 }
 269
 270         }
 271
 272         $self->{'db'}->save_ds(
 273                 id => $id,
 274                 ds => $ds,
 275                 prefix => $self->{prefix},
 276         ) if ($self->{'db'});
 277
 278         $log->debug("ds: ", sub { Dumper($ds) });
 279
 280         $log->logconfess("data structure returned is not array any more!") if wantarray;
 281
 282         return $ds;
 283
 284 }
 285
 286 =head2 parse
 287
 288 Perform smart parsing of string, skipping delimiters for fields which aren't
 289 defined. It can also eval code in format starting with C<eval{...}> and
 290 return output or nothing depending on eval code.
 291
 292  my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
 293
 294 Filters are implemented here. While simple form of filters looks like this:
 295
 296   filter{name_of_filter}
 297
 298 but, filters can also have variable number of parametars like this:
 299
 300   filter{name_of_filter(param,param,param)}
 301
 302 =cut
 303
 304 my $warn_once;
 305
 306 sub parse {
 307         my $self = shift;
 308
 309         my ($rec, $format_utf8, $i) = @_;
 310
 311         return if (! $format_utf8);
 312
 313         my $log = $self->_get_logger();
 314
 315         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 316
 317         $i = 0 if (! $i);
 318
 319         my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
 320
 321         my @out;
 322
 323         $log->debug("format: $format");
 324
 325         my $eval_code;
 326         # remove eval{...} from beginning
 327         $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
 328
 329         my $filter_name;
 330         # remove filter{...} from beginning
 331         $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
 332
 333         my $prefix;
 334         my $all_found=0;
 335
 336         while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
 337
 338                 my $del = $1 || '';
 339                 $prefix ||= $del if ($all_found == 0);
 340
 341                 # repeatable index
 342                 my $r = $i;
 343                 $r = 0 if (lc("$2") eq 's');
 344
 345                 my $found = 0;
 346                 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
 347
 348                 if ($found) {
 349                         push @out, $del;
 350                         push @out, $tmp;
 351                         $all_found += $found;
 352                 }
 353         }
 354
 355         return if (! $all_found);
 356
 357         my $out = join('',@out);
 358
 359         if ($out) {
 360                 # add rest of format (suffix)
 361                 $out .= $format;
 362
 363                 # add prefix if not there
 364                 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
 365
 366                 $log->debug("result: $out");
 367         }
 368
 369         if ($eval_code) {
 370                 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
 371                 $log->debug("about to eval{$eval} format: $out");
 372                 return if (! $self->_eval($eval));
 373         }
 374
 375         if ($filter_name) {
 376                 my @filter_args;
 377                 if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
 378                         @filter_args = split(/,/, $2);
 379                 }
 380                 if ($self->{'filter'}->{$filter_name}) {
 381                         $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
 382                         unshift @filter_args, $out;
 383                         $out = $self->{'filter'}->{$filter_name}->(@filter_args);
 384                         return unless(defined($out));
 385                         $log->debug("filter result: $out");
 386                 } elsif (! $warn_once->{$filter_name}) {
 387                         $log->warn("trying to use undefined filter $filter_name");
 388                         $warn_once->{$filter_name}++;
 389                 }
 390         }
 391
 392         return $out;
 393 }
 394
 395 =head2 parse_to_arr
 396
 397 Similar to C<parse>, but returns array of all repeatable fields
 398
 399  my @arr = $webpac->parse_to_arr($rec,'v250^a');
 400
 401 =cut
 402
 403 sub parse_to_arr {
 404         my $self = shift;
 405
 406         my ($rec, $format_utf8) = @_;
 407
 408         my $log = $self->_get_logger();
 409
 410         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 411         return if (! $format_utf8);
 412
 413         my $i = 0;
 414         my @arr;
 415
 416         while (my $v = $self->parse($rec,$format_utf8,$i++)) {
 417                 push @arr, $v;
 418         }
 419
 420         $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
 421
 422         return @arr;
 423 }
 424
 425
 426 =head2 fill_in
 427
 428 Workhourse of all: takes record from in-memory structure of database and
 429 strings with placeholders and returns string or array of with substituted
 430 values from record.
 431
 432  my $text = $webpac->fill_in($rec,'v250^a');
 433
 434 Optional argument is ordinal number for repeatable fields. By default,
 435 it's assume to be first repeatable field (fields are perl array, so first
 436 element is 0).
 437 Following example will read second value from repeatable field.
 438
 439  my $text = $webpac->fill_in($rec,'Title: v250^a',1);
 440
 441 This function B<does not> perform parsing of format to inteligenty skip
 442 delimiters before fields which aren't used.
 443
 444 This method will automatically decode UTF-8 string to local code page
 445 if needed.
 446
 447 =cut
 448
 449 sub fill_in {
 450         my $self = shift;
 451
 452         my $log = $self->_get_logger();
 453
 454         my $rec = shift || $log->logconfess("need data record");
 455         my $format = shift || $log->logconfess("need format to parse");
 456         # iteration (for repeatable fields)
 457         my $i = shift || 0;
 458
 459         $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
 460
 461         # FIXME remove for speedup?
 462         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 463
 464         if (utf8::is_utf8($format)) {
 465                 $format = $self->_x($format);
 466         }
 467
 468         my $found = 0;
 469
 470         my $eval_code;
 471         # remove eval{...} from beginning
 472         $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
 473
 474         my $filter_name;
 475         # remove filter{...} from beginning
 476         $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
 477
 478         # do actual replacement of placeholders
 479         # repeatable fields
 480         $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
 481         # non-repeatable fields
 482         $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
 483
 484         if ($found) {
 485                 $log->debug("format: $format");
 486                 if ($eval_code) {
 487                         my $eval = $self->fill_in($rec,$eval_code,$i);
 488                         return if (! $self->_eval($eval));
 489                 }
 490                 if ($filter_name && $self->{'filter'}->{$filter_name}) {
 491                         $log->debug("filter '$filter_name' for $format");
 492                         $format = $self->{'filter'}->{$filter_name}->($format);
 493                         return unless(defined($format));
 494                         $log->debug("filter result: $format");
 495                 }
 496                 # do we have lookups?
 497                 if ($self->{'lookup'}) {
 498                         if ($self->{'lookup'}->can('lookup')) {
 499                                 my @lookup = $self->{lookup}->lookup($format);
 500                                 $log->debug("lookup $format", join(", ", @lookup));
 501                                 return @lookup;
 502                         } else {
 503                                 $log->warn("Have lookup object but can't invoke lookup method");
 504                         }
 505                 } else {
 506                         return $format;
 507                 }
 508         } else {
 509                 return;
 510         }
 511 }
 512
 513
 514 =head2 fill_in_to_arr
 515
 516 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
 517 for fields which have lookups, so they shouldn't be parsed but rather
 518 C<fill_id>ed.
 519
 520  my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
 521
 522 =cut
 523
 524 sub fill_in_to_arr {
 525         my $self = shift;
 526
 527         my ($rec, $format_utf8) = @_;
 528
 529         my $log = $self->_get_logger();
 530
 531         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 532         return if (! $format_utf8);
 533
 534         my $i = 0;
 535         my @arr;
 536
 537         while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
 538                 push @arr, @v;
 539         }
 540
 541         $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
 542
 543         return @arr;
 544 }
 545
 546
 547 =head2 get_data
 548
 549 Returns value from record.
 550
 551  my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
 552
 553 Arguments are:
 554 record reference C<$rec>,
 555 field C<$f>,
 556 optional subfiled C<$sf>,
 557 index for repeatable values C<$i>.
 558
 559 Optinal variable C<$found> will be incremeted if there
 560 is field.
 561
 562 Returns value or empty string.
 563
 564 =cut
 565
 566 sub get_data {
 567         my $self = shift;
 568
 569         my ($rec,$f,$sf,$i,$found) = @_;
 570
 571         if ($$rec->{$f}) {
 572                 return '' if (! $$rec->{$f}->[$i]);
 573                 no strict 'refs';
 574                 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
 575                         $$found++ if (defined($$found));
 576                         return $$rec->{$f}->[$i]->{$sf};
 577                 } elsif (! $sf && $$rec->{$f}->[$i]) {
 578                         $$found++ if (defined($$found));
 579                         # it still might have subfield, just
 580                         # not specified, so we'll dump all
 581                         if ($$rec->{$f}->[$i] =~ /HASH/o) {
 582                                 my $out;
 583                                 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
 584                                         $out .= $$rec->{$f}->[$i]->{$k}." ";
 585                                 }
 586                                 return $out;
 587                         } else {
 588                                 return $$rec->{$f}->[$i];
 589                         }
 590                 } else {
 591                         return '';
 592                 }
 593         } else {
 594                 return '';
 595         }
 596 }
 597
 598
 599 =head2 apply_format
 600
 601 Apply format specified in tag with C<format_name="name"> and
 602 C<format_delimiter=";;">.
 603
 604  my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
 605
 606 Formats can contain C<lookup{...}> if you need them.
 607
 608 =cut
 609
 610 sub apply_format {
 611         my $self = shift;
 612
 613         my ($name,$delimiter,$data) = @_;
 614
 615         my $log = $self->_get_logger();
 616
 617         if (! $self->{'import_xml'}->{'format'}->{$name}) {
 618                 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
 619                 return $data;
 620         }
 621
 622         $log->warn("no delimiter for format $name") if (! $delimiter);
 623
 624         my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
 625
 626         my @data = split(/\Q$delimiter\E/, $data);
 627
 628         my $out = sprintf($format, @data);
 629         $log->debug("using format $name [$format] on $data to produce: $out");
 630
 631         if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
 632                 return $self->{'lookup'}->lookup($out);
 633         } else {
 634                 return $out;
 635         }
 636
 637 }
 638
 639 =head2 sort_arr
 640
 641 Sort array ignoring case and html in data
 642
 643  my @sorted = $webpac->sort_arr(@unsorted);
 644
 645 =cut
 646
 647 sub sort_arr {
 648         my $self = shift;
 649
 650         my $log = $self->_get_logger();
 651
 652         # FIXME add Schwartzian Transformation?
 653
 654         my @sorted = sort {
 655                 $a =~ s#<[^>]+/*>##;
 656                 $b =~ s#<[^>]+/*>##;
 657                 lc($b) cmp lc($a)
 658         } @_;
 659         $log->debug("sorted values: ",sub { join(", ",@sorted) });
 660
 661         return @sorted;
 662 }
 663
 664
 665 =head1 INTERNAL METHODS
 666
 667 =head2 _sort_by_order
 668
 669 Sort xml tags data structure accoding to C<order=""> attribute.
 670
 671 =cut
 672
 673 sub _sort_by_order {
 674         my $self = shift;
 675
 676         my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
 677                 $self->{'import_xml'}->{'indexer'}->{$a};
 678         my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
 679                 $self->{'import_xml'}->{'indexer'}->{$b};
 680
 681         return $va <=> $vb;
 682 }
 683
 684 =head2 _x
 685
 686 Convert strings from C<conf/normalize/*.xml> encoding into application
 687 specific encoding (optinally specified using C<code_page> to C<new>
 688 constructor).
 689
 690  my $text = $n->_x('normalize text string');
 691
 692 This is a stub so that other modules doesn't have to implement it.
 693
 694 =cut
 695
 696 sub _x {
 697         my $self = shift;
 698         return shift;
 699 }
 700
 701
 702 =head1 AUTHOR
 703
 704 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
 705
 706 =head1 COPYRIGHT & LICENSE
 707
 708 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
 709
 710 This program is free software; you can redistribute it and/or modify it
 711 under the same terms as Perl itself.
 712
 713 =cut
 714
 715 1; # End of WebPAC::Normalize