git.rot13.org Git - webpac2/blob - lib/WebPAC/Normalize.pm

   1 package WebPAC::Normalize;
   2
   3 use warnings;
   4 use strict;
   5 use base 'WebPAC::Common';
   6 use Data::Dumper;
   7
   8 =head1 NAME
   9
  10 WebPAC::Normalize - data mungling for normalisation
  11
  12 =head1 VERSION
  13
  14 Version 0.06
  15
  16 =cut
  17
  18 our $VERSION = '0.06';
  19
  20 =head1 SYNOPSIS
  21
  22 This package contains code that mungle data to produce normalized format.
  23
  24 It contains several assumptions:
  25
  26 =over
  27
  28 =item *
  29
  30 format of fields is defined using C<v123^a> notation for repeatable fields
  31 or C<s123^a> for single (or first) value, where C<123> is field number and
  32 C<a> is subfield.
  33
  34 =item *
  35
  36 source data records (C<$rec>) have unique identifiers in field C<000>
  37
  38 =item *
  39
  40 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
  41 perl code that is evaluated before producing output (value of field will be
  42 interpolated before that)
  43
  44 =item *
  45
  46 optional C<filter{filter_name}> at B<begining of format> will apply perl
  47 code defined as code ref on format after field substitution to producing
  48 output
  49
  50 There is one built-in filter called C<regex> which can be use like this:
  51
  52   filter{regex(s/foo/bar/)}
  53
  54 =item *
  55
  56 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
  57
  58 =item *
  59
  60 at end, optional C<format>s rules are resolved. Format rules are similar to
  61 C<sprintf> and can also contain C<lookup{...}> which is performed after
  62 values are inserted in format.
  63
  64 =back
  65
  66 This also describes order in which transformations are applied (eval,
  67 filter, lookup, format) which is important to undestand when deciding how to
  68 solve your data mungling and normalisation process.
  69
  70
  71
  72
  73 =head1 FUNCTIONS
  74
  75 =head2 new
  76
  77 Create new normalisation object
  78
  79   my $n = new WebPAC::Normalize::Something(
  80         filter => {
  81                 'filter_name_1' => sub {
  82                         # filter code
  83                         return length($_);
  84                 }, ...
  85         },
  86         db => $db_obj,
  87         lookup_regex => $lookup->regex,
  88         lookup => $lookup_obj,
  89         prefix => 'foobar',
  90   );
  91
  92 Parametar C<filter> defines user supplied snippets of perl code which can
  93 be use with C<filter{...}> notation.
  94
  95 C<prefix> is used to form filename for database record (to support multiple
  96 source files which are joined in one database).
  97
  98 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
  99 in structures. If you pass this parametar, you must also pass C<lookup>
 100 which is C<WebPAC::Lookup> object.
 101
 102 =cut
 103
 104 sub new {
 105         my $class = shift;
 106         my $self = {@_};
 107         bless($self, $class);
 108
 109         my $r = $self->{'lookup_regex'} ? 1 : 0;
 110         my $l = $self->{'lookup'} ? 1 : 0;
 111
 112         my $log = $self->_get_logger();
 113
 114         # those two must be in pair
 115         if ( ($r & $l) != ($r || $l) ) {
 116                 my $log = $self->_get_logger();
 117                 $log->logdie("lookup_regex and lookup must be in pair");
 118         }
 119
 120         $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
 121
 122         $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
 123
 124         $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
 125
 126         if ($self->{filter} && ! $self->{filter}->{regex}) {
 127                 $log->debug("adding built-in filter regex");
 128                 $self->{filter}->{regex} = sub {
 129                         my ($val, $regex) = @_;
 130                         eval "\$val =~ $regex";
 131                         return $val;
 132                 };
 133         }
 134
 135         $self ? return $self : return undef;
 136 }
 137
 138
 139 =head2 data_structure
 140
 141 Create in-memory data structure which represents normalized layout from
 142 C<conf/normalize/*.xml>.
 143
 144 This structures are used to produce output.
 145
 146  my $ds = $webpac->data_structure($rec);
 147
 148 =cut
 149
 150 sub data_structure {
 151         my $self = shift;
 152
 153         my $log = $self->_get_logger();
 154
 155         my $rec = shift;
 156         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 157
 158         $log->debug("data_structure rec = ", sub { Dumper($rec) });
 159
 160         $log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'}));
 161
 162         my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
 163
 164         my $cache_file;
 165
 166         if ($self->{'db'}) {
 167                 my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
 168                 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
 169                 return $ds if ($ds);
 170                 $log->debug("cache miss, creating");
 171         }
 172
 173         undef $self->{'currnet_filename'};
 174         undef $self->{'headline'};
 175
 176         my @sorted_tags;
 177         if ($self->{tags_by_order}) {
 178                 @sorted_tags = @{$self->{tags_by_order}};
 179         } else {
 180                 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
 181                 $self->{tags_by_order} = \@sorted_tags;
 182         }
 183
 184         my $ds;
 185
 186         $log->debug("tags: ",sub { join(", ",@sorted_tags) });
 187
 188         foreach my $field (@sorted_tags) {
 189
 190                 my $row;
 191
 192 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
 193
 194                 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
 195                         my $format;
 196
 197                         $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
 198                         $format = $tag->{'value'} || $tag->{'content'};
 199
 200                         $log->debug("format: $format");
 201
 202                         my @v;
 203                         if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
 204                                 @v = $self->fill_in_to_arr($rec,$format);
 205                         } else {
 206                                 @v = $self->parse_to_arr($rec,$format);
 207                         }
 208                         next if (! @v);
 209
 210                         if ($tag->{'sort'}) {
 211                                 @v = $self->sort_arr(@v);
 212                         }
 213
 214                         # use format?
 215                         if ($tag->{'format_name'}) {
 216                                 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
 217                         }
 218
 219                         # delimiter will join repeatable fields
 220                         if ($tag->{'delimiter'}) {
 221                                 @v = ( join($tag->{'delimiter'}, @v) );
 222                         }
 223
 224                         # default types
 225                         my @types = qw(display search);
 226                         # override by type attribute
 227                         @types = ( $tag->{'type'} ) if ($tag->{'type'});
 228
 229                         foreach my $type (@types) {
 230                                 # append to previous line?
 231                                 $log->debug("type: $type ",sub { join(" ",@v) }, " ", $row->{'append'} || 'no append');
 232                                 if ($tag->{'append'}) {
 233
 234                                         # I will delimit appended part with
 235                                         # delimiter (or ,)
 236                                         my $d = $tag->{'delimiter'};
 237                                         # default delimiter
 238                                         $d ||= " ";
 239
 240                                         my $last = pop @{$row->{$type}};
 241                                         $d = "" if (! $last);
 242                                         $last .= $d . join($d, @v);
 243                                         push @{$row->{$type}}, $last;
 244
 245                                 } else {
 246                                         push @{$row->{$type}}, @v;
 247                                 }
 248                         }
 249
 250
 251                 }
 252
 253                 if ($row) {
 254                         $row->{'tag'} = $field;
 255
 256                         # TODO: name_sigular, name_plural
 257                         my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
 258                         my $row_name = $name ? $self->_x($name) : $field;
 259
 260                         # post-sort all values in field
 261                         if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
 262                                 $log->warn("sort at field tag not implemented");
 263                         }
 264
 265                         $ds->{$row_name} = $row;
 266
 267                         $log->debug("row $field: ",sub { Dumper($row) });
 268                 }
 269
 270         }
 271
 272         $self->{'db'}->save_ds(
 273                 id => $id,
 274                 ds => $ds,
 275                 prefix => $self->{prefix},
 276         ) if ($self->{'db'});
 277
 278         $log->debug("ds: ", sub { Dumper($ds) });
 279
 280         $log->logconfess("data structure returned is not array any more!") if wantarray;
 281
 282         return $ds;
 283
 284 }
 285
 286 =head2 parse
 287
 288 Perform smart parsing of string, skipping delimiters for fields which aren't
 289 defined. It can also eval code in format starting with C<eval{...}> and
 290 return output or nothing depending on eval code.
 291
 292  my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
 293
 294 Filters are implemented here. While simple form of filters looks like this:
 295
 296   filter{name_of_filter}
 297
 298 but, filters can also have variable number of parametars like this:
 299
 300   filter{name_of_filter(param,param,param)}
 301
 302 =cut
 303
 304 sub parse {
 305         my $self = shift;
 306
 307         my ($rec, $format_utf8, $i) = @_;
 308
 309         return if (! $format_utf8);
 310
 311         my $log = $self->_get_logger();
 312
 313         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 314
 315         $i = 0 if (! $i);
 316
 317         my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
 318
 319         my @out;
 320
 321         $log->debug("format: $format");
 322
 323         my $eval_code;
 324         # remove eval{...} from beginning
 325         $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
 326
 327         my $filter_name;
 328         # remove filter{...} from beginning
 329         $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
 330
 331         my $prefix;
 332         my $all_found=0;
 333
 334         while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
 335
 336                 my $del = $1 || '';
 337                 $prefix ||= $del if ($all_found == 0);
 338
 339                 # repeatable index
 340                 my $r = $i;
 341                 $r = 0 if (lc("$2") eq 's');
 342
 343                 my $found = 0;
 344                 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
 345
 346                 if ($found) {
 347                         push @out, $del;
 348                         push @out, $tmp;
 349                         $all_found += $found;
 350                 }
 351         }
 352
 353         return if (! $all_found);
 354
 355         my $out = join('',@out);
 356
 357         if ($out) {
 358                 # add rest of format (suffix)
 359                 $out .= $format;
 360
 361                 # add prefix if not there
 362                 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
 363
 364                 $log->debug("result: $out");
 365         }
 366
 367         if ($eval_code) {
 368                 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
 369                 $log->debug("about to eval{$eval} format: $out");
 370                 return if (! $self->_eval($eval));
 371         }
 372
 373         if ($filter_name) {
 374                 my @filter_args;
 375                 if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
 376                         @filter_args = split(/,/, $2);
 377                 }
 378                 if ($self->{'filter'}->{$filter_name}) {
 379                         $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
 380                         unshift @filter_args, $out;
 381                         $out = $self->{'filter'}->{$filter_name}->(@filter_args);
 382                         return unless(defined($out));
 383                         $log->debug("filter result: $out");
 384                 } else {
 385                         $log->warn("trying to use undefined filter $filter_name");
 386                 }
 387         }
 388
 389         return $out;
 390 }
 391
 392 =head2 parse_to_arr
 393
 394 Similar to C<parse>, but returns array of all repeatable fields
 395
 396  my @arr = $webpac->parse_to_arr($rec,'v250^a');
 397
 398 =cut
 399
 400 sub parse_to_arr {
 401         my $self = shift;
 402
 403         my ($rec, $format_utf8) = @_;
 404
 405         my $log = $self->_get_logger();
 406
 407         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 408         return if (! $format_utf8);
 409
 410         my $i = 0;
 411         my @arr;
 412
 413         while (my $v = $self->parse($rec,$format_utf8,$i++)) {
 414                 push @arr, $v;
 415         }
 416
 417         $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
 418
 419         return @arr;
 420 }
 421
 422
 423 =head2 fill_in
 424
 425 Workhourse of all: takes record from in-memory structure of database and
 426 strings with placeholders and returns string or array of with substituted
 427 values from record.
 428
 429  my $text = $webpac->fill_in($rec,'v250^a');
 430
 431 Optional argument is ordinal number for repeatable fields. By default,
 432 it's assume to be first repeatable field (fields are perl array, so first
 433 element is 0).
 434 Following example will read second value from repeatable field.
 435
 436  my $text = $webpac->fill_in($rec,'Title: v250^a',1);
 437
 438 This function B<does not> perform parsing of format to inteligenty skip
 439 delimiters before fields which aren't used.
 440
 441 This method will automatically decode UTF-8 string to local code page
 442 if needed.
 443
 444 =cut
 445
 446 sub fill_in {
 447         my $self = shift;
 448
 449         my $log = $self->_get_logger();
 450
 451         my $rec = shift || $log->logconfess("need data record");
 452         my $format = shift || $log->logconfess("need format to parse");
 453         # iteration (for repeatable fields)
 454         my $i = shift || 0;
 455
 456         $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
 457
 458         # FIXME remove for speedup?
 459         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 460
 461         if (utf8::is_utf8($format)) {
 462                 $format = $self->_x($format);
 463         }
 464
 465         my $found = 0;
 466
 467         my $eval_code;
 468         # remove eval{...} from beginning
 469         $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
 470
 471         my $filter_name;
 472         # remove filter{...} from beginning
 473         $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
 474
 475         # do actual replacement of placeholders
 476         # repeatable fields
 477         $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
 478         # non-repeatable fields
 479         $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
 480
 481         if ($found) {
 482                 $log->debug("format: $format");
 483                 if ($eval_code) {
 484                         my $eval = $self->fill_in($rec,$eval_code,$i);
 485                         return if (! $self->_eval($eval));
 486                 }
 487                 if ($filter_name && $self->{'filter'}->{$filter_name}) {
 488                         $log->debug("filter '$filter_name' for $format");
 489                         $format = $self->{'filter'}->{$filter_name}->($format);
 490                         return unless(defined($format));
 491                         $log->debug("filter result: $format");
 492                 }
 493                 # do we have lookups?
 494                 if ($self->{'lookup'}) {
 495                         if ($self->{'lookup'}->can('lookup')) {
 496                                 my @lookup = $self->{lookup}->lookup($format);
 497                                 $log->debug("lookup $format", join(", ", @lookup));
 498                                 return @lookup;
 499                         } else {
 500                                 $log->warn("Have lookup object but can't invoke lookup method");
 501                         }
 502                 } else {
 503                         return $format;
 504                 }
 505         } else {
 506                 return;
 507         }
 508 }
 509
 510
 511 =head2 fill_in_to_arr
 512
 513 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
 514 for fields which have lookups, so they shouldn't be parsed but rather
 515 C<fill_id>ed.
 516
 517  my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
 518
 519 =cut
 520
 521 sub fill_in_to_arr {
 522         my $self = shift;
 523
 524         my ($rec, $format_utf8) = @_;
 525
 526         my $log = $self->_get_logger();
 527
 528         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 529         return if (! $format_utf8);
 530
 531         my $i = 0;
 532         my @arr;
 533
 534         while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
 535                 push @arr, @v;
 536         }
 537
 538         $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
 539
 540         return @arr;
 541 }
 542
 543
 544 =head2 get_data
 545
 546 Returns value from record.
 547
 548  my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
 549
 550 Arguments are:
 551 record reference C<$rec>,
 552 field C<$f>,
 553 optional subfiled C<$sf>,
 554 index for repeatable values C<$i>.
 555
 556 Optinal variable C<$found> will be incremeted if there
 557 is field.
 558
 559 Returns value or empty string.
 560
 561 =cut
 562
 563 sub get_data {
 564         my $self = shift;
 565
 566         my ($rec,$f,$sf,$i,$found) = @_;
 567
 568         if ($$rec->{$f}) {
 569                 return '' if (! $$rec->{$f}->[$i]);
 570                 no strict 'refs';
 571                 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
 572                         $$found++ if (defined($$found));
 573                         return $$rec->{$f}->[$i]->{$sf};
 574                 } elsif (! $sf && $$rec->{$f}->[$i]) {
 575                         $$found++ if (defined($$found));
 576                         # it still might have subfield, just
 577                         # not specified, so we'll dump all
 578                         if ($$rec->{$f}->[$i] =~ /HASH/o) {
 579                                 my $out;
 580                                 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
 581                                         $out .= $$rec->{$f}->[$i]->{$k}." ";
 582                                 }
 583                                 return $out;
 584                         } else {
 585                                 return $$rec->{$f}->[$i];
 586                         }
 587                 } else {
 588                         return '';
 589                 }
 590         } else {
 591                 return '';
 592         }
 593 }
 594
 595
 596 =head2 apply_format
 597
 598 Apply format specified in tag with C<format_name="name"> and
 599 C<format_delimiter=";;">.
 600
 601  my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
 602
 603 Formats can contain C<lookup{...}> if you need them.
 604
 605 =cut
 606
 607 sub apply_format {
 608         my $self = shift;
 609
 610         my ($name,$delimiter,$data) = @_;
 611
 612         my $log = $self->_get_logger();
 613
 614         if (! $self->{'import_xml'}->{'format'}->{$name}) {
 615                 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
 616                 return $data;
 617         }
 618
 619         $log->warn("no delimiter for format $name") if (! $delimiter);
 620
 621         my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
 622
 623         my @data = split(/\Q$delimiter\E/, $data);
 624
 625         my $out = sprintf($format, @data);
 626         $log->debug("using format $name [$format] on $data to produce: $out");
 627
 628         if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
 629                 return $self->{'lookup'}->lookup($out);
 630         } else {
 631                 return $out;
 632         }
 633
 634 }
 635
 636 =head2 sort_arr
 637
 638 Sort array ignoring case and html in data
 639
 640  my @sorted = $webpac->sort_arr(@unsorted);
 641
 642 =cut
 643
 644 sub sort_arr {
 645         my $self = shift;
 646
 647         my $log = $self->_get_logger();
 648
 649         # FIXME add Schwartzian Transformation?
 650
 651         my @sorted = sort {
 652                 $a =~ s#<[^>]+/*>##;
 653                 $b =~ s#<[^>]+/*>##;
 654                 lc($b) cmp lc($a)
 655         } @_;
 656         $log->debug("sorted values: ",sub { join(", ",@sorted) });
 657
 658         return @sorted;
 659 }
 660
 661
 662 =head1 INTERNAL METHODS
 663
 664 =head2 _sort_by_order
 665
 666 Sort xml tags data structure accoding to C<order=""> attribute.
 667
 668 =cut
 669
 670 sub _sort_by_order {
 671         my $self = shift;
 672
 673         my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
 674                 $self->{'import_xml'}->{'indexer'}->{$a};
 675         my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
 676                 $self->{'import_xml'}->{'indexer'}->{$b};
 677
 678         return $va <=> $vb;
 679 }
 680
 681 =head2 _x
 682
 683 Convert strings from C<conf/normalize/*.xml> encoding into application
 684 specific encoding (optinally specified using C<code_page> to C<new>
 685 constructor).
 686
 687  my $text = $n->_x('normalize text string');
 688
 689 This is a stub so that other modules doesn't have to implement it.
 690
 691 =cut
 692
 693 sub _x {
 694         my $self = shift;
 695         return shift;
 696 }
 697
 698
 699 =head1 AUTHOR
 700
 701 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
 702
 703 =head1 COPYRIGHT & LICENSE
 704
 705 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
 706
 707 This program is free software; you can redistribute it and/or modify it
 708 under the same terms as Perl itself.
 709
 710 =cut
 711
 712 1; # End of WebPAC::Normalize