git.rot13.org Git - webpac2/blob - lib/WebPAC/Normalize.pm

   1 package WebPAC::Normalize;
   2
   3 use warnings;
   4 use strict;
   5 use base 'WebPAC::Common';
   6 use Data::Dumper;
   7
   8 =head1 NAME
   9
  10 WebPAC::Normalize - data mungling for normalisation
  11
  12 =head1 VERSION
  13
  14 Version 0.02
  15
  16 =cut
  17
  18 our $VERSION = '0.02';
  19
  20 =head1 SYNOPSIS
  21
  22 This package contains code that mungle data to produce normalized format.
  23
  24 It contains several assumptions:
  25
  26 =over
  27
  28 =item *
  29
  30 format of fields is defined using C<v123^a> notation for repeatable fields
  31 or C<s123^a> for single (or first) value, where C<123> is field number and
  32 C<a> is subfield.
  33
  34 =item *
  35
  36 source data records (C<$rec>) have unique identifiers in field C<000>
  37
  38 =item *
  39
  40 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
  41 perl code that is evaluated before producing output (value of field will be
  42 interpolated before that)
  43
  44 =item *
  45
  46 optional C<filter{filter_name}> at B<begining of format> will apply perl
  47 code defined as code ref on format after field substitution to producing
  48 output
  49
  50 =item *
  51
  52 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
  53
  54 =item *
  55
  56 at end, optional C<format>s rules are resolved. Format rules are similar to
  57 C<sprintf> and can also contain C<lookup{...}> which is performed after
  58 values are inserted in format.
  59
  60 =back
  61
  62 This also describes order in which transformations are applied (eval,
  63 filter, lookup, format) which is important to undestand when deciding how to
  64 solve your data mungling and normalisation process.
  65
  66
  67
  68
  69 =head1 FUNCTIONS
  70
  71 =head2 new
  72
  73 Create new normalisation object
  74
  75   my $n = new WebPAC::Normalize::Something(
  76         filter => {
  77                 'filter_name_1' => sub {
  78                         # filter code
  79                         return length($_);
  80                 }, ...
  81         },
  82         db => $db_obj,
  83         lookup_regex => $lookup->regex,
  84         lookup => $lookup_obj,
  85   );
  86
  87 Parametar C<filter> defines user supplied snippets of perl code which can
  88 be use with C<filter{...}> notation.
  89
  90 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
  91 in structures. If you pass this parametar, you must also pass C<lookup>
  92 which is C<WebPAC::Lookup> object.
  93
  94 =cut
  95
  96 sub new {
  97         my $class = shift;
  98         my $self = {@_};
  99         bless($self, $class);
 100
 101         my $r = $self->{'lookup_regex'} ? 1 : 0;
 102         my $l = $self->{'lookup'} ? 1 : 0;
 103
 104         my $log = $self->_get_logger();
 105
 106         # those two must be in pair
 107         if ( ($r & $l) != ($r || $l) ) {
 108                 my $log = $self->_get_logger();
 109                 $log->logdie("lookup_regex and lookup must be in pair");
 110         }
 111
 112         $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
 113
 114         $self ? return $self : return undef;
 115 }
 116
 117
 118 =head2 data_structure
 119
 120 Create in-memory data structure which represents normalized layout from
 121 C<conf/normalize/*.xml>.
 122
 123 This structures are used to produce output.
 124
 125  my $ds = $webpac->data_structure($rec);
 126
 127 =cut
 128
 129 sub data_structure {
 130         my $self = shift;
 131
 132         my $log = $self->_get_logger();
 133
 134         my $rec = shift;
 135         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 136
 137         $log->debug("data_structure rec = ", sub { Dumper($rec) });
 138
 139         $log->logdie("need unique ID (mfn) in field 000 of record ", sub { Dumper($rec) } ) unless (defined($rec->{'000'}));
 140
 141         my $mfn = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
 142
 143         my $cache_file;
 144
 145         if ($self->{'db'}) {
 146                 my $ds = $self->{'db'}->load_ds( $mfn );
 147                 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
 148                 return $ds if ($ds);
 149                 $log->debug("cache miss, creating");
 150         }
 151
 152         undef $self->{'currnet_filename'};
 153         undef $self->{'headline'};
 154
 155         my @sorted_tags;
 156         if ($self->{tags_by_order}) {
 157                 @sorted_tags = @{$self->{tags_by_order}};
 158         } else {
 159                 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
 160                 $self->{tags_by_order} = \@sorted_tags;
 161         }
 162
 163         my $ds;
 164
 165         $log->debug("tags: ",sub { join(", ",@sorted_tags) });
 166
 167         foreach my $field (@sorted_tags) {
 168
 169                 my $row;
 170
 171 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
 172
 173                 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
 174                         my $format;
 175
 176                         $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
 177                         $format = $tag->{'value'} || $tag->{'content'};
 178
 179                         $log->debug("format: $format");
 180
 181                         my @v;
 182                         if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
 183                                 @v = $self->fill_in_to_arr($rec,$format);
 184                         } else {
 185                                 @v = $self->parse_to_arr($rec,$format);
 186                         }
 187                         next if (! @v);
 188
 189                         if ($tag->{'sort'}) {
 190                                 @v = $self->sort_arr(@v);
 191                         }
 192
 193                         # use format?
 194                         if ($tag->{'format_name'}) {
 195                                 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
 196                         }
 197
 198                         # delimiter will join repeatable fields
 199                         if ($tag->{'delimiter'}) {
 200                                 @v = ( join($tag->{'delimiter'}, @v) );
 201                         }
 202
 203                         # default types
 204                         my @types = qw(display search);
 205                         # override by type attribute
 206                         @types = ( $tag->{'type'} ) if ($tag->{'type'});
 207
 208                         foreach my $type (@types) {
 209                                 # append to previous line?
 210                                 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
 211                                 if ($tag->{'append'}) {
 212
 213                                         # I will delimit appended part with
 214                                         # delimiter (or ,)
 215                                         my $d = $tag->{'delimiter'};
 216                                         # default delimiter
 217                                         $d ||= " ";
 218
 219                                         my $last = pop @{$row->{$type}};
 220                                         $d = "" if (! $last);
 221                                         $last .= $d . join($d, @v);
 222                                         push @{$row->{$type}}, $last;
 223
 224                                 } else {
 225                                         push @{$row->{$type}}, @v;
 226                                 }
 227                         }
 228
 229
 230                 }
 231
 232                 if ($row) {
 233                         $row->{'tag'} = $field;
 234
 235                         # TODO: name_sigular, name_plural
 236                         my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
 237                         my $row_name = $name ? $self->_x($name) : $field;
 238
 239                         # post-sort all values in field
 240                         if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
 241                                 $log->warn("sort at field tag not implemented");
 242                         }
 243
 244                         $ds->{$row_name} = $row;
 245
 246                         $log->debug("row $field: ",sub { Dumper($row) });
 247                 }
 248
 249         }
 250
 251         $self->{'db'}->save_ds(
 252                 id => $mfn,
 253                 ds => $ds,
 254         ) if ($self->{'db'});
 255
 256         $log->debug("ds: ", sub { Dumper($ds) });
 257
 258         $log->logconfess("data structure returned is not array any more!") if wantarray;
 259
 260         return $ds;
 261
 262 }
 263
 264 =head2 parse
 265
 266 Perform smart parsing of string, skipping delimiters for fields which aren't
 267 defined. It can also eval code in format starting with C<eval{...}> and
 268 return output or nothing depending on eval code.
 269
 270  my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
 271
 272 =cut
 273
 274 sub parse {
 275         my $self = shift;
 276
 277         my ($rec, $format_utf8, $i) = @_;
 278
 279         return if (! $format_utf8);
 280
 281         my $log = $self->_get_logger();
 282
 283         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 284
 285         $i = 0 if (! $i);
 286
 287         my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
 288
 289         my @out;
 290
 291         $log->debug("format: $format");
 292
 293         my $eval_code;
 294         # remove eval{...} from beginning
 295         $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
 296
 297         my $filter_name;
 298         # remove filter{...} from beginning
 299         $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
 300
 301         my $prefix;
 302         my $all_found=0;
 303
 304         while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
 305
 306                 my $del = $1 || '';
 307                 $prefix ||= $del if ($all_found == 0);
 308
 309                 # repeatable index
 310                 my $r = $i;
 311                 $r = 0 if (lc("$2") eq 's');
 312
 313                 my $found = 0;
 314                 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
 315
 316                 if ($found) {
 317                         push @out, $del;
 318                         push @out, $tmp;
 319                         $all_found += $found;
 320                 }
 321         }
 322
 323         return if (! $all_found);
 324
 325         my $out = join('',@out);
 326
 327         if ($out) {
 328                 # add rest of format (suffix)
 329                 $out .= $format;
 330
 331                 # add prefix if not there
 332                 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
 333
 334                 $log->debug("result: $out");
 335         }
 336
 337         if ($eval_code) {
 338                 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
 339                 $log->debug("about to eval{$eval} format: $out");
 340                 return if (! $self->_eval($eval));
 341         }
 342
 343         if ($filter_name && $self->{'filter'}->{$filter_name}) {
 344                 $log->debug("about to filter{$filter_name} format: $out");
 345                 $out = $self->{'filter'}->{$filter_name}->($out);
 346                 return unless(defined($out));
 347                 $log->debug("filter result: $out");
 348         }
 349
 350         return $out;
 351 }
 352
 353 =head2 parse_to_arr
 354
 355 Similar to C<parse>, but returns array of all repeatable fields
 356
 357  my @arr = $webpac->parse_to_arr($rec,'v250^a');
 358
 359 =cut
 360
 361 sub parse_to_arr {
 362         my $self = shift;
 363
 364         my ($rec, $format_utf8) = @_;
 365
 366         my $log = $self->_get_logger();
 367
 368         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 369         return if (! $format_utf8);
 370
 371         my $i = 0;
 372         my @arr;
 373
 374         while (my $v = $self->parse($rec,$format_utf8,$i++)) {
 375                 push @arr, $v;
 376         }
 377
 378         $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
 379
 380         return @arr;
 381 }
 382
 383
 384 =head2 fill_in
 385
 386 Workhourse of all: takes record from in-memory structure of database and
 387 strings with placeholders and returns string or array of with substituted
 388 values from record.
 389
 390  my $text = $webpac->fill_in($rec,'v250^a');
 391
 392 Optional argument is ordinal number for repeatable fields. By default,
 393 it's assume to be first repeatable field (fields are perl array, so first
 394 element is 0).
 395 Following example will read second value from repeatable field.
 396
 397  my $text = $webpac->fill_in($rec,'Title: v250^a',1);
 398
 399 This function B<does not> perform parsing of format to inteligenty skip
 400 delimiters before fields which aren't used.
 401
 402 This method will automatically decode UTF-8 string to local code page
 403 if needed.
 404
 405 =cut
 406
 407 sub fill_in {
 408         my $self = shift;
 409
 410         my $log = $self->_get_logger();
 411
 412         my $rec = shift || $log->logconfess("need data record");
 413         my $format = shift || $log->logconfess("need format to parse");
 414         # iteration (for repeatable fields)
 415         my $i = shift || 0;
 416
 417         $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
 418
 419         # FIXME remove for speedup?
 420         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 421
 422         if (utf8::is_utf8($format)) {
 423                 $format = $self->_x($format);
 424         }
 425
 426         my $found = 0;
 427
 428         my $eval_code;
 429         # remove eval{...} from beginning
 430         $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
 431
 432         my $filter_name;
 433         # remove filter{...} from beginning
 434         $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
 435
 436         # do actual replacement of placeholders
 437         # repeatable fields
 438         $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
 439         # non-repeatable fields
 440         $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
 441
 442         if ($found) {
 443                 $log->debug("format: $format");
 444                 if ($eval_code) {
 445                         my $eval = $self->fill_in($rec,$eval_code,$i);
 446                         return if (! $self->_eval($eval));
 447                 }
 448                 if ($filter_name && $self->{'filter'}->{$filter_name}) {
 449                         $log->debug("filter '$filter_name' for $format");
 450                         $format = $self->{'filter'}->{$filter_name}->($format);
 451                         return unless(defined($format));
 452                         $log->debug("filter result: $format");
 453                 }
 454                 # do we have lookups?
 455                 if ($self->{'lookup'}) {
 456                         if ($self->{'lookup'}->can('lookup')) {
 457                                 return $self->{'lookup'}->lookup($format);
 458                         } else {
 459                                 $log->warn("Have lookup object but can't invoke lookup method");
 460                         }
 461                 } else {
 462                         return $format;
 463                 }
 464         } else {
 465                 return;
 466         }
 467 }
 468
 469
 470 =head2 fill_in_to_arr
 471
 472 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
 473 for fields which have lookups, so they shouldn't be parsed but rather
 474 C<fill_id>ed.
 475
 476  my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
 477
 478 =cut
 479
 480 sub fill_in_to_arr {
 481         my $self = shift;
 482
 483         my ($rec, $format_utf8) = @_;
 484
 485         my $log = $self->_get_logger();
 486
 487         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 488         return if (! $format_utf8);
 489
 490         my $i = 0;
 491         my @arr;
 492
 493         while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
 494                 push @arr, @v;
 495         }
 496
 497         $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
 498
 499         return @arr;
 500 }
 501
 502
 503 =head2 get_data
 504
 505 Returns value from record.
 506
 507  my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
 508
 509 Arguments are:
 510 record reference C<$rec>,
 511 field C<$f>,
 512 optional subfiled C<$sf>,
 513 index for repeatable values C<$i>.
 514
 515 Optinal variable C<$found> will be incremeted if there
 516 is field.
 517
 518 Returns value or empty string.
 519
 520 =cut
 521
 522 sub get_data {
 523         my $self = shift;
 524
 525         my ($rec,$f,$sf,$i,$found) = @_;
 526
 527         if ($$rec->{$f}) {
 528                 return '' if (! $$rec->{$f}->[$i]);
 529                 no strict 'refs';
 530                 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
 531                         $$found++ if (defined($$found));
 532                         return $$rec->{$f}->[$i]->{$sf};
 533                 } elsif (! $sf && $$rec->{$f}->[$i]) {
 534                         $$found++ if (defined($$found));
 535                         # it still might have subfield, just
 536                         # not specified, so we'll dump all
 537                         if ($$rec->{$f}->[$i] =~ /HASH/o) {
 538                                 my $out;
 539                                 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
 540                                         $out .= $$rec->{$f}->[$i]->{$k}." ";
 541                                 }
 542                                 return $out;
 543                         } else {
 544                                 return $$rec->{$f}->[$i];
 545                         }
 546                 } else {
 547                         return '';
 548                 }
 549         } else {
 550                 return '';
 551         }
 552 }
 553
 554
 555 =head2 apply_format
 556
 557 Apply format specified in tag with C<format_name="name"> and
 558 C<format_delimiter=";;">.
 559
 560  my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
 561
 562 Formats can contain C<lookup{...}> if you need them.
 563
 564 =cut
 565
 566 sub apply_format {
 567         my $self = shift;
 568
 569         my ($name,$delimiter,$data) = @_;
 570
 571         my $log = $self->_get_logger();
 572
 573         if (! $self->{'import_xml'}->{'format'}->{$name}) {
 574                 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
 575                 return $data;
 576         }
 577
 578         $log->warn("no delimiter for format $name") if (! $delimiter);
 579
 580         my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
 581
 582         my @data = split(/\Q$delimiter\E/, $data);
 583
 584         my $out = sprintf($format, @data);
 585         $log->debug("using format $name [$format] on $data to produce: $out");
 586
 587         if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
 588                 return $self->{'lookup'}->lookup($out);
 589         } else {
 590                 return $out;
 591         }
 592
 593 }
 594
 595 =head2 sort_arr
 596
 597 Sort array ignoring case and html in data
 598
 599  my @sorted = $webpac->sort_arr(@unsorted);
 600
 601 =cut
 602
 603 sub sort_arr {
 604         my $self = shift;
 605
 606         my $log = $self->_get_logger();
 607
 608         # FIXME add Schwartzian Transformation?
 609
 610         my @sorted = sort {
 611                 $a =~ s#<[^>]+/*>##;
 612                 $b =~ s#<[^>]+/*>##;
 613                 lc($b) cmp lc($a)
 614         } @_;
 615         $log->debug("sorted values: ",sub { join(", ",@sorted) });
 616
 617         return @sorted;
 618 }
 619
 620
 621 =head1 INTERNAL METHODS
 622
 623 =head2 _sort_by_order
 624
 625 Sort xml tags data structure accoding to C<order=""> attribute.
 626
 627 =cut
 628
 629 sub _sort_by_order {
 630         my $self = shift;
 631
 632         my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
 633                 $self->{'import_xml'}->{'indexer'}->{$a};
 634         my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
 635                 $self->{'import_xml'}->{'indexer'}->{$b};
 636
 637         return $va <=> $vb;
 638 }
 639
 640 =head2 _x
 641
 642 Convert strings from C<conf/normalize/*.xml> encoding into application
 643 specific encoding (optinally specified using C<code_page> to C<new>
 644 constructor).
 645
 646  my $text = $n->_x('normalize text string');
 647
 648 This is a stub so that other modules doesn't have to implement it.
 649
 650 =cut
 651
 652 sub _x {
 653         my $self = shift;
 654         return shift;
 655 }
 656
 657
 658 =head1 AUTHOR
 659
 660 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
 661
 662 =head1 COPYRIGHT & LICENSE
 663
 664 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
 665
 666 This program is free software; you can redistribute it and/or modify it
 667 under the same terms as Perl itself.
 668
 669 =cut
 670
 671 1; # End of WebPAC::DB