git.rot13.org Git - webpac2/blob - lib/WebPAC/Normalize.pm

   1 package WebPAC::Normalize;
   2
   3 use warnings;
   4 use strict;
   5 use base 'WebPAC::Common';
   6 use Data::Dumper;
   7
   8 =head1 NAME
   9
  10 WebPAC::Normalize - data mungling for normalisation
  11
  12 =head1 VERSION
  13
  14 Version 0.01
  15
  16 =cut
  17
  18 our $VERSION = '0.01';
  19
  20 =head1 SYNOPSIS
  21
  22 This package contains code that mungle data to produce normalized format.
  23
  24 It contains several assumptions:
  25
  26 =over
  27
  28 =item *
  29
  30 format of fields is defined using C<v123^a> notation for repeatable fields
  31 or C<s123^a> for single (or first) value, where C<123> is field number and
  32 C<a> is subfield.
  33
  34 =item *
  35
  36 source data records (C<$rec>) have unique identifiers in field C<000>
  37
  38 =item *
  39
  40 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
  41 perl code that is evaluated before producing output (value of field will be
  42 interpolated before that)
  43
  44 =item *
  45
  46 optional C<filter{filter_name}> at B<begining of format> will apply perl
  47 code defined as code ref on format after field substitution to producing
  48 output
  49
  50 =item *
  51
  52 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
  53
  54 =item *
  55
  56 at end, optional C<format>s rules are resolved. Format rules are similar to
  57 C<sprintf> and can also contain C<lookup{...}> which is performed after
  58 values are inserted in format.
  59
  60 =back
  61
  62 This also describes order in which transformations are applied (eval,
  63 filter, lookup, format) which is important to undestand when deciding how to
  64 solve your data mungling and normalisation process.
  65
  66
  67
  68
  69 =head1 FUNCTIONS
  70
  71 =head2 new
  72
  73 Create new normalisation object
  74
  75   my $n = new WebPAC::Normalize::Something(
  76         filter => {
  77                 'filter_name_1' => sub {
  78                         # filter code
  79                         return length($_);
  80                 }, ...
  81         },
  82         db => $db_obj,
  83         lookup_regex => $lookup->regex,
  84         lookup => $lookup_obj,
  85   );
  86
  87 Parametar C<filter> defines user supplied snippets of perl code which can
  88 be use with C<filter{...}> notation.
  89
  90 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
  91 in structures. If you pass this parametar, you must also pass C<lookup>
  92 which is C<WebPAC::Lookup> object.
  93
  94 =cut
  95
  96 sub new {
  97         my $class = shift;
  98         my $self = {@_};
  99         bless($self, $class);
 100
 101         my $r = $self->{'lookup_regex'} ? 1 : 0;
 102         my $l = $self->{'lookup'} ? 1 : 0;
 103
 104         my $log = $self->_get_logger();
 105
 106         # those two must be in pair
 107         if ( ($r & $l) != ($r || $l) ) {
 108                 my $log = $self->_get_logger();
 109                 $log->logdie("lookup_regex and lookup must be in pair");
 110         }
 111
 112         $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
 113
 114         $self ? return $self : return undef;
 115 }
 116
 117
 118 =head2 data_structure
 119
 120 Create in-memory data structure which represents normalized layout from
 121 C<conf/normalize/*.xml>.
 122
 123 This structures are used to produce output.
 124
 125  my @ds = $webpac->data_structure($rec);
 126
 127 B<Note: historical oddity follows>
 128
 129 This method will also set C<< $webpac->{'currnet_filename'} >> if there is
 130 C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
 131 C<< <headline> >> tag.
 132
 133 =cut
 134
 135 sub data_structure {
 136         my $self = shift;
 137
 138         my $log = $self->_get_logger();
 139
 140         my $rec = shift;
 141         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 142
 143         my $cache_file;
 144
 145         if ($self->{'db'}) {
 146                 my @ds = $self->{'db'}->load_ds($rec);
 147                 $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper(@ds) });
 148                 return @ds if ($#ds > 0);
 149                 $log->debug("cache miss, creating");
 150         }
 151
 152         undef $self->{'currnet_filename'};
 153         undef $self->{'headline'};
 154
 155         my @sorted_tags;
 156         if ($self->{tags_by_order}) {
 157                 @sorted_tags = @{$self->{tags_by_order}};
 158         } else {
 159                 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
 160                 $self->{tags_by_order} = \@sorted_tags;
 161         }
 162
 163         my @ds;
 164
 165         $log->debug("tags: ",sub { join(", ",@sorted_tags) });
 166
 167         foreach my $field (@sorted_tags) {
 168
 169                 my $row;
 170
 171 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
 172
 173                 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
 174                         my $format;
 175
 176                         $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
 177                         $format = $tag->{'value'} || $tag->{'content'};
 178
 179                         $log->debug("format: $format");
 180
 181                         my @v;
 182                         if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
 183                                 @v = $self->fill_in_to_arr($rec,$format);
 184                         } else {
 185                                 @v = $self->parse_to_arr($rec,$format);
 186                         }
 187                         next if (! @v);
 188
 189                         if ($tag->{'sort'}) {
 190                                 @v = $self->sort_arr(@v);
 191                         }
 192
 193                         # use format?
 194                         if ($tag->{'format_name'}) {
 195                                 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
 196                         }
 197
 198                         if ($field eq 'filename') {
 199                                 $self->{'current_filename'} = join('',@v);
 200                                 $log->debug("filename: ",$self->{'current_filename'});
 201                         } elsif ($field eq 'headline') {
 202                                 $self->{'headline'} .= join('',@v);
 203                                 $log->debug("headline: ",$self->{'headline'});
 204                                 next; # don't return headline in data_structure!
 205                         }
 206
 207                         # delimiter will join repeatable fields
 208                         if ($tag->{'delimiter'}) {
 209                                 @v = ( join($tag->{'delimiter'}, @v) );
 210                         }
 211
 212                         # default types
 213                         my @types = qw(display swish);
 214                         # override by type attribute
 215                         @types = ( $tag->{'type'} ) if ($tag->{'type'});
 216
 217                         foreach my $type (@types) {
 218                                 # append to previous line?
 219                                 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
 220                                 if ($tag->{'append'}) {
 221
 222                                         # I will delimit appended part with
 223                                         # delimiter (or ,)
 224                                         my $d = $tag->{'delimiter'};
 225                                         # default delimiter
 226                                         $d ||= " ";
 227
 228                                         my $last = pop @{$row->{$type}};
 229                                         $d = "" if (! $last);
 230                                         $last .= $d . join($d, @v);
 231                                         push @{$row->{$type}}, $last;
 232
 233                                 } else {
 234                                         push @{$row->{$type}}, @v;
 235                                 }
 236                         }
 237
 238
 239                 }
 240
 241                 if ($row) {
 242                         $row->{'tag'} = $field;
 243
 244                         # TODO: name_sigular, name_plural
 245                         my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
 246                         $row->{'name'} = $name ? $self->_x($name) : $field;
 247
 248                         # post-sort all values in field
 249                         if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
 250                                 $log->warn("sort at field tag not implemented");
 251                         }
 252
 253                         push @ds, $row;
 254
 255                         $log->debug("row $field: ",sub { Dumper($row) });
 256                 }
 257
 258         }
 259
 260         $self->{'db'}->save_ds(
 261                 ds => \@ds,
 262                 current_filename => $self->{'current_filename'},
 263                 headline => $self->{'headline'},
 264         ) if ($self->{'db'});
 265
 266         $log->debug("ds: ", sub { Dumper(@ds) });
 267
 268         return @ds;
 269
 270 }
 271
 272 =head2 parse
 273
 274 Perform smart parsing of string, skipping delimiters for fields which aren't
 275 defined. It can also eval code in format starting with C<eval{...}> and
 276 return output or nothing depending on eval code.
 277
 278  my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
 279
 280 =cut
 281
 282 sub parse {
 283         my $self = shift;
 284
 285         my ($rec, $format_utf8, $i) = @_;
 286
 287         return if (! $format_utf8);
 288
 289         my $log = $self->_get_logger();
 290
 291         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 292
 293         $i = 0 if (! $i);
 294
 295         my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
 296
 297         my @out;
 298
 299         $log->debug("format: $format");
 300
 301         my $eval_code;
 302         # remove eval{...} from beginning
 303         $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
 304
 305         my $filter_name;
 306         # remove filter{...} from beginning
 307         $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
 308
 309         my $prefix;
 310         my $all_found=0;
 311
 312         while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
 313
 314                 my $del = $1 || '';
 315                 $prefix ||= $del if ($all_found == 0);
 316
 317                 # repeatable index
 318                 my $r = $i;
 319                 $r = 0 if (lc("$2") eq 's');
 320
 321                 my $found = 0;
 322                 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
 323
 324                 if ($found) {
 325                         push @out, $del;
 326                         push @out, $tmp;
 327                         $all_found += $found;
 328                 }
 329         }
 330
 331         return if (! $all_found);
 332
 333         my $out = join('',@out);
 334
 335         if ($out) {
 336                 # add rest of format (suffix)
 337                 $out .= $format;
 338
 339                 # add prefix if not there
 340                 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
 341
 342                 $log->debug("result: $out");
 343         }
 344
 345         if ($eval_code) {
 346                 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
 347                 $log->debug("about to eval{$eval} format: $out");
 348                 return if (! $self->_eval($eval));
 349         }
 350
 351         if ($filter_name && $self->{'filter'}->{$filter_name}) {
 352                 $log->debug("about to filter{$filter_name} format: $out");
 353                 $out = $self->{'filter'}->{$filter_name}->($out);
 354                 return unless(defined($out));
 355                 $log->debug("filter result: $out");
 356         }
 357
 358         return $out;
 359 }
 360
 361 =head2 parse_to_arr
 362
 363 Similar to C<parse>, but returns array of all repeatable fields
 364
 365  my @arr = $webpac->parse_to_arr($rec,'v250^a');
 366
 367 =cut
 368
 369 sub parse_to_arr {
 370         my $self = shift;
 371
 372         my ($rec, $format_utf8) = @_;
 373
 374         my $log = $self->_get_logger();
 375
 376         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 377         return if (! $format_utf8);
 378
 379         my $i = 0;
 380         my @arr;
 381
 382         while (my $v = $self->parse($rec,$format_utf8,$i++)) {
 383                 push @arr, $v;
 384         }
 385
 386         $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
 387
 388         return @arr;
 389 }
 390
 391
 392 =head2 fill_in
 393
 394 Workhourse of all: takes record from in-memory structure of database and
 395 strings with placeholders and returns string or array of with substituted
 396 values from record.
 397
 398  my $text = $webpac->fill_in($rec,'v250^a');
 399
 400 Optional argument is ordinal number for repeatable fields. By default,
 401 it's assume to be first repeatable field (fields are perl array, so first
 402 element is 0).
 403 Following example will read second value from repeatable field.
 404
 405  my $text = $webpac->fill_in($rec,'Title: v250^a',1);
 406
 407 This function B<does not> perform parsing of format to inteligenty skip
 408 delimiters before fields which aren't used.
 409
 410 This method will automatically decode UTF-8 string to local code page
 411 if needed.
 412
 413 =cut
 414
 415 sub fill_in {
 416         my $self = shift;
 417
 418         my $log = $self->_get_logger();
 419
 420         my $rec = shift || $log->logconfess("need data record");
 421         my $format = shift || $log->logconfess("need format to parse");
 422         # iteration (for repeatable fields)
 423         my $i = shift || 0;
 424
 425         $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
 426
 427         # FIXME remove for speedup?
 428         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 429
 430         if (utf8::is_utf8($format)) {
 431                 $format = $self->_x($format);
 432         }
 433
 434         my $found = 0;
 435
 436         my $eval_code;
 437         # remove eval{...} from beginning
 438         $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
 439
 440         my $filter_name;
 441         # remove filter{...} from beginning
 442         $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
 443
 444         # do actual replacement of placeholders
 445         # repeatable fields
 446         $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
 447         # non-repeatable fields
 448         $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
 449
 450         if ($found) {
 451                 $log->debug("format: $format");
 452                 if ($eval_code) {
 453                         my $eval = $self->fill_in($rec,$eval_code,$i);
 454                         return if (! $self->_eval($eval));
 455                 }
 456                 if ($filter_name && $self->{'filter'}->{$filter_name}) {
 457                         $log->debug("filter '$filter_name' for $format");
 458                         $format = $self->{'filter'}->{$filter_name}->($format);
 459                         return unless(defined($format));
 460                         $log->debug("filter result: $format");
 461                 }
 462                 # do we have lookups?
 463                 if ($self->{'lookup'}) {
 464                         if ($self->{'lookup'}->can('lookup')) {
 465                                 return $self->{'lookup'}->lookup($format);
 466                         } else {
 467                                 $log->warn("Have lookup object but can't invoke lookup method");
 468                         }
 469                 } else {
 470                         return $format;
 471                 }
 472         } else {
 473                 return;
 474         }
 475 }
 476
 477
 478 =head2 fill_in_to_arr
 479
 480 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
 481 for fields which have lookups, so they shouldn't be parsed but rather
 482 C<fill_id>ed.
 483
 484  my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
 485
 486 =cut
 487
 488 sub fill_in_to_arr {
 489         my $self = shift;
 490
 491         my ($rec, $format_utf8) = @_;
 492
 493         my $log = $self->_get_logger();
 494
 495         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 496         return if (! $format_utf8);
 497
 498         my $i = 0;
 499         my @arr;
 500
 501         while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
 502                 push @arr, @v;
 503         }
 504
 505         $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
 506
 507         return @arr;
 508 }
 509
 510
 511 =head2 get_data
 512
 513 Returns value from record.
 514
 515  my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
 516
 517 Arguments are:
 518 record reference C<$rec>,
 519 field C<$f>,
 520 optional subfiled C<$sf>,
 521 index for repeatable values C<$i>.
 522
 523 Optinal variable C<$found> will be incremeted if there
 524 is field.
 525
 526 Returns value or empty string.
 527
 528 =cut
 529
 530 sub get_data {
 531         my $self = shift;
 532
 533         my ($rec,$f,$sf,$i,$found) = @_;
 534
 535         if ($$rec->{$f}) {
 536                 return '' if (! $$rec->{$f}->[$i]);
 537                 no strict 'refs';
 538                 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
 539                         $$found++ if (defined($$found));
 540                         return $$rec->{$f}->[$i]->{$sf};
 541                 } elsif ($$rec->{$f}->[$i]) {
 542                         $$found++ if (defined($$found));
 543                         # it still might have subfield, just
 544                         # not specified, so we'll dump all
 545                         if ($$rec->{$f}->[$i] =~ /HASH/o) {
 546                                 my $out;
 547                                 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
 548                                         $out .= $$rec->{$f}->[$i]->{$k}." ";
 549                                 }
 550                                 return $out;
 551                         } else {
 552                                 return $$rec->{$f}->[$i];
 553                         }
 554                 }
 555         } else {
 556                 return '';
 557         }
 558 }
 559
 560
 561 =head2 apply_format
 562
 563 Apply format specified in tag with C<format_name="name"> and
 564 C<format_delimiter=";;">.
 565
 566  my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
 567
 568 Formats can contain C<lookup{...}> if you need them.
 569
 570 =cut
 571
 572 sub apply_format {
 573         my $self = shift;
 574
 575         my ($name,$delimiter,$data) = @_;
 576
 577         my $log = $self->_get_logger();
 578
 579         if (! $self->{'import_xml'}->{'format'}->{$name}) {
 580                 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
 581                 return $data;
 582         }
 583
 584         $log->warn("no delimiter for format $name") if (! $delimiter);
 585
 586         my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
 587
 588         my @data = split(/\Q$delimiter\E/, $data);
 589
 590         my $out = sprintf($format, @data);
 591         $log->debug("using format $name [$format] on $data to produce: $out");
 592
 593         if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
 594                 return $self->{'lookup'}->lookup($out);
 595         } else {
 596                 return $out;
 597         }
 598
 599 }
 600
 601 =head2 sort_arr
 602
 603 Sort array ignoring case and html in data
 604
 605  my @sorted = $webpac->sort_arr(@unsorted);
 606
 607 =cut
 608
 609 sub sort_arr {
 610         my $self = shift;
 611
 612         my $log = $self->_get_logger();
 613
 614         # FIXME add Schwartzian Transformation?
 615
 616         my @sorted = sort {
 617                 $a =~ s#<[^>]+/*>##;
 618                 $b =~ s#<[^>]+/*>##;
 619                 lc($b) cmp lc($a)
 620         } @_;
 621         $log->debug("sorted values: ",sub { join(", ",@sorted) });
 622
 623         return @sorted;
 624 }
 625
 626
 627 =head1 INTERNAL METHODS
 628
 629 =head2 _sort_by_order
 630
 631 Sort xml tags data structure accoding to C<order=""> attribute.
 632
 633 =cut
 634
 635 sub _sort_by_order {
 636         my $self = shift;
 637
 638         my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
 639                 $self->{'import_xml'}->{'indexer'}->{$a};
 640         my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
 641                 $self->{'import_xml'}->{'indexer'}->{$b};
 642
 643         return $va <=> $vb;
 644 }
 645
 646 =head2 _x
 647
 648 Convert strings from C<conf/normalize/*.xml> encoding into application
 649 specific encoding (optinally specified using C<code_page> to C<new>
 650 constructor).
 651
 652  my $text = $n->_x('normalize text string');
 653
 654 This is a stub so that other modules doesn't have to implement it.
 655
 656 =cut
 657
 658 sub _x {
 659         my $self = shift;
 660         return shift;
 661 }
 662
 663
 664 =head1 AUTHOR
 665
 666 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
 667
 668 =head1 COPYRIGHT & LICENSE
 669
 670 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
 671
 672 This program is free software; you can redistribute it and/or modify it
 673 under the same terms as Perl itself.
 674
 675 =cut
 676
 677 1; # End of WebPAC::DB