git.rot13.org Git - webpac2/blob - lib/WebPAC/Normalize.pm

   1 package WebPAC::Normalize;
   2
   3 use warnings;
   4 use strict;
   5 use Data::Dumper;
   6
   7 =head1 NAME
   8
   9 WebPAC::Normalize - data mungling for normalisation
  10
  11 =head1 VERSION
  12
  13 Version 0.01
  14
  15 =cut
  16
  17 our $VERSION = '0.01';
  18
  19 =head1 SYNOPSIS
  20
  21 This package contains code that mungle data to produce normalized format.
  22
  23 It contains several assumptions:
  24
  25 =over
  26
  27 =item *
  28
  29 format of fields is defined using C<v123^a> notation for repeatable fields
  30 or C<s123^a> for single (or first) value, where C<123> is field number and
  31 C<a> is subfield.
  32
  33 =item *
  34
  35 source data records (C<$rec>) have unique identifiers in field C<000>
  36
  37 =item *
  38
  39 optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
  40 perl code that is evaluated before producing output (value of field will be
  41 interpolated before that)
  42
  43 =item *
  44
  45 optional C<filter{filter_name}> at B<begining of format> will apply perl
  46 code defined as code ref on format after field substitution to producing
  47 output
  48
  49 =item *
  50
  51 optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
  52
  53 =item *
  54
  55 at end, optional C<format>s rules are resolved. Format rules are similar to
  56 C<sprintf> and can also contain C<lookup{...}> which is performed after
  57 values are inserted in format.
  58
  59 =back
  60
  61 This also describes order in which transformations are applied (eval,
  62 filter, lookup, format) which is important to undestand when deciding how to
  63 solve your data mungling and normalisation process.
  64
  65
  66
  67
  68 =head1 FUNCTIONS
  69
  70 =head2 new
  71
  72 Create new normalisation object
  73
  74   my $n = new WebPAC::Normalize::Something(
  75         filter => {
  76                 'filter_name_1' => sub {
  77                         # filter code
  78                         return length($_);
  79                 }, ...
  80         },
  81         db => $webpac_db_obj,
  82         lookup_regex => $lookup->regex,
  83   );
  84
  85 Parametar C<filter> defines user supplied snippets of perl code which can
  86 be use with C<filter{...}> notation.
  87
  88 Recommended parametar C<lookup_regex> is used to enable parsing of lookups
  89 in structures.
  90
  91 =cut
  92
  93 sub new {
  94         my $class = shift;
  95         my $self = {@_};
  96         bless($self, $class);
  97
  98         $self ? return $self : return undef;
  99 }
 100
 101
 102 =head2 data_structure
 103
 104 Create in-memory data structure which represents normalized layout from
 105 C<conf/normalize/*.xml>.
 106
 107 This structures are used to produce output.
 108
 109  my @ds = $webpac->data_structure($rec);
 110
 111 B<Note: historical oddity follows>
 112
 113 This method will also set C<< $webpac->{'currnet_filename'} >> if there is
 114 C<< <filename> >> tag and C<< $webpac->{'headline'} >> if there is
 115 C<< <headline> >> tag.
 116
 117 =cut
 118
 119 sub data_structure {
 120         my $self = shift;
 121
 122         my $log = $self->_get_logger();
 123
 124         my $rec = shift;
 125         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 126
 127         my $cache_file;
 128
 129         if ($self->{'db'}) {
 130                 my @ds = $self->{'db'}->get_ds($rec);
 131                 return @ds if (@ds);
 132         }
 133
 134         undef $self->{'currnet_filename'};
 135         undef $self->{'headline'};
 136
 137         my @sorted_tags;
 138         if ($self->{tags_by_order}) {
 139                 @sorted_tags = @{$self->{tags_by_order}};
 140         } else {
 141                 @sorted_tags = sort { $self->_sort_by_order } keys %{$self->{'import_xml'}->{'indexer'}};
 142                 $self->{tags_by_order} = \@sorted_tags;
 143         }
 144
 145         my @ds;
 146
 147         $log->debug("tags: ",sub { join(", ",@sorted_tags) });
 148
 149         foreach my $field (@sorted_tags) {
 150
 151                 my $row;
 152
 153 #print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
 154
 155                 foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
 156                         my $format = $tag->{'value'} || $tag->{'content'};
 157
 158                         $log->debug("format: $format");
 159
 160                         my @v;
 161                         if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
 162                                 @v = $self->fill_in_to_arr($rec,$format);
 163                         } else {
 164                                 @v = $self->parse_to_arr($rec,$format);
 165                         }
 166                         next if (! @v);
 167
 168                         if ($tag->{'sort'}) {
 169                                 @v = $self->sort_arr(@v);
 170                         }
 171
 172                         # use format?
 173                         if ($tag->{'format_name'}) {
 174                                 @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
 175                         }
 176
 177                         if ($field eq 'filename') {
 178                                 $self->{'current_filename'} = join('',@v);
 179                                 $log->debug("filename: ",$self->{'current_filename'});
 180                         } elsif ($field eq 'headline') {
 181                                 $self->{'headline'} .= join('',@v);
 182                                 $log->debug("headline: ",$self->{'headline'});
 183                                 next; # don't return headline in data_structure!
 184                         }
 185
 186                         # delimiter will join repeatable fields
 187                         if ($tag->{'delimiter'}) {
 188                                 @v = ( join($tag->{'delimiter'}, @v) );
 189                         }
 190
 191                         # default types
 192                         my @types = qw(display swish);
 193                         # override by type attribute
 194                         @types = ( $tag->{'type'} ) if ($tag->{'type'});
 195
 196                         foreach my $type (@types) {
 197                                 # append to previous line?
 198                                 $log->debug("type: $type ",sub { join(" ",@v) }, $row->{'append'} || 'no append');
 199                                 if ($tag->{'append'}) {
 200
 201                                         # I will delimit appended part with
 202                                         # delimiter (or ,)
 203                                         my $d = $tag->{'delimiter'};
 204                                         # default delimiter
 205                                         $d ||= " ";
 206
 207                                         my $last = pop @{$row->{$type}};
 208                                         $d = "" if (! $last);
 209                                         $last .= $d . join($d, @v);
 210                                         push @{$row->{$type}}, $last;
 211
 212                                 } else {
 213                                         push @{$row->{$type}}, @v;
 214                                 }
 215                         }
 216
 217
 218                 }
 219
 220                 if ($row) {
 221                         $row->{'tag'} = $field;
 222
 223                         # TODO: name_sigular, name_plural
 224                         my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
 225                         $row->{'name'} = $name ? $self->_x($name) : $field;
 226
 227                         # post-sort all values in field
 228                         if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
 229                                 $log->warn("sort at field tag not implemented");
 230                         }
 231
 232                         push @ds, $row;
 233
 234                         $log->debug("row $field: ",sub { Dumper($row) });
 235                 }
 236
 237         }
 238
 239         $self->{'db'}->put_gs(
 240                 ds => \@ds,
 241                 current_filename => $self->{'current_filename'},
 242                 headline => $self->{'headline'},
 243         ) if ($self->{'db'});
 244
 245         return @ds;
 246
 247 }
 248
 249 =head2 parse
 250
 251 Perform smart parsing of string, skipping delimiters for fields which aren't
 252 defined. It can also eval code in format starting with C<eval{...}> and
 253 return output or nothing depending on eval code.
 254
 255  my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
 256
 257 =cut
 258
 259 sub parse {
 260         my $self = shift;
 261
 262         my ($rec, $format_utf8, $i) = @_;
 263
 264         return if (! $format_utf8);
 265
 266         my $log = $self->_get_logger();
 267
 268         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 269
 270         $i = 0 if (! $i);
 271
 272         my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
 273
 274         my @out;
 275
 276         $log->debug("format: $format");
 277
 278         my $eval_code;
 279         # remove eval{...} from beginning
 280         $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
 281
 282         my $filter_name;
 283         # remove filter{...} from beginning
 284         $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
 285
 286         my $prefix;
 287         my $all_found=0;
 288
 289         while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
 290
 291                 my $del = $1 || '';
 292                 $prefix ||= $del if ($all_found == 0);
 293
 294                 # repeatable index
 295                 my $r = $i;
 296                 $r = 0 if (lc("$2") eq 's');
 297
 298                 my $found = 0;
 299                 my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found);
 300
 301                 if ($found) {
 302                         push @out, $del;
 303                         push @out, $tmp;
 304                         $all_found += $found;
 305                 }
 306         }
 307
 308         return if (! $all_found);
 309
 310         my $out = join('',@out);
 311
 312         if ($out) {
 313                 # add rest of format (suffix)
 314                 $out .= $format;
 315
 316                 # add prefix if not there
 317                 $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
 318
 319                 $log->debug("result: $out");
 320         }
 321
 322         if ($eval_code) {
 323                 my $eval = $self->fill_in($rec,$eval_code,$i) || return;
 324                 $log->debug("about to eval{$eval} format: $out");
 325                 return if (! $self->_eval($eval));
 326         }
 327
 328         if ($filter_name && $self->{'filter'}->{$filter_name}) {
 329                 $log->debug("about to filter{$filter_name} format: $out");
 330                 $out = $self->{'filter'}->{$filter_name}->($out);
 331                 return unless(defined($out));
 332                 $log->debug("filter result: $out");
 333         }
 334
 335         return $out;
 336 }
 337
 338 =head2 parse_to_arr
 339
 340 Similar to C<parse>, but returns array of all repeatable fields
 341
 342  my @arr = $webpac->parse_to_arr($rec,'v250^a');
 343
 344 =cut
 345
 346 sub parse_to_arr {
 347         my $self = shift;
 348
 349         my ($rec, $format_utf8) = @_;
 350
 351         my $log = $self->_get_logger();
 352
 353         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 354         return if (! $format_utf8);
 355
 356         my $i = 0;
 357         my @arr;
 358
 359         while (my $v = $self->parse($rec,$format_utf8,$i++)) {
 360                 push @arr, $v;
 361         }
 362
 363         $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
 364
 365         return @arr;
 366 }
 367
 368
 369 =head2 fill_in
 370
 371 Workhourse of all: takes record from in-memory structure of database and
 372 strings with placeholders and returns string or array of with substituted
 373 values from record.
 374
 375  my $text = $webpac->fill_in($rec,'v250^a');
 376
 377 Optional argument is ordinal number for repeatable fields. By default,
 378 it's assume to be first repeatable field (fields are perl array, so first
 379 element is 0).
 380 Following example will read second value from repeatable field.
 381
 382  my $text = $webpac->fill_in($rec,'Title: v250^a',1);
 383
 384 This function B<does not> perform parsing of format to inteligenty skip
 385 delimiters before fields which aren't used.
 386
 387 This method will automatically decode UTF-8 string to local code page
 388 if needed.
 389
 390 =cut
 391
 392 sub fill_in {
 393         my $self = shift;
 394
 395         my $log = $self->_get_logger();
 396
 397         my $rec = shift || $log->logconfess("need data record");
 398         my $format = shift || $log->logconfess("need format to parse");
 399         # iteration (for repeatable fields)
 400         my $i = shift || 0;
 401
 402         $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
 403
 404         # FIXME remove for speedup?
 405         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 406
 407         if (utf8::is_utf8($format)) {
 408                 $format = $self->_x($format);
 409         }
 410
 411         my $found = 0;
 412
 413         my $eval_code;
 414         # remove eval{...} from beginning
 415         $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
 416
 417         my $filter_name;
 418         # remove filter{...} from beginning
 419         $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
 420
 421         # do actual replacement of placeholders
 422         # repeatable fields
 423         $format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found)/ges;
 424         # non-repeatable fields
 425         $format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found)/ges;
 426
 427         if ($found) {
 428                 $log->debug("format: $format");
 429                 if ($eval_code) {
 430                         my $eval = $self->fill_in($rec,$eval_code,$i);
 431                         return if (! $self->_eval($eval));
 432                 }
 433                 if ($filter_name && $self->{'filter'}->{$filter_name}) {
 434                         $log->debug("filter '$filter_name' for $format");
 435                         $format = $self->{'filter'}->{$filter_name}->($format);
 436                         return unless(defined($format));
 437                         $log->debug("filter result: $format");
 438                 }
 439                 # do we have lookups?
 440                 if ($self->{'lookup'}) {
 441                         return $self->lookup($format);
 442                 } else {
 443                         return $format;
 444                 }
 445         } else {
 446                 return;
 447         }
 448 }
 449
 450
 451 =head2 fill_in_to_arr
 452
 453 Similar to C<fill_in>, but returns array of all repeatable fields. Usable
 454 for fields which have lookups, so they shouldn't be parsed but rather
 455 C<fill_id>ed.
 456
 457  my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]');
 458
 459 =cut
 460
 461 sub fill_in_to_arr {
 462         my $self = shift;
 463
 464         my ($rec, $format_utf8) = @_;
 465
 466         my $log = $self->_get_logger();
 467
 468         $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
 469         return if (! $format_utf8);
 470
 471         my $i = 0;
 472         my @arr;
 473
 474         while (my @v = $self->fill_in($rec,$format_utf8,$i++)) {
 475                 push @arr, @v;
 476         }
 477
 478         $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
 479
 480         return @arr;
 481 }
 482
 483
 484 =head2 get_data
 485
 486 Returns value from record.
 487
 488  my $text = $self->get_data(\$rec,$f,$sf,$i,\$found);
 489
 490 Arguments are:
 491 record reference C<$rec>,
 492 field C<$f>,
 493 optional subfiled C<$sf>,
 494 index for repeatable values C<$i>.
 495
 496 Optinal variable C<$found> will be incremeted if there
 497 is field.
 498
 499 Returns value or empty string.
 500
 501 =cut
 502
 503 sub get_data {
 504         my $self = shift;
 505
 506         my ($rec,$f,$sf,$i,$found) = @_;
 507
 508         if ($$rec->{$f}) {
 509                 return '' if (! $$rec->{$f}->[$i]);
 510                 no strict 'refs';
 511                 if ($sf && $$rec->{$f}->[$i]->{$sf}) {
 512                         $$found++ if (defined($$found));
 513                         return $$rec->{$f}->[$i]->{$sf};
 514                 } elsif ($$rec->{$f}->[$i]) {
 515                         $$found++ if (defined($$found));
 516                         # it still might have subfield, just
 517                         # not specified, so we'll dump all
 518                         if ($$rec->{$f}->[$i] =~ /HASH/o) {
 519                                 my $out;
 520                                 foreach my $k (keys %{$$rec->{$f}->[$i]}) {
 521                                         $out .= $$rec->{$f}->[$i]->{$k}." ";
 522                                 }
 523                                 return $out;
 524                         } else {
 525                                 return $$rec->{$f}->[$i];
 526                         }
 527                 }
 528         } else {
 529                 return '';
 530         }
 531 }
 532
 533
 534 =head2 apply_format
 535
 536 Apply format specified in tag with C<format_name="name"> and
 537 C<format_delimiter=";;">.
 538
 539  my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
 540
 541 Formats can contain C<lookup{...}> if you need them.
 542
 543 =cut
 544
 545 sub apply_format {
 546         my $self = shift;
 547
 548         my ($name,$delimiter,$data) = @_;
 549
 550         my $log = $self->_get_logger();
 551
 552         if (! $self->{'import_xml'}->{'format'}->{$name}) {
 553                 $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
 554                 return $data;
 555         }
 556
 557         $log->warn("no delimiter for format $name") if (! $delimiter);
 558
 559         my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
 560
 561         my @data = split(/\Q$delimiter\E/, $data);
 562
 563         my $out = sprintf($format, @data);
 564         $log->debug("using format $name [$format] on $data to produce: $out");
 565
 566         if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
 567                 return $self->lookup($out);
 568         } else {
 569                 return $out;
 570         }
 571
 572 }
 573
 574 =head2 sort_arr
 575
 576 Sort array ignoring case and html in data
 577
 578  my @sorted = $webpac->sort_arr(@unsorted);
 579
 580 =cut
 581
 582 sub sort_arr {
 583         my $self = shift;
 584
 585         my $log = $self->_get_logger();
 586
 587         # FIXME add Schwartzian Transformation?
 588
 589         my @sorted = sort {
 590                 $a =~ s#<[^>]+/*>##;
 591                 $b =~ s#<[^>]+/*>##;
 592                 lc($b) cmp lc($a)
 593         } @_;
 594         $log->debug("sorted values: ",sub { join(", ",@sorted) });
 595
 596         return @sorted;
 597 }
 598
 599
 600 =head1 INTERNAL METHODS
 601
 602 =head2 _sort_by_order
 603
 604 Sort xml tags data structure accoding to C<order=""> attribute.
 605
 606 =cut
 607
 608 sub _sort_by_order {
 609         my $self = shift;
 610
 611         my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
 612                 $self->{'import_xml'}->{'indexer'}->{$a};
 613         my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
 614                 $self->{'import_xml'}->{'indexer'}->{$b};
 615
 616         return $va <=> $vb;
 617 }
 618
 619 =head2 _x
 620
 621 Convert strings from C<conf/normalize/*.xml> encoding into application
 622 specific encoding (optinally specified using C<code_page> to C<new>
 623 constructor).
 624
 625  my $text = $n->_x('normalize text string');
 626
 627 This is a stub so that other modules doesn't have to implement it.
 628
 629 =cut
 630
 631 sub _x {
 632         my $self = shift;
 633         return shift;
 634 }
 635
 636
 637 =head1 AUTHOR
 638
 639 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
 640
 641 =head1 COPYRIGHT & LICENSE
 642
 643 Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
 644
 645 This program is free software; you can redistribute it and/or modify it
 646 under the same terms as Perl itself.
 647
 648 =cut
 649
 650 1; # End of WebPAC::DB