git.rot13.org Git - webpac2/blob - lib/WebPAC/Normalize.pm

   1 package WebPAC::Normalize;
   2 use Exporter 'import';
   3 @EXPORT = qw/
   4         _set_rec _set_lookup
   5         _get_ds _clean_ds
   6         _debug
   7
   8         tag search display
   9         marc marc_indicators marc_repeatable_subfield
  10         marc_compose marc_leader
  11
  12         rec1 rec2 rec
  13         regex prefix suffix surround
  14         first lookup join_with
  15
  16         split_rec_on
  17 /;
  18
  19 use warnings;
  20 use strict;
  21
  22 #use base qw/WebPAC::Common/;
  23 use Data::Dump qw/dump/;
  24 use Encode qw/from_to/;
  25
  26 # debugging warn(s)
  27 my $debug = 0;
  28
  29
  30 =head1 NAME
  31
  32 WebPAC::Normalize - describe normalisaton rules using sets
  33
  34 =head1 VERSION
  35
  36 Version 0.09
  37
  38 =cut
  39
  40 our $VERSION = '0.09';
  41
  42 =head1 SYNOPSIS
  43
  44 This module uses C<conf/normalize/*.pl> files to perform normalisation
  45 from input records using perl functions which are specialized for set
  46 processing.
  47
  48 Sets are implemented as arrays, and normalisation file is valid perl, which
  49 means that you check it's validity before running WebPAC using
  50 C<perl -c normalize.pl>.
  51
  52 Normalisation can generate multiple output normalized data. For now, supported output
  53 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
  54 C<marc>.
  55
  56 =head1 FUNCTIONS
  57
  58 Functions which start with C<_> are private and used by WebPAC internally.
  59 All other functions are available for use within normalisation rules.
  60
  61 =head2 data_structure
  62
  63 Return data structure
  64
  65   my $ds = WebPAC::Normalize::data_structure(
  66         lookup => $lookup->lookup_hash,
  67         row => $row,
  68         rules => $normalize_pl_config,
  69         marc_encoding => 'utf-8',
  70   );
  71
  72 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
  73 other are optional.
  74
  75 This function will B<die> if normalizastion can't be evaled.
  76
  77 Since this function isn't exported you have to call it with
  78 C<WebPAC::Normalize::data_structure>.
  79
  80 =cut
  81
  82 sub data_structure {
  83         my $arg = {@_};
  84
  85         die "need row argument" unless ($arg->{row});
  86         die "need normalisation argument" unless ($arg->{rules});
  87
  88         no strict 'subs';
  89         _set_lookup( $arg->{lookup} );
  90         _set_rec( $arg->{row} );
  91         _clean_ds( %{ $arg } );
  92         eval "$arg->{rules}";
  93         die "error evaling $arg->{rules}: $@\n" if ($@);
  94
  95         return _get_ds();
  96 }
  97
  98 =head2 _set_rec
  99
 100 Set current record hash
 101
 102   _set_rec( $rec );
 103
 104 =cut
 105
 106 my $rec;
 107
 108 sub _set_rec {
 109         $rec = shift or die "no record hash";
 110 }
 111
 112 =head2 _get_ds
 113
 114 Return hash formatted as data structure
 115
 116   my $ds = _get_ds();
 117
 118 =cut
 119
 120 my ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
 121
 122 sub _get_ds {
 123         return $out;
 124 }
 125
 126 =head2 _clean_ds
 127
 128 Clean data structure hash for next record
 129
 130   _clean_ds();
 131
 132 =cut
 133
 134 sub _clean_ds {
 135         my $a = {@_};
 136         ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
 137         $marc_encoding = $a->{marc_encoding};
 138 }
 139
 140 =head2 _set_lookup
 141
 142 Set current lookup hash
 143
 144   _set_lookup( $lookup );
 145
 146 =cut
 147
 148 my $lookup;
 149
 150 sub _set_lookup {
 151         $lookup = shift;
 152 }
 153
 154 =head2 _get_marc_fields
 155
 156 Get all fields defined by calls to C<marc>
 157
 158         $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
 159
 160 We are using I<magic> which detect repeatable fields only from
 161 sequence of field/subfield data generated by normalization.
 162
 163 Repeatable field is created when there is second occurence of same subfield or
 164 if any of indicators are different.
 165
 166 This is sane for most cases. Something like:
 167
 168   900a-1 900b-1 900c-1
 169   900a-2 900b-2
 170   900a-3
 171
 172 will be created from any combination of:
 173
 174   900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
 175
 176 and following rules:
 177
 178   marc('900','a', rec('200','a') );
 179   marc('900','b', rec('200','b') );
 180   marc('900','c', rec('200','c') );
 181
 182 which might not be what you have in mind. If you need repeatable subfield,
 183 define it using C<marc_repeatable_subfield> like this:
 184
 185 ....
 186
 187 =cut
 188
 189 sub _get_marc_fields {
 190
 191         return if (! $marc_record || ref($marc_record) ne 'ARRAY' || $#{ $marc_record } < 0);
 192
 193         # first, sort all existing fields
 194         # XXX might not be needed, but modern perl might randomize elements in hash
 195         my @sorted_marc_record = sort {
 196                 $a->[0] . $a->[3] cmp $b->[0] . $b->[3]
 197         } @{ $marc_record };
 198
 199         @sorted_marc_record = @{ $marc_record };        ### FIXME disable sorting
 200
 201         # output marc fields
 202         my @m;
 203
 204         # count unique field-subfields (used for offset when walking to next subfield)
 205         my $u;
 206         map { $u->{ $_->[0] . $_->[3]  }++ } @sorted_marc_record;
 207
 208         if ($debug) {
 209                 warn "## marc_repeatable_subfield ", dump( $marc_repeatable_subfield ), $/;
 210                 warn "## marc_record ", dump( $marc_record ), $/;
 211                 warn "## sorted_marc_record ", dump( \@sorted_marc_record ), $/;
 212                 warn "## subfield count ", dump( $u ), $/;
 213         }
 214
 215         my $len = $#sorted_marc_record;
 216         my $visited;
 217         my $i = 0;
 218         my $field;
 219
 220         foreach ( 0 .. $len ) {
 221
 222                 # find next element which isn't visited
 223                 while ($visited->{$i}) {
 224                         $i = ($i + 1) % ($len + 1);
 225                 }
 226
 227                 # mark it visited
 228                 $visited->{$i}++;
 229
 230                 my $row = $sorted_marc_record[$i];
 231
 232                 # field and subfield which is key for
 233                 # marc_repeatable_subfield and u
 234                 my $fsf = $row->[0] . $row->[3];
 235
 236                 if ($debug > 1) {
 237
 238                         print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
 239                         print "### this [$i]: ", dump( $row ),$/;
 240                         print "### sf: ", $row->[3], " vs ", $field->[3],
 241                                 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
 242                                 if ($#$field >= 0);
 243
 244                 }
 245
 246                 # if field exists
 247                 if ( $#$field >= 0 ) {
 248                         if (
 249                                 $row->[0] ne $field->[0] ||             # field
 250                                 $row->[1] ne $field->[1] ||             # i1
 251                                 $row->[2] ne $field->[2]                # i2
 252                         ) {
 253                                 push @m, $field;
 254                                 warn "## saved/1 ", dump( $field ),$/ if ($debug);
 255                                 $field = $row;
 256
 257                         } elsif (
 258                                 ( $row->[3] lt $field->[-2] )           # subfield which is not next (e.g. a after c)
 259                                 ||
 260                                 ( $row->[3] eq $field->[-2] &&          # same subfield, but not repeatable
 261                                         ! $marc_repeatable_subfield->{ $fsf }
 262                                 )
 263                         ) {
 264                                 push @m, $field;
 265                                 warn "## saved/2 ", dump( $field ),$/ if ($debug);
 266                                 $field = $row;
 267
 268                         } else {
 269                                 # append new subfields to existing field
 270                                 push @$field, ( $row->[3], $row->[4] );
 271                         }
 272                 } else {
 273                         # insert first field
 274                         $field = $row;
 275                 }
 276
 277                 if (! $marc_repeatable_subfield->{ $fsf }) {
 278                         # make step to next subfield
 279                         $i = ($i + $u->{ $fsf } ) % ($len + 1);
 280                 }
 281         }
 282
 283         if ($#$field >= 0) {
 284                 push @m, $field;
 285                 warn "## saved/3 ", dump( $field ),$/ if ($debug);
 286         }
 287
 288         return @m;
 289 }
 290
 291 =head2 _debug
 292
 293 Change level of debug warnings
 294
 295   _debug( 2 );
 296
 297 =cut
 298
 299 sub _debug {
 300         my $l = shift;
 301         return $debug unless defined($l);
 302         warn "debug level $l" if ($l > 0);
 303         $debug = $l;
 304 }
 305
 306 =head1 Functions to create C<data_structure>
 307
 308 Those functions generally have to first in your normalization file.
 309
 310 =head2 tag
 311
 312 Define new tag for I<search> and I<display>.
 313
 314   tag('Title', rec('200','a') );
 315
 316
 317 =cut
 318
 319 sub tag {
 320         my $name = shift or die "tag needs name as first argument";
 321         my @o = grep { defined($_) && $_ ne '' } @_;
 322         return unless (@o);
 323         $out->{$name}->{tag} = $name;
 324         $out->{$name}->{search} = \@o;
 325         $out->{$name}->{display} = \@o;
 326 }
 327
 328 =head2 display
 329
 330 Define tag just for I<display>
 331
 332   @v = display('Title', rec('200','a') );
 333
 334 =cut
 335
 336 sub display {
 337         my $name = shift or die "display needs name as first argument";
 338         my @o = grep { defined($_) && $_ ne '' } @_;
 339         return unless (@o);
 340         $out->{$name}->{tag} = $name;
 341         $out->{$name}->{display} = \@o;
 342 }
 343
 344 =head2 search
 345
 346 Prepare values just for I<search>
 347
 348   @v = search('Title', rec('200','a') );
 349
 350 =cut
 351
 352 sub search {
 353         my $name = shift or die "search needs name as first argument";
 354         my @o = grep { defined($_) && $_ ne '' } @_;
 355         return unless (@o);
 356         $out->{$name}->{tag} = $name;
 357         $out->{$name}->{search} = \@o;
 358 }
 359
 360 =head2 marc_leader
 361
 362 Setup fields within MARC leader or get leader
 363
 364   marc_leader('05','c');
 365   my $leader = marc_leader();
 366
 367 =cut
 368
 369 sub marc_leader {
 370         my ($offset,$value) = @_;
 371
 372         if ($offset) {
 373                 $out->{' leader'}->{ $offset } = $value;
 374         } else {
 375                 return $out->{' leader'};
 376         }
 377 }
 378
 379 =head2 marc
 380
 381 Save value for MARC field
 382
 383   marc('900','a', rec('200','a') );
 384
 385 =cut
 386
 387 sub marc {
 388         my $f = shift or die "marc needs field";
 389         die "marc field must be numer" unless ($f =~ /^\d+$/);
 390
 391         my $sf = shift or die "marc needs subfield";
 392
 393         foreach (@_) {
 394                 my $v = $_;             # make var read-write for Encode
 395                 next unless (defined($v) && $v !~ /^\s*$/);
 396                 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
 397                 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
 398                 push @{ $marc_record }, [ $f, $i1, $i2, $sf => $v ];
 399         }
 400 }
 401
 402 =head2 marc_repeatable_subfield
 403
 404 Save values for MARC repetable subfield
 405
 406   marc_repeatable_subfield('910', 'z', rec('909') );
 407
 408 =cut
 409
 410 sub marc_repeatable_subfield {
 411         my ($f,$sf) = @_;
 412         die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
 413         $marc_repeatable_subfield->{ $f . $sf }++;
 414         marc(@_);
 415 }
 416
 417 =head2 marc_indicators
 418
 419 Set both indicators for MARC field
 420
 421   marc_indicators('900', ' ', 1);
 422
 423 Any indicator value other than C<0-9> will be treated as undefined.
 424
 425 =cut
 426
 427 sub marc_indicators {
 428         my $f = shift || die "marc_indicators need field!\n";
 429         my ($i1,$i2) = @_;
 430         die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
 431         die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
 432
 433         $i1 = ' ' if ($i1 !~ /^\d$/);
 434         $i2 = ' ' if ($i2 !~ /^\d$/);
 435         @{ $marc_indicators->{$f} } = ($i1,$i2);
 436 }
 437
 438 =head2 marc_compose
 439
 440 Save values for each MARC subfield explicitly
 441
 442   marc_compose('900',
 443         'a', rec('200','a')
 444         'b', rec('201','a')
 445         'a', rec('200','b')
 446         'c', rec('200','c')
 447   );
 448
 449 =cut
 450
 451 sub marc_compose {
 452         my $f = shift or die "marc_compose needs field";
 453         die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
 454
 455         my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
 456         my $m = [ $f, $i1, $i2 ];
 457
 458         while (@_) {
 459                 my $sf = shift or die "marc_compose $f needs subfield";
 460                 my $v = shift or die "marc_compose $f needs value for subfield $sf";
 461
 462                 next unless (defined($v) && $v !~ /^\s*$/);
 463                 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
 464                 push @$m, ( $sf, $v );
 465                 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ) if ($debug > 1);
 466         }
 467
 468         warn "## marc_compose(d) ", dump( $m ) if ($debug > 1);
 469
 470         push @{ $marc_record }, $m;
 471 }
 472
 473
 474 =head1 Functions to extract data from input
 475
 476 This function should be used inside functions to create C<data_structure> described
 477 above.
 478
 479 =head2 rec1
 480
 481 Return all values in some field
 482
 483   @v = rec1('200')
 484
 485 TODO: order of values is probably same as in source data, need to investigate that
 486
 487 =cut
 488
 489 sub rec1 {
 490         my $f = shift;
 491         return unless (defined($rec) && defined($rec->{$f}));
 492         if (ref($rec->{$f}) eq 'ARRAY') {
 493                 return map {
 494                         if (ref($_) eq 'HASH') {
 495                                 values %{$_};
 496                         } else {
 497                                 $_;
 498                         }
 499                 } @{ $rec->{$f} };
 500         } elsif( defined($rec->{$f}) ) {
 501                 return $rec->{$f};
 502         }
 503 }
 504
 505 =head2 rec2
 506
 507 Return all values in specific field and subfield
 508
 509   @v = rec2('200','a')
 510
 511 =cut
 512
 513 sub rec2 {
 514         my $f = shift;
 515         return unless (defined($rec && $rec->{$f}));
 516         my $sf = shift;
 517         return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
 518 }
 519
 520 =head2 rec
 521
 522 syntaxtic sugar for
 523
 524   @v = rec('200')
 525   @v = rec('200','a')
 526
 527 =cut
 528
 529 sub rec {
 530         if ($#_ == 0) {
 531                 return rec1(@_);
 532         } elsif ($#_ == 1) {
 533                 return rec2(@_);
 534         }
 535 }
 536
 537 =head2 regex
 538
 539 Apply regex to some or all values
 540
 541   @v = regex( 's/foo/bar/g', @v );
 542
 543 =cut
 544
 545 sub regex {
 546         my $r = shift;
 547         my @out;
 548         #warn "r: $r\n", dump(\@_);
 549         foreach my $t (@_) {
 550                 next unless ($t);
 551                 eval "\$t =~ $r";
 552                 push @out, $t if ($t && $t ne '');
 553         }
 554         return @out;
 555 }
 556
 557 =head2 prefix
 558
 559 Prefix all values with a string
 560
 561   @v = prefix( 'my_', @v );
 562
 563 =cut
 564
 565 sub prefix {
 566         my $p = shift or die "prefix needs string as first argument";
 567         return map { $p . $_ } grep { defined($_) } @_;
 568 }
 569
 570 =head2 suffix
 571
 572 suffix all values with a string
 573
 574   @v = suffix( '_my', @v );
 575
 576 =cut
 577
 578 sub suffix {
 579         my $s = shift or die "suffix needs string as first argument";
 580         return map { $_ . $s } grep { defined($_) } @_;
 581 }
 582
 583 =head2 surround
 584
 585 surround all values with a two strings
 586
 587   @v = surround( 'prefix_', '_suffix', @v );
 588
 589 =cut
 590
 591 sub surround {
 592         my $p = shift or die "surround need prefix as first argument";
 593         my $s = shift or die "surround needs suffix as second argument";
 594         return map { $p . $_ . $s } grep { defined($_) } @_;
 595 }
 596
 597 =head2 first
 598
 599 Return first element
 600
 601   $v = first( @v );
 602
 603 =cut
 604
 605 sub first {
 606         my $r = shift;
 607         return $r;
 608 }
 609
 610 =head2 lookup
 611
 612 Consult lookup hashes for some value
 613
 614   @v = lookup( $v );
 615   @v = lookup( @v );
 616
 617 =cut
 618
 619 sub lookup {
 620         my $k = shift or return;
 621         return unless (defined($lookup->{$k}));
 622         if (ref($lookup->{$k}) eq 'ARRAY') {
 623                 return @{ $lookup->{$k} };
 624         } else {
 625                 return $lookup->{$k};
 626         }
 627 }
 628
 629 =head2 join_with
 630
 631 Joins walues with some delimiter
 632
 633   $v = join_with(", ", @v);
 634
 635 =cut
 636
 637 sub join_with {
 638         my $d = shift;
 639         return join($d, grep { defined($_) && $_ ne '' } @_);
 640 }
 641
 642 =head2 split_rec_on
 643
 644 Split record subfield on some regex and take one of parts out
 645
 646   $a_before_semi_column =
 647         split_rec_on('200','a', /\s*;\s*/, $part);
 648
 649 C<$part> is optional number of element. First element is
 650 B<1>, not 0!
 651
 652 If there is no C<$part> parameter or C<$part> is 0, this function will
 653 return all values produced by splitting.
 654
 655 =cut
 656
 657 sub split_rec_on {
 658         die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
 659
 660         my ($fld, $sf, $regex, $part) = @_;
 661         warn "### regex ", ref($regex), $regex if ($debug > 2);
 662
 663         my @r = rec( $fld, $sf );
 664         my $v = shift @r;
 665         warn "### first rec($fld,$sf) = ",dump($v) if ($debug > 2);
 666
 667         my @s = split( $regex, $v );
 668         warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s) if ($debug > 1);
 669         if ($part > 0) {
 670                 return $s[ $part - 1 ];
 671         } else {
 672                 return @s;
 673         }
 674 }
 675
 676 # END
 677 1;