git.rot13.org Git - webpac2/blob - lib/WebPAC/Normalize.pm

   1 package WebPAC::Normalize;
   2 use Exporter 'import';
   3 @EXPORT = qw/
   4         _set_rec _set_lookup
   5         _set_load_row
   6         _get_ds _clean_ds
   7         _debug
   8         _pack_subfields_hash
   9
  10         tag search display
  11         marc marc_indicators marc_repeatable_subfield
  12         marc_compose marc_leader
  13         marc_duplicate marc_remove
  14         marc_original_order
  15
  16         rec1 rec2 rec
  17         regex prefix suffix surround
  18         first lookup join_with
  19         save_into_lookup
  20
  21         split_rec_on
  22 /;
  23
  24 use warnings;
  25 use strict;
  26
  27 #use base qw/WebPAC::Common/;
  28 use Data::Dump qw/dump/;
  29 use Storable qw/dclone/;
  30 use Carp qw/confess/;
  31
  32 # debugging warn(s)
  33 my $debug = 0;
  34
  35
  36 =head1 NAME
  37
  38 WebPAC::Normalize - describe normalisaton rules using sets
  39
  40 =head1 VERSION
  41
  42 Version 0.23
  43
  44 =cut
  45
  46 our $VERSION = '0.23';
  47
  48 =head1 SYNOPSIS
  49
  50 This module uses C<conf/normalize/*.pl> files to perform normalisation
  51 from input records using perl functions which are specialized for set
  52 processing.
  53
  54 Sets are implemented as arrays, and normalisation file is valid perl, which
  55 means that you check it's validity before running WebPAC using
  56 C<perl -c normalize.pl>.
  57
  58 Normalisation can generate multiple output normalized data. For now, supported output
  59 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
  60 C<marc>.
  61
  62 =head1 FUNCTIONS
  63
  64 Functions which start with C<_> are private and used by WebPAC internally.
  65 All other functions are available for use within normalisation rules.
  66
  67 =head2 data_structure
  68
  69 Return data structure
  70
  71   my $ds = WebPAC::Normalize::data_structure(
  72         lookup => $lookup_hash,
  73         row => $row,
  74         rules => $normalize_pl_config,
  75         marc_encoding => 'utf-8',
  76         config => $config,
  77         load_row_coderef => sub {
  78                 my ($database,$input,$mfn) = shift;
  79                 $store->load_row( database => $database, input => $input, id => $mfn );
  80         },
  81   );
  82
  83 Options C<row>, C<rules> and C<log> are mandatory while all
  84 other are optional.
  85
  86 C<load_row_coderef> is closure only used when executing lookups, so they will
  87 die if it's not defined.
  88
  89 This function will B<die> if normalizastion can't be evaled.
  90
  91 Since this function isn't exported you have to call it with
  92 C<WebPAC::Normalize::data_structure>.
  93
  94 =cut
  95
  96 my $load_row_coderef;
  97
  98 sub data_structure {
  99         my $arg = {@_};
 100
 101         die "need row argument" unless ($arg->{row});
 102         die "need normalisation argument" unless ($arg->{rules});
 103
 104         no strict 'subs';
 105         _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
 106         _set_rec( $arg->{row} );
 107         _set_config( $arg->{config} ) if defined($arg->{config});
 108         _clean_ds( %{ $arg } );
 109         $load_row_coderef = $arg->{load_row_coderef};
 110
 111         eval "$arg->{rules}";
 112         die "error evaling $arg->{rules}: $@\n" if ($@);
 113
 114         return _get_ds();
 115 }
 116
 117 =head2 _set_rec
 118
 119 Set current record hash
 120
 121   _set_rec( $rec );
 122
 123 =cut
 124
 125 my $rec;
 126
 127 sub _set_rec {
 128         $rec = shift or die "no record hash";
 129 }
 130
 131 =head2 _set_config
 132
 133 Set current config hash
 134
 135   _set_config( $config );
 136
 137 Magic keys are:
 138
 139 =over 4
 140
 141 =item _
 142
 143 Code of current database
 144
 145 =item _mfn
 146
 147 Current MFN
 148
 149 =back
 150
 151 =cut
 152
 153 my $config;
 154
 155 sub _set_config {
 156         $config = shift;
 157 }
 158
 159 =head2 _get_ds
 160
 161 Return hash formatted as data structure
 162
 163   my $ds = _get_ds();
 164
 165 =cut
 166
 167 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $leader);
 168 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
 169
 170 sub _get_ds {
 171         return $out;
 172 }
 173
 174 =head2 _clean_ds
 175
 176 Clean data structure hash for next record
 177
 178   _clean_ds();
 179
 180 =cut
 181
 182 sub _clean_ds {
 183         my $a = {@_};
 184         ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $leader) = ();
 185         ($marc_record_offset, $marc_fetch_offset) = (0,0);
 186         $marc_encoding = $a->{marc_encoding};
 187 }
 188
 189 =head2 _set_lookup
 190
 191 Set current lookup hash
 192
 193   _set_lookup( $lookup );
 194
 195 =cut
 196
 197 my $lookup;
 198
 199 sub _set_lookup {
 200         $lookup = shift;
 201 }
 202
 203 =head2 _get_lookup
 204
 205 Get current lookup hash
 206
 207   my $lookup = _get_lookup();
 208
 209 =cut
 210
 211 sub _get_lookup {
 212         return $lookup;
 213 }
 214
 215 =head2 _set_load_row
 216
 217 Setup code reference which will return L<data_structure> from
 218 L<WebPAC::Store>
 219
 220   _set_load_row(sub {
 221                 my ($database,$input,$mfn) = @_;
 222                 $store->load_row( database => $database, input => $input, id => $mfn );
 223   });
 224
 225 =cut
 226
 227 sub _set_load_row {
 228         my $coderef = shift;
 229         confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
 230
 231         $load_row_coderef = $coderef;
 232 }
 233
 234 =head2 _get_marc_fields
 235
 236 Get all fields defined by calls to C<marc>
 237
 238         $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
 239
 240 We are using I<magic> which detect repeatable fields only from
 241 sequence of field/subfield data generated by normalization.
 242
 243 Repeatable field is created when there is second occurence of same subfield or
 244 if any of indicators are different.
 245
 246 This is sane for most cases. Something like:
 247
 248   900a-1 900b-1 900c-1
 249   900a-2 900b-2
 250   900a-3
 251
 252 will be created from any combination of:
 253
 254   900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
 255
 256 and following rules:
 257
 258   marc('900','a', rec('200','a') );
 259   marc('900','b', rec('200','b') );
 260   marc('900','c', rec('200','c') );
 261
 262 which might not be what you have in mind. If you need repeatable subfield,
 263 define it using C<marc_repeatable_subfield> like this:
 264
 265   marc_repeatable_subfield('900','a');
 266   marc('900','a', rec('200','a') );
 267   marc('900','b', rec('200','b') );
 268   marc('900','c', rec('200','c') );
 269
 270 will create:
 271
 272   900a-1 900a-2 900a-3 900b-1 900c-1
 273   900b-2
 274
 275 There is also support for returning next or specific using:
 276
 277   while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
 278         # do something with $mf
 279   }
 280
 281 will always return fields from next MARC record or
 282
 283   my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
 284
 285 will return 42th copy record (if it exists).
 286
 287 =cut
 288
 289 sub _get_marc_fields {
 290
 291         my $arg = {@_};
 292         warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
 293         my $offset = $marc_fetch_offset;
 294         if ($arg->{offset}) {
 295                 $offset = $arg->{offset};
 296         } elsif($arg->{fetch_next}) {
 297                 $marc_fetch_offset++;
 298         }
 299
 300         return if (! $marc_record || ref($marc_record) ne 'ARRAY');
 301
 302         warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
 303
 304         my $marc_rec = $marc_record->[ $offset ];
 305
 306         warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
 307
 308         return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
 309
 310         # first, sort all existing fields
 311         # XXX might not be needed, but modern perl might randomize elements in hash
 312         my @sorted_marc_record = sort {
 313                 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
 314         } @{ $marc_rec };
 315
 316         @sorted_marc_record = @{ $marc_rec };   ### FIXME disable sorting
 317
 318         # output marc fields
 319         my @m;
 320
 321         # count unique field-subfields (used for offset when walking to next subfield)
 322         my $u;
 323         map { $u->{ $_->[0] . ( $_->[3] || '')  }++ } @sorted_marc_record;
 324
 325         if ($debug) {
 326                 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
 327                 warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
 328                 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
 329                 warn "## subfield count = ", dump( $u ), $/;
 330         }
 331
 332         my $len = $#sorted_marc_record;
 333         my $visited;
 334         my $i = 0;
 335         my $field;
 336
 337         foreach ( 0 .. $len ) {
 338
 339                 # find next element which isn't visited
 340                 while ($visited->{$i}) {
 341                         $i = ($i + 1) % ($len + 1);
 342                 }
 343
 344                 # mark it visited
 345                 $visited->{$i}++;
 346
 347                 my $row = dclone( $sorted_marc_record[$i] );
 348
 349                 # field and subfield which is key for
 350                 # marc_repeatable_subfield and u
 351                 my $fsf = $row->[0] . ( $row->[3] || '' );
 352
 353                 if ($debug > 1) {
 354
 355                         print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
 356                         print "### this [$i]: ", dump( $row ),$/;
 357                         print "### sf: ", $row->[3], " vs ", $field->[3],
 358                                 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
 359                                 if ($#$field >= 0);
 360
 361                 }
 362
 363                 # if field exists
 364                 if ( $#$field >= 0 ) {
 365                         if (
 366                                 $row->[0] ne $field->[0] ||             # field
 367                                 $row->[1] ne $field->[1] ||             # i1
 368                                 $row->[2] ne $field->[2]                # i2
 369                         ) {
 370                                 push @m, $field;
 371                                 warn "## saved/1 ", dump( $field ),$/ if ($debug);
 372                                 $field = $row;
 373
 374                         } elsif (
 375                                 ( $row->[3] lt $field->[-2] )           # subfield which is not next (e.g. a after c)
 376                                 ||
 377                                 ( $row->[3] eq $field->[-2] &&          # same subfield, but not repeatable
 378                                         ! $marc_repeatable_subfield->{ $fsf }
 379                                 )
 380                         ) {
 381                                 push @m, $field;
 382                                 warn "## saved/2 ", dump( $field ),$/ if ($debug);
 383                                 $field = $row;
 384
 385                         } else {
 386                                 # append new subfields to existing field
 387                                 push @$field, ( $row->[3], $row->[4] );
 388                         }
 389                 } else {
 390                         # insert first field
 391                         $field = $row;
 392                 }
 393
 394                 if (! $marc_repeatable_subfield->{ $fsf }) {
 395                         # make step to next subfield
 396                         $i = ($i + $u->{ $fsf } ) % ($len + 1);
 397                 }
 398         }
 399
 400         if ($#$field >= 0) {
 401                 push @m, $field;
 402                 warn "## saved/3 ", dump( $field ),$/ if ($debug);
 403         }
 404
 405         return \@m;
 406 }
 407
 408 =head2 _debug
 409
 410 Change level of debug warnings
 411
 412   _debug( 2 );
 413
 414 =cut
 415
 416 sub _debug {
 417         my $l = shift;
 418         return $debug unless defined($l);
 419         warn "debug level $l",$/ if ($l > 0);
 420         $debug = $l;
 421 }
 422
 423 =head1 Functions to create C<data_structure>
 424
 425 Those functions generally have to first in your normalization file.
 426
 427 =head2 tag
 428
 429 Define new tag for I<search> and I<display>.
 430
 431   tag('Title', rec('200','a') );
 432
 433
 434 =cut
 435
 436 sub tag {
 437         my $name = shift or die "tag needs name as first argument";
 438         my @o = grep { defined($_) && $_ ne '' } @_;
 439         return unless (@o);
 440         $out->{$name}->{tag} = $name;
 441         $out->{$name}->{search} = \@o;
 442         $out->{$name}->{display} = \@o;
 443 }
 444
 445 =head2 display
 446
 447 Define tag just for I<display>
 448
 449   @v = display('Title', rec('200','a') );
 450
 451 =cut
 452
 453 sub display {
 454         my $name = shift or die "display needs name as first argument";
 455         my @o = grep { defined($_) && $_ ne '' } @_;
 456         return unless (@o);
 457         $out->{$name}->{tag} = $name;
 458         $out->{$name}->{display} = \@o;
 459 }
 460
 461 =head2 search
 462
 463 Prepare values just for I<search>
 464
 465   @v = search('Title', rec('200','a') );
 466
 467 =cut
 468
 469 sub search {
 470         my $name = shift or die "search needs name as first argument";
 471         my @o = grep { defined($_) && $_ ne '' } @_;
 472         return unless (@o);
 473         $out->{$name}->{tag} = $name;
 474         $out->{$name}->{search} = \@o;
 475 }
 476
 477 =head2 marc_leader
 478
 479 Setup fields within MARC leader or get leader
 480
 481   marc_leader('05','c');
 482   my $leader = marc_leader();
 483
 484 =cut
 485
 486 sub marc_leader {
 487         my ($offset,$value) = @_;
 488
 489         if ($offset) {
 490                 $leader->{ $offset } = $value;
 491         } else {
 492                 return $leader;
 493         }
 494 }
 495
 496 =head2 marc
 497
 498 Save value for MARC field
 499
 500   marc('900','a', rec('200','a') );
 501   marc('001', rec('000') );
 502
 503 =cut
 504
 505 sub marc {
 506         my $f = shift or die "marc needs field";
 507         die "marc field must be numer" unless ($f =~ /^\d+$/);
 508
 509         my $sf;
 510         if ($f >= 10) {
 511                 $sf = shift or die "marc needs subfield";
 512         }
 513
 514         foreach (@_) {
 515                 my $v = $_;             # make var read-write for Encode
 516                 next unless (defined($v) && $v !~ /^\s*$/);
 517                 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
 518                 if (defined $sf) {
 519                         push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
 520                 } else {
 521                         push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
 522                 }
 523         }
 524 }
 525
 526 =head2 marc_repeatable_subfield
 527
 528 Save values for MARC repetable subfield
 529
 530   marc_repeatable_subfield('910', 'z', rec('909') );
 531
 532 =cut
 533
 534 sub marc_repeatable_subfield {
 535         my ($f,$sf) = @_;
 536         die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
 537         $marc_repeatable_subfield->{ $f . $sf }++;
 538         marc(@_);
 539 }
 540
 541 =head2 marc_indicators
 542
 543 Set both indicators for MARC field
 544
 545   marc_indicators('900', ' ', 1);
 546
 547 Any indicator value other than C<0-9> will be treated as undefined.
 548
 549 =cut
 550
 551 sub marc_indicators {
 552         my $f = shift || die "marc_indicators need field!\n";
 553         my ($i1,$i2) = @_;
 554         die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
 555         die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
 556
 557         $i1 = ' ' if ($i1 !~ /^\d$/);
 558         $i2 = ' ' if ($i2 !~ /^\d$/);
 559         @{ $marc_indicators->{$f} } = ($i1,$i2);
 560 }
 561
 562 =head2 marc_compose
 563
 564 Save values for each MARC subfield explicitly
 565
 566   marc_compose('900',
 567         'a', rec('200','a')
 568         'b', rec('201','a')
 569         'a', rec('200','b')
 570         'c', rec('200','c')
 571   );
 572
 573 If you specify C<+> for subfield, value will be appended
 574 to previous defined subfield.
 575
 576 =cut
 577
 578 sub marc_compose {
 579         my $f = shift or die "marc_compose needs field";
 580         die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
 581
 582         my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
 583         my $m = [ $f, $i1, $i2 ];
 584
 585         warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
 586
 587         if ($#_ % 2 != 1) {
 588                 die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
 589         }
 590
 591         while (@_) {
 592                 my $sf = shift;
 593                 my $v = shift;
 594
 595                 next unless (defined($v) && $v !~ /^\s*$/);
 596                 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
 597                 if ($sf ne '+') {
 598                         push @$m, ( $sf, $v );
 599                 } else {
 600                         $m->[ $#$m ] .= $v;
 601                 }
 602         }
 603
 604         warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
 605
 606         push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
 607 }
 608
 609 =head2 marc_duplicate
 610
 611 Generate copy of current MARC record and continue working on copy
 612
 613   marc_duplicate();
 614
 615 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
 616 C<< _get_marc_fields( offset => 42 ) >>.
 617
 618 =cut
 619
 620 sub marc_duplicate {
 621          my $m = $marc_record->[ -1 ];
 622          die "can't duplicate record which isn't defined" unless ($m);
 623          push @{ $marc_record }, dclone( $m );
 624          warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
 625          $marc_record_offset = $#{ $marc_record };
 626          warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
 627 }
 628
 629 =head2 marc_remove
 630
 631 Remove some field or subfield from MARC record.
 632
 633   marc_remove('200');
 634   marc_remove('200','a');
 635
 636 This will erase field C<200> or C<200^a> from current MARC record.
 637
 638 This is useful after calling C<marc_duplicate> or on it's own (but, you
 639 should probably just remove that subfield definition if you are not
 640 using C<marc_duplicate>).
 641
 642 FIXME: support fields < 10.
 643
 644 =cut
 645
 646 sub marc_remove {
 647         my ($f, $sf) = @_;
 648
 649         die "marc_remove needs record number" unless defined($f);
 650
 651         my $marc = $marc_record->[ $marc_record_offset ];
 652
 653         warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
 654
 655         my $i = 0;
 656         foreach ( 0 .. $#{ $marc } ) {
 657                 last unless (defined $marc->[$i]);
 658                 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
 659                 if ($marc->[$i]->[0] eq $f) {
 660                         if (! defined $sf) {
 661                                 # remove whole field
 662                                 splice @$marc, $i, 1;
 663                                 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
 664                                 $i--;
 665                         } else {
 666                                 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
 667                                         my $o = ($j * 2) + 3;
 668                                         if ($marc->[$i]->[$o] eq $sf) {
 669                                                 # remove subfield
 670                                                 splice @{$marc->[$i]}, $o, 2;
 671                                                 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
 672                                                 # is record now empty?
 673                                                 if ($#{ $marc->[$i] } == 2) {
 674                                                         splice @$marc, $i, 1;
 675                                                         warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
 676                                                         $i--;
 677                                                 };
 678                                         }
 679                                 }
 680                         }
 681                 }
 682                 $i++;
 683         }
 684
 685         warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
 686
 687         $marc_record->[ $marc_record_offset ] = $marc;
 688
 689         warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
 690 }
 691
 692 =head2 marc_original_order
 693
 694 Copy all subfields preserving original order to marc field.
 695
 696   marc_original_order( marc_field_number, original_input_field_number );
 697
 698 Please note that field numbers are consistent with other commands (marc
 699 field number first), but somewhat counter-intuitive (destination and then
 700 source).
 701
 702 You might want to use this command if you are just renaming subfields or
 703 using pre-processing modify_record in C<config.yml> and don't need any
 704 post-processing or want to preserve order of original subfields.
 705
 706
 707 =cut
 708
 709 sub marc_original_order {
 710
 711         my ($to, $from) = @_;
 712         die "marc_original_order needs from and to fields\n" unless ($from && $to);
 713
 714         return unless defined($rec->{$from});
 715
 716         my $r = $rec->{$from};
 717         die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
 718
 719         my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
 720         warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
 721
 722         foreach my $d (@$r) {
 723
 724                 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
 725                         warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
 726                         next;
 727                 }
 728
 729                 my @sfs = @{ $d->{subfields} };
 730
 731                 die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
 732
 733                 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
 734
 735                 my $m = [ $to, $i1, $i2 ];
 736
 737                 while (my $sf = shift @sfs) {
 738
 739                         warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
 740                         my $offset = shift @sfs;
 741                         die "corrupted sufields specification for field $from\n" unless defined($offset);
 742
 743                         my $v;
 744                         if (ref($d->{$sf}) eq 'ARRAY') {
 745                                 $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
 746                         } elsif ($offset == 0) {
 747                                 $v = $d->{$sf};
 748                         } else {
 749                                 die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
 750                         }
 751                         push @$m, ( $sf, $v ) if (defined($v));
 752                 }
 753
 754                 if ($#{$m} > 2) {
 755                         push @{ $marc_record->[ $marc_record_offset ] }, $m;
 756                 }
 757         }
 758
 759         warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
 760 }
 761
 762
 763 =head1 Functions to extract data from input
 764
 765 This function should be used inside functions to create C<data_structure> described
 766 above.
 767
 768 =head2 _pack_subfields_hash
 769
 770  @subfields = _pack_subfields_hash( $h );
 771  $subfields = _pack_subfields_hash( $h, 1 );
 772
 773 Return each subfield value in array or pack them all together and return scalar
 774 with subfields (denoted by C<^>) and values.
 775
 776 =cut
 777
 778 sub _pack_subfields_hash {
 779
 780         warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
 781
 782         my ($h,$include_subfields) = @_;
 783
 784         if ( defined($h->{subfields}) ) {
 785                 my $sfs = delete $h->{subfields} || die "no subfields?";
 786                 my @out;
 787                 while (@$sfs) {
 788                         my $sf = shift @$sfs;
 789                         push @out, '^' . $sf if ($include_subfields);
 790                         my $o = shift @$sfs;
 791                         if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
 792                                 # single element subfields are not arrays
 793 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
 794
 795                                 push @out, $h->{$sf};
 796                         } else {
 797 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
 798                                 push @out, $h->{$sf}->[$o];
 799                         }
 800                 }
 801                 if ($include_subfields) {
 802                         return join('', @out);
 803                 } else {
 804                         return @out;
 805                 }
 806         } else {
 807                 if ($include_subfields) {
 808                         my $out = '';
 809                         foreach my $sf (sort keys %$h) {
 810                                 if (ref($h->{$sf}) eq 'ARRAY') {
 811                                         $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
 812                                 } else {
 813                                         $out .= '^' . $sf . $h->{$sf};
 814                                 }
 815                         }
 816                         return $out;
 817                 } else {
 818                         # FIXME this should probably be in alphabetical order instead of hash order
 819                         values %{$h};
 820                 }
 821         }
 822 }
 823
 824 =head2 rec1
 825
 826 Return all values in some field
 827
 828   @v = rec1('200')
 829
 830 TODO: order of values is probably same as in source data, need to investigate that
 831
 832 =cut
 833
 834 sub rec1 {
 835         my $f = shift;
 836         warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
 837         return unless (defined($rec) && defined($rec->{$f}));
 838         warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
 839         if (ref($rec->{$f}) eq 'ARRAY') {
 840                 my @out;
 841                 foreach my $h ( @{ $rec->{$f} } ) {
 842                         if (ref($h) eq 'HASH') {
 843                                 push @out, ( _pack_subfields_hash( $h ) );
 844                         } else {
 845                                 push @out, $h;
 846                         }
 847                 }
 848                 return @out;
 849         } elsif( defined($rec->{$f}) ) {
 850                 return $rec->{$f};
 851         }
 852 }
 853
 854 =head2 rec2
 855
 856 Return all values in specific field and subfield
 857
 858   @v = rec2('200','a')
 859
 860 =cut
 861
 862 sub rec2 {
 863         my $f = shift;
 864         return unless (defined($rec && $rec->{$f}));
 865         my $sf = shift;
 866         warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
 867         return map {
 868                 if (ref($_->{$sf}) eq 'ARRAY') {
 869                         @{ $_->{$sf} };
 870                 } else {
 871                         $_->{$sf};
 872                 }
 873         } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
 874 }
 875
 876 =head2 rec
 877
 878 syntaxtic sugar for
 879
 880   @v = rec('200')
 881   @v = rec('200','a')
 882
 883 If rec() returns just single value, it will
 884 return scalar, not array.
 885
 886 =cut
 887
 888 sub rec {
 889         my @out;
 890         if ($#_ == 0) {
 891                 @out = rec1(@_);
 892         } elsif ($#_ == 1) {
 893                 @out = rec2(@_);
 894         }
 895         if ($#out == 0 && ! wantarray) {
 896                 return $out[0];
 897         } elsif (@out) {
 898                 return @out;
 899         } else {
 900                 return '';
 901         }
 902 }
 903
 904 =head2 regex
 905
 906 Apply regex to some or all values
 907
 908   @v = regex( 's/foo/bar/g', @v );
 909
 910 =cut
 911
 912 sub regex {
 913         my $r = shift;
 914         my @out;
 915         #warn "r: $r\n", dump(\@_);
 916         foreach my $t (@_) {
 917                 next unless ($t);
 918                 eval "\$t =~ $r";
 919                 push @out, $t if ($t && $t ne '');
 920         }
 921         return @out;
 922 }
 923
 924 =head2 prefix
 925
 926 Prefix all values with a string
 927
 928   @v = prefix( 'my_', @v );
 929
 930 =cut
 931
 932 sub prefix {
 933         my $p = shift or return;
 934         return map { $p . $_ } grep { defined($_) } @_;
 935 }
 936
 937 =head2 suffix
 938
 939 suffix all values with a string
 940
 941   @v = suffix( '_my', @v );
 942
 943 =cut
 944
 945 sub suffix {
 946         my $s = shift or die "suffix needs string as first argument";
 947         return map { $_ . $s } grep { defined($_) } @_;
 948 }
 949
 950 =head2 surround
 951
 952 surround all values with a two strings
 953
 954   @v = surround( 'prefix_', '_suffix', @v );
 955
 956 =cut
 957
 958 sub surround {
 959         my $p = shift or die "surround need prefix as first argument";
 960         my $s = shift or die "surround needs suffix as second argument";
 961         return map { $p . $_ . $s } grep { defined($_) } @_;
 962 }
 963
 964 =head2 first
 965
 966 Return first element
 967
 968   $v = first( @v );
 969
 970 =cut
 971
 972 sub first {
 973         my $r = shift;
 974         return $r;
 975 }
 976
 977 =head2 lookup
 978
 979 Consult lookup hashes for some value
 980
 981   @v = lookup(
 982         sub {
 983                 'ffkk/peri/mfn'.rec('000')
 984         },
 985         'ffkk','peri','200-a-200-e',
 986         sub {
 987                 first(rec(200,'a')).' '.first(rec('200','e'))
 988         }
 989   );
 990
 991 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
 992 normal lookup definition in C<conf/lookup/something.pl> which looks like:
 993
 994   lookup(
 995         # which results to return from record recorded in lookup
 996         sub { 'ffkk/peri/mfn' . rec('000') },
 997         # from which database and input
 998         'ffkk','peri',
 999         # such that following values match
1000         sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
1001         # if this part is missing, we will try to match same fields
1002         # from lookup record and current one, or you can override
1003         # which records to use from current record using
1004         sub { rec('900','x') . ' ' . rec('900','y') },
1005   )
1006
1007 You can think about this lookup as SQL (if that helps):
1008
1009   select
1010         sub { what }
1011   from
1012         database, input
1013   where
1014     sub { filter from lookuped record }
1015   having
1016     sub { optional filter on current record }
1017
1018 Easy as pie, right?
1019
1020 =cut
1021
1022 sub lookup {
1023         my ($what, $database, $input, $key, $having) = @_;
1024
1025         confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
1026
1027         warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
1028         return unless (defined($lookup->{$database}->{$input}->{$key}));
1029
1030         confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
1031
1032         my $mfns;
1033         my @having = $having->();
1034
1035         warn "## having = ", dump( @having ) if ($debug > 2);
1036
1037         foreach my $h ( @having ) {
1038                 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
1039                         warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
1040                         $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
1041                 }
1042         }
1043
1044         return unless ($mfns);
1045
1046         my @mfns = sort keys %$mfns;
1047
1048         warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
1049
1050         my $old_rec = $rec;
1051         my @out;
1052
1053         foreach my $mfn (@mfns) {
1054                 $rec = $load_row_coderef->( $database, $input, $mfn );
1055
1056                 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
1057
1058                 my @vals = $what->();
1059
1060                 push @out, ( @vals );
1061
1062                 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
1063         }
1064
1065 #       if (ref($lookup->{$k}) eq 'ARRAY') {
1066 #               return @{ $lookup->{$k} };
1067 #       } else {
1068 #               return $lookup->{$k};
1069 #       }
1070
1071         $rec = $old_rec;
1072
1073         warn "## lookup returns = ", dump(@out), $/ if ($debug);
1074
1075         if ($#out == 0) {
1076                 return $out[0];
1077         } else {
1078                 return @out;
1079         }
1080 }
1081
1082 =head2 save_into_lookup
1083
1084 Save value into lookup. It associates current database, input
1085 and specific keys with one or more values which will be
1086 associated over MFN.
1087
1088 MFN will be extracted from first occurence current of field 000
1089 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
1090
1091   my $nr = save_into_lookup($database,$input,$key,sub {
1092         # code which produce one or more values
1093   });
1094
1095 It returns number of items saved.
1096
1097 This function shouldn't be called directly, it's called from code created by
1098 L<WebPAC::Parser>.
1099
1100 =cut
1101
1102 sub save_into_lookup {
1103         my ($database,$input,$key,$coderef) = @_;
1104         die "save_into_lookup needs database" unless defined($database);
1105         die "save_into_lookup needs input" unless defined($input);
1106         die "save_into_lookup needs key" unless defined($key);
1107         die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
1108
1109         warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
1110
1111         my $mfn =
1112                 defined($rec->{'000'}->[0])     ?       $rec->{'000'}->[0]      :
1113                 defined($config->{_mfn})        ?       $config->{_mfn}         :
1114                                                                                 die "mfn not defined or zero";
1115
1116         my $nr = 0;
1117
1118         foreach my $v ( $coderef->() ) {
1119                 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
1120                 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
1121                 $nr++;
1122         }
1123
1124         return $nr;
1125 }
1126
1127 =head2 config
1128
1129 Consult config values stored in C<config.yml>
1130
1131   # return database code (key under databases in yaml)
1132   $database_code = config();    # use _ from hash
1133   $database_name = config('name');
1134   $database_input_name = config('input name');
1135   $tag = config('input normalize tag');
1136
1137 Up to three levels are supported.
1138
1139 =cut
1140
1141 sub config {
1142         return unless ($config);
1143
1144         my $p = shift;
1145
1146         $p ||= '';
1147
1148         my $v;
1149
1150         warn "### getting config($p)\n" if ($debug > 1);
1151
1152         my @p = split(/\s+/,$p);
1153         if ($#p < 0) {
1154                 $v = $config->{ '_' };  # special, database code
1155         } else {
1156
1157                 my $c = dclone( $config );
1158
1159                 foreach my $k (@p) {
1160                         warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
1161                         if (ref($c) eq 'ARRAY') {
1162                                 $c = shift @$c;
1163                                 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
1164                                 last;
1165                         }
1166
1167                         if (! defined($c->{$k}) ) {
1168                                 $c = undef;
1169                                 last;
1170                         } else {
1171                                 $c = $c->{$k};
1172                         }
1173                 }
1174                 $v = $c if ($c);
1175
1176         }
1177
1178         warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
1179         warn "config( '$p' ) is empty\n" if (! $v);
1180
1181         return $v;
1182 }
1183
1184 =head2 id
1185
1186 Returns unique id of this record
1187
1188   $id = id();
1189
1190 Returns C<42/2> for 2nd occurence of MFN 42.
1191
1192 =cut
1193
1194 sub id {
1195         my $mfn = $config->{_mfn} || die "no _mfn in config data";
1196         return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1197 }
1198
1199 =head2 join_with
1200
1201 Joins walues with some delimiter
1202
1203   $v = join_with(", ", @v);
1204
1205 =cut
1206
1207 sub join_with {
1208         my $d = shift;
1209         warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1210         my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1211         return '' unless defined($v);
1212         return $v;
1213 }
1214
1215 =head2 split_rec_on
1216
1217 Split record subfield on some regex and take one of parts out
1218
1219   $a_before_semi_column =
1220         split_rec_on('200','a', /\s*;\s*/, $part);
1221
1222 C<$part> is optional number of element. First element is
1223 B<1>, not 0!
1224
1225 If there is no C<$part> parameter or C<$part> is 0, this function will
1226 return all values produced by splitting.
1227
1228 =cut
1229
1230 sub split_rec_on {
1231         die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1232
1233         my ($fld, $sf, $regex, $part) = @_;
1234         warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1235
1236         my @r = rec( $fld, $sf );
1237         my $v = shift @r;
1238         warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1239
1240         return '' if ( ! defined($v) || $v =~ /^\s*$/);
1241
1242         my @s = split( $regex, $v );
1243         warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1244         if ($part && $part > 0) {
1245                 return $s[ $part - 1 ];
1246         } else {
1247                 return @s;
1248         }
1249 }
1250
1251 # END
1252 1;