git.rot13.org Git - webpac2/blob - lib/WebPAC/Normalize.pm

   1 package WebPAC::Normalize;
   2 use Exporter 'import';
   3 @EXPORT = qw/
   4         _set_rec _set_lookup
   5         _set_load_row
   6         _get_ds _clean_ds
   7         _debug
   8         _pack_subfields_hash
   9
  10         tag search display
  11         marc marc_indicators marc_repeatable_subfield
  12         marc_compose marc_leader
  13         marc_duplicate marc_remove
  14         marc_original_order
  15
  16         rec1 rec2 rec
  17         regex prefix suffix surround
  18         first lookup join_with
  19         save_into_lookup
  20
  21         split_rec_on
  22
  23         get set
  24         count
  25 /;
  26
  27 use warnings;
  28 use strict;
  29
  30 #use base qw/WebPAC::Common/;
  31 use Data::Dump qw/dump/;
  32 use Storable qw/dclone/;
  33 use Carp qw/confess/;
  34
  35 # debugging warn(s)
  36 my $debug = 0;
  37
  38
  39 =head1 NAME
  40
  41 WebPAC::Normalize - describe normalisaton rules using sets
  42
  43 =head1 VERSION
  44
  45 Version 0.26
  46
  47 =cut
  48
  49 our $VERSION = '0.26';
  50
  51 =head1 SYNOPSIS
  52
  53 This module uses C<conf/normalize/*.pl> files to perform normalisation
  54 from input records using perl functions which are specialized for set
  55 processing.
  56
  57 Sets are implemented as arrays, and normalisation file is valid perl, which
  58 means that you check it's validity before running WebPAC using
  59 C<perl -c normalize.pl>.
  60
  61 Normalisation can generate multiple output normalized data. For now, supported output
  62 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
  63 C<marc>.
  64
  65 =head1 FUNCTIONS
  66
  67 Functions which start with C<_> are private and used by WebPAC internally.
  68 All other functions are available for use within normalisation rules.
  69
  70 =head2 data_structure
  71
  72 Return data structure
  73
  74   my $ds = WebPAC::Normalize::data_structure(
  75         lookup => $lookup_hash,
  76         row => $row,
  77         rules => $normalize_pl_config,
  78         marc_encoding => 'utf-8',
  79         config => $config,
  80         load_row_coderef => sub {
  81                 my ($database,$input,$mfn) = shift;
  82                 $store->load_row( database => $database, input => $input, id => $mfn );
  83         },
  84   );
  85
  86 Options C<row>, C<rules> and C<log> are mandatory while all
  87 other are optional.
  88
  89 C<load_row_coderef> is closure only used when executing lookups, so they will
  90 die if it's not defined.
  91
  92 This function will B<die> if normalizastion can't be evaled.
  93
  94 Since this function isn't exported you have to call it with
  95 C<WebPAC::Normalize::data_structure>.
  96
  97 =cut
  98
  99 my $load_row_coderef;
 100
 101 sub data_structure {
 102         my $arg = {@_};
 103
 104         die "need row argument" unless ($arg->{row});
 105         die "need normalisation argument" unless ($arg->{rules});
 106
 107         no strict 'subs';
 108         _set_lookup( $arg->{lookup} ) if defined($arg->{lookup});
 109         _set_rec( $arg->{row} );
 110         _set_config( $arg->{config} ) if defined($arg->{config});
 111         _clean_ds( %{ $arg } );
 112         $load_row_coderef = $arg->{load_row_coderef};
 113
 114         eval "$arg->{rules}";
 115         die "error evaling $arg->{rules}: $@\n" if ($@);
 116
 117         return _get_ds();
 118 }
 119
 120 =head2 _set_rec
 121
 122 Set current record hash
 123
 124   _set_rec( $rec );
 125
 126 =cut
 127
 128 my $rec;
 129
 130 sub _set_rec {
 131         $rec = shift or die "no record hash";
 132 }
 133
 134 =head2 _set_config
 135
 136 Set current config hash
 137
 138   _set_config( $config );
 139
 140 Magic keys are:
 141
 142 =over 4
 143
 144 =item _
 145
 146 Code of current database
 147
 148 =item _mfn
 149
 150 Current MFN
 151
 152 =back
 153
 154 =cut
 155
 156 my $config;
 157
 158 sub _set_config {
 159         $config = shift;
 160 }
 161
 162 =head2 _get_ds
 163
 164 Return hash formatted as data structure
 165
 166   my $ds = _get_ds();
 167
 168 =cut
 169
 170 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader);
 171 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
 172
 173 sub _get_ds {
 174         return $out;
 175 }
 176
 177 =head2 _clean_ds
 178
 179 Clean data structure hash for next record
 180
 181   _clean_ds();
 182
 183 =cut
 184
 185 sub _clean_ds {
 186         my $a = {@_};
 187         ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators, $marc_leader) = ();
 188         ($marc_record_offset, $marc_fetch_offset) = (0,0);
 189         $marc_encoding = $a->{marc_encoding};
 190 }
 191
 192 =head2 _set_lookup
 193
 194 Set current lookup hash
 195
 196   _set_lookup( $lookup );
 197
 198 =cut
 199
 200 my $lookup;
 201
 202 sub _set_lookup {
 203         $lookup = shift;
 204 }
 205
 206 =head2 _get_lookup
 207
 208 Get current lookup hash
 209
 210   my $lookup = _get_lookup();
 211
 212 =cut
 213
 214 sub _get_lookup {
 215         return $lookup;
 216 }
 217
 218 =head2 _set_load_row
 219
 220 Setup code reference which will return L<data_structure> from
 221 L<WebPAC::Store>
 222
 223   _set_load_row(sub {
 224                 my ($database,$input,$mfn) = @_;
 225                 $store->load_row( database => $database, input => $input, id => $mfn );
 226   });
 227
 228 =cut
 229
 230 sub _set_load_row {
 231         my $coderef = shift;
 232         confess "argument isn't CODE" unless ref($coderef) eq 'CODE';
 233
 234         $load_row_coderef = $coderef;
 235 }
 236
 237 =head2 _get_marc_fields
 238
 239 Get all fields defined by calls to C<marc>
 240
 241         $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
 242
 243 We are using I<magic> which detect repeatable fields only from
 244 sequence of field/subfield data generated by normalization.
 245
 246 Repeatable field is created when there is second occurence of same subfield or
 247 if any of indicators are different.
 248
 249 This is sane for most cases. Something like:
 250
 251   900a-1 900b-1 900c-1
 252   900a-2 900b-2
 253   900a-3
 254
 255 will be created from any combination of:
 256
 257   900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
 258
 259 and following rules:
 260
 261   marc('900','a', rec('200','a') );
 262   marc('900','b', rec('200','b') );
 263   marc('900','c', rec('200','c') );
 264
 265 which might not be what you have in mind. If you need repeatable subfield,
 266 define it using C<marc_repeatable_subfield> like this:
 267
 268   marc_repeatable_subfield('900','a');
 269   marc('900','a', rec('200','a') );
 270   marc('900','b', rec('200','b') );
 271   marc('900','c', rec('200','c') );
 272
 273 will create:
 274
 275   900a-1 900a-2 900a-3 900b-1 900c-1
 276   900b-2
 277
 278 There is also support for returning next or specific using:
 279
 280   while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
 281         # do something with $mf
 282   }
 283
 284 will always return fields from next MARC record or
 285
 286   my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
 287
 288 will return 42th copy record (if it exists).
 289
 290 =cut
 291
 292 sub _get_marc_fields {
 293
 294         my $arg = {@_};
 295         warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
 296         my $offset = $marc_fetch_offset;
 297         if ($arg->{offset}) {
 298                 $offset = $arg->{offset};
 299         } elsif($arg->{fetch_next}) {
 300                 $marc_fetch_offset++;
 301         }
 302
 303         return if (! $marc_record || ref($marc_record) ne 'ARRAY');
 304
 305         warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
 306
 307         my $marc_rec = $marc_record->[ $offset ];
 308
 309         warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
 310
 311         return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
 312
 313         # first, sort all existing fields
 314         # XXX might not be needed, but modern perl might randomize elements in hash
 315         my @sorted_marc_record = sort {
 316                 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
 317         } @{ $marc_rec };
 318
 319         @sorted_marc_record = @{ $marc_rec };   ### FIXME disable sorting
 320
 321         # output marc fields
 322         my @m;
 323
 324         # count unique field-subfields (used for offset when walking to next subfield)
 325         my $u;
 326         map { $u->{ $_->[0] . ( $_->[3] || '')  }++ } @sorted_marc_record;
 327
 328         if ($debug) {
 329                 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
 330                 warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
 331                 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
 332                 warn "## subfield count = ", dump( $u ), $/;
 333         }
 334
 335         my $len = $#sorted_marc_record;
 336         my $visited;
 337         my $i = 0;
 338         my $field;
 339
 340         foreach ( 0 .. $len ) {
 341
 342                 # find next element which isn't visited
 343                 while ($visited->{$i}) {
 344                         $i = ($i + 1) % ($len + 1);
 345                 }
 346
 347                 # mark it visited
 348                 $visited->{$i}++;
 349
 350                 my $row = dclone( $sorted_marc_record[$i] );
 351
 352                 # field and subfield which is key for
 353                 # marc_repeatable_subfield and u
 354                 my $fsf = $row->[0] . ( $row->[3] || '' );
 355
 356                 if ($debug > 1) {
 357
 358                         print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
 359                         print "### this [$i]: ", dump( $row ),$/;
 360                         print "### sf: ", $row->[3], " vs ", $field->[3],
 361                                 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
 362                                 if ($#$field >= 0);
 363
 364                 }
 365
 366                 # if field exists
 367                 if ( $#$field >= 0 ) {
 368                         if (
 369                                 $row->[0] ne $field->[0] ||             # field
 370                                 $row->[1] ne $field->[1] ||             # i1
 371                                 $row->[2] ne $field->[2]                # i2
 372                         ) {
 373                                 push @m, $field;
 374                                 warn "## saved/1 ", dump( $field ),$/ if ($debug);
 375                                 $field = $row;
 376
 377                         } elsif (
 378                                 ( $row->[3] lt $field->[-2] )           # subfield which is not next (e.g. a after c)
 379                                 ||
 380                                 ( $row->[3] eq $field->[-2] &&          # same subfield, but not repeatable
 381                                         ! $marc_repeatable_subfield->{ $fsf }
 382                                 )
 383                         ) {
 384                                 push @m, $field;
 385                                 warn "## saved/2 ", dump( $field ),$/ if ($debug);
 386                                 $field = $row;
 387
 388                         } else {
 389                                 # append new subfields to existing field
 390                                 push @$field, ( $row->[3], $row->[4] );
 391                         }
 392                 } else {
 393                         # insert first field
 394                         $field = $row;
 395                 }
 396
 397                 if (! $marc_repeatable_subfield->{ $fsf }) {
 398                         # make step to next subfield
 399                         $i = ($i + $u->{ $fsf } ) % ($len + 1);
 400                 }
 401         }
 402
 403         if ($#$field >= 0) {
 404                 push @m, $field;
 405                 warn "## saved/3 ", dump( $field ),$/ if ($debug);
 406         }
 407
 408         return \@m;
 409 }
 410
 411 =head2 _debug
 412
 413 Change level of debug warnings
 414
 415   _debug( 2 );
 416
 417 =cut
 418
 419 sub _debug {
 420         my $l = shift;
 421         return $debug unless defined($l);
 422         warn "debug level $l",$/ if ($l > 0);
 423         $debug = $l;
 424 }
 425
 426 =head1 Functions to create C<data_structure>
 427
 428 Those functions generally have to first in your normalization file.
 429
 430 =head2 tag
 431
 432 Define new tag for I<search> and I<display>.
 433
 434   tag('Title', rec('200','a') );
 435
 436
 437 =cut
 438
 439 sub tag {
 440         my $name = shift or die "tag needs name as first argument";
 441         my @o = grep { defined($_) && $_ ne '' } @_;
 442         return unless (@o);
 443         $out->{$name}->{tag} = $name;
 444         $out->{$name}->{search} = \@o;
 445         $out->{$name}->{display} = \@o;
 446 }
 447
 448 =head2 display
 449
 450 Define tag just for I<display>
 451
 452   @v = display('Title', rec('200','a') );
 453
 454 =cut
 455
 456 sub display {
 457         my $name = shift or die "display needs name as first argument";
 458         my @o = grep { defined($_) && $_ ne '' } @_;
 459         return unless (@o);
 460         $out->{$name}->{tag} = $name;
 461         $out->{$name}->{display} = \@o;
 462 }
 463
 464 =head2 search
 465
 466 Prepare values just for I<search>
 467
 468   @v = search('Title', rec('200','a') );
 469
 470 =cut
 471
 472 sub search {
 473         my $name = shift or die "search needs name as first argument";
 474         my @o = grep { defined($_) && $_ ne '' } @_;
 475         return unless (@o);
 476         $out->{$name}->{tag} = $name;
 477         $out->{$name}->{search} = \@o;
 478 }
 479
 480 =head2 marc_leader
 481
 482 Setup fields within MARC leader or get leader
 483
 484   marc_leader('05','c');
 485   my $leader = marc_leader();
 486
 487 =cut
 488
 489 sub marc_leader {
 490         my ($offset,$value) = @_;
 491
 492         if ($offset) {
 493                 $marc_leader->{ $offset } = $value;
 494         } else {
 495                 return $marc_leader;
 496         }
 497 }
 498
 499 =head2 marc
 500
 501 Save value for MARC field
 502
 503   marc('900','a', rec('200','a') );
 504   marc('001', rec('000') );
 505
 506 =cut
 507
 508 sub marc {
 509         my $f = shift or die "marc needs field";
 510         die "marc field must be numer" unless ($f =~ /^\d+$/);
 511
 512         my $sf;
 513         if ($f >= 10) {
 514                 $sf = shift or die "marc needs subfield";
 515         }
 516
 517         foreach (@_) {
 518                 my $v = $_;             # make var read-write for Encode
 519                 next unless (defined($v) && $v !~ /^\s*$/);
 520                 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
 521                 if (defined $sf) {
 522                         push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
 523                 } else {
 524                         push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
 525                 }
 526         }
 527 }
 528
 529 =head2 marc_repeatable_subfield
 530
 531 Save values for MARC repetable subfield
 532
 533   marc_repeatable_subfield('910', 'z', rec('909') );
 534
 535 =cut
 536
 537 sub marc_repeatable_subfield {
 538         my ($f,$sf) = @_;
 539         die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
 540         $marc_repeatable_subfield->{ $f . $sf }++;
 541         marc(@_);
 542 }
 543
 544 =head2 marc_indicators
 545
 546 Set both indicators for MARC field
 547
 548   marc_indicators('900', ' ', 1);
 549
 550 Any indicator value other than C<0-9> will be treated as undefined.
 551
 552 =cut
 553
 554 sub marc_indicators {
 555         my $f = shift || die "marc_indicators need field!\n";
 556         my ($i1,$i2) = @_;
 557         die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
 558         die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
 559
 560         $i1 = ' ' if ($i1 !~ /^\d$/);
 561         $i2 = ' ' if ($i2 !~ /^\d$/);
 562         @{ $marc_indicators->{$f} } = ($i1,$i2);
 563 }
 564
 565 =head2 marc_compose
 566
 567 Save values for each MARC subfield explicitly
 568
 569   marc_compose('900',
 570         'a', rec('200','a')
 571         'b', rec('201','a')
 572         'a', rec('200','b')
 573         'c', rec('200','c')
 574   );
 575
 576 If you specify C<+> for subfield, value will be appended
 577 to previous defined subfield.
 578
 579 =cut
 580
 581 sub marc_compose {
 582         my $f = shift or die "marc_compose needs field";
 583         die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
 584
 585         my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
 586         my $m = [ $f, $i1, $i2 ];
 587
 588         warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
 589
 590         if ($#_ % 2 != 1) {
 591                 die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
 592         }
 593
 594         while (@_) {
 595                 my $sf = shift;
 596                 my $v = shift;
 597
 598                 next unless (defined($v) && $v !~ /^\s*$/);
 599                 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
 600                 if ($sf ne '+') {
 601                         push @$m, ( $sf, $v );
 602                 } else {
 603                         $m->[ $#$m ] .= $v;
 604                 }
 605         }
 606
 607         warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
 608
 609         push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
 610 }
 611
 612 =head2 marc_duplicate
 613
 614 Generate copy of current MARC record and continue working on copy
 615
 616   marc_duplicate();
 617
 618 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
 619 C<< _get_marc_fields( offset => 42 ) >>.
 620
 621 =cut
 622
 623 sub marc_duplicate {
 624          my $m = $marc_record->[ -1 ];
 625          die "can't duplicate record which isn't defined" unless ($m);
 626          push @{ $marc_record }, dclone( $m );
 627          warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
 628          $marc_record_offset = $#{ $marc_record };
 629          warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
 630 }
 631
 632 =head2 marc_remove
 633
 634 Remove some field or subfield from MARC record.
 635
 636   marc_remove('200');
 637   marc_remove('200','a');
 638
 639 This will erase field C<200> or C<200^a> from current MARC record.
 640
 641   marc_remove('*');
 642
 643 Will remove all fields in current MARC record.
 644
 645 This is useful after calling C<marc_duplicate> or on it's own (but, you
 646 should probably just remove that subfield definition if you are not
 647 using C<marc_duplicate>).
 648
 649 FIXME: support fields < 10.
 650
 651 =cut
 652
 653 sub marc_remove {
 654         my ($f, $sf) = @_;
 655
 656         die "marc_remove needs record number" unless defined($f);
 657
 658         my $marc = $marc_record->[ $marc_record_offset ];
 659
 660         warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
 661
 662         if ($f eq '*') {
 663
 664                 delete( $marc_record->[ $marc_record_offset ] );
 665                 warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
 666
 667         } else {
 668
 669                 my $i = 0;
 670                 foreach ( 0 .. $#{ $marc } ) {
 671                         last unless (defined $marc->[$i]);
 672                         warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
 673                         if ($marc->[$i]->[0] eq $f) {
 674                                 if (! defined $sf) {
 675                                         # remove whole field
 676                                         splice @$marc, $i, 1;
 677                                         warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
 678                                         $i--;
 679                                 } else {
 680                                         foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
 681                                                 my $o = ($j * 2) + 3;
 682                                                 if ($marc->[$i]->[$o] eq $sf) {
 683                                                         # remove subfield
 684                                                         splice @{$marc->[$i]}, $o, 2;
 685                                                         warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
 686                                                         # is record now empty?
 687                                                         if ($#{ $marc->[$i] } == 2) {
 688                                                                 splice @$marc, $i, 1;
 689                                                                 warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
 690                                                                 $i--;
 691                                                         };
 692                                                 }
 693                                         }
 694                                 }
 695                         }
 696                         $i++;
 697                 }
 698
 699                 warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
 700
 701                 $marc_record->[ $marc_record_offset ] = $marc;
 702         }
 703
 704         warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
 705 }
 706
 707 =head2 marc_original_order
 708
 709 Copy all subfields preserving original order to marc field.
 710
 711   marc_original_order( marc_field_number, original_input_field_number );
 712
 713 Please note that field numbers are consistent with other commands (marc
 714 field number first), but somewhat counter-intuitive (destination and then
 715 source).
 716
 717 You might want to use this command if you are just renaming subfields or
 718 using pre-processing modify_record in C<config.yml> and don't need any
 719 post-processing or want to preserve order of original subfields.
 720
 721
 722 =cut
 723
 724 sub marc_original_order {
 725
 726         my ($to, $from) = @_;
 727         die "marc_original_order needs from and to fields\n" unless ($from && $to);
 728
 729         return unless defined($rec->{$from});
 730
 731         my $r = $rec->{$from};
 732         die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
 733
 734         my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
 735         warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
 736
 737         foreach my $d (@$r) {
 738
 739                 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
 740                         warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
 741                         next;
 742                 }
 743
 744                 my @sfs = @{ $d->{subfields} };
 745
 746                 die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
 747
 748                 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
 749
 750                 my $m = [ $to, $i1, $i2 ];
 751
 752                 while (my $sf = shift @sfs) {
 753
 754                         warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
 755                         my $offset = shift @sfs;
 756                         die "corrupted sufields specification for field $from\n" unless defined($offset);
 757
 758                         my $v;
 759                         if (ref($d->{$sf}) eq 'ARRAY') {
 760                                 $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
 761                         } elsif ($offset == 0) {
 762                                 $v = $d->{$sf};
 763                         } else {
 764                                 die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
 765                         }
 766                         push @$m, ( $sf, $v ) if (defined($v));
 767                 }
 768
 769                 if ($#{$m} > 2) {
 770                         push @{ $marc_record->[ $marc_record_offset ] }, $m;
 771                 }
 772         }
 773
 774         warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
 775 }
 776
 777
 778 =head1 Functions to extract data from input
 779
 780 This function should be used inside functions to create C<data_structure> described
 781 above.
 782
 783 =head2 _pack_subfields_hash
 784
 785  @subfields = _pack_subfields_hash( $h );
 786  $subfields = _pack_subfields_hash( $h, 1 );
 787
 788 Return each subfield value in array or pack them all together and return scalar
 789 with subfields (denoted by C<^>) and values.
 790
 791 =cut
 792
 793 sub _pack_subfields_hash {
 794
 795         warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
 796
 797         my ($h,$include_subfields) = @_;
 798
 799         if ( defined($h->{subfields}) ) {
 800                 my $sfs = delete $h->{subfields} || die "no subfields?";
 801                 my @out;
 802                 while (@$sfs) {
 803                         my $sf = shift @$sfs;
 804                         push @out, '^' . $sf if ($include_subfields);
 805                         my $o = shift @$sfs;
 806                         if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
 807                                 # single element subfields are not arrays
 808 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
 809
 810                                 push @out, $h->{$sf};
 811                         } else {
 812 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
 813                                 push @out, $h->{$sf}->[$o];
 814                         }
 815                 }
 816                 if ($include_subfields) {
 817                         return join('', @out);
 818                 } else {
 819                         return @out;
 820                 }
 821         } else {
 822                 if ($include_subfields) {
 823                         my $out = '';
 824                         foreach my $sf (sort keys %$h) {
 825                                 if (ref($h->{$sf}) eq 'ARRAY') {
 826                                         $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
 827                                 } else {
 828                                         $out .= '^' . $sf . $h->{$sf};
 829                                 }
 830                         }
 831                         return $out;
 832                 } else {
 833                         # FIXME this should probably be in alphabetical order instead of hash order
 834                         values %{$h};
 835                 }
 836         }
 837 }
 838
 839 =head2 rec1
 840
 841 Return all values in some field
 842
 843   @v = rec1('200')
 844
 845 TODO: order of values is probably same as in source data, need to investigate that
 846
 847 =cut
 848
 849 sub rec1 {
 850         my $f = shift;
 851         warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
 852         return unless (defined($rec) && defined($rec->{$f}));
 853         warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
 854         if (ref($rec->{$f}) eq 'ARRAY') {
 855                 my @out;
 856                 foreach my $h ( @{ $rec->{$f} } ) {
 857                         if (ref($h) eq 'HASH') {
 858                                 push @out, ( _pack_subfields_hash( $h ) );
 859                         } else {
 860                                 push @out, $h;
 861                         }
 862                 }
 863                 return @out;
 864         } elsif( defined($rec->{$f}) ) {
 865                 return $rec->{$f};
 866         }
 867 }
 868
 869 =head2 rec2
 870
 871 Return all values in specific field and subfield
 872
 873   @v = rec2('200','a')
 874
 875 =cut
 876
 877 sub rec2 {
 878         my $f = shift;
 879         return unless (defined($rec && $rec->{$f}));
 880         my $sf = shift;
 881         warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
 882         return map {
 883                 if (ref($_->{$sf}) eq 'ARRAY') {
 884                         @{ $_->{$sf} };
 885                 } else {
 886                         $_->{$sf};
 887                 }
 888         } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
 889 }
 890
 891 =head2 rec
 892
 893 syntaxtic sugar for
 894
 895   @v = rec('200')
 896   @v = rec('200','a')
 897
 898 If rec() returns just single value, it will
 899 return scalar, not array.
 900
 901 =cut
 902
 903 sub rec {
 904         my @out;
 905         if ($#_ == 0) {
 906                 @out = rec1(@_);
 907         } elsif ($#_ == 1) {
 908                 @out = rec2(@_);
 909         }
 910         if ($#out == 0 && ! wantarray) {
 911                 return $out[0];
 912         } elsif (@out) {
 913                 return @out;
 914         } else {
 915                 return '';
 916         }
 917 }
 918
 919 =head2 regex
 920
 921 Apply regex to some or all values
 922
 923   @v = regex( 's/foo/bar/g', @v );
 924
 925 =cut
 926
 927 sub regex {
 928         my $r = shift;
 929         my @out;
 930         #warn "r: $r\n", dump(\@_);
 931         foreach my $t (@_) {
 932                 next unless ($t);
 933                 eval "\$t =~ $r";
 934                 push @out, $t if ($t && $t ne '');
 935         }
 936         return @out;
 937 }
 938
 939 =head2 prefix
 940
 941 Prefix all values with a string
 942
 943   @v = prefix( 'my_', @v );
 944
 945 =cut
 946
 947 sub prefix {
 948         my $p = shift or return;
 949         return map { $p . $_ } grep { defined($_) } @_;
 950 }
 951
 952 =head2 suffix
 953
 954 suffix all values with a string
 955
 956   @v = suffix( '_my', @v );
 957
 958 =cut
 959
 960 sub suffix {
 961         my $s = shift or die "suffix needs string as first argument";
 962         return map { $_ . $s } grep { defined($_) } @_;
 963 }
 964
 965 =head2 surround
 966
 967 surround all values with a two strings
 968
 969   @v = surround( 'prefix_', '_suffix', @v );
 970
 971 =cut
 972
 973 sub surround {
 974         my $p = shift or die "surround need prefix as first argument";
 975         my $s = shift or die "surround needs suffix as second argument";
 976         return map { $p . $_ . $s } grep { defined($_) } @_;
 977 }
 978
 979 =head2 first
 980
 981 Return first element
 982
 983   $v = first( @v );
 984
 985 =cut
 986
 987 sub first {
 988         my $r = shift;
 989         return $r;
 990 }
 991
 992 =head2 lookup
 993
 994 Consult lookup hashes for some value
 995
 996   @v = lookup(
 997         sub {
 998                 'ffkk/peri/mfn'.rec('000')
 999         },
1000         'ffkk','peri','200-a-200-e',
1001         sub {
1002                 first(rec(200,'a')).' '.first(rec('200','e'))
1003         }
1004   );
1005
1006 Code like above will be B<automatically generated> using L<WebPAC::Parse> from
1007 normal lookup definition in C<conf/lookup/something.pl> which looks like:
1008
1009   lookup(
1010         # which results to return from record recorded in lookup
1011         sub { 'ffkk/peri/mfn' . rec('000') },
1012         # from which database and input
1013         'ffkk','peri',
1014         # such that following values match
1015         sub { first(rec(200,'a')) . ' ' . first(rec('200','e')) },
1016         # if this part is missing, we will try to match same fields
1017         # from lookup record and current one, or you can override
1018         # which records to use from current record using
1019         sub { rec('900','x') . ' ' . rec('900','y') },
1020   )
1021
1022 You can think about this lookup as SQL (if that helps):
1023
1024   select
1025         sub { what }
1026   from
1027         database, input
1028   where
1029     sub { filter from lookuped record }
1030   having
1031     sub { optional filter on current record }
1032
1033 Easy as pie, right?
1034
1035 =cut
1036
1037 sub lookup {
1038         my ($what, $database, $input, $key, $having) = @_;
1039
1040         confess "lookup needs 5 arguments: what, database, input, key, having\n" unless ($#_ == 4);
1041
1042         warn "## lookup ($database, $input, $key)", $/ if ($debug > 1);
1043         return unless (defined($lookup->{$database}->{$input}->{$key}));
1044
1045         confess "lookup really need load_row_coderef added to data_structure\n" unless ($load_row_coderef);
1046
1047         my $mfns;
1048         my @having = $having->();
1049
1050         warn "## having = ", dump( @having ) if ($debug > 2);
1051
1052         foreach my $h ( @having ) {
1053                 if (defined($lookup->{$database}->{$input}->{$key}->{$h})) {
1054                         warn "lookup for $database/$input/$key/$h return ",dump($lookup->{$database}->{$input}->{$key}->{$h}),"\n" if ($debug);
1055                         $mfns->{$_}++ foreach keys %{ $lookup->{$database}->{$input}->{$key}->{$h} };
1056                 }
1057         }
1058
1059         return unless ($mfns);
1060
1061         my @mfns = sort keys %$mfns;
1062
1063         warn "# lookup loading $database/$input/$key mfn ", join(",",@mfns)," having ",dump(@having),"\n" if ($debug);
1064
1065         my $old_rec = $rec;
1066         my @out;
1067
1068         foreach my $mfn (@mfns) {
1069                 $rec = $load_row_coderef->( $database, $input, $mfn );
1070
1071                 warn "got $database/$input/$mfn = ", dump($rec), $/ if ($debug);
1072
1073                 my @vals = $what->();
1074
1075                 push @out, ( @vals );
1076
1077                 warn "lookup for mfn $mfn returned ", dump(@vals), $/ if ($debug);
1078         }
1079
1080 #       if (ref($lookup->{$k}) eq 'ARRAY') {
1081 #               return @{ $lookup->{$k} };
1082 #       } else {
1083 #               return $lookup->{$k};
1084 #       }
1085
1086         $rec = $old_rec;
1087
1088         warn "## lookup returns = ", dump(@out), $/ if ($debug);
1089
1090         if ($#out == 0) {
1091                 return $out[0];
1092         } else {
1093                 return @out;
1094         }
1095 }
1096
1097 =head2 save_into_lookup
1098
1099 Save value into lookup. It associates current database, input
1100 and specific keys with one or more values which will be
1101 associated over MFN.
1102
1103 MFN will be extracted from first occurence current of field 000
1104 in current record, or if it doesn't exist from L<_set_config> C<_mfn>.
1105
1106   my $nr = save_into_lookup($database,$input,$key,sub {
1107         # code which produce one or more values
1108   });
1109
1110 It returns number of items saved.
1111
1112 This function shouldn't be called directly, it's called from code created by
1113 L<WebPAC::Parser>.
1114
1115 =cut
1116
1117 sub save_into_lookup {
1118         my ($database,$input,$key,$coderef) = @_;
1119         die "save_into_lookup needs database" unless defined($database);
1120         die "save_into_lookup needs input" unless defined($input);
1121         die "save_into_lookup needs key" unless defined($key);
1122         die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
1123
1124         warn "## save_into_lookup rec = ", dump($rec), " config = ", dump($config), $/ if ($debug > 2);
1125
1126         my $mfn =
1127                 defined($rec->{'000'}->[0])     ?       $rec->{'000'}->[0]      :
1128                 defined($config->{_mfn})        ?       $config->{_mfn}         :
1129                                                                                 die "mfn not defined or zero";
1130
1131         my $nr = 0;
1132
1133         foreach my $v ( $coderef->() ) {
1134                 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
1135                 warn "# saved lookup $database/$input/$key [$v] $mfn\n" if ($debug > 1);
1136                 $nr++;
1137         }
1138
1139         return $nr;
1140 }
1141
1142 =head2 config
1143
1144 Consult config values stored in C<config.yml>
1145
1146   # return database code (key under databases in yaml)
1147   $database_code = config();    # use _ from hash
1148   $database_name = config('name');
1149   $database_input_name = config('input name');
1150   $tag = config('input normalize tag');
1151
1152 Up to three levels are supported.
1153
1154 =cut
1155
1156 sub config {
1157         return unless ($config);
1158
1159         my $p = shift;
1160
1161         $p ||= '';
1162
1163         my $v;
1164
1165         warn "### getting config($p)\n" if ($debug > 1);
1166
1167         my @p = split(/\s+/,$p);
1168         if ($#p < 0) {
1169                 $v = $config->{ '_' };  # special, database code
1170         } else {
1171
1172                 my $c = dclone( $config );
1173
1174                 foreach my $k (@p) {
1175                         warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
1176                         if (ref($c) eq 'ARRAY') {
1177                                 $c = shift @$c;
1178                                 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
1179                                 last;
1180                         }
1181
1182                         if (! defined($c->{$k}) ) {
1183                                 $c = undef;
1184                                 last;
1185                         } else {
1186                                 $c = $c->{$k};
1187                         }
1188                 }
1189                 $v = $c if ($c);
1190
1191         }
1192
1193         warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
1194         warn "config( '$p' ) is empty\n" if (! $v);
1195
1196         return $v;
1197 }
1198
1199 =head2 id
1200
1201 Returns unique id of this record
1202
1203   $id = id();
1204
1205 Returns C<42/2> for 2nd occurence of MFN 42.
1206
1207 =cut
1208
1209 sub id {
1210         my $mfn = $config->{_mfn} || die "no _mfn in config data";
1211         return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1212 }
1213
1214 =head2 join_with
1215
1216 Joins walues with some delimiter
1217
1218   $v = join_with(", ", @v);
1219
1220 =cut
1221
1222 sub join_with {
1223         my $d = shift;
1224         warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1225         my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1226         return '' unless defined($v);
1227         return $v;
1228 }
1229
1230 =head2 split_rec_on
1231
1232 Split record subfield on some regex and take one of parts out
1233
1234   $a_before_semi_column =
1235         split_rec_on('200','a', /\s*;\s*/, $part);
1236
1237 C<$part> is optional number of element. First element is
1238 B<1>, not 0!
1239
1240 If there is no C<$part> parameter or C<$part> is 0, this function will
1241 return all values produced by splitting.
1242
1243 =cut
1244
1245 sub split_rec_on {
1246         die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1247
1248         my ($fld, $sf, $regex, $part) = @_;
1249         warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1250
1251         my @r = rec( $fld, $sf );
1252         my $v = shift @r;
1253         warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1254
1255         return '' if ( ! defined($v) || $v =~ /^\s*$/);
1256
1257         my @s = split( $regex, $v );
1258         warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1259         if ($part && $part > 0) {
1260                 return $s[ $part - 1 ];
1261         } else {
1262                 return @s;
1263         }
1264 }
1265
1266 my $hash;
1267
1268 =head2 set
1269
1270   set( key => 'value' );
1271
1272 =cut
1273
1274 sub set {
1275         my ($k,$v) = @_;
1276         warn "## set ( $k => ", dump($v), " )", $/ if ( $debug );
1277         $hash->{$k} = $v;
1278 };
1279
1280 =head2 get
1281
1282   get( 'key' );
1283
1284 =cut
1285
1286 sub get {
1287         my $k = shift || return;
1288         my $v = $hash->{$k};
1289         warn "## get $k = ", dump( $v ), $/ if ( $debug );
1290         return $v;
1291 }
1292
1293 =head2 count
1294
1295   if ( count( @result ) == 1 ) {
1296         # do something if only 1 result is there
1297   }
1298
1299 =cut
1300
1301 sub count {
1302         warn "## count ",dump(@_),$/ if ( $debug );
1303         return @_ . '';
1304 }
1305
1306 # END
1307 1;