git.rot13.org Git - webpac2/blob - lib/WebPAC/Normalize.pm

   1 package WebPAC::Normalize;
   2 use Exporter 'import';
   3 @EXPORT = qw/
   4         _set_rec _set_lookup
   5         _get_ds _clean_ds
   6         _debug
   7         _pack_subfields_hash
   8
   9         tag search display
  10         marc marc_indicators marc_repeatable_subfield
  11         marc_compose marc_leader
  12         marc_duplicate marc_remove
  13         marc_original_order
  14
  15         rec1 rec2 rec
  16         regex prefix suffix surround
  17         first lookup join_with
  18         save_into_lookup
  19
  20         split_rec_on
  21 /;
  22
  23 use warnings;
  24 use strict;
  25
  26 #use base qw/WebPAC::Common/;
  27 use Data::Dump qw/dump/;
  28 use Storable qw/dclone/;
  29
  30 # debugging warn(s)
  31 my $debug = 0;
  32
  33
  34 =head1 NAME
  35
  36 WebPAC::Normalize - describe normalisaton rules using sets
  37
  38 =head1 VERSION
  39
  40 Version 0.20
  41
  42 =cut
  43
  44 our $VERSION = '0.20';
  45
  46 =head1 SYNOPSIS
  47
  48 This module uses C<conf/normalize/*.pl> files to perform normalisation
  49 from input records using perl functions which are specialized for set
  50 processing.
  51
  52 Sets are implemented as arrays, and normalisation file is valid perl, which
  53 means that you check it's validity before running WebPAC using
  54 C<perl -c normalize.pl>.
  55
  56 Normalisation can generate multiple output normalized data. For now, supported output
  57 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
  58 C<marc>.
  59
  60 =head1 FUNCTIONS
  61
  62 Functions which start with C<_> are private and used by WebPAC internally.
  63 All other functions are available for use within normalisation rules.
  64
  65 =head2 data_structure
  66
  67 Return data structure
  68
  69   my $ds = WebPAC::Normalize::data_structure(
  70         lookup => $lookup_variable,
  71         row => $row,
  72         rules => $normalize_pl_config,
  73         marc_encoding => 'utf-8',
  74         config => $config,
  75   );
  76
  77 Options C<row>, C<rules> and C<log> are mandatory while all
  78 other are optional.
  79
  80 This function will B<die> if normalizastion can't be evaled.
  81
  82 Since this function isn't exported you have to call it with
  83 C<WebPAC::Normalize::data_structure>.
  84
  85 =cut
  86
  87 sub data_structure {
  88         my $arg = {@_};
  89
  90         die "need row argument" unless ($arg->{row});
  91         die "need normalisation argument" unless ($arg->{rules});
  92
  93         no strict 'subs';
  94         _set_lookup( $arg->{lookup} ) if (defined( $arg->{lookup} ));
  95         _set_rec( $arg->{row} );
  96         _set_config( $arg->{config} ) if (defined( $arg->{config} ));
  97         _clean_ds( %{ $arg } );
  98         eval "$arg->{rules}";
  99         die "error evaling $arg->{rules}: $@\n" if ($@);
 100
 101         return _get_ds();
 102 }
 103
 104 =head2 _set_rec
 105
 106 Set current record hash
 107
 108   _set_rec( $rec );
 109
 110 =cut
 111
 112 my $rec;
 113
 114 sub _set_rec {
 115         $rec = shift or die "no record hash";
 116 }
 117
 118 =head2 _set_config
 119
 120 Set current config hash
 121
 122   _set_config( $config );
 123
 124 Magic keys are:
 125
 126 =over 4
 127
 128 =item _
 129
 130 Code of current database
 131
 132 =item _mfn
 133
 134 Current MFN
 135
 136 =back
 137
 138 =cut
 139
 140 my $config;
 141
 142 sub _set_config {
 143         $config = shift;
 144 }
 145
 146 =head2 _get_ds
 147
 148 Return hash formatted as data structure
 149
 150   my $ds = _get_ds();
 151
 152 =cut
 153
 154 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
 155 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
 156
 157 sub _get_ds {
 158         return $out;
 159 }
 160
 161 =head2 _clean_ds
 162
 163 Clean data structure hash for next record
 164
 165   _clean_ds();
 166
 167 =cut
 168
 169 sub _clean_ds {
 170         my $a = {@_};
 171         ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
 172         ($marc_record_offset, $marc_fetch_offset) = (0,0);
 173         $marc_encoding = $a->{marc_encoding};
 174 }
 175
 176 =head2 _set_lookup
 177
 178 Set current lookup hash
 179
 180   _set_lookup( $lookup );
 181
 182 =cut
 183
 184 my $lookup;
 185
 186 sub _set_lookup {
 187         $lookup = shift;
 188 }
 189
 190 =head2 _get_lookup
 191
 192 Get current lookup hash
 193
 194   my $lookup = _get_lookup();
 195
 196 =cut
 197
 198 sub _get_lookup {
 199         return $lookup;
 200 }
 201
 202 =head2 _get_marc_fields
 203
 204 Get all fields defined by calls to C<marc>
 205
 206         $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
 207
 208 We are using I<magic> which detect repeatable fields only from
 209 sequence of field/subfield data generated by normalization.
 210
 211 Repeatable field is created when there is second occurence of same subfield or
 212 if any of indicators are different.
 213
 214 This is sane for most cases. Something like:
 215
 216   900a-1 900b-1 900c-1
 217   900a-2 900b-2
 218   900a-3
 219
 220 will be created from any combination of:
 221
 222   900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
 223
 224 and following rules:
 225
 226   marc('900','a', rec('200','a') );
 227   marc('900','b', rec('200','b') );
 228   marc('900','c', rec('200','c') );
 229
 230 which might not be what you have in mind. If you need repeatable subfield,
 231 define it using C<marc_repeatable_subfield> like this:
 232
 233   marc_repeatable_subfield('900','a');
 234   marc('900','a', rec('200','a') );
 235   marc('900','b', rec('200','b') );
 236   marc('900','c', rec('200','c') );
 237
 238 will create:
 239
 240   900a-1 900a-2 900a-3 900b-1 900c-1
 241   900b-2
 242
 243 There is also support for returning next or specific using:
 244
 245   while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
 246         # do something with $mf
 247   }
 248
 249 will always return fields from next MARC record or
 250
 251   my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
 252
 253 will return 42th copy record (if it exists).
 254
 255 =cut
 256
 257 sub _get_marc_fields {
 258
 259         my $arg = {@_};
 260         warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
 261         my $offset = $marc_fetch_offset;
 262         if ($arg->{offset}) {
 263                 $offset = $arg->{offset};
 264         } elsif($arg->{fetch_next}) {
 265                 $marc_fetch_offset++;
 266         }
 267
 268         return if (! $marc_record || ref($marc_record) ne 'ARRAY');
 269
 270         warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
 271
 272         my $marc_rec = $marc_record->[ $offset ];
 273
 274         warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
 275
 276         return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
 277
 278         # first, sort all existing fields
 279         # XXX might not be needed, but modern perl might randomize elements in hash
 280         my @sorted_marc_record = sort {
 281                 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
 282         } @{ $marc_rec };
 283
 284         @sorted_marc_record = @{ $marc_rec };   ### FIXME disable sorting
 285
 286         # output marc fields
 287         my @m;
 288
 289         # count unique field-subfields (used for offset when walking to next subfield)
 290         my $u;
 291         map { $u->{ $_->[0] . ( $_->[3] || '')  }++ } @sorted_marc_record;
 292
 293         if ($debug) {
 294                 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
 295                 warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
 296                 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
 297                 warn "## subfield count = ", dump( $u ), $/;
 298         }
 299
 300         my $len = $#sorted_marc_record;
 301         my $visited;
 302         my $i = 0;
 303         my $field;
 304
 305         foreach ( 0 .. $len ) {
 306
 307                 # find next element which isn't visited
 308                 while ($visited->{$i}) {
 309                         $i = ($i + 1) % ($len + 1);
 310                 }
 311
 312                 # mark it visited
 313                 $visited->{$i}++;
 314
 315                 my $row = dclone( $sorted_marc_record[$i] );
 316
 317                 # field and subfield which is key for
 318                 # marc_repeatable_subfield and u
 319                 my $fsf = $row->[0] . ( $row->[3] || '' );
 320
 321                 if ($debug > 1) {
 322
 323                         print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
 324                         print "### this [$i]: ", dump( $row ),$/;
 325                         print "### sf: ", $row->[3], " vs ", $field->[3],
 326                                 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
 327                                 if ($#$field >= 0);
 328
 329                 }
 330
 331                 # if field exists
 332                 if ( $#$field >= 0 ) {
 333                         if (
 334                                 $row->[0] ne $field->[0] ||             # field
 335                                 $row->[1] ne $field->[1] ||             # i1
 336                                 $row->[2] ne $field->[2]                # i2
 337                         ) {
 338                                 push @m, $field;
 339                                 warn "## saved/1 ", dump( $field ),$/ if ($debug);
 340                                 $field = $row;
 341
 342                         } elsif (
 343                                 ( $row->[3] lt $field->[-2] )           # subfield which is not next (e.g. a after c)
 344                                 ||
 345                                 ( $row->[3] eq $field->[-2] &&          # same subfield, but not repeatable
 346                                         ! $marc_repeatable_subfield->{ $fsf }
 347                                 )
 348                         ) {
 349                                 push @m, $field;
 350                                 warn "## saved/2 ", dump( $field ),$/ if ($debug);
 351                                 $field = $row;
 352
 353                         } else {
 354                                 # append new subfields to existing field
 355                                 push @$field, ( $row->[3], $row->[4] );
 356                         }
 357                 } else {
 358                         # insert first field
 359                         $field = $row;
 360                 }
 361
 362                 if (! $marc_repeatable_subfield->{ $fsf }) {
 363                         # make step to next subfield
 364                         $i = ($i + $u->{ $fsf } ) % ($len + 1);
 365                 }
 366         }
 367
 368         if ($#$field >= 0) {
 369                 push @m, $field;
 370                 warn "## saved/3 ", dump( $field ),$/ if ($debug);
 371         }
 372
 373         return \@m;
 374 }
 375
 376 =head2 _debug
 377
 378 Change level of debug warnings
 379
 380   _debug( 2 );
 381
 382 =cut
 383
 384 sub _debug {
 385         my $l = shift;
 386         return $debug unless defined($l);
 387         warn "debug level $l",$/ if ($l > 0);
 388         $debug = $l;
 389 }
 390
 391 =head1 Functions to create C<data_structure>
 392
 393 Those functions generally have to first in your normalization file.
 394
 395 =head2 tag
 396
 397 Define new tag for I<search> and I<display>.
 398
 399   tag('Title', rec('200','a') );
 400
 401
 402 =cut
 403
 404 sub tag {
 405         my $name = shift or die "tag needs name as first argument";
 406         my @o = grep { defined($_) && $_ ne '' } @_;
 407         return unless (@o);
 408         $out->{$name}->{tag} = $name;
 409         $out->{$name}->{search} = \@o;
 410         $out->{$name}->{display} = \@o;
 411 }
 412
 413 =head2 display
 414
 415 Define tag just for I<display>
 416
 417   @v = display('Title', rec('200','a') );
 418
 419 =cut
 420
 421 sub display {
 422         my $name = shift or die "display needs name as first argument";
 423         my @o = grep { defined($_) && $_ ne '' } @_;
 424         return unless (@o);
 425         $out->{$name}->{tag} = $name;
 426         $out->{$name}->{display} = \@o;
 427 }
 428
 429 =head2 search
 430
 431 Prepare values just for I<search>
 432
 433   @v = search('Title', rec('200','a') );
 434
 435 =cut
 436
 437 sub search {
 438         my $name = shift or die "search needs name as first argument";
 439         my @o = grep { defined($_) && $_ ne '' } @_;
 440         return unless (@o);
 441         $out->{$name}->{tag} = $name;
 442         $out->{$name}->{search} = \@o;
 443 }
 444
 445 =head2 marc_leader
 446
 447 Setup fields within MARC leader or get leader
 448
 449   marc_leader('05','c');
 450   my $leader = marc_leader();
 451
 452 =cut
 453
 454 sub marc_leader {
 455         my ($offset,$value) = @_;
 456
 457         if ($offset) {
 458                 $out->{' leader'}->{ $offset } = $value;
 459         } else {
 460                 return $out->{' leader'};
 461         }
 462 }
 463
 464 =head2 marc
 465
 466 Save value for MARC field
 467
 468   marc('900','a', rec('200','a') );
 469   marc('001', rec('000') );
 470
 471 =cut
 472
 473 sub marc {
 474         my $f = shift or die "marc needs field";
 475         die "marc field must be numer" unless ($f =~ /^\d+$/);
 476
 477         my $sf;
 478         if ($f >= 10) {
 479                 $sf = shift or die "marc needs subfield";
 480         }
 481
 482         foreach (@_) {
 483                 my $v = $_;             # make var read-write for Encode
 484                 next unless (defined($v) && $v !~ /^\s*$/);
 485                 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
 486                 if (defined $sf) {
 487                         push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
 488                 } else {
 489                         push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
 490                 }
 491         }
 492 }
 493
 494 =head2 marc_repeatable_subfield
 495
 496 Save values for MARC repetable subfield
 497
 498   marc_repeatable_subfield('910', 'z', rec('909') );
 499
 500 =cut
 501
 502 sub marc_repeatable_subfield {
 503         my ($f,$sf) = @_;
 504         die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
 505         $marc_repeatable_subfield->{ $f . $sf }++;
 506         marc(@_);
 507 }
 508
 509 =head2 marc_indicators
 510
 511 Set both indicators for MARC field
 512
 513   marc_indicators('900', ' ', 1);
 514
 515 Any indicator value other than C<0-9> will be treated as undefined.
 516
 517 =cut
 518
 519 sub marc_indicators {
 520         my $f = shift || die "marc_indicators need field!\n";
 521         my ($i1,$i2) = @_;
 522         die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
 523         die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
 524
 525         $i1 = ' ' if ($i1 !~ /^\d$/);
 526         $i2 = ' ' if ($i2 !~ /^\d$/);
 527         @{ $marc_indicators->{$f} } = ($i1,$i2);
 528 }
 529
 530 =head2 marc_compose
 531
 532 Save values for each MARC subfield explicitly
 533
 534   marc_compose('900',
 535         'a', rec('200','a')
 536         'b', rec('201','a')
 537         'a', rec('200','b')
 538         'c', rec('200','c')
 539   );
 540
 541 If you specify C<+> for subfield, value will be appended
 542 to previous defined subfield.
 543
 544 =cut
 545
 546 sub marc_compose {
 547         my $f = shift or die "marc_compose needs field";
 548         die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
 549
 550         my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
 551         my $m = [ $f, $i1, $i2 ];
 552
 553         warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
 554
 555         if ($#_ % 2 != 1) {
 556                 die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
 557         }
 558
 559         while (@_) {
 560                 my $sf = shift;
 561                 my $v = shift;
 562
 563                 next unless (defined($v) && $v !~ /^\s*$/);
 564                 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
 565                 if ($sf ne '+') {
 566                         push @$m, ( $sf, $v );
 567                 } else {
 568                         $m->[ $#$m ] .= $v;
 569                 }
 570         }
 571
 572         warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
 573
 574         push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
 575 }
 576
 577 =head2 marc_duplicate
 578
 579 Generate copy of current MARC record and continue working on copy
 580
 581   marc_duplicate();
 582
 583 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
 584 C<< _get_marc_fields( offset => 42 ) >>.
 585
 586 =cut
 587
 588 sub marc_duplicate {
 589          my $m = $marc_record->[ -1 ];
 590          die "can't duplicate record which isn't defined" unless ($m);
 591          push @{ $marc_record }, dclone( $m );
 592          warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
 593          $marc_record_offset = $#{ $marc_record };
 594          warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
 595 }
 596
 597 =head2 marc_remove
 598
 599 Remove some field or subfield from MARC record.
 600
 601   marc_remove('200');
 602   marc_remove('200','a');
 603
 604 This will erase field C<200> or C<200^a> from current MARC record.
 605
 606 This is useful after calling C<marc_duplicate> or on it's own (but, you
 607 should probably just remove that subfield definition if you are not
 608 using C<marc_duplicate>).
 609
 610 FIXME: support fields < 10.
 611
 612 =cut
 613
 614 sub marc_remove {
 615         my ($f, $sf) = @_;
 616
 617         die "marc_remove needs record number" unless defined($f);
 618
 619         my $marc = $marc_record->[ $marc_record_offset ];
 620
 621         warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
 622
 623         my $i = 0;
 624         foreach ( 0 .. $#{ $marc } ) {
 625                 last unless (defined $marc->[$i]);
 626                 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
 627                 if ($marc->[$i]->[0] eq $f) {
 628                         if (! defined $sf) {
 629                                 # remove whole field
 630                                 splice @$marc, $i, 1;
 631                                 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
 632                                 $i--;
 633                         } else {
 634                                 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
 635                                         my $o = ($j * 2) + 3;
 636                                         if ($marc->[$i]->[$o] eq $sf) {
 637                                                 # remove subfield
 638                                                 splice @{$marc->[$i]}, $o, 2;
 639                                                 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
 640                                                 # is record now empty?
 641                                                 if ($#{ $marc->[$i] } == 2) {
 642                                                         splice @$marc, $i, 1;
 643                                                         warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
 644                                                         $i--;
 645                                                 };
 646                                         }
 647                                 }
 648                         }
 649                 }
 650                 $i++;
 651         }
 652
 653         warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
 654
 655         $marc_record->[ $marc_record_offset ] = $marc;
 656
 657         warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
 658 }
 659
 660 =head2 marc_original_order
 661
 662 Copy all subfields preserving original order to marc field.
 663
 664   marc_original_order( marc_field_number, original_input_field_number );
 665
 666 Please note that field numbers are consistent with other commands (marc
 667 field number first), but somewhat counter-intuitive (destination and then
 668 source).
 669
 670 You might want to use this command if you are just renaming subfields or
 671 using pre-processing modify_record in C<config.yml> and don't need any
 672 post-processing or want to preserve order of original subfields.
 673
 674
 675 =cut
 676
 677 sub marc_original_order {
 678
 679         my ($to, $from) = @_;
 680         die "marc_original_order needs from and to fields\n" unless ($from && $to);
 681
 682         return unless defined($rec->{$from});
 683
 684         my $r = $rec->{$from};
 685         die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
 686
 687         my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
 688         warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
 689
 690         foreach my $d (@$r) {
 691
 692                 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
 693                         warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
 694                         next;
 695                 }
 696
 697                 my @sfs = @{ $d->{subfields} };
 698
 699                 die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
 700
 701                 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
 702
 703                 my $m = [ $to, $i1, $i2 ];
 704
 705                 while (my $sf = shift @sfs) {
 706
 707                         warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
 708                         my $offset = shift @sfs;
 709                         die "corrupted sufields specification for field $from\n" unless defined($offset);
 710
 711                         my $v;
 712                         if (ref($d->{$sf}) eq 'ARRAY') {
 713                                 $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
 714                         } elsif ($offset == 0) {
 715                                 $v = $d->{$sf};
 716                         } else {
 717                                 die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
 718                         }
 719                         push @$m, ( $sf, $v ) if (defined($v));
 720                 }
 721
 722                 if ($#{$m} > 2) {
 723                         push @{ $marc_record->[ $marc_record_offset ] }, $m;
 724                 }
 725         }
 726
 727         warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
 728 }
 729
 730
 731 =head1 Functions to extract data from input
 732
 733 This function should be used inside functions to create C<data_structure> described
 734 above.
 735
 736 =head2 _pack_subfields_hash
 737
 738  @subfields = _pack_subfields_hash( $h );
 739  $subfields = _pack_subfields_hash( $h, 1 );
 740
 741 Return each subfield value in array or pack them all together and return scalar
 742 with subfields (denoted by C<^>) and values.
 743
 744 =cut
 745
 746 sub _pack_subfields_hash {
 747
 748         warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
 749
 750         my ($h,$include_subfields) = @_;
 751
 752         if ( defined($h->{subfields}) ) {
 753                 my $sfs = delete $h->{subfields} || die "no subfields?";
 754                 my @out;
 755                 while (@$sfs) {
 756                         my $sf = shift @$sfs;
 757                         push @out, '^' . $sf if ($include_subfields);
 758                         my $o = shift @$sfs;
 759                         if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
 760                                 # single element subfields are not arrays
 761 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
 762
 763                                 push @out, $h->{$sf};
 764                         } else {
 765 #warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
 766                                 push @out, $h->{$sf}->[$o];
 767                         }
 768                 }
 769                 if ($include_subfields) {
 770                         return join('', @out);
 771                 } else {
 772                         return @out;
 773                 }
 774         } else {
 775                 if ($include_subfields) {
 776                         my $out = '';
 777                         foreach my $sf (sort keys %$h) {
 778                                 if (ref($h->{$sf}) eq 'ARRAY') {
 779                                         $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
 780                                 } else {
 781                                         $out .= '^' . $sf . $h->{$sf};
 782                                 }
 783                         }
 784                         return $out;
 785                 } else {
 786                         # FIXME this should probably be in alphabetical order instead of hash order
 787                         values %{$h};
 788                 }
 789         }
 790 }
 791
 792 =head2 rec1
 793
 794 Return all values in some field
 795
 796   @v = rec1('200')
 797
 798 TODO: order of values is probably same as in source data, need to investigate that
 799
 800 =cut
 801
 802 sub rec1 {
 803         my $f = shift;
 804         warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
 805         return unless (defined($rec) && defined($rec->{$f}));
 806         warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
 807         if (ref($rec->{$f}) eq 'ARRAY') {
 808                 my @out;
 809                 foreach my $h ( @{ $rec->{$f} } ) {
 810                         if (ref($h) eq 'HASH') {
 811                                 push @out, ( _pack_subfields_hash( $h ) );
 812                         } else {
 813                                 push @out, $h;
 814                         }
 815                 }
 816                 return @out;
 817         } elsif( defined($rec->{$f}) ) {
 818                 return $rec->{$f};
 819         }
 820 }
 821
 822 =head2 rec2
 823
 824 Return all values in specific field and subfield
 825
 826   @v = rec2('200','a')
 827
 828 =cut
 829
 830 sub rec2 {
 831         my $f = shift;
 832         return unless (defined($rec && $rec->{$f}));
 833         my $sf = shift;
 834         warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
 835         return map {
 836                 if (ref($_->{$sf}) eq 'ARRAY') {
 837                         @{ $_->{$sf} };
 838                 } else {
 839                         $_->{$sf};
 840                 }
 841         } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
 842 }
 843
 844 =head2 rec
 845
 846 syntaxtic sugar for
 847
 848   @v = rec('200')
 849   @v = rec('200','a')
 850
 851 =cut
 852
 853 sub rec {
 854         my @out;
 855         if ($#_ == 0) {
 856                 @out = rec1(@_);
 857         } elsif ($#_ == 1) {
 858                 @out = rec2(@_);
 859         }
 860         if (@out) {
 861                 return @out;
 862         } else {
 863                 return '';
 864         }
 865 }
 866
 867 =head2 regex
 868
 869 Apply regex to some or all values
 870
 871   @v = regex( 's/foo/bar/g', @v );
 872
 873 =cut
 874
 875 sub regex {
 876         my $r = shift;
 877         my @out;
 878         #warn "r: $r\n", dump(\@_);
 879         foreach my $t (@_) {
 880                 next unless ($t);
 881                 eval "\$t =~ $r";
 882                 push @out, $t if ($t && $t ne '');
 883         }
 884         return @out;
 885 }
 886
 887 =head2 prefix
 888
 889 Prefix all values with a string
 890
 891   @v = prefix( 'my_', @v );
 892
 893 =cut
 894
 895 sub prefix {
 896         my $p = shift or return;
 897         return map { $p . $_ } grep { defined($_) } @_;
 898 }
 899
 900 =head2 suffix
 901
 902 suffix all values with a string
 903
 904   @v = suffix( '_my', @v );
 905
 906 =cut
 907
 908 sub suffix {
 909         my $s = shift or die "suffix needs string as first argument";
 910         return map { $_ . $s } grep { defined($_) } @_;
 911 }
 912
 913 =head2 surround
 914
 915 surround all values with a two strings
 916
 917   @v = surround( 'prefix_', '_suffix', @v );
 918
 919 =cut
 920
 921 sub surround {
 922         my $p = shift or die "surround need prefix as first argument";
 923         my $s = shift or die "surround needs suffix as second argument";
 924         return map { $p . $_ . $s } grep { defined($_) } @_;
 925 }
 926
 927 =head2 first
 928
 929 Return first element
 930
 931   $v = first( @v );
 932
 933 =cut
 934
 935 sub first {
 936         my $r = shift;
 937         return $r;
 938 }
 939
 940 =head2 lookup
 941
 942 Consult lookup hashes for some value
 943
 944   @v = lookup( $v );
 945   @v = lookup( @v );
 946
 947 FIXME B<currently this one is broken!>
 948
 949 =cut
 950
 951 sub lookup {
 952         my $k = shift or return;
 953         return unless (defined($lookup->{$k}));
 954         if (ref($lookup->{$k}) eq 'ARRAY') {
 955                 return @{ $lookup->{$k} };
 956         } else {
 957                 return $lookup->{$k};
 958         }
 959 }
 960
 961 =head2 save_into_lookup
 962
 963 Save value into lookup.
 964
 965   save_into_lookup($database,$input,$key,sub {
 966         # code which produce one or more values
 967   });
 968
 969 This function shouldn't be called directly, it's called from code created by L<WebPAC::Parser>.
 970
 971 =cut
 972
 973 sub save_into_lookup {
 974         my ($database,$input,$key,$coderef) = @_;
 975         die "save_into_lookup needs database" unless defined($database);
 976         die "save_into_lookup needs input" unless defined($input);
 977         die "save_into_lookup needs key" unless defined($key);
 978         die "save_into_lookup needs CODE" unless ( defined($coderef) && ref($coderef) eq 'CODE' );
 979         my $mfn = $rec->{'000'}->[0] || die "mfn not defined or zero";
 980         foreach my $v ( $coderef->() ) {
 981                 $lookup->{$database}->{$input}->{$key}->{$v}->{$mfn}++;
 982                 warn "# saved lookup $database/$input/$key [$v] $mfn\n"; #if ($debug > 1);
 983         }
 984 }
 985
 986 =head2 config
 987
 988 Consult config values stored in C<config.yml>
 989
 990   # return database code (key under databases in yaml)
 991   $database_code = config();    # use _ from hash
 992   $database_name = config('name');
 993   $database_input_name = config('input name');
 994   $tag = config('input normalize tag');
 995
 996 Up to three levels are supported.
 997
 998 =cut
 999
1000 sub config {
1001         return unless ($config);
1002
1003         my $p = shift;
1004
1005         $p ||= '';
1006
1007         my $v;
1008
1009         warn "### getting config($p)\n" if ($debug > 1);
1010
1011         my @p = split(/\s+/,$p);
1012         if ($#p < 0) {
1013                 $v = $config->{ '_' };  # special, database code
1014         } else {
1015
1016                 my $c = dclone( $config );
1017
1018                 foreach my $k (@p) {
1019                         warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
1020                         if (ref($c) eq 'ARRAY') {
1021                                 $c = shift @$c;
1022                                 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
1023                                 last;
1024                         }
1025
1026                         if (! defined($c->{$k}) ) {
1027                                 $c = undef;
1028                                 last;
1029                         } else {
1030                                 $c = $c->{$k};
1031                         }
1032                 }
1033                 $v = $c if ($c);
1034
1035         }
1036
1037         warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
1038         warn "config( '$p' ) is empty\n" if (! $v);
1039
1040         return $v;
1041 }
1042
1043 =head2 id
1044
1045 Returns unique id of this record
1046
1047   $id = id();
1048
1049 Returns C<42/2> for 2nd occurence of MFN 42.
1050
1051 =cut
1052
1053 sub id {
1054         my $mfn = $config->{_mfn} || die "no _mfn in config data";
1055         return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1056 }
1057
1058 =head2 join_with
1059
1060 Joins walues with some delimiter
1061
1062   $v = join_with(", ", @v);
1063
1064 =cut
1065
1066 sub join_with {
1067         my $d = shift;
1068         warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1069         my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1070         return '' unless defined($v);
1071         return $v;
1072 }
1073
1074 =head2 split_rec_on
1075
1076 Split record subfield on some regex and take one of parts out
1077
1078   $a_before_semi_column =
1079         split_rec_on('200','a', /\s*;\s*/, $part);
1080
1081 C<$part> is optional number of element. First element is
1082 B<1>, not 0!
1083
1084 If there is no C<$part> parameter or C<$part> is 0, this function will
1085 return all values produced by splitting.
1086
1087 =cut
1088
1089 sub split_rec_on {
1090         die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1091
1092         my ($fld, $sf, $regex, $part) = @_;
1093         warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1094
1095         my @r = rec( $fld, $sf );
1096         my $v = shift @r;
1097         warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1098
1099         return '' if ( ! defined($v) || $v =~ /^\s*$/);
1100
1101         my @s = split( $regex, $v );
1102         warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1103         if ($part && $part > 0) {
1104                 return $s[ $part - 1 ];
1105         } else {
1106                 return @s;
1107         }
1108 }
1109
1110 # END
1111 1;