git.rot13.org Git - webpac2/blob - lib/WebPAC/Normalize.pm

   1 package WebPAC::Normalize;
   2 use Exporter 'import';
   3 @EXPORT = qw/
   4         _set_rec _set_lookup
   5         _get_ds _clean_ds
   6         _debug
   7         _pack_subfields_hash
   8
   9         tag search display
  10         marc marc_indicators marc_repeatable_subfield
  11         marc_compose marc_leader
  12         marc_duplicate marc_remove
  13         marc_original_order
  14
  15         rec1 rec2 rec
  16         regex prefix suffix surround
  17         first lookup join_with
  18
  19         split_rec_on
  20 /;
  21
  22 use warnings;
  23 use strict;
  24
  25 #use base qw/WebPAC::Common/;
  26 use Data::Dump qw/dump/;
  27 use Storable qw/dclone/;
  28
  29 # debugging warn(s)
  30 my $debug = 0;
  31
  32
  33 =head1 NAME
  34
  35 WebPAC::Normalize - describe normalisaton rules using sets
  36
  37 =head1 VERSION
  38
  39 Version 0.19
  40
  41 =cut
  42
  43 our $VERSION = '0.19';
  44
  45 =head1 SYNOPSIS
  46
  47 This module uses C<conf/normalize/*.pl> files to perform normalisation
  48 from input records using perl functions which are specialized for set
  49 processing.
  50
  51 Sets are implemented as arrays, and normalisation file is valid perl, which
  52 means that you check it's validity before running WebPAC using
  53 C<perl -c normalize.pl>.
  54
  55 Normalisation can generate multiple output normalized data. For now, supported output
  56 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
  57 C<marc>.
  58
  59 =head1 FUNCTIONS
  60
  61 Functions which start with C<_> are private and used by WebPAC internally.
  62 All other functions are available for use within normalisation rules.
  63
  64 =head2 data_structure
  65
  66 Return data structure
  67
  68   my $ds = WebPAC::Normalize::data_structure(
  69         lookup => $lookup->lookup_hash,
  70         row => $row,
  71         rules => $normalize_pl_config,
  72         marc_encoding => 'utf-8',
  73         config => $config,
  74   );
  75
  76 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
  77 other are optional.
  78
  79 This function will B<die> if normalizastion can't be evaled.
  80
  81 Since this function isn't exported you have to call it with
  82 C<WebPAC::Normalize::data_structure>.
  83
  84 =cut
  85
  86 sub data_structure {
  87         my $arg = {@_};
  88
  89         die "need row argument" unless ($arg->{row});
  90         die "need normalisation argument" unless ($arg->{rules});
  91
  92         no strict 'subs';
  93         _set_lookup( $arg->{lookup} );
  94         _set_rec( $arg->{row} );
  95         _set_config( $arg->{config} );
  96         _clean_ds( %{ $arg } );
  97         eval "$arg->{rules}";
  98         die "error evaling $arg->{rules}: $@\n" if ($@);
  99
 100         return _get_ds();
 101 }
 102
 103 =head2 _set_rec
 104
 105 Set current record hash
 106
 107   _set_rec( $rec );
 108
 109 =cut
 110
 111 my $rec;
 112
 113 sub _set_rec {
 114         $rec = shift or die "no record hash";
 115 }
 116
 117 =head2 _set_config
 118
 119 Set current config hash
 120
 121   _set_config( $config );
 122
 123 Magic keys are:
 124
 125 =over 4
 126
 127 =item _
 128
 129 Code of current database
 130
 131 =item _mfn
 132
 133 Current MFN
 134
 135 =back
 136
 137 =cut
 138
 139 my $config;
 140
 141 sub _set_config {
 142         $config = shift;
 143 }
 144
 145 =head2 _get_ds
 146
 147 Return hash formatted as data structure
 148
 149   my $ds = _get_ds();
 150
 151 =cut
 152
 153 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
 154 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
 155
 156 sub _get_ds {
 157         return $out;
 158 }
 159
 160 =head2 _clean_ds
 161
 162 Clean data structure hash for next record
 163
 164   _clean_ds();
 165
 166 =cut
 167
 168 sub _clean_ds {
 169         my $a = {@_};
 170         ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
 171         ($marc_record_offset, $marc_fetch_offset) = (0,0);
 172         $marc_encoding = $a->{marc_encoding};
 173 }
 174
 175 =head2 _set_lookup
 176
 177 Set current lookup hash
 178
 179   _set_lookup( $lookup );
 180
 181 =cut
 182
 183 my $lookup;
 184
 185 sub _set_lookup {
 186         $lookup = shift;
 187 }
 188
 189 =head2 _get_marc_fields
 190
 191 Get all fields defined by calls to C<marc>
 192
 193         $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
 194
 195 We are using I<magic> which detect repeatable fields only from
 196 sequence of field/subfield data generated by normalization.
 197
 198 Repeatable field is created when there is second occurence of same subfield or
 199 if any of indicators are different.
 200
 201 This is sane for most cases. Something like:
 202
 203   900a-1 900b-1 900c-1
 204   900a-2 900b-2
 205   900a-3
 206
 207 will be created from any combination of:
 208
 209   900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
 210
 211 and following rules:
 212
 213   marc('900','a', rec('200','a') );
 214   marc('900','b', rec('200','b') );
 215   marc('900','c', rec('200','c') );
 216
 217 which might not be what you have in mind. If you need repeatable subfield,
 218 define it using C<marc_repeatable_subfield> like this:
 219
 220   marc_repeatable_subfield('900','a');
 221   marc('900','a', rec('200','a') );
 222   marc('900','b', rec('200','b') );
 223   marc('900','c', rec('200','c') );
 224
 225 will create:
 226
 227   900a-1 900a-2 900a-3 900b-1 900c-1
 228   900b-2
 229
 230 There is also support for returning next or specific using:
 231
 232   while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
 233         # do something with $mf
 234   }
 235
 236 will always return fields from next MARC record or
 237
 238   my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
 239
 240 will return 42th copy record (if it exists).
 241
 242 =cut
 243
 244 sub _get_marc_fields {
 245
 246         my $arg = {@_};
 247         warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
 248         my $offset = $marc_fetch_offset;
 249         if ($arg->{offset}) {
 250                 $offset = $arg->{offset};
 251         } elsif($arg->{fetch_next}) {
 252                 $marc_fetch_offset++;
 253         }
 254
 255         return if (! $marc_record || ref($marc_record) ne 'ARRAY');
 256
 257         warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
 258
 259         my $marc_rec = $marc_record->[ $offset ];
 260
 261         warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
 262
 263         return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
 264
 265         # first, sort all existing fields
 266         # XXX might not be needed, but modern perl might randomize elements in hash
 267         my @sorted_marc_record = sort {
 268                 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
 269         } @{ $marc_rec };
 270
 271         @sorted_marc_record = @{ $marc_rec };   ### FIXME disable sorting
 272
 273         # output marc fields
 274         my @m;
 275
 276         # count unique field-subfields (used for offset when walking to next subfield)
 277         my $u;
 278         map { $u->{ $_->[0] . ( $_->[3] || '')  }++ } @sorted_marc_record;
 279
 280         if ($debug) {
 281                 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
 282                 warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
 283                 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
 284                 warn "## subfield count = ", dump( $u ), $/;
 285         }
 286
 287         my $len = $#sorted_marc_record;
 288         my $visited;
 289         my $i = 0;
 290         my $field;
 291
 292         foreach ( 0 .. $len ) {
 293
 294                 # find next element which isn't visited
 295                 while ($visited->{$i}) {
 296                         $i = ($i + 1) % ($len + 1);
 297                 }
 298
 299                 # mark it visited
 300                 $visited->{$i}++;
 301
 302                 my $row = dclone( $sorted_marc_record[$i] );
 303
 304                 # field and subfield which is key for
 305                 # marc_repeatable_subfield and u
 306                 my $fsf = $row->[0] . ( $row->[3] || '' );
 307
 308                 if ($debug > 1) {
 309
 310                         print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
 311                         print "### this [$i]: ", dump( $row ),$/;
 312                         print "### sf: ", $row->[3], " vs ", $field->[3],
 313                                 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
 314                                 if ($#$field >= 0);
 315
 316                 }
 317
 318                 # if field exists
 319                 if ( $#$field >= 0 ) {
 320                         if (
 321                                 $row->[0] ne $field->[0] ||             # field
 322                                 $row->[1] ne $field->[1] ||             # i1
 323                                 $row->[2] ne $field->[2]                # i2
 324                         ) {
 325                                 push @m, $field;
 326                                 warn "## saved/1 ", dump( $field ),$/ if ($debug);
 327                                 $field = $row;
 328
 329                         } elsif (
 330                                 ( $row->[3] lt $field->[-2] )           # subfield which is not next (e.g. a after c)
 331                                 ||
 332                                 ( $row->[3] eq $field->[-2] &&          # same subfield, but not repeatable
 333                                         ! $marc_repeatable_subfield->{ $fsf }
 334                                 )
 335                         ) {
 336                                 push @m, $field;
 337                                 warn "## saved/2 ", dump( $field ),$/ if ($debug);
 338                                 $field = $row;
 339
 340                         } else {
 341                                 # append new subfields to existing field
 342                                 push @$field, ( $row->[3], $row->[4] );
 343                         }
 344                 } else {
 345                         # insert first field
 346                         $field = $row;
 347                 }
 348
 349                 if (! $marc_repeatable_subfield->{ $fsf }) {
 350                         # make step to next subfield
 351                         $i = ($i + $u->{ $fsf } ) % ($len + 1);
 352                 }
 353         }
 354
 355         if ($#$field >= 0) {
 356                 push @m, $field;
 357                 warn "## saved/3 ", dump( $field ),$/ if ($debug);
 358         }
 359
 360         return \@m;
 361 }
 362
 363 =head2 _debug
 364
 365 Change level of debug warnings
 366
 367   _debug( 2 );
 368
 369 =cut
 370
 371 sub _debug {
 372         my $l = shift;
 373         return $debug unless defined($l);
 374         warn "debug level $l",$/ if ($l > 0);
 375         $debug = $l;
 376 }
 377
 378 =head1 Functions to create C<data_structure>
 379
 380 Those functions generally have to first in your normalization file.
 381
 382 =head2 tag
 383
 384 Define new tag for I<search> and I<display>.
 385
 386   tag('Title', rec('200','a') );
 387
 388
 389 =cut
 390
 391 sub tag {
 392         my $name = shift or die "tag needs name as first argument";
 393         my @o = grep { defined($_) && $_ ne '' } @_;
 394         return unless (@o);
 395         $out->{$name}->{tag} = $name;
 396         $out->{$name}->{search} = \@o;
 397         $out->{$name}->{display} = \@o;
 398 }
 399
 400 =head2 display
 401
 402 Define tag just for I<display>
 403
 404   @v = display('Title', rec('200','a') );
 405
 406 =cut
 407
 408 sub display {
 409         my $name = shift or die "display needs name as first argument";
 410         my @o = grep { defined($_) && $_ ne '' } @_;
 411         return unless (@o);
 412         $out->{$name}->{tag} = $name;
 413         $out->{$name}->{display} = \@o;
 414 }
 415
 416 =head2 search
 417
 418 Prepare values just for I<search>
 419
 420   @v = search('Title', rec('200','a') );
 421
 422 =cut
 423
 424 sub search {
 425         my $name = shift or die "search needs name as first argument";
 426         my @o = grep { defined($_) && $_ ne '' } @_;
 427         return unless (@o);
 428         $out->{$name}->{tag} = $name;
 429         $out->{$name}->{search} = \@o;
 430 }
 431
 432 =head2 marc_leader
 433
 434 Setup fields within MARC leader or get leader
 435
 436   marc_leader('05','c');
 437   my $leader = marc_leader();
 438
 439 =cut
 440
 441 sub marc_leader {
 442         my ($offset,$value) = @_;
 443
 444         if ($offset) {
 445                 $out->{' leader'}->{ $offset } = $value;
 446         } else {
 447                 return $out->{' leader'};
 448         }
 449 }
 450
 451 =head2 marc
 452
 453 Save value for MARC field
 454
 455   marc('900','a', rec('200','a') );
 456   marc('001', rec('000') );
 457
 458 =cut
 459
 460 sub marc {
 461         my $f = shift or die "marc needs field";
 462         die "marc field must be numer" unless ($f =~ /^\d+$/);
 463
 464         my $sf;
 465         if ($f >= 10) {
 466                 $sf = shift or die "marc needs subfield";
 467         }
 468
 469         foreach (@_) {
 470                 my $v = $_;             # make var read-write for Encode
 471                 next unless (defined($v) && $v !~ /^\s*$/);
 472                 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
 473                 if (defined $sf) {
 474                         push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
 475                 } else {
 476                         push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
 477                 }
 478         }
 479 }
 480
 481 =head2 marc_repeatable_subfield
 482
 483 Save values for MARC repetable subfield
 484
 485   marc_repeatable_subfield('910', 'z', rec('909') );
 486
 487 =cut
 488
 489 sub marc_repeatable_subfield {
 490         my ($f,$sf) = @_;
 491         die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
 492         $marc_repeatable_subfield->{ $f . $sf }++;
 493         marc(@_);
 494 }
 495
 496 =head2 marc_indicators
 497
 498 Set both indicators for MARC field
 499
 500   marc_indicators('900', ' ', 1);
 501
 502 Any indicator value other than C<0-9> will be treated as undefined.
 503
 504 =cut
 505
 506 sub marc_indicators {
 507         my $f = shift || die "marc_indicators need field!\n";
 508         my ($i1,$i2) = @_;
 509         die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
 510         die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
 511
 512         $i1 = ' ' if ($i1 !~ /^\d$/);
 513         $i2 = ' ' if ($i2 !~ /^\d$/);
 514         @{ $marc_indicators->{$f} } = ($i1,$i2);
 515 }
 516
 517 =head2 marc_compose
 518
 519 Save values for each MARC subfield explicitly
 520
 521   marc_compose('900',
 522         'a', rec('200','a')
 523         'b', rec('201','a')
 524         'a', rec('200','b')
 525         'c', rec('200','c')
 526   );
 527
 528 If you specify C<+> for subfield, value will be appended
 529 to previous defined subfield.
 530
 531 =cut
 532
 533 sub marc_compose {
 534         my $f = shift or die "marc_compose needs field";
 535         die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
 536
 537         my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
 538         my $m = [ $f, $i1, $i2 ];
 539
 540         warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
 541
 542         if ($#_ % 2 != 1) {
 543                 die "ERROR: marc_compose",dump($f,@_)," not valid (must be even).\nDo you need to add first() or join() around some argument?\n";
 544         }
 545
 546         while (@_) {
 547                 my $sf = shift;
 548                 my $v = shift;
 549
 550                 next unless (defined($v) && $v !~ /^\s*$/);
 551                 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
 552                 if ($sf ne '+') {
 553                         push @$m, ( $sf, $v );
 554                 } else {
 555                         $m->[ $#$m ] .= $v;
 556                 }
 557         }
 558
 559         warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
 560
 561         push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
 562 }
 563
 564 =head2 marc_duplicate
 565
 566 Generate copy of current MARC record and continue working on copy
 567
 568   marc_duplicate();
 569
 570 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
 571 C<< _get_marc_fields( offset => 42 ) >>.
 572
 573 =cut
 574
 575 sub marc_duplicate {
 576          my $m = $marc_record->[ -1 ];
 577          die "can't duplicate record which isn't defined" unless ($m);
 578          push @{ $marc_record }, dclone( $m );
 579          warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
 580          $marc_record_offset = $#{ $marc_record };
 581          warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
 582 }
 583
 584 =head2 marc_remove
 585
 586 Remove some field or subfield from MARC record.
 587
 588   marc_remove('200');
 589   marc_remove('200','a');
 590
 591 This will erase field C<200> or C<200^a> from current MARC record.
 592
 593 This is useful after calling C<marc_duplicate> or on it's own (but, you
 594 should probably just remove that subfield definition if you are not
 595 using C<marc_duplicate>).
 596
 597 FIXME: support fields < 10.
 598
 599 =cut
 600
 601 sub marc_remove {
 602         my ($f, $sf) = @_;
 603
 604         die "marc_remove needs record number" unless defined($f);
 605
 606         my $marc = $marc_record->[ $marc_record_offset ];
 607
 608         warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
 609
 610         my $i = 0;
 611         foreach ( 0 .. $#{ $marc } ) {
 612                 last unless (defined $marc->[$i]);
 613                 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
 614                 if ($marc->[$i]->[0] eq $f) {
 615                         if (! defined $sf) {
 616                                 # remove whole field
 617                                 splice @$marc, $i, 1;
 618                                 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
 619                                 $i--;
 620                         } else {
 621                                 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
 622                                         my $o = ($j * 2) + 3;
 623                                         if ($marc->[$i]->[$o] eq $sf) {
 624                                                 # remove subfield
 625                                                 splice @{$marc->[$i]}, $o, 2;
 626                                                 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
 627                                                 # is record now empty?
 628                                                 if ($#{ $marc->[$i] } == 2) {
 629                                                         splice @$marc, $i, 1;
 630                                                         warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
 631                                                         $i--;
 632                                                 };
 633                                         }
 634                                 }
 635                         }
 636                 }
 637                 $i++;
 638         }
 639
 640         warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
 641
 642         $marc_record->[ $marc_record_offset ] = $marc;
 643
 644         warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
 645 }
 646
 647 =head2 marc_original_order
 648
 649 Copy all subfields preserving original order to marc field.
 650
 651   marc_original_order( marc_field_number, original_input_field_number );
 652
 653 Please note that field numbers are consistent with other commands (marc
 654 field number first), but somewhat counter-intuitive (destination and then
 655 source).
 656
 657 You might want to use this command if you are just renaming subfields or
 658 using pre-processing modify_record in C<config.yml> and don't need any
 659 post-processing or want to preserve order of original subfields.
 660
 661
 662 =cut
 663
 664 sub marc_original_order {
 665
 666         my ($to, $from) = @_;
 667         die "marc_original_order needs from and to fields\n" unless ($from && $to);
 668
 669         return unless defined($rec->{$from});
 670
 671         my $r = $rec->{$from};
 672         die "record field $from isn't array\n" unless (ref($r) eq 'ARRAY');
 673
 674         my ($i1,$i2) = defined($marc_indicators->{$to}) ? @{ $marc_indicators->{$to} } : (' ',' ');
 675         warn "## marc_original_order($to,$from) source = ", dump( $r ),$/ if ($debug > 1);
 676
 677         foreach my $d (@$r) {
 678
 679                 if (! defined($d->{subfields}) && ref($d->{subfields}) ne 'ARRAY') {
 680                         warn "# marc_original_order($to,$from): field $from doesn't have subfields specification\n";
 681                         next;
 682                 }
 683
 684                 my @sfs = @{ $d->{subfields} };
 685
 686                 die "field $from doesn't have even number of subfields specifications\n" unless($#sfs % 2 == 1);
 687
 688                 warn "#--> d: ",dump($d), "\n#--> sfs: ",dump(@sfs),$/ if ($debug > 2);
 689
 690                 my $m = [ $to, $i1, $i2 ];
 691
 692                 while (my $sf = shift @sfs) {
 693
 694                         warn "#--> sf: ",dump($sf), $/ if ($debug > 2);
 695                         my $offset = shift @sfs;
 696                         die "corrupted sufields specification for field $from\n" unless defined($offset);
 697
 698                         my $v;
 699                         if (ref($d->{$sf}) eq 'ARRAY') {
 700                                 $v = $d->{$sf}->[$offset] if (defined($d->{$sf}->[$offset]));
 701                         } elsif ($offset == 0) {
 702                                 $v = $d->{$sf};
 703                         } else {
 704                                 die "field $from subfield '$sf' need occurence $offset which doesn't exist", dump($d->{$sf});
 705                         }
 706                         push @$m, ( $sf, $v ) if (defined($v));
 707                 }
 708
 709                 if ($#{$m} > 2) {
 710                         push @{ $marc_record->[ $marc_record_offset ] }, $m;
 711                 }
 712         }
 713
 714         warn "## marc_record = ", dump( $marc_record ),$/ if ($debug > 1);
 715 }
 716
 717
 718 =head1 Functions to extract data from input
 719
 720 This function should be used inside functions to create C<data_structure> described
 721 above.
 722
 723 =head2 _pack_subfields_hash
 724
 725  @values = _pack_subfields_hash( $h, $include_subfields )
 726
 727 =cut
 728
 729 sub _pack_subfields_hash {
 730
 731         warn "## _pack_subfields_hash( ",dump(@_), " )\n" if ($debug > 1);
 732
 733         my ($h,$include_subfields) = @_;
 734
 735         if ( defined($h->{subfields}) ) {
 736                 my $sfs = delete $h->{subfields} || die "no subfields?";
 737                 my @out;
 738                 while (@$sfs) {
 739                         my $sf = shift @$sfs;
 740                         push @out, '^' . $sf if ($include_subfields);
 741                         my $o = shift @$sfs;
 742                         if ($o == 0 && ref( $h->{$sf} ) ne 'ARRAY' ) {
 743                                 # single element subfields are not arrays
 744 warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
 745
 746                                 push @out, $h->{$sf};
 747                         } else {
 748 warn "====> $sf $o / $#$sfs ", dump( $sfs, $h->{$sf} ), "\n";
 749                                 push @out, $h->{$sf}->[$o];
 750                         }
 751                 }
 752                 if ($include_subfields) {
 753                         return join('', @out);
 754                 } else {
 755                         return @out;
 756                 }
 757         } else {
 758                 if ($include_subfields) {
 759                         my $out = '';
 760                         foreach my $sf (keys %$h) {
 761                                 if (ref($h->{$sf}) eq 'ARRAY') {
 762                                         $out .= '^' . $sf . join('^' . $sf, @{ $h->{$sf} });
 763                                 } else {
 764                                         $out .= '^' . $sf . $h->{$sf};
 765                                 }
 766                         }
 767                         return $out;
 768                 } else {
 769                         # FIXME this should probably be in alphabetical order instead of hash order
 770                         values %{$h};
 771                 }
 772         }
 773 }
 774
 775 =head2 rec1
 776
 777 Return all values in some field
 778
 779   @v = rec1('200')
 780
 781 TODO: order of values is probably same as in source data, need to investigate that
 782
 783 =cut
 784
 785 sub rec1 {
 786         my $f = shift;
 787         warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
 788         return unless (defined($rec) && defined($rec->{$f}));
 789         warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
 790         if (ref($rec->{$f}) eq 'ARRAY') {
 791                 my @out;
 792                 foreach my $h ( @{ $rec->{$f} } ) {
 793                         if (ref($h) eq 'HASH') {
 794                                 push @out, ( _pack_subfields_hash( $h ) );
 795                         } else {
 796                                 push @out, $h;
 797                         }
 798                 }
 799                 return @out;
 800         } elsif( defined($rec->{$f}) ) {
 801                 return $rec->{$f};
 802         }
 803 }
 804
 805 =head2 rec2
 806
 807 Return all values in specific field and subfield
 808
 809   @v = rec2('200','a')
 810
 811 =cut
 812
 813 sub rec2 {
 814         my $f = shift;
 815         return unless (defined($rec && $rec->{$f}));
 816         my $sf = shift;
 817         warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
 818         return map {
 819                 if (ref($_->{$sf}) eq 'ARRAY') {
 820                         @{ $_->{$sf} };
 821                 } else {
 822                         $_->{$sf};
 823                 }
 824         } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
 825 }
 826
 827 =head2 rec
 828
 829 syntaxtic sugar for
 830
 831   @v = rec('200')
 832   @v = rec('200','a')
 833
 834 =cut
 835
 836 sub rec {
 837         my @out;
 838         if ($#_ == 0) {
 839                 @out = rec1(@_);
 840         } elsif ($#_ == 1) {
 841                 @out = rec2(@_);
 842         }
 843         if (@out) {
 844                 return @out;
 845         } else {
 846                 return '';
 847         }
 848 }
 849
 850 =head2 regex
 851
 852 Apply regex to some or all values
 853
 854   @v = regex( 's/foo/bar/g', @v );
 855
 856 =cut
 857
 858 sub regex {
 859         my $r = shift;
 860         my @out;
 861         #warn "r: $r\n", dump(\@_);
 862         foreach my $t (@_) {
 863                 next unless ($t);
 864                 eval "\$t =~ $r";
 865                 push @out, $t if ($t && $t ne '');
 866         }
 867         return @out;
 868 }
 869
 870 =head2 prefix
 871
 872 Prefix all values with a string
 873
 874   @v = prefix( 'my_', @v );
 875
 876 =cut
 877
 878 sub prefix {
 879         my $p = shift or return;
 880         return map { $p . $_ } grep { defined($_) } @_;
 881 }
 882
 883 =head2 suffix
 884
 885 suffix all values with a string
 886
 887   @v = suffix( '_my', @v );
 888
 889 =cut
 890
 891 sub suffix {
 892         my $s = shift or die "suffix needs string as first argument";
 893         return map { $_ . $s } grep { defined($_) } @_;
 894 }
 895
 896 =head2 surround
 897
 898 surround all values with a two strings
 899
 900   @v = surround( 'prefix_', '_suffix', @v );
 901
 902 =cut
 903
 904 sub surround {
 905         my $p = shift or die "surround need prefix as first argument";
 906         my $s = shift or die "surround needs suffix as second argument";
 907         return map { $p . $_ . $s } grep { defined($_) } @_;
 908 }
 909
 910 =head2 first
 911
 912 Return first element
 913
 914   $v = first( @v );
 915
 916 =cut
 917
 918 sub first {
 919         my $r = shift;
 920         return $r;
 921 }
 922
 923 =head2 lookup
 924
 925 Consult lookup hashes for some value
 926
 927   @v = lookup( $v );
 928   @v = lookup( @v );
 929
 930 =cut
 931
 932 sub lookup {
 933         my $k = shift or return;
 934         return unless (defined($lookup->{$k}));
 935         if (ref($lookup->{$k}) eq 'ARRAY') {
 936                 return @{ $lookup->{$k} };
 937         } else {
 938                 return $lookup->{$k};
 939         }
 940 }
 941
 942 =head2 config
 943
 944 Consult config values stored in C<config.yml>
 945
 946   # return database code (key under databases in yaml)
 947   $database_code = config();    # use _ from hash
 948   $database_name = config('name');
 949   $database_input_name = config('input name');
 950   $tag = config('input normalize tag');
 951
 952 Up to three levels are supported.
 953
 954 =cut
 955
 956 sub config {
 957         return unless ($config);
 958
 959         my $p = shift;
 960
 961         $p ||= '';
 962
 963         my $v;
 964
 965         warn "### getting config($p)\n" if ($debug > 1);
 966
 967         my @p = split(/\s+/,$p);
 968         if ($#p < 0) {
 969                 $v = $config->{ '_' };  # special, database code
 970         } else {
 971
 972                 my $c = dclone( $config );
 973
 974                 foreach my $k (@p) {
 975                         warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
 976                         if (ref($c) eq 'ARRAY') {
 977                                 $c = shift @$c;
 978                                 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
 979                                 last;
 980                         }
 981
 982                         if (! defined($c->{$k}) ) {
 983                                 $c = undef;
 984                                 last;
 985                         } else {
 986                                 $c = $c->{$k};
 987                         }
 988                 }
 989                 $v = $c if ($c);
 990
 991         }
 992
 993         warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
 994         warn "config( '$p' ) is empty\n" if (! $v);
 995
 996         return $v;
 997 }
 998
 999 =head2 id
1000
1001 Returns unique id of this record
1002
1003   $id = id();
1004
1005 Returns C<42/2> for 2nd occurence of MFN 42.
1006
1007 =cut
1008
1009 sub id {
1010         my $mfn = $config->{_mfn} || die "no _mfn in config data";
1011         return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
1012 }
1013
1014 =head2 join_with
1015
1016 Joins walues with some delimiter
1017
1018   $v = join_with(", ", @v);
1019
1020 =cut
1021
1022 sub join_with {
1023         my $d = shift;
1024         warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
1025         my $v = join($d, grep { defined($_) && $_ ne '' } @_);
1026         return '' unless defined($v);
1027         return $v;
1028 }
1029
1030 =head2 split_rec_on
1031
1032 Split record subfield on some regex and take one of parts out
1033
1034   $a_before_semi_column =
1035         split_rec_on('200','a', /\s*;\s*/, $part);
1036
1037 C<$part> is optional number of element. First element is
1038 B<1>, not 0!
1039
1040 If there is no C<$part> parameter or C<$part> is 0, this function will
1041 return all values produced by splitting.
1042
1043 =cut
1044
1045 sub split_rec_on {
1046         die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
1047
1048         my ($fld, $sf, $regex, $part) = @_;
1049         warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
1050
1051         my @r = rec( $fld, $sf );
1052         my $v = shift @r;
1053         warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
1054
1055         return '' if ( ! defined($v) || $v =~ /^\s*$/);
1056
1057         my @s = split( $regex, $v );
1058         warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
1059         if ($part && $part > 0) {
1060                 return $s[ $part - 1 ];
1061         } else {
1062                 return @s;
1063         }
1064 }
1065
1066 # END
1067 1;