git.rot13.org Git - webpac2/blob - lib/WebPAC/Normalize.pm

   1 package WebPAC::Normalize;
   2 use Exporter 'import';
   3 @EXPORT = qw/
   4         _set_rec _set_lookup
   5         _get_ds _clean_ds
   6         _debug
   7
   8         tag search display
   9         marc marc_indicators marc_repeatable_subfield
  10         marc_compose marc_leader
  11         marc_duplicate marc_remove
  12
  13         rec1 rec2 rec
  14         regex prefix suffix surround
  15         first lookup join_with
  16
  17         split_rec_on
  18 /;
  19
  20 use warnings;
  21 use strict;
  22
  23 #use base qw/WebPAC::Common/;
  24 use Data::Dump qw/dump/;
  25 use Encode qw/from_to/;
  26 use Storable qw/dclone/;
  27
  28 # debugging warn(s)
  29 my $debug = 0;
  30
  31
  32 =head1 NAME
  33
  34 WebPAC::Normalize - describe normalisaton rules using sets
  35
  36 =head1 VERSION
  37
  38 Version 0.15
  39
  40 =cut
  41
  42 our $VERSION = '0.15';
  43
  44 =head1 SYNOPSIS
  45
  46 This module uses C<conf/normalize/*.pl> files to perform normalisation
  47 from input records using perl functions which are specialized for set
  48 processing.
  49
  50 Sets are implemented as arrays, and normalisation file is valid perl, which
  51 means that you check it's validity before running WebPAC using
  52 C<perl -c normalize.pl>.
  53
  54 Normalisation can generate multiple output normalized data. For now, supported output
  55 types (on the left side of definition) are: C<tag>, C<display>, C<search> and
  56 C<marc>.
  57
  58 =head1 FUNCTIONS
  59
  60 Functions which start with C<_> are private and used by WebPAC internally.
  61 All other functions are available for use within normalisation rules.
  62
  63 =head2 data_structure
  64
  65 Return data structure
  66
  67   my $ds = WebPAC::Normalize::data_structure(
  68         lookup => $lookup->lookup_hash,
  69         row => $row,
  70         rules => $normalize_pl_config,
  71         marc_encoding => 'utf-8',
  72         config => $config,
  73   );
  74
  75 Options C<lookup>, C<row>, C<rules> and C<log> are mandatory while all
  76 other are optional.
  77
  78 This function will B<die> if normalizastion can't be evaled.
  79
  80 Since this function isn't exported you have to call it with
  81 C<WebPAC::Normalize::data_structure>.
  82
  83 =cut
  84
  85 sub data_structure {
  86         my $arg = {@_};
  87
  88         die "need row argument" unless ($arg->{row});
  89         die "need normalisation argument" unless ($arg->{rules});
  90
  91         no strict 'subs';
  92         _set_lookup( $arg->{lookup} );
  93         _set_rec( $arg->{row} );
  94         _set_config( $arg->{config} );
  95         _clean_ds( %{ $arg } );
  96         eval "$arg->{rules}";
  97         die "error evaling $arg->{rules}: $@\n" if ($@);
  98
  99         return _get_ds();
 100 }
 101
 102 =head2 _set_rec
 103
 104 Set current record hash
 105
 106   _set_rec( $rec );
 107
 108 =cut
 109
 110 my $rec;
 111
 112 sub _set_rec {
 113         $rec = shift or die "no record hash";
 114 }
 115
 116 =head2 _set_config
 117
 118 Set current config hash
 119
 120   _set_config( $config );
 121
 122 Magic keys are:
 123
 124 =over 4
 125
 126 =item _
 127
 128 Code of current database
 129
 130 =item _mfn
 131
 132 Current MFN
 133
 134 =back
 135
 136 =cut
 137
 138 my $config;
 139
 140 sub _set_config {
 141         $config = shift;
 142 }
 143
 144 =head2 _get_ds
 145
 146 Return hash formatted as data structure
 147
 148   my $ds = _get_ds();
 149
 150 =cut
 151
 152 my ($out, $marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators);
 153 my ($marc_record_offset, $marc_fetch_offset) = (0, 0);
 154
 155 sub _get_ds {
 156         return $out;
 157 }
 158
 159 =head2 _clean_ds
 160
 161 Clean data structure hash for next record
 162
 163   _clean_ds();
 164
 165 =cut
 166
 167 sub _clean_ds {
 168         my $a = {@_};
 169         ($out,$marc_record, $marc_encoding, $marc_repeatable_subfield, $marc_indicators) = ();
 170         ($marc_record_offset, $marc_fetch_offset) = (0,0);
 171         $marc_encoding = $a->{marc_encoding};
 172 }
 173
 174 =head2 _set_lookup
 175
 176 Set current lookup hash
 177
 178   _set_lookup( $lookup );
 179
 180 =cut
 181
 182 my $lookup;
 183
 184 sub _set_lookup {
 185         $lookup = shift;
 186 }
 187
 188 =head2 _get_marc_fields
 189
 190 Get all fields defined by calls to C<marc>
 191
 192         $marc->add_fields( WebPAC::Normalize:_get_marc_fields() );
 193
 194 We are using I<magic> which detect repeatable fields only from
 195 sequence of field/subfield data generated by normalization.
 196
 197 Repeatable field is created when there is second occurence of same subfield or
 198 if any of indicators are different.
 199
 200 This is sane for most cases. Something like:
 201
 202   900a-1 900b-1 900c-1
 203   900a-2 900b-2
 204   900a-3
 205
 206 will be created from any combination of:
 207
 208   900a-1 900a-2 900a-3 900b-1 900b-2 900c-1
 209
 210 and following rules:
 211
 212   marc('900','a', rec('200','a') );
 213   marc('900','b', rec('200','b') );
 214   marc('900','c', rec('200','c') );
 215
 216 which might not be what you have in mind. If you need repeatable subfield,
 217 define it using C<marc_repeatable_subfield> like this:
 218
 219   marc_repeatable_subfield('900','a');
 220   marc('900','a', rec('200','a') );
 221   marc('900','b', rec('200','b') );
 222   marc('900','c', rec('200','c') );
 223
 224 will create:
 225
 226   900a-1 900a-2 900a-3 900b-1 900c-1
 227   900b-2
 228
 229 There is also support for returning next or specific using:
 230
 231   while (my $mf = WebPAC::Normalize:_get_marc_fields( fetch_next => 1 ) ) {
 232         # do something with $mf
 233   }
 234
 235 will always return fields from next MARC record or
 236
 237   my $mf = WebPAC::Normalize::_get_marc_fields( offset => 42 );
 238
 239 will return 42th copy record (if it exists).
 240
 241 =cut
 242
 243 sub _get_marc_fields {
 244
 245         my $arg = {@_};
 246         warn "### _get_marc_fields arg: ", dump($arg), $/ if ($debug > 2);
 247         my $offset = $marc_fetch_offset;
 248         if ($arg->{offset}) {
 249                 $offset = $arg->{offset};
 250         } elsif($arg->{fetch_next}) {
 251                 $marc_fetch_offset++;
 252         }
 253
 254         return if (! $marc_record || ref($marc_record) ne 'ARRAY');
 255
 256         warn "### full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 2);
 257
 258         my $marc_rec = $marc_record->[ $offset ];
 259
 260         warn "## _get_marc_fields (at offset: $offset) -- marc_record = ", dump( @$marc_rec ), $/ if ($debug > 1);
 261
 262         return if (! $marc_rec || ref($marc_rec) ne 'ARRAY' || $#{ $marc_rec } < 0);
 263
 264         # first, sort all existing fields
 265         # XXX might not be needed, but modern perl might randomize elements in hash
 266         my @sorted_marc_record = sort {
 267                 $a->[0] . ( $a->[3] || '' ) cmp $b->[0] . ( $b->[3] || '')
 268         } @{ $marc_rec };
 269
 270         @sorted_marc_record = @{ $marc_rec };   ### FIXME disable sorting
 271
 272         # output marc fields
 273         my @m;
 274
 275         # count unique field-subfields (used for offset when walking to next subfield)
 276         my $u;
 277         map { $u->{ $_->[0] . ( $_->[3] || '')  }++ } @sorted_marc_record;
 278
 279         if ($debug) {
 280                 warn "## marc_repeatable_subfield = ", dump( $marc_repeatable_subfield ), $/ if ( $marc_repeatable_subfield );
 281                 warn "## marc_record[$offset] = ", dump( $marc_rec ), $/;
 282                 warn "## sorted_marc_record = ", dump( \@sorted_marc_record ), $/;
 283                 warn "## subfield count = ", dump( $u ), $/;
 284         }
 285
 286         my $len = $#sorted_marc_record;
 287         my $visited;
 288         my $i = 0;
 289         my $field;
 290
 291         foreach ( 0 .. $len ) {
 292
 293                 # find next element which isn't visited
 294                 while ($visited->{$i}) {
 295                         $i = ($i + 1) % ($len + 1);
 296                 }
 297
 298                 # mark it visited
 299                 $visited->{$i}++;
 300
 301                 my $row = dclone( $sorted_marc_record[$i] );
 302
 303                 # field and subfield which is key for
 304                 # marc_repeatable_subfield and u
 305                 my $fsf = $row->[0] . ( $row->[3] || '' );
 306
 307                 if ($debug > 1) {
 308
 309                         print "### field so far [", $#$field, "] : ", dump( $field ), " ", $field ? 'T' : 'F', $/;
 310                         print "### this [$i]: ", dump( $row ),$/;
 311                         print "### sf: ", $row->[3], " vs ", $field->[3],
 312                                 $marc_repeatable_subfield->{ $row->[0] . $row->[3] } ? ' (repeatable)' : '', $/,
 313                                 if ($#$field >= 0);
 314
 315                 }
 316
 317                 # if field exists
 318                 if ( $#$field >= 0 ) {
 319                         if (
 320                                 $row->[0] ne $field->[0] ||             # field
 321                                 $row->[1] ne $field->[1] ||             # i1
 322                                 $row->[2] ne $field->[2]                # i2
 323                         ) {
 324                                 push @m, $field;
 325                                 warn "## saved/1 ", dump( $field ),$/ if ($debug);
 326                                 $field = $row;
 327
 328                         } elsif (
 329                                 ( $row->[3] lt $field->[-2] )           # subfield which is not next (e.g. a after c)
 330                                 ||
 331                                 ( $row->[3] eq $field->[-2] &&          # same subfield, but not repeatable
 332                                         ! $marc_repeatable_subfield->{ $fsf }
 333                                 )
 334                         ) {
 335                                 push @m, $field;
 336                                 warn "## saved/2 ", dump( $field ),$/ if ($debug);
 337                                 $field = $row;
 338
 339                         } else {
 340                                 # append new subfields to existing field
 341                                 push @$field, ( $row->[3], $row->[4] );
 342                         }
 343                 } else {
 344                         # insert first field
 345                         $field = $row;
 346                 }
 347
 348                 if (! $marc_repeatable_subfield->{ $fsf }) {
 349                         # make step to next subfield
 350                         $i = ($i + $u->{ $fsf } ) % ($len + 1);
 351                 }
 352         }
 353
 354         if ($#$field >= 0) {
 355                 push @m, $field;
 356                 warn "## saved/3 ", dump( $field ),$/ if ($debug);
 357         }
 358
 359         return \@m;
 360 }
 361
 362 =head2 _debug
 363
 364 Change level of debug warnings
 365
 366   _debug( 2 );
 367
 368 =cut
 369
 370 sub _debug {
 371         my $l = shift;
 372         return $debug unless defined($l);
 373         warn "debug level $l",$/ if ($l > 0);
 374         $debug = $l;
 375 }
 376
 377 =head1 Functions to create C<data_structure>
 378
 379 Those functions generally have to first in your normalization file.
 380
 381 =head2 tag
 382
 383 Define new tag for I<search> and I<display>.
 384
 385   tag('Title', rec('200','a') );
 386
 387
 388 =cut
 389
 390 sub tag {
 391         my $name = shift or die "tag needs name as first argument";
 392         my @o = grep { defined($_) && $_ ne '' } @_;
 393         return unless (@o);
 394         $out->{$name}->{tag} = $name;
 395         $out->{$name}->{search} = \@o;
 396         $out->{$name}->{display} = \@o;
 397 }
 398
 399 =head2 display
 400
 401 Define tag just for I<display>
 402
 403   @v = display('Title', rec('200','a') );
 404
 405 =cut
 406
 407 sub display {
 408         my $name = shift or die "display needs name as first argument";
 409         my @o = grep { defined($_) && $_ ne '' } @_;
 410         return unless (@o);
 411         $out->{$name}->{tag} = $name;
 412         $out->{$name}->{display} = \@o;
 413 }
 414
 415 =head2 search
 416
 417 Prepare values just for I<search>
 418
 419   @v = search('Title', rec('200','a') );
 420
 421 =cut
 422
 423 sub search {
 424         my $name = shift or die "search needs name as first argument";
 425         my @o = grep { defined($_) && $_ ne '' } @_;
 426         return unless (@o);
 427         $out->{$name}->{tag} = $name;
 428         $out->{$name}->{search} = \@o;
 429 }
 430
 431 =head2 marc_leader
 432
 433 Setup fields within MARC leader or get leader
 434
 435   marc_leader('05','c');
 436   my $leader = marc_leader();
 437
 438 =cut
 439
 440 sub marc_leader {
 441         my ($offset,$value) = @_;
 442
 443         if ($offset) {
 444                 $out->{' leader'}->{ $offset } = $value;
 445         } else {
 446                 return $out->{' leader'};
 447         }
 448 }
 449
 450 =head2 marc
 451
 452 Save value for MARC field
 453
 454   marc('900','a', rec('200','a') );
 455   marc('001', rec('000') );
 456
 457 =cut
 458
 459 sub marc {
 460         my $f = shift or die "marc needs field";
 461         die "marc field must be numer" unless ($f =~ /^\d+$/);
 462
 463         my $sf;
 464         if ($f >= 10) {
 465                 $sf = shift or die "marc needs subfield";
 466         }
 467
 468         foreach (@_) {
 469                 my $v = $_;             # make var read-write for Encode
 470                 next unless (defined($v) && $v !~ /^\s*$/);
 471                 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
 472                 my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
 473                 if (defined $sf) {
 474                         push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $i1, $i2, $sf => $v ];
 475                 } else {
 476                         push @{ $marc_record->[ $marc_record_offset ] }, [ $f, $v ];
 477                 }
 478         }
 479 }
 480
 481 =head2 marc_repeatable_subfield
 482
 483 Save values for MARC repetable subfield
 484
 485   marc_repeatable_subfield('910', 'z', rec('909') );
 486
 487 =cut
 488
 489 sub marc_repeatable_subfield {
 490         my ($f,$sf) = @_;
 491         die "marc_repeatable_subfield need field and subfield!\n" unless ($f && $sf);
 492         $marc_repeatable_subfield->{ $f . $sf }++;
 493         marc(@_);
 494 }
 495
 496 =head2 marc_indicators
 497
 498 Set both indicators for MARC field
 499
 500   marc_indicators('900', ' ', 1);
 501
 502 Any indicator value other than C<0-9> will be treated as undefined.
 503
 504 =cut
 505
 506 sub marc_indicators {
 507         my $f = shift || die "marc_indicators need field!\n";
 508         my ($i1,$i2) = @_;
 509         die "marc_indicators($f, ...) need i1!\n" unless(defined($i1));
 510         die "marc_indicators($f, $i1, ...) need i2!\n" unless(defined($i2));
 511
 512         $i1 = ' ' if ($i1 !~ /^\d$/);
 513         $i2 = ' ' if ($i2 !~ /^\d$/);
 514         @{ $marc_indicators->{$f} } = ($i1,$i2);
 515 }
 516
 517 =head2 marc_compose
 518
 519 Save values for each MARC subfield explicitly
 520
 521   marc_compose('900',
 522         'a', rec('200','a')
 523         'b', rec('201','a')
 524         'a', rec('200','b')
 525         'c', rec('200','c')
 526   );
 527
 528 If you specify C<+> for subfield, value will be appended
 529 to previous defined subfield.
 530
 531 =cut
 532
 533 sub marc_compose {
 534         my $f = shift or die "marc_compose needs field";
 535         die "marc_compose field must be numer" unless ($f =~ /^\d+$/);
 536
 537         my ($i1,$i2) = defined($marc_indicators->{$f}) ? @{ $marc_indicators->{$f} } : (' ',' ');
 538         my $m = [ $f, $i1, $i2 ];
 539
 540         warn "### marc_compose input subfields = ", dump(@_),$/ if ($debug > 2);
 541
 542         while (@_) {
 543                 my $sf = shift or die "marc_compose $f needs subfield";
 544                 my $v = shift;
 545
 546                 next unless (defined($v) && $v !~ /^\s*$/);
 547                 from_to($v, 'iso-8859-2', $marc_encoding) if ($marc_encoding);
 548                 warn "## ++ marc_compose($f,$sf,$v) ", dump( $m ),$/ if ($debug > 1);
 549                 if ($sf ne '+') {
 550                         push @$m, ( $sf, $v );
 551                 } else {
 552                         $m->[ $#$m ] .= $v;
 553                 }
 554         }
 555
 556         warn "## marc_compose current marc = ", dump( $m ),$/ if ($debug > 1);
 557
 558         push @{ $marc_record->[ $marc_record_offset ] }, $m if ($#{$m} > 2);
 559 }
 560
 561 =head2 marc_duplicate
 562
 563 Generate copy of current MARC record and continue working on copy
 564
 565   marc_duplicate();
 566
 567 Copies can be accessed using C<< _get_marc_fields( fetch_next => 1 ) >> or
 568 C<< _get_marc_fields( offset => 42 ) >>.
 569
 570 =cut
 571
 572 sub marc_duplicate {
 573          my $m = $marc_record->[ -1 ];
 574          die "can't duplicate record which isn't defined" unless ($m);
 575          push @{ $marc_record }, dclone( $m );
 576          warn "## marc_duplicate = ", dump(@$marc_record), $/ if ($debug > 1);
 577          $marc_record_offset = $#{ $marc_record };
 578          warn "## marc_record_offset = $marc_record_offset", $/ if ($debug > 1);
 579 }
 580
 581 =head2 marc_remove
 582
 583 Remove some field or subfield from MARC record.
 584
 585   marc_remove('200');
 586   marc_remove('200','a');
 587
 588 This will erase field C<200> or C<200^a> from current MARC record.
 589
 590 This is useful after calling C<marc_duplicate> or on it's own (but, you
 591 should probably just remove that subfield definition if you are not
 592 using C<marc_duplicate>).
 593
 594 FIXME: support fields < 10.
 595
 596 =cut
 597
 598 sub marc_remove {
 599         my ($f, $sf) = @_;
 600
 601         die "marc_remove needs record number" unless defined($f);
 602
 603         my $marc = $marc_record->[ $marc_record_offset ];
 604
 605         warn "### marc_remove before = ", dump( $marc ), $/ if ($debug > 2);
 606
 607         my $i = 0;
 608         foreach ( 0 .. $#{ $marc } ) {
 609                 last unless (defined $marc->[$i]);
 610                 warn "#### working on ",dump( @{ $marc->[$i] }), $/ if ($debug > 3);
 611                 if ($marc->[$i]->[0] eq $f) {
 612                         if (! defined $sf) {
 613                                 # remove whole field
 614                                 splice @$marc, $i, 1;
 615                                 warn "#### slice \@\$marc, $i, 1 = ",dump( @{ $marc }), $/ if ($debug > 3);
 616                                 $i--;
 617                         } else {
 618                                 foreach my $j ( 0 .. (( $#{ $marc->[$i] } - 3 ) / 2) ) {
 619                                         my $o = ($j * 2) + 3;
 620                                         if ($marc->[$i]->[$o] eq $sf) {
 621                                                 # remove subfield
 622                                                 splice @{$marc->[$i]}, $o, 2;
 623                                                 warn "#### slice \@{\$marc->[$i]}, $o, 2 = ", dump( @{ $marc }), $/ if ($debug > 3);
 624                                                 # is record now empty?
 625                                                 if ($#{ $marc->[$i] } == 2) {
 626                                                         splice @$marc, $i, 1;
 627                                                         warn "#### slice \@\$marc, $i, 1 = ", dump( @{ $marc }), $/ if ($debug > 3);
 628                                                         $i--;
 629                                                 };
 630                                         }
 631                                 }
 632                         }
 633                 }
 634                 $i++;
 635         }
 636
 637         warn "### marc_remove($f", $sf ? ",$sf" : "", ") after = ", dump( $marc ), $/ if ($debug > 2);
 638
 639         $marc_record->[ $marc_record_offset ] = $marc;
 640
 641         warn "## full marc_record = ", dump( @{ $marc_record }), $/ if ($debug > 1);
 642 }
 643
 644 =head1 Functions to extract data from input
 645
 646 This function should be used inside functions to create C<data_structure> described
 647 above.
 648
 649 =head2 rec1
 650
 651 Return all values in some field
 652
 653   @v = rec1('200')
 654
 655 TODO: order of values is probably same as in source data, need to investigate that
 656
 657 =cut
 658
 659 sub rec1 {
 660         my $f = shift;
 661         warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
 662         return unless (defined($rec) && defined($rec->{$f}));
 663         warn "rec1($f) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
 664         if (ref($rec->{$f}) eq 'ARRAY') {
 665                 return map {
 666                         if (ref($_) eq 'HASH') {
 667                                 values %{$_};
 668                         } else {
 669                                 $_;
 670                         }
 671                 } @{ $rec->{$f} };
 672         } elsif( defined($rec->{$f}) ) {
 673                 return $rec->{$f};
 674         }
 675 }
 676
 677 =head2 rec2
 678
 679 Return all values in specific field and subfield
 680
 681   @v = rec2('200','a')
 682
 683 =cut
 684
 685 sub rec2 {
 686         my $f = shift;
 687         return unless (defined($rec && $rec->{$f}));
 688         my $sf = shift;
 689         warn "rec2($f,$sf) = ", dump( $rec->{$f} ), $/ if ($debug > 1);
 690         return map {
 691                 if (ref($_->{$sf}) eq 'ARRAY') {
 692                         @{ $_->{$sf} };
 693                 } else {
 694                         $_->{$sf};
 695                 }
 696         } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
 697 }
 698
 699 =head2 rec
 700
 701 syntaxtic sugar for
 702
 703   @v = rec('200')
 704   @v = rec('200','a')
 705
 706 =cut
 707
 708 sub rec {
 709         my @out;
 710         if ($#_ == 0) {
 711                 @out = rec1(@_);
 712         } elsif ($#_ == 1) {
 713                 @out = rec2(@_);
 714         }
 715         if (@out) {
 716                 return @out;
 717         } else {
 718                 return '';
 719         }
 720 }
 721
 722 =head2 regex
 723
 724 Apply regex to some or all values
 725
 726   @v = regex( 's/foo/bar/g', @v );
 727
 728 =cut
 729
 730 sub regex {
 731         my $r = shift;
 732         my @out;
 733         #warn "r: $r\n", dump(\@_);
 734         foreach my $t (@_) {
 735                 next unless ($t);
 736                 eval "\$t =~ $r";
 737                 push @out, $t if ($t && $t ne '');
 738         }
 739         return @out;
 740 }
 741
 742 =head2 prefix
 743
 744 Prefix all values with a string
 745
 746   @v = prefix( 'my_', @v );
 747
 748 =cut
 749
 750 sub prefix {
 751         my $p = shift or return;
 752         return map { $p . $_ } grep { defined($_) } @_;
 753 }
 754
 755 =head2 suffix
 756
 757 suffix all values with a string
 758
 759   @v = suffix( '_my', @v );
 760
 761 =cut
 762
 763 sub suffix {
 764         my $s = shift or die "suffix needs string as first argument";
 765         return map { $_ . $s } grep { defined($_) } @_;
 766 }
 767
 768 =head2 surround
 769
 770 surround all values with a two strings
 771
 772   @v = surround( 'prefix_', '_suffix', @v );
 773
 774 =cut
 775
 776 sub surround {
 777         my $p = shift or die "surround need prefix as first argument";
 778         my $s = shift or die "surround needs suffix as second argument";
 779         return map { $p . $_ . $s } grep { defined($_) } @_;
 780 }
 781
 782 =head2 first
 783
 784 Return first element
 785
 786   $v = first( @v );
 787
 788 =cut
 789
 790 sub first {
 791         my $r = shift;
 792         return $r;
 793 }
 794
 795 =head2 lookup
 796
 797 Consult lookup hashes for some value
 798
 799   @v = lookup( $v );
 800   @v = lookup( @v );
 801
 802 =cut
 803
 804 sub lookup {
 805         my $k = shift or return;
 806         return unless (defined($lookup->{$k}));
 807         if (ref($lookup->{$k}) eq 'ARRAY') {
 808                 return @{ $lookup->{$k} };
 809         } else {
 810                 return $lookup->{$k};
 811         }
 812 }
 813
 814 =head2 config
 815
 816 Consult config values stored in C<config.yml>
 817
 818   # return database code (key under databases in yaml)
 819   $database_code = config();    # use _ from hash
 820   $database_name = config('name');
 821   $database_input_name = config('input name');
 822   $tag = config('input normalize tag');
 823
 824 Up to three levels are supported.
 825
 826 =cut
 827
 828 sub config {
 829         return unless ($config);
 830
 831         my $p = shift;
 832
 833         $p ||= '';
 834
 835         my $v;
 836
 837         warn "### getting config($p)\n" if ($debug > 1);
 838
 839         my @p = split(/\s+/,$p);
 840         if ($#p < 0) {
 841                 $v = $config->{ '_' };  # special, database code
 842         } else {
 843
 844                 my $c = dclone( $config );
 845
 846                 foreach my $k (@p) {
 847                         warn "### k: $k c = ",dump($c),$/ if ($debug > 1);
 848                         if (ref($c) eq 'ARRAY') {
 849                                 $c = shift @$c;
 850                                 warn "config($p) taking first occurence of '$k', probably not what you wanted!\n";
 851                                 last;
 852                         }
 853
 854                         if (! defined($c->{$k}) ) {
 855                                 $c = undef;
 856                                 last;
 857                         } else {
 858                                 $c = $c->{$k};
 859                         }
 860                 }
 861                 $v = $c if ($c);
 862
 863         }
 864
 865         warn "## config( '$p' ) = ",dump( $v ),$/ if ($v && $debug);
 866         warn "config( '$p' ) is empty\n" if (! $v);
 867
 868         return $v;
 869 }
 870
 871 =head2 id
 872
 873 Returns unique id of this record
 874
 875   $id = id();
 876
 877 Returns C<42/2> for 2nd occurence of MFN 42.
 878
 879 =cut
 880
 881 sub id {
 882         my $mfn = $config->{_mfn} || die "no _mfn in config data";
 883         return $mfn . $#{$marc_record} ? $#{$marc_record} + 1 : '';
 884 }
 885
 886 =head2 join_with
 887
 888 Joins walues with some delimiter
 889
 890   $v = join_with(", ", @v);
 891
 892 =cut
 893
 894 sub join_with {
 895         my $d = shift;
 896         warn "### join_with('$d',",dump(@_),")\n" if ($debug > 2);
 897         my $v = join($d, grep { defined($_) && $_ ne '' } @_);
 898         return '' unless defined($v);
 899         return $v;
 900 }
 901
 902 =head2 split_rec_on
 903
 904 Split record subfield on some regex and take one of parts out
 905
 906   $a_before_semi_column =
 907         split_rec_on('200','a', /\s*;\s*/, $part);
 908
 909 C<$part> is optional number of element. First element is
 910 B<1>, not 0!
 911
 912 If there is no C<$part> parameter or C<$part> is 0, this function will
 913 return all values produced by splitting.
 914
 915 =cut
 916
 917 sub split_rec_on {
 918         die "split_rec_on need (fld,sf,regex[,part]" if ($#_ < 2);
 919
 920         my ($fld, $sf, $regex, $part) = @_;
 921         warn "### regex ", ref($regex), $regex, $/ if ($debug > 2);
 922
 923         my @r = rec( $fld, $sf );
 924         my $v = shift @r;
 925         warn "### first rec($fld,$sf) = ",dump($v),$/ if ($debug > 2);
 926
 927         return '' if( ! defined($v) || $v =~ /^\s*$/);
 928
 929         my @s = split( $regex, $v );
 930         warn "## split_rec_on($fld,$sf,$regex,$part) = ",dump(@s),$/ if ($debug > 1);
 931         if ($part && $part > 0) {
 932                 return $s[ $part - 1 ];
 933         } else {
 934                 return @s;
 935         }
 936 }
 937
 938 # END
 939 1;