lib/WebPAC/Input.pm
lib/WebPAC/Input/ISIS.pm
lib/WebPAC/Normalize.pm
-lib/WebPAC/Normalize/XML.pm
lib/WebPAC/Output.pm
lib/WebPAC/Output/CDBI.pm
lib/WebPAC/Output/Estraier.pm
t/pod.t
conf/lookup/example.pm
conf/lookup/isis.pm
-conf/normalize/isis.xml
'Log::Log4perl' => 1.02,
'Data::Dumper' => 0,
'Cwd' => 0,
- 'Text::Iconv' => 0,
'Storable' => 0,
'DBM::Deep' => 0,
- 'XML::Simple' => 0,
'Template' => 0,
'Time::HiRes' => 0,
'File::Temp' => 0,
+ added --stats to report field and subfield usage [2.14]
+ add validator for input data [2.15]
+ add Excel input format [2.16]
-- fix varous annoyences in code [2.17]
++ remove WebPAC::Normalize::XML and promote WebPAC::Normalize::Set to WebPAC::Normalize [2.20]
- support arrays for normalize/path and lookup
- add dBase input format
- remove delimiters characters from index and query entered
+++ /dev/null
-<?xml version="1.0" encoding="ISO-8859-2"?>
-<!--
- isis_codepage is code page which isis uses
- index_codepage is code page which will be stored in index
-
- format of this file is described in: doc/formating_xml.txt
--->
-<config
- isis_codepage="852"
->
-
-<!--
- <isis type="swish|display|index">_pre_v000^x_sep_v000^x_sep_v000^x_post_</isis>
- <config type="swish|display">name of var from config file</config>
--->
-
- <format name="IDths">
- <!--
- <![CDATA[<a href="?rm=results&show_full=1&f=IDths&v=%s">%s</a>]]>
- -->
- <![CDATA[<a href="lookup{900_mfn:%s}.html">%s</a>]]>
- </format>
-
- <indexer>
- <headline>
- <isis type="display">v250^a</isis>
- <isis type="display" append="1">[v251]</isis>
- <isis type="display" append="1"> / [v562^4] v562^a</isis>
- <isis type="display" append="1">filter{CROVOC}v800</isis>
- </headline>
-
- <filename>
- <isis type="filename">out/thes/v000.html</isis>
- </filename>
-
- <AllThes>
- <isis type="swish">v250 v450 v258 v458 v253 v453 v254 v454 v330 v338</isis>
- <!--
- <isis type="index">v250^a / [v561^4]</isis>
- <isis type="index">v450^a</isis>
- <isis type="index">v258^a</isis>
- <isis type="index">v458^a</isis>
- <isis type="index">v253^a</isis>
- <isis type="index">v453^a</isis>
- <isis type="index">v254^a</isis>
- <isis type="index">v454^a</isis>
- <isis type="index">v330^a</isis>
- <isis type="index">v338^a</isis>
- -->
- </AllThes>
-
- <Source name="Izvor: " order="5">
- <isis type="display">v800 -- v901^c</isis>
- </Source>
-
- <line name="..................." order="9">
- <config type="display">line</config>
- </line>
-
- <DescriptorHR name="HRVATSKI:" order="10">
- <isis type="swish">eval{"v901^a" eq "Deskriptor"}v250^a</isis>
- <isis type="swish">eval{"s901^a" eq "Deskriptor"}v450^a</isis>
- <isis type="index">eval{"v901^a" eq "Deskriptor"}v250^a / [v562^4] v562^a</isis>
- <isis type="index" append="1">eval{qq#v250^a# and "v901^a" eq "Deskriptor"}filter{CROVOC}v800</isis>
- <!--
- <isis type="index">eval{"v901^a" ne "Deskriptor"}v250^a [v251]</isis>
- -->
- <isis type="index">eval{qq#v450^a#}filter{CROVOC}v450^a >> s250^a / [s562^4] s562^a s800</isis>
- <isis type="display">[v251]</isis>
- <isis type="display" append="1">v250^a</isis>
- <isis type="display" append="1">eval{qq#v250^a#}filter{CROVOC}v800</isis>
- </DescriptorHR>
-
- <NonDescriptorHR name="Uporabi za: " order="20">
- <isis type="display" >
- <delimiter> * </delimiter>
- <value>v450^a</value>
- </isis>
- </NonDescriptorHR>
-
- <ScopeNoteHR name="Napomena: " order="30">
- <isis type="display">
- <delimiter><![CDATA[<br/>]]></delimiter>
- <value>v330^a</value>
- </isis>
- </ScopeNoteHR>
-
- <SubjectAreaHR name="Podruèje: " order="50">
- <!--<isis type="swish">eval{"v901^a" ne "Deskriptor"}v250^a</isis>-->
- <isis type="swish">eval{"v901^a" eq "Podruèje"}v250^a</isis>
- <isis type="index">eval{"v901^a" eq "Podruèje"}[v251] v250^a</isis>
- <!--
- <isis type="index" append="1">[v251]</isis>
- -->
- <isis type="display">[v561^4]</isis>
- <isis type="display" append="1" delimiter=" " format_name="IDths" format_delimiter=";;">v561^1;;v561^a</isis>
- <isis type="display" append="1" delimiter=" ">lookup{crovoc:v561^1}</isis>
- </SubjectAreaHR>
-
- <MicrothesaurusHR name="Potpojmovnik: " order="60">
- <isis type="display">[v562^4]</isis>
- <isis type="display" append="1" delimiter=" " format_name="IDths" format_delimiter=";;">v562^1;;v562^a</isis>
- <isis type="display" append="1" delimiter=" ">lookup{crovoc:v562^1}</isis>
- </MicrothesaurusHR>
-<!--
- <ClassCode name="Klasifikacijski kod:" order="70">
- <isis>v251</isis>
- </ClassCode>
--->
- <URL name="URL: " order="80">
- <isis type="display">v856^u</isis>
- </URL>
-
- <BroaderTerm name="©iri pojam: " order="90">
- <isis type="display" format_name="IDths" format_delimiter=";;">
- <delimiter><![CDATA[</li><li>]]></delimiter>
- <value>v556^1;;v556^a lookup{crovoc:v556^1}</value>
- </isis>
- <isis type="display" format_name="IDths" format_delimiter=";;">
- <delimiter><![CDATA[</li><li>]]></delimiter>
- <value>v461^1;;v461^a lookup{crovoc:v461^1}</value>
- </isis>
- </BroaderTerm>
-
- <NarrowerTerm name="U¾i pojam: " order="100">
- <isis type="display" format_name="IDths" format_delimiter=";;">
- <delimiter><![CDATA[</li><li>]]></delimiter>
- <value>v553^1;;v553^a lookup{crovoc:v553^1}</value>
- </isis>
- <!--
- <isis type="display" format_name="IDths" format_delimiter=";;" >
- <delimiter> * </delimiter>
- <value>[a:v561^4];;[d:[a:v561^4]]</value>
- </isis>
- -->
- <isis type="display" format_name="IDths" format_delimiter=";;" >
- <delimiter><![CDATA[</li><li>]]></delimiter>
- <value>eval{"v901^a" eq "Podruèje"}lookup{a:v251::};;lookup{d:lookup{a:v251::}}</value>
- </isis>
- <isis type="display" append="1" format_name="IDths" format_delimiter=";;" >
- <delimiter><![CDATA[</li><li>]]></delimiter>
- <value>eval{"v901^a" eq "Mikrotezaurus"}lookup{a:v561^4:v251:};;lookup{d:lookup{a:v561^4:v251:}}</value>
- </isis>
- <isis type="display" format_name="IDths" format_delimiter=";;" >
- <delimiter><![CDATA[</li><li>]]></delimiter>
- <value>eval{"v901^a" eq "Deskriptor"}lookup{a:v561^4:v562^4:v900};;lookup{d:lookup{a:v561^4:v562^4:v900}}</value>
- </isis>
- </NarrowerTerm>
-
- <RelatedTerm name="Srodni pojam: " order="110">
- <isis type="display" format_name="IDths" format_delimiter=";;" sort="1">
- <delimiter> * </delimiter>
- <value>v550^1;;v550^a lookup{crovoc:v550^1}</value>
- </isis>
- </RelatedTerm>
-
- <SeeNext name="Slijedi: " order="120">
- <isis type="display" format_name="IDths" format_delimiter=";;">
- <delimiter>, </delimiter>
- <value>v440^1;;v440^a lookup{crovoc:v440^1}</value>
- </isis>
- </SeeNext>
-
- <SeeBefore name="Prethodi: " order="130">
- <isis type="display" format_name="IDths" format_delimiter=";;">
- <delimiter>, </delimiter>
- <value>v430^1;;v430^a lookup{crovoc:v430^1}</value>
- </isis>
- </SeeBefore>
-
- <line2 name=".................." order="139">
- <config type="display">line</config>
- </line2>
-
- <DescriptorEN name="ENGLESKI:" order="140">
- <isis type="swish">eval{"v901^a" eq "Deskriptor"}v258^a</isis>
- <isis type="swish">eval{"s901^a" eq "Deskriptor"}v458^a</isis>
- <isis type="index">eval{"v901^a" eq "Deskriptor"}v258^a / [v572^4] v572^a</isis>
- <isis type="index">eval{qq#v458^a#}filter{CROVOC}v458^a >> s258^a / [s572^4] s572^a s800</isis>
- <isis type="display">v258^a</isis>
- </DescriptorEN>
-
- <ScopeNoteEN name="Napomena: " order="150">
- <isis type="display">
- <delimiter><![CDATA[<br/>]]></delimiter>
- <value>v338^a</value>
- </isis>
- </ScopeNoteEN>
-
- <NonDescriptorEN name="Uporabi za: " order="160">
- <isis type="display" >
- <delimiter> * </delimiter>
- <value>v458^a</value>
- </isis>
- </NonDescriptorEN>
-
-
- <SubjectAreaEN name="Podruèje: " order="170">
- <isis type="swish">v571 v572</isis>
- <isis type="index">[v251]</isis>
- <isis type="index" append="1">v571^a. v572^a</isis>
-<!-- <isis type="display">[v571^4]</isis>
- <isis type="display" append="1" delimiter=" " format_name="IDths" format_delimiter=";;">v571^1;;v571^a</isis>
---> </SubjectAreaEN>
-
-<!-- <MicrothesaurusEN name="Potpojmovnik: " order="180">
- <isis type="display">[v572^4]</isis>
- <isis type="display" append="1" delimiter=" " format_name="IDths" format_delimiter=";;">v572^1;;v572^a</isis>
- </MicrothesaurusEN>
--->
- <line3 name=".................." order="189">
- <config type="display">eval{"v253^9" eq "1"} line</config>
- </line3>
-
- <DescriptorFR name="FRANCUSKI:" order="190">
- <isis type="swish">eval{"v901^a" eq "Deskriptor"}v253^a</isis>
- <isis type="swish">eval{"s901^a" eq "Deskriptor"}v453^a</isis>
- <isis type="index">eval{"v901^a" eq "Deskriptor"}v253^a / [v572^4] v572^a</isis>
- <isis type="index">eval{qq#v453^a#}filter{CROVOC}v453^a >> s253^a / [s572^4] s572^a s800</isis>
- <isis type="display">v253^a</isis>
- </DescriptorFR>
-
- <NonDescriptorFR name="Uporabi za: " order="200">
- <isis type="display">v453^a</isis>
- </NonDescriptorFR>
-
- <SubjectAreaFR name="Podruèje: " order="210">
- <isis type="swish">v573 v574</isis>
- <isis type="index">[v251] </isis>
- <isis type="index" append="1">v573^a. v574^a</isis>
-<!-- <isis type="display">[v573^4]</isis>
- <isis type="display" append="1" delimiter=" " format_name="IDths" format_delimiter=";;">v573^1;;v573^a</isis>
---> </SubjectAreaFR>
-<!--
- <MicrothesaurusFR name="Potpojmovnik: " order="220">
- <isis type="display">[v574^4]</isis>
- <isis type="display" append="1" delimiter=" " format_name="IDths" format_delimiter=";;">v574^1;;v574^a</isis>
- </MicrothesaurusFR>
--->
- <line4 name=".................." order="229">
- <config type="display">eval{"v254^9" eq "1"} line</config>
- </line4>
-
- <DescriptorGE name="NJEMAÈKI:" order="230">
- <isis type="swish">eval{"v901^a" eq "Deskriptor"}v254^a</isis>
- <isis type="swish">eval{"s901^a" eq "Deskriptor"}v454^a</isis>
- <isis type="index">eval{"v901^a" eq "Deskriptor"}v254^a / [v572^4] v572^a</isis>
- <isis type="index">eval{qq#v454^a#}filter{CROVOC}v454^a >> s254^a / [s572^4] s572^a s800</isis>
- <isis type="display">v254^a</isis>
- </DescriptorGE>
-
- <NonDescriptorGE name="Uporabi za: " order="240">
- <isis type="display">v454^a</isis>
- </NonDescriptorGE>
-
- <SubjectAreaGE name="Podruèje: " order="250">
- <isis type="swish">v575 v576</isis>
- <isis type="index">[v251] </isis>
- <isis type="index" append="1">v575^a. v576^a.</isis>
-<!-- <isis type="display">[v575^4]</isis>
- <isis type="display" append="1" delimiter=" " format_name="IDths" format_delimiter=";;">v575^1;;v575^a</isis>
---> </SubjectAreaGE>
-<!--
- <MicrothesaurusGE name="Potpojmovnik: " order="260">
- <isis type="display">[v576^4]</isis>
- <isis type="display" append="1" delimiter=" " format_name="IDths" format_delimiter=";;">v576^1;;v576^a</isis>
- </MicrothesaurusGE>
--->
- <!--
- order is important for type="lookup_key|lookup_val"
- They create internal hash file for lookups from other
- fields. Therefore, you mist ensure (in config file) that
- databases which are source of lookup are called before
- databases that use lookup fields.
- Those lookup are then used using special ##lookup##
- token inside formating instructions like this:
- <isis type="lookup">v999^x</isis>
- -->
-
- <IDths name="ID" order="300">
- <isis type="lookup_key">v900</isis>
- <isis type="swish">v900</isis>
- </IDths>
-
- <SubjectIndex name="Predmetno kazalo" order="301">
- <isis type="lookup_val">eval{"v901^a" eq "Deskriptor"} / [v562^4] v562^a</isis>
- <isis type="lookup_val">eval{"v901^a" eq "Deskriptor"} [v251]</isis>
- <!--
- <isis type="lookup_val" append="1"> v562^a</isis>
- -->
- </SubjectIndex>
-
- </indexer>
-</config>
+++ /dev/null
-<?xml version="1.0" encoding="ISO-8859-2"?>
-<!--
- Here are few examples of syntax.
-
- <isis type="search|display">something v200^a something else</isis>
- <config type="search|display">name of var from config file</config>
--->
-<config>
-<indexer>
- <MFN>
- <isis>v000</isis>
- </MFN>
-
- <ISBN>
- <isis type="display">v10</isis>
- </ISBN>
-
- <ISSN>
- <isis type="display">v11</isis>
- </ISSN>
-
- <IdentificationNumbers name="ISN">
- <isis type="search">v10 v11</isis>
- </IdentificationNumbers>
-
- <Language>
- <isis>v101</isis>
- </Language>
-
- <TitleProper>
- <isis><![CDATA[filter{regex(s/<[^>]*>/)}v200^a]]></isis>
- </TitleProper>
-
- <titleNo>
- <isis>v200^9</isis>
- </titleNo>
-
- <Subtitle>
- <isis>v200^e</isis>
- </Subtitle>
-
- <TitleProper2>
- <isis>v200^c</isis>
- </TitleProper2>
-
- <ParallelTitle>
- <isis>v200^d</isis>
- </ParallelTitle>
-
- <Responsibility>
- <isis>v200^f ; v200^g</isis>
- </Responsibility>
-
- <ResponsibilityFirst>
- <isis type="display">v200^f</isis>
- </ResponsibilityFirst>
-
- <ResponsibilitySecond>
- <isis type="display">v200^g</isis>
- </ResponsibilitySecond>
-
- <VolumeDesignation>
- <isis>v200^v</isis>
- </VolumeDesignation>
-
- <EditionStatement>
- <isis>v205^a</isis>
- </EditionStatement>
-
- <SerialNo>
- <isis>v207^a</isis>
- </SerialNo>
-
- <fond>
- <isis>v209^a</isis>
- </fond>
-
- <PlacePublication>
- <isis>v210^a</isis>
- </PlacePublication>
-
- <NamePublisher>
- <isis>v210^c</isis>
- </NamePublisher>
-
- <DatePublication>
- <isis>v210^d</isis>
- </DatePublication>
-
- <PhysicalDescription>
- <isis>v215^a : v215^c ; v215^d</isis>
- </PhysicalDescription>
-
- <MaterialDesignation>
- <isis>v215^a</isis>
- </MaterialDesignation>
-
- <PhysicalDetails>
- <isis>v215^c</isis>
- </PhysicalDetails>
-
- <AccompanyingMaterial>
- <isis>v215^e</isis>
- </AccompanyingMaterial>
-
- <Series>
- <isis>v225^a = v225^d : v225^e ; v225^v. v225h, v225^i ; v225^w</isis>
- </Series>
-
- <SeriesTitle>
- <isis>v225^a</isis>
- </SeriesTitle>
-
- <GeneralNote>
- <isis>v300</isis>
- </GeneralNote>
-
- <EditionNote>
- <isis>v305</isis>
- </EditionNote>
-
- <PhysicalDescriptionNote>
- <isis>v307</isis>
- </PhysicalDescriptionNote>
-
- <IntellectResponsNote>
- <isis>v314</isis>
- </IntellectResponsNote>
-
- <InternalBibliographies>
- <isis>v320</isis>
- </InternalBibliographies>
-
- <Frequency>
- <isis>v326</isis>
- </Frequency>
-
- <ContentsNote>
- <isis>v327</isis>
- </ContentsNote>
-
- <Summary>
- <isis>v330</isis>
- </Summary>
-
- <SystemRequirements>
- <isis>v337</isis>
- </SystemRequirements>
-
- <IssuedWith>
- <isis>v423^z: v423^a / v423^c v423^b</isis>
- </IssuedWith>
-
- <Parts>
- <isis type="display">lookup{dio-jzav:v900}</isis>
- <isis type="search">lookup{id-dio-jzav:v900}</isis>
- </Parts>
-
- <PartsEF>
- <isis type="display">lookup{naslov-efzg:001v001}</isis>
- </PartsEF>
-
- <PartsID>
- <isis>v463^1</isis>
- </PartsID>
-
- <Piece>
- <isis>lookup{naslov-efzg:s463^1}</isis>
- </Piece>
-
- <PieceSubtitle>
- <isis>lookup{podnaslov-efzg:s463^1}</isis>
- </PieceSubtitle>
-
- <PieceNum>
- <isis>v463^v</isis>
- </PieceNum>
-
-
- <PieceAnalitic>
- <isis>v464^a / v464^g v464^f</isis>
- </PieceAnalitic>
-
- <UniformHeading>
- <isis>v500^a. v503^b</isis>
- </UniformHeading>
-
- <ExpandedTitle>
- <isis>v532</isis>
- </ExpandedTitle>
-
- <Form>
- <isis>v608</isis>
- </Form>
-
- <UncontrolledTerms>
- <isis>v610</isis>
- </UncontrolledTerms>
-
- <UDC_All>
- <isis type="search">v675</isis>
- </UDC_All>
-
- <UDC>
- <isis>v675^a</isis>
- </UDC>
-
- <UDCb>
- <isis>v675^b</isis>
- </UDCb>
-
- <APA>
- <isis>v686</isis>
- </APA>
-
- <PersonalName>
- <isis>v700^a, v700^b</isis>
- <isis type="index">v700^a, v700^b</isis>
- </PersonalName>
-
- <PersonalName2>
- <isis>v701^a, v701^b</isis>
- <isis type="index">v701^a, v701^b</isis>
- </PersonalName2>
-
- <PersonalNameOther>
- <isis>v702^a, v702^b</isis>
- <isis type="index">v702^a, v702^b</isis>
- </PersonalNameOther>
-
- <Names>
- <isis>v700^a, v700^b</isis>
- <isis>v701^a, v701^b</isis>
- <isis>v702^a, v702^b</isis>
- <isis type="index">v700^a, v700^b</isis>
- <isis type="index">v701^a, v701^b</isis>
- <isis type="index">v702^a, v702^b</isis>
- </Names>
-
- <CorporateName>
- <isis>v710^a</isis>
- </CorporateName>
-
- <CorporateName2>
- <isis>v711^a</isis>
- </CorporateName2>
-
- <OriginatingSource>
- <isis>v801</isis>
- </OriginatingSource>
-
- <URL>
- <isis>v856^u</isis>
- </URL>
-
- <level>
- <isis>v909</isis>
- </level>
-
- <ID>
- <isis>v900</isis>
- <isis>001v001</isis>
- </ID>
-
- <Set>
- <isis type="display">lookup{set-jzav:v946^1}</isis>
- <isis type="search">v946^1</isis>
- <isis type="display">lookup{set-efzg:v461^1}</isis>
- <isis type="search">v461^1</isis>
- </Set>
-
- <Set2>
- <isis>lookup{set-jzav:lookup{parent-id:v946^1}}</isis>
- </Set2>
-
- <CallNo>
- <isis>v990</isis>
- </CallNo>
-
- <InvNo>
- <isis>v991</isis>
- </InvNo>
-
-</indexer>
-</config>
=head1 VERSION
-Version 2.17
+Version 2.20
=cut
-our $VERSION = '2.17';
+our $VERSION = '2.20';
=head1 SYNOPSIS
-This is quick description of what WebPAC is. This is third iteration of
-WebPAC design (second one was semi-private creatation of CD ROM with L<jsFind>
-module). This code will eventually become official WebPAC version 2.
+This is quick description of what WebPAC is. This is another iteration of
+WebPAC design (first was system with XML files and CGI, second one was semi-private
+creatation of CD ROM with L<jsFind> module and third was older version 2
+with supprot for lagacy XML and YAML).
+
+Current version supports different input formats and normalisation using set rules.
=head1 AUTHOR
Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
-=head1 BUGS
-
-Please report any bugs or feature requests to
-C<bug-webpac@rt.cpan.org>, or through the web interface at
-L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=WebPAC>.
-I will be notified, and then you'll automatically be notified of progress on
-your bug as I make changes.
-
=head1 SEE ALSO
To undestand concpets behind WebPAC examine L<WebPAC::Manual>, and then
=head1 COPYRIGHT & LICENSE
-Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
+Copyright 2005-2006 Dobrica Pavlinusic, All Rights Reserved.
This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.
use warnings;
use strict;
-use base qw/WebPAC::Common WebPAC::Normalize/;
+use base qw/WebPAC::Common WebPAC::Lookup::Normalize/;
use File::Slurp;
use YAML qw/LoadFile/;
use Data::Dumper;
=head1 COPYRIGHT & LICENSE
-Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
+Copyright 2005-2006 Dobrica Pavlinusic, All Rights Reserved.
This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.
--- /dev/null
+package WebPAC::Lookup::Normalize;
+
+use warnings;
+use strict;
+use blib;
+use WebPAC::Common;
+use base 'WebPAC::Common';
+use Data::Dumper;
+
+=head1 NAME
+
+WebPAC::Lookup::Normalize - data mungling for normalisation
+
+=head1 VERSION
+
+Version 0.09
+
+=cut
+
+our $VERSION = '0.09';
+
+=head1 SYNOPSIS
+
+This package contains code that mungle data to produce normalized format.
+
+B<WARNING:>
+
+This code is obsolete. It moved to here so that I don't have to re-write
+L<WebPAC::Lookup> to use set configuration files (using L<WebPAC::Normalize>)
+just yet. But it will dissapear real soon!
+
+It contains several assumptions:
+
+=over
+
+=item *
+
+format of fields is defined using C<v123^a> notation for repeatable fields
+or C<s123^a> for single (or first) value, where C<123> is field number and
+C<a> is subfield.
+
+=item *
+
+source data records (C<$rec>) have unique identifiers in field C<000>
+
+=item *
+
+optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
+perl code that is evaluated before producing output (value of field will be
+interpolated before that)
+
+=item *
+
+optional C<filter{filter_name}> at B<begining of format> will apply perl
+code defined as code ref on format after field substitution to producing
+output
+
+There is one built-in filter called C<regex> which can be use like this:
+
+ filter{regex(s/foo/bar/)}
+
+=item *
+
+optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
+
+=item *
+
+at end, optional C<format>s rules are resolved. Format rules are similar to
+C<sprintf> and can also contain C<lookup{...}> which is performed after
+values are inserted in format.
+
+=back
+
+This also describes order in which transformations are applied (eval,
+filter, lookup, format) which is important to undestand when deciding how to
+solve your data mungling and normalisation process.
+
+
+
+
+=head1 FUNCTIONS
+
+=head2 new
+
+Create new normalisation object
+
+ my $n = new WebPAC::Lookup::Normalize::Something(
+ filter => {
+ 'filter_name_1' => sub {
+ # filter code
+ return length($_);
+ }, ...
+ },
+ db => $db_obj,
+ lookup_regex => $lookup->regex,
+ lookup => $lookup_obj,
+ prefix => 'foobar',
+ );
+
+Parametar C<filter> defines user supplied snippets of perl code which can
+be use with C<filter{...}> notation.
+
+C<prefix> is used to form filename for database record (to support multiple
+source files which are joined in one database).
+
+Recommended parametar C<lookup_regex> is used to enable parsing of lookups
+in structures. If you pass this parametar, you must also pass C<lookup>
+which is C<WebPAC::Lookup> object.
+
+=cut
+
+sub new {
+ my $class = shift;
+ my $self = {@_};
+ bless($self, $class);
+
+ my $r = $self->{'lookup_regex'} ? 1 : 0;
+ my $l = $self->{'lookup'} ? 1 : 0;
+
+ my $log = $self->_get_logger();
+
+ # those two must be in pair
+ if ( ($r & $l) != ($r || $l) ) {
+ my $log = $self->_get_logger();
+ $log->logdie("lookup_regex and lookup must be in pair");
+ }
+
+ $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
+
+ $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
+
+ $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
+
+ if (! $self->{filter} || ! $self->{filter}->{regex}) {
+ $log->debug("adding built-in filter regex");
+ $self->{filter}->{regex} = sub {
+ my ($val, $regex) = @_;
+ eval "\$val =~ $regex";
+ return $val;
+ };
+ }
+
+ $self ? return $self : return undef;
+}
+
+=head2 all_tags
+
+Returns all tags in document in specified order
+
+ my $sorted_tags = $self->all_tags();
+
+=cut
+
+sub all_tags {
+ my $self = shift;
+
+ if (! $self->{_tags_by_order}) {
+
+ my $log = $self->_get_logger;
+ # sanity check
+ $log->logdie("can't find self->{inport_xml}->{indexer}") unless ($self->{import_xml}->{indexer});
+
+ my @tags = keys %{ $self->{'import_xml'}->{'indexer'}};
+ $log->debug("unsorted tags: " . join(", ", @tags));
+
+ @tags = sort { $self->_sort_by_order } @tags;
+
+ $log->debug("sorted tags: " . join(",", @tags) );
+
+ $self->{_tags_by_order} = \@tags;
+ }
+
+ return $self->{_tags_by_order};
+}
+
+
+
+=head2 data_structure
+
+Create in-memory data structure which represents normalized layout from
+C<conf/normalize/*.xml>.
+
+This structures are used to produce output.
+
+ my $ds = $webpac->data_structure($rec);
+
+=cut
+
+sub data_structure {
+ my $self = shift;
+
+ my $log = $self->_get_logger();
+
+ my $rec = shift;
+ $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+
+ $log->debug("data_structure rec = ", sub { Dumper($rec) });
+
+ $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
+
+ my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
+
+ my $cache_file;
+
+ if ($self->{'db'}) {
+ my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
+ $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
+ return $ds if ($ds);
+ $log->debug("cache miss, creating");
+ }
+
+ my $tags = $self->all_tags();
+
+ $log->debug("tags: ",sub { join(", ",@{ $tags }) });
+
+ my $ds;
+
+ foreach my $field (@{ $tags }) {
+
+ my $row;
+
+#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
+
+ foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
+ my $format;
+
+ $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
+ $format = $tag->{'value'} || $tag->{'content'};
+
+ my @v;
+ if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
+ @v = $self->_rec_to_arr($rec,$format,'fill_in');
+ } else {
+ @v = $self->_rec_to_arr($rec,$format,'parse');
+ }
+ if (! @v) {
+ $log->debug("$field <",$self->{tag},"> format: $format no values");
+ next;
+ } else {
+ $log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
+ }
+
+ if ($tag->{'sort'}) {
+ @v = $self->sort_arr(@v);
+ }
+
+ # use format?
+ if ($tag->{'format_name'}) {
+ @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
+ }
+
+ # delimiter will join repeatable fields
+ if ($tag->{'delimiter'}) {
+ @v = ( join($tag->{'delimiter'}, @v) );
+ }
+
+ # default types
+ my @types = qw(display search);
+ # override by type attribute
+ @types = ( $tag->{'type'} ) if ($tag->{'type'});
+
+ foreach my $type (@types) {
+ # append to previous line?
+ $log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} || 'no append');
+ if ($tag->{'append'}) {
+
+ # I will delimit appended part with
+ # delimiter (or ,)
+ my $d = $tag->{'delimiter'};
+ # default delimiter
+ $d ||= " ";
+
+ my $last = pop @{$row->{$type}};
+ $d = "" if (! $last);
+ $last .= $d . join($d, @v);
+ push @{$row->{$type}}, $last;
+
+ } else {
+ push @{$row->{$type}}, @v;
+ }
+ }
+
+
+ }
+
+ if ($row) {
+ $row->{'tag'} = $field;
+
+ # TODO: name_sigular, name_plural
+ my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
+ my $row_name = $name ? $self->_x($name) : $field;
+
+ # post-sort all values in field
+ if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
+ $log->warn("sort at field tag not implemented");
+ }
+
+ $ds->{$row_name} = $row;
+
+ $log->debug("row $field: ",sub { Dumper($row) });
+ }
+
+ }
+
+ $self->{'db'}->save_ds(
+ id => $id,
+ ds => $ds,
+ prefix => $self->{prefix},
+ ) if ($self->{'db'});
+
+ $log->debug("ds: ", sub { Dumper($ds) });
+
+ $log->logconfess("data structure returned is not array any more!") if wantarray;
+
+ return $ds;
+
+}
+
+=head2 parse
+
+Perform smart parsing of string, skipping delimiters for fields which aren't
+defined. It can also eval code in format starting with C<eval{...}> and
+return output or nothing depending on eval code.
+
+ my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
+
+Filters are implemented here. While simple form of filters looks like this:
+
+ filter{name_of_filter}
+
+but, filters can also have variable number of parametars like this:
+
+ filter{name_of_filter(param,param,param)}
+
+=cut
+
+my $warn_once;
+
+sub parse {
+ my $self = shift;
+
+ my ($rec, $format_utf8, $i, $rec_size) = @_;
+
+ return if (! $format_utf8);
+
+ my $log = $self->_get_logger();
+
+ $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+
+ $i = 0 if (! $i);
+
+ my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
+
+ my @out;
+
+ $log->debug("format: $format [$i]");
+
+ my $eval_code;
+ # remove eval{...} from beginning
+ $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
+
+ my $filter_name;
+ # remove filter{...} from beginning
+ $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
+
+ # did we found any (att all) field from format in row?
+ my $found_any;
+ # prefix before first field which we preserve it $found_any
+ my $prefix;
+
+ my $f_step = 1;
+
+ while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
+
+ my $del = $1 || '';
+ $prefix = $del if ($f_step == 1);
+
+ my $fld_type = lc($2);
+
+ # repeatable index
+ my $r = $i;
+ if ($fld_type eq 's') {
+ if ($found_any->{'v'}) {
+ $r = 0;
+ } else {
+ return;
+ }
+ }
+
+ my $found = 0;
+ my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found,$rec_size);
+
+ if ($found) {
+ $found_any->{$fld_type} += $found;
+
+ # we will skip delimiter before first occurence of field!
+ push @out, $del unless($found_any->{$fld_type} == 1);
+ push @out, $tmp if ($tmp);
+ }
+ $f_step++;
+ }
+
+ # test if any fields found?
+ return if (! $found_any->{'v'} && ! $found_any->{'s'});
+
+ my $out = join('',@out);
+
+ if ($out) {
+ # add rest of format (suffix)
+ $out .= $format;
+
+ # add prefix if not there
+ $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
+
+ $log->debug("result: $out");
+ }
+
+ if ($eval_code) {
+ my $eval = $self->fill_in($rec,$eval_code,$i) || return;
+ $log->debug("about to eval{$eval} format: $out");
+ return if (! $self->_eval($eval));
+ }
+
+ if ($filter_name) {
+ my @filter_args;
+ if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
+ @filter_args = split(/,/, $2);
+ }
+ if ($self->{'filter'}->{$filter_name}) {
+ $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
+ unshift @filter_args, $out;
+ $out = $self->{'filter'}->{$filter_name}->(@filter_args);
+ return unless(defined($out));
+ $log->debug("filter result: $out");
+ } elsif (! $warn_once->{$filter_name}) {
+ $log->warn("trying to use undefined filter $filter_name");
+ $warn_once->{$filter_name}++;
+ }
+ }
+
+ return $out;
+}
+
+=head2 fill_in
+
+Workhourse of all: takes record from in-memory structure of database and
+strings with placeholders and returns string or array of with substituted
+values from record.
+
+ my $text = $webpac->fill_in($rec,'v250^a');
+
+Optional argument is ordinal number for repeatable fields. By default,
+it's assume to be first repeatable field (fields are perl array, so first
+element is 0).
+Following example will read second value from repeatable field.
+
+ my $text = $webpac->fill_in($rec,'Title: v250^a',1);
+
+This function B<does not> perform parsing of format to inteligenty skip
+delimiters before fields which aren't used.
+
+This method will automatically decode UTF-8 string to local code page
+if needed.
+
+There is optional parametar C<$record_size> which can be used to get sizes of
+all C<field^subfield> combinations in this format.
+
+ my $text = $webpac->fill_in($rec,'got: v900^a v900^x',0,\$rec_size);
+
+=cut
+
+sub fill_in {
+ my $self = shift;
+
+ my $log = $self->_get_logger();
+
+ my ($rec,$format,$i,$rec_size) = @_;
+
+ $log->logconfess("need data record") unless ($rec);
+ $log->logconfess("need format to parse") unless($format);
+
+ # iteration (for repeatable fields)
+ $i ||= 0;
+
+ $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
+
+ # FIXME remove for speedup?
+ $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+
+ if (utf8::is_utf8($format)) {
+ $format = $self->_x($format);
+ }
+
+ my $found = 0;
+ my $just_single = 1;
+
+ my $eval_code;
+ # remove eval{...} from beginning
+ $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
+
+ my $filter_name;
+ # remove filter{...} from beginning
+ $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
+
+ {
+ # fix warnings
+ no warnings 'uninitialized';
+
+ # do actual replacement of placeholders
+ # repeatable fields
+ if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found,$rec_size)/ges) {
+ $just_single = 0;
+ }
+
+ # non-repeatable fields
+ if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found,$rec_size)/ges) {
+ return if ($i > 0 && $just_single);
+ }
+ }
+
+ if ($found) {
+ $log->debug("format: $format");
+ if ($eval_code) {
+ my $eval = $self->fill_in($rec,$eval_code,$i);
+ return if (! $self->_eval($eval));
+ }
+ if ($filter_name && $self->{'filter'}->{$filter_name}) {
+ $log->debug("filter '$filter_name' for $format");
+ $format = $self->{'filter'}->{$filter_name}->($format);
+ return unless(defined($format));
+ $log->debug("filter result: $format");
+ }
+ # do we have lookups?
+ if ($self->{'lookup'}) {
+ if ($self->{'lookup'}->can('lookup')) {
+ my @lookup = $self->{lookup}->lookup($format);
+ $log->debug("lookup $format", join(", ", @lookup));
+ return @lookup;
+ } else {
+ $log->warn("Have lookup object but can't invoke lookup method");
+ }
+ } else {
+ return $format;
+ }
+ } else {
+ return;
+ }
+}
+
+
+=head2 _rec_to_arr
+
+Similar to C<parse> and C<fill_in>, but returns array of all repeatable fields. Usable
+for fields which have lookups, so they shouldn't be parsed but rather
+C<paste>d or C<fill_id>ed. Last argument is name of operation: C<paste> or C<fill_in>.
+
+ my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]','paste');
+
+=cut
+
+sub _rec_to_arr {
+ my $self = shift;
+
+ my ($rec, $format_utf8, $code) = @_;
+
+ my $log = $self->_get_logger();
+
+ $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+ return if (! $format_utf8);
+
+ $log->debug("using $code on $format_utf8");
+
+ my $i = 0;
+ my $max = 0;
+ my @arr;
+ my $rec_size = {};
+
+ while ($i <= $max) {
+ my @v = $self->$code($rec,$format_utf8,$i++,\$rec_size);
+ if ($rec_size) {
+ foreach my $f (keys %{ $rec_size }) {
+ $max = $rec_size->{$f} if ($rec_size->{$f} > $max);
+ }
+ $log->debug("max set to $max");
+ undef $rec_size;
+ }
+ if (@v) {
+ push @arr, @v;
+ } else {
+ push @arr, '' if ($max > $i);
+ }
+ }
+
+ $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
+
+ return @arr;
+}
+
+
+=head2 get_data
+
+Returns value from record.
+
+ my $text = $self->get_data(\$rec,$f,$sf,$i,\$found,\$rec_size);
+
+Required arguments are:
+
+=over 8
+
+=item C<$rec>
+
+record reference
+
+=item C<$f>
+
+field
+
+=item C<$sf>
+
+optional subfield
+
+=item C<$i>
+
+index offset for repeatable values ( 0 ... $rec_size->{'400^a'} )
+
+=item C<$found>
+
+optional variable that will be incremeted if preset
+
+=item C<$rec_size>
+
+hash to hold maximum occurances of C<field^subfield> combinations
+(which can be accessed using keys in same format)
+
+=back
+
+Returns value or empty string, updates C<$found> and C<rec_size>
+if present.
+
+=cut
+
+sub get_data {
+ my $self = shift;
+
+ my ($rec,$f,$sf,$i,$found,$cache) = @_;
+
+ return '' unless ($$rec->{$f} && ref($$rec->{$f}) eq 'ARRAY');
+
+ if (defined($$cache)) {
+ $$cache->{ $f . ( $sf ? '^' . $sf : '' ) } ||= scalar @{ $$rec->{$f} };
+ }
+
+ return '' unless ($$rec->{$f}->[$i]);
+
+ {
+ no strict 'refs';
+ if (defined($sf)) {
+ $$found++ if (defined($$found) && $$rec->{$f}->[$i]->{$sf});
+ return $$rec->{$f}->[$i]->{$sf};
+ } else {
+ $$found++ if (defined($$found));
+ # it still might have subfields, just
+ # not specified, so we'll dump some debug info
+ if ($$rec->{$f}->[$i] =~ /HASH/o) {
+ my $out;
+ foreach my $k (keys %{$$rec->{$f}->[$i]}) {
+ my $v = $$rec->{$f}->[$i]->{$k};
+ $out .= '$' . $k .':' . $v if ($v);
+ }
+ return $out;
+ } else {
+ return $$rec->{$f}->[$i];
+ }
+ }
+ }
+}
+
+
+=head2 apply_format
+
+Apply format specified in tag with C<format_name="name"> and
+C<format_delimiter=";;">.
+
+ my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
+
+Formats can contain C<lookup{...}> if you need them.
+
+=cut
+
+sub apply_format {
+ my $self = shift;
+
+ my ($name,$delimiter,$data) = @_;
+
+ my $log = $self->_get_logger();
+
+ if (! $self->{'import_xml'}->{'format'}->{$name}) {
+ $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
+ return $data;
+ }
+
+ $log->warn("no delimiter for format $name") if (! $delimiter);
+
+ my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
+
+ my @data = split(/\Q$delimiter\E/, $data);
+
+ my $out = sprintf($format, @data);
+ $log->debug("using format $name [$format] on $data to produce: $out");
+
+ if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
+ return $self->{'lookup'}->lookup($out);
+ } else {
+ return $out;
+ }
+
+}
+
+=head2 sort_arr
+
+Sort array ignoring case and html in data
+
+ my @sorted = $webpac->sort_arr(@unsorted);
+
+=cut
+
+sub sort_arr {
+ my $self = shift;
+
+ my $log = $self->_get_logger();
+
+ # FIXME add Schwartzian Transformation?
+
+ my @sorted = sort {
+ $a =~ s#<[^>]+/*>##;
+ $b =~ s#<[^>]+/*>##;
+ lc($b) cmp lc($a)
+ } @_;
+ $log->debug("sorted values: ",sub { join(", ",@sorted) });
+
+ return @sorted;
+}
+
+
+=head1 INTERNAL METHODS
+
+=head2 _sort_by_order
+
+Sort xml tags data structure accoding to C<order=""> attribute.
+
+=cut
+
+sub _sort_by_order {
+ my $self = shift;
+
+ my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
+ $self->{'import_xml'}->{'indexer'}->{$a};
+ my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
+ $self->{'import_xml'}->{'indexer'}->{$b};
+
+ return $va <=> $vb;
+}
+
+=head2 _x
+
+Convert strings from C<conf/normalize/*.xml> encoding into application
+specific encoding (optinally specified using C<code_page> to C<new>
+constructor).
+
+ my $text = $n->_x('normalize text string');
+
+This is a stub so that other modules doesn't have to implement it.
+
+=cut
+
+sub _x {
+ my $self = shift;
+ return shift;
+}
+
+
+=head1 AUTHOR
+
+Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
+
+=head1 COPYRIGHT & LICENSE
+
+Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it
+under the same terms as Perl itself.
+
+=cut
+
+1; # End of WebPAC::Lookup::Normalize
package WebPAC::Normalize;
+use Exporter 'import';
+@EXPORT = qw/
+ set_rec set_lookup
+ get_ds clean_ds
+ tag search display
+ rec1 rec2 rec
+ regex prefix suffix surround
+ first lookup join_with
+/;
use warnings;
use strict;
-use blib;
-use WebPAC::Common;
-use base 'WebPAC::Common';
+
+#use base qw/WebPAC::Common/;
use Data::Dumper;
=head1 NAME
-WebPAC::Normalize - data mungling for normalisation
+WebPAC::Normalize - describe normalisaton rules using sets
=head1 VERSION
-Version 0.09
+Version 0.04
=cut
-our $VERSION = '0.09';
+our $VERSION = '0.04';
=head1 SYNOPSIS
-This package contains code that mungle data to produce normalized format.
-
-It contains several assumptions:
-
-=over
-
-=item *
-
-format of fields is defined using C<v123^a> notation for repeatable fields
-or C<s123^a> for single (or first) value, where C<123> is field number and
-C<a> is subfield.
-
-=item *
-
-source data records (C<$rec>) have unique identifiers in field C<000>
-
-=item *
-
-optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
-perl code that is evaluated before producing output (value of field will be
-interpolated before that)
-
-=item *
-
-optional C<filter{filter_name}> at B<begining of format> will apply perl
-code defined as code ref on format after field substitution to producing
-output
-
-There is one built-in filter called C<regex> which can be use like this:
-
- filter{regex(s/foo/bar/)}
-
-=item *
-
-optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
-
-=item *
-
-at end, optional C<format>s rules are resolved. Format rules are similar to
-C<sprintf> and can also contain C<lookup{...}> which is performed after
-values are inserted in format.
-
-=back
-
-This also describes order in which transformations are applied (eval,
-filter, lookup, format) which is important to undestand when deciding how to
-solve your data mungling and normalisation process.
-
+This module uses C<conf/normalize/*.pl> files to perform normalisation
+from input records using perl functions which are specialized for set
+processing.
+Sets are implemented as arrays, and normalisation file is valid perl, which
+means that you check it's validity before running WebPAC using
+C<perl -c normalize.pl>.
+Normalisation can generate multiple output normalized data. For now, supported output
+types (on the left side of definition) are: C<tag>, C<display> and C<search>.
=head1 FUNCTIONS
-=head2 new
-
-Create new normalisation object
-
- my $n = new WebPAC::Normalize::Something(
- filter => {
- 'filter_name_1' => sub {
- # filter code
- return length($_);
- }, ...
- },
- db => $db_obj,
- lookup_regex => $lookup->regex,
- lookup => $lookup_obj,
- prefix => 'foobar',
- );
+=head2 data_structure
-Parametar C<filter> defines user supplied snippets of perl code which can
-be use with C<filter{...}> notation.
+Return data structure
-C<prefix> is used to form filename for database record (to support multiple
-source files which are joined in one database).
+ my $ds = WebPAC::Normalize(
+ lookup => $lookup->lookup_hash,
+ row => $row,
+ rules => $normalize_pl_config,
+ );
-Recommended parametar C<lookup_regex> is used to enable parsing of lookups
-in structures. If you pass this parametar, you must also pass C<lookup>
-which is C<WebPAC::Lookup> object.
+This function will B<die> if normalizastion can't be evaled.
=cut
-sub new {
- my $class = shift;
- my $self = {@_};
- bless($self, $class);
-
- my $r = $self->{'lookup_regex'} ? 1 : 0;
- my $l = $self->{'lookup'} ? 1 : 0;
-
- my $log = $self->_get_logger();
-
- # those two must be in pair
- if ( ($r & $l) != ($r || $l) ) {
- my $log = $self->_get_logger();
- $log->logdie("lookup_regex and lookup must be in pair");
- }
-
- $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
-
- $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
-
- $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
-
- if (! $self->{filter} || ! $self->{filter}->{regex}) {
- $log->debug("adding built-in filter regex");
- $self->{filter}->{regex} = sub {
- my ($val, $regex) = @_;
- eval "\$val =~ $regex";
- return $val;
- };
- }
-
- $self ? return $self : return undef;
+sub data_structure {
+ my $arg = {@_};
+
+ die "need row argument" unless ($arg->{row});
+ die "need normalisation argument" unless ($arg->{rules});
+
+ no strict 'subs';
+ set_lookup( $arg->{lookup} );
+ set_rec( $arg->{row} );
+ clean_ds();
+ eval "$arg->{rules}";
+ die "error evaling $arg->{rules}: $@\n" if ($@);
+ return get_ds();
}
-=head2 all_tags
+=head2 set_rec
-Returns all tags in document in specified order
+Set current record hash
- my $sorted_tags = $self->all_tags();
+ set_rec( $rec );
=cut
-sub all_tags {
- my $self = shift;
-
- if (! $self->{_tags_by_order}) {
-
- my $log = $self->_get_logger;
- # sanity check
- $log->logdie("can't find self->{inport_xml}->{indexer}") unless ($self->{import_xml}->{indexer});
-
- my @tags = keys %{ $self->{'import_xml'}->{'indexer'}};
- $log->debug("unsorted tags: " . join(", ", @tags));
+my $rec;
- @tags = sort { $self->_sort_by_order } @tags;
-
- $log->debug("sorted tags: " . join(",", @tags) );
-
- $self->{_tags_by_order} = \@tags;
- }
-
- return $self->{_tags_by_order};
+sub set_rec {
+ $rec = shift or die "no record hash";
}
+=head2 tag
+Define new tag for I<search> and I<display>.
-=head2 data_structure
-
-Create in-memory data structure which represents normalized layout from
-C<conf/normalize/*.xml>.
-
-This structures are used to produce output.
+ tag('Title', rec('200','a') );
- my $ds = $webpac->data_structure($rec);
=cut
-sub data_structure {
- my $self = shift;
-
- my $log = $self->_get_logger();
-
- my $rec = shift;
- $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
-
- $log->debug("data_structure rec = ", sub { Dumper($rec) });
-
- $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
-
- my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
-
- my $cache_file;
-
- if ($self->{'db'}) {
- my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
- $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
- return $ds if ($ds);
- $log->debug("cache miss, creating");
- }
-
- my $tags = $self->all_tags();
-
- $log->debug("tags: ",sub { join(", ",@{ $tags }) });
-
- my $ds;
-
- foreach my $field (@{ $tags }) {
-
- my $row;
-
-#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
-
- foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
- my $format;
-
- $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
- $format = $tag->{'value'} || $tag->{'content'};
-
- my @v;
- if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
- @v = $self->_rec_to_arr($rec,$format,'fill_in');
- } else {
- @v = $self->_rec_to_arr($rec,$format,'parse');
- }
- if (! @v) {
- $log->debug("$field <",$self->{tag},"> format: $format no values");
- next;
- } else {
- $log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
- }
-
- if ($tag->{'sort'}) {
- @v = $self->sort_arr(@v);
- }
-
- # use format?
- if ($tag->{'format_name'}) {
- @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
- }
-
- # delimiter will join repeatable fields
- if ($tag->{'delimiter'}) {
- @v = ( join($tag->{'delimiter'}, @v) );
- }
-
- # default types
- my @types = qw(display search);
- # override by type attribute
- @types = ( $tag->{'type'} ) if ($tag->{'type'});
-
- foreach my $type (@types) {
- # append to previous line?
- $log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} || 'no append');
- if ($tag->{'append'}) {
-
- # I will delimit appended part with
- # delimiter (or ,)
- my $d = $tag->{'delimiter'};
- # default delimiter
- $d ||= " ";
-
- my $last = pop @{$row->{$type}};
- $d = "" if (! $last);
- $last .= $d . join($d, @v);
- push @{$row->{$type}}, $last;
-
- } else {
- push @{$row->{$type}}, @v;
- }
- }
-
-
- }
-
- if ($row) {
- $row->{'tag'} = $field;
-
- # TODO: name_sigular, name_plural
- my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
- my $row_name = $name ? $self->_x($name) : $field;
-
- # post-sort all values in field
- if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
- $log->warn("sort at field tag not implemented");
- }
-
- $ds->{$row_name} = $row;
-
- $log->debug("row $field: ",sub { Dumper($row) });
- }
-
- }
-
- $self->{'db'}->save_ds(
- id => $id,
- ds => $ds,
- prefix => $self->{prefix},
- ) if ($self->{'db'});
-
- $log->debug("ds: ", sub { Dumper($ds) });
-
- $log->logconfess("data structure returned is not array any more!") if wantarray;
-
- return $ds;
+my $out;
+sub tag {
+ my $name = shift or die "tag needs name as first argument";
+ my @o = grep { defined($_) && $_ ne '' } @_;
+ return unless (@o);
+ $out->{$name}->{tag} = $name;
+ $out->{$name}->{search} = \@o;
+ $out->{$name}->{display} = \@o;
}
-=head2 parse
-
-Perform smart parsing of string, skipping delimiters for fields which aren't
-defined. It can also eval code in format starting with C<eval{...}> and
-return output or nothing depending on eval code.
-
- my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
-
-Filters are implemented here. While simple form of filters looks like this:
+=head2 display
- filter{name_of_filter}
+Define tag just for I<display>
-but, filters can also have variable number of parametars like this:
-
- filter{name_of_filter(param,param,param)}
+ @v = display('Title', rec('200','a') );
=cut
-my $warn_once;
-
-sub parse {
- my $self = shift;
-
- my ($rec, $format_utf8, $i, $rec_size) = @_;
-
- return if (! $format_utf8);
-
- my $log = $self->_get_logger();
-
- $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
-
- $i = 0 if (! $i);
-
- my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
-
- my @out;
-
- $log->debug("format: $format [$i]");
-
- my $eval_code;
- # remove eval{...} from beginning
- $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
-
- my $filter_name;
- # remove filter{...} from beginning
- $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
-
- # did we found any (att all) field from format in row?
- my $found_any;
- # prefix before first field which we preserve it $found_any
- my $prefix;
-
- my $f_step = 1;
-
- while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
-
- my $del = $1 || '';
- $prefix = $del if ($f_step == 1);
-
- my $fld_type = lc($2);
-
- # repeatable index
- my $r = $i;
- if ($fld_type eq 's') {
- if ($found_any->{'v'}) {
- $r = 0;
- } else {
- return;
- }
- }
+sub display {
+ my $name = shift or die "display needs name as first argument";
+ my @o = grep { defined($_) && $_ ne '' } @_;
+ return unless (@o);
+ $out->{$name}->{tag} = $name;
+ $out->{$name}->{display} = \@o;
+}
- my $found = 0;
- my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found,$rec_size);
+=head2 search
- if ($found) {
- $found_any->{$fld_type} += $found;
+Prepare values just for I<search>
- # we will skip delimiter before first occurence of field!
- push @out, $del unless($found_any->{$fld_type} == 1);
- push @out, $tmp if ($tmp);
- }
- $f_step++;
- }
+ @v = search('Title', rec('200','a') );
- # test if any fields found?
- return if (! $found_any->{'v'} && ! $found_any->{'s'});
+=cut
- my $out = join('',@out);
+sub search {
+ my $name = shift or die "search needs name as first argument";
+ my @o = grep { defined($_) && $_ ne '' } @_;
+ return unless (@o);
+ $out->{$name}->{tag} = $name;
+ $out->{$name}->{search} = \@o;
+}
- if ($out) {
- # add rest of format (suffix)
- $out .= $format;
+=head2 get_ds
- # add prefix if not there
- $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
+Return hash formatted as data structure
- $log->debug("result: $out");
- }
+ my $ds = get_ds();
- if ($eval_code) {
- my $eval = $self->fill_in($rec,$eval_code,$i) || return;
- $log->debug("about to eval{$eval} format: $out");
- return if (! $self->_eval($eval));
- }
-
- if ($filter_name) {
- my @filter_args;
- if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
- @filter_args = split(/,/, $2);
- }
- if ($self->{'filter'}->{$filter_name}) {
- $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
- unshift @filter_args, $out;
- $out = $self->{'filter'}->{$filter_name}->(@filter_args);
- return unless(defined($out));
- $log->debug("filter result: $out");
- } elsif (! $warn_once->{$filter_name}) {
- $log->warn("trying to use undefined filter $filter_name");
- $warn_once->{$filter_name}++;
- }
- }
+=cut
+sub get_ds {
return $out;
}
-=head2 fill_in
-
-Workhourse of all: takes record from in-memory structure of database and
-strings with placeholders and returns string or array of with substituted
-values from record.
+=head2 clean_ds
- my $text = $webpac->fill_in($rec,'v250^a');
+Clean data structure hash for next record
-Optional argument is ordinal number for repeatable fields. By default,
-it's assume to be first repeatable field (fields are perl array, so first
-element is 0).
-Following example will read second value from repeatable field.
-
- my $text = $webpac->fill_in($rec,'Title: v250^a',1);
-
-This function B<does not> perform parsing of format to inteligenty skip
-delimiters before fields which aren't used.
-
-This method will automatically decode UTF-8 string to local code page
-if needed.
-
-There is optional parametar C<$record_size> which can be used to get sizes of
-all C<field^subfield> combinations in this format.
-
- my $text = $webpac->fill_in($rec,'got: v900^a v900^x',0,\$rec_size);
+ clean_ds();
=cut
-sub fill_in {
- my $self = shift;
-
- my $log = $self->_get_logger();
-
- my ($rec,$format,$i,$rec_size) = @_;
+sub clean_ds {
+ $out = undef;
+}
- $log->logconfess("need data record") unless ($rec);
- $log->logconfess("need format to parse") unless($format);
+=head2 set_lookup
- # iteration (for repeatable fields)
- $i ||= 0;
+Set current lookup hash
- $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
+ set_lookup( $lookup );
- # FIXME remove for speedup?
- $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+=cut
- if (utf8::is_utf8($format)) {
- $format = $self->_x($format);
- }
+my $lookup;
- my $found = 0;
- my $just_single = 1;
+sub set_lookup {
+ $lookup = shift;
+}
- my $eval_code;
- # remove eval{...} from beginning
- $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
+=head2 rec1
- my $filter_name;
- # remove filter{...} from beginning
- $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
+Return all values in some field
- {
- # fix warnings
- no warnings 'uninitialized';
+ @v = rec1('200')
- # do actual replacement of placeholders
- # repeatable fields
- if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found,$rec_size)/ges) {
- $just_single = 0;
- }
+TODO: order of values is probably same as in source data, need to investigate that
- # non-repeatable fields
- if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found,$rec_size)/ges) {
- return if ($i > 0 && $just_single);
- }
- }
+=cut
- if ($found) {
- $log->debug("format: $format");
- if ($eval_code) {
- my $eval = $self->fill_in($rec,$eval_code,$i);
- return if (! $self->_eval($eval));
- }
- if ($filter_name && $self->{'filter'}->{$filter_name}) {
- $log->debug("filter '$filter_name' for $format");
- $format = $self->{'filter'}->{$filter_name}->($format);
- return unless(defined($format));
- $log->debug("filter result: $format");
- }
- # do we have lookups?
- if ($self->{'lookup'}) {
- if ($self->{'lookup'}->can('lookup')) {
- my @lookup = $self->{lookup}->lookup($format);
- $log->debug("lookup $format", join(", ", @lookup));
- return @lookup;
+sub rec1 {
+ my $f = shift;
+ return unless (defined($rec) && defined($rec->{$f}));
+ if (ref($rec->{$f}) eq 'ARRAY') {
+ return map {
+ if (ref($_) eq 'HASH') {
+ values %{$_};
} else {
- $log->warn("Have lookup object but can't invoke lookup method");
+ $_;
}
- } else {
- return $format;
- }
- } else {
- return;
+ } @{ $rec->{$f} };
+ } elsif( defined($rec->{$f}) ) {
+ return $rec->{$f};
}
}
+=head2 rec2
-=head2 _rec_to_arr
-
-Similar to C<parse> and C<fill_in>, but returns array of all repeatable fields. Usable
-for fields which have lookups, so they shouldn't be parsed but rather
-C<paste>d or C<fill_id>ed. Last argument is name of operation: C<paste> or C<fill_in>.
+Return all values in specific field and subfield
- my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]','paste');
+ @v = rec2('200','a')
=cut
-sub _rec_to_arr {
- my $self = shift;
-
- my ($rec, $format_utf8, $code) = @_;
+sub rec2 {
+ my $f = shift;
+ return unless (defined($rec && $rec->{$f}));
+ my $sf = shift;
+ return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
+}
- my $log = $self->_get_logger();
+=head2 rec
- $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
- return if (! $format_utf8);
+syntaxtic sugar for
- $log->debug("using $code on $format_utf8");
+ @v = rec('200')
+ @v = rec('200','a')
- my $i = 0;
- my $max = 0;
- my @arr;
- my $rec_size = {};
+=cut
- while ($i <= $max) {
- my @v = $self->$code($rec,$format_utf8,$i++,\$rec_size);
- if ($rec_size) {
- foreach my $f (keys %{ $rec_size }) {
- $max = $rec_size->{$f} if ($rec_size->{$f} > $max);
- }
- $log->debug("max set to $max");
- undef $rec_size;
- }
- if (@v) {
- push @arr, @v;
- } else {
- push @arr, '' if ($max > $i);
- }
+sub rec {
+ if ($#_ == 0) {
+ return rec1(@_);
+ } elsif ($#_ == 1) {
+ return rec2(@_);
}
-
- $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
-
- return @arr;
}
+=head2 regex
-=head2 get_data
-
-Returns value from record.
-
- my $text = $self->get_data(\$rec,$f,$sf,$i,\$found,\$rec_size);
-
-Required arguments are:
+Apply regex to some or all values
-=over 8
-
-=item C<$rec>
-
-record reference
-
-=item C<$f>
-
-field
-
-=item C<$sf>
-
-optional subfield
-
-=item C<$i>
-
-index offset for repeatable values ( 0 ... $rec_size->{'400^a'} )
-
-=item C<$found>
-
-optional variable that will be incremeted if preset
-
-=item C<$rec_size>
-
-hash to hold maximum occurances of C<field^subfield> combinations
-(which can be accessed using keys in same format)
-
-=back
-
-Returns value or empty string, updates C<$found> and C<rec_size>
-if present.
+ @v = regex( 's/foo/bar/g', @v );
=cut
-sub get_data {
- my $self = shift;
-
- my ($rec,$f,$sf,$i,$found,$cache) = @_;
-
- return '' unless ($$rec->{$f} && ref($$rec->{$f}) eq 'ARRAY');
-
- if (defined($$cache)) {
- $$cache->{ $f . ( $sf ? '^' . $sf : '' ) } ||= scalar @{ $$rec->{$f} };
- }
-
- return '' unless ($$rec->{$f}->[$i]);
-
- {
- no strict 'refs';
- if (defined($sf)) {
- $$found++ if (defined($$found) && $$rec->{$f}->[$i]->{$sf});
- return $$rec->{$f}->[$i]->{$sf};
- } else {
- $$found++ if (defined($$found));
- # it still might have subfields, just
- # not specified, so we'll dump some debug info
- if ($$rec->{$f}->[$i] =~ /HASH/o) {
- my $out;
- foreach my $k (keys %{$$rec->{$f}->[$i]}) {
- my $v = $$rec->{$f}->[$i]->{$k};
- $out .= '$' . $k .':' . $v if ($v);
- }
- return $out;
- } else {
- return $$rec->{$f}->[$i];
- }
- }
+sub regex {
+ my $r = shift;
+ my @out;
+ #warn "r: $r\n",Dumper(\@_);
+ foreach my $t (@_) {
+ next unless ($t);
+ eval "\$t =~ $r";
+ push @out, $t if ($t && $t ne '');
}
+ return @out;
}
+=head2 prefix
-=head2 apply_format
-
-Apply format specified in tag with C<format_name="name"> and
-C<format_delimiter=";;">.
-
- my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
+Prefix all values with a string
-Formats can contain C<lookup{...}> if you need them.
+ @v = prefix( 'my_', @v );
=cut
-sub apply_format {
- my $self = shift;
-
- my ($name,$delimiter,$data) = @_;
-
- my $log = $self->_get_logger();
-
- if (! $self->{'import_xml'}->{'format'}->{$name}) {
- $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
- return $data;
- }
-
- $log->warn("no delimiter for format $name") if (! $delimiter);
+sub prefix {
+ my $p = shift or die "prefix needs string as first argument";
+ return map { $p . $_ } grep { defined($_) } @_;
+}
- my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
+=head2 suffix
- my @data = split(/\Q$delimiter\E/, $data);
+suffix all values with a string
- my $out = sprintf($format, @data);
- $log->debug("using format $name [$format] on $data to produce: $out");
+ @v = suffix( '_my', @v );
- if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
- return $self->{'lookup'}->lookup($out);
- } else {
- return $out;
- }
+=cut
+sub suffix {
+ my $s = shift or die "suffix needs string as first argument";
+ return map { $_ . $s } grep { defined($_) } @_;
}
-=head2 sort_arr
+=head2 surround
-Sort array ignoring case and html in data
+surround all values with a two strings
- my @sorted = $webpac->sort_arr(@unsorted);
+ @v = surround( 'prefix_', '_suffix', @v );
=cut
-sub sort_arr {
- my $self = shift;
-
- my $log = $self->_get_logger();
-
- # FIXME add Schwartzian Transformation?
-
- my @sorted = sort {
- $a =~ s#<[^>]+/*>##;
- $b =~ s#<[^>]+/*>##;
- lc($b) cmp lc($a)
- } @_;
- $log->debug("sorted values: ",sub { join(", ",@sorted) });
-
- return @sorted;
+sub surround {
+ my $p = shift or die "surround need prefix as first argument";
+ my $s = shift or die "surround needs suffix as second argument";
+ return map { $p . $_ . $s } grep { defined($_) } @_;
}
+=head2 first
-=head1 INTERNAL METHODS
-
-=head2 _sort_by_order
+Return first element
-Sort xml tags data structure accoding to C<order=""> attribute.
+ $v = first( @v );
=cut
-sub _sort_by_order {
- my $self = shift;
-
- my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
- $self->{'import_xml'}->{'indexer'}->{$a};
- my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
- $self->{'import_xml'}->{'indexer'}->{$b};
-
- return $va <=> $vb;
+sub first {
+ my $r = shift;
+ return $r;
}
-=head2 _x
-
-Convert strings from C<conf/normalize/*.xml> encoding into application
-specific encoding (optinally specified using C<code_page> to C<new>
-constructor).
+=head2 lookup
- my $text = $n->_x('normalize text string');
+Consult lookup hashes for some value
-This is a stub so that other modules doesn't have to implement it.
+ @v = lookup( $v );
+ @v = lookup( @v );
=cut
-sub _x {
- my $self = shift;
- return shift;
+sub lookup {
+ my $k = shift or return;
+ return unless (defined($lookup->{$k}));
+ if (ref($lookup->{$k}) eq 'ARRAY') {
+ return @{ $lookup->{$k} };
+ } else {
+ return $lookup->{$k};
+ }
}
+=head2 join_with
-=head1 AUTHOR
-
-Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
-
-=head1 COPYRIGHT & LICENSE
+Joins walues with some delimiter
-Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it
-under the same terms as Perl itself.
+ $v = join_with(", ", @v);
=cut
-1; # End of WebPAC::Normalize
+sub join_with {
+ my $d = shift;
+ return join($d, grep { defined($_) && $_ ne '' } @_);
+}
+
+# END
+1;
+++ /dev/null
-package WebPAC::Normalize::Set;
-use Exporter 'import';
-@EXPORT = qw/
- set_rec set_lookup
- get_ds clean_ds
- tag search display
- rec1 rec2 rec
- regex prefix suffix surround
- first lookup join_with
-/;
-
-use warnings;
-use strict;
-
-#use base qw/WebPAC::Common/;
-use Data::Dumper;
-
-=head1 NAME
-
-WebPAC::Normalize::Set - describe normalisaton rules using sets
-
-=head1 VERSION
-
-Version 0.04
-
-=cut
-
-our $VERSION = '0.04';
-
-=head1 SYNOPSIS
-
-This module uses C<conf/normalize/*.pl> files to perform normalisation
-from input records using perl functions which are specialized for set
-processing.
-
-Sets are implemented as arrays, and normalisation file is valid perl, which
-means that you check it's validity before running WebPAC using
-C<perl -c normalize.pl>.
-
-Normalisation can generate multiple output normalized data. For now, supported output
-types (on the left side of definition) are: C<tag>, C<display> and C<search>.
-
-=head1 FUNCTIONS
-
-=head2 data_structure
-
-Return data structure
-
- my $ds = WebPAC::Normalize::Set(
- lookup => $lookup->lookup_hash,
- row => $row,
- rules => $normalize_pl_config,
- );
-
-This function will B<die> if normalizastion can't be evaled.
-
-=cut
-
-sub data_structure {
- my $arg = {@_};
-
- die "need row argument" unless ($arg->{row});
- die "need normalisation argument" unless ($arg->{rules});
-
- no strict 'subs';
- set_lookup( $arg->{lookup} );
- set_rec( $arg->{row} );
- clean_ds();
- eval "$arg->{rules}";
- die "error evaling $arg->{rules}: $@\n" if ($@);
- return get_ds();
-}
-
-=head2 set_rec
-
-Set current record hash
-
- set_rec( $rec );
-
-=cut
-
-my $rec;
-
-sub set_rec {
- $rec = shift or die "no record hash";
-}
-
-=head2 tag
-
-Define new tag for I<search> and I<display>.
-
- tag('Title', rec('200','a') );
-
-
-=cut
-
-my $out;
-
-sub tag {
- my $name = shift or die "tag needs name as first argument";
- my @o = grep { defined($_) && $_ ne '' } @_;
- return unless (@o);
- $out->{$name}->{tag} = $name;
- $out->{$name}->{search} = \@o;
- $out->{$name}->{display} = \@o;
-}
-
-=head2 display
-
-Define tag just for I<display>
-
- @v = display('Title', rec('200','a') );
-
-=cut
-
-sub display {
- my $name = shift or die "display needs name as first argument";
- my @o = grep { defined($_) && $_ ne '' } @_;
- return unless (@o);
- $out->{$name}->{tag} = $name;
- $out->{$name}->{display} = \@o;
-}
-
-=head2 search
-
-Prepare values just for I<search>
-
- @v = search('Title', rec('200','a') );
-
-=cut
-
-sub search {
- my $name = shift or die "search needs name as first argument";
- my @o = grep { defined($_) && $_ ne '' } @_;
- return unless (@o);
- $out->{$name}->{tag} = $name;
- $out->{$name}->{search} = \@o;
-}
-
-=head2 get_ds
-
-Return hash formatted as data structure
-
- my $ds = get_ds();
-
-=cut
-
-sub get_ds {
- return $out;
-}
-
-=head2 clean_ds
-
-Clean data structure hash for next record
-
- clean_ds();
-
-=cut
-
-sub clean_ds {
- $out = undef;
-}
-
-=head2 set_lookup
-
-Set current lookup hash
-
- set_lookup( $lookup );
-
-=cut
-
-my $lookup;
-
-sub set_lookup {
- $lookup = shift;
-}
-
-=head2 rec1
-
-Return all values in some field
-
- @v = rec1('200')
-
-TODO: order of values is probably same as in source data, need to investigate that
-
-=cut
-
-sub rec1 {
- my $f = shift;
- return unless (defined($rec) && defined($rec->{$f}));
- if (ref($rec->{$f}) eq 'ARRAY') {
- return map {
- if (ref($_) eq 'HASH') {
- values %{$_};
- } else {
- $_;
- }
- } @{ $rec->{$f} };
- } elsif( defined($rec->{$f}) ) {
- return $rec->{$f};
- }
-}
-
-=head2 rec2
-
-Return all values in specific field and subfield
-
- @v = rec2('200','a')
-
-=cut
-
-sub rec2 {
- my $f = shift;
- return unless (defined($rec && $rec->{$f}));
- my $sf = shift;
- return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
-}
-
-=head2 rec
-
-syntaxtic sugar for
-
- @v = rec('200')
- @v = rec('200','a')
-
-=cut
-
-sub rec {
- if ($#_ == 0) {
- return rec1(@_);
- } elsif ($#_ == 1) {
- return rec2(@_);
- }
-}
-
-=head2 regex
-
-Apply regex to some or all values
-
- @v = regex( 's/foo/bar/g', @v );
-
-=cut
-
-sub regex {
- my $r = shift;
- my @out;
- #warn "r: $r\n",Dumper(\@_);
- foreach my $t (@_) {
- next unless ($t);
- eval "\$t =~ $r";
- push @out, $t if ($t && $t ne '');
- }
- return @out;
-}
-
-=head2 prefix
-
-Prefix all values with a string
-
- @v = prefix( 'my_', @v );
-
-=cut
-
-sub prefix {
- my $p = shift or die "prefix needs string as first argument";
- return map { $p . $_ } grep { defined($_) } @_;
-}
-
-=head2 suffix
-
-suffix all values with a string
-
- @v = suffix( '_my', @v );
-
-=cut
-
-sub suffix {
- my $s = shift or die "suffix needs string as first argument";
- return map { $_ . $s } grep { defined($_) } @_;
-}
-
-=head2 surround
-
-surround all values with a two strings
-
- @v = surround( 'prefix_', '_suffix', @v );
-
-=cut
-
-sub surround {
- my $p = shift or die "surround need prefix as first argument";
- my $s = shift or die "surround needs suffix as second argument";
- return map { $p . $_ . $s } grep { defined($_) } @_;
-}
-
-=head2 first
-
-Return first element
-
- $v = first( @v );
-
-=cut
-
-sub first {
- my $r = shift;
- return $r;
-}
-
-=head2 lookup
-
-Consult lookup hashes for some value
-
- @v = lookup( $v );
- @v = lookup( @v );
-
-=cut
-
-sub lookup {
- my $k = shift or return;
- return unless (defined($lookup->{$k}));
- if (ref($lookup->{$k}) eq 'ARRAY') {
- return @{ $lookup->{$k} };
- } else {
- return $lookup->{$k};
- }
-}
-
-=head2 join_with
-
-Joins walues with some delimiter
-
- $v = join_with(", ", @v);
-
-=cut
-
-sub join_with {
- my $d = shift;
- return join($d, grep { defined($_) && $_ ne '' } @_);
-}
-
-# END
-1;
+++ /dev/null
-package WebPAC::Normalize::XML;
-
-use warnings;
-use strict;
-
-use base qw/WebPAC::Common WebPAC::Normalize/;
-use XML::Simple;
-use Data::Dumper;
-use Text::Iconv;
-use YAML qw/Dump LoadFile/;
-
-=head1 NAME
-
-WebPAC::Normalize::XML - apply XML or YAML normalisaton rules
-
-=head1 VERSION
-
-Version 0.03
-
-=cut
-
-our $VERSION = '0.03';
-
-=head1 SYNOPSIS
-
-This module uses C<conf/normalize/*.xml> files to perform normalisation
-from input records
-
-=cut
-
-=head1 FUNCTIONS
-
-=head2 open
-
-Read normalisation rules defined using XML from C<conf/normalize/*.xml> and
-parse it.
-
- my $n = new WebPAC::Normalize::XML;
- $n->open(
- tag => 'isis',
- xml_file => '/path/to/conf/normalize/isis.xml',
- );
-
-C<tag> defines tag to use within C<xml_file>
-
-C<xml_file> defines path to normalize XML
-
-C<tags> define additional tags that can be forced (and an be array).
-
-=cut
-
-sub open {
- my $self = shift;
-
- my $arg = {@_};
-
- my $log = $self->_get_logger();
-
- foreach my $req (qw/tag xml_file/) {
- $log->logconfess("need argument $req") unless $arg->{$req};
- }
-
- $self->{'tag'} = $arg->{'tag'};
- my $xml_file = $arg->{'xml_file'};
-
- $log->info("using $xml_file tag <",$self->{'tag'},">");
-
- $log->logdie("normalisation xml file '$xml_file' doesn't exist!") if (! -e $xml_file);
-
- $self->{'import_xml_file'} = $xml_file;
-
- my @force_array = [ $self->{'tag'}, 'config', 'format' ];
- push @force_array, $self->{'tags'} if ($self->{'tags'});
-
- $self->{'import_xml'} = XMLin($xml_file,
- ForceArray => @force_array,
- ForceContent => 1,
- );
-
- $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) }, $self->{lookup} ? " using lookups" : "lookups disabled");
-
- #print STDERR Dump($self->{import_xml});
-
- return $self;
-}
-
-=head2 open_yaml
-
-Read normalisation rules defined in YAML file located usually at
-C<conf/normalize/*.yml> and parse it.
-
- my $n = new WebPAC::Normalize::XML;
- $n->open_yaml(
- tag => 'isis',
- path => '/path/to/conf/normalize/isis.yml',
- );
-
-=cut
-
-sub open_yaml {
- my $self = shift;
-
- my $arg = {@_};
-
- my $log = $self->_get_logger();
-
- foreach my $req (qw/tag path/) {
- $log->logconfess("need argument $req") unless $arg->{$req};
- }
-
- my $path = $arg->{path};
- $self->{tag} = $arg->{tag};
-
- $log->logdie("normalisation yaml file '$path' doesn't exist!") if (! -e $path);
-
- $log->info("using $path normalization YAML");
-
- $self->{'import_xml'} = LoadFile( $path ) || $log->die("can't load $path: $!");
-
- $log->debug("import yaml is ",sub { Dumper($self->{'import_xml'}) }, $self->{lookup} ? " using lookups" : "lookups disabled");
-
- $self->{_skip_x} = 1;
-
- return $self;
-}
-
-=head2 _x
-
-Convert string from XML UTF-8 encoding to code page defined in C<xml_file>.
-
- my $text = $n->_x('utf8 text');
-
-Default application code page is C<ISO-8859-2>. You will probably want to
-change that when creating new instance of object based on this one.
-
-=cut
-
-sub _x {
- my $self = shift;
- my $utf8 = shift || return;
- return $utf8 if ($self->{_skip_x});
-
- # create UTF-8 convertor for import_xml files
- $self->{'utf2cp'} ||= Text::Iconv->new('UTF-8' ,$self->{'code_page'} || 'ISO-8859-2');
-
- return $self->{'utf2cp'}->convert($utf8) ||
- $self->_get_logger()->logwarn("can't convert '$utf8'");
-}
-
-
-=head1 AUTHOR
-
-Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
-
-=head1 COPYRIGHT & LICENSE
-
-Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it
-under the same terms as Perl itself.
-
-=cut
-
-1; # End of WebPAC::Normalize::XML
use KinoSearch::Analysis::PolyAnalyzer;
use Encode qw/from_to/;
use Data::Dumper;
+use Storable;
=head1 NAME
=head1 VERSION
-Version 0.01
+Version 0.02
=cut
-our $VERSION = '0.01';
+our $VERSION = '0.02';
=head1 SYNOPSIS
analyzer => $analyzer,
);
+ my $fields_path = $self->{index_path} . '/fields.storable';
+ $fields_path =~ s#//#/#g;
+ if (-e $fields_path) {
+ $self->{fields} = retrieve($fields_path) ||
+ $log->warn("can't open $fields_path: $!");
+ } else {
+ $log->error("This will be dummy run since no fields statistics are found!");
+ $log->error("You will have to re-run indexing to get search results!");
+ $self->{dummy_run} = 1;
+ }
+ $self->{fields_path} = $fields_path;
+
foreach my $f (@{ $self->{fields} }) {
$self->{invindex}->spec_field(
name => $f,
sub add_value($$$$$) {
my ($self,$log,$doc,$n,$v) = @_;
return unless ($v);
+
+ $self->{value_usage}->{$n}++;
+ return if ($self->{dummy_run});
+
eval { $doc->set_value($n, $self->convert($v) ) };
$log->warn("can't insert: $n = $v") if ($@);
}
sub finish {
my $self = shift;
- $self->_get_logger()->info("finish index writing to disk");
+ my $log = $self->_get_logger();
+
+ $log->info("finish index writing to disk");
$self->{invindex}->finish;
+
+ $log->info("writing value usage file");
+
+ # add fields from last run
+ map { $self->{value_usage}->{$_}++ } @{ $self->{fields} };
+
+ my @fields = keys %{ $self->{value_usage} };
+ store \@fields, $self->{fields_path} ||
+ $log->warn("can't write $self->{fields_path}: $!");
+
}
=head2 convert
use WebPAC::Lookup;
use WebPAC::Input 0.03;
use WebPAC::Store 0.03;
-use WebPAC::Normalize::XML;
-use WebPAC::Normalize::Set;
+use WebPAC::Normalize;
use WebPAC::Output::TT;
use WebPAC::Validate;
use YAML qw/LoadFile/;
path to YAML configuration file
-=item --force-set
-
-force conversion C<< normalize->path >> in C<config.yml> from
-C<.xml> to C<.pl>
-
=item --stats
disable indexing and dump statistics about field and subfield
my $config = 'conf/config.yml';
my $debug = 0;
my $only_filter;
-my $force_set = 0;
my $stats = 0;
my $validate_path;
"only=s" => \$only_filter,
"config" => \$config,
"debug" => \$debug,
- "force-set" => \$force_set,
"stats" => \$stats,
"validate=s" => \$validate_path,
);
%{ $input },
);
- my $n = new WebPAC::Normalize::XML(
- # filter => { 'foo' => sub { shift } },
- db => $db,
- lookup_regex => $lookup ? $lookup->regex : undef,
- lookup => $lookup,
- prefix => $input->{name},
- );
-
my $rules;
my $normalize_path = $input->{normalize}->{path};
- if ($force_set) {
- my $new_norm_path = $normalize_path;
- $new_norm_path =~ s/\.xml$/.pl/;
- if (-e $new_norm_path) {
- $log->debug("--force-set replaced $normalize_path with $new_norm_path");
- $normalize_path = $new_norm_path;
- } else {
- $log->debug("--force-set failed on $new_norm_path, fallback to $normalize_path");
- }
- }
+ $log->logdie("Found '$normalize_path' as normalization file which isn't supported any more!") unless ( $normalize_path =~ m!\.pl$!i );
- if ($normalize_path =~ m/\.xml$/i) {
- $n->open(
- tag => $input->{normalize}->{tag},
- xml_file => $normalize_path,
- );
- } elsif ($normalize_path =~ m/\.(?:yml|yaml)$/i) {
- $n->open_yaml(
- path => $normalize_path,
- tag => $input->{normalize}->{tag},
- );
- } elsif ($normalize_path =~ m/\.(?:pl)$/i) {
- $n = undef;
- $log->info("using WebPAC::Normalize::Set to process $normalize_path");
- $rules = read_file( $normalize_path ) or die "can't open $normalize_path: $!";
- }
+ my $rules = read_file( $normalize_path ) or die "can't open $normalize_path: $!";
foreach my $pos ( 0 ... $input_db->size ) {
}
- my $ds;
- if ($n) {
- $ds = $n->data_structure($row);
- } else {
- $ds = WebPAC::Normalize::Set::data_structure(
- row => $row,
- rules => $rules,
- lookup => $lookup ? $lookup->lookup_hash : undef,
- );
-
- $db->save_ds(
- id => $mfn,
- ds => $ds,
- prefix => $input->{name},
- ) if ($ds && !$stats);
- }
+ my $ds = WebPAC::Normalize::data_structure(
+ row => $row,
+ rules => $rules,
+ lookup => $lookup ? $lookup->lookup_hash : undef,
+ );
+
+ $db->save_ds(
+ id => $mfn,
+ ds => $ds,
+ prefix => $input->{name},
+ ) if ($ds && !$stats);
$indexer->add(
id => $input->{name} . "/" . $mfn,
use_ok( 'WebPAC::Input::ISIS' );
use_ok( 'WebPAC::Store' );
use_ok( 'WebPAC::Lookup' );
-use_ok( 'WebPAC::Normalize::XML' );
+use_ok( 'WebPAC::Normalize' );
use_ok( 'WebPAC::Output' );
use_ok( 'WebPAC::Output::Estraier' );
use_ok( 'WebPAC::Output::TT' );
+++ /dev/null
-#!/usr/bin/perl -w
-
-use strict;
-
-use Test::More tests => 67;
-use Test::Exception;
-use Cwd qw/abs_path/;
-use blib;
-use File::Slurp;
-
-use Data::Dumper;
-my $debug = shift @ARGV;
-
-BEGIN {
- use_ok( 'WebPAC::Normalize::Set' );
-}
-
-ok(my $abs_path = abs_path($0), "abs_path");
-$abs_path =~ s#/[^/]*$#/#;
-diag "abs_path: $abs_path" if ($debug);
-
-#throws_ok { new WebPAC::Normalize::XML( lookup_regex => 'foo' ) } qr/pair/, "lookup_regex without lookup";
-
-my $rec1 = {
- '200' => [{
- 'a' => '200a',
- 'b' => '200b',
- },{
- 'c' => '200c',
- 'd' => '200d',
- },{
- 'a' => '200a*2',
- 'd' => '200d*2',
- }],
- '201' => [{
- 'x' => '201x',
- 'y' => '201y',
- }],
- '900' => [
- '900-no_subfield'
- ],
- '901' => [{
- 'a' => '900a',
- }],
- '902' => [{
- 'z' => '900',
- }],
-};
-
-my $rec2 = {
- '675' => [ {
- 'a' => '159.9'
- } ],
- '210' => [ {
- 'c' => 'New York University press',
- 'a' => 'New York',
- 'd' => 'cop. 1988'
- } ],
- '700' => [ {
- 'a' => 'Haynal',
- 'b' => 'André'
- } ],
- '801' => [ 'FFZG' ],
- '991' => [ '8302' ],
- '000' => [ 1 ],
- '702' => [ {
- 'a' => 'Holder',
- 'b' => 'Elizabeth'
- } ],
- '215' => [ {
- 'c' => 'ilustr',
- 'a' => 'xix, 202 str',
- 'd' => '23cm'
- } ],
- '990' => [
- '2140',
- '88',
- 'HAY'
- ],
- '200' => [ {
- 'e' => 'from Freud and Ferenczi to Michael balint',
- 'a' => 'Controversies in psychoanalytic method',
- 'g' => 'translated by Elizabeth Holder on the basisi of a first draft by Archie Hooton ; with a preface by Daniel N. Stern',
- 'f' => 'by André E. Haynal'
- } ],
- '610' => [ 'povijest psihoanalize' ],
- '994' => [ {
- 'c' => '',
- 'a' => 'PS',
- 'b' => 'MG'
- } ],
- '320' => [ 'Kazalo' ],
- '101' => [ 'ENG' ],
- '686' => [ '2140' ],
- '300' => [ 'Prijevod djela: ' ],
-};
-
-
-my $lookup1 = {
- '00900' => [
- 'lookup 1',
- 'lookup 2',
- ],
-};
-
-my $lookup2 = {
- '00900' => 'lookup',
-};
-
-
-sub test {
- print Dumper( @_ ), ("-" x 78), "\n";
- ok( defined(@_) );
-}
-
-# how much of string evaled to display?
-my $max_eval_output = 170;
-
-sub dump_error {
- my ($msg,$code) = @_;
-
- my @l = split(/[\n\r]/, $code);
- my $out = "$msg\n";
-
- foreach my $i ( 0 .. $#l ) {
- $out .= sprintf("%2d: %s\n", $i, $l[$i]);
- }
-
- return $out;
-}
-
-sub test_s {
- my $t = shift || die;
-
- my $eval_t = $t;
- $eval_t =~ s/[\n\r\s]+/ /gs;
- $eval_t = substr($eval_t,0,$max_eval_output) . '...' if (length($eval_t) > $max_eval_output);
-
- eval "$t";
- ok(! $@, $@ ? dump_error($@, $t) : "eval: $eval_t");
-}
-
-{
- no strict 'subs';
- use WebPAC::Normalize::Set;
-
- ok(! set_lookup( undef ), "set_lookup(undef)");
-
- set_rec( $rec1 );
-
- cmp_ok( join(",", rec2('200','a') ), 'eq', '200a,200a*2', 'join rec2' );
- cmp_ok( join(",", rec2('200','a'), rec2('200','b') ), 'eq', '200a,200a*2,200b', 'join rec2 rec2' );
- cmp_ok( join(" * ", sort(rec1('200'), rec1('201') )), 'eq', '200a * 200a*2 * 200b * 200c * 200d * 200d*2 * 201x * 201y', 'join sort rec1 rec1' );
- diag "is_deeply checks\n";
- is_deeply( \[ rec1('200') ], \[ qw/200a 200b 200c 200d 200a*2 200d*2/ ] );
- is_deeply( \[ regex( 's/0/o/g', rec1('200') ) ], \[ qw/2ooa 2oob 2ooc 2ood 2ooa*2 2ood*2/ ]);
- is_deeply( \[ grep { /\*/ } regex( 's/0/o/g', rec1('200') ) ], \[ qw/2ooa*2 2ood*2/ ]);
- is_deeply( \[ rec('902') ], \[ '900' ] );
-
- cmp_ok( rec('902'), 'eq', rec('902','z'), 'rec sf' );
-
- # simple list manipulatons
- cmp_ok( join('', prefix('ab', 'cd') ), 'eq', 'abcd', 'prefix');
- cmp_ok( join('', suffix('xy', 'cd') ), 'eq', 'cdxy', 'suffix');
- cmp_ok( join('', surround('->', '<-', 'a','b','c') ), 'eq', '->a<-->b<-->c<-', 'surround');
-
-
- set_lookup( $lookup1 );
-
- cmp_ok(
- join_with(" i ",
- lookup(
- regex( 's/^/00/',
- rec2('902','z')
- )
- )
- ),
- 'eq', 'lookup 1 i lookup 2', 'join lookup regex rec2');
-
- # check join_with operations
-
- sub test_join_with_2 {
- my ($a,$b,$e) = @_;
-
- cmp_ok(
- join_with(" <1> ",
- rec('201',$a),
- rec('201',$b),
- ),
- 'eq', $e, "join_with $a <1> $b = $e");
- }
-
- test_join_with_2('_','_','');
- test_join_with_2('x','_','201x');
- test_join_with_2('_','x','201x');
- test_join_with_2('x','y','201x <1> 201y');
-
- sub test_join_with_3 {
- my ($a,$b,$c,$e) = @_;
-
- cmp_ok(
- join_with(" <1> ", rec('201',$a),
- join_with(" <2> ", rec('201',$b),
- rec('201',$c),
- )
- ),
- 'eq', $e, "join_with $a <1> $b <2> $c = $e");
- };
-
- test_join_with_3('_','_','_','');
- test_join_with_3('x','_','_','201x');
- test_join_with_3('_','x','_','201x');
- test_join_with_3('_','_','x','201x');
- test_join_with_3('x','y','_','201x <1> 201y');
- test_join_with_3('x','_','y','201x <1> 201y');
- test_join_with_3('_','x','y','201x <2> 201y');
- test_join_with_3('x','_','y','201x <1> 201y');
- test_join_with_3('x','y','x','201x <1> 201y <2> 201x');
-
- # test lookups
-
- set_lookup( $lookup2 );
-
- is_deeply( \[ lookup( prefix( '00', rec('902') ) ) ], \[ 'lookup' ], 'lookup prefix' );
-
- ok(! lookup('non-existent'), 'lookup non-existant' );
-
- set_rec( $rec2 );
-
- test_s(qq{
- tag('Title',
- rec('200','a')
- );
- });
- test_s(qq{
- tag('Who',
- join_with(" ",
- rec('702','a'),
- rec('702','b')
- )
- );
- });
-
- test_s(qq{
- display('Publisher',
- rec('210','c')
- )
- });
-
- test_s(qq{
- search('Year',
- regex( 's/[^\\d]+//',
- rec('210','d')
- )
- )
- });
-
- ok(my $ds = get_ds(), "get_ds");
- diag "ds = ", Dumper($ds) if ($debug);
-
-
- sub test_check_ds {
-
- my $t = shift;
-
- ok($ds = get_ds(), 'get_ds');
- diag Dumper( $ds ) if ($debug);
-
- ok( $ds && $ds->{something}, 'get_ds->something exists' );
- ok( $ds && $ds->{something}->{$t}, 'get_ds->something->'.$t.' exists') if ($t);
- ok( $ds && !$ds->{empty}, 'get_ds->empty doesn\'t' );
-
- return $ds;
- }
-
- clean_ds();
- test_s(qq{ search('something', '42'); });
- test_s(qq{ search('empty', ''); });
- test_check_ds('search');
-
- clean_ds();
- test_s(qq{ display('something', '42'); });
- test_s(qq{ display('empty', ''); });
- test_check_ds('display');
-
- clean_ds();
- test_s(qq{ tag('something', '42'); });
- test_s(qq{ tag('empty', ''); });
- test_check_ds('search');
- test_check_ds('display');
-
- clean_ds();
- my $n = read_file( "$abs_path/data/normalize.pl" );
- $n .= "\n1;\n";
- #diag "normalize code:\n$n\n";
- test_s( $n );
-
- ok($ds = get_ds(), "get_ds");
- diag "ds = ", Dumper($ds) if ($debug);
-
- my $rec = {
- '200' => [{
- 'a' => '200a',
- 'b' => '200b',
- }],
- };
- my $rules = qq{ search('mixed', rec('200') ) };
-
- clean_ds();
- set_rec( $rec );
- test_s( $rules );
- ok($ds = get_ds(), "get_ds");
- is_deeply( $ds, {
- 'mixed' => {
- 'search' => [ '200a', '200b' ],
- 'tag' => 'mixed'
- }
- }, 'correct get_ds');
-
- ok(my $ds2 = WebPAC::Normalize::Set::data_structure(
- row => $rec,
- rules => $rules,
- ), 'data_structure');
- is_deeply( $ds, $ds2, 'data_structure(s) same');
-
- # wird and non-valid structure which is supported anyway
- clean_ds();
- set_rec({
- '200' => [{
- 'a' => '200a',
- },
- '200-solo'
- ]
- });
- test_s(qq{ search('mixed', rec('200') ) });
- ok($ds = get_ds(), "get_ds");
- is_deeply( $ds, {
- 'mixed' => {
- 'search' => [ '200a', '200-solo' ],
- 'tag' => 'mixed'
- }
- }, 'correct get_ds');
-
-}
-
+++ /dev/null
-#!/usr/bin/perl -w
-
-use Test::More tests => 140;
-use Test::Exception;
-use Cwd qw/abs_path/;
-use blib;
-use strict;
-use Data::Dumper;
-
-BEGIN {
-use_ok( 'WebPAC::Normalize::XML' );
-}
-
-ok(my $abs_path = abs_path($0), "abs_path");
-$abs_path =~ s#/[^/]*$#/#;
-#diag "abs_path: $abs_path";
-
-throws_ok { new WebPAC::Normalize::XML( lookup_regex => 'foo' ) } qr/pair/, "lookup_regex without lookup";
-throws_ok { new WebPAC::Normalize::XML( lookup => 'bar' ) } qr/pair/, "lookup without lookup_regex";
-
-ok(my $n = new WebPAC::Normalize::XML(
- debug => 0,
- filter => {
- regex => sub {
- my ($val, $regex) = @_;
- eval "\$val =~ $regex";
- return $val;
- },
- },
-), "new");
-
-throws_ok { $n->open() } qr/tag/, "open without tag";
-throws_ok { $n->open( tag => 'isis' ) } qr/xml_file/, "open without xml_file";
-throws_ok { $n->open( tag => 'isis', xml_file => '/foo/bar/baz' ) } qr/file.*doesn't exist/, "open with invalid xml_file";
-ok( $n->open(
- tag => 'isis',
- xml_file => "$abs_path/data/normalize.xml",
-), "open");
-
-my $rec = {
- '675' => [
- {
- 'a' => '159.9'
- }
- ],
- '210' => [
- {
- 'c' => 'New York University press',
- 'a' => 'New York',
- 'd' => 'cop. 1988'
- }
- ],
- '700' => [
- {
- 'a' => 'Haynal',
- 'b' => 'André'
- }
- ],
- '801' => [
- 'FFZG'
- ],
- '991' => [
- '8302'
- ],
- '000' => [
- 1
- ],
- '702' => [
- {
- 'a' => 'Holder',
- 'b' => 'Elizabeth'
- }
- ],
- '215' => [
- {
- 'c' => 'ilustr',
- 'a' => 'xix, 202 str',
- 'd' => '23cm'
- }
- ],
- '990' => [
- '2140',
- '88',
- 'HAY'
- ],
- '200' => [
- {
- 'e' => 'from Freud and Ferenczi to Michael balint',
- 'a' => 'Controversies in psychoanalytic method',
- 'g' => 'translated by Elizabeth Holder on the basisi of a first draft by Archie Hooton ; with a preface by Daniel N. Stern',
- 'f' => 'by André E. Haynal'
- }
- ],
- '610' => [
- 'povijest psihoanalize'
- ],
- '994' => [
- {
- 'c' => '',
- 'a' => 'PS',
- 'b' => 'MG'
- }
- ],
- '320' => [
- 'Kazalo'
- ],
- '101' => [
- 'ENG'
- ],
- '686' => [
- '2140'
- ],
- '300' => [
- 'Prijevod djela: '
- ]
-};
-
-foreach my $fld (keys %$rec) {
- my $r = 0;
- foreach my $item ($rec->{$fld}) {
- if (ref($item) eq 'HASH') {
- foreach my $sf (keys %$item) {
- my $found = 0;
- ok($n->get_data(\$rec, $fld, $sf, $r, \$found), "v${fld}^${sf} / $r");
- ok($found, "found");
- }
- my $found = 0;
- ok(! $n->get_data(\$rec, $fld, 'x', $r, \$found), "no v${fld}^x / $r");
- ok(! $found, "not found");
- } else {
- my $found = 0;
- ok($n->get_data(\$rec, $fld, undef, $r, \$found), "v${fld} / $r");
- ok($found, "found");
- }
- }
- my $found = 0;
- ok(! $n->get_data(\$rec, '999', '', $r, \$found), "no v${fld} / $r");
- ok(! $found, "not found");
-}
-
-ok(my $ds = $n->data_structure( $rec ), "data_structure");
-
-#diag Dumper($rec, $ds);
-
-# fake load of our test normalisation data
-$n->{tag} = 'isis';
-
-#diag Dumper($n->{import_xml}->{indexer});
-
-
-$rec = {
- '900' => [ {
- 'a' => '1',
- 'b' => '2',
- 'c' => '3',
- 'x' => 'yap',
- } ],
-};
-
-my $import = {
- 'Tag' => { 'isis' => [
- { content => 'v900^a + v900^b = v900^c [v900^x]' },
- ] },
-};
-
-sub parse_test($$$$$) {
- my ($import,$tag,$rec,$i,$r) = @_;
- $n->{import_xml}->{indexer} = $import;
- # erase internal cache (yak!)
- delete($n->{_tags_by_order});
- push @{$rec->{'000'}}, 42 unless ($rec->{'000'});
- #diag "test normalisation of: ",Dumper($n->{import_xml}->{indexer}, $rec);
- ok(my $ds = $n->data_structure( $rec ), "data_structure");
- #diag Dumper($ds->{$tag}->{display}); # if ($i == 0);
- cmp_ok($ds->{$tag}->{display}->[$i], 'eq', $r, "parse $tag/$i - $r");
-}
-
-parse_test($import, 'Tag', $rec, 0, '1 + 2 = 3 [yap]');
-
-delete($rec->{'900'}->[0]->{'b'});
-parse_test($import, 'Tag', $rec, 0, '1 = 3 [yap]');
-
-$rec->{'900'}->[0]->{'b'} = 5;
-$rec->{'900'}->[0]->{'c'} = 6;
-parse_test($import, 'Tag', $rec, 0, '1 + 5 = 6 [yap]');
-
-delete($rec->{'900'}->[0]->{'c'});
-$rec->{'900'}->[0]->{'x'} = 'hmmm';
-parse_test($import, 'Tag', $rec, 0, '1 + 5 [hmmm]');
-
-$rec->{'900'}->[0]->{'x'} = 'nope!';
-delete($rec->{'900'}->[0]->{'a'});
-parse_test($import, 'Tag', $rec, 0, '5 [nope!]');
-
-$rec = {
- '900' => [ {
- 'b' => 'b1',
- 'x' => 'b1',
- },{
- 'a' => 'a1',
- 'b' => 'b2',
- 'x' => 'a1_b2',
- },{
- 'b' => 'b3',
- 'c' => 'c1',
- 'x' => 'b3_c1',
- },{
- 'a' => 'a2',
- 'b' => 'b4',
- 'c' => 'c2',
- 'x' => 'a2_b4_c2',
- } ],
-};
-
-$import = {
- 'Tag' => { 'isis' => [
- { content => '0 v900^a 1 v900^b 2 v900^c 3 v900^x 4' },
- ] },
-};
-
-parse_test($import, 'Tag', $rec, 0, '0 b1 3 b1 4');
-parse_test($import, 'Tag', $rec, 1, '0 a1 1 b2 3 a1_b2 4');
-parse_test($import, 'Tag', $rec, 2, '0 b3 2 c1 3 b3_c1 4');
-parse_test($import, 'Tag', $rec, 3, '0 a2 1 b4 2 c2 3 a2_b4_c2 4');
-
-sub parse_test_arr($$$$) {
- my ($import, $tag, $rec, $arr) = @_;
- my $i = 0;
- foreach my $res (@{$arr}) {
- parse_test($import, $tag, $rec, $i, $res);
- $i++;
- }
-}
-
-$import = {
- 'Tag_a' => { 'isis' => [
- { content => 'v900^a' },
- ] },
- 'Tag_b' => { 'isis' => [
- { content => 'v900^b' },
- ] },
- 'Tag_c' => { 'isis' => [
- { content => 'v900^c' },
- ] },
- 'Tag_x' => { 'isis' => [
- { content => 'v900^x' },
- ] },
- 'Tag_s1' => { 'isis' => [
- { content => 'v900^a = v900^c' },
- ] },
- 'Tag_noval' => { 'isis' => [
- { content => 'v911^1' },
- { content => 'v900^c' },
- ] },
-};
-
-parse_test_arr($import, 'Tag_a', $rec, [ '','a1','','a2' ] );
-parse_test_arr($import, 'Tag_b', $rec, [ 'b1','b2','b3','b4' ] );
-parse_test_arr($import, 'Tag_c', $rec, [ '','','c1','c2' ] );
-parse_test_arr($import, 'Tag_x', $rec, [ 'b1','a1_b2','b3_c1','a2_b4_c2' ] );
-parse_test_arr($import, 'Tag_s1', $rec, [ '', 'a1', 'c1', 'a2 = c2' ] );
-parse_test_arr($import, 'Tag_noval', $rec, [ '','','c1','c2' ] );
-
--- /dev/null
+#!/usr/bin/perl -w
+
+use strict;
+
+use Test::More tests => 67;
+use Test::Exception;
+use Cwd qw/abs_path/;
+use blib;
+use File::Slurp;
+
+use Data::Dumper;
+my $debug = shift @ARGV;
+
+BEGIN {
+ use_ok( 'WebPAC::Normalize' );
+}
+
+ok(my $abs_path = abs_path($0), "abs_path");
+$abs_path =~ s#/[^/]*$#/#;
+diag "abs_path: $abs_path" if ($debug);
+
+#throws_ok { new WebPAC::Normalize::XML( lookup_regex => 'foo' ) } qr/pair/, "lookup_regex without lookup";
+
+my $rec1 = {
+ '200' => [{
+ 'a' => '200a',
+ 'b' => '200b',
+ },{
+ 'c' => '200c',
+ 'd' => '200d',
+ },{
+ 'a' => '200a*2',
+ 'd' => '200d*2',
+ }],
+ '201' => [{
+ 'x' => '201x',
+ 'y' => '201y',
+ }],
+ '900' => [
+ '900-no_subfield'
+ ],
+ '901' => [{
+ 'a' => '900a',
+ }],
+ '902' => [{
+ 'z' => '900',
+ }],
+};
+
+my $rec2 = {
+ '675' => [ {
+ 'a' => '159.9'
+ } ],
+ '210' => [ {
+ 'c' => 'New York University press',
+ 'a' => 'New York',
+ 'd' => 'cop. 1988'
+ } ],
+ '700' => [ {
+ 'a' => 'Haynal',
+ 'b' => 'André'
+ } ],
+ '801' => [ 'FFZG' ],
+ '991' => [ '8302' ],
+ '000' => [ 1 ],
+ '702' => [ {
+ 'a' => 'Holder',
+ 'b' => 'Elizabeth'
+ } ],
+ '215' => [ {
+ 'c' => 'ilustr',
+ 'a' => 'xix, 202 str',
+ 'd' => '23cm'
+ } ],
+ '990' => [
+ '2140',
+ '88',
+ 'HAY'
+ ],
+ '200' => [ {
+ 'e' => 'from Freud and Ferenczi to Michael balint',
+ 'a' => 'Controversies in psychoanalytic method',
+ 'g' => 'translated by Elizabeth Holder on the basisi of a first draft by Archie Hooton ; with a preface by Daniel N. Stern',
+ 'f' => 'by André E. Haynal'
+ } ],
+ '610' => [ 'povijest psihoanalize' ],
+ '994' => [ {
+ 'c' => '',
+ 'a' => 'PS',
+ 'b' => 'MG'
+ } ],
+ '320' => [ 'Kazalo' ],
+ '101' => [ 'ENG' ],
+ '686' => [ '2140' ],
+ '300' => [ 'Prijevod djela: ' ],
+};
+
+
+my $lookup1 = {
+ '00900' => [
+ 'lookup 1',
+ 'lookup 2',
+ ],
+};
+
+my $lookup2 = {
+ '00900' => 'lookup',
+};
+
+
+sub test {
+ print Dumper( @_ ), ("-" x 78), "\n";
+ ok( defined(@_) );
+}
+
+# how much of string evaled to display?
+my $max_eval_output = 170;
+
+sub dump_error {
+ my ($msg,$code) = @_;
+
+ my @l = split(/[\n\r]/, $code);
+ my $out = "$msg\n";
+
+ foreach my $i ( 0 .. $#l ) {
+ $out .= sprintf("%2d: %s\n", $i, $l[$i]);
+ }
+
+ return $out;
+}
+
+sub test_s {
+ my $t = shift || die;
+
+ my $eval_t = $t;
+ $eval_t =~ s/[\n\r\s]+/ /gs;
+ $eval_t = substr($eval_t,0,$max_eval_output) . '...' if (length($eval_t) > $max_eval_output);
+
+ eval "$t";
+ ok(! $@, $@ ? dump_error($@, $t) : "eval: $eval_t");
+}
+
+{
+ no strict 'subs';
+ use WebPAC::Normalize;
+
+ ok(! set_lookup( undef ), "set_lookup(undef)");
+
+ set_rec( $rec1 );
+
+ cmp_ok( join(",", rec2('200','a') ), 'eq', '200a,200a*2', 'join rec2' );
+ cmp_ok( join(",", rec2('200','a'), rec2('200','b') ), 'eq', '200a,200a*2,200b', 'join rec2 rec2' );
+ cmp_ok( join(" * ", sort(rec1('200'), rec1('201') )), 'eq', '200a * 200a*2 * 200b * 200c * 200d * 200d*2 * 201x * 201y', 'join sort rec1 rec1' );
+ diag "is_deeply checks\n";
+ is_deeply( \[ rec1('200') ], \[ qw/200a 200b 200c 200d 200a*2 200d*2/ ] );
+ is_deeply( \[ regex( 's/0/o/g', rec1('200') ) ], \[ qw/2ooa 2oob 2ooc 2ood 2ooa*2 2ood*2/ ]);
+ is_deeply( \[ grep { /\*/ } regex( 's/0/o/g', rec1('200') ) ], \[ qw/2ooa*2 2ood*2/ ]);
+ is_deeply( \[ rec('902') ], \[ '900' ] );
+
+ cmp_ok( rec('902'), 'eq', rec('902','z'), 'rec sf' );
+
+ # simple list manipulatons
+ cmp_ok( join('', prefix('ab', 'cd') ), 'eq', 'abcd', 'prefix');
+ cmp_ok( join('', suffix('xy', 'cd') ), 'eq', 'cdxy', 'suffix');
+ cmp_ok( join('', surround('->', '<-', 'a','b','c') ), 'eq', '->a<-->b<-->c<-', 'surround');
+
+
+ set_lookup( $lookup1 );
+
+ cmp_ok(
+ join_with(" i ",
+ lookup(
+ regex( 's/^/00/',
+ rec2('902','z')
+ )
+ )
+ ),
+ 'eq', 'lookup 1 i lookup 2', 'join lookup regex rec2');
+
+ # check join_with operations
+
+ sub test_join_with_2 {
+ my ($a,$b,$e) = @_;
+
+ cmp_ok(
+ join_with(" <1> ",
+ rec('201',$a),
+ rec('201',$b),
+ ),
+ 'eq', $e, "join_with $a <1> $b = $e");
+ }
+
+ test_join_with_2('_','_','');
+ test_join_with_2('x','_','201x');
+ test_join_with_2('_','x','201x');
+ test_join_with_2('x','y','201x <1> 201y');
+
+ sub test_join_with_3 {
+ my ($a,$b,$c,$e) = @_;
+
+ cmp_ok(
+ join_with(" <1> ", rec('201',$a),
+ join_with(" <2> ", rec('201',$b),
+ rec('201',$c),
+ )
+ ),
+ 'eq', $e, "join_with $a <1> $b <2> $c = $e");
+ };
+
+ test_join_with_3('_','_','_','');
+ test_join_with_3('x','_','_','201x');
+ test_join_with_3('_','x','_','201x');
+ test_join_with_3('_','_','x','201x');
+ test_join_with_3('x','y','_','201x <1> 201y');
+ test_join_with_3('x','_','y','201x <1> 201y');
+ test_join_with_3('_','x','y','201x <2> 201y');
+ test_join_with_3('x','_','y','201x <1> 201y');
+ test_join_with_3('x','y','x','201x <1> 201y <2> 201x');
+
+ # test lookups
+
+ set_lookup( $lookup2 );
+
+ is_deeply( \[ lookup( prefix( '00', rec('902') ) ) ], \[ 'lookup' ], 'lookup prefix' );
+
+ ok(! lookup('non-existent'), 'lookup non-existant' );
+
+ set_rec( $rec2 );
+
+ test_s(qq{
+ tag('Title',
+ rec('200','a')
+ );
+ });
+ test_s(qq{
+ tag('Who',
+ join_with(" ",
+ rec('702','a'),
+ rec('702','b')
+ )
+ );
+ });
+
+ test_s(qq{
+ display('Publisher',
+ rec('210','c')
+ )
+ });
+
+ test_s(qq{
+ search('Year',
+ regex( 's/[^\\d]+//',
+ rec('210','d')
+ )
+ )
+ });
+
+ ok(my $ds = get_ds(), "get_ds");
+ diag "ds = ", Dumper($ds) if ($debug);
+
+
+ sub test_check_ds {
+
+ my $t = shift;
+
+ ok($ds = get_ds(), 'get_ds');
+ diag Dumper( $ds ) if ($debug);
+
+ ok( $ds && $ds->{something}, 'get_ds->something exists' );
+ ok( $ds && $ds->{something}->{$t}, 'get_ds->something->'.$t.' exists') if ($t);
+ ok( $ds && !$ds->{empty}, 'get_ds->empty doesn\'t' );
+
+ return $ds;
+ }
+
+ clean_ds();
+ test_s(qq{ search('something', '42'); });
+ test_s(qq{ search('empty', ''); });
+ test_check_ds('search');
+
+ clean_ds();
+ test_s(qq{ display('something', '42'); });
+ test_s(qq{ display('empty', ''); });
+ test_check_ds('display');
+
+ clean_ds();
+ test_s(qq{ tag('something', '42'); });
+ test_s(qq{ tag('empty', ''); });
+ test_check_ds('search');
+ test_check_ds('display');
+
+ clean_ds();
+ my $n = read_file( "$abs_path/data/normalize.pl" );
+ $n .= "\n1;\n";
+ #diag "normalize code:\n$n\n";
+ test_s( $n );
+
+ ok($ds = get_ds(), "get_ds");
+ diag "ds = ", Dumper($ds) if ($debug);
+
+ my $rec = {
+ '200' => [{
+ 'a' => '200a',
+ 'b' => '200b',
+ }],
+ };
+ my $rules = qq{ search('mixed', rec('200') ) };
+
+ clean_ds();
+ set_rec( $rec );
+ test_s( $rules );
+ ok($ds = get_ds(), "get_ds");
+ is_deeply( $ds, {
+ 'mixed' => {
+ 'search' => [ '200a', '200b' ],
+ 'tag' => 'mixed'
+ }
+ }, 'correct get_ds');
+
+ ok(my $ds2 = WebPAC::Normalize::data_structure(
+ row => $rec,
+ rules => $rules,
+ ), 'data_structure');
+ is_deeply( $ds, $ds2, 'data_structure(s) same');
+
+ # wird and non-valid structure which is supported anyway
+ clean_ds();
+ set_rec({
+ '200' => [{
+ 'a' => '200a',
+ },
+ '200-solo'
+ ]
+ });
+ test_s(qq{ search('mixed', rec('200') ) });
+ ok($ds = get_ds(), "get_ds");
+ is_deeply( $ds, {
+ 'mixed' => {
+ 'search' => [ '200a', '200-solo' ],
+ 'tag' => 'mixed'
+ }
+ }, 'correct get_ds');
+
+}
+
use strict;
-use Test::More tests => 41;
+use Test::More tests => 24;
use Test::Exception;
use Cwd qw/abs_path/;
use File::Temp qw/tempdir/;
use_ok( 'WebPAC::Lookup' );
use_ok( 'WebPAC::Input' );
use_ok( 'WebPAC::Store' );
-use_ok( 'WebPAC::Normalize::XML' );
-use_ok( 'WebPAC::Normalize::Set' );
+use_ok( 'WebPAC::Lookup::Normalize' );
+use_ok( 'WebPAC::Normalize' );
use_ok( 'WebPAC::Output::TT' );
}
my $isis_file = "$abs_path../t/winisis/BIBL";
#$isis_file = '/data/hidra/THS/THS';
-$isis_file = '/data/isis_data/ffkk/';
+#$isis_file = '/data/isis_data/ffkk/';
diag "isis_file: $isis_file" if ($debug);
my $normalize_set_pl = "$abs_path/data/normalize.pl";
my $lookup_file = "$abs_path../conf/lookup/isis.pm";
-my ($t1,$t2) = (0,0);
-
ok(my $lookup = new WebPAC::Lookup(
lookup_file => $lookup_file,
), "new Lookup");
database => '.',
), "new Store");
-ok(my $n = new WebPAC::Normalize::XML(
-# filter => { 'foo' => sub { shift } },
- db => $db,
- lookup_regex => $lookup->regex,
- lookup => $lookup,
- no_progress_bar => 1,
-), "new Normalize::XML");
-
-ok($n->open(
- tag => 'isis',
- xml_file => "$abs_path/data/normalize.xml",
-), "Normalize::XML->open");
-
ok(my $norm_pl = read_file( $normalize_set_pl ), "set definitions: $normalize_set_pl" );
ok(my $out = new WebPAC::Output::TT(
diag " lookup => ",Dumper($lookup->lookup_hash) if ($debug);
+my $t_norm = 0;
+
foreach my $pos ( 0 ... $isis->size ) {
my $row = $isis->fetch || next;
diag " row $pos => ",Dumper($row) if ($debug);
my $t = time();
- ok(my $ds = $n->data_structure($row), "XML data_structure");
- $t1 += time() - $t;
-
- diag " ds $pos => ",Dumper($ds) if ($debug);
-
- $t = time();
- ok( my $ds2 = WebPAC::Normalize::Set::data_structure(
+ ok( my $ds = WebPAC::Normalize::data_structure(
lookup => $lookup->lookup_hash,
row => $row,
rules => $norm_pl,
), "Set data_structure");
- $t2 += time() - $t;
+ $t_norm += time() - $t;
- diag " ds2 $pos => ",Dumper($ds2) if ($debug);
- is_deeply( $ds, $ds2, 'ds same for xml and sets');
+ diag " ds $pos => ",Dumper($ds) if ($debug);
ok(my $html = $out->apply(
template => 'html.tt',
};
-diag sprintf("timings: %.2fs vs %.2fs [%1.2f%%]\n", $t1, $t2, ($t1 / $t2) * 100);
+diag sprintf("timings: %.2fs\n", $t_norm);