r719@llin: dpavlin | 2006-06-26 18:40:57 +0200
authorDobrica Pavlinusic <dpavlin@rot13.org>
Mon, 26 Jun 2006 16:39:51 +0000 (16:39 +0000)
committerDobrica Pavlinusic <dpavlin@rot13.org>
Mon, 26 Jun 2006 16:39:51 +0000 (16:39 +0000)
 big refacture: depriciate and remove all normalisation formats except .pl sets (but
 old code is still available in WebPAC::Lookup::Normalize because lookups use it) [2.20]

git-svn-id: svn+ssh://mjesec/home/dpavlin/svn/webpac2/trunk@536 07558da8-63fa-0310-ba24-9fe276d99e06

18 files changed:
MANIFEST
Makefile.PL
TODO
conf/normalize/isis.xml [deleted file]
conf/normalize/isis_ffzg.xml [deleted file]
lib/WebPAC.pm
lib/WebPAC/Lookup.pm
lib/WebPAC/Lookup/Normalize.pm [new file with mode: 0644]
lib/WebPAC/Normalize.pm
lib/WebPAC/Normalize/Set.pm [deleted file]
lib/WebPAC/Normalize/XML.pm [deleted file]
lib/WebPAC/Output/KinoSearch.pm
run.pl
t/0-load.t
t/3-normalize-set.t [deleted file]
t/3-normalize-xml.t [deleted file]
t/3-normalize.t [new file with mode: 0755]
t/6-unit.t

index 469227b..94d97ae 100644 (file)
--- a/MANIFEST
+++ b/MANIFEST
@@ -10,7 +10,6 @@ lib/WebPAC/Store.pm
 lib/WebPAC/Input.pm
 lib/WebPAC/Input/ISIS.pm
 lib/WebPAC/Normalize.pm
-lib/WebPAC/Normalize/XML.pm
 lib/WebPAC/Output.pm
 lib/WebPAC/Output/CDBI.pm
 lib/WebPAC/Output/Estraier.pm
@@ -33,4 +32,3 @@ t/pod-coverage.t
 t/pod.t
 conf/lookup/example.pm
 conf/lookup/isis.pm
-conf/normalize/isis.xml
index 2261619..ee3c0d3 100644 (file)
@@ -15,10 +15,8 @@ WriteMakefile(
        'Log::Log4perl' => 1.02,
        'Data::Dumper' => 0,
        'Cwd' => 0,
-       'Text::Iconv' => 0,
        'Storable' => 0,
        'DBM::Deep' => 0,
-       'XML::Simple' => 0,
        'Template' => 0,
        'Time::HiRes' => 0,
        'File::Temp' => 0,
diff --git a/TODO b/TODO
index a643614..4171ec9 100644 (file)
--- a/TODO
+++ b/TODO
@@ -16,7 +16,7 @@
 + added --stats to report field and subfield usage [2.14]
 + add validator for input data [2.15]
 + add Excel input format [2.16]
-- fix varous annoyences in code [2.17]
++ remove WebPAC::Normalize::XML and promote WebPAC::Normalize::Set to WebPAC::Normalize [2.20]
 - support arrays for normalize/path and lookup
 - add dBase input format
 - remove delimiters characters from index and query entered
diff --git a/conf/normalize/isis.xml b/conf/normalize/isis.xml
deleted file mode 100644 (file)
index c94018a..0000000
+++ /dev/null
@@ -1,294 +0,0 @@
-<?xml version="1.0" encoding="ISO-8859-2"?>
-<!--
-       isis_codepage is code page which isis uses
-       index_codepage is code page which will be stored in index
-
-       format of this file is described in: doc/formating_xml.txt
--->
-<config
-       isis_codepage="852"
->
-
-<!--
-       <isis type="swish|display|index">_pre_v000^x_sep_v000^x_sep_v000^x_post_</isis>
-       <config type="swish|display">name of var from config file</config>
--->
-
-       <format name="IDths">
-               <!--
-               <![CDATA[<a href="?rm=results&show_full=1&f=IDths&v=%s">%s</a>]]>
-               -->
-               <![CDATA[<a href="lookup{900_mfn:%s}.html">%s</a>]]>
-       </format>
-
-    <indexer>
-       <headline>
-               <isis type="display">v250^a</isis>
-               <isis type="display" append="1">[v251]</isis>
-               <isis type="display" append="1"> / [v562^4] v562^a</isis> 
-               <isis type="display" append="1">filter{CROVOC}v800</isis>
-       </headline>
-
-       <filename>
-               <isis type="filename">out/thes/v000.html</isis> 
-       </filename>
-       
-       <AllThes>
-               <isis type="swish">v250 v450 v258 v458 v253 v453 v254 v454 v330 v338</isis>
-       <!--
-               <isis type="index">v250^a / [v561^4]</isis> 
-               <isis type="index">v450^a</isis>
-               <isis type="index">v258^a</isis>
-               <isis type="index">v458^a</isis>
-               <isis type="index">v253^a</isis>
-               <isis type="index">v453^a</isis>
-               <isis type="index">v254^a</isis>
-               <isis type="index">v454^a</isis>
-               <isis type="index">v330^a</isis>
-               <isis type="index">v338^a</isis>
-       -->
-       </AllThes>
-
-       <Source name="Izvor: " order="5">
-               <isis type="display">v800 -- v901^c</isis>
-       </Source>
-       
-       <line name="..................." order="9">
-               <config type="display">line</config>
-       </line>
-       
-       <DescriptorHR name="HRVATSKI:" order="10">
-               <isis type="swish">eval{"v901^a" eq "Deskriptor"}v250^a</isis>
-               <isis type="swish">eval{"s901^a" eq "Deskriptor"}v450^a</isis>
-               <isis type="index">eval{"v901^a" eq "Deskriptor"}v250^a / [v562^4] v562^a</isis>
-               <isis type="index" append="1">eval{qq#v250^a# and "v901^a" eq "Deskriptor"}filter{CROVOC}v800</isis>
-               <!--
-               <isis type="index">eval{"v901^a" ne "Deskriptor"}v250^a [v251]</isis>
-               -->
-               <isis type="index">eval{qq#v450^a#}filter{CROVOC}v450^a >> s250^a / [s562^4] s562^a s800</isis>
-               <isis type="display">[v251]</isis>
-               <isis type="display" append="1">v250^a</isis>
-               <isis type="display" append="1">eval{qq#v250^a#}filter{CROVOC}v800</isis>
-       </DescriptorHR>
-       
-       <NonDescriptorHR name="Uporabi za: " order="20">
-               <isis type="display" >
-                       <delimiter> * </delimiter>
-                       <value>v450^a</value>
-               </isis>
-       </NonDescriptorHR>
-
-       <ScopeNoteHR name="Napomena: " order="30">
-               <isis type="display">
-               <delimiter><![CDATA[<br/>]]></delimiter>
-               <value>v330^a</value>
-               </isis>
-       </ScopeNoteHR>
-       
-       <SubjectAreaHR name="Podruèje: " order="50">
-               <!--<isis type="swish">eval{"v901^a" ne "Deskriptor"}v250^a</isis>-->
-               <isis type="swish">eval{"v901^a" eq "Podruèje"}v250^a</isis>
-               <isis type="index">eval{"v901^a" eq "Podruèje"}[v251] v250^a</isis>
-               <!--
-               <isis type="index" append="1">[v251]</isis>
-               -->
-               <isis type="display">[v561^4]</isis>
-               <isis type="display" append="1" delimiter=" " format_name="IDths" format_delimiter=";;">v561^1;;v561^a</isis>
-               <isis type="display" append="1" delimiter=" ">lookup{crovoc:v561^1}</isis>
-       </SubjectAreaHR>
-
-       <MicrothesaurusHR name="Potpojmovnik: " order="60">
-               <isis type="display">[v562^4]</isis>
-               <isis type="display" append="1" delimiter=" " format_name="IDths" format_delimiter=";;">v562^1;;v562^a</isis>
-               <isis type="display" append="1" delimiter=" ">lookup{crovoc:v562^1}</isis>
-       </MicrothesaurusHR>
-<!--
-       <ClassCode name="Klasifikacijski kod:" order="70">
-               <isis>v251</isis>
-       </ClassCode>
--->    
-       <URL name="URL: " order="80">
-               <isis type="display">v856^u</isis>
-       </URL>
-       
-       <BroaderTerm name="©iri pojam: " order="90">
-               <isis type="display" format_name="IDths" format_delimiter=";;">
-                       <delimiter><![CDATA[</li><li>]]></delimiter>
-                       <value>v556^1;;v556^a lookup{crovoc:v556^1}</value>
-               </isis>
-               <isis type="display" format_name="IDths" format_delimiter=";;">
-                       <delimiter><![CDATA[</li><li>]]></delimiter>
-                       <value>v461^1;;v461^a lookup{crovoc:v461^1}</value>
-               </isis> 
-       </BroaderTerm>
-
-       <NarrowerTerm name="U¾i pojam: " order="100">
-               <isis type="display" format_name="IDths" format_delimiter=";;">
-                       <delimiter><![CDATA[</li><li>]]></delimiter>
-                       <value>v553^1;;v553^a lookup{crovoc:v553^1}</value>
-               </isis>
-               <!--
-               <isis type="display" format_name="IDths" format_delimiter=";;" >
-                       <delimiter> * </delimiter>
-                       <value>[a:v561^4];;[d:[a:v561^4]]</value>
-               </isis>
-               -->
-               <isis type="display" format_name="IDths" format_delimiter=";;" >
-                       <delimiter><![CDATA[</li><li>]]></delimiter>
-                       <value>eval{"v901^a" eq "Podruèje"}lookup{a:v251::};;lookup{d:lookup{a:v251::}}</value>
-               </isis>
-               <isis type="display" append="1" format_name="IDths" format_delimiter=";;" >
-                       <delimiter><![CDATA[</li><li>]]></delimiter>
-                       <value>eval{"v901^a" eq "Mikrotezaurus"}lookup{a:v561^4:v251:};;lookup{d:lookup{a:v561^4:v251:}}</value>
-               </isis>
-               <isis type="display" format_name="IDths" format_delimiter=";;" >
-                       <delimiter><![CDATA[</li><li>]]></delimiter>
-                       <value>eval{"v901^a" eq "Deskriptor"}lookup{a:v561^4:v562^4:v900};;lookup{d:lookup{a:v561^4:v562^4:v900}}</value>
-               </isis>
-       </NarrowerTerm> 
-
-       <RelatedTerm name="Srodni pojam: " order="110">
-               <isis type="display"  format_name="IDths" format_delimiter=";;" sort="1">
-                       <delimiter> * </delimiter>
-                       <value>v550^1;;v550^a lookup{crovoc:v550^1}</value>
-               </isis> 
-       </RelatedTerm>
-
-       <SeeNext name="Slijedi: " order="120">
-               <isis type="display"  format_name="IDths" format_delimiter=";;">
-                       <delimiter>, </delimiter>
-                       <value>v440^1;;v440^a lookup{crovoc:v440^1}</value>
-               </isis> 
-       </SeeNext>
-
-       <SeeBefore name="Prethodi: " order="130">
-               <isis type="display"  format_name="IDths" format_delimiter=";;">
-                       <delimiter>, </delimiter>
-                       <value>v430^1;;v430^a lookup{crovoc:v430^1}</value>
-               </isis> 
-       </SeeBefore>
-
-       <line2 name=".................." order="139">
-               <config type="display">line</config>
-       </line2>
-
-       <DescriptorEN name="ENGLESKI:" order="140">
-               <isis type="swish">eval{"v901^a" eq "Deskriptor"}v258^a</isis>
-               <isis type="swish">eval{"s901^a" eq "Deskriptor"}v458^a</isis>
-               <isis type="index">eval{"v901^a" eq "Deskriptor"}v258^a / [v572^4] v572^a</isis>
-               <isis type="index">eval{qq#v458^a#}filter{CROVOC}v458^a >> s258^a / [s572^4] s572^a s800</isis>
-               <isis type="display">v258^a</isis>
-       </DescriptorEN>
-       
-       <ScopeNoteEN name="Napomena: " order="150">
-               <isis type="display">
-               <delimiter><![CDATA[<br/>]]></delimiter>
-               <value>v338^a</value>
-               </isis>
-       </ScopeNoteEN>
-       
-       <NonDescriptorEN name="Uporabi za: " order="160">
-               <isis type="display" >
-                       <delimiter> * </delimiter>
-                       <value>v458^a</value>
-               </isis>
-       </NonDescriptorEN>
-
-       
-       <SubjectAreaEN name="Podruèje: " order="170">
-               <isis type="swish">v571 v572</isis>
-               <isis type="index">[v251]</isis>
-               <isis type="index" append="1">v571^a. v572^a</isis>
-<!--           <isis type="display">[v571^4]</isis>
-               <isis type="display" append="1" delimiter=" " format_name="IDths" format_delimiter=";;">v571^1;;v571^a</isis>
--->    </SubjectAreaEN>
-
-<!--   <MicrothesaurusEN name="Potpojmovnik: " order="180">
-               <isis type="display">[v572^4]</isis>
-               <isis type="display" append="1" delimiter=" " format_name="IDths" format_delimiter=";;">v572^1;;v572^a</isis>
-       </MicrothesaurusEN>
--->
-       <line3 name=".................." order="189">
-               <config type="display">eval{"v253^9" eq "1"} line</config>
-       </line3>
-       
-       <DescriptorFR name="FRANCUSKI:" order="190">
-               <isis type="swish">eval{"v901^a" eq "Deskriptor"}v253^a</isis>
-               <isis type="swish">eval{"s901^a" eq "Deskriptor"}v453^a</isis>
-               <isis type="index">eval{"v901^a" eq "Deskriptor"}v253^a / [v572^4] v572^a</isis>
-               <isis type="index">eval{qq#v453^a#}filter{CROVOC}v453^a >> s253^a / [s572^4] s572^a s800</isis>
-               <isis type="display">v253^a</isis>
-       </DescriptorFR>
-
-       <NonDescriptorFR name="Uporabi za: " order="200">
-               <isis type="display">v453^a</isis>
-       </NonDescriptorFR>
-
-       <SubjectAreaFR name="Podruèje: " order="210">
-               <isis type="swish">v573 v574</isis>
-               <isis type="index">[v251] </isis>
-               <isis type="index" append="1">v573^a. v574^a</isis>
-<!--           <isis type="display">[v573^4]</isis>
-       <isis type="display" append="1" delimiter=" " format_name="IDths" format_delimiter=";;">v573^1;;v573^a</isis>
--->            </SubjectAreaFR>
-<!--
-       <MicrothesaurusFR name="Potpojmovnik: " order="220">
-               <isis type="display">[v574^4]</isis>
-               <isis type="display" append="1" delimiter=" " format_name="IDths" format_delimiter=";;">v574^1;;v574^a</isis>
-       </MicrothesaurusFR>
--->    
-       <line4 name=".................." order="229">
-               <config type="display">eval{"v254^9" eq "1"} line</config>
-       </line4>
-       
-       <DescriptorGE name="NJEMAÈKI:" order="230">
-               <isis type="swish">eval{"v901^a" eq "Deskriptor"}v254^a</isis>
-               <isis type="swish">eval{"s901^a" eq "Deskriptor"}v454^a</isis>
-               <isis type="index">eval{"v901^a" eq "Deskriptor"}v254^a / [v572^4] v572^a</isis>
-               <isis type="index">eval{qq#v454^a#}filter{CROVOC}v454^a >> s254^a / [s572^4] s572^a s800</isis>
-               <isis type="display">v254^a</isis>
-       </DescriptorGE>
-
-       <NonDescriptorGE name="Uporabi za: " order="240">
-               <isis type="display">v454^a</isis>
-       </NonDescriptorGE>
-
-       <SubjectAreaGE name="Podruèje: " order="250">
-               <isis type="swish">v575 v576</isis>
-               <isis type="index">[v251] </isis>
-               <isis type="index" append="1">v575^a. v576^a.</isis>
-<!--           <isis type="display">[v575^4]</isis>
-               <isis type="display" append="1" delimiter=" " format_name="IDths" format_delimiter=";;">v575^1;;v575^a</isis>
--->    </SubjectAreaGE>
-<!--
-       <MicrothesaurusGE name="Potpojmovnik: " order="260">
-               <isis type="display">[v576^4]</isis>
-               <isis type="display" append="1" delimiter=" " format_name="IDths" format_delimiter=";;">v576^1;;v576^a</isis>
-       </MicrothesaurusGE>
--->
-       <!--
-               order is important for type="lookup_key|lookup_val"
-               They create internal hash file for lookups from other
-               fields. Therefore, you mist ensure (in config file) that
-               databases which are source of lookup are called before
-               databases that use lookup fields.
-               Those lookup are then used using special ##lookup##
-               token inside formating instructions like this:
-               <isis type="lookup">v999^x</isis>
-       -->
-
-       <IDths name="ID" order="300">
-               <isis type="lookup_key">v900</isis>
-               <isis type="swish">v900</isis>
-       </IDths>
-
-       <SubjectIndex name="Predmetno kazalo" order="301">
-               <isis type="lookup_val">eval{"v901^a" eq "Deskriptor"} / [v562^4] v562^a</isis>
-               <isis type="lookup_val">eval{"v901^a" eq "Deskriptor"} [v251]</isis>
-               <!--
-               <isis type="lookup_val" append="1"> v562^a</isis>
-               -->
-       </SubjectIndex>
-
-    </indexer>
-</config>
diff --git a/conf/normalize/isis_ffzg.xml b/conf/normalize/isis_ffzg.xml
deleted file mode 100644 (file)
index 54383f1..0000000
+++ /dev/null
@@ -1,285 +0,0 @@
-<?xml version="1.0" encoding="ISO-8859-2"?>
-<!--
-       Here are few examples of syntax.
-
-       <isis type="search|display">something v200^a something else</isis>
-       <config type="search|display">name of var from config file</config>
--->
-<config>
-<indexer>
-       <MFN>
-               <isis>v000</isis>
-       </MFN>
-
-       <ISBN>
-               <isis type="display">v10</isis>
-       </ISBN>
-
-       <ISSN>
-               <isis type="display">v11</isis>
-       </ISSN>
-
-       <IdentificationNumbers name="ISN">
-               <isis type="search">v10 v11</isis>
-       </IdentificationNumbers>
-       
-       <Language>
-               <isis>v101</isis>
-       </Language>
-       
-       <TitleProper>
-               <isis><![CDATA[filter{regex(s/<[^>]*>/)}v200^a]]></isis>
-       </TitleProper>
-
-       <titleNo>
-               <isis>v200^9</isis>
-       </titleNo>
-
-       <Subtitle>
-               <isis>v200^e</isis>
-       </Subtitle>
-
-       <TitleProper2>
-               <isis>v200^c</isis>
-       </TitleProper2>
-       
-       <ParallelTitle>
-               <isis>v200^d</isis>
-       </ParallelTitle>
-
-       <Responsibility>
-               <isis>v200^f ; v200^g</isis>
-       </Responsibility>
-               
-       <ResponsibilityFirst>
-               <isis type="display">v200^f</isis>
-       </ResponsibilityFirst>
-               
-       <ResponsibilitySecond>
-               <isis type="display">v200^g</isis>
-       </ResponsibilitySecond>
-
-       <VolumeDesignation>
-               <isis>v200^v</isis>
-       </VolumeDesignation>
-
-       <EditionStatement>
-               <isis>v205^a</isis>
-       </EditionStatement>
-
-       <SerialNo>
-               <isis>v207^a</isis>
-       </SerialNo>
-
-       <fond>
-               <isis>v209^a</isis>
-       </fond>
-
-       <PlacePublication>
-               <isis>v210^a</isis>
-       </PlacePublication>
-
-       <NamePublisher>
-               <isis>v210^c</isis>
-       </NamePublisher>
-
-       <DatePublication> 
-               <isis>v210^d</isis>
-       </DatePublication>
-
-       <PhysicalDescription>
-               <isis>v215^a : v215^c ; v215^d</isis>
-       </PhysicalDescription>
-       
-       <MaterialDesignation>
-               <isis>v215^a</isis>
-       </MaterialDesignation>
-       
-       <PhysicalDetails>
-               <isis>v215^c</isis>
-       </PhysicalDetails>
-       
-       <AccompanyingMaterial>
-               <isis>v215^e</isis>
-       </AccompanyingMaterial>
-       
-       <Series>
-               <isis>v225^a = v225^d : v225^e ; v225^v. v225h, v225^i ; v225^w</isis>
-       </Series>
-       
-       <SeriesTitle>
-               <isis>v225^a</isis>
-       </SeriesTitle>
-       
-       <GeneralNote>
-               <isis>v300</isis>
-       </GeneralNote>
-
-       <EditionNote>
-               <isis>v305</isis>
-       </EditionNote>
-
-       <PhysicalDescriptionNote>
-               <isis>v307</isis>
-       </PhysicalDescriptionNote>
-
-       <IntellectResponsNote>
-               <isis>v314</isis>
-       </IntellectResponsNote>
-
-       <InternalBibliographies>
-               <isis>v320</isis>
-       </InternalBibliographies>
-
-       <Frequency>
-               <isis>v326</isis>
-       </Frequency>
-
-       <ContentsNote>
-               <isis>v327</isis>
-       </ContentsNote>
-
-       <Summary>
-               <isis>v330</isis>
-       </Summary>
-
-       <SystemRequirements>
-               <isis>v337</isis>
-       </SystemRequirements>
-
-       <IssuedWith>
-               <isis>v423^z: v423^a / v423^c v423^b</isis>
-       </IssuedWith>
-
-       <Parts>
-               <isis type="display">lookup{dio-jzav:v900}</isis>
-               <isis type="search">lookup{id-dio-jzav:v900}</isis>
-       </Parts>
-       
-       <PartsEF>
-               <isis type="display">lookup{naslov-efzg:001v001}</isis>
-       </PartsEF>
-
-       <PartsID>
-               <isis>v463^1</isis>
-       </PartsID>
-
-       <Piece>
-               <isis>lookup{naslov-efzg:s463^1}</isis>
-       </Piece>
-
-       <PieceSubtitle>
-               <isis>lookup{podnaslov-efzg:s463^1}</isis>
-       </PieceSubtitle>
-
-       <PieceNum>
-               <isis>v463^v</isis>
-       </PieceNum>
-       
-
-       <PieceAnalitic>
-               <isis>v464^a / v464^g v464^f</isis>
-       </PieceAnalitic>
-
-       <UniformHeading>
-               <isis>v500^a. v503^b</isis>
-       </UniformHeading>
-       
-       <ExpandedTitle>
-               <isis>v532</isis>
-       </ExpandedTitle>
-       
-       <Form>
-               <isis>v608</isis>
-       </Form>
-
-       <UncontrolledTerms>
-               <isis>v610</isis>       
-       </UncontrolledTerms>
-       
-       <UDC_All>
-               <isis type="search">v675</isis>
-       </UDC_All>
-       
-       <UDC>
-               <isis>v675^a</isis>
-       </UDC>
-
-       <UDCb>
-               <isis>v675^b</isis>
-       </UDCb>
-
-       <APA>
-               <isis>v686</isis>
-       </APA>
-
-       <PersonalName>
-               <isis>v700^a, v700^b</isis>
-               <isis type="index">v700^a, v700^b</isis>
-       </PersonalName>
-
-       <PersonalName2>
-               <isis>v701^a, v701^b</isis>
-               <isis type="index">v701^a, v701^b</isis>
-       </PersonalName2>
-
-       <PersonalNameOther>
-               <isis>v702^a, v702^b</isis>
-               <isis type="index">v702^a, v702^b</isis>
-       </PersonalNameOther>
-
-       <Names>
-               <isis>v700^a, v700^b</isis>
-               <isis>v701^a, v701^b</isis>
-               <isis>v702^a, v702^b</isis>
-               <isis type="index">v700^a, v700^b</isis>
-               <isis type="index">v701^a, v701^b</isis>
-               <isis type="index">v702^a, v702^b</isis>
-       </Names>
-
-       <CorporateName>
-               <isis>v710^a</isis>
-       </CorporateName>
-
-       <CorporateName2>
-               <isis>v711^a</isis>
-       </CorporateName2>
-
-       <OriginatingSource>
-               <isis>v801</isis>
-       </OriginatingSource>
-       
-       <URL>
-               <isis>v856^u</isis>
-       </URL>
-
-       <level>
-               <isis>v909</isis>
-       </level>
-       
-       <ID>
-               <isis>v900</isis>
-               <isis>001v001</isis>
-       </ID>
-       
-       <Set>
-               <isis type="display">lookup{set-jzav:v946^1}</isis>
-               <isis type="search">v946^1</isis>
-               <isis type="display">lookup{set-efzg:v461^1}</isis>
-               <isis type="search">v461^1</isis>
-       </Set>
-
-       <Set2>
-               <isis>lookup{set-jzav:lookup{parent-id:v946^1}}</isis>
-       </Set2>
-
-       <CallNo>
-               <isis>v990</isis>
-       </CallNo>
-
-       <InvNo>
-               <isis>v991</isis>
-       </InvNo>
-
-</indexer>
-</config>
index b1970c0..567b5c8 100644 (file)
@@ -9,30 +9,25 @@ WebPAC - core module
 
 =head1 VERSION
 
-Version 2.17
+Version 2.20
 
 =cut
 
-our $VERSION = '2.17';
+our $VERSION = '2.20';
 
 =head1 SYNOPSIS
 
-This is quick description of what WebPAC is. This is third iteration of
-WebPAC design (second one was semi-private creatation of CD ROM with L<jsFind>
-module). This code will eventually become official WebPAC version 2.
+This is quick description of what WebPAC is. This is another iteration of
+WebPAC design (first was system with XML files and CGI, second one was semi-private
+creatation of CD ROM with L<jsFind> module and third was older version 2
+with supprot for lagacy XML and YAML).
+
+Current version supports different input formats and normalisation using set rules.
 
 =head1 AUTHOR
 
 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
 
-=head1 BUGS
-
-Please report any bugs or feature requests to
-C<bug-webpac@rt.cpan.org>, or through the web interface at
-L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=WebPAC>.
-I will be notified, and then you'll automatically be notified of progress on
-your bug as I make changes.
-
 =head1 SEE ALSO
 
 To undestand concpets behind WebPAC examine L<WebPAC::Manual>, and then
@@ -40,7 +35,7 @@ respective documentation for each component.
 
 =head1 COPYRIGHT & LICENSE
 
-Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
+Copyright 2005-2006 Dobrica Pavlinusic, All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it
 under the same terms as Perl itself.
index 2f1ba12..db772fe 100644 (file)
@@ -3,7 +3,7 @@ package WebPAC::Lookup;
 use warnings;
 use strict;
 
-use base qw/WebPAC::Common WebPAC::Normalize/;
+use base qw/WebPAC::Common WebPAC::Lookup::Normalize/;
 use File::Slurp;
 use YAML qw/LoadFile/;
 use Data::Dumper;
@@ -219,7 +219,7 @@ Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
 
 =head1 COPYRIGHT & LICENSE
 
-Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
+Copyright 2005-2006 Dobrica Pavlinusic, All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it
 under the same terms as Perl itself.
diff --git a/lib/WebPAC/Lookup/Normalize.pm b/lib/WebPAC/Lookup/Normalize.pm
new file mode 100644 (file)
index 0000000..dd10717
--- /dev/null
@@ -0,0 +1,795 @@
+package WebPAC::Lookup::Normalize;
+
+use warnings;
+use strict;
+use blib;
+use WebPAC::Common;
+use base 'WebPAC::Common';
+use Data::Dumper;
+
+=head1 NAME
+
+WebPAC::Lookup::Normalize - data mungling for normalisation
+
+=head1 VERSION
+
+Version 0.09
+
+=cut
+
+our $VERSION = '0.09';
+
+=head1 SYNOPSIS
+
+This package contains code that mungle data to produce normalized format.
+
+B<WARNING:>
+
+This code is obsolete. It moved to here so that I don't have to re-write
+L<WebPAC::Lookup> to use set configuration files (using L<WebPAC::Normalize>)
+just yet. But it will dissapear real soon!
+
+It contains several assumptions:
+
+=over
+
+=item *
+
+format of fields is defined using C<v123^a> notation for repeatable fields
+or C<s123^a> for single (or first) value, where C<123> is field number and
+C<a> is subfield.
+
+=item *
+
+source data records (C<$rec>) have unique identifiers in field C<000>
+
+=item *
+
+optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
+perl code that is evaluated before producing output (value of field will be
+interpolated before that)
+
+=item *
+
+optional C<filter{filter_name}> at B<begining of format> will apply perl
+code defined as code ref on format after field substitution to producing
+output
+
+There is one built-in filter called C<regex> which can be use like this:
+
+  filter{regex(s/foo/bar/)}
+
+=item *
+
+optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
+
+=item *
+
+at end, optional C<format>s rules are resolved. Format rules are similar to
+C<sprintf> and can also contain C<lookup{...}> which is performed after
+values are inserted in format.
+
+=back
+
+This also describes order in which transformations are applied (eval,
+filter, lookup, format) which is important to undestand when deciding how to
+solve your data mungling and normalisation process.
+
+
+
+
+=head1 FUNCTIONS
+
+=head2 new
+
+Create new normalisation object
+
+  my $n = new WebPAC::Lookup::Normalize::Something(
+       filter => {
+               'filter_name_1' => sub {
+                       # filter code
+                       return length($_);
+               }, ...
+       },
+       db => $db_obj,
+       lookup_regex => $lookup->regex,
+       lookup => $lookup_obj,
+       prefix => 'foobar',
+  );
+
+Parametar C<filter> defines user supplied snippets of perl code which can
+be use with C<filter{...}> notation.
+
+C<prefix> is used to form filename for database record (to support multiple
+source files which are joined in one database).
+
+Recommended parametar C<lookup_regex> is used to enable parsing of lookups
+in structures. If you pass this parametar, you must also pass C<lookup>
+which is C<WebPAC::Lookup> object.
+
+=cut
+
+sub new {
+       my $class = shift;
+        my $self = {@_};
+        bless($self, $class);
+
+       my $r = $self->{'lookup_regex'} ? 1 : 0;
+       my $l = $self->{'lookup'} ? 1 : 0;
+
+       my $log = $self->_get_logger();
+
+       # those two must be in pair
+       if ( ($r & $l) != ($r || $l) ) {
+               my $log = $self->_get_logger();
+               $log->logdie("lookup_regex and lookup must be in pair");
+       }
+
+       $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
+
+       $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
+
+       $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
+
+       if (! $self->{filter} || ! $self->{filter}->{regex}) {
+               $log->debug("adding built-in filter regex");
+               $self->{filter}->{regex} = sub {
+                       my ($val, $regex) = @_;
+                       eval "\$val =~ $regex";
+                       return $val;
+               };
+       }
+
+       $self ? return $self : return undef;
+}
+
+=head2 all_tags
+
+Returns all tags in document in specified order
+
+  my $sorted_tags = $self->all_tags();
+
+=cut
+
+sub all_tags {
+       my $self = shift;
+
+       if (! $self->{_tags_by_order}) {
+
+               my $log = $self->_get_logger;
+               # sanity check
+               $log->logdie("can't find self->{inport_xml}->{indexer}") unless ($self->{import_xml}->{indexer});
+
+               my @tags = keys %{ $self->{'import_xml'}->{'indexer'}};
+               $log->debug("unsorted tags: " . join(", ", @tags));
+
+               @tags = sort { $self->_sort_by_order } @tags;
+
+               $log->debug("sorted tags: " . join(",", @tags) );
+
+               $self->{_tags_by_order} = \@tags;
+       }
+
+       return $self->{_tags_by_order};
+}
+
+
+
+=head2 data_structure
+
+Create in-memory data structure which represents normalized layout from
+C<conf/normalize/*.xml>.
+
+This structures are used to produce output.
+
+ my $ds = $webpac->data_structure($rec);
+
+=cut
+
+sub data_structure {
+       my $self = shift;
+
+       my $log = $self->_get_logger();
+
+       my $rec = shift;
+       $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+
+       $log->debug("data_structure rec = ", sub { Dumper($rec) });
+
+       $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
+
+       my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
+
+       my $cache_file;
+
+       if ($self->{'db'}) {
+               my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
+               $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
+               return $ds if ($ds);
+               $log->debug("cache miss, creating");
+       }
+
+       my $tags = $self->all_tags();
+
+       $log->debug("tags: ",sub { join(", ",@{ $tags }) });
+
+       my $ds;
+
+       foreach my $field (@{ $tags }) {
+
+               my $row;
+
+#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
+
+               foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
+                       my $format;
+
+                       $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
+                       $format = $tag->{'value'} || $tag->{'content'};
+
+                       my @v;
+                       if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
+                               @v = $self->_rec_to_arr($rec,$format,'fill_in');
+                       } else {
+                               @v = $self->_rec_to_arr($rec,$format,'parse');
+                       }
+                       if (! @v) {
+                               $log->debug("$field <",$self->{tag},"> format: $format no values");
+                               next;
+                       } else {
+                               $log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
+                       }
+
+                       if ($tag->{'sort'}) {
+                               @v = $self->sort_arr(@v);
+                       }
+
+                       # use format?
+                       if ($tag->{'format_name'}) {
+                               @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
+                       }
+
+                       # delimiter will join repeatable fields
+                       if ($tag->{'delimiter'}) {
+                               @v = ( join($tag->{'delimiter'}, @v) );
+                       }
+
+                       # default types 
+                       my @types = qw(display search);
+                       # override by type attribute
+                       @types = ( $tag->{'type'} ) if ($tag->{'type'});
+
+                       foreach my $type (@types) {
+                               # append to previous line?
+                               $log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} || 'no append');
+                               if ($tag->{'append'}) {
+
+                                       # I will delimit appended part with
+                                       # delimiter (or ,)
+                                       my $d = $tag->{'delimiter'};
+                                       # default delimiter
+                                       $d ||= " ";
+
+                                       my $last = pop @{$row->{$type}};
+                                       $d = "" if (! $last);
+                                       $last .= $d . join($d, @v);
+                                       push @{$row->{$type}}, $last;
+
+                               } else {
+                                       push @{$row->{$type}}, @v;
+                               }
+                       }
+
+
+               }
+
+               if ($row) {
+                       $row->{'tag'} = $field;
+
+                       # TODO: name_sigular, name_plural
+                       my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
+                       my $row_name = $name ? $self->_x($name) : $field;
+
+                       # post-sort all values in field
+                       if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
+                               $log->warn("sort at field tag not implemented");
+                       }
+
+                       $ds->{$row_name} = $row;
+
+                       $log->debug("row $field: ",sub { Dumper($row) });
+               }
+
+       }
+
+       $self->{'db'}->save_ds(
+               id => $id,
+               ds => $ds,
+               prefix => $self->{prefix},
+       ) if ($self->{'db'});
+
+       $log->debug("ds: ", sub { Dumper($ds) });
+
+       $log->logconfess("data structure returned is not array any more!") if wantarray;
+
+       return $ds;
+
+}
+
+=head2 parse
+
+Perform smart parsing of string, skipping delimiters for fields which aren't
+defined. It can also eval code in format starting with C<eval{...}> and
+return output or nothing depending on eval code.
+
+ my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
+
+Filters are implemented here. While simple form of filters looks like this:
+
+  filter{name_of_filter}
+
+but, filters can also have variable number of parametars like this:
+
+  filter{name_of_filter(param,param,param)}
+
+=cut
+
+my $warn_once;
+
+sub parse {
+       my $self = shift;
+
+       my ($rec, $format_utf8, $i, $rec_size) = @_;
+
+       return if (! $format_utf8);
+
+       my $log = $self->_get_logger();
+
+       $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+
+       $i = 0 if (! $i);
+
+       my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
+
+       my @out;
+
+       $log->debug("format: $format [$i]");
+
+       my $eval_code;
+       # remove eval{...} from beginning
+       $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
+
+       my $filter_name;
+       # remove filter{...} from beginning
+       $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
+
+       # did we found any (att all) field from format in row?
+       my $found_any;
+       # prefix before first field which we preserve it $found_any
+       my $prefix;
+
+       my $f_step = 1;
+
+       while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
+
+               my $del = $1 || '';
+               $prefix = $del if ($f_step == 1);
+
+               my $fld_type = lc($2);
+
+               # repeatable index
+               my $r = $i;
+               if ($fld_type eq 's') {
+                       if ($found_any->{'v'}) {
+                               $r = 0;
+                       } else {
+                               return;
+                       }
+               }
+
+               my $found = 0;
+               my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found,$rec_size);
+
+               if ($found) {
+                       $found_any->{$fld_type} += $found;
+
+                       # we will skip delimiter before first occurence of field!
+                       push @out, $del unless($found_any->{$fld_type} == 1);
+                       push @out, $tmp if ($tmp);
+               }
+               $f_step++;
+       }
+
+       # test if any fields found?
+       return if (! $found_any->{'v'} && ! $found_any->{'s'});
+
+       my $out = join('',@out);
+
+       if ($out) {
+               # add rest of format (suffix)
+               $out .= $format;
+
+               # add prefix if not there
+               $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
+
+               $log->debug("result: $out");
+       }
+
+       if ($eval_code) {
+               my $eval = $self->fill_in($rec,$eval_code,$i) || return;
+               $log->debug("about to eval{$eval} format: $out");
+               return if (! $self->_eval($eval));
+       }
+       
+       if ($filter_name) {
+               my @filter_args;
+               if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
+                       @filter_args = split(/,/, $2);
+               }
+               if ($self->{'filter'}->{$filter_name}) {
+                       $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
+                       unshift @filter_args, $out;
+                       $out = $self->{'filter'}->{$filter_name}->(@filter_args);
+                       return unless(defined($out));
+                       $log->debug("filter result: $out");
+               } elsif (! $warn_once->{$filter_name}) {
+                       $log->warn("trying to use undefined filter $filter_name");
+                       $warn_once->{$filter_name}++;
+               }
+       }
+
+       return $out;
+}
+
+=head2 fill_in
+
+Workhourse of all: takes record from in-memory structure of database and
+strings with placeholders and returns string or array of with substituted
+values from record.
+
+ my $text = $webpac->fill_in($rec,'v250^a');
+
+Optional argument is ordinal number for repeatable fields. By default,
+it's assume to be first repeatable field (fields are perl array, so first
+element is 0).
+Following example will read second value from repeatable field.
+
+ my $text = $webpac->fill_in($rec,'Title: v250^a',1);
+
+This function B<does not> perform parsing of format to inteligenty skip
+delimiters before fields which aren't used.
+
+This method will automatically decode UTF-8 string to local code page
+if needed.
+
+There is optional parametar C<$record_size> which can be used to get sizes of
+all C<field^subfield> combinations in this format.
+
+ my $text = $webpac->fill_in($rec,'got: v900^a v900^x',0,\$rec_size); 
+
+=cut
+
+sub fill_in {
+       my $self = shift;
+
+       my $log = $self->_get_logger();
+
+       my ($rec,$format,$i,$rec_size) = @_;
+
+       $log->logconfess("need data record") unless ($rec);
+       $log->logconfess("need format to parse") unless($format);
+
+       # iteration (for repeatable fields)
+       $i ||= 0;
+
+       $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
+
+       # FIXME remove for speedup?
+       $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+
+       if (utf8::is_utf8($format)) {
+               $format = $self->_x($format);
+       }
+
+       my $found = 0;
+       my $just_single = 1;
+
+       my $eval_code;
+       # remove eval{...} from beginning
+       $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
+
+       my $filter_name;
+       # remove filter{...} from beginning
+       $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
+
+       {
+               # fix warnings
+               no warnings 'uninitialized';
+
+               # do actual replacement of placeholders
+               # repeatable fields
+               if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found,$rec_size)/ges) {
+                       $just_single = 0;
+               }
+
+               # non-repeatable fields
+               if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found,$rec_size)/ges) {
+                       return if ($i > 0 && $just_single);
+               }
+       }
+
+       if ($found) {
+               $log->debug("format: $format");
+               if ($eval_code) {
+                       my $eval = $self->fill_in($rec,$eval_code,$i);
+                       return if (! $self->_eval($eval));
+               }
+               if ($filter_name && $self->{'filter'}->{$filter_name}) {
+                       $log->debug("filter '$filter_name' for $format");
+                       $format = $self->{'filter'}->{$filter_name}->($format);
+                       return unless(defined($format));
+                       $log->debug("filter result: $format");
+               }
+               # do we have lookups?
+               if ($self->{'lookup'}) {
+                       if ($self->{'lookup'}->can('lookup')) {
+                               my @lookup = $self->{lookup}->lookup($format);
+                               $log->debug("lookup $format", join(", ", @lookup));
+                               return @lookup;
+                       } else {
+                               $log->warn("Have lookup object but can't invoke lookup method");
+                       }
+               } else {
+                       return $format;
+               }
+       } else {
+               return;
+       }
+}
+
+
+=head2 _rec_to_arr
+
+Similar to C<parse> and C<fill_in>, but returns array of all repeatable fields. Usable
+for fields which have lookups, so they shouldn't be parsed but rather
+C<paste>d or C<fill_id>ed. Last argument is name of operation: C<paste> or C<fill_in>.
+
+ my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]','paste');
+
+=cut
+
+sub _rec_to_arr {
+       my $self = shift;
+
+       my ($rec, $format_utf8, $code) = @_;
+
+       my $log = $self->_get_logger();
+
+       $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+       return if (! $format_utf8);
+
+       $log->debug("using $code on $format_utf8");
+
+       my $i = 0;
+       my $max = 0;
+       my @arr;
+       my $rec_size = {};
+
+       while ($i <= $max) {
+               my @v = $self->$code($rec,$format_utf8,$i++,\$rec_size);
+               if ($rec_size) {
+                       foreach my $f (keys %{ $rec_size }) {
+                               $max = $rec_size->{$f} if ($rec_size->{$f} > $max);
+                       }
+                       $log->debug("max set to $max");
+                       undef $rec_size;
+               }
+               if (@v) {
+                       push @arr, @v;
+               } else {
+                       push @arr, '' if ($max > $i);
+               }
+       }
+
+       $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
+
+       return @arr;
+}
+
+
+=head2 get_data
+
+Returns value from record.
+
+ my $text = $self->get_data(\$rec,$f,$sf,$i,\$found,\$rec_size);
+
+Required arguments are:
+
+=over 8
+
+=item C<$rec>
+
+record reference
+
+=item C<$f>
+
+field
+
+=item C<$sf>
+
+optional subfield
+
+=item C<$i>
+
+index offset for repeatable values ( 0 ... $rec_size->{'400^a'} )
+
+=item C<$found>
+
+optional variable that will be incremeted if preset
+
+=item C<$rec_size>
+
+hash to hold maximum occurances of C<field^subfield> combinations
+(which can be accessed using keys in same format)
+
+=back
+
+Returns value or empty string, updates C<$found> and C<rec_size>
+if present.
+
+=cut
+
+sub get_data {
+       my $self = shift;
+
+       my ($rec,$f,$sf,$i,$found,$cache) = @_;
+
+       return '' unless ($$rec->{$f} && ref($$rec->{$f}) eq 'ARRAY');
+
+       if (defined($$cache)) {
+               $$cache->{ $f . ( $sf ? '^' . $sf : '' ) } ||= scalar @{ $$rec->{$f} };
+       }
+
+       return '' unless ($$rec->{$f}->[$i]);
+
+       {
+               no strict 'refs';
+               if (defined($sf)) {
+                       $$found++ if (defined($$found) && $$rec->{$f}->[$i]->{$sf});
+                       return $$rec->{$f}->[$i]->{$sf};
+               } else {
+                       $$found++ if (defined($$found));
+                       # it still might have subfields, just
+                       # not specified, so we'll dump some debug info
+                       if ($$rec->{$f}->[$i] =~ /HASH/o) {
+                               my $out;
+                               foreach my $k (keys %{$$rec->{$f}->[$i]}) {
+                                       my $v = $$rec->{$f}->[$i]->{$k};
+                                       $out .= '$' . $k .':' . $v if ($v);
+                               }
+                               return $out;
+                       } else {
+                               return $$rec->{$f}->[$i];
+                       }
+               }
+       }
+}
+
+
+=head2 apply_format
+
+Apply format specified in tag with C<format_name="name"> and
+C<format_delimiter=";;">.
+
+ my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
+
+Formats can contain C<lookup{...}> if you need them.
+
+=cut
+
+sub apply_format {
+       my $self = shift;
+
+       my ($name,$delimiter,$data) = @_;
+
+       my $log = $self->_get_logger();
+
+       if (! $self->{'import_xml'}->{'format'}->{$name}) {
+               $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
+               return $data;
+       }
+
+       $log->warn("no delimiter for format $name") if (! $delimiter);
+
+       my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
+
+       my @data = split(/\Q$delimiter\E/, $data);
+
+       my $out = sprintf($format, @data);
+       $log->debug("using format $name [$format] on $data to produce: $out");
+
+       if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
+               return $self->{'lookup'}->lookup($out);
+       } else {
+               return $out;
+       }
+
+}
+
+=head2 sort_arr
+
+Sort array ignoring case and html in data
+
+ my @sorted = $webpac->sort_arr(@unsorted);
+
+=cut
+
+sub sort_arr {
+       my $self = shift;
+
+       my $log = $self->_get_logger();
+
+       # FIXME add Schwartzian Transformation?
+
+       my @sorted = sort {
+               $a =~ s#<[^>]+/*>##;
+               $b =~ s#<[^>]+/*>##;
+               lc($b) cmp lc($a)
+       } @_;
+       $log->debug("sorted values: ",sub { join(", ",@sorted) });
+
+       return @sorted;
+}
+
+
+=head1 INTERNAL METHODS
+
+=head2 _sort_by_order
+
+Sort xml tags data structure accoding to C<order=""> attribute.
+
+=cut
+
+sub _sort_by_order {
+       my $self = shift;
+
+       my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
+               $self->{'import_xml'}->{'indexer'}->{$a};
+       my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
+               $self->{'import_xml'}->{'indexer'}->{$b};
+
+       return $va <=> $vb;
+}
+
+=head2 _x
+
+Convert strings from C<conf/normalize/*.xml> encoding into application
+specific encoding (optinally specified using C<code_page> to C<new>
+constructor).
+
+ my $text = $n->_x('normalize text string');
+
+This is a stub so that other modules doesn't have to implement it.
+
+=cut
+
+sub _x {
+       my $self = shift;
+       return shift;
+}
+
+
+=head1 AUTHOR
+
+Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
+
+=head1 COPYRIGHT & LICENSE
+
+Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it
+under the same terms as Perl itself.
+
+=cut
+
+1; # End of WebPAC::Lookup::Normalize
index 93810ab..cf78071 100644 (file)
 package WebPAC::Normalize;
+use Exporter 'import';
+@EXPORT = qw/
+       set_rec set_lookup
+       get_ds clean_ds
+       tag search display
+       rec1 rec2 rec
+       regex prefix suffix surround
+       first lookup join_with
+/;
 
 use warnings;
 use strict;
-use blib;
-use WebPAC::Common;
-use base 'WebPAC::Common';
+
+#use base qw/WebPAC::Common/;
 use Data::Dumper;
 
 =head1 NAME
 
-WebPAC::Normalize - data mungling for normalisation
+WebPAC::Normalize - describe normalisaton rules using sets
 
 =head1 VERSION
 
-Version 0.09
+Version 0.04
 
 =cut
 
-our $VERSION = '0.09';
+our $VERSION = '0.04';
 
 =head1 SYNOPSIS
 
-This package contains code that mungle data to produce normalized format.
-
-It contains several assumptions:
-
-=over
-
-=item *
-
-format of fields is defined using C<v123^a> notation for repeatable fields
-or C<s123^a> for single (or first) value, where C<123> is field number and
-C<a> is subfield.
-
-=item *
-
-source data records (C<$rec>) have unique identifiers in field C<000>
-
-=item *
-
-optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
-perl code that is evaluated before producing output (value of field will be
-interpolated before that)
-
-=item *
-
-optional C<filter{filter_name}> at B<begining of format> will apply perl
-code defined as code ref on format after field substitution to producing
-output
-
-There is one built-in filter called C<regex> which can be use like this:
-
-  filter{regex(s/foo/bar/)}
-
-=item *
-
-optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
-
-=item *
-
-at end, optional C<format>s rules are resolved. Format rules are similar to
-C<sprintf> and can also contain C<lookup{...}> which is performed after
-values are inserted in format.
-
-=back
-
-This also describes order in which transformations are applied (eval,
-filter, lookup, format) which is important to undestand when deciding how to
-solve your data mungling and normalisation process.
-
+This module uses C<conf/normalize/*.pl> files to perform normalisation
+from input records using perl functions which are specialized for set
+processing.
 
+Sets are implemented as arrays, and normalisation file is valid perl, which
+means that you check it's validity before running WebPAC using
+C<perl -c normalize.pl>.
 
+Normalisation can generate multiple output normalized data. For now, supported output
+types (on the left side of definition) are: C<tag>, C<display> and C<search>.
 
 =head1 FUNCTIONS
 
-=head2 new
-
-Create new normalisation object
-
-  my $n = new WebPAC::Normalize::Something(
-       filter => {
-               'filter_name_1' => sub {
-                       # filter code
-                       return length($_);
-               }, ...
-       },
-       db => $db_obj,
-       lookup_regex => $lookup->regex,
-       lookup => $lookup_obj,
-       prefix => 'foobar',
-  );
+=head2 data_structure
 
-Parametar C<filter> defines user supplied snippets of perl code which can
-be use with C<filter{...}> notation.
+Return data structure
 
-C<prefix> is used to form filename for database record (to support multiple
-source files which are joined in one database).
+  my $ds = WebPAC::Normalize(
+       lookup => $lookup->lookup_hash,
+       row => $row,
+       rules => $normalize_pl_config,
+  );
 
-Recommended parametar C<lookup_regex> is used to enable parsing of lookups
-in structures. If you pass this parametar, you must also pass C<lookup>
-which is C<WebPAC::Lookup> object.
+This function will B<die> if normalizastion can't be evaled.
 
 =cut
 
-sub new {
-       my $class = shift;
-        my $self = {@_};
-        bless($self, $class);
-
-       my $r = $self->{'lookup_regex'} ? 1 : 0;
-       my $l = $self->{'lookup'} ? 1 : 0;
-
-       my $log = $self->_get_logger();
-
-       # those two must be in pair
-       if ( ($r & $l) != ($r || $l) ) {
-               my $log = $self->_get_logger();
-               $log->logdie("lookup_regex and lookup must be in pair");
-       }
-
-       $log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
-
-       $log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
-
-       $log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
-
-       if (! $self->{filter} || ! $self->{filter}->{regex}) {
-               $log->debug("adding built-in filter regex");
-               $self->{filter}->{regex} = sub {
-                       my ($val, $regex) = @_;
-                       eval "\$val =~ $regex";
-                       return $val;
-               };
-       }
-
-       $self ? return $self : return undef;
+sub data_structure {
+       my $arg = {@_};
+
+       die "need row argument" unless ($arg->{row});
+       die "need normalisation argument" unless ($arg->{rules});
+
+       no strict 'subs';
+       set_lookup( $arg->{lookup} );
+       set_rec( $arg->{row} );
+       clean_ds();
+       eval "$arg->{rules}";
+       die "error evaling $arg->{rules}: $@\n" if ($@);
+       return get_ds();
 }
 
-=head2 all_tags
+=head2 set_rec
 
-Returns all tags in document in specified order
+Set current record hash
 
-  my $sorted_tags = $self->all_tags();
+  set_rec( $rec );
 
 =cut
 
-sub all_tags {
-       my $self = shift;
-
-       if (! $self->{_tags_by_order}) {
-
-               my $log = $self->_get_logger;
-               # sanity check
-               $log->logdie("can't find self->{inport_xml}->{indexer}") unless ($self->{import_xml}->{indexer});
-
-               my @tags = keys %{ $self->{'import_xml'}->{'indexer'}};
-               $log->debug("unsorted tags: " . join(", ", @tags));
+my $rec;
 
-               @tags = sort { $self->_sort_by_order } @tags;
-
-               $log->debug("sorted tags: " . join(",", @tags) );
-
-               $self->{_tags_by_order} = \@tags;
-       }
-
-       return $self->{_tags_by_order};
+sub set_rec {
+       $rec = shift or die "no record hash";
 }
 
+=head2 tag
 
+Define new tag for I<search> and I<display>.
 
-=head2 data_structure
-
-Create in-memory data structure which represents normalized layout from
-C<conf/normalize/*.xml>.
-
-This structures are used to produce output.
+  tag('Title', rec('200','a') );
 
- my $ds = $webpac->data_structure($rec);
 
 =cut
 
-sub data_structure {
-       my $self = shift;
-
-       my $log = $self->_get_logger();
-
-       my $rec = shift;
-       $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
-
-       $log->debug("data_structure rec = ", sub { Dumper($rec) });
-
-       $log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
-
-       my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
-
-       my $cache_file;
-
-       if ($self->{'db'}) {
-               my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
-               $log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
-               return $ds if ($ds);
-               $log->debug("cache miss, creating");
-       }
-
-       my $tags = $self->all_tags();
-
-       $log->debug("tags: ",sub { join(", ",@{ $tags }) });
-
-       my $ds;
-
-       foreach my $field (@{ $tags }) {
-
-               my $row;
-
-#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
-
-               foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
-                       my $format;
-
-                       $log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
-                       $format = $tag->{'value'} || $tag->{'content'};
-
-                       my @v;
-                       if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
-                               @v = $self->_rec_to_arr($rec,$format,'fill_in');
-                       } else {
-                               @v = $self->_rec_to_arr($rec,$format,'parse');
-                       }
-                       if (! @v) {
-                               $log->debug("$field <",$self->{tag},"> format: $format no values");
-                               next;
-                       } else {
-                               $log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
-                       }
-
-                       if ($tag->{'sort'}) {
-                               @v = $self->sort_arr(@v);
-                       }
-
-                       # use format?
-                       if ($tag->{'format_name'}) {
-                               @v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
-                       }
-
-                       # delimiter will join repeatable fields
-                       if ($tag->{'delimiter'}) {
-                               @v = ( join($tag->{'delimiter'}, @v) );
-                       }
-
-                       # default types 
-                       my @types = qw(display search);
-                       # override by type attribute
-                       @types = ( $tag->{'type'} ) if ($tag->{'type'});
-
-                       foreach my $type (@types) {
-                               # append to previous line?
-                               $log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} || 'no append');
-                               if ($tag->{'append'}) {
-
-                                       # I will delimit appended part with
-                                       # delimiter (or ,)
-                                       my $d = $tag->{'delimiter'};
-                                       # default delimiter
-                                       $d ||= " ";
-
-                                       my $last = pop @{$row->{$type}};
-                                       $d = "" if (! $last);
-                                       $last .= $d . join($d, @v);
-                                       push @{$row->{$type}}, $last;
-
-                               } else {
-                                       push @{$row->{$type}}, @v;
-                               }
-                       }
-
-
-               }
-
-               if ($row) {
-                       $row->{'tag'} = $field;
-
-                       # TODO: name_sigular, name_plural
-                       my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
-                       my $row_name = $name ? $self->_x($name) : $field;
-
-                       # post-sort all values in field
-                       if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
-                               $log->warn("sort at field tag not implemented");
-                       }
-
-                       $ds->{$row_name} = $row;
-
-                       $log->debug("row $field: ",sub { Dumper($row) });
-               }
-
-       }
-
-       $self->{'db'}->save_ds(
-               id => $id,
-               ds => $ds,
-               prefix => $self->{prefix},
-       ) if ($self->{'db'});
-
-       $log->debug("ds: ", sub { Dumper($ds) });
-
-       $log->logconfess("data structure returned is not array any more!") if wantarray;
-
-       return $ds;
+my $out;
 
+sub tag {
+       my $name = shift or die "tag needs name as first argument";
+       my @o = grep { defined($_) && $_ ne '' } @_;
+       return unless (@o);
+       $out->{$name}->{tag} = $name;
+       $out->{$name}->{search} = \@o;
+       $out->{$name}->{display} = \@o;
 }
 
-=head2 parse
-
-Perform smart parsing of string, skipping delimiters for fields which aren't
-defined. It can also eval code in format starting with C<eval{...}> and
-return output or nothing depending on eval code.
-
- my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
-
-Filters are implemented here. While simple form of filters looks like this:
+=head2 display
 
-  filter{name_of_filter}
+Define tag just for I<display>
 
-but, filters can also have variable number of parametars like this:
-
-  filter{name_of_filter(param,param,param)}
+  @v = display('Title', rec('200','a') );
 
 =cut
 
-my $warn_once;
-
-sub parse {
-       my $self = shift;
-
-       my ($rec, $format_utf8, $i, $rec_size) = @_;
-
-       return if (! $format_utf8);
-
-       my $log = $self->_get_logger();
-
-       $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
-
-       $i = 0 if (! $i);
-
-       my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
-
-       my @out;
-
-       $log->debug("format: $format [$i]");
-
-       my $eval_code;
-       # remove eval{...} from beginning
-       $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
-
-       my $filter_name;
-       # remove filter{...} from beginning
-       $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
-
-       # did we found any (att all) field from format in row?
-       my $found_any;
-       # prefix before first field which we preserve it $found_any
-       my $prefix;
-
-       my $f_step = 1;
-
-       while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
-
-               my $del = $1 || '';
-               $prefix = $del if ($f_step == 1);
-
-               my $fld_type = lc($2);
-
-               # repeatable index
-               my $r = $i;
-               if ($fld_type eq 's') {
-                       if ($found_any->{'v'}) {
-                               $r = 0;
-                       } else {
-                               return;
-                       }
-               }
+sub display {
+       my $name = shift or die "display needs name as first argument";
+       my @o = grep { defined($_) && $_ ne '' } @_;
+       return unless (@o);
+       $out->{$name}->{tag} = $name;
+       $out->{$name}->{display} = \@o;
+}
 
-               my $found = 0;
-               my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found,$rec_size);
+=head2 search
 
-               if ($found) {
-                       $found_any->{$fld_type} += $found;
+Prepare values just for I<search>
 
-                       # we will skip delimiter before first occurence of field!
-                       push @out, $del unless($found_any->{$fld_type} == 1);
-                       push @out, $tmp if ($tmp);
-               }
-               $f_step++;
-       }
+  @v = search('Title', rec('200','a') );
 
-       # test if any fields found?
-       return if (! $found_any->{'v'} && ! $found_any->{'s'});
+=cut
 
-       my $out = join('',@out);
+sub search {
+       my $name = shift or die "search needs name as first argument";
+       my @o = grep { defined($_) && $_ ne '' } @_;
+       return unless (@o);
+       $out->{$name}->{tag} = $name;
+       $out->{$name}->{search} = \@o;
+}
 
-       if ($out) {
-               # add rest of format (suffix)
-               $out .= $format;
+=head2 get_ds
 
-               # add prefix if not there
-               $out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
+Return hash formatted as data structure
 
-               $log->debug("result: $out");
-       }
+  my $ds = get_ds();
 
-       if ($eval_code) {
-               my $eval = $self->fill_in($rec,$eval_code,$i) || return;
-               $log->debug("about to eval{$eval} format: $out");
-               return if (! $self->_eval($eval));
-       }
-       
-       if ($filter_name) {
-               my @filter_args;
-               if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
-                       @filter_args = split(/,/, $2);
-               }
-               if ($self->{'filter'}->{$filter_name}) {
-                       $log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
-                       unshift @filter_args, $out;
-                       $out = $self->{'filter'}->{$filter_name}->(@filter_args);
-                       return unless(defined($out));
-                       $log->debug("filter result: $out");
-               } elsif (! $warn_once->{$filter_name}) {
-                       $log->warn("trying to use undefined filter $filter_name");
-                       $warn_once->{$filter_name}++;
-               }
-       }
+=cut
 
+sub get_ds {
        return $out;
 }
 
-=head2 fill_in
-
-Workhourse of all: takes record from in-memory structure of database and
-strings with placeholders and returns string or array of with substituted
-values from record.
+=head2 clean_ds
 
- my $text = $webpac->fill_in($rec,'v250^a');
+Clean data structure hash for next record
 
-Optional argument is ordinal number for repeatable fields. By default,
-it's assume to be first repeatable field (fields are perl array, so first
-element is 0).
-Following example will read second value from repeatable field.
-
- my $text = $webpac->fill_in($rec,'Title: v250^a',1);
-
-This function B<does not> perform parsing of format to inteligenty skip
-delimiters before fields which aren't used.
-
-This method will automatically decode UTF-8 string to local code page
-if needed.
-
-There is optional parametar C<$record_size> which can be used to get sizes of
-all C<field^subfield> combinations in this format.
-
- my $text = $webpac->fill_in($rec,'got: v900^a v900^x',0,\$rec_size); 
+  clean_ds();
 
 =cut
 
-sub fill_in {
-       my $self = shift;
-
-       my $log = $self->_get_logger();
-
-       my ($rec,$format,$i,$rec_size) = @_;
+sub clean_ds {
+       $out = undef;
+}
 
-       $log->logconfess("need data record") unless ($rec);
-       $log->logconfess("need format to parse") unless($format);
+=head2 set_lookup
 
-       # iteration (for repeatable fields)
-       $i ||= 0;
+Set current lookup hash
 
-       $log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
+  set_lookup( $lookup );
 
-       # FIXME remove for speedup?
-       $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+=cut
 
-       if (utf8::is_utf8($format)) {
-               $format = $self->_x($format);
-       }
+my $lookup;
 
-       my $found = 0;
-       my $just_single = 1;
+sub set_lookup {
+       $lookup = shift;
+}
 
-       my $eval_code;
-       # remove eval{...} from beginning
-       $eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
+=head2 rec1
 
-       my $filter_name;
-       # remove filter{...} from beginning
-       $filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
+Return all values in some field
 
-       {
-               # fix warnings
-               no warnings 'uninitialized';
+  @v = rec1('200')
 
-               # do actual replacement of placeholders
-               # repeatable fields
-               if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found,$rec_size)/ges) {
-                       $just_single = 0;
-               }
+TODO: order of values is probably same as in source data, need to investigate that
 
-               # non-repeatable fields
-               if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found,$rec_size)/ges) {
-                       return if ($i > 0 && $just_single);
-               }
-       }
+=cut
 
-       if ($found) {
-               $log->debug("format: $format");
-               if ($eval_code) {
-                       my $eval = $self->fill_in($rec,$eval_code,$i);
-                       return if (! $self->_eval($eval));
-               }
-               if ($filter_name && $self->{'filter'}->{$filter_name}) {
-                       $log->debug("filter '$filter_name' for $format");
-                       $format = $self->{'filter'}->{$filter_name}->($format);
-                       return unless(defined($format));
-                       $log->debug("filter result: $format");
-               }
-               # do we have lookups?
-               if ($self->{'lookup'}) {
-                       if ($self->{'lookup'}->can('lookup')) {
-                               my @lookup = $self->{lookup}->lookup($format);
-                               $log->debug("lookup $format", join(", ", @lookup));
-                               return @lookup;
+sub rec1 {
+       my $f = shift;
+       return unless (defined($rec) && defined($rec->{$f}));
+       if (ref($rec->{$f}) eq 'ARRAY') {
+               return map { 
+                       if (ref($_) eq 'HASH') {
+                               values %{$_};
                        } else {
-                               $log->warn("Have lookup object but can't invoke lookup method");
+                               $_;
                        }
-               } else {
-                       return $format;
-               }
-       } else {
-               return;
+               } @{ $rec->{$f} };
+       } elsif( defined($rec->{$f}) ) {
+               return $rec->{$f};
        }
 }
 
+=head2 rec2
 
-=head2 _rec_to_arr
-
-Similar to C<parse> and C<fill_in>, but returns array of all repeatable fields. Usable
-for fields which have lookups, so they shouldn't be parsed but rather
-C<paste>d or C<fill_id>ed. Last argument is name of operation: C<paste> or C<fill_in>.
+Return all values in specific field and subfield
 
- my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]','paste');
+  @v = rec2('200','a')
 
 =cut
 
-sub _rec_to_arr {
-       my $self = shift;
-
-       my ($rec, $format_utf8, $code) = @_;
+sub rec2 {
+       my $f = shift;
+       return unless (defined($rec && $rec->{$f}));
+       my $sf = shift;
+       return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
+}
 
-       my $log = $self->_get_logger();
+=head2 rec
 
-       $log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
-       return if (! $format_utf8);
+syntaxtic sugar for
 
-       $log->debug("using $code on $format_utf8");
+  @v = rec('200')
+  @v = rec('200','a')
 
-       my $i = 0;
-       my $max = 0;
-       my @arr;
-       my $rec_size = {};
+=cut
 
-       while ($i <= $max) {
-               my @v = $self->$code($rec,$format_utf8,$i++,\$rec_size);
-               if ($rec_size) {
-                       foreach my $f (keys %{ $rec_size }) {
-                               $max = $rec_size->{$f} if ($rec_size->{$f} > $max);
-                       }
-                       $log->debug("max set to $max");
-                       undef $rec_size;
-               }
-               if (@v) {
-                       push @arr, @v;
-               } else {
-                       push @arr, '' if ($max > $i);
-               }
+sub rec {
+       if ($#_ == 0) {
+               return rec1(@_);
+       } elsif ($#_ == 1) {
+               return rec2(@_);
        }
-
-       $log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
-
-       return @arr;
 }
 
+=head2 regex
 
-=head2 get_data
-
-Returns value from record.
-
- my $text = $self->get_data(\$rec,$f,$sf,$i,\$found,\$rec_size);
-
-Required arguments are:
+Apply regex to some or all values
 
-=over 8
-
-=item C<$rec>
-
-record reference
-
-=item C<$f>
-
-field
-
-=item C<$sf>
-
-optional subfield
-
-=item C<$i>
-
-index offset for repeatable values ( 0 ... $rec_size->{'400^a'} )
-
-=item C<$found>
-
-optional variable that will be incremeted if preset
-
-=item C<$rec_size>
-
-hash to hold maximum occurances of C<field^subfield> combinations
-(which can be accessed using keys in same format)
-
-=back
-
-Returns value or empty string, updates C<$found> and C<rec_size>
-if present.
+  @v = regex( 's/foo/bar/g', @v );
 
 =cut
 
-sub get_data {
-       my $self = shift;
-
-       my ($rec,$f,$sf,$i,$found,$cache) = @_;
-
-       return '' unless ($$rec->{$f} && ref($$rec->{$f}) eq 'ARRAY');
-
-       if (defined($$cache)) {
-               $$cache->{ $f . ( $sf ? '^' . $sf : '' ) } ||= scalar @{ $$rec->{$f} };
-       }
-
-       return '' unless ($$rec->{$f}->[$i]);
-
-       {
-               no strict 'refs';
-               if (defined($sf)) {
-                       $$found++ if (defined($$found) && $$rec->{$f}->[$i]->{$sf});
-                       return $$rec->{$f}->[$i]->{$sf};
-               } else {
-                       $$found++ if (defined($$found));
-                       # it still might have subfields, just
-                       # not specified, so we'll dump some debug info
-                       if ($$rec->{$f}->[$i] =~ /HASH/o) {
-                               my $out;
-                               foreach my $k (keys %{$$rec->{$f}->[$i]}) {
-                                       my $v = $$rec->{$f}->[$i]->{$k};
-                                       $out .= '$' . $k .':' . $v if ($v);
-                               }
-                               return $out;
-                       } else {
-                               return $$rec->{$f}->[$i];
-                       }
-               }
+sub regex {
+       my $r = shift;
+       my @out;
+       #warn "r: $r\n",Dumper(\@_);
+       foreach my $t (@_) {
+               next unless ($t);
+               eval "\$t =~ $r";
+               push @out, $t if ($t && $t ne '');
        }
+       return @out;
 }
 
+=head2 prefix
 
-=head2 apply_format
-
-Apply format specified in tag with C<format_name="name"> and
-C<format_delimiter=";;">.
-
- my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
+Prefix all values with a string
 
-Formats can contain C<lookup{...}> if you need them.
+  @v = prefix( 'my_', @v );
 
 =cut
 
-sub apply_format {
-       my $self = shift;
-
-       my ($name,$delimiter,$data) = @_;
-
-       my $log = $self->_get_logger();
-
-       if (! $self->{'import_xml'}->{'format'}->{$name}) {
-               $log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
-               return $data;
-       }
-
-       $log->warn("no delimiter for format $name") if (! $delimiter);
+sub prefix {
+       my $p = shift or die "prefix needs string as first argument";
+       return map { $p . $_ } grep { defined($_) } @_;
+}
 
-       my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
+=head2 suffix
 
-       my @data = split(/\Q$delimiter\E/, $data);
+suffix all values with a string
 
-       my $out = sprintf($format, @data);
-       $log->debug("using format $name [$format] on $data to produce: $out");
+  @v = suffix( '_my', @v );
 
-       if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
-               return $self->{'lookup'}->lookup($out);
-       } else {
-               return $out;
-       }
+=cut
 
+sub suffix {
+       my $s = shift or die "suffix needs string as first argument";
+       return map { $_ . $s } grep { defined($_) } @_;
 }
 
-=head2 sort_arr
+=head2 surround
 
-Sort array ignoring case and html in data
+surround all values with a two strings
 
my @sorted = $webpac->sort_arr(@unsorted);
 @v = surround( 'prefix_', '_suffix', @v );
 
 =cut
 
-sub sort_arr {
-       my $self = shift;
-
-       my $log = $self->_get_logger();
-
-       # FIXME add Schwartzian Transformation?
-
-       my @sorted = sort {
-               $a =~ s#<[^>]+/*>##;
-               $b =~ s#<[^>]+/*>##;
-               lc($b) cmp lc($a)
-       } @_;
-       $log->debug("sorted values: ",sub { join(", ",@sorted) });
-
-       return @sorted;
+sub surround {
+       my $p = shift or die "surround need prefix as first argument";
+       my $s = shift or die "surround needs suffix as second argument";
+       return map { $p . $_ . $s } grep { defined($_) } @_;
 }
 
+=head2 first
 
-=head1 INTERNAL METHODS
-
-=head2 _sort_by_order
+Return first element
 
-Sort xml tags data structure accoding to C<order=""> attribute.
+  $v = first( @v );
 
 =cut
 
-sub _sort_by_order {
-       my $self = shift;
-
-       my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
-               $self->{'import_xml'}->{'indexer'}->{$a};
-       my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
-               $self->{'import_xml'}->{'indexer'}->{$b};
-
-       return $va <=> $vb;
+sub first {
+       my $r = shift;
+       return $r;
 }
 
-=head2 _x
-
-Convert strings from C<conf/normalize/*.xml> encoding into application
-specific encoding (optinally specified using C<code_page> to C<new>
-constructor).
+=head2 lookup
 
- my $text = $n->_x('normalize text string');
+Consult lookup hashes for some value
 
-This is a stub so that other modules doesn't have to implement it.
+  @v = lookup( $v );
+  @v = lookup( @v );
 
 =cut
 
-sub _x {
-       my $self = shift;
-       return shift;
+sub lookup {
+       my $k = shift or return;
+       return unless (defined($lookup->{$k}));
+       if (ref($lookup->{$k}) eq 'ARRAY') {
+               return @{ $lookup->{$k} };
+       } else {
+               return $lookup->{$k};
+       }
 }
 
+=head2 join_with
 
-=head1 AUTHOR
-
-Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
-
-=head1 COPYRIGHT & LICENSE
+Joins walues with some delimiter
 
-Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it
-under the same terms as Perl itself.
+  $v = join_with(", ", @v);
 
 =cut
 
-1; # End of WebPAC::Normalize
+sub join_with {
+       my $d = shift;
+       return join($d, grep { defined($_) && $_ ne '' } @_);
+}
+
+# END
+1;
diff --git a/lib/WebPAC/Normalize/Set.pm b/lib/WebPAC/Normalize/Set.pm
deleted file mode 100644 (file)
index 0b8a17e..0000000
+++ /dev/null
@@ -1,342 +0,0 @@
-package WebPAC::Normalize::Set;
-use Exporter 'import';
-@EXPORT = qw/
-       set_rec set_lookup
-       get_ds clean_ds
-       tag search display
-       rec1 rec2 rec
-       regex prefix suffix surround
-       first lookup join_with
-/;
-
-use warnings;
-use strict;
-
-#use base qw/WebPAC::Common/;
-use Data::Dumper;
-
-=head1 NAME
-
-WebPAC::Normalize::Set - describe normalisaton rules using sets
-
-=head1 VERSION
-
-Version 0.04
-
-=cut
-
-our $VERSION = '0.04';
-
-=head1 SYNOPSIS
-
-This module uses C<conf/normalize/*.pl> files to perform normalisation
-from input records using perl functions which are specialized for set
-processing.
-
-Sets are implemented as arrays, and normalisation file is valid perl, which
-means that you check it's validity before running WebPAC using
-C<perl -c normalize.pl>.
-
-Normalisation can generate multiple output normalized data. For now, supported output
-types (on the left side of definition) are: C<tag>, C<display> and C<search>.
-
-=head1 FUNCTIONS
-
-=head2 data_structure
-
-Return data structure
-
-  my $ds = WebPAC::Normalize::Set(
-       lookup => $lookup->lookup_hash,
-       row => $row,
-       rules => $normalize_pl_config,
-  );
-
-This function will B<die> if normalizastion can't be evaled.
-
-=cut
-
-sub data_structure {
-       my $arg = {@_};
-
-       die "need row argument" unless ($arg->{row});
-       die "need normalisation argument" unless ($arg->{rules});
-
-       no strict 'subs';
-       set_lookup( $arg->{lookup} );
-       set_rec( $arg->{row} );
-       clean_ds();
-       eval "$arg->{rules}";
-       die "error evaling $arg->{rules}: $@\n" if ($@);
-       return get_ds();
-}
-
-=head2 set_rec
-
-Set current record hash
-
-  set_rec( $rec );
-
-=cut
-
-my $rec;
-
-sub set_rec {
-       $rec = shift or die "no record hash";
-}
-
-=head2 tag
-
-Define new tag for I<search> and I<display>.
-
-  tag('Title', rec('200','a') );
-
-
-=cut
-
-my $out;
-
-sub tag {
-       my $name = shift or die "tag needs name as first argument";
-       my @o = grep { defined($_) && $_ ne '' } @_;
-       return unless (@o);
-       $out->{$name}->{tag} = $name;
-       $out->{$name}->{search} = \@o;
-       $out->{$name}->{display} = \@o;
-}
-
-=head2 display
-
-Define tag just for I<display>
-
-  @v = display('Title', rec('200','a') );
-
-=cut
-
-sub display {
-       my $name = shift or die "display needs name as first argument";
-       my @o = grep { defined($_) && $_ ne '' } @_;
-       return unless (@o);
-       $out->{$name}->{tag} = $name;
-       $out->{$name}->{display} = \@o;
-}
-
-=head2 search
-
-Prepare values just for I<search>
-
-  @v = search('Title', rec('200','a') );
-
-=cut
-
-sub search {
-       my $name = shift or die "search needs name as first argument";
-       my @o = grep { defined($_) && $_ ne '' } @_;
-       return unless (@o);
-       $out->{$name}->{tag} = $name;
-       $out->{$name}->{search} = \@o;
-}
-
-=head2 get_ds
-
-Return hash formatted as data structure
-
-  my $ds = get_ds();
-
-=cut
-
-sub get_ds {
-       return $out;
-}
-
-=head2 clean_ds
-
-Clean data structure hash for next record
-
-  clean_ds();
-
-=cut
-
-sub clean_ds {
-       $out = undef;
-}
-
-=head2 set_lookup
-
-Set current lookup hash
-
-  set_lookup( $lookup );
-
-=cut
-
-my $lookup;
-
-sub set_lookup {
-       $lookup = shift;
-}
-
-=head2 rec1
-
-Return all values in some field
-
-  @v = rec1('200')
-
-TODO: order of values is probably same as in source data, need to investigate that
-
-=cut
-
-sub rec1 {
-       my $f = shift;
-       return unless (defined($rec) && defined($rec->{$f}));
-       if (ref($rec->{$f}) eq 'ARRAY') {
-               return map { 
-                       if (ref($_) eq 'HASH') {
-                               values %{$_};
-                       } else {
-                               $_;
-                       }
-               } @{ $rec->{$f} };
-       } elsif( defined($rec->{$f}) ) {
-               return $rec->{$f};
-       }
-}
-
-=head2 rec2
-
-Return all values in specific field and subfield
-
-  @v = rec2('200','a')
-
-=cut
-
-sub rec2 {
-       my $f = shift;
-       return unless (defined($rec && $rec->{$f}));
-       my $sf = shift;
-       return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
-}
-
-=head2 rec
-
-syntaxtic sugar for
-
-  @v = rec('200')
-  @v = rec('200','a')
-
-=cut
-
-sub rec {
-       if ($#_ == 0) {
-               return rec1(@_);
-       } elsif ($#_ == 1) {
-               return rec2(@_);
-       }
-}
-
-=head2 regex
-
-Apply regex to some or all values
-
-  @v = regex( 's/foo/bar/g', @v );
-
-=cut
-
-sub regex {
-       my $r = shift;
-       my @out;
-       #warn "r: $r\n",Dumper(\@_);
-       foreach my $t (@_) {
-               next unless ($t);
-               eval "\$t =~ $r";
-               push @out, $t if ($t && $t ne '');
-       }
-       return @out;
-}
-
-=head2 prefix
-
-Prefix all values with a string
-
-  @v = prefix( 'my_', @v );
-
-=cut
-
-sub prefix {
-       my $p = shift or die "prefix needs string as first argument";
-       return map { $p . $_ } grep { defined($_) } @_;
-}
-
-=head2 suffix
-
-suffix all values with a string
-
-  @v = suffix( '_my', @v );
-
-=cut
-
-sub suffix {
-       my $s = shift or die "suffix needs string as first argument";
-       return map { $_ . $s } grep { defined($_) } @_;
-}
-
-=head2 surround
-
-surround all values with a two strings
-
-  @v = surround( 'prefix_', '_suffix', @v );
-
-=cut
-
-sub surround {
-       my $p = shift or die "surround need prefix as first argument";
-       my $s = shift or die "surround needs suffix as second argument";
-       return map { $p . $_ . $s } grep { defined($_) } @_;
-}
-
-=head2 first
-
-Return first element
-
-  $v = first( @v );
-
-=cut
-
-sub first {
-       my $r = shift;
-       return $r;
-}
-
-=head2 lookup
-
-Consult lookup hashes for some value
-
-  @v = lookup( $v );
-  @v = lookup( @v );
-
-=cut
-
-sub lookup {
-       my $k = shift or return;
-       return unless (defined($lookup->{$k}));
-       if (ref($lookup->{$k}) eq 'ARRAY') {
-               return @{ $lookup->{$k} };
-       } else {
-               return $lookup->{$k};
-       }
-}
-
-=head2 join_with
-
-Joins walues with some delimiter
-
-  $v = join_with(", ", @v);
-
-=cut
-
-sub join_with {
-       my $d = shift;
-       return join($d, grep { defined($_) && $_ ne '' } @_);
-}
-
-# END
-1;
diff --git a/lib/WebPAC/Normalize/XML.pm b/lib/WebPAC/Normalize/XML.pm
deleted file mode 100644 (file)
index 7e829b5..0000000
+++ /dev/null
@@ -1,164 +0,0 @@
-package WebPAC::Normalize::XML;
-
-use warnings;
-use strict;
-
-use base qw/WebPAC::Common WebPAC::Normalize/;
-use XML::Simple;
-use Data::Dumper;
-use Text::Iconv;
-use YAML qw/Dump LoadFile/;
-
-=head1 NAME
-
-WebPAC::Normalize::XML - apply XML or YAML normalisaton rules
-
-=head1 VERSION
-
-Version 0.03
-
-=cut
-
-our $VERSION = '0.03';
-
-=head1 SYNOPSIS
-
-This module uses C<conf/normalize/*.xml> files to perform normalisation
-from input records
-
-=cut
-
-=head1 FUNCTIONS
-
-=head2 open
-
-Read normalisation rules defined using XML from C<conf/normalize/*.xml> and
-parse it.
-
- my $n = new WebPAC::Normalize::XML;
- $n->open(
-       tag => 'isis',
-       xml_file => '/path/to/conf/normalize/isis.xml',
- );
-
-C<tag> defines tag to use within C<xml_file>
-
-C<xml_file> defines path to normalize XML
-
-C<tags> define additional tags that can be forced (and an be array).
-
-=cut
-
-sub open {
-       my $self = shift;
-
-       my $arg = {@_};
-
-       my $log = $self->_get_logger();
-
-       foreach my $req (qw/tag xml_file/) {
-               $log->logconfess("need argument $req") unless $arg->{$req};
-       }
-
-       $self->{'tag'} = $arg->{'tag'};
-       my $xml_file = $arg->{'xml_file'};
-
-       $log->info("using $xml_file tag <",$self->{'tag'},">");
-
-       $log->logdie("normalisation xml file '$xml_file' doesn't exist!") if (! -e $xml_file);
-
-       $self->{'import_xml_file'} = $xml_file;
-
-       my @force_array = [ $self->{'tag'}, 'config', 'format' ];
-       push @force_array, $self->{'tags'} if ($self->{'tags'});
-
-       $self->{'import_xml'} = XMLin($xml_file,
-               ForceArray => @force_array,
-               ForceContent => 1,
-       );
-
-       $log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) }, $self->{lookup} ? " using lookups" : "lookups disabled");
-
-       #print STDERR Dump($self->{import_xml});
-
-       return $self;
-}
-
-=head2 open_yaml
-
-Read normalisation rules defined in YAML file located usually at
-C<conf/normalize/*.yml> and parse it.
-
- my $n = new WebPAC::Normalize::XML;
- $n->open_yaml(
-       tag => 'isis',
-       path => '/path/to/conf/normalize/isis.yml',
- );
-
-=cut
-
-sub open_yaml {
-       my $self = shift;
-
-       my $arg = {@_};
-
-       my $log = $self->_get_logger();
-
-       foreach my $req (qw/tag path/) {
-               $log->logconfess("need argument $req") unless $arg->{$req};
-       }
-
-       my $path = $arg->{path};
-       $self->{tag} = $arg->{tag};
-
-       $log->logdie("normalisation yaml file '$path' doesn't exist!") if (! -e $path);
-
-       $log->info("using $path normalization YAML");
-
-       $self->{'import_xml'} = LoadFile( $path ) || $log->die("can't load $path: $!");
-
-       $log->debug("import yaml is ",sub { Dumper($self->{'import_xml'}) }, $self->{lookup} ? " using lookups" : "lookups disabled");
-
-       $self->{_skip_x} = 1;
-
-       return $self;
-}
-
-=head2 _x
-
-Convert string from XML UTF-8 encoding to code page defined in C<xml_file>.
-
- my $text = $n->_x('utf8 text');
-
-Default application code page is C<ISO-8859-2>. You will probably want to
-change that when creating new instance of object based on this one.
-
-=cut
-
-sub _x {
-       my $self = shift;
-       my $utf8 = shift || return;
-       return $utf8 if ($self->{_skip_x});
-
-       # create UTF-8 convertor for import_xml files
-       $self->{'utf2cp'} ||= Text::Iconv->new('UTF-8' ,$self->{'code_page'} || 'ISO-8859-2');
-
-       return $self->{'utf2cp'}->convert($utf8) ||
-               $self->_get_logger()->logwarn("can't convert '$utf8'");
-}
-
-
-=head1 AUTHOR
-
-Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
-
-=head1 COPYRIGHT & LICENSE
-
-Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it
-under the same terms as Perl itself.
-
-=cut
-
-1; # End of WebPAC::Normalize::XML
index 7f562a4..6c50ac3 100644 (file)
@@ -9,6 +9,7 @@ use KinoSearch::InvIndexer;
 use KinoSearch::Analysis::PolyAnalyzer;
 use Encode qw/from_to/;
 use Data::Dumper;
+use Storable;
 
 =head1 NAME
 
@@ -16,11 +17,11 @@ WebPAC::Output::KinoSearch - Create KinoSearch full text index
 
 =head1 VERSION
 
-Version 0.01
+Version 0.02
 
 =cut
 
-our $VERSION = '0.01';
+our $VERSION = '0.02';
 
 =head1 SYNOPSIS
 
@@ -99,6 +100,18 @@ sub new {
                analyzer => $analyzer,
        );
 
+       my $fields_path = $self->{index_path} . '/fields.storable';
+       $fields_path =~ s#//#/#g;
+       if (-e $fields_path) {
+               $self->{fields} = retrieve($fields_path) ||
+                       $log->warn("can't open $fields_path: $!");
+       } else {
+               $log->error("This will be dummy run since no fields statistics are found!");
+               $log->error("You will have to re-run indexing to get search results!");
+               $self->{dummy_run} = 1;
+       }
+       $self->{fields_path} = $fields_path;
+
        foreach my $f (@{ $self->{fields} }) {
                $self->{invindex}->spec_field( 
                        name  => $f,
@@ -158,6 +171,10 @@ sub add {
        sub add_value($$$$$) {
                my ($self,$log,$doc,$n,$v) = @_;
                return unless ($v);
+
+               $self->{value_usage}->{$n}++;
+               return if ($self->{dummy_run});
+
                eval { $doc->set_value($n, $self->convert($v) ) };
                $log->warn("can't insert: $n = $v") if ($@);
        }
@@ -208,8 +225,20 @@ Close index
 sub finish {
        my $self = shift;
 
-       $self->_get_logger()->info("finish index writing to disk");
+       my $log = $self->_get_logger();
+
+       $log->info("finish index writing to disk");
        $self->{invindex}->finish;
+
+       $log->info("writing value usage file");
+
+       # add fields from last run
+       map { $self->{value_usage}->{$_}++ } @{ $self->{fields} };
+
+       my @fields = keys %{ $self->{value_usage} };
+       store \@fields, $self->{fields_path} ||
+               $log->warn("can't write $self->{fields_path}: $!");
+
 }
 
 =head2 convert
diff --git a/run.pl b/run.pl
index b8fb7ce..825facd 100755 (executable)
--- a/run.pl
+++ b/run.pl
@@ -11,8 +11,7 @@ use WebPAC::Common 0.02;
 use WebPAC::Lookup;
 use WebPAC::Input 0.03;
 use WebPAC::Store 0.03;
-use WebPAC::Normalize::XML;
-use WebPAC::Normalize::Set;
+use WebPAC::Normalize;
 use WebPAC::Output::TT;
 use WebPAC::Validate;
 use YAML qw/LoadFile/;
@@ -54,11 +53,6 @@ or C<type> from input
 
 path to YAML configuration file
 
-=item --force-set
-
-force conversion C<< normalize->path >> in C<config.yml> from
-C<.xml> to C<.pl>
-
 =item --stats
 
 disable indexing and dump statistics about field and subfield
@@ -79,7 +73,6 @@ my $clean = 0;
 my $config = 'conf/config.yml';
 my $debug = 0;
 my $only_filter;
-my $force_set = 0;
 my $stats = 0;
 my $validate_path;
 
@@ -91,7 +84,6 @@ GetOptions(
        "only=s" => \$only_filter,
        "config" => \$config,
        "debug" => \$debug,
-       "force-set" => \$force_set,
        "stats" => \$stats,
        "validate=s" => \$validate_path,
 );
@@ -228,43 +220,12 @@ while (my ($database, $db_config) = each %{ $config->{databases} }) {
                        %{ $input },
                );
 
-               my $n = new WebPAC::Normalize::XML(
-               #       filter => { 'foo' => sub { shift } },
-                       db => $db,
-                       lookup_regex => $lookup ? $lookup->regex : undef,
-                       lookup => $lookup,
-                       prefix => $input->{name},
-               );
-
                my $rules;
                my $normalize_path = $input->{normalize}->{path};
 
-               if ($force_set) {
-                       my $new_norm_path = $normalize_path;
-                       $new_norm_path =~ s/\.xml$/.pl/;
-                       if (-e $new_norm_path) {
-                               $log->debug("--force-set replaced $normalize_path with $new_norm_path");
-                               $normalize_path = $new_norm_path;
-                       } else {
-                               $log->debug("--force-set failed on $new_norm_path, fallback to $normalize_path");
-                       }
-               }
+               $log->logdie("Found '$normalize_path' as normalization file which isn't supported any more!") unless ( $normalize_path =~ m!\.pl$!i );
 
-               if ($normalize_path =~ m/\.xml$/i) {
-                       $n->open(
-                               tag => $input->{normalize}->{tag},
-                               xml_file => $normalize_path,
-                       );
-               } elsif ($normalize_path =~ m/\.(?:yml|yaml)$/i) {
-                       $n->open_yaml(
-                               path => $normalize_path,
-                               tag => $input->{normalize}->{tag},
-                       );
-               } elsif ($normalize_path =~ m/\.(?:pl)$/i) {
-                       $n = undef;
-                       $log->info("using WebPAC::Normalize::Set to process $normalize_path");
-                       $rules = read_file( $normalize_path ) or die "can't open $normalize_path: $!";
-               }
+               my $rules = read_file( $normalize_path ) or die "can't open $normalize_path: $!";
 
                foreach my $pos ( 0 ... $input_db->size ) {
 
@@ -285,22 +246,17 @@ while (my ($database, $db_config) = each %{ $config->{databases} }) {
                        }
 
                                
-                       my $ds;
-                       if ($n) {
-                               $ds = $n->data_structure($row);
-                       } else {
-                               $ds = WebPAC::Normalize::Set::data_structure(
-                                       row => $row,
-                                       rules => $rules,
-                                       lookup => $lookup ? $lookup->lookup_hash : undef,
-                               );
-
-                               $db->save_ds(
-                                       id => $mfn,
-                                       ds => $ds,
-                                       prefix => $input->{name},
-                               ) if ($ds && !$stats);
-                       }
+                       my $ds = WebPAC::Normalize::data_structure(
+                               row => $row,
+                               rules => $rules,
+                               lookup => $lookup ? $lookup->lookup_hash : undef,
+                       );
+
+                       $db->save_ds(
+                               id => $mfn,
+                               ds => $ds,
+                               prefix => $input->{name},
+                       ) if ($ds && !$stats);
 
                        $indexer->add(
                                id => $input->{name} . "/" . $mfn,
index 06d6667..633eab2 100644 (file)
@@ -11,7 +11,7 @@ use_ok( 'WebPAC::Input' );
 use_ok( 'WebPAC::Input::ISIS' );
 use_ok( 'WebPAC::Store' );
 use_ok( 'WebPAC::Lookup' );
-use_ok( 'WebPAC::Normalize::XML' );
+use_ok( 'WebPAC::Normalize' );
 use_ok( 'WebPAC::Output' );
 use_ok( 'WebPAC::Output::Estraier' );
 use_ok( 'WebPAC::Output::TT' );
diff --git a/t/3-normalize-set.t b/t/3-normalize-set.t
deleted file mode 100755 (executable)
index da7a881..0000000
+++ /dev/null
@@ -1,345 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-
-use Test::More tests => 67;
-use Test::Exception;
-use Cwd qw/abs_path/;
-use blib;
-use File::Slurp;
-
-use Data::Dumper;
-my $debug = shift @ARGV;
-
-BEGIN {
-       use_ok( 'WebPAC::Normalize::Set' );
-}
-
-ok(my $abs_path = abs_path($0), "abs_path");
-$abs_path =~ s#/[^/]*$#/#;
-diag "abs_path: $abs_path" if ($debug);
-
-#throws_ok { new WebPAC::Normalize::XML( lookup_regex => 'foo' ) } qr/pair/, "lookup_regex without lookup";
-
-my $rec1 = {
-       '200' => [{
-               'a' => '200a',
-               'b' => '200b',
-               },{
-               'c' => '200c',
-               'd' => '200d',
-               },{
-               'a' => '200a*2',
-               'd' => '200d*2',
-               }],
-       '201' => [{
-               'x' => '201x',
-               'y' => '201y',
-               }],
-       '900' => [
-               '900-no_subfield'
-               ],
-       '901' => [{
-               'a' => '900a',
-               }],
-       '902' => [{
-               'z' => '900',
-               }],
-};
-
-my $rec2 = {
- '675' => [ {
-              'a' => '159.9'
-            } ],
- '210' => [ {
-              'c' => 'New York University press',
-              'a' => 'New York',
-              'd' => 'cop. 1988'
-            } ],
- '700' => [ {
-              'a' => 'Haynal',
-              'b' => 'André'
-            } ],
- '801' => [ 'FFZG' ],
- '991' => [ '8302' ],
- '000' => [ 1 ],
- '702' => [ {
-              'a' => 'Holder',
-              'b' => 'Elizabeth'
-            } ],
- '215' => [ {
-              'c' => 'ilustr',
-              'a' => 'xix, 202 str',
-              'd' => '23cm'
-            } ],
- '990' => [
-            '2140',
-            '88',
-            'HAY'
-          ],
- '200' => [ {
-              'e' => 'from Freud and Ferenczi to Michael balint',
-              'a' => 'Controversies in psychoanalytic method',
-              'g' => 'translated by Elizabeth Holder on the basisi of a first draft by Archie Hooton ; with a preface by Daniel N. Stern',
-              'f' => 'by André E. Haynal'
-            } ],
- '610' => [ 'povijest psihoanalize' ],
- '994' => [ {
-              'c' => '',
-              'a' => 'PS',
-              'b' => 'MG'
-            } ],
- '320' => [ 'Kazalo' ],
- '101' => [ 'ENG' ],
- '686' => [ '2140' ],
- '300' => [ 'Prijevod djela: ' ],
-};
-
-
-my $lookup1 = {
-       '00900' => [
-               'lookup 1',
-               'lookup 2',
-       ],
-};
-
-my $lookup2 = {
-       '00900' => 'lookup',
-};
-
-
-sub test {
-       print Dumper( @_ ), ("-" x 78), "\n";
-       ok( defined(@_) );
-}
-
-# how much of string evaled to display?
-my $max_eval_output = 170;
-
-sub dump_error {
-       my ($msg,$code) = @_;
-
-       my @l = split(/[\n\r]/, $code);
-       my $out = "$msg\n";
-
-       foreach my $i ( 0 .. $#l ) {
-               $out .= sprintf("%2d: %s\n", $i, $l[$i]);
-       }
-
-       return $out;
-}
-
-sub test_s {
-       my $t = shift || die;
-
-       my $eval_t = $t;
-       $eval_t =~ s/[\n\r\s]+/ /gs;
-       $eval_t = substr($eval_t,0,$max_eval_output) . '...' if (length($eval_t) > $max_eval_output);
-
-       eval "$t";
-       ok(! $@, $@ ? dump_error($@, $t) : "eval: $eval_t");
-}
-
-{
-       no strict 'subs';
-       use WebPAC::Normalize::Set;
-
-       ok(! set_lookup( undef ), "set_lookup(undef)");
-
-       set_rec( $rec1 );
-
-       cmp_ok( join(",", rec2('200','a') ), 'eq', '200a,200a*2', 'join rec2' );
-       cmp_ok( join(",", rec2('200','a'), rec2('200','b') ), 'eq', '200a,200a*2,200b', 'join rec2 rec2' );
-       cmp_ok( join(" * ", sort(rec1('200'), rec1('201') )), 'eq', '200a * 200a*2 * 200b * 200c * 200d * 200d*2 * 201x * 201y', 'join sort rec1 rec1' );
-       diag "is_deeply checks\n";
-       is_deeply( \[ rec1('200') ], \[ qw/200a 200b 200c 200d 200a*2 200d*2/ ] );
-       is_deeply( \[ regex( 's/0/o/g', rec1('200') ) ],  \[ qw/2ooa 2oob 2ooc 2ood 2ooa*2 2ood*2/ ]);
-       is_deeply( \[ grep { /\*/ } regex( 's/0/o/g', rec1('200') ) ], \[ qw/2ooa*2 2ood*2/ ]);
-       is_deeply( \[ rec('902') ], \[ '900' ] );
-
-       cmp_ok( rec('902'), 'eq', rec('902','z'), 'rec sf' );
-
-       # simple list manipulatons
-       cmp_ok( join('', prefix('ab', 'cd') ), 'eq', 'abcd', 'prefix');
-       cmp_ok( join('', suffix('xy', 'cd') ), 'eq', 'cdxy', 'suffix');
-       cmp_ok( join('', surround('->', '<-', 'a','b','c') ), 'eq', '->a<-->b<-->c<-', 'surround');
-
-
-       set_lookup( $lookup1 );
-       
-       cmp_ok(
-               join_with(" i ",
-                       lookup( 
-                               regex( 's/^/00/',
-                                       rec2('902','z')
-                               )
-                       ) 
-               ),
-       'eq', 'lookup 1 i lookup 2', 'join lookup regex rec2');
-
-       # check join_with operations
-
-       sub test_join_with_2 {
-               my ($a,$b,$e) = @_;
-
-               cmp_ok(
-                       join_with(" <1> ",
-                               rec('201',$a),
-                               rec('201',$b),
-                       ), 
-               'eq', $e, "join_with $a <1> $b = $e");
-       }
-
-       test_join_with_2('_','_','');
-       test_join_with_2('x','_','201x');
-       test_join_with_2('_','x','201x');
-       test_join_with_2('x','y','201x <1> 201y');
-
-       sub test_join_with_3 {
-               my ($a,$b,$c,$e) = @_;
-
-               cmp_ok(
-                       join_with(" <1> ", rec('201',$a),
-                               join_with(" <2> ", rec('201',$b),
-                                       rec('201',$c),
-                               )
-                       ), 
-               'eq', $e, "join_with $a <1> $b <2> $c = $e");
-       };
-
-       test_join_with_3('_','_','_','');
-       test_join_with_3('x','_','_','201x');
-       test_join_with_3('_','x','_','201x');
-       test_join_with_3('_','_','x','201x');
-       test_join_with_3('x','y','_','201x <1> 201y');
-       test_join_with_3('x','_','y','201x <1> 201y');
-       test_join_with_3('_','x','y','201x <2> 201y');
-       test_join_with_3('x','_','y','201x <1> 201y');
-       test_join_with_3('x','y','x','201x <1> 201y <2> 201x');
-
-       # test lookups
-
-       set_lookup( $lookup2 );
-
-       is_deeply( \[ lookup( prefix( '00', rec('902') ) ) ], \[ 'lookup' ], 'lookup prefix' );
-
-       ok(! lookup('non-existent'), 'lookup non-existant' );
-
-       set_rec( $rec2 );
-
-       test_s(qq{
-               tag('Title',
-                       rec('200','a')
-               );
-       });
-       test_s(qq{
-               tag('Who',
-                       join_with(" ",
-                               rec('702','a'),
-                               rec('702','b')
-                       )
-               );
-       });
-
-       test_s(qq{
-               display('Publisher',
-                       rec('210','c')
-               )
-       });
-       
-       test_s(qq{
-               search('Year',
-                       regex( 's/[^\\d]+//',
-                               rec('210','d')
-                       )
-               )
-       });
-
-       ok(my $ds = get_ds(), "get_ds");
-       diag "ds = ", Dumper($ds) if ($debug);
-
-
-       sub test_check_ds {
-
-               my $t = shift;
-
-               ok($ds = get_ds(), 'get_ds');
-               diag Dumper( $ds ) if ($debug);
-
-               ok( $ds && $ds->{something}, 'get_ds->something exists' );
-               ok( $ds && $ds->{something}->{$t}, 'get_ds->something->'.$t.' exists') if ($t);
-               ok( $ds && !$ds->{empty}, 'get_ds->empty doesn\'t' );
-
-               return $ds;
-       }
-
-       clean_ds();
-       test_s(qq{ search('something', '42'); });
-       test_s(qq{ search('empty', ''); });
-       test_check_ds('search');
-
-       clean_ds();
-       test_s(qq{ display('something', '42'); });
-       test_s(qq{ display('empty', ''); });
-       test_check_ds('display');
-
-       clean_ds();
-       test_s(qq{ tag('something', '42'); });
-       test_s(qq{ tag('empty', ''); });
-       test_check_ds('search');
-       test_check_ds('display');
-
-       clean_ds();
-       my $n = read_file( "$abs_path/data/normalize.pl" );
-       $n .= "\n1;\n";
-       #diag "normalize code:\n$n\n";
-       test_s( $n );
-
-       ok($ds = get_ds(), "get_ds");
-       diag "ds = ", Dumper($ds) if ($debug);
-
-       my $rec = {
-               '200' => [{
-                       'a' => '200a',
-                       'b' => '200b',
-               }],
-       };
-       my $rules = qq{ search('mixed', rec('200') ) };
-       
-       clean_ds();
-       set_rec( $rec );
-       test_s( $rules );
-       ok($ds = get_ds(), "get_ds");
-       is_deeply( $ds, {
-               'mixed' => {
-                       'search' => [ '200a', '200b' ],
-                       'tag' => 'mixed'
-               }
-       }, 'correct get_ds');
-
-       ok(my $ds2 = WebPAC::Normalize::Set::data_structure(
-               row => $rec,
-               rules => $rules,
-       ), 'data_structure');
-       is_deeply( $ds, $ds2, 'data_structure(s) same');
-
-       # wird and non-valid structure which is supported anyway
-       clean_ds();
-       set_rec({
-               '200' => [{
-                       'a' => '200a',
-               },
-                       '200-solo'
-               ]
-       });
-       test_s(qq{ search('mixed', rec('200') ) });
-       ok($ds = get_ds(), "get_ds");
-       is_deeply( $ds, {
-               'mixed' => {
-                       'search' => [ '200a', '200-solo' ],
-                       'tag' => 'mixed'
-               }
-       }, 'correct get_ds');
-
-}
-
diff --git a/t/3-normalize-xml.t b/t/3-normalize-xml.t
deleted file mode 100755 (executable)
index a55e52b..0000000
+++ /dev/null
@@ -1,263 +0,0 @@
-#!/usr/bin/perl -w
-
-use Test::More tests => 140;
-use Test::Exception;
-use Cwd qw/abs_path/;
-use blib;
-use strict;
-use Data::Dumper;
-
-BEGIN {
-use_ok( 'WebPAC::Normalize::XML' );
-}
-
-ok(my $abs_path = abs_path($0), "abs_path");
-$abs_path =~ s#/[^/]*$#/#;
-#diag "abs_path: $abs_path";
-
-throws_ok { new WebPAC::Normalize::XML( lookup_regex => 'foo' ) } qr/pair/, "lookup_regex without lookup";
-throws_ok { new WebPAC::Normalize::XML( lookup => 'bar' ) } qr/pair/, "lookup without lookup_regex";
-
-ok(my $n = new WebPAC::Normalize::XML(
-       debug => 0,
-       filter => {
-               regex => sub {
-                       my ($val, $regex) = @_;
-                       eval "\$val =~ $regex";
-                       return $val;
-               },
-       },
-), "new");
-
-throws_ok { $n->open() } qr/tag/, "open without tag";
-throws_ok { $n->open( tag => 'isis' ) } qr/xml_file/, "open without xml_file";
-throws_ok { $n->open( tag => 'isis', xml_file => '/foo/bar/baz' ) } qr/file.*doesn't exist/, "open with invalid xml_file";
-ok( $n->open(
-       tag => 'isis',
-       xml_file => "$abs_path/data/normalize.xml",
-), "open");
-
-my $rec = {
- '675' => [
-            {
-              'a' => '159.9'
-            }
-          ],
- '210' => [
-            {
-              'c' => 'New York University press',
-              'a' => 'New York',
-              'd' => 'cop. 1988'
-            }
-          ],
- '700' => [
-            {
-              'a' => 'Haynal',
-              'b' => 'André'
-            }
-          ],
- '801' => [
-            'FFZG'
-          ],
- '991' => [
-            '8302'
-          ],
- '000' => [
-            1
-          ],
- '702' => [
-            {
-              'a' => 'Holder',
-              'b' => 'Elizabeth'
-            }
-          ],
- '215' => [
-            {
-              'c' => 'ilustr',
-              'a' => 'xix, 202 str',
-              'd' => '23cm'
-            }
-          ],
- '990' => [
-            '2140',
-            '88',
-            'HAY'
-          ],
- '200' => [
-            {
-              'e' => 'from Freud and Ferenczi to Michael balint',
-              'a' => 'Controversies in psychoanalytic method',
-              'g' => 'translated by Elizabeth Holder on the basisi of a first draft by Archie Hooton ; with a preface by Daniel N. Stern',
-              'f' => 'by André E. Haynal'
-            }
-          ],
- '610' => [
-            'povijest psihoanalize'
-          ],
- '994' => [
-            {
-              'c' => '',
-              'a' => 'PS',
-              'b' => 'MG'
-            }
-          ],
- '320' => [
-            'Kazalo'
-          ],
- '101' => [
-            'ENG'
-          ],
- '686' => [
-            '2140'
-          ],
- '300' => [
-            'Prijevod djela: '
-          ]
-};
-
-foreach my $fld (keys %$rec) {
-       my $r = 0;
-       foreach my $item ($rec->{$fld}) {
-               if (ref($item) eq 'HASH') {
-                       foreach my $sf (keys %$item) {
-                               my $found = 0;
-                               ok($n->get_data(\$rec, $fld, $sf, $r, \$found), "v${fld}^${sf} / $r");
-                               ok($found, "found");
-                       }
-                       my $found = 0;
-                       ok(! $n->get_data(\$rec, $fld, 'x', $r, \$found), "no v${fld}^x / $r");
-                       ok(! $found, "not found");
-               } else {
-                       my $found = 0;
-                       ok($n->get_data(\$rec, $fld, undef, $r, \$found), "v${fld} / $r");
-                       ok($found, "found");
-               }
-       }
-       my $found = 0;
-       ok(! $n->get_data(\$rec, '999', '', $r, \$found), "no v${fld} / $r");
-       ok(! $found, "not found");
-}
-
-ok(my $ds = $n->data_structure( $rec ), "data_structure");
-
-#diag Dumper($rec, $ds);
-
-# fake load of our test normalisation data
-$n->{tag} = 'isis';
-
-#diag Dumper($n->{import_xml}->{indexer});
-
-
-$rec = {
-       '900' => [ {
-               'a' => '1',
-               'b' => '2',
-               'c' => '3',
-               'x' => 'yap',
-       } ],
-};
-
-my $import = {
-       'Tag' => { 'isis' => [
-                       { content => 'v900^a + v900^b = v900^c [v900^x]' },
-       ] },
-};
-
-sub parse_test($$$$$) {
-       my ($import,$tag,$rec,$i,$r) = @_;
-       $n->{import_xml}->{indexer} = $import;
-       # erase internal cache (yak!)
-       delete($n->{_tags_by_order});
-       push @{$rec->{'000'}}, 42 unless ($rec->{'000'});
-       #diag "test normalisation of: ",Dumper($n->{import_xml}->{indexer}, $rec);
-       ok(my $ds = $n->data_structure( $rec ), "data_structure");
-       #diag Dumper($ds->{$tag}->{display}); # if ($i == 0);
-       cmp_ok($ds->{$tag}->{display}->[$i], 'eq', $r, "parse $tag/$i - $r");
-}
-
-parse_test($import, 'Tag', $rec, 0, '1 + 2 = 3 [yap]');
-
-delete($rec->{'900'}->[0]->{'b'});
-parse_test($import, 'Tag', $rec, 0, '1 = 3 [yap]');
-
-$rec->{'900'}->[0]->{'b'} = 5;
-$rec->{'900'}->[0]->{'c'} = 6;
-parse_test($import, 'Tag', $rec, 0, '1 + 5 = 6 [yap]');
-
-delete($rec->{'900'}->[0]->{'c'});
-$rec->{'900'}->[0]->{'x'} = 'hmmm';
-parse_test($import, 'Tag', $rec, 0, '1 + 5 [hmmm]');
-
-$rec->{'900'}->[0]->{'x'} = 'nope!';
-delete($rec->{'900'}->[0]->{'a'});
-parse_test($import, 'Tag', $rec, 0, '5 [nope!]');
-
-$rec = {
-       '900' => [ {
-               'b' => 'b1',
-               'x' => 'b1',
-       },{
-               'a' => 'a1',
-               'b' => 'b2',
-               'x' => 'a1_b2',
-       },{
-               'b' => 'b3',
-               'c' => 'c1',
-               'x' => 'b3_c1',
-       },{
-               'a' => 'a2',
-               'b' => 'b4',
-               'c' => 'c2',
-               'x' => 'a2_b4_c2',
-       } ],
-};
-
-$import = {
-       'Tag' => { 'isis' => [
-                       { content => '0 v900^a 1 v900^b 2 v900^c 3 v900^x 4' },
-       ] },
-};
-
-parse_test($import, 'Tag', $rec, 0, '0 b1 3 b1 4');
-parse_test($import, 'Tag', $rec, 1, '0 a1 1 b2 3 a1_b2 4');
-parse_test($import, 'Tag', $rec, 2, '0 b3 2 c1 3 b3_c1 4');
-parse_test($import, 'Tag', $rec, 3, '0 a2 1 b4 2 c2 3 a2_b4_c2 4');
-
-sub parse_test_arr($$$$) {
-       my ($import, $tag, $rec, $arr) = @_;
-       my $i = 0;
-       foreach my $res (@{$arr}) {
-               parse_test($import, $tag, $rec, $i, $res);
-               $i++;
-       }
-}
-
-$import = {
-       'Tag_a' => { 'isis' => [
-               { content => 'v900^a' },
-       ] },
-       'Tag_b' => { 'isis' => [
-               { content => 'v900^b' },
-       ] },
-       'Tag_c' => { 'isis' => [
-               { content => 'v900^c' },
-       ] },
-       'Tag_x' => { 'isis' => [
-               { content => 'v900^x' },
-       ] },
-       'Tag_s1' => { 'isis' => [
-               { content => 'v900^a = v900^c' },
-       ] },
-       'Tag_noval' => { 'isis' => [
-               { content => 'v911^1' },
-               { content => 'v900^c' },
-       ] },
-};
-
-parse_test_arr($import, 'Tag_a', $rec, [ '','a1','','a2' ] );
-parse_test_arr($import, 'Tag_b', $rec, [ 'b1','b2','b3','b4' ] );
-parse_test_arr($import, 'Tag_c', $rec, [ '','','c1','c2' ] );
-parse_test_arr($import, 'Tag_x', $rec, [ 'b1','a1_b2','b3_c1','a2_b4_c2' ] );
-parse_test_arr($import, 'Tag_s1', $rec, [ '', 'a1', 'c1', 'a2 = c2' ] );
-parse_test_arr($import, 'Tag_noval', $rec, [ '','','c1','c2' ] );
-
diff --git a/t/3-normalize.t b/t/3-normalize.t
new file mode 100755 (executable)
index 0000000..7321bf5
--- /dev/null
@@ -0,0 +1,345 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+use Test::More tests => 67;
+use Test::Exception;
+use Cwd qw/abs_path/;
+use blib;
+use File::Slurp;
+
+use Data::Dumper;
+my $debug = shift @ARGV;
+
+BEGIN {
+       use_ok( 'WebPAC::Normalize' );
+}
+
+ok(my $abs_path = abs_path($0), "abs_path");
+$abs_path =~ s#/[^/]*$#/#;
+diag "abs_path: $abs_path" if ($debug);
+
+#throws_ok { new WebPAC::Normalize::XML( lookup_regex => 'foo' ) } qr/pair/, "lookup_regex without lookup";
+
+my $rec1 = {
+       '200' => [{
+               'a' => '200a',
+               'b' => '200b',
+               },{
+               'c' => '200c',
+               'd' => '200d',
+               },{
+               'a' => '200a*2',
+               'd' => '200d*2',
+               }],
+       '201' => [{
+               'x' => '201x',
+               'y' => '201y',
+               }],
+       '900' => [
+               '900-no_subfield'
+               ],
+       '901' => [{
+               'a' => '900a',
+               }],
+       '902' => [{
+               'z' => '900',
+               }],
+};
+
+my $rec2 = {
+ '675' => [ {
+              'a' => '159.9'
+            } ],
+ '210' => [ {
+              'c' => 'New York University press',
+              'a' => 'New York',
+              'd' => 'cop. 1988'
+            } ],
+ '700' => [ {
+              'a' => 'Haynal',
+              'b' => 'André'
+            } ],
+ '801' => [ 'FFZG' ],
+ '991' => [ '8302' ],
+ '000' => [ 1 ],
+ '702' => [ {
+              'a' => 'Holder',
+              'b' => 'Elizabeth'
+            } ],
+ '215' => [ {
+              'c' => 'ilustr',
+              'a' => 'xix, 202 str',
+              'd' => '23cm'
+            } ],
+ '990' => [
+            '2140',
+            '88',
+            'HAY'
+          ],
+ '200' => [ {
+              'e' => 'from Freud and Ferenczi to Michael balint',
+              'a' => 'Controversies in psychoanalytic method',
+              'g' => 'translated by Elizabeth Holder on the basisi of a first draft by Archie Hooton ; with a preface by Daniel N. Stern',
+              'f' => 'by André E. Haynal'
+            } ],
+ '610' => [ 'povijest psihoanalize' ],
+ '994' => [ {
+              'c' => '',
+              'a' => 'PS',
+              'b' => 'MG'
+            } ],
+ '320' => [ 'Kazalo' ],
+ '101' => [ 'ENG' ],
+ '686' => [ '2140' ],
+ '300' => [ 'Prijevod djela: ' ],
+};
+
+
+my $lookup1 = {
+       '00900' => [
+               'lookup 1',
+               'lookup 2',
+       ],
+};
+
+my $lookup2 = {
+       '00900' => 'lookup',
+};
+
+
+sub test {
+       print Dumper( @_ ), ("-" x 78), "\n";
+       ok( defined(@_) );
+}
+
+# how much of string evaled to display?
+my $max_eval_output = 170;
+
+sub dump_error {
+       my ($msg,$code) = @_;
+
+       my @l = split(/[\n\r]/, $code);
+       my $out = "$msg\n";
+
+       foreach my $i ( 0 .. $#l ) {
+               $out .= sprintf("%2d: %s\n", $i, $l[$i]);
+       }
+
+       return $out;
+}
+
+sub test_s {
+       my $t = shift || die;
+
+       my $eval_t = $t;
+       $eval_t =~ s/[\n\r\s]+/ /gs;
+       $eval_t = substr($eval_t,0,$max_eval_output) . '...' if (length($eval_t) > $max_eval_output);
+
+       eval "$t";
+       ok(! $@, $@ ? dump_error($@, $t) : "eval: $eval_t");
+}
+
+{
+       no strict 'subs';
+       use WebPAC::Normalize;
+
+       ok(! set_lookup( undef ), "set_lookup(undef)");
+
+       set_rec( $rec1 );
+
+       cmp_ok( join(",", rec2('200','a') ), 'eq', '200a,200a*2', 'join rec2' );
+       cmp_ok( join(",", rec2('200','a'), rec2('200','b') ), 'eq', '200a,200a*2,200b', 'join rec2 rec2' );
+       cmp_ok( join(" * ", sort(rec1('200'), rec1('201') )), 'eq', '200a * 200a*2 * 200b * 200c * 200d * 200d*2 * 201x * 201y', 'join sort rec1 rec1' );
+       diag "is_deeply checks\n";
+       is_deeply( \[ rec1('200') ], \[ qw/200a 200b 200c 200d 200a*2 200d*2/ ] );
+       is_deeply( \[ regex( 's/0/o/g', rec1('200') ) ],  \[ qw/2ooa 2oob 2ooc 2ood 2ooa*2 2ood*2/ ]);
+       is_deeply( \[ grep { /\*/ } regex( 's/0/o/g', rec1('200') ) ], \[ qw/2ooa*2 2ood*2/ ]);
+       is_deeply( \[ rec('902') ], \[ '900' ] );
+
+       cmp_ok( rec('902'), 'eq', rec('902','z'), 'rec sf' );
+
+       # simple list manipulatons
+       cmp_ok( join('', prefix('ab', 'cd') ), 'eq', 'abcd', 'prefix');
+       cmp_ok( join('', suffix('xy', 'cd') ), 'eq', 'cdxy', 'suffix');
+       cmp_ok( join('', surround('->', '<-', 'a','b','c') ), 'eq', '->a<-->b<-->c<-', 'surround');
+
+
+       set_lookup( $lookup1 );
+       
+       cmp_ok(
+               join_with(" i ",
+                       lookup( 
+                               regex( 's/^/00/',
+                                       rec2('902','z')
+                               )
+                       ) 
+               ),
+       'eq', 'lookup 1 i lookup 2', 'join lookup regex rec2');
+
+       # check join_with operations
+
+       sub test_join_with_2 {
+               my ($a,$b,$e) = @_;
+
+               cmp_ok(
+                       join_with(" <1> ",
+                               rec('201',$a),
+                               rec('201',$b),
+                       ), 
+               'eq', $e, "join_with $a <1> $b = $e");
+       }
+
+       test_join_with_2('_','_','');
+       test_join_with_2('x','_','201x');
+       test_join_with_2('_','x','201x');
+       test_join_with_2('x','y','201x <1> 201y');
+
+       sub test_join_with_3 {
+               my ($a,$b,$c,$e) = @_;
+
+               cmp_ok(
+                       join_with(" <1> ", rec('201',$a),
+                               join_with(" <2> ", rec('201',$b),
+                                       rec('201',$c),
+                               )
+                       ), 
+               'eq', $e, "join_with $a <1> $b <2> $c = $e");
+       };
+
+       test_join_with_3('_','_','_','');
+       test_join_with_3('x','_','_','201x');
+       test_join_with_3('_','x','_','201x');
+       test_join_with_3('_','_','x','201x');
+       test_join_with_3('x','y','_','201x <1> 201y');
+       test_join_with_3('x','_','y','201x <1> 201y');
+       test_join_with_3('_','x','y','201x <2> 201y');
+       test_join_with_3('x','_','y','201x <1> 201y');
+       test_join_with_3('x','y','x','201x <1> 201y <2> 201x');
+
+       # test lookups
+
+       set_lookup( $lookup2 );
+
+       is_deeply( \[ lookup( prefix( '00', rec('902') ) ) ], \[ 'lookup' ], 'lookup prefix' );
+
+       ok(! lookup('non-existent'), 'lookup non-existant' );
+
+       set_rec( $rec2 );
+
+       test_s(qq{
+               tag('Title',
+                       rec('200','a')
+               );
+       });
+       test_s(qq{
+               tag('Who',
+                       join_with(" ",
+                               rec('702','a'),
+                               rec('702','b')
+                       )
+               );
+       });
+
+       test_s(qq{
+               display('Publisher',
+                       rec('210','c')
+               )
+       });
+       
+       test_s(qq{
+               search('Year',
+                       regex( 's/[^\\d]+//',
+                               rec('210','d')
+                       )
+               )
+       });
+
+       ok(my $ds = get_ds(), "get_ds");
+       diag "ds = ", Dumper($ds) if ($debug);
+
+
+       sub test_check_ds {
+
+               my $t = shift;
+
+               ok($ds = get_ds(), 'get_ds');
+               diag Dumper( $ds ) if ($debug);
+
+               ok( $ds && $ds->{something}, 'get_ds->something exists' );
+               ok( $ds && $ds->{something}->{$t}, 'get_ds->something->'.$t.' exists') if ($t);
+               ok( $ds && !$ds->{empty}, 'get_ds->empty doesn\'t' );
+
+               return $ds;
+       }
+
+       clean_ds();
+       test_s(qq{ search('something', '42'); });
+       test_s(qq{ search('empty', ''); });
+       test_check_ds('search');
+
+       clean_ds();
+       test_s(qq{ display('something', '42'); });
+       test_s(qq{ display('empty', ''); });
+       test_check_ds('display');
+
+       clean_ds();
+       test_s(qq{ tag('something', '42'); });
+       test_s(qq{ tag('empty', ''); });
+       test_check_ds('search');
+       test_check_ds('display');
+
+       clean_ds();
+       my $n = read_file( "$abs_path/data/normalize.pl" );
+       $n .= "\n1;\n";
+       #diag "normalize code:\n$n\n";
+       test_s( $n );
+
+       ok($ds = get_ds(), "get_ds");
+       diag "ds = ", Dumper($ds) if ($debug);
+
+       my $rec = {
+               '200' => [{
+                       'a' => '200a',
+                       'b' => '200b',
+               }],
+       };
+       my $rules = qq{ search('mixed', rec('200') ) };
+       
+       clean_ds();
+       set_rec( $rec );
+       test_s( $rules );
+       ok($ds = get_ds(), "get_ds");
+       is_deeply( $ds, {
+               'mixed' => {
+                       'search' => [ '200a', '200b' ],
+                       'tag' => 'mixed'
+               }
+       }, 'correct get_ds');
+
+       ok(my $ds2 = WebPAC::Normalize::data_structure(
+               row => $rec,
+               rules => $rules,
+       ), 'data_structure');
+       is_deeply( $ds, $ds2, 'data_structure(s) same');
+
+       # wird and non-valid structure which is supported anyway
+       clean_ds();
+       set_rec({
+               '200' => [{
+                       'a' => '200a',
+               },
+                       '200-solo'
+               ]
+       });
+       test_s(qq{ search('mixed', rec('200') ) });
+       ok($ds = get_ds(), "get_ds");
+       is_deeply( $ds, {
+               'mixed' => {
+                       'search' => [ '200a', '200-solo' ],
+                       'tag' => 'mixed'
+               }
+       }, 'correct get_ds');
+
+}
+
index a17aa4a..5658ce6 100755 (executable)
@@ -2,7 +2,7 @@
 
 use strict;
 
-use Test::More tests => 41;
+use Test::More tests => 24;
 use Test::Exception;
 use Cwd qw/abs_path/;
 use File::Temp qw/tempdir/;
@@ -17,8 +17,8 @@ BEGIN {
 use_ok( 'WebPAC::Lookup' );
 use_ok( 'WebPAC::Input' );
 use_ok( 'WebPAC::Store' );
-use_ok( 'WebPAC::Normalize::XML' );
-use_ok( 'WebPAC::Normalize::Set' );
+use_ok( 'WebPAC::Lookup::Normalize' );
+use_ok( 'WebPAC::Normalize' );
 use_ok( 'WebPAC::Output::TT' );
 }
 
@@ -28,15 +28,13 @@ diag "abs_path: $abs_path" if ($debug);
 
 my $isis_file = "$abs_path../t/winisis/BIBL";
 #$isis_file = '/data/hidra/THS/THS';
-$isis_file = '/data/isis_data/ffkk/';
+#$isis_file = '/data/isis_data/ffkk/';
 
 diag "isis_file: $isis_file" if ($debug);
 
 my $normalize_set_pl = "$abs_path/data/normalize.pl";
 my $lookup_file = "$abs_path../conf/lookup/isis.pm";
 
-my ($t1,$t2) = (0,0);
-
 ok(my $lookup = new WebPAC::Lookup(
        lookup_file => $lookup_file,
 ), "new Lookup");
@@ -61,19 +59,6 @@ ok(my $db = new WebPAC::Store(
        database => '.',
 ), "new Store");
 
-ok(my $n = new WebPAC::Normalize::XML(
-#      filter => { 'foo' => sub { shift } },
-       db => $db,
-       lookup_regex => $lookup->regex,
-       lookup => $lookup,
-       no_progress_bar => 1,
-), "new Normalize::XML");
-
-ok($n->open(
-       tag => 'isis',
-       xml_file => "$abs_path/data/normalize.xml",
-), "Normalize::XML->open");
-
 ok(my $norm_pl = read_file( $normalize_set_pl ), "set definitions: $normalize_set_pl" );
 
 ok(my $out = new WebPAC::Output::TT(
@@ -83,6 +68,8 @@ ok(my $out = new WebPAC::Output::TT(
 
 diag " lookup => ",Dumper($lookup->lookup_hash) if ($debug);
 
+my $t_norm = 0;
+
 foreach my $pos ( 0 ... $isis->size ) {
 
        my $row = $isis->fetch || next;
@@ -90,21 +77,14 @@ foreach my $pos ( 0 ... $isis->size ) {
        diag " row $pos => ",Dumper($row) if ($debug);
 
        my $t = time();
-       ok(my $ds = $n->data_structure($row), "XML data_structure");
-       $t1 += time() - $t;
-
-       diag " ds $pos => ",Dumper($ds) if ($debug);
-
-       $t = time();
-       ok( my $ds2 = WebPAC::Normalize::Set::data_structure(
+       ok( my $ds = WebPAC::Normalize::data_structure(
                lookup => $lookup->lookup_hash,
                row => $row,
                rules => $norm_pl,
        ), "Set data_structure");
-       $t2 += time() - $t;
+       $t_norm += time() - $t;
 
-       diag " ds2 $pos => ",Dumper($ds2) if ($debug);
-       is_deeply( $ds, $ds2, 'ds same for xml and sets');
+       diag " ds $pos => ",Dumper($ds) if ($debug);
 
        ok(my $html = $out->apply(
                template => 'html.tt',
@@ -117,4 +97,4 @@ foreach my $pos ( 0 ... $isis->size ) {
 
 };
 
-diag sprintf("timings: %.2fs vs %.2fs [%1.2f%%]\n", $t1, $t2, ($t1 / $t2) * 100);
+diag sprintf("timings: %.2fs\n", $t_norm);