From: Dobrica Pavlinusic <dpavlin@rot13.org>
Date: Mon, 26 Jun 2006 16:39:51 +0000 (+0000)
Subject:  r719@llin:  dpavlin | 2006-06-26 18:40:57 +0200
X-Git-Url: http://git.rot13.org/?a=commitdiff_plain;h=bbacd62d35e8d7b421f5e8cc0134d9282d6310dc;p=webpac2

 r719@llin:  dpavlin | 2006-06-26 18:40:57 +0200
 big refacture: depriciate and remove all normalisation formats except .pl sets (but
 old code is still available in WebPAC::Lookup::Normalize because lookups use it) [2.20]


git-svn-id: svn+ssh://mjesec/home/dpavlin/svn/webpac2/trunk@536 07558da8-63fa-0310-ba24-9fe276d99e06
---

diff --git a/MANIFEST b/MANIFEST
index 469227b..94d97ae 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -10,7 +10,6 @@ lib/WebPAC/Store.pm
 lib/WebPAC/Input.pm
 lib/WebPAC/Input/ISIS.pm
 lib/WebPAC/Normalize.pm
-lib/WebPAC/Normalize/XML.pm
 lib/WebPAC/Output.pm
 lib/WebPAC/Output/CDBI.pm
 lib/WebPAC/Output/Estraier.pm
@@ -33,4 +32,3 @@ t/pod-coverage.t
 t/pod.t
 conf/lookup/example.pm
 conf/lookup/isis.pm
-conf/normalize/isis.xml
diff --git a/Makefile.PL b/Makefile.PL
index 2261619..ee3c0d3 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -15,10 +15,8 @@ WriteMakefile(
 	'Log::Log4perl' => 1.02,
 	'Data::Dumper' => 0,
 	'Cwd' => 0,
-	'Text::Iconv' => 0,
 	'Storable' => 0,
 	'DBM::Deep' => 0,
-	'XML::Simple' => 0,
 	'Template' => 0,
 	'Time::HiRes' => 0,
 	'File::Temp' => 0,
diff --git a/TODO b/TODO
index a643614..4171ec9 100644
--- a/TODO
+++ b/TODO
@@ -16,7 +16,7 @@
 + added --stats to report field and subfield usage [2.14]
 + add validator for input data [2.15]
 + add Excel input format [2.16]
-- fix varous annoyences in code [2.17]
++ remove WebPAC::Normalize::XML and promote WebPAC::Normalize::Set to WebPAC::Normalize [2.20]
 - support arrays for normalize/path and lookup
 - add dBase input format
 - remove delimiters characters from index and query entered
diff --git a/conf/normalize/isis.xml b/conf/normalize/isis.xml
deleted file mode 100644
index c94018a..0000000
--- a/conf/normalize/isis.xml
+++ /dev/null
@@ -1,294 +0,0 @@
-<?xml version="1.0" encoding="ISO-8859-2"?>
-<!--
-	isis_codepage is code page which isis uses
-	index_codepage is code page which will be stored in index
-
-	format of this file is described in: doc/formating_xml.txt
--->
-<config
-	isis_codepage="852"
->
-
-<!--
-	<isis type="swish|display|index">_pre_v000^x_sep_v000^x_sep_v000^x_post_</isis>
-	<config type="swish|display">name of var from config file</config>
--->
-
-	<format name="IDths">
-		<!--
-		<![CDATA[<a href="?rm=results&show_full=1&f=IDths&v=%s">%s</a>]]>
-		-->
-		<![CDATA[<a href="lookup{900_mfn:%s}.html">%s</a>]]>
-	</format>
-
-    <indexer>
-	<headline>
-		<isis type="display">v250^a</isis>
-		<isis type="display" append="1">[v251]</isis>
-		<isis type="display" append="1"> / [v562^4] v562^a</isis> 
-		<isis type="display" append="1">filter{CROVOC}v800</isis>
-	</headline>
-
-	<filename>
-		<isis type="filename">out/thes/v000.html</isis>	
-	</filename>
- 	
-	<AllThes>
-		<isis type="swish">v250 v450 v258 v458 v253 v453 v254 v454 v330 v338</isis>
-	<!--
-		<isis type="index">v250^a / [v561^4]</isis> 
-		<isis type="index">v450^a</isis>
-		<isis type="index">v258^a</isis>
-		<isis type="index">v458^a</isis>
-		<isis type="index">v253^a</isis>
-		<isis type="index">v453^a</isis>
-		<isis type="index">v254^a</isis>
-		<isis type="index">v454^a</isis>
-		<isis type="index">v330^a</isis>
-		<isis type="index">v338^a</isis>
-	-->
-	</AllThes>
-
-	<Source name="Izvor: " order="5">
-		<isis type="display">v800 -- v901^c</isis>
-	</Source>
-	
-	<line name="..................." order="9">
-		<config type="display">line</config>
-	</line>
-	
-	<DescriptorHR name="HRVATSKI:" order="10">
-		<isis type="swish">eval{"v901^a" eq "Deskriptor"}v250^a</isis>
-		<isis type="swish">eval{"s901^a" eq "Deskriptor"}v450^a</isis>
-		<isis type="index">eval{"v901^a" eq "Deskriptor"}v250^a / [v562^4] v562^a</isis>
-		<isis type="index" append="1">eval{qq#v250^a# and "v901^a" eq "Deskriptor"}filter{CROVOC}v800</isis>
-		<!--
-		<isis type="index">eval{"v901^a" ne "Deskriptor"}v250^a [v251]</isis>
-		-->
-		<isis type="index">eval{qq#v450^a#}filter{CROVOC}v450^a >> s250^a / [s562^4] s562^a s800</isis>
-		<isis type="display">[v251]</isis>
-		<isis type="display" append="1">v250^a</isis>
-		<isis type="display" append="1">eval{qq#v250^a#}filter{CROVOC}v800</isis>
-	</DescriptorHR>
-	
-	<NonDescriptorHR name="Uporabi za: " order="20">
-		<isis type="display" >
-			<delimiter> * </delimiter>
-			<value>v450^a</value>
-		</isis>
-	</NonDescriptorHR>
-
-	<ScopeNoteHR name="Napomena: " order="30">
-		<isis type="display">
-		<delimiter><![CDATA[<br/>]]></delimiter>
-		<value>v330^a</value>
-		</isis>
-	</ScopeNoteHR>
-	
-	<SubjectAreaHR name="Podruèje: " order="50">
-		<!--<isis type="swish">eval{"v901^a" ne "Deskriptor"}v250^a</isis>-->
-		<isis type="swish">eval{"v901^a" eq "Podruèje"}v250^a</isis>
-		<isis type="index">eval{"v901^a" eq "Podruèje"}[v251] v250^a</isis>
-		<!--
-		<isis type="index" append="1">[v251]</isis>
-		-->
-		<isis type="display">[v561^4]</isis>
-		<isis type="display" append="1" delimiter=" " format_name="IDths" format_delimiter=";;">v561^1;;v561^a</isis>
-		<isis type="display" append="1" delimiter=" ">lookup{crovoc:v561^1}</isis>
-	</SubjectAreaHR>
-
-	<MicrothesaurusHR name="Potpojmovnik: " order="60">
-		<isis type="display">[v562^4]</isis>
-		<isis type="display" append="1" delimiter=" " format_name="IDths" format_delimiter=";;">v562^1;;v562^a</isis>
-		<isis type="display" append="1" delimiter=" ">lookup{crovoc:v562^1}</isis>
-	</MicrothesaurusHR>
-<!--
-	<ClassCode name="Klasifikacijski kod:" order="70">
-		<isis>v251</isis>
-	</ClassCode>
--->	
-	<URL name="URL: " order="80">
-		<isis type="display">v856^u</isis>
-	</URL>
-	
-	<BroaderTerm name="©iri pojam: " order="90">
-		<isis type="display" format_name="IDths" format_delimiter=";;">
-			<delimiter><![CDATA[</li><li>]]></delimiter>
-			<value>v556^1;;v556^a lookup{crovoc:v556^1}</value>
-		</isis>
-		<isis type="display" format_name="IDths" format_delimiter=";;">
-			<delimiter><![CDATA[</li><li>]]></delimiter>
-			<value>v461^1;;v461^a lookup{crovoc:v461^1}</value>
-		</isis> 
-	</BroaderTerm>
-
-	<NarrowerTerm name="U¾i pojam: " order="100">
-		<isis type="display" format_name="IDths" format_delimiter=";;">
-			<delimiter><![CDATA[</li><li>]]></delimiter>
-			<value>v553^1;;v553^a lookup{crovoc:v553^1}</value>
-		</isis>
-		<!--
-		<isis type="display" format_name="IDths" format_delimiter=";;" >
-			<delimiter> * </delimiter>
-			<value>[a:v561^4];;[d:[a:v561^4]]</value>
-		</isis>
-		-->
-		<isis type="display" format_name="IDths" format_delimiter=";;" >
-			<delimiter><![CDATA[</li><li>]]></delimiter>
-			<value>eval{"v901^a" eq "Podruèje"}lookup{a:v251::};;lookup{d:lookup{a:v251::}}</value>
-		</isis>
-		<isis type="display" append="1" format_name="IDths" format_delimiter=";;" >
-			<delimiter><![CDATA[</li><li>]]></delimiter>
-			<value>eval{"v901^a" eq "Mikrotezaurus"}lookup{a:v561^4:v251:};;lookup{d:lookup{a:v561^4:v251:}}</value>
-		</isis>
-		<isis type="display" format_name="IDths" format_delimiter=";;" >
-			<delimiter><![CDATA[</li><li>]]></delimiter>
-			<value>eval{"v901^a" eq "Deskriptor"}lookup{a:v561^4:v562^4:v900};;lookup{d:lookup{a:v561^4:v562^4:v900}}</value>
-		</isis>
-	</NarrowerTerm> 
-
-	<RelatedTerm name="Srodni pojam: " order="110">
-		<isis type="display"  format_name="IDths" format_delimiter=";;" sort="1">
-			<delimiter> * </delimiter>
-			<value>v550^1;;v550^a lookup{crovoc:v550^1}</value>
-		</isis> 
-	</RelatedTerm>
-
-	<SeeNext name="Slijedi: " order="120">
-		<isis type="display"  format_name="IDths" format_delimiter=";;">
-			<delimiter>, </delimiter>
-			<value>v440^1;;v440^a lookup{crovoc:v440^1}</value>
-		</isis> 
-	</SeeNext>
-
-	<SeeBefore name="Prethodi: " order="130">
-		<isis type="display"  format_name="IDths" format_delimiter=";;">
-			<delimiter>, </delimiter>
-			<value>v430^1;;v430^a lookup{crovoc:v430^1}</value>
-		</isis> 
-	</SeeBefore>
-
-	<line2 name=".................." order="139">
-		<config type="display">line</config>
-	</line2>
-
-	<DescriptorEN name="ENGLESKI:" order="140">
-		<isis type="swish">eval{"v901^a" eq "Deskriptor"}v258^a</isis>
-		<isis type="swish">eval{"s901^a" eq "Deskriptor"}v458^a</isis>
-		<isis type="index">eval{"v901^a" eq "Deskriptor"}v258^a / [v572^4] v572^a</isis>
-		<isis type="index">eval{qq#v458^a#}filter{CROVOC}v458^a >> s258^a / [s572^4] s572^a s800</isis>
-		<isis type="display">v258^a</isis>
-	</DescriptorEN>
-	
-	<ScopeNoteEN name="Napomena: " order="150">
-		<isis type="display">
-		<delimiter><![CDATA[<br/>]]></delimiter>
-		<value>v338^a</value>
-		</isis>
-	</ScopeNoteEN>
-	
-	<NonDescriptorEN name="Uporabi za: " order="160">
-		<isis type="display" >
-			<delimiter> * </delimiter>
-			<value>v458^a</value>
-		</isis>
-	</NonDescriptorEN>
-
-	
-	<SubjectAreaEN name="Podruèje: " order="170">
-		<isis type="swish">v571 v572</isis>
-		<isis type="index">[v251]</isis>
-		<isis type="index" append="1">v571^a. v572^a</isis>
-<!--		<isis type="display">[v571^4]</isis>
-		<isis type="display" append="1" delimiter=" " format_name="IDths" format_delimiter=";;">v571^1;;v571^a</isis>
--->	</SubjectAreaEN>
-
-<!--	<MicrothesaurusEN name="Potpojmovnik: " order="180">
-		<isis type="display">[v572^4]</isis>
-		<isis type="display" append="1" delimiter=" " format_name="IDths" format_delimiter=";;">v572^1;;v572^a</isis>
-	</MicrothesaurusEN>
--->
-	<line3 name=".................." order="189">
-		<config type="display">eval{"v253^9" eq "1"} line</config>
-	</line3>
-	
-	<DescriptorFR name="FRANCUSKI:" order="190">
-		<isis type="swish">eval{"v901^a" eq "Deskriptor"}v253^a</isis>
-		<isis type="swish">eval{"s901^a" eq "Deskriptor"}v453^a</isis>
-		<isis type="index">eval{"v901^a" eq "Deskriptor"}v253^a / [v572^4] v572^a</isis>
-		<isis type="index">eval{qq#v453^a#}filter{CROVOC}v453^a >> s253^a / [s572^4] s572^a s800</isis>
-		<isis type="display">v253^a</isis>
-	</DescriptorFR>
-
-	<NonDescriptorFR name="Uporabi za: " order="200">
-		<isis type="display">v453^a</isis>
-	</NonDescriptorFR>
-
-	<SubjectAreaFR name="Podruèje: " order="210">
-		<isis type="swish">v573 v574</isis>
-		<isis type="index">[v251] </isis>
-		<isis type="index" append="1">v573^a. v574^a</isis>
-<!--		<isis type="display">[v573^4]</isis>
-	<isis type="display" append="1" delimiter=" " format_name="IDths" format_delimiter=";;">v573^1;;v573^a</isis>
--->		</SubjectAreaFR>
-<!--
-	<MicrothesaurusFR name="Potpojmovnik: " order="220">
-		<isis type="display">[v574^4]</isis>
-		<isis type="display" append="1" delimiter=" " format_name="IDths" format_delimiter=";;">v574^1;;v574^a</isis>
-	</MicrothesaurusFR>
--->	
-	<line4 name=".................." order="229">
-		<config type="display">eval{"v254^9" eq "1"} line</config>
-	</line4>
-	
-	<DescriptorGE name="NJEMAÈKI:" order="230">
-		<isis type="swish">eval{"v901^a" eq "Deskriptor"}v254^a</isis>
-		<isis type="swish">eval{"s901^a" eq "Deskriptor"}v454^a</isis>
-		<isis type="index">eval{"v901^a" eq "Deskriptor"}v254^a / [v572^4] v572^a</isis>
-		<isis type="index">eval{qq#v454^a#}filter{CROVOC}v454^a >> s254^a / [s572^4] s572^a s800</isis>
-		<isis type="display">v254^a</isis>
-	</DescriptorGE>
-
-	<NonDescriptorGE name="Uporabi za: " order="240">
-		<isis type="display">v454^a</isis>
-	</NonDescriptorGE>
-
-	<SubjectAreaGE name="Podruèje: " order="250">
-		<isis type="swish">v575 v576</isis>
-		<isis type="index">[v251] </isis>
-		<isis type="index" append="1">v575^a. v576^a.</isis>
-<!--		<isis type="display">[v575^4]</isis>
-		<isis type="display" append="1" delimiter=" " format_name="IDths" format_delimiter=";;">v575^1;;v575^a</isis>
--->	</SubjectAreaGE>
-<!--
-	<MicrothesaurusGE name="Potpojmovnik: " order="260">
-		<isis type="display">[v576^4]</isis>
-		<isis type="display" append="1" delimiter=" " format_name="IDths" format_delimiter=";;">v576^1;;v576^a</isis>
-	</MicrothesaurusGE>
--->
-	<!--
-		order is important for type="lookup_key|lookup_val"
-		They create internal hash file for lookups from other
-		fields. Therefore, you mist ensure (in config file) that
-		databases which are source of lookup are called before
-		databases that use lookup fields.
-		Those lookup are then used using special ##lookup##
-		token inside formating instructions like this:
-		<isis type="lookup">v999^x</isis>
-	-->
-
-	<IDths name="ID" order="300">
-		<isis type="lookup_key">v900</isis>
-		<isis type="swish">v900</isis>
-	</IDths>
-
-	<SubjectIndex name="Predmetno kazalo" order="301">
-		<isis type="lookup_val">eval{"v901^a" eq "Deskriptor"} / [v562^4] v562^a</isis>
-		<isis type="lookup_val">eval{"v901^a" eq "Deskriptor"} [v251]</isis>
-		<!--
-		<isis type="lookup_val" append="1"> v562^a</isis>
-		-->
-	</SubjectIndex>
-
-    </indexer>
-</config>
diff --git a/conf/normalize/isis_ffzg.xml b/conf/normalize/isis_ffzg.xml
deleted file mode 100644
index 54383f1..0000000
--- a/conf/normalize/isis_ffzg.xml
+++ /dev/null
@@ -1,285 +0,0 @@
-<?xml version="1.0" encoding="ISO-8859-2"?>
-<!--
-	Here are few examples of syntax.
-
-	<isis type="search|display">something v200^a something else</isis>
-	<config type="search|display">name of var from config file</config>
--->
-<config>
-<indexer>
-	<MFN>
-		<isis>v000</isis>
-	</MFN>
-
-	<ISBN>
-		<isis type="display">v10</isis>
-	</ISBN>
-
-	<ISSN>
-		<isis type="display">v11</isis>
-	</ISSN>
-
-	<IdentificationNumbers name="ISN">
-		<isis type="search">v10 v11</isis>
-	</IdentificationNumbers>
-	
-	<Language>
-		<isis>v101</isis>
-	</Language>
-	
-	<TitleProper>
-		<isis><![CDATA[filter{regex(s/<[^>]*>/)}v200^a]]></isis>
-	</TitleProper>
-
-	<titleNo>
-		<isis>v200^9</isis>
-	</titleNo>
-
-	<Subtitle>
-		<isis>v200^e</isis>
-	</Subtitle>
-
-	<TitleProper2>
-		<isis>v200^c</isis>
-	</TitleProper2>
-	
-	<ParallelTitle>
-		<isis>v200^d</isis>
-	</ParallelTitle>
-
-	<Responsibility>
-		<isis>v200^f ; v200^g</isis>
-	</Responsibility>
-		
-	<ResponsibilityFirst>
-		<isis type="display">v200^f</isis>
-	</ResponsibilityFirst>
-		
-	<ResponsibilitySecond>
-		<isis type="display">v200^g</isis>
-	</ResponsibilitySecond>
-
-	<VolumeDesignation>
-		<isis>v200^v</isis>
-	</VolumeDesignation>
-
-	<EditionStatement>
-		<isis>v205^a</isis>
-	</EditionStatement>
-
-	<SerialNo>
-		<isis>v207^a</isis>
-	</SerialNo>
-
-	<fond>
-		<isis>v209^a</isis>
-	</fond>
-
-	<PlacePublication>
-		<isis>v210^a</isis>
-	</PlacePublication>
-
-	<NamePublisher>
-	 	<isis>v210^c</isis>
-	</NamePublisher>
-
-	<DatePublication> 
-		<isis>v210^d</isis>
-	</DatePublication>
-
-	<PhysicalDescription>
-		<isis>v215^a : v215^c ; v215^d</isis>
-	</PhysicalDescription>
-	
-	<MaterialDesignation>
-		<isis>v215^a</isis>
-	</MaterialDesignation>
-	
-	<PhysicalDetails>
-		<isis>v215^c</isis>
-	</PhysicalDetails>
-	
-	<AccompanyingMaterial>
-		<isis>v215^e</isis>
-	</AccompanyingMaterial>
-	
-	<Series>
-		<isis>v225^a = v225^d : v225^e ; v225^v. v225h, v225^i ; v225^w</isis>
-	</Series>
-	
-	<SeriesTitle>
-		<isis>v225^a</isis>
-	</SeriesTitle>
-	
-	<GeneralNote>
-		<isis>v300</isis>
-	</GeneralNote>
-
-	<EditionNote>
-		<isis>v305</isis>
-	</EditionNote>
-
-	<PhysicalDescriptionNote>
-		<isis>v307</isis>
-	</PhysicalDescriptionNote>
-
-	<IntellectResponsNote>
-		<isis>v314</isis>
-	</IntellectResponsNote>
-
-	<InternalBibliographies>
-		<isis>v320</isis>
-	</InternalBibliographies>
-
-	<Frequency>
-		<isis>v326</isis>
-	</Frequency>
-
-	<ContentsNote>
-		<isis>v327</isis>
-	</ContentsNote>
-
-	<Summary>
-		<isis>v330</isis>
-	</Summary>
-
-	<SystemRequirements>
-		<isis>v337</isis>
-	</SystemRequirements>
-
-	<IssuedWith>
-		<isis>v423^z: v423^a / v423^c v423^b</isis>
-	</IssuedWith>
-
-	<Parts>
-		<isis type="display">lookup{dio-jzav:v900}</isis>
-		<isis type="search">lookup{id-dio-jzav:v900}</isis>
-	</Parts>
-	
-	<PartsEF>
-		<isis type="display">lookup{naslov-efzg:001v001}</isis>
-	</PartsEF>
-
-	<PartsID>
-		<isis>v463^1</isis>
-	</PartsID>
-
-	<Piece>
-		<isis>lookup{naslov-efzg:s463^1}</isis>
-	</Piece>
-
-	<PieceSubtitle>
-		<isis>lookup{podnaslov-efzg:s463^1}</isis>
-	</PieceSubtitle>
-
-	<PieceNum>
-		<isis>v463^v</isis>
-	</PieceNum>
-	
-
-	<PieceAnalitic>
-		<isis>v464^a / v464^g v464^f</isis>
-	</PieceAnalitic>
-
-	<UniformHeading>
-		<isis>v500^a. v503^b</isis>
-	</UniformHeading>
-	
-	<ExpandedTitle>
-		<isis>v532</isis>
-	</ExpandedTitle>
-	
-	<Form>
-		<isis>v608</isis>
-	</Form>
-
-	<UncontrolledTerms>
-		<isis>v610</isis>	
-	</UncontrolledTerms>
-	
-	<UDC_All>
-		<isis type="search">v675</isis>
-	</UDC_All>
-	
-	<UDC>
-		<isis>v675^a</isis>
-	</UDC>
-
-	<UDCb>
-		<isis>v675^b</isis>
-	</UDCb>
-
-	<APA>
-		<isis>v686</isis>
-	</APA>
-
-	<PersonalName>
-		<isis>v700^a, v700^b</isis>
-		<isis type="index">v700^a, v700^b</isis>
-	</PersonalName>
-
-	<PersonalName2>
-		<isis>v701^a, v701^b</isis>
-		<isis type="index">v701^a, v701^b</isis>
-	</PersonalName2>
-
-	<PersonalNameOther>
-		<isis>v702^a, v702^b</isis>
-		<isis type="index">v702^a, v702^b</isis>
-	</PersonalNameOther>
-
-	<Names>
-		<isis>v700^a, v700^b</isis>
-		<isis>v701^a, v701^b</isis>
-		<isis>v702^a, v702^b</isis>
-		<isis type="index">v700^a, v700^b</isis>
-		<isis type="index">v701^a, v701^b</isis>
-		<isis type="index">v702^a, v702^b</isis>
-	</Names>
-
-	<CorporateName>
-		<isis>v710^a</isis>
-	</CorporateName>
-
-	<CorporateName2>
-		<isis>v711^a</isis>
-	</CorporateName2>
-
-	<OriginatingSource>
-		<isis>v801</isis>
-	</OriginatingSource>
-	
-	<URL>
-		<isis>v856^u</isis>
-	</URL>
-
-	<level>
-		<isis>v909</isis>
-	</level>
-	
-	<ID>
-		<isis>v900</isis>
-		<isis>001v001</isis>
-	</ID>
-	
-	<Set>
-		<isis type="display">lookup{set-jzav:v946^1}</isis>
-		<isis type="search">v946^1</isis>
-		<isis type="display">lookup{set-efzg:v461^1}</isis>
-		<isis type="search">v461^1</isis>
-	</Set>
-
-	<Set2>
-		<isis>lookup{set-jzav:lookup{parent-id:v946^1}}</isis>
-	</Set2>
-
-	<CallNo>
-		<isis>v990</isis>
-	</CallNo>
-
-	<InvNo>
-		<isis>v991</isis>
-	</InvNo>
-
-</indexer>
-</config>
diff --git a/lib/WebPAC.pm b/lib/WebPAC.pm
index b1970c0..567b5c8 100644
--- a/lib/WebPAC.pm
+++ b/lib/WebPAC.pm
@@ -9,30 +9,25 @@ WebPAC - core module
 
 =head1 VERSION
 
-Version 2.17
+Version 2.20
 
 =cut
 
-our $VERSION = '2.17';
+our $VERSION = '2.20';
 
 =head1 SYNOPSIS
 
-This is quick description of what WebPAC is. This is third iteration of
-WebPAC design (second one was semi-private creatation of CD ROM with L<jsFind>
-module). This code will eventually become official WebPAC version 2.
+This is quick description of what WebPAC is. This is another iteration of
+WebPAC design (first was system with XML files and CGI, second one was semi-private
+creatation of CD ROM with L<jsFind> module and third was older version 2
+with supprot for lagacy XML and YAML).
+
+Current version supports different input formats and normalisation using set rules.
 
 =head1 AUTHOR
 
 Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
 
-=head1 BUGS
-
-Please report any bugs or feature requests to
-C<bug-webpac@rt.cpan.org>, or through the web interface at
-L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=WebPAC>.
-I will be notified, and then you'll automatically be notified of progress on
-your bug as I make changes.
-
 =head1 SEE ALSO
 
 To undestand concpets behind WebPAC examine L<WebPAC::Manual>, and then
@@ -40,7 +35,7 @@ respective documentation for each component.
 
 =head1 COPYRIGHT & LICENSE
 
-Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
+Copyright 2005-2006 Dobrica Pavlinusic, All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it
 under the same terms as Perl itself.
diff --git a/lib/WebPAC/Lookup.pm b/lib/WebPAC/Lookup.pm
index 2f1ba12..db772fe 100644
--- a/lib/WebPAC/Lookup.pm
+++ b/lib/WebPAC/Lookup.pm
@@ -3,7 +3,7 @@ package WebPAC::Lookup;
 use warnings;
 use strict;
 
-use base qw/WebPAC::Common WebPAC::Normalize/;
+use base qw/WebPAC::Common WebPAC::Lookup::Normalize/;
 use File::Slurp;
 use YAML qw/LoadFile/;
 use Data::Dumper;
@@ -219,7 +219,7 @@ Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
 
 =head1 COPYRIGHT & LICENSE
 
-Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
+Copyright 2005-2006 Dobrica Pavlinusic, All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it
 under the same terms as Perl itself.
diff --git a/lib/WebPAC/Lookup/Normalize.pm b/lib/WebPAC/Lookup/Normalize.pm
new file mode 100644
index 0000000..dd10717
--- /dev/null
+++ b/lib/WebPAC/Lookup/Normalize.pm
@@ -0,0 +1,795 @@
+package WebPAC::Lookup::Normalize;
+
+use warnings;
+use strict;
+use blib;
+use WebPAC::Common;
+use base 'WebPAC::Common';
+use Data::Dumper;
+
+=head1 NAME
+
+WebPAC::Lookup::Normalize - data mungling for normalisation
+
+=head1 VERSION
+
+Version 0.09
+
+=cut
+
+our $VERSION = '0.09';
+
+=head1 SYNOPSIS
+
+This package contains code that mungle data to produce normalized format.
+
+B<WARNING:>
+
+This code is obsolete. It moved to here so that I don't have to re-write
+L<WebPAC::Lookup> to use set configuration files (using L<WebPAC::Normalize>)
+just yet. But it will dissapear real soon!
+
+It contains several assumptions:
+
+=over
+
+=item *
+
+format of fields is defined using C<v123^a> notation for repeatable fields
+or C<s123^a> for single (or first) value, where C<123> is field number and
+C<a> is subfield.
+
+=item *
+
+source data records (C<$rec>) have unique identifiers in field C<000>
+
+=item *
+
+optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
+perl code that is evaluated before producing output (value of field will be
+interpolated before that)
+
+=item *
+
+optional C<filter{filter_name}> at B<begining of format> will apply perl
+code defined as code ref on format after field substitution to producing
+output
+
+There is one built-in filter called C<regex> which can be use like this:
+
+  filter{regex(s/foo/bar/)}
+
+=item *
+
+optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
+
+=item *
+
+at end, optional C<format>s rules are resolved. Format rules are similar to
+C<sprintf> and can also contain C<lookup{...}> which is performed after
+values are inserted in format.
+
+=back
+
+This also describes order in which transformations are applied (eval,
+filter, lookup, format) which is important to undestand when deciding how to
+solve your data mungling and normalisation process.
+
+
+
+
+=head1 FUNCTIONS
+
+=head2 new
+
+Create new normalisation object
+
+  my $n = new WebPAC::Lookup::Normalize::Something(
+ 	filter => {
+		'filter_name_1' => sub {
+			# filter code
+			return length($_);
+		}, ...
+	},
+	db => $db_obj,
+	lookup_regex => $lookup->regex,
+	lookup => $lookup_obj,
+	prefix => 'foobar',
+  );
+
+Parametar C<filter> defines user supplied snippets of perl code which can
+be use with C<filter{...}> notation.
+
+C<prefix> is used to form filename for database record (to support multiple
+source files which are joined in one database).
+
+Recommended parametar C<lookup_regex> is used to enable parsing of lookups
+in structures. If you pass this parametar, you must also pass C<lookup>
+which is C<WebPAC::Lookup> object.
+
+=cut
+
+sub new {
+	my $class = shift;
+        my $self = {@_};
+        bless($self, $class);
+
+	my $r = $self->{'lookup_regex'} ? 1 : 0;
+	my $l = $self->{'lookup'} ? 1 : 0;
+
+	my $log = $self->_get_logger();
+
+	# those two must be in pair
+	if ( ($r & $l) != ($r || $l) ) {
+		my $log = $self->_get_logger();
+		$log->logdie("lookup_regex and lookup must be in pair");
+	}
+
+	$log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
+
+	$log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
+
+	$log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
+
+	if (! $self->{filter} || ! $self->{filter}->{regex}) {
+		$log->debug("adding built-in filter regex");
+		$self->{filter}->{regex} = sub {
+			my ($val, $regex) = @_;
+			eval "\$val =~ $regex";
+			return $val;
+		};
+	}
+
+	$self ? return $self : return undef;
+}
+
+=head2 all_tags
+
+Returns all tags in document in specified order
+
+  my $sorted_tags = $self->all_tags();
+
+=cut
+
+sub all_tags {
+	my $self = shift;
+
+	if (! $self->{_tags_by_order}) {
+
+		my $log = $self->_get_logger;
+		# sanity check
+		$log->logdie("can't find self->{inport_xml}->{indexer}") unless ($self->{import_xml}->{indexer});
+
+		my @tags = keys %{ $self->{'import_xml'}->{'indexer'}};
+		$log->debug("unsorted tags: " . join(", ", @tags));
+
+		@tags = sort { $self->_sort_by_order } @tags;
+
+		$log->debug("sorted tags: " . join(",", @tags) );
+
+		$self->{_tags_by_order} = \@tags;
+	}
+
+	return $self->{_tags_by_order};
+}
+
+
+
+=head2 data_structure
+
+Create in-memory data structure which represents normalized layout from
+C<conf/normalize/*.xml>.
+
+This structures are used to produce output.
+
+ my $ds = $webpac->data_structure($rec);
+
+=cut
+
+sub data_structure {
+	my $self = shift;
+
+	my $log = $self->_get_logger();
+
+	my $rec = shift;
+	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+
+	$log->debug("data_structure rec = ", sub { Dumper($rec) });
+
+	$log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
+
+	my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
+
+	my $cache_file;
+
+	if ($self->{'db'}) {
+		my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
+		$log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
+		return $ds if ($ds);
+		$log->debug("cache miss, creating");
+	}
+
+	my $tags = $self->all_tags();
+
+	$log->debug("tags: ",sub { join(", ",@{ $tags }) });
+
+	my $ds;
+
+	foreach my $field (@{ $tags }) {
+
+		my $row;
+
+#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
+
+		foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
+			my $format;
+
+			$log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
+			$format = $tag->{'value'} || $tag->{'content'};
+
+			my @v;
+			if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
+				@v = $self->_rec_to_arr($rec,$format,'fill_in');
+			} else {
+				@v = $self->_rec_to_arr($rec,$format,'parse');
+			}
+			if (! @v) {
+				$log->debug("$field <",$self->{tag},"> format: $format no values");
+				next;
+			} else {
+				$log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
+			}
+
+			if ($tag->{'sort'}) {
+				@v = $self->sort_arr(@v);
+			}
+
+			# use format?
+			if ($tag->{'format_name'}) {
+				@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
+			}
+
+			# delimiter will join repeatable fields
+			if ($tag->{'delimiter'}) {
+				@v = ( join($tag->{'delimiter'}, @v) );
+			}
+
+			# default types 
+			my @types = qw(display search);
+			# override by type attribute
+			@types = ( $tag->{'type'} ) if ($tag->{'type'});
+
+			foreach my $type (@types) {
+				# append to previous line?
+				$log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} || 'no append');
+				if ($tag->{'append'}) {
+
+					# I will delimit appended part with
+					# delimiter (or ,)
+					my $d = $tag->{'delimiter'};
+					# default delimiter
+					$d ||= " ";
+
+					my $last = pop @{$row->{$type}};
+					$d = "" if (! $last);
+					$last .= $d . join($d, @v);
+					push @{$row->{$type}}, $last;
+
+				} else {
+					push @{$row->{$type}}, @v;
+				}
+			}
+
+
+		}
+
+		if ($row) {
+			$row->{'tag'} = $field;
+
+			# TODO: name_sigular, name_plural
+			my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
+			my $row_name = $name ? $self->_x($name) : $field;
+
+			# post-sort all values in field
+			if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
+				$log->warn("sort at field tag not implemented");
+			}
+
+			$ds->{$row_name} = $row;
+
+			$log->debug("row $field: ",sub { Dumper($row) });
+		}
+
+	}
+
+	$self->{'db'}->save_ds(
+		id => $id,
+		ds => $ds,
+		prefix => $self->{prefix},
+	) if ($self->{'db'});
+
+	$log->debug("ds: ", sub { Dumper($ds) });
+
+	$log->logconfess("data structure returned is not array any more!") if wantarray;
+
+	return $ds;
+
+}
+
+=head2 parse
+
+Perform smart parsing of string, skipping delimiters for fields which aren't
+defined. It can also eval code in format starting with C<eval{...}> and
+return output or nothing depending on eval code.
+
+ my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
+
+Filters are implemented here. While simple form of filters looks like this:
+
+  filter{name_of_filter}
+
+but, filters can also have variable number of parametars like this:
+
+  filter{name_of_filter(param,param,param)}
+
+=cut
+
+my $warn_once;
+
+sub parse {
+	my $self = shift;
+
+	my ($rec, $format_utf8, $i, $rec_size) = @_;
+
+	return if (! $format_utf8);
+
+	my $log = $self->_get_logger();
+
+	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+
+	$i = 0 if (! $i);
+
+	my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
+
+	my @out;
+
+	$log->debug("format: $format [$i]");
+
+	my $eval_code;
+	# remove eval{...} from beginning
+	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
+
+	my $filter_name;
+	# remove filter{...} from beginning
+	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
+
+	# did we found any (att all) field from format in row?
+	my $found_any;
+	# prefix before first field which we preserve it $found_any
+	my $prefix;
+
+	my $f_step = 1;
+
+	while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
+
+		my $del = $1 || '';
+		$prefix = $del if ($f_step == 1);
+
+		my $fld_type = lc($2);
+
+		# repeatable index
+		my $r = $i;
+		if ($fld_type eq 's') {
+			if ($found_any->{'v'}) {
+				$r = 0;
+			} else {
+				return;
+			}
+		}
+
+		my $found = 0;
+		my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found,$rec_size);
+
+		if ($found) {
+			$found_any->{$fld_type} += $found;
+
+			# we will skip delimiter before first occurence of field!
+			push @out, $del unless($found_any->{$fld_type} == 1);
+			push @out, $tmp if ($tmp);
+		}
+		$f_step++;
+	}
+
+	# test if any fields found?
+	return if (! $found_any->{'v'} && ! $found_any->{'s'});
+
+	my $out = join('',@out);
+
+	if ($out) {
+		# add rest of format (suffix)
+		$out .= $format;
+
+		# add prefix if not there
+		$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
+
+		$log->debug("result: $out");
+	}
+
+	if ($eval_code) {
+		my $eval = $self->fill_in($rec,$eval_code,$i) || return;
+		$log->debug("about to eval{$eval} format: $out");
+		return if (! $self->_eval($eval));
+	}
+	
+	if ($filter_name) {
+		my @filter_args;
+		if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
+			@filter_args = split(/,/, $2);
+		}
+		if ($self->{'filter'}->{$filter_name}) {
+			$log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
+			unshift @filter_args, $out;
+			$out = $self->{'filter'}->{$filter_name}->(@filter_args);
+			return unless(defined($out));
+			$log->debug("filter result: $out");
+		} elsif (! $warn_once->{$filter_name}) {
+			$log->warn("trying to use undefined filter $filter_name");
+			$warn_once->{$filter_name}++;
+		}
+	}
+
+	return $out;
+}
+
+=head2 fill_in
+
+Workhourse of all: takes record from in-memory structure of database and
+strings with placeholders and returns string or array of with substituted
+values from record.
+
+ my $text = $webpac->fill_in($rec,'v250^a');
+
+Optional argument is ordinal number for repeatable fields. By default,
+it's assume to be first repeatable field (fields are perl array, so first
+element is 0).
+Following example will read second value from repeatable field.
+
+ my $text = $webpac->fill_in($rec,'Title: v250^a',1);
+
+This function B<does not> perform parsing of format to inteligenty skip
+delimiters before fields which aren't used.
+
+This method will automatically decode UTF-8 string to local code page
+if needed.
+
+There is optional parametar C<$record_size> which can be used to get sizes of
+all C<field^subfield> combinations in this format.
+
+ my $text = $webpac->fill_in($rec,'got: v900^a v900^x',0,\$rec_size); 
+
+=cut
+
+sub fill_in {
+	my $self = shift;
+
+	my $log = $self->_get_logger();
+
+	my ($rec,$format,$i,$rec_size) = @_;
+
+	$log->logconfess("need data record") unless ($rec);
+	$log->logconfess("need format to parse") unless($format);
+
+	# iteration (for repeatable fields)
+	$i ||= 0;
+
+	$log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
+
+	# FIXME remove for speedup?
+	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+
+	if (utf8::is_utf8($format)) {
+		$format = $self->_x($format);
+	}
+
+	my $found = 0;
+	my $just_single = 1;
+
+	my $eval_code;
+	# remove eval{...} from beginning
+	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
+
+	my $filter_name;
+	# remove filter{...} from beginning
+	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
+
+	{
+		# fix warnings
+		no warnings 'uninitialized';
+
+		# do actual replacement of placeholders
+		# repeatable fields
+		if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found,$rec_size)/ges) {
+			$just_single = 0;
+		}
+
+		# non-repeatable fields
+		if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found,$rec_size)/ges) {
+			return if ($i > 0 && $just_single);
+		}
+	}
+
+	if ($found) {
+		$log->debug("format: $format");
+		if ($eval_code) {
+			my $eval = $self->fill_in($rec,$eval_code,$i);
+			return if (! $self->_eval($eval));
+		}
+		if ($filter_name && $self->{'filter'}->{$filter_name}) {
+			$log->debug("filter '$filter_name' for $format");
+			$format = $self->{'filter'}->{$filter_name}->($format);
+			return unless(defined($format));
+			$log->debug("filter result: $format");
+		}
+		# do we have lookups?
+		if ($self->{'lookup'}) {
+			if ($self->{'lookup'}->can('lookup')) {
+				my @lookup = $self->{lookup}->lookup($format);
+				$log->debug("lookup $format", join(", ", @lookup));
+				return @lookup;
+			} else {
+				$log->warn("Have lookup object but can't invoke lookup method");
+			}
+		} else {
+			return $format;
+		}
+	} else {
+		return;
+	}
+}
+
+
+=head2 _rec_to_arr
+
+Similar to C<parse> and C<fill_in>, but returns array of all repeatable fields. Usable
+for fields which have lookups, so they shouldn't be parsed but rather
+C<paste>d or C<fill_id>ed. Last argument is name of operation: C<paste> or C<fill_in>.
+
+ my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]','paste');
+
+=cut
+
+sub _rec_to_arr {
+	my $self = shift;
+
+	my ($rec, $format_utf8, $code) = @_;
+
+	my $log = $self->_get_logger();
+
+	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+	return if (! $format_utf8);
+
+	$log->debug("using $code on $format_utf8");
+
+	my $i = 0;
+	my $max = 0;
+	my @arr;
+	my $rec_size = {};
+
+	while ($i <= $max) {
+		my @v = $self->$code($rec,$format_utf8,$i++,\$rec_size);
+		if ($rec_size) {
+			foreach my $f (keys %{ $rec_size }) {
+				$max = $rec_size->{$f} if ($rec_size->{$f} > $max);
+			}
+			$log->debug("max set to $max");
+			undef $rec_size;
+		}
+		if (@v) {
+			push @arr, @v;
+		} else {
+			push @arr, '' if ($max > $i);
+		}
+	}
+
+	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
+
+	return @arr;
+}
+
+
+=head2 get_data
+
+Returns value from record.
+
+ my $text = $self->get_data(\$rec,$f,$sf,$i,\$found,\$rec_size);
+
+Required arguments are:
+
+=over 8
+
+=item C<$rec>
+
+record reference
+
+=item C<$f>
+
+field
+
+=item C<$sf>
+
+optional subfield
+
+=item C<$i>
+
+index offset for repeatable values ( 0 ... $rec_size->{'400^a'} )
+
+=item C<$found>
+
+optional variable that will be incremeted if preset
+
+=item C<$rec_size>
+
+hash to hold maximum occurances of C<field^subfield> combinations
+(which can be accessed using keys in same format)
+
+=back
+
+Returns value or empty string, updates C<$found> and C<rec_size>
+if present.
+
+=cut
+
+sub get_data {
+	my $self = shift;
+
+	my ($rec,$f,$sf,$i,$found,$cache) = @_;
+
+	return '' unless ($$rec->{$f} && ref($$rec->{$f}) eq 'ARRAY');
+
+	if (defined($$cache)) {
+		$$cache->{ $f . ( $sf ? '^' . $sf : '' ) } ||= scalar @{ $$rec->{$f} };
+	}
+
+	return '' unless ($$rec->{$f}->[$i]);
+
+	{
+		no strict 'refs';
+		if (defined($sf)) {
+			$$found++ if (defined($$found) && $$rec->{$f}->[$i]->{$sf});
+			return $$rec->{$f}->[$i]->{$sf};
+		} else {
+			$$found++ if (defined($$found));
+			# it still might have subfields, just
+			# not specified, so we'll dump some debug info
+			if ($$rec->{$f}->[$i] =~ /HASH/o) {
+				my $out;
+				foreach my $k (keys %{$$rec->{$f}->[$i]}) {
+					my $v = $$rec->{$f}->[$i]->{$k};
+					$out .= '$' . $k .':' . $v if ($v);
+				}
+				return $out;
+			} else {
+				return $$rec->{$f}->[$i];
+			}
+		}
+	}
+}
+
+
+=head2 apply_format
+
+Apply format specified in tag with C<format_name="name"> and
+C<format_delimiter=";;">.
+
+ my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
+
+Formats can contain C<lookup{...}> if you need them.
+
+=cut
+
+sub apply_format {
+	my $self = shift;
+
+	my ($name,$delimiter,$data) = @_;
+
+	my $log = $self->_get_logger();
+
+	if (! $self->{'import_xml'}->{'format'}->{$name}) {
+		$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
+		return $data;
+	}
+
+	$log->warn("no delimiter for format $name") if (! $delimiter);
+
+	my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
+
+	my @data = split(/\Q$delimiter\E/, $data);
+
+	my $out = sprintf($format, @data);
+	$log->debug("using format $name [$format] on $data to produce: $out");
+
+	if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
+		return $self->{'lookup'}->lookup($out);
+	} else {
+		return $out;
+	}
+
+}
+
+=head2 sort_arr
+
+Sort array ignoring case and html in data
+
+ my @sorted = $webpac->sort_arr(@unsorted);
+
+=cut
+
+sub sort_arr {
+	my $self = shift;
+
+	my $log = $self->_get_logger();
+
+	# FIXME add Schwartzian Transformation?
+
+	my @sorted = sort {
+		$a =~ s#<[^>]+/*>##;
+		$b =~ s#<[^>]+/*>##;
+		lc($b) cmp lc($a)
+	} @_;
+	$log->debug("sorted values: ",sub { join(", ",@sorted) });
+
+	return @sorted;
+}
+
+
+=head1 INTERNAL METHODS
+
+=head2 _sort_by_order
+
+Sort xml tags data structure accoding to C<order=""> attribute.
+
+=cut
+
+sub _sort_by_order {
+	my $self = shift;
+
+	my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
+		$self->{'import_xml'}->{'indexer'}->{$a};
+	my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
+		$self->{'import_xml'}->{'indexer'}->{$b};
+
+	return $va <=> $vb;
+}
+
+=head2 _x
+
+Convert strings from C<conf/normalize/*.xml> encoding into application
+specific encoding (optinally specified using C<code_page> to C<new>
+constructor).
+
+ my $text = $n->_x('normalize text string');
+
+This is a stub so that other modules doesn't have to implement it.
+
+=cut
+
+sub _x {
+	my $self = shift;
+	return shift;
+}
+
+
+=head1 AUTHOR
+
+Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
+
+=head1 COPYRIGHT & LICENSE
+
+Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it
+under the same terms as Perl itself.
+
+=cut
+
+1; # End of WebPAC::Lookup::Normalize
diff --git a/lib/WebPAC/Normalize.pm b/lib/WebPAC/Normalize.pm
index 93810ab..cf78071 100644
--- a/lib/WebPAC/Normalize.pm
+++ b/lib/WebPAC/Normalize.pm
@@ -1,789 +1,342 @@
 package WebPAC::Normalize;
+use Exporter 'import';
+@EXPORT = qw/
+	set_rec set_lookup
+	get_ds clean_ds
+	tag search display
+	rec1 rec2 rec
+	regex prefix suffix surround
+	first lookup join_with
+/;
 
 use warnings;
 use strict;
-use blib;
-use WebPAC::Common;
-use base 'WebPAC::Common';
+
+#use base qw/WebPAC::Common/;
 use Data::Dumper;
 
 =head1 NAME
 
-WebPAC::Normalize - data mungling for normalisation
+WebPAC::Normalize - describe normalisaton rules using sets
 
 =head1 VERSION
 
-Version 0.09
+Version 0.04
 
 =cut
 
-our $VERSION = '0.09';
+our $VERSION = '0.04';
 
 =head1 SYNOPSIS
 
-This package contains code that mungle data to produce normalized format.
-
-It contains several assumptions:
-
-=over
-
-=item *
-
-format of fields is defined using C<v123^a> notation for repeatable fields
-or C<s123^a> for single (or first) value, where C<123> is field number and
-C<a> is subfield.
-
-=item *
-
-source data records (C<$rec>) have unique identifiers in field C<000>
-
-=item *
-
-optional C<eval{length('v123^a') == 3}> tag at B<beginning of format> will be
-perl code that is evaluated before producing output (value of field will be
-interpolated before that)
-
-=item *
-
-optional C<filter{filter_name}> at B<begining of format> will apply perl
-code defined as code ref on format after field substitution to producing
-output
-
-There is one built-in filter called C<regex> which can be use like this:
-
-  filter{regex(s/foo/bar/)}
-
-=item *
-
-optional C<lookup{...}> will be then performed. See C<WebPAC::Lookups>.
-
-=item *
-
-at end, optional C<format>s rules are resolved. Format rules are similar to
-C<sprintf> and can also contain C<lookup{...}> which is performed after
-values are inserted in format.
-
-=back
-
-This also describes order in which transformations are applied (eval,
-filter, lookup, format) which is important to undestand when deciding how to
-solve your data mungling and normalisation process.
-
+This module uses C<conf/normalize/*.pl> files to perform normalisation
+from input records using perl functions which are specialized for set
+processing.
 
+Sets are implemented as arrays, and normalisation file is valid perl, which
+means that you check it's validity before running WebPAC using
+C<perl -c normalize.pl>.
 
+Normalisation can generate multiple output normalized data. For now, supported output
+types (on the left side of definition) are: C<tag>, C<display> and C<search>.
 
 =head1 FUNCTIONS
 
-=head2 new
-
-Create new normalisation object
-
-  my $n = new WebPAC::Normalize::Something(
- 	filter => {
-		'filter_name_1' => sub {
-			# filter code
-			return length($_);
-		}, ...
-	},
-	db => $db_obj,
-	lookup_regex => $lookup->regex,
-	lookup => $lookup_obj,
-	prefix => 'foobar',
-  );
+=head2 data_structure
 
-Parametar C<filter> defines user supplied snippets of perl code which can
-be use with C<filter{...}> notation.
+Return data structure
 
-C<prefix> is used to form filename for database record (to support multiple
-source files which are joined in one database).
+  my $ds = WebPAC::Normalize(
+  	lookup => $lookup->lookup_hash,
+	row => $row,
+	rules => $normalize_pl_config,
+  );
 
-Recommended parametar C<lookup_regex> is used to enable parsing of lookups
-in structures. If you pass this parametar, you must also pass C<lookup>
-which is C<WebPAC::Lookup> object.
+This function will B<die> if normalizastion can't be evaled.
 
 =cut
 
-sub new {
-	my $class = shift;
-        my $self = {@_};
-        bless($self, $class);
-
-	my $r = $self->{'lookup_regex'} ? 1 : 0;
-	my $l = $self->{'lookup'} ? 1 : 0;
-
-	my $log = $self->_get_logger();
-
-	# those two must be in pair
-	if ( ($r & $l) != ($r || $l) ) {
-		my $log = $self->_get_logger();
-		$log->logdie("lookup_regex and lookup must be in pair");
-	}
-
-	$log->logdie("lookup must be WebPAC::Lookup object") if ($self->{'lookup'} && ! $self->{'lookup'}->isa('WebPAC::Lookup'));
-
-	$log->warn("no prefix defined. please check that!") unless ($self->{'prefix'});
-
-	$log->debug("using lookup regex: ", $self->{lookup_regex}) if ($r && $l);
-
-	if (! $self->{filter} || ! $self->{filter}->{regex}) {
-		$log->debug("adding built-in filter regex");
-		$self->{filter}->{regex} = sub {
-			my ($val, $regex) = @_;
-			eval "\$val =~ $regex";
-			return $val;
-		};
-	}
-
-	$self ? return $self : return undef;
+sub data_structure {
+	my $arg = {@_};
+
+	die "need row argument" unless ($arg->{row});
+	die "need normalisation argument" unless ($arg->{rules});
+
+	no strict 'subs';
+	set_lookup( $arg->{lookup} );
+	set_rec( $arg->{row} );
+	clean_ds();
+	eval "$arg->{rules}";
+	die "error evaling $arg->{rules}: $@\n" if ($@);
+	return get_ds();
 }
 
-=head2 all_tags
+=head2 set_rec
 
-Returns all tags in document in specified order
+Set current record hash
 
-  my $sorted_tags = $self->all_tags();
+  set_rec( $rec );
 
 =cut
 
-sub all_tags {
-	my $self = shift;
-
-	if (! $self->{_tags_by_order}) {
-
-		my $log = $self->_get_logger;
-		# sanity check
-		$log->logdie("can't find self->{inport_xml}->{indexer}") unless ($self->{import_xml}->{indexer});
-
-		my @tags = keys %{ $self->{'import_xml'}->{'indexer'}};
-		$log->debug("unsorted tags: " . join(", ", @tags));
+my $rec;
 
-		@tags = sort { $self->_sort_by_order } @tags;
-
-		$log->debug("sorted tags: " . join(",", @tags) );
-
-		$self->{_tags_by_order} = \@tags;
-	}
-
-	return $self->{_tags_by_order};
+sub set_rec {
+	$rec = shift or die "no record hash";
 }
 
+=head2 tag
 
+Define new tag for I<search> and I<display>.
 
-=head2 data_structure
-
-Create in-memory data structure which represents normalized layout from
-C<conf/normalize/*.xml>.
-
-This structures are used to produce output.
+  tag('Title', rec('200','a') );
 
- my $ds = $webpac->data_structure($rec);
 
 =cut
 
-sub data_structure {
-	my $self = shift;
-
-	my $log = $self->_get_logger();
-
-	my $rec = shift;
-	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
-
-	$log->debug("data_structure rec = ", sub { Dumper($rec) });
-
-	$log->logdie("need unique ID (mfn) in field 000 of record " . Dumper($rec) ) unless (defined($rec->{'000'}));
-
-	my $id = $rec->{'000'}->[0] || $log->logdie("field 000 isn't array!");
-
-	my $cache_file;
-
-	if ($self->{'db'}) {
-		my $ds = $self->{'db'}->load_ds( id => $id, prefix => $self->{prefix} );
-		$log->debug("load_ds( rec = ", sub { Dumper($rec) }, ") = ", sub { Dumper($ds) });
-		return $ds if ($ds);
-		$log->debug("cache miss, creating");
-	}
-
-	my $tags = $self->all_tags();
-
-	$log->debug("tags: ",sub { join(", ",@{ $tags }) });
-
-	my $ds;
-
-	foreach my $field (@{ $tags }) {
-
-		my $row;
-
-#print "field $field [",$self->{'tag'},"] = ",Dumper($self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}});
-
-		foreach my $tag (@{$self->{'import_xml'}->{'indexer'}->{$field}->{$self->{'tag'}}}) {
-			my $format;
-
-			$log->logdie("expected tag HASH and got $tag") unless (ref($tag) eq 'HASH');
-			$format = $tag->{'value'} || $tag->{'content'};
-
-			my @v;
-			if ($self->{'lookup_regex'} && $format =~ $self->{'lookup_regex'}) {
-				@v = $self->_rec_to_arr($rec,$format,'fill_in');
-			} else {
-				@v = $self->_rec_to_arr($rec,$format,'parse');
-			}
-			if (! @v) {
-				$log->debug("$field <",$self->{tag},"> format: $format no values");
-				next;
-			} else {
-				$log->debug("$field <",$self->{tag},"> format: $format values: ", join(",", @v));
-			}
-
-			if ($tag->{'sort'}) {
-				@v = $self->sort_arr(@v);
-			}
-
-			# use format?
-			if ($tag->{'format_name'}) {
-				@v = map { $self->apply_format($tag->{'format_name'},$tag->{'format_delimiter'},$_) } @v;
-			}
-
-			# delimiter will join repeatable fields
-			if ($tag->{'delimiter'}) {
-				@v = ( join($tag->{'delimiter'}, @v) );
-			}
-
-			# default types 
-			my @types = qw(display search);
-			# override by type attribute
-			@types = ( $tag->{'type'} ) if ($tag->{'type'});
-
-			foreach my $type (@types) {
-				# append to previous line?
-				$log->debug("tag $field / $type [",sub { join(",",@v) }, "] ", $row->{'append'} || 'no append');
-				if ($tag->{'append'}) {
-
-					# I will delimit appended part with
-					# delimiter (or ,)
-					my $d = $tag->{'delimiter'};
-					# default delimiter
-					$d ||= " ";
-
-					my $last = pop @{$row->{$type}};
-					$d = "" if (! $last);
-					$last .= $d . join($d, @v);
-					push @{$row->{$type}}, $last;
-
-				} else {
-					push @{$row->{$type}}, @v;
-				}
-			}
-
-
-		}
-
-		if ($row) {
-			$row->{'tag'} = $field;
-
-			# TODO: name_sigular, name_plural
-			my $name = $self->{'import_xml'}->{'indexer'}->{$field}->{'name'};
-			my $row_name = $name ? $self->_x($name) : $field;
-
-			# post-sort all values in field
-			if ($self->{'import_xml'}->{'indexer'}->{$field}->{'sort'}) {
-				$log->warn("sort at field tag not implemented");
-			}
-
-			$ds->{$row_name} = $row;
-
-			$log->debug("row $field: ",sub { Dumper($row) });
-		}
-
-	}
-
-	$self->{'db'}->save_ds(
-		id => $id,
-		ds => $ds,
-		prefix => $self->{prefix},
-	) if ($self->{'db'});
-
-	$log->debug("ds: ", sub { Dumper($ds) });
-
-	$log->logconfess("data structure returned is not array any more!") if wantarray;
-
-	return $ds;
+my $out;
 
+sub tag {
+	my $name = shift or die "tag needs name as first argument";
+	my @o = grep { defined($_) && $_ ne '' } @_;
+	return unless (@o);
+	$out->{$name}->{tag} = $name;
+	$out->{$name}->{search} = \@o;
+	$out->{$name}->{display} = \@o;
 }
 
-=head2 parse
-
-Perform smart parsing of string, skipping delimiters for fields which aren't
-defined. It can also eval code in format starting with C<eval{...}> and
-return output or nothing depending on eval code.
-
- my $text = $webpac->parse($rec,'eval{"v901^a" eq "Deskriptor"}descriptor: v250^a', $i);
-
-Filters are implemented here. While simple form of filters looks like this:
+=head2 display
 
-  filter{name_of_filter}
+Define tag just for I<display>
 
-but, filters can also have variable number of parametars like this:
-
-  filter{name_of_filter(param,param,param)}
+  @v = display('Title', rec('200','a') );
 
 =cut
 
-my $warn_once;
-
-sub parse {
-	my $self = shift;
-
-	my ($rec, $format_utf8, $i, $rec_size) = @_;
-
-	return if (! $format_utf8);
-
-	my $log = $self->_get_logger();
-
-	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
-
-	$i = 0 if (! $i);
-
-	my $format = $self->_x($format_utf8) || $log->logconfess("can't convert '$format_utf8' from UTF-8 to ",$self->{'code_page'});
-
-	my @out;
-
-	$log->debug("format: $format [$i]");
-
-	my $eval_code;
-	# remove eval{...} from beginning
-	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
-
-	my $filter_name;
-	# remove filter{...} from beginning
-	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
-
-	# did we found any (att all) field from format in row?
-	my $found_any;
-	# prefix before first field which we preserve it $found_any
-	my $prefix;
-
-	my $f_step = 1;
-
-	while ($format =~ s/^(.*?)(v|s)(\d+)(?:\^(\w))?//s) {
-
-		my $del = $1 || '';
-		$prefix = $del if ($f_step == 1);
-
-		my $fld_type = lc($2);
-
-		# repeatable index
-		my $r = $i;
-		if ($fld_type eq 's') {
-			if ($found_any->{'v'}) {
-				$r = 0;
-			} else {
-				return;
-			}
-		}
+sub display {
+	my $name = shift or die "display needs name as first argument";
+	my @o = grep { defined($_) && $_ ne '' } @_;
+	return unless (@o);
+	$out->{$name}->{tag} = $name;
+	$out->{$name}->{display} = \@o;
+}
 
-		my $found = 0;
-		my $tmp = $self->get_data(\$rec,$3,$4,$r,\$found,$rec_size);
+=head2 search
 
-		if ($found) {
-			$found_any->{$fld_type} += $found;
+Prepare values just for I<search>
 
-			# we will skip delimiter before first occurence of field!
-			push @out, $del unless($found_any->{$fld_type} == 1);
-			push @out, $tmp if ($tmp);
-		}
-		$f_step++;
-	}
+  @v = search('Title', rec('200','a') );
 
-	# test if any fields found?
-	return if (! $found_any->{'v'} && ! $found_any->{'s'});
+=cut
 
-	my $out = join('',@out);
+sub search {
+	my $name = shift or die "search needs name as first argument";
+	my @o = grep { defined($_) && $_ ne '' } @_;
+	return unless (@o);
+	$out->{$name}->{tag} = $name;
+	$out->{$name}->{search} = \@o;
+}
 
-	if ($out) {
-		# add rest of format (suffix)
-		$out .= $format;
+=head2 get_ds
 
-		# add prefix if not there
-		$out = $prefix . $out if ($out !~ m/^\Q$prefix\E/);
+Return hash formatted as data structure
 
-		$log->debug("result: $out");
-	}
+  my $ds = get_ds();
 
-	if ($eval_code) {
-		my $eval = $self->fill_in($rec,$eval_code,$i) || return;
-		$log->debug("about to eval{$eval} format: $out");
-		return if (! $self->_eval($eval));
-	}
-	
-	if ($filter_name) {
-		my @filter_args;
-		if ($filter_name =~ s/(\w+)\((.*)\)/$1/) {
-			@filter_args = split(/,/, $2);
-		}
-		if ($self->{'filter'}->{$filter_name}) {
-			$log->debug("about to filter{$filter_name} format: $out with arguments: ", join(",", @filter_args));
-			unshift @filter_args, $out;
-			$out = $self->{'filter'}->{$filter_name}->(@filter_args);
-			return unless(defined($out));
-			$log->debug("filter result: $out");
-		} elsif (! $warn_once->{$filter_name}) {
-			$log->warn("trying to use undefined filter $filter_name");
-			$warn_once->{$filter_name}++;
-		}
-	}
+=cut
 
+sub get_ds {
 	return $out;
 }
 
-=head2 fill_in
-
-Workhourse of all: takes record from in-memory structure of database and
-strings with placeholders and returns string or array of with substituted
-values from record.
+=head2 clean_ds
 
- my $text = $webpac->fill_in($rec,'v250^a');
+Clean data structure hash for next record
 
-Optional argument is ordinal number for repeatable fields. By default,
-it's assume to be first repeatable field (fields are perl array, so first
-element is 0).
-Following example will read second value from repeatable field.
-
- my $text = $webpac->fill_in($rec,'Title: v250^a',1);
-
-This function B<does not> perform parsing of format to inteligenty skip
-delimiters before fields which aren't used.
-
-This method will automatically decode UTF-8 string to local code page
-if needed.
-
-There is optional parametar C<$record_size> which can be used to get sizes of
-all C<field^subfield> combinations in this format.
-
- my $text = $webpac->fill_in($rec,'got: v900^a v900^x',0,\$rec_size); 
+  clean_ds();
 
 =cut
 
-sub fill_in {
-	my $self = shift;
-
-	my $log = $self->_get_logger();
-
-	my ($rec,$format,$i,$rec_size) = @_;
+sub clean_ds {
+	$out = undef;
+}
 
-	$log->logconfess("need data record") unless ($rec);
-	$log->logconfess("need format to parse") unless($format);
+=head2 set_lookup
 
-	# iteration (for repeatable fields)
-	$i ||= 0;
+Set current lookup hash
 
-	$log->logdie("infitite loop in format $format") if ($i > ($self->{'max_mfn'} || 9999));
+  set_lookup( $lookup );
 
-	# FIXME remove for speedup?
-	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
+=cut
 
-	if (utf8::is_utf8($format)) {
-		$format = $self->_x($format);
-	}
+my $lookup;
 
-	my $found = 0;
-	my $just_single = 1;
+sub set_lookup {
+	$lookup = shift;
+}
 
-	my $eval_code;
-	# remove eval{...} from beginning
-	$eval_code = $1 if ($format =~ s/^eval{([^}]+)}//s);
+=head2 rec1
 
-	my $filter_name;
-	# remove filter{...} from beginning
-	$filter_name = $1 if ($format =~ s/^filter{([^}]+)}//s);
+Return all values in some field
 
-	{
-		# fix warnings
-		no warnings 'uninitialized';
+  @v = rec1('200')
 
-		# do actual replacement of placeholders
-		# repeatable fields
-		if ($format =~ s/v(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,$i,\$found,$rec_size)/ges) {
-			$just_single = 0;
-		}
+TODO: order of values is probably same as in source data, need to investigate that
 
-		# non-repeatable fields
-		if ($format =~ s/s(\d+)(?:\^(\w))?/$self->get_data(\$rec,$1,$2,0,\$found,$rec_size)/ges) {
-			return if ($i > 0 && $just_single);
-		}
-	}
+=cut
 
-	if ($found) {
-		$log->debug("format: $format");
-		if ($eval_code) {
-			my $eval = $self->fill_in($rec,$eval_code,$i);
-			return if (! $self->_eval($eval));
-		}
-		if ($filter_name && $self->{'filter'}->{$filter_name}) {
-			$log->debug("filter '$filter_name' for $format");
-			$format = $self->{'filter'}->{$filter_name}->($format);
-			return unless(defined($format));
-			$log->debug("filter result: $format");
-		}
-		# do we have lookups?
-		if ($self->{'lookup'}) {
-			if ($self->{'lookup'}->can('lookup')) {
-				my @lookup = $self->{lookup}->lookup($format);
-				$log->debug("lookup $format", join(", ", @lookup));
-				return @lookup;
+sub rec1 {
+	my $f = shift;
+	return unless (defined($rec) && defined($rec->{$f}));
+	if (ref($rec->{$f}) eq 'ARRAY') {
+		return map { 
+			if (ref($_) eq 'HASH') {
+				values %{$_};
 			} else {
-				$log->warn("Have lookup object but can't invoke lookup method");
+				$_;
 			}
-		} else {
-			return $format;
-		}
-	} else {
-		return;
+		} @{ $rec->{$f} };
+	} elsif( defined($rec->{$f}) ) {
+		return $rec->{$f};
 	}
 }
 
+=head2 rec2
 
-=head2 _rec_to_arr
-
-Similar to C<parse> and C<fill_in>, but returns array of all repeatable fields. Usable
-for fields which have lookups, so they shouldn't be parsed but rather
-C<paste>d or C<fill_id>ed. Last argument is name of operation: C<paste> or C<fill_in>.
+Return all values in specific field and subfield
 
- my @arr = $webpac->fill_in_to_arr($rec,'[v900];;[v250^a]','paste');
+  @v = rec2('200','a')
 
 =cut
 
-sub _rec_to_arr {
-	my $self = shift;
-
-	my ($rec, $format_utf8, $code) = @_;
+sub rec2 {
+	my $f = shift;
+	return unless (defined($rec && $rec->{$f}));
+	my $sf = shift;
+	return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
+}
 
-	my $log = $self->_get_logger();
+=head2 rec
 
-	$log->logconfess("need HASH as first argument!") if ($rec !~ /HASH/o);
-	return if (! $format_utf8);
+syntaxtic sugar for
 
-	$log->debug("using $code on $format_utf8");
+  @v = rec('200')
+  @v = rec('200','a')
 
-	my $i = 0;
-	my $max = 0;
-	my @arr;
-	my $rec_size = {};
+=cut
 
-	while ($i <= $max) {
-		my @v = $self->$code($rec,$format_utf8,$i++,\$rec_size);
-		if ($rec_size) {
-			foreach my $f (keys %{ $rec_size }) {
-				$max = $rec_size->{$f} if ($rec_size->{$f} > $max);
-			}
-			$log->debug("max set to $max");
-			undef $rec_size;
-		}
-		if (@v) {
-			push @arr, @v;
-		} else {
-			push @arr, '' if ($max > $i);
-		}
+sub rec {
+	if ($#_ == 0) {
+		return rec1(@_);
+	} elsif ($#_ == 1) {
+		return rec2(@_);
 	}
-
-	$log->debug("format '$format_utf8' returned ",--$i," elements: ", sub { join(" | ",@arr) }) if (@arr);
-
-	return @arr;
 }
 
+=head2 regex
 
-=head2 get_data
-
-Returns value from record.
-
- my $text = $self->get_data(\$rec,$f,$sf,$i,\$found,\$rec_size);
-
-Required arguments are:
+Apply regex to some or all values
 
-=over 8
-
-=item C<$rec>
-
-record reference
-
-=item C<$f>
-
-field
-
-=item C<$sf>
-
-optional subfield
-
-=item C<$i>
-
-index offset for repeatable values ( 0 ... $rec_size->{'400^a'} )
-
-=item C<$found>
-
-optional variable that will be incremeted if preset
-
-=item C<$rec_size>
-
-hash to hold maximum occurances of C<field^subfield> combinations
-(which can be accessed using keys in same format)
-
-=back
-
-Returns value or empty string, updates C<$found> and C<rec_size>
-if present.
+  @v = regex( 's/foo/bar/g', @v );
 
 =cut
 
-sub get_data {
-	my $self = shift;
-
-	my ($rec,$f,$sf,$i,$found,$cache) = @_;
-
-	return '' unless ($$rec->{$f} && ref($$rec->{$f}) eq 'ARRAY');
-
-	if (defined($$cache)) {
-		$$cache->{ $f . ( $sf ? '^' . $sf : '' ) } ||= scalar @{ $$rec->{$f} };
-	}
-
-	return '' unless ($$rec->{$f}->[$i]);
-
-	{
-		no strict 'refs';
-		if (defined($sf)) {
-			$$found++ if (defined($$found) && $$rec->{$f}->[$i]->{$sf});
-			return $$rec->{$f}->[$i]->{$sf};
-		} else {
-			$$found++ if (defined($$found));
-			# it still might have subfields, just
-			# not specified, so we'll dump some debug info
-			if ($$rec->{$f}->[$i] =~ /HASH/o) {
-				my $out;
-				foreach my $k (keys %{$$rec->{$f}->[$i]}) {
-					my $v = $$rec->{$f}->[$i]->{$k};
-					$out .= '$' . $k .':' . $v if ($v);
-				}
-				return $out;
-			} else {
-				return $$rec->{$f}->[$i];
-			}
-		}
+sub regex {
+	my $r = shift;
+	my @out;
+	#warn "r: $r\n",Dumper(\@_);
+	foreach my $t (@_) {
+		next unless ($t);
+		eval "\$t =~ $r";
+		push @out, $t if ($t && $t ne '');
 	}
+	return @out;
 }
 
+=head2 prefix
 
-=head2 apply_format
-
-Apply format specified in tag with C<format_name="name"> and
-C<format_delimiter=";;">.
-
- my $text = $webpac->apply_format($format_name,$format_delimiter,$data);
+Prefix all values with a string
 
-Formats can contain C<lookup{...}> if you need them.
+  @v = prefix( 'my_', @v );
 
 =cut
 
-sub apply_format {
-	my $self = shift;
-
-	my ($name,$delimiter,$data) = @_;
-
-	my $log = $self->_get_logger();
-
-	if (! $self->{'import_xml'}->{'format'}->{$name}) {
-		$log->warn("<format name=\"$name\"> is not defined in ",$self->{'import_xml_file'});
-		return $data;
-	}
-
-	$log->warn("no delimiter for format $name") if (! $delimiter);
+sub prefix {
+	my $p = shift or die "prefix needs string as first argument";
+	return map { $p . $_ } grep { defined($_) } @_;
+}
 
-	my $format = $self->_x($self->{'import_xml'}->{'format'}->{$name}->{'content'}) || $log->logdie("can't find format '$name'");
+=head2 suffix
 
-	my @data = split(/\Q$delimiter\E/, $data);
+suffix all values with a string
 
-	my $out = sprintf($format, @data);
-	$log->debug("using format $name [$format] on $data to produce: $out");
+  @v = suffix( '_my', @v );
 
-	if ($self->{'lookup_regex'} && $out =~ $self->{'lookup_regex'}) {
-		return $self->{'lookup'}->lookup($out);
-	} else {
-		return $out;
-	}
+=cut
 
+sub suffix {
+	my $s = shift or die "suffix needs string as first argument";
+	return map { $_ . $s } grep { defined($_) } @_;
 }
 
-=head2 sort_arr
+=head2 surround
 
-Sort array ignoring case and html in data
+surround all values with a two strings
 
- my @sorted = $webpac->sort_arr(@unsorted);
+  @v = surround( 'prefix_', '_suffix', @v );
 
 =cut
 
-sub sort_arr {
-	my $self = shift;
-
-	my $log = $self->_get_logger();
-
-	# FIXME add Schwartzian Transformation?
-
-	my @sorted = sort {
-		$a =~ s#<[^>]+/*>##;
-		$b =~ s#<[^>]+/*>##;
-		lc($b) cmp lc($a)
-	} @_;
-	$log->debug("sorted values: ",sub { join(", ",@sorted) });
-
-	return @sorted;
+sub surround {
+	my $p = shift or die "surround need prefix as first argument";
+	my $s = shift or die "surround needs suffix as second argument";
+	return map { $p . $_ . $s } grep { defined($_) } @_;
 }
 
+=head2 first
 
-=head1 INTERNAL METHODS
-
-=head2 _sort_by_order
+Return first element
 
-Sort xml tags data structure accoding to C<order=""> attribute.
+  $v = first( @v );
 
 =cut
 
-sub _sort_by_order {
-	my $self = shift;
-
-	my $va = $self->{'import_xml'}->{'indexer'}->{$a}->{'order'} ||
-		$self->{'import_xml'}->{'indexer'}->{$a};
-	my $vb = $self->{'import_xml'}->{'indexer'}->{$b}->{'order'} ||
-		$self->{'import_xml'}->{'indexer'}->{$b};
-
-	return $va <=> $vb;
+sub first {
+	my $r = shift;
+	return $r;
 }
 
-=head2 _x
-
-Convert strings from C<conf/normalize/*.xml> encoding into application
-specific encoding (optinally specified using C<code_page> to C<new>
-constructor).
+=head2 lookup
 
- my $text = $n->_x('normalize text string');
+Consult lookup hashes for some value
 
-This is a stub so that other modules doesn't have to implement it.
+  @v = lookup( $v );
+  @v = lookup( @v );
 
 =cut
 
-sub _x {
-	my $self = shift;
-	return shift;
+sub lookup {
+	my $k = shift or return;
+	return unless (defined($lookup->{$k}));
+	if (ref($lookup->{$k}) eq 'ARRAY') {
+		return @{ $lookup->{$k} };
+	} else {
+		return $lookup->{$k};
+	}
 }
 
+=head2 join_with
 
-=head1 AUTHOR
-
-Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
-
-=head1 COPYRIGHT & LICENSE
+Joins walues with some delimiter
 
-Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it
-under the same terms as Perl itself.
+  $v = join_with(", ", @v);
 
 =cut
 
-1; # End of WebPAC::Normalize
+sub join_with {
+	my $d = shift;
+	return join($d, grep { defined($_) && $_ ne '' } @_);
+}
+
+# END
+1;
diff --git a/lib/WebPAC/Normalize/Set.pm b/lib/WebPAC/Normalize/Set.pm
deleted file mode 100644
index 0b8a17e..0000000
--- a/lib/WebPAC/Normalize/Set.pm
+++ /dev/null
@@ -1,342 +0,0 @@
-package WebPAC::Normalize::Set;
-use Exporter 'import';
-@EXPORT = qw/
-	set_rec set_lookup
-	get_ds clean_ds
-	tag search display
-	rec1 rec2 rec
-	regex prefix suffix surround
-	first lookup join_with
-/;
-
-use warnings;
-use strict;
-
-#use base qw/WebPAC::Common/;
-use Data::Dumper;
-
-=head1 NAME
-
-WebPAC::Normalize::Set - describe normalisaton rules using sets
-
-=head1 VERSION
-
-Version 0.04
-
-=cut
-
-our $VERSION = '0.04';
-
-=head1 SYNOPSIS
-
-This module uses C<conf/normalize/*.pl> files to perform normalisation
-from input records using perl functions which are specialized for set
-processing.
-
-Sets are implemented as arrays, and normalisation file is valid perl, which
-means that you check it's validity before running WebPAC using
-C<perl -c normalize.pl>.
-
-Normalisation can generate multiple output normalized data. For now, supported output
-types (on the left side of definition) are: C<tag>, C<display> and C<search>.
-
-=head1 FUNCTIONS
-
-=head2 data_structure
-
-Return data structure
-
-  my $ds = WebPAC::Normalize::Set(
-  	lookup => $lookup->lookup_hash,
-	row => $row,
-	rules => $normalize_pl_config,
-  );
-
-This function will B<die> if normalizastion can't be evaled.
-
-=cut
-
-sub data_structure {
-	my $arg = {@_};
-
-	die "need row argument" unless ($arg->{row});
-	die "need normalisation argument" unless ($arg->{rules});
-
-	no strict 'subs';
-	set_lookup( $arg->{lookup} );
-	set_rec( $arg->{row} );
-	clean_ds();
-	eval "$arg->{rules}";
-	die "error evaling $arg->{rules}: $@\n" if ($@);
-	return get_ds();
-}
-
-=head2 set_rec
-
-Set current record hash
-
-  set_rec( $rec );
-
-=cut
-
-my $rec;
-
-sub set_rec {
-	$rec = shift or die "no record hash";
-}
-
-=head2 tag
-
-Define new tag for I<search> and I<display>.
-
-  tag('Title', rec('200','a') );
-
-
-=cut
-
-my $out;
-
-sub tag {
-	my $name = shift or die "tag needs name as first argument";
-	my @o = grep { defined($_) && $_ ne '' } @_;
-	return unless (@o);
-	$out->{$name}->{tag} = $name;
-	$out->{$name}->{search} = \@o;
-	$out->{$name}->{display} = \@o;
-}
-
-=head2 display
-
-Define tag just for I<display>
-
-  @v = display('Title', rec('200','a') );
-
-=cut
-
-sub display {
-	my $name = shift or die "display needs name as first argument";
-	my @o = grep { defined($_) && $_ ne '' } @_;
-	return unless (@o);
-	$out->{$name}->{tag} = $name;
-	$out->{$name}->{display} = \@o;
-}
-
-=head2 search
-
-Prepare values just for I<search>
-
-  @v = search('Title', rec('200','a') );
-
-=cut
-
-sub search {
-	my $name = shift or die "search needs name as first argument";
-	my @o = grep { defined($_) && $_ ne '' } @_;
-	return unless (@o);
-	$out->{$name}->{tag} = $name;
-	$out->{$name}->{search} = \@o;
-}
-
-=head2 get_ds
-
-Return hash formatted as data structure
-
-  my $ds = get_ds();
-
-=cut
-
-sub get_ds {
-	return $out;
-}
-
-=head2 clean_ds
-
-Clean data structure hash for next record
-
-  clean_ds();
-
-=cut
-
-sub clean_ds {
-	$out = undef;
-}
-
-=head2 set_lookup
-
-Set current lookup hash
-
-  set_lookup( $lookup );
-
-=cut
-
-my $lookup;
-
-sub set_lookup {
-	$lookup = shift;
-}
-
-=head2 rec1
-
-Return all values in some field
-
-  @v = rec1('200')
-
-TODO: order of values is probably same as in source data, need to investigate that
-
-=cut
-
-sub rec1 {
-	my $f = shift;
-	return unless (defined($rec) && defined($rec->{$f}));
-	if (ref($rec->{$f}) eq 'ARRAY') {
-		return map { 
-			if (ref($_) eq 'HASH') {
-				values %{$_};
-			} else {
-				$_;
-			}
-		} @{ $rec->{$f} };
-	} elsif( defined($rec->{$f}) ) {
-		return $rec->{$f};
-	}
-}
-
-=head2 rec2
-
-Return all values in specific field and subfield
-
-  @v = rec2('200','a')
-
-=cut
-
-sub rec2 {
-	my $f = shift;
-	return unless (defined($rec && $rec->{$f}));
-	my $sf = shift;
-	return map { $_->{$sf} } grep { ref($_) eq 'HASH' && $_->{$sf} } @{ $rec->{$f} };
-}
-
-=head2 rec
-
-syntaxtic sugar for
-
-  @v = rec('200')
-  @v = rec('200','a')
-
-=cut
-
-sub rec {
-	if ($#_ == 0) {
-		return rec1(@_);
-	} elsif ($#_ == 1) {
-		return rec2(@_);
-	}
-}
-
-=head2 regex
-
-Apply regex to some or all values
-
-  @v = regex( 's/foo/bar/g', @v );
-
-=cut
-
-sub regex {
-	my $r = shift;
-	my @out;
-	#warn "r: $r\n",Dumper(\@_);
-	foreach my $t (@_) {
-		next unless ($t);
-		eval "\$t =~ $r";
-		push @out, $t if ($t && $t ne '');
-	}
-	return @out;
-}
-
-=head2 prefix
-
-Prefix all values with a string
-
-  @v = prefix( 'my_', @v );
-
-=cut
-
-sub prefix {
-	my $p = shift or die "prefix needs string as first argument";
-	return map { $p . $_ } grep { defined($_) } @_;
-}
-
-=head2 suffix
-
-suffix all values with a string
-
-  @v = suffix( '_my', @v );
-
-=cut
-
-sub suffix {
-	my $s = shift or die "suffix needs string as first argument";
-	return map { $_ . $s } grep { defined($_) } @_;
-}
-
-=head2 surround
-
-surround all values with a two strings
-
-  @v = surround( 'prefix_', '_suffix', @v );
-
-=cut
-
-sub surround {
-	my $p = shift or die "surround need prefix as first argument";
-	my $s = shift or die "surround needs suffix as second argument";
-	return map { $p . $_ . $s } grep { defined($_) } @_;
-}
-
-=head2 first
-
-Return first element
-
-  $v = first( @v );
-
-=cut
-
-sub first {
-	my $r = shift;
-	return $r;
-}
-
-=head2 lookup
-
-Consult lookup hashes for some value
-
-  @v = lookup( $v );
-  @v = lookup( @v );
-
-=cut
-
-sub lookup {
-	my $k = shift or return;
-	return unless (defined($lookup->{$k}));
-	if (ref($lookup->{$k}) eq 'ARRAY') {
-		return @{ $lookup->{$k} };
-	} else {
-		return $lookup->{$k};
-	}
-}
-
-=head2 join_with
-
-Joins walues with some delimiter
-
-  $v = join_with(", ", @v);
-
-=cut
-
-sub join_with {
-	my $d = shift;
-	return join($d, grep { defined($_) && $_ ne '' } @_);
-}
-
-# END
-1;
diff --git a/lib/WebPAC/Normalize/XML.pm b/lib/WebPAC/Normalize/XML.pm
deleted file mode 100644
index 7e829b5..0000000
--- a/lib/WebPAC/Normalize/XML.pm
+++ /dev/null
@@ -1,164 +0,0 @@
-package WebPAC::Normalize::XML;
-
-use warnings;
-use strict;
-
-use base qw/WebPAC::Common WebPAC::Normalize/;
-use XML::Simple;
-use Data::Dumper;
-use Text::Iconv;
-use YAML qw/Dump LoadFile/;
-
-=head1 NAME
-
-WebPAC::Normalize::XML - apply XML or YAML normalisaton rules
-
-=head1 VERSION
-
-Version 0.03
-
-=cut
-
-our $VERSION = '0.03';
-
-=head1 SYNOPSIS
-
-This module uses C<conf/normalize/*.xml> files to perform normalisation
-from input records
-
-=cut
-
-=head1 FUNCTIONS
-
-=head2 open
-
-Read normalisation rules defined using XML from C<conf/normalize/*.xml> and
-parse it.
-
- my $n = new WebPAC::Normalize::XML;
- $n->open(
- 	tag => 'isis',
- 	xml_file => '/path/to/conf/normalize/isis.xml',
- );
-
-C<tag> defines tag to use within C<xml_file>
-
-C<xml_file> defines path to normalize XML
-
-C<tags> define additional tags that can be forced (and an be array).
-
-=cut
-
-sub open {
-	my $self = shift;
-
-	my $arg = {@_};
-
-	my $log = $self->_get_logger();
-
-	foreach my $req (qw/tag xml_file/) {
-		$log->logconfess("need argument $req") unless $arg->{$req};
-	}
-
-	$self->{'tag'} = $arg->{'tag'};
-	my $xml_file = $arg->{'xml_file'};
-
-	$log->info("using $xml_file tag <",$self->{'tag'},">");
-
-	$log->logdie("normalisation xml file '$xml_file' doesn't exist!") if (! -e $xml_file);
-
-	$self->{'import_xml_file'} = $xml_file;
-
-	my @force_array = [ $self->{'tag'}, 'config', 'format' ];
-	push @force_array, $self->{'tags'} if ($self->{'tags'});
-
-	$self->{'import_xml'} = XMLin($xml_file,
-		ForceArray => @force_array,
-		ForceContent => 1,
-	);
-
-	$log->debug("import xml is ",sub { Dumper($self->{'import_xml'}) }, $self->{lookup} ? " using lookups" : "lookups disabled");
-
-	#print STDERR Dump($self->{import_xml});
-
-	return $self;
-}
-
-=head2 open_yaml
-
-Read normalisation rules defined in YAML file located usually at
-C<conf/normalize/*.yml> and parse it.
-
- my $n = new WebPAC::Normalize::XML;
- $n->open_yaml(
- 	tag => 'isis',
- 	path => '/path/to/conf/normalize/isis.yml',
- );
-
-=cut
-
-sub open_yaml {
-	my $self = shift;
-
-	my $arg = {@_};
-
-	my $log = $self->_get_logger();
-
-	foreach my $req (qw/tag path/) {
-		$log->logconfess("need argument $req") unless $arg->{$req};
-	}
-
-	my $path = $arg->{path};
-	$self->{tag} = $arg->{tag};
-
-	$log->logdie("normalisation yaml file '$path' doesn't exist!") if (! -e $path);
-
-	$log->info("using $path normalization YAML");
-
-	$self->{'import_xml'} = LoadFile( $path ) || $log->die("can't load $path: $!");
-
-	$log->debug("import yaml is ",sub { Dumper($self->{'import_xml'}) }, $self->{lookup} ? " using lookups" : "lookups disabled");
-
-	$self->{_skip_x} = 1;
-
-	return $self;
-}
-
-=head2 _x
-
-Convert string from XML UTF-8 encoding to code page defined in C<xml_file>.
-
- my $text = $n->_x('utf8 text');
-
-Default application code page is C<ISO-8859-2>. You will probably want to
-change that when creating new instance of object based on this one.
-
-=cut
-
-sub _x {
-	my $self = shift;
-	my $utf8 = shift || return;
-	return $utf8 if ($self->{_skip_x});
-
-	# create UTF-8 convertor for import_xml files
-	$self->{'utf2cp'} ||= Text::Iconv->new('UTF-8' ,$self->{'code_page'} || 'ISO-8859-2');
-
-	return $self->{'utf2cp'}->convert($utf8) ||
-		$self->_get_logger()->logwarn("can't convert '$utf8'");
-}
-
-
-=head1 AUTHOR
-
-Dobrica Pavlinusic, C<< <dpavlin@rot13.org> >>
-
-=head1 COPYRIGHT & LICENSE
-
-Copyright 2005 Dobrica Pavlinusic, All Rights Reserved.
-
-This program is free software; you can redistribute it and/or modify it
-under the same terms as Perl itself.
-
-=cut
-
-1; # End of WebPAC::Normalize::XML
diff --git a/lib/WebPAC/Output/KinoSearch.pm b/lib/WebPAC/Output/KinoSearch.pm
index 7f562a4..6c50ac3 100644
--- a/lib/WebPAC/Output/KinoSearch.pm
+++ b/lib/WebPAC/Output/KinoSearch.pm
@@ -9,6 +9,7 @@ use KinoSearch::InvIndexer;
 use KinoSearch::Analysis::PolyAnalyzer;
 use Encode qw/from_to/;
 use Data::Dumper;
+use Storable;
 
 =head1 NAME
 
@@ -16,11 +17,11 @@ WebPAC::Output::KinoSearch - Create KinoSearch full text index
 
 =head1 VERSION
 
-Version 0.01
+Version 0.02
 
 =cut
 
-our $VERSION = '0.01';
+our $VERSION = '0.02';
 
 =head1 SYNOPSIS
 
@@ -99,6 +100,18 @@ sub new {
 		analyzer => $analyzer,
 	);
 
+	my $fields_path = $self->{index_path} . '/fields.storable';
+	$fields_path =~ s#//#/#g;
+	if (-e $fields_path) {
+		$self->{fields} = retrieve($fields_path) ||
+			$log->warn("can't open $fields_path: $!");
+	} else {
+		$log->error("This will be dummy run since no fields statistics are found!");
+		$log->error("You will have to re-run indexing to get search results!");
+		$self->{dummy_run} = 1;
+	}
+	$self->{fields_path} = $fields_path;
+
 	foreach my $f (@{ $self->{fields} }) {
 		$self->{invindex}->spec_field( 
 			name  => $f,
@@ -158,6 +171,10 @@ sub add {
 	sub add_value($$$$$) {
 		my ($self,$log,$doc,$n,$v) = @_;
 		return unless ($v);
+
+		$self->{value_usage}->{$n}++;
+		return if ($self->{dummy_run});
+
 		eval { $doc->set_value($n, $self->convert($v) ) };
 		$log->warn("can't insert: $n = $v") if ($@);
 	}
@@ -208,8 +225,20 @@ Close index
 sub finish {
 	my $self = shift;
 
-	$self->_get_logger()->info("finish index writing to disk");
+	my $log = $self->_get_logger();
+
+	$log->info("finish index writing to disk");
 	$self->{invindex}->finish;
+
+	$log->info("writing value usage file");
+
+	# add fields from last run
+	map { $self->{value_usage}->{$_}++ } @{ $self->{fields} };
+
+	my @fields = keys %{ $self->{value_usage} };
+	store \@fields, $self->{fields_path} ||
+		$log->warn("can't write $self->{fields_path}: $!");
+
 }
 
 =head2 convert
diff --git a/run.pl b/run.pl
index b8fb7ce..825facd 100755
--- a/run.pl
+++ b/run.pl
@@ -11,8 +11,7 @@ use WebPAC::Common 0.02;
 use WebPAC::Lookup;
 use WebPAC::Input 0.03;
 use WebPAC::Store 0.03;
-use WebPAC::Normalize::XML;
-use WebPAC::Normalize::Set;
+use WebPAC::Normalize;
 use WebPAC::Output::TT;
 use WebPAC::Validate;
 use YAML qw/LoadFile/;
@@ -54,11 +53,6 @@ or C<type> from input
 
 path to YAML configuration file
 
-=item --force-set
-
-force conversion C<< normalize->path >> in C<config.yml> from
-C<.xml> to C<.pl>
-
 =item --stats
 
 disable indexing and dump statistics about field and subfield
@@ -79,7 +73,6 @@ my $clean = 0;
 my $config = 'conf/config.yml';
 my $debug = 0;
 my $only_filter;
-my $force_set = 0;
 my $stats = 0;
 my $validate_path;
 
@@ -91,7 +84,6 @@ GetOptions(
 	"only=s" => \$only_filter,
 	"config" => \$config,
 	"debug" => \$debug,
-	"force-set" => \$force_set,
 	"stats" => \$stats,
 	"validate=s" => \$validate_path,
 );
@@ -228,43 +220,12 @@ while (my ($database, $db_config) = each %{ $config->{databases} }) {
 			%{ $input },
 		);
 
-		my $n = new WebPAC::Normalize::XML(
-		#	filter => { 'foo' => sub { shift } },
-			db => $db,
-			lookup_regex => $lookup ? $lookup->regex : undef,
-			lookup => $lookup,
-			prefix => $input->{name},
-		);
-
 		my $rules;
 		my $normalize_path = $input->{normalize}->{path};
 
-		if ($force_set) {
-			my $new_norm_path = $normalize_path;
-			$new_norm_path =~ s/\.xml$/.pl/;
-			if (-e $new_norm_path) {
-				$log->debug("--force-set replaced $normalize_path with $new_norm_path");
-				$normalize_path = $new_norm_path;
-			} else {
-				$log->debug("--force-set failed on $new_norm_path, fallback to $normalize_path");
-			}
-		}
+		$log->logdie("Found '$normalize_path' as normalization file which isn't supported any more!") unless ( $normalize_path =~ m!\.pl$!i );
 
-		if ($normalize_path =~ m/\.xml$/i) {
-			$n->open(
-				tag => $input->{normalize}->{tag},
-				xml_file => $normalize_path,
-			);
-		} elsif ($normalize_path =~ m/\.(?:yml|yaml)$/i) {
-			$n->open_yaml(
-				path => $normalize_path,
-				tag => $input->{normalize}->{tag},
-			);
-		} elsif ($normalize_path =~ m/\.(?:pl)$/i) {
-			$n = undef;
-			$log->info("using WebPAC::Normalize::Set to process $normalize_path");
-			$rules = read_file( $normalize_path ) or die "can't open $normalize_path: $!";
-		}
+		my $rules = read_file( $normalize_path ) or die "can't open $normalize_path: $!";
 
 		foreach my $pos ( 0 ... $input_db->size ) {
 
@@ -285,22 +246,17 @@ while (my ($database, $db_config) = each %{ $config->{databases} }) {
 			}
 
 				
-			my $ds;
-			if ($n) {
-				$ds = $n->data_structure($row);
-			} else {
-				$ds = WebPAC::Normalize::Set::data_structure(
-					row => $row,
-					rules => $rules,
-					lookup => $lookup ? $lookup->lookup_hash : undef,
-				);
-
-				$db->save_ds(
-					id => $mfn,
-					ds => $ds,
-					prefix => $input->{name},
-				) if ($ds && !$stats);
-			}
+			my $ds = WebPAC::Normalize::data_structure(
+				row => $row,
+				rules => $rules,
+				lookup => $lookup ? $lookup->lookup_hash : undef,
+			);
+
+			$db->save_ds(
+				id => $mfn,
+				ds => $ds,
+				prefix => $input->{name},
+			) if ($ds && !$stats);
 
 			$indexer->add(
 				id => $input->{name} . "/" . $mfn,
diff --git a/t/0-load.t b/t/0-load.t
index 06d6667..633eab2 100644
--- a/t/0-load.t
+++ b/t/0-load.t
@@ -11,7 +11,7 @@ use_ok( 'WebPAC::Input' );
 use_ok( 'WebPAC::Input::ISIS' );
 use_ok( 'WebPAC::Store' );
 use_ok( 'WebPAC::Lookup' );
-use_ok( 'WebPAC::Normalize::XML' );
+use_ok( 'WebPAC::Normalize' );
 use_ok( 'WebPAC::Output' );
 use_ok( 'WebPAC::Output::Estraier' );
 use_ok( 'WebPAC::Output::TT' );
diff --git a/t/3-normalize-set.t b/t/3-normalize-set.t
deleted file mode 100755
index da7a881..0000000
--- a/t/3-normalize-set.t
+++ /dev/null
@@ -1,345 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-
-use Test::More tests => 67;
-use Test::Exception;
-use Cwd qw/abs_path/;
-use blib;
-use File::Slurp;
-
-use Data::Dumper;
-my $debug = shift @ARGV;
-
-BEGIN {
-	use_ok( 'WebPAC::Normalize::Set' );
-}
-
-ok(my $abs_path = abs_path($0), "abs_path");
-$abs_path =~ s#/[^/]*$#/#;
-diag "abs_path: $abs_path" if ($debug);
-
-#throws_ok { new WebPAC::Normalize::XML( lookup_regex => 'foo' ) } qr/pair/, "lookup_regex without lookup";
-
-my $rec1 = {
-	'200' => [{
-		'a' => '200a',
-		'b' => '200b',
-		},{
-		'c' => '200c',
-		'd' => '200d',
-		},{
-		'a' => '200a*2',
-		'd' => '200d*2',
-		}],
-	'201' => [{
-		'x' => '201x',
-		'y' => '201y',
-		}],
-	'900' => [
-		'900-no_subfield'
-		],
-	'901' => [{
-		'a' => '900a',
-		}],
-	'902' => [{
-		'z' => '900',
-		}],
-};
-
-my $rec2 = {
- '675' => [ {
-              'a' => '159.9'
-            } ],
- '210' => [ {
-              'c' => 'New York University press',
-              'a' => 'New York',
-              'd' => 'cop. 1988'
-            } ],
- '700' => [ {
-              'a' => 'Haynal',
-              'b' => 'André'
-            } ],
- '801' => [ 'FFZG' ],
- '991' => [ '8302' ],
- '000' => [ 1 ],
- '702' => [ {
-              'a' => 'Holder',
-              'b' => 'Elizabeth'
-            } ],
- '215' => [ {
-              'c' => 'ilustr',
-              'a' => 'xix, 202 str',
-              'd' => '23cm'
-            } ],
- '990' => [
-            '2140',
-            '88',
-            'HAY'
-          ],
- '200' => [ {
-              'e' => 'from Freud and Ferenczi to Michael balint',
-              'a' => 'Controversies in psychoanalytic method',
-              'g' => 'translated by Elizabeth Holder on the basisi of a first draft by Archie Hooton ; with a preface by Daniel N. Stern',
-              'f' => 'by André E. Haynal'
-            } ],
- '610' => [ 'povijest psihoanalize' ],
- '994' => [ {
-              'c' => '',
-              'a' => 'PS',
-              'b' => 'MG'
-            } ],
- '320' => [ 'Kazalo' ],
- '101' => [ 'ENG' ],
- '686' => [ '2140' ],
- '300' => [ 'Prijevod djela: ' ],
-};
-
-
-my $lookup1 = {
-	'00900' => [
-		'lookup 1',
-		'lookup 2',
-	],
-};
-
-my $lookup2 = {
-	'00900' => 'lookup',
-};
-
-
-sub test {
-	print Dumper( @_ ), ("-" x 78), "\n";
-	ok( defined(@_) );
-}
-
-# how much of string evaled to display?
-my $max_eval_output = 170;
-
-sub dump_error {
-	my ($msg,$code) = @_;
-
-	my @l = split(/[\n\r]/, $code);
-	my $out = "$msg\n";
-
-	foreach my $i ( 0 .. $#l ) {
-		$out .= sprintf("%2d: %s\n", $i, $l[$i]);
-	}
-
-	return $out;
-}
-
-sub test_s {
-	my $t = shift || die;
-
-	my $eval_t = $t;
-	$eval_t =~ s/[\n\r\s]+/ /gs;
-	$eval_t = substr($eval_t,0,$max_eval_output) . '...' if (length($eval_t) > $max_eval_output);
-
-	eval "$t";
-	ok(! $@, $@ ? dump_error($@, $t) : "eval: $eval_t");
-}
-
-{
-	no strict 'subs';
-	use WebPAC::Normalize::Set;
-
-	ok(! set_lookup( undef ), "set_lookup(undef)");
-
-	set_rec( $rec1 );
-
-	cmp_ok( join(",", rec2('200','a') ), 'eq', '200a,200a*2', 'join rec2' );
-	cmp_ok( join(",", rec2('200','a'), rec2('200','b') ), 'eq', '200a,200a*2,200b', 'join rec2 rec2' );
-	cmp_ok( join(" * ", sort(rec1('200'), rec1('201') )), 'eq', '200a * 200a*2 * 200b * 200c * 200d * 200d*2 * 201x * 201y', 'join sort rec1 rec1' );
-	diag "is_deeply checks\n";
-	is_deeply( \[ rec1('200') ], \[ qw/200a 200b 200c 200d 200a*2 200d*2/ ] );
-	is_deeply( \[ regex( 's/0/o/g', rec1('200') ) ],  \[ qw/2ooa 2oob 2ooc 2ood 2ooa*2 2ood*2/ ]);
-	is_deeply( \[ grep { /\*/ } regex( 's/0/o/g', rec1('200') ) ], \[ qw/2ooa*2 2ood*2/ ]);
-	is_deeply( \[ rec('902') ], \[ '900' ] );
-
-	cmp_ok( rec('902'), 'eq', rec('902','z'), 'rec sf' );
-
-	# simple list manipulatons
-	cmp_ok( join('', prefix('ab', 'cd') ), 'eq', 'abcd', 'prefix');
-	cmp_ok( join('', suffix('xy', 'cd') ), 'eq', 'cdxy', 'suffix');
-	cmp_ok( join('', surround('->', '<-', 'a','b','c') ), 'eq', '->a<-->b<-->c<-', 'surround');
-
-
-	set_lookup( $lookup1 );
-	
-	cmp_ok(
-		join_with(" i ",
-			lookup( 
-				regex( 's/^/00/',
-					rec2('902','z')
-				)
-			) 
-		),
-	'eq', 'lookup 1 i lookup 2', 'join lookup regex rec2');
-
-	# check join_with operations
-
-	sub test_join_with_2 {
-		my ($a,$b,$e) = @_;
-
-		cmp_ok(
-			join_with(" <1> ",
-				rec('201',$a),
-				rec('201',$b),
-			), 
-		'eq', $e, "join_with $a <1> $b = $e");
-	}
-
-	test_join_with_2('_','_','');
-	test_join_with_2('x','_','201x');
-	test_join_with_2('_','x','201x');
-	test_join_with_2('x','y','201x <1> 201y');
-
-	sub test_join_with_3 {
-		my ($a,$b,$c,$e) = @_;
-
-		cmp_ok(
-			join_with(" <1> ", rec('201',$a),
-				join_with(" <2> ", rec('201',$b),
-					rec('201',$c),
-				)
-			), 
-		'eq', $e, "join_with $a <1> $b <2> $c = $e");
-	};
-
-	test_join_with_3('_','_','_','');
-	test_join_with_3('x','_','_','201x');
-	test_join_with_3('_','x','_','201x');
-	test_join_with_3('_','_','x','201x');
-	test_join_with_3('x','y','_','201x <1> 201y');
-	test_join_with_3('x','_','y','201x <1> 201y');
-	test_join_with_3('_','x','y','201x <2> 201y');
-	test_join_with_3('x','_','y','201x <1> 201y');
-	test_join_with_3('x','y','x','201x <1> 201y <2> 201x');
-
-	# test lookups
-
-	set_lookup( $lookup2 );
-
-	is_deeply( \[ lookup( prefix( '00', rec('902') ) ) ], \[ 'lookup' ], 'lookup prefix' );
-
-	ok(! lookup('non-existent'), 'lookup non-existant' );
-
-	set_rec( $rec2 );
-
-	test_s(qq{
-		tag('Title',
-			rec('200','a')
-		);
-	});
-	test_s(qq{
-		tag('Who',
-			join_with(" ",
-				rec('702','a'),
-				rec('702','b')
-			)
-		);
-	});
-
-	test_s(qq{
-		display('Publisher',
-			rec('210','c')
-		)
-	});
-	
-	test_s(qq{
-		search('Year',
-			regex( 's/[^\\d]+//',
-				rec('210','d')
-			)
-		)
-	});
-
-	ok(my $ds = get_ds(), "get_ds");
-	diag "ds = ", Dumper($ds) if ($debug);
-
-
-	sub test_check_ds {
-
-		my $t = shift;
-
-		ok($ds = get_ds(), 'get_ds');
-		diag Dumper( $ds ) if ($debug);
-
-		ok( $ds && $ds->{something}, 'get_ds->something exists' );
-		ok( $ds && $ds->{something}->{$t}, 'get_ds->something->'.$t.' exists') if ($t);
-		ok( $ds && !$ds->{empty}, 'get_ds->empty doesn\'t' );
-
-		return $ds;
-	}
-
-	clean_ds();
-	test_s(qq{ search('something', '42'); });
-	test_s(qq{ search('empty', ''); });
-	test_check_ds('search');
-
-	clean_ds();
-	test_s(qq{ display('something', '42'); });
-	test_s(qq{ display('empty', ''); });
-	test_check_ds('display');
-
-	clean_ds();
-	test_s(qq{ tag('something', '42'); });
-	test_s(qq{ tag('empty', ''); });
-	test_check_ds('search');
-	test_check_ds('display');
-
-	clean_ds();
-	my $n = read_file( "$abs_path/data/normalize.pl" );
-	$n .= "\n1;\n";
-	#diag "normalize code:\n$n\n";
-	test_s( $n );
-
-	ok($ds = get_ds(), "get_ds");
-	diag "ds = ", Dumper($ds) if ($debug);
-
-	my $rec = {
-		'200' => [{
-			'a' => '200a',
-			'b' => '200b',
-		}],
-	};
-	my $rules = qq{ search('mixed', rec('200') ) };
-	
-	clean_ds();
-	set_rec( $rec );
-	test_s( $rules );
-	ok($ds = get_ds(), "get_ds");
-	is_deeply( $ds, {
-		'mixed' => {
-			'search' => [ '200a', '200b' ],
-			'tag' => 'mixed'
-		}
-	}, 'correct get_ds');
-
-	ok(my $ds2 = WebPAC::Normalize::Set::data_structure(
-		row => $rec,
-		rules => $rules,
-	), 'data_structure');
-	is_deeply( $ds, $ds2, 'data_structure(s) same');
-
-	# wird and non-valid structure which is supported anyway
-	clean_ds();
-	set_rec({
-		'200' => [{
-			'a' => '200a',
-		},
-			'200-solo'
-		]
-	});
-	test_s(qq{ search('mixed', rec('200') ) });
-	ok($ds = get_ds(), "get_ds");
-	is_deeply( $ds, {
-		'mixed' => {
-			'search' => [ '200a', '200-solo' ],
-			'tag' => 'mixed'
-		}
-	}, 'correct get_ds');
-
-}
-
diff --git a/t/3-normalize-xml.t b/t/3-normalize-xml.t
deleted file mode 100755
index a55e52b..0000000
--- a/t/3-normalize-xml.t
+++ /dev/null
@@ -1,263 +0,0 @@
-#!/usr/bin/perl -w
-
-use Test::More tests => 140;
-use Test::Exception;
-use Cwd qw/abs_path/;
-use blib;
-use strict;
-use Data::Dumper;
-
-BEGIN {
-use_ok( 'WebPAC::Normalize::XML' );
-}
-
-ok(my $abs_path = abs_path($0), "abs_path");
-$abs_path =~ s#/[^/]*$#/#;
-#diag "abs_path: $abs_path";
-
-throws_ok { new WebPAC::Normalize::XML( lookup_regex => 'foo' ) } qr/pair/, "lookup_regex without lookup";
-throws_ok { new WebPAC::Normalize::XML( lookup => 'bar' ) } qr/pair/, "lookup without lookup_regex";
-
-ok(my $n = new WebPAC::Normalize::XML(
-	debug => 0,
-	filter => {
-		regex => sub {
-			my ($val, $regex) = @_;
-			eval "\$val =~ $regex";
-			return $val;
-		},
-	},
-), "new");
-
-throws_ok { $n->open() } qr/tag/, "open without tag";
-throws_ok { $n->open( tag => 'isis' ) } qr/xml_file/, "open without xml_file";
-throws_ok { $n->open( tag => 'isis', xml_file => '/foo/bar/baz' ) } qr/file.*doesn't exist/, "open with invalid xml_file";
-ok( $n->open(
-	tag => 'isis',
-	xml_file => "$abs_path/data/normalize.xml",
-), "open");
-
-my $rec = {
- '675' => [
-            {
-              'a' => '159.9'
-            }
-          ],
- '210' => [
-            {
-              'c' => 'New York University press',
-              'a' => 'New York',
-              'd' => 'cop. 1988'
-            }
-          ],
- '700' => [
-            {
-              'a' => 'Haynal',
-              'b' => 'André'
-            }
-          ],
- '801' => [
-            'FFZG'
-          ],
- '991' => [
-            '8302'
-          ],
- '000' => [
-            1
-          ],
- '702' => [
-            {
-              'a' => 'Holder',
-              'b' => 'Elizabeth'
-            }
-          ],
- '215' => [
-            {
-              'c' => 'ilustr',
-              'a' => 'xix, 202 str',
-              'd' => '23cm'
-            }
-          ],
- '990' => [
-            '2140',
-            '88',
-            'HAY'
-          ],
- '200' => [
-            {
-              'e' => 'from Freud and Ferenczi to Michael balint',
-              'a' => 'Controversies in psychoanalytic method',
-              'g' => 'translated by Elizabeth Holder on the basisi of a first draft by Archie Hooton ; with a preface by Daniel N. Stern',
-              'f' => 'by André E. Haynal'
-            }
-          ],
- '610' => [
-            'povijest psihoanalize'
-          ],
- '994' => [
-            {
-              'c' => '',
-              'a' => 'PS',
-              'b' => 'MG'
-            }
-          ],
- '320' => [
-            'Kazalo'
-          ],
- '101' => [
-            'ENG'
-          ],
- '686' => [
-            '2140'
-          ],
- '300' => [
-            'Prijevod djela: '
-          ]
-};
-
-foreach my $fld (keys %$rec) {
-	my $r = 0;
-	foreach my $item ($rec->{$fld}) {
-		if (ref($item) eq 'HASH') {
-			foreach my $sf (keys %$item) {
-				my $found = 0;
-				ok($n->get_data(\$rec, $fld, $sf, $r, \$found), "v${fld}^${sf} / $r");
-				ok($found, "found");
-			}
-			my $found = 0;
-			ok(! $n->get_data(\$rec, $fld, 'x', $r, \$found), "no v${fld}^x / $r");
-			ok(! $found, "not found");
-		} else {
-			my $found = 0;
-			ok($n->get_data(\$rec, $fld, undef, $r, \$found), "v${fld} / $r");
-			ok($found, "found");
-		}
-	}
-	my $found = 0;
-	ok(! $n->get_data(\$rec, '999', '', $r, \$found), "no v${fld} / $r");
-	ok(! $found, "not found");
-}
-
-ok(my $ds = $n->data_structure( $rec ), "data_structure");
-
-#diag Dumper($rec, $ds);
-
-# fake load of our test normalisation data
-$n->{tag} = 'isis';
-
-#diag Dumper($n->{import_xml}->{indexer});
-
-
-$rec = {
-	'900' => [ {
-		'a' => '1',
-		'b' => '2',
-		'c' => '3',
-		'x' => 'yap',
-	} ],
-};
-
-my $import = {
-	'Tag' => { 'isis' => [
-			{ content => 'v900^a + v900^b = v900^c [v900^x]' },
-	] },
-};
-
-sub parse_test($$$$$) {
-	my ($import,$tag,$rec,$i,$r) = @_;
-	$n->{import_xml}->{indexer} = $import;
-	# erase internal cache (yak!)
-	delete($n->{_tags_by_order});
-	push @{$rec->{'000'}}, 42 unless ($rec->{'000'});
-	#diag "test normalisation of: ",Dumper($n->{import_xml}->{indexer}, $rec);
-	ok(my $ds = $n->data_structure( $rec ), "data_structure");
-	#diag Dumper($ds->{$tag}->{display}); # if ($i == 0);
-	cmp_ok($ds->{$tag}->{display}->[$i], 'eq', $r, "parse $tag/$i - $r");
-}
-
-parse_test($import, 'Tag', $rec, 0, '1 + 2 = 3 [yap]');
-
-delete($rec->{'900'}->[0]->{'b'});
-parse_test($import, 'Tag', $rec, 0, '1 = 3 [yap]');
-
-$rec->{'900'}->[0]->{'b'} = 5;
-$rec->{'900'}->[0]->{'c'} = 6;
-parse_test($import, 'Tag', $rec, 0, '1 + 5 = 6 [yap]');
-
-delete($rec->{'900'}->[0]->{'c'});
-$rec->{'900'}->[0]->{'x'} = 'hmmm';
-parse_test($import, 'Tag', $rec, 0, '1 + 5 [hmmm]');
-
-$rec->{'900'}->[0]->{'x'} = 'nope!';
-delete($rec->{'900'}->[0]->{'a'});
-parse_test($import, 'Tag', $rec, 0, '5 [nope!]');
-
-$rec = {
-	'900' => [ {
-		'b' => 'b1',
-		'x' => 'b1',
-	},{
-		'a' => 'a1',
-		'b' => 'b2',
-		'x' => 'a1_b2',
-	},{
-		'b' => 'b3',
-		'c' => 'c1',
-		'x' => 'b3_c1',
-	},{
-		'a' => 'a2',
-		'b' => 'b4',
-		'c' => 'c2',
-		'x' => 'a2_b4_c2',
-	} ],
-};
-
-$import = {
-	'Tag' => { 'isis' => [
-			{ content => '0 v900^a 1 v900^b 2 v900^c 3 v900^x 4' },
-	] },
-};
-
-parse_test($import, 'Tag', $rec, 0, '0 b1 3 b1 4');
-parse_test($import, 'Tag', $rec, 1, '0 a1 1 b2 3 a1_b2 4');
-parse_test($import, 'Tag', $rec, 2, '0 b3 2 c1 3 b3_c1 4');
-parse_test($import, 'Tag', $rec, 3, '0 a2 1 b4 2 c2 3 a2_b4_c2 4');
-
-sub parse_test_arr($$$$) {
-	my ($import, $tag, $rec, $arr) = @_;
-	my $i = 0;
-	foreach my $res (@{$arr}) {
-		parse_test($import, $tag, $rec, $i, $res);
-		$i++;
-	}
-}
-
-$import = {
-	'Tag_a' => { 'isis' => [
-		{ content => 'v900^a' },
-	] },
-	'Tag_b' => { 'isis' => [
-		{ content => 'v900^b' },
-	] },
-	'Tag_c' => { 'isis' => [
-		{ content => 'v900^c' },
-	] },
-	'Tag_x' => { 'isis' => [
-		{ content => 'v900^x' },
-	] },
-	'Tag_s1' => { 'isis' => [
-		{ content => 'v900^a = v900^c' },
-	] },
-	'Tag_noval' => { 'isis' => [
-		{ content => 'v911^1' },
-		{ content => 'v900^c' },
-	] },
-};
-
-parse_test_arr($import, 'Tag_a', $rec, [ '','a1','','a2' ] );
-parse_test_arr($import, 'Tag_b', $rec, [ 'b1','b2','b3','b4' ] );
-parse_test_arr($import, 'Tag_c', $rec, [ '','','c1','c2' ] );
-parse_test_arr($import, 'Tag_x', $rec, [ 'b1','a1_b2','b3_c1','a2_b4_c2' ] );
-parse_test_arr($import, 'Tag_s1', $rec, [ '', 'a1', 'c1', 'a2 = c2' ] );
-parse_test_arr($import, 'Tag_noval', $rec, [ '','','c1','c2' ] );
-
diff --git a/t/3-normalize.t b/t/3-normalize.t
new file mode 100755
index 0000000..7321bf5
--- /dev/null
+++ b/t/3-normalize.t
@@ -0,0 +1,345 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+use Test::More tests => 67;
+use Test::Exception;
+use Cwd qw/abs_path/;
+use blib;
+use File::Slurp;
+
+use Data::Dumper;
+my $debug = shift @ARGV;
+
+BEGIN {
+	use_ok( 'WebPAC::Normalize' );
+}
+
+ok(my $abs_path = abs_path($0), "abs_path");
+$abs_path =~ s#/[^/]*$#/#;
+diag "abs_path: $abs_path" if ($debug);
+
+#throws_ok { new WebPAC::Normalize::XML( lookup_regex => 'foo' ) } qr/pair/, "lookup_regex without lookup";
+
+my $rec1 = {
+	'200' => [{
+		'a' => '200a',
+		'b' => '200b',
+		},{
+		'c' => '200c',
+		'd' => '200d',
+		},{
+		'a' => '200a*2',
+		'd' => '200d*2',
+		}],
+	'201' => [{
+		'x' => '201x',
+		'y' => '201y',
+		}],
+	'900' => [
+		'900-no_subfield'
+		],
+	'901' => [{
+		'a' => '900a',
+		}],
+	'902' => [{
+		'z' => '900',
+		}],
+};
+
+my $rec2 = {
+ '675' => [ {
+              'a' => '159.9'
+            } ],
+ '210' => [ {
+              'c' => 'New York University press',
+              'a' => 'New York',
+              'd' => 'cop. 1988'
+            } ],
+ '700' => [ {
+              'a' => 'Haynal',
+              'b' => 'André'
+            } ],
+ '801' => [ 'FFZG' ],
+ '991' => [ '8302' ],
+ '000' => [ 1 ],
+ '702' => [ {
+              'a' => 'Holder',
+              'b' => 'Elizabeth'
+            } ],
+ '215' => [ {
+              'c' => 'ilustr',
+              'a' => 'xix, 202 str',
+              'd' => '23cm'
+            } ],
+ '990' => [
+            '2140',
+            '88',
+            'HAY'
+          ],
+ '200' => [ {
+              'e' => 'from Freud and Ferenczi to Michael balint',
+              'a' => 'Controversies in psychoanalytic method',
+              'g' => 'translated by Elizabeth Holder on the basisi of a first draft by Archie Hooton ; with a preface by Daniel N. Stern',
+              'f' => 'by André E. Haynal'
+            } ],
+ '610' => [ 'povijest psihoanalize' ],
+ '994' => [ {
+              'c' => '',
+              'a' => 'PS',
+              'b' => 'MG'
+            } ],
+ '320' => [ 'Kazalo' ],
+ '101' => [ 'ENG' ],
+ '686' => [ '2140' ],
+ '300' => [ 'Prijevod djela: ' ],
+};
+
+
+my $lookup1 = {
+	'00900' => [
+		'lookup 1',
+		'lookup 2',
+	],
+};
+
+my $lookup2 = {
+	'00900' => 'lookup',
+};
+
+
+sub test {
+	print Dumper( @_ ), ("-" x 78), "\n";
+	ok( defined(@_) );
+}
+
+# how much of string evaled to display?
+my $max_eval_output = 170;
+
+sub dump_error {
+	my ($msg,$code) = @_;
+
+	my @l = split(/[\n\r]/, $code);
+	my $out = "$msg\n";
+
+	foreach my $i ( 0 .. $#l ) {
+		$out .= sprintf("%2d: %s\n", $i, $l[$i]);
+	}
+
+	return $out;
+}
+
+sub test_s {
+	my $t = shift || die;
+
+	my $eval_t = $t;
+	$eval_t =~ s/[\n\r\s]+/ /gs;
+	$eval_t = substr($eval_t,0,$max_eval_output) . '...' if (length($eval_t) > $max_eval_output);
+
+	eval "$t";
+	ok(! $@, $@ ? dump_error($@, $t) : "eval: $eval_t");
+}
+
+{
+	no strict 'subs';
+	use WebPAC::Normalize;
+
+	ok(! set_lookup( undef ), "set_lookup(undef)");
+
+	set_rec( $rec1 );
+
+	cmp_ok( join(",", rec2('200','a') ), 'eq', '200a,200a*2', 'join rec2' );
+	cmp_ok( join(",", rec2('200','a'), rec2('200','b') ), 'eq', '200a,200a*2,200b', 'join rec2 rec2' );
+	cmp_ok( join(" * ", sort(rec1('200'), rec1('201') )), 'eq', '200a * 200a*2 * 200b * 200c * 200d * 200d*2 * 201x * 201y', 'join sort rec1 rec1' );
+	diag "is_deeply checks\n";
+	is_deeply( \[ rec1('200') ], \[ qw/200a 200b 200c 200d 200a*2 200d*2/ ] );
+	is_deeply( \[ regex( 's/0/o/g', rec1('200') ) ],  \[ qw/2ooa 2oob 2ooc 2ood 2ooa*2 2ood*2/ ]);
+	is_deeply( \[ grep { /\*/ } regex( 's/0/o/g', rec1('200') ) ], \[ qw/2ooa*2 2ood*2/ ]);
+	is_deeply( \[ rec('902') ], \[ '900' ] );
+
+	cmp_ok( rec('902'), 'eq', rec('902','z'), 'rec sf' );
+
+	# simple list manipulatons
+	cmp_ok( join('', prefix('ab', 'cd') ), 'eq', 'abcd', 'prefix');
+	cmp_ok( join('', suffix('xy', 'cd') ), 'eq', 'cdxy', 'suffix');
+	cmp_ok( join('', surround('->', '<-', 'a','b','c') ), 'eq', '->a<-->b<-->c<-', 'surround');
+
+
+	set_lookup( $lookup1 );
+	
+	cmp_ok(
+		join_with(" i ",
+			lookup( 
+				regex( 's/^/00/',
+					rec2('902','z')
+				)
+			) 
+		),
+	'eq', 'lookup 1 i lookup 2', 'join lookup regex rec2');
+
+	# check join_with operations
+
+	sub test_join_with_2 {
+		my ($a,$b,$e) = @_;
+
+		cmp_ok(
+			join_with(" <1> ",
+				rec('201',$a),
+				rec('201',$b),
+			), 
+		'eq', $e, "join_with $a <1> $b = $e");
+	}
+
+	test_join_with_2('_','_','');
+	test_join_with_2('x','_','201x');
+	test_join_with_2('_','x','201x');
+	test_join_with_2('x','y','201x <1> 201y');
+
+	sub test_join_with_3 {
+		my ($a,$b,$c,$e) = @_;
+
+		cmp_ok(
+			join_with(" <1> ", rec('201',$a),
+				join_with(" <2> ", rec('201',$b),
+					rec('201',$c),
+				)
+			), 
+		'eq', $e, "join_with $a <1> $b <2> $c = $e");
+	};
+
+	test_join_with_3('_','_','_','');
+	test_join_with_3('x','_','_','201x');
+	test_join_with_3('_','x','_','201x');
+	test_join_with_3('_','_','x','201x');
+	test_join_with_3('x','y','_','201x <1> 201y');
+	test_join_with_3('x','_','y','201x <1> 201y');
+	test_join_with_3('_','x','y','201x <2> 201y');
+	test_join_with_3('x','_','y','201x <1> 201y');
+	test_join_with_3('x','y','x','201x <1> 201y <2> 201x');
+
+	# test lookups
+
+	set_lookup( $lookup2 );
+
+	is_deeply( \[ lookup( prefix( '00', rec('902') ) ) ], \[ 'lookup' ], 'lookup prefix' );
+
+	ok(! lookup('non-existent'), 'lookup non-existant' );
+
+	set_rec( $rec2 );
+
+	test_s(qq{
+		tag('Title',
+			rec('200','a')
+		);
+	});
+	test_s(qq{
+		tag('Who',
+			join_with(" ",
+				rec('702','a'),
+				rec('702','b')
+			)
+		);
+	});
+
+	test_s(qq{
+		display('Publisher',
+			rec('210','c')
+		)
+	});
+	
+	test_s(qq{
+		search('Year',
+			regex( 's/[^\\d]+//',
+				rec('210','d')
+			)
+		)
+	});
+
+	ok(my $ds = get_ds(), "get_ds");
+	diag "ds = ", Dumper($ds) if ($debug);
+
+
+	sub test_check_ds {
+
+		my $t = shift;
+
+		ok($ds = get_ds(), 'get_ds');
+		diag Dumper( $ds ) if ($debug);
+
+		ok( $ds && $ds->{something}, 'get_ds->something exists' );
+		ok( $ds && $ds->{something}->{$t}, 'get_ds->something->'.$t.' exists') if ($t);
+		ok( $ds && !$ds->{empty}, 'get_ds->empty doesn\'t' );
+
+		return $ds;
+	}
+
+	clean_ds();
+	test_s(qq{ search('something', '42'); });
+	test_s(qq{ search('empty', ''); });
+	test_check_ds('search');
+
+	clean_ds();
+	test_s(qq{ display('something', '42'); });
+	test_s(qq{ display('empty', ''); });
+	test_check_ds('display');
+
+	clean_ds();
+	test_s(qq{ tag('something', '42'); });
+	test_s(qq{ tag('empty', ''); });
+	test_check_ds('search');
+	test_check_ds('display');
+
+	clean_ds();
+	my $n = read_file( "$abs_path/data/normalize.pl" );
+	$n .= "\n1;\n";
+	#diag "normalize code:\n$n\n";
+	test_s( $n );
+
+	ok($ds = get_ds(), "get_ds");
+	diag "ds = ", Dumper($ds) if ($debug);
+
+	my $rec = {
+		'200' => [{
+			'a' => '200a',
+			'b' => '200b',
+		}],
+	};
+	my $rules = qq{ search('mixed', rec('200') ) };
+	
+	clean_ds();
+	set_rec( $rec );
+	test_s( $rules );
+	ok($ds = get_ds(), "get_ds");
+	is_deeply( $ds, {
+		'mixed' => {
+			'search' => [ '200a', '200b' ],
+			'tag' => 'mixed'
+		}
+	}, 'correct get_ds');
+
+	ok(my $ds2 = WebPAC::Normalize::data_structure(
+		row => $rec,
+		rules => $rules,
+	), 'data_structure');
+	is_deeply( $ds, $ds2, 'data_structure(s) same');
+
+	# wird and non-valid structure which is supported anyway
+	clean_ds();
+	set_rec({
+		'200' => [{
+			'a' => '200a',
+		},
+			'200-solo'
+		]
+	});
+	test_s(qq{ search('mixed', rec('200') ) });
+	ok($ds = get_ds(), "get_ds");
+	is_deeply( $ds, {
+		'mixed' => {
+			'search' => [ '200a', '200-solo' ],
+			'tag' => 'mixed'
+		}
+	}, 'correct get_ds');
+
+}
+
diff --git a/t/6-unit.t b/t/6-unit.t
index a17aa4a..5658ce6 100755
--- a/t/6-unit.t
+++ b/t/6-unit.t
@@ -2,7 +2,7 @@
 
 use strict;
 
-use Test::More tests => 41;
+use Test::More tests => 24;
 use Test::Exception;
 use Cwd qw/abs_path/;
 use File::Temp qw/tempdir/;
@@ -17,8 +17,8 @@ BEGIN {
 use_ok( 'WebPAC::Lookup' );
 use_ok( 'WebPAC::Input' );
 use_ok( 'WebPAC::Store' );
-use_ok( 'WebPAC::Normalize::XML' );
-use_ok( 'WebPAC::Normalize::Set' );
+use_ok( 'WebPAC::Lookup::Normalize' );
+use_ok( 'WebPAC::Normalize' );
 use_ok( 'WebPAC::Output::TT' );
 }
 
@@ -28,15 +28,13 @@ diag "abs_path: $abs_path" if ($debug);
 
 my $isis_file = "$abs_path../t/winisis/BIBL";
 #$isis_file = '/data/hidra/THS/THS';
-$isis_file = '/data/isis_data/ffkk/';
+#$isis_file = '/data/isis_data/ffkk/';
 
 diag "isis_file: $isis_file" if ($debug);
 
 my $normalize_set_pl = "$abs_path/data/normalize.pl";
 my $lookup_file = "$abs_path../conf/lookup/isis.pm";
 
-my ($t1,$t2) = (0,0);
-
 ok(my $lookup = new WebPAC::Lookup(
 	lookup_file => $lookup_file,
 ), "new Lookup");
@@ -61,19 +59,6 @@ ok(my $db = new WebPAC::Store(
 	database => '.',
 ), "new Store");
 
-ok(my $n = new WebPAC::Normalize::XML(
-#	filter => { 'foo' => sub { shift } },
-	db => $db,
-	lookup_regex => $lookup->regex,
-	lookup => $lookup,
-	no_progress_bar => 1,
-), "new Normalize::XML");
-
-ok($n->open(
-	tag => 'isis',
-	xml_file => "$abs_path/data/normalize.xml",
-), "Normalize::XML->open");
-
 ok(my $norm_pl = read_file( $normalize_set_pl ), "set definitions: $normalize_set_pl" );
 
 ok(my $out = new WebPAC::Output::TT(
@@ -83,6 +68,8 @@ ok(my $out = new WebPAC::Output::TT(
 
 diag " lookup => ",Dumper($lookup->lookup_hash) if ($debug);
 
+my $t_norm = 0;
+
 foreach my $pos ( 0 ... $isis->size ) {
 
 	my $row = $isis->fetch || next;
@@ -90,21 +77,14 @@ foreach my $pos ( 0 ... $isis->size ) {
 	diag " row $pos => ",Dumper($row) if ($debug);
 
 	my $t = time();
-	ok(my $ds = $n->data_structure($row), "XML data_structure");
-	$t1 += time() - $t;
-
-	diag " ds $pos => ",Dumper($ds) if ($debug);
-
-	$t = time();
-	ok( my $ds2 = WebPAC::Normalize::Set::data_structure(
+	ok( my $ds = WebPAC::Normalize::data_structure(
 		lookup => $lookup->lookup_hash,
 		row => $row,
 		rules => $norm_pl,
 	), "Set data_structure");
-	$t2 += time() - $t;
+	$t_norm += time() - $t;
 
-	diag " ds2 $pos => ",Dumper($ds2) if ($debug);
-	is_deeply( $ds, $ds2, 'ds same for xml and sets');
+	diag " ds $pos => ",Dumper($ds) if ($debug);
 
 	ok(my $html = $out->apply(
 		template => 'html.tt',
@@ -117,4 +97,4 @@ foreach my $pos ( 0 ... $isis->size ) {
 
 };
 
-diag sprintf("timings: %.2fs vs %.2fs [%1.2f%%]\n", $t1, $t2, ($t1 / $t2) * 100);
+diag sprintf("timings: %.2fs\n", $t_norm);