Updates to date indexing and search processing
authorJoshua Ferraro <jmf@liblime.com>
Mon, 17 Dec 2007 17:54:54 +0000 (11:54 -0600)
committerJoshua Ferraro <jmf@liblime.com>
Mon, 17 Dec 2007 18:00:30 +0000 (12:00 -0600)
Summary of Koha 3.0 date indexing for MARC21:

Index                   Expected format         Notes
-----------------------------------------------------
date-entered-on-file    [yymmdd]                (008/0-5, indexed in word and sort indexes)
copydate                [yyyy]                  (260$c, indexed in word and sort indexes)
acqdate                 [yyyy-mm-dd]            (952$d, indexed in date,word,sort indexes)
pubdate                 [yyyy]                  (008/7-10, indexed in year,word,sort indexes)

Template Search Parameters Tested:
        limit-yr (either yyyy or yyyy-yyyy) (added processing for ge le, structure attribute st-numeric, etc.)
        yr pubdate (yyyy)
        acqdate,st-date-normalized (yyyy-mm-dd)

Template Sort Parameters Tested:
        pubdate_dsc
        pubdate_asc
        acqdate_dsc
        acqdate_asc

Signed-off-by: Joshua Ferraro <jmf@liblime.com>
C4/Search.pm
catalogue/search.pl
etc/zebradb/biblios/etc/bib1.att
etc/zebradb/biblios/etc/record.abs
etc/zebradb/ccl.properties
koha-tmpl/intranet-tmpl/prog/en/includes/search_indexes.inc
koha-tmpl/intranet-tmpl/prog/en/modules/labels/search.tmpl
koha-tmpl/opac-tmpl/prog/en/modules/opac-advsearch.tmpl
opac/opac-search.pl

index 72121a8..b3bf8f3 100644 (file)
@@ -660,9 +660,9 @@ sub _build_weighted_query {
         $weighted_query .= " or ti,phr,r3=\"$operand\"";            # phrase title
        #$weighted_query .= " or any,ext,r4=$operand";               # exact any
        #$weighted_query .=" or kw,wrdl,r5=\"$operand\"";            # word list any
-        $weighted_query .= " or wrd,fuzzy,r8=\"$operand\"" if $fuzzy_enabled; # add fuzzy, word list
-        $weighted_query .= " or wrd,right-Truncation,r9=\"$stemmed_operand\"" if ($stemming and $stemmed_operand); # add stemming, right truncation
-               $weighted_query .= " or wrd,r9=\"$operand\"";
+        $weighted_query .= " or wrdl,fuzzy,r8=\"$operand\"" if $fuzzy_enabled; # add fuzzy, word list
+        $weighted_query .= " or wrdl,right-Truncation,r9=\"$stemmed_operand\"" if ($stemming and $stemmed_operand); # add stemming, right truncation
+               $weighted_query .= " or wrdl,r9=\"$operand\"";
 
        # embedded sorting: 0 a-z; 1 z-a
        # $weighted_query .= ") or (sort1,aut=1";
@@ -680,7 +680,7 @@ sub _build_weighted_query {
        $weighted_query .= " $index,ext,r1=\"$operand\"";            # exact index
        #$weighted_query .= " or (title-sort-az=0 or $index,startswithnt,st-word,r3=$operand #)";
        $weighted_query .= " or $index,phr,r3=\"$operand\"";         # phrase index
-       $weighted_query .= " or $index,rt,wrd,r3=\"$operand\"";      # word list index
+       $weighted_query .= " or $index,rt,wrdl,r3=\"$operand\"";      # word list index
     }
     $weighted_query .= "))";    # close rank specification
     return $weighted_query;
@@ -753,7 +753,8 @@ sub buildQuery {
 
                                # a flag to determine whether or not to add the index to the query
                                my $indexes_set;
-                               # if the user is sophisticated enough to specify an index, turn off some defaults
+
+                               # if the user is sophisticated enough to specify an index, turn off field weighting, stemming, and stopword handling
                                if ($operands[$i] =~ /(:|=)/ || $scan) {
                                        $weight_fields = 0;
                                        $stemming = 0;
@@ -761,15 +762,30 @@ sub buildQuery {
                                }
                 my $operand = $operands[$i];
                 my $index   = $indexes[$i];
-                               $DEBUG=1;
-                               # some helpful index modifs
 
-                               my $wrdl;
-                               unless (!$index || $index =~ /(phr|ext)/) {
-                                       $wrdl = ",wrdl";
+                               # add some attributes for certain index types
+                               # Date of Publication
+                               if ($index eq 'yr') {
+                                       $index .=",st-numeric";
+                                       $indexes_set++;
+                                       ($stemming,$auto_truncation,$weight_fields, $fuzzy_enabled, $remove_stopwords) = (0,0,0,0,0);
+                               }
+                               # Date of Acquisition
+                               elsif ($index eq 'acqdate') {
+                                       $index.=",st-date-normalized";
+                                       $indexes_set++;
+                                       ($stemming,$auto_truncation,$weight_fields, $fuzzy_enabled, $remove_stopwords) = (0,0,0,0,0);
+
                                }
-                my $index_plus = $index.$wrdl.":" if $index;
-                my $index_plus_comma=$index.$wrdl."," if $index;
+
+                               # set default structure attribute (word list)
+                               my $struct_attr;
+                               unless (!$index || $index =~ /(st-|phr|ext|wrdl)/) {
+                                       $struct_attr = ",wrdl";
+                               }
+                               # some helpful index modifs
+                my $index_plus = $index.$struct_attr.":" if $index;
+                my $index_plus_comma=$index.$struct_attr."," if $index;
 
                 # Remove Stopwords
                                if ($remove_stopwords) {
@@ -885,7 +901,6 @@ sub buildQuery {
                        $limit_cgi .="&limit=$this_limit";
                        $limit_desc .= "$this_limit";
         }
-
                # regular old limits
                else {
                        $limit .= " and " if $limit || $query;
index 40be2cd..c9c21ad 100755 (executable)
@@ -387,7 +387,18 @@ foreach my $limit(@limits) {
 $template->param(available => $available);
 
 # append year limits if they exist
-push @limits, map "yr:".$_, split("\0",$params->{'limit-yr'}) if $params->{'limit-yr'};
+if ($params->{'limit-yr'}) {
+       if ($params->{'limit-yr'} =~ /\d{4}-\d{4}/) {
+               my ($yr1,$yr2) = split(/-/, $params->{'limit-yr'});
+               push @limits, "yr,st-numeric,ge=$yr1 and yr,st-numeric,le=$yr2";
+       }
+       elsif ($params->{'limit-yr'} =~ /\d{4}/) {
+               push @limits, "yr,st-numeric=$params->{'limit-yr'}";
+       }
+       else {
+               #FIXME: Should return a error to the user, incorect date format specified
+       }
+}
 
 # Params that can only have one value
 my $scan = $params->{'scan'};
@@ -546,7 +557,7 @@ for (my $i=0;$i<=@servers;$i++) {
                        $template->param(       PAGE_NUMBERS => \@page_numbers,
                                                                previous_page_offset => $previous_page_offset) unless $pages < 2;
                        $template->param(next_page_offset => $next_page_offset) unless $pages eq $current_page_number;
-         }
+               }
     } # end of the if local
     else {
         # check if it's a z3950 or opensearch source
index 3fa2721..e4c584b 100644 (file)
@@ -32,7 +32,7 @@ att 26              PA-subject
 att 27              LC-subject-heading
 att 28              RVM-subject-heading
 att 29              Local-subject-index
-att 30              Date
+att 30              copydate
 att 31              pubdate
 att 32              Date-of-acquisition
 att 33              Title-key
@@ -78,7 +78,7 @@ att 1007            Identifier-standard
 att 1008            Subject-LC-childrens
 att 1009            Subject-name-personal
 att 1010            Body-of-text
-att 1011            dateaddeddb
+att 1011            date-entered-on-file
 att 1012            Date/time-last-modified
 att 1013            Authority/format-id
 att 1014            Concept-text
@@ -131,7 +131,8 @@ att 8010                    itemnumber
 att 8011                       homebranch
 att 8012                       holdingbranch
 att 8013                       location
-att 8014                       Date-of-acquisition
+# handled in bib1 attr 1=32
+#att 8014                      Date-of-acquisition
 att 8015                       acqsource
 att 8016                       coded-location-qualifier
 att 8017                       price
index 30f612c..0eed7b4 100644 (file)
@@ -16,7 +16,7 @@ esetname B @
 marc usmarc.mar
 systag sysno rank
 xpath enable
-
+# Some notes:
 # pl = Published Place
 # ta = Target Audience 002/22
 # ff8-23
@@ -27,6 +27,13 @@ xpath enable
 # ctype = Content type: review, catalog, encyclopedia, dictionary
 # pubdate Publication Date
 # rtype =  Record type (leader 06)
+#
+# Date indexing in Koha 3.0 for MARC21:
+# Index                   Expected format         Notes
+# date-entered-on-file    [yymmdd]        (008/0-5, indexed in word and sort indexes)
+# copydate                [yyyy]          (260$c, indexed in word and sort indexes)
+# acqdate                 [yyyy-mm-dd]    (952$d, indexed in date,word,sort indexes)
+# pubdate                 [yyyy]          (008/7-10, indexed in year,word,num,sort indexes)
 
 all any
 # melm 000             rtype:n:range(data,06,1),Bib-level:w:range(data,07,01)
@@ -34,10 +41,10 @@ xelm /record/leader llength:w:range(data,0,5),rtype:w:range(data,6,1),Bib-level:
 # example: xelm /record/leader l1:w:range(data,0,5),l2:w:range(data,10,2)
 
 melm 001               Control-number
-melm 005               Date,Date/time-last-modified
+melm 005               Date/time-last-modified
 melm 007               Microform-generation:n:range(data,11,1),Material-type,ff7-00:w:range(data,0,1),ff7-01:w:range(data,1,1),ff7-02:w:range(data,2,1),ff7-01-02:w:range(data,0,2)
 
-melm 008               ln:n:range(data,35,3),ctype:w:range(data,24,4),Date:n:range(data,0,5),Date:s:range(data,0,5),Date:n:range(data,7,4),Date:s:range(data,7,4),Date:n:range(data,11,4),Date:s:range(data,11,4),pubdate:n:range(data,7,4),pubdate:s:range(data,7,4),dateaddeddb:n:range(data,0,5),dateaddeddb:s:range(data,0,5),pl:w:range(data,15,3),ta:w:range(data,22,1),ff8-23:w:range(data,23,1),ff8-29:w:range(data,29,1),lf:w:range(data,33,1),bio:w:range(data,34,1),Record-source:w:range(data,39,0)
+melm 008               date-entered-on-file:n:range(data,0,5),date-entered-on-file:s:range(data,0,5),pubdate:w:range(data,7,4),pubdate:n:range(data,7,4),pubdate:y:range(data,7,4),pubdate:s:range(data,7,4),pl:w:range(data,15,3),ta:w:range(data,22,1),ff8-23:w:range(data,23,1),ff8-29:w:range(data,29,1),lf:w:range(data,33,1),bio:w:range(data,34,1),ln:n:range(data,35,3),ctype:w:range(data,24,4),Record-source:w:range(data,39,0)
 
 melm 010               LC-card-number,Identifier-standard
 melm 011               LC-card-number,Identifier-standard
@@ -54,7 +61,7 @@ melm 025              Identifier-standard
 melm 027               Report-number,Identifier-standard
 melm 028               Number-music-publisher,Identifier-standard
 melm 030               CODEN,Identifier-standard
-melm 033               Date
+#melm 033              Date
 melm 034        Map-scale
 #melm 035              Local-number,Identifier-standard
 melm 037               Identifier-standard,Stock-number
@@ -107,8 +114,7 @@ melm 246            Title,Title:p,Title-abbreviated,Title-expanded,Title-former
 melm 247               Title,Title:p,Title-former,Title-other-variant,Related-periodical
 melm 260$a             pl:w,pl:p
 melm 260$b             Publisher:w,Publisher:p
-melm 260$c             Date,Date:s,Date:y
-#,pubdate,pubdate:s
+melm 260$c             copydate,copydate:s
 melm 260               pl
 melm 300               Extent:w,Extent:p
 melm 400$a             Name-and-title
index 8643e2a..7e49986 100644 (file)
@@ -183,7 +183,7 @@ aut 1=1003
 #                           number from a system not
 #                           specified elsewhere in this
 #                           list of attributes.
-Local-classification 4=1 1=20
+Local-classification 1=20
 lcn Local-classification
 callnum Local-classification
 #Local-classification cc callnum dewey
@@ -326,18 +326,19 @@ Local-number 1=12
 #Date                   30  The point of time at which      005, 008/00-05,
 #                           a transaction or event          008/07-10, 260$c,
 #                           takes place.                    008/11-14, 033,etc.
-Date 1=30 4=109 r=r
-#yr Date
+# interpreting this as the copyright date in 260$c
+copydate 1=30 r=r
 
 #Date-publication       31  The date (usually year) in      008/07-10, 260$c
 #                           which a document is published.  046, 533$d
-Date-of-publication 1=31 4=109 r=r
+Date-of-publication 1=pubdate r=r
 #dp Date-of-publication
 yr Date-of-publication
+pubdate Date-of-publication
 
 #Date-acquisition       32  The date when a document was    541$d
 #                           acquired.
-Date-of-acquisition 1=32
+Date-of-acquisition 1=Date-of-acquisition
 acqdate Date-of-acquisition
 #da Date-of-acquisition
 
@@ -847,12 +848,7 @@ st-key     4=3
 st-year        4=4
 st-date-normalized     4=5
 st-word-list   4=6
-wrdl 4=6
-
-# there was a reason I didn't want to use this but it's
-# escaped me -- JF
-wrd 4=6
-
+wrdl st-word-list
 #st-word
 st-date-un-normalized  4=100
 st-name-normalized     4=101
@@ -861,7 +857,7 @@ st-structure        4=103
 st-urx         4=104
 st-free-form-text      4=105
 st-document-text       4=106
-st-local       number 4=107
+st-local-number 4=107
 st-string      4=108
 st-numeric     4=109
 #string 109
@@ -908,6 +904,7 @@ cn-item 1=9008
 cn-prefix 1=9009
 cn-suffix 1=9010
 Suppress 1=9011
+date-entered-on-file 1=date-entered-on-file
 
 # Items Index
 withdrawn 1=8001
@@ -923,14 +920,14 @@ itemnumber 1=8010
 Code-institution 1=8011
 holdingbranch 1=8012
 location 1=8013
-Date-of-acquisition 1=8014
+#Date-of-acquisition 1=8014
 acqsource 1=8015
 coded-location-qualifier 1=8016
 price 1=8017
-stack 1=8018 4=109
-issues 1=8019 4=109
-renewals 1=8020 4=109
-reserves 1=8021 4=109
+stack 1=8018
+issues 1=8019
+renewals 1=8020
+reserves 1=8021
 Local-classification 1=8022
 barcode 1=8023
 bc barcode
@@ -954,7 +951,6 @@ pl Place-publication
 
 #att 8900            
 #Call-Number 1=8900
-#date-entered-on-file 1=8800
 #date1 1=8801
 #date2 1=8802
 #language 8805
index 5f255dc..21788b6 100644 (file)
 <option value="nt" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>Notes/Comments</option>
 <option value="pb" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>Publisher</option>
 <option value="pl" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>Publisher Location</option>
-<option value="yr" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>Publication Date</option>
+<option value="yr" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>Publication Date (yyyy)</option>
+<option value="acqdate" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>Acquisition Date (yyyy-mm-dd)</option>
 <option value="sn" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>Standard Number</option>
 <option value="nb" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>&nbsp;&nbsp;&nbsp;&nbsp; ISBN</option>
 <option value="ns" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>&nbsp;&nbsp;&nbsp;&nbsp; ISSN</option>
-<option value="lcn" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>&nbsp;&nbsp;&nbsp;&nbsp; Call Number</option>
+<option value="lcn,phr" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>&nbsp;&nbsp;&nbsp;&nbsp; Call Number</option>
 <option value="su" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>Subject</option>
 <option value="su,phr" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>&nbsp;&nbsp;&nbsp;&nbsp; Subject as Phrase</option>
 <option value="ti" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>Title</option>
index a024ee8..c1160d8 100644 (file)
@@ -27,7 +27,7 @@
     <option value="sn" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>Standard Number</option>
     <option value="nb" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>ISBN</option>
     <option value="ns" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>ISSN</option>
-    <option value="lcn" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>Call Number</option>
+    <option value="lcn,phr" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>Call Number</option>
     <option value="su" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>Subject</option>
    <option value="ti" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>Title</option>
     <option value="ti,phr" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>Title Phrase</option>
@@ -44,7 +44,7 @@
     <option value="sn" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>Standard Number</option>
     <option value="nb" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>ISBN</option>
     <option value="ns" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>ISSN</option>
-    <option value="lcn" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>Call Number</option>
+    <option value="lcn,phr" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>Call Number</option>
     <option value="su" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>Subject</option>
    <option value="ti" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>Title</option>
     <option value="ti,phr" <!-- TMPL_IF NAME="selected" -->selected="<!-- TMPL_VAR NAME="selected" -->"<!-- /TMPL_IF -->>Title Phrase</option>
index b7251a4..d70291a 100644 (file)
                     <option value="ns">&nbsp;&nbsp;&nbsp;&nbsp; ISSN</option><!-- /TMPL_IF -->
                                
                                <!-- TMPL_IF NAME="selected" -->
-                    <option value="lcn" selected="selected">&nbsp;&nbsp;&nbsp;&nbsp; Call Number</option>
+                    <option value="lcn,phr" selected="selected">&nbsp;&nbsp;&nbsp;&nbsp; Call Number</option>
                                        <!-- TMPL_ELSE -->
-                    <option value="lcn">&nbsp;&nbsp;&nbsp;&nbsp; Call Number</option><!-- /TMPL_IF -->
+                    <option value="lcn,phr">&nbsp;&nbsp;&nbsp;&nbsp; Call Number</option><!-- /TMPL_IF -->
 
 
                 <!-- /TMPL_IF -->
index fb0e46d..54838e9 100755 (executable)
@@ -394,7 +394,18 @@ foreach my $limit(@limits) {
 $template->param(available => $available);
 
 # append year limits if they exist
-push @limits, map "yr:".$_, split("\0",$params->{'limit-yr'}) if $params->{'limit-yr'};
+if ($params->{'limit-yr'}) {
+    if ($params->{'limit-yr'} =~ /\d{4}-\d{4}/) {
+        my ($yr1,$yr2) = split(/-/, $params->{'limit-yr'});
+        push @limits, "yr,st-numeric,ge=$yr1 and yr,st-numeric,le=$yr2";
+    }
+    elsif ($params->{'limit-yr'} =~ /\d{4}/) {
+        push @limits, "yr,st-numeric=$params->{'limit-yr'}";
+    }
+    else {
+        #FIXME: Should return a error to the user, incorect date format specified
+    }
+}
 
 # Params that can only have one value
 my $scan = $params->{'scan'};