configuration to index git schema

author Dobrica Pavlinusic <dpavlin@rot13.org>

Wed, 5 May 2010 21:45:15 +0000 (23:45 +0200)

committer Dobrica Pavlinusic <dpavlin@rot13.org>

Wed, 5 May 2010 21:45:15 +0000 (23:45 +0200)
author Dobrica Pavlinusic <dpavlin@rot13.org>
Wed, 5 May 2010 21:45:15 +0000 (23:45 +0200)
committer Dobrica Pavlinusic <dpavlin@rot13.org>
Wed, 5 May 2010 21:45:15 +0000 (23:45 +0200)
diff --git a/sphinx.conf b/sphinx.conf

index 7e56288..acb71b4 100644 (file)
--- a/sphinx.conf
+++ b/sphinx.conf
@@ -183,6 +183,10 @@ source src1
         # lets you store and retrieve strings
         #
         # sql_attr_string                       = stitle
         # lets you store and retrieve strings
         #
         # sql_attr_string                       = stitle
+       sql_attr_string                 = hash
+       sql_attr_string                 = parent
+       sql_attr_string                 = subject
+#      sql_field_str2wordcount = subject
  
  
         # wordcount attribute declaration
  
  
         # wordcount attribute declaration
@@ -815,3 +819,523 @@ searchd
  }
  
  # --eof--
  }
  
  # --eof--
+
+source git
+{
+       # data source type. mandatory, no default value
+       # known types are mysql, pgsql, mssql, xmlpipe, xmlpipe2, odbc
+       type                                    = mysql
+
+       #####################################################################
+       ## SQL settings (for 'mysql' and 'pgsql' types)
+       #####################################################################
+
+       # some straightforward parameters for SQL source types
+       sql_host                                = localhost
+       sql_user                                = dpavlin
+       sql_pass                                =
+       sql_db                                  = git
+       sql_port                                = 3306  # optional, default is 3306
+
+       # UNIX socket name
+       # optional, default is empty (reuse client library defaults)
+       # usually '/var/lib/mysql/mysql.sock' on Linux
+       # usually '/tmp/mysql.sock' on FreeBSD
+       #
+       # sql_sock                              = /tmp/mysql.sock
+
+
+       # MySQL specific client connection flags
+       # optional, default is 0
+       #
+       # mysql_connect_flags   = 32 # enable compression
+
+       # MySQL specific SSL certificate settings
+       # optional, defaults are empty
+       #
+       # mysql_ssl_cert                = /etc/ssl/client-cert.pem
+       # mysql_ssl_key         = /etc/ssl/client-key.pem
+       # mysql_ssl_ca          = /etc/ssl/cacert.pem
+
+       # MS SQL specific Windows authentication mode flag
+       # MUST be in sync with charset_type index-level setting
+       # optional, default is 0
+       #
+       # mssql_winauth                 = 1 # use currently logged on user credentials
+
+
+       # MS SQL specific Unicode indexing flag
+       # optional, default is 0 (request SBCS data)
+       #
+       # mssql_unicode                 = 1 # request Unicode data from server
+
+
+       # ODBC specific DSN (data source name)
+       # mandatory for odbc source type, no default value
+       #
+       # odbc_dsn                              = DBQ=C:\data;DefaultDir=C:\data;Driver={Microsoft Text Driver (*.txt; *.csv)};
+       # sql_query                             = SELECT id, data FROM documents.csv
+
+
+       # pre-query, executed before the main fetch query
+       # multi-value, optional, default is empty list of queries
+       #
+       # sql_query_pre                 = SET NAMES utf8
+       # sql_query_pre                 = SET SESSION query_cache_type=OFF
+
+
+       # main document fetch query
+       # mandatory, integer document ID field MUST be the first selected column
+       sql_query                               = \
+               SELECT id, hash, parent, UNIX_TIMESTAMP(timestamp) AS timestamp, subject \
+               FROM log
+
+
+       # joined/payload field fetch query
+       # joined fields let you avoid (slow) JOIN and GROUP_CONCAT
+       # payload fields let you attach custom per-keyword values (eg. for ranking)
+       #
+       # syntax is FIELD-NAME 'from'  ( 'query' | 'payload-query' ); QUERY
+       # joined field QUERY should return 2 columns (docid, text)
+       # payload field QUERY should return 3 columns (docid, keyword, weight)
+       #
+       # REQUIRES that query results are in ascending document ID order!
+       # multi-value, optional, default is empty list of queries
+       #
+       # sql_joined_field                      = tags from query; SELECT docid, CONCAT('tag',tagid) FROM tags ORDER BY docid ASC
+       # sql_joined_field                      = wtags from payload-query; SELECT docid, tag, tagweight FROM tags ORDER BY docid ASC
+
+
+       # range query setup, query that must return min and max ID values
+       # optional, default is empty
+       #
+       # sql_query will need to reference $start and $end boundaries
+       # if using ranged query:
+       #
+       # sql_query                             = \
+       #       SELECT doc.id, doc.id AS group, doc.title, doc.data \
+       #       FROM documents doc \
+       #       WHERE id>=$start AND id<=$end
+       #
+       # sql_query_range               = SELECT MIN(id),MAX(id) FROM documents
+
+
+       # range query step
+       # optional, default is 1024
+       #
+       # sql_range_step                = 1000
+
+
+       # unsigned integer attribute declaration
+       # multi-value (an arbitrary number of attributes is allowed), optional
+       # optional bit size can be specified, default is 32
+       #
+       # sql_attr_uint                 = author_id
+       # sql_attr_uint                 = forum_id:9 # 9 bits for forum_id
+       #sql_attr_uint                  = group_id
+
+       # boolean attribute declaration
+       # multi-value (an arbitrary number of attributes is allowed), optional
+       # equivalent to sql_attr_uint with 1-bit size
+       #
+       # sql_attr_bool                 = is_deleted
+
+
+       # bigint attribute declaration
+       # multi-value (an arbitrary number of attributes is allowed), optional
+       # declares a signed (unlike uint!) 64-bit attribute
+       #
+       # sql_attr_bigint                       = my_bigint_id
+
+
+       # UNIX timestamp attribute declaration
+       # multi-value (an arbitrary number of attributes is allowed), optional
+       # similar to integer, but can also be used in date functions
+       #
+       # sql_attr_timestamp    = posted_ts
+       # sql_attr_timestamp    = last_edited_ts
+       sql_attr_timestamp              = timestamp
+
+       # string ordinal attribute declaration
+       # multi-value (an arbitrary number of attributes is allowed), optional
+       # sorts strings (bytewise), and stores their indexes in the sorted list
+       # sorting by this attr is equivalent to sorting by the original strings
+       #
+       # sql_attr_str2ordinal  = author_name
+
+
+       # floating point attribute declaration
+       # multi-value (an arbitrary number of attributes is allowed), optional
+       # values are stored in single precision, 32-bit IEEE 754 format
+       #
+       # sql_attr_float = lat_radians
+       # sql_attr_float = long_radians
+
+
+       # multi-valued attribute (MVA) attribute declaration
+       # multi-value (an arbitrary number of attributes is allowed), optional
+       # MVA values are variable length lists of unsigned 32-bit integers
+       #
+       # syntax is ATTR-TYPE ATTR-NAME 'from' SOURCE-TYPE [;QUERY] [;RANGE-QUERY]
+       # ATTR-TYPE is 'uint' or 'timestamp'
+       # SOURCE-TYPE is 'field', 'query', or 'ranged-query'
+       # QUERY is SQL query used to fetch all ( docid, attrvalue ) pairs
+       # RANGE-QUERY is SQL query used to fetch min and max ID values, similar to 'sql_query_range'
+       #
+       # sql_attr_multi        = uint tag from query; SELECT id, tag FROM tags
+       # sql_attr_multi        = uint tag from ranged-query; \
+       #       SELECT id, tag FROM tags WHERE id>=$start AND id<=$end; \
+       #       SELECT MIN(id), MAX(id) FROM tags
+
+
+       # string attribute declaration
+       # multi-value (an arbitrary number of these is allowed), optional
+       # lets you store and retrieve strings
+       #
+       # sql_attr_string                       = stitle
+
+
+       # wordcount attribute declaration
+       # multi-value (an arbitrary number of these is allowed), optional
+       # lets you count the words at indexing time
+       #
+       # sql_attr_str2wordcount        = stitle
+
+
+       # combined field plus attribute declaration (from a single column)
+       # stores column as an attribute, but also indexes it as a full-text field
+       #
+       # sql_field_string                      = author
+       # sql_field_str2wordcount       = title
+
+       
+       # post-query, executed on sql_query completion
+       # optional, default is empty
+       #
+       # sql_query_post                =
+
+       
+       # post-index-query, executed on successful indexing completion
+       # optional, default is empty
+       # $maxid expands to max document ID actually fetched from DB
+       #
+       # sql_query_post_index = REPLACE INTO counters ( id, val ) \
+       #       VALUES ( 'max_indexed_id', $maxid )
+
+
+       # ranged query throttling, in milliseconds
+       # optional, default is 0 which means no delay
+       # enforces given delay before each query step
+       sql_ranged_throttle     = 0
+
+       # document info query, ONLY for CLI search (ie. testing and debugging)
+       # optional, default is empty
+       # must contain $id macro and must fetch the document by that id
+       sql_query_info          = SELECT * FROM log WHERE id=$id
+
+       # kill-list query, fetches the document IDs for kill-list
+       # k-list will suppress matches from preceding indexes in the same query
+       # optional, default is empty
+       #
+       # sql_query_killlist    = SELECT id FROM documents WHERE edited>=@last_reindex
+
+
+       # columns to unpack on indexer side when indexing
+       # multi-value, optional, default is empty list
+       #
+       # unpack_zlib = zlib_column
+       # unpack_mysqlcompress = compressed_column
+       # unpack_mysqlcompress = compressed_column_2
+
+
+       # maximum unpacked length allowed in MySQL COMPRESS() unpacker
+       # optional, default is 16M
+       #
+       # unpack_mysqlcompress_maxsize = 16M
+
+
+       #####################################################################
+       ## xmlpipe settings
+       #####################################################################
+
+       # type                          = xmlpipe
+
+       # shell command to invoke xmlpipe stream producer
+       # mandatory
+       #
+       # xmlpipe_command       = cat /var/test.xml
+
+       #####################################################################
+       ## xmlpipe2 settings
+       #####################################################################
+
+       # type                          = xmlpipe2
+       # xmlpipe_command       = cat /var/test2.xml
+
+
+       # xmlpipe2 field declaration
+       # multi-value, optional, default is empty
+       #
+       # xmlpipe_field                         = subject
+       # xmlpipe_field                         = content
+
+
+       # xmlpipe2 attribute declaration
+       # multi-value, optional, default is empty
+       # all xmlpipe_attr_XXX options are fully similar to sql_attr_XXX
+       #
+       # xmlpipe_attr_timestamp        = published
+       # xmlpipe_attr_uint                     = author_id
+
+
+       # perform UTF-8 validation, and filter out incorrect codes
+       # avoids XML parser choking on non-UTF-8 documents
+       # optional, default is 0
+       #
+       # xmlpipe_fixup_utf8            = 1
+}
+
+#############################################################################
+## index definition
+#############################################################################
+
+# local index example
+#
+# this is an index which is stored locally in the filesystem
+#
+# all indexing-time options (such as morphology and charsets)
+# are configured per local index
+index git
+{
+       # index type
+       # optional, default is 'plain'
+       # known values are 'plain', 'distributed', and 'rt' (see samples below)
+       # type                  = plain
+
+       # document source(s) to index
+       # multi-value, mandatory
+       # document IDs must be globally unique across all sources
+       source                  = git
+
+       # index files path and file name, without extension
+       # mandatory, path must be writable, extensions will be auto-appended
+       path                    = data/git
+
+       # document attribute values (docinfo) storage mode
+       # optional, default is 'extern'
+       # known values are 'none', 'extern' and 'inline'
+       docinfo                 = inline
+
+       # memory locking for cached data (.spa and .spi), to prevent swapping
+       # optional, default is 0 (do not mlock)
+       # requires searchd to be run from root
+       mlock                   = 0
+
+       # a list of morphology preprocessors to apply
+       # optional, default is empty
+       #
+       # builtin preprocessors are 'none', 'stem_en', 'stem_ru', 'stem_enru',
+       # 'soundex', and 'metaphone'; additional preprocessors available from
+       # libstemmer are 'libstemmer_XXX', where XXX is algorithm code
+       # (see libstemmer_c/libstemmer/modules.txt)
+       #
+       # morphology    = stem_en, stem_ru, soundex
+       # morphology    = libstemmer_german
+       # morphology    = libstemmer_sv
+       morphology              = none
+
+       # minimum word length at which to enable stemming
+       # optional, default is 1 (stem everything)
+       #
+       # min_stemming_len      = 1
+
+
+       # stopword files list (space separated)
+       # optional, default is empty
+       # contents are plain text, charset_table and stemming are both applied
+       #
+       # stopwords                     = data/stopwords.txt
+
+
+       # wordforms file, in "mapfrom > mapto" plain text format
+       # optional, default is empty
+       #
+       # wordforms                     = data/wordforms.txt
+
+
+       # tokenizing exceptions file
+       # optional, default is empty
+       #
+       # plain text, case sensitive, space insensitive in map-from part
+       # one "Map Several Words => ToASingleOne" entry per line
+       #
+       # exceptions            = data/exceptions.txt
+
+
+       # minimum indexed word length
+       # default is 1 (index everything)
+       min_word_len            = 1
+
+       # charset encoding type
+       # optional, default is 'sbcs'
+       # known types are 'sbcs' (Single Byte CharSet) and 'utf-8'
+       charset_type            = sbcs
+
+       # charset definition and case folding rules "table"
+       # optional, default value depends on charset_type
+       #
+       # defaults are configured to include English and Russian characters only
+       # you need to change the table to include additional ones
+       # this behavior MAY change in future versions
+       #
+       # 'sbcs' default value is
+       # charset_table         = 0..9, A..Z->a..z, _, a..z, U+A8->U+B8, U+B8, U+C0..U+DF->U+E0..U+FF, U+E0..U+FF
+       #
+       # 'utf-8' default value is
+       # charset_table         = 0..9, A..Z->a..z, _, a..z, U+410..U+42F->U+430..U+44F, U+430..U+44F
+
+
+       # ignored characters list
+       # optional, default value is empty
+       #
+       # ignore_chars          = U+00AD
+
+
+       # minimum word prefix length to index
+       # optional, default is 0 (do not index prefixes)
+       #
+       # min_prefix_len        = 0
+
+
+       # minimum word infix length to index
+       # optional, default is 0 (do not index infixes)
+       #
+       # min_infix_len         = 0
+
+
+       # list of fields to limit prefix/infix indexing to
+       # optional, default value is empty (index all fields in prefix/infix mode)
+       #
+       # prefix_fields         = filename
+       # infix_fields          = url, domain
+
+
+       # enable star-syntax (wildcards) when searching prefix/infix indexes
+       # search-time only, does not affect indexing, can be 0 or 1
+       # optional, default is 0 (do not use wildcard syntax)
+       #
+       # enable_star           = 1
+
+
+       # expand keywords with exact forms and/or stars when searching fit indexes
+       # search-time only, does not affect indexing, can be 0 or 1
+       # optional, default is 0 (do not expand keywords)
+       #
+       # expand_keywords               = 1
+
+       
+       # n-gram length to index, for CJK indexing
+       # only supports 0 and 1 for now, other lengths to be implemented
+       # optional, default is 0 (disable n-grams)
+       #
+       # ngram_len                             = 1
+
+
+       # n-gram characters list, for CJK indexing
+       # optional, default is empty
+       #
+       # ngram_chars                   = U+3000..U+2FA1F
+
+
+       # phrase boundary characters list
+       # optional, default is empty
+       #
+       # phrase_boundary               = ., ?, !, U+2026 # horizontal ellipsis
+
+
+       # phrase boundary word position increment
+       # optional, default is 0
+       #
+       # phrase_boundary_step  = 100
+
+
+       # blended characters list
+       # blended chars are indexed both as separators and valid characters
+       # for instance, AT&T will results in 3 tokens ("at", "t", and "at&t")
+       # optional, default is empty
+       #
+       # blend_chars                           = +, &, U+23
+
+
+       # whether to strip HTML tags from incoming documents
+       # known values are 0 (do not strip) and 1 (do strip)
+       # optional, default is 0
+       html_strip                              = 0
+
+       # what HTML attributes to index if stripping HTML
+       # optional, default is empty (do not index anything)
+       #
+       # html_index_attrs              = img=alt,title; a=title;
+
+
+       # what HTML elements contents to strip
+       # optional, default is empty (do not strip element contents)
+       #
+       # html_remove_elements  = style, script
+
+
+       # whether to preopen index data files on startup
+       # optional, default is 0 (do not preopen), searchd-only
+       #
+       # preopen                                       = 1
+
+
+       # whether to keep dictionary (.spi) on disk, or cache it in RAM
+       # optional, default is 0 (cache in RAM), searchd-only
+       #
+       # ondisk_dict                           = 1
+
+
+       # whether to enable in-place inversion (2x less disk, 90-95% speed)
+       # optional, default is 0 (use separate temporary files), indexer-only
+       #
+       # inplace_enable                        = 1
+
+
+       # in-place fine-tuning options
+       # optional, defaults are listed below
+       #
+       # inplace_hit_gap                       = 0             # preallocated hitlist gap size
+       # inplace_docinfo_gap           = 0             # preallocated docinfo gap size
+       # inplace_reloc_factor  = 0.1   # relocation buffer size within arena
+       # inplace_write_factor  = 0.1   # write buffer size within arena
+
+
+       # whether to index original keywords along with stemmed versions
+       # enables "=exactform" operator to work
+       # optional, default is 0
+       #
+       # index_exact_words             = 1
+
+
+       # position increment on overshort (less that min_word_len) words
+       # optional, allowed values are 0 and 1, default is 1
+       #
+       # overshort_step                        = 1
+
+
+       # position increment on stopword
+       # optional, allowed values are 0 and 1, default is 1
+       #
+       # stopword_step                 = 1
+
+
+       # hitless words list
+       # positions for these keywords will not be stored in the index
+       # optional, allowed values are 'all', or a list file name
+       #
+       # hitless_words                 = all
+       # hitless_words                 = hitless.txt
+}
+
+
author	Dobrica Pavlinusic <dpavlin@rot13.org>
	Wed, 5 May 2010 21:45:15 +0000 (23:45 +0200)
committer	Dobrica Pavlinusic <dpavlin@rot13.org>
	Wed, 5 May 2010 21:45:15 +0000 (23:45 +0200)