From b798b4e1d276c5eeb87174e6d47e49001db7bb65 Mon Sep 17 00:00:00 2001 From: Dobrica Pavlinusic Date: Wed, 5 May 2010 23:45:15 +0200 Subject: [PATCH] configuration to index git schema --- sphinx.conf | 524 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 524 insertions(+) diff --git a/sphinx.conf b/sphinx.conf index 7e56288..acb71b4 100644 --- a/sphinx.conf +++ b/sphinx.conf @@ -183,6 +183,10 @@ source src1 # lets you store and retrieve strings # # sql_attr_string = stitle + sql_attr_string = hash + sql_attr_string = parent + sql_attr_string = subject +# sql_field_str2wordcount = subject # wordcount attribute declaration @@ -815,3 +819,523 @@ searchd } # --eof-- + +source git +{ + # data source type. mandatory, no default value + # known types are mysql, pgsql, mssql, xmlpipe, xmlpipe2, odbc + type = mysql + + ##################################################################### + ## SQL settings (for 'mysql' and 'pgsql' types) + ##################################################################### + + # some straightforward parameters for SQL source types + sql_host = localhost + sql_user = dpavlin + sql_pass = + sql_db = git + sql_port = 3306 # optional, default is 3306 + + # UNIX socket name + # optional, default is empty (reuse client library defaults) + # usually '/var/lib/mysql/mysql.sock' on Linux + # usually '/tmp/mysql.sock' on FreeBSD + # + # sql_sock = /tmp/mysql.sock + + + # MySQL specific client connection flags + # optional, default is 0 + # + # mysql_connect_flags = 32 # enable compression + + # MySQL specific SSL certificate settings + # optional, defaults are empty + # + # mysql_ssl_cert = /etc/ssl/client-cert.pem + # mysql_ssl_key = /etc/ssl/client-key.pem + # mysql_ssl_ca = /etc/ssl/cacert.pem + + # MS SQL specific Windows authentication mode flag + # MUST be in sync with charset_type index-level setting + # optional, default is 0 + # + # mssql_winauth = 1 # use currently logged on user credentials + + + # MS SQL specific Unicode indexing flag + # optional, default is 0 (request SBCS data) + # + # mssql_unicode = 1 # request Unicode data from server + + + # ODBC specific DSN (data source name) + # mandatory for odbc source type, no default value + # + # odbc_dsn = DBQ=C:\data;DefaultDir=C:\data;Driver={Microsoft Text Driver (*.txt; *.csv)}; + # sql_query = SELECT id, data FROM documents.csv + + + # pre-query, executed before the main fetch query + # multi-value, optional, default is empty list of queries + # + # sql_query_pre = SET NAMES utf8 + # sql_query_pre = SET SESSION query_cache_type=OFF + + + # main document fetch query + # mandatory, integer document ID field MUST be the first selected column + sql_query = \ + SELECT id, hash, parent, UNIX_TIMESTAMP(timestamp) AS timestamp, subject \ + FROM log + + + # joined/payload field fetch query + # joined fields let you avoid (slow) JOIN and GROUP_CONCAT + # payload fields let you attach custom per-keyword values (eg. for ranking) + # + # syntax is FIELD-NAME 'from' ( 'query' | 'payload-query' ); QUERY + # joined field QUERY should return 2 columns (docid, text) + # payload field QUERY should return 3 columns (docid, keyword, weight) + # + # REQUIRES that query results are in ascending document ID order! + # multi-value, optional, default is empty list of queries + # + # sql_joined_field = tags from query; SELECT docid, CONCAT('tag',tagid) FROM tags ORDER BY docid ASC + # sql_joined_field = wtags from payload-query; SELECT docid, tag, tagweight FROM tags ORDER BY docid ASC + + + # range query setup, query that must return min and max ID values + # optional, default is empty + # + # sql_query will need to reference $start and $end boundaries + # if using ranged query: + # + # sql_query = \ + # SELECT doc.id, doc.id AS group, doc.title, doc.data \ + # FROM documents doc \ + # WHERE id>=$start AND id<=$end + # + # sql_query_range = SELECT MIN(id),MAX(id) FROM documents + + + # range query step + # optional, default is 1024 + # + # sql_range_step = 1000 + + + # unsigned integer attribute declaration + # multi-value (an arbitrary number of attributes is allowed), optional + # optional bit size can be specified, default is 32 + # + # sql_attr_uint = author_id + # sql_attr_uint = forum_id:9 # 9 bits for forum_id + #sql_attr_uint = group_id + + # boolean attribute declaration + # multi-value (an arbitrary number of attributes is allowed), optional + # equivalent to sql_attr_uint with 1-bit size + # + # sql_attr_bool = is_deleted + + + # bigint attribute declaration + # multi-value (an arbitrary number of attributes is allowed), optional + # declares a signed (unlike uint!) 64-bit attribute + # + # sql_attr_bigint = my_bigint_id + + + # UNIX timestamp attribute declaration + # multi-value (an arbitrary number of attributes is allowed), optional + # similar to integer, but can also be used in date functions + # + # sql_attr_timestamp = posted_ts + # sql_attr_timestamp = last_edited_ts + sql_attr_timestamp = timestamp + + # string ordinal attribute declaration + # multi-value (an arbitrary number of attributes is allowed), optional + # sorts strings (bytewise), and stores their indexes in the sorted list + # sorting by this attr is equivalent to sorting by the original strings + # + # sql_attr_str2ordinal = author_name + + + # floating point attribute declaration + # multi-value (an arbitrary number of attributes is allowed), optional + # values are stored in single precision, 32-bit IEEE 754 format + # + # sql_attr_float = lat_radians + # sql_attr_float = long_radians + + + # multi-valued attribute (MVA) attribute declaration + # multi-value (an arbitrary number of attributes is allowed), optional + # MVA values are variable length lists of unsigned 32-bit integers + # + # syntax is ATTR-TYPE ATTR-NAME 'from' SOURCE-TYPE [;QUERY] [;RANGE-QUERY] + # ATTR-TYPE is 'uint' or 'timestamp' + # SOURCE-TYPE is 'field', 'query', or 'ranged-query' + # QUERY is SQL query used to fetch all ( docid, attrvalue ) pairs + # RANGE-QUERY is SQL query used to fetch min and max ID values, similar to 'sql_query_range' + # + # sql_attr_multi = uint tag from query; SELECT id, tag FROM tags + # sql_attr_multi = uint tag from ranged-query; \ + # SELECT id, tag FROM tags WHERE id>=$start AND id<=$end; \ + # SELECT MIN(id), MAX(id) FROM tags + + + # string attribute declaration + # multi-value (an arbitrary number of these is allowed), optional + # lets you store and retrieve strings + # + # sql_attr_string = stitle + + + # wordcount attribute declaration + # multi-value (an arbitrary number of these is allowed), optional + # lets you count the words at indexing time + # + # sql_attr_str2wordcount = stitle + + + # combined field plus attribute declaration (from a single column) + # stores column as an attribute, but also indexes it as a full-text field + # + # sql_field_string = author + # sql_field_str2wordcount = title + + + # post-query, executed on sql_query completion + # optional, default is empty + # + # sql_query_post = + + + # post-index-query, executed on successful indexing completion + # optional, default is empty + # $maxid expands to max document ID actually fetched from DB + # + # sql_query_post_index = REPLACE INTO counters ( id, val ) \ + # VALUES ( 'max_indexed_id', $maxid ) + + + # ranged query throttling, in milliseconds + # optional, default is 0 which means no delay + # enforces given delay before each query step + sql_ranged_throttle = 0 + + # document info query, ONLY for CLI search (ie. testing and debugging) + # optional, default is empty + # must contain $id macro and must fetch the document by that id + sql_query_info = SELECT * FROM log WHERE id=$id + + # kill-list query, fetches the document IDs for kill-list + # k-list will suppress matches from preceding indexes in the same query + # optional, default is empty + # + # sql_query_killlist = SELECT id FROM documents WHERE edited>=@last_reindex + + + # columns to unpack on indexer side when indexing + # multi-value, optional, default is empty list + # + # unpack_zlib = zlib_column + # unpack_mysqlcompress = compressed_column + # unpack_mysqlcompress = compressed_column_2 + + + # maximum unpacked length allowed in MySQL COMPRESS() unpacker + # optional, default is 16M + # + # unpack_mysqlcompress_maxsize = 16M + + + ##################################################################### + ## xmlpipe settings + ##################################################################### + + # type = xmlpipe + + # shell command to invoke xmlpipe stream producer + # mandatory + # + # xmlpipe_command = cat /var/test.xml + + ##################################################################### + ## xmlpipe2 settings + ##################################################################### + + # type = xmlpipe2 + # xmlpipe_command = cat /var/test2.xml + + + # xmlpipe2 field declaration + # multi-value, optional, default is empty + # + # xmlpipe_field = subject + # xmlpipe_field = content + + + # xmlpipe2 attribute declaration + # multi-value, optional, default is empty + # all xmlpipe_attr_XXX options are fully similar to sql_attr_XXX + # + # xmlpipe_attr_timestamp = published + # xmlpipe_attr_uint = author_id + + + # perform UTF-8 validation, and filter out incorrect codes + # avoids XML parser choking on non-UTF-8 documents + # optional, default is 0 + # + # xmlpipe_fixup_utf8 = 1 +} + +############################################################################# +## index definition +############################################################################# + +# local index example +# +# this is an index which is stored locally in the filesystem +# +# all indexing-time options (such as morphology and charsets) +# are configured per local index +index git +{ + # index type + # optional, default is 'plain' + # known values are 'plain', 'distributed', and 'rt' (see samples below) + # type = plain + + # document source(s) to index + # multi-value, mandatory + # document IDs must be globally unique across all sources + source = git + + # index files path and file name, without extension + # mandatory, path must be writable, extensions will be auto-appended + path = data/git + + # document attribute values (docinfo) storage mode + # optional, default is 'extern' + # known values are 'none', 'extern' and 'inline' + docinfo = inline + + # memory locking for cached data (.spa and .spi), to prevent swapping + # optional, default is 0 (do not mlock) + # requires searchd to be run from root + mlock = 0 + + # a list of morphology preprocessors to apply + # optional, default is empty + # + # builtin preprocessors are 'none', 'stem_en', 'stem_ru', 'stem_enru', + # 'soundex', and 'metaphone'; additional preprocessors available from + # libstemmer are 'libstemmer_XXX', where XXX is algorithm code + # (see libstemmer_c/libstemmer/modules.txt) + # + # morphology = stem_en, stem_ru, soundex + # morphology = libstemmer_german + # morphology = libstemmer_sv + morphology = none + + # minimum word length at which to enable stemming + # optional, default is 1 (stem everything) + # + # min_stemming_len = 1 + + + # stopword files list (space separated) + # optional, default is empty + # contents are plain text, charset_table and stemming are both applied + # + # stopwords = data/stopwords.txt + + + # wordforms file, in "mapfrom > mapto" plain text format + # optional, default is empty + # + # wordforms = data/wordforms.txt + + + # tokenizing exceptions file + # optional, default is empty + # + # plain text, case sensitive, space insensitive in map-from part + # one "Map Several Words => ToASingleOne" entry per line + # + # exceptions = data/exceptions.txt + + + # minimum indexed word length + # default is 1 (index everything) + min_word_len = 1 + + # charset encoding type + # optional, default is 'sbcs' + # known types are 'sbcs' (Single Byte CharSet) and 'utf-8' + charset_type = sbcs + + # charset definition and case folding rules "table" + # optional, default value depends on charset_type + # + # defaults are configured to include English and Russian characters only + # you need to change the table to include additional ones + # this behavior MAY change in future versions + # + # 'sbcs' default value is + # charset_table = 0..9, A..Z->a..z, _, a..z, U+A8->U+B8, U+B8, U+C0..U+DF->U+E0..U+FF, U+E0..U+FF + # + # 'utf-8' default value is + # charset_table = 0..9, A..Z->a..z, _, a..z, U+410..U+42F->U+430..U+44F, U+430..U+44F + + + # ignored characters list + # optional, default value is empty + # + # ignore_chars = U+00AD + + + # minimum word prefix length to index + # optional, default is 0 (do not index prefixes) + # + # min_prefix_len = 0 + + + # minimum word infix length to index + # optional, default is 0 (do not index infixes) + # + # min_infix_len = 0 + + + # list of fields to limit prefix/infix indexing to + # optional, default value is empty (index all fields in prefix/infix mode) + # + # prefix_fields = filename + # infix_fields = url, domain + + + # enable star-syntax (wildcards) when searching prefix/infix indexes + # search-time only, does not affect indexing, can be 0 or 1 + # optional, default is 0 (do not use wildcard syntax) + # + # enable_star = 1 + + + # expand keywords with exact forms and/or stars when searching fit indexes + # search-time only, does not affect indexing, can be 0 or 1 + # optional, default is 0 (do not expand keywords) + # + # expand_keywords = 1 + + + # n-gram length to index, for CJK indexing + # only supports 0 and 1 for now, other lengths to be implemented + # optional, default is 0 (disable n-grams) + # + # ngram_len = 1 + + + # n-gram characters list, for CJK indexing + # optional, default is empty + # + # ngram_chars = U+3000..U+2FA1F + + + # phrase boundary characters list + # optional, default is empty + # + # phrase_boundary = ., ?, !, U+2026 # horizontal ellipsis + + + # phrase boundary word position increment + # optional, default is 0 + # + # phrase_boundary_step = 100 + + + # blended characters list + # blended chars are indexed both as separators and valid characters + # for instance, AT&T will results in 3 tokens ("at", "t", and "at&t") + # optional, default is empty + # + # blend_chars = +, &, U+23 + + + # whether to strip HTML tags from incoming documents + # known values are 0 (do not strip) and 1 (do strip) + # optional, default is 0 + html_strip = 0 + + # what HTML attributes to index if stripping HTML + # optional, default is empty (do not index anything) + # + # html_index_attrs = img=alt,title; a=title; + + + # what HTML elements contents to strip + # optional, default is empty (do not strip element contents) + # + # html_remove_elements = style, script + + + # whether to preopen index data files on startup + # optional, default is 0 (do not preopen), searchd-only + # + # preopen = 1 + + + # whether to keep dictionary (.spi) on disk, or cache it in RAM + # optional, default is 0 (cache in RAM), searchd-only + # + # ondisk_dict = 1 + + + # whether to enable in-place inversion (2x less disk, 90-95% speed) + # optional, default is 0 (use separate temporary files), indexer-only + # + # inplace_enable = 1 + + + # in-place fine-tuning options + # optional, defaults are listed below + # + # inplace_hit_gap = 0 # preallocated hitlist gap size + # inplace_docinfo_gap = 0 # preallocated docinfo gap size + # inplace_reloc_factor = 0.1 # relocation buffer size within arena + # inplace_write_factor = 0.1 # write buffer size within arena + + + # whether to index original keywords along with stemmed versions + # enables "=exactform" operator to work + # optional, default is 0 + # + # index_exact_words = 1 + + + # position increment on overshort (less that min_word_len) words + # optional, allowed values are 0 and 1, default is 1 + # + # overshort_step = 1 + + + # position increment on stopword + # optional, allowed values are 0 and 1, default is 1 + # + # stopword_step = 1 + + + # hitless words list + # positions for these keywords will not be stored in the index + # optional, allowed values are 'all', or a list file name + # + # hitless_words = all + # hitless_words = hitless.txt +} + + -- 2.20.1