From b798b4e1d276c5eeb87174e6d47e49001db7bb65 Mon Sep 17 00:00:00 2001
From: Dobrica Pavlinusic <dpavlin@rot13.org>
Date: Wed, 5 May 2010 23:45:15 +0200
Subject: [PATCH] configuration to index git schema

---
 sphinx.conf | 524 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 524 insertions(+)

diff --git a/sphinx.conf b/sphinx.conf
index 7e56288..acb71b4 100644
--- a/sphinx.conf
+++ b/sphinx.conf
@@ -183,6 +183,10 @@ source src1
 	# lets you store and retrieve strings
 	#
 	# sql_attr_string			= stitle
+	sql_attr_string			= hash
+	sql_attr_string			= parent
+	sql_attr_string			= subject
+#	sql_field_str2wordcount	= subject
 
 
 	# wordcount attribute declaration
@@ -815,3 +819,523 @@ searchd
 }
 
 # --eof--
+
+source git
+{
+	# data source type. mandatory, no default value
+	# known types are mysql, pgsql, mssql, xmlpipe, xmlpipe2, odbc
+	type					= mysql
+
+	#####################################################################
+	## SQL settings (for 'mysql' and 'pgsql' types)
+	#####################################################################
+
+	# some straightforward parameters for SQL source types
+	sql_host				= localhost
+	sql_user				= dpavlin
+	sql_pass				=
+	sql_db					= git
+	sql_port				= 3306	# optional, default is 3306
+
+	# UNIX socket name
+	# optional, default is empty (reuse client library defaults)
+	# usually '/var/lib/mysql/mysql.sock' on Linux
+	# usually '/tmp/mysql.sock' on FreeBSD
+	#
+	# sql_sock				= /tmp/mysql.sock
+
+
+	# MySQL specific client connection flags
+	# optional, default is 0
+	#
+	# mysql_connect_flags	= 32 # enable compression
+
+	# MySQL specific SSL certificate settings
+	# optional, defaults are empty
+	#
+	# mysql_ssl_cert		= /etc/ssl/client-cert.pem
+	# mysql_ssl_key		= /etc/ssl/client-key.pem
+	# mysql_ssl_ca		= /etc/ssl/cacert.pem
+
+	# MS SQL specific Windows authentication mode flag
+	# MUST be in sync with charset_type index-level setting
+	# optional, default is 0
+	#
+	# mssql_winauth			= 1 # use currently logged on user credentials
+
+
+	# MS SQL specific Unicode indexing flag
+	# optional, default is 0 (request SBCS data)
+	#
+	# mssql_unicode			= 1 # request Unicode data from server
+
+
+	# ODBC specific DSN (data source name)
+	# mandatory for odbc source type, no default value
+	#
+	# odbc_dsn				= DBQ=C:\data;DefaultDir=C:\data;Driver={Microsoft Text Driver (*.txt; *.csv)};
+	# sql_query				= SELECT id, data FROM documents.csv
+
+
+	# pre-query, executed before the main fetch query
+	# multi-value, optional, default is empty list of queries
+	#
+	# sql_query_pre			= SET NAMES utf8
+	# sql_query_pre			= SET SESSION query_cache_type=OFF
+
+
+	# main document fetch query
+	# mandatory, integer document ID field MUST be the first selected column
+	sql_query				= \
+		SELECT id, hash, parent, UNIX_TIMESTAMP(timestamp) AS timestamp, subject \
+		FROM log
+
+
+	# joined/payload field fetch query
+	# joined fields let you avoid (slow) JOIN and GROUP_CONCAT
+	# payload fields let you attach custom per-keyword values (eg. for ranking)
+	#
+	# syntax is FIELD-NAME 'from'  ( 'query' | 'payload-query' ); QUERY
+	# joined field QUERY should return 2 columns (docid, text)
+	# payload field QUERY should return 3 columns (docid, keyword, weight)
+	#
+	# REQUIRES that query results are in ascending document ID order!
+	# multi-value, optional, default is empty list of queries
+	#
+	# sql_joined_field			= tags from query; SELECT docid, CONCAT('tag',tagid) FROM tags ORDER BY docid ASC
+	# sql_joined_field			= wtags from payload-query; SELECT docid, tag, tagweight FROM tags ORDER BY docid ASC
+
+
+	# range query setup, query that must return min and max ID values
+	# optional, default is empty
+	#
+	# sql_query will need to reference $start and $end boundaries
+	# if using ranged query:
+	#
+	# sql_query				= \
+	#	SELECT doc.id, doc.id AS group, doc.title, doc.data \
+	#	FROM documents doc \
+	#	WHERE id>=$start AND id<=$end
+	#
+	# sql_query_range		= SELECT MIN(id),MAX(id) FROM documents
+
+
+	# range query step
+	# optional, default is 1024
+	#
+	# sql_range_step		= 1000
+
+
+	# unsigned integer attribute declaration
+	# multi-value (an arbitrary number of attributes is allowed), optional
+	# optional bit size can be specified, default is 32
+	#
+	# sql_attr_uint			= author_id
+	# sql_attr_uint			= forum_id:9 # 9 bits for forum_id
+	#sql_attr_uint			= group_id
+
+	# boolean attribute declaration
+	# multi-value (an arbitrary number of attributes is allowed), optional
+	# equivalent to sql_attr_uint with 1-bit size
+	#
+	# sql_attr_bool			= is_deleted
+
+
+	# bigint attribute declaration
+	# multi-value (an arbitrary number of attributes is allowed), optional
+	# declares a signed (unlike uint!) 64-bit attribute
+	#
+	# sql_attr_bigint			= my_bigint_id
+
+
+	# UNIX timestamp attribute declaration
+	# multi-value (an arbitrary number of attributes is allowed), optional
+	# similar to integer, but can also be used in date functions
+	#
+	# sql_attr_timestamp	= posted_ts
+	# sql_attr_timestamp	= last_edited_ts
+	sql_attr_timestamp		= timestamp
+
+	# string ordinal attribute declaration
+	# multi-value (an arbitrary number of attributes is allowed), optional
+	# sorts strings (bytewise), and stores their indexes in the sorted list
+	# sorting by this attr is equivalent to sorting by the original strings
+	#
+	# sql_attr_str2ordinal	= author_name
+
+
+	# floating point attribute declaration
+	# multi-value (an arbitrary number of attributes is allowed), optional
+	# values are stored in single precision, 32-bit IEEE 754 format
+	#
+	# sql_attr_float = lat_radians
+	# sql_attr_float = long_radians
+
+
+	# multi-valued attribute (MVA) attribute declaration
+	# multi-value (an arbitrary number of attributes is allowed), optional
+	# MVA values are variable length lists of unsigned 32-bit integers
+	#
+	# syntax is ATTR-TYPE ATTR-NAME 'from' SOURCE-TYPE [;QUERY] [;RANGE-QUERY]
+	# ATTR-TYPE is 'uint' or 'timestamp'
+	# SOURCE-TYPE is 'field', 'query', or 'ranged-query'
+	# QUERY is SQL query used to fetch all ( docid, attrvalue ) pairs
+	# RANGE-QUERY is SQL query used to fetch min and max ID values, similar to 'sql_query_range'
+	#
+	# sql_attr_multi	= uint tag from query; SELECT id, tag FROM tags
+	# sql_attr_multi	= uint tag from ranged-query; \
+	#	SELECT id, tag FROM tags WHERE id>=$start AND id<=$end; \
+	#	SELECT MIN(id), MAX(id) FROM tags
+
+
+	# string attribute declaration
+	# multi-value (an arbitrary number of these is allowed), optional
+	# lets you store and retrieve strings
+	#
+	# sql_attr_string			= stitle
+
+
+	# wordcount attribute declaration
+	# multi-value (an arbitrary number of these is allowed), optional
+	# lets you count the words at indexing time
+	#
+	# sql_attr_str2wordcount	= stitle
+
+
+	# combined field plus attribute declaration (from a single column)
+	# stores column as an attribute, but also indexes it as a full-text field
+	#
+	# sql_field_string			= author
+	# sql_field_str2wordcount	= title
+
+	
+	# post-query, executed on sql_query completion
+	# optional, default is empty
+	#
+	# sql_query_post		=
+
+	
+	# post-index-query, executed on successful indexing completion
+	# optional, default is empty
+	# $maxid expands to max document ID actually fetched from DB
+	#
+	# sql_query_post_index = REPLACE INTO counters ( id, val ) \
+	#	VALUES ( 'max_indexed_id', $maxid )
+
+
+	# ranged query throttling, in milliseconds
+	# optional, default is 0 which means no delay
+	# enforces given delay before each query step
+	sql_ranged_throttle	= 0
+
+	# document info query, ONLY for CLI search (ie. testing and debugging)
+	# optional, default is empty
+	# must contain $id macro and must fetch the document by that id
+	sql_query_info		= SELECT * FROM log WHERE id=$id
+
+	# kill-list query, fetches the document IDs for kill-list
+	# k-list will suppress matches from preceding indexes in the same query
+	# optional, default is empty
+	#
+	# sql_query_killlist	= SELECT id FROM documents WHERE edited>=@last_reindex
+
+
+	# columns to unpack on indexer side when indexing
+	# multi-value, optional, default is empty list
+	#
+	# unpack_zlib = zlib_column
+	# unpack_mysqlcompress = compressed_column
+	# unpack_mysqlcompress = compressed_column_2
+
+
+	# maximum unpacked length allowed in MySQL COMPRESS() unpacker
+	# optional, default is 16M
+	#
+	# unpack_mysqlcompress_maxsize = 16M
+
+
+	#####################################################################
+	## xmlpipe settings
+	#####################################################################
+
+	# type				= xmlpipe
+
+	# shell command to invoke xmlpipe stream producer
+	# mandatory
+	#
+	# xmlpipe_command	= cat /var/test.xml
+
+	#####################################################################
+	## xmlpipe2 settings
+	#####################################################################
+
+	# type				= xmlpipe2
+	# xmlpipe_command	= cat /var/test2.xml
+
+
+	# xmlpipe2 field declaration
+	# multi-value, optional, default is empty
+	#
+	# xmlpipe_field				= subject
+	# xmlpipe_field				= content
+
+
+	# xmlpipe2 attribute declaration
+	# multi-value, optional, default is empty
+	# all xmlpipe_attr_XXX options are fully similar to sql_attr_XXX
+	#
+	# xmlpipe_attr_timestamp	= published
+	# xmlpipe_attr_uint			= author_id
+
+
+	# perform UTF-8 validation, and filter out incorrect codes
+	# avoids XML parser choking on non-UTF-8 documents
+	# optional, default is 0
+	#
+	# xmlpipe_fixup_utf8		= 1
+}
+
+#############################################################################
+## index definition
+#############################################################################
+
+# local index example
+#
+# this is an index which is stored locally in the filesystem
+#
+# all indexing-time options (such as morphology and charsets)
+# are configured per local index
+index git
+{
+	# index type
+	# optional, default is 'plain'
+	# known values are 'plain', 'distributed', and 'rt' (see samples below)
+	# type			= plain
+
+	# document source(s) to index
+	# multi-value, mandatory
+	# document IDs must be globally unique across all sources
+	source			= git
+
+	# index files path and file name, without extension
+	# mandatory, path must be writable, extensions will be auto-appended
+	path			= data/git
+
+	# document attribute values (docinfo) storage mode
+	# optional, default is 'extern'
+	# known values are 'none', 'extern' and 'inline'
+	docinfo			= inline
+
+	# memory locking for cached data (.spa and .spi), to prevent swapping
+	# optional, default is 0 (do not mlock)
+	# requires searchd to be run from root
+	mlock			= 0
+
+	# a list of morphology preprocessors to apply
+	# optional, default is empty
+	#
+	# builtin preprocessors are 'none', 'stem_en', 'stem_ru', 'stem_enru',
+	# 'soundex', and 'metaphone'; additional preprocessors available from
+	# libstemmer are 'libstemmer_XXX', where XXX is algorithm code
+	# (see libstemmer_c/libstemmer/modules.txt)
+	#
+	# morphology 	= stem_en, stem_ru, soundex
+	# morphology	= libstemmer_german
+	# morphology	= libstemmer_sv
+	morphology		= none
+
+	# minimum word length at which to enable stemming
+	# optional, default is 1 (stem everything)
+	#
+	# min_stemming_len	= 1
+
+
+	# stopword files list (space separated)
+	# optional, default is empty
+	# contents are plain text, charset_table and stemming are both applied
+	#
+	# stopwords			= data/stopwords.txt
+
+
+	# wordforms file, in "mapfrom > mapto" plain text format
+	# optional, default is empty
+	#
+	# wordforms			= data/wordforms.txt
+
+
+	# tokenizing exceptions file
+	# optional, default is empty
+	#
+	# plain text, case sensitive, space insensitive in map-from part
+	# one "Map Several Words => ToASingleOne" entry per line
+	#
+	# exceptions		= data/exceptions.txt
+
+
+	# minimum indexed word length
+	# default is 1 (index everything)
+	min_word_len		= 1
+
+	# charset encoding type
+	# optional, default is 'sbcs'
+	# known types are 'sbcs' (Single Byte CharSet) and 'utf-8'
+	charset_type		= sbcs
+
+	# charset definition and case folding rules "table"
+	# optional, default value depends on charset_type
+	#
+	# defaults are configured to include English and Russian characters only
+	# you need to change the table to include additional ones
+	# this behavior MAY change in future versions
+	#
+	# 'sbcs' default value is
+	# charset_table		= 0..9, A..Z->a..z, _, a..z, U+A8->U+B8, U+B8, U+C0..U+DF->U+E0..U+FF, U+E0..U+FF
+	#
+	# 'utf-8' default value is
+	# charset_table		= 0..9, A..Z->a..z, _, a..z, U+410..U+42F->U+430..U+44F, U+430..U+44F
+
+
+	# ignored characters list
+	# optional, default value is empty
+	#
+	# ignore_chars		= U+00AD
+
+
+	# minimum word prefix length to index
+	# optional, default is 0 (do not index prefixes)
+	#
+	# min_prefix_len	= 0
+
+
+	# minimum word infix length to index
+	# optional, default is 0 (do not index infixes)
+	#
+	# min_infix_len		= 0
+
+
+	# list of fields to limit prefix/infix indexing to
+	# optional, default value is empty (index all fields in prefix/infix mode)
+	#
+	# prefix_fields		= filename
+	# infix_fields		= url, domain
+
+
+	# enable star-syntax (wildcards) when searching prefix/infix indexes
+	# search-time only, does not affect indexing, can be 0 or 1
+	# optional, default is 0 (do not use wildcard syntax)
+	#
+	# enable_star		= 1
+
+
+	# expand keywords with exact forms and/or stars when searching fit indexes
+	# search-time only, does not affect indexing, can be 0 or 1
+	# optional, default is 0 (do not expand keywords)
+	#
+	# expand_keywords		= 1
+
+	
+	# n-gram length to index, for CJK indexing
+	# only supports 0 and 1 for now, other lengths to be implemented
+	# optional, default is 0 (disable n-grams)
+	#
+	# ngram_len				= 1
+
+
+	# n-gram characters list, for CJK indexing
+	# optional, default is empty
+	#
+	# ngram_chars			= U+3000..U+2FA1F
+
+
+	# phrase boundary characters list
+	# optional, default is empty
+	#
+	# phrase_boundary		= ., ?, !, U+2026 # horizontal ellipsis
+
+
+	# phrase boundary word position increment
+	# optional, default is 0
+	#
+	# phrase_boundary_step	= 100
+
+
+	# blended characters list
+	# blended chars are indexed both as separators and valid characters
+	# for instance, AT&T will results in 3 tokens ("at", "t", and "at&t")
+	# optional, default is empty
+	#
+	# blend_chars				= +, &, U+23
+
+
+	# whether to strip HTML tags from incoming documents
+	# known values are 0 (do not strip) and 1 (do strip)
+	# optional, default is 0
+	html_strip				= 0
+
+	# what HTML attributes to index if stripping HTML
+	# optional, default is empty (do not index anything)
+	#
+	# html_index_attrs		= img=alt,title; a=title;
+
+
+	# what HTML elements contents to strip
+	# optional, default is empty (do not strip element contents)
+	#
+	# html_remove_elements	= style, script
+
+
+	# whether to preopen index data files on startup
+	# optional, default is 0 (do not preopen), searchd-only
+	#
+	# preopen					= 1
+
+
+	# whether to keep dictionary (.spi) on disk, or cache it in RAM
+	# optional, default is 0 (cache in RAM), searchd-only
+	#
+	# ondisk_dict				= 1
+
+
+	# whether to enable in-place inversion (2x less disk, 90-95% speed)
+	# optional, default is 0 (use separate temporary files), indexer-only
+	#
+	# inplace_enable			= 1
+
+
+	# in-place fine-tuning options
+	# optional, defaults are listed below
+	#
+	# inplace_hit_gap			= 0		# preallocated hitlist gap size
+	# inplace_docinfo_gap		= 0		# preallocated docinfo gap size
+	# inplace_reloc_factor	= 0.1	# relocation buffer size within arena
+	# inplace_write_factor	= 0.1	# write buffer size within arena
+
+
+	# whether to index original keywords along with stemmed versions
+	# enables "=exactform" operator to work
+	# optional, default is 0
+	#
+	# index_exact_words		= 1
+
+
+	# position increment on overshort (less that min_word_len) words
+	# optional, allowed values are 0 and 1, default is 1
+	#
+	# overshort_step			= 1
+
+
+	# position increment on stopword
+	# optional, allowed values are 0 and 1, default is 1
+	#
+	# stopword_step			= 1
+
+
+	# hitless words list
+	# positions for these keywords will not be stored in the index
+	# optional, allowed values are 'all', or a list file name
+	#
+	# hitless_words			= all
+	# hitless_words			= hitless.txt
+}
+
+
-- 
2.20.1