sphinx.conf

   1 #
   2 # Sphinx configuration file sample
   3 #
   4 # WARNING! While this sample file mentions all available options,
   5 # it contains (very) short helper descriptions only. Please refer to
   6 # doc/sphinx.html for details.
   7 #
   8
   9 #############################################################################
  10 ## data source definition
  11 #############################################################################
  12
  13 source src1
  14 {
  15         # data source type. mandatory, no default value
  16         # known types are mysql, pgsql, mssql, xmlpipe, xmlpipe2, odbc
  17         type                                    = mysql
  18
  19         #####################################################################
  20         ## SQL settings (for 'mysql' and 'pgsql' types)
  21         #####################################################################
  22
  23         # some straightforward parameters for SQL source types
  24         sql_host                                = localhost
  25         sql_user                                = test
  26         sql_pass                                =
  27         sql_db                                  = test
  28         sql_port                                = 3306  # optional, default is 3306
  29
  30         # UNIX socket name
  31         # optional, default is empty (reuse client library defaults)
  32         # usually '/var/lib/mysql/mysql.sock' on Linux
  33         # usually '/tmp/mysql.sock' on FreeBSD
  34         #
  35         # sql_sock                              = /tmp/mysql.sock
  36
  37
  38         # MySQL specific client connection flags
  39         # optional, default is 0
  40         #
  41         # mysql_connect_flags   = 32 # enable compression
  42
  43         # MySQL specific SSL certificate settings
  44         # optional, defaults are empty
  45         #
  46         # mysql_ssl_cert                = /etc/ssl/client-cert.pem
  47         # mysql_ssl_key         = /etc/ssl/client-key.pem
  48         # mysql_ssl_ca          = /etc/ssl/cacert.pem
  49
  50         # MS SQL specific Windows authentication mode flag
  51         # MUST be in sync with charset_type index-level setting
  52         # optional, default is 0
  53         #
  54         # mssql_winauth                 = 1 # use currently logged on user credentials
  55
  56
  57         # MS SQL specific Unicode indexing flag
  58         # optional, default is 0 (request SBCS data)
  59         #
  60         # mssql_unicode                 = 1 # request Unicode data from server
  61
  62
  63         # ODBC specific DSN (data source name)
  64         # mandatory for odbc source type, no default value
  65         #
  66         # odbc_dsn                              = DBQ=C:\data;DefaultDir=C:\data;Driver={Microsoft Text Driver (*.txt; *.csv)};
  67         # sql_query                             = SELECT id, data FROM documents.csv
  68
  69
  70         # pre-query, executed before the main fetch query
  71         # multi-value, optional, default is empty list of queries
  72         #
  73         # sql_query_pre                 = SET NAMES utf8
  74         # sql_query_pre                 = SET SESSION query_cache_type=OFF
  75
  76
  77         # main document fetch query
  78         # mandatory, integer document ID field MUST be the first selected column
  79         sql_query                               = \
  80                 SELECT id, group_id, UNIX_TIMESTAMP(date_added) AS date_added, title, content \
  81                 FROM documents
  82
  83
  84         # joined/payload field fetch query
  85         # joined fields let you avoid (slow) JOIN and GROUP_CONCAT
  86         # payload fields let you attach custom per-keyword values (eg. for ranking)
  87         #
  88         # syntax is FIELD-NAME 'from'  ( 'query' | 'payload-query' ); QUERY
  89         # joined field QUERY should return 2 columns (docid, text)
  90         # payload field QUERY should return 3 columns (docid, keyword, weight)
  91         #
  92         # REQUIRES that query results are in ascending document ID order!
  93         # multi-value, optional, default is empty list of queries
  94         #
  95         # sql_joined_field                      = tags from query; SELECT docid, CONCAT('tag',tagid) FROM tags ORDER BY docid ASC
  96         # sql_joined_field                      = wtags from payload-query; SELECT docid, tag, tagweight FROM tags ORDER BY docid ASC
  97
  98
  99         # range query setup, query that must return min and max ID values
 100         # optional, default is empty
 101         #
 102         # sql_query will need to reference $start and $end boundaries
 103         # if using ranged query:
 104         #
 105         # sql_query                             = \
 106         #       SELECT doc.id, doc.id AS group, doc.title, doc.data \
 107         #       FROM documents doc \
 108         #       WHERE id>=$start AND id<=$end
 109         #
 110         # sql_query_range               = SELECT MIN(id),MAX(id) FROM documents
 111
 112
 113         # range query step
 114         # optional, default is 1024
 115         #
 116         # sql_range_step                = 1000
 117
 118
 119         # unsigned integer attribute declaration
 120         # multi-value (an arbitrary number of attributes is allowed), optional
 121         # optional bit size can be specified, default is 32
 122         #
 123         # sql_attr_uint                 = author_id
 124         # sql_attr_uint                 = forum_id:9 # 9 bits for forum_id
 125         sql_attr_uint                   = group_id
 126
 127         # boolean attribute declaration
 128         # multi-value (an arbitrary number of attributes is allowed), optional
 129         # equivalent to sql_attr_uint with 1-bit size
 130         #
 131         # sql_attr_bool                 = is_deleted
 132
 133
 134         # bigint attribute declaration
 135         # multi-value (an arbitrary number of attributes is allowed), optional
 136         # declares a signed (unlike uint!) 64-bit attribute
 137         #
 138         # sql_attr_bigint                       = my_bigint_id
 139
 140
 141         # UNIX timestamp attribute declaration
 142         # multi-value (an arbitrary number of attributes is allowed), optional
 143         # similar to integer, but can also be used in date functions
 144         #
 145         # sql_attr_timestamp    = posted_ts
 146         # sql_attr_timestamp    = last_edited_ts
 147         sql_attr_timestamp              = date_added
 148
 149         # string ordinal attribute declaration
 150         # multi-value (an arbitrary number of attributes is allowed), optional
 151         # sorts strings (bytewise), and stores their indexes in the sorted list
 152         # sorting by this attr is equivalent to sorting by the original strings
 153         #
 154         # sql_attr_str2ordinal  = author_name
 155
 156
 157         # floating point attribute declaration
 158         # multi-value (an arbitrary number of attributes is allowed), optional
 159         # values are stored in single precision, 32-bit IEEE 754 format
 160         #
 161         # sql_attr_float = lat_radians
 162         # sql_attr_float = long_radians
 163
 164
 165         # multi-valued attribute (MVA) attribute declaration
 166         # multi-value (an arbitrary number of attributes is allowed), optional
 167         # MVA values are variable length lists of unsigned 32-bit integers
 168         #
 169         # syntax is ATTR-TYPE ATTR-NAME 'from' SOURCE-TYPE [;QUERY] [;RANGE-QUERY]
 170         # ATTR-TYPE is 'uint' or 'timestamp'
 171         # SOURCE-TYPE is 'field', 'query', or 'ranged-query'
 172         # QUERY is SQL query used to fetch all ( docid, attrvalue ) pairs
 173         # RANGE-QUERY is SQL query used to fetch min and max ID values, similar to 'sql_query_range'
 174         #
 175         # sql_attr_multi        = uint tag from query; SELECT id, tag FROM tags
 176         # sql_attr_multi        = uint tag from ranged-query; \
 177         #       SELECT id, tag FROM tags WHERE id>=$start AND id<=$end; \
 178         #       SELECT MIN(id), MAX(id) FROM tags
 179
 180
 181         # string attribute declaration
 182         # multi-value (an arbitrary number of these is allowed), optional
 183         # lets you store and retrieve strings
 184         #
 185         # sql_attr_string                       = stitle
 186         sql_attr_string                 = hash
 187         sql_attr_string                 = parent
 188         sql_attr_string                 = subject
 189 #       sql_field_str2wordcount = subject
 190
 191
 192         # wordcount attribute declaration
 193         # multi-value (an arbitrary number of these is allowed), optional
 194         # lets you count the words at indexing time
 195         #
 196         # sql_attr_str2wordcount        = stitle
 197
 198
 199         # combined field plus attribute declaration (from a single column)
 200         # stores column as an attribute, but also indexes it as a full-text field
 201         #
 202         # sql_field_string                      = author
 203         # sql_field_str2wordcount       = title
 204
 205
 206         # post-query, executed on sql_query completion
 207         # optional, default is empty
 208         #
 209         # sql_query_post                =
 210
 211
 212         # post-index-query, executed on successful indexing completion
 213         # optional, default is empty
 214         # $maxid expands to max document ID actually fetched from DB
 215         #
 216         # sql_query_post_index = REPLACE INTO counters ( id, val ) \
 217         #       VALUES ( 'max_indexed_id', $maxid )
 218
 219
 220         # ranged query throttling, in milliseconds
 221         # optional, default is 0 which means no delay
 222         # enforces given delay before each query step
 223         sql_ranged_throttle     = 0
 224
 225         # document info query, ONLY for CLI search (ie. testing and debugging)
 226         # optional, default is empty
 227         # must contain $id macro and must fetch the document by that id
 228         sql_query_info          = SELECT * FROM documents WHERE id=$id
 229
 230         # kill-list query, fetches the document IDs for kill-list
 231         # k-list will suppress matches from preceding indexes in the same query
 232         # optional, default is empty
 233         #
 234         # sql_query_killlist    = SELECT id FROM documents WHERE edited>=@last_reindex
 235
 236
 237         # columns to unpack on indexer side when indexing
 238         # multi-value, optional, default is empty list
 239         #
 240         # unpack_zlib = zlib_column
 241         # unpack_mysqlcompress = compressed_column
 242         # unpack_mysqlcompress = compressed_column_2
 243
 244
 245         # maximum unpacked length allowed in MySQL COMPRESS() unpacker
 246         # optional, default is 16M
 247         #
 248         # unpack_mysqlcompress_maxsize = 16M
 249
 250
 251         #####################################################################
 252         ## xmlpipe settings
 253         #####################################################################
 254
 255         # type                          = xmlpipe
 256
 257         # shell command to invoke xmlpipe stream producer
 258         # mandatory
 259         #
 260         # xmlpipe_command       = cat /var/test.xml
 261
 262         #####################################################################
 263         ## xmlpipe2 settings
 264         #####################################################################
 265
 266         # type                          = xmlpipe2
 267         # xmlpipe_command       = cat /var/test2.xml
 268
 269
 270         # xmlpipe2 field declaration
 271         # multi-value, optional, default is empty
 272         #
 273         # xmlpipe_field                         = subject
 274         # xmlpipe_field                         = content
 275
 276
 277         # xmlpipe2 attribute declaration
 278         # multi-value, optional, default is empty
 279         # all xmlpipe_attr_XXX options are fully similar to sql_attr_XXX
 280         #
 281         # xmlpipe_attr_timestamp        = published
 282         # xmlpipe_attr_uint                     = author_id
 283
 284
 285         # perform UTF-8 validation, and filter out incorrect codes
 286         # avoids XML parser choking on non-UTF-8 documents
 287         # optional, default is 0
 288         #
 289         # xmlpipe_fixup_utf8            = 1
 290 }
 291
 292
 293 # inherited source example
 294 #
 295 # all the parameters are copied from the parent source,
 296 # and may then be overridden in this source definition
 297 source src1throttled : src1
 298 {
 299         sql_ranged_throttle                     = 100
 300 }
 301
 302 #############################################################################
 303 ## index definition
 304 #############################################################################
 305
 306 # local index example
 307 #
 308 # this is an index which is stored locally in the filesystem
 309 #
 310 # all indexing-time options (such as morphology and charsets)
 311 # are configured per local index
 312 index test1
 313 {
 314         # index type
 315         # optional, default is 'plain'
 316         # known values are 'plain', 'distributed', and 'rt' (see samples below)
 317         # type                  = plain
 318
 319         # document source(s) to index
 320         # multi-value, mandatory
 321         # document IDs must be globally unique across all sources
 322         source                  = src1
 323
 324         # index files path and file name, without extension
 325         # mandatory, path must be writable, extensions will be auto-appended
 326         path                    = data/test1
 327
 328         # document attribute values (docinfo) storage mode
 329         # optional, default is 'extern'
 330         # known values are 'none', 'extern' and 'inline'
 331         docinfo                 = extern
 332
 333         # memory locking for cached data (.spa and .spi), to prevent swapping
 334         # optional, default is 0 (do not mlock)
 335         # requires searchd to be run from root
 336         mlock                   = 0
 337
 338         # a list of morphology preprocessors to apply
 339         # optional, default is empty
 340         #
 341         # builtin preprocessors are 'none', 'stem_en', 'stem_ru', 'stem_enru',
 342         # 'soundex', and 'metaphone'; additional preprocessors available from
 343         # libstemmer are 'libstemmer_XXX', where XXX is algorithm code
 344         # (see libstemmer_c/libstemmer/modules.txt)
 345         #
 346         # morphology    = stem_en, stem_ru, soundex
 347         # morphology    = libstemmer_german
 348         # morphology    = libstemmer_sv
 349         morphology              = none
 350
 351         # minimum word length at which to enable stemming
 352         # optional, default is 1 (stem everything)
 353         #
 354         # min_stemming_len      = 1
 355
 356
 357         # stopword files list (space separated)
 358         # optional, default is empty
 359         # contents are plain text, charset_table and stemming are both applied
 360         #
 361         # stopwords                     = data/stopwords.txt
 362
 363
 364         # wordforms file, in "mapfrom > mapto" plain text format
 365         # optional, default is empty
 366         #
 367         # wordforms                     = data/wordforms.txt
 368
 369
 370         # tokenizing exceptions file
 371         # optional, default is empty
 372         #
 373         # plain text, case sensitive, space insensitive in map-from part
 374         # one "Map Several Words => ToASingleOne" entry per line
 375         #
 376         # exceptions            = data/exceptions.txt
 377
 378
 379         # minimum indexed word length
 380         # default is 1 (index everything)
 381         min_word_len            = 1
 382
 383         # charset encoding type
 384         # optional, default is 'sbcs'
 385         # known types are 'sbcs' (Single Byte CharSet) and 'utf-8'
 386         charset_type            = sbcs
 387
 388         # charset definition and case folding rules "table"
 389         # optional, default value depends on charset_type
 390         #
 391         # defaults are configured to include English and Russian characters only
 392         # you need to change the table to include additional ones
 393         # this behavior MAY change in future versions
 394         #
 395         # 'sbcs' default value is
 396         # charset_table         = 0..9, A..Z->a..z, _, a..z, U+A8->U+B8, U+B8, U+C0..U+DF->U+E0..U+FF, U+E0..U+FF
 397         #
 398         # 'utf-8' default value is
 399         # charset_table         = 0..9, A..Z->a..z, _, a..z, U+410..U+42F->U+430..U+44F, U+430..U+44F
 400
 401
 402         # ignored characters list
 403         # optional, default value is empty
 404         #
 405         # ignore_chars          = U+00AD
 406
 407
 408         # minimum word prefix length to index
 409         # optional, default is 0 (do not index prefixes)
 410         #
 411         # min_prefix_len        = 0
 412
 413
 414         # minimum word infix length to index
 415         # optional, default is 0 (do not index infixes)
 416         #
 417         # min_infix_len         = 0
 418
 419
 420         # list of fields to limit prefix/infix indexing to
 421         # optional, default value is empty (index all fields in prefix/infix mode)
 422         #
 423         # prefix_fields         = filename
 424         # infix_fields          = url, domain
 425
 426
 427         # enable star-syntax (wildcards) when searching prefix/infix indexes
 428         # search-time only, does not affect indexing, can be 0 or 1
 429         # optional, default is 0 (do not use wildcard syntax)
 430         #
 431         # enable_star           = 1
 432
 433
 434         # expand keywords with exact forms and/or stars when searching fit indexes
 435         # search-time only, does not affect indexing, can be 0 or 1
 436         # optional, default is 0 (do not expand keywords)
 437         #
 438         # expand_keywords               = 1
 439
 440
 441         # n-gram length to index, for CJK indexing
 442         # only supports 0 and 1 for now, other lengths to be implemented
 443         # optional, default is 0 (disable n-grams)
 444         #
 445         # ngram_len                             = 1
 446
 447
 448         # n-gram characters list, for CJK indexing
 449         # optional, default is empty
 450         #
 451         # ngram_chars                   = U+3000..U+2FA1F
 452
 453
 454         # phrase boundary characters list
 455         # optional, default is empty
 456         #
 457         # phrase_boundary               = ., ?, !, U+2026 # horizontal ellipsis
 458
 459
 460         # phrase boundary word position increment
 461         # optional, default is 0
 462         #
 463         # phrase_boundary_step  = 100
 464
 465
 466         # blended characters list
 467         # blended chars are indexed both as separators and valid characters
 468         # for instance, AT&T will results in 3 tokens ("at", "t", and "at&t")
 469         # optional, default is empty
 470         #
 471         # blend_chars                           = +, &, U+23
 472
 473
 474         # whether to strip HTML tags from incoming documents
 475         # known values are 0 (do not strip) and 1 (do strip)
 476         # optional, default is 0
 477         html_strip                              = 0
 478
 479         # what HTML attributes to index if stripping HTML
 480         # optional, default is empty (do not index anything)
 481         #
 482         # html_index_attrs              = img=alt,title; a=title;
 483
 484
 485         # what HTML elements contents to strip
 486         # optional, default is empty (do not strip element contents)
 487         #
 488         # html_remove_elements  = style, script
 489
 490
 491         # whether to preopen index data files on startup
 492         # optional, default is 0 (do not preopen), searchd-only
 493         #
 494         # preopen                                       = 1
 495
 496
 497         # whether to keep dictionary (.spi) on disk, or cache it in RAM
 498         # optional, default is 0 (cache in RAM), searchd-only
 499         #
 500         # ondisk_dict                           = 1
 501
 502
 503         # whether to enable in-place inversion (2x less disk, 90-95% speed)
 504         # optional, default is 0 (use separate temporary files), indexer-only
 505         #
 506         # inplace_enable                        = 1
 507
 508
 509         # in-place fine-tuning options
 510         # optional, defaults are listed below
 511         #
 512         # inplace_hit_gap                       = 0             # preallocated hitlist gap size
 513         # inplace_docinfo_gap           = 0             # preallocated docinfo gap size
 514         # inplace_reloc_factor  = 0.1   # relocation buffer size within arena
 515         # inplace_write_factor  = 0.1   # write buffer size within arena
 516
 517
 518         # whether to index original keywords along with stemmed versions
 519         # enables "=exactform" operator to work
 520         # optional, default is 0
 521         #
 522         # index_exact_words             = 1
 523
 524
 525         # position increment on overshort (less that min_word_len) words
 526         # optional, allowed values are 0 and 1, default is 1
 527         #
 528         # overshort_step                        = 1
 529
 530
 531         # position increment on stopword
 532         # optional, allowed values are 0 and 1, default is 1
 533         #
 534         # stopword_step                 = 1
 535
 536
 537         # hitless words list
 538         # positions for these keywords will not be stored in the index
 539         # optional, allowed values are 'all', or a list file name
 540         #
 541         # hitless_words                 = all
 542         # hitless_words                 = hitless.txt
 543 }
 544
 545
 546 # inherited index example
 547 #
 548 # all the parameters are copied from the parent index,
 549 # and may then be overridden in this index definition
 550 index test1stemmed : test1
 551 {
 552         path                    = data/test1stemmed
 553         morphology              = stem_en
 554 }
 555
 556
 557 # distributed index example
 558 #
 559 # this is a virtual index which can NOT be directly indexed,
 560 # and only contains references to other local and/or remote indexes
 561 index dist1
 562 {
 563         # 'distributed' index type MUST be specified
 564         type                            = distributed
 565
 566         # local index to be searched
 567         # there can be many local indexes configured
 568         local                           = test1
 569         local                           = test1stemmed
 570
 571         # remote agent
 572         # multiple remote agents may be specified
 573         # syntax for TCP connections is 'hostname:port:index1,[index2[,...]]'
 574         # syntax for local UNIX connections is '/path/to/socket:index1,[index2[,...]]'
 575         agent                           = localhost:9313:remote1
 576         agent                           = localhost:9314:remote2,remote3
 577         # agent                         = run/searchd.sock:remote4
 578
 579         # blackhole remote agent, for debugging/testing
 580         # network errors and search results will be ignored
 581         #
 582         # agent_blackhole               = testbox:9312:testindex1,testindex2
 583
 584
 585         # remote agent connection timeout, milliseconds
 586         # optional, default is 1000 ms, ie. 1 sec
 587         agent_connect_timeout   = 1000
 588
 589         # remote agent query timeout, milliseconds
 590         # optional, default is 3000 ms, ie. 3 sec
 591         agent_query_timeout             = 3000
 592 }
 593
 594
 595 # realtime index example
 596 #
 597 # you can run INSERT, REPLACE, and DELETE on this index on the fly
 598 # using MySQL protocol (see 'listen' directive below)
 599 index rt
 600 {
 601         # 'rt' index type must be specified to use RT index
 602         type                            = rt
 603
 604         # index files path and file name, without extension
 605         # mandatory, path must be writable, extensions will be auto-appended
 606         path                            = data/rt
 607
 608         # RAM chunk size limit
 609         # RT index will keep at most this much data in RAM, then flush to disk
 610         # optional, default is 32M
 611         #
 612         # rt_mem_limit          = 512M
 613
 614         # full-text field declaration
 615         # multi-value, mandatory
 616         rt_field                        = title
 617         rt_field                        = content
 618
 619         # unsigned integer attribute declaration
 620         # multi-value (an arbitrary number of attributes is allowed), optional
 621         # declares an unsigned 32-bit attribute
 622         rt_attr_uint                    = gid
 623
 624         # bigint attribute declaration
 625         # multi-value (an arbitrary number of attributes is allowed), optional
 626         # declares a signed 64-bit attribute
 627         #
 628         # rt_attr_bigint                = guid
 629
 630
 631         # floating point attribute declaration
 632         # multi-value (an arbitrary number of attributes is allowed), optional
 633         # declares a single precision, 32-bit IEEE 754 format float attribute
 634         #
 635         # rt_attr_float         = gpa
 636 }
 637
 638 #############################################################################
 639 ## indexer settings
 640 #############################################################################
 641
 642 indexer
 643 {
 644         # memory limit, in bytes, kiloytes (16384K) or megabytes (256M)
 645         # optional, default is 32M, max is 2047M, recommended is 256M to 1024M
 646         mem_limit                       = 32M
 647
 648         # maximum IO calls per second (for I/O throttling)
 649         # optional, default is 0 (unlimited)
 650         #
 651         # max_iops                      = 40
 652
 653
 654         # maximum IO call size, bytes (for I/O throttling)
 655         # optional, default is 0 (unlimited)
 656         #
 657         # max_iosize            = 1048576
 658
 659
 660         # maximum xmlpipe2 field length, bytes
 661         # optional, default is 2M
 662         #
 663         # max_xmlpipe2_field    = 4M
 664
 665
 666         # write buffer size, bytes
 667         # several (currently up to 4) buffers will be allocated
 668         # write buffers are allocated in addition to mem_limit
 669         # optional, default is 1M
 670         #
 671         # write_buffer          = 1M
 672 }
 673
 674 #############################################################################
 675 ## searchd settings
 676 #############################################################################
 677
 678 searchd
 679 {
 680         # hostname, port, or hostname:port, or /unix/socket/path to listen on
 681         # multi-value, multiple listen points are allowed
 682         # optional, default is 0.0.0.0:9312 (listen on all interfaces, port 9312)
 683         #
 684         # listen                                = 127.0.0.1
 685         # listen                                = 192.168.0.1:9312
 686         # listen                                = 9312
 687         # listen                                = run/searchd.sock
 688         listen = localhost:9306:mysql41
 689
 690
 691         # log file, searchd run info is logged here
 692         # optional, default is 'searchd.log'
 693         log                                     = log/searchd.log
 694
 695         # query log file, all search queries are logged here
 696         # optional, default is empty (do not log queries)
 697         query_log                       = log/query.log
 698
 699         # client read timeout, seconds
 700         # optional, default is 5
 701         read_timeout            = 5
 702
 703         # request timeout, seconds
 704         # optional, default is 5 minutes
 705         client_timeout          = 300
 706
 707         # maximum amount of children to fork (concurrent searches to run)
 708         # optional, default is 0 (unlimited)
 709         max_children            = 30
 710
 711         # PID file, searchd process ID file name
 712         # mandatory
 713         pid_file                        = log/searchd.pid
 714
 715         # max amount of matches the daemon ever keeps in RAM, per-index
 716         # WARNING, THERE'S ALSO PER-QUERY LIMIT, SEE SetLimits() API CALL
 717         # default is 1000 (just like Google)
 718         max_matches                     = 1000
 719
 720         # seamless rotate, prevents rotate stalls if precaching huge datasets
 721         # optional, default is 1
 722         seamless_rotate         = 1
 723
 724         # whether to forcibly preopen all indexes on startup
 725         # optional, default is 0 (do not preopen)
 726         preopen_indexes         = 0
 727
 728         # whether to unlink .old index copies on succesful rotation.
 729         # optional, default is 1 (do unlink)
 730         unlink_old                      = 1
 731
 732         # attribute updates periodic flush timeout, seconds
 733         # updates will be automatically dumped to disk this frequently
 734         # optional, default is 0 (disable periodic flush)
 735         #
 736         # attr_flush_period     = 900
 737
 738
 739         # instance-wide ondisk_dict defaults (per-index value take precedence)
 740         # optional, default is 0 (precache all dictionaries in RAM)
 741         #
 742         # ondisk_dict_default   = 1
 743
 744
 745         # MVA updates pool size
 746         # shared between all instances of searchd, disables attr flushes!
 747         # optional, default size is 1M
 748         mva_updates_pool        = 1M
 749
 750         # max allowed network packet size
 751         # limits both query packets from clients, and responses from agents
 752         # optional, default size is 8M
 753         max_packet_size         = 8M
 754
 755         # crash log path
 756         # searchd will (try to) log crashed query to 'crash_log_path.PID' file
 757         # optional, default is empty (do not create crash logs)
 758         #
 759         # crash_log_path                = log/crash
 760
 761
 762         # max allowed per-query filter count
 763         # optional, default is 256
 764         max_filters                     = 256
 765
 766         # max allowed per-filter values count
 767         # optional, default is 4096
 768         max_filter_values       = 4096
 769
 770
 771         # socket listen queue length
 772         # optional, default is 5
 773         #
 774         # listen_backlog                = 5
 775
 776
 777         # per-keyword read buffer size
 778         # optional, default is 256K
 779         #
 780         # read_buffer                   = 256K
 781
 782
 783         # unhinted read size (currently used when reading hits)
 784         # optional, default is 32K
 785         #
 786         # read_unhinted         = 32K
 787
 788
 789         # max allowed per-batch query count (aka multi-query count)
 790         # optional, default is 32
 791         max_batch_queries       = 32
 792
 793
 794         # max common subtree document cache size, per-query
 795         # optional, default is 512K, 0 means disable subtree optimization
 796         #
 797         # subtree_docs_cache    = 4M
 798
 799
 800         # max common subtree hit cache size, per-query
 801         # optional, default is 1M, 0 means disable subtree optimization
 802         #
 803         # subtree_hits_cache    = 8M
 804
 805
 806         # multi-processing mode (MPM)
 807         # known values are none, fork, prefork, and threads
 808         # optional, default is fork
 809         #
 810         # workers                               = fork
 811         workers=threads
 812
 813
 814         # max threads to create for searching local parts of a distributed index
 815         # optional, default is 0, which means disable multi-threaded searching
 816         # should work with all MPMs (ie. does NOT require workers=threads)
 817         #
 818         # dist_threads          = 4
 819 }
 820
 821 # --eof--
 822
 823 source git
 824 {
 825         # data source type. mandatory, no default value
 826         # known types are mysql, pgsql, mssql, xmlpipe, xmlpipe2, odbc
 827         type                                    = mysql
 828
 829         #####################################################################
 830         ## SQL settings (for 'mysql' and 'pgsql' types)
 831         #####################################################################
 832
 833         # some straightforward parameters for SQL source types
 834         sql_host                                = localhost
 835         sql_user                                = dpavlin
 836         sql_pass                                =
 837         sql_db                                  = git
 838         sql_port                                = 3306  # optional, default is 3306
 839
 840         # UNIX socket name
 841         # optional, default is empty (reuse client library defaults)
 842         # usually '/var/lib/mysql/mysql.sock' on Linux
 843         # usually '/tmp/mysql.sock' on FreeBSD
 844         #
 845         # sql_sock                              = /tmp/mysql.sock
 846
 847
 848         # MySQL specific client connection flags
 849         # optional, default is 0
 850         #
 851         # mysql_connect_flags   = 32 # enable compression
 852
 853         # MySQL specific SSL certificate settings
 854         # optional, defaults are empty
 855         #
 856         # mysql_ssl_cert                = /etc/ssl/client-cert.pem
 857         # mysql_ssl_key         = /etc/ssl/client-key.pem
 858         # mysql_ssl_ca          = /etc/ssl/cacert.pem
 859
 860         # MS SQL specific Windows authentication mode flag
 861         # MUST be in sync with charset_type index-level setting
 862         # optional, default is 0
 863         #
 864         # mssql_winauth                 = 1 # use currently logged on user credentials
 865
 866
 867         # MS SQL specific Unicode indexing flag
 868         # optional, default is 0 (request SBCS data)
 869         #
 870         # mssql_unicode                 = 1 # request Unicode data from server
 871
 872
 873         # ODBC specific DSN (data source name)
 874         # mandatory for odbc source type, no default value
 875         #
 876         # odbc_dsn                              = DBQ=C:\data;DefaultDir=C:\data;Driver={Microsoft Text Driver (*.txt; *.csv)};
 877         # sql_query                             = SELECT id, data FROM documents.csv
 878
 879
 880         # pre-query, executed before the main fetch query
 881         # multi-value, optional, default is empty list of queries
 882         #
 883         # sql_query_pre                 = SET NAMES utf8
 884         # sql_query_pre                 = SET SESSION query_cache_type=OFF
 885
 886
 887         # main document fetch query
 888         # mandatory, integer document ID field MUST be the first selected column
 889         sql_query                               = \
 890                 SELECT id, hash, parent, UNIX_TIMESTAMP(timestamp) AS timestamp, subject \
 891                 FROM log
 892
 893
 894         # joined/payload field fetch query
 895         # joined fields let you avoid (slow) JOIN and GROUP_CONCAT
 896         # payload fields let you attach custom per-keyword values (eg. for ranking)
 897         #
 898         # syntax is FIELD-NAME 'from'  ( 'query' | 'payload-query' ); QUERY
 899         # joined field QUERY should return 2 columns (docid, text)
 900         # payload field QUERY should return 3 columns (docid, keyword, weight)
 901         #
 902         # REQUIRES that query results are in ascending document ID order!
 903         # multi-value, optional, default is empty list of queries
 904         #
 905         # sql_joined_field                      = tags from query; SELECT docid, CONCAT('tag',tagid) FROM tags ORDER BY docid ASC
 906         # sql_joined_field                      = wtags from payload-query; SELECT docid, tag, tagweight FROM tags ORDER BY docid ASC
 907
 908
 909         # range query setup, query that must return min and max ID values
 910         # optional, default is empty
 911         #
 912         # sql_query will need to reference $start and $end boundaries
 913         # if using ranged query:
 914         #
 915         # sql_query                             = \
 916         #       SELECT doc.id, doc.id AS group, doc.title, doc.data \
 917         #       FROM documents doc \
 918         #       WHERE id>=$start AND id<=$end
 919         #
 920         # sql_query_range               = SELECT MIN(id),MAX(id) FROM documents
 921
 922
 923         # range query step
 924         # optional, default is 1024
 925         #
 926         # sql_range_step                = 1000
 927
 928
 929         # unsigned integer attribute declaration
 930         # multi-value (an arbitrary number of attributes is allowed), optional
 931         # optional bit size can be specified, default is 32
 932         #
 933         # sql_attr_uint                 = author_id
 934         # sql_attr_uint                 = forum_id:9 # 9 bits for forum_id
 935         #sql_attr_uint                  = group_id
 936
 937         # boolean attribute declaration
 938         # multi-value (an arbitrary number of attributes is allowed), optional
 939         # equivalent to sql_attr_uint with 1-bit size
 940         #
 941         # sql_attr_bool                 = is_deleted
 942
 943
 944         # bigint attribute declaration
 945         # multi-value (an arbitrary number of attributes is allowed), optional
 946         # declares a signed (unlike uint!) 64-bit attribute
 947         #
 948         # sql_attr_bigint                       = my_bigint_id
 949
 950
 951         # UNIX timestamp attribute declaration
 952         # multi-value (an arbitrary number of attributes is allowed), optional
 953         # similar to integer, but can also be used in date functions
 954         #
 955         # sql_attr_timestamp    = posted_ts
 956         # sql_attr_timestamp    = last_edited_ts
 957         sql_attr_timestamp              = timestamp
 958
 959         # string ordinal attribute declaration
 960         # multi-value (an arbitrary number of attributes is allowed), optional
 961         # sorts strings (bytewise), and stores their indexes in the sorted list
 962         # sorting by this attr is equivalent to sorting by the original strings
 963         #
 964         # sql_attr_str2ordinal  = author_name
 965
 966
 967         # floating point attribute declaration
 968         # multi-value (an arbitrary number of attributes is allowed), optional
 969         # values are stored in single precision, 32-bit IEEE 754 format
 970         #
 971         # sql_attr_float = lat_radians
 972         # sql_attr_float = long_radians
 973
 974
 975         # multi-valued attribute (MVA) attribute declaration
 976         # multi-value (an arbitrary number of attributes is allowed), optional
 977         # MVA values are variable length lists of unsigned 32-bit integers
 978         #
 979         # syntax is ATTR-TYPE ATTR-NAME 'from' SOURCE-TYPE [;QUERY] [;RANGE-QUERY]
 980         # ATTR-TYPE is 'uint' or 'timestamp'
 981         # SOURCE-TYPE is 'field', 'query', or 'ranged-query'
 982         # QUERY is SQL query used to fetch all ( docid, attrvalue ) pairs
 983         # RANGE-QUERY is SQL query used to fetch min and max ID values, similar to 'sql_query_range'
 984         #
 985         # sql_attr_multi        = uint tag from query; SELECT id, tag FROM tags
 986         # sql_attr_multi        = uint tag from ranged-query; \
 987         #       SELECT id, tag FROM tags WHERE id>=$start AND id<=$end; \
 988         #       SELECT MIN(id), MAX(id) FROM tags
 989
 990
 991         # string attribute declaration
 992         # multi-value (an arbitrary number of these is allowed), optional
 993         # lets you store and retrieve strings
 994         #
 995         # sql_attr_string                       = stitle
 996
 997
 998         # wordcount attribute declaration
 999         # multi-value (an arbitrary number of these is allowed), optional
1000         # lets you count the words at indexing time
1001         #
1002         # sql_attr_str2wordcount        = stitle
1003
1004
1005         # combined field plus attribute declaration (from a single column)
1006         # stores column as an attribute, but also indexes it as a full-text field
1007         #
1008         # sql_field_string                      = author
1009         # sql_field_str2wordcount       = title
1010
1011
1012         # post-query, executed on sql_query completion
1013         # optional, default is empty
1014         #
1015         # sql_query_post                =
1016
1017
1018         # post-index-query, executed on successful indexing completion
1019         # optional, default is empty
1020         # $maxid expands to max document ID actually fetched from DB
1021         #
1022         # sql_query_post_index = REPLACE INTO counters ( id, val ) \
1023         #       VALUES ( 'max_indexed_id', $maxid )
1024
1025
1026         # ranged query throttling, in milliseconds
1027         # optional, default is 0 which means no delay
1028         # enforces given delay before each query step
1029         sql_ranged_throttle     = 0
1030
1031         # document info query, ONLY for CLI search (ie. testing and debugging)
1032         # optional, default is empty
1033         # must contain $id macro and must fetch the document by that id
1034         sql_query_info          = SELECT * FROM log WHERE id=$id
1035
1036         # kill-list query, fetches the document IDs for kill-list
1037         # k-list will suppress matches from preceding indexes in the same query
1038         # optional, default is empty
1039         #
1040         # sql_query_killlist    = SELECT id FROM documents WHERE edited>=@last_reindex
1041
1042
1043         # columns to unpack on indexer side when indexing
1044         # multi-value, optional, default is empty list
1045         #
1046         # unpack_zlib = zlib_column
1047         # unpack_mysqlcompress = compressed_column
1048         # unpack_mysqlcompress = compressed_column_2
1049
1050
1051         # maximum unpacked length allowed in MySQL COMPRESS() unpacker
1052         # optional, default is 16M
1053         #
1054         # unpack_mysqlcompress_maxsize = 16M
1055
1056
1057         #####################################################################
1058         ## xmlpipe settings
1059         #####################################################################
1060
1061         # type                          = xmlpipe
1062
1063         # shell command to invoke xmlpipe stream producer
1064         # mandatory
1065         #
1066         # xmlpipe_command       = cat /var/test.xml
1067
1068         #####################################################################
1069         ## xmlpipe2 settings
1070         #####################################################################
1071
1072         # type                          = xmlpipe2
1073         # xmlpipe_command       = cat /var/test2.xml
1074
1075
1076         # xmlpipe2 field declaration
1077         # multi-value, optional, default is empty
1078         #
1079         # xmlpipe_field                         = subject
1080         # xmlpipe_field                         = content
1081
1082
1083         # xmlpipe2 attribute declaration
1084         # multi-value, optional, default is empty
1085         # all xmlpipe_attr_XXX options are fully similar to sql_attr_XXX
1086         #
1087         # xmlpipe_attr_timestamp        = published
1088         # xmlpipe_attr_uint                     = author_id
1089
1090
1091         # perform UTF-8 validation, and filter out incorrect codes
1092         # avoids XML parser choking on non-UTF-8 documents
1093         # optional, default is 0
1094         #
1095         # xmlpipe_fixup_utf8            = 1
1096 }
1097
1098 #############################################################################
1099 ## index definition
1100 #############################################################################
1101
1102 # local index example
1103 #
1104 # this is an index which is stored locally in the filesystem
1105 #
1106 # all indexing-time options (such as morphology and charsets)
1107 # are configured per local index
1108 index git
1109 {
1110         # index type
1111         # optional, default is 'plain'
1112         # known values are 'plain', 'distributed', and 'rt' (see samples below)
1113         # type                  = plain
1114
1115         # document source(s) to index
1116         # multi-value, mandatory
1117         # document IDs must be globally unique across all sources
1118         source                  = git
1119
1120         # index files path and file name, without extension
1121         # mandatory, path must be writable, extensions will be auto-appended
1122         path                    = data/git
1123
1124         # document attribute values (docinfo) storage mode
1125         # optional, default is 'extern'
1126         # known values are 'none', 'extern' and 'inline'
1127         docinfo                 = inline
1128
1129         # memory locking for cached data (.spa and .spi), to prevent swapping
1130         # optional, default is 0 (do not mlock)
1131         # requires searchd to be run from root
1132         mlock                   = 0
1133
1134         # a list of morphology preprocessors to apply
1135         # optional, default is empty
1136         #
1137         # builtin preprocessors are 'none', 'stem_en', 'stem_ru', 'stem_enru',
1138         # 'soundex', and 'metaphone'; additional preprocessors available from
1139         # libstemmer are 'libstemmer_XXX', where XXX is algorithm code
1140         # (see libstemmer_c/libstemmer/modules.txt)
1141         #
1142         # morphology    = stem_en, stem_ru, soundex
1143         # morphology    = libstemmer_german
1144         # morphology    = libstemmer_sv
1145         morphology              = none
1146
1147         # minimum word length at which to enable stemming
1148         # optional, default is 1 (stem everything)
1149         #
1150         # min_stemming_len      = 1
1151
1152
1153         # stopword files list (space separated)
1154         # optional, default is empty
1155         # contents are plain text, charset_table and stemming are both applied
1156         #
1157         # stopwords                     = data/stopwords.txt
1158
1159
1160         # wordforms file, in "mapfrom > mapto" plain text format
1161         # optional, default is empty
1162         #
1163         # wordforms                     = data/wordforms.txt
1164
1165
1166         # tokenizing exceptions file
1167         # optional, default is empty
1168         #
1169         # plain text, case sensitive, space insensitive in map-from part
1170         # one "Map Several Words => ToASingleOne" entry per line
1171         #
1172         # exceptions            = data/exceptions.txt
1173
1174
1175         # minimum indexed word length
1176         # default is 1 (index everything)
1177         min_word_len            = 1
1178
1179         # charset encoding type
1180         # optional, default is 'sbcs'
1181         # known types are 'sbcs' (Single Byte CharSet) and 'utf-8'
1182         charset_type            = sbcs
1183
1184         # charset definition and case folding rules "table"
1185         # optional, default value depends on charset_type
1186         #
1187         # defaults are configured to include English and Russian characters only
1188         # you need to change the table to include additional ones
1189         # this behavior MAY change in future versions
1190         #
1191         # 'sbcs' default value is
1192         # charset_table         = 0..9, A..Z->a..z, _, a..z, U+A8->U+B8, U+B8, U+C0..U+DF->U+E0..U+FF, U+E0..U+FF
1193         #
1194         # 'utf-8' default value is
1195         # charset_table         = 0..9, A..Z->a..z, _, a..z, U+410..U+42F->U+430..U+44F, U+430..U+44F
1196
1197
1198         # ignored characters list
1199         # optional, default value is empty
1200         #
1201         # ignore_chars          = U+00AD
1202
1203
1204         # minimum word prefix length to index
1205         # optional, default is 0 (do not index prefixes)
1206         #
1207         # min_prefix_len        = 0
1208
1209
1210         # minimum word infix length to index
1211         # optional, default is 0 (do not index infixes)
1212         #
1213         # min_infix_len         = 0
1214
1215
1216         # list of fields to limit prefix/infix indexing to
1217         # optional, default value is empty (index all fields in prefix/infix mode)
1218         #
1219         # prefix_fields         = filename
1220         # infix_fields          = url, domain
1221
1222
1223         # enable star-syntax (wildcards) when searching prefix/infix indexes
1224         # search-time only, does not affect indexing, can be 0 or 1
1225         # optional, default is 0 (do not use wildcard syntax)
1226         #
1227         # enable_star           = 1
1228
1229
1230         # expand keywords with exact forms and/or stars when searching fit indexes
1231         # search-time only, does not affect indexing, can be 0 or 1
1232         # optional, default is 0 (do not expand keywords)
1233         #
1234         # expand_keywords               = 1
1235
1236
1237         # n-gram length to index, for CJK indexing
1238         # only supports 0 and 1 for now, other lengths to be implemented
1239         # optional, default is 0 (disable n-grams)
1240         #
1241         # ngram_len                             = 1
1242
1243
1244         # n-gram characters list, for CJK indexing
1245         # optional, default is empty
1246         #
1247         # ngram_chars                   = U+3000..U+2FA1F
1248
1249
1250         # phrase boundary characters list
1251         # optional, default is empty
1252         #
1253         # phrase_boundary               = ., ?, !, U+2026 # horizontal ellipsis
1254
1255
1256         # phrase boundary word position increment
1257         # optional, default is 0
1258         #
1259         # phrase_boundary_step  = 100
1260
1261
1262         # blended characters list
1263         # blended chars are indexed both as separators and valid characters
1264         # for instance, AT&T will results in 3 tokens ("at", "t", and "at&t")
1265         # optional, default is empty
1266         #
1267         # blend_chars                           = +, &, U+23
1268
1269
1270         # whether to strip HTML tags from incoming documents
1271         # known values are 0 (do not strip) and 1 (do strip)
1272         # optional, default is 0
1273         html_strip                              = 0
1274
1275         # what HTML attributes to index if stripping HTML
1276         # optional, default is empty (do not index anything)
1277         #
1278         # html_index_attrs              = img=alt,title; a=title;
1279
1280
1281         # what HTML elements contents to strip
1282         # optional, default is empty (do not strip element contents)
1283         #
1284         # html_remove_elements  = style, script
1285
1286
1287         # whether to preopen index data files on startup
1288         # optional, default is 0 (do not preopen), searchd-only
1289         #
1290         # preopen                                       = 1
1291
1292
1293         # whether to keep dictionary (.spi) on disk, or cache it in RAM
1294         # optional, default is 0 (cache in RAM), searchd-only
1295         #
1296         # ondisk_dict                           = 1
1297
1298
1299         # whether to enable in-place inversion (2x less disk, 90-95% speed)
1300         # optional, default is 0 (use separate temporary files), indexer-only
1301         #
1302         # inplace_enable                        = 1
1303
1304
1305         # in-place fine-tuning options
1306         # optional, defaults are listed below
1307         #
1308         # inplace_hit_gap                       = 0             # preallocated hitlist gap size
1309         # inplace_docinfo_gap           = 0             # preallocated docinfo gap size
1310         # inplace_reloc_factor  = 0.1   # relocation buffer size within arena
1311         # inplace_write_factor  = 0.1   # write buffer size within arena
1312
1313
1314         # whether to index original keywords along with stemmed versions
1315         # enables "=exactform" operator to work
1316         # optional, default is 0
1317         #
1318         # index_exact_words             = 1
1319
1320
1321         # position increment on overshort (less that min_word_len) words
1322         # optional, allowed values are 0 and 1, default is 1
1323         #
1324         # overshort_step                        = 1
1325
1326
1327         # position increment on stopword
1328         # optional, allowed values are 0 and 1, default is 1
1329         #
1330         # stopword_step                 = 1
1331
1332
1333         # hitless words list
1334         # positions for these keywords will not be stored in the index
1335         # optional, allowed values are 'all', or a list file name
1336         #
1337         # hitless_words                 = all
1338         # hitless_words                 = hitless.txt
1339 }
1340
1341