2 openisis - an open implementation of the CDS/ISIS database
3 Version 0.8.x (patchlevel see file Version)
4 Copyright (C) 2001-2003 by Erik Grziwotz, erik@openisis.org
6 This library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 This library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with this library; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 see README for more information
24 $Id: openisis.c,v 1.67 2003/06/03 11:25:02 kripke Exp $
25 main file of openisis executable.
33 #include <stdlib.h> /* free */
34 #include <string.h> /* strcmp */
36 # include <sys/timeb.h>
37 # include <sys/types.h>
38 # define timeval _timeb
40 # include <unistd.h> /* gettimeofday */
42 #include <sys/time.h> /* gettimeofday */
44 #include <pthread.h> /* threaded crashtest */
50 temporary library includes for testing of functions
51 that should later be accessible via openisis.h
54 extern Db* ldb_getdb (int dbid);
57 /* ************************************************************
60 typedef enum { /* what to do */
62 DO_SCAN, /* simple full scan */
63 DO_SEARCH, /* basic index based searching */
64 DO_TERMS, /* list terms */
65 DO_CHECK, /* check db */
66 DO_PERF, /* do random reads for performance checking */
67 DO_CRASH, /* do multi-threaded crashtest */
68 DO_SPLIT, /* split a field value */
69 DO_STREAM, /* stream in records */
70 DO_MFNLIST, /* fetch records by mfn list */
71 DO_IFLOAD, /* read a lk2-style file from stdin */
72 DO_SWLOAD, /* load stopwords */
73 DO_IFDUMP, /* dump a lk2-style file to stdout */
74 DO_FDT, /* print fdt */
75 DO_VUTF /* validate UTF-8 input */
79 FMT_MFN, /* rowid only */
80 FMT_MFNF, /* rowid, 1st field */
81 FMT_PROP, /* property style */
82 FMT_TXT, /* plaintext masterfile style */
83 FMT_TXTW /* plaintext masterfile style with W lines */
89 IFM_OLD, /* dump old index */
90 IFM_COPY, /* copy old index */
91 IFM_CHK /* check (new oxi) index */
95 /* ************************************************************
98 static const char *pft;
99 static const char **term, **val;
100 static OpenIsisSet *post;
102 static int db = -1, wdb = -1, append = 0, idxall = 0;
104 /* ************************************************************
107 static int argchk ( const char *param, const char *n, const char *v )
109 return strcmp( param, n ) ? 0 : v ? 1 :
110 (openIsisSMsg( OPENISIS_ERR_INVAL, "no value for param '%s'", param ), 0);
114 static int print ( OpenIsisRec *r, int freeit, format f )
116 union { OpenIsisRec r; char buf[10000]; } x;
120 openIsisSMsg( 1, "\n" );
127 q = openIsisFmt( &x.r, pft, r );
134 openIsisSMsg( 1, "%d\n", r->rowid );
135 else if ( FMT_MFNF == f )
136 openIsisSMsg( 1, "%d %.*s\n", r->rowid,
137 0 == r->len ? 1 : (int)r->field[0].len,
138 0 == r->len ? "-" : r->field[0].val );
139 else if ( FMT_TXT <= f ) {
140 openIsisSMsg( 1, "\n" ); /* blank line */
142 openIsisSMsg( 1, "W\t%d\n", r->rowid );
143 for ( i=0; i<r->len; i++ ) {
144 if ( r->field[i].val )
145 openIsisSMsg( 1, "%d\t%.*s\n", r->field[i].tag,
146 (int)r->field[i].len, r->field[i].val );
148 openIsisSMsg( 1, "%d\t%d\n", r->field[i].tag, r->field[i].len );
150 } else for ( i=0; i<r->len; i++ ) {
151 if ( ! r->field[i].val ) { /* shouldn't happen -- numeric ? */
152 openIsisSMsg( 1, "%d.?=%d\n", r->rowid, r->field[i].len );
155 openIsisSMsg( 1, "%d.%d=%.*s\n", r->rowid, r->field[i].tag,
156 (int)r->field[i].len, r->field[i].val );
157 if ( r->field[i].len && '^' == *r->field[i].val ) { /* split subfields */
158 OpenIsisRec *rf = openIsisReadField( 0, r->field+i );
161 for ( j=0; j<rf->len; j++ )
162 openIsisSMsg( 1, "%d.%d.%c=%.*s\n",
163 r->rowid, r->field[i].tag,
164 (0x60 & (int)rf->field[j].tag ) ?
165 (int)rf->field[j].tag : ' ',
166 (int)rf->field[j].len, rf->field[j].val );
176 if ( idxall && r != &x.r ) { /* add index entries for all fields */
179 for ( i=0; i<r->len; i++ ) {
181 OpenIsisField *fld = r->field + i;
182 sprintf( hit, "%d.%d.%d.1 ", r->rowid, fld->tag, i );
183 OPENISIS_RADDS( q, OPENISIS_XHIT, hit, q != &x.r );
184 OPENISIS_RCAT( q, fld->val, fld->len, q != &x.r );
187 ok = openIsisDWritex( wdb, r, q );
188 openIsisSMsg( 1, "wrote mfn %d (%d)\n", r->rowid, ok );
196 static void printid ( int id, format f )
198 if ( FMT_MFN == f && 0 > wdb )
199 openIsisSMsg( 1, "%d\n", id );
201 print( openIsisReadRow( db, id ), !0, f );
205 static int printlk2 ( void *me, OpenIsisKey *key, OpenIsisHit *hit )
209 /* 30 key BLANK 7 mfn BLANK 5 tag BLANK 4 occ BLANK 4 pos*/
210 openIsisSMsg( 1, "%-30.*s %7u %5u %4u %4u\n",
211 key->len, key->byt, hit->mfn, hit->tag, hit->occ, hit->pos );
216 static int printtab ( void *me, OpenIsisKey *key, OpenIsisHit *hit )
220 /* 30 key BLANK 7 mfn BLANK 5 tag BLANK 4 occ BLANK 4 pos*/
221 openIsisSMsg( 1, "%.*s\t%u\t%u\t%u\t%u\n",
222 key->len, key->byt, hit->mfn, hit->tag, hit->occ, hit->pos );
227 /* timing utility. set the timeval, return milliseconds since last call. */
229 static int millis ( struct _timeb *tb )
231 struct _timeb otb = *tb;
233 return (tb->time - otb.time)*1000 + (tb->millitm - otb.millitm);
236 static int millis ( struct timeval *tv )
238 struct timeval otv = *tv;
239 gettimeofday( tv, 0 );
240 return (tv->tv_sec - otv.tv_sec)*1000 + (tv->tv_usec - otv.tv_usec)/1000;
245 int myOpenIsisLockFunc ( int lock )
247 static pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER; /* the "fast" kind */
248 static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
250 LOG_DBG( LOG_ERROR, "thread %d op 0x%08x", (int)pthread_self(), lock );
252 switch ( OPENISIS_WAIT & lock ) {
253 case OPENISIS_RELE: return pthread_mutex_unlock( &mut );
254 case OPENISIS_LOCK: return pthread_mutex_lock( &mut );
255 case OPENISIS_WAKE: return pthread_cond_broadcast( &cond );
256 case OPENISIS_WAIT: return pthread_cond_wait( &cond, &mut );
261 # define myOpenIsisLockFunc 0
268 /* (p)thread routine */
269 static void * run ( void *arg )
273 threadarg *my = (threadarg *)arg;
274 int start = my->start;
278 for ( i=start+1; i++!=start; ) {
282 for ( j=0; j<post[i].len; j++ ) {
283 int mfn = post[i].id[j];
284 OpenIsisRec *r = openIsisReadRow( db, mfn );
286 openIsisSMsg( 2, "no rec %d\n", mfn );
290 openIsisSMsg( 2, "no fields for %d\n", mfn );
291 else if ( r->field[0].len != (int)strlen(val[mfn])
292 || memcmp( r->field[0].val, val[mfn], r->field[0].len )
294 openIsisSMsg( 2, "mismatch on %d\n", mfn );
299 openIsisSMsg( 1, "thread %d@%d terminated after %.3f seconds\n",
310 /* multithreaded crashtest */
311 static int crash ( const char *pre )
314 union { OpenIsisRec r; char buf[10000]; } x;
323 x.r.bytes = sizeof(x);
325 while ( openIsisTerm( &x.r, db, pre ) && x.r.len )
327 openIsisSMsg( fd, "%d terms\n", nterm );
328 openIsisSMsg( fd, "%.3f sec\n", millis(&tv)/1000. );
330 term = (const char **)malloc( nterm * sizeof(*term) );
331 post = (OpenIsisSet *)malloc( nterm * sizeof(*post) );
334 while ( openIsisTerm( &x.r, db, pre ) && x.r.len ) {
335 for ( i=0; i<x.r.len; i++, nterm++ ) {
337 OpenIsisSet *set = post+nterm;
338 char *c = malloc( x.r.field[i].len + 1 );
340 memcpy( c, x.r.field[i].val, x.r.field[i].len );
341 c[ x.r.field[i].len ] = 0;
345 openIsisQuery( set, db, c, OPENISIS_QRY_KEYEQ, 0 );
346 if ( 0 >= set->len ) {
347 openIsisSMsg( 2, "no results for '%s'\n", c );
350 for ( cnt = 0; cnt < set->len; cnt++ )
351 if ( nval < set->id[cnt] )
356 openIsisSMsg( fd, "%d postings max mfn %d\n", p, nval );
357 openIsisSMsg( fd, "%.3f sec\n", millis(&tv)/1000. );
359 val = (const char **)malloc( (1+nval) * sizeof(*val) );
360 for ( i=1; i<=nval; i++ ) {
361 OpenIsisRec *r = openIsisReadRow( db, i );
364 openIsisSMsg( 2, "no rec %d\n", i );
368 char *c = malloc( r->field[0].len + 1 );
370 memcpy( c, r->field[0].val, r->field[0].len );
371 c[ r->field[0].len ] = 0;
377 openIsisSMsg( fd, "sequential read %d rows in %.3f seconds %d rows per sec\n",
378 nval, l/1000., nval*1000/(l?l:1) );
388 openIsisSMsg( fd, "in-thread run in %.3f seconds %d rows per sec\n",
389 l/1000., p*1000/(l?l:1) );
393 static int nThreads[] = { 8, 2, 1, 4 };
394 int res[ sizeof(nThreads)/sizeof(nThreads[0]) ];
395 pthread_t th[ 8 /* max. nThreads */ ];
396 threadarg arg[ sizeof(th)/sizeof(th[0]) ];
400 for ( i=0; i<(int)(sizeof(nThreads)/sizeof(nThreads[0])); i++ )
402 for ( j=0; j<(int)(sizeof(th)/sizeof(th[0])); j++ )
404 if ( ! (arg[j].ses = openIsisSesGet( -1, 0 )) ) {
405 openIsisSMsg( fd, "could not get %dth session\n", j );
409 for ( pass=0; pass<passes; pass++ )
410 for ( i=0; i<(int)(sizeof(nThreads)/sizeof(nThreads[0])); i++ ) {
413 assert( nThreads[i] <= (int)(sizeof(th)/sizeof(th[0])) );
414 for ( j=0; j<nThreads[i]; j++ ) {
415 arg[j].start = j*nterm/nThreads[i];
416 if ( pthread_create( th+j, 0, run, arg+j ) )
417 th[j] = (pthread_t)0;
419 openIsisSMsg( fd, "started %d threads\n", nThreads[i] );
421 for ( j=0; j<nThreads[i]; j++ ) {
423 pthread_join( th[j], (void**)&t );
428 rps = (int)(nThreads[i] * p * 1000 / avg);
429 openIsisSMsg( fd, "joined %d threads avg %.3f after %.3f seconds %d rows per sec\n",
430 nThreads[i], avg/1000., l/1000., rps );
433 for ( i=0; i<(int)(sizeof(nThreads)/sizeof(nThreads[0])); i++ )
434 openIsisSMsg( fd, "%d threads %d rows per sec\n", nThreads[i], res[i]/passes );
437 (void)pass; /* avoid compiler warning */
438 #endif /* HAVE_THREADS */
443 /* ************************************************************
446 /* ************************************************************
449 int main ( int argc, const char **argv )
454 format fmt = FMT_TXT;
455 int check = OPENISIS_CHK_FIX;
456 int searchmode = OPENISIS_QRY_KEYAT;
457 int idxmode = OPENISIS_IDXPF;
458 const char *search = 0;
459 const char *idxto = 0;
461 const char *dowrite = 0;
462 ifmode ifm = IFM_DUMP;
466 /* initialize minimal env */
468 argr = openIsisRSet( 0,
469 OPENISIS_RFDT|OPENISIS_RARGV|OPENISIS_RIGN | (argc-1),
470 openIsisFdtSyspar, argv+1 );
472 if ( 2 == argc && ! strcmp("-version",argv[1]) ) {
473 openIsisSMsg( 0, "%s\n", OPENISIS_VERSION );
476 /* check options ... */
477 for ( i=0; i < argc; ) {
478 const char *n = argv[i], *v = 0;
482 if ( 1 == argc - i || '-' == argv[i+1][0] ) { /* no value */
490 if ( argchk("logfile",n,v) )
491 openIsisLog( '=', v );
492 else if ( argchk("v",n,v) )
493 openIsisLog( *v, 0 );
494 else if ( argchk("scan",n,v) ) {
498 else if ( argchk("search",n,v) ) {
502 else if ( argchk("upto",n,v) ) {
503 idxmode = OPENISIS_IDXUPTO;
506 else if ( argchk("incl",n,v) ) {
507 idxmode = OPENISIS_IDXINCL;
510 else if ( argchk("query",n,v) ) {
512 searchmode = OPENISIS_QRY_SIMPLE;
515 else if ( argchk("terms",n,v) ) {
519 else if ( argchk("perf",n,v) ) {
523 else if ( argchk("crash",n,v) ) {
527 else if ( argchk("split",n,v) ) {
532 else if ( argchk("fmt",n,v) ) {
533 if ( ! strcmp("mfn",v) )
535 else if ( ! strcmp("mfnf",v) )
537 else if ( ! strcmp("prop",v) )
539 else if ( ! strcmp("txt",v) )
541 else if ( ! strcmp("txtw",v) )
544 else if ( ! strcmp("check",n) )
546 else if ( ! strcmp("vutf",n) ) {
549 } else if ( argchk("pft",n,v) )
551 else if ( ! strcmp("stream",n) ) {
554 } else if ( argchk("write",n,v) )
556 else if ( argchk("append",n,v) ) {
559 } else if ( ! strcmp("idxall",n) )
561 else if ( ! strcmp("mfnlist",n) )
563 else if ( argchk("ifload",n,v) ) {
566 } else if ( ! strcmp("swload",n) ) {
568 } else if ( ! strcmp("ifadd",n) ) {
571 } else if ( ! strcmp("ifdel",n) ) {
574 } else if ( ! strcmp("ifcopy",n) ) {
577 } else if ( ! strcmp("ifchk",n) ) {
580 } else if ( ! strcmp("iftab",n) ) {
583 } else if ( ! strcmp("ifdump",n) )
585 else if ( ! strcmp("noxi",n) )
587 else if ( argchk("out",n,v) ) {
591 memcpy( buf+1, v, l );
593 openIsisSOpen( buf, 0, 0 );
596 else if ( argchk("fdtdump",n,v) ) {
603 if ( needdb && 0 > (db = openIsisOpen( 0, argv + 1, argc - 1 )) ) {
605 "openisis " OPENISIS_VERSION "\n\n"
606 "please specify a valid database with -db, e.g.\n"
607 "-db /winisis/data/cds\n"
609 "other options are:\n"
610 "-search term search for term\n"
611 "-query \"query\" run a query like \"water * plant\"\n"
612 "-terms term list terms matching term (e.g. plant$)\n"
614 /* warning: string length `580' is greater than the minimum length
615 * `509' ISO C89 is required to support
618 "-fmt mfn for a search or query, list only the mfn\n"
619 "-fmt mfnf for a search or query, list the mfn and 1st field\n"
620 "-pft \"pft\" use printformat (currently very limited)\n"
621 "-write dbpath specify a db where records are written to\n"
622 "-mfnlist read mfns from stdin\n"
623 "-ifload pctfree read .lk2-index from stdin\n"
625 "default output format is one field per line like tag<TAB>value\n"
631 if ( dowrite && 0 > (wdb = openIsisOpen( dowrite, 0, 0 )) ) {
632 openIsisSMsg( 2, "could not open write target db '%s'\n", dowrite );
640 int max = openIsisMaxRowid( db );
642 for ( rowid = 1; rowid <= max; rowid++ )
643 printid( rowid, fmt );
644 } break; /* DO_DUMP */
648 while ( 0 <= (l = openIsisSReadln( &buf )) ) {
651 id = 10*id + *buf++ - '0';
655 } break; /* DO_MFNLIST */
657 int max = openIsisMaxRowid( db );
659 for ( rowid = 1; 0 < rowid && rowid <= max; rowid++ )
660 rowid = print( openIsisScan( db, rowid, 0, search ), !0, fmt );
661 } break; /* DO_SCAN */
666 openIsisQuery( &set, db, search, searchmode, 0 );
667 if ( 0 >= set.len ) {
668 openIsisSMsg( 2, "no results for '%s'\n", search );
672 /* openIsisSMsg( 2, "%d\trows for\t%s\n", set.len, search ); */
673 for ( cnt = 0; cnt < set.len; cnt++ )
674 printid( set.id[cnt], fmt );
675 } break; /* DO_SEARCH */
677 union { OpenIsisRec r; char buf[10000]; } x;
679 x.r.bytes = sizeof(x);
680 while ( openIsisTerm( &x.r, db, search ) && x.r.len ) {
681 /* openIsisSMsg( 1, "%d terms\n", x.r.len ); */
682 for ( i=0; i<x.r.len; i++ )
683 openIsisSMsg( 1, "%.*s\n", (int)x.r.field[i].len, x.r.field[i].val );
685 } break; /* DO_TERMS */
687 int max = openIsisMaxRowid( db );
688 while ( 0 < intarg-- ) {
689 OpenIsisRec *r = openIsisReadRow( db, 1+((int)rand() % max) );
692 } break; /* DO_PERF */
694 ret = openIsisCheck( db, check );
697 ret = crash( search );
702 f.tag = 24; f.val = search; f.len = strlen(search);
703 r = openIsisReadField( 0, &f );
705 for ( i=0; i<r->len; i++ )
706 openIsisSMsg( 1, "%c=%.*s\n", (int)r->field[i].tag,
707 (int)r->field[i].len, r->field[i].val );
711 OpenIsisRecStream rs = { 0, OPENISIS_STOPONEMPTY, 0, 0, 0 };
712 LIO_SINIT( &ios, lio_stdio, "stdin", LIO_IN );
713 rs.in = &ios; /* some gcc versions need it this way */
714 while ( 0 < (i = openIsisSGetr( &rs )) )
715 print( rs.rec, 0, fmt );
721 OpenIsisIndex idx = openIsisIdxOpen( db, intarg );
725 memset( &hit, 0, sizeof(hit) );
726 hit.dbn = (-2 == intarg) ? 0xffff : 0; /* secret key for ifdel */
727 while ( 0 <= (l = openIsisSReadln( &buf )) && buf ) {
728 char *t = memchr( buf, '\t', l );
729 if ( DO_SWLOAD == what ) {
730 memcpy( key.byt, buf, key.len = (unsigned char)l );
731 } else if ( t ) { /* tab delimited */
732 key.len = (unsigned char)(t - buf);
733 memcpy( key.byt, buf, key.len );
734 if ( 0 >= (l -= t-buf+1) || !(t = memchr( buf=t+1, '\t', l )) )
736 hit.mfn = (unsigned) openIsisA2i( buf, t-buf );
737 if ( 0 >= (l -= t-buf+1) || !(t = memchr( buf=t+1, '\t', l )) )
739 hit.tag = (unsigned short) openIsisA2i( buf, t-buf );
740 if ( 0 >= (l -= t-buf+1) || !(t = memchr( buf=t+1, '\t', l )) )
742 hit.occ = (unsigned short) openIsisA2i( buf, t-buf );
743 if ( 0 >= (l -= t-buf+1) )
745 hit.pos = (unsigned short) openIsisA2i( t+1, l );
747 /* 10/30 key BLANK 7 mfn BLANK 5 tag BLANK 4 occ BLANK 4 pos*/
748 int eok = l - 24; /* pos of blank after key, 10 or 30 */
749 if ( 54 != l && 34 != l ) {
750 openIsisSMsg( OPENISIS_ERR_INVAL,
751 "bad ifload input len %d, want 34 or54 bytes + newline\n", l );
754 for ( i=eok-1; ' ' == buf[i] && i--; )
756 key.len = (unsigned char) (++i);
757 memcpy( key.byt, buf, key.len );
758 log_msg( LOG_VERBOSE, "'%.*s'", 7, buf+eok+1 );
759 hit.mfn = (unsigned) openIsisA2i( buf+eok+1, 7 );
760 hit.tag = (unsigned short)openIsisA2i( buf+eok+9, 5 );
761 hit.occ = (unsigned short)openIsisA2i( buf+eok+15, 4 );
762 hit.pos = (unsigned short)openIsisA2i( buf+eok+20, 4 );
764 log_msg( LOG_VERBOSE, "'%.*s' %d %d %d %d",
765 key.len, key.byt, hit.mfn, hit.tag, hit.occ, hit.pos );
766 if ( openIsisIdxAdd( idx, &key, &hit ) )
768 if ( !(++lines & 0x3ff) )
769 log_msg( LOG_INFO, "%dK lines", lines >> 10 );
771 openIsisIdxDone( idx );
775 memset( &l, 0, sizeof(l) );
779 l.flg |= OPENISIS_IDXTRAD;
781 l.cb = (OpenIsisIdxCb*)printlk2;
784 l.cb = (OpenIsisIdxCb*)printtab;
787 l.flg |= OPENISIS_IDXTRAD;
788 l.me = openIsisIdxOpen( 0 <= wdb ? wdb : db, 0 );
789 l.cb = (OpenIsisIdxCb*)openIsisIdxAdd;
796 memcpy( l.key.byt, search,
797 l.key.len = (unsigned char)strlen( search ) );
799 memcpy( l.to.byt, idxto,
800 l.to.len = (unsigned char)strlen( idxto ) );
801 openIsisIdxLoop( db, &l );
802 if ( IFM_COPY == ifm )
803 openIsisIdxDone( (OpenIsisIndex)l.me );
806 odb = ldb_getdb( db );
807 if ( odb && odb->fdt )
808 print( openIsisFFdt2Rec( odb->fdt, 0, intarg ), 0, fmt );
813 while ( 0 < (g = lio_read( &lio_in, buf, sizeof(buf) )) ) {
814 int l = openIsisValidUTF8( buf, g, &f );
816 openIsisSMsg( OPENISIS_ERR_INVAL,
817 "at total %d = %d+%d\n", l-1+t, l-1, t );
824 } /* switch ( what ) */
829 openIsisClose( wdb );
831 /* at least with WINE,
832 atexit cleanup is not performed
833 unless we explicitly call exit :(
837 } /* openisis main */