fixed parsing for science direct html with more than one <a href=> per one <tr>
[webpac] / openisis / lqry.c
1 /*
2         openisis - an open implementation of the CDS/ISIS database
3         Version 0.8.x (patchlevel see file Version)
4         Copyright (C) 2001-2003 by Erik Grziwotz, erik@openisis.org
5
6         This library is free software; you can redistribute it and/or
7         modify it under the terms of the GNU Lesser General Public
8         License as published by the Free Software Foundation; either
9         version 2.1 of the License, or (at your option) any later version.
10
11         This library is distributed in the hope that it will be useful,
12         but WITHOUT ANY WARRANTY; without even the implied warranty of
13         MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14         Lesser General Public License for more details.
15
16         You should have received a copy of the GNU Lesser General Public
17         License along with this library; if not, write to the Free Software
18         Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19
20         see README for more information
21 EOH */
22
23 /*
24         $Id: lqry.c,v 1.7 2003/04/08 00:20:52 kripke Exp $
25         implementation of record cooking.
26 */
27
28 #include <string.h> /* memset et al */
29
30 #include "ldb.h"
31
32
33 /* ************************************************************
34         private types
35 */
36 /* ************************************************************
37         private data
38 */
39
40 static unsigned char op[256];
41
42
43 /* ************************************************************
44         private functions
45 */
46 static int lqry_scan ( OpenIsisSet *set, int db,
47         const char *key, int mode, int skip )
48 {
49         (void)set;(void)db;(void)key;(void)mode;(void)skip;
50         log_msg( LOG_ERROR, "scan mode not implemented yet, sorry!" );
51         return -1;
52 }       /* lqry_scan */
53
54 /* ************************************************************
55         package functions
56 */
57 /* ************************************************************
58         public functions
59 */
60
61
62 int dQuery ( Set *set, int db,
63         const char *key, int mode, int skip )
64 {
65         LdbPost p;
66         int ret = 0;
67
68         memset( &p, 0, sizeof(p) );
69         p.skp = skip;
70         p.tag = mode >> 16;
71         mode &= 0xff;
72
73         if ( OPENISIS_QRY_SIMPLE > mode ) {
74                 char *tmp = (char*)key;
75                 if ( OPENISIS_QRY_SCANE <= mode )
76                         return lqry_scan( set, db, key, mode, skip );
77                 switch ( (int)mode ) {
78                 case OPENISIS_QRY_KEYEQ: break;
79                 case OPENISIS_QRY_KEYAT:
80                         {
81                                 size_t l = strlen(key);
82                                 if ( !l || '$' != key[l-1] )
83                                         break;
84                                 tmp = mAlloc( l );
85                                 if ( ! tmp )
86                                         return -ERR_NOMEM;
87                                 memcpy( tmp, key, l-1 );
88                                 tmp[l-1] = 0;
89                         }
90                 case OPENISIS_QRY_KEYPF:
91                         p.mode = LDB_PFX;
92                 }
93                 ret = ldb_search( db, tmp, &p, 0 );
94                 if ( tmp != key )
95                         mFree( tmp );
96         /* } else if ( OPENISIS_QRY_SIMPLE < mode ) { */
97         } else { /* handling query expression (simple) */
98                 size_t klen = strlen(key);
99                 unsigned char *buf, *uc, *term, *end, utmp;
100                 if ( ! op['.'] ) /* operators */
101                         op['.'] = op['$'] = op['('] = op['*'] = op['+'] = op['^'] = op['/'] = 1;
102                 buf = (unsigned char*)mAlloc( klen+1 );
103                 if ( ! buf )
104                         return -ERR_NOMEM;
105         /* startover: */
106                 memcpy( buf, key, klen+1 ); /* reload, might have frobed the buf on \ */
107                 uc = buf;
108                 p.mode = LDB_OR;
109         nextterm:
110                 while ( *uc &&  ' ' >= *uc ) uc++; /* skip white */
111                 if ( !*uc ) goto done;
112                 /* take as term, whatever it is ... */
113                 term = uc;
114                 if ( '"' == *term ) { /* anything up to next unescaped " */
115                         int bs = 0; /* count of backslashes seen so far */
116                         term++;
117                         while ( *++uc && '"' != *uc ) {
118                                 if ( '\\' == *uc ) {
119                                         if ( ! *++uc ) break; /* trailing \ */
120                                         bs++;
121                                 }
122                                 if ( bs ) uc[-bs] = *uc;
123                         }
124                         end = uc++ - bs;
125                         if ( '$' == end[-1] ) {
126                                 p.mode |= LDB_PFX;
127                                 end--;
128                         }
129                 } else {
130                         while ( *++uc && !op[*uc]
131                                 && (' '<*uc || (uc==term+3 && !memcmp("ANY",term,3)))
132                         ); /* find white or op */
133                         end = uc;
134                         if ( '$' == *uc ) {
135                                 p.mode |= LDB_PFX;
136                                 uc++;
137                         }
138                 }
139                 /* now end is on 1st char after term, possibly on a $ or ".
140                         uc is on next pos to scan. */
141                 /* check for tag */
142                 p.tag = 0;
143                 if ( '/' == *uc ) {
144                         int parenths = '(' == *++uc;
145                         if ( parenths ) uc++; /* accept tag w o w/o () */
146                         /* eat arbitrary large numbers :)) */
147                         while ( '0'<=*uc && *uc<='9' ) p.tag = 10*p.tag + *uc++ - '0';
148                         if ( parenths ) while ( *uc && ')'!=*uc++ ); /* mv behind ) */
149                 }
150
151                 /* do it */
152                 utmp = *end; *end = 0; /* terminate term */
153                 if ( !(p.mode & (LDB_AND|LDB_NOT)) )
154                         p.near = -1; /* collect all pos */
155                 else if ( !p.fil ) /* nothing to AND */
156                         goto operator;
157                 if ( 0 > (ret = ldb_search( db, (const char*)term, &p, 0 )) )
158                         goto done;
159                 sMsg( LOG_INFO, "src %d '%.30s'@%d %hd(%hd): fill %d cut %d",
160                         p.skp, term, p.tag, p.mode, p.near, p.fil, p.cut );
161         operator:
162                 *end = utmp; /* restore */
163
164                 /* look for next operator */
165                 p.mode = LDB_AND;
166                 p.near = 0;
167                 while ( *uc &&  ' ' >= *uc ) uc++; /* skip white */
168                 if ( !*uc ) goto done;
169                 switch ( *uc ) {
170                 case '*': uc++; break;
171                 case '^': p.mode = LDB_NOT; uc++; break;
172                 case '+': p.mode = LDB_OR; uc++; break;
173                 case '.': while ( '.' == *uc++ ) p.near++; break;
174                 case '$': while ( '$' == *uc++ ) p.near--; break;
175                 case '(':
176                         if ( ! uc[1] ) goto done;
177                         if ( ')' != uc[2] ) break;
178                         if ( 'F' == uc[1] ) p.near = LDB_NEAR_F;
179                         if ( 'G' == uc[1] ) p.near = LDB_NEAR_G;
180                         if ( '0'<uc[1] && uc[1]<='9' ) p.near = uc[1]-'0';
181                         uc += 3;
182                         break;
183                 case 'O':
184                         if ( 'R' == uc[1] && ' ' >= uc[2] ) { p.mode = LDB_OR; uc += 3; }
185                         break;
186                 case 'A':
187                         if ( 'N' == uc[1] && 'D' == uc[2] && ' ' >= uc[3] ) uc += 4;
188                         break;
189                 case 'N':
190                         if ( 'O' == uc[1] && 'T' == uc[2] && ' ' >= uc[3] ) {
191                                 p.mode = LDB_NOT; uc += 4;
192                         }
193                         break;
194                 }
195                 goto nextterm;
196
197 done:
198                 /* somewhat screws the idea of limiting costs ...
199                 if ( !p.fil && p.cut > p.skp ) {
200                         sMsg( LOG_INFO, "OUCH! full cut at %d for '%.50s' >= %d",
201                                 p.cut, key, p.skp );
202                         p.skp = p.cut;
203                         p.cut = 0;
204                         goto startover;
205                 }
206                 */
207                 mFree( buf );
208         }
209         return 0 > ret ? ret : ldb_p2s( set, &p );
210 }       /* openIsisQuery */