From: Michael Ang Date: Fri, 16 Sep 2011 00:53:52 +0000 (+0000) Subject: Add files related to Internet Archive's implementation of fulltext search X-Git-Url: http://git.rot13.org/?p=bookreader.git;a=commitdiff_plain;h=5b1ca1a957386759108d034256ab4e13efc00516 Add files related to Internet Archive's implementation of fulltext search --- diff --git a/BookReaderIA/fulltext/abbyy_to_text.php b/BookReaderIA/fulltext/abbyy_to_text.php new file mode 100644 index 0000000..c13f94f --- /dev/null +++ b/BookReaderIA/fulltext/abbyy_to_text.php @@ -0,0 +1,13 @@ +&1"); +?> diff --git a/BookReaderIA/fulltext/extract_paragraphs.py b/BookReaderIA/fulltext/extract_paragraphs.py new file mode 100644 index 0000000..185901a --- /dev/null +++ b/BookReaderIA/fulltext/extract_paragraphs.py @@ -0,0 +1,155 @@ +#!/usr/bin/python + +from lxml.etree import iterparse, tostring, Element, parse +import sys, re, gzip, zipfile +from time import time + +ns = '{http://www.abbyy.com/FineReader_xml/FineReader6-schema-v1.xml}' +page_tag = ns + 'page' + +re_par_end_dot = re.compile(r'\.\W*$') + +def read_text_line(line): + text = '' + for fmt in line: + for c in fmt: + text += c.text + return text + +def par_text(lines): + cur = '' + for line_num, line in enumerate(lines): + first_char = line[0][0] + if first_char.attrib.get('wordStart') == 'false' or first_char.attrib.get('wordFromDictionary') == 'false' and cur.endswith('- '): + cur = cur[:-2] + for fmt in line: + cur += ''.join(c.text for c in fmt) + if line_num + 1 != len(lines): + cur += ' ' + return cur + +def line_end_dot(line): + return bool(re_par_end_dot.search(read_text_line(line))) + +def par_unfinished(last_line, page_w): + last_line_len = sum(len(fmt) for fmt in last_line) + if last_line_len < 15 or line_end_dot(last_line): + return False + last_line_last_char = last_line[-1][-1] + r = float(last_line_last_char.attrib['r']) + return r / page_w > 0.75 + +def col_unfinished(last_line): + return sum(len(fmt) for fmt in last_line) > 14 and not line_end_dot(last_line) + +def par_iter(f): + incomplete_par = None + end_column_par = None + skipped_par = [] + page_num = 0 + t0 = time() + for eve, page in iterparse(f): + if page.tag != page_tag: + continue + yield 'page' + + page_w = float(page.attrib['width']) + assert page.tag == page_tag + + for block_num, block in enumerate(page): + if block.attrib['blockType'] != 'Text': + continue + block.set('page', `page_num`) + block.set('page_width', page.get('width')) + block.set('page_height', page.get('height')) + region, text = block + for par_num, par in enumerate(text): + if len(par) == 0 or len(par[0]) == 0 or len(par[0][0]) == 0: + continue + last_line = par[-1] + if end_column_par is not None: + if line_end_dot(last_line) and int(par[0].attrib['t']) < int(end_column_par[0].attrib['b']): + yield list(end_column_par) + list(par) + end_column_par = None + continue + else: + yield list(end_column_par) + end_column_par = None + + if incomplete_par is not None: + if line_end_dot(last_line): + yield list(incomplete_par) + list(par) + for p in skipped_par: + yield list(p) + incomplete_par = None + skipped_par = [] + else: + skipped_par.append(par) + elif par_num + 1 == len(text) and block_num + 1 == len(page) and par_unfinished(last_line, page_w): + incomplete_par = par + elif par_num + 1 == len(text) and block_num + 1 != len(page) and col_unfinished(last_line): + end_column_par = par + else: + yield list(par) + + page_num += 1 + page.clear() + +def open_abbyy(filename): + if filename.endswith('abbyy.gz'): + return gzip.open(filename, 'rb') + elif filename.endswith('abbyy.xml'): + return open(filename) + else: + assert filename.endswith('abbyy.zip') + z = zipfile.ZipFile(filename, 'r') + names = z.namelist() + assert len(names) == 1 + assert names[0].endswith('_abbyy.xml') + return z.open(names[0]) + +lang_map = { + 'english': 'eng', + 'en': 'eng', + 'french': 'fre', + 'fr': 'fre', + 'german': 'deu', + 'de': 'deu', + 'ger': 'deu', + 'spanish': 'spa', + 'es': 'spa', +} + +langs = set(['eng', 'fre', 'deu', 'spa']) + +def read_meta(ia, path): + root = parse(path + '/' + ia + '_meta.xml').getroot() + title_elem = root.find('title') + if title_elem is None or not title_elem.text: + return + ocr_elem = root.find('ocr') + if ocr_elem is not None and ocr_elem.text == 'language not currently OCRable': + print 'language not currently OCRable' + sys.exit(0) + lang_elem = root.find('language') + if lang_elem is None: + return + l = lang_elem.text.lower() + return l if l in langs else lang_map.get(l) + +if __name__ == '__main__': + page_count = 0 + ia = sys.argv[1] + path = sys.argv[2] + filename = path + '/' + sys.argv[3] + lang = read_meta(ia, path) + if not lang: + lang = 'other' + f = open_abbyy(filename) + for lines in par_iter(f): + if lines == 'page': + page_count += 1 + continue + text = par_text(lines) + print text.encode('utf-8') + print 'meta: %s %d' % (lang, page_count) diff --git a/BookReaderIA/fulltext/inside.php b/BookReaderIA/fulltext/inside.php new file mode 100644 index 0000000..3699b56 --- /dev/null +++ b/BookReaderIA/fulltext/inside.php @@ -0,0 +1,50 @@ +&1"); +?> + diff --git a/BookReaderIA/fulltext/inside.py b/BookReaderIA/fulltext/inside.py new file mode 100644 index 0000000..747c69e --- /dev/null +++ b/BookReaderIA/fulltext/inside.py @@ -0,0 +1,213 @@ +#!/usr/bin/python +# written by Edward Betts in October 2010 + +from lxml.etree import iterparse, tostring, Element +from itertools import izip +from urllib import urlopen +import sys, re, json, os, urllib +from extract_paragraphs import par_iter, open_abbyy +from subprocess import Popen, PIPE + +ns = '{http://www.abbyy.com/FineReader_xml/FineReader6-schema-v1.xml}' +page_tag = ns + 'page' + +solr_inside = 'http://ol-search-inside:8983/solr/inside/select?rows=1&wt=json&fl=ia,body_length,page_count&hl=true&hl.fl=body&hl.fragsize=0&hl.maxAnalyzedChars=-1&hl.usePhraseHighlighter=true&hl.simple.pre={{{&hl.simple.post=}}}&q.op=AND&q=' + +class Space(): + text = ' ' + +space = Space() + +def par_char(lines): + cur = [] + match_start = False + matches = [] + for line_num, line in enumerate(lines): + first_char = line[0][0] + if first_char.attrib.get('wordStart') == 'false' or first_char.attrib.get('wordFromDictionary') == 'false' and len(cur) > 1 and cur[-2].text == '-': + cur = cur[:-2] + for fmt in line: + cur += [c for c in fmt] + if line_num + 1 != len(lines): + cur += [space] + return cur + +def skip_page(abbyy_iter): + for par in abbyy_iter: + if par != 'page': + yield par + +re_braces = re.compile(r'(\{\{\{|\}\}\})') +def find_matches(hl_body, abbyy_iter, leaf0_missing=False): + text_offset = 0 + match_number = 0 + leaf_offset = 1 if leaf0_missing else 0 + for solr_line, par in izip(hl_body.split('\n'), skip_page(abbyy_iter)): + if '{{{' not in solr_line: + text_offset += len(solr_line) + continue + match_with = solr_line + abbyy_text = ''.join(c.text for c in par_char(par)) + cur = { + 'text': solr_line, + #'abbyy': ''.join(c.text for c in par_char(par)), + 'par': [] + } + if re_braces.sub('', cur['text']) != abbyy_text: + cur['error'] = 'mismatch' + match_number += 1 + yield match_number, cur + continue + prev_char = None + match_line = None + match_par = None + for c in par_char(par): + text_offset += 1 + if match_with.startswith('{{{'): + match_with = match_with[3:] + match_line = c.getparent().getparent() + if not cur['par'] or match_line.getparent() != match_par: + match_par = match_line.getparent() + block = match_par.getparent().getparent() + cur['par'].append({ + 't': int(match_par[0].get('t')), + 'b': int(match_par[-1].get('b')), + 'l': int(block.get('l')), + 'r': int(block.get('r')), + 'boxes': [], + 'page': int(block.get('page')) + leaf_offset, + 'page_width': int(block.get('page_width')), + 'page_height': int(block.get('page_height')), + }) + line = c.getparent().getparent() + cur['par'][-1]['boxes'].append({ + 't': int(line.get('t')), + 'b': int(line.get('b')), + 'l': int(c.get('l')), + 'page': int(block.get('page')) + leaf_offset, + }) + elif match_with.startswith('}}}'): + cur['par'][-1]['boxes'][-1]['r'] = int(prev_char.get('r')) + match_with = match_with[3:] + match_line = None + elif match_line is not None and c.getparent().getparent() != match_line: + #print 'line break in match' + end_line_char = match_line[-1][-1] + cur['par'][-1]['boxes'][-1]['r'] = int(end_line_char.get('r')) + match_line = c.getparent().getparent() + if match_line.getparent() != match_par: + match_par = match_line.getparent() + cur['par'].append({ + 't': int(match_par.get('t')), + 'b': int(match_par.get('b')), + 'l': int(match_par.get('l')), + 'r': int(match_par.get('r')), + 'boxes': [], + 'page': int(block.get('page')) + leaf_offset, + }) + + cur['par'][-1]['boxes'].append({ + 't': int(match_line.get('t')), + 'b': int(match_line.get('b')), + 'l': int(c.get('l')), + }) + + if len(match_with) == 0: + break + assert c.text == match_with[0] + match_with = match_with[1:] + prev_char = c + if match_with == '}}}': + cur['par'][-1]['boxes'][-1]['r'] = int(prev_char.get('r')) + match_number += 1 + yield match_number, cur + +if __name__ == '__main__': + (item_id, doc, path, q) = sys.argv[1:5] + callback = sys.argv[5] if len(sys.argv) > 5 else None + q = q.strip() + if not q: + if callback: + print callback + '(', + print json.dumps({ 'ia': item_id, 'q': q, 'matches': [], 'error': 'You must enter a query.' }, indent=2), + print ')' if callback else '' + sys.exit(0) + reply = urllib.urlopen(solr_inside + urllib.quote('ia:' + item_id)).read() + results = json.loads(reply) + assert os.path.exists(path) + re_item = re.compile('^/\d+/items/([^/]+)') + filename = None + for ending in 'abbyy.gz', 'abbyy.xml', 'abbyy.zip': + test_filename = os.path.join(path, doc + '_' + ending) + if os.path.exists(test_filename): + filename = test_filename + break + if callback: + print callback + '(', + if not results['response']['docs']: + index_result = urlopen('http://edward.openlibrary.org/index_now/' + item_id).read() + if not index_result.startswith('done'): + print json.dumps({ 'ia': item_id, 'q': q, 'matches': [], 'indexed': False}, indent=2), + print ')' if callback else '' + sys.exit(0) + if not filename: + print """{ + "ia": %s, + "q": %s, + "matches": [], +}""" % (json.dumps(item_id), json.dumps(q)), + + print ')' if callback else '' + sys.exit(0) + solr_q = 'ia:%s AND %s' % (item_id, q) + reply = urllib.urlopen(solr_inside + urllib.quote(solr_q)).read() + try: + results = json.loads(reply) + except: + print reply + raise + if not results['response']['docs']: + print """{ + "ia": %s, + "q": %s, + "indexed": true, + "matches": [], +}""" % (json.dumps(item_id), json.dumps(q)), + + print ')' if callback else '' + sys.exit(0) + solr_doc = results['response']['docs'][0] + hl_body = results['highlighting'][item_id]['body'][0] + jp2_zip = os.path.join(path, doc + '_jp2.zip') + tif_zip = os.path.join(path, doc + '_tif.zip') + leaf0_missing = False + if os.path.exists(jp2_zip): + leaf0_missing = '0000.jp2' not in Popen(['unzip', '-l', jp2_zip], stdout=PIPE).communicate()[0] + elif os.path.exists(tif_zip): + leaf0_missing = '0000.tif' not in Popen(['unzip', '-l', tif_zip], stdout=PIPE).communicate()[0] + + f = open_abbyy(filename) + abbyy_iter = par_iter(f) + + print """{ + "ia": %s, + "q": %s, + "indexed": true, + "page_count": %d, + "body_length": %d, + "leaf0_missing": %s, + "matches": [ """ % (json.dumps(item_id), json.dumps(q), solr_doc['page_count'], solr_doc['body_length'], 'true' if leaf0_missing else 'false') + prev = '' + error = None + for num, match in find_matches(hl_body, abbyy_iter, leaf0_missing): + if 'error' in match: + error = match['error'] + break + if prev: + print prev + ',' + prev = json.dumps(match, indent=4) + if error: + print prev, '],\n "error": %s' % (json.dumps(error)), + else: + print prev, ']', + print '\n}' + (')' if callback else '')