Add files related to Internet Archive's implementation of fulltext search
authorMichael Ang <mang@archive.org>
Fri, 16 Sep 2011 00:53:52 +0000 (00:53 +0000)
committerMichael Ang <mang@archive.org>
Fri, 16 Sep 2011 00:53:52 +0000 (00:53 +0000)
BookReaderIA/fulltext/abbyy_to_text.php [new file with mode: 0644]
BookReaderIA/fulltext/extract_paragraphs.py [new file with mode: 0644]
BookReaderIA/fulltext/inside.php [new file with mode: 0644]
BookReaderIA/fulltext/inside.py [new file with mode: 0644]

diff --git a/BookReaderIA/fulltext/abbyy_to_text.php b/BookReaderIA/fulltext/abbyy_to_text.php
new file mode 100644 (file)
index 0000000..c13f94f
--- /dev/null
@@ -0,0 +1,13 @@
+<?php
+$ia = escapeshellarg($_GET['ia']);
+$path = escapeshellarg($_GET['path']);
+$file = escapeshellarg($_GET['file']);
+
+$full = $_GET['path'] . '/' . $_GET['file'];
+if (!is_readable($full)) {
+    header("HTTP/1.1 403 Forbidden");
+    exit(0);
+}
+header('Content-type: text/plain');
+passthru("python extract_paragraphs.py $ia $path $file 2>&1");
+?>
diff --git a/BookReaderIA/fulltext/extract_paragraphs.py b/BookReaderIA/fulltext/extract_paragraphs.py
new file mode 100644 (file)
index 0000000..185901a
--- /dev/null
@@ -0,0 +1,155 @@
+#!/usr/bin/python
+
+from lxml.etree import iterparse, tostring, Element, parse
+import sys, re, gzip, zipfile
+from time import time
+
+ns = '{http://www.abbyy.com/FineReader_xml/FineReader6-schema-v1.xml}'
+page_tag = ns + 'page'
+
+re_par_end_dot = re.compile(r'\.\W*$')
+
+def read_text_line(line):
+    text = ''
+    for fmt in line:
+        for c in fmt:
+            text += c.text
+    return text
+
+def par_text(lines):
+    cur = ''
+    for line_num, line in enumerate(lines):
+        first_char = line[0][0]
+        if first_char.attrib.get('wordStart') == 'false' or first_char.attrib.get('wordFromDictionary') == 'false' and cur.endswith('- '):
+            cur = cur[:-2]
+        for fmt in line:
+            cur += ''.join(c.text for c in fmt)
+        if line_num + 1 != len(lines):
+            cur += ' '
+    return cur
+
+def line_end_dot(line):
+    return bool(re_par_end_dot.search(read_text_line(line)))
+
+def par_unfinished(last_line, page_w):
+    last_line_len = sum(len(fmt) for fmt in last_line)
+    if last_line_len < 15 or line_end_dot(last_line):
+        return False
+    last_line_last_char = last_line[-1][-1]
+    r = float(last_line_last_char.attrib['r'])
+    return r / page_w > 0.75
+
+def col_unfinished(last_line):
+    return sum(len(fmt) for fmt in last_line) > 14 and not line_end_dot(last_line)
+
+def par_iter(f):
+    incomplete_par = None
+    end_column_par = None
+    skipped_par = []
+    page_num = 0
+    t0 = time()
+    for eve, page in iterparse(f):
+        if page.tag != page_tag:
+            continue
+        yield 'page'
+
+        page_w = float(page.attrib['width'])
+        assert page.tag == page_tag
+
+        for block_num, block in enumerate(page):
+            if block.attrib['blockType'] != 'Text':
+                continue
+            block.set('page', `page_num`)
+            block.set('page_width', page.get('width'))
+            block.set('page_height', page.get('height'))
+            region, text = block
+            for par_num, par in enumerate(text):
+                if len(par) == 0 or len(par[0]) == 0 or len(par[0][0]) == 0:
+                    continue
+                last_line = par[-1]
+                if end_column_par is not None:
+                    if line_end_dot(last_line) and int(par[0].attrib['t']) < int(end_column_par[0].attrib['b']):
+                        yield list(end_column_par) + list(par)
+                        end_column_par = None
+                        continue
+                    else:
+                        yield list(end_column_par)
+                        end_column_par = None
+
+                if incomplete_par is not None:
+                    if line_end_dot(last_line):
+                        yield list(incomplete_par) + list(par)
+                        for p in skipped_par:
+                            yield list(p)
+                        incomplete_par = None
+                        skipped_par = []
+                    else:
+                        skipped_par.append(par)
+                elif par_num + 1 == len(text) and block_num + 1 == len(page) and par_unfinished(last_line, page_w):
+                        incomplete_par = par
+                elif par_num + 1 == len(text) and block_num + 1 != len(page) and col_unfinished(last_line):
+                        end_column_par = par
+                else:
+                    yield list(par)
+
+        page_num += 1
+        page.clear()
+
+def open_abbyy(filename):
+    if filename.endswith('abbyy.gz'):
+        return gzip.open(filename, 'rb')
+    elif filename.endswith('abbyy.xml'):
+        return open(filename)
+    else:
+        assert filename.endswith('abbyy.zip')
+        z = zipfile.ZipFile(filename, 'r')
+        names = z.namelist()
+        assert len(names) == 1
+        assert names[0].endswith('_abbyy.xml')
+        return z.open(names[0])
+
+lang_map = {
+    'english': 'eng',
+    'en': 'eng',
+    'french': 'fre',
+    'fr': 'fre',
+    'german': 'deu',
+    'de': 'deu',
+    'ger': 'deu',
+    'spanish': 'spa',
+    'es': 'spa',
+}
+
+langs = set(['eng', 'fre', 'deu', 'spa'])
+
+def read_meta(ia, path):
+    root = parse(path + '/' + ia + '_meta.xml').getroot()
+    title_elem = root.find('title')
+    if title_elem is None or not title_elem.text:
+        return
+    ocr_elem = root.find('ocr')
+    if ocr_elem is not None and ocr_elem.text == 'language not currently OCRable':
+        print 'language not currently OCRable'
+        sys.exit(0)
+    lang_elem = root.find('language')
+    if lang_elem is None:
+        return
+    l = lang_elem.text.lower()
+    return l if l in langs else lang_map.get(l)
+
+if __name__ == '__main__':
+    page_count = 0
+    ia = sys.argv[1]
+    path = sys.argv[2]
+    filename = path + '/' + sys.argv[3]
+    lang = read_meta(ia, path)
+    if not lang:
+        lang = 'other'
+    f = open_abbyy(filename)
+    for lines in par_iter(f):
+        if lines == 'page':
+            page_count += 1
+            continue
+        text = par_text(lines)
+        print text.encode('utf-8')
+    print 'meta: %s %d' % (lang, page_count)
diff --git a/BookReaderIA/fulltext/inside.php b/BookReaderIA/fulltext/inside.php
new file mode 100644 (file)
index 0000000..3699b56
--- /dev/null
@@ -0,0 +1,50 @@
+<?php
+
+$item_id=$_GET['item_id'];
+$path=$_GET['path'];
+$doc=$_GET['doc'];
+$q=$_GET['q'];
+$callback=$_GET['callback'];
+
+function isValidCallback($identifier) {
+    $pattern = '/^[a-zA-Z_$.][a-zA-Z0-9_$.]*$/';
+    return preg_match($pattern, $identifier) == 1;
+}
+
+function checkPrivs($filename) {
+    if (!is_readable($filename)) {        
+        header('HTTP/1.1 403 Forbidden');
+        exit(0);
+    }
+}
+
+$filename = "$path/${doc}_abbyy.gz";
+if (file_exists($filename)) {
+    checkPrivs($filename);
+} else {
+    $filename = "$path/${doc}_abbyy.zip";
+    if (file_exists($filename)) {
+        checkPrivs($filename);
+    }
+}
+
+$contentType = 'application/json'; // default
+if ($callback) {
+    if (!isValidCallback($callback) ) {
+        throw new Exception("Invalid callback");
+    }
+    $contentType = 'text/javascript'; // JSONP is not JSON
+}
+
+header('Content-type: ' . $contentType . ';charset=UTF-8');
+header('Access-Control-Allow-Origin: *'); // allow cross-origin requests
+
+$item_id = escapeshellarg($item_id);
+$doc = escapeshellarg($doc);
+$path = escapeshellarg($path);
+$q = escapeshellarg($q);
+
+set_time_limit(120);
+passthru("python inside.py $item_id $doc $path $q $callback 2>&1");
+?>
+
diff --git a/BookReaderIA/fulltext/inside.py b/BookReaderIA/fulltext/inside.py
new file mode 100644 (file)
index 0000000..747c69e
--- /dev/null
@@ -0,0 +1,213 @@
+#!/usr/bin/python
+# written by Edward Betts <edward@archive.org> in October 2010
+
+from lxml.etree import iterparse, tostring, Element
+from itertools import izip
+from urllib import urlopen
+import sys, re, json, os, urllib
+from extract_paragraphs import par_iter, open_abbyy
+from subprocess import Popen, PIPE
+
+ns = '{http://www.abbyy.com/FineReader_xml/FineReader6-schema-v1.xml}'
+page_tag = ns + 'page'
+
+solr_inside = 'http://ol-search-inside:8983/solr/inside/select?rows=1&wt=json&fl=ia,body_length,page_count&hl=true&hl.fl=body&hl.fragsize=0&hl.maxAnalyzedChars=-1&hl.usePhraseHighlighter=true&hl.simple.pre={{{&hl.simple.post=}}}&q.op=AND&q='
+
+class Space():
+    text = ' '
+
+space = Space()
+
+def par_char(lines):
+    cur = []
+    match_start = False
+    matches = []
+    for line_num, line in enumerate(lines):
+        first_char = line[0][0]
+        if first_char.attrib.get('wordStart') == 'false' or first_char.attrib.get('wordFromDictionary') == 'false' and len(cur) > 1 and cur[-2].text == '-':
+            cur = cur[:-2]
+        for fmt in line:
+            cur += [c for c in fmt]
+        if line_num + 1 != len(lines):
+            cur += [space]
+    return cur
+
+def skip_page(abbyy_iter):
+    for par in abbyy_iter:
+        if par != 'page':
+            yield par
+
+re_braces = re.compile(r'(\{\{\{|\}\}\})')
+def find_matches(hl_body, abbyy_iter, leaf0_missing=False):
+    text_offset = 0
+    match_number = 0
+    leaf_offset = 1 if leaf0_missing else 0
+    for solr_line, par in izip(hl_body.split('\n'), skip_page(abbyy_iter)):
+        if '{{{' not in solr_line:
+            text_offset += len(solr_line)
+            continue
+        match_with = solr_line
+        abbyy_text = ''.join(c.text for c in par_char(par))
+        cur = {
+            'text': solr_line,
+            #'abbyy': ''.join(c.text for c in par_char(par)),
+            'par': []
+        }
+        if re_braces.sub('', cur['text']) != abbyy_text:
+            cur['error'] = 'mismatch'
+            match_number += 1
+            yield match_number, cur
+            continue
+        prev_char = None
+        match_line = None
+        match_par = None
+        for c in par_char(par):
+            text_offset += 1
+            if match_with.startswith('{{{'):
+                match_with = match_with[3:]
+                match_line = c.getparent().getparent()
+                if not cur['par'] or match_line.getparent() != match_par:
+                    match_par = match_line.getparent()
+                    block = match_par.getparent().getparent()
+                    cur['par'].append({
+                        't': int(match_par[0].get('t')),
+                        'b': int(match_par[-1].get('b')),
+                        'l': int(block.get('l')),
+                        'r': int(block.get('r')),
+                        'boxes': [],
+                        'page': int(block.get('page')) + leaf_offset,
+                        'page_width': int(block.get('page_width')),
+                        'page_height': int(block.get('page_height')),
+                    })
+                line = c.getparent().getparent()
+                cur['par'][-1]['boxes'].append({
+                    't': int(line.get('t')),
+                    'b': int(line.get('b')),
+                    'l': int(c.get('l')),
+                    'page': int(block.get('page')) + leaf_offset,
+                })
+            elif match_with.startswith('}}}'):
+                cur['par'][-1]['boxes'][-1]['r'] = int(prev_char.get('r'))
+                match_with = match_with[3:]
+                match_line = None
+            elif match_line is not None and c.getparent().getparent() != match_line:
+                #print 'line break in match'
+                end_line_char = match_line[-1][-1]
+                cur['par'][-1]['boxes'][-1]['r'] = int(end_line_char.get('r'))
+                match_line = c.getparent().getparent()
+                if match_line.getparent() != match_par:
+                    match_par = match_line.getparent()
+                    cur['par'].append({
+                        't': int(match_par.get('t')),
+                        'b': int(match_par.get('b')),
+                        'l': int(match_par.get('l')),
+                        'r': int(match_par.get('r')),
+                        'boxes': [],
+                        'page': int(block.get('page')) + leaf_offset,
+                    })
+
+                cur['par'][-1]['boxes'].append({
+                    't': int(match_line.get('t')),
+                    'b': int(match_line.get('b')),
+                    'l': int(c.get('l')),
+                })
+
+            if len(match_with) == 0:
+                break
+            assert c.text == match_with[0]
+            match_with = match_with[1:]
+            prev_char = c
+        if match_with == '}}}':
+            cur['par'][-1]['boxes'][-1]['r'] = int(prev_char.get('r'))
+        match_number += 1
+        yield match_number, cur
+
+if __name__ == '__main__':
+    (item_id, doc, path, q) = sys.argv[1:5]
+    callback = sys.argv[5] if len(sys.argv) > 5 else None
+    q = q.strip()
+    if not q:
+        if callback:
+            print callback + '(',
+        print json.dumps({ 'ia': item_id, 'q': q, 'matches': [], 'error': 'You must enter a query.' }, indent=2),
+        print ')' if callback else ''
+        sys.exit(0)
+    reply = urllib.urlopen(solr_inside + urllib.quote('ia:' + item_id)).read()
+    results = json.loads(reply)
+    assert os.path.exists(path)
+    re_item = re.compile('^/\d+/items/([^/]+)')
+    filename = None
+    for ending in 'abbyy.gz', 'abbyy.xml', 'abbyy.zip':
+        test_filename = os.path.join(path, doc + '_' + ending)
+        if os.path.exists(test_filename):
+            filename = test_filename
+            break
+    if callback:
+        print callback + '(',
+    if not results['response']['docs']:
+        index_result = urlopen('http://edward.openlibrary.org/index_now/' + item_id).read()
+        if not index_result.startswith('done'):
+            print json.dumps({ 'ia': item_id, 'q': q, 'matches': [], 'indexed': False}, indent=2),
+            print ')' if callback else ''
+            sys.exit(0)
+    if not filename:
+        print """{
+    "ia": %s,
+    "q": %s,
+    "matches": [],
+}""" % (json.dumps(item_id), json.dumps(q)),
+
+        print ')' if callback else ''
+        sys.exit(0)
+    solr_q = 'ia:%s AND %s' % (item_id, q)
+    reply = urllib.urlopen(solr_inside + urllib.quote(solr_q)).read()
+    try:
+        results = json.loads(reply)
+    except:
+        print reply
+        raise
+    if not results['response']['docs']:
+        print """{
+    "ia": %s,
+    "q": %s,
+    "indexed": true,
+    "matches": [],
+}""" % (json.dumps(item_id), json.dumps(q)),
+
+        print ')' if callback else ''
+        sys.exit(0)
+    solr_doc = results['response']['docs'][0]
+    hl_body = results['highlighting'][item_id]['body'][0]
+    jp2_zip = os.path.join(path, doc + '_jp2.zip')
+    tif_zip = os.path.join(path, doc + '_tif.zip')
+    leaf0_missing = False
+    if os.path.exists(jp2_zip):
+        leaf0_missing = '0000.jp2' not in Popen(['unzip', '-l', jp2_zip], stdout=PIPE).communicate()[0]
+    elif os.path.exists(tif_zip):
+        leaf0_missing = '0000.tif' not in Popen(['unzip', '-l', tif_zip], stdout=PIPE).communicate()[0]
+
+    f = open_abbyy(filename)
+    abbyy_iter = par_iter(f)
+
+    print """{
+    "ia": %s,
+    "q": %s,
+    "indexed": true,
+    "page_count": %d,
+    "body_length": %d,
+    "leaf0_missing": %s,
+    "matches": [ """ % (json.dumps(item_id), json.dumps(q), solr_doc['page_count'], solr_doc['body_length'], 'true' if leaf0_missing else 'false')
+    prev = ''
+    error = None
+    for num, match in find_matches(hl_body, abbyy_iter, leaf0_missing):
+        if 'error' in match:
+            error = match['error']
+            break
+        if prev:
+            print prev + ','
+        prev = json.dumps(match, indent=4)
+    if error:
+        print prev, '],\n    "error": %s' % (json.dumps(error)),
+    else:
+        print prev, ']',
+    print '\n}' + (')' if callback else '')