BookReaderIA/fulltext/extract_paragraphs.py

   1 #!/usr/bin/python
   2
   3 from lxml.etree import iterparse, tostring, Element, parse
   4 import sys, re, gzip, zipfile
   5 from time import time
   6
   7 ns = '{http://www.abbyy.com/FineReader_xml/FineReader6-schema-v1.xml}'
   8 page_tag = ns + 'page'
   9
  10 re_par_end_dot = re.compile(r'\.\W*$')
  11
  12 def read_text_line(line):
  13     text = ''
  14     for fmt in line:
  15         for c in fmt:
  16             text += c.text
  17     return text
  18
  19 def par_text(lines):
  20     cur = ''
  21     for line_num, line in enumerate(lines):
  22         first_char = line[0][0]
  23         if first_char.attrib.get('wordStart') == 'false' or first_char.attrib.get('wordFromDictionary') == 'false' and cur.endswith('- '):
  24             cur = cur[:-2]
  25         for fmt in line:
  26             cur += ''.join(c.text for c in fmt)
  27         if line_num + 1 != len(lines):
  28             cur += ' '
  29     return cur
  30
  31 def line_end_dot(line):
  32     return bool(re_par_end_dot.search(read_text_line(line)))
  33
  34 def par_unfinished(last_line, page_w):
  35     last_line_len = sum(len(fmt) for fmt in last_line)
  36     if last_line_len < 15 or line_end_dot(last_line):
  37         return False
  38     last_line_last_char = last_line[-1][-1]
  39     r = float(last_line_last_char.attrib['r'])
  40     return r / page_w > 0.75
  41
  42 def col_unfinished(last_line):
  43     return sum(len(fmt) for fmt in last_line) > 14 and not line_end_dot(last_line)
  44
  45 def par_iter(f):
  46     incomplete_par = None
  47     end_column_par = None
  48     skipped_par = []
  49     page_num = 0
  50     t0 = time()
  51     for eve, page in iterparse(f):
  52         if page.tag != page_tag:
  53             continue
  54         yield 'page'
  55
  56         page_w = float(page.attrib['width'])
  57         assert page.tag == page_tag
  58
  59         for block_num, block in enumerate(page):
  60             if block.attrib['blockType'] != 'Text':
  61                 continue
  62             block.set('page', `page_num`)
  63             block.set('page_width', page.get('width'))
  64             block.set('page_height', page.get('height'))
  65             region, text = block
  66             for par_num, par in enumerate(text):
  67                 if len(par) == 0 or len(par[0]) == 0 or len(par[0][0]) == 0:
  68                     continue
  69                 last_line = par[-1]
  70                 if end_column_par is not None:
  71                     if line_end_dot(last_line) and int(par[0].attrib['t']) < int(end_column_par[0].attrib['b']):
  72                         yield list(end_column_par) + list(par)
  73                         end_column_par = None
  74                         continue
  75                     else:
  76                         yield list(end_column_par)
  77                         end_column_par = None
  78
  79                 if incomplete_par is not None:
  80                     if line_end_dot(last_line):
  81                         yield list(incomplete_par) + list(par)
  82                         for p in skipped_par:
  83                             yield list(p)
  84                         incomplete_par = None
  85                         skipped_par = []
  86                     else:
  87                         skipped_par.append(par)
  88                 elif par_num + 1 == len(text) and block_num + 1 == len(page) and par_unfinished(last_line, page_w):
  89                         incomplete_par = par
  90                 elif par_num + 1 == len(text) and block_num + 1 != len(page) and col_unfinished(last_line):
  91                         end_column_par = par
  92                 else:
  93                     yield list(par)
  94
  95         page_num += 1
  96         page.clear()
  97
  98 def open_abbyy(filename):
  99     if filename.endswith('abbyy.gz'):
 100         return gzip.open(filename, 'rb')
 101     elif filename.endswith('abbyy.xml'):
 102         return open(filename)
 103     else:
 104         assert filename.endswith('abbyy.zip')
 105         z = zipfile.ZipFile(filename, 'r')
 106         names = z.namelist()
 107         assert len(names) == 1
 108         assert names[0].endswith('_abbyy.xml')
 109         return z.open(names[0])
 110
 111 lang_map = {
 112     'english': 'eng',
 113     'en': 'eng',
 114     'french': 'fre',
 115     'fr': 'fre',
 116     'german': 'deu',
 117     'de': 'deu',
 118     'ger': 'deu',
 119     'spanish': 'spa',
 120     'es': 'spa',
 121 }
 122
 123 langs = set(['eng', 'fre', 'deu', 'spa'])
 124
 125 def read_meta(ia, path):
 126     root = parse(path + '/' + ia + '_meta.xml').getroot()
 127     title_elem = root.find('title')
 128     if title_elem is None or not title_elem.text:
 129         return
 130     ocr_elem = root.find('ocr')
 131     if ocr_elem is not None and ocr_elem.text == 'language not currently OCRable':
 132         print 'language not currently OCRable'
 133         sys.exit(0)
 134     lang_elem = root.find('language')
 135     if lang_elem is None:
 136         return
 137     l = lang_elem.text.lower()
 138     return l if l in langs else lang_map.get(l)
 139
 140 if __name__ == '__main__':
 141     page_count = 0
 142     ia = sys.argv[1]
 143     path = sys.argv[2]
 144     filename = path + '/' + sys.argv[3]
 145     lang = read_meta(ia, path)
 146     if not lang:
 147         lang = 'other'
 148     f = open_abbyy(filename)
 149     for lines in par_iter(f):
 150         if lines == 'page':
 151             page_count += 1
 152             continue
 153         text = par_text(lines)
 154         print text.encode('utf-8')
 155     print 'meta: %s %d' % (lang, page_count)