3 from lxml.etree import iterparse, tostring, Element, parse
4 import sys, re, gzip, zipfile
7 ns = '{http://www.abbyy.com/FineReader_xml/FineReader6-schema-v1.xml}'
10 re_par_end_dot = re.compile(r'\.\W*$')
12 def read_text_line(line):
21 for line_num, line in enumerate(lines):
22 first_char = line[0][0]
23 if first_char.attrib.get('wordStart') == 'false' or first_char.attrib.get('wordFromDictionary') == 'false' and cur.endswith('- '):
26 cur += ''.join(c.text for c in fmt)
27 if line_num + 1 != len(lines):
31 def line_end_dot(line):
32 return bool(re_par_end_dot.search(read_text_line(line)))
34 def par_unfinished(last_line, page_w):
35 last_line_len = sum(len(fmt) for fmt in last_line)
36 if last_line_len < 15 or line_end_dot(last_line):
38 last_line_last_char = last_line[-1][-1]
39 r = float(last_line_last_char.attrib['r'])
40 return r / page_w > 0.75
42 def col_unfinished(last_line):
43 return sum(len(fmt) for fmt in last_line) > 14 and not line_end_dot(last_line)
51 for eve, page in iterparse(f):
52 if page.tag != page_tag:
56 page_w = float(page.attrib['width'])
57 assert page.tag == page_tag
59 for block_num, block in enumerate(page):
60 if block.attrib['blockType'] != 'Text':
62 block.set('page', `page_num`)
63 block.set('page_width', page.get('width'))
64 block.set('page_height', page.get('height'))
66 for par_num, par in enumerate(text):
67 if len(par) == 0 or len(par[0]) == 0 or len(par[0][0]) == 0:
70 if end_column_par is not None:
71 if line_end_dot(last_line) and int(par[0].attrib['t']) < int(end_column_par[0].attrib['b']):
72 yield list(end_column_par) + list(par)
76 yield list(end_column_par)
79 if incomplete_par is not None:
80 if line_end_dot(last_line):
81 yield list(incomplete_par) + list(par)
87 skipped_par.append(par)
88 elif par_num + 1 == len(text) and block_num + 1 == len(page) and par_unfinished(last_line, page_w):
90 elif par_num + 1 == len(text) and block_num + 1 != len(page) and col_unfinished(last_line):
98 def open_abbyy(filename):
99 if filename.endswith('abbyy.gz'):
100 return gzip.open(filename, 'rb')
101 elif filename.endswith('abbyy.xml'):
102 return open(filename)
104 assert filename.endswith('abbyy.zip')
105 z = zipfile.ZipFile(filename, 'r')
107 assert len(names) == 1
108 assert names[0].endswith('_abbyy.xml')
109 return z.open(names[0])
123 langs = set(['eng', 'fre', 'deu', 'spa'])
125 def read_meta(ia, path):
126 root = parse(path + '/' + ia + '_meta.xml').getroot()
127 title_elem = root.find('title')
128 if title_elem is None or not title_elem.text:
130 ocr_elem = root.find('ocr')
131 if ocr_elem is not None and ocr_elem.text == 'language not currently OCRable':
132 print 'language not currently OCRable'
134 lang_elem = root.find('language')
135 if lang_elem is None:
137 l = lang_elem.text.lower()
138 return l if l in langs else lang_map.get(l)
140 if __name__ == '__main__':
144 filename = path + '/' + sys.argv[3]
145 lang = read_meta(ia, path)
148 f = open_abbyy(filename)
149 for lines in par_iter(f):
153 text = par_text(lines)
154 print text.encode('utf-8')
155 print 'meta: %s %d' % (lang, page_count)