+
+def simplify_line_text(line):
+ text = etree.tostring(line, method='text', encoding=unicode).lower();
+ # collape numbers (roman too) to '@' so headers will be more
+ # similar from page to page
+ text = re.sub(r'[ivx\d]', r'@', text)
+ text = re.sub(r'\s+', r' ', text)
+ return text
+
+
+def get_hf_candidates(page):
+ result = []
+ hfwin = len(weights) / 2
+ lines = [line for line in page.findall('.//LINE')]
+ for i in range(hfwin) + range(-hfwin, 0):
+ if abs(i) < len(lines):
+ result.append((lines[i], simplify_line_text(lines[i])))
+ else:
+ result.append(None)
+ return result
+
+
+def main(args):
+ path = args[0]
+ pageNum = int(args[1])
+ callback = args[2]
+
+ f = open(path)
+ context = etree.iterparse(f, tag='OBJECT')
+ def drop_event(iter):
+ for event, page in iter:
+ yield page
+ pages = drop_event(context)
+ def clear_page(page):
+ page.clear()
+ pages = windowed_iterator(pages, windowsize, clear_page)
+ for i, page in enumerate(pages):
+ if i == pageNum:
+ break
+ hfs = guess_hfs(page, pages)
+
+ lines = page.findall('.//LINE')