From 5eeb59388d375162c4ff9788b726ea98d8b2320b Mon Sep 17 00:00:00 2001 From: Mike McCabe Date: Thu, 23 Sep 2010 22:58:30 +0000 Subject: [PATCH 1/1] Add header/footer detection --- BookReaderIA/datanode/BookReaderGetText.py | 266 ++++++++++++++------- 1 file changed, 185 insertions(+), 81 deletions(-) diff --git a/BookReaderIA/datanode/BookReaderGetText.py b/BookReaderIA/datanode/BookReaderGetText.py index 9955968..1e75a93 100644 --- a/BookReaderIA/datanode/BookReaderGetText.py +++ b/BookReaderIA/datanode/BookReaderGetText.py @@ -24,108 +24,212 @@ from lxml import etree import sys +import re import json +from windowed_iterator import windowed_iterator +from diff_match_patch import diff_match_patch + minWordsInBlock = 25 maxWordsInBlock = 50 -path = sys.argv[1] -pageNum = int(sys.argv[2]) -callback = sys.argv[3] - -tree = etree.parse(path) +# Header/Footer detection parameters -objects = tree.findall('//OBJECT') +# 'Window' of neighboring pages to check for similar text that may +# mark headers / footers +windowsize = 10 -#print 'got %s objects' % len(objects) +# Weights to assign to potential headers / footers. +# len(weights) should be even. +weights = (1.0, .75, + .75, 1.0) +# weights = (1.0, .75, .5, +# .5, .75, 1.0) -page = objects[pageNum] +# allow potential headers/footers with this length difference +max_length_difference = 4 -lines = page.findall('.//LINE') +dmp = diff_match_patch() +dmp.Match_Distance = 2 # number of prepended characters allowed before match +dmp.Match_Threshold = .5 # 0 to 1 ... higher => more fanciful matches, + # slower execution. -#print 'got %s .//lines' % len(lines) +# minimum match score for a line to be considered a header or footer. +min_score = .9 -textBlocks = [] -block = '' -rects = [] -numWords = 0 +def guess_hfs(page, pages): + """ Given a page and a 'windowed iterator' giving access to neighboring + pages, return a dict containing likely header/footer lines on that page. -for line in lines: + A line is considered a likely header/footer if it's near the + start/end of the page, and if it is textually similar the same + line on neighboring pages. + """ + + result = {} + + hf_candidates = get_hf_candidates(page) + neighbor_info = {} + for i in range(len(weights)): + if hf_candidates[i] is None: + continue + score = 0 + for neighbor_page in pages.neighbors(): + if neighbor_page in neighbor_info: + neighbor_candidates = neighbor_info[neighbor_page] + else: + neighbor_candidates = get_hf_candidates(neighbor_page) + neighbor_info[neighbor_page] = neighbor_candidates + if neighbor_candidates[i] is None: + continue + text = hf_candidates[i][1] + neighbor_text = neighbor_candidates[i][1] + if abs(len(text) - len(neighbor_text)) > max_length_difference: + continue + + matchstart = dmp.match_main(hf_candidates[i][1], + neighbor_candidates[i][1], 0) + if matchstart != -1: + score += weights[i] + if score > min_score: + result[hf_candidates[i][0]] = True + break + return result - top = sys.maxint - left = sys.maxint - right = -1 - bottom = -1 + +def simplify_line_text(line): + text = etree.tostring(line, method='text', encoding=unicode).lower(); + # collape numbers (roman too) to '@' so headers will be more + # similar from page to page + text = re.sub(r'[ivx\d]', r'@', text) + text = re.sub(r'\s+', r' ', text) + return text + + +def get_hf_candidates(page): + result = [] + hfwin = len(weights) / 2 + lines = [line for line in page.findall('.//LINE')] + for i in range(hfwin) + range(-hfwin, 0): + if abs(i) < len(lines): + result.append((lines[i], simplify_line_text(lines[i]))) + else: + result.append(None) + return result + + +def main(args): + path = args[0] + pageNum = int(args[1]) + callback = args[2] + + f = open(path) + context = etree.iterparse(f, tag='OBJECT') + def drop_event(iter): + for event, page in iter: + yield page + pages = drop_event(context) + def clear_page(page): + page.clear() + pages = windowed_iterator(pages, windowsize, clear_page) + for i, page in enumerate(pages): + if i == pageNum: + break + hfs = guess_hfs(page, pages) + + lines = page.findall('.//LINE') - numWordsInLine = 0 + #print 'got %s .//lines' % len(lines) - words = line.findall('.//WORD') + textBlocks = [] + block = '' + rects = [] - #print 'at start of line, rects =' - #print rects - - for word in words: + numWords = 0 - numWordsInLine += 1 - - text = word.text - #print 'got text ' + text - - coords = word.get('coords').split(',') #l,b,r,t - coords = map(int, coords) - - if int(coords[0]) < left: - left = coords[0] - - if coords[1] > bottom: - bottom = coords[1] + for line in lines: + # skip headers/footers + if line in hfs: + continue - if coords[2] > right: - right = coords[2] + top = sys.maxint + left = sys.maxint + right = -1 + bottom = -1 - if coords[3] < top: - top = coords[3] - - block += word.text + ' ' - numWords += 1 - - if text.endswith('.') and (numWords>minWordsInBlock): - #print 'end of block with numWords=%d' % numWords - #print 'block = ' + block - - rects.append([left, bottom, right, top]) - - #textBlocks.append(block.strip()) + numWordsInLine = 0 + + words = line.findall('.//WORD') + + #print 'at start of line, rects =' + #print rects + + for word in words: + + numWordsInLine += 1 + + text = word.text + #print 'got text ' + text + + coords = word.get('coords').split(',') #l,b,r,t + coords = map(int, coords) + + if int(coords[0]) < left: + left = coords[0] + + if coords[1] > bottom: + bottom = coords[1] + + if coords[2] > right: + right = coords[2] + + if coords[3] < top: + top = coords[3] + + block += word.text + ' ' + numWords += 1 + + if text.endswith('.') and (numWords>minWordsInBlock): + #print 'end of block with numWords=%d' % numWords + #print 'block = ' + block + + rects.append([left, bottom, right, top]) + + #textBlocks.append(block.strip()) + rects.insert(0, block.strip()) + textBlocks.append(rects) + block = '' + rects = [] + numWords = 0 + numWordsInLine = 0 + top = sys.maxint + left = sys.maxint + right = -1 + bottom = -1 + + #end of line + if numWordsInLine > 0: + rects.append([left, bottom, right, top]) + + if numWords>maxWordsInBlock: + #textBlocks.append(block.strip()) rects.insert(0, block.strip()) - textBlocks.append(rects) + textBlocks.append(rects) block = '' - rects = [] numWords = 0 - numWordsInLine = 0 - top = sys.maxint - left = sys.maxint - right = -1 - bottom = -1 - - #end of line - if numWordsInLine > 0: - rects.append([left, bottom, right, top]) - - if numWords>maxWordsInBlock: - #textBlocks.append(block.strip()) + rects = [] + + #print 'at end of line, rects =' + #print rects + + if '' != block: + #textBlocks.append(block.strip()) rects.insert(0, block.strip()) - textBlocks.append(rects) - block = '' - numWords = 0 - rects = [] - - #print 'at end of line, rects =' - #print rects - -if '' != block: - #textBlocks.append(block.strip()) - rects.insert(0, block.strip()) - textBlocks.append(rects) - -print 'br.%s(%s);' % (callback, json.dumps(textBlocks)) + textBlocks.append(rects) + + print 'br.%s(%s);' % (callback, json.dumps(textBlocks)) + + +if __name__ == '__main__': + main(sys.argv[1:]) -- 2.20.1