from lxml import etree
import sys
+import re
import json
+import re
-minWordsInBlock = 50
-maxWordsInBlock = 100
+from windowed_iterator import windowed_iterator
+from diff_match_patch import diff_match_patch
-path = sys.argv[1]
-pageNum = int(sys.argv[2])
+minWordsInBlock = 25
+maxWordsInBlock = 50
-tree = etree.parse(path)
+# Header/Footer detection parameters
-objects = tree.findall('//OBJECT')
+# 'Window' of neighboring pages to check for similar text that may
+# mark headers / footers
+windowsize = 10
-#print 'got %s objects' % len(objects)
+# Weights to assign to potential headers / footers.
+# len(weights) should be even.
+weights = (1.0, .75,
+ .75, 1.0)
+# weights = (1.0, .75, .5,
+# .5, .75, 1.0)
-page = objects[pageNum]
+# allow potential headers/footers with this length difference
+max_length_difference = 4
-lines = page.findall('.//LINE')
+dmp = diff_match_patch()
+dmp.Match_Distance = 2 # number of prepended characters allowed before match
+dmp.Match_Threshold = .5 # 0 to 1 ... higher => more fanciful matches,
+ # slower execution.
-#print 'got %s .//lines' % len(lines)
+# minimum match score for a line to be considered a header or footer.
+min_score = .9
-textBlocks = []
-block = ''
-rects = []
-numWords = 0
+def guess_hfs(page, pages):
+ """ Given a page and a 'windowed iterator' giving access to neighboring
+ pages, return a dict containing likely header/footer lines on that page.
-for line in lines:
+ A line is considered a likely header/footer if it's near the
+ start/end of the page, and if it is textually similar the same
+ line on neighboring pages.
+ """
+
+ result = {}
+
+ hf_candidates = get_hf_candidates(page)
+ neighbor_info = {}
+ for i in range(len(weights)):
+ if hf_candidates[i] is None:
+ continue
+ score = 0
+ for neighbor_page in pages.neighbors():
+ if neighbor_page in neighbor_info:
+ neighbor_candidates = neighbor_info[neighbor_page]
+ else:
+ neighbor_candidates = get_hf_candidates(neighbor_page)
+ neighbor_info[neighbor_page] = neighbor_candidates
+ if neighbor_candidates[i] is None:
+ continue
+ text = hf_candidates[i][1]
+ neighbor_text = neighbor_candidates[i][1]
+ if abs(len(text) - len(neighbor_text)) > max_length_difference:
+ continue
+
+ matchstart = dmp.match_main(hf_candidates[i][1],
+ neighbor_candidates[i][1], 0)
+ if matchstart != -1:
+ score += weights[i]
+ if score > min_score:
+ result[hf_candidates[i][0]] = True
+ break
+ return result
- top = sys.maxint
- left = sys.maxint
- right = -1
- bottom = -1
+
+def simplify_line_text(line):
+ text = etree.tostring(line, method='text', encoding=unicode).lower();
+ # collape numbers (roman too) to '@' so headers will be more
+ # similar from page to page
+ text = re.sub(r'[ivx\d]', r'@', text)
+ text = re.sub(r'\s+', r' ', text)
+ return text
+
+
+def get_hf_candidates(page):
+ result = []
+ hfwin = len(weights) / 2
+ lines = [line for line in page.findall('.//LINE')]
+ for i in range(hfwin) + range(-hfwin, 0):
+ if abs(i) < len(lines):
+ result.append((lines[i], simplify_line_text(lines[i])))
+ else:
+ result.append(None)
+ return result
+
+
+def main(args):
+ path = args[0]
+ pageNum = int(args[1])
+ callback = args[2]
+
+ if not re.match('^/\d{1,2}/items/.+_djvu.xml$', path):
+ sys.exit(-1);
+
+ if ('ttsNextPageCB' != callback):
+ callback = 'ttsStartCB'
+
+ f = open(path)
+ context = etree.iterparse(f, tag='OBJECT')
+ def drop_event(iter):
+ for event, page in iter:
+ yield page
+ pages = drop_event(context)
+ def clear_page(page):
+ page.clear()
+ pages = windowed_iterator(pages, windowsize, clear_page)
+ for i, page in enumerate(pages):
+ if i == pageNum:
+ break
+ hfs = guess_hfs(page, pages)
+
+ lines = page.findall('.//LINE')
- numWordsInLine = 0
+ #print 'got %s .//lines' % len(lines)
- words = line.findall('.//WORD')
+ textBlocks = []
+ block = ''
+ rects = []
- #print 'at start of line, rects ='
- #print rects
-
- for word in words:
+ numWords = 0
- numWordsInLine += 1
-
- text = word.text
- #print 'got text ' + text
-
- coords = word.get('coords').split(',') #l,b,r,t
- coords = map(int, coords)
-
- if int(coords[0]) < left:
- left = coords[0]
-
- if coords[1] > bottom:
- bottom = coords[1]
+ for line in lines:
+ # skip headers/footers
+ if line in hfs:
+ continue
- if coords[2] > right:
- right = coords[2]
+ top = sys.maxint
+ left = sys.maxint
+ right = -1
+ bottom = -1
- if coords[3] < top:
- top = coords[3]
-
- block += word.text + ' '
- numWords += 1
-
- if text.endswith('.') and (numWords>minWordsInBlock):
- #print 'end of block with numWords=%d' % numWords
- #print 'block = ' + block
-
- rects.append([left, bottom, right, top])
-
- #textBlocks.append(block.strip())
+ numWordsInLine = 0
+
+ words = line.findall('.//WORD')
+
+ #print 'at start of line, rects ='
+ #print rects
+
+ for word in words:
+
+ numWordsInLine += 1
+
+ text = word.text
+ #print 'got text ' + text
+
+ coords = word.get('coords').split(',') #l,b,r,t
+ coords = map(int, coords)
+
+ if int(coords[0]) < left:
+ left = coords[0]
+
+ if coords[1] > bottom:
+ bottom = coords[1]
+
+ if coords[2] > right:
+ right = coords[2]
+
+ if coords[3] < top:
+ top = coords[3]
+
+ block += word.text + ' '
+ numWords += 1
+
+ if text.endswith('.') and (numWords>minWordsInBlock):
+ #print 'end of block with numWords=%d' % numWords
+ #print 'block = ' + block
+
+ rects.append([left, bottom, right, top])
+
+ #textBlocks.append(block.strip())
+ rects.insert(0, block.strip())
+ textBlocks.append(rects)
+ block = ''
+ rects = []
+ numWords = 0
+ numWordsInLine = 0
+ top = sys.maxint
+ left = sys.maxint
+ right = -1
+ bottom = -1
+
+ #end of line
+ if numWordsInLine > 0:
+ rects.append([left, bottom, right, top])
+
+ if numWords>maxWordsInBlock:
+ #textBlocks.append(block.strip())
rects.insert(0, block.strip())
- textBlocks.append(rects)
+ textBlocks.append(rects)
block = ''
- rects = []
numWords = 0
- numWordsInLine = 0
- top = sys.maxint
- left = sys.maxint
- right = -1
- bottom = -1
-
- #end of line
- if numWordsInLine > 0:
- rects.append([left, bottom, right, top])
-
- if numWords>maxWordsInBlock:
- #textBlocks.append(block.strip())
+ rects = []
+
+ #print 'at end of line, rects ='
+ #print rects
+
+ if '' != block:
+ #textBlocks.append(block.strip())
rects.insert(0, block.strip())
- textBlocks.append(rects)
- block = ''
- numWords = 0
- rects = []
-
- #print 'at end of line, rects ='
- #print rects
-
-if '' != block:
- #textBlocks.append(block.strip())
- rects.insert(0, block.strip())
- textBlocks.append(rects)
-
-print json.dumps(textBlocks)
\ No newline at end of file
+ textBlocks.append(rects)
+
+ print 'br.%s(%s);' % (callback, json.dumps(textBlocks))
+
+
+if __name__ == '__main__':
+ main(sys.argv[1:])