3 # Copyright(c)2008-2010 Internet Archive. Software license AGPL version 3.
5 # This file is part of BookReader.
7 # BookReader is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # BookReader is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details.
17 # You should have received a copy of the GNU Affero General Public License
18 # along with BookReader. If not, see <http://www.gnu.org/licenses/>.
20 # The BookReader source is hosted at http://github.com/openlibrary/bookreader/
23 #watch out for blank lines (<LINE></LINE>)
25 from lxml import etree
31 from windowed_iterator import windowed_iterator
32 from diff_match_patch import diff_match_patch
37 # Header/Footer detection parameters
39 # 'Window' of neighboring pages to check for similar text that may
40 # mark headers / footers
43 # Weights to assign to potential headers / footers.
44 # len(weights) should be even.
47 # weights = (1.0, .75, .5,
50 # allow potential headers/footers with this length difference
51 max_length_difference = 4
53 dmp = diff_match_patch()
54 dmp.Match_Distance = 2 # number of prepended characters allowed before match
55 dmp.Match_Threshold = .5 # 0 to 1 ... higher => more fanciful matches,
58 # minimum match score for a line to be considered a header or footer.
62 def guess_hfs(page, pages):
63 """ Given a page and a 'windowed iterator' giving access to neighboring
64 pages, return a dict containing likely header/footer lines on that page.
66 A line is considered a likely header/footer if it's near the
67 start/end of the page, and if it is textually similar the same
68 line on neighboring pages.
73 hf_candidates = get_hf_candidates(page)
75 for i in range(len(weights)):
76 if hf_candidates[i] is None:
79 for neighbor_page in pages.neighbors():
80 if neighbor_page in neighbor_info:
81 neighbor_candidates = neighbor_info[neighbor_page]
83 neighbor_candidates = get_hf_candidates(neighbor_page)
84 neighbor_info[neighbor_page] = neighbor_candidates
85 if neighbor_candidates[i] is None:
87 text = hf_candidates[i][1]
88 neighbor_text = neighbor_candidates[i][1]
89 if abs(len(text) - len(neighbor_text)) > max_length_difference:
92 matchstart = dmp.match_main(hf_candidates[i][1],
93 neighbor_candidates[i][1], 0)
97 result[hf_candidates[i][0]] = True
102 def simplify_line_text(line):
103 text = etree.tostring(line, method='text', encoding=unicode).lower();
104 # collape numbers (roman too) to '@' so headers will be more
105 # similar from page to page
106 text = re.sub(r'[ivx\d]', r'@', text)
107 text = re.sub(r'\s+', r' ', text)
111 def get_hf_candidates(page):
113 hfwin = len(weights) / 2
114 lines = [line for line in page.findall('.//LINE')]
115 for i in range(hfwin) + range(-hfwin, 0):
116 if abs(i) < len(lines):
117 result.append((lines[i], simplify_line_text(lines[i])))
125 pageNum = int(args[1])
128 if not re.match('^/\d{1,2}/items/.+_djvu.xml$', path):
131 if ('ttsNextPageCB' != callback):
132 callback = 'ttsStartCB'
135 context = etree.iterparse(f, tag='OBJECT')
136 def drop_event(iter):
137 for event, page in iter:
139 pages = drop_event(context)
140 def clear_page(page):
142 pages = windowed_iterator(pages, windowsize, clear_page)
143 for i, page in enumerate(pages):
146 hfs = guess_hfs(page, pages)
148 lines = page.findall('.//LINE')
150 #print 'got %s .//lines' % len(lines)
159 # skip headers/footers
170 words = line.findall('.//WORD')
172 #print 'at start of line, rects ='
180 #print 'got text ' + text
182 coords = word.get('coords').split(',') #l,b,r,t
183 coords = map(int, coords)
185 if int(coords[0]) < left:
188 if coords[1] > bottom:
191 if coords[2] > right:
197 block += word.text + ' '
200 if text.endswith('.') and (numWords>minWordsInBlock):
201 #print 'end of block with numWords=%d' % numWords
202 #print 'block = ' + block
204 rects.append([left, bottom, right, top])
206 #textBlocks.append(block.strip())
207 rects.insert(0, block.strip())
208 textBlocks.append(rects)
219 if numWordsInLine > 0:
220 rects.append([left, bottom, right, top])
222 if numWords>maxWordsInBlock:
223 #textBlocks.append(block.strip())
224 rects.insert(0, block.strip())
225 textBlocks.append(rects)
230 #print 'at end of line, rects ='
234 #textBlocks.append(block.strip())
235 rects.insert(0, block.strip())
236 textBlocks.append(rects)
238 print 'br.%s(%s);' % (callback, json.dumps(textBlocks))
241 if __name__ == '__main__':