3 # Copyright(c)2008-2010 Internet Archive. Software license AGPL version 3.
5 # This file is part of BookReader.
7 # BookReader is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # BookReader is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details.
17 # You should have received a copy of the GNU Affero General Public License
18 # along with BookReader. If not, see <http://www.gnu.org/licenses/>.
20 # The BookReader source is hosted at http://github.com/openlibrary/bookreader/
23 #watch out for blank lines (<LINE></LINE>)
25 from lxml import etree
30 from windowed_iterator import windowed_iterator
31 from diff_match_patch import diff_match_patch
36 # Header/Footer detection parameters
38 # 'Window' of neighboring pages to check for similar text that may
39 # mark headers / footers
42 # Weights to assign to potential headers / footers.
43 # len(weights) should be even.
46 # weights = (1.0, .75, .5,
49 # allow potential headers/footers with this length difference
50 max_length_difference = 4
52 dmp = diff_match_patch()
53 dmp.Match_Distance = 2 # number of prepended characters allowed before match
54 dmp.Match_Threshold = .5 # 0 to 1 ... higher => more fanciful matches,
57 # minimum match score for a line to be considered a header or footer.
61 def guess_hfs(page, pages):
62 """ Given a page and a 'windowed iterator' giving access to neighboring
63 pages, return a dict containing likely header/footer lines on that page.
65 A line is considered a likely header/footer if it's near the
66 start/end of the page, and if it is textually similar the same
67 line on neighboring pages.
72 hf_candidates = get_hf_candidates(page)
74 for i in range(len(weights)):
75 if hf_candidates[i] is None:
78 for neighbor_page in pages.neighbors():
79 if neighbor_page in neighbor_info:
80 neighbor_candidates = neighbor_info[neighbor_page]
82 neighbor_candidates = get_hf_candidates(neighbor_page)
83 neighbor_info[neighbor_page] = neighbor_candidates
84 if neighbor_candidates[i] is None:
86 text = hf_candidates[i][1]
87 neighbor_text = neighbor_candidates[i][1]
88 if abs(len(text) - len(neighbor_text)) > max_length_difference:
91 matchstart = dmp.match_main(hf_candidates[i][1],
92 neighbor_candidates[i][1], 0)
96 result[hf_candidates[i][0]] = True
101 def simplify_line_text(line):
102 text = etree.tostring(line, method='text', encoding=unicode).lower();
103 # collape numbers (roman too) to '@' so headers will be more
104 # similar from page to page
105 text = re.sub(r'[ivx\d]', r'@', text)
106 text = re.sub(r'\s+', r' ', text)
110 def get_hf_candidates(page):
112 hfwin = len(weights) / 2
113 lines = [line for line in page.findall('.//LINE')]
114 for i in range(hfwin) + range(-hfwin, 0):
115 if abs(i) < len(lines):
116 result.append((lines[i], simplify_line_text(lines[i])))
124 pageNum = int(args[1])
128 context = etree.iterparse(f, tag='OBJECT')
129 def drop_event(iter):
130 for event, page in iter:
132 pages = drop_event(context)
133 def clear_page(page):
135 pages = windowed_iterator(pages, windowsize, clear_page)
136 for i, page in enumerate(pages):
139 hfs = guess_hfs(page, pages)
141 lines = page.findall('.//LINE')
143 #print 'got %s .//lines' % len(lines)
152 # skip headers/footers
163 words = line.findall('.//WORD')
165 #print 'at start of line, rects ='
173 #print 'got text ' + text
175 coords = word.get('coords').split(',') #l,b,r,t
176 coords = map(int, coords)
178 if int(coords[0]) < left:
181 if coords[1] > bottom:
184 if coords[2] > right:
190 block += word.text + ' '
193 if text.endswith('.') and (numWords>minWordsInBlock):
194 #print 'end of block with numWords=%d' % numWords
195 #print 'block = ' + block
197 rects.append([left, bottom, right, top])
199 #textBlocks.append(block.strip())
200 rects.insert(0, block.strip())
201 textBlocks.append(rects)
212 if numWordsInLine > 0:
213 rects.append([left, bottom, right, top])
215 if numWords>maxWordsInBlock:
216 #textBlocks.append(block.strip())
217 rects.insert(0, block.strip())
218 textBlocks.append(rects)
223 #print 'at end of line, rects ='
227 #textBlocks.append(block.strip())
228 rects.insert(0, block.strip())
229 textBlocks.append(rects)
231 print 'br.%s(%s);' % (callback, json.dumps(textBlocks))
234 if __name__ == '__main__':