BookReaderIA/datanode/BookReaderGetText.py

   1 #!/usr/bin/python
   2
   3 # Copyright(c)2008-2010 Internet Archive. Software license AGPL version 3.
   4 #
   5 # This file is part of BookReader.
   6 #
   7 #     BookReader is free software: you can redistribute it and/or modify
   8 #     it under the terms of the GNU Affero General Public License as published by
   9 #     the Free Software Foundation, either version 3 of the License, or
  10 #     (at your option) any later version.
  11 #
  12 #     BookReader is distributed in the hope that it will be useful,
  13 #     but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 #     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 #     GNU Affero General Public License for more details.
  16 #
  17 #     You should have received a copy of the GNU Affero General Public License
  18 #     along with BookReader.  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20 #     The BookReader source is hosted at http://github.com/openlibrary/bookreader/
  21
  22
  23 #watch out for blank lines (<LINE></LINE>)
  24
  25 from lxml import etree
  26 import sys
  27 import re
  28 import json
  29 import re
  30
  31 from windowed_iterator import windowed_iterator
  32 from diff_match_patch import diff_match_patch
  33
  34 minWordsInBlock = 25
  35 maxWordsInBlock = 50
  36
  37 # Header/Footer detection parameters
  38
  39 # 'Window' of neighboring pages to check for similar text that may
  40 # mark headers / footers
  41 windowsize = 10
  42
  43 # Weights to assign to potential headers / footers.
  44 # len(weights) should be even.
  45 weights = (1.0, .75,
  46            .75, 1.0)
  47 # weights = (1.0, .75, .5,
  48 #            .5, .75, 1.0)
  49
  50 # allow potential headers/footers with this length difference
  51 max_length_difference = 4
  52
  53 dmp = diff_match_patch()
  54 dmp.Match_Distance = 2 # number of prepended characters allowed before match
  55 dmp.Match_Threshold = .5 # 0 to 1 ... higher => more fanciful matches,
  56                          # slower execution.
  57
  58 # minimum match score for a line to be considered a header or footer.
  59 min_score = .9
  60
  61
  62 def guess_hfs(page, pages):
  63     """ Given a page and a 'windowed iterator' giving access to neighboring
  64     pages, return a dict containing likely header/footer lines on that page.
  65
  66     A line is considered a likely header/footer if it's near the
  67     start/end of the page, and if it is textually similar the same
  68     line on neighboring pages.
  69     """
  70
  71     result = {}
  72
  73     hf_candidates = get_hf_candidates(page)
  74     neighbor_info = {}
  75     for i in range(len(weights)):
  76         if hf_candidates[i] is None:
  77             continue
  78         score = 0
  79         for neighbor_page in pages.neighbors():
  80             if neighbor_page in neighbor_info:
  81                 neighbor_candidates = neighbor_info[neighbor_page]
  82             else:
  83                 neighbor_candidates = get_hf_candidates(neighbor_page)
  84                 neighbor_info[neighbor_page] = neighbor_candidates
  85             if neighbor_candidates[i] is None:
  86                 continue
  87             text = hf_candidates[i][1]
  88             neighbor_text = neighbor_candidates[i][1]
  89             if abs(len(text) - len(neighbor_text)) > max_length_difference:
  90                 continue
  91
  92             matchstart = dmp.match_main(hf_candidates[i][1],
  93                                         neighbor_candidates[i][1], 0)
  94             if matchstart != -1:
  95                 score += weights[i]
  96             if score > min_score:
  97                 result[hf_candidates[i][0]] = True
  98                 break
  99     return result
 100
 101
 102 def simplify_line_text(line):
 103     text = etree.tostring(line, method='text', encoding=unicode).lower();
 104     # collape numbers (roman too) to '@' so headers will be more
 105     # similar from page to page
 106     text = re.sub(r'[ivx\d]', r'@', text)
 107     text = re.sub(r'\s+', r' ', text)
 108     return text
 109
 110
 111 def get_hf_candidates(page):
 112     result = []
 113     hfwin = len(weights) / 2
 114     lines = [line for line in page.findall('.//LINE')]
 115     for i in range(hfwin) + range(-hfwin, 0):
 116         if abs(i) < len(lines):
 117             result.append((lines[i], simplify_line_text(lines[i])))
 118         else:
 119             result.append(None)
 120     return result
 121
 122
 123 def main(args):
 124     path = args[0]
 125     pageNum = int(args[1])
 126     callback = args[2]
 127
 128     if not re.match('^/\d{1,2}/items/.+_djvu.xml$', path):
 129         sys.exit(-1);
 130
 131     if ('ttsNextPageCB' != callback):
 132         callback = 'ttsStartCB'
 133
 134     f = open(path)
 135     context = etree.iterparse(f, tag='OBJECT')
 136     def drop_event(iter):
 137         for event, page in iter:
 138             yield page
 139     pages = drop_event(context)
 140     def clear_page(page):
 141         page.clear()
 142     pages = windowed_iterator(pages, windowsize, clear_page)
 143     for i, page in enumerate(pages):
 144         if i == pageNum:
 145             break
 146     hfs = guess_hfs(page, pages)
 147
 148     lines = page.findall('.//LINE')
 149
 150     #print 'got %s .//lines' % len(lines)
 151
 152     textBlocks = []
 153     block = ''
 154     rects = []
 155
 156     numWords = 0
 157
 158     for line in lines:
 159         # skip headers/footers
 160         if line in hfs:
 161             continue
 162
 163         top = sys.maxint
 164         left = sys.maxint
 165         right = -1
 166         bottom = -1
 167
 168         numWordsInLine = 0
 169
 170         words = line.findall('.//WORD')
 171
 172         #print 'at start of line, rects ='
 173         #print rects
 174
 175         for word in words:
 176
 177             numWordsInLine += 1
 178
 179             text = word.text
 180             #print 'got text ' + text
 181
 182             coords = word.get('coords').split(',') #l,b,r,t
 183             coords = map(int, coords)
 184
 185             if int(coords[0]) < left:
 186                 left = coords[0]
 187
 188             if coords[1] > bottom:
 189                 bottom = coords[1]
 190
 191             if coords[2] > right:
 192                 right = coords[2]
 193
 194             if coords[3] < top:
 195                 top = coords[3]
 196
 197             block += word.text + ' '
 198             numWords += 1
 199
 200             if text.endswith('.') and (numWords>minWordsInBlock):
 201                 #print 'end of block with numWords=%d' % numWords
 202                 #print 'block = ' + block
 203
 204                 rects.append([left, bottom, right, top])
 205
 206                 #textBlocks.append(block.strip())
 207                 rects.insert(0, block.strip())
 208                 textBlocks.append(rects)
 209                 block = ''
 210                 rects = []
 211                 numWords = 0
 212                 numWordsInLine = 0
 213                 top = sys.maxint
 214                 left = sys.maxint
 215                 right = -1
 216                 bottom = -1
 217
 218         #end of line
 219         if numWordsInLine > 0:
 220             rects.append([left, bottom, right, top])
 221
 222         if numWords>maxWordsInBlock:
 223             #textBlocks.append(block.strip())
 224             rects.insert(0, block.strip())
 225             textBlocks.append(rects)
 226             block = ''
 227             numWords = 0
 228             rects = []
 229
 230         #print 'at end of line, rects ='
 231         #print rects
 232
 233     if '' != block:
 234         #textBlocks.append(block.strip())
 235         rects.insert(0, block.strip())
 236         textBlocks.append(rects)
 237
 238     print 'br.%s(%s);' % (callback, json.dumps(textBlocks))
 239
 240
 241 if __name__ == '__main__':
 242     main(sys.argv[1:])