BookReaderIA/datanode/BookReaderGetText.py

   1 #!/usr/bin/python
   2
   3 # Copyright(c)2008-2010 Internet Archive. Software license AGPL version 3.
   4 #
   5 # This file is part of BookReader.
   6 #
   7 #     BookReader is free software: you can redistribute it and/or modify
   8 #     it under the terms of the GNU Affero General Public License as published by
   9 #     the Free Software Foundation, either version 3 of the License, or
  10 #     (at your option) any later version.
  11 #
  12 #     BookReader is distributed in the hope that it will be useful,
  13 #     but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 #     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 #     GNU Affero General Public License for more details.
  16 #
  17 #     You should have received a copy of the GNU Affero General Public License
  18 #     along with BookReader.  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20 #     The BookReader source is hosted at http://github.com/openlibrary/bookreader/
  21
  22
  23 #watch out for blank lines (<LINE></LINE>)
  24
  25 from lxml import etree
  26 import sys
  27 import re
  28 import json
  29
  30 from windowed_iterator import windowed_iterator
  31 from diff_match_patch import diff_match_patch
  32
  33 minWordsInBlock = 25
  34 maxWordsInBlock = 50
  35
  36 # Header/Footer detection parameters
  37
  38 # 'Window' of neighboring pages to check for similar text that may
  39 # mark headers / footers
  40 windowsize = 10
  41
  42 # Weights to assign to potential headers / footers.
  43 # len(weights) should be even.
  44 weights = (1.0, .75,
  45            .75, 1.0)
  46 # weights = (1.0, .75, .5,
  47 #            .5, .75, 1.0)
  48
  49 # allow potential headers/footers with this length difference
  50 max_length_difference = 4
  51
  52 dmp = diff_match_patch()
  53 dmp.Match_Distance = 2 # number of prepended characters allowed before match
  54 dmp.Match_Threshold = .5 # 0 to 1 ... higher => more fanciful matches,
  55                          # slower execution.
  56
  57 # minimum match score for a line to be considered a header or footer.
  58 min_score = .9
  59
  60
  61 def guess_hfs(page, pages):
  62     """ Given a page and a 'windowed iterator' giving access to neighboring
  63     pages, return a dict containing likely header/footer lines on that page.
  64
  65     A line is considered a likely header/footer if it's near the
  66     start/end of the page, and if it is textually similar the same
  67     line on neighboring pages.
  68     """
  69
  70     result = {}
  71
  72     hf_candidates = get_hf_candidates(page)
  73     neighbor_info = {}
  74     for i in range(len(weights)):
  75         if hf_candidates[i] is None:
  76             continue
  77         score = 0
  78         for neighbor_page in pages.neighbors():
  79             if neighbor_page in neighbor_info:
  80                 neighbor_candidates = neighbor_info[neighbor_page]
  81             else:
  82                 neighbor_candidates = get_hf_candidates(neighbor_page)
  83                 neighbor_info[neighbor_page] = neighbor_candidates
  84             if neighbor_candidates[i] is None:
  85                 continue
  86             text = hf_candidates[i][1]
  87             neighbor_text = neighbor_candidates[i][1]
  88             if abs(len(text) - len(neighbor_text)) > max_length_difference:
  89                 continue
  90
  91             matchstart = dmp.match_main(hf_candidates[i][1],
  92                                         neighbor_candidates[i][1], 0)
  93             if matchstart != -1:
  94                 score += weights[i]
  95             if score > min_score:
  96                 result[hf_candidates[i][0]] = True
  97                 break
  98     return result
  99
 100
 101 def simplify_line_text(line):
 102     text = etree.tostring(line, method='text', encoding=unicode).lower();
 103     # collape numbers (roman too) to '@' so headers will be more
 104     # similar from page to page
 105     text = re.sub(r'[ivx\d]', r'@', text)
 106     text = re.sub(r'\s+', r' ', text)
 107     return text
 108
 109
 110 def get_hf_candidates(page):
 111     result = []
 112     hfwin = len(weights) / 2
 113     lines = [line for line in page.findall('.//LINE')]
 114     for i in range(hfwin) + range(-hfwin, 0):
 115         if abs(i) < len(lines):
 116             result.append((lines[i], simplify_line_text(lines[i])))
 117         else:
 118             result.append(None)
 119     return result
 120
 121
 122 def main(args):
 123     path = args[0]
 124     pageNum = int(args[1])
 125     callback = args[2]
 126
 127     f = open(path)
 128     context = etree.iterparse(f, tag='OBJECT')
 129     def drop_event(iter):
 130         for event, page in iter:
 131             yield page
 132     pages = drop_event(context)
 133     def clear_page(page):
 134         page.clear()
 135     pages = windowed_iterator(pages, windowsize, clear_page)
 136     for i, page in enumerate(pages):
 137         if i == pageNum:
 138             break
 139     hfs = guess_hfs(page, pages)
 140
 141     lines = page.findall('.//LINE')
 142
 143     #print 'got %s .//lines' % len(lines)
 144
 145     textBlocks = []
 146     block = ''
 147     rects = []
 148
 149     numWords = 0
 150
 151     for line in lines:
 152         # skip headers/footers
 153         if line in hfs:
 154             continue
 155
 156         top = sys.maxint
 157         left = sys.maxint
 158         right = -1
 159         bottom = -1
 160
 161         numWordsInLine = 0
 162
 163         words = line.findall('.//WORD')
 164
 165         #print 'at start of line, rects ='
 166         #print rects
 167
 168         for word in words:
 169
 170             numWordsInLine += 1
 171
 172             text = word.text
 173             #print 'got text ' + text
 174
 175             coords = word.get('coords').split(',') #l,b,r,t
 176             coords = map(int, coords)
 177
 178             if int(coords[0]) < left:
 179                 left = coords[0]
 180
 181             if coords[1] > bottom:
 182                 bottom = coords[1]
 183
 184             if coords[2] > right:
 185                 right = coords[2]
 186
 187             if coords[3] < top:
 188                 top = coords[3]
 189
 190             block += word.text + ' '
 191             numWords += 1
 192
 193             if text.endswith('.') and (numWords>minWordsInBlock):
 194                 #print 'end of block with numWords=%d' % numWords
 195                 #print 'block = ' + block
 196
 197                 rects.append([left, bottom, right, top])
 198
 199                 #textBlocks.append(block.strip())
 200                 rects.insert(0, block.strip())
 201                 textBlocks.append(rects)
 202                 block = ''
 203                 rects = []
 204                 numWords = 0
 205                 numWordsInLine = 0
 206                 top = sys.maxint
 207                 left = sys.maxint
 208                 right = -1
 209                 bottom = -1
 210
 211         #end of line
 212         if numWordsInLine > 0:
 213             rects.append([left, bottom, right, top])
 214
 215         if numWords>maxWordsInBlock:
 216             #textBlocks.append(block.strip())
 217             rects.insert(0, block.strip())
 218             textBlocks.append(rects)
 219             block = ''
 220             numWords = 0
 221             rects = []
 222
 223         #print 'at end of line, rects ='
 224         #print rects
 225
 226     if '' != block:
 227         #textBlocks.append(block.strip())
 228         rects.insert(0, block.strip())
 229         textBlocks.append(rects)
 230
 231     print 'br.%s(%s);' % (callback, json.dumps(textBlocks))
 232
 233
 234 if __name__ == '__main__':
 235     main(sys.argv[1:])