BookReaderIA/datanode/BookReaderGetText.py

   1 #!/usr/bin/python
   2
   3 # Copyright(c)2008-2010 Internet Archive. Software license AGPL version 3.
   4 #
   5 # This file is part of BookReader.
   6 #
   7 #     BookReader is free software: you can redistribute it and/or modify
   8 #     it under the terms of the GNU Affero General Public License as published by
   9 #     the Free Software Foundation, either version 3 of the License, or
  10 #     (at your option) any later version.
  11 #
  12 #     BookReader is distributed in the hope that it will be useful,
  13 #     but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 #     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 #     GNU Affero General Public License for more details.
  16 #
  17 #     You should have received a copy of the GNU Affero General Public License
  18 #     along with BookReader.  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20 #     The BookReader source is hosted at http://github.com/openlibrary/bookreader/
  21
  22
  23 #watch out for blank lines (<LINE></LINE>)
  24
  25 from lxml import etree
  26 import sys
  27 import json
  28
  29 minWordsInBlock = 25
  30 maxWordsInBlock = 50
  31
  32 path = sys.argv[1]
  33 pageNum = int(sys.argv[2])
  34 callback = sys.argv[3]
  35
  36 tree = etree.parse(path)
  37
  38 objects = tree.findall('//OBJECT')
  39
  40 #print 'got %s objects' % len(objects)
  41
  42 page = objects[pageNum]
  43
  44 lines = page.findall('.//LINE')
  45
  46 #print 'got %s .//lines' % len(lines)
  47
  48 textBlocks = []
  49 block = ''
  50 rects = []
  51
  52 numWords = 0
  53
  54 for line in lines:
  55
  56     top = sys.maxint
  57     left = sys.maxint
  58     right = -1
  59     bottom = -1
  60
  61     numWordsInLine = 0
  62
  63     words = line.findall('.//WORD')
  64
  65     #print 'at start of line, rects ='
  66     #print rects
  67
  68     for word in words:
  69
  70         numWordsInLine += 1
  71
  72         text = word.text
  73         #print 'got text ' + text
  74
  75         coords = word.get('coords').split(',') #l,b,r,t
  76         coords = map(int, coords)
  77
  78         if int(coords[0]) < left:
  79             left = coords[0]
  80
  81         if coords[1] > bottom:
  82             bottom = coords[1]
  83
  84         if coords[2] > right:
  85             right = coords[2]
  86
  87         if coords[3] < top:
  88             top = coords[3]
  89
  90         block += word.text + ' '
  91         numWords += 1
  92
  93         if text.endswith('.') and (numWords>minWordsInBlock):
  94             #print 'end of block with numWords=%d' % numWords
  95             #print 'block = ' + block
  96
  97             rects.append([left, bottom, right, top])
  98
  99             #textBlocks.append(block.strip())
 100             rects.insert(0, block.strip())
 101             textBlocks.append(rects)
 102             block = ''
 103             rects = []
 104             numWords = 0
 105             numWordsInLine = 0
 106             top = sys.maxint
 107             left = sys.maxint
 108             right = -1
 109             bottom = -1
 110
 111     #end of line
 112     if numWordsInLine > 0:
 113         rects.append([left, bottom, right, top])
 114
 115     if numWords>maxWordsInBlock:
 116         #textBlocks.append(block.strip())
 117         rects.insert(0, block.strip())
 118         textBlocks.append(rects)
 119         block = ''
 120         numWords = 0
 121         rects = []
 122
 123     #print 'at end of line, rects ='
 124     #print rects
 125
 126 if '' != block:
 127     #textBlocks.append(block.strip())
 128     rects.insert(0, block.strip())
 129     textBlocks.append(rects)
 130
 131 print 'br.%s(%s);' % (callback, json.dumps(textBlocks))