BookReaderIA/datanode/BookReaderGetText.py

   1 #!/usr/bin/python
   2
   3 # Copyright(c)2008-2010 Internet Archive. Software license AGPL version 3.
   4 #
   5 # This file is part of BookReader.
   6 #
   7 #     BookReader is free software: you can redistribute it and/or modify
   8 #     it under the terms of the GNU Affero General Public License as published by
   9 #     the Free Software Foundation, either version 3 of the License, or
  10 #     (at your option) any later version.
  11 #
  12 #     BookReader is distributed in the hope that it will be useful,
  13 #     but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 #     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 #     GNU Affero General Public License for more details.
  16 #
  17 #     You should have received a copy of the GNU Affero General Public License
  18 #     along with BookReader.  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20 #     The BookReader source is hosted at http://github.com/openlibrary/bookreader/
  21
  22
  23 #watch out for blank lines (<LINE></LINE>)
  24
  25 from lxml import etree
  26 import sys
  27 import json
  28 import re
  29
  30 minWordsInBlock = 25
  31 maxWordsInBlock = 50
  32
  33 path = sys.argv[1]
  34 pageNum = int(sys.argv[2])
  35 callback = sys.argv[3]
  36
  37 if not re.match('^/\d{1,2}/items/.+_djvu.xml$', path):
  38     sys.exit(-1);
  39
  40 if ('ttsNextPageCB' != callback):
  41     callback = 'ttsStartCB'
  42
  43 tree = etree.parse(path)
  44
  45 objects = tree.findall('//OBJECT')
  46
  47 #print 'got %s objects' % len(objects)
  48
  49 if pageNum > (len(objects)-1):
  50     sys.exit(-1)
  51
  52 page = objects[pageNum]
  53
  54 lines = page.findall('.//LINE')
  55
  56 #print 'got %s .//lines' % len(lines)
  57
  58 textBlocks = []
  59 block = ''
  60 rects = []
  61
  62 numWords = 0
  63
  64 for line in lines:
  65
  66     top = sys.maxint
  67     left = sys.maxint
  68     right = -1
  69     bottom = -1
  70
  71     numWordsInLine = 0
  72
  73     words = line.findall('.//WORD')
  74
  75     #print 'at start of line, rects ='
  76     #print rects
  77
  78     for word in words:
  79
  80         numWordsInLine += 1
  81
  82         text = word.text
  83         #print 'got text ' + text
  84
  85         coords = word.get('coords').split(',') #l,b,r,t
  86         coords = map(int, coords)
  87
  88         if int(coords[0]) < left:
  89             left = coords[0]
  90
  91         if coords[1] > bottom:
  92             bottom = coords[1]
  93
  94         if coords[2] > right:
  95             right = coords[2]
  96
  97         if coords[3] < top:
  98             top = coords[3]
  99
 100         block += word.text + ' '
 101         numWords += 1
 102
 103         if text.endswith('.') and (numWords>minWordsInBlock):
 104             #print 'end of block with numWords=%d' % numWords
 105             #print 'block = ' + block
 106
 107             rects.append([left, bottom, right, top])
 108
 109             #textBlocks.append(block.strip())
 110             rects.insert(0, block.strip())
 111             textBlocks.append(rects)
 112             block = ''
 113             rects = []
 114             numWords = 0
 115             numWordsInLine = 0
 116             top = sys.maxint
 117             left = sys.maxint
 118             right = -1
 119             bottom = -1
 120
 121     #end of line
 122     if numWordsInLine > 0:
 123         rects.append([left, bottom, right, top])
 124
 125     if numWords>maxWordsInBlock:
 126         #textBlocks.append(block.strip())
 127         rects.insert(0, block.strip())
 128         textBlocks.append(rects)
 129         block = ''
 130         numWords = 0
 131         rects = []
 132
 133     #print 'at end of line, rects ='
 134     #print rects
 135
 136 if '' != block:
 137     #textBlocks.append(block.strip())
 138     rects.insert(0, block.strip())
 139     textBlocks.append(rects)
 140
 141 print 'br.%s(%s);' % (callback, json.dumps(textBlocks))