3 # Copyright(c)2008-2010 Internet Archive. Software license AGPL version 3.
5 # This file is part of BookReader.
7 # BookReader is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # BookReader is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details.
17 # You should have received a copy of the GNU Affero General Public License
18 # along with BookReader. If not, see <http://www.gnu.org/licenses/>.
20 # The BookReader source is hosted at http://github.com/openlibrary/bookreader/
23 #watch out for blank lines (<LINE></LINE>)
25 from lxml import etree
33 pageNum = int(sys.argv[2])
35 tree = etree.parse(path)
37 objects = tree.findall('//OBJECT')
39 #print 'got %s objects' % len(objects)
41 page = objects[pageNum]
43 lines = page.findall('.//LINE')
45 #print 'got %s .//lines' % len(lines)
62 words = line.findall('.//WORD')
64 #print 'at start of line, rects ='
72 #print 'got text ' + text
74 coords = word.get('coords').split(',') #l,b,r,t
75 coords = map(int, coords)
77 if int(coords[0]) < left:
80 if coords[1] > bottom:
89 block += word.text + ' '
92 if text.endswith('.') and (numWords>minWordsInBlock):
93 #print 'end of block with numWords=%d' % numWords
94 #print 'block = ' + block
96 rects.append([left, bottom, right, top])
98 #textBlocks.append(block.strip())
99 rects.insert(0, block.strip())
100 textBlocks.append(rects)
111 if numWordsInLine > 0:
112 rects.append([left, bottom, right, top])
114 if numWords>maxWordsInBlock:
115 #textBlocks.append(block.strip())
116 rects.insert(0, block.strip())
117 textBlocks.append(rects)
122 #print 'at end of line, rects ='
126 #textBlocks.append(block.strip())
127 rects.insert(0, block.strip())
128 textBlocks.append(rects)
130 print 'br.ttsGetTextCB(' + json.dumps(textBlocks) + ');'