3 # Copyright(c)2008-2010 Internet Archive. Software license AGPL version 3.
5 # This file is part of BookReader.
7 # BookReader is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
12 # BookReader is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details.
17 # You should have received a copy of the GNU Affero General Public License
18 # along with BookReader. If not, see <http://www.gnu.org/licenses/>.
20 # The BookReader source is hosted at http://github.com/openlibrary/bookreader/
23 #watch out for blank lines (<LINE></LINE>)
25 from lxml import etree
33 pageNum = int(sys.argv[2])
34 callback = sys.argv[3]
36 tree = etree.parse(path)
38 objects = tree.findall('//OBJECT')
40 #print 'got %s objects' % len(objects)
42 page = objects[pageNum]
44 lines = page.findall('.//LINE')
46 #print 'got %s .//lines' % len(lines)
63 words = line.findall('.//WORD')
65 #print 'at start of line, rects ='
73 #print 'got text ' + text
75 coords = word.get('coords').split(',') #l,b,r,t
76 coords = map(int, coords)
78 if int(coords[0]) < left:
81 if coords[1] > bottom:
90 block += word.text + ' '
93 if text.endswith('.') and (numWords>minWordsInBlock):
94 #print 'end of block with numWords=%d' % numWords
95 #print 'block = ' + block
97 rects.append([left, bottom, right, top])
99 #textBlocks.append(block.strip())
100 rects.insert(0, block.strip())
101 textBlocks.append(rects)
112 if numWordsInLine > 0:
113 rects.append([left, bottom, right, top])
115 if numWords>maxWordsInBlock:
116 #textBlocks.append(block.strip())
117 rects.insert(0, block.strip())
118 textBlocks.append(rects)
123 #print 'at end of line, rects ='
127 #textBlocks.append(block.strip())
128 rects.insert(0, block.strip())
129 textBlocks.append(rects)
131 print 'br.%s(%s);' % (callback, json.dumps(textBlocks))