From dc807c409c3bfc109203bb4db65429419fc2e8fb Mon Sep 17 00:00:00 2001 From: rajbot Date: Wed, 25 Aug 2010 23:19:19 +0000 Subject: [PATCH] add BookReaderGetText.py and BookReaderGetTextWrapper.php --- BookReaderIA/datanode/BookReaderGetText.py | 130 ++++++++++++++++++ .../datanode/BookReaderGetTextWrapper.php | 29 ++++ 2 files changed, 159 insertions(+) create mode 100644 BookReaderIA/datanode/BookReaderGetText.py create mode 100644 BookReaderIA/datanode/BookReaderGetTextWrapper.php diff --git a/BookReaderIA/datanode/BookReaderGetText.py b/BookReaderIA/datanode/BookReaderGetText.py new file mode 100644 index 0000000..cf01ccb --- /dev/null +++ b/BookReaderIA/datanode/BookReaderGetText.py @@ -0,0 +1,130 @@ +#!/usr/bin/python + +# Copyright(c)2008-2010 Internet Archive. Software license AGPL version 3. +# +# This file is part of BookReader. +# +# BookReader is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# BookReader is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with BookReader. If not, see . +# +# The BookReader source is hosted at http://github.com/openlibrary/bookreader/ + + +#watch out for blank lines () + +from lxml import etree +import sys +import json + +minWordsInBlock = 50 +maxWordsInBlock = 100 + +path = sys.argv[1] +pageNum = int(sys.argv[2]) + +tree = etree.parse(path) + +objects = tree.findall('//OBJECT') + +#print 'got %s objects' % len(objects) + +page = objects[pageNum] + +lines = page.findall('.//LINE') + +#print 'got %s .//lines' % len(lines) + +textBlocks = [] +block = '' +rects = [] + +numWords = 0 + +for line in lines: + + top = sys.maxint + left = sys.maxint + right = -1 + bottom = -1 + + numWordsInLine = 0 + + words = line.findall('.//WORD') + + #print 'at start of line, rects =' + #print rects + + for word in words: + + numWordsInLine += 1 + + text = word.text + #print 'got text ' + text + + coords = word.get('coords').split(',') #l,b,r,t + coords = map(int, coords) + + if int(coords[0]) < left: + left = coords[0] + + if coords[1] > bottom: + bottom = coords[1] + + if coords[2] > right: + right = coords[2] + + if coords[3] < top: + top = coords[3] + + block += word.text + ' ' + numWords += 1 + + if text.endswith('.') and (numWords>minWordsInBlock): + #print 'end of block with numWords=%d' % numWords + #print 'block = ' + block + + rects.append([left, bottom, right, top]) + + #textBlocks.append(block.strip()) + rects.insert(0, block.strip()) + textBlocks.append(rects) + block = '' + rects = [] + numWords = 0 + numWordsInLine = 0 + top = sys.maxint + left = sys.maxint + right = -1 + bottom = -1 + + #end of line + if numWordsInLine > 0: + rects.append([left, bottom, right, top]) + + if numWords>maxWordsInBlock: + #textBlocks.append(block.strip()) + rects.insert(0, block.strip()) + textBlocks.append(rects) + block = '' + numWords = 0 + rects = [] + + #print 'at end of line, rects =' + #print rects + +if '' != block: + #textBlocks.append(block.strip()) + rects.insert(0, block.strip()) + textBlocks.append(rects) + +print json.dumps(textBlocks) \ No newline at end of file diff --git a/BookReaderIA/datanode/BookReaderGetTextWrapper.php b/BookReaderIA/datanode/BookReaderGetTextWrapper.php new file mode 100644 index 0000000..6e70134 --- /dev/null +++ b/BookReaderIA/datanode/BookReaderGetTextWrapper.php @@ -0,0 +1,29 @@ +. + + The BookReader source is hosted at http://github.com/openlibrary/bookreader/ +*/ + +$env = 'LD_LIBRARY_PATH=/petabox/sw/lib/lxml/lib PYTHONPATH=/petabox/sw/lib/lxml/lib/python2.5/site-packages:$PYTHONPATH'; +$path=$_GET['path']; +$page=$_GET['page']; +passthru("$env python BookReaderGetText.py $path $page"); + +?> \ No newline at end of file -- 2.20.1