#!/bin/bash #/* # openisis - an open implementation of the CDS/ISIS database # Version 0.8.x (patchlevel see file Version) # Copyright (C) 2001-2003 by Erik Grziwotz, erik@openisis.org # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # see README for more information #EOH */ # # CVSID='$Id: any2txt,v 1.4 2003/04/08 00:20:52 kripke Exp $' # # utility to convert several document types to text # used for mail archiving # similar to anytopnm # # assumptions: # filename is a name of the file in the current directory # we are allowed to use any filename.* # # return code: # the lowest valued bit denotes that there is a plain version in filename.txt # # 0 file was fine text and is unmodified # 1 file was sort of (non-iso,long-lined) text, plain version is filename.txt # 2 file was formatted html text # 3 file was formatted html text, plain version is filename.txt # 4,5 likewise for pdf # 6,7 likewise for rtf # 8,9 likewise for doc (M$-w0rd) # 16,17 file was mail text # 32 gif image # 34 jpg image # 36 bmp image # 38 png image # 40 tiff image # 46 other image # ?? other # 124 unknown # 126 error # verbose=false err () { echo "$*" >&2 exit 126 } dewindoofy () { tr '\200-\237' 'E?,f".+#^%S<Ö?Z??'\'\''""*--"Ts>ö?zY' } echo "any2txt $* ($PWD)" >&2 while [ 1 -lt $# ]; do # shift options case "$1" in -v) verbose=true;; -m) mtype="$2"; shift;; esac shift done # if $verbose; then # : #else # no output expected; so any output is error exec >&2 #fi filename="$1" [ -f "$filename" -a -r "$filename" ] || err "'$filename' not readable" ftype="$(file -b "$filename")" code=124 case "$ftype" in *"mail text"*) sed -e '1,/^$/d' <"$filename" >"$filename".txt code=17 ;; *HTML*) # maybe ASCII or ISO-8859 HTML document text, so check before ASCII code=2 lynx -dump -force_html -nolog "$filename" >"$filename".txt && code=3 ;; *"Non-ISO extended-ASCII"*"long lines") # assume windows stuff dewindoofy <"$filename" | fmt -s >"$filename".txt code=1 ;; *"Non-ISO extended-ASCII"*) # assume windows stuff dewindoofy <"$filename" >"$filename".txt code=1 ;; ASCII*text*"long lines"|ISO-8859*text*"long lines") fmt -s <"$filename" >"$filename".txt code=1 ;; ASCII*text*|ISO-8859*text*) code=0 ;; PDF*) code=4 pdftotext -raw "$filename" - >"$filename".txt && code=5 ;; "Rich Text Format"*) code=6 unrtf --nopict --html "$filename" | lynx -dump -force_html -nolist /proc/self/fd/0 >"$filename".txt && code=7 ;; "Microsoft Office Document"*) code=8 antiword "$filename" >"$filename".txt && code=9 ;; *"GIF image"*) code=32;; *"JPEG image"*) code=34;; *"PC bitmap"*) code=36;; *"PNG image"*) code=38;; *"TIFF image"*) code=40;; *image*) code=46;; esac $verbose && echo "$filename: $ftype: $code" >&2 exit $code