3 # openisis - an open implementation of the CDS/ISIS database
4 # Version 0.8.x (patchlevel see file Version)
5 # Copyright (C) 2001-2003 by Erik Grziwotz, erik@openisis.org
7 # This library is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU Lesser General Public
9 # License as published by the Free Software Foundation; either
10 # version 2.1 of the License, or (at your option) any later version.
12 # This library is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 # Lesser General Public License for more details.
17 # You should have received a copy of the GNU Lesser General Public
18 # License along with this library; if not, write to the Free Software
19 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 # see README for more information
25 CVSID='$Id: any2txt,v 1.4 2003/04/08 00:20:52 kripke Exp $'
27 # utility to convert several document types to text
28 # used for mail archiving
33 # filename is a name of the file in the current directory
34 # we are allowed to use any filename.*
38 # the lowest valued bit denotes that there is a plain version in filename.txt
40 # 0 file was fine text and is unmodified
41 # 1 file was sort of (non-iso,long-lined) text, plain version is filename.txt
42 # 2 file was formatted html text
43 # 3 file was formatted html text, plain version is filename.txt
44 # 4,5 likewise for pdf
45 # 6,7 likewise for rtf
46 # 8,9 likewise for doc (M$-w0rd)
47 # 16,17 file was mail text
67 tr '\200-\237' 'E?,f".+#^%S<Ö?Z??'\'\''""*--"Ts>ö?zY'
70 echo "any2txt $* ($PWD)" >&2
72 while [ 1 -lt $# ]; do # shift options
75 -m) mtype="$2"; shift;;
83 # no output expected; so any output is error
88 [ -f "$filename" -a -r "$filename" ] || err "'$filename' not readable"
90 ftype="$(file -b "$filename")"
95 sed -e '1,/^$/d' <"$filename" >"$filename".txt
98 *HTML*) # maybe ASCII or ISO-8859 HTML document text, so check before ASCII
100 lynx -dump -force_html -nolog "$filename" >"$filename".txt && code=3
102 *"Non-ISO extended-ASCII"*"long lines") # assume windows stuff
103 dewindoofy <"$filename" | fmt -s >"$filename".txt
106 *"Non-ISO extended-ASCII"*) # assume windows stuff
107 dewindoofy <"$filename" >"$filename".txt
110 ASCII*text*"long lines"|ISO-8859*text*"long lines")
111 fmt -s <"$filename" >"$filename".txt
114 ASCII*text*|ISO-8859*text*)
119 pdftotext -raw "$filename" - >"$filename".txt && code=5
123 unrtf --nopict --html "$filename" | lynx -dump -force_html -nolist /proc/self/fd/0 >"$filename".txt && code=7
125 "Microsoft Office Document"*)
127 antiword "$filename" >"$filename".txt && code=9
129 *"GIF image"*) code=32;;
130 *"JPEG image"*) code=34;;
131 *"PC bitmap"*) code=36;;
132 *"PNG image"*) code=38;;
133 *"TIFF image"*) code=40;;
137 $verbose && echo "$filename: $ftype: $code" >&2