forked from rainey/antiword-xp-rb
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathantiwordxp.rb
292 lines (236 loc) · 8.31 KB
/
antiwordxp.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
#!/usr/bin/ruby
#Matt Smith/Shawn Rainey
#antiword-xp.rb - Convert docx files to plaintext
# add an each_wrapped_line method to the String class
class String
#Takes a width to wrap, defaulted to $wrapWidth, and a paragraph separator
# the separator is inserted after each paragraph
# along with an option to add the seperator after single-line "paragraphs"
# which fit on one line with no wrapping. This will likely always be false
def each_wrapped_line(cols = $wrapWidth, p_seperator="\n", seperateSingle=false)
lines = []
self.each_line { |line|
words = line.split
wrapped_line = ""
seperate = seperateSingle
words.each { | word |
word.strip!
#Is there room for the next word?
if (wrapped_line.length + word.length) <= cols || cols == 0
wrapped_line << word
#Add Space if it will fit
wrapped_line << " " unless wrapped_line.length == cols
else
#Always use seperator when paragraphs span more than one line
seperate = true
lines << wrapped_line
#If the word length is bigger than the number of columns
# add it to lines. Otherwise add it to the next wrapped line
if word.length + 1 > cols
lines << word
else
wrapped_line = word + " "
end
end
}
wrapped_line += p_seperator if seperate
lines << wrapped_line
}
#Yield lines if block given, otherwise return them.
lines.each { |line| yield(line) } if block_given?
return lines
end
end
#Found out the hard way that the env. var $COLUMNS is not exported...
#So we do this instead
begin
IO.popen("tput cols"){ |process| $consoleWidth = process.read.to_i }
$wrapWidth = $consoleWidth
rescue Errno::ENOENT
$wrapWidth = $consoleWidth = 80
end
def usage
"Usage: #{$0} takes a .doc or .docx formatted word document. It can be called either by piping the document to antiword, or by calling `#{$0} filename`
".each_wrapped_line($consoleWidth) { | line | puts line }
puts "
Arguments:"
"-w## or -w ## Set wrap with. If not specified, uses console width or 80 if console width cannot be determined.
--notimeout Disable input timeout. This could be necessary for large files or files from external sources. Only needed when piping in a word file, and not when one is specified in the programs argument list.".each_wrapped_line($consoleWidth) { | line | puts line }
puts"
Examples:
$#{$0} < mydoc.doc[x]
$#{$0} mydoc.doc[x] -w 60 --notimeout
$cat mydoc.doc[x] | #{$0} -w80
"
end
stdinTimeout = 5
filename = nil
#Generate a hash string from a random number to seperate the arguments
#See antiword.rb.txt
require 'digest'
arg_sep = "<"
#Choose a new argument seperator until sep. not found in arguments
# use the TR because we don't want digits in the seperator
until ARGV.join("") !~ /#{arg_sep}/
arg_sep = Digest::hexencode(Digest::SHA2.new().digest(rand().to_s)).tr("0-9", "G-P")
end
argstring = ARGV.join(arg_sep)
#if we can find a -h or -help, or can't find a good indicator
#of a doc/docx
if (argstring =~ /(?:#{arg_sep}|^)-+h(?:elp)?(?=#{arg_sep}|$)/)
usage
exit(1)
else
temp_fname = nil
argtokens = {
"--notimeout" => \
lambda { | matchData | stdinTimeout = 0 if matchData
},
#Take a width value. can be "-w10" or "-w 10" on the command line
"-w(?:#{arg_sep})?(\\d+)" => \
lambda { | matchData |
$wrapWidth = matchData.to_a[1].to_i unless matchData == nil
},
"(.+\\.docx?)" => \
lambda { | matchData |
temp_fname = matchData.to_a[1] unless matchData == nil
}
}
argtokens.each_pair { | expression, callback |
#Call the callback function with the matchdata from the expression-injected RE
# expression is matched between arg_sep or start/end anchors
# ending seperator is not included in match, but beginning seperator is.
callback.call(argstring.match(/.*(?:#{arg_sep}|^)(?>#{expression})(?=#{arg_sep}|$)/))
}
#If a file name is given,
# test if the given filename exists
if temp_fname != nil
if File.exist?(temp_fname)
filename = temp_fname
else
puts "#{temp_fname} does not exist!"
usage
Process.exit(1);
end
end
#Clear the argument list of known arguments
argtokens.each_key { | expression |
argstring.gsub!(/(?:#{arg_sep}|^)#{expression}(?=#{arg_sep}|$)/, "")
}
#If there are still arguments left, and the file name has not been
# assigned, assume that the unrecognized arg is meant to be the file,
# and output the error message + usage.
#This allows us to ignore garbage arguments when a file is supplied.
# Unfortunately, they will still be problematic when the word file is
# piped in.
if(temp_fname == nil && !argstring.empty?)
argstring.gsub!(/#{arg_sep}/, "")
puts "#{argstring} is not a valid word file."
usage
Process.exit(1)
end
end
process_xml = true;
#Copy contents of stdin to antiword.zip
if filename == nil
begin
require 'timeout'
Timeout::timeout(stdinTimeout) do
File.open("antiword_temp.zip", "w") { |file| file.write($stdin.read) }
filename = "antiword_temp.zip"
end
rescue Timeout::Error
File.delete("antiword_temp.zip")
"Timed out. This can happen if you piped in a very large file, or if you did not specify a file at all. To remedy this with very large files, add the --notimeout argument when calling #{$0}.
".each_wrapped_line { |line| puts line }
usage
Process.exit(1)
end
end
document = String.new
gotContents = true
#Set to "if false" if RubyZip is causing problems
# and it will use the system's unzip instead.
if true
require 'rubygems'
require 'zip/zipfilesystem'
begin
Zip::ZipFile.open(filename) { | awContents |
document = awContents.read("word/document.xml")
}
rescue Zip::ZipError
gotContents = false;
end
else
#unzip options: pipe output to stdout, only extract word/document.xml
#result.read captures stdout from the opened process.
IO.popen("unzip -p antiword_temp.zip word/document.xml 2> /dev/null") { |result| document = result.read }
gotContents = ($? == 0)
end
#If the unzip failed
unless gotContents
process_xml = nil
#If the filename isn't antiword_temp, and this is a doc, copy
#the file to antiword_temp. Do this to avoid having to escape the filename
unless filename == "antiword_temp.zip"
File.open("antiword_temp.zip", "w") { |awfile|
File.open(filename) { | inFile | awfile.write(inFile.read) }
filename = "antiword_temp.zip"
# ^^^ antiword_temp.zip doesn't get deleted unless filename is set to this
}
end
#Try to process with system's antiword, maybe it's an old doc file.
#Set antiword's options: one paragraph per line, text mode, no images.
# This matches the format of document.xml with the tags processed
IO.popen('antiword antiword_temp.zip -w 0 -t -i 1 2> /dev/null') {
|result| document = result.read
}
#if antiword failed
#You're SOL
unless $? == 0
$stderr.write("Unsupported format\n")
usage
File.delete("antiword_temp.zip")
Process.exit 1
end
end
if(process_xml)
replacements = []
#Remove line breaks. There are none in MS-Words's XML, but
#Could change in the future. Or could have been generated
#using something else
replacements << [ /\n|\r/, '']
#Add seperators where column tags are using pipe, unless last in row
replacements << [ /<\/w:p><\/w:tc>(?!<\/w:tr>)/, " | " ]
#list elements, may add more soon
replacements << [ /<w:numPr>/, "-" ]
#Tabbed Columns
replacements << [ /<w:tab[^\/]*\/>/, " " ]
#Substitute end paragraph tag with newline
#Effectively, this should treat each paragraph on one line
replacements << [ /<\/w:p>/, "\r\n" ]
#insert [pic] to replace graphics.
replacements << [ /<pic:pic[^>]*>/, '[pic]']
replacements << [ /<wp:posOffset>\d+?<\/wp:posOffset>/, "" ]
#Remove all other tags
replacements << [ /<[^>]*>/, "" ]
#Not sure if any other replacements need to be made, but this should
# make it easy enough to add more
replacements << [ /</ , '<' ] <<
[ />/, '>' ] <<
[ /&/, "&"] <<
[ /"/, '"'] <<
[ /'/, "'" ]
replacements.each { | replacement |
document.gsub!(replacement[0], replacement[1])
}
#Some UTF-8 characters don't print
#This translates from utf-8 to ascii
require "iconv"
document = Iconv.conv("ascii//translit", "UTF-8", document)
end
begin
document.each_wrapped_line {|line| $stdout.write( line + "\n") }
rescue Errno::EPIPE
end
File.delete("antiword_temp.zip") if filename == "antiword_temp.zip"