lib/mail_handler.rb
# Handles the parsing of email
require 'English'
require 'tmpdir'
module MailHandler
require 'mail'
require 'mail_handler/backends/mail_extensions'
require 'mail_handler/backends/mail_backend'
include Backends::MailBackend
class TNEFParsingError < StandardError
end
# Returns a set of attachments from the given TNEF contents
# The TNEF contents also contains the message body, but in general this is the
# same as the message body in the message proper.
def tnef_attachments(content)
attachments = []
Dir.mktmpdir do |dir|
IO.popen("tnef -K -C #{dir} 2> /dev/null", "wb") do |f|
f.write(content)
f.close
raise IOError, "tnef exited with signal #{$CHILD_STATUS.termsig}" if $CHILD_STATUS.signaled?
if $CHILD_STATUS.exited? && $CHILD_STATUS.exitstatus != 0
raise TNEFParsingError, "tnef exited with status #{$CHILD_STATUS.exitstatus}"
end
end
found = 0
Dir.new(dir).sort.each do |file| # sort for deterministic behaviour
if file != "." && file != ".."
file_content = File.open("#{dir}/#{file}", "rb").read
attachments << { content: file_content,
filename: file }
found += 1
end
end
raise TNEFParsingError, "tnef produced no attachments" if found == 0
end
attachments
end
def normalise_content_type(content_type)
# e.g. http://www.whatdotheyknow.com/request/93/response/250
if (content_type == 'application/excel') || (content_type == 'application/msexcel') || (content_type == 'application/x-ms-excel')
content_type = 'application/vnd.ms-excel'
end
if (content_type == 'application/mspowerpoint') || (content_type == 'application/x-ms-powerpoint')
content_type = 'application/vnd.ms-powerpoint'
end
if (content_type == 'application/msword') || (content_type == 'application/x-ms-word')
content_type = 'application/vnd.ms-word'
end
if content_type == 'application/x-zip-compressed'
content_type = 'application/zip'
end
# e.g. http://www.whatdotheyknow.com/request/copy_of_current_swessex_scr_opt#incoming-9928
if (content_type == 'application/acrobat') || (content_type == 'document/pdf')
content_type = 'application/pdf'
end
content_type
end
def get_attachment_text_one_file(content_type, body, charset = 'utf-8')
# NOTE: re. charset: TMail always tries to convert email bodies
# to UTF8 by default, so normally it should already be that.
text = ''
# TODO: - tell all these command line tools to return utf-8
if content_type == 'text/plain'
text += body + "\n\n"
else
tempfile = Tempfile.new('foiextract')
tempfile.binmode
tempfile.print body
tempfile.flush
default_params = { append_to: text,
binary_output: false,
timeout: 1200 }
if content_type == 'application/vnd.ms-word'
AlaveteliExternalCommand.run("wvText", tempfile.path, tempfile.path + ".txt",
{ memory_limit: 536_870_912, timeout: 120 } )
# Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701)
if !File.exist?(tempfile.path + ".txt")
AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params)
else
text += File.read(tempfile.path + ".txt") + "\n\n"
File.unlink(tempfile.path + ".txt")
end
elsif content_type == 'application/rtf'
# catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf
AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params)
elsif content_type == 'text/html'
# lynx wordwraps links in its output, which then don't
# get formatted properly by Alaveteli. We use elinks
# instead, which doesn't do that.
AlaveteliExternalCommand.run("elinks", "-eval", "set document.codepage.assume = \"#{charset}\"",
"-eval", "set document.codepage.force_assumed = 1",
"-dump-charset", "utf-8",
"-force-html", "-dump",
tempfile.path,
default_params.merge(env: { "LANG" => "C" }))
elsif content_type == 'application/vnd.ms-excel'
# Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and
# py_xls2txt only extract text from cells, not from floating
# notes. catdoc may be fooled by weird character sets, but will
# probably do for UK FOI requests.
AlaveteliExternalCommand.run("/usr/bin/strings", tempfile.path, default_params)
elsif content_type == 'application/vnd.ms-powerpoint'
# ppthtml seems to catch more text, but only outputs HTML when
# we want text, so just use catppt for now
AlaveteliExternalCommand.run("catppt", tempfile.path, default_params)
elsif content_type == 'application/pdf'
AlaveteliExternalCommand.run("pdftotext", tempfile.path, "-", default_params)
elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
# This is Microsoft's XML office document format.
# Just pull out the main XML file, and strip it of text.
xml = AlaveteliExternalCommand.run("/usr/bin/unzip", "-qq",
"-c",
tempfile.path,
"word/document.xml",
{ binary_output: false })
unless xml.nil?
doc = REXML::Document.new(xml)
text += doc.each_element( './/text()' ) {}.join(" ")
end
elsif content_type == 'application/zip'
# recurse into zip files
begin
zip_file = Zip::File.open(tempfile.path)
text += get_attachment_text_from_zip_file(zip_file)
zip_file.close
rescue
$stderr.puts("Error processing zip file: #{$ERROR_INFO.inspect}")
end
end
tempfile.close
end
text
end
def get_attachment_text_from_zip_file(zip_file)
text = ""
zip_file.each do |entry|
if entry.file?
filename = entry.to_s
begin
body = entry.get_input_stream.read
rescue
# move to next attachment silently if there were problems
# TODO: really should reduce this to specific exceptions?
# e.g. password protected
next
end
calc_mime = AlaveteliFileTypes.filename_to_mimetype(filename)
if calc_mime
content_type = calc_mime
else
content_type = 'application/octet-stream'
end
if content_type == 'text/plain' && body.encoding.to_s == 'ASCII-8BIT'
body = convert_string_to_utf8(body, 'ASCII-8BIT').string
end
text += get_attachment_text_one_file(content_type, body)
end
end
text
end
# Turn instance methods into class methods
extend self
end