lib/mail_handler/backends/mail_backend.rb
require 'mail'
require 'mapi/msg'
require 'mapi/convert'
require 'config_helper'
require 'alaveteli_file_types'
require 'normalize_string'
module Mail
class Message
# The behaviour of the 'to' and 'cc' methods have changed
# between TMail and Mail; this monkey-patching restores the
# TMail behaviour. The key difference is that when there's an
# invalid address, e.g. '<foo@example.org', Mail returns the
# string as an ActiveSupport::Multibyte::Chars, whereas
# previously TMail would return nil.
alias old_to to
alias old_cc cc
alias old_bcc bcc
def clean_addresses(old_method, val)
old_result = send(old_method, val)
old_result.class == Mail::AddressContainer ? old_result : nil
end
def to(val = nil)
clean_addresses :old_to, val
end
def cc(val = nil)
clean_addresses :old_cc, val
end
def bcc(val = nil)
clean_addresses :old_bcc, val
end
end
end
module MailHandler
module Backends
module MailBackend
include ConfigHelper
MismatchedAttachmentHexdigest = Class.new(StandardError)
def backend
'Mail'
end
def mail_from_raw_email(data)
data = data.force_encoding(Encoding::BINARY) if data.is_a? String
Mail.new(Mail::Utilities.binary_unsafe_to_crlf(data.to_s))
end
# Extracts all attachments from the given TNEF file as a Mail object
def mail_from_tnef(content)
main = Mail.new
tnef_attachments(content).each do |attachment|
main.add_file(attachment)
end
main.ready_to_send!
main
end
# Returns an outlook message as a Mail object
def mail_from_outlook(content)
msg = Mapi::Msg.open(StringIO.new(content))
mail = mail_from_raw_email(msg.to_mime.to_s)
mail.ready_to_send!
mail
end
def get_subject(mail)
subject = mail.subject
convert_string_to_utf8(subject).string if subject
end
def get_within_rfc822_subject(leaf)
within_rfc822_subject = leaf.within_rfc822_attachment.subject
return unless within_rfc822_subject
convert_string_to_utf8(within_rfc822_subject).string
end
# Return a copy of the file name for the mail part
def get_part_file_name(part)
part_file_name = part.filename
part_file_name = part_file_name.nil? ? nil : part_file_name.dup
if part_file_name
part_file_name = CGI.unescape(part_file_name)
part_file_name = convert_string_to_utf8(part_file_name, part.charset).string
end
part_file_name
end
# Get the body of a mail part
def get_part_body(part)
decoded = part.body.decoded
decoded = decoded.dup if decoded.frozen?
if part.content_type =~ /^text\//
decoded = convert_string_to_utf8_or_binary decoded, part.charset
end
decoded
end
# Return the first from field if any
def first_from(mail)
if mail[:from]
begin
mail[:from].addrs[0]
mail[:from].decoded
mail[:from].addrs[0]
rescue
mail[:from].value
end
end
end
# Return the first from address if any
def get_from_address(mail)
first_from = first_from(mail)
if first_from
if first_from.is_a?(String)
nil
else
first_from.address
end
end
end
# Return the first from name if any
def get_from_name(mail)
first_from = first_from(mail)
if first_from
if first_from.is_a?(String)
nil
else
(first_from.display_name || nil)
end
end
end
def get_all_addresses(mail, include_invalid: false)
addrs = []
addrs << mail.to
addrs << mail[:to].try(:value) if mail.to.nil? && include_invalid
addrs << mail.cc
addrs << mail[:cc].try(:value) if mail.cc.nil? && include_invalid
addrs << mail.bcc
addrs << mail[:bcc].try(:value) if mail.bcc.nil? && include_invalid
addrs << (mail['envelope-to'] ? mail['envelope-to'].value.to_s : nil)
addrs << get_emails_within_received_headers(mail)
addrs.flatten.compact.uniq
end
def empty_return_path?(mail)
return false if mail['return-path'].nil?
return true if mail['return-path'].value.blank?
false
end
def get_auto_submitted(mail)
mail['auto-submitted'] ? mail['auto-submitted'].value : nil
end
def get_content_type(part)
part.content_type ? part.content_type.split(';')[0] : nil
end
def get_header_string(header, mail)
mail.header[header] ? mail.header[header].to_s : nil
end
# Detects whether a mail part is an Outlook email
def is_outlook?(part)
filename = get_part_file_name(part)
return true if get_content_type(part) == 'application/vnd.ms-outlook'
if filename && AlaveteliFileTypes.filename_to_mimetype(filename) == 'application/vnd.ms-outlook'
return true
end
false
end
# Convert a mail part which is an attached mail in one of
# several formats into a mail object and set it as the
# rfc822_attachment on the part. If the mail part can't be
# converted, the content type on the part is updated to
# 'text/plain' for an RFC822 attachment, and 'application/octet-stream'
# for other types
def decode_attached_part(part, parent_mail)
if get_content_type(part) == 'message/rfc822'
# An email attached as text
part.rfc822_attachment = mail_from_raw_email(part.body)
if part.rfc822_attachment.nil?
# Attached mail didn't parse, so treat as text
part.content_type = 'text/plain'
end
elsif is_outlook?(part)
begin
part.rfc822_attachment = mail_from_outlook(part.body.decoded)
rescue Encoding::CompatibilityError => e
if send_exception_notifications?
data = { message: 'Exception while parsing outlook attachment.',
parent_mail: parent_mail.inspect }
ExceptionNotifier.notify_exception(e, data: data)
end
part.rfc822_attachment = nil
end
if part.rfc822_attachment.nil?
# Attached mail didn't parse, so treat as binary
part.content_type = 'application/octet-stream'
end
elsif get_content_type(part) == 'application/ms-tnef'
# A set of attachments in a TNEF file
begin
part.rfc822_attachment = mail_from_tnef(part.body.decoded)
if part.rfc822_attachment.nil?
# Attached mail didn't parse, so treat as binary
part.content_type = 'application/octet-stream'
end
rescue TNEFParsingError
part.rfc822_attachment = nil
part.content_type = 'application/octet-stream'
end
end
if part.rfc822_attachment
expand_and_normalize_parts(part.rfc822_attachment, parent_mail)
end
end
# Expand and normalize a mail part recursively. Decodes attached messages into
# Mail objects wherever possible. Sets a default content type if none is
# set. Tries to set a more specific content type for binary content types.
def expand_and_normalize_parts(part, parent_mail)
if part.multipart?
Mail::PartsList.new(part.parts.each { |sub_part| expand_and_normalize_parts(sub_part, parent_mail) })
else
part_filename = get_part_file_name(part)
if part.has_charset?
original_charset = part.charset # save this, because overwriting content_type also resets charset
else
original_charset = nil
end
# Don't allow nil content_types
if get_content_type(part).nil?
part.content_type = 'application/octet-stream'
end
# PDFs often come with this mime type, fix it up for view code
if get_content_type(part) == 'application/octet-stream'
part_body = get_part_body(part)
calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(part_filename,
part_body)
part.content_type = calc_mime if calc_mime
end
# Use standard content types for Word documents etc.
part.content_type = normalise_content_type(get_content_type(part))
decode_attached_part(part, parent_mail)
part.charset = original_charset if original_charset
end
end
# Count the parts in a mail part recursively, including any attached messages.
# Set the count on the parent mail, and set a url_part_number on the part itself.
# Set the count for the first uudecoded part on the parent mail also.
def count_parts(part, parent_mail)
if part.multipart?
part.parts.each { |p| count_parts(p, parent_mail) }
elsif part.rfc822_attachment
count_parts(part.rfc822_attachment, parent_mail)
else
parent_mail.count_parts_count += 1
part.url_part_number = parent_mail.count_parts_count
end
parent_mail.count_first_uudecode_count = parent_mail.count_parts_count
end
# Choose the best part from alternatives
def choose_best_alternative(mail)
return mail.parts.detect(&:multipart?) if mail.parts.any?(&:multipart?)
if mail.html_part
mail.html_part
elsif mail.text_part
mail.text_part
else
mail.parts.first
end
end
# Expand and normalize the parts of a mail, select the best part
# wherever there is an alternative, and then count the returned
# leaves and assign url_part values to them
def get_attachment_leaves(mail)
# TODO: Most of these methods are modifying in place! :(
expand_and_normalize_parts(mail, mail)
leaves = _get_attachment_leaves_recursive(mail, nil, mail)
mail.count_parts_count = 0
count_parts(mail, mail)
leaves
end
# Recurse through a mail part, selecting the best part wherever there is
# an alternative
def _get_attachment_leaves_recursive(part, within_rfc822_attachment, parent_mail)
leaves_found = []
if part.multipart?
if part.parts.empty?
# This is typically caused by a missing final
# MIME boundary, in which case the text of the
# message (including the opening MIME
# boundary) is in part.body, so just add this
# part as a leaf and treat it as text/plain:
part.content_type = "text/plain"
leaves_found += [part]
elsif part.sub_type == 'alternative'
best_part = choose_best_alternative(part)
leaves_found += _get_attachment_leaves_recursive(best_part,
within_rfc822_attachment,
parent_mail)
else
# Add all parts
part.parts.each do |sub_part|
leaves_found += _get_attachment_leaves_recursive(sub_part,
within_rfc822_attachment,
parent_mail)
end
end
elsif part.rfc822_attachment
# Add all the parts of a decoded attached message
leaves_found += _get_attachment_leaves_recursive(part.rfc822_attachment,
part.rfc822_attachment,
parent_mail)
else
# Store leaf
part.within_rfc822_attachment = within_rfc822_attachment
leaves_found += [part]
end
leaves_found
end
# Add selected useful headers from an attached message to its body
def extract_attached_message_headers(leaf)
body = get_part_body(leaf)
# Test to see if we are in the first part of the attached
# RFC822 message and it is text, if so add headers.
if leaf.within_rfc822_attachment == leaf && get_content_type(leaf) == 'text/plain'
headers = ""
%w[Date Subject From To Cc].each do |header|
if (header_value = get_header_string(header, leaf.within_rfc822_attachment))
unless header_value.blank?
headers = headers + header + ": " + header_value.to_s + "\n"
end
end
end
# TODO: call _convert_part_body_to_text here, but need to get charset somehow
# e.g. http://www.whatdotheyknow.com/request/1593/response/3088/attach/4/Freedom%20of%20Information%20request%20-%20car%20oval%20sticker:%20Article%2020,%20Convention%20on%20Road%20Traffic%201949.txt
body = headers + "\n" + body
end
body
end
# Generate a hash of the attributes associated with each significant part
# of a Mail object
def get_attachment_attributes(mail)
get_attachment_leaves(mail).inject([]) do |acc, leaf|
original_body = body = get_part_body(leaf)
if leaf.within_rfc822_attachment
within_rfc822_subject = get_within_rfc822_subject(leaf)
body = extract_attached_message_headers(leaf)
end
acc.push(
url_part_number: leaf.url_part_number,
content_type: get_content_type(leaf),
filename: get_part_file_name(leaf),
charset: leaf.charset,
within_rfc822_subject: within_rfc822_subject,
original_body: original_body,
body: body,
hexdigest: Digest::MD5.hexdigest(body)
)
acc
end
end
def attachment_body_for_hexdigest(mail, hexdigest:)
attributes = get_attachment_attributes(mail).find do |attrs|
attrs[:hexdigest] == hexdigest
end
return attributes.fetch(:body) if attributes
raise MismatchedAttachmentHexdigest,
"can't find attachment matching hexdigest: #{hexdigest}"
end
def attempt_to_find_original_attachment_attributes(mail, body:, nested: false)
all_attributes = get_attachment_attributes(mail)
def calculate_hexdigest(body)
# ensure bodies have the same line endings and are encoded the same
Digest::MD5.hexdigest(
Mail::Utilities.binary_unsafe_to_lf(
convert_string_to_utf8(
body.rstrip
).string
)
)
end
hexdigest = calculate_hexdigest(body)
attributes = all_attributes.find do |attrs|
hexdigest_1 = calculate_hexdigest(attrs[:body])
hexdigest_2 = calculate_hexdigest(attrs[:original_body])
hexdigest == hexdigest_1 || hexdigest == hexdigest_2
end
return attributes if nested
mail_body = Mail.new(body).body.to_s
attributes ||= attempt_to_find_original_attachment_attributes(
mail, body: mail_body, nested: true
) unless mail_body.empty?
return attributes if attributes
# check uuencoded attachments which can be located in plain text
uuencoded_attributes = all_attributes.inject([]) do |acc, attrs|
next acc unless attrs[:content_type] == 'text/plain'
acc += uudecode(attrs[:body], attrs[:url_part_number])
end
attributes ||= uuencoded_attributes.find do |attrs|
calculate_hexdigest(attrs[:body]) == hexdigest
end
attributes
end
def uudecode(text, start_part_number)
# Find any uudecoded things buried in it, yeuchly
uus = text.scan(/^begin.+^`\n^end\n/m)
uus.map.with_index do |uu, index|
# Decode the string
body = uu.sub(/\Abegin \d+ [^\n]*\n/, '').unpack('u').first
# Make attachment type from it, working out filename and mime type
filename = uu.match(/^begin\s+[0-9]+\s+(.*)$/)[1]
mime_type = AlaveteliFileTypes.filename_and_content_to_mimetype(
filename, body
)
if mime_type
content_type = MailHandler.normalise_content_type(mime_type)
else
content_type = 'application/octet-stream'
end
hexdigest = Digest::MD5.hexdigest(body)
{
body: body,
filename: filename,
content_type: content_type,
hexdigest: hexdigest,
url_part_number: start_part_number + index + 1
}
end
end
# Format
def address_from_name_and_email(name, email)
unless MySociety::Validate.is_valid_email(email)
raise "invalid email " + email + " passed to address_from_name_and_email"
end
return Mail::Address.new(email.dup).to_s if name.nil?
address = Mail::Address.new
address.display_name = name.dup
address.address = email.dup
address.to_s
end
def address_from_string(string)
mail = Mail.new
mail.from = string
mail.from[0]
end
def get_emails_within_received_headers(email)
received_headers = Array(email['Received'])
return [] if received_headers.empty?
received_headers.map(&:to_s).
join(' ').
scan(MySociety::Validate.email_find_regexp).
flatten
end
end
end
end