mysociety/alaveteli

View on GitHub
lib/mail_handler/backends/mail_backend.rb

Summary

Maintainability
D
2 days
Test Coverage
require 'mail'
require 'mapi/msg'
require 'mapi/convert'
require 'config_helper'
require 'alaveteli_file_types'
require 'normalize_string'

module Mail
  class Message
    # The behaviour of the 'to' and 'cc' methods have changed
    # between TMail and Mail; this monkey-patching restores the
    # TMail behaviour.  The key difference is that when there's an
    # invalid address, e.g. '<foo@example.org', Mail returns the
    # string as an ActiveSupport::Multibyte::Chars, whereas
    # previously TMail would return nil.

    alias old_to to
    alias old_cc cc
    alias old_bcc bcc

    def clean_addresses(old_method, val)
      old_result = send(old_method, val)
      old_result.class == Mail::AddressContainer ? old_result : nil
    end

    def to(val = nil)
      clean_addresses :old_to, val
    end

    def cc(val = nil)
      clean_addresses :old_cc, val
    end

    def bcc(val = nil)
      clean_addresses :old_bcc, val
    end
  end
end

module MailHandler
  module Backends
    module MailBackend
      include ConfigHelper

      MismatchedAttachmentHexdigest = Class.new(StandardError)

      def backend
        'Mail'
      end

      def mail_from_raw_email(data)
        data = data.force_encoding(Encoding::BINARY) if data.is_a? String
        Mail.new(Mail::Utilities.binary_unsafe_to_crlf(data.to_s))
      end

      # Extracts all attachments from the given TNEF file as a Mail object
      def mail_from_tnef(content)
        main = Mail.new
        tnef_attachments(content).each do |attachment|
          main.add_file(attachment)
        end
        main.ready_to_send!
        main
      end

      # Returns an outlook message as a Mail object
      def mail_from_outlook(content)
        msg = Mapi::Msg.open(StringIO.new(content))
        mail = mail_from_raw_email(msg.to_mime.to_s)
        mail.ready_to_send!
        mail
      end

      def get_subject(mail)
        subject = mail.subject
        convert_string_to_utf8(subject).string if subject
      end

      def get_within_rfc822_subject(leaf)
        within_rfc822_subject = leaf.within_rfc822_attachment.subject
        return unless within_rfc822_subject

        convert_string_to_utf8(within_rfc822_subject).string
      end

      # Return a copy of the file name for the mail part
      def get_part_file_name(part)
        part_file_name = part.filename
        part_file_name = part_file_name.nil? ? nil : part_file_name.dup
        if part_file_name
          part_file_name = CGI.unescape(part_file_name)
          part_file_name = convert_string_to_utf8(part_file_name, part.charset).string
        end
        part_file_name
      end

      # Get the body of a mail part
      def get_part_body(part)
        decoded = part.body.decoded
        decoded = decoded.dup if decoded.frozen?
        if part.content_type =~ /^text\//
          decoded = convert_string_to_utf8_or_binary decoded, part.charset
        end
        decoded
      end

      # Return the first from field if any
      def first_from(mail)
        if mail[:from]
          begin
            mail[:from].addrs[0]
            mail[:from].decoded
            mail[:from].addrs[0]
          rescue
            mail[:from].value
          end
        end
      end

      # Return the first from address if any
      def get_from_address(mail)
        first_from = first_from(mail)
        if first_from
          if first_from.is_a?(String)
            nil
          else
            first_from.address
          end
        end
      end

      # Return the first from name if any
      def get_from_name(mail)
        first_from = first_from(mail)
        if first_from
          if first_from.is_a?(String)
            nil
          else
            (first_from.display_name || nil)
          end
        end
      end

      def get_all_addresses(mail, include_invalid: false)
        addrs = []
        addrs << mail.to
        addrs << mail[:to].try(:value) if mail.to.nil? && include_invalid
        addrs << mail.cc
        addrs << mail[:cc].try(:value) if mail.cc.nil? && include_invalid
        addrs << mail.bcc
        addrs << mail[:bcc].try(:value) if mail.bcc.nil? && include_invalid
        addrs << (mail['envelope-to'] ? mail['envelope-to'].value.to_s : nil)
        addrs << get_emails_within_received_headers(mail)
        addrs.flatten.compact.uniq
      end

      def empty_return_path?(mail)
        return false if mail['return-path'].nil?
        return true if mail['return-path'].value.blank?

        false
      end

      def get_auto_submitted(mail)
        mail['auto-submitted'] ? mail['auto-submitted'].value : nil
      end

      def get_content_type(part)
        part.content_type ? part.content_type.split(';')[0] : nil
      end

      def get_header_string(header, mail)
        mail.header[header] ? mail.header[header].to_s : nil
      end

      # Detects whether a mail part is an Outlook email
      def is_outlook?(part)
        filename = get_part_file_name(part)
        return true if get_content_type(part) == 'application/vnd.ms-outlook'
        if filename && AlaveteliFileTypes.filename_to_mimetype(filename) == 'application/vnd.ms-outlook'
          return true
        end

        false
      end

      # Convert a mail part which is an attached mail in one of
      # several formats into a mail object and set it as the
      # rfc822_attachment on the part. If the mail part can't be
      # converted, the content type on the part is updated to
      # 'text/plain' for an RFC822 attachment, and 'application/octet-stream'
      # for other types
      def decode_attached_part(part, parent_mail)
        if get_content_type(part) == 'message/rfc822'
          # An email attached as text
          part.rfc822_attachment = mail_from_raw_email(part.body)
          if part.rfc822_attachment.nil?
            # Attached mail didn't parse, so treat as text
            part.content_type = 'text/plain'
          end
        elsif is_outlook?(part)
          begin
            part.rfc822_attachment = mail_from_outlook(part.body.decoded)
          rescue Encoding::CompatibilityError => e
            if send_exception_notifications?
              data = { message: 'Exception while parsing outlook attachment.',
                       parent_mail: parent_mail.inspect }
              ExceptionNotifier.notify_exception(e, data: data)
            end

            part.rfc822_attachment = nil
          end

          if part.rfc822_attachment.nil?
            # Attached mail didn't parse, so treat as binary
            part.content_type = 'application/octet-stream'
          end
        elsif get_content_type(part) == 'application/ms-tnef'
          # A set of attachments in a TNEF file
          begin
            part.rfc822_attachment = mail_from_tnef(part.body.decoded)
            if part.rfc822_attachment.nil?
              # Attached mail didn't parse, so treat as binary
              part.content_type = 'application/octet-stream'
            end

          rescue TNEFParsingError
            part.rfc822_attachment = nil
            part.content_type = 'application/octet-stream'
          end
        end
        if part.rfc822_attachment
          expand_and_normalize_parts(part.rfc822_attachment, parent_mail)
        end
      end

      # Expand and normalize a mail part recursively. Decodes attached messages into
      # Mail objects wherever possible. Sets a default content type if none is
      # set. Tries to set a more specific content type for binary content types.
      def expand_and_normalize_parts(part, parent_mail)
        if part.multipart?
          Mail::PartsList.new(part.parts.each { |sub_part| expand_and_normalize_parts(sub_part, parent_mail) })
        else
          part_filename = get_part_file_name(part)
          if part.has_charset?
            original_charset = part.charset # save this, because overwriting content_type also resets charset
          else
            original_charset = nil
          end
          # Don't allow nil content_types
          if get_content_type(part).nil?
            part.content_type = 'application/octet-stream'
          end

          # PDFs often come with this mime type, fix it up for view code
          if get_content_type(part) == 'application/octet-stream'
            part_body = get_part_body(part)
            calc_mime = AlaveteliFileTypes.filename_and_content_to_mimetype(part_filename,
                                                                            part_body)
            part.content_type = calc_mime if calc_mime
          end

          # Use standard content types for Word documents etc.
          part.content_type = normalise_content_type(get_content_type(part))
          decode_attached_part(part, parent_mail)
          part.charset = original_charset if original_charset
        end
      end

      # Count the parts in a mail part recursively, including any attached messages.
      # Set the count on the parent mail, and set a url_part_number on the part itself.
      # Set the count for the first uudecoded part on the parent mail also.
      def count_parts(part, parent_mail)
        if part.multipart?
          part.parts.each { |p| count_parts(p, parent_mail) }
        elsif part.rfc822_attachment
          count_parts(part.rfc822_attachment, parent_mail)
        else
          parent_mail.count_parts_count += 1
          part.url_part_number = parent_mail.count_parts_count
        end
        parent_mail.count_first_uudecode_count = parent_mail.count_parts_count
      end

      # Choose the best part from alternatives
      def choose_best_alternative(mail)
        return mail.parts.detect(&:multipart?) if mail.parts.any?(&:multipart?)

        if mail.html_part
          mail.html_part
        elsif mail.text_part
          mail.text_part
        else
          mail.parts.first
        end
      end

      # Expand and normalize the parts of a mail, select the best part
      # wherever there is an alternative, and then count the returned
      # leaves and assign url_part values to them
      def get_attachment_leaves(mail)
        # TODO: Most of these methods are modifying in place! :(
        expand_and_normalize_parts(mail, mail)
        leaves = _get_attachment_leaves_recursive(mail, nil, mail)
        mail.count_parts_count = 0
        count_parts(mail, mail)
        leaves
      end

      # Recurse through a mail part, selecting the best part wherever there is
      # an alternative
      def _get_attachment_leaves_recursive(part, within_rfc822_attachment, parent_mail)
        leaves_found = []
        if part.multipart?
          if part.parts.empty?
            # This is typically caused by a missing final
            # MIME boundary, in which case the text of the
            # message (including the opening MIME
            # boundary) is in part.body, so just add this
            # part as a leaf and treat it as text/plain:
            part.content_type = "text/plain"
            leaves_found += [part]
          elsif part.sub_type == 'alternative'
            best_part = choose_best_alternative(part)
            leaves_found += _get_attachment_leaves_recursive(best_part,
                                                             within_rfc822_attachment,
                                                             parent_mail)
          else
            # Add all parts
            part.parts.each do |sub_part|
              leaves_found += _get_attachment_leaves_recursive(sub_part,
                                                               within_rfc822_attachment,
                                                               parent_mail)
            end
          end
        elsif part.rfc822_attachment
          # Add all the parts of a decoded attached message
          leaves_found += _get_attachment_leaves_recursive(part.rfc822_attachment,
                                                           part.rfc822_attachment,
                                                           parent_mail)
        else
          # Store leaf
          part.within_rfc822_attachment = within_rfc822_attachment
          leaves_found += [part]
        end
        leaves_found
      end

      # Add selected useful headers from an attached message to its body
      def extract_attached_message_headers(leaf)
        body = get_part_body(leaf)
        # Test to see if we are in the first part of the attached
        # RFC822 message and it is text, if so add headers.
        if leaf.within_rfc822_attachment == leaf && get_content_type(leaf) == 'text/plain'
          headers = ""
          %w[Date Subject From To Cc].each do |header|
            if (header_value = get_header_string(header, leaf.within_rfc822_attachment))
              unless header_value.blank?
                headers = headers + header + ": " + header_value.to_s + "\n"
              end
            end
          end
          # TODO: call _convert_part_body_to_text here, but need to get charset somehow
          # e.g. http://www.whatdotheyknow.com/request/1593/response/3088/attach/4/Freedom%20of%20Information%20request%20-%20car%20oval%20sticker:%20Article%2020,%20Convention%20on%20Road%20Traffic%201949.txt
          body = headers + "\n" + body
        end
        body
      end

      # Generate a hash of the attributes associated with each significant part
      # of a Mail object
      def get_attachment_attributes(mail)
        get_attachment_leaves(mail).inject([]) do |acc, leaf|
          original_body = body = get_part_body(leaf)

          if leaf.within_rfc822_attachment
            within_rfc822_subject = get_within_rfc822_subject(leaf)
            body = extract_attached_message_headers(leaf)
          end

          acc.push(
            url_part_number: leaf.url_part_number,
            content_type: get_content_type(leaf),
            filename: get_part_file_name(leaf),
            charset: leaf.charset,
            within_rfc822_subject: within_rfc822_subject,
            original_body: original_body,
            body: body,
            hexdigest: Digest::MD5.hexdigest(body)
          )

          acc
        end
      end

      def attachment_body_for_hexdigest(mail, hexdigest:)
        attributes = get_attachment_attributes(mail).find do |attrs|
          attrs[:hexdigest] == hexdigest
        end

        return attributes.fetch(:body) if attributes

        raise MismatchedAttachmentHexdigest,
          "can't find attachment matching hexdigest: #{hexdigest}"
      end

      def attempt_to_find_original_attachment_attributes(mail, body:, nested: false)
        all_attributes = get_attachment_attributes(mail)

        def calculate_hexdigest(body)
          # ensure bodies have the same line endings and are encoded the same
          Digest::MD5.hexdigest(
            Mail::Utilities.binary_unsafe_to_lf(
              convert_string_to_utf8(
                body.rstrip
              ).string
            )
          )
        end

        hexdigest = calculate_hexdigest(body)

        attributes = all_attributes.find do |attrs|
          hexdigest_1 = calculate_hexdigest(attrs[:body])
          hexdigest_2 = calculate_hexdigest(attrs[:original_body])

          hexdigest == hexdigest_1 || hexdigest == hexdigest_2
        end

        return attributes if nested

        mail_body = Mail.new(body).body.to_s
        attributes ||= attempt_to_find_original_attachment_attributes(
          mail, body: mail_body, nested: true
        ) unless mail_body.empty?

        return attributes if attributes

        # check uuencoded attachments which can be located in plain text
        uuencoded_attributes = all_attributes.inject([]) do |acc, attrs|
          next acc unless attrs[:content_type] == 'text/plain'

          acc += uudecode(attrs[:body], attrs[:url_part_number])
        end
        attributes ||= uuencoded_attributes.find do |attrs|
          calculate_hexdigest(attrs[:body]) == hexdigest
        end

        attributes
      end

      def uudecode(text, start_part_number)
        # Find any uudecoded things buried in it, yeuchly
        uus = text.scan(/^begin.+^`\n^end\n/m)
        uus.map.with_index do |uu, index|
          # Decode the string
          body = uu.sub(/\Abegin \d+ [^\n]*\n/, '').unpack('u').first
          # Make attachment type from it, working out filename and mime type
          filename = uu.match(/^begin\s+[0-9]+\s+(.*)$/)[1]
          mime_type = AlaveteliFileTypes.filename_and_content_to_mimetype(
            filename, body
          )
          if mime_type
            content_type = MailHandler.normalise_content_type(mime_type)
          else
            content_type = 'application/octet-stream'
          end
          hexdigest = Digest::MD5.hexdigest(body)

          {
            body: body,
            filename: filename,
            content_type: content_type,
            hexdigest: hexdigest,
            url_part_number: start_part_number + index + 1
          }
        end
      end

      # Format
      def address_from_name_and_email(name, email)
        unless MySociety::Validate.is_valid_email(email)
          raise "invalid email " + email + " passed to address_from_name_and_email"
        end
        return Mail::Address.new(email.dup).to_s if name.nil?

        address = Mail::Address.new
        address.display_name = name.dup
        address.address = email.dup
        address.to_s
      end

      def address_from_string(string)
        mail = Mail.new
        mail.from = string
        mail.from[0]
      end

      def get_emails_within_received_headers(email)
        received_headers = Array(email['Received'])
        return [] if received_headers.empty?

        received_headers.map(&:to_s).
          join(' ').
          scan(MySociety::Validate.email_find_regexp).
          flatten
      end
    end
  end
end