mysociety/alaveteli

View on GitHub
lib/mail_handler.rb

Summary

Maintainability
C
1 day
Test Coverage
# Handles the parsing of email
require 'English'
require 'tmpdir'

module MailHandler
  require 'mail'
  require 'mail_handler/backends/mail_extensions'
  require 'mail_handler/backends/mail_backend'
  include Backends::MailBackend

  class TNEFParsingError < StandardError
  end

  # Returns a set of attachments from the given TNEF contents
  # The TNEF contents also contains the message body, but in general this is the
  # same as the message body in the message proper.
  def tnef_attachments(content)
    attachments = []
    Dir.mktmpdir do |dir|
      IO.popen("tnef -K -C #{dir} 2> /dev/null", "wb") do |f|
        f.write(content)
        f.close
        raise IOError, "tnef exited with signal #{$CHILD_STATUS.termsig}" if $CHILD_STATUS.signaled?
        if $CHILD_STATUS.exited? && $CHILD_STATUS.exitstatus != 0
          raise TNEFParsingError, "tnef exited with status #{$CHILD_STATUS.exitstatus}"
        end
      end
      found = 0
      Dir.new(dir).sort.each do |file| # sort for deterministic behaviour
        if file != "." && file != ".."
          file_content = File.open("#{dir}/#{file}", "rb").read
          attachments << { content: file_content,
                           filename: file }
          found += 1
        end
      end
      raise TNEFParsingError, "tnef produced no attachments" if found == 0
    end
    attachments
  end

  def normalise_content_type(content_type)
    # e.g. http://www.whatdotheyknow.com/request/93/response/250
    if (content_type == 'application/excel') || (content_type == 'application/msexcel') || (content_type == 'application/x-ms-excel')
      content_type = 'application/vnd.ms-excel'
    end
    if (content_type == 'application/mspowerpoint') || (content_type == 'application/x-ms-powerpoint')
      content_type = 'application/vnd.ms-powerpoint'
    end
    if (content_type == 'application/msword') || (content_type == 'application/x-ms-word')
      content_type = 'application/vnd.ms-word'
    end
    if content_type == 'application/x-zip-compressed'
      content_type = 'application/zip'
    end

    # e.g. http://www.whatdotheyknow.com/request/copy_of_current_swessex_scr_opt#incoming-9928
    if (content_type == 'application/acrobat') || (content_type == 'document/pdf')
      content_type = 'application/pdf'
    end

    content_type
  end

  def get_attachment_text_one_file(content_type, body, charset = 'utf-8')
    # NOTE: re. charset: TMail always tries to convert email bodies
    # to UTF8 by default, so normally it should already be that.
    text = ''
    # TODO: - tell all these command line tools to return utf-8
    if content_type == 'text/plain'
      text += body + "\n\n"
    else
      tempfile = Tempfile.new('foiextract')
      tempfile.binmode
      tempfile.print body
      tempfile.flush
      default_params = { append_to: text,
                         binary_output: false,
                         timeout: 1200 }
      if content_type == 'application/vnd.ms-word'
        AlaveteliExternalCommand.run("wvText", tempfile.path, tempfile.path + ".txt",
                                     { memory_limit: 536_870_912, timeout: 120 } )
        # Try catdoc if we get into trouble (e.g. for InfoRequestEvent 2701)
        if !File.exist?(tempfile.path + ".txt")
          AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params)
        else
          text += File.read(tempfile.path + ".txt") + "\n\n"
          File.unlink(tempfile.path + ".txt")
        end
      elsif content_type == 'application/rtf'
        # catdoc on RTF prodcues less comments and extra bumf than --text option to unrtf
        AlaveteliExternalCommand.run("catdoc", tempfile.path, default_params)
      elsif content_type == 'text/html'
        # lynx wordwraps links in its output, which then don't
        # get formatted properly by Alaveteli. We use elinks
        # instead, which doesn't do that.
        AlaveteliExternalCommand.run("elinks", "-eval", "set document.codepage.assume = \"#{charset}\"",
                                     "-eval", "set document.codepage.force_assumed = 1",
                                     "-dump-charset", "utf-8",
                                     "-force-html", "-dump",
                                     tempfile.path,
                                     default_params.merge(env: { "LANG" => "C" }))
      elsif content_type == 'application/vnd.ms-excel'
        # Bit crazy using /usr/bin/strings - but xls2csv, xlhtml and
        # py_xls2txt only extract text from cells, not from floating
        # notes. catdoc may be fooled by weird character sets, but will
        # probably do for UK FOI requests.
        AlaveteliExternalCommand.run("/usr/bin/strings", tempfile.path, default_params)
      elsif content_type == 'application/vnd.ms-powerpoint'
        # ppthtml seems to catch more text, but only outputs HTML when
        # we want text, so just use catppt for now
        AlaveteliExternalCommand.run("catppt", tempfile.path, default_params)
      elsif content_type == 'application/pdf'
        AlaveteliExternalCommand.run("pdftotext", tempfile.path, "-", default_params)
      elsif content_type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
        # This is Microsoft's XML office document format.
        # Just pull out the main XML file, and strip it of text.
        xml = AlaveteliExternalCommand.run("/usr/bin/unzip", "-qq",
                                           "-c",
                                           tempfile.path,
                                           "word/document.xml",
                                           { binary_output: false })
        unless xml.nil?
          doc = REXML::Document.new(xml)
          text += doc.each_element( './/text()' ) {}.join(" ")
        end
      elsif content_type == 'application/zip'
        # recurse into zip files
        begin
          zip_file = Zip::File.open(tempfile.path)
          text += get_attachment_text_from_zip_file(zip_file)
          zip_file.close
        rescue
          $stderr.puts("Error processing zip file: #{$ERROR_INFO.inspect}")
        end
      end
      tempfile.close
    end

    text
  end

  def get_attachment_text_from_zip_file(zip_file)
    text = ""
    zip_file.each do |entry|
      if entry.file?
        filename = entry.to_s
        begin
          body = entry.get_input_stream.read
        rescue
          # move to next attachment silently if there were problems
          # TODO: really should reduce this to specific exceptions?
          # e.g. password protected
          next
        end
        calc_mime = AlaveteliFileTypes.filename_to_mimetype(filename)
        if calc_mime
          content_type = calc_mime
        else
          content_type = 'application/octet-stream'
        end
        if content_type == 'text/plain' && body.encoding.to_s == 'ASCII-8BIT'
          body = convert_string_to_utf8(body, 'ASCII-8BIT').string
        end
        text += get_attachment_text_one_file(content_type, body)
      end
    end
    text
  end

  # Turn instance methods into class methods
  extend self
end