lib/normalize_string.rb from mysociety/alaveteli

lib/normalize_string.rb
Summary

Maintainability

1 hr
Test Coverage

Issues
require 'charlock_holmes'
require "net/imap"

class EncodingNormalizationError < StandardError
end

def normalize_string_to_utf8(s, suggested_character_encoding=nil)
  # Make a list of encodings to try:
  to_try = []

  guessed_encoding = if CharlockHolmes::EncodingDetector.detect(s).blank?
    ''
  else
    CharlockHolmes::EncodingDetector.detect(s)[:encoding]
  end
  guessed_encoding ||= ''

  # It's reasonably common for windows-1252 text to be mislabelled
  # as ISO-8859-1, so try that first if charlock_holmes guessed
  # that.  However, it can also easily misidentify UTF-8 strings as
  # ISO-8859-1 so we don't want to go with the guess by default...
  to_try.push guessed_encoding if guessed_encoding.downcase == 'windows-1252'

  to_try.push suggested_character_encoding if suggested_character_encoding
  to_try.push 'UTF-8'
  to_try.push guessed_encoding

  to_try.each do |from_encoding|
    if from_encoding == 'utf-7'
      return Net::IMAP.decode_utf7(s)
    else
      begin
        s.force_encoding from_encoding
        return s.encode('UTF-8') if s.valid_encoding?
      rescue ArgumentError, Encoding::UndefinedConversionError
        # We get this is there are invalid bytes when
        # interpreted as from_encoding at the point of
        # the encode('UTF-8'); move onto the next one...
      rescue Encoding::ConverterNotFoundError,
             Encoding::InvalidByteSequenceError => ex
        raise EncodingNormalizationError, ex.message
      end
    end
  end
  raise EncodingNormalizationError, "Couldn't find a valid character encoding for the string"
end

def convert_string_to_utf8_or_binary(s, suggested_character_encoding=nil)
  # This function exists to help to keep consistent with the
  # behaviour of earlier versions of Alaveteli: in the code as it
  # is, there are situations where it's expected that we generally
  # have a UTF-8 encoded string, but if the source data was
  # unintepretable under any character encoding, the string may be
  # binary data (i.e. invalid UTF-8).  Such a string would then be
  # mangled into valid UTF-8 by _sanitize_text for the purposes of
  # display.

  # This seems unsatisfactory to me - two better alternatives would
  # be either: (a) to mangle the data into valid UTF-8 in this
  # method or (b) to treat the 'text/*' attachment as
  # 'application/octet-stream' instead.  However, for the purposes
  # of the transition to Ruby 1.9 and/or Rails 3 we just want the
  # behaviour to be as similar as possible.

  begin
    result = normalize_string_to_utf8 s, suggested_character_encoding
  rescue EncodingNormalizationError
    result = s.force_encoding 'ASCII-8BIT'
  end
  result
end

StringConversionResult = Struct.new(:string, :scrubbed) do
  alias_method :scrubbed?, :scrubbed
end

def convert_string_to_utf8(s, suggested_character_encoding=nil)
  result = normalize_string_to_utf8 s, suggested_character_encoding
  StringConversionResult.new(result, false)

rescue EncodingNormalizationError
  result = scrub(s)
  StringConversionResult.new(result, true)
end

def scrub(string)
  string = string.force_encoding("utf-8")
  string.valid_encoding? ? string : string.encode("utf-16le", invalid: :replace, replace: "").encode("utf-8")
end

def log_text_details(message, text)
  STDERR.puts "#{message}, we have text: #{text}, of class #{text.class} and encoding #{text.encoding}"
  filename = "/var/tmp/#{Digest::MD5.hexdigest(text)}.txt"
  File.open(filename, "wb") { |f| f.write text }
  STDERR.puts "#{message}, the filename is: #{filename}"
end