mysociety/alaveteli

View on GitHub
script/redact-raw-emails.rb

Summary

Maintainability
A
0 mins
Test Coverage
#!/usr/bin/env ruby

require 'optparse'

options = { tmp: true }
parser = OptionParser.new do |opts|
  opts.on(
    '-uID', '--user=ID',
    'User ID to redact from incoming_messages'
  ) do |arg|
    options[:user_id] = arg
  end
  opts.on(
    '-mID', '--message=ID',
    'Limit to an incoming message with ID'
  ) do |arg|
    options[:incoming_message_id] = arg
  end
  opts.on(
    '-r', '--replace',
    'Replace redacted emails on disk'
  ) do
    options[:tmp] = false
  end
end
parser.parse!

require_relative File.join('..', 'config', 'environment')

# PATCH: Add incoming_messages association to User class
class User
  has_many :incoming_messages, through: :info_requests
end

# PATCH: Remove text masks for emails addresses and mobile phone numbers
module AlaveteliTextMasker
  def apply_binary_masks(text, options = {})
    # Keep original size, so can check haven't resized it
    orig_size = text.bytesize
    text = text.dup

    # Replace censor items
    censor_rules = options[:censor_rules] || []
    text = censor_rules.reduce(text) { |t, rule| rule.apply_to_binary(t) }
    raise "internal error in apply_binary_masks" if text.bytesize != orig_size

    text
  end
end

def censor_data(data, content_type)
  @incoming_message.apply_masks(data, content_type)
end

def censor(text)
  return unless text

  @incoming_message.info_request.apply_censor_rules_to_text(text)
end

def censor_part(part)
  if part.multipart?
    part.parts.each { |nested_part| censor_part(nested_part) }

  elsif part.attachment?
    encoding = part.content_transfer_encoding
    encoder = Mail::Encodings.get_encoding(encoding)
    raise 'Unknow encoding' unless encoder

    part.body = encoder.encode(
      censor_data(part.body.decoded, part.content_type)
    )

    # TODO: censor content disposition

  else
    part.body = censor(part.body.decoded)
  end
end

include Rails.application.routes.url_helpers
default_url_options[:host] = AlaveteliConfiguration.domain

if options[:user_id] && options[:incoming_message_id]
  scope = User.find(options[:user_id]).incoming_messages.
    where(id: options[:incoming_message_id])
elsif options[:incoming_message_id]
  scope = IncomingMessage.where(id: options[:incoming_message_id])
elsif options[:user_id]
  scope = User.find(options[:user_id]).incoming_messages
else
  puts parser.help
  exit
end

puts 'WARNING: Redactions might not be perfect and if you use this script ' \
  'then you should manually check the redactions to ensure this is doing ' \
  'what you would expect.'

if options[:tmp]
  puts "Outputting redactions to tmp files. Please check these redactions " \
    "are working correctly and the emails/attachments haven't been broken."
  puts "When you are happy to proceed run with '-r' option to replace raw " \
    "emails on disk."
end

puts

scope.each do |incoming_message|
  @incoming_message = incoming_message
  @raw_email = @incoming_message.raw_email

  mail = Mail.new(@raw_email.data)

  if mail.multipart?
    mail.parts.each { |part| censor_part(part) }
  else
    mail.body = censor(mail.body.decoded)
  end

  mail.subject = censor(mail.subject)
  mail[:to]    = censor(mail[:to].to_s)
  mail[:cc]    = censor(mail[:cc].to_s)
  mail[:bcc]   = censor(mail[:bcc].to_s)

  print show_request_url(
    @incoming_message.info_request.url_title,
    anchor: "incoming-#{@incoming_message.id}"
  )

  print " cached at #{@raw_email.filepath}"

  if @raw_email.data == mail.to_s
    puts " no changes required"
    next
  end

  if options[:tmp]
    path = Rails.root.join('tmp', "redacted-#{@raw_email.id}.eml")
    File.write(path, mail.to_s)
    puts " will be replaced with #{path.relative_path_from(Rails.root)}"
  else
    FileUtils.copy(@raw_email.filepath, "#{@raw_email.filepath}.bak")
    @raw_email.data = mail.to_s
    @incoming_message.parse_raw_email!(true)
    puts " has been updated"
    puts "Backup created at #{@raw_email.filepath}.bak"
  end
end