mysociety/alaveteli

View on GitHub
lib/alaveteli_text_masker.rb

Summary

Maintainability
A
1 hr
Test Coverage
require 'tempfile'

module AlaveteliTextMasker
  include ConfigHelper

  extend self
  DoNotBinaryMask = [ 'image/tiff',
                      'image/gif',
                      'image/jpeg',
                      'image/png',
                      'image/bmp',
                      'application/zip' ]

  TextMask = [ 'text/css',
               'text/csv',
               'text/html',
               'text/plain',
               'text/rfc822-headers',
               'text/rtf',
               'text/tab-separated-values',
               'text/x-c',
               'text/x-diff',
               'text/x-fortran',
               'text/x-mail',
               'text/xml',
               'text/x-pascal',
               'text/x-vcard' ]

  # Replaces all email addresses in (possibly binary) data
  # Also applies custom masks and censor items
  def apply_masks(text, content_type, options = {})
    # See if content type is one that we mask - things like zip files and
    # images may get broken if we try to. We err on the side of masking too
    # much, as many unknown types will really be text.

    # Special cases for some content types
    case content_type
    when 'application/pdf'
      apply_pdf_masks(text, options)
    when *TextMask
      apply_text_masks(text, options)
    when *DoNotBinaryMask
      text # do nothing
    else
      apply_binary_masks(text, options)
    end
  end

  private

  def uncompress_pdf(text)
    temp = Tempfile.new('pdftk', './tmp', encoding: 'ascii-8bit')
    temp.write(text)
    temp.close

    AlaveteliExternalCommand.run(
      "pdftk", temp.path, "output", "-", "uncompress"
    )
  ensure
    temp.unlink
  end

  def compress_pdf(text)
    temp = Tempfile.new('pdftk', './tmp', encoding: 'ascii-8bit')
    temp.write(text)
    temp.close

    if AlaveteliConfiguration.use_ghostscript_compression
      command = ["gs",
                 "-sDEVICE=pdfwrite",
                 "-dCompatibilityLevel=1.4",
                 "-dPDFSETTINGS=/screen",
                 "-dNOPAUSE",
                 "-dQUIET",
                 "-dBATCH",
                 "-sOutputFile=-",
                 temp.path]
    else
      command = ["pdftk", temp.path, "output", "-", "compress"]
    end
    AlaveteliExternalCommand.run(*command)
  ensure
    temp.unlink
  end

  def apply_pdf_masks(text, options = {})
    uncompressed_text = uncompress_pdf(text)
    # if we managed to uncompress the PDF...
    if uncompressed_text.blank?
      text
    else
      # then censor stuff (making a copy so can compare again in a bit)
      censored_uncompressed_text = apply_binary_masks(uncompressed_text, options)

      # if the censor rule removed something...
      if censored_uncompressed_text != uncompressed_text
        # then use the altered file (recompressed)
        recompressed_text = compress_pdf(censored_uncompressed_text)

        if recompressed_text.blank?
          # buggy versions of pdftk sometimes fail on
          # compression, I don't see it's a disaster in
          # these cases to save an uncompressed version?
          recompressed_text = censored_uncompressed_text
          Rails.logger.warn "Unable to compress PDF; problem with your pdftk version?"
        end

        if recompressed_text.blank?
          text
        else
          recompressed_text
        end
      else
        text
      end
    end
  end

  def apply_binary_masks(text, options = {})
    # Keep original size, so can check haven't resized it
    orig_size = text.bytesize
    text = text.dup

    # Replace ASCII email addresses...
    text.gsub!(MySociety::Validate.email_find_regexp) do |email|
      email.gsub(/[^@.]/, 'x')
    end

    # And replace UCS-2 ones (for Microsoft Office documents)...
    # Find emails, by finding them in parts of text that have ASCII
    # equivalents to the UCS-2
    ascii_chars = text.gsub(/\0/, "")
    emails = ascii_chars.scan(MySociety::Validate.email_find_regexp)

    # Convert back to UCS-2, making a mask at the same time
    emails.map! do |email|
      # We want the ASCII representation of UCS-2
      [email[0].encode('UTF-16LE').force_encoding('US-ASCII'),
       email[0].gsub(/[^@.]/, 'x').encode('UTF-16LE').force_encoding('US-ASCII')]
    end

    # Now search and replace the UCS-2 email with the UCS-2 mask
    emails.each do |email, mask|
      text.gsub!(email, mask)
    end

    # Replace censor items
    censor_rules = options[:censor_rules] || []
    text = censor_rules.reduce(text) { |t, rule| rule.apply_to_binary(t) }
    raise "internal error in apply_binary_masks" if text.bytesize != orig_size

    text
  end

  # Remove any email addresses, login links and mobile phone numbers
  def default_text_masks
    [{ to_replace: MySociety::Validate.email_find_regexp,
       replacement: "[#{_("email address")}]" },
     { to_replace: /(Mobile|Mob)([\s\/]*(Fax|Tel))*\s*:?[\s\d]*\d/,
       replacement: "[#{_("mobile number")}]" },
     { to_replace: /https?:\/\/#{AlaveteliConfiguration.domain}\/c\/[^\s]+/,
       replacement: "[#{_("{{site_name}} login link",
                          site_name: site_name)}]" }]
  end

  def apply_text_masks(text, options = {})
    masks = options[:masks] || []
    masks += default_text_masks
    censor_rules = options[:censor_rules] || []

    text = masks.inject(text) do |memo, mask|
      memo.gsub(mask[:to_replace], mask[:replacement])
    end

    censor_rules.reduce(text) { |t, rule| rule.apply_to_text(t) }
  end
end