modules/auxiliary/gather/http_pdf_authors.rb from rapid7/metasploit-framework

modules/auxiliary/gather/http_pdf_authors.rb
Summary

Maintainability

5 hrs
Test Coverage

Issues
##
# This module requires Metasploit: https://metasploit.com/download
# Current source: https://github.com/rapid7/metasploit-framework
##

class MetasploitModule < Msf::Auxiliary
  include Msf::Exploit::Remote::HttpClient
  include Msf::Auxiliary::Report

  def initialize(info = {})
    super(update_info(info,
      'Name'        => 'Gather PDF Authors',
      'Description' => %q{
        This module downloads PDF documents and extracts the author's
        name from the document metadata.

        This module expects a URL to be provided using the URL option.
        Alternatively, multiple URLs can be provided by supplying the
        path to a file containing a list of URLs in the URL_LIST option.

        The URL_TYPE option is used to specify the type of URLs supplied.

        By specifying 'pdf' for the URL_TYPE, the module will treat
        the specified URL(s) as PDF documents. The module will
        download the documents and extract the authors' names from the
        document metadata.

        By specifying 'html' for the URL_TYPE, the module will treat
        the specified URL(s) as HTML pages. The module will scrape the
        pages for links to PDF documents, download the PDF documents,
        and extract the author's name from the document metadata.
      },
      'License'     => MSF_LICENSE,
      'Author'      => 'bcoles'))

    deregister_http_client_options

    register_options(
      [
        OptString.new('URL', [ false, 'The target URL', '' ]),
        OptString.new('URL_LIST', [ false, 'File containing a list of target URLs', '' ]),
        OptEnum.new('URL_TYPE', [ true, 'The type of URL(s) specified', 'html', [ 'pdf', 'html' ] ]),
        OptBool.new('STORE_LOOT', [ false, 'Store authors in loot', true ])
      ])
  end

  def progress(current, total)
    done = (current.to_f / total.to_f) * 100
    percent = "%3.2f%%" % done.to_f
    print_status "%7s done (%d/%d files)" % [percent, current, total]
  end

  def load_urls
    return [ datastore['URL'] ] unless datastore['URL'].to_s.eql? ''

    if datastore['URL_LIST'].to_s.eql? ''
      fail_with Failure::BadConfig, 'No URL(s) specified'
    end

    unless File.file? datastore['URL_LIST'].to_s
      fail_with Failure::BadConfig, "File '#{datastore['URL_LIST']}' does not exist"
    end

    File.open(datastore['URL_LIST'], 'rb') { |f| f.read }.split(/\r?\n/)
  end

  def read(data)
    require 'pdf-reader'

    Timeout.timeout(10) do
      reader = PDF::Reader.new data
      return parse reader
    end
  rescue PDF::Reader::MalformedPDFError
    print_error "Could not parse PDF: PDF is malformed (MalformedPDFError)"
    return
  rescue PDF::Reader::UnsupportedFeatureError
    print_error "Could not parse PDF: PDF contains unsupported features (UnsupportedFeatureError)"
    return
  rescue SystemStackError
    print_error "Could not parse PDF: PDF is malformed (SystemStackError)"
    return
  rescue SyntaxError
    print_error "Could not parse PDF: PDF is malformed (SyntaxError)"
    return
  rescue Timeout::Error
    print_error "Could not parse PDF: PDF is malformed (Timeout)"
    return
  rescue => e
    print_error "Could not parse PDF: Unhandled exception: #{e}"
    return
  end

  def parse(reader)
    # PDF
    # print_status "PDF Version: #{reader.pdf_version}"
    # print_status "PDF Title: #{reader.info['title']}"
    # print_status "PDF Info: #{reader.info}"
    # print_status "PDF Metadata: #{reader.metadata}"
    # print_status "PDF Pages: #{reader.page_count}"

    # Software
    # print_status "PDF Creator: #{reader.info[:Creator]}"
    # print_status "PDF Producer: #{reader.info[:Producer]}"

    # Author
    reader.info[:Author].class == String ? reader.info[:Author].split(/\r?\n/).first : ''
  end

  def run
    urls = load_urls

    if datastore['URL_TYPE'].eql? 'html'
      urls = extract_pdf_links urls

      if urls.empty?
        print_error 'Found no links to PDF files'
        return
      end

      print_line
      print_good "Found links to #{urls.size} PDF files:"
      print_line urls.join "\n"
      print_line
    end

    authors = extract_authors urls

    print_line

    if authors.empty?
      print_status 'Found no authors'
      return
    end

    print_good "Found #{authors.size} authors: #{authors.join ', '}"

    return unless datastore['STORE_LOOT']

    p = store_loot 'pdf.authors', 'text/plain', nil, authors.join("\n"), 'pdf.authors.txt', 'PDF authors'
    print_good "File saved in: #{p}"
  end

  def extract_pdf_links(urls)
    print_status "Processing #{urls.size} URLs..."

    pdf_urls = []
    urls.each_with_index do |url, index|
      next if url.blank?
      html = download url
      next if html.blank?
      doc = Nokogiri::HTML html
      doc.search('a[href]').select { |n| n['href'][/(\.pdf$|\.pdf\?)/] }.map do |n|
        pdf_urls << URI.join(url, n['href']).to_s
      end
      progress(index + 1, urls.size)
    end

    pdf_urls.uniq
  end

  def extract_authors(urls)
    print_status "Processing #{urls.size} URLs..."

    authors = []
    max_len = 256
    urls.each_with_index do |url, index|
      next if url.blank?
      file = download url
      next if file.blank?
      pdf = StringIO.new
      pdf.puts file
      author = read pdf
      unless author.blank?
        print_good "PDF Author: #{author}"
        if author.length > max_len
          print_warning "Warning: Truncated author's name at #{max_len} characters"
          authors << author[0...max_len]
        else
          authors << author
        end
      end
      progress(index + 1, urls.size)
    end

    authors.uniq
  end
end