amenezes/headers

View on GitHub
lib/scanner/headers.py

Summary

Maintainability
A
25 mins
Test Coverage
#!/usr/bin/env python

import gevent
import argparse
from gevent import monkey; monkey.patch_all()
from lib.database.db import DB
from lib.utils.util import Util
from lib.scanner.scan import Scan

from lib.utils.config import DEFAULT_CONFIG_FILE

class Headers(object):


    def __init__(self):
        self.site_table = []
        self.header_name_table = {}
        self.header_value_table = {}
        self.header_table = []
        self.headers_counter = {'name': 0, 'value': 0}
        self.config = Util()
        self.settings = self.config.load_config(DEFAULT_CONFIG_FILE)
        self.scanner = Scan(self.settings)
        self.load_header_name_table()

    def load_header_name_table(self):
        for header_name in self.settings['headers']:
            self.test_duplicate_value(
                header_name,
                self.header_name_table,
                'name')

    def work_headers(self, item):
        site_id = item[0]
        site = item[1]
        url, code, headers = self.scanner.get_data(site)
        self.site_table.append([site_id, site, url, code])
        if code > 0:
            for header_name, header_value in headers:
                if header_name in self.header_name_table:
                    hvalue = self.test_duplicate_value(
                        header_value,
                        self.header_value_table,
                        'value')
                    self.header_table.append(
                        [site_id,
                        self.header_name_table[header_name],
                        hvalue])

    def test_duplicate_value(self, value, table, index_name):
        if value not in table:
            self.headers_counter[index_name] += 1
            table[value] = self.headers_counter[index_name]
            return self.headers_counter[index_name]
        else:
            return table[value]

    def save_data(self):
        database = DB(self.settings)
        database.populate_mysql(self.site_table, self.header_name_table, self.header_value_table, self.header_table)


    def main(self):
        parser = argparse.ArgumentParser(
            description='Headers will get all response headers from Alexa top sites.'
        )
        parser.add_argument(
            '-f',
            '--filename',
            default=self.settings['general']['topsites_filename'],
            help='Filename with list of sites.'
        )
        parser.add_argument(
            '-t',
            '--threads',
            type=int,
            default=self.settings['general']['thread_number'],
            help='Number of threads to make parallel request.'
        )
        args = parser.parse_args()

        filename = args.filename
        num_threads = args.threads
        dictsites = self.config.get_dictsites(filename)
        sites = len(dictsites)
        start = 0
        thread = 1
        while (start < sites):
            print('Thread pool {} ({} - {})'.format(thread, start, start + num_threads))
            thread += 1
            threads = [gevent.spawn(self.work_headers, item) for item in dictsites[start:start+num_threads]]
            gevent.joinall(threads)
            start += num_threads
        self.scanner.get_summary(self.site_table)
        self.save_data()