autostack-team/autostack

View on GitHub
autostack/so_web_scraper/__init__.py

Summary

Maintainability
A
2 hrs
Test Coverage
'''
Authors: Elijah Sawyers, Benjamin Sanders
Emails: elijahsawyers@gmail.com, ben.sanders97@gmail.com
Date: 03/17/2019
Overview: Contains the StackOverflowScraper class which is used to
scrape Stack Overflow for posts with accepted answers for a given query.
'''

from __future__ import absolute_import, division, print_function
from bs4 import BeautifulSoup
import pygments
from pygments.lexers import PythonLexer  # pylint: disable=no-name-in-module
import requests
from termcolor import colored

BASE_URL = 'https://stackoverflow.com'


def accepted_posts(query):
    '''
    A generator that queries Stack Overflow and yields posts with
    accepted answers.

    Parameter {str} query: the string to query Stack Overflow with.
    Yields {bs4.BeautifulSoup}: accepted posts html documents.
    '''

    for result_set in get_post_summaries(query):
        for post_summary in result_set:
            post = post_soup(post_summary)

            if post:
                yield post


def get_post_summaries(query):
    '''
    A generator that queries Stack Overflow and yields a ResultSet
    of post summaries.

    Parameter {str} query: the string to query Stack Overflow with.
    Yields {bs4.element.ResultSet}: ResultSet of post summaries.
    '''

    page = 1

    while True:
        query_url = build_query_url(query, page)
        query_soup = query_stack_overflow(query_url)

        if not query_soup:
            break

        post_summaries = query_soup.find_all(
            attrs={
                'class': 'question-summary'
            }
        )

        if not post_summaries:
            break

        yield post_summaries

        page += 1


def build_query_url(query, page):
    '''
    Builds a URL to query Stack Overflow with.

    e.g. query == 'Test Query' and page == 1 then the url will be:
    https://stackoverflow.com/search?page=1&tab=Relevance&q=%5Bpython%5D+Test+Query

    Parameter {str} query: the string to query Stack Overflow with.
    Parameter {int} page: the page to select in the query.
    '''

    query_url = '{}/search?page={}&tab=Relevance&q=%5Bpython%5D'.format(
        BASE_URL,
        page
    )

    for query_string in query.split(' '):
        query_url = '{}+{}'.format(query_url, query_string)

    return query_url


def query_stack_overflow(url):
    '''
    Given a url, this function returns the BeautifulSoup of the
    request.

    Parameter {str} url: the url to request.
    Returns {bs4.BeautifulSoup}: the BeautifulSoup of the request.
    '''

    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.HTTPError:
        return None

    return BeautifulSoup(response.text, 'lxml')


def post_soup(post_summary):
    '''
    Given a post summary, query Stack Overflow, and return the
    BeautifulSoup of the post, if it has an accepted answer.

    Parameter {bs4.Tag} post_summary: the bs4.Tag post summary.
    Parameter {bs4.BeautifulSoup}: the BeautifulSoup of the post,
    if it has an accepted answer; otherwise, None.
    '''

    if has_accepted_answer(post_summary):
        post_url = get_post_url(post_summary)

        try:
            response = requests.get(BASE_URL + post_url)
            response.raise_for_status()
        except requests.exceptions.HTTPError:
            return None

        return BeautifulSoup(response.text, 'lxml')

    return None


def has_accepted_answer(post_summary):
    '''
    Given a post summary, this function determines whether or not
    the post has an accepted answer.

    Parameter {bs4.Tag} post_summary: the post summary.
    Returns {Boolean}: True if the post has an accepted answer;
    otherwise, False.
    '''

    accepted_answer = post_summary.find(
        attrs={
            'class': 'answered-accepted'
        }
    )

    if not accepted_answer:
        return False

    return True


def get_post_url(post_summary):
    '''
    Given a post summary, this function returns the post's url.

    Parameter {bs4.Tag} post_summary: the post summary.
    Returns {str:None}: post url, or None, if the post url couldn't
    be found.
    '''

    try:
        return post_summary.find(
            attrs={
                'class': 'question-hyperlink'
            },
            href=True
        )['href']
    except KeyError:
        return None


def print_accepted_post(post):
    '''
    Prints a Stack Overflow post with an accepted answer.

    Parameter {bs4.BeautifulSoup} post: The 'soup' of the post
    to print.
    '''

    question = get_post_text(post, 'question')
    accepted_answer = get_post_text(post, 'accepted-answer')

    if question is None or accepted_answer is None:
        return

    print(colored('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~', 'red'))
    print(colored('Question:', 'red'))

    # Print the question.
    print_post_text(question)

    print(colored('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~', 'red'))
    print(colored('Answer:', 'red'))

    # Print the answer.
    print_post_text(accepted_answer)

    print(colored('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~', 'red'))


def get_post_text(post, html_class):
    '''
    Given a post, and a html class, this function returns a
    bs4.Tag with the post-text.
    Typically, you'd only pass 'question' or 'accepted-answer' as
    the html class.

    Parameter {bs4.BeautifulSoup} post: the post to get post-text from.
    Parameter {str} html_class: the html class of the elementto get
    post-text from.
    Returns {bs4.Tag}: the post-text.
    '''

    try:
        return post.find(
            attrs={
                'class',
                html_class
            }
        ).find(
            attrs={
                'class',
                'post-text'
            }
        )
    except AttributeError:
        return None


def print_post_text(post_text):
    '''
    Prints post-text from Stack Overflow.

    On Stack Overflow, a div with a class of 'post-text'
    indicates that the div is either a question or an answer.

    Different elements of the post-text are printed in different
    colors.

    Headers: White.
    Text: White.
    Quotes: Yellow.
    Lists: Syntax Highlighted in print_ul.
    Code: Syntax Highlighted in print_code_block.

    Parameter {bs4.Tag} post_text: HTML 'div' element from a Stack Overflow
    post with class of 'post-text.'
    '''

    element_colors = {
        'h1': 'white',
        'h2': 'white',
        'h3': 'white',
        'p': 'white',
        'blockquote': 'yellow',
    }

    for element in post_text:
        if element.name in element_colors.keys():
            print(
                colored(element.text, element_colors[element.name])
            )
        elif element.name == 'ul':  # Lists.
            print_ul(element)
        elif element.name == 'pre':  # Code.
            print_code_block(element.find('code'))


def print_ul(ul_element):
    '''
    Prints an unordered list.

    Parameter {bs4.Tag} ul_element: the unordered list to print.
    '''

    for item in ul_element.find_all('li'):
        print(
            colored('    - ' + item.text, 'green', attrs=['bold'])
        )


def print_code_block(code_block):
    '''
    Prints a code block from Stack Overflow with syntax highlighting.

    On Stack Overflow, the code in a HTML 'code' element contains
    a 'span' element for each token. Because of this, it's necessary
    to grab each of the 'code' element's 'span' elements' values to get
    the actual code.

    Parameter {bs4.Tag} code_block: 'soup' of a HTML
    'code' element from a Stack Overflow post.
    '''

    token_colors = {
        'Token.Keyword': 'blue',
        'Token.Name.Builtin.Pseudo': 'blue',
        'Token.Literal.Number.Integer': 'green',
        'Token.Literal.Number.Float': 'green',
        'Token.Comment.Single': 'green',
        'Token.Comment.Hashbang': 'green',
        'Token.Literal.String.Single': 'yellow',
        'Token.Literal.String.Double': 'yellow',
        'Token.Literal.String.Doc': 'yellow'
    }

    print('')

    # Store the code's text.
    code = get_src_code(code_block)

    # Loop over code, and highlight.
    for token, content in pygments.lex(code, PythonLexer()):
        try:
            print(
                colored(content, token_colors[str(token)]),
                end=''
            )
        except KeyError:
            print(
                content,
                end=''
            )

    print('')


def get_src_code(code_block):
    '''
    Loops over a code block and grabs the 'source code'
    (i.e. text).

    Parameter {bs4.Tag} code_block: the source code (or text).
    Returns {str}: the source code (or text).
    '''

    code = ''

    # Loop through code spans.
    for token in code_block:
        try:  # bs4.NavigableString
            code += token
        except TypeError:  # bs4.Tag
            code += get_src_code(token.contents)

    return code