Lambda-School-Labs/pt-synaps-ds

View on GitHub
retrieve_definition.py

Summary

Maintainability
A
1 hr
Test Coverage
#!/usr/bin/python3

"""
    Pull first 300 characters from Wikipedia article for a given term
"""

import requests


def get_API_params(term):
    params = {
        "action": "query",
        "prop": "extracts",
        "exchars": "190",
        "titles": term,
        "format": "json",
        "explaintext": 1,
        "exlimit": 1
    }

    # parameters set to query for an extract of 300 characters for the given term, in JSON format. Explaintext strips
    # out Wikipedia's special formatting. Exlimit says to only return 1
    # extract.

    return params


def get_opensearch_params(term):
    params = {
        "action": "opensearch",
        "search": term,
        "redirects": "resolve",
        "format": "json"
    }

    # Parameters set tells API to use opensearch on the given term and return the results as a JSON object.
    # Resolve means to return redirects as the page they point to.

    return params


def get_json_extract(term):
    S = requests.Session()

    URL = "https://en.wikipedia.org/w/api.php"

    params = get_API_params(term)

    print("Searching API for: ", term)
    response = S.get(url=URL, params=params)
    S.close()
    data = response.json()
    return data


def get_json_opensearch(term):
    S = requests.Session()

    URL = "https://en.wikipedia.org/w/api.php"

    params = get_opensearch_params(term)

    R = S.get(url=URL, params=params)
    DATA = R.json()
    suggests = DATA[1]

    S.close()
    return suggests


def retrieve_definition(term, term_wrangled=False):
    """
    Given a term, returns the first 190 characters of the matching Wikipedia
    page. If no term is found, returns a "Did you mean...?" prompt with three
    terms that do have matching pages.
    """
    if len(term) > 255:
        text = 'Sorry, that text is too long to search!'

    data = get_json_extract(term)

    pageid = list(data['query']['pages'].keys())[0]
    try:
        print("Pulling extract")
        extract = data['query']['pages'][pageid]['extract']
        # this selects the extract from within the JSON object returned by the API call. Two steps are necessary
        # because one of the dictionary keys is the page ID for that term.

        # if the length of extract is 3, that indicates extract is '...',
        # which is what the API usually returns if it doesn't find a page
        if len(extract) > 3:
            text = extract

        elif len(extract) == 3 and term_wrangled is False:
            wrangled_term = text_wrangle(term)
            print("Wrangled_term: ", wrangled_term)
            wrangled_extract = retrieve_definition(wrangled_term,
                                                   term_wrangled=True)
            print(len(wrangled_extract))
            if len(wrangled_extract) > 3:
                text = wrangled_extract

            else:
                text = open_search(term)

        else:
            text = open_search(term)

    except KeyError:
        # sometimes instead of an empty string as an extract the API call returns a "missing" key in JSON, this accounts
        # for that
        text = open_search(term)
    return text


def open_search(term):
    """
    function to use opensearch on Wikipedia API and return most likely related articles for a given term. opensearch
    is a Wikimedia API feature which returns similarly-titled articles within the wiki.
    """
    suggests = get_json_opensearch(term)

    try:
        return f"Did you mean {suggests[0]}, {suggests[1]}, {suggests[2]}?"

    except IndexError:
        # This covers cases where input doesn't have a close Wiki entry
        return "We can't find anything close to that :("


def text_wrangle(term):
    """
    Check text for various edge cases and remove
    """
    import inflect

    # Start engine for text_wrangle() singularization
    p = inflect.engine()

    # Makes term lowercase
    term = term.lower()
    print("Lowercase search: ", term)

    if term[0:4] == 'the ':
        # Strips 'the' and 'The' from term
        term = term[4:]
        print("Search without 'the': ", term)

    if term[0:2] == 'a ':
        term = term[2:]
        print("Search without 'a': ", term)

    if p.singular_noun(term):
        term = p.singular_noun(term)
        print("Search as singular: ", term)

    return term