michadenheijer/pynytimes

View on GitHub
pynytimes/api.py

Summary

Maintainability
A
0 mins
Test Coverage
"""Main function of the wrapper"""
# Import typings dependencies
from __future__ import annotations

# Import standard Python dependencies
import datetime
import math
from typing import Any, Final, Literal, Optional, Union, TypedDict, cast

# Import other dependencies
from requests import Session
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Import version from __init__
from .__version__ import __title__, __version__

# Import own dependencies
from .helpers import *

# Define all URLs that are needed
BASE_URL: Final = "api.nytimes.com/svc/"
BASE_TOP_STORIES: Final = BASE_URL + "topstories/v2/"
BASE_MOST_POPULAR: Final = BASE_URL + "mostpopular/v2/"
BASE_BOOKS: Final = BASE_URL + "books/v3/"
BASE_MOVIE_REVIEWS: Final = BASE_URL + "movies/v2/reviews/search.json"
BASE_META_DATA: Final = BASE_URL + "news/v3/content.json"
BASE_TAGS: Final = BASE_URL + "suggest/v1/timestags"
BASE_ARCHIVE_METADATA: Final = BASE_URL + "archive/v1/"
BASE_ARTICLE_SEARCH: Final = BASE_URL + "search/v2/articlesearch.json"
BASE_LATEST_ARTICLES: Final = BASE_URL + "news/v3/content/"
BASE_SECTION_LIST: Final = BASE_URL + "news/v3/content/section-list.json"
BASE_BOOK_REVIEWS: Final = BASE_BOOKS + "reviews.json"
BASE_BEST_SELLERS_LISTS: Final = BASE_BOOKS + "lists/names.json"
BASE_BEST_SELLERS_LIST: Final = BASE_BOOKS + "lists/"

# Define Requests variables
TIMEOUT: Final = (10, 30)
BACKOFF_FACTOR = 1
RETRY_STATUS_CODES = [429, 509]
MAX_RETRIES = 10
RESULTS_MOVIE = 20
RESULTS_SEARCH = 10

# Set type hints
DateType = Union[datetime.date, datetime.datetime, None]
ArticleSearchOptions = TypedDict(
    "ArticleSearchOptions",
    {
        "sort": Literal["oldest", "newest", "relevance"],
        "sources": "list[str]",
        "news_desk": "list[str]",
        "type_of_material": "list[str]",
        "section_name": "list[str]",
        "subject": "list[str]",
        "body": "list[str]",
        "headline": "list[str]",
        "fq": "str",
    },
    total=False,
)
MovieReviewsOptions = TypedDict(
    "MovieReviewsOptions",
    {
        "order": Literal[
            "by-title", "by-publication-date", "by-opening-date"
        ],
        "reviewer": str,
        "critics_pick": bool,
    },
    total=False,
)


class NYTAPI:
    """
    New York Times API Class loads data from the NYT API.
    """

    key: str
    https: bool
    session: Session
    backoff: bool
    user_agent: str
    parse_dates: bool

    # pylint: disable=too-many-arguments

    def __init__(
        self,
        key: str,
        https: bool = True,
        session: Optional[Session] = None,
        backoff: bool = True,
        user_agent: Optional[str] = None,
        parse_dates: bool = False,
    ):
        """Creates the New York Times API class.

        Args:
            key (str): Your key to access the NYT developer API.
            Get your key at https://developer.nytimes.nl. Defaults to None.
            https (bool, optional): Optionally disable HTTPS, not advised.
            Defaults to True.
            session (Session, optional): Use your own Session object. Defaults to None.
            backoff (bool, optional): Optionally disable the automatic backoff,
            this is only advised if you implement your own. Defaults to True.
            user_agent (str, optional): Set your own user-agent. Defaults to None.
            parse_dates (bool, optional): Optionally parse all dates into datetime
            objects.
            It is advised to enable this. Defaults to False.
        """
        self.__set_key(key)
        self.__set_session(session)
        self.__set_parse_dates(parse_dates)
        self.__set_protocol(https)
        self.__set_backoff(backoff)
        self.__set_user_agent(user_agent)

    def __set_key(self, key: Optional[str]):
        """Set key of the class

        Args:
            key (str): The New York Times developer key

        Raises:
            ValueError: You have not set an API key, set one
            TypeError: Your API key is not a string
        """
        # Raise Error if API key is not given, or wrong type
        if key is None:
            raise ValueError(
                "API key is not set, get an API-key from "
                + "https://developer.nytimes.com."
            )

        if not isinstance(key, str):
            raise TypeError("API key needs to be str")

        # Set API key
        self.key: str = key

    def __set_session(self, session: Optional[Session]):
        # Check if session is Session, add session to class so connection
        # can be reused
        self._local_session = False
        if session is None:
            self._local_session = True
            session = Session()

        # FIXME maybe also support other types of requests compatible
        # Session instances
        if not isinstance(session, Session):
            raise TypeError("Session needs to be a Session object")

        self.session = session

    def __set_parse_dates(self, parse_dates: bool):
        # Check if parse_dates is bool, if correct set parse_dates
        if not isinstance(parse_dates, bool):
            raise TypeError("parse_dates needs to be bool")

        self.parse_dates = parse_dates

    def __set_protocol(self, https: bool):
        # Define protocol to be used
        if not isinstance(https, bool):
            raise TypeError("https needs to be bool")

        if https:
            self.protocol = "https://"
        else:
            self.protocol = "http://"

    def __set_backoff(self, backoff: bool):
        # Set strategy to prevent HTTP 429 (Too Many Requests) errors
        if not isinstance(backoff, bool):
            raise TypeError("backoff needs to be bool")

        if backoff:
            # Any to remove errors from type checker
            # FIXME maybe set this as a constant
            backoff_strategy = Retry(
                total=MAX_RETRIES,
                backoff_factor=BACKOFF_FACTOR,
                status_forcelist=RETRY_STATUS_CODES,
            )

            adapter = HTTPAdapter(max_retries=backoff_strategy)

            self.session.mount(self.protocol + BASE_URL, adapter)

    def __set_user_agent(self, user_agent: Optional[str]):
        # Set header to show that this wrapper is used
        if user_agent is None:
            user_agent = f"{__title__}/{__version__}"

        if not isinstance(user_agent, str):
            raise TypeError("user_agent needs to be str")

        self.session.headers.update({"User-Agent": user_agent})

    def __enter__(self) -> NYTAPI:
        return self

    def __load_data(
        self,
        url: str,
        options: Optional[dict[str, Any]] = None,
        location: Optional[list[str]] = None,
    ) -> Union[list[dict[str, Any]], dict[str, Any]]:
        """This function loads the data for the wrapper for most API use cases"""
        # Set API key in query parameters
        params = {"api-key": self.key}

        # Add options to query parameters
        params.update(options or {})  # add empty list if None

        # Load the data from the API, raise error if there's an invalid status
        # code
        res = self.session.get(
            f"{self.protocol}{url}",
            params=params,
            timeout=TIMEOUT,
        )

        raise_for_status(res)
        parsed_res: dict[str, Any] = res.json()
        return get_from_location(parsed_res, location)

    def __parse_dates(
        self,
        articles: list[dict[str, str]],
        date_type: Literal["rfc3339", "date-only", "date-time"],
        locations: Optional[list] = None,
    ) -> list[dict[str, Any]]:
        """Parse dates to datetime"""
        # Don't parse if parse_dates is False
        return (
            parse_dates(articles, date_type, locations)
            if self.parse_dates
            else articles
        )

    def top_stories(self, section: str = "home") -> list[dict[str, Any]]:
        """Load Top Stories

        Args:
            section (str, optional): The section to load the top stories from.
            Defaults to "home".

        Raises:
            TypeError: Section can only be a string
            ValueError: A non-existant section is given

        Returns:
            list[dict[str, Any]]: Top stories metadata
        """
        # Raise error if section is not a str
        if not isinstance(section, str):
            raise TypeError("Section can only be a str")

        # Set the URL the data can be loaded from, and load the data
        url = f"{BASE_TOP_STORIES}{section}.json"

        try:
            result: list[dict[str, Any]] = self.__load_data(
                url
            )  # type:ignore
        # If 404 error throw invalid section name error
        except RuntimeError:
            raise ValueError("Invalid section name")

        # Parse dates from string to datetime.datetime
        # FIXME probably this should be a constant
        date_locations = ["updated_date", "created_date", "published_date"]
        parsed_result = self.__parse_dates(
            result, "rfc3339", date_locations
        )  # FIXME this could just be a direct return
        return parsed_result

    def most_viewed(
        self, days: Literal[1, 7, 30] = 1
    ) -> list[dict[str, Any]]:
        """Get most viewed articles

        Args:
            days (Literal[1, 7, 30], optional): Select the period of which you
            want to get the most viewed articles. Defaults to 1.

        Returns:
            list[dict[str, Any]]: Most viewed article metadata
        """
        most_viewed_check_values(days)

        # Load the data
        url = f"{BASE_MOST_POPULAR}viewed/{days}.json"
        result: list[dict[str, Any]] = self.__load_data(url)  # type:ignore

        # Parse the dates in the results
        parsed_result = self.__parse_dates(
            self.__parse_dates(result, "date-only", ["published_date"]),
            "date-time",
            ["updated"],
        )

        return parsed_result

    def most_shared(
        self,
        days: Literal[1, 7, 30] = 1,
        method: Literal["email", "facebook"] = "email",
    ) -> list[dict[str, Any]]:
        """Get most shared articles

        Args:
            days (Literal[1, 7, 30], optional): Period of the most shared
            articles. Defaults to 1.
            method (Literal["email, "facebook"], optional): Choose the source
            of shared articles. Defaults to "email".

        Returns:
            list[dict[str, Any]]: Most shared articles
        """
        most_shared_check_days(days)
        most_shared_check_method(method)

        # Set URL of data that needs to be loaded
        url = most_shared_get_url(BASE_MOST_POPULAR, method, days)

        # Load the data
        result: list[dict[str, Any]] = self.__load_data(url)  # type:ignore

        # Parse the date_strings into datetime.datetime
        parsed_result = self.__parse_dates(
            self.__parse_dates(result, "date-only", ["published_date"]),
            "date-time",
            ["updated"],
        )

        return parsed_result

    def book_reviews(
        self,
        author: Optional[str] = None,
        isbn: Optional[Union[str, int]] = None,
        title: Optional[str] = None,
    ) -> list[dict[str, Any]]:
        """Load book reviews

        Args:
            author (Optional[str], optional): Name of author. Defaults to None.
            isbn (Optional[Union[str, int]], optional): ISBN of book. Defaults to None.
            title (Optional[str], optional): Title of book. Defaults to None.

        Returns:
            list[dict[str, Any]]: Reviews of books
        """
        # Set request options params
        options = book_reviews_extract_options(author, isbn, title)

        # Check book review input
        book_reviews_check_input(author, isbn, title)

        # Set URL, load and return data
        result: list[dict[str, Any]] = self.__load_data(
            url=BASE_BOOK_REVIEWS, options=options
        )  # type:ignore

        parsed_result = self.__parse_dates(
            result, "date-only", ["publication_dt"]
        )
        return parsed_result

    def best_sellers_lists(self) -> list[dict[str, Any]]:
        """Get all the best sellers lists (not the contents of these lists,
        but just all the lists).

        Returns:
            list[dict[str, Any]]: Bestsellers lists
        """
        # Set URL, load and return data
        result = cast(
            list[dict[str, Any]],
            self.__load_data(url=BASE_BEST_SELLERS_LISTS),
        )

        parsed_result = self.__parse_dates(
            result,
            "date-only",
            ["oldest_published_date", "newest_published_date"],
        )
        return parsed_result

    def best_sellers_list(
        self,
        date: Optional[DateType] = None,
        name: str = "combined-print-and-e-book-fiction",
    ) -> list[dict[str, Any]]:
        """Load all books on a best sellers lists

        Args:
            date (Union[datetime.date, datetime.datetime, None], optional):
            The list closest to this date. If left empty loads most recent.
            Defaults to None.
            name (str, optional): Name of the list. Defaults to
            "combined-print-and-e-book-fiction".

        Raises:
            ValueError: List does not exist

        Returns:
            list[dict[str, Any]]: Books that are on the best sellers list
        """
        _date = best_sellers_parse_date(date)

        # Set URL and include data
        url = f"{BASE_BEST_SELLERS_LIST}{_date}/{name}.json"

        # Set location in JSON of results, load and return data
        try:
            result = cast(
                list[dict[str, Any]],
                self.__load_data(url, location=["results", "books"]),
            )
        except RuntimeError:
            raise ValueError("Best sellers list name is invalid")

        return result

    def __load_movie_reviews(
        self, max_results: int, params: dict[str, Any]
    ) -> list[dict[str, Any]]:
        # Set results list
        results: list[dict[str, Any]] = []

        requests_needed = math.ceil(max_results / RESULTS_MOVIE)
        for i in range(requests_needed):
            # Set offset for second request
            offset = i * RESULTS_MOVIE
            params["offset"] = str(offset)

            # Load the data from the API and raise if there's an Error
            res = cast(
                dict[str, Any],
                self.__load_data(
                    url=BASE_MOVIE_REVIEWS,
                    options=params,
                    location=[],
                ),
            )

            results += res.get("results")  # type:ignore

            # Quit loading more data if no more data is available
            if not res.get("has_more"):
                break

        return results

    def movie_reviews(
        self,
        keyword: Optional[str] = None,
        options: Optional[MovieReviewsOptions] = None,
        dates: Optional[dict[Literal["begin", "end"], DateType]] = None,
    ) -> list[dict[str, Any]]:
        """Load movie reviews

        Args:
            keyword (Optional[str], optional): Keyword to find the movie.
            Defaults to None.
            options (Optional[dict[str, Any]], optional): Options object
            where certain requirements can be set. Check for more
            https://github.com/michadenheijer/pynytimes. Defaults to None.
            dates (Optional[MovieReviewsDateType],
            optional):
            Dates between the review was written or movie was first shown.
            Defaults to None.

        Returns:
            list[dict[str, Any]]: Movie reviews
        """
        # Set options and dates if not defined
        _options = cast(dict[str, Any], options or {})
        dates = dates or {}

        # Check input types and values
        movie_reviews_check_input(keyword, _options, dates)
        params = movie_reviews_parse_dates(dates)
        movie_reviews_parse_params(params, keyword, _options)

        max_results = _options.get("max_results", RESULTS_MOVIE)
        results = self.__load_movie_reviews(max_results, params)

        # Parse and return the results
        # FIXME this part really seems unclear
        parsed_results = self.__parse_dates(
            self.__parse_dates(
                results, "date-only", ["publication_date", "opening_date"]
            ),
            "date-time",
            ["date_updated"],
        )

        return parsed_results

    def article_metadata(self, url: str) -> list[dict[str, Any]]:
        """Load metadata of an article by url

        Args:
            url (str): URL of an New York Times article

        Returns:
            list[dict[str, Any]]: List of article metadata
        """
        options = article_metadata_set_url(url)

        # Load, parse and return the data
        result = cast(
            list[dict[str, Any]],
            self.__load_data(url=BASE_META_DATA, options=options),
        )

        article_metadata_check_valid(result)

        # FIXME this looks like it should be a constant
        date_locations = [
            "updated_date",
            "created_date",
            "published_date",
            "first_published_date",
        ]
        parsed_result = self.__parse_dates(result, "rfc3339", date_locations)
        return parsed_result

    def section_list(self) -> list[dict[str, Any]]:
        """Load all list of all sections

        Returns:
            list[dict[str, Any]]: List of sections
        """
        # Set URL, load and return the data
        return cast(
            list[dict[str, Any]],
            self.__load_data(url=BASE_SECTION_LIST),
        )

    def latest_articles(
        self,
        source: Literal["all", "nyt", "inyt"] = "all",
        section: str = "all",
    ) -> list[dict[str, Any]]:
        """Load latest articles

        Args:
            source (Literal["all", "nyt", "inyt"], optional): Select sources to get all
            articles from. Defaults to "all".
            section (str, optional): Section to get all latest articles from.
            Defaults to "all".

        Raises:
            ValueError: Section is not a valid option

        Returns:
            list[dict[str, Any]]: List of metadata of latest articles
        """
        latest_articles_check_types(source, section)

        # Set URL, load and return data
        url = f"{BASE_LATEST_ARTICLES}{source}/{section}.json"
        try:
            result = cast(list[dict[str, Any]], self.__load_data(url))
        except RuntimeError:
            raise ValueError("Section is not a valid option")

        # FIXME looks like this should be a constant
        date_locations = [
            "updated_date",
            "created_date",
            "published_date",
            "first_published_date",
        ]
        parsed_result = self.__parse_dates(result, "rfc3339", date_locations)
        return parsed_result

    def tag_query(
        self,
        query: str,
        filter_option: Optional[dict[str, Any]] = None,
        filter_options: Optional[str] = None,
        max_results: Optional[int] = None,
    ) -> list[str]:
        """Load Times Tags

        Args:
            query (str): Search query to find a tag
            filter_option (Optional[dict[str, Any]], optional): Filter the tags.
            Defaults to None.
            filter_options (Optional[str], optional): Filter options. Defaults
            to None.
            max_results (Optional[int], optional): Maximum number of results.
            None means no limit. Defaults to None.

        Returns:
            list[str]: List of tags
        """
        # Raise error for TypeError
        tag_query_check_types(query, max_results)

        _filter_options = (
            tag_query_get_filter_options(filter_options) or filter_option
        )

        # Add options to request params
        options = {"query": query, "filter": _filter_options}

        # Define amount of results wanted
        if max_results is not None:
            options["max"] = str(max_results)

        # Set URL, load and return data
        # FIXME what is this, why is this?
        return self.__load_data(url=BASE_TAGS, options=options, location=[])[
            1
        ]  # type:ignore

    def archive_metadata(self, date: DateType) -> list[dict[str, Any]]:
        """Load all article metadata from the last month

        Args:
            date (Union[datetime.datetime, datetime.date]): The month of
            which you want to load all article metadata from

        Raises:
            TypeError: Date is not a datetime or date object

        Returns:
            list[dict[str, Any]]: List of article metadata
        """
        # Raise Error if date is not defined
        if not isinstance(date, (datetime.datetime, datetime.date)):
            raise TypeError("Date has to be datetime or date")

        # Set URL, load and return data
        url = f"{BASE_ARCHIVE_METADATA}{date.year}/{date.month}.json"

        # FIXME why not return immidiatly? what it is doing is also unclear
        parsed_result = self.__parse_dates(
            self.__load_data(  # type:ignore
                url, location=["response", "docs"]
            ),
            "rfc3339",
            ["pub_date"],
        )
        return parsed_result

    # FIXME should this not be in a helper function?
    def __article_search_load_data(
        self,
        results: int,
        options: dict[str, Any],
    ) -> list[dict[str, Any]]:
        result = []
        for i in range(math.ceil(results / RESULTS_SEARCH)):
            # Set page
            options["page"] = str(i)

            location = ["response"]
            # Load data and raise error if there's and error status
            res: dict[str, Any] = self.__load_data(  # type:ignore
                url=BASE_ARTICLE_SEARCH, options=options, location=location
            )

            # Parse results and append them to results list
            result += res.get("docs")  # type:ignore

            # Stop loading if all responses are already loaded
            if res.get("meta", {}).get("hits", 0) <= i * RESULTS_SEARCH:
                break

        return result

    # FIXME this appears to try to do to much
    def article_search(
        self,
        query: Optional[str] = None,
        dates: Optional[dict[Literal["begin", "end"], DateType]] = None,
        options: Optional[ArticleSearchOptions] = None,
        results: int = 10,
    ) -> list[dict[str, Any]]:
        """Search New York Times articles

        Args:
            query (Optional[str], optional): Search query. Defaults to None.
            dates (Optional[dict[Literal["begin", "end"], DateType]],
            optional): Dictionary with "begin" and "end" of search range.
            Defaults to None.
            options (Optional[ArticleSearchOptions], optional): Options for the
            search results.
            Defaults to None.
            results (int, optional): Load at most this many articles. Defaults to 10.

        Returns:
            list[dict[str, Any]]: Article metadata
        """
        # Set if None
        dates = dates or {}
        _options = cast(dict[str, Any], options or {})

        # Check if input is valid
        article_search_check_input(query, dates, _options, results)

        # Limit results loading to 2010
        results = min(results, 2010)

        # Resolve filter options into fq
        _options = article_search_parse_options(_options)

        # Parse dates into options
        # FIXME I really don't get this error
        begin_date, end_date = article_search_parse_dates(dates)
        _options["begin_date"] = begin_date
        _options["end_date"] = end_date

        # Set query if defined
        if query is not None:
            _options["q"] = query

        # Set result list and add request as much data as needed
        result = self.__article_search_load_data(results, _options)

        # Parse and return results
        parsed_result = self.__parse_dates(result, "rfc3339", ["pub_date"])
        return parsed_result

    # Allow the option to close the session
    def close(self) -> None:
        """Close session"""
        # Close session only if it exists
        if hasattr(self, "session"):
            self.session.close()

    # Close session before delete
    def __del__(self) -> None:
        """Close session on deletion"""
        if getattr(self, "_local_session", False):
            self.close()

    def __exit__(self, *args) -> None:
        """Close session on exit"""
        if getattr(self, "_local_session", False):
            self.close()