muneebalam/scrapenhl2

View on GitHub
scrapenhl2/scrape/schedules.py

Summary

Maintainability
C
1 day
Test Coverage
"""
This module contains methods related to season schedules.
"""

import arrow
import datetime
import functools
import json
import os.path
import urllib.request

import feather
import pandas as pd

import scrapenhl2.scrape.general_helpers as helpers
import scrapenhl2.scrape.organization as organization
import scrapenhl2.scrape.team_info as team_info


def _get_current_season():
    """
    Runs at import only. Sets current season as today's year minus 1, or today's year if it's September or later

    :return: int, current season
    """
    date = arrow.now()
    season = date.year - 1
    if date.month >= 9:
        season += 1
    return season


def get_current_season():
    """
    Returns the current season.

    :return: The current season variable (generated at import from _get_current_season)
    """
    return _CURRENT_SEASON


def get_season_schedule_filename(season):
    """
    Gets the filename for the season's schedule file

    :param season: int, the season

    :return: str, /scrape/data/other/[season]_schedule.feather
    """
    return os.path.join(organization.get_other_data_folder(), '{0:d}_schedule.feather'.format(season))


def get_season_schedule(season):
    """
    Gets the the season's schedule file from memory.

    :param season: int, the season

    :return: dataframe (originally from /scrape/data/other/[season]_schedule.feather)
    """
    return _SCHEDULES[season]


def get_team_schedule(season=None, team=None, startdate=None, enddate=None):
    """
    Gets the schedule for given team in given season. Or if startdate and enddate are specified, searches between
    those dates. If season and startdate (and/or enddate) are specified, searches that season between those dates.

    :param season: int, the season
    :param team: int or str, the team
    :param startdate: str, YYYY-MM-DD
    :param enddate: str, YYYY-MM-DD

    :return: dataframe
    """
    # TODO handle case when only team and startdate, or only team and enddate, are given
    if season is not None:
        df = get_season_schedule(season).query('Status != "Scheduled"')
        if startdate is not None:
            df = df.query('Date >= "{0:s}"'.format(startdate))
        if enddate is not None:
            df = df.query('Date <= "{0:s}"'.format(enddate))
        tid = team_info.team_as_id(team)
        return df[(df.Home == tid) | (df.Road == tid)]
    if startdate is not None and enddate is not None:
        dflst = []
        startseason = helpers.infer_season_from_date(startdate)
        endseason = helpers.infer_season_from_date(enddate)
        for season in range(startseason, endseason + 1):
            df = get_team_schedule(season, team) \
                .query('Status != "Scheduled"') \
                .assign(Season=season)
            if season == startseason:
                df = df.query('Date >= "{0:s}"'.format(startdate))
            if season == endseason:
                df = df.query('Date <= "{0:s}"'.format(enddate))
            dflst.append(df)
        df = pd.concat(dflst)
        return df


def get_team_games(season=None, team=None, startdate=None, enddate=None):
    """
    Returns list of games played by team in season.

    Just calls get_team_schedule with the provided arguments, returning the series of games from that dataframe.

    :param season: int, the season
    :param team: int or str, the team
    :param startdate: str or None
    :param enddate: str or None

    :return: series of games
    """
    return get_team_schedule(season, team, startdate, enddate).Game


def _get_season_schedule(season):
    """
    Gets the the season's schedule file. Stored as a feather file for fast read/write

    :param season: int, the season

    :return: dataframe from /scrape/data/other/[season]_schedule.feather
    """
    return feather.read_dataframe(get_season_schedule_filename(season))


def write_season_schedule(df, season, force_overwrite):
    """
    A helper method that writes the season schedule file to disk (in feather format for fast read/write)

    :param df: the season schedule datafraome
    :param season: the season
    :param force_overwrite: bool. If True, overwrites entire file. If False, only redoes when not Final previously.

    :return: Nothing
    """
    if force_overwrite:  # Easy--just write it
        feather.write_dataframe(df, get_season_schedule_filename(season))
    else:  # Only write new games/previously unfinished games
        olddf = get_season_schedule(season)
        olddf = olddf.query('Status != "Final"')

        # TODO: Maybe in the future set status for games partially scraped as "partial" or something

        game_diff = set(df.Game).difference(olddf.Game)
        where_diff = df.Key.isin(game_diff)
        newdf = pd.concat(olddf, df[where_diff], ignore_index=True)

        feather.write_dataframe(newdf, get_season_schedule_filename(season))
    schedule_setup()


def clear_caches():
    """
    Clears caches for methods in this module.
    :return:
    """
    get_game_data_from_schedule.cache_clear()


@functools.lru_cache(maxsize=1024, typed=False)
def get_game_data_from_schedule(season, game):
    """
    This is a helper method that uses the schedule file to isolate information for current game
    (e.g. teams involved, coaches, venue, score, etc.)

    :param season: int, the season
    :param game: int, the game

    :return: dict of game data
    """

    schedule_item = get_season_schedule(season).query('Game == {0:d}'.format(game)).to_dict(orient='series')
    # The output format of above was {colname: np.array[vals]}. Change to {colname: val}
    schedule_item = {k: v.values[0] for k, v in schedule_item.items()}
    return schedule_item


def get_game_date(season, game):
    """
    Returns the date of this game

    :param season: int, the game
    :param game: int, the season

    :return: str
    """
    return get_game_data_from_schedule(season, game)['Date']


def get_home_team(season, game, returntype='id'):
    """
    Returns the home team from this game

    :param season: int, the game
    :param game: int, the season
    :param returntype: str, 'id' or 'name'

    :return: float or str, depending on returntype
    """
    home = get_game_data_from_schedule(season, game)['Home']
    if returntype.lower() == 'id':
        return team_info.team_as_id(home)
    else:
        return team_info.team_as_str(home)


def get_road_team(season, game, returntype='id'):
    """
    Returns the road team from this game

    :param season: int, the game
    :param game: int, the season
    :param returntype: str, 'id' or 'name'

    :return: float or str, depending on returntype
    """
    road = get_game_data_from_schedule(season, game)['Road']
    if returntype.lower() == 'id':
        return team_info.team_as_id(road)
    else:
        return team_info.team_as_str(road)


def get_home_score(season, game):
    """
    Returns the home score from this game

    :param season: int, the season
    :param game: int, the game

    :return: int, the score
    """
    return int(get_game_data_from_schedule(season, game)['HomeScore'])


def get_road_score(season, game):
    """
    Returns the road score from this game

    :param season: int, the season
    :param game: int, the game

    :return: int, the score
    """
    return int(get_game_data_from_schedule(season, game)['RoadScore'])


def get_game_status(season, game):
    """
    Returns the status of this game (e.g. Final, In Progress)

    :param season: int, the season
    :param game: int, the game

    :return: int, the score
    """
    return get_game_data_from_schedule(season, game)['Status']


def get_game_result(season, game):
    """
    Returns the result of this game for home team (e.g. W, SOL)

    :param season: int, the season
    :param game: int, the game

    :return: int, the score
    """
    return get_game_data_from_schedule(season, game)['Result']


def get_season_schedule_url(season):
    """
    Gets the url for a page containing all of this season's games (Sep 1 to Jun 26) from NHL API.

    :param season: int, the season

    :return: str, https://statsapi.web.nhl.com/api/v1/schedule?startDate=[season]-09-01&endDate=[season+1]-06-25
    """
    return 'https://statsapi.web.nhl.com/api/v1/schedule?startDate=' \
           '{0:d}-09-01&endDate={1:d}-06-25'.format(season, season + 1)


def get_teams_in_season(season):
    """
    Returns all teams that have a game in the schedule for this season

    :param season: int, the season

    :return: set of team IDs
    """

    sch = get_season_schedule(season)
    allteams = set(sch.Road).union(sch.Home)
    return set(allteams)


def check_valid_game(season, game):
    """
    Checks if gameid in season schedule.

    :param season: int, season
    :param game: int, game
    :return: bool
    """

    try:
        get_game_status(season, game)
        return True
    except IndexError:
        return False


def schedule_setup():
    """
    Reads current season and schedules into memory.

    :return: nothing
    """
    clear_caches()
    global _SCHEDULES, _CURRENT_SEASON
    _CURRENT_SEASON = _get_current_season()
    for season in range(2005, get_current_season() + 1):
        if not os.path.exists(get_season_schedule_filename(season)):
            generate_season_schedule_file(season)  # season schedule
            # There is a potential issue here for current season.
            # For current season, we'll update this as we go along.
            # But original creation first time you start up in a new season is automatic, here.
            # When we autoupdate season date, we need to make sure to re-access this file and add in new entries
    _SCHEDULES = {season: _get_season_schedule(season) for season in range(2005, _CURRENT_SEASON + 1)}


def generate_season_schedule_file(season, force_overwrite=True):
    """
    Reads season schedule from NHL API and writes to file.

    The output contains the following columns:

    - Season: int, the season
    - Date: str, the dates
    - Game: int, the game id
    - Type: str, the game type (for preseason vs regular season, etc)
    - Status: str, e.g. Final
    - Road: int, the road team ID
    - RoadScore: int, number of road team goals
    - RoadCoach str, 'N/A' when this function is run (edited later with road coach name)
    - Home: int, the home team ID
    - HomeScore: int, number of home team goals
    - HomeCoach: str, 'N/A' when this function is run (edited later with home coach name)
    - Venue: str, the name of the arena
    - Result: str, 'N/A' when this function is run (edited accordingly later from PoV of home team: W, OTW, SOL, etc)
    - PBPStatus: str, 'Not scraped' when this function is run (edited accordingly later)
    - TOIStatus: str, 'Not scraped' when this function is run (edited accordingly later)

    :param season: int, the season

    :param force_overwrite: bool. If True, generates entire file from scratch.
        If False, only redoes when not Final previously.

    :return: Nothing
    """
    page = helpers.try_url_n_times(get_season_schedule_url(season))

    page2 = json.loads(page)
    df = _create_schedule_dataframe_from_json(page2)
    df.loc[:, 'Season'] = season

    # Last step: we fill in some info from the pbp. If current schedule already exists, fill in that info.
    df = _fill_in_schedule_from_pbp(df, season)
    write_season_schedule(df, season, force_overwrite)
    clear_caches()


def _create_schedule_dataframe_from_json(jsondict):
    """
    Reads game, game type, status, visitor ID, home ID, visitor score, and home score for each game in this dict

    :param jsondict: a dictionary formed from season schedule json

    :return: pandas dataframe
    """
    dates = []
    games = []
    gametypes = []
    statuses = []
    vids = []
    vscores = []
    hids = []
    hscores = []
    venues = []
    for datejson in jsondict['dates']:
        try:
            date = datejson.get('date', None)
            for gamejson in datejson['games']:
                game = int(str(helpers.try_to_access_dict(gamejson, 'gamePk'))[-5:])
                gametype = helpers.try_to_access_dict(gamejson, 'gameType')
                status = helpers.try_to_access_dict(gamejson, 'status', 'detailedState')
                vid = helpers.try_to_access_dict(gamejson, 'teams', 'away', 'team', 'id')
                vscore = int(helpers.try_to_access_dict(gamejson, 'teams', 'away', 'score'))
                hid = helpers.try_to_access_dict(gamejson, 'teams', 'home', 'team', 'id')
                hscore = int(helpers.try_to_access_dict(gamejson, 'teams', 'home', 'score'))
                venue = helpers.try_to_access_dict(gamejson, 'venue', 'name')

                dates.append(date)
                games.append(game)
                gametypes.append(gametype)
                statuses.append(status)
                vids.append(vid)
                vscores.append(vscore)
                hids.append(hid)
                hscores.append(hscore)
                venues.append(venue)
        except KeyError:
            pass
    df = pd.DataFrame({'Date': dates,
                       'Game': games,
                       'Type': gametypes,
                       'Status': statuses,
                       'Road': vids,
                       'RoadScore': vscores,
                       'Home': hids,
                       'HomeScore': hscores,
                       'Venue': venues}).sort_values('Game')
    return df


def _fill_in_schedule_from_pbp(df, season):
    """
    Fills in columns for coaches, result, pbp status, and toi status as N/A, not scraped, etc.
    Use methods prefixed with update_schedule to actually fill in with correct values.

    :param df: dataframe, season schedule dataframe as created by _create_schedule_dataframe_from_json
    :param season: int, the season

    :return: df, with coaches, result, and status filled in
    """

    if os.path.exists(get_season_schedule_filename(season)):
        # only final games--this way pbp status and toistatus will be ok.
        cur_season = get_season_schedule(season).query('Status == "Final"')
        cur_season = cur_season[['Season', 'Game', 'HomeCoach', 'RoadCoach', 'Result', 'PBPStatus', 'TOIStatus']]
        df = df.merge(cur_season, how='left', on=['Season', 'Game'])

        # Fill in NAs
        df.loc[:, 'Season'] = df.Season.fillna(season)
        df.loc[:, 'HomeCoach'] = df.HomeCoach.fillna('N/A')
        df.loc[:, 'RoadCoach'] = df.RoadCoach.fillna('N/A')
        df.loc[:, 'Result'] = df.Result.fillna('N/A')
        df.loc[:, 'PBPStatus'] = df.PBPStatus.fillna('Not scraped')
        df.loc[:, 'TOIStatus'] = df.TOIStatus.fillna('Not scraped')
    else:
        df.loc[:, 'HomeCoach'] = 'N/A'  # Tried to set this to None earlier, but Arrow couldn't handle it, so 'N/A'
        df.loc[:, 'RoadCoach'] = 'N/A'
        df.loc[:, 'Result'] = 'N/A'
        df.loc[:, 'PBPStatus'] = 'Not scraped'
        df.loc[:, 'TOIStatus'] = 'Not scraped'
    return df


def attach_game_dates_to_dateframe(df):
    """
    Takes dataframe with Season and Game columns and adds a Date column (for that game)

    :param df: dataframe

    :return: dataframe with one more column
    """
    dflst = []
    for season in df.Season.unique():
        temp = df.query("Season == {0:d}".format(int(season))) \
            .merge(get_season_schedule(season)[['Game', 'Date']], how='left', on='Game')
        dflst.append(temp)
    df2 = pd.concat(dflst)
    return df2


_CURRENT_SEASON = None
_SCHEDULES = None
schedule_setup()