scrapenhl2/scrape/players.py
"""
This module contains methods related to individual player info.
"""
import functools
import json
import os.path
import urllib.request
from tqdm import tqdm
import feather
import pandas as pd
import scrapenhl2.scrape.general_helpers as helpers
import scrapenhl2.scrape.organization as organization
import scrapenhl2.scrape.schedules as schedules
import scrapenhl2.scrape.team_info as team_info
_PLAYERS = None
_PLAYER_LOG = None
def get_player_log_file():
"""
Returns the player log file from memory.
:return: dataframe, the log
"""
return _PLAYER_LOG
def _get_player_log_file():
"""
Returns the player log file, reading from file. This is stored as a feather file for fast read/write.
:return: dataframe from /scrape/data/other/PLAYER_LOG.feather
"""
return feather.read_dataframe(get_player_log_filename())
def get_player_ids_file():
"""
Returns the player information file. This is stored as a feather file for fast read/write.
:return: /scrape/data/other/PLAYER_INFO.feather
"""
return _PLAYERS
def _get_player_ids_file():
"""
Runs at startup to read the player information file. This is stored as a feather file for fast read/write.
:return: /scrape/data/other/PLAYER_INFO.feather
"""
return feather.read_dataframe(get_player_ids_filename())
def write_player_log_file(df):
"""
Writes the given dataframe to file as the player log filename
:param df: pandas dataframe
:return: nothing
"""
feather.write_dataframe(df.drop_duplicates(), get_player_log_filename())
player_setup()
def get_player_log_filename():
"""
Returns the player log filename.
:return: str, /scrape/data/other/PLAYER_LOG.feather
"""
return os.path.join(organization.get_other_data_folder(), 'PLAYER_LOG.feather')
def get_player_ids_filename():
return os.path.join(organization.get_other_data_folder(), 'PLAYER_INFO.feather')
def check_default_player_id(playername):
"""
E.g. For Mike Green, I should automatically assume we mean 8471242 (WSH/DET), not 8468436.
Returns None if not in dict.
Ideally improve code so this isn't needed.
:param playername: str
:return: int, or None
"""
# TODO gradually add to this
return helpers.try_to_access_dict({'Mike Green': 8471242,
'Francois Beauchemin': 8467400,
'Erik Karlsson': 8474578,
'Mike Hoffman': 8474884,
'Tyler Johnson': 8474870,
'Josh Anderson': 8476981,
'Sebastian Aho': 8478427,
'Trevor Lewis': 8473453,
'Ryan Murphy': 8476465}, playername)
def player_setup():
"""
Loads team info file into memory.
:return: nothing
"""
global _PLAYERS, _PLAYER_LOG
if not os.path.exists(get_player_ids_filename()):
generate_player_ids_file()
if not os.path.exists(get_player_log_filename()):
generate_player_log_file()
_PLAYERS = _get_player_ids_file()
_PLAYER_LOG = _get_player_log_file()
def rescrape_player(playerid):
"""
If you notice that a player name, position, etc, is outdated, call this method on their ID. It will
re-scrape their data from the NHL API.
:param playerid: int, their ID. Also accepts str, their name.
:return: nothing
"""
playerid = player_as_id(playerid)
update_player_ids_file(playerid, True)
def write_player_ids_file(df):
"""
Writes the given dataframe to disk as the player ids mapping.
:param df: pandas dataframe, player ids file
:return: nothing
"""
feather.write_dataframe(df.drop_duplicates(), get_player_ids_filename())
def get_player_url(playerid):
"""
Gets the url for a page containing information for specified player from NHL API.
:param playerid: int, the player ID
:return: str, https://statsapi.web.nhl.com/api/v1/people/[playerid]
"""
return 'https://statsapi.web.nhl.com/api/v1/people/{0:s}'.format(str(playerid))
def update_player_ids_file(playerids, force_overwrite=False):
"""
Adds these entries to player IDs file if need be.
:param playerids: a list of IDs
:param force_overwrite: bool. If True, will re-scrape data for all player ids. If False, only new ones.
:return: nothing
"""
# In case we get just one number
if isinstance(playerids, int):
playerids = [playerids]
ids = []
names = []
hands = []
pos = []
dobs = []
heights = []
weights = []
nationalities = []
current_players = get_player_ids_file()
if not force_overwrite:
# Pull only ones we don't have already
newdf = pd.DataFrame({'ID': [int(x) for x in playerids]})
to_scrape = set(newdf.ID).difference(current_players.ID)
else:
to_scrape = playerids
current_players = current_players.merge(pd.DataFrame({'ID': playerids}),
how='outer',
on='ID')
current_players = current_players.query('_merge == "left_only"').drop('_merge', axis=1)
if len(to_scrape) == 0:
return
for playerid in tqdm(to_scrape, desc="Parsing players in play by play"):
playerinfo = get_player_info_from_url(playerid)
ids.append(playerinfo['ID'])
names.append(playerinfo['Name'])
hands.append(playerinfo['Hand'])
pos.append(playerinfo['Pos'])
dobs.append(playerinfo['DOB'])
weights.append(playerinfo['Weight'])
heights.append(playerinfo['Height'])
nationalities.append(playerinfo['Nationality'])
df = pd.DataFrame({'ID': ids, 'Name': names, 'DOB': dobs, 'Hand': hands, 'Pos': pos,
'Weight': weights, 'Height': heights, 'Nationality': nationalities})
df.loc[:, 'ID'] = pd.to_numeric(df.ID).astype(int)
write_player_ids_file(pd.concat([df, current_players]))
# print(len(_PLAYERS.groupby('ID').count().query('Name >= 2'))) # not getting duplicates, so I think we're okay
def update_player_log_file(playerids, seasons, games, teams, statuses):
"""
Updates the player log file with given players. The player log file notes which players played in which games
and whether they were scratched or played.
:param playerids: int or str or list of int
:param seasons: int, the season, or list of int the same length as playerids
:param games: int, the game, or list of int the same length as playerids
:param teams: str or int, the team, or list of int the same length as playerids
:param statuses: str, or list of str the same length as playerids
:return: nothing
"""
# Change everything to lists first if need be
if isinstance(playerids, int) or isinstance(playerids, str):
playerids = player_as_id(playerids)
playerids = [playerids]
if helpers.check_number(seasons):
seasons = [seasons for _ in range(len(playerids))]
if helpers.check_number(games):
games = [games for _ in range(len(playerids))]
if helpers.check_types(teams):
teams = team_info.team_as_id(teams)
teams = [teams for _ in range(len(playerids))]
if isinstance(statuses, str):
statuses = [statuses for _ in range(len(playerids))]
df = pd.DataFrame({'ID': playerids, # Player ID
'Team': teams, # Team
'Status': statuses, # P for played, S for scratch.
'Season': seasons, # Season
'Game': games}) # Game
if len(get_player_log_file()) == 1:
# In this case, the only entry is our original entry for Ovi, that sets the datatypes properly
write_player_log_file(df)
else:
write_player_log_file(pd.concat([get_player_log_file(), df]))
@functools.lru_cache(maxsize=128, typed=False)
def get_player_position(player):
"""
Retrieves position of player
:param player: str or int, the player name or ID
:return: str, player position (e.g. C, D, R, L, G)
"""
df = get_player_ids_file()
df = df[df.ID == player_as_id(player)]
if len(df) == 1:
return df.Pos.iloc[0]
else:
print('Could not find position for', player)
return None
@functools.lru_cache(maxsize=128, typed=False)
def get_player_handedness(player):
"""
Retrieves handedness of player
:param player: str or int, the player name or ID
:return: str, player hand (L or R)
"""
df = get_player_ids_file()
df = df[df.ID == player_as_id(player)]
if len(df) == 1:
return df.Hand.iloc[0]
else:
print('Could not find hand for', player)
return None
@functools.lru_cache(maxsize=128, typed=False)
def player_as_id(playername, filterids=None, dob=None):
"""
A helper method. If player entered is int, returns that. If player is str, returns integer id of that player.
:param playername: int, or str, the player whose names you want to retrieve
:param filterids: a tuple of players to choose from. Needs to be tuple else caching won't work.
:param dob: yyyy-mm-dd, use to help when multiple players have the same name
:return: int, the player ID
"""
filterdf = get_player_ids_file()
if filterids is None:
pass
else:
filterdf = filterdf.merge(pd.DataFrame({'ID': filterids}), how='inner', on='ID')
pids = filterdf
if dob is not None:
pids = pids.query('DOB == "{0:s}"'.format(dob))
if helpers.check_number(playername):
return int(playername)
elif isinstance(playername, str):
df = pids.query('Name == "{0:s}"'.format(playername))
if len(df) == 0:
# ed.print_and_log('Could not find exact match for for {0:s}; trying exact substring match'.format(player))
df = pids
df = df[df.Name.str.contains(playername)]
if len(df) == 0:
# ed.print_and_log('Could not find exact substring match; trying fuzzy matching')
name = helpers.fuzzy_match_player(playername, pids.Name)
return player_as_id(name, tuple(filterdf.ID.values))
# return player_as_id(name)
elif len(df) == 1:
return df.ID.iloc[0]
else:
print('Multiple results when searching for {0:s}; returning first result'.format(playername))
print('You can specify a tuple of acceptable IDs to scrapenhl2.scrape.players.player_as_id')
print(df.to_string())
return df.ID.iloc[0]
elif len(df) == 1:
return df.ID.iloc[0]
else:
default = check_default_player_id(playername)
if default is None:
print('Multiple results when searching for {0:s}; returning first result'.format(playername))
print('You can specify a tuple of acceptable IDs to scrapenhl2.scrape.players.player_as_id')
print(df.to_string())
return df.ID.iloc[0]
else:
print('Multiple results when searching for {0:s}; returning default'.format(playername))
print('You can specify a tuple of acceptable IDs to scrapenhl2.scrape.players.player_as_id')
print(df.to_string())
return default
else:
print('Specified wrong type for player: {0:s}'.format(type(playername)))
return None
def playerlst_as_str(players, filterdf=None):
"""
Similar to player_as_str, but less robust against errors, and works on a list of players
:param players: a list of int, or str, players whose names you want to retrieve
:param filterdf: df, a dataframe of players to choose from. Defaults to all.
:return: a list of str
"""
if filterdf is None:
filterdf = get_player_ids_file()
df = pd.DataFrame({'ID': players})
if df.ID.dtype == 'str' or df.ID.dtype == 'O':
return df.ID
else:
df = df.merge(filterdf, how='left', on='ID')
return df.Name
def playerlst_as_id(playerlst, exact=False, filterdf=None):
"""
Similar to player_as_id, but less robust against errors, and works on a list of players.
:param players: a list of int, or str, players whose IDs you want to retrieve.
:param exact: bool. If True, looks for exact matches. If False, does not, using player_as_id (but will be slower)
:param filterdf: df, a dataframe of players to choose from. Defaults to all.
:return: a list of int/float
"""
if filterdf is None:
filterdf = get_player_ids_file()
df = pd.DataFrame({'Name': playerlst})
if not (df.Name.dtype == 'str' or df.Name.dtype == 'O'):
return df.Name
elif exact is True:
return df.merge(filterdf, on='Name', how='left').PlayerID
else:
df.loc[:, 'ID'] = df.Name.apply(lambda x: player_as_id(x, tuple(filterdf.ID.values)))
return df.ID
@functools.lru_cache(maxsize=128, typed=False)
def player_as_str(playerid, filterids=None):
"""
A helper method. If player is int, returns string name of that player. Else returns standardized name.
:param playerid: int, or str, player whose name you want to retrieve
:param filterids: a tuple of players to choose from. Needs to be tuple else caching won't work.
Probably not needed but you can use this method to go from part of the name to full name, in which case
it may be helpful.
:return: str, the player name
"""
filterdf = get_player_ids_file()
if filterids is None:
pass
else:
filterdf = filterdf.merge(pd.DataFrame({'ID': filterids}), how='inner', on='ID')
if isinstance(playerid, str):
# full name
newfilterdf = filterdf
realid = player_as_id(playerid)
return player_as_str(realid)
elif helpers.check_number(playerid):
player = int(playerid)
df = filterdf.query('ID == {0:.0f}'.format(playerid))
if len(df) == 0:
print('Could not find name for {0:.0f}'.format(playerid))
return None
elif len(df) == 1:
return df.Name.iloc[0]
else:
print('Multiple results when searching for {0:d}; returning first result'.format(playerid))
print(df.to_string())
return df.Name.iloc[0]
else:
print('Specified wrong type for player: {0:d}'.format(type(playerid)))
return None
def get_player_info_from_url(playerid):
"""
Gets ID, Name, Hand, Pos, DOB, Height, Weight, and Nationality from the NHL API.
:param playerid: int, the player id
:return: dict with player ID, name, handedness, position, etc
"""
page = helpers.try_url_n_times(get_player_url(playerid))
data = json.loads(page)
info = {}
vars_to_get = {'ID': ['people', 0, 'id'],
'Name': ['people', 0, 'fullName'],
'Hand': ['people', 0, 'shootsCatches'],
'Pos': ['people', 0, 'primaryPosition', 'code'],
'DOB': ['people', 0, 'birthDate'],
'Height': ['people', 0, 'height'],
'Weight': ['people', 0, 'weight'],
'Nationality': ['people', 0, 'nationality']}
for key, val in vars_to_get.items():
info[key] = helpers.try_to_access_dict(data, *val)
# Remove the space in the middle of height
if info['Height'] is not None:
info['Height'] = info['Height'].replace(' ', '')
return info
def generate_player_ids_file():
"""
Creates a dataframe with these columns:
- ID: int, player ID
- Name: str, player name
- DOB: str, date of birth
- Hand: char, R or L
- Pos: char, one of C/R/L/D/G
It will be populated with Alex Ovechkin to start.
:return: nothing
"""
df = pd.DataFrame({'ID': [8471214],
'Name': ['Alex Ovechkin'],
'DOB': ['1985-09-17'],
'Hand': ['R'],
'Pos': ['L'],
'Height': ["6'3\""],
'Weight': [235],
'Nationality': ['RUS']})
write_player_ids_file(df)
player_setup()
def generate_player_log_file():
"""
Run this when no player log file exists already. This is for getting the datatypes right. Adds Alex Ovechkin
in Game 1 vs Pittsburgh in 2016-2017.
:return: nothing
"""
df = pd.DataFrame({'ID': [8471214], # Player ID (Ovi)
'Team': [15], # Team (WSH)
'Status': ['P'], # P for played, S for scratch. # TODO can I do healthy vs injured?
'Season': [2016], # Season (2016-17)
'Game': [30221]}) # Game (G1 vs PIT)
if os.path.exists(get_player_log_filename()):
pass # ed.print_and_log('Warning: overwriting existing player log with default, one-line df!', 'warn')
write_player_log_file(df)
def update_player_ids_from_page(pbp):
"""
Reads the list of players listed in the game file and adds to the player IDs file if they are not there already.
:param pbp: json, the raw pbp
:return: nothing
"""
playerdict = pbp['gameData']['players'] # yields the subdictionary with players
ids = [key[2:] for key in playerdict] # keys are format "ID[PlayerID]"; pull that PlayerID part
update_player_ids_file(ids)
def update_player_logs_from_page(pbp, season, game):
"""
Takes the game play by play and adds players to the master player log file, noting that they were on the roster
for this game, which team they played for, and their status (P for played, S for scratch).
:param season: int, the season
:param game: int, the game
:param pbp: json, the pbp of the game
:return: nothing
"""
# Get players who played, and scratches, from boxscore
home_played = helpers.try_to_access_dict(pbp, 'liveData', 'boxscore', 'teams', 'home', 'players')
road_played = helpers.try_to_access_dict(pbp, 'liveData', 'boxscore', 'teams', 'away', 'players')
home_scratches = helpers.try_to_access_dict(pbp, 'liveData', 'boxscore', 'teams', 'home', 'scratches')
road_scratches = helpers.try_to_access_dict(pbp, 'liveData', 'boxscore', 'teams', 'away', 'scratches')
# Played are both dicts, so make them lists
home_played = [int(pid[2:]) for pid in home_played]
road_played = [int(pid[2:]) for pid in road_played]
# Played may include scratches, so make sure to remove them
home_played = list(set(home_played).difference(set(home_scratches)))
road_played = list(set(road_played).difference(set(road_scratches)))
# Get home and road names
gameinfo = schedules.get_game_data_from_schedule(season, game)
# Update player logs
update_player_log_file(home_played, season, game, gameinfo['Home'], 'P')
update_player_log_file(home_scratches, season, game, gameinfo['Home'], 'S')
update_player_log_file(road_played, season, game, gameinfo['Road'], 'P')
update_player_log_file(road_scratches, season, game, gameinfo['Road'], 'S')
# TODO: One issue is we do not see goalies (and maybe skaters) who dressed but did not play. How can this be fixed?
player_setup()