scrapenhl2/scrape/scrape_pbp.py
"""
This module contains methods for scraping pbp.
"""
import json
import os.path
import urllib.request
import zlib
from time import sleep
from scrapenhl2.scrape import organization, schedules, general_helpers as helpers, manipulate_schedules, parse_pbp
def scrape_game_pbp_from_html(season, game, force_overwrite=True):
"""
This method scrapes the html pbp for the given game. Use for live games.
:param season: int, the season
:param game: int, the game
:param force_overwrite: bool. If file exists already, won't scrape again
:return: bool, False if not scraped, else True
"""
filename = get_game_pbplog_filename(season, game)
if not force_overwrite and os.path.exists(filename):
return False
page = get_game_from_url(season, game)
save_raw_html_pbp(page, season, game)
# ed.print_and_log('Scraped html pbp for {0:d} {1:d}'.format(season, game))
sleep(1) # Don't want to overload NHL servers
# It's most efficient to parse with page in memory, but for sake of simplicity will do it later
# pbp = read_pbp_events_from_page(page)
# update_team_logs(pbp, season, schedule_item['Home'])
return True
def scrape_game_pbp(season, game, force_overwrite=False):
"""
This method scrapes the pbp for the given game.
:param season: int, the season
:param game: int, the game
:param force_overwrite: bool. If file exists already, won't scrape again
:return: bool, False if not scraped, else True
"""
filename = get_game_raw_pbp_filename(season, game)
if not force_overwrite and os.path.exists(filename):
return False
# Use the season schedule file to get the home and road team names
# schedule_item = get_files.get_season_schedule(season) \
# .query('Game == {0:d}'.format(game)) \
# .to_dict(orient = 'series')
# The output format of above was {colname: np.array[vals]}. Change to {colname: val}
# schedule_item = {k: v.values[0] for k, v in schedule_item.items()}
page = get_game_from_url(season, game)
save_raw_pbp(page, season, game)
# ed.print_and_log('Scraped pbp for {0:d} {1:d}'.format(season, game))
sleep(1) # Don't want to overload NHL servers
# It's most efficient to parse with page in memory, but for sake of simplicity will do it later
# pbp = read_pbp_events_from_page(page)
# update_team_logs(pbp, season, schedule_item['Home'])
return True
def save_raw_html_pbp(page, season, game):
"""
Takes the bytes page containing html pbp information and saves as such
:param page: bytes
:param season: int, the season
:param game: int, the game
:return: nothing
"""
filename = get_game_pbplog_filename(season, game)
w = open(filename, 'w')
w.write(page)
w.close()
def save_raw_pbp(page, season, game):
"""
Takes the bytes page containing pbp information and saves to disk as a compressed zlib.
:param page: bytes. str(page) would yield a string version of the json pbp
:param season: int, the season
:param game: int, the game
:return: nothing
"""
try:
page2 = zlib.compress(page.encode('latin-1'), level=9)
except TypeError:
# No level kwarg before Python 3.6
page2 = zlib.compress(page.encode('latin-1'))
filename = get_game_raw_pbp_filename(season, game)
w = open(filename, 'wb')
w.write(page2)
w.close()
def get_raw_pbp(season, game):
"""
Loads the compressed json file containing this game's play by play from disk.
:param season: int, the season
:param game: int, the game
:return: json, the json pbp
"""
with open(get_game_raw_pbp_filename(season, game), 'rb') as reader:
page = reader.read()
return json.loads(str(zlib.decompress(page).decode('latin-1')))
def get_raw_html_pbp(season, game):
"""
Loads the html file containing this game's play by play from disk.
:param season: int, the season
:param game: int, the game
:return: str, the html pbp
"""
with open(get_game_pbplog_filename(season, game), 'r') as reader:
page = reader.read()
return page
def get_game_from_url(season, game):
"""
Gets the page containing information for specified game from NHL API.
:param season: int, the season
:param game: int, the game
:return: str, the page at the url
"""
return helpers.try_url_n_times(get_game_url(season, game))
def get_game_pbplog_url(season, game):
"""
Gets the url for a page containing pbp information for specified game from HTML tables.
:param season: int, the season
:param game: int, the game
:return : str, e.g. http://www.nhl.com/scores/htmlreports/20072008/PL020001.HTM
"""
return 'http://www.nhl.com/scores/htmlreports/{0:d}{1:d}/PL0{2:d}.HTM'.format(season, season + 1, game)
def get_game_url(season, game):
"""
Gets the url for a page containing information for specified game from NHL API.
:param season: int, the season
:param game: int, the game
:return: str, https://statsapi.web.nhl.com/api/v1/game/[season]0[game]/feed/live
"""
return 'https://statsapi.web.nhl.com/api/v1/game/{0:d}0{1:d}/feed/live'.format(season, game)
def get_game_raw_pbp_filename(season, game):
"""
Returns the filename of the raw pbp folder
:param season: int, current season
:param game: int, game
:return: str, /scrape/data/raw/pbp/[season]/[game].zlib
"""
return os.path.join(organization.get_season_raw_pbp_folder(season), str(game) + '.zlib')
def get_game_pbplog_filename(season, game):
"""
Returns the filename of the parsed pbp html game pbp
:param season: int, current season
:param game: int, game
:return: str, /scrape/data/raw/pbp/[season]/[game].html
"""
return os.path.join(organization.get_season_raw_pbp_folder(season), str(game) + '.html')
def scrape_season_pbp(season, force_overwrite=False):
"""
Scrapes and parses pbp from the given season.
:param season: int, the season
:param force_overwrite: bool. If true, rescrapes all games. If false, only previously unscraped ones
:return: nothing
"""
if season is None:
season = schedules.get_current_season()
sch = schedules.get_season_schedule(season)
games = sch[sch.Status == "Final"].Game.values
games.sort()
intervals = helpers.intervals(games)
interval_j = 0
for i, game in enumerate(games):
try:
scrape_game_pbp(season, game, force_overwrite)
manipulate_schedules.update_schedule_with_pbp_scrape(season, game)
parse_pbp.parse_game_pbp(season, game, True)
except Exception as e:
pass # ed.print_and_log('{0:d} {1:d} {2:s}'.format(season, game, str(e)), 'warn')
if interval_j < len(intervals):
if i == intervals[interval_j][0]:
print('Done scraping through {0:d} {1:d} ({2:d}%)'.format(
season, game, round(intervals[interval_j][0] / len(games) * 100)))
interval_j += 1
def scrape_pbp_setup():
"""
Creates raw pbp folders if need be
:return:
"""
for season in range(2005, schedules.get_current_season() + 1):
organization.check_create_folder(organization.get_season_raw_pbp_folder(season))
scrape_pbp_setup()