scrapenhl2/scrape/scrape_toi.py
"""
This module contains methods for scraping TOI.
"""
import json
import os.path
import urllib.request
import zlib
from time import sleep
from scrapenhl2.scrape import organization, schedules, manipulate_schedules, general_helpers as helpers, parse_toi
def scrape_game_toi(season, game, force_overwrite=False):
"""
This method scrapes the toi for the given game.
:param season: int, the season
:param game: int, the game
:param force_overwrite: bool. If file exists already, won't scrape again
:return: nothing
"""
filename = get_game_raw_toi_filename(season, game)
if not force_overwrite and os.path.exists(filename):
return False
page = helpers.try_url_n_times(get_shift_url(season, game))
save_raw_toi(page, season, game)
# ed.print_and_log('Scraped toi for {0:d} {1:d}'.format(season, game))
sleep(1) # Don't want to overload NHL servers
# It's most efficient to parse with page in memory, but for sake of simplicity will do it later
# toi = read_toi_from_page(page)
return True
def get_home_shiftlog_filename(season, game):
"""
Returns the filename of the parsed toi html home shifts
:param season: int, the season
:param game: int, the game
:return: str, /scrape/data/raw/pbp/[season]/[game]H.html
"""
return os.path.join(organization.get_season_raw_toi_folder(season), str(game) + 'H.html')
def get_road_shiftlog_filename(season, game):
"""
Returns the filename of the parsed toi html road shifts
:param season: int, current season
:param game: int, game
:return: str, /scrape/data/raw/pbp/[season]/[game]H.html
"""
return os.path.join(organization.get_season_raw_toi_folder(season), str(game) + 'R.html')
def scrape_game_toi_from_html(season, game, force_overwrite=True):
"""
This method scrapes the toi html logs for the given game.
:param season: int, the season
:param game: int, the game
:param force_overwrite: bool. If file exists already, won't scrape again
:return: nothing
"""
filenames = (get_home_shiftlog_filename(season, game), get_road_shiftlog_filename(season, game))
urls = (get_home_shiftlog_url(season, game), get_road_shiftlog_url(season, game))
filetypes = ('H', 'R')
for i in range(2):
filename = filenames[i]
if not force_overwrite and os.path.exists(filename):
pass
page = helpers.try_url_n_times(urls[i])
save_raw_toi_from_html(page, season, game, filetypes[i])
sleep(1) # Don't want to overload NHL servers
print('Scraped html toi for {0:d} {1:d}'.format(season, game))
def save_raw_toi(page, season, game):
"""
Takes the bytes page containing shift information and saves to disk as a compressed zlib.
:param page: bytes. str(page) would yield a string version of the json shifts
:param season: int, the season
:param game: int, the game
:return: nothing
"""
try:
page2 = zlib.compress(page.encode('latin-1'), level=9)
except TypeError:
# No level kwarg before Python 3.6
page2 = zlib.compress(page.encode('latin-1'))
filename = get_game_raw_toi_filename(season, game)
w = open(filename, 'wb')
w.write(page2)
w.close()
def save_raw_toi_from_html(page, season, game, homeroad):
"""
Takes the bytes page containing shift information and saves to disk as html.
:param page: bytes. str(page) would yield a string version of the json shifts
:param season: int, he season
:param game: int, the game
:param homeroad: str, 'H' or 'R'
:return: nothing
"""
if homeroad == 'H':
filename = get_home_shiftlog_filename(season, game)
elif homeroad == 'R':
filename = get_road_shiftlog_filename(season, game)
w = open(filename, 'w')
if type(page) != str:
page = page.decode('latin-1')
w.write(page)
w.close()
def get_raw_html_toi(season, game, homeroad):
"""
Loads the html file containing this game's toi from disk.
:param season: int, the season
:param game: int, the game
:param homeroad: str, 'H' for home or 'R' for road
:return: str, the html toi
"""
if homeroad == 'H':
filename = get_home_shiftlog_filename(season, game)
elif homeroad == 'R':
filename = get_road_shiftlog_filename(season, game)
with open(filename, 'r') as reader:
page = reader.read()
return page
def get_raw_toi(season, game):
"""
Loads the compressed json file containing this game's shifts from disk.
:param season: int, the season
:param game: int, the game
:return: dict, the json shifts
"""
with open(get_game_raw_toi_filename(season, game), 'rb') as reader:
page = reader.read()
return json.loads(str(zlib.decompress(page).decode('latin-1')))
def get_home_shiftlog_url(season, game):
"""
Gets the url for a page containing shift information for specified game from HTML tables for home team.
:param season: int, the season
:param game: int, the game
:return : str, e.g. http://www.nhl.com/scores/htmlreports/20072008/TH020001.HTM
"""
return 'http://www.nhl.com/scores/htmlreports/{0:d}{1:d}/TH0{2:d}.HTM'.format(season, season + 1, game)
def get_road_shiftlog_url(season, game):
"""
Gets the url for a page containing shift information for specified game from HTML tables for road team.
:param season: int, the season
:param game: int, the game
:return : str, e.g. http://www.nhl.com/scores/htmlreports/20072008/TV020001.HTM
"""
return 'http://www.nhl.com/scores/htmlreports/{0:d}{1:d}/TV0{2:d}.HTM'.format(season, season + 1, game)
def get_shift_url(season, game):
"""
Gets the url for a page containing shift information for specified game from NHL API.
:param season: int, the season
:param game: int, the game
:return : str, http://www.nhl.com/stats/rest/shiftcharts?cayenneExp=gameId=[season]0[game]
"""
return 'http://www.nhl.com/stats/rest/shiftcharts?cayenneExp=gameId={0:d}0{1:d}'.format(season, game)
def get_game_raw_toi_filename(season, game):
"""
Returns the filename of the raw toi folder
:param season: int, current season
:param game: int, game
:return: str, /scrape/data/raw/toi/[season]/[game].zlib
"""
return os.path.join(organization.get_season_raw_toi_folder(season), str(game) + '.zlib')
def scrape_season_toi(season, force_overwrite=False):
"""
Scrapes and parses toi from the given season.
:param season: int, the season
:param force_overwrite: bool. If true, rescrapes all games. If false, only previously unscraped ones
:return: nothing
"""
if season is None:
season = schedules.get_current_season()
sch = schedules.get_season_schedule(season)
games = sch[sch.Status == "Final"].Game.values
games.sort()
intervals = helpers.intervals(games)
interval_j = 0
for i, game in enumerate(games):
try:
scrape_game_toi(season, game, force_overwrite)
manipulate_schedules.update_schedule_with_pbp_scrape(season, game)
parse_toi.parse_game_pbp(season, game, True)
if len(parse_toi.get_parsed_toi(season, game)) < 3600:
scrape_game_toi_from_html(season, game, True)
parse_toi.parse_game_toi_from_html(season, game, True)
except Exception as e:
pass # ed.print_and_log('{0:d} {1:d} {2:s}'.format(season, game, str(e)), 'warn')
if interval_j < len(intervals):
if i == intervals[interval_j][0]:
print('Done scraping through {0:d} {1:d} ({2:d}%)'.format(
season, game, round(intervals[interval_j][0] / len(games) * 100)))
interval_j += 1
def scrape_toi_setup():
"""
Creates raw toi folders if need be
:return:
"""
for season in range(2005, schedules.get_current_season() + 1):
organization.check_create_folder(organization.get_season_raw_toi_folder(season))
scrape_toi_setup()