research/src/fetch_foursquare_info.py
import datetime
import os.path
import re
from decouple import config
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import requests
DATA_DIR = 'data'
DATE = datetime.date.today().strftime('%Y-%m-%d')
def find_newest_file(name):
"""
Assuming that the files will be in the form of :
yyyy-mm-dd-type_of_file.xz we can try to find the newest file
based on the date, but if the file doesn't exist fallback to another
date until all dates are exhausted
"""
date_regex = re.compile('\d{4}-\d{2}-\d{2}')
matches = (date_regex.findall(f) for f in os.listdir(DATA_DIR))
dates = sorted(set([l[0] for l in matches if l]), reverse=True)
for date in dates:
filename = os.path.join(DATA_DIR, '{}-{}.xz'.format(date, name))
if os.path.isfile(filename):
return filename
return None
def load_cnpjs(subquota_description):
"""Return a list of CNPJs from the given subquota_description"""
u_cols = ['cnpj_cpf', 'subquota_description']
docs = pd.read_csv(REIMBURSEMENTS_DATASET_PATH,
low_memory=False,
usecols=u_cols,
dtype={'cnpj_cpf': np.str})
meals = docs[docs.subquota_description == subquota_description]
return meals['cnpj_cpf'].unique()
def only_numbers(string):
"""Return a string w only the numbers from the given string"""
return re.sub("\D", "", string)
def load_companies_dataset(cnpjs):
"""Return a DataFrame of companies from the given list of cnpjs"""
u_cols = ['cnpj', 'trade_name', 'zip_code', 'latitude', 'longitude']
all_companies = pd.read_csv(COMPANIES_DATASET_PATH,
low_memory=False,
usecols=u_cols,
dtype={'trade_name': np.str})
all_companies = all_companies.dropna(subset=['cnpj', 'trade_name'])
all_companies['clean_cnpj'] = all_companies['cnpj'].map(only_numbers)
return all_companies[all_companies['clean_cnpj'].isin(cnpjs)]
def remaining_companies(companies, fetched_companies):
"""Return the first DF but without matching CNPJs from the second DF"""
remaining = companies[~companies['cnpj'].isin(fetched_companies['cnpj'])]
return remaining.reset_index()
def load_foursquare_companies_dataset():
"""Return a DF with the data already collected. Fallback to empty one"""
if FOURSQUARE_DATASET_PATH is not None:
return pd.read_csv(FOURSQUARE_DATASET_PATH)
return pd.DataFrame(columns=['cnpj'])
def get_venue(company):
"""Return a matching venue from Foursquare for the given company Series"""
venue = search(company)
if venue:
return fetch_venue(venue['id'])
def search(company):
"""Search Foursquare's API for a match for a given company Series"""
params = dict(DEFAULT_PARAMS)
params.update({'query': company['trade_name'],
'll': '%s,%s' % (company['latitude'], company['longitude']),
'zip_code': company['zip_code'],
'intent': 'match'})
url = 'https://api.foursquare.com/v2/venues/search'
response = requests.get(url, params=params)
result = parse_search_results(response, True)
if not result:
params.pop('intent')
response = requests.get(url, params=params)
result = parse_search_results(response, False)
return result
def parse_search_results(response, confirmed_match):
"""Return the first venue from the given search response"""
json_response = response.json()
venues = json_response.get('response', {}).get('venues')
if venues:
venue = venues[0]
venue['confirmed_match'] = confirmed_match
return venue
def fetch_venue(venue_id):
"""Return specific data from Foursquare for the given venue_id"""
url = 'https://api.foursquare.com/v2/venues/%s' % venue_id
response = requests.get(url, params=DEFAULT_PARAMS)
return parse_venue_info(response)
def parse_venue_info(response):
"""Return only venue data from the given fetch_venue response"""
json_response = response.json()
venue = json_response.get('response', {}).get('venue')
return venue
def write_fetched_companies(companies):
"""Save a compressed CSV file with the given DF"""
companies.to_csv(OUTPUT_DATASET_PATH,
compression='xz',
index=False)
# Foursquare API Version. This is in YYYYMMDD format.
VERSION = '20161021'
# Required params to make a request to Foursquare's API
# You can create your own API keys at https://pt.foursquare.com/developers/register
DEFAULT_PARAMS = {'client_id': config('FOURSQUARE_CLIENT_ID'),
'client_secret': config('FOURSQUARE_CLIENT_SECRET'),
'v': VERSION}
# Dataset paths
REIMBURSEMENTS_DATASET_PATH = find_newest_file('reimbursements')
COMPANIES_DATASET_PATH = find_newest_file('companies')
FOURSQUARE_DATASET_PATH = find_newest_file('foursquare-companies')
OUTPUT_DATASET = '{}-foursquare-companies.xz'.format(DATE)
OUTPUT_DATASET_PATH = os.path.join(DATA_DIR, OUTPUT_DATASET)
if __name__ == '__main__':
meal_cnpjs = load_cnpjs('Congressperson meal')
meal_companies = load_companies_dataset(meal_cnpjs)
fetched_companies = load_foursquare_companies_dataset()
remaining_companies = remaining_companies(
meal_companies, fetched_companies)
for index, company in remaining_companies.iterrows():
print('Looking for: %s' % company['trade_name'])
fetched = get_venue(company)
if fetched:
print('Was found : %s! <=======<<<' % fetched['name'])
fetched['trade_name'] = company['trade_name']
fetched['cnpj'] = company['cnpj']
fetched['clean_cnpj'] = company['clean_cnpj']
fetched['scraped_at'] = datetime.datetime.utcnow().isoformat()
fetched = json_normalize(fetched)
fetched_companies = pd.concat([fetched_companies, fetched])
else:
print('No results.')
if (index % 100) == 0 and index > 0:
print('###########################################')
print("%s companies fetched. Stopping to save." % index)
write_fetched_companies(fetched_companies)
print('###########################################')
write_fetched_companies(fetched_companies)