scraper.py
# coding=utf-8
import scraperwiki
import lxml.html
import lxml.etree as etree
import sqlite3
import re
from datetime import datetime as dt
import lcc_id_map
CURRENT_MEMBERS_URL = 'https://democracy.leeds.gov.uk/mgMemberIndex.aspx?VW=TABLE&PIC=1&FN='
def merge_two_dicts(x, y):
z = x.copy() # start with x's keys and values
z.update(y) # modifies z with y's keys and values & returns None
return z
def parse_date(date):
parts = date.split('/')
if len(parts)== 3:
return parts[2] + '-' + parts[1] + '-' + parts[0]
def cleanup(string):
# Strip any annoying whitespace
string = string.strip()
# Lose any curled apostrophies
string = string.replace(u'’', '\'')
return string
def get_content_of_label(page, label):
element = page.xpath('//span[contains(text(),\'' + label + ':\')]/parent::p')
if element:
return element[0].text_content().replace(label + ':', '').strip()
else:
return None
def scrape_member_page(id):
page_url = cleanup('https://democracy.leeds.gov.uk/mgUserInfo.aspx?UID=' + str(id))
print(' Scraping ' + page_url)
html = scraperwiki.scrape(page_url)
pageRoot = lxml.html.fromstring(html)
memberSessions = []
memberData = {}
memberData['lcc_id'] = str(id)
memberData['url'] = page_url
nameTitle = pageRoot.cssselect('#modgov h1')[0]
nameUnparsed = nameTitle.text.strip()
nameRegex = re.search('(.+?) (.+)', nameUnparsed)
memberData['honorific_string'] = nameRegex.group(1)
memberData['name'] = cleanup(nameRegex.group(2))
print(' Name: ' + memberData['name'])
party = get_content_of_label(pageRoot, 'Party')
ward = get_content_of_label(pageRoot, 'Ward')
# Check to see if the person is reconciled or not
if memberData['lcc_id'] in lcc_id_map.people_ids:
memberData['wikidata_id'] = lcc_id_map.people_ids[memberData['lcc_id']]
else:
unreconciledPeople.append(memberData['name'] + ' (' + memberData['lcc_id'] + ')')
# Check to see if the party is reconciled or not
if party in lcc_id_map.party_names:
party_id = lcc_id_map.party_names[party]
elif party is not None:
unreconciledParties.append(party)
party_id = None
# Check to see if the ward is reconciled or not
if ward in lcc_id_map.ward_names:
ward_id = lcc_id_map.ward_names[ward]
elif ward is not None:
unreconciledPeople.append(ward)
ward_id = None
terms = pageRoot.xpath('//h2[contains(text(),\'Terms of Office\')]/following::ul[1]/li')
# Sometimes this is "Term of Office"
if not terms:
terms = pageRoot.xpath('//h2[contains(text(),\'Term of Office\')]/following::ul[1]/li')
if id in current_member_ids:
needs_current_term = True
has_current_term = False
else:
needs_current_term = False
has_current_term = True
for term in terms:
# Explode it into two bits
parts = term.text.split('-')
if len(parts) == 2:
startRaw = parse_date(parts[0].strip())
start = startRaw + 'T00:00:00Z'
endRaw = parse_date(parts[1].strip())
if endRaw:
end = endRaw + 'T00:00:00Z'
else:
end = None
sessionDetails = {
'id': id + '-' + start,
'start': start
}
end_date = dt.strptime(endRaw, "%Y-%m-%d")
if end_date >= dt.now():
print(' Found current term ' + term.text + '.')
has_current_term = True
sessionDetails['current'] = True
sessionDetails['end'] = None
sessionDetails['party'] = party
sessionDetails['party_id'] = party_id
sessionDetails['ward'] = ward
sessionDetails['ward_id'] = ward_id
else:
print(' Found non-current term ' + term.text + '.')
sessionDetails['current'] = False
sessionDetails['end'] = end
memberSessions.append(merge_two_dicts(memberData, sessionDetails))
print(' Added term ' + term.text + '.')
else:
print(' Skipped "' + term.text + '", does not appear to be a date range.')
# Need a current term but don't yet have one? Inject a fake one!
if needs_current_term == True and has_current_term == False:
sessionDetails = {
'id': id + '-current',
'current': True,
}
memberSessions.append(merge_two_dicts(memberData, sessionDetails))
return memberSessions
parsedMemberships = []
unreconciledWards = []
unreconciledParties = []
unreconciledPeople = []
print('(i) Scraping from ' + CURRENT_MEMBERS_URL)
# Get the page!
html = scraperwiki.scrape(CURRENT_MEMBERS_URL)
ssRoot = lxml.html.fromstring(html)
rows = ssRoot.cssselect('#mgTable1 tr')
current_member_ids = []
# Skip the header row
for row in rows[1:]:
nameLink = row.cssselect('a')[0]
linkHref = nameLink.attrib['href']
idRegex = re.search('mgUserInfo\.aspx\?UID=([0-9]+)', linkHref)
current_member_ids.append(idRegex.group(1))
print('(i) Found {} current members'.format(len(current_member_ids)))
ids_to_scrape = set(current_member_ids + list(lcc_id_map.people_ids))
print('(i) Scraping {} members in total'.format(len(ids_to_scrape)))
for id in ids_to_scrape:
parsedMemberships = parsedMemberships + scrape_member_page(id)
nameLink = row.cssselect('a')[0]
print('(i) Done.')
print('(i) Counted {} memberships in total'.format(len(parsedMemberships)))
print('<!> {} unreconciled people:'.format(len(unreconciledPeople)))
print(unreconciledPeople)
print('<!> {} unreconciled wards:'.format(len(unreconciledWards)))
print(unreconciledWards)
print('<!> {} unreconciled parties:'.format(len(unreconciledParties)))
print(unreconciledParties)
try:
scraperwiki.sqlite.execute('DELETE FROM data')
except sqlite3.OperationalError:
pass
scraperwiki.sqlite.save(
unique_keys=['id'],
data=parsedMemberships)