habr/user.py
from copy import deepcopy
from unittest import TestCase
from lxml import html
import requests
__author__ = 'icoz'
def get_pages(doc):
pages_data = doc.xpath("//ul[@id='nav-pages']//a[last()]")
# print(pages_data)
if len(pages_data) > 0:
pg_text = str(pages_data[-1].attrib['href']).split('/')[-2]
# print('pages=',pg_text[4:])
pages = int(pg_text[4:])
else:
pages = 1
return pages
class TMUser(object):
def __init__(self, username, need_favorites=False, need_user_posts=False, domain='habrahabr.ru'):
self._domain = domain
self._username = username
self._user = dict()
self._user_karma = dict()
self._user_profile = dict()
self._user_activity = dict()
# print(self._genUrlForUsername(username))
req_data = requests.get(self._genUrlForUsername(username)).text
self._doc = html.document_fromstring(req_data)
self._parseUserpage()
self._user_favorites = dict()
self._user_favorites_loaded = need_favorites
if need_favorites:
self._user_favorites = self._getFavorites()
self._user_posts = dict()
self._user_posts_loaded = need_user_posts
if need_user_posts:
self._user_posts = self._getUserPosts()
def favorites(self):
"""
Returns dict by name of topic_id
:param username:
string of username, ex. 'some_user'
:return:
dict(name) = id
"""
if not self._user_favorites_loaded:
self._user_favorites = self._getFavorites()
self._user_favorites_loaded = True
return deepcopy(self._user_favorites)
def user_posts(self):
if not self._user_posts_loaded:
self._user_posts = self._getUserPosts()
self._user_posts_loaded = True
return deepcopy(self._user_posts)
def profile(self):
return deepcopy(self._user_profile)
def activity(self):
return deepcopy(self._user_activity)
def karma(self):
return deepcopy(self._user_karma)
def _genFavoritesUrlByUser(self, username):
'''
Generates favirites URL using username
:param id:
string with username
:return:
string with URL
'''
return self._genUrlForUsername(username) + 'favorites/'
def _genUrlForUsername(self, username):
'''
Generates user-page URL using username
:param id:
string with username
:return:
string with URL
'''
return 'http://{domain}/users/{username}/'.format(domain=self._domain, username=username)
def _getUserCompanyList(self):
out = []
cmpns = self._doc.xpath("//div[@class='user_profile']/dl[@id='favorite_companies_list']//a")
for company in cmpns:
out.append((company.text, company.attrib['href']))
return out
def _getUserHubList(self):
out = []
hubs = self._doc.xpath("//div[@class='user_profile']/dl[@class='hubs_list']//a[@class='cross']")
for hub in hubs:
out.append((hub.text, hub.attrib['href']))
return out
def _parseUserpage(self):
# print(self._doc)
# check for BAN
val = self._doc.xpath("//div[@class='main']/h1")
if val and val[0].text.strip() == "Доступ закрыт":
# maybe raise ERROR???
return
p_tags = self._doc.xpath("//div[@class='user_profile']//ul[@id='people-tags']//a/span")
# date_of_registration = self._doc.xpath("//div[@class='user_profile']//dd[@class='grey']")[0].text.strip()
tmp = self._doc.xpath("//div[@class='user_profile']//p[@class='profile-section__invited']")
date_of_registration = tmp[0].text.strip() if tmp else ""
tmp = self._doc.xpath("//div[@class='user_profile']//dl[last()]/dd")
date_of_last_login = tmp[0].text.strip()
tmp = self._doc.xpath("//div[@class='user_header']/h2/a")
self._user['username'] = tmp.pop().text if len(tmp) else self._username
tmp = self._doc.xpath("//div[@class='karma']//div[@class='num']")
self._user_karma['karma'] = float(tmp.pop().text.replace(',', '.').replace("–","-")) if len(tmp) else 0.0
tmp = self._doc.xpath("//div[@class='karma']/div[@class='votes']")
self._user_karma['karma_vote'] = int(tmp.pop().text.split(' ')[0]) if len(tmp) else 0
tmp = self._doc.xpath("//div[@class='rating']/div[@class='num']")
self._user_karma['rating'] = float(tmp.pop().text.replace(',', '.').replace("–","-")) if len(tmp) else 0.0
tmp = self._doc.xpath("//div[@class='user_profile']/div[@class='fullname']")
self._user_profile['fullname'] = tmp.pop().text.strip() if len(tmp) else ''
tmp = self._doc.xpath("//div[@class='user_profile']/div[@class='fullname']/sup")
self._user_profile['is_read_only'] = True if len(tmp) else False
tmp = self._doc.xpath("//div[@class='user_profile']/div[@class='rating-place']")
try:
self._user_karma['rating_place'] = int(tmp.pop().text.split('-')[0]) if len(tmp) else -1
except ValueError:
self._user_karma['rating_place'] = -1
tmp = self._doc.xpath("//div[@class='user_profile']//dd[@class='bday']")
self._user_profile['birthday'] = tmp[0].text if len(tmp) else ''
tmp = self._doc.xpath("//div[@class='user_profile']//dd/a[@class='country-name']")
self._user_profile['country'] = tmp[0].text if len(tmp) else ''
tmp = self._doc.xpath("//div[@class='user_profile']//dd/a[@class='region']")
self._user_profile['region'] = tmp[0].text if len(tmp) else ''
tmp = self._doc.xpath("//div[@class='user_profile']//dd/a[@class='city']")
self._user_profile['city'] = tmp[0].text if len(tmp) else ''
self._user_profile['people_tags'] = [i for i in map(lambda x: x.text, p_tags)]
try:
self._user_profile['registration_date'] = date_of_registration[:date_of_registration.index('\n')].strip()
except ValueError:
self._user_profile['registration_date'] = date_of_registration
self._user_profile['last_login_date'] = date_of_last_login[27:] if len(date_of_last_login) > 27 else ''
tmp = self._doc.xpath("//div[@class='stats']/div[@id='followers_count']/a")
self._user_activity['followers_count'] = int(tmp.pop().text.split(' ')[0]) if len(tmp) else 0
tmp = self._doc.xpath("//div[@class='stats']/div[@class='item posts_count']/a")
self._user_activity['posts_count'] = int(tmp.pop().text.split(' ')[0]) if len(tmp) else 0
tmp = self._doc.xpath("//div[@class='stats']/div[@class='item comments_count']/a")
self._user_activity['comments_count'] = int(tmp.pop().text.split(' ')[0]) if len(tmp) else 0
self._user['company_list'] = self._getUserCompanyList()
self._user['hubs_list'] = self._getUserHubList()
self._user['profile'] = self._user_profile
self._user['activity'] = self._user_activity
self._user['karma'] = self._user_karma
def _getFavorites(self):
"""
Returns dict by name of topic_id
:param username:
string of username, ex. 'some_user'
:return:
dict(name) = id
"""
url = self._genFavoritesUrlByUser(self._username)
doc = html.document_fromstring(requests.get(url).text)
# check for BAN
val = self._doc.xpath("//div[@class='main']/h1")
if val and val[0].text.strip() == "Доступ закрыт":
# maybe raise ERROR???
return
out = dict()
pages = get_pages(doc)
favs = doc.xpath("//div[@class='user_favorites']//a[@class='post__title_link']")
for f in favs:
# out[f.text] = str(f.attrib['href']).split('/')[-2]
# topic_id =
out[f.text] = str(f.attrib['href']).split('/')[-2]
for p in range(2, pages):
url = 'http://{0}/users/{1}/favorites/page{2}/'.format(self._domain, self._username, p)
# if show_progress:
# print('parsing page{0}... url={1}'.format(p, url))
doc = html.document_fromstring(requests.get(url).text)
favs = doc.xpath("//div[@class='user_favorites']//a[@class='post__title_link']")
for f in favs:
# out[f.text] = f.attrib['href'][-7:-1]
out[f.text] = str(f.attrib['href']).split('/')[-2]
return out
def _getUserPosts(self):
url = self._genUrlForUsername(self._username) + 'topics/'
req = requests.get(url)
if req.status_code != 200:
raise IOError('doc not found. URL = {}'.format(url))
doc = html.document_fromstring(req.text)
out = dict()
pages = get_pages(doc)
posts = doc.xpath("//div[@class='posts_list']//a[@class='post_title']")
for f in posts:
# print(f.text)
out[f.text] = str(f.attrib['href']).split('/')[-2]
# out[f.text] = f.attrib['href'][-7:-1]
for p in range(2, pages):
url = self._genUrlForUsername(self._username) + 'topics/page{0}/'.format(p)
req = requests.get(url)
if req.status_code != 200:
raise IOError('doc not found. URL = {}'.format(url))
doc = html.document_fromstring(req.text)
posts = doc.xpath("//div[@class='posts_list']//a[@class='post_title']")
for f in posts:
out[f.text] = str(f.attrib['href']).split('/')[-2]
return out
class HabraUser(TMUser):
def __init__(self, username, need_favorites=False, need_user_posts=False):
super().__init__(username, need_favorites, need_user_posts=need_user_posts, domain='habrahabr.ru')
class GeektimesUser(TMUser):
def __init__(self, username, need_favorites=False, need_user_posts=False):
super().__init__(username, need_favorites, need_user_posts=need_user_posts, domain='geektimes.ru')
# R.I.P.
# class MegamozgUser(TMUser):
# def __init__(self, username, need_favorites=False, need_user_posts=False):
# super().__init__(username, need_favorites, need_user_posts=need_user_posts, domain='megamozg.ru')
import pprint
class Test_HabraUser(TestCase):
def setUp(self):
self.hu = HabraUser('icoz')
self.pp = pprint.PrettyPrinter(indent=4)
pass
def test_parseUserpage(self):
self.pp.pprint(self.hu.activity())
self.pp.pprint(self.hu.profile())
self.pp.pprint(self.hu.karma())
def test_favs(self):
self.pp.pprint(self.hu.favorites())
# def test_readonly_user(self):
# self.pp.pprint('starting test for readonly xvitaly')
# hu = HabraUser('xvitaly')
# self.pp.pprint('date=')
# self.pp.pprint(hu.profile()['registration_date'])
def test_user_posts(self):
hu = HabraUser('Zelenyikot')
self.pp.pprint('userposts=')
self.pp.pprint(hu.user_posts())
def test_rating_place(self):
self.pp.pprint('starting test for lokkersp')
hu = HabraUser('lokkersp')
self.pp.pprint('karma=')
self.pp.pprint(hu.karma())
class Test_GeektimesUser(TestCase):
def setUp(self):
self.hu = GeektimesUser('icoz')
pass
def test_parseUserpage(self):
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(self.hu.activity())
pp.pprint(self.hu.profile())
pp.pprint(self.hu.karma())
# def test_favs(self):
# pp = pprint.PrettyPrinter(indent=4)
def test_user_posts(self):
hu = GeektimesUser('Zelenyikot')
pp = pprint.PrettyPrinter(indent=4)
pp.pprint('userposts=')
pp.pprint(hu.user_posts())
#
# class Test_MegamozgUser(TestCase):
# def setUp(self):
# self.hu = MegamozgUser('icoz')
# pass
#
# def test_parseUserpage(self):
# pp = pprint.PrettyPrinter(indent=4)
# pp.pprint(self.hu.activity())
# pp.pprint(self.hu.profile())
# pp.pprint(self.hu.karma())
#
# # def test_favs(self):
# # pp = pprint.PrettyPrinter(indent=4)
#
# def test_user_posts(self):
# hu = MegamozgUser('Zelenyikot')
# pp = pprint.PrettyPrinter(indent=4)
# pp.pprint('userposts=')
# pp.pprint(hu.user_posts())
#