scraper/get_posts.py
import osfrom time import strftimeimport csvimport json def process_posts(page, status, message, status_published): """Escreve os posts de forma clara e concisa no JSON""" post = pretty_post(status, message) post = get_reactions_info(post, status, message) post['published'] = status_published specific_comments = {} num_of_comment = 0 for comment in status['comments']['data']: specific_comments['comment ' + str(num_of_comment)] = \ comment['message'] num_of_comment += 1 post['specific_comments'] = specific_comments try: path = 'json/posts/' + str(page) + '/' + status['id'] + '.json' with open(path, 'w', encoding='utf8') as post_file: post_file.write(json.dumps(post, indent=2, ensure_ascii=False)) except Exception as e: print('Algo errado na escrita do post' + str(e)) def pretty_post(status, message): """Transforma o dicionariario em algo mais legível""" post = {} post['id'] = status['id'] post['type'] = status['type'] post['message'] = '' if 'message' not in message.keys() else \ message['message'] post['link_to_post'] = '' if 'link' not in status else \ status['link'] return post def get_reactions_info(post, status, message): """Pega as reações específicas como likes, sad dentre outras""" post['story'] = message['story'] if 'story' in message.keys() else '' reactions = ['like', 'wow', 'sad', 'love', 'haha', 'angry', 'reactions', 'comments'] for react in reactions: post[react] = status[react]['summary']['total_count'] return post def write_posts_to_csv(): """ Escreve os posts no arquivo CSV conforme solicitado pelo cliente """ path = 'json/posts' columns = ['id', 'message', 'type', 'published', 'story', 'reactions', 'love', 'like', 'wow', 'sad', 'angry', 'haha', 'link_to_post'] list_of_actors = os.listdir(path) time = strftime("%Y-%m-%d") for actor in list_of_actors: list_of_content = [] list_of_posts = os.listdir(path + '/' + actor) for post in list_of_posts: json_post = path + '/' + actor + '/' + post with open(json_post, 'r', encoding='utf8') as json_post: content = json.load(json_post) list_of_content.append(get_info(content, columns)) actor_file_name = 'csv/' + time + '/' + actor + '.csv' dump_to_csv(actor_file_name, list_of_content, columns) def dump_to_csv(path, list_of_content, columns): """Joga toda info coletada no CSV""" with open(path, 'w', encoding='utf8') as csv_file: info = csv.writer(csv_file) info.writerow(columns) for row in list_of_content: info.writerow(row) def get_info(content, keys): """Função auxiliar de write_posts_to_csv""" list_of_content = [] for key in keys: list_of_content.append(content[key]) return list_of_content def write_comments_to_csv(): """Escreve os comentários no arquivo CSV""" path = 'json/posts' list_of_actors = os.listdir(path) time = strftime("%Y-%m-%d") for actor in list_of_actors: comt_file_name = 'csv/' + time + '/' + actor +\ '_comments.csv' csv_comt_file = open(comt_file_name, 'w', encoding='utf8') comments_info = csv.writer(csv_comt_file) list_of_posts = os.listdir(path + '/' + actor) for post in list_of_posts: list_of_comments = [] json_post = path + '/' + actor + '/' + post with open(json_post, 'r', encoding='utf8') as json_post: content = json.load(json_post) list_of_comments.append(content['id']) list_of_comments = dict_to_list( content['specific_comments'], list_of_comments) comments_info.writerow(list_of_comments) csv_comt_file.close() def dict_to_list(dictionary, list_of_comments): """Transforma um dicionário numa lista""" for comment in dictionary.values(): list_of_comments.append(comment) return list_of_comments