tweet_display/read_data.py from gedankenstuecke/twitter-analyser

tweet_display/read_data.py
Summary

Maintainability

1 day
Test Coverage

Issues
from timezonefinder import TimezoneFinder
import tempfile
import zipfile
import json
import datetime
import pytz
import ijson
import io
import pandas as pd
import requests
import os

# tzwhere_ = tzwhere.tzwhere()
tzf = TimezoneFinder()


# READ JSON FILES FROM TWITTER ARCHIVE!

def check_hashtag(single_tweet):
    '''check whether tweet has any hashtags'''
    return len(single_tweet['entities']['hashtags']) > 0


def check_media(single_tweet):
    '''check whether tweet has any media attached'''
    if 'media' in single_tweet['entities'].keys():
        return len(single_tweet['entities']['media']) > 0
    else:
        return False


def check_url(single_tweet):
    '''check whether tweet has any urls attached'''
    return len(single_tweet['entities']['urls']) > 0


def check_retweet(single_tweet):
    '''
    check whether tweet is a RT. If yes:
    return name & user name of the RT'd user.
    otherwise just return nones
    '''
    if 'full_text' in single_tweet.keys():
        if single_tweet['full_text'].startswith("RT @"):
            if len(single_tweet['entities']['user_mentions']) > 0:
                return (
                  single_tweet['entities']['user_mentions'][0]['screen_name'],
                  single_tweet['entities']['user_mentions'][0]['name'])
    if 'retweeted_status' in single_tweet.keys():
        return (single_tweet['retweeted_status']['user']['screen_name'],
                single_tweet['retweeted_status']['user']['name'])
    return (None, None)


def check_coordinates(single_tweet):
    '''
    check whether tweet has coordinates attached.
    if yes return the coordinates
    otherwise just return nones
    '''
    if 'geo' in single_tweet.keys():
        if 'coordinates' in single_tweet['geo'].keys():
            return (float(single_tweet['geo']['coordinates'][0]),
                    float(single_tweet['geo']['coordinates'][1]))
        else:
            return (None, None)
    else:
        return (None, None)


def check_reply_to(single_tweet):
    '''
    check whether tweet is a reply. If yes:
    return name & user name of the user that's replied to.
    otherwise just return nones
    '''
    if 'in_reply_to_screen_name' in single_tweet.keys():
        name = None
        for user in single_tweet['entities']['user_mentions']:
            if user['screen_name'] == single_tweet['in_reply_to_screen_name']:
                name = user['name']
                break
        return (single_tweet['in_reply_to_screen_name'], name)
    else:
        return (None, None)


def convert_time(coordinates, time_utc):
    '''
    Does this tweet have a geo location? if yes
    we can easily convert the UTC timestamp to true local time!
    otherwise return nones
    '''
    if coordinates[0] and coordinates[1]:
        timezone_str = tzf.timezone_at(lat=coordinates[0], lng=coordinates[1])
        if timezone_str:
            timezone = pytz.timezone(timezone_str)
            time_obj_local = datetime.datetime.astimezone(time_utc, timezone)
            return time_obj_local


def create_dataframe(tweets):
    '''
    create a pandas dataframe from our tweet jsons
    '''

    # initalize empty lists
    utc_time = []
    longitude = []
    latitude = []
    local_time = []
    hashtag = []
    media = []
    url = []
    retweet_user_name = []
    retweet_name = []
    reply_user_name = []
    reply_name = []
    text = []
    # iterate over all tweets and extract data
    for single_tweet in tweets:
        try:
            utc_time.append(
                datetime.datetime.strptime(
                    single_tweet['tweet']['created_at'],
                    '%a %b %d %H:%M:%S %z %Y'))
        except ValueError:
            utc_time.append(
                datetime.datetime.strptime(
                    single_tweet['tweet']['created_at'],
                    '%Y-%m-%d %H:%M:%S %z'))
        coordinates = check_coordinates(single_tweet['tweet'])
        latitude.append(coordinates[0])
        longitude.append(coordinates[1])
        try:
            creation_time = datetime.datetime.strptime(
                    single_tweet['tweet']['created_at'],
                    '%a %b %d %H:%M:%S %z %Y')
        except ValueError:
            creation_time = datetime.datetime.strptime(
                single_tweet['tweet']['created_at'],
                '%Y-%m-%d %H:%M:%S %z')
        converted_time = convert_time(coordinates, creation_time)
        local_time.append(converted_time)
        hashtag.append(check_hashtag(single_tweet['tweet']))
        media.append(check_media(single_tweet['tweet']))
        url.append(check_url(single_tweet['tweet']))
        retweet = check_retweet(single_tweet['tweet'])
        retweet_user_name.append(retweet[0])
        retweet_name.append(retweet[1])
        reply = check_reply_to(single_tweet['tweet'])
        reply_user_name.append(reply[0])
        reply_name.append(reply[1])
        if 'full_text' in single_tweet['tweet'].keys():
            text.append(single_tweet['tweet']['full_text'])
        else:
            text.append(single_tweet['tweet']['text'])
    # convert the whole shebang into a pandas dataframe
    dataframe = pd.DataFrame(data={
                            'utc_time': utc_time,
                            'local_time': local_time,
                            'latitude': latitude,
                            'longitude': longitude,
                            'hashtag': hashtag,
                            'media': media,
                            'url': url,
                            'retweet_user_name': retweet_user_name,
                            'retweet_name': retweet_name,
                            'reply_user_name': reply_user_name,
                            'reply_name': reply_name,
                            'text': text,
    })
    return dataframe


def fetch_zip_file(zip_url):
    tf = tempfile.NamedTemporaryFile()
    print('downloading files')
    tf.write(requests.get(zip_url).content)
    tf.flush()
    if zipfile.is_zipfile(tf.name):
        return (zipfile.ZipFile(tf.name), 'zipped')
    else:
        return (open(tf.name, 'r'), 'json')


def read_old_zip_archive(zf):
    with zf.open('data/js/tweet_index.js', 'r') as f:
        f = io.TextIOWrapper(f)
        d = f.readlines()[1:]
        d = "[{" + "".join(d)
        json_files = json.loads(d)
    data_frames = []
    print('iterate over individual files')
    for single_file in json_files:
        print('read ' + single_file['file_name'])
        with zf.open(single_file['file_name']) as f:
            f = io.TextIOWrapper(f)
            d = f.readlines()[1:]
            d = "".join(d)
            tweets = json.loads(d)
            df_tweets = create_dataframe(tweets)
            data_frames.append(df_tweets)
    return data_frames


def read_files(zf, filetype):
    if filetype == 'zipped':
        if 'data/js/tweet_index.js' in zf.namelist():
            print('reading index')
            data_frames = read_old_zip_archive(zf)
            return data_frames
        elif 'tweet.js' in zf.namelist():
            with zf.open('tweet.js') as f:
                f = io.TextIOWrapper(f)
                tweet_string = f.readlines()
                tweet_string = "".join([i.strip() for i in tweet_string])
                tweet_string = tweet_string[25:]

    elif filetype == 'json':
        tweet_string = zf.readlines()
        tweet_string = "".join([i.strip() for i in tweet_string])
        tweet_string = tweet_string[25:]
    correct_json = tempfile.NamedTemporaryFile(mode='w')
    correct_json.write(tweet_string)
    correct_json.flush()
    tweets = ijson.items(open(correct_json.name, 'r'), 'item')
    data_frame = create_dataframe(tweets)
    return [data_frame]


def create_main_dataframe(zip_url='http://ruleofthirds.de/test_archive.zip'):
    if zip_url.startswith('http'):
        print('reading zip file from web')
        zip_file, filetype = fetch_zip_file(zip_url)
    elif os.path.isfile(zip_url):
        print('reading zip file from disk')
        zip_file = zipfile.ZipFile(zip_url)
        filetype = 'zipped'
    else:
        raise ValueError('zip_url is not an URL nor a file in disk')

    dataframes = read_files(zip_file, filetype)
    print('concatenating...')
    dataframe = pd.concat(dataframes)
    dataframe = dataframe.sort_values('utc_time', ascending=False)
    dataframe = dataframe.set_index('utc_time')
    dataframe = dataframe.replace(to_replace={
                                    'url': {False: None},
                                    'hashtag': {False: None},
                                    'media': {False: None}
                                    })
    return dataframe