Details for twitter-sentiment.ipynb

Published by gedankenstuecke

Description

This notebook uses data from a Twitter archive to perform a simple sentiment analysis and emoji usage over time using Python.

0

Tags & Data Sources

sentiment analysis sentiment Twitter Archive Analyzer

Comments

Please log in to comment.

Notebook
Last updated 2 months, 3 weeks ago

Doing simple sentiment analyses on a Twitter archive

Here we explore how the Personal Data Notebooks can be used to get additional information out of a full Twitter archive. To use this notebook you need to have uploaded a Twitter archive into your Open Humans account through http://twarxiv.org.

In a first step we again start by loading all the needed modules.

After this follows a huge list of function declarations with def function():. Let's take these for granted for now. I've basically copy & pasted these from the code of the Twitter Archive Analyser. I only removed the local time zone calculations as we won't need these to get started and installing them would take a rather long time.

In [1]:
# LOAD OUR MODULES 
%matplotlib inline

import os, requests, json
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tempfile
import zipfile
import pytz
import io
import sys
from textblob import TextBlob
import emoji
from ohapi import api

# THIS CODE BELOW IS COPIED FROM TWARXIV.ORG AS IT ALREADY DOES EXACTLY WHAT WE WANT FOR READING IN THE DATA

# READ JSON FILES FROM TWITTER ARCHIVE!

def check_hashtag(single_tweet):
    '''check whether tweet has any hashtags'''
    return len(single_tweet['entities']['hashtags']) > 0


def check_media(single_tweet):
    '''check whether tweet has any media attached'''
    return len(single_tweet['entities']['media']) > 0


def check_url(single_tweet):
    '''check whether tweet has any urls attached'''
    return len(single_tweet['entities']['urls']) > 0


def check_retweet(single_tweet):
    '''
    check whether tweet is a RT. If yes:
    return name & user name of the RT'd user.
    otherwise just return nones
    '''
    if 'retweeted_status' in single_tweet.keys():
        return (single_tweet['retweeted_status']['user']['screen_name'],
                single_tweet['retweeted_status']['user']['name'])
    else:
        return (None, None)


def check_coordinates(single_tweet):
    '''
    check whether tweet has coordinates attached.
    if yes return the coordinates
    otherwise just return nones
    '''
    if 'coordinates' in single_tweet['geo'].keys():
        return (single_tweet['geo']['coordinates'][0],
                single_tweet['geo']['coordinates'][1])
    else:
        return (None, None)


def check_reply_to(single_tweet):
    '''
    check whether tweet is a reply. If yes:
    return name & user name of the user that's replied to.
    otherwise just return nones
    '''
    if 'in_reply_to_screen_name' in single_tweet.keys():
        name = None
        for user in single_tweet['entities']['user_mentions']:
            if user['screen_name'] == single_tweet['in_reply_to_screen_name']:
                name = user['name']
                break
        return (single_tweet['in_reply_to_screen_name'], name)
    else:
        return (None, None)


def create_dataframe(tweets):
    '''
    create a pandas dataframe from our tweet jsons
    '''

    # initalize empty lists
    utc_time = []
    longitude = []
    latitude = []
    hashtag = []
    media = []
    url = []
    retweet_user_name = []
    retweet_name = []
    reply_user_name = []
    reply_name = []
    text = []
    # iterate over all tweets and extract data
    for single_tweet in tweets:
        utc_time.append(datetime.datetime.strptime(single_tweet['created_at'],
                                                   '%Y-%m-%d %H:%M:%S %z'))
        coordinates = check_coordinates(single_tweet)
        latitude.append(coordinates[0])
        longitude.append(coordinates[1])
        hashtag.append(check_hashtag(single_tweet))
        media.append(check_media(single_tweet))
        url.append(check_url(single_tweet))
        retweet = check_retweet(single_tweet)
        retweet_user_name.append(retweet[0])
        retweet_name.append(retweet[1])
        reply = check_reply_to(single_tweet)
        reply_user_name.append(reply[0])
        reply_name.append(reply[1])
        text.append(single_tweet['text'])
    # convert the whole shebang into a pandas dataframe
    dataframe = pd.DataFrame(data={
                            'utc_time': utc_time,
                            'latitude': latitude,
                            'longitude': longitude,
                            'hashtag': hashtag,
                            'media': media,
                            'url': url,
                            'retweet_user_name': retweet_user_name,
                            'retweet_name': retweet_name,
                            'reply_user_name': reply_user_name,
                            'reply_name': reply_name,
                            'text': text
    })
    return dataframe


def read_files(zip_url):
    tf = tempfile.NamedTemporaryFile()
    print('downloading files')
    tf.write(requests.get(zip_url).content)
    tf.flush()
    zf = zipfile.ZipFile(tf.name)
    print('reading index')
    with zf.open('data/js/tweet_index.js', 'r') as f:
        f = io.TextIOWrapper(f)
        d = f.readlines()[1:]
        d = "[{" + "".join(d)
        json_files = json.loads(d)
    data_frames = []
    print('iterate over individual files')
    for single_file in json_files:
        print('read ' + single_file['file_name'])
        with zf.open(single_file['file_name']) as f:
            f = io.TextIOWrapper(f)
            d = f.readlines()[1:]
            d = "".join(d)
            tweets = json.loads(d)
            df_tweets = create_dataframe(tweets)
            data_frames.append(df_tweets)
    return data_frames


def create_main_dataframe(zip_url='http://ruleofthirds.de/test_archive.zip'):
    print('reading files')
    dataframes = read_files(zip_url)
    print('concatenating...')
    dataframe = pd.concat(dataframes)
    dataframe = dataframe.sort_values('utc_time', ascending=False)
    dataframe = dataframe.set_index('utc_time')
    dataframe = dataframe.replace(to_replace={
                                    'url': {False: None},
                                    'hashtag': {False: None},
                                    'media': {False: None}
                                    })
    return dataframe

Once our packages are installed and we declared all of our functions we can get started with requesting the data from Open Humans. Using our access_token which we get from os.environ.get('OH_ACCESS_TOKEN') we get a json object that contains all of our data sources.

We then loop over all of them to identify which is the zipped twitter archive and save the URL. Once that's done we can just call the master function create_main_dataframe that was declared above to create a pandas dataframe out of this.

In [2]:
user = api.exchange_oauth2_member(os.environ.get('OH_ACCESS_TOKEN'))
for entry in user['data']:
    if entry['source'] == "direct-sharing-70":
        twitter_data_url = entry['download_url']
        break
twitter_data_url

twitter_data = create_main_dataframe(zip_url=twitter_data_url)
reading files
downloading files
reading index
iterate over individual files
read data/js/tweets/2018_05.js
read data/js/tweets/2018_04.js
read data/js/tweets/2018_03.js
read data/js/tweets/2018_02.js
read data/js/tweets/2018_01.js
read data/js/tweets/2017_12.js
read data/js/tweets/2017_11.js
read data/js/tweets/2017_10.js
read data/js/tweets/2017_09.js
read data/js/tweets/2017_08.js
read data/js/tweets/2017_07.js
read data/js/tweets/2017_06.js
read data/js/tweets/2017_05.js
read data/js/tweets/2017_04.js
read data/js/tweets/2017_03.js
read data/js/tweets/2017_02.js
read data/js/tweets/2017_01.js
read data/js/tweets/2016_12.js
read data/js/tweets/2016_11.js
read data/js/tweets/2016_10.js
read data/js/tweets/2016_09.js
read data/js/tweets/2016_08.js
read data/js/tweets/2016_07.js
read data/js/tweets/2016_06.js
read data/js/tweets/2016_05.js
read data/js/tweets/2016_04.js
read data/js/tweets/2016_03.js
read data/js/tweets/2016_02.js
read data/js/tweets/2016_01.js
read data/js/tweets/2015_12.js
read data/js/tweets/2015_11.js
read data/js/tweets/2015_10.js
read data/js/tweets/2015_09.js
read data/js/tweets/2015_08.js
read data/js/tweets/2015_07.js
read data/js/tweets/2015_06.js
read data/js/tweets/2015_05.js
read data/js/tweets/2015_04.js
read data/js/tweets/2015_03.js
read data/js/tweets/2015_02.js
read data/js/tweets/2015_01.js
read data/js/tweets/2014_12.js
read data/js/tweets/2014_11.js
read data/js/tweets/2014_10.js
read data/js/tweets/2014_09.js
read data/js/tweets/2014_08.js
read data/js/tweets/2014_07.js
read data/js/tweets/2014_06.js
read data/js/tweets/2014_05.js
read data/js/tweets/2014_04.js
read data/js/tweets/2014_03.js
read data/js/tweets/2014_02.js
read data/js/tweets/2014_01.js
read data/js/tweets/2013_12.js
read data/js/tweets/2013_11.js
read data/js/tweets/2013_10.js
read data/js/tweets/2013_09.js
read data/js/tweets/2013_08.js
read data/js/tweets/2013_07.js
read data/js/tweets/2013_06.js
read data/js/tweets/2013_05.js
read data/js/tweets/2013_04.js
read data/js/tweets/2013_03.js
read data/js/tweets/2013_02.js
read data/js/tweets/2013_01.js
read data/js/tweets/2012_12.js
read data/js/tweets/2012_11.js
read data/js/tweets/2012_10.js
read data/js/tweets/2012_09.js
read data/js/tweets/2012_08.js
read data/js/tweets/2012_07.js
read data/js/tweets/2012_06.js
read data/js/tweets/2012_05.js
read data/js/tweets/2012_04.js
read data/js/tweets/2012_03.js
read data/js/tweets/2012_02.js
read data/js/tweets/2012_01.js
read data/js/tweets/2011_12.js
read data/js/tweets/2011_11.js
read data/js/tweets/2011_10.js
read data/js/tweets/2011_09.js
read data/js/tweets/2011_08.js
read data/js/tweets/2011_07.js
read data/js/tweets/2011_06.js
read data/js/tweets/2011_05.js
read data/js/tweets/2011_04.js
read data/js/tweets/2011_03.js
read data/js/tweets/2011_02.js
read data/js/tweets/2011_01.js
read data/js/tweets/2010_12.js
read data/js/tweets/2010_11.js
read data/js/tweets/2010_10.js
read data/js/tweets/2010_09.js
read data/js/tweets/2010_08.js
read data/js/tweets/2010_07.js
read data/js/tweets/2010_06.js
read data/js/tweets/2010_05.js
read data/js/tweets/2010_04.js
read data/js/tweets/2010_03.js
read data/js/tweets/2010_02.js
read data/js/tweets/2010_01.js
read data/js/tweets/2009_12.js
read data/js/tweets/2009_11.js
read data/js/tweets/2009_10.js
read data/js/tweets/2009_09.js
read data/js/tweets/2009_08.js
read data/js/tweets/2009_07.js
read data/js/tweets/2009_06.js
read data/js/tweets/2009_05.js
read data/js/tweets/2009_04.js
read data/js/tweets/2009_03.js
read data/js/tweets/2009_02.js
read data/js/tweets/2009_01.js
read data/js/tweets/2008_12.js
read data/js/tweets/2008_11.js
read data/js/tweets/2008_10.js
read data/js/tweets/2008_09.js
read data/js/tweets/2008_08.js
read data/js/tweets/2008_07.js
read data/js/tweets/2008_06.js
read data/js/tweets/2008_05.js
read data/js/tweets/2008_04.js
concatenating...

Now we have a dataframe called twitter_data which contains a lot of metadata, along with all the tweets in the column twitter_data['text'].

In [3]:
twitter_data.head()
Out[3]:
hashtag latitude longitude media reply_name reply_user_name retweet_name retweet_user_name text url
utc_time
2018-05-11 19:58:25+00:00 NaN NaN NaN NaN Mama Hörnchen ♿️ MamsellChaos None None @MamsellChaos oh, and https://t.co/c2N4g6BcUT ... 1.0
2018-05-11 19:38:09+00:00 1.0 NaN NaN NaN None None None None The collection of personal data analysis noteb... 1.0
2018-05-11 19:19:56+00:00 NaN NaN NaN NaN Mama Hörnchen ♿️ MamsellChaos None None @MamsellChaos und ich update dich sobald der i... NaN
2018-05-11 19:19:39+00:00 NaN NaN NaN NaN Mama Hörnchen ♿️ MamsellChaos None None @MamsellChaos fuer R kann ich https://t.co/n2H... 1.0
2018-05-11 19:10:14+00:00 NaN NaN NaN NaN Mama Hörnchen ♿️ MamsellChaos None None @MamsellChaos Ah, der Fitbit Data Import funkt... NaN

Let's now add the polarity and subjectivity to the tweets with textblob. polarity values range between +1 and -1, with +1 being extremely positive, -1 extremely negative and 0 being neutral. subjectivity values range between 0 and 1, with larger numbers meaning the text is more subjective.

We add these numbers to our dataframe and for a start just remove all the 0 values, as these can also indicate a lack of data/classifications.

In [4]:
polarity = []
subjectivity = []
twitter_data['blob'] = twitter_data['text'].apply(TextBlob)
for entry in twitter_data['blob']:
    polarity.append(entry.sentiment.polarity)
    subjectivity.append(entry.sentiment.subjectivity)
twitter_data['polarity'] = polarity
twitter_data['subjectivity'] = subjectivity
twitter_data = twitter_data.replace(0, np.nan)

Let's now normalize these values for each day instead of looking at individual tweets. For both polarity and subjectivity we calculate the daily maximum, minimum and mean values along with the standard deviation.

In a next step we further smooth out these values by applying a 30-day rolling average to remove the impact of daily fluctuations.

In [5]:
# get 30-day averaged standard deviations for polarity & subjectivity
twitter_std = twitter_data.groupby(twitter_data.index.date).std()
twitter_std.index = pd.to_datetime(twitter_std.index)
twitter_std_rolling = twitter_std.rolling('30d').mean()

# get 30-day mean averaged daily means for polarity & subjectivity
twitter_mean = twitter_data.groupby(twitter_data.index.date).mean()
twitter_mean.index = pd.to_datetime(twitter_mean.index)
twitter_mean_rolling = twitter_mean.rolling('30d').mean()

# get 30-day averaged maximum values for polarity & subjectivity
twitter_max = twitter_data.groupby(twitter_data.index.date).max()
twitter_max.index = pd.to_datetime(twitter_max.index)
twitter_max_rolling = twitter_max.rolling('30d').mean()

# get 30-day averaged minimum for polarity & subjectivity
twitter_min = twitter_data.groupby(twitter_data.index.date).min()
twitter_min.index = pd.to_datetime(twitter_min.index)
twitter_min_rolling = twitter_min.rolling('30d').mean()

We can now pack all of this in two new data frames for the subsequent plotting.

In [6]:
polarity = pd.DataFrame(data={
    "max_polarity": twitter_max_rolling["polarity"],
    "mean_polarity": twitter_mean_rolling["polarity"],
    "min_polarity": twitter_min_rolling["polarity"],    
    "std_polarity": twitter_std_rolling["polarity"]    
})

subjectivity = pd.DataFrame(data={
    "max_subjectivity": twitter_max_rolling["subjectivity"],
    "mean_subjectivity": twitter_mean_rolling["subjectivity"],
    "min_subjectivity": twitter_min_rolling["subjectivity"],    
    "std_subjectivity": twitter_std_rolling["subjectivity"]    
})

Visualization

Let's start with the polarity. Looking at the mean it seems that my tweets overall seem to be pretty neutral. Looking at the maximum/minimum polarity we see that positive & negative polarity are more or less balanced. Put in other words: For each mean-spirited tweet there's one full of praise ;-)

In [7]:
pt = polarity.plot(y=['max_polarity','mean_polarity','min_polarity','std_polarity'],figsize=(15,10),fontsize=14)
pt.legend(['Maximum Polarity','Mean Polarity','Minium Polarity','Standard Deviation of Polarity'])
Out[7]:
<matplotlib.legend.Legend at 0x7f54214639e8>
In [8]:
pt = subjectivity.plot(y=['max_subjectivity','mean_subjectivity','min_subjectivity','std_subjectivity'],figsize=(15,10),fontsize=14)
pt.legend(['Maximum subjectivity','Mean subjectivity','Minimum subjectivity','Standard Deviation of subjectivity'])
Out[8]:
<matplotlib.legend.Legend at 0x7f53f61b7940>

Looking at the subjectivity is a bit more interesting: It seems my tweets have grown less subjective over time. Which might have plenty of reasons: My active political career coming to a finish, growing a larger audience and thus tweeting more responsibly, or just plain growing old.

I guess we'll never know. Unless one of you has a good idea of how to use the Twitter archives to investigate this further. If you do: Hit me up on twitter @gedankenstuecke.

Emoji usage

Let's have a look into the emoji usage next. To do this we loaded the emoji package on top. This has a dictionary of (many) emoji, but not of all of them. Especially never ones are bound to be absent, as are the multi-character flag emojis like 🇮🇱 🇭🇰 🇬🇷. But for a first look this list should be good enough. We write a small number_of_emoji function that as expected returns the number of emoji found for a single tweet. We can then apply this function to our twitter_data dataframe.

In [9]:
def number_emoji(row):
    n_emoji = 0
    for character in row['text']:
        if character in emoji.UNICODE_EMOJI:
            n_emoji += 1
    return n_emoji

twitter_data['emoji_count'] = twitter_data.apply(number_emoji, axis=1)

We can now sum up over the number of emoji per day and then again apply a rolling average to minimize the influence of daily fluctuations. Once that's done we can make our plots 🎉

In [10]:
twitter_emoji = twitter_data.groupby(twitter_data.index.date).sum()
twitter_emoji.index = pd.to_datetime(twitter_emoji.index)
twitter_emoji_rolling = twitter_emoji.rolling('90d').mean()

pt = twitter_emoji_rolling.plot(y=['emoji_count'],figsize=(15,10),fontsize=14)
pt.legend(['daily emoji count'])
Out[10]:
<matplotlib.legend.Legend at 0x7f53f5969f60>

Let's also have a look what the most common emoji are. We iterate over all the tweets, and count the occurrences of each emoji. Ultimately we only look at those that appear at least 5 times.

In [11]:
from collections import defaultdict

emojis = defaultdict(int)
for tweet in twitter_data['text']:
    for character in tweet:
        if character in emoji.UNICODE_EMOJI:
            emojis[character] += 1

s = [(k, emojis[k]) for k in sorted(emojis, key=emojis.get, reverse=True)]
for k,v in s:
    print(k,v)
    if v < 5:
        break
😂 593
😍 265
🎉 217
💖 207
👍 155
😉 153
✈ 146
😊 121
😱 76
🐶 75
™ 69
♀ 53
🙏 41
😢 40
☺ 37
🤷 35
😭 34
✔ 34
🤔 33
👋 28
❤ 27
😘 22
🚗 22
🍆 21
🍄 21
⭐ 19
☕ 19
💩 19
😇 18
🍾 18
😔 18
🔥 18
📚 17
📊 16
🍩 16
😎 16
😴 16
🌟 16
💃 16
✅ 15
🐦 15
🍻 15
🌈 15
👌 14
🐼 14
🔬 14
👏 13
© 13
🍪 12
😀 12
🎊 11
🎈 11
🍦 11
💘 11
💓 11
☀ 10
🤦 10
🍌 10
🍰 10
🎄 10
🐧 10
🐢 10
💕 10
♥ 10
💜 9
🤓 9
🏃 9
🍨 9
🐳 9
🎸 9
💉 9
👀 9
✨ 8
🍿 8
😳 8
🍺 8
📉 8
😒 8
😞 8
🏳 8
🍜 8
😜 8
💤 8
✊ 8
🔮 8
☔ 7
🌍 7
🦄 7
🤞 7
🐻 7
🐿 7
🌊 7
🚂 7
🐌 7
😛 7
🍧 7
🙄 6
🚨 6
👇 6
🐱 6
💰 6
🐍 6
😥 6
👨 6
👓 6
🐛 6
🚙 6
🌋 6
🐴 6
☑ 6
🤘 5
🙃 5
🐈 5
👩 5
🎁 5
🍫 5
🍸 5
📈 5
🍉 5
💥 5
💸 5
🔨 5
🐰 5
🌱 5
💍 5
😋 5
🌲 5
👉 5
😕 5
💨 5
⚡ 5
🎯 4

Using this list of most common emoji I manually categorized some of them into sub-categories to see how these categories vary over time:

In [12]:
emoji_love = ["💖","😍","😘","🍆","🌈","💘","💓","💕","♥","💜"]
emoji_science = ['📉','🌱','🐌','🤓','🔬','📚','📊','🍄']
emoji_joy = ['😋','😛','😜','😀','😎','☺','😊','😉','👍','😂']
emoji_celebrate = ['☑','🍺','✨','🍾','🎈','🎊','✅','🍻','✔','🎉']
emoji_sad = ['😕','🤦','😥','😳','😒','😞','😔','💩','🔥','🤷','😭','😢','😱']
emoji_travel = ['🌍','🚙','🚂','🚗','✈']

We can now apply a small function with these sub-categories to get the number of emoji for each tweet/category combination:

In [13]:
def classify_number_emoji(row, classifier):
    n_emoji = 0
    for character in row:
        if character in classifier:
            n_emoji += 1
    return n_emoji

twitter_data['emoji_count_love'] = twitter_data['text'].apply(classify_number_emoji,args=(emoji_love,))
twitter_data['emoji_count_science'] = twitter_data['text'].apply(classify_number_emoji,args=(emoji_science,))
twitter_data['emoji_count_joy'] = twitter_data['text'].apply(classify_number_emoji,args=(emoji_joy,))
twitter_data['emoji_count_celebrate'] = twitter_data['text'].apply(classify_number_emoji,args=(emoji_celebrate,))
twitter_data['emoji_count_sad'] = twitter_data['text'].apply(classify_number_emoji,args=(emoji_sad,))
twitter_data['emoji_count_travel'] = twitter_data['text'].apply(classify_number_emoji,args=(emoji_travel,))

Let's get the daily sum of emoji for each of the categories and minimize fluctuations with a 90-day rolling average:

In [14]:
twitter_emoji = twitter_data.groupby(twitter_data.index.date).sum()
twitter_emoji.index = pd.to_datetime(twitter_emoji.index)
twitter_emoji_rolling = twitter_emoji.rolling('90d').mean()

Plotting the emoji usage we can see that I'm becoming a much more joyful person over time 😂 Though I should probably check on my loving tweets. 😉

In [15]:
pt = twitter_emoji_rolling.plot(y=['emoji_count_love',
                                   #'emoji_count_science',
                                   'emoji_count_joy',
                                   'emoji_count_celebrate',
                                   'emoji_count_sad'],
                                   #'emoji_count_travel'],
                                figsize=(15,10),
                                fontsize=14,
                                xlim=["2015-01-01","2018-03-01"])
pt.legend(['emoji_count_love',
        #'emoji_count_science',
        'emoji_count_joy',
        'emoji_count_celebrate',
        'emoji_count_sad',])
        #'emoji_count_travel'])
Out[15]:
<matplotlib.legend.Legend at 0x7f53f125dcc0>
In [ ]: