World Cup Tweet Sentiment Analysis

Background

Sentiment Analysis is a topic that has fascinated me for some time. Examples like the analysis of @realDonaldTrump to classify tweets as positive or negative captivated me. To perform my own analysis, very simply, I data collected tweets during the England V Croatia World Cup Semi-Final game and proceeded through the steps of pre-processing, processing, sentiment analysis.

The tweets were collected through by a Python script I ran on a AWS instance. I used the tweepy api to collect 376 records in a JSON file and then used that for the following steps.

import os, json, pprint
import pandas as pd

from nltk.corpus import stopwords
from nltk import sent_tokenize

pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)

tweet_data = []
tweet_file = open('world_cup_tweets.txt', 'r')
for line in tweet_file:
    try:
        tweet = json.loads(line)
        tweet_data.append(tweet)
    except:
        continue

len(tweet_data)

df = pd.DataFrame(tweet_data)
print('There are', df.shape[0], ' rows and', df.shape[1], ' columns in this dataset')
df.columns

There are 376  rows and 37  columns in this dataset





Index(['contributors', 'coordinates', 'created_at', 'display_text_range',
       'entities', 'extended_entities', 'extended_tweet', 'favorite_count',
       'favorited', 'filter_level', 'geo', 'id', 'id_str',
       'in_reply_to_screen_name', 'in_reply_to_status_id',
       'in_reply_to_status_id_str', 'in_reply_to_user_id',
       'in_reply_to_user_id_str', 'is_quote_status', 'lang', 'limit', 'place',
       'possibly_sensitive', 'quote_count', 'quoted_status',
       'quoted_status_id', 'quoted_status_id_str', 'quoted_status_permalink',
       'reply_count', 'retweet_count', 'retweeted', 'retweeted_status',
       'source', 'text', 'timestamp_ms', 'truncated', 'user'],
      dtype='object')

Drop unnecessary columns

There are a siginifance number of attributes on this data set that will not be used. This step removes the unnessary columns.

df.drop(['contributors', 'coordinates', 'extended_tweet', 'geo', 'created_at', 'display_text_range',
       'entities', 'extended_entities', 'favorite_count',
       'favorited', 'filter_level', 'id', 'id_str',
       'in_reply_to_screen_name', 'in_reply_to_status_id',
       'in_reply_to_status_id_str', 'in_reply_to_user_id',
       'in_reply_to_user_id_str', 'is_quote_status', 'limit',
       'possibly_sensitive', 'quote_count', 'quoted_status',
       'quoted_status_id', 'quoted_status_id_str', 'quoted_status_permalink',
       'reply_count', 'retweet_count', 'retweeted', 'retweeted_status',
       'source','timestamp_ms', 'truncated', 'user'], axis=1, inplace=True)
df.shape

(376, 3)

df.dtypes

lang     object
place    object
text     object
dtype: object

Fill in NAN's

There are rows where the text value is NAN and that is causing the pre-processing steps to error with a data type error. Perhaps I could have filtered them out, but I chose to use fillna to fill them in with spaces.

df.text.fillna("", inplace=True)
df.text.dtype

dtype('O')

#df['cleaned'] = df['text'].str.replace('[^\w\s]', '')
#from nltk.corpus import stopwords
#stop = stopwords.words('english')
#df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
#df['text'].head()

df['word_count'] = df['text'].apply(lambda x: len(str(x).split(" ")))
df.head(10)

	lang	place	text	word_count
0	en	None	This. https://t.co/UNKep8L5EY	2
1	en	None	Wanna see England in the final but Croatia is clearly the better side\n\n#ENGCRO #WorldCup	14
2	en	None	C’mon #England we really can do this. Dig deep. #EnglandvsCroatia #ThreeLionsOnAShirt	11
3	en	None	RT @FlickSaudi: If England lose This is How The Streets Of London Will Look https://t.co/V8AT2RRn5p	15
4	en	None	RT @hermannkelly: .@TonightShowTV3 I’m into the breach on the TV3 Tonight Show, Wednesday 11th July at 11pm. #WorldCup⁠ ⁠⁠ ⁠ \nAs we already…	23
5	th	None	RT @mthai: เกมส์ยังไม่จบ... ต่อเวลาพิเศษ 30 นาที\n\nโครเอเชีย 🇭🇷 1 : 1 🏴󠁧󠁢󠁥󠁮󠁧󠁿 อังกฤษ\n เปริซิช 68' ⚽…	47
6	en	None	William made an England flag at nursery today. I hope he gets the need for it on Sunday. #ENGvCRO… https://t.co/Cjf76wWtPG	20
7	en	None	This is why england can never win d world cup again argue with ur ancestors #ENGCRO	16
8	en	None	RT @PurelyFootball: Hyde Park when Kieran Trippier scored for England🏴󠁧󠁢󠁥󠁮󠁧󠁿\n\nAbsolute scenes!🍻 https://t.co/SLb6dFcG0g	12
9	en	None	Honestly? I’m ashamed about the protests in England... I’m ashamed we aren’t as organized and doing as much protest… https://t.co/JBLcWtfgsX	20

Punctuation removal

import string
df['no_punctuation'] = df['text'].str.replace('[^\w\s]','')
df.head()

	lang	place	text	word_count	no_punctuation
0	en	None	This. https://t.co/UNKep8L5EY	2	This httpstcoUNKep8L5EY
1	en	None	Wanna see England in the final but Croatia is clearly the better side\n\n#ENGCRO #WorldCup	14	Wanna see England in the final but Croatia is clearly the better side\n\nENGCRO WorldCup
2	en	None	C’mon #England we really can do this. Dig deep. #EnglandvsCroatia #ThreeLionsOnAShirt	11	Cmon England we really can do this Dig deep EnglandvsCroatia ThreeLionsOnAShirt
3	en	None	RT @FlickSaudi: If England lose This is How The Streets Of London Will Look https://t.co/V8AT2RRn5p	15	RT FlickSaudi If England lose This is How The Streets Of London Will Look httpstcoV8AT2RRn5p
4	en	None	RT @hermannkelly: .@TonightShowTV3 I’m into the breach on the TV3 Tonight Show, Wednesday 11th July at 11pm. #WorldCup⁠ ⁠⁠ ⁠ \nAs we already…	23	RT hermannkelly TonightShowTV3 Im into the breach on the TV3 Tonight Show Wednesday 11th July at 11pm WorldCup \nAs we already

from bs4 import BeautifulSoup
import re
from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))



def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
        letters_only = re.sub("[^a-zA-Z]", " ", clean)
        lower_case = letters_only.lower()
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
        words = tok.tokenize(lower_case)
    return (" ".join(words)).strip()


result = []

for tweet in df.text:
    result.append(tweet_cleaner(tweet))

df['cleaned'] = result

df.head(20)

	lang	place	text	word_count	no_punctuation	cleaned
0	en	None	This. https://t.co/UNKep8L5EY	2	This httpstcoUNKep8L5EY	this
1	en	None	Wanna see England in the final but Croatia is clearly the better side\n\n#ENGCRO #WorldCup	14	Wanna see England in the final but Croatia is clearly the better side\n\nENGCRO WorldCup	wanna see england in the final but croatia is clearly the better side engcro worldcup
2	en	None	C’mon #England we really can do this. Dig deep. #EnglandvsCroatia #ThreeLionsOnAShirt	11	Cmon England we really can do this Dig deep EnglandvsCroatia ThreeLionsOnAShirt	c mon england we really can do this dig deep englandvscroatia threelionsonashirt
3	en	None	RT @FlickSaudi: If England lose This is How The Streets Of London Will Look https://t.co/V8AT2RRn5p	15	RT FlickSaudi If England lose This is How The Streets Of London Will Look httpstcoV8AT2RRn5p	rt if england lose this is how the streets of london will look
4	en	None	RT @hermannkelly: .@TonightShowTV3 I’m into the breach on the TV3 Tonight Show, Wednesday 11th July at 11pm. #WorldCup⁠ ⁠⁠ ⁠ \nAs we already…	23	RT hermannkelly TonightShowTV3 Im into the breach on the TV3 Tonight Show Wednesday 11th July at 11pm WorldCup \nAs we already	rt i m into the breach on the tv tonight show wednesday th july at pm worldcup as we already
5	th	None	RT @mthai: เกมส์ยังไม่จบ... ต่อเวลาพิเศษ 30 นาที\n\nโครเอเชีย 🇭🇷 1 : 1 🏴󠁧󠁢󠁥󠁮󠁧󠁿 อังกฤษ\n เปริซิช 68' ⚽…	47	RT mthai เกมสยงไมจบ ตอเวลาพเศษ 30 นาท\n\nโครเอเชย 1 1 องกฤษ\n เปรซช 68	rt
6	en	None	William made an England flag at nursery today. I hope he gets the need for it on Sunday. #ENGvCRO… https://t.co/Cjf76wWtPG	20	William made an England flag at nursery today I hope he gets the need for it on Sunday ENGvCRO httpstcoCjf76wWtPG	william made an england flag at nursery today i hope he gets the need for it on sunday engvcro
7	en	None	This is why england can never win d world cup again argue with ur ancestors #ENGCRO	16	This is why england can never win d world cup again argue with ur ancestors ENGCRO	this is why england can never win d world cup again argue with ur ancestors engcro
8	en	None	RT @PurelyFootball: Hyde Park when Kieran Trippier scored for England🏴󠁧󠁢󠁥󠁮󠁧󠁿\n\nAbsolute scenes!🍻 https://t.co/SLb6dFcG0g	12	RT PurelyFootball Hyde Park when Kieran Trippier scored for England\n\nAbsolute scenes httpstcoSLb6dFcG0g	rt hyde park when kieran trippier scored for england absolute scenes
9	en	None	Honestly? I’m ashamed about the protests in England... I’m ashamed we aren’t as organized and doing as much protest… https://t.co/JBLcWtfgsX	20	Honestly Im ashamed about the protests in England Im ashamed we arent as organized and doing as much protest httpstcoJBLcWtfgsX	honestly i m ashamed about the protests in england i m ashamed we aren t as organized and doing as much protest
10	en	None	RT @SavageLord10: Snap back thou this an old video guyz help me go viral tag @Aylo_SA @tloucolt @shelm_eric @flickice #imsorrychallenge #M…	22	RT SavageLord10 Snap back thou this an old video guyz help me go viral tag Aylo_SA tloucolt shelm_eric flickice imsorrychallenge M	rt snap back thou this an old video guyz help me go viral tag sa eric imsorrychallenge m
11	en	None	RT @petertimmins3: If Croatia win by cheating tonight, I expect at least 17million people to respect the result. Especially you, @JuliaHB1	21	RT petertimmins3 If Croatia win by cheating tonight I expect at least 17million people to respect the result Especially you JuliaHB1	rt if croatia win by cheating tonight i expect at least million people to respect the result especially you
12	en	None	RT @KEEMSTAR: England wins \n\nI saw it in a dream	10	RT KEEMSTAR England wins \n\nI saw it in a dream	rt england wins i saw it in a dream
13	en	None	We’re going a bit Spursy here England 🙈	8	Were going a bit Spursy here England	we re going a bit spursy here england
14	en	None	RT @Predictionhq: correct score 1-1✔️\nCroatia over 0.5 Team Goals ✔️\nCroatia over 3.5 corners ✔️\nOver 1.5 FT goals \n100% Record https://t.c…	20	RT Predictionhq correct score 11\nCroatia over 05 Team Goals \nCroatia over 35 corners \nOver 15 FT goals \n100 Record httpstc	rt correct score croatia over team goals croatia over corners over ft goals record
15	en	None	RT @alexandramusic: Literally can’t cope. COME ON ENGLAND !!! 🏴󠁧󠁢󠁥󠁮󠁧󠁿	10	RT alexandramusic Literally cant cope COME ON ENGLAND	rt literally can t cope come on england
16	en	None	Genuinely feel sick right now #ENGCRO #WorldCup	7	Genuinely feel sick right now ENGCRO WorldCup	genuinely feel sick right now engcro worldcup
17	en	None	Okay the pace seems to be slightly better for #ENGCRO - not sure it's going to be enough at this stage though. Typi… https://t.co/CXsEmkrss4	24	Okay the pace seems to be slightly better for ENGCRO not sure its going to be enough at this stage though Typi httpstcoCXsEmkrss4	okay the pace seems to be slightly better for engcro not sure it s going to be enough at this stage though typi
18	en	None	Ok extra time let’s wrap this up and return England to their natural state; moaning into their pints… https://t.co/8qQLHyjHxN	19	Ok extra time lets wrap this up and return England to their natural state moaning into their pints httpstco8qQLHyjHxN	ok extra time let s wrap this up and return england to their natural state moaning into their pints
19	en	None	I’d love some sideline reporting from @BarstoolBigCat in the next World Cup	12	Id love some sideline reporting from BarstoolBigCat in the next World Cup	i d love some sideline reporting from in the next world cup

Stopwords Removal

#stop = set(stopwords.words('english'))
#df['final'] = df['cleaned'].apply(lambda x: [item for item in x if item not in stop])
#df['text'].apply(lambda x: [item for item in x if item not in stop])

Sentiment Analysis

Using Textblob.sentiment to detect the sentiment of the cleaned tweet. This returns a tuple of polarity and subjectivity. The polarity is indicative of the sentiment, a positive sentiment will have a value closer to 1 while a negative sentiment will be closer to -1.

from textblob import TextBlob
train = df.cleaned[:100]
#classifier = NaiveBayesClassifier(train, format=None)

#train.apply(lambda x: TextBlob(x).sentiment)

df['sentiment'] = df['cleaned'].apply(lambda x: TextBlob(x).sentiment[0])

df['positive'] = df['sentiment'] > 0
df['negative'] = df['sentiment'] < 0
df['neutral'] = df['sentiment'] == 0

labels = ['Negative','Postive','Neutral']

df_summary = pd.DataFrame([df['negative'].sum(), df['positive'].sum(), df['neutral'].sum()], index=labels)
df_summary

	0
Negative	47
Postive	124
Neutral	205

from matplotlib import pyplot as plt
%matplotlib inline
fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 12
fig_size[1] = 9
plt.rcParams["figure.figsize"] = fig_size

df_summary.plot(kind='bar');

png

fig = plt.gcf()
fig.set_size_inches(12, 9)
from matplotlib import pyplot as plt
plt.scatter(df.index.values, df['sentiment']);

png

from wordcloud import WordCloud, STOPWORDS

stopwords = set(STOPWORDS)
wordcloud = WordCloud(
                          background_color='white',
                          stopwords=stopwords,
                          max_words=200,
                          max_font_size=40, 
                          random_state=42
                         ).generate(str(df['cleaned']))


print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show();

<wordcloud.wordcloud.WordCloud object at 0x111d36be0>

png

blogroll

social