# Twitter account
from datetime import date
tw_user_name = "@SeguraAndres7"
print("Twitter account:", tw_user_name)
print("Analysis date:", date.today())
Twitter account: @SeguraAndres7 Analysis date: 2021-09-17
# Import util libraries
import tweepy
import random
import numpy as np
import pandas as pd
import yaml
import warnings
import calendar
import time
from PIL import Image
from collections import Counter
# Import NLP libraries
import re
import spacy.lang.es as es
import spacy.lang.en as en
from textblob import TextBlob
from wordcloud import WordCloud
# Import plot libraries
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
# Util function - Read dict from yaml file
def get_dict_from_yaml(yaml_path):
result = dict()
with open(yaml_path) as f:
yaml_file = f.read()
result = yaml.load(yaml_file, Loader=yaml.FullLoader)
return result
# Util function - Plot column chart
def plot_col_chart(df, figsize, x_var, y_var, title, color='green', legend=None, x_label=None):
fig, ax = plt.subplots()
df.plot.bar(ax=ax, x=x_var, y=y_var, color=color, alpha=0.75, figsize=figsize)
if legend:
ax.legend(legend)
else:
ax.get_legend().remove()
if x_label:
x = np.arange(len(x_label))
plt.xticks(x, x_label, rotation=45)
else:
plt.xticks(rotation=45)
plt.title(title, fontsize=16)
plt.xlabel(x_var.capitalize())
plt.ylabel(y_var.capitalize())
plt.show()
# Util function - Plot column chart
def plot_bar_chart(df, figsize, x_var, y_var, title, color='blue', legend=None):
fig, ax = plt.subplots()
df.plot.barh(ax=ax, x=x_var, y=y_var, alpha=0.75, figsize=figsize)
if legend:
ax.legend(legend)
else:
ax.get_legend().remove()
plt.title(title, fontsize=16)
plt.xlabel(y_var.capitalize())
plt.ylabel(x_var.capitalize())
plt.show()
# Util function - Get country from location
def get_country_from_loc(location):
country = ''
location = location.strip()
if location != '':
tokens = location.split(',')
country = tokens[-1].strip()
return country
# Read twitter bot credentials
yaml_path = 'config/credentials.yml'
twt_login = get_dict_from_yaml(yaml_path)
# Setup bot credentials
consumer_key = twt_login['consumer_key']
consumer_secret = twt_login['consumer_secret']
access_token = twt_login['access_token']
access_token_secret = twt_login['access_token_secret']
# Authenticate to Twitter
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
# Create API object
api = tweepy.API(auth)
try:
api.verify_credentials()
print("Authentication OK")
except:
print("Error during authentication")
Authentication OK
# Show user account details
user = api.get_user(screen_name=tw_user_name)
print("User details:")
print(user.name)
print(user.description)
print(user.location)
print(user.created_at)
User details: Andres Segura-Tinoco CS PhD student at @UAM_Madrid | My research interests are in: XAI, ML, RecSys, NLP, Argument Mining and Information Retrieval | Tweets in En & Es Bogotá, DC, Colombia 2010-09-06 04:35:36
# Show followers/Following ratio
flw_ratio = user.followers_count / user.friends_count
print('Influence ratio:', round(flw_ratio, 2))
Influence ratio: 7.78
# Util function - Fetch tweets list from a specific user
# Note: Twitter only allows access to a users most recent 3240 tweets with this method
def get_all_tweets(api, screen_name):
all_tweets = []
# Make initial request for most recent tweets (200 is the maximum allowed count)
try:
new_tweets = api.user_timeline(screen_name=screen_name, count=200, tweet_mode='extended')
# Save most recent tweets
all_tweets.extend(new_tweets)
# Save the id of the oldest tweet less one
oldest = all_tweets[-1].id - 1
# Keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) > 0:
# All subsiquent requests use the max_id param to prevent duplicates
new_tweets = api.user_timeline(screen_name = screen_name, count=200, tweet_mode='extended', max_id=oldest)
# Save most recent tweets
all_tweets.extend(new_tweets)
# Update the id of the oldest tweet less one
oldest = all_tweets[-1].id - 1
except (socket.timeout, exceptions.ReadTimeoutError, exceptions.ProtocolError, tweepy.TweepError) as e:
print('Error:', e)
# Transform the tweepy tweets into an array that contains the relevant fields of each tweet
tweet_list = []
for tweet in all_tweets:
new_tweet = {
'id': tweet.id_str,
'created_at': tweet.created_at,
'message': tweet.full_text,
'lang': tweet.lang,
'hashtags': [ht['text'] for ht in tweet.entities['hashtags']],
'user_mentions': [mt['screen_name'] for mt in tweet.entities['user_mentions']],
'retweet_count': tweet.retweet_count,
'favorite_count': tweet.favorite_count,
'retweeted': tweet.retweeted,
'source': tweet.source,
'display_text_range': tweet.display_text_range
}
tweet_list.append(new_tweet)
return tweet_list
# Fetching tweet list from a specific user
raw_tweet_list = get_all_tweets(api, screen_name=tw_user_name)
len(raw_tweet_list)
1100
# Show the last tweet
raw_tweet_list[0]
{'id': '1438986291262197763', 'created_at': datetime.datetime(2021, 9, 17, 22, 0, 41), 'message': 'Hello everyone. What do you like most about Python 🐍?\n\n▪️ Its clean syntax?\n▪️ Its powerful libraries for Machine Learning and Data Science?\n▪️ The documentation available?\n▪️ Its large and active community?\n\nOr all of the above, as is my case 😁', 'lang': 'en', 'hashtags': [], 'user_mentions': [], 'retweet_count': 0, 'favorite_count': 0, 'retweeted': False, 'source': 'Twitter Web App', 'display_text_range': [0, 245]}
tweet_list = [tweet for tweet in raw_tweet_list if not tweet['retweeted'] and not tweet['message'].startswith("RT ")]
own_tweet_rate = len(tweet_list) / len(raw_tweet_list)
print("Own tweet rate: %.2f" % own_tweet_rate)
print("Total own tweets:", len(tweet_list))
Own tweet rate: 0.97 Total own tweets: 1064
# Get a list of all followers of a twitter account
followers = []
for page in tweepy.Cursor(api.followers, screen_name=tw_user_name, wait_on_rate_limit=True, count=200).pages():
try:
followers.extend(page)
except tweepy.TweepError as e:
time.sleep(10)
n_followers = len(followers)
print('Number of followers: %s' % n_followers)
Number of followers: 1820
# Get year the account was created
flw_creation = Counter()
for flw in followers:
created_at = flw.created_at.date()
year = created_at.year
flw_creation[year] += 1
# Cooking dataframe
df = pd.DataFrame.from_records(flw_creation.most_common(), columns = ['year', 'frequency']).sort_values(by=['year'])
# Plot creation of followers account
figsize = (12, 7)
x_var = 'year'
y_var = 'frequency'
title = 'Creation of followers account'
color = 'indigo'
plot_col_chart(df, figsize, x_var, y_var, title, color)
# Get followers location
flw_location = Counter()
for flw in followers:
location = flw.location.strip()
country = get_country_from_loc(location)
if country != '':
flw_location[country] += 1
loc_ratio = sum(flw_location.values()) / n_followers * 100
print('Number of followers with location:', round(loc_ratio, 2), '%')
Number of followers with location: 57.8 %
# Cooking dataframe
df = pd.DataFrame.from_records(flw_location.most_common(), columns=['location', 'frequency'])
df = df.sort_values(by=['frequency'], ascending=False)
df = df.loc[(df['frequency'] >= 2)]
df
location | frequency | |
---|---|---|
0 | India | 168 |
1 | Colombia | 28 |
2 | CA | 24 |
3 | Nigeria | 21 |
4 | Kenya | 16 |
... | ... | ... |
86 | Serbia | 2 |
83 | PA | 2 |
82 | Sudan | 2 |
81 | Switzerland | 2 |
88 | Cuba | 2 |
118 rows × 2 columns
# Plot top N mentioned accounts
x_var = 'location'
y_var = 'frequency'
title = 'Location of Followers'
figsize = (16, 7)
plot_col_chart(df.head(25), figsize, x_var, y_var, title)
# Get year the account was created
flw_following_count = []
flw_followers_count = []
for flw in followers:
flw_following_count.append(flw.friends_count)
flw_followers_count.append(flw.followers_count)
# Accounts followed per followers
print(">> Accounts followed per followers")
print('Average number: %.2f' % np.mean(flw_following_count))
print('Median number: %.2f' % np.median(flw_following_count))
>> Accounts followed per followers Average number: 1767.84 Median number: 573.00
# Number of followers of my followers
print(">> Number of followers of my followers")
print('Average number: %.2f' % np.mean(flw_followers_count))
print('Median number: %.2f' % np.median(flw_followers_count))
>> Number of followers of my followers Average number: 3748.61 Median number: 72.00
# Tweet length
n_tweets = len(tweet_list)
tweets_len = []
for tweet in tweet_list:
tweets_len.append(len(tweet['message']))
# Showing stats
print('Number of tweets: %s' % n_tweets)
print('Minimum length: %s' % np.min(tweets_len))
print('Maximum length: %s' % np.max(tweets_len))
print('Average length: %.2f' % np.mean(tweets_len))
print('Standard deviation: %.2f' % np.std(tweets_len))
Number of tweets: 1064 Minimum length: 15 Maximum length: 319 Average length: 204.07 Standard deviation: 71.71
# Plot histograma of the tweet length
fig = plt.figure(figsize = (12, 7))
plt.hist(tweets_len, 20, facecolor='green', alpha=0.75)
plt.title('Tweet Length Histogram', fontsize=16)
plt.xlabel('Tweet Length')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
# Get retweet and favorite stats
retweet_count = []
favorite_count = []
for tweet in tweet_list:
retweet_count.append(tweet['retweet_count'])
favorite_count.append(tweet['favorite_count'])
# Showing statistics of account tweets
print('Retweets total: %s' % np.sum(retweet_count))
print('Retweets average: %.2f' % np.mean(retweet_count))
print('Favorites total: %s' % np.sum(favorite_count))
print('Favorites average: %.2f' % np.mean(favorite_count))
Retweets total: 1726 Retweets average: 1.62 Favorites total: 10783 Favorites average: 10.13
# Top 15 most popular tweets
df = pd.DataFrame(tweet_list)
df.drop('id', axis=1, inplace=True)
df.drop('source', axis=1, inplace=True)
df.drop('user_mentions', axis=1, inplace=True)
df.drop('retweeted', axis=1, inplace=True)
df.drop('display_text_range', axis=1, inplace=True)
df.sort_values(by=['retweet_count', 'favorite_count'], ascending=False).head(15)
created_at | message | lang | hashtags | retweet_count | favorite_count | |
---|---|---|---|---|---|---|
279 | 2021-07-07 20:23:53 | I have been working on 3 free, hands-on Python... | en | [] | 393 | 1594 |
88 | 2021-08-29 19:45:27 | Machine Learning Community:\n\nWhen you have t... | en | [] | 87 | 435 |
43 | 2021-09-08 20:40:34 | Today, we have in the tech world a lot of buzz... | en | [] | 60 | 307 |
534 | 2021-04-15 21:29:08 | Why is there such a difference in the results ... | en | [] | 59 | 308 |
434 | 2021-05-13 19:31:22 | Correlation is the degree and direction in whi... | en | [DataAnalytics] | 51 | 337 |
334 | 2021-06-19 19:00:15 | Let me share with you 3 definitions of what Ma... | en | [] | 47 | 277 |
291 | 2021-07-02 16:11:24 | Feature Scaling is one of the most useful and ... | en | [] | 47 | 234 |
441 | 2021-05-10 02:53:45 | 12 cool and useful NLP libraries🧵\n\n1. NLTK: ... | en | [] | 43 | 216 |
416 | 2021-05-17 19:38:09 | I am working on this new project 👨💻\n\nA prac... | en | [] | 38 | 144 |
238 | 2021-07-23 20:24:10 | It is amazing how much information can be obta... | en | [] | 35 | 191 |
541 | 2021-04-13 19:31:36 | Clustering is one of the most common applicati... | en | [] | 35 | 147 |
194 | 2021-08-04 19:22:18 | A common practice when we create NLP models is... | en | [] | 25 | 163 |
173 | 2021-08-08 14:00:14 | This is the main list of tools I use to work o... | en | [] | 23 | 232 |
162 | 2021-08-09 22:39:45 | How cool is @TensorFlow Embedding Projector!\n... | en | [] | 23 | 174 |
550 | 2021-04-07 15:48:26 | The Confusion Matrix is a performance measure ... | en | [] | 23 | 129 |
# Read iso639_2 languages codes
filename = 'config\iso_lang_codes.csv'
df = pd.read_csv(filename, names = ['iso_code', 'language'], skiprows=1)
lang_dict = dict(zip(list(df.iso_code), list(df.language)))
print('Languages dict length:', len(lang_dict))
Languages dict length: 185
# Count the language of the tweets
lang_list = Counter()
for tweet in tweet_list:
tweet_lang = tweet['lang']
if tweet_lang in lang_dict.keys():
lang = lang_dict[tweet['lang']]
lang_list[lang] += 1
else:
print('Missing:', tweet_lang)
print('Total tweets language: %s' % len(lang_list))
Total tweets language: 3
# Cooking dataframe
df = pd.DataFrame.from_records(lang_list.most_common(), columns = ['language', 'frequency'])
# Plot the language of the tweets
x_var = 'language'
y_var = 'frequency'
title = 'Tweets by Languages'
figsize = (7, 7)
plot_col_chart(df, figsize, x_var, y_var, title)
# Count the language of the tweets
tweets_year = Counter()
tweets_day = Counter()
for tweet in tweet_list:
created_at = tweet['created_at'].date()
weekday = created_at.weekday()
year = created_at.year
tweets_day[weekday] += 1
tweets_year[year] += 1
# Cooking dataframe
df = pd.DataFrame.from_records(list(tweets_day.items()), columns = ['weekday', 'frequency']).sort_values(by=['weekday'])
x = np.arange(7)
x_label = [calendar.day_name[d] for d in x]
# Plot tweets by weekday
x_var = 'weekday'
y_var = 'frequency'
title = 'Tweets by Weekday'
figsize = (12, 7)
color = 'green'
x_label = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
plot_col_chart(df, figsize, x_var, y_var, title, color, None, x_label)
# Cooking dataframe
df = pd.DataFrame.from_records(tweets_year.most_common(), columns = ['year', 'frequency']).sort_values(by=['year'])
# Plot tweets by year
x_var = 'year'
y_var = 'frequency'
title = 'Tweets by Year'
figsize = (12, 7)
plot_col_chart(df, figsize, x_var, y_var, title)
# Count the source of the tweets
tweets_source = Counter()
for tweet in tweet_list:
year = tweet['source']
tweets_source[year] += 1
print('Total tweeting sources: %s' % len(tweets_source))
Total tweeting sources: 7
# Cooking dataframe
df = pd.DataFrame.from_records(tweets_source.most_common(), columns = ['source', 'frequency'])
# Plot tweeting Sources
x_var = 'source'
y_var = 'frequency'
title = 'Tweets by Sources'
figsize = (12, 7)
plot_col_chart(df, figsize, x_var, y_var, title)
# Util function - Clean tweet text
def dq_clean_text(text):
clean_text = text.lower()
clean_text = re.sub(rx, ' ', clean_text)
clean_text = re.sub(r'\.+', ' ', clean_text)
clean_text = re.sub(r'\s+', ' ', clean_text)
return clean_text
# Global NLP variables
n_top = 20
punt_marks = ['\n', '\'', '\\', '/', '¡', '!', '¿', '?', '.', ',', ';', ':', '_', '-', '#', '$', '%', '&', '(', ')', '"']
rx = '[' + re.escape(''.join(punt_marks)) + ']'
# Counter of used words
word_list = Counter()
# Create list of words
for tweet in tweet_list:
tweet_text = tweet['message']
# Clean tweet text
clean_text = dq_clean_text(tweet_text)
# Save mentioned accounts
for word in clean_text.split(' '):
if len(word) and not word.isnumeric():
word_list[word] += 1
print('Number of words used: %s' % len(word_list))
print('Average words per tweet: %.2f' % (len(word_list) / n_tweets))
Number of words used: 6224 Average words per tweet: 5.85
# Most mentioned accounts list
tw_accounts = Counter()
# Save mentioned accounts
for word, freq in list(word_list.items()):
if word[0] == '@' and word != tw_user_name and len(word) > 1:
tw_accounts[word] = freq
print('Total different mentioned accounts: %s' % len(tw_accounts))
print('Average mentioned accounts per tweet: %.2f' % (sum(tw_accounts.values())/n_tweets))
Total different mentioned accounts: 225 Average mentioned accounts per tweet: 0.46
# Cooking dataframe
top_accounts = tw_accounts.most_common(n_top)
df = pd.DataFrame.from_records(top_accounts, columns = ['account', 'frequency']).sort_values(by=['frequency'])
# Plot top N mentioned accounts
x_var = 'account'
y_var = 'frequency'
title = 'Top %s Mentioned Twitter Accounts' % n_top
figsize = (8, 10)
plot_bar_chart(df, figsize, x_var, y_var, title)
# Count the hashtags in the tweets and return the N most repeated
hashtags = Counter()
for t in tweet_list:
for ht in t['hashtags']:
ht = '#'+ht.lower()
hashtags[ht] += 1
print('Total used different hashtags: %s' % len(hashtags))
print('Average hashtags per tweet: %.2f' % (sum(hashtags.values())/n_tweets))
Total used different hashtags: 113 Average hashtags per tweet: 0.37
# Cooking dataframe
top_hashtags = hashtags.most_common(n_top)
df = pd.DataFrame.from_records(top_hashtags, columns = ['hashtag', 'frequency']).sort_values(by=['frequency'])
# Plot the most common hashtags in tweets
x_var = 'hashtag'
y_var = 'frequency'
title = '%s Most used Hashtags' % n_top
figsize = (8, 10)
plot_bar_chart(df, figsize, x_var, y_var, title)
# Add word-freq to Dataframe
df = pd.DataFrame(columns=['word', 'frequency'])
ix = 0
# Filter twitter accounts
for word, freq in word_list.most_common():
if ix == n_top:
break
elif len(word) > 2 and word[0] != '@':
df.loc[ix] = (word, freq)
ix += 1
# Sorting data by frequency
df = df.sort_values(by=['frequency'])
# Plot the most common words in tweets
x_var = 'word'
y_var = 'frequency'
title = '%s Most used Words' % n_top
figsize = (8, 10)
plot_bar_chart(df, figsize, x_var, y_var, title)
# Get Spanish stopwords
stopwords_es = es.stop_words.STOP_WORDS
stopwords_en = en.stop_words.STOP_WORDS | set({'http', 'https'})
# Add word-freq to Dataframe
df = pd.DataFrame(columns=['word', 'frequency'])
ix = 0
# Filter English and Spanihs stopwords
for word, freq in word_list.most_common():
if ix == n_top:
break
elif (word not in stopwords_es) and (word not in stopwords_en) and (len(word) > 2) and (word[0] != '@'):
df.loc[ix] = (word, freq)
ix += 1
# Sorting data by frequency
df = df.sort_values(by=['frequency'])
# Plot the most common words in tweets
x_var = 'word'
y_var = 'frequency'
title = '%s Most used Words w/o Stopwords' % n_top
figsize = (8, 10)
plot_bar_chart(df, figsize, x_var, y_var, title)
# Calculate most common bigrams and reconstruct full text with used words
new_clean_text = user.description
bigram_list = Counter()
# Create list of words
for tweet in tweet_list:
tweet_text = tweet['message']
clean_text = dq_clean_text(tweet_text)
tokens = clean_text.split(' ')
bigram = ''
last_word = ''
for i in range(0, len(tokens)):
word = tokens[i]
if (word not in stopwords_es) and (word not in stopwords_en) and (len(word) > 2) and (word[0] != '@'):
# Reconstructing the clean text (without stop-words)
new_clean_text += ' ' + word
# Add bigrams-freq to Dataframe
if last_word != "":
bigram = last_word + '-' + word
bigram_list[bigram] += 1
last_word = word
print('Total different bigrams: %s' % len(bigram_list))
Total different bigrams: 12890
# Cooking dataframe
top_bigrams = bigram_list.most_common(n_top)
df = pd.DataFrame.from_records(top_bigrams, columns = ['bigram', 'frequency']).sort_values(by=['frequency'])
# Plot the most common words in tweets
x_var = 'bigram'
y_var = 'frequency'
title = '%s Most used Bigrams' % n_top
figsize = (8, 10)
plot_bar_chart(df, figsize, x_var, y_var, title)
# Import mask
mask = np.array(Image.open('../img/twitter-mask.png'))
# Create a Word-cloud
wc = WordCloud(background_color="white", max_words=250, mask=mask, collocations=False, margin=2)
wc = wc.generate(new_clean_text)
# Plot a Word-cloud of words of tweets
plt.figure(figsize = (16, 10))
image = plt.imshow(wc, interpolation = "bilinear")
plt.title('%s WordCloud\n' % tw_user_name, fontsize=16)
plt.axis("off")
# Save the Word-cloud image
plt.savefig('../img/wordcloud.png')
plt.show()
Sentiment analysis is done with the TextBlob library. Tweets that were not written in English are previously translated.
# Functions to discretize the sentiment analysis results
def get_polarity(polarity):
if polarity >= 0.20:
return 'positive'
elif polarity <= -0.20:
return 'negative'
return 'neutral'
def get_subjectivity(subjectivity):
if subjectivity >= 0.5:
return 'subjective'
return 'objective'
# Calculate the sentiment of the tweet
allow_translation = True
lang_dest = 'en'
polarity_list = Counter({'positive': 0, 'negative': 0, 'neutral': 0})
subjectivity_list = Counter({'subjective': 0, 'objective': 0})
for tweet in tweet_list:
message = tweet['message']
lang_source = tweet['lang']
# If the language of the tweet is different from English, it is translated
if lang_source == lang_dest or allow_translation:
analysis = TextBlob(message)
try:
if lang_source != lang_dest:
analysis = analysis.translate(to=lang_dest)
time.sleep(0.1)
# Apply sent-analysis
polarity = get_polarity(analysis.sentiment.polarity)
subjectivity = get_subjectivity(analysis.sentiment.subjectivity)
# Save results
polarity_list[polarity] += 1
subjectivity_list[subjectivity] += 1
except:
pass
# Cooking dataframe
df = pd.DataFrame.from_records(list(polarity_list.items()), columns = ['polarity', 'frequency'])
df['perc'] = (100 * df['frequency'] / sum(df['frequency']))
df
polarity | frequency | perc | |
---|---|---|---|
0 | positive | 366 | 50.482759 |
1 | negative | 13 | 1.793103 |
2 | neutral | 346 | 47.724138 |
# Plot the polarity of tweets
x_var = 'polarity'
y_var = 'frequency'
title = 'Polarity of Tweets'
figsize = (7, 7)
plot_col_chart(df, figsize, x_var, y_var, title)
# Cooking dataframe
df = pd.DataFrame.from_records(list(subjectivity_list.items()), columns = ['subjectivity', 'frequency'])
df['perc'] = (100 * df['frequency'] / sum(df['frequency']))
df
subjectivity | frequency | perc | |
---|---|---|---|
0 | subjective | 341 | 47.034483 |
1 | objective | 384 | 52.965517 |
# Plot the subjectivity of tweets
x_var = 'subjectivity'
y_var = 'frequency'
title = 'Subjectivity of Tweets'
figsize = (7, 7)
plot_col_chart(df, figsize, x_var, y_var, title)