# Import util libraries
import numpy as np
import pandas as pd
import csv
import json
import warnings
import time
import networkx as nx
from datetime import datetime
from PIL import Image
from collections import Counter


# Import NLP libraries
import re
from wordcloud import WordCloud


# Import plot libraries
import matplotlib.pyplot as plt


warnings.filterwarnings("ignore")


print("Networkx version:", nx.__version__)

Networkx version: 2.6.3


# Util function - Plot column chart
def plot_column_chart(df, figsize, x_var, y_var, title, color="green", legend=None, x_label=None):
    fig, ax = plt.subplots()
    df.plot.bar(ax=ax, x=x_var, y=y_var, color=color, alpha=0.75, figsize=figsize)
    if legend:
        ax.legend(legend)
    else:
        ax.get_legend().remove()
    if x_label:
        x = np.arange(len(x_label))
        plt.xticks(x, x_label, rotation=45)
    else:
        plt.xticks(rotation=45)
    plt.title(title, fontsize=16)
    plt.xlabel(x_var.capitalize())
    plt.ylabel(y_var.capitalize())
    plt.show()


# Util function - Plot column chart
def plot_bar_chart(df, figsize, x_var, y_var, title, color="blue", legend=None):
    fig, ax = plt.subplots()
    df.plot.barh(ax=ax, x=x_var, y=y_var, alpha=0.75, figsize=figsize)
    if legend:
        ax.legend(legend)
    else:
        ax.get_legend().remove()
    plt.title(title, fontsize=16)
    plt.xlabel(y_var.capitalize())
    plt.ylabel(x_var.capitalize())
    plt.show()


# Util function - Get country from location
def get_country_from_loc(location):
    country = ""
    
    location = location.strip()
    if location != "":
        tokens = location.split(",")
        country = tokens[-1].strip()
    
    return country


# Read list of tweets from CSV file
def read_tweets_from_csv(csv_filepath):
    tweet_list = []
    
    with open(csv_filepath, "r", encoding="utf-8") as f:
        reader = csv.reader(f, skipinitialspace=True)
        header = next(reader)
        tweet_list = [dict(zip(header, row)) for row in reader]
    
    for tweet in tweet_list:
        tweet["retweet_count"] = int(tweet["retweet_count"])
        tweet["favorite_count"] = int(tweet["favorite_count"])
        tweet["retweeted"] = tweet["retweeted"] == "True"
    
    return tweet_list


# Read list of tweets
csv_filepath = "../data/tweets2022.csv"
raw_tweet_list = read_tweets_from_csv(csv_filepath)


# Last tweet
raw_tweet_list[20]

{'id': '1572345207559168000',
 'created_at': '2022-09-20 22:01:24',
 'user_name': 'aadi_joshi',
 'lang': 'en',
 'hashtags': 'recsys,recsys2022,acmrecsys,recommender,recommendersystems',
 'user_mentions': '',
 'retweet_count': 0,
 'favorite_count': 1,
 'retweeted': False,
 'message': 'A colleague at Seek wrote a blog about lack of diversity in #recsys leadership. #recsys2022 #acmrecsys #recommender #recommendersystems https://t.co/EOq00tfs7s'}


# Show the last tweet
print("Total tweets:", len(raw_tweet_list))

Total tweets: 2289


tweet_list = [tweet for tweet in raw_tweet_list if not tweet["retweeted"]]
own_tweet_rate = len(tweet_list) / len(raw_tweet_list)
print("Own tweet rate: %.2f" % own_tweet_rate)
print("Total own tweets:", len(tweet_list))

Own tweet rate: 0.44
Total own tweets: 1018


# Tweet length
n_tweets = len(tweet_list)
tweets_len = [len(tweet["message"]) for tweet in tweet_list]


# Showing stats
print("Number of tweets: %s" % n_tweets)
print("Minimum length: %s" % np.min(tweets_len))
print("Maximum length: %s" % np.max(tweets_len))
print("Average length: %.2f" % np.mean(tweets_len))
print("Standard deviation: %.2f" % np.std(tweets_len))

Number of tweets: 1018
Minimum length: 10
Maximum length: 332
Average length: 175.69
Standard deviation: 82.95


# Plot histograma of the tweet length
fig = plt.figure(figsize=(12, 7))
plt.hist(tweets_len, 20, facecolor="green", alpha=0.75)
plt.title("Tweet Length Histogram", fontsize=16)
plt.xlabel("Tweet Length")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()


# Get retweet and favorite stats
retweet_count = []
favorite_count = []

for tweet in tweet_list:
    retweet_count.append(tweet["retweet_count"])
    favorite_count.append(tweet["favorite_count"])


# Showing statistics of account tweets
print("Retweets total: %s" % np.sum(retweet_count))
print("Retweets average per tweet: %.2f" % np.mean(retweet_count))
print("Favorites total: %s" % np.sum(favorite_count))
print("Favorites average per tweet: %.2f" % np.mean(favorite_count))

Retweets total: 1286
Retweets average per tweet: 1.26
Favorites total: 9607
Favorites average per tweet: 9.44


# Top 15 most popular tweets
df = pd.DataFrame(tweet_list)
df.drop("id", axis=1, inplace=True)
df.drop("user_mentions", axis=1, inplace=True)
df.drop("retweeted", axis=1, inplace=True)
df.sort_values(by=["retweet_count", "favorite_count"], ascending=False).head(15)


# Read iso639_2 languages codes
filename = "config\iso_lang_codes.csv"
df = pd.read_csv(filename, names=["iso_code", "language"], skiprows=1)
lang_dict = dict(zip(list(df.iso_code), list(df.language)))
print("Languages dict length:", len(lang_dict))

Languages dict length: 185


# Count the language of the tweets
lang_list = Counter()

for tweet in tweet_list:
    tweet_lang = tweet["lang"]
    if tweet_lang in lang_dict.keys():
        lang = lang_dict[tweet["lang"]]
        lang_list[lang] += 1
    else:
        print("Missing:", tweet_lang)

print("Total tweets language: %s" % len(lang_list))

Total tweets language: 8


# Cooking dataframe
df = pd.DataFrame.from_records(lang_list.most_common(), columns=["language", "frequency"])


# Plot the language of the tweets
x_var = "language"
y_var = "frequency"
title = "Tweets by Languages"
figsize = (7, 7)
plot_column_chart(df, figsize, x_var, y_var, title)


# Count the language of the tweets
tweets_day = Counter()

for tweet in raw_tweet_list:
    created_at = datetime.strptime(tweet["created_at"], "%Y-%m-%d %H:%M:%S").date()
    created_at_str = created_at.strftime("%Y-%m-%d")
    
    tweets_day[created_at] += 1


# Cooking dataframe
df = pd.DataFrame.from_records(list(tweets_day.items()), columns=["day", "frequency"]).sort_values(by=["day"])
df


# Plot tweets by day
x_var = "day"
y_var = "frequency"
title = "Tweets by day"
figsize = (12, 7)
color = "green"
plot_column_chart(df, figsize, x_var, y_var, title, color, None, None)


# Count the language of the tweets
tweets_account = Counter()

for tweet in raw_tweet_list:
    account_name = tweet["user_name"]
    tweets_account[account_name] += 1
    
print("Number of accounts that tweeted:", len(tweets_account))

Number of accounts that tweeted: 641


# Cooking dataframe
n_top = 20
pro_accounts = tweets_account.most_common(n_top)
df = pd.DataFrame.from_records(pro_accounts, columns=["account", "frequency"]).sort_values(by=["frequency"])
df


x_var = "account"
y_var = "frequency"
title = "Top %s Prolific Accounts" % n_top
figsize = (8, 10)
plot_bar_chart(df, figsize, x_var, y_var, title)


# Most mentioned accounts list
tw_accounts = Counter()
tw_user_name = "@acmrecsys"

for tweet in raw_tweet_list:
    user_mentions = tweet["user_mentions"].split(",")
    for users in user_mentions:
        if users != "" and users != "ACMRecSys":
            users = "@"+users
            tw_accounts[users] += 1

print("Number of accounts that tweeted:", len(tw_accounts))

Number of accounts that tweeted: 325


# Cooking dataframe
top_accounts = tw_accounts.most_common(n_top)
df = pd.DataFrame.from_records(top_accounts, columns=["account", "frequency"]).sort_values(by=["frequency"])
df


# Plot top N mentioned accounts
x_var = "account"
y_var = "frequency"
title = "Top %s Mentioned Twitter Accounts" % n_top
figsize = (8, 10)
plot_bar_chart(df, figsize, x_var, y_var, title)


# Count the hashtags in the tweets and return the N most repeated 
hashtags = Counter()
recsys_ht = 0
main_ht = "recsys2022"

for tweet in raw_tweet_list:
    twt_hts = tweet["hashtags"].lower().split(",")
    
    for ht in twt_hts:
        if ht == main_ht:
            recsys_ht += 1
            
        elif len(ht) > 0:
            ht = "#" + ht
            hashtags[ht] += 1

total_diff_ht = len(hashtags) + 1
print("Total used different hashtags: %s" % total_diff_ht)
print("Average hashtags per tweet: %.2f" % (sum(hashtags.values()) / n_tweets))
print("Total RecSys2022:", recsys_ht)

Total used different hashtags: 177
Average hashtags per tweet: 0.83
Total RecSys2022: 1462


# Cooking dataframe
top_hashtags = hashtags.most_common(n_top)
df = pd.DataFrame.from_records(top_hashtags, columns=["hashtag", "frequency"]).sort_values(by=["frequency"])
df


# Plot the most common hashtags in tweets
x_var = "hashtag"
y_var = "frequency"
title = "%s Most used Hashtags" % n_top
figsize = (8, 10)
plot_bar_chart(df, figsize, x_var, y_var, title)


# Count the likes in the tweets by accounts
likes_by_account = Counter()

for tweet in raw_tweet_list:
    account = tweet["user_name"]
    tot_likes = tweet["favorite_count"]
    likes_by_account[account] += tot_likes


# Cooking dataframe
top_likes_by_account = likes_by_account.most_common(n_top)
df = pd.DataFrame.from_records(top_likes_by_account, columns=["account", "total_likes"]).sort_values(by=["total_likes"])
df


# Plot the accounts with more likes
x_var = "account"
y_var = "total_likes"
title = "%s Accounts with more likes" % n_top
figsize = (8, 10)
plot_bar_chart(df, figsize, x_var, y_var, title)


# Global NLP variables
punt_marks = ["\n", "\"", "\\", "/", "¡", "!", "¿", "?", ".", ",", ";", ":", "_", "-", "#", "$", "%", "&", "(", ")", "'"]
rx = "[" + re.escape("".join(punt_marks)) + "]"


# Util function - Clean tweet text
def dq_clean_text(text):
    clean_text = text.lower()
    clean_text = re.sub(rx, " ", clean_text)
    clean_text = re.sub(r"\.+", " ", clean_text)
    clean_text = re.sub(r"\s+", " ", clean_text)
    return clean_text


# Get tweets in English
en_tweet_list = [tweet for tweet in raw_tweet_list if tweet["lang"] == "en"]
print("Tweets in English:", len(en_tweet_list))

Tweets in English: 2213


# Counter of used words
word_list = Counter()

# Create list of words
for tweet in en_tweet_list:
    if not tweet["retweeted"]:
        tweet_text = tweet["message"]

        # Clean tweet text
        clean_text = dq_clean_text(tweet_text)

        # Save mentioned accounts
        for word in clean_text.split(" "):
            if len(word) and not word.isnumeric():
                word_list[word] += 1

print("Number of words used: %s" % len(word_list))
print("Average words per tweet: %.2f" % (len(word_list) / n_tweets))

Number of words used: 4545
Average words per tweet: 4.46


# Add word-freq to Dataframe
df = pd.DataFrame(columns=["word", "frequency"])
ix = 0

# Filter twitter accounts
for word, freq in word_list.most_common():
    if ix == n_top:
        break
    elif len(word) > 2 and word[0] != "@":
        df.loc[ix] = (word, freq)
        ix += 1

# Sorting data by frequency
df = df.sort_values(by=["frequency"])


# Plot the most common words in tweets
x_var = "word"
y_var = "frequency"
title = "%s Most used Words" % n_top
figsize = (8, 10)
plot_bar_chart(df, figsize, x_var, y_var, title)


def get_stopwords(lang: str) -> set:
    stopwords = []
    
    filename = "../data/stopwords/" + lang + ".txt"
    with open(filename) as file:
        lines = file.readlines()
        stopwords = [line.rstrip() for line in lines]
    
    return set(stopwords)


# Get Spanish stopwords
stopwords_sp = get_stopwords("spanish")
stopwords_en = get_stopwords("english") | set({"http", "https"})


# Add word-freq to Dataframe
df = pd.DataFrame(columns=["word", "frequency"])
ix = 0

# Filter English and Spanihs stopwords
for word, freq in word_list.most_common():
    if word != main_ht:
        if ix == n_top:
            break
        elif (word not in stopwords_sp) and (word not in stopwords_en) and (len(word) > 2) and (word[0] != "@"):
            df.loc[ix] = (word, freq)
            ix += 1

# Sorting data by frequency
df = df.sort_values(by=["frequency"])


# Plot the most common words in tweets
x_var = "word"
y_var = "frequency"
title = "%s Most used Words w/o Stopwords" % n_top
figsize = (8, 10)
plot_bar_chart(df, figsize, x_var, y_var, title)


# Calculate most common bigrams and reconstruct full text with used words
new_clean_text = ""
bigram_list = Counter()

# Create list of words
for tweet in tweet_list:
    tweet_text = tweet["message"]
    clean_text = dq_clean_text(tweet_text)
    
    tokens = clean_text.split(" ")
    bigram = ""
    last_word = ""
    for i in range(0, len(tokens)):
        word = tokens[i]
        if (word not in stopwords_sp) and (word not in stopwords_en) and (len(word) > 2) and (word[0] != "@"):
            
            # Reconstructing the clean text (without stop-words)
            new_clean_text += " " + word
            
            # Add bigrams-freq to Dataframe
            if last_word != "":
                bigram = last_word + "-" + word
                bigram_list[bigram] += 1
            
            last_word = word

print("Total different bigrams: %s" % len(bigram_list))

Total different bigrams: 9409


# Cooking dataframe
top_bigrams = bigram_list.most_common(n_top)
df = pd.DataFrame.from_records(top_bigrams, columns=["bigram", "frequency"]).sort_values(by=["frequency"])


# Plot the most common words in tweets
x_var = "bigram"
y_var = "frequency"
title = "%s Most used Bigrams" % n_top
figsize = (8, 10)
plot_bar_chart(df, figsize, x_var, y_var, title)


# Load mask
mask = np.array(Image.open("../img/twitter-mask.png"))

# Create a Word-cloud
wc = WordCloud(background_color="white", max_words=250, mask=mask, collocations=False, margin=2)
wc = wc.generate(new_clean_text)

# Plot a Word-cloud of words of tweets
plt.figure(figsize=(16, 10))
image = plt.imshow(wc, interpolation = "bilinear")
plt.title("%s WordCloud\n" % tw_user_name, fontsize=16)
plt.axis("off")

# Save the Word-cloud image
plt.savefig("../img/wordcloud.png")
plt.show()


# Counting nodes and edges
edge_counter = Counter()

for tweet in raw_tweet_list:
    source = tweet["user_name"].lower()
    targets = tweet["user_mentions"].lower().split(",")
    
    for target in targets:
        if target != "" and source != target:
            edge = source + "-" + target
            edge_counter[edge] += 1

print("Total edge list:", len(edge_counter))
print(edge_counter.most_common(10))

Total edge list: 2201
[('manelslokom-acmrecsys', 116), ('encodedgeek-acmrecsys', 49), ('rajamifa-acmrecsys', 37), ('seguraandres7-acmrecsys', 33), ('rajamifa-facctrec', 32), ('manelslokom-seguraandres7', 24), ('facctrec-rajamifa', 20), ('agelesschronicl-acmrecsys', 18), ('manelslokom-christine_bauer', 15), ('drch0le-acmrecsys', 15)]


node_list = []
edge_list = []
THRESHOLD = 4

# Add weighted edges
for key, weight in edge_counter.items():
    tokens = key.split("-")
    source = tokens[0]
    target = tokens[1]
    
    if weight > THRESHOLD:
        
        # Add nodes
        if source not in node_list:
            node_list.append(source)
        if target not in node_list:
            node_list.append(target)
        
        # Adde edges
        edge_list.append((source, target, weight))


# Create an empty Directed Weighted Graph (DWG)
dwg = nx.DiGraph()
dwg.add_nodes_from(node_list)
dwg.add_weighted_edges_from(edge_list)


# Function that calculates the graph density
def get_graph_density(g):
    n_nodes = len(g.nodes())
    n_edges = len(g.edges())
    print("n_nodes:", n_nodes, ", n_edges:", n_edges)
    
    if nx.is_directed(g):
        density = n_edges / (n_nodes * (n_nodes - 1))
    else:
        density = 2 * n_edges / (n_nodes * (n_nodes - 1))
    
    return density


# Calculate the graph density
density = get_graph_density(dwg)
print("graph density:", density)

n_nodes: 70 , n_edges: 101
graph density: 0.02091097308488613


# Calculate the adjacency between the nodes
adj_mtx = nx.adjacency_matrix(dwg).todense()
adj_mtx

matrix([[ 0,  5,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0, 13,  0, ...,  0,  0,  0],
        ...,
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0,  0,  0, ...,  0,  0,  0],
        [ 0, 12,  0, ...,  0,  0,  0]], dtype=int32)


# Plotting the Adjacency matrix
plt.rcParams["figure.figsize"] = [20, 20]
fig, ax = plt.subplots()
ax.imshow(adj_mtx, cmap="Blues")
ax.set_xticks(np.arange(len(node_list)))
ax.set_yticks(np.arange(len(node_list)))
ax.set_xticklabels(node_list, rotation=45)
ax.set_yticklabels(node_list)

for i in range(len(node_list)):
    for j in range(len(node_list)):
        text_color = "black" if adj_mtx[i, j] == 0 else "white"
        text = ax.text(j, i, adj_mtx[i, j], ha="center", va="center", color=text_color)

ax.set_title("Adjacency matrix", fontsize=12)
plt.show()


# Calculate the incidence of the edges on the nodes
inc_mtx = nx.incidence_matrix(dwg).todense()
inc_mtx

matrix([[1., 0., 0., ..., 0., 0., 0.],
        [1., 1., 0., ..., 0., 1., 0.],
        [0., 1., 1., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.],
        [0., 0., 0., ..., 0., 1., 1.]])


# Plotting the Incidence matrix
fig, ax = plt.subplots()
ax.imshow(inc_mtx, cmap="Blues")
ax.set_xticks(np.arange(len(edge_list)))
ax.set_yticks(np.arange(len(node_list)))
ax.set_xticklabels(edge_list, rotation=45)
ax.set_yticklabels(node_list)

for i in range(len(node_list)):
    for j in range(len(edge_list)):
        text_color = "black" if inc_mtx[i, j] == 0 else "white"
        text = ax.text(j, i, int(inc_mtx[i, j]), ha="center", va="center", color=text_color)

ax.set_title("Incidence matrix", fontsize=12)
plt.show()


# Plot Undirected Simple Graph (free format)
plt.rcParams["figure.figsize"] = [20, 20]
nx.draw(dwg, with_labels=True)
plt.title("Directed Weighted Graph (DWG)", fontsize=14)
plt.axis("off")
plt.show()


# Util function - save list to CSV file
def export_list_to_csv_file(csv_path:str, header:list, data:list) -> None:
    
    # Using csv.writer method from CSV package
    with open(csv_path, "w", newline="", encoding="utf-8") as f:
        write = csv.writer(f)
        write.writerow(header)
        for row in data:
            write.writerow(row)

# Util function - save json to json file
def export_json_to_json_file(json_path:str, json_data:dict) -> None:
    
    # Create JSON file
    with open(json_path, "w") as outfile:
        json.dump(json_data, outfile)


# Create CSV file
header = ["source", "target", "weight"]
csv_data = []

# Add weighted edges
for key, weight in edge_counter.items():
    tokens = key.split("-")
    source = tokens[0]
    target = tokens[1]
    csv_data.append([source, target, weight])

# Save CSV file
csv_filepath = "../data/graph_data.csv"
export_list_to_csv_file(csv_filepath, header, csv_data)


# Create JSON file
json_data = {"nodes": [], "links":[]}
node_list = []

# Add weighted edges
for key, weight in edge_counter.items():
    tokens = key.split("-")
    source = tokens[0]
    target = tokens[1]
    
    # Adde edges
    if weight > 1:
        if source not in node_list:
            node_list.append(source)
        if target not in node_list:
            node_list.append(target)
        json_data["links"].append({"source": source, "target": target, "value": weight})

source_list = [key.lower()[1:] for key in dict(tw_accounts.items()).keys()]
for node in node_list:
    json_data["nodes"].append({"id": node, "group": 1 if node in source_list else 2})

# Save JSON file
json_filepath = "../data/graph_data.json"
export_json_to_json_file(json_filepath, json_data)

	created_at	user_name	lang	hashtags	retweet_count	favorite_count	message
873	2022-09-23 17:30:34	_arohan_	en		89	370	Today, we present our paper on Google Search A...
987	2022-09-25 14:57:41	jengolbeck	en	RecSys2022	33	242	Last week was #RecSys2022 which I co-chaired w...
241	2022-09-19 14:33:36	jengolbeck	en	RecSys2022	24	355	How does TikTok know my location when I don’t ...
347	2022-09-16 13:16:34	informor	en	recsys2022	17	71	On Monday, I will be giving the opening keynot...
335	2022-09-17 10:44:03	ir_glasgow	en	recsys2022	17	46	We are excited to be at #recsys2022 next week!...
705	2022-09-22 13:12:01	ecir2023	en	RecSys2022,ECIR2023	14	25	Congrats on a successful #RecSys2022 round, P...
272	2022-09-18 21:48:29	erishabh	en	recsys2022	13	83	#recsys2022 hands tutorial on Explainable Reco...
38	2022-09-20 17:18:52	ACMRecSys	en	WomenInRecSys,RecSys2022,WomenInRecSys	12	72	Nice to see the #WomenInRecSys come together a...
281	2022-09-18 18:46:09	jinayoon_	en	RecSys2022	10	42	WOW the tutorial room for the Hands-on Reinfor...
59	2022-09-20 15:09:49	AmazonScience	en	RecSys2022,RecommenderSystems,NLProc	10	30	Amazon scientists @maxharp3r and @vanessa_murd...
606	2022-09-21 15:44:54	eugeneyan	en	RecSys2022	9	50	I'm psyched to be giving a keynote at the @ACM...
326	2022-09-17 18:51:20	moumita_bh	en	RecSys2022,search,RecSys,IR,MachineLearning	9	49	Excited to present our work on "Augmenting Net...
679	2022-09-22 00:31:45	peterpaws	en	recsys2023,RecSys2022	9	29	First announcement about @ACMRecSys summer sch...
587	2022-09-21 01:29:30	SeguraAndres7	en	RecSys2022	9	26	New amazing repos discovered at #RecSys2022\n⭐...
485	2022-09-19 17:31:54	sannevrijenhoek	en		8	59	Our work "RADio: Rank-aware divergence metrics...

	day	frequency
8	2022-09-12	10
7	2022-09-13	41
6	2022-09-14	28
5	2022-09-15	33
4	2022-09-16	56
3	2022-09-17	53
2	2022-09-18	148
1	2022-09-19	466
0	2022-09-20	284
9	2022-09-21	203
10	2022-09-22	454
11	2022-09-23	333
13	2022-09-24	122
12	2022-09-25	58

	account	frequency
19	@DrCh0le	37
18	@mdekstrand	38
17	@McWillemsen	39
16	@mvlacho1	40
15	@alansaid	43
14	@maxharp3r	43
13	@vanessa_murdock	45
12	@JavierSanzCruza	46
11	@ManelSlokom	47
10	@jengolbeck	49
9	@lienmichiels	52
7	@christine_bauer	53
8	@KarsWorkshop	53
6	@CONSEQUENCES_ws	54
5	@IntRSworkshop	56
4	@RajAmifa	78
3	@_arohan_	86
2	@SeguraAndres7	91
1	@craig_macdonald	116
0	@FAccTRec	131

	account	total_likes
19	ylyuliang	105
17	peterpaws	115
18	informor	115
16	mdekstrand	117
15	eugeneyan	118
14	himan_abd	126
13	olivierjeunen	140
12	McWillemsen	165
11	HarrieOos	168
10	erishabh	187
9	ManelSlokom	249
7	alansaid	272
8	RajAmifa	272
6	lienmichiels	325
5	DrCh0le	341
4	_arohan_	371
3	craig_macdonald	386
2	SeguraAndres7	401
1	ACMRecSys	600
0	jengolbeck	758

RecSys 2022 Twitter Interaction Analysis¶

Definition of Utility Functions¶

1. Read tweets from CSV file¶

2. Tweets Basic Analytics¶

2.1. Show basic stats¶

2.2. Calculate own tweet rate¶

2.3. Length of Tweets¶

2.4. Stats of Retweets and Favorites¶

2.5. Tweets Language¶

2.6. Tweets by Days¶

2.7. Tweets by Accounts¶

2.8. Most mentioned User Accounts¶

2.9. Most used Hashtags in Tweets¶

2.10. Accounts with more likes¶

3. NLP Analytics¶

3.1. Number of words per tweets in English¶

3.2. Most common Words in Tweets¶

3.3. Most common Words w/o Stopwords¶

3.4. Most common Bigrams w/o Stopwords¶

3.5. Tweet WordCloud¶

4. Network Analytics¶

4.1. Create Network¶

4.2. Graph density¶

4.3. Plot Graph data¶

Adjacency matrix¶

Incidence matrix¶

Plotting Directed Weighted Graph¶

4.4. Export data¶

	account	frequency
19	emiliagogu	18
17	alainstarke	18
18	christine_bauer	18
16	get_sphere	20
15	olivierjeunen	20
14	yashonwu	22
13	peterpaws	23
12	craig_macdonald	23
11	Agelesschronicl	25
10	lienmichiels	27
9	DrCh0le	28
8	totopampin	28
7	alansaid	29
6	mdekstrand	55
5	ACMRecSys	56
4	FAccTRec	60
3	RajAmifa	65
2	SeguraAndres7	70
1	encodedgeek	74
0	ManelSlokom	121

	hashtag	frequency
19	#recsysingfromafar	8
17	#flowmoods	9
18	#doctoralsymposium	9
16	#fairness	10
15	#facctrec	11
14	#rs_c	11
13	#recommenders	11
12	#perspectives	12
11	#ecir2023	13
10	#mors2022	13
9	#facctrec2022	14
8	#recommender	15
7	#seattle	18
6	#finrec	19
5	#ai	20
4	#womeninrecsys	24
3	#perspectives2022	38
2	#recsys2023	54
1	#recsys22	78
0	#recsys	133