Data collected with Twitter API during 2 weeks (from 09/12 to 09/25).
# Import util libraries
import numpy as np
import pandas as pd
import csv
import json
import warnings
import time
import networkx as nx
from datetime import datetime
from PIL import Image
from collections import Counter
# Import NLP libraries
import re
from wordcloud import WordCloud
# Import plot libraries
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
print("Networkx version:", nx.__version__)
Networkx version: 2.6.3
# Util function - Plot column chart
def plot_column_chart(df, figsize, x_var, y_var, title, color="green", legend=None, x_label=None):
fig, ax = plt.subplots()
df.plot.bar(ax=ax, x=x_var, y=y_var, color=color, alpha=0.75, figsize=figsize)
if legend:
ax.legend(legend)
else:
ax.get_legend().remove()
if x_label:
x = np.arange(len(x_label))
plt.xticks(x, x_label, rotation=45)
else:
plt.xticks(rotation=45)
plt.title(title, fontsize=16)
plt.xlabel(x_var.capitalize())
plt.ylabel(y_var.capitalize())
plt.show()
# Util function - Plot column chart
def plot_bar_chart(df, figsize, x_var, y_var, title, color="blue", legend=None):
fig, ax = plt.subplots()
df.plot.barh(ax=ax, x=x_var, y=y_var, alpha=0.75, figsize=figsize)
if legend:
ax.legend(legend)
else:
ax.get_legend().remove()
plt.title(title, fontsize=16)
plt.xlabel(y_var.capitalize())
plt.ylabel(x_var.capitalize())
plt.show()
# Util function - Get country from location
def get_country_from_loc(location):
country = ""
location = location.strip()
if location != "":
tokens = location.split(",")
country = tokens[-1].strip()
return country
# Read list of tweets from CSV file
def read_tweets_from_csv(csv_filepath):
tweet_list = []
with open(csv_filepath, "r", encoding="utf-8") as f:
reader = csv.reader(f, skipinitialspace=True)
header = next(reader)
tweet_list = [dict(zip(header, row)) for row in reader]
for tweet in tweet_list:
tweet["retweet_count"] = int(tweet["retweet_count"])
tweet["favorite_count"] = int(tweet["favorite_count"])
tweet["retweeted"] = tweet["retweeted"] == "True"
return tweet_list
# Read list of tweets
csv_filepath = "../data/tweets2022.csv"
raw_tweet_list = read_tweets_from_csv(csv_filepath)
# Last tweet
raw_tweet_list[20]
{'id': '1572345207559168000', 'created_at': '2022-09-20 22:01:24', 'user_name': 'aadi_joshi', 'lang': 'en', 'hashtags': 'recsys,recsys2022,acmrecsys,recommender,recommendersystems', 'user_mentions': '', 'retweet_count': 0, 'favorite_count': 1, 'retweeted': False, 'message': 'A colleague at Seek wrote a blog about lack of diversity in #recsys leadership. #recsys2022 #acmrecsys #recommender #recommendersystems https://t.co/EOq00tfs7s'}
# Show the last tweet
print("Total tweets:", len(raw_tweet_list))
Total tweets: 2289
tweet_list = [tweet for tweet in raw_tweet_list if not tweet["retweeted"]]
own_tweet_rate = len(tweet_list) / len(raw_tweet_list)
print("Own tweet rate: %.2f" % own_tweet_rate)
print("Total own tweets:", len(tweet_list))
Own tweet rate: 0.44 Total own tweets: 1018
# Tweet length
n_tweets = len(tweet_list)
tweets_len = [len(tweet["message"]) for tweet in tweet_list]
# Showing stats
print("Number of tweets: %s" % n_tweets)
print("Minimum length: %s" % np.min(tweets_len))
print("Maximum length: %s" % np.max(tweets_len))
print("Average length: %.2f" % np.mean(tweets_len))
print("Standard deviation: %.2f" % np.std(tweets_len))
Number of tweets: 1018 Minimum length: 10 Maximum length: 332 Average length: 175.69 Standard deviation: 82.95
# Plot histograma of the tweet length
fig = plt.figure(figsize=(12, 7))
plt.hist(tweets_len, 20, facecolor="green", alpha=0.75)
plt.title("Tweet Length Histogram", fontsize=16)
plt.xlabel("Tweet Length")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()
# Get retweet and favorite stats
retweet_count = []
favorite_count = []
for tweet in tweet_list:
retweet_count.append(tweet["retweet_count"])
favorite_count.append(tweet["favorite_count"])
# Showing statistics of account tweets
print("Retweets total: %s" % np.sum(retweet_count))
print("Retweets average per tweet: %.2f" % np.mean(retweet_count))
print("Favorites total: %s" % np.sum(favorite_count))
print("Favorites average per tweet: %.2f" % np.mean(favorite_count))
Retweets total: 1286 Retweets average per tweet: 1.26 Favorites total: 9607 Favorites average per tweet: 9.44
# Top 15 most popular tweets
df = pd.DataFrame(tweet_list)
df.drop("id", axis=1, inplace=True)
df.drop("user_mentions", axis=1, inplace=True)
df.drop("retweeted", axis=1, inplace=True)
df.sort_values(by=["retweet_count", "favorite_count"], ascending=False).head(15)
created_at | user_name | lang | hashtags | retweet_count | favorite_count | message | |
---|---|---|---|---|---|---|---|
873 | 2022-09-23 17:30:34 | _arohan_ | en | 89 | 370 | Today, we present our paper on Google Search A... | |
987 | 2022-09-25 14:57:41 | jengolbeck | en | RecSys2022 | 33 | 242 | Last week was #RecSys2022 which I co-chaired w... |
241 | 2022-09-19 14:33:36 | jengolbeck | en | RecSys2022 | 24 | 355 | How does TikTok know my location when I don’t ... |
347 | 2022-09-16 13:16:34 | informor | en | recsys2022 | 17 | 71 | On Monday, I will be giving the opening keynot... |
335 | 2022-09-17 10:44:03 | ir_glasgow | en | recsys2022 | 17 | 46 | We are excited to be at #recsys2022 next week!... |
705 | 2022-09-22 13:12:01 | ecir2023 | en | RecSys2022,ECIR2023 | 14 | 25 | Congrats on a successful #RecSys2022 round, P... |
272 | 2022-09-18 21:48:29 | erishabh | en | recsys2022 | 13 | 83 | #recsys2022 hands tutorial on Explainable Reco... |
38 | 2022-09-20 17:18:52 | ACMRecSys | en | WomenInRecSys,RecSys2022,WomenInRecSys | 12 | 72 | Nice to see the #WomenInRecSys come together a... |
281 | 2022-09-18 18:46:09 | jinayoon_ | en | RecSys2022 | 10 | 42 | WOW the tutorial room for the Hands-on Reinfor... |
59 | 2022-09-20 15:09:49 | AmazonScience | en | RecSys2022,RecommenderSystems,NLProc | 10 | 30 | Amazon scientists @maxharp3r and @vanessa_murd... |
606 | 2022-09-21 15:44:54 | eugeneyan | en | RecSys2022 | 9 | 50 | I'm psyched to be giving a keynote at the @ACM... |
326 | 2022-09-17 18:51:20 | moumita_bh | en | RecSys2022,search,RecSys,IR,MachineLearning | 9 | 49 | Excited to present our work on "Augmenting Net... |
679 | 2022-09-22 00:31:45 | peterpaws | en | recsys2023,RecSys2022 | 9 | 29 | First announcement about @ACMRecSys summer sch... |
587 | 2022-09-21 01:29:30 | SeguraAndres7 | en | RecSys2022 | 9 | 26 | New amazing repos discovered at #RecSys2022\n⭐... |
485 | 2022-09-19 17:31:54 | sannevrijenhoek | en | 8 | 59 | Our work "RADio: Rank-aware divergence metrics... |
# Read iso639_2 languages codes
filename = "config\iso_lang_codes.csv"
df = pd.read_csv(filename, names=["iso_code", "language"], skiprows=1)
lang_dict = dict(zip(list(df.iso_code), list(df.language)))
print("Languages dict length:", len(lang_dict))
Languages dict length: 185
# Count the language of the tweets
lang_list = Counter()
for tweet in tweet_list:
tweet_lang = tweet["lang"]
if tweet_lang in lang_dict.keys():
lang = lang_dict[tweet["lang"]]
lang_list[lang] += 1
else:
print("Missing:", tweet_lang)
print("Total tweets language: %s" % len(lang_list))
Total tweets language: 8
# Cooking dataframe
df = pd.DataFrame.from_records(lang_list.most_common(), columns=["language", "frequency"])
# Plot the language of the tweets
x_var = "language"
y_var = "frequency"
title = "Tweets by Languages"
figsize = (7, 7)
plot_column_chart(df, figsize, x_var, y_var, title)
# Count the language of the tweets
tweets_day = Counter()
for tweet in raw_tweet_list:
created_at = datetime.strptime(tweet["created_at"], "%Y-%m-%d %H:%M:%S").date()
created_at_str = created_at.strftime("%Y-%m-%d")
tweets_day[created_at] += 1
# Cooking dataframe
df = pd.DataFrame.from_records(list(tweets_day.items()), columns=["day", "frequency"]).sort_values(by=["day"])
df
day | frequency | |
---|---|---|
8 | 2022-09-12 | 10 |
7 | 2022-09-13 | 41 |
6 | 2022-09-14 | 28 |
5 | 2022-09-15 | 33 |
4 | 2022-09-16 | 56 |
3 | 2022-09-17 | 53 |
2 | 2022-09-18 | 148 |
1 | 2022-09-19 | 466 |
0 | 2022-09-20 | 284 |
9 | 2022-09-21 | 203 |
10 | 2022-09-22 | 454 |
11 | 2022-09-23 | 333 |
13 | 2022-09-24 | 122 |
12 | 2022-09-25 | 58 |
# Plot tweets by day
x_var = "day"
y_var = "frequency"
title = "Tweets by day"
figsize = (12, 7)
color = "green"
plot_column_chart(df, figsize, x_var, y_var, title, color, None, None)
# Count the language of the tweets
tweets_account = Counter()
for tweet in raw_tweet_list:
account_name = tweet["user_name"]
tweets_account[account_name] += 1
print("Number of accounts that tweeted:", len(tweets_account))
Number of accounts that tweeted: 641
# Cooking dataframe
n_top = 20
pro_accounts = tweets_account.most_common(n_top)
df = pd.DataFrame.from_records(pro_accounts, columns=["account", "frequency"]).sort_values(by=["frequency"])
df
account | frequency | |
---|---|---|
19 | emiliagogu | 18 |
17 | alainstarke | 18 |
18 | christine_bauer | 18 |
16 | get_sphere | 20 |
15 | olivierjeunen | 20 |
14 | yashonwu | 22 |
13 | peterpaws | 23 |
12 | craig_macdonald | 23 |
11 | Agelesschronicl | 25 |
10 | lienmichiels | 27 |
9 | DrCh0le | 28 |
8 | totopampin | 28 |
7 | alansaid | 29 |
6 | mdekstrand | 55 |
5 | ACMRecSys | 56 |
4 | FAccTRec | 60 |
3 | RajAmifa | 65 |
2 | SeguraAndres7 | 70 |
1 | encodedgeek | 74 |
0 | ManelSlokom | 121 |
x_var = "account"
y_var = "frequency"
title = "Top %s Prolific Accounts" % n_top
figsize = (8, 10)
plot_bar_chart(df, figsize, x_var, y_var, title)
# Most mentioned accounts list
tw_accounts = Counter()
tw_user_name = "@acmrecsys"
for tweet in raw_tweet_list:
user_mentions = tweet["user_mentions"].split(",")
for users in user_mentions:
if users != "" and users != "ACMRecSys":
users = "@"+users
tw_accounts[users] += 1
print("Number of accounts that tweeted:", len(tw_accounts))
Number of accounts that tweeted: 325
# Cooking dataframe
top_accounts = tw_accounts.most_common(n_top)
df = pd.DataFrame.from_records(top_accounts, columns=["account", "frequency"]).sort_values(by=["frequency"])
df
account | frequency | |
---|---|---|
19 | @DrCh0le | 37 |
18 | @mdekstrand | 38 |
17 | @McWillemsen | 39 |
16 | @mvlacho1 | 40 |
15 | @alansaid | 43 |
14 | @maxharp3r | 43 |
13 | @vanessa_murdock | 45 |
12 | @JavierSanzCruza | 46 |
11 | @ManelSlokom | 47 |
10 | @jengolbeck | 49 |
9 | @lienmichiels | 52 |
7 | @christine_bauer | 53 |
8 | @KarsWorkshop | 53 |
6 | @CONSEQUENCES_ws | 54 |
5 | @IntRSworkshop | 56 |
4 | @RajAmifa | 78 |
3 | @_arohan_ | 86 |
2 | @SeguraAndres7 | 91 |
1 | @craig_macdonald | 116 |
0 | @FAccTRec | 131 |
# Plot top N mentioned accounts
x_var = "account"
y_var = "frequency"
title = "Top %s Mentioned Twitter Accounts" % n_top
figsize = (8, 10)
plot_bar_chart(df, figsize, x_var, y_var, title)
# Count the hashtags in the tweets and return the N most repeated
hashtags = Counter()
recsys_ht = 0
main_ht = "recsys2022"
for tweet in raw_tweet_list:
twt_hts = tweet["hashtags"].lower().split(",")
for ht in twt_hts:
if ht == main_ht:
recsys_ht += 1
elif len(ht) > 0:
ht = "#" + ht
hashtags[ht] += 1
total_diff_ht = len(hashtags) + 1
print("Total used different hashtags: %s" % total_diff_ht)
print("Average hashtags per tweet: %.2f" % (sum(hashtags.values()) / n_tweets))
print("Total RecSys2022:", recsys_ht)
Total used different hashtags: 177 Average hashtags per tweet: 0.83 Total RecSys2022: 1462
# Cooking dataframe
top_hashtags = hashtags.most_common(n_top)
df = pd.DataFrame.from_records(top_hashtags, columns=["hashtag", "frequency"]).sort_values(by=["frequency"])
df
hashtag | frequency | |
---|---|---|
19 | #recsysingfromafar | 8 |
17 | #flowmoods | 9 |
18 | #doctoralsymposium | 9 |
16 | #fairness | 10 |
15 | #facctrec | 11 |
14 | #rs_c | 11 |
13 | #recommenders | 11 |
12 | #perspectives | 12 |
11 | #ecir2023 | 13 |
10 | #mors2022 | 13 |
9 | #facctrec2022 | 14 |
8 | #recommender | 15 |
7 | #seattle | 18 |
6 | #finrec | 19 |
5 | #ai | 20 |
4 | #womeninrecsys | 24 |
3 | #perspectives2022 | 38 |
2 | #recsys2023 | 54 |
1 | #recsys22 | 78 |
0 | #recsys | 133 |
# Plot the most common hashtags in tweets
x_var = "hashtag"
y_var = "frequency"
title = "%s Most used Hashtags" % n_top
figsize = (8, 10)
plot_bar_chart(df, figsize, x_var, y_var, title)
# Count the likes in the tweets by accounts
likes_by_account = Counter()
for tweet in raw_tweet_list:
account = tweet["user_name"]
tot_likes = tweet["favorite_count"]
likes_by_account[account] += tot_likes
# Cooking dataframe
top_likes_by_account = likes_by_account.most_common(n_top)
df = pd.DataFrame.from_records(top_likes_by_account, columns=["account", "total_likes"]).sort_values(by=["total_likes"])
df
account | total_likes | |
---|---|---|
19 | ylyuliang | 105 |
17 | peterpaws | 115 |
18 | informor | 115 |
16 | mdekstrand | 117 |
15 | eugeneyan | 118 |
14 | himan_abd | 126 |
13 | olivierjeunen | 140 |
12 | McWillemsen | 165 |
11 | HarrieOos | 168 |
10 | erishabh | 187 |
9 | ManelSlokom | 249 |
7 | alansaid | 272 |
8 | RajAmifa | 272 |
6 | lienmichiels | 325 |
5 | DrCh0le | 341 |
4 | _arohan_ | 371 |
3 | craig_macdonald | 386 |
2 | SeguraAndres7 | 401 |
1 | ACMRecSys | 600 |
0 | jengolbeck | 758 |
# Plot the accounts with more likes
x_var = "account"
y_var = "total_likes"
title = "%s Accounts with more likes" % n_top
figsize = (8, 10)
plot_bar_chart(df, figsize, x_var, y_var, title)
# Global NLP variables
punt_marks = ["\n", "\"", "\\", "/", "¡", "!", "¿", "?", ".", ",", ";", ":", "_", "-", "#", "$", "%", "&", "(", ")", "'"]
rx = "[" + re.escape("".join(punt_marks)) + "]"
# Util function - Clean tweet text
def dq_clean_text(text):
clean_text = text.lower()
clean_text = re.sub(rx, " ", clean_text)
clean_text = re.sub(r"\.+", " ", clean_text)
clean_text = re.sub(r"\s+", " ", clean_text)
return clean_text
# Get tweets in English
en_tweet_list = [tweet for tweet in raw_tweet_list if tweet["lang"] == "en"]
print("Tweets in English:", len(en_tweet_list))
Tweets in English: 2213
# Counter of used words
word_list = Counter()
# Create list of words
for tweet in en_tweet_list:
if not tweet["retweeted"]:
tweet_text = tweet["message"]
# Clean tweet text
clean_text = dq_clean_text(tweet_text)
# Save mentioned accounts
for word in clean_text.split(" "):
if len(word) and not word.isnumeric():
word_list[word] += 1
print("Number of words used: %s" % len(word_list))
print("Average words per tweet: %.2f" % (len(word_list) / n_tweets))
Number of words used: 4545 Average words per tweet: 4.46
# Add word-freq to Dataframe
df = pd.DataFrame(columns=["word", "frequency"])
ix = 0
# Filter twitter accounts
for word, freq in word_list.most_common():
if ix == n_top:
break
elif len(word) > 2 and word[0] != "@":
df.loc[ix] = (word, freq)
ix += 1
# Sorting data by frequency
df = df.sort_values(by=["frequency"])
# Plot the most common words in tweets
x_var = "word"
y_var = "frequency"
title = "%s Most used Words" % n_top
figsize = (8, 10)
plot_bar_chart(df, figsize, x_var, y_var, title)
def get_stopwords(lang: str) -> set:
stopwords = []
filename = "../data/stopwords/" + lang + ".txt"
with open(filename) as file:
lines = file.readlines()
stopwords = [line.rstrip() for line in lines]
return set(stopwords)
# Get Spanish stopwords
stopwords_sp = get_stopwords("spanish")
stopwords_en = get_stopwords("english") | set({"http", "https"})
# Add word-freq to Dataframe
df = pd.DataFrame(columns=["word", "frequency"])
ix = 0
# Filter English and Spanihs stopwords
for word, freq in word_list.most_common():
if word != main_ht:
if ix == n_top:
break
elif (word not in stopwords_sp) and (word not in stopwords_en) and (len(word) > 2) and (word[0] != "@"):
df.loc[ix] = (word, freq)
ix += 1
# Sorting data by frequency
df = df.sort_values(by=["frequency"])
# Plot the most common words in tweets
x_var = "word"
y_var = "frequency"
title = "%s Most used Words w/o Stopwords" % n_top
figsize = (8, 10)
plot_bar_chart(df, figsize, x_var, y_var, title)
# Calculate most common bigrams and reconstruct full text with used words
new_clean_text = ""
bigram_list = Counter()
# Create list of words
for tweet in tweet_list:
tweet_text = tweet["message"]
clean_text = dq_clean_text(tweet_text)
tokens = clean_text.split(" ")
bigram = ""
last_word = ""
for i in range(0, len(tokens)):
word = tokens[i]
if (word not in stopwords_sp) and (word not in stopwords_en) and (len(word) > 2) and (word[0] != "@"):
# Reconstructing the clean text (without stop-words)
new_clean_text += " " + word
# Add bigrams-freq to Dataframe
if last_word != "":
bigram = last_word + "-" + word
bigram_list[bigram] += 1
last_word = word
print("Total different bigrams: %s" % len(bigram_list))
Total different bigrams: 9409
# Cooking dataframe
top_bigrams = bigram_list.most_common(n_top)
df = pd.DataFrame.from_records(top_bigrams, columns=["bigram", "frequency"]).sort_values(by=["frequency"])
# Plot the most common words in tweets
x_var = "bigram"
y_var = "frequency"
title = "%s Most used Bigrams" % n_top
figsize = (8, 10)
plot_bar_chart(df, figsize, x_var, y_var, title)
# Load mask
mask = np.array(Image.open("../img/twitter-mask.png"))
# Create a Word-cloud
wc = WordCloud(background_color="white", max_words=250, mask=mask, collocations=False, margin=2)
wc = wc.generate(new_clean_text)
# Plot a Word-cloud of words of tweets
plt.figure(figsize=(16, 10))
image = plt.imshow(wc, interpolation = "bilinear")
plt.title("%s WordCloud\n" % tw_user_name, fontsize=16)
plt.axis("off")
# Save the Word-cloud image
plt.savefig("../img/wordcloud.png")
plt.show()
# Counting nodes and edges
edge_counter = Counter()
for tweet in raw_tweet_list:
source = tweet["user_name"].lower()
targets = tweet["user_mentions"].lower().split(",")
for target in targets:
if target != "" and source != target:
edge = source + "-" + target
edge_counter[edge] += 1
print("Total edge list:", len(edge_counter))
print(edge_counter.most_common(10))
Total edge list: 2201 [('manelslokom-acmrecsys', 116), ('encodedgeek-acmrecsys', 49), ('rajamifa-acmrecsys', 37), ('seguraandres7-acmrecsys', 33), ('rajamifa-facctrec', 32), ('manelslokom-seguraandres7', 24), ('facctrec-rajamifa', 20), ('agelesschronicl-acmrecsys', 18), ('manelslokom-christine_bauer', 15), ('drch0le-acmrecsys', 15)]
node_list = []
edge_list = []
THRESHOLD = 4
# Add weighted edges
for key, weight in edge_counter.items():
tokens = key.split("-")
source = tokens[0]
target = tokens[1]
if weight > THRESHOLD:
# Add nodes
if source not in node_list:
node_list.append(source)
if target not in node_list:
node_list.append(target)
# Adde edges
edge_list.append((source, target, weight))
# Create an empty Directed Weighted Graph (DWG)
dwg = nx.DiGraph()
dwg.add_nodes_from(node_list)
dwg.add_weighted_edges_from(edge_list)
# Function that calculates the graph density
def get_graph_density(g):
n_nodes = len(g.nodes())
n_edges = len(g.edges())
print("n_nodes:", n_nodes, ", n_edges:", n_edges)
if nx.is_directed(g):
density = n_edges / (n_nodes * (n_nodes - 1))
else:
density = 2 * n_edges / (n_nodes * (n_nodes - 1))
return density
# Calculate the graph density
density = get_graph_density(dwg)
print("graph density:", density)
n_nodes: 70 , n_edges: 101 graph density: 0.02091097308488613
# Calculate the adjacency between the nodes
adj_mtx = nx.adjacency_matrix(dwg).todense()
adj_mtx
matrix([[ 0, 5, 0, ..., 0, 0, 0], [ 0, 0, 0, ..., 0, 0, 0], [ 0, 13, 0, ..., 0, 0, 0], ..., [ 0, 0, 0, ..., 0, 0, 0], [ 0, 0, 0, ..., 0, 0, 0], [ 0, 12, 0, ..., 0, 0, 0]], dtype=int32)
# Plotting the Adjacency matrix
plt.rcParams["figure.figsize"] = [20, 20]
fig, ax = plt.subplots()
ax.imshow(adj_mtx, cmap="Blues")
ax.set_xticks(np.arange(len(node_list)))
ax.set_yticks(np.arange(len(node_list)))
ax.set_xticklabels(node_list, rotation=45)
ax.set_yticklabels(node_list)
for i in range(len(node_list)):
for j in range(len(node_list)):
text_color = "black" if adj_mtx[i, j] == 0 else "white"
text = ax.text(j, i, adj_mtx[i, j], ha="center", va="center", color=text_color)
ax.set_title("Adjacency matrix", fontsize=12)
plt.show()
# Calculate the incidence of the edges on the nodes
inc_mtx = nx.incidence_matrix(dwg).todense()
inc_mtx
matrix([[1., 0., 0., ..., 0., 0., 0.], [1., 1., 0., ..., 0., 1., 0.], [0., 1., 1., ..., 0., 0., 0.], ..., [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 1., 0., 0.], [0., 0., 0., ..., 0., 1., 1.]])
# Plotting the Incidence matrix
fig, ax = plt.subplots()
ax.imshow(inc_mtx, cmap="Blues")
ax.set_xticks(np.arange(len(edge_list)))
ax.set_yticks(np.arange(len(node_list)))
ax.set_xticklabels(edge_list, rotation=45)
ax.set_yticklabels(node_list)
for i in range(len(node_list)):
for j in range(len(edge_list)):
text_color = "black" if inc_mtx[i, j] == 0 else "white"
text = ax.text(j, i, int(inc_mtx[i, j]), ha="center", va="center", color=text_color)
ax.set_title("Incidence matrix", fontsize=12)
plt.show()
# Plot Undirected Simple Graph (free format)
plt.rcParams["figure.figsize"] = [20, 20]
nx.draw(dwg, with_labels=True)
plt.title("Directed Weighted Graph (DWG)", fontsize=14)
plt.axis("off")
plt.show()
# Util function - save list to CSV file
def export_list_to_csv_file(csv_path:str, header:list, data:list) -> None:
# Using csv.writer method from CSV package
with open(csv_path, "w", newline="", encoding="utf-8") as f:
write = csv.writer(f)
write.writerow(header)
for row in data:
write.writerow(row)
# Util function - save json to json file
def export_json_to_json_file(json_path:str, json_data:dict) -> None:
# Create JSON file
with open(json_path, "w") as outfile:
json.dump(json_data, outfile)
# Create CSV file
header = ["source", "target", "weight"]
csv_data = []
# Add weighted edges
for key, weight in edge_counter.items():
tokens = key.split("-")
source = tokens[0]
target = tokens[1]
csv_data.append([source, target, weight])
# Save CSV file
csv_filepath = "../data/graph_data.csv"
export_list_to_csv_file(csv_filepath, header, csv_data)
# Create JSON file
json_data = {"nodes": [], "links":[]}
node_list = []
# Add weighted edges
for key, weight in edge_counter.items():
tokens = key.split("-")
source = tokens[0]
target = tokens[1]
# Adde edges
if weight > 1:
if source not in node_list:
node_list.append(source)
if target not in node_list:
node_list.append(target)
json_data["links"].append({"source": source, "target": target, "value": weight})
source_list = [key.lower()[1:] for key in dict(tw_accounts.items()).keys()]
for node in node_list:
json_data["nodes"].append({"id": node, "group": 1 if node in source_list else 2})
# Save JSON file
json_filepath = "../data/graph_data.json"
export_json_to_json_file(json_filepath, json_data)