Word Embedding: is the collective name for a set of language modeling and feature learning techniques in natural language processing (NLP) where words or phrases from the vocabulary are mapped to vectors of real numbers. Conceptually it involves a mathematical embedding from a space with many dimensions per word to a continuous vector space with a much lower dimension. [1]
Gensim is a Python library for topic modelling, document indexing and similarity retrieval with large corpora. Target audience is the natural language processing (NLP) and information retrieval (IR) community. [2]
# Load Python libraries
import io
import re
import pandas as pd
import random
import numpy as np
import timeit
from collections import Counter
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')
# Load Plot libraries
from wordcloud import WordCloud
import seaborn as sns
import matplotlib.pyplot as plt
# Load NLP libraries from gensim and spacy
import spacy.lang.en as en
import gensim
from gensim.models import Word2Vec
gensim.__version__
'4.0.1'
# Util function to read a plain text file
def read_text_file(file_path, encoding='ISO-8859-1'):
text = ""
with io.open(file_path, 'r', encoding=encoding) as f:
text = f.read()
return text
# Get text sample
file_path = "../data/en/The Adventures of Sherlock Holmes - Arthur Conan Doyle.txt"
plain_text = read_text_file(file_path)
len(plain_text)
576467
# Show first 1000 characters of document
plain_text[:1000]
"\nProject Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle\n\nThis eBook is for the use of anyone anywhere at no cost and with\nalmost no restrictions whatsoever. You may copy it, give it away or\nre-use it under the terms of the Project Gutenberg License included\nwith this eBook or online at www.gutenberg.net\n\n\nTitle: The Adventures of Sherlock Holmes\n\nAuthor: Arthur Conan Doyle\n\nRelease Date: November 29, 2002 [EBook #1661]\nLast Updated: May 20, 2019\n\nLanguage: English\n\nCharacter set encoding: UTF-8\n\n*** START OF THIS PROJECT GUTENBERG EBOOK THE ADVENTURES OF SHERLOCK HOLMES ***\n\n\n\nProduced by an anonymous Project Gutenberg volunteer and Jose Menendez\n\n\n\ncover\n\n\n\nThe Adventures of Sherlock Holmes\n\n\n\nby Arthur Conan Doyle\n\n\n\nContents\n\n\n I. A Scandal in Bohemia\n II. The Red-Headed League\n III. A Case of Identity\n IV. The Boscombe Valley Mystery\n V. The Five Orange Pips\n VI. The Man with the Twisted Lip\n VII. The Adventure of the Blue C"
Refers to the cleaning process of input data so they have meaning and value.
# Cleaing the text with RegEx
clean_text = plain_text.lower()
clean_text = clean_text.replace('\n', '.')
clean_text = re.sub('[^a-zA-Z.]', ' ', clean_text)
clean_text = re.sub(r'\s+', ' ', clean_text)
clean_text = re.sub(r'\.+', ".", clean_text)
clean_text[:1000]
'.project gutenberg s the adventures of sherlock holmes by arthur conan doyle.this ebook is for the use of anyone anywhere at no cost and with.almost no restrictions whatsoever. you may copy it give it away or.re use it under the terms of the project gutenberg license included.with this ebook or online at www.gutenberg.net.title the adventures of sherlock holmes.author arthur conan doyle.release date november ebook .last updated may .language english.character set encoding utf . start of this project gutenberg ebook the adventures of sherlock holmes .produced by an anonymous project gutenberg volunteer and jose menendez.cover.the adventures of sherlock holmes.by arthur conan doyle.contents. i. a scandal in bohemia. ii. the red headed league. iii. a case of identity. iv. the boscombe valley mystery. v. the five orange pips. vi. the man with the twisted lip. vii. the adventure of the blue carbuncle. viii. the adventure of the speckled band. ix. the adventure of the engineer s thumb. x. th'
# Tokenize text in sentences
sentence_list = clean_text.split('.')
len(sentence_list)
14592
# Tokenize sentences in words
word_list = [sentence.split() for sentence in sentence_list if len(sentence.split()) > 0]
word_list[:10]
[['project', 'gutenberg', 's', 'the', 'adventures', 'of', 'sherlock', 'holmes', 'by', 'arthur', 'conan', 'doyle'], ['this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with'], ['almost', 'no', 'restrictions', 'whatsoever'], ['you', 'may', 'copy', 'it', 'give', 'it', 'away', 'or'], ['re', 'use', 'it', 'under', 'the', 'terms', 'of', 'the', 'project', 'gutenberg', 'license', 'included'], ['with', 'this', 'ebook', 'or', 'online', 'at', 'www'], ['gutenberg'], ['net'], ['title', 'the', 'adventures', 'of', 'sherlock', 'holmes'], ['author', 'arthur', 'conan', 'doyle']]
# Count the words in a document and return the most N repeated
def count_words(sentences, n):
words = Counter()
for sent in sentences:
for word in sent:
words[word] += 1
return words.most_common(n)
# Get the most common words in the document
n_words = count_words(word_list, 50)
df = pd.DataFrame.from_records(n_words, columns=['word', 'quantity'])
df.head(10)
word | quantity | |
---|---|---|
0 | the | 5636 |
1 | i | 3038 |
2 | and | 3020 |
3 | to | 2744 |
4 | of | 2661 |
5 | a | 2643 |
6 | in | 1766 |
7 | that | 1752 |
8 | it | 1737 |
9 | you | 1503 |
# Plot the most common words in the document
fig, ax = plt.subplots()
df_desc = df.sort_values(by='quantity', ascending=True)
df_desc.plot.barh(ax=ax, x="word", y="quantity", color="green", alpha=0.75, figsize=(8, 14))
ax.get_legend().remove()
plt.title("The 50 Most Common Words in document", fontsize=16)
plt.xlabel("Quantity", fontsize=12)
plt.ylabel("Words", fontsize=12)
plt.show()
Refers to the most common words in a language, which do not significantly affect the meaning of the text.
# Get English stopwords
stopwords_en = en.stop_words.STOP_WORDS
print(stopwords_en)
{'thereafter', 'in', 'fifteen', 'becomes', 'wherever', 'must', '’re', 'what', 'please', 'show', 'yourself', 'at', 'their', 'below', 'beforehand', 'when', 'hereby', 'how', 'anywhere', 'rather', 'already', 'five', 'beside', 'except', 'mine', 'n’t', 'ten', 'his', '‘m', 'wherein', 'seems', 'latterly', 'afterwards', 'they', 'hundred', '‘ll', 'never', 'who', 'yet', 'twelve', 'unless', 'again', 'less', 'very', 'hence', 'empty', 'however', 'make', 'somewhere', 'this', 'whatever', 'down', 'alone', 'often', 'whoever', 'during', 'former', 'whose', 'name', 'put', 'throughout', 'has', 'some', 'was', 'off', 'still', 'anyone', 'nine', 'sometimes', 'many', 'call', 'therein', 'without', 'everything', 'else', 'any', 'after', "'ve", 'where', 'always', "'re", 'keep', 'really', '‘re', 'we', 'ca', 'nevertheless', 'none', 'mostly', 'now', 'nobody', 'bottom', 'these', 'along', 'too', 'for', 'get', 'therefore', 'go', 'herein', 'eleven', 'whole', 'and', 'by', 'last', 'all', 'four', 'yourselves', 'while', 'sixty', 'via', 'give', 'ours', 'side', 'back', 'us', 'whither', 'serious', 'see', 'since', 'then', 'each', 'everyone', 'also', 'well', 'other', 'top', 'cannot', 'every', 'the', 'three', 'but', 'nothing', 're', 'n‘t', 'thus', '’ve', 'forty', "n't", 'whether', 'above', 'than', 'whereupon', 'someone', 'with', 'our', 'move', 'due', 'either', 'may', 'against', 'although', 'take', 'indeed', 'anyhow', 'before', 'anything', 'full', 'of', 'seeming', '’ll', "'ll", 'should', 'even', 'upon', 'herself', 'besides', 'she', 'thru', 'more', 'would', 'am', 'one', 'seem', 'have', 'front', 'only', 'whom', 'done', 'beyond', 'namely', 'used', 'does', 'just', 'towards', 'eight', 'here', 'hereupon', 'because', 'can', 'various', 'first', 'further', '’d', 'under', 'will', 'be', '‘s', 'whenever', 'you', 'least', '’s', 'own', 'though', "'d", 'did', "'m", 'across', 'it', 'itself', 'both', 'or', 'formerly', 'into', 'its', 'up', 'noone', 'ever', 'my', 'whereas', 'if', 'over', 'myself', 'behind', '’m', 'nowhere', 'as', 'whereafter', 'yours', 'could', 'doing', 'among', 'he', 'next', 'those', 'amongst', 'became', 'part', 'there', 'latter', 'six', 'perhaps', 'become', 'within', 'her', 'quite', 'toward', 'using', 'otherwise', 'most', 'between', 'your', 'himself', 'something', 'out', 'together', 'seemed', 'anyway', 'such', 'is', 'others', 'few', 'third', '‘d', 'almost', 'them', 'around', 'ourselves', 'about', '‘ve', 'neither', 'onto', 'an', 'twenty', 'through', 'might', 'becoming', 'do', 'several', 'not', 'thereby', 'were', 'elsewhere', 'same', "'s", 'enough', 'somehow', 'until', 'sometime', 'two', 'a', 'why', 'thence', 'meanwhile', 'moreover', 'another', 'no', 'been', 'made', 'everywhere', 'from', 'nor', 'to', 'fifty', 'me', 'once', 'regarding', 'are', 'that', 'i', 'thereupon', 'so', 'hereafter', 'themselves', 'much', 'had', 'which', 'per', 'being', 'amount', 'say', 'whereby', 'him', 'hers', 'on', 'whence'}
# Remove stopwords
all_words = []
for ix in range(len(word_list)):
all_words.append([word for word in word_list[ix] if (word not in stopwords_en and len(word) > 2)])
all_words[:10]
[['project', 'gutenberg', 'adventures', 'sherlock', 'holmes', 'arthur', 'conan', 'doyle'], ['ebook', 'use', 'cost'], ['restrictions', 'whatsoever'], ['copy', 'away'], ['use', 'terms', 'project', 'gutenberg', 'license', 'included'], ['ebook', 'online', 'www'], ['gutenberg'], ['net'], ['title', 'adventures', 'sherlock', 'holmes'], ['author', 'arthur', 'conan', 'doyle']]
# Get the most common words in the document after removing the stopwords
n_words = count_words(all_words, 50)
df = pd.DataFrame.from_records(n_words, columns = ['word', 'quantity'])
df.head(10)
word | quantity | |
---|---|---|
0 | said | 486 |
1 | holmes | 465 |
2 | man | 305 |
3 | little | 269 |
4 | think | 174 |
5 | room | 171 |
6 | know | 170 |
7 | shall | 169 |
8 | come | 161 |
9 | time | 151 |
# Plot the most common words in the document
fig, ax = plt.subplots()
df_desc = df.sort_values(by='quantity', ascending=True)
df_desc.plot.barh(ax=ax, x="word", y="quantity", color="indigo", alpha=0.75, figsize=(8, 14))
ax.get_legend().remove()
plt.title("The 50 Most Common Words in document (without Stopwords)", fontsize=16)
plt.xlabel("Quantity", fontsize=12)
plt.ylabel("Words", fontsize=12)
plt.show()
# Reconstructing the clean text (without stop-words)
new_clean_text = ''
for sent in all_words:
for word in sent:
new_clean_text = new_clean_text + word + ' '
# Custom color function
def color_func(word, font_size, position, orientation, random_state = None, **kwargs):
return "hsl(45, 150%%, %d%%)" % random.randint(160, 255)
# Create a Word cloud
wc = WordCloud(max_font_size=60, min_font_size=5, max_words=150, background_color="black", margin=2)
wc = wc.generate(new_clean_text)
# Plot a Word cloud
plt.figure(figsize = (12, 12))
plt.imshow(wc.recolor(color_func = color_func, random_state=3), interpolation = "bilinear")
plt.axis("off")
plt.show()
Word2Vec consists of models for generating word embedding. These models are shallow two layer neural networks having one input layer, one hidden layer and one output layer. Word2Vec utilizes two architectures: CBOW (Continuous Bag of Words) and Skip Gram. [3]
# Algorithm params
min_count = 5 # Minimium frequency count of words. The model would ignore words that do not satisfy the min_count
size = 150 # The size of the dense vector to represent each token or word
window = 3 # The maximum distance between the target word and its neighboring word
sg = 0 # The training algorithm, either CBOW (0) or skip-gram (1)
epochs = 50 # Number of iterations (epochs) over the corpus
# Create Word2Vec model with CBOW algorithm approach
w2v_model = Word2Vec(all_words, min_count=min_count, compute_loss=True, vector_size=size, window=window, sg=sg, epochs=epochs)
# Getting the training loss value
training_loss = w2v_model.get_latest_training_loss()
print(training_loss)
1881113.5
Unique words of the document.
# Show vocabulary size: unique words occurring at least twice
vocabulary = w2v_model.wv.index_to_key
len(vocabulary)
1780
# Show 'holmes' vector
w2v_model.wv['holmes']
array([ 0.39678112, -0.70309114, -0.40441296, 0.41272703, -0.39327854, 0.3943475 , 1.0194279 , 0.5290096 , -0.3295435 , 0.5167051 , -0.2100004 , 0.05697219, -0.6289016 , -0.5468892 , 0.07339276, 0.25482133, 0.35808587, -0.37766075, 0.30003628, 0.69528216, -0.5203084 , -0.56655407, 0.6395985 , 0.48054895, -0.14689611, -0.87779665, -0.33620927, -0.41077298, 0.44543254, -0.31105122, -0.6248907 , 0.4354348 , 0.00322383, -0.5232826 , 0.42328218, -0.354256 , 0.30290315, -0.6596066 , 0.21340986, -0.19804758, 0.23923375, 0.3380928 , 0.14989339, -0.5060465 , 0.1368444 , 0.0837866 , 0.09099548, -0.09379155, -0.49251 , 0.47424227, -0.31569317, 0.37731454, -0.4209911 , -0.04227128, 1.1047065 , 0.67135966, -0.01516947, 0.12846494, -0.30910513, 0.21949199, -0.24549896, -0.20272876, 0.48747998, -0.05098846, 0.51575536, -0.4784055 , 0.17526303, 0.5846486 , -1.0175432 , -0.6835881 , -0.49530125, -0.00624014, -0.16789956, -0.5730044 , 0.61781263, -0.08208855, 0.2954704 , 0.34314203, -0.6385965 , 0.48848748, 0.3607169 , -0.70421195, -0.61902785, 0.2587055 , -0.43062556, 0.13585067, 0.15164214, -0.01356333, 0.38123044, -0.08722676, -0.33507347, -0.04237956, -0.05416336, 0.47078547, 0.33078003, 0.07455686, -0.12697284, -0.34902504, -0.6440862 , 0.1371734 , 0.33433244, -0.3309382 , -0.34492585, -0.525152 , -0.5974499 , -0.35576165, 0.10244957, -0.3423658 , -0.5629105 , -0.42280748, 0.15419073, -0.21639544, 0.354176 , 0.14280857, -0.5938123 , 0.66370696, 0.3069274 , 0.33340967, -0.45068938, 0.19979353, -0.04969786, 0.27417824, 0.21661213, 0.01481856, -0.11360772, 0.6402411 , 0.8784679 , 0.19794293, -0.41630414, 0.13313097, 0.4308818 , -0.4056247 , -0.804949 , -0.05049953, 0.09105106, -0.07421248, -0.40570074, -0.30800608, 0.68268174, 0.14379938, -0.4121782 , 0.73899627, 0.6005977 , -0.1266218 , 0.85023427, 0.5550141 , 0.02638014, 0.11194157, 0.891131 , -0.67984074], dtype=float32)
vectors = []
for vector in w2v_model.wv.vectors:
vectors.append(list(vector))
# Save word embeddings
with open("../data/embeddings/vectors.tsv", "w") as f_output:
for row in vectors:
line = '\t'.join(map(str, row))
f_output.write(line + "\n")
# Save metadata
with open("../data/embeddings/metadata.tsv", "w") as f_output:
f_output.write("word\tlen\n")
for word in vocabulary:
f_output.write(word + "\t" + str(len(word)) + "\n")
Words more similar in terms of meaning and context.
# Finding Positive Similar Words
w2v_model.wv.most_similar(positive=['holmes'], topn=10)
[('adventures', 0.8516415357589722), ('glancing', 0.7797467112541199), ('laughed', 0.7691915035247803), ('laughing', 0.7643370032310486), ('staggered', 0.7595707178115845), ('smiling', 0.7572908401489258), ('visitor', 0.7556211352348328), ('silent', 0.7444738745689392), ('gutenberg', 0.7365286946296692), ('cases', 0.7255791425704956)]
# Finding Negative Similar Words
w2v_model.wv.most_similar(negative=['holmes'], topn=10)
[('carried', 0.177780881524086), ('went', 0.17299138009548187), ('strong', 0.16780786216259003), ('taken', 0.10628341883420944), ('cut', 0.10017789155244827), ('save', 0.09637609124183655), ('place', 0.08977862447500229), ('hair', 0.08958049863576889), ('brought', 0.07890144735574722), ('son', 0.0758872702717781)]
# Calculate the similarity between 2 words
w2v_model.wv.similarity(w1='holmes', w2='lestrade')
0.70907706
# Calculate similarity: sim(w1, w2) = sim(w2, w1)
w2v_model.wv.similarity(w1='lestrade', w2='holmes')
0.70907706
# Show word that doesn't belong to the list
w2v_model.wv.doesnt_match(['holmes', 'watson', 'mycroft'])
'holmes'
# Get vectors
target_word = 'sherlock'
top_n = 25
# Calculate more and less similars words
most_similar = w2v_model.wv.most_similar(positive = [target_word], topn = top_n)
less_similar = w2v_model.wv.most_similar(negative = [target_word], topn = top_n)
# Save them
neighbors = [(target_word, 1, 'current')]
neighbors += [(*row, 'most') for row in most_similar]
neighbors += [(*row, 'less') for row in less_similar]
# Get neighbors vectos
neigh_word = [row[0] for row in neighbors]
X = w2v_model.wv[neigh_word]
len(X)
51
# Perform PCA with 3 components
pca = PCA(n_components=3)
pca_data = pca.fit_transform(X)
# The explained variance of each principal components
print(list(pca.explained_variance_ratio_))
print(sum(pca.explained_variance_ratio_))
[0.42086148, 0.1456058, 0.07055229] 0.6370195746421814
# Create and show principal components DataFrame
pca_df = pd.DataFrame(data=pca_data, columns=["PC1", "PC2", "PC3"])
pca_df['Name'] = neigh_word
pca_df.head(10)
PC1 | PC2 | PC3 | Name | |
---|---|---|---|---|
0 | 5.340423 | -0.646308 | -0.963914 | sherlock |
1 | 2.565961 | 0.041457 | 0.005309 | remarked |
2 | 1.090095 | -0.276063 | -0.419835 | observed |
3 | 0.712601 | -0.024503 | 0.282124 | murmured |
4 | 0.682104 | -0.265439 | -0.061924 | smiled |
5 | 1.067823 | -0.348628 | -0.098575 | heartily |
6 | 0.848729 | -0.340285 | 0.058723 | chuckled |
7 | 2.484412 | -0.281194 | -0.326231 | lestrade |
8 | 1.115612 | 0.112329 | 0.243798 | rising |
9 | 2.190557 | -1.440016 | -1.332575 | rose |
# Create a scatter plot of the projection
fig, ax = plt.subplots(figsize = (14, 14))
gap = 0.01
colors = dict()
colors['current'] = 'royalblue'
colors['most'] = 'forestgreen'
colors['less'] = 'orange'
# Add points one by one with a loop
for i, word in enumerate(neigh_word):
node_col = colors[neighbors[i][2]]
if word == target_word:
node_size = 100
text = word.upper()
else:
node_size = 50
text = word + ': ' + str(round(neighbors[i][1], 3))
plt.scatter(pca_data[i, 0], pca_data[i, 1], c = node_col, s = node_size)
plt.annotate(text, xy = (pca_data[i, 0] + gap*10, pca_data[i, 1] - gap*3))
# Plot setup
ax.set_xlabel("PC 1", fontsize=12)
ax.set_ylabel("PC 2", fontsize=12)
ax.set_title("Most Similar Words to " + target_word, fontsize=20)
ax.legend(["Similar Words"])
ax.grid()
# Create Word2Vec model
w2v_model = Word2Vec(all_words, min_count=5, vector_size=150, window=3, sg=0, epochs=50)
vocabulary = w2v_model.wv.index_to_key
len(vocabulary)
1780
# Returns the dense similarity between all the words in the document
def get_dense_similarity(model, precision=3):
vocabulary = list(model.wv.index_to_key)
n_words = len(vocabulary)
matrix = np.zeros((n_words, n_words))
for i in range(n_words):
for j in range(n_words):
if i == j:
matrix[i][j] = 1
elif i > j:
word_sim = round(model.wv.similarity(w1 = vocabulary[i], w2 = vocabulary[j]), precision)
matrix[i][j] = word_sim
matrix[j][i] = word_sim
return matrix;
# Create dataframe with the similarity between all the words in the document
start_time = timeit.default_timer()
words_sim = get_dense_similarity(w2v_model, 2)
elapsed = timeit.default_timer() - start_time
elapsed
57.315067199999994
# Show dense similarity matrix as a dataframe
df_dense = pd.DataFrame.from_records(words_sim, columns=vocabulary)
print(df_dense.shape)
df_dense.iloc[:18, :18]
(1780, 1780)
said | holmes | man | little | think | room | know | shall | come | time | came | door | good | face | matter | yes | hand | house | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.00 | 0.38 | 0.23 | 0.24 | 0.64 | 0.07 | 0.58 | 0.38 | 0.48 | 0.17 | 0.12 | 0.09 | 0.41 | 0.12 | 0.21 | 0.36 | 0.31 | 0.08 |
1 | 0.38 | 1.00 | 0.17 | 0.20 | 0.35 | 0.16 | 0.13 | -0.05 | 0.34 | 0.27 | -0.02 | 0.22 | 0.10 | 0.16 | 0.16 | 0.21 | 0.39 | -0.07 |
2 | 0.23 | 0.17 | 1.00 | 0.31 | 0.16 | 0.16 | 0.12 | -0.10 | 0.18 | 0.24 | 0.07 | 0.25 | 0.07 | 0.57 | 0.26 | 0.17 | 0.14 | 0.15 |
3 | 0.24 | 0.20 | 0.31 | 1.00 | 0.26 | 0.02 | 0.33 | 0.21 | 0.34 | 0.31 | 0.08 | -0.02 | 0.38 | 0.41 | 0.52 | 0.24 | 0.27 | 0.09 |
4 | 0.64 | 0.35 | 0.16 | 0.26 | 1.00 | 0.11 | 0.64 | 0.50 | 0.61 | 0.46 | 0.27 | 0.13 | 0.45 | -0.06 | 0.31 | 0.54 | 0.03 | 0.26 |
5 | 0.07 | 0.16 | 0.16 | 0.02 | 0.11 | 1.00 | -0.09 | 0.04 | 0.13 | 0.25 | 0.34 | 0.67 | 0.05 | 0.13 | -0.13 | 0.08 | 0.20 | 0.60 |
6 | 0.58 | 0.13 | 0.12 | 0.33 | 0.64 | -0.09 | 1.00 | 0.63 | 0.53 | 0.22 | 0.33 | -0.01 | 0.42 | 0.12 | 0.58 | 0.46 | 0.16 | 0.17 |
7 | 0.38 | -0.05 | -0.10 | 0.21 | 0.50 | 0.04 | 0.63 | 1.00 | 0.53 | 0.31 | 0.39 | -0.06 | 0.52 | -0.12 | 0.50 | 0.52 | -0.00 | 0.27 |
8 | 0.48 | 0.34 | 0.18 | 0.34 | 0.61 | 0.13 | 0.53 | 0.53 | 1.00 | 0.52 | 0.36 | 0.11 | 0.59 | -0.03 | 0.45 | 0.42 | 0.13 | 0.17 |
9 | 0.17 | 0.27 | 0.24 | 0.31 | 0.46 | 0.25 | 0.22 | 0.31 | 0.52 | 1.00 | 0.30 | 0.17 | 0.37 | -0.03 | 0.31 | 0.45 | 0.07 | 0.34 |
10 | 0.12 | -0.02 | 0.07 | 0.08 | 0.27 | 0.34 | 0.33 | 0.39 | 0.36 | 0.30 | 1.00 | 0.20 | 0.41 | -0.03 | 0.12 | 0.30 | 0.09 | 0.52 |
11 | 0.09 | 0.22 | 0.25 | -0.02 | 0.13 | 0.67 | -0.01 | -0.06 | 0.11 | 0.17 | 0.20 | 1.00 | -0.08 | 0.26 | -0.11 | 0.00 | 0.19 | 0.36 |
12 | 0.41 | 0.10 | 0.07 | 0.38 | 0.45 | 0.05 | 0.42 | 0.52 | 0.59 | 0.37 | 0.41 | -0.08 | 1.00 | 0.04 | 0.32 | 0.47 | 0.04 | 0.25 |
13 | 0.12 | 0.16 | 0.57 | 0.41 | -0.06 | 0.13 | 0.12 | -0.12 | -0.03 | -0.03 | -0.03 | 0.26 | 0.04 | 1.00 | 0.23 | 0.01 | 0.45 | 0.03 |
14 | 0.21 | 0.16 | 0.26 | 0.52 | 0.31 | -0.13 | 0.58 | 0.50 | 0.45 | 0.31 | 0.12 | -0.11 | 0.32 | 0.23 | 1.00 | 0.50 | 0.06 | -0.01 |
15 | 0.36 | 0.21 | 0.17 | 0.24 | 0.54 | 0.08 | 0.46 | 0.52 | 0.42 | 0.45 | 0.30 | 0.00 | 0.47 | 0.01 | 0.50 | 1.00 | 0.02 | 0.41 |
16 | 0.31 | 0.39 | 0.14 | 0.27 | 0.03 | 0.20 | 0.16 | -0.00 | 0.13 | 0.07 | 0.09 | 0.19 | 0.04 | 0.45 | 0.06 | 0.02 | 1.00 | 0.18 |
17 | 0.08 | -0.07 | 0.15 | 0.09 | 0.26 | 0.60 | 0.17 | 0.27 | 0.17 | 0.34 | 0.52 | 0.36 | 0.25 | 0.03 | -0.01 | 0.41 | 0.18 | 1.00 |
# Plot dense similarity matrix
fig, ax = plt.subplots(figsize = (14, 14))
sns.heatmap(words_sim, ax = ax)
ax.set_title("Dense Similarity Matrix", fontsize=16)
ax.set_xlabel("vocabulary", fontsize=12)
ax.set_ylabel("vocabulary", fontsize=12)
plt.show()
# Exporting dense word similarity matrix
file_path = "../data/network/dense_similarity.csv"
df_dense.to_csv(file_path, index = False, sep = ',')
# Returns the sparse similarity between all the words in the document
def get_sparse_similarity(model, precision=3, top_n=10):
matrix = []
vocabulary = list(model.wv.index_to_key)
n_words = len(vocabulary)
# Calculate sparse similarity
for word in vocabulary:
row_sim = np.zeros(n_words)
best_sim = w2v_model.wv.most_similar(positive = [word], topn = top_n)
for neighbor in best_sim:
nei_name = neighbor[0]
nei_ix = vocabulary.index(nei_name)
nei_sim = round(neighbor[1], precision)
row_sim[nei_ix] = nei_sim
matrix.append(row_sim)
return matrix, vocabulary
# Create a data frame with the similarity between the nearest words
start_time = timeit.default_timer()
words_sim, vocabulary = get_sparse_similarity(w2v_model, 2, 50)
elapsed = timeit.default_timer() - start_time
elapsed
3.117964999999998
# Show sparse similarity matrix as a dataframe
df_sparse = pd.DataFrame.from_records(words_sim, columns=vocabulary)
print(df_sparse.shape)
df_sparse.iloc[:18, :18]
(1780, 1780)
said | holmes | man | little | think | room | know | shall | come | time | came | door | good | face | matter | yes | hand | house | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.57 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 |
5 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 |
6 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 |
7 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 |
8 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 |
9 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 |
10 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 |
11 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.67 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 |
12 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.59 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 |
13 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 |
14 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 |
15 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 |
16 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 |
17 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.60 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 |
# Plot sparse similarity matrix
fig, ax = plt.subplots(figsize = (14, 14))
sns.heatmap(words_sim, ax = ax)
ax.set_title("Sparse Similarity Matrix", fontsize=16)
ax.set_xlabel("vocabulary", fontsize=12)
ax.set_ylabel("vocabulary", fontsize=12)
plt.show()
# Exporting sparse word similarity matrix
file_path = "../data/network/sparse_similarity.csv"
df_sparse.to_csv(file_path, index=False, sep=',')