# Load Python libraries
import io
import re
import pandas as pd
import random
import numpy as np
import timeit
from collections import Counter
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')


# Load Plot libraries
from wordcloud import WordCloud
import seaborn as sns
import matplotlib.pyplot as plt


# Load NLP libraries from gensim and spacy
import spacy.lang.en as en
import gensim
from gensim.models import Word2Vec


gensim.__version__

'4.0.1'


# Util function to read a plain text file
def read_text_file(file_path, encoding='ISO-8859-1'):
    text = ""
    with io.open(file_path, 'r', encoding=encoding) as f:
        text = f.read()
    
    return text


# Get text sample
file_path = "../data/en/The Adventures of Sherlock Holmes - Arthur Conan Doyle.txt"
plain_text = read_text_file(file_path)
len(plain_text)

576467


# Show first 1000 characters of document
plain_text[:1000]

"\nProject Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle\n\nThis eBook is for the use of anyone anywhere at no cost and with\nalmost no restrictions whatsoever.  You may copy it, give it away or\nre-use it under the terms of the Project Gutenberg License included\nwith this eBook or online at www.gutenberg.net\n\n\nTitle: The Adventures of Sherlock Holmes\n\nAuthor: Arthur Conan Doyle\n\nRelease Date: November 29, 2002 [EBook #1661]\nLast Updated: May 20, 2019\n\nLanguage: English\n\nCharacter set encoding: UTF-8\n\n*** START OF THIS PROJECT GUTENBERG EBOOK THE ADVENTURES OF SHERLOCK HOLMES ***\n\n\n\nProduced by an anonymous Project Gutenberg volunteer and Jose Menendez\n\n\n\ncover\n\n\n\nThe Adventures of Sherlock Holmes\n\n\n\nby Arthur Conan Doyle\n\n\n\nContents\n\n\n   I.     A Scandal in Bohemia\n   II.    The Red-Headed League\n   III.   A Case of Identity\n   IV.    The Boscombe Valley Mystery\n   V.     The Five Orange Pips\n   VI.    The Man with the Twisted Lip\n   VII.   The Adventure of the Blue C"


# Cleaing the text with RegEx
clean_text = plain_text.lower()
clean_text = clean_text.replace('\n', '.')
clean_text = re.sub('[^a-zA-Z.]', ' ', clean_text)
clean_text = re.sub(r'\s+', ' ', clean_text)
clean_text = re.sub(r'\.+', ".", clean_text)
clean_text[:1000]

'.project gutenberg s the adventures of sherlock holmes by arthur conan doyle.this ebook is for the use of anyone anywhere at no cost and with.almost no restrictions whatsoever. you may copy it give it away or.re use it under the terms of the project gutenberg license included.with this ebook or online at www.gutenberg.net.title the adventures of sherlock holmes.author arthur conan doyle.release date november ebook .last updated may .language english.character set encoding utf . start of this project gutenberg ebook the adventures of sherlock holmes .produced by an anonymous project gutenberg volunteer and jose menendez.cover.the adventures of sherlock holmes.by arthur conan doyle.contents. i. a scandal in bohemia. ii. the red headed league. iii. a case of identity. iv. the boscombe valley mystery. v. the five orange pips. vi. the man with the twisted lip. vii. the adventure of the blue carbuncle. viii. the adventure of the speckled band. ix. the adventure of the engineer s thumb. x. th'


# Tokenize text in sentences
sentence_list = clean_text.split('.')
len(sentence_list)

14592


# Tokenize sentences in words
word_list = [sentence.split() for sentence in sentence_list if len(sentence.split()) > 0]
word_list[:10]

[['project',
  'gutenberg',
  's',
  'the',
  'adventures',
  'of',
  'sherlock',
  'holmes',
  'by',
  'arthur',
  'conan',
  'doyle'],
 ['this',
  'ebook',
  'is',
  'for',
  'the',
  'use',
  'of',
  'anyone',
  'anywhere',
  'at',
  'no',
  'cost',
  'and',
  'with'],
 ['almost', 'no', 'restrictions', 'whatsoever'],
 ['you', 'may', 'copy', 'it', 'give', 'it', 'away', 'or'],
 ['re',
  'use',
  'it',
  'under',
  'the',
  'terms',
  'of',
  'the',
  'project',
  'gutenberg',
  'license',
  'included'],
 ['with', 'this', 'ebook', 'or', 'online', 'at', 'www'],
 ['gutenberg'],
 ['net'],
 ['title', 'the', 'adventures', 'of', 'sherlock', 'holmes'],
 ['author', 'arthur', 'conan', 'doyle']]


# Count the words in a document and return the most N repeated
def count_words(sentences, n):
    words = Counter()
    
    for sent in sentences:
        for word in sent:
            words[word] += 1
    
    return words.most_common(n)


# Get the most common words in the document
n_words = count_words(word_list, 50)
df = pd.DataFrame.from_records(n_words, columns=['word', 'quantity'])
df.head(10)


# Plot the most common words in the document
fig, ax = plt.subplots()
df_desc = df.sort_values(by='quantity', ascending=True)
df_desc.plot.barh(ax=ax, x="word", y="quantity", color="green", alpha=0.75, figsize=(8, 14))
ax.get_legend().remove()
plt.title("The 50 Most Common Words in document", fontsize=16)
plt.xlabel("Quantity", fontsize=12)
plt.ylabel("Words", fontsize=12)
plt.show()


# Get English stopwords
stopwords_en = en.stop_words.STOP_WORDS
print(stopwords_en)

{'thereafter', 'in', 'fifteen', 'becomes', 'wherever', 'must', '’re', 'what', 'please', 'show', 'yourself', 'at', 'their', 'below', 'beforehand', 'when', 'hereby', 'how', 'anywhere', 'rather', 'already', 'five', 'beside', 'except', 'mine', 'n’t', 'ten', 'his', '‘m', 'wherein', 'seems', 'latterly', 'afterwards', 'they', 'hundred', '‘ll', 'never', 'who', 'yet', 'twelve', 'unless', 'again', 'less', 'very', 'hence', 'empty', 'however', 'make', 'somewhere', 'this', 'whatever', 'down', 'alone', 'often', 'whoever', 'during', 'former', 'whose', 'name', 'put', 'throughout', 'has', 'some', 'was', 'off', 'still', 'anyone', 'nine', 'sometimes', 'many', 'call', 'therein', 'without', 'everything', 'else', 'any', 'after', "'ve", 'where', 'always', "'re", 'keep', 'really', '‘re', 'we', 'ca', 'nevertheless', 'none', 'mostly', 'now', 'nobody', 'bottom', 'these', 'along', 'too', 'for', 'get', 'therefore', 'go', 'herein', 'eleven', 'whole', 'and', 'by', 'last', 'all', 'four', 'yourselves', 'while', 'sixty', 'via', 'give', 'ours', 'side', 'back', 'us', 'whither', 'serious', 'see', 'since', 'then', 'each', 'everyone', 'also', 'well', 'other', 'top', 'cannot', 'every', 'the', 'three', 'but', 'nothing', 're', 'n‘t', 'thus', '’ve', 'forty', "n't", 'whether', 'above', 'than', 'whereupon', 'someone', 'with', 'our', 'move', 'due', 'either', 'may', 'against', 'although', 'take', 'indeed', 'anyhow', 'before', 'anything', 'full', 'of', 'seeming', '’ll', "'ll", 'should', 'even', 'upon', 'herself', 'besides', 'she', 'thru', 'more', 'would', 'am', 'one', 'seem', 'have', 'front', 'only', 'whom', 'done', 'beyond', 'namely', 'used', 'does', 'just', 'towards', 'eight', 'here', 'hereupon', 'because', 'can', 'various', 'first', 'further', '’d', 'under', 'will', 'be', '‘s', 'whenever', 'you', 'least', '’s', 'own', 'though', "'d", 'did', "'m", 'across', 'it', 'itself', 'both', 'or', 'formerly', 'into', 'its', 'up', 'noone', 'ever', 'my', 'whereas', 'if', 'over', 'myself', 'behind', '’m', 'nowhere', 'as', 'whereafter', 'yours', 'could', 'doing', 'among', 'he', 'next', 'those', 'amongst', 'became', 'part', 'there', 'latter', 'six', 'perhaps', 'become', 'within', 'her', 'quite', 'toward', 'using', 'otherwise', 'most', 'between', 'your', 'himself', 'something', 'out', 'together', 'seemed', 'anyway', 'such', 'is', 'others', 'few', 'third', '‘d', 'almost', 'them', 'around', 'ourselves', 'about', '‘ve', 'neither', 'onto', 'an', 'twenty', 'through', 'might', 'becoming', 'do', 'several', 'not', 'thereby', 'were', 'elsewhere', 'same', "'s", 'enough', 'somehow', 'until', 'sometime', 'two', 'a', 'why', 'thence', 'meanwhile', 'moreover', 'another', 'no', 'been', 'made', 'everywhere', 'from', 'nor', 'to', 'fifty', 'me', 'once', 'regarding', 'are', 'that', 'i', 'thereupon', 'so', 'hereafter', 'themselves', 'much', 'had', 'which', 'per', 'being', 'amount', 'say', 'whereby', 'him', 'hers', 'on', 'whence'}


# Remove stopwords
all_words = []
for ix in range(len(word_list)):
    all_words.append([word for word in word_list[ix] if (word not in stopwords_en and len(word) > 2)])

all_words[:10]

[['project',
  'gutenberg',
  'adventures',
  'sherlock',
  'holmes',
  'arthur',
  'conan',
  'doyle'],
 ['ebook', 'use', 'cost'],
 ['restrictions', 'whatsoever'],
 ['copy', 'away'],
 ['use', 'terms', 'project', 'gutenberg', 'license', 'included'],
 ['ebook', 'online', 'www'],
 ['gutenberg'],
 ['net'],
 ['title', 'adventures', 'sherlock', 'holmes'],
 ['author', 'arthur', 'conan', 'doyle']]


# Get the most common words in the document after removing the stopwords
n_words = count_words(all_words, 50)
df = pd.DataFrame.from_records(n_words, columns = ['word', 'quantity'])
df.head(10)


# Plot the most common words in the document
fig, ax = plt.subplots()
df_desc = df.sort_values(by='quantity', ascending=True)
df_desc.plot.barh(ax=ax, x="word", y="quantity", color="indigo", alpha=0.75, figsize=(8, 14))
ax.get_legend().remove()
plt.title("The 50 Most Common Words in document (without Stopwords)", fontsize=16)
plt.xlabel("Quantity", fontsize=12)
plt.ylabel("Words", fontsize=12)
plt.show()


# Reconstructing the clean text (without stop-words)
new_clean_text = ''
for sent in all_words:
    for word in sent:
        new_clean_text = new_clean_text + word + ' '


# Custom color function
def color_func(word, font_size, position, orientation, random_state = None, **kwargs):
    return "hsl(45, 150%%, %d%%)" % random.randint(160, 255)

# Create a Word cloud
wc = WordCloud(max_font_size=60, min_font_size=5, max_words=150, background_color="black", margin=2)
wc = wc.generate(new_clean_text)

# Plot a Word cloud
plt.figure(figsize = (12, 12))
plt.imshow(wc.recolor(color_func = color_func, random_state=3), interpolation = "bilinear")
plt.axis("off")
plt.show()


# Algorithm params
min_count = 5    # Minimium frequency count of words. The model would ignore words that do not satisfy the min_count
size = 150       # The size of the dense vector to represent each token or word
window = 3       # The maximum distance between the target word and its neighboring word
sg = 0           # The training algorithm, either CBOW (0) or skip-gram (1)
epochs = 50      # Number of iterations (epochs) over the corpus


# Create Word2Vec model with CBOW algorithm approach
w2v_model = Word2Vec(all_words, min_count=min_count, compute_loss=True, vector_size=size, window=window, sg=sg, epochs=epochs)

# Getting the training loss value
training_loss = w2v_model.get_latest_training_loss()
print(training_loss)

1881113.5


# Show vocabulary size: unique words occurring at least twice
vocabulary = w2v_model.wv.index_to_key
len(vocabulary)

1780


# Show 'holmes' vector
w2v_model.wv['holmes']

array([ 0.39678112, -0.70309114, -0.40441296,  0.41272703, -0.39327854,
        0.3943475 ,  1.0194279 ,  0.5290096 , -0.3295435 ,  0.5167051 ,
       -0.2100004 ,  0.05697219, -0.6289016 , -0.5468892 ,  0.07339276,
        0.25482133,  0.35808587, -0.37766075,  0.30003628,  0.69528216,
       -0.5203084 , -0.56655407,  0.6395985 ,  0.48054895, -0.14689611,
       -0.87779665, -0.33620927, -0.41077298,  0.44543254, -0.31105122,
       -0.6248907 ,  0.4354348 ,  0.00322383, -0.5232826 ,  0.42328218,
       -0.354256  ,  0.30290315, -0.6596066 ,  0.21340986, -0.19804758,
        0.23923375,  0.3380928 ,  0.14989339, -0.5060465 ,  0.1368444 ,
        0.0837866 ,  0.09099548, -0.09379155, -0.49251   ,  0.47424227,
       -0.31569317,  0.37731454, -0.4209911 , -0.04227128,  1.1047065 ,
        0.67135966, -0.01516947,  0.12846494, -0.30910513,  0.21949199,
       -0.24549896, -0.20272876,  0.48747998, -0.05098846,  0.51575536,
       -0.4784055 ,  0.17526303,  0.5846486 , -1.0175432 , -0.6835881 ,
       -0.49530125, -0.00624014, -0.16789956, -0.5730044 ,  0.61781263,
       -0.08208855,  0.2954704 ,  0.34314203, -0.6385965 ,  0.48848748,
        0.3607169 , -0.70421195, -0.61902785,  0.2587055 , -0.43062556,
        0.13585067,  0.15164214, -0.01356333,  0.38123044, -0.08722676,
       -0.33507347, -0.04237956, -0.05416336,  0.47078547,  0.33078003,
        0.07455686, -0.12697284, -0.34902504, -0.6440862 ,  0.1371734 ,
        0.33433244, -0.3309382 , -0.34492585, -0.525152  , -0.5974499 ,
       -0.35576165,  0.10244957, -0.3423658 , -0.5629105 , -0.42280748,
        0.15419073, -0.21639544,  0.354176  ,  0.14280857, -0.5938123 ,
        0.66370696,  0.3069274 ,  0.33340967, -0.45068938,  0.19979353,
       -0.04969786,  0.27417824,  0.21661213,  0.01481856, -0.11360772,
        0.6402411 ,  0.8784679 ,  0.19794293, -0.41630414,  0.13313097,
        0.4308818 , -0.4056247 , -0.804949  , -0.05049953,  0.09105106,
       -0.07421248, -0.40570074, -0.30800608,  0.68268174,  0.14379938,
       -0.4121782 ,  0.73899627,  0.6005977 , -0.1266218 ,  0.85023427,
        0.5550141 ,  0.02638014,  0.11194157,  0.891131  , -0.67984074],
      dtype=float32)


vectors = []
for vector in w2v_model.wv.vectors:
    vectors.append(list(vector))


# Save word embeddings
with open("../data/embeddings/vectors.tsv", "w") as f_output:
    for row in vectors:
        line = '\t'.join(map(str, row))
        f_output.write(line + "\n")

# Save metadata
with open("../data/embeddings/metadata.tsv", "w") as f_output:
    f_output.write("word\tlen\n")
    for word in vocabulary:
        f_output.write(word + "\t" + str(len(word)) + "\n")


# Finding Positive Similar Words
w2v_model.wv.most_similar(positive=['holmes'], topn=10)

[('adventures', 0.8516415357589722),
 ('glancing', 0.7797467112541199),
 ('laughed', 0.7691915035247803),
 ('laughing', 0.7643370032310486),
 ('staggered', 0.7595707178115845),
 ('smiling', 0.7572908401489258),
 ('visitor', 0.7556211352348328),
 ('silent', 0.7444738745689392),
 ('gutenberg', 0.7365286946296692),
 ('cases', 0.7255791425704956)]


# Finding Negative Similar Words
w2v_model.wv.most_similar(negative=['holmes'], topn=10)

[('carried', 0.177780881524086),
 ('went', 0.17299138009548187),
 ('strong', 0.16780786216259003),
 ('taken', 0.10628341883420944),
 ('cut', 0.10017789155244827),
 ('save', 0.09637609124183655),
 ('place', 0.08977862447500229),
 ('hair', 0.08958049863576889),
 ('brought', 0.07890144735574722),
 ('son', 0.0758872702717781)]


# Calculate the similarity between 2 words
w2v_model.wv.similarity(w1='holmes', w2='lestrade')

0.70907706


# Calculate similarity: sim(w1, w2) = sim(w2, w1)
w2v_model.wv.similarity(w1='lestrade', w2='holmes')

0.70907706


# Show word that doesn't belong to the list
w2v_model.wv.doesnt_match(['holmes', 'watson', 'mycroft'])

'holmes'


# Get vectors
target_word = 'sherlock'
top_n = 25

# Calculate more and less similars words
most_similar = w2v_model.wv.most_similar(positive = [target_word], topn = top_n)
less_similar = w2v_model.wv.most_similar(negative = [target_word], topn = top_n)

# Save them
neighbors = [(target_word, 1, 'current')]
neighbors += [(*row, 'most') for row in most_similar]
neighbors += [(*row, 'less') for row in less_similar]

# Get neighbors vectos
neigh_word = [row[0] for row in neighbors]
X = w2v_model.wv[neigh_word]
len(X)

51


# Perform PCA with 3 components
pca = PCA(n_components=3)
pca_data = pca.fit_transform(X)

# The explained variance of each principal components
print(list(pca.explained_variance_ratio_))
print(sum(pca.explained_variance_ratio_))

[0.42086148, 0.1456058, 0.07055229]
0.6370195746421814


# Create and show principal components DataFrame
pca_df = pd.DataFrame(data=pca_data, columns=["PC1", "PC2", "PC3"])
pca_df['Name'] = neigh_word
pca_df.head(10)


# Create a scatter plot of the projection
fig, ax = plt.subplots(figsize = (14, 14))
gap = 0.01
colors = dict()
colors['current'] = 'royalblue'
colors['most'] = 'forestgreen'
colors['less'] = 'orange'

# Add points one by one with a loop
for i, word in enumerate(neigh_word):
    node_col = colors[neighbors[i][2]]
    
    if word == target_word:
        node_size = 100
        text = word.upper()
    else:
        node_size = 50
        text = word + ': ' + str(round(neighbors[i][1], 3))
        
    plt.scatter(pca_data[i, 0], pca_data[i, 1], c = node_col, s = node_size)
    plt.annotate(text, xy = (pca_data[i, 0] + gap*10, pca_data[i, 1] - gap*3))

# Plot setup
ax.set_xlabel("PC 1", fontsize=12)
ax.set_ylabel("PC 2", fontsize=12)
ax.set_title("Most Similar Words to " + target_word, fontsize=20)
ax.legend(["Similar Words"])
ax.grid()


# Create Word2Vec model
w2v_model = Word2Vec(all_words, min_count=5, vector_size=150, window=3, sg=0, epochs=50)
vocabulary = w2v_model.wv.index_to_key
len(vocabulary)

1780


# Returns the dense similarity between all the words in the document
def get_dense_similarity(model, precision=3):
    vocabulary = list(model.wv.index_to_key)
    n_words = len(vocabulary)
    matrix = np.zeros((n_words, n_words))
    
    for i in range(n_words):
        for j in range(n_words):
            if i == j:
                matrix[i][j] = 1
            elif i > j:
                word_sim = round(model.wv.similarity(w1 = vocabulary[i], w2 = vocabulary[j]), precision)
                matrix[i][j] = word_sim
                matrix[j][i] = word_sim
    
    return matrix;


# Create dataframe with the similarity between all the words in the document
start_time = timeit.default_timer()
words_sim = get_dense_similarity(w2v_model, 2)
elapsed = timeit.default_timer() - start_time
elapsed

57.315067199999994


# Show dense similarity matrix as a dataframe
df_dense = pd.DataFrame.from_records(words_sim, columns=vocabulary)
print(df_dense.shape)
df_dense.iloc[:18, :18]

(1780, 1780)


# Plot dense similarity matrix
fig, ax = plt.subplots(figsize = (14, 14))
sns.heatmap(words_sim, ax = ax)
ax.set_title("Dense Similarity Matrix", fontsize=16)
ax.set_xlabel("vocabulary", fontsize=12)
ax.set_ylabel("vocabulary", fontsize=12)
plt.show()


# Exporting dense word similarity matrix
file_path = "../data/network/dense_similarity.csv"
df_dense.to_csv(file_path, index = False, sep = ',')


# Returns the sparse similarity between all the words in the document
def get_sparse_similarity(model, precision=3, top_n=10):
    matrix = []
    vocabulary = list(model.wv.index_to_key)
    n_words = len(vocabulary)
    
    # Calculate sparse similarity
    for word in vocabulary:
        row_sim = np.zeros(n_words)
        best_sim = w2v_model.wv.most_similar(positive = [word], topn = top_n)
        
        for neighbor in best_sim:
            nei_name = neighbor[0]
            nei_ix = vocabulary.index(nei_name)
            nei_sim = round(neighbor[1], precision)
            row_sim[nei_ix] = nei_sim
        
        matrix.append(row_sim)
    
    return matrix, vocabulary


# Create a data frame with the similarity between the nearest words
start_time = timeit.default_timer()
words_sim, vocabulary = get_sparse_similarity(w2v_model, 2, 50)
elapsed = timeit.default_timer() - start_time
elapsed

3.117964999999998


# Show sparse similarity matrix as a dataframe
df_sparse = pd.DataFrame.from_records(words_sim, columns=vocabulary)
print(df_sparse.shape)
df_sparse.iloc[:18, :18]

(1780, 1780)


# Plot sparse similarity matrix
fig, ax = plt.subplots(figsize = (14, 14))
sns.heatmap(words_sim, ax = ax)
ax.set_title("Sparse Similarity Matrix", fontsize=16)
ax.set_xlabel("vocabulary", fontsize=12)
ax.set_ylabel("vocabulary", fontsize=12)
plt.show()


# Exporting sparse word similarity matrix
file_path = "../data/network/sparse_similarity.csv"
df_sparse.to_csv(file_path, index=False, sep=',')

	PC1	PC2	PC3	Name
0	5.340423	-0.646308	-0.963914	sherlock
1	2.565961	0.041457	0.005309	remarked
2	1.090095	-0.276063	-0.419835	observed
3	0.712601	-0.024503	0.282124	murmured
4	0.682104	-0.265439	-0.061924	smiled
5	1.067823	-0.348628	-0.098575	heartily
6	0.848729	-0.340285	0.058723	chuckled
7	2.484412	-0.281194	-0.326231	lestrade
8	1.115612	0.112329	0.243798	rising
9	2.190557	-1.440016	-1.332575	rose

NLP - Word Embedding¶

Example with a document in English¶

Step 1 - Read natural text from a book¶

Step 2 - Tokenize and remove Stopwords¶

Data Quality process¶

Stopwords¶

Step 3 - Create a Word2Vec model¶

Vocabulary¶

Word Embedding¶

Similar Words¶

Step 4 - Plot similars words¶

Step 5 - Export similarity between the Words¶

Dense matrix¶

Sparse matrix¶

Reference¶

	word	quantity
0	the	5636
1	i	3038
2	and	3020
3	to	2744
4	of	2661
5	a	2643
6	in	1766
7	that	1752
8	it	1737
9	you	1503

	word	quantity
0	said	486
1	holmes	465
2	man	305
3	little	269
4	think	174
5	room	171
6	know	170
7	shall	169
8	come	161
9	time	151

	said	holmes	man	little	think	room	know	shall	come	time	came	door	good	face	matter	yes	hand	house
0	1.00	0.38	0.23	0.24	0.64	0.07	0.58	0.38	0.48	0.17	0.12	0.09	0.41	0.12	0.21	0.36	0.31	0.08
1	0.38	1.00	0.17	0.20	0.35	0.16	0.13	-0.05	0.34	0.27	-0.02	0.22	0.10	0.16	0.16	0.21	0.39	-0.07
2	0.23	0.17	1.00	0.31	0.16	0.16	0.12	-0.10	0.18	0.24	0.07	0.25	0.07	0.57	0.26	0.17	0.14	0.15
3	0.24	0.20	0.31	1.00	0.26	0.02	0.33	0.21	0.34	0.31	0.08	-0.02	0.38	0.41	0.52	0.24	0.27	0.09
4	0.64	0.35	0.16	0.26	1.00	0.11	0.64	0.50	0.61	0.46	0.27	0.13	0.45	-0.06	0.31	0.54	0.03	0.26
5	0.07	0.16	0.16	0.02	0.11	1.00	-0.09	0.04	0.13	0.25	0.34	0.67	0.05	0.13	-0.13	0.08	0.20	0.60
6	0.58	0.13	0.12	0.33	0.64	-0.09	1.00	0.63	0.53	0.22	0.33	-0.01	0.42	0.12	0.58	0.46	0.16	0.17
7	0.38	-0.05	-0.10	0.21	0.50	0.04	0.63	1.00	0.53	0.31	0.39	-0.06	0.52	-0.12	0.50	0.52	-0.00	0.27
8	0.48	0.34	0.18	0.34	0.61	0.13	0.53	0.53	1.00	0.52	0.36	0.11	0.59	-0.03	0.45	0.42	0.13	0.17
9	0.17	0.27	0.24	0.31	0.46	0.25	0.22	0.31	0.52	1.00	0.30	0.17	0.37	-0.03	0.31	0.45	0.07	0.34
10	0.12	-0.02	0.07	0.08	0.27	0.34	0.33	0.39	0.36	0.30	1.00	0.20	0.41	-0.03	0.12	0.30	0.09	0.52
11	0.09	0.22	0.25	-0.02	0.13	0.67	-0.01	-0.06	0.11	0.17	0.20	1.00	-0.08	0.26	-0.11	0.00	0.19	0.36
12	0.41	0.10	0.07	0.38	0.45	0.05	0.42	0.52	0.59	0.37	0.41	-0.08	1.00	0.04	0.32	0.47	0.04	0.25
13	0.12	0.16	0.57	0.41	-0.06	0.13	0.12	-0.12	-0.03	-0.03	-0.03	0.26	0.04	1.00	0.23	0.01	0.45	0.03
14	0.21	0.16	0.26	0.52	0.31	-0.13	0.58	0.50	0.45	0.31	0.12	-0.11	0.32	0.23	1.00	0.50	0.06	-0.01
15	0.36	0.21	0.17	0.24	0.54	0.08	0.46	0.52	0.42	0.45	0.30	0.00	0.47	0.01	0.50	1.00	0.02	0.41
16	0.31	0.39	0.14	0.27	0.03	0.20	0.16	-0.00	0.13	0.07	0.09	0.19	0.04	0.45	0.06	0.02	1.00	0.18
17	0.08	-0.07	0.15	0.09	0.26	0.60	0.17	0.27	0.17	0.34	0.52	0.36	0.25	0.03	-0.01	0.41	0.18	1.00

	room	come	face
0	0.00	0.00	0.00
1	0.00	0.00	0.00
2	0.00	0.00	0.57
3	0.00	0.00	0.00
4	0.00	0.00	0.00
5	0.00	0.00	0.00
6	0.00	0.00	0.00
7	0.00	0.00	0.00
8	0.00	0.00	0.00
9	0.00	0.00	0.00
10	0.00	0.00	0.00
11	0.67	0.00	0.00
12	0.00	0.59	0.00
13	0.00	0.00	0.00
14	0.00	0.00	0.00
15	0.00	0.00	0.00
16	0.00	0.00	0.00
17	0.60	0.00	0.00