# Load Python libraries
import re
import codecs
import pandas as pd
from collections import Counter
# Import plot libraries
import matplotlib.pyplot as plt
# Util function - Read a plain text file
def read_file_lines(file_path):
lines = []
with codecs.open(file_path, encoding='utf-8') as f:
for line in f:
lines.append(line)
return lines
# Util function - Apply data quality to words
def apply_dq_word(word):
new_word = word.replace('\n', '')
# Get first token
if ',' in new_word:
new_word = new_word.split(',')[0]
# Remove extra whitespaces
new_word = new_word.strip()
# Remove digits
while re.search("\d", new_word):
new_word = new_word[0:len(new_word)-1]
return new_word
# Util function - Plot column chart
def plot_col_chart(df, figsize, x_var, y_var, title, color='#1f77b4', legend=None, x_label=None):
fig, ax = plt.subplots()
df.plot.bar(ax=ax, x=x_var, y=y_var, color=color, figsize=figsize)
if legend:
ax.legend(legend)
else:
ax.get_legend().remove()
if x_label:
x = np.arange(len(x_label))
plt.xticks(x, x_label, rotation=0)
else:
plt.xticks(rotation=0)
plt.title(title, fontsize=16)
plt.xlabel(x_var.capitalize())
plt.ylabel(y_var.capitalize())
plt.show()
# Util function - Plot bar chart
def plot_bar_chart(df, figsize, x_var, y_var, title, color='#1f77b4', legend=None):
fig, ax = plt.subplots()
df.plot.barh(ax=ax, x=x_var, y=y_var, figsize=figsize)
if legend:
ax.legend(legend)
else:
ax.get_legend().remove()
plt.title(title, fontsize=16)
plt.xlabel(y_var.capitalize())
plt.ylabel(x_var.capitalize())
plt.show()
# Range of files by first letter of word
letter_list = list(map(chr, range(97, 123)))
letter_list.append('ñ')
len(letter_list)
27
# Read words by letter [a-z]
word_dict = Counter()
file_path = '../data/dics/'
# Read data only first time
for letter in letter_list:
filename = file_path + letter + '.txt'
word_list = read_file_lines(filename)
for word in word_list:
word = apply_dq_word(word)
word_dict[word] += 1
# Show results
n_words = len(word_dict)
print('Total of different words: %d' % n_words)
Total of different words: 88190
# Counting words with acute accent
aa_freq = Counter()
regexp = re.compile('[áéíóúÁÉÍÓÚ]')
for word in word_dict.keys():
match = regexp.search(word.lower())
if match:
l = match.group(0)
aa_freq[l] += 1
# Show results
count = sum(aa_freq.values())
perc_words = 100.0 * count / n_words
print('Total words with acute accent: %d (%0.2f %s)' % (count, perc_words, '%'))
Total words with acute accent: 16334 (18.52 %)
# Cooking dataframe
df = pd.DataFrame.from_records(aa_freq.most_common(), columns = ['vowel', 'frequency']).sort_values(by=['vowel'])
df['perc'] = round(100.0 * df['frequency'] / count, 2)
df
vowel | frequency | perc | |
---|---|---|---|
2 | á | 2514 | 15.39 |
3 | é | 1748 | 10.70 |
1 | í | 4915 | 30.09 |
0 | ó | 6599 | 40.40 |
4 | ú | 558 | 3.42 |
# Plotting data
figsize = (12, 6)
x_var = 'vowel'
y_var = 'perc'
title = 'Frequency of accented vowels'
plot_col_chart(df, figsize, x_var, y_var, title)
# Processing
word_size = Counter()
for word in word_dict.keys():
size = len(word)
word_size[size] += 1
# Cooking dataframe
df = pd.DataFrame.from_records(word_size.most_common(), columns = ['size', 'frequency']).sort_values(by=['size'])
df['perc'] = 100.0 * df['frequency'] / n_words
df
size | frequency | perc | |
---|---|---|---|
18 | 1 | 27 | 0.030616 |
16 | 2 | 105 | 0.119061 |
13 | 3 | 466 | 0.528405 |
9 | 4 | 2102 | 2.383490 |
6 | 5 | 5323 | 6.035832 |
4 | 6 | 9097 | 10.315228 |
2 | 7 | 12569 | 14.252183 |
0 | 8 | 14129 | 16.021091 |
1 | 9 | 13486 | 15.291983 |
3 | 10 | 10983 | 12.453793 |
5 | 11 | 7777 | 8.818460 |
7 | 12 | 4990 | 5.658238 |
8 | 13 | 3165 | 3.588842 |
10 | 14 | 1865 | 2.114752 |
11 | 15 | 1082 | 1.226896 |
12 | 16 | 551 | 0.624787 |
14 | 17 | 254 | 0.288015 |
15 | 18 | 136 | 0.154212 |
17 | 19 | 46 | 0.052160 |
19 | 20 | 22 | 0.024946 |
20 | 21 | 8 | 0.009071 |
21 | 22 | 4 | 0.004536 |
24 | 23 | 1 | 0.001134 |
23 | 24 | 1 | 0.001134 |
22 | 26 | 1 | 0.001134 |
# Plotting data
figsize = (12, 6)
x_var = 'size'
y_var = 'frequency'
title = 'Frequency of words per size'
plot_col_chart(df, figsize, x_var, y_var, title)
# Processing
top_size = Counter()
threshold = 21
for word in word_dict.keys():
size = len(word)
if size >= threshold:
top_size[word] = size
# Top 15 bigger words
top_size.most_common()
[('contencioso-administrativo', 26), ('contradictio in terminis', 24), ('electroencefalografista', 23), ('anticonstitucionalidad', 22), ('electroencefalográfico', 22), ('esternocleidomastoideo', 22), ('in partibus infidelium', 22), ('antinorteamericanismo', 21), ('constitucionalización', 21), ('contrarrevolucionario', 21), ('corresponsabilización', 21), ('electroencefalografía', 21), ('interdisciplinariedad', 21), ('otorrinolaringológico', 21), ('preterintencionalidad', 21)]
# Processing
letter_freq = Counter()
for word in word_dict.keys():
word = word.lower()
for l in word:
letter_freq[l] += 1
n_total = sum(letter_freq.values())
n_total
769596
# Cooking dataframe
df = pd.DataFrame.from_records(letter_freq.most_common(), columns = ['letter', 'frequency']).sort_values(by=['letter'])
df['perc'] = 100.0 * df['frequency'] / n_total
df
letter | frequency | perc | |
---|---|---|---|
31 | 316 | 0.041061 | |
29 | - | 602 | 0.078223 |
0 | a | 107146 | 13.922370 |
14 | b | 14301 | 1.858248 |
6 | c | 42924 | 5.577472 |
10 | d | 28937 | 3.760025 |
1 | e | 72597 | 9.433131 |
16 | f | 8321 | 1.081217 |
15 | g | 13338 | 1.733117 |
17 | h | 8207 | 1.066404 |
4 | i | 61657 | 8.011606 |
22 | j | 4849 | 0.630071 |
32 | k | 272 | 0.035343 |
8 | l | 37564 | 4.881003 |
11 | m | 24639 | 3.201550 |
5 | n | 47804 | 6.211571 |
3 | o | 69956 | 9.089964 |
13 | p | 19032 | 2.472986 |
23 | q | 3201 | 0.415933 |
2 | r | 70502 | 9.160910 |
9 | s | 32322 | 4.199866 |
7 | t | 41887 | 5.442726 |
12 | u | 24321 | 3.160230 |
18 | v | 7141 | 0.927889 |
34 | w | 66 | 0.008576 |
28 | x | 1350 | 0.175417 |
27 | y | 1430 | 0.185812 |
20 | z | 6319 | 0.821080 |
24 | á | 2514 | 0.326665 |
36 | è | 1 | 0.000130 |
26 | é | 1748 | 0.227132 |
21 | í | 4916 | 0.638777 |
35 | î | 2 | 0.000260 |
25 | ñ | 2028 | 0.263515 |
19 | ó | 6601 | 0.857723 |
30 | ú | 558 | 0.072506 |
33 | ü | 227 | 0.029496 |
# Plotting data
figsize = (12, 6)
x_var = 'letter'
y_var = 'frequency'
title = 'Letter frequency in DSL words'
plot_col_chart(df, figsize, x_var, y_var, title)
# Plotting sorted data
figsize = (12, 6)
x_var = 'letter'
y_var = 'perc'
title = 'Letter frequency in DSL words (Sorted)'
color = '#2ca02c'
plot_col_chart(df.sort_values(by='perc', ascending=False), figsize, x_var, y_var, title, color)
vowel_list = 'aeiouáéíóúèîü'
vowel_total = 0
consonant_total = 0
for ix, row in df.iterrows():
letter = str(row['letter'])
freq = int(row['frequency'])
if letter in vowel_list:
vowel_total += freq
elif letter.isalpha():
consonant_total += freq
letter_total = vowel_total + consonant_total
# Initialize list of lists
data = [['vowels', vowel_total, (100.0 * vowel_total / letter_total)],
['consonant', consonant_total, (100.0 * consonant_total / letter_total)]]
# Create the pandas DataFrame
df = pd.DataFrame(data, columns = ['type', 'frequency', 'perc'])
df
type | frequency | perc | |
---|---|---|---|
0 | vowels | 352244 | 45.82465 |
1 | consonant | 416434 | 54.17535 |
# Plotting data
figsize = (6, 6)
x_var = 'type'
y_var = 'perc'
title = 'Vowel and consonant ratio'
plot_col_chart(df, figsize, x_var, y_var, title)
norm_dict = {'á':'a', 'é':'e', 'í':'i', 'ó':'o', 'ú':'u'}
# Processing
first_letter_freq = Counter()
for word in word_dict.keys():
first_letter = word[0].lower()
if first_letter.isalpha():
if first_letter in norm_dict.keys():
first_letter = norm_dict[first_letter]
first_letter_freq[first_letter] += 1
# Cooking dataframe
df = pd.DataFrame.from_records(first_letter_freq.most_common(), columns = ['letter', 'frequency']).sort_values(by=['letter'])
df['perc'] = 100.0 * df['frequency'] / n_words
df
letter | frequency | perc | |
---|---|---|---|
1 | a | 10747 | 12.186189 |
9 | b | 3688 | 4.181880 |
0 | c | 12194 | 13.826965 |
4 | d | 5776 | 6.549495 |
3 | e | 7124 | 8.078013 |
11 | f | 2881 | 3.266810 |
12 | g | 2873 | 3.257739 |
14 | h | 2172 | 2.462864 |
10 | i | 3211 | 3.641002 |
18 | j | 950 | 1.077220 |
23 | k | 116 | 0.131534 |
13 | l | 2419 | 2.742941 |
5 | m | 5357 | 6.074385 |
17 | n | 1329 | 1.506974 |
16 | o | 1420 | 1.610160 |
2 | p | 7676 | 8.703935 |
20 | q | 509 | 0.577163 |
8 | r | 4469 | 5.067468 |
7 | s | 4593 | 5.208073 |
6 | t | 4779 | 5.418982 |
21 | u | 482 | 0.546547 |
15 | v | 1960 | 2.222474 |
26 | w | 28 | 0.031750 |
25 | x | 48 | 0.054428 |
22 | y | 259 | 0.293684 |
19 | z | 746 | 0.845901 |
24 | ñ | 82 | 0.092981 |
# Plotting data
figsize = (12, 6)
x_var = 'letter'
y_var = 'frequency'
title = 'Frequency of words per letter of the alphabet'
plot_col_chart(df, figsize, x_var, y_var, title)
# Plotting sorted data
figsize = (12, 6)
x_var = 'letter'
y_var = 'perc'
title = 'Frequency of words per letter of the alphabet (Sorted)'
color = '#2ca02c'
plot_col_chart(df.sort_values(by='perc', ascending=False), figsize, x_var, y_var, title, color)
# Processing
top_ngrams = 25
bi_grams = Counter()
tri_grams = Counter()
for word in word_dict.keys():
word = word.lower()
n = len(word)
size = 2
for i in range(size, n+1):
n_grams = word[i-size:i]
bi_grams[n_grams] += 1
size = 3
for i in range(size, n+1):
n_grams = word[i-size:i]
tri_grams[n_grams] += 1
# Cooking dataframe
df_bi = pd.DataFrame.from_records(bi_grams.most_common(top_ngrams), columns=['bi-grams', 'frequency'])
df_tri = pd.DataFrame.from_records(tri_grams.most_common(top_ngrams), columns=['tri-grams', 'frequency'])
# Plotting sorted data
figsize = (8, 10)
x_var = 'bi-grams'
y_var = 'frequency'
title = str(top_ngrams) + ' bi-grams most frequent in Spanish'
plot_bar_chart(df_bi.sort_values(by=['frequency']), figsize, x_var, y_var, title)
# Plotting sorted data
figsize = (8, 10)
x_var = 'tri-grams'
y_var = 'frequency'
title = str(top_ngrams) + ' tri-grams most frequent in Spanish'
plot_bar_chart(df_tri.sort_values(by=['frequency']), figsize, x_var, y_var, title)