Natural language processing (NLP): is a discipline where computer science, artificial intelligence and cognitive logic are intercepted, with the objective that machines can read and understand our language for decision making [1].
# Load Python libraries
import io
import os
import pandas as pd
from collections import Counter
# Load Plot libraries
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
Below, the proposed procedure with a NLP approach to text preprocessing:
# Set local file encoding
file_enconding = 'utf8'
# Util function to read a plain text file
def read_text_file(file_path):
text = ""
if os.path.isfile(file_path):
print('Read the following file:', file_path)
with io.open(file_path, 'r', encoding = file_enconding) as f:
text = f.read()
return text
# Util function to save a plain text file
def save_text_file(file_path, text):
try:
with open(file_path, 'w', encoding = file_enconding) as f:
f.write(text)
except:
print("Error saving text file:", file_path)
return False
return True
# Loading target text book
file_path = "../data/text/book1-en.txt"
file_text1 = read_text_file(file_path)
# Weight of the original file
file_size = os.path.getsize(file_path)
print('Original file size:', round(file_size / 1024, 2), 'KB')
# Calculate occurrences of stop words
stop_words = ['\n', ', ', '. ', '; ', '! ', '? ', ',* ', '.* ']
sw_occur = dict()
for sword in stop_words:
sw_occur[sword] = file_text1.count(sword)
sw_occur
# Plot occurrences of stop words
fig = plt.figure(figsize = (14, 6))
plt.bar(range(len(sw_occur)), list(sw_occur.values()), alpha = 0.9, label = 'Number Stopword', edgecolor = 'black')
plt.xticks(range(len(sw_occur)), list(sw_occur.keys()))
plt.grid(True, alpha = 0.8)
plt.title('Histogram of the number of Stopwords')
plt.xlabel('Stopword', fontsize = 10)
plt.ylabel('Number', fontsize = 10)
plt.legend(loc = 'upper right')
plt.show()
# Cleaing the plain text
clean_text = file_text1
for sword in stop_words:
clean_text = clean_text.replace(sword, ' ')
# Calculate base compression
print('Original plain text size:', len(file_text1))
print('Cleaned plain text size:', len(clean_text))
print('Base compression:', round((len(file_text1) - len(clean_text)) * 100 / len(file_text1), 2), '%')
Note: Experimentally it was found that passing the plain-text through a data quality process adds approximately 2% more data compression.
# Tokenize text in a words list
word_list = clean_text.split(' ')
print('Number of words:', len(word_list))
# Showing first 100 words
print(word_list[:100])
# Get the most common words in the document
n_words = Counter(word_list).most_common(1000)
df_words = pd.DataFrame.from_records(n_words, columns = ['word', 'quantity'])
df_words['length'] = df_words.word.str.len()
df_words['weigth'] = df_words.quantity * df_words.length
df_words = df_words.sort_values(by=['weigth'], ascending=False)
df_words.head(10)
Show the 50 Most Common Words in document.
# Plot the most common words in the document
fig = plt.figure(figsize = (18, 6))
sns.barplot(x = 'word', y = 'weigth', data = df_words[0:50])
plt.title('The 50 Most Common Words in document')
plt.show()
# Plot Histogram of the number of Words by Sizes
hist_bins = len(df_words["length"].unique())
fig = plt.figure(figsize = (14, 6))
plt.hist(df_words.length, bins = hist_bins, alpha = 0.9, label = 'Word Size', edgecolor = 'black')
plt.grid(True, alpha = 0.8)
plt.title('Histogram of the number of Words by Sizes')
plt.xlabel('Size', fontsize = 10)
plt.ylabel('Number', fontsize = 10)
plt.legend(loc = 'upper right')
plt.show()
Once the words to be replaced are identified, the unused symbols must be found in the alphabet, to be used as substituents.
# Read file in low level (Bytes)
def get_file_bytes(file_path):
with open(file_path, 'rb') as f:
return bytearray(f.read());
return None;
# Loading target text book
file_byte_list = get_file_bytes(file_path)
# Calculate code frequency
term_freq = Counter(file_byte_list)
n = len(term_freq)
print('Unique symbols used:', n)
# Normalize term frequency
max_symbols = 256
total = sum(term_freq.values())
for ix in range(max_symbols):
if ix in term_freq:
term_freq[ix] = term_freq[ix] / total
else:
term_freq[ix] = 0
# Create dataframe: unused byte list
df_ubytes = pd.DataFrame.from_records(term_freq.most_common(max_symbols), columns = ['byte', 'frequency'])
df_ubytes['symbol'] = [chr(b) for b in df_ubytes.byte]
df_ubytes = df_ubytes[['byte', 'symbol', 'frequency']]
df_ubytes.head(10)
# Create pretty x axis labels
def get_x_labels():
x_labels = []
for ix in range(max_symbols):
if ix % 5 == 0:
x_labels.append(str(ix))
else:
x_labels.append('')
return x_labels
# Probability of each symbol by default
p_x = 1 / max_symbols
# Plot the frequency of the bytes in the file
fig = plt.figure(figsize = (18, 6))
ax = sns.barplot(x = 'byte', y = 'frequency', data = df_ubytes.sort_values(by=['byte']), palette=("Blues_d"))
ax.set_xticklabels(labels = get_x_labels(), fontsize = 10, rotation = 50)
plt.axhline(y = p_x, color = "#8b0000", linestyle = "--")
plt.title('Bytes Frequency of the Original Text File')
plt.show()
# Get locked symbols
locked_symbols = []
two_bytes_symb = [chr(c) for c in range(128, max_symbols)]
for spe_sym in two_bytes_symb:
if spe_sym in file_text1:
locked_symbols.append(spe_sym)
locked_symbols
# Save unused symbols
unused_symbols = list(df_ubytes[df_ubytes['frequency'] == 0]["byte"])
unused_symbols = list(set(unused_symbols) - set([ord(ls) for ls in locked_symbols]))
len(unused_symbols)
Now, we can replace the heaviest words with unused symbols in the file.
# Returns the semantic table with the new symbols to replace the heaviest words
def get_semantic_table(df_words, unused_symbols):
new_symbols = dict()
n_symb_repleced = min(150, len(unused_symbols))
ix = 0
for index, row in df_words.iterrows():
if len(row['word']) > 1:
key = row['word']
value = int(unused_symbols[ix])
new_symbols[key] = value
ix += 1
if ix == n_symb_repleced:
break
return new_symbols
# Show semantic table
semantic_table = get_semantic_table(df_words, unused_symbols)
print(semantic_table)
# Function that applies a semantic compression approach
def compress_text(curr_text, codes, forward = True):
symbol_list = sorted([(len(k), k, v) for k, v in codes.items()], key = lambda x: int(x[0]), reverse = True)
new_text = curr_text
for l, key, value in symbol_list:
c_value = chr(value)
if forward:
new_text = new_text.replace(key, c_value)
else:
new_text = new_text.replace(c_value, key)
return new_text
# Compressiong text of the file
file_text_comp = compress_text(file_text1, semantic_table)
file_path_comp = file_path.replace('.txt', '_comp.txt')
save_text_file(file_path_comp, file_text_comp)
# Weight of the compressed file
file_size = os.path.getsize(file_path_comp)
print('Compressed file size:', round(file_size / 1024, 2), 'KB')
# Original file
print('Number of characters:', len(file_text1))
# Compressed file
print('Number of characters:', len(file_text_comp))
# Semantic compression percentage
compress_rate = (len(file_text1) - len(file_text_comp)) / len(file_text1)
print('Compression Rate:', round(compress_rate * 100, 2), '%')
Click in the following link to download the compressed text file:
https://raw.githubusercontent.com/ansegura7/DataCompression/master/data/text/book1-en_comp.txt
# Loading target text book
file_byte_list = get_file_bytes(file_path_comp)
# Calculate code frequency
term_freq = Counter(file_byte_list)
n = len(term_freq)
print('Unique symbols used:', n)
# Normalize term frequency
total = sum(term_freq.values())
for ix in range(max_symbols):
if ix in term_freq:
term_freq[ix] = term_freq[ix] / total
else:
term_freq[ix] = 0
# Create dataframe: unused byte list
df_ubytes = pd.DataFrame.from_records(term_freq.most_common(max_symbols), columns = ['byte', 'frequency'])
df_ubytes['symbol'] = [chr(b) for b in df_ubytes.byte]
df_ubytes = df_ubytes[['byte', 'symbol', 'frequency']]
df_ubytes.head(10)
# Plot the frequency of the bytes in the file
fig = plt.figure(figsize = (18, 6))
ax = sns.barplot(x = 'byte', y = 'frequency', data = df_ubytes.sort_values(by=['byte']), palette=("Greens_d"))
ax.set_xticklabels(labels = get_x_labels(), fontsize = 10, rotation = 50)
plt.axhline(y = p_x, color = "#8b0000", linestyle = "--")
plt.title('Bytes Frequency of the Compressed Text File')
plt.show()
Note: The symbol 32 (in decimal) refers to the white space character of the ASCII table.
# Loading target text book
file_text_comp = read_text_file(file_path_comp)
# Function that applies a semantic decompression approach
def decompress_text(comp_text, codes):
new_text = compress_text(comp_text, codes, False)
return new_text
# Decompression
file_text2 = decompress_text(file_text_comp, semantic_table)
# Compressed file
print('Number of characters:', len(file_text2))
Validation
# Comparing size
len(file_text1) - len(file_text2)
# Comparing content
file_text1 == file_text2
[1] Wikipedia - Natural language processing.