In computer science and information theory, a Huffman Code is a particular type of optimal prefix code that is commonly used for lossless data compression. The output from Huffman's algorithm can be viewed as a variable-length code table for encoding a source symbol (such as a character in a file). The algorithm derives this table from the estimated probability or frequency of occurrence (weight) for each possible value of the source symbol. [1]
# Load Python libraries
import numpy as np
import timeit
import pandas as pd
import statistics as stats
from collections import Counter
# Load Plot libraries
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib import gridspec
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
Huffman coding uses a specific method for choosing the representation for each symbol, resulting in a prefix code (sometimes called "prefix-free codes", that is, the bit string representing some particular symbol is never a prefix of the bit string representing any other symbol). Huffman coding is such a widespread method for creating prefix codes that the term "Huffman code" is widely used as a synonym for "prefix code" even when such a code is not produced by Huffman's algorithm. [1]
# Class HuffmanCode from scratch
class HuffmanCode:
# Return a Huffman code for an ensemble with distribution p
def get_code(self, p_symbols):
# Init validation
n = len(p_symbols)
if n == 0:
return dict()
elif n == 1:
return dict(zip(p_symbols.keys(), ['1']))
# Ensure probabilities sum to 1
self._normalize_weights(p_symbols)
# Returns Huffman code
return self._get_code(p_symbols);
# (Private) Calculate Huffman code
def _get_code(self, p):
# Base case of only two symbols, assign 0 or 1 arbitrarily
if len(p) == 2:
return dict(zip(p.keys(), ['0', '1']))
# Create a new distribution by merging lowest prob pair
p_prime = p.copy()
s1, s2 = self._get_lowest_prob_pair(p)
p1, p2 = p_prime.pop(s1), p_prime.pop(s2)
p_prime[s1 + s2] = p1 + p2
# Recurse and construct code on new distribution
code = self._get_code(p_prime)
symbol = s1 + s2
s1s2 = code.pop(symbol)
code[s1], code[s2] = s1s2 + '0', s1s2 + '1'
return code;
# (Private) Return pair of symbols from distribution p with lowest probabilities
def _get_lowest_prob_pair(self, p):
# Ensure there are at least 2 symbols in the dist.
if len(p) >= 2:
sorted_p = sorted(p.items(), key=lambda x: x[1])
return sorted_p[0][0], sorted_p[1][0];
return (None, None);
# (Private) Makes sure all weights add up to 1
def _normalize_weights(self, p_symbols, t_weight=1.0):
n = sum(p_symbols.values())
if n != t_weight:
for s in p_symbols:
p_symbols[s] = p_symbols[s] / n;
Input: $$ A = \{ a_1, a_2, a_3, ..., a_n \} \tag{1} $$
Output: $$ C(A, W) = \{ c_1, c_2, c_3, ..., c_n \} \tag{3} $$
Target: $$ L(C) = \sum_{i=1}^n{w_i . length(c_i)} \tag{4} $$
# Create Huffman Code instance
hc = HuffmanCode()
# Alphabet with 1 symbol
sample_1 = { 'a': 1.0 }
hc.get_code(sample_1)
# Alphabet with 3 symbols and total probability less than 1
sample_2 = { 'a': 0.6, 'b': 0.25, 'c': 0.1 }
hc.get_code(sample_2)
# Alphabet with 5 symbols and total probability equal than 1.0
sample_3 = { 'a': 0.10, 'b': 0.15, 'c': 0.30, 'd': 0.16, 'e': 0.29 }
hc.get_code(sample_3)
This example is with a PNG image. Experimentally it was found that the distribution of the bytes of an image tends to be uniform.
# Read file in low level (Bytes)
def get_file_bytes(file_path):
with open(file_path, 'rb') as f:
return bytearray(f.read());
return None;
# Loading target image
file_path = "../data/img/example-3.png"
image_byte_list = get_file_bytes(file_path)
# Calculate code frequency
def get_term_freq(term_list):
term_freq = {}
terms_count = dict(Counter(term_list))
for key, value in terms_count.items():
if isinstance(key, int):
key = str(key)
term_freq[key] = value
return term_freq;
# Alphabet with 256 symbols
term_freq = get_term_freq(image_byte_list)
len(term_freq)
# Normalize term frequency
n = sum(term_freq.values())
for term in term_freq:
term_freq[term] = term_freq[term] / n;
sum(term_freq.values())
# Get Huffman coding
hf_code = hc.get_code(term_freq)
# Creates a huffman dataframe with the codes and frequency of each of them
def create_huffman_df(code_list, term_freq):
codes = pd.DataFrame([code_list, term_freq]).T
codes.reset_index(level=0, inplace=True)
codes.columns = ["byte", "code", "frequency"]
codes['symbol'] = [chr(int(b)) for b in codes["byte"]]
codes = codes[["byte", "symbol", "code", "frequency"]]
return codes
# Create and showing data
codes = create_huffman_df(hf_code, term_freq)
codes.head(20)