commit 7cc823f37ad4a7e3fad7d87f743b7f2501e1708b Author: Edgar Luque Date: Tue Oct 25 13:08:26 2022 +0200 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..869df07 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +Cargo.lock \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..eb8b112 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "huffman" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +bit-vec = "0.6.3" diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..936dbb1 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,229 @@ +use std::{ + cell::RefCell, + collections::{BinaryHeap, HashMap}, + rc::Rc, +}; + +use bit_vec::BitVec; + +// Created with help from: +// - https://en.wikipedia.org/wiki/Huffman_coding +// - https://aquarchitect.github.io/swift-algorithm-club/Huffman%20Coding/ + +#[derive(Debug, Clone, Copy)] +pub struct Node { + pub data: Option, + pub count: usize, + pub index: Option, + pub parent: Option, + pub left: Option, + pub right: Option, +} + +impl Node { + fn new(data: u8, count: usize) -> Self { + Self { + data: Some(data), + count, + index: None, + parent: None, + left: None, + right: None, + } + } +} + +impl PartialEq for Node { + fn eq(&self, other: &Self) -> bool { + other.count.eq(&self.count) + } +} + +impl Eq for Node {} + +impl PartialOrd for Node { + fn partial_cmp(&self, other: &Self) -> Option { + Some(other.count.cmp(&self.count)) + } +} + +impl Ord for Node { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + other.count.cmp(&self.count) + } +} + +#[derive(Debug, Clone)] +pub struct Huffman { + tree: Vec, + // index lookup table for the leaf nodes. + indexes: HashMap, +} + +impl Huffman { + pub fn new(frequency_table: &HashMap) -> Self { + let tree = Self::build_tree(frequency_table); + let indexes = tree + .iter() + .filter(|x| x.data.is_some()) + .map(|n| (n.data.unwrap(), n.index.expect("should have index"))) + .collect(); + + Self { tree, indexes } + } + + /// Creates the Huffman frequency table from the provided data. + pub fn new_from_data(data: &[u8]) -> Self { + Self::new(&Self::calculate_freq_table(data)) + } + + pub fn calculate_freq_table(data: &[u8]) -> HashMap { + let mut table: HashMap = HashMap::with_capacity(256.min(data.len() / 2)); + + for i in data { + if let Some(c) = table.get_mut(i) { + *c += 1; + } else { + table.insert(*i, 1); + } + } + + table + } + + /// Builds a binary tree, the root is the last node, the leafs are at the start. + fn build_tree(table: &HashMap) -> Vec { + let mut priority_queue: BinaryHeap>> = table + .iter() + .map(|(c, v)| Rc::new(RefCell::new(Node::new(*c, *v)))) + .collect(); + + let mut tree: Vec>> = Vec::with_capacity(priority_queue.len() * 2); + + while priority_queue.len() > 1 { + let shared_node1 = priority_queue.pop().unwrap(); + let shared_node2 = priority_queue.pop().unwrap(); + + let mut node1 = shared_node1.borrow_mut(); + if node1.index.is_none() { + node1.index = Some(tree.len()); + tree.push(shared_node1.clone()); + } + + let mut node2 = shared_node2.borrow_mut(); + if node2.index.is_none() { + node2.index = Some(tree.len()); + tree.push(shared_node2.clone()); + } + + let parent_index = tree.len(); + + node1.parent = Some(parent_index); + node2.parent = Some(parent_index); + + let parent = Node { + data: None, + count: node1.count + node2.count, + left: node1.index, + right: node2.index, + parent: None, + index: Some(parent_index), + }; + + let parent = Rc::new(RefCell::new(parent)); + tree.push(parent.clone()); + priority_queue.push(parent); + } + + tree.into_iter().map(|x| *x.borrow()).collect() + } + + // Recursively walk to the root and back to calculate the bits. + fn traverse(&self, bits: &mut BitVec, index: usize, child_index: Option) { + // First walk up to the root + if let Some(parent) = self.tree[index].parent { + self.traverse(bits, parent, Some(index)); + } + + // Then walk down back while pushing the bits. + if let Some(child_index) = child_index { + if Some(child_index) == self.tree[index].left { + bits.push(true); + } else if Some(child_index) == self.tree[index].right { + bits.push(false); + } + } + } + + pub fn compress(&self, data: &[u8]) -> Vec { + let mut bits = BitVec::new(); + + for b in data.iter() { + self.traverse( + &mut bits, + *self + .indexes + .get(b) + .expect("frequency table did not contain this byte"), + None, + ) + } + + bits.to_bytes() + } + + pub fn decompress(&self, data: &[u8]) -> Vec { + let bits = BitVec::from_bytes(data); + let mut decompressed = Vec::with_capacity(bits.len() * 2); + let root_index = self.tree.len() - 1; + let byte_count = self.tree[root_index].count; + + let mut bits_iter = bits.iter(); + + for _ in 0..byte_count { + let mut index = root_index; + + while self.tree[index].right.is_some() { + let bit = bits_iter.next().expect("missing data"); + if bit { + index = self.tree[index].left.expect("should have left index"); + } else { + index = self.tree[index].right.expect("should have right index") + } + } + + decompressed.push(self.tree[index].data.expect("should have data")); + } + + decompressed + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn compress_decompress() { + let payload = b"so much words wow many compression"; + + let huffman = Huffman::new_from_data(payload); + let compressed = huffman.compress(payload); + let decompressed = huffman.decompress(&compressed); + + assert!(compressed.len() < payload.len()); + assert_eq!(&payload[..], decompressed) + } + + #[test] + fn compress_decompress_lorem_ipsum() { + let payload = b"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."; + + let huffman = Huffman::new_from_data(payload); + let compressed = huffman.compress(payload); + let decompressed = huffman.decompress(&compressed); + + assert!(compressed.len() < payload.len()); + assert_eq!(&payload[..], decompressed) + } +} \ No newline at end of file