Implementation

Data-structures:

Priority-queue
Prefix tree

English

Place all frequencies of nodes into a priority queue, by ascending order
Select two most minimum frequencies (delete min x2), sum these together, and insert a new frequency $f_{w+1}$ .
Use this to build internal nodes for a binary-tree and adds an addition bit to each of the subtrees involved, bottom up. Have leaf nodes contain the data at the bottom.
Stop once priority queue is <1 (the root node is reached).

Self Notes

Node represents both an internal node within the tree, as well as the base nodes for the original characters. I used two different constructors to identify them both.
Analyzing the file had just meant finding the frequency of characters, and then doing some math to discover the total number of bits used. I used the following calcuations to analyize the compression.

// original
bitlength = log_2(#unique_chars)
byte_size_per_char = bit_length * freq
total_byes = sum(byte_size_per_char)

// compressed
count_bits = #of bits to reach parent node from base
byte_size_per_char = count_bits * freq
total_bytes = sum(byte_size_per_char)

Initially I defined the base characters the base of the tree in base_nodes hashmap.
Inserting the node meant adding having two base nodes point to a combined parent node.
With id_char_bit_seq, I decided to not go the extra mile of hashing the user id. This is ok since the number of unique characters tends not to exceed 300 (ASCII).

using Data = tuple<char, unsigned, unsigned>;

class Node {
public:
    Node(const int _id, const int _freq, const int _usr_id) : // base nodes
        hash_id(_id), freq(_freq), user_id(_usr_id), parent(nullptr) {};

    Node(const int _id, const int _freq) : // internal nodes
        hash_id(_id), freq(_freq), parent(nullptr) {};

    int hash_id, freq;
    char user_id;
    bool right;
    Node *parent;

private:
};

struct pq_Cmptor
{
    bool operator()(const Node *lhs, const Node *rhs) const
    {
        return lhs->freq > rhs->freq;
    }
};

class HuffmanTree
{
public:
    HuffmanTree(const string filepath) {
        base_file_info = analyze_file(filepath);
        base_nodes.reserve(base_file_info.size());
    }

    void huffman_compression()
    {
        // add base elements
        for (auto i : base_file_info) {
            // hash id = char + frequency
            int identifer = (int)get<0>(i) + (int)get<1>(i);
            base_nodes.insert(new Node(identifer, get<1>(i), get<0>(i)));
        }

        // add pq base elements
        priority_queue<Node*, vector<Node*>, pq_Cmptor> pq;
        for (auto i : base_nodes) {
            pq.push(i);
        }

        // begin main algorithm
        while (pq.size() > 1) {
            Node *min1 = pq.top(); pq.pop();
            Node *min2 = pq.top(); pq.pop();

            // new hash id = (char2 + freq2) + (char2 + freq2);
            int identifer = (int)min1->user_id + min1->freq
                + (int)min2->user_id + min2->freq;

            // constructor for internal node
            Node *newNode = new Node(identifer, min1->freq + min2->freq);
            pq.push(newNode);

            insert(min1, min2, newNode);
        }
    }

    void analyze_compression()
    {
        cout << "Original File Statistics" << endl;
        print_base_fileinfo(); cout << endl;

        cout << "All Data Statistics Compressed" << endl;
        print_cmprsd_fileinfo(); cout << endl;

        cout << "Compression File Reduction" << endl;
        float cmp_r = 1 - ceilf(((float)total_bits_new / (float)total_bits_ori) * 100) / 100;
        cout << total_bits_new << "/" << total_bits_ori << "=" << cmp_r << endl;
    }

    vector<bool> id_char_bit_seq(char user_id)
    {
        // find the id
        Node *baseNode = nullptr;
        for (auto &i : base_nodes) {
            if (user_id == i->user_id) {
                baseNode = i;
            }
        }
        return get_bit_seq(baseNode);
    }

    string bit_seq_to_string(vector<bool> &seq)
    {
        string str_seq = "";
        for (bool i : seq) {
            if (i) str_seq += "1";
            else str_seq += "0";
        }
        return str_seq;
    }

    void print_all_bit_seq()
    {
        for (auto &i : base_nodes)
            cout << i->user_id << ": " << setw(15) << 
                bit_seq_to_string(get_bit_seq(i)) << endl;
    }

private:
    vector<Data> base_file_info;
    unordered_set<Node*> base_nodes;
    int total_bits_new, total_bits_ori;

    void insert(Node *left, Node *right, Node *newNode)
    {
        left->right = false;
        right->right = true;
        left->parent = newNode;
        right->parent = newNode;
    }

    void print_base_fileinfo()
    {
        cout << setw(10) << "char" << setw(10) << "freq" << 
            setw(15) << "total_bits" << endl;
        for (auto i : base_file_info) {
            total_bits_ori += get<2>(i);
            cout << setw(10) << get<0>(i) << setw(10) << get<1>(i) << 
                setw(15) << get<2>(i) << endl;
        }
        cout << endl;
    }


    void print_cmprsd_fileinfo()
    {
        cout << setw(10) << "user_id" << setw(10) << "hash_id"
            << setw(10) << "freq" << setw(15) << "total_bits" << endl;

        for (auto &i : base_nodes) {
            int total_bits = count_bits(i) * i->freq;
            total_bits_new += total_bits;
            cout << setw(10) << i->user_id << setw(10) << i->hash_id
                << setw(10) << i->freq << setw(15) << total_bits << endl;
        }
        cout << endl;
    }

    int count_bits(Node *baseNode) {
        Node *temp = baseNode;
        int count = 0;
        while (temp->parent != nullptr)
        {
            count++;
            temp = temp->parent;
        }
        return count;
    }

    vector<bool> get_bit_seq(Node *baseNode)
    {
        if (baseNode == nullptr) {
            cout << "Invalid Node" << endl;
            exit(0);
        }

        vector<bool> bit_seq;
        while (baseNode != nullptr)
        {
            if (baseNode->right) {
                bit_seq.push_back(1);
            }
            else bit_seq.push_back(0);
            baseNode = baseNode->parent;
        }

        return bit_seq;
    }

    vector<Data> analyze_file(const string &filepath)
    {
        ifstream infile;
        infile.open(filepath, ifstream::in);

        unordered_map<char, int> char_freq;
        char cur_char;
        while (infile.good())
        {
            cur_char = infile.get();
            if (char_freq.find(cur_char) == char_freq.end()) {
                char_freq.insert({ cur_char, 1 });
            }
            else char_freq[cur_char]++;
            if (infile.peek() == EOF) break;
        }

        vector<Data> info;
        int bit_length = ceil(log2(char_freq.size()));
        for (auto i : char_freq) {
            info.push_back(make_tuple(i.first, i.second, bit_length * i.second));
        }

        infile.close();
        return info;
    }
};

int main()
{
    HuffmanTree compr("../InFile/random.txt");
    compr.huffman_compression();
    compr.analyze_compression();
    compr.print_all_bit_seq();
}

PreviousABL NextGraph Algorithms

Last updated 6 years ago

Was this helpful?