You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
532 lines
15 KiB
532 lines
15 KiB
--[[
|
|
This is a recontruction of the Huffman Compression method included in
|
|
LibCompress. The origional was creating too much garbage so I have
|
|
included this modified algorythm. The compressed data is also escaped
|
|
for safe transmits through the AddonMessage Channel. (Thanks to Stewarta).
|
|
]]
|
|
|
|
-- includes
|
|
local WIM = WIM;
|
|
local _G = _G;
|
|
local table = table;
|
|
local pairs = pairs;
|
|
local string = string;
|
|
local type = type;
|
|
local unpack = unpack;
|
|
local math = math;
|
|
local bit = bit;
|
|
local tostring = tostring;
|
|
local setmetatable = setmetatable;
|
|
local rawset = rawset;
|
|
local rawget = rawget;
|
|
local next = next;
|
|
local select = select;
|
|
local math = math;
|
|
|
|
-- set namespace
|
|
setfenv(1, WIM);
|
|
|
|
|
|
|
|
-- clears all data found within the passed tables.
|
|
local function clearTables(...)
|
|
for i=1, select("#", ...) do
|
|
local tbl = select(i, ...);
|
|
if(type(tbl) == "table") then
|
|
setmetatable(tbl, nil);
|
|
for i=#tbl, 1, -1 do
|
|
table.remove(tbl, i);
|
|
end
|
|
for k, v in pairs(tbl) do
|
|
tbl[k] = nil;
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
-- Table Recycling
|
|
tblInUse = {};
|
|
tblAvailable = {};
|
|
|
|
local function getNewTable()
|
|
if(#tblAvailable > 0) then
|
|
local tbl = table.remove(tblAvailable, #tblAvailable);
|
|
table.insert(tblInUse, tbl);
|
|
return tbl;
|
|
else
|
|
local tbl = {};
|
|
table.insert(tblInUse, tbl);
|
|
return tbl;
|
|
end
|
|
end
|
|
|
|
local function freeTables()
|
|
clearTables(unpack(tblInUse));
|
|
for i=#tblInUse, 1, -1 do
|
|
table.insert(tblAvailable, table.remove(tblInUse, i));
|
|
end
|
|
end
|
|
|
|
|
|
|
|
|
|
-- thank you stewarta
|
|
local function huff_encode(thestr)
|
|
thestr=thestr:gsub("\001","\002\003");
|
|
thestr=thestr:gsub("%z","\002\004");
|
|
return thestr;
|
|
end
|
|
|
|
local function huff_decode(thestr)
|
|
thestr=thestr:gsub("\002\004","\000");
|
|
thestr=thestr:gsub("\002\003","\001");
|
|
return thestr;
|
|
end
|
|
|
|
--------------------------------------------------------------------------------
|
|
-- Huffman codec
|
|
-- implemented by Galmok of European Stormrage (Horde), galmok@gmail.com
|
|
|
|
local function addCode(tree, bcode,len)
|
|
if tree then
|
|
tree.bcode = bcode;
|
|
tree.blength = len;
|
|
if tree.c1 then
|
|
addCode(tree.c1, bit.bor(bcode, bit.lshift(1,len)), len+1);
|
|
end
|
|
if tree.c2 then
|
|
addCode(tree.c2, bcode, len+1);
|
|
end
|
|
end
|
|
end
|
|
|
|
local function escape_code(code, len)
|
|
local escaped_code = 0;
|
|
local b;
|
|
local l = 0;
|
|
for i = len-1, 0,- 1 do
|
|
b = bit.band( code, bit.lshift(1,i))==0 and 0 or 1;
|
|
escaped_code = bit.lshift(escaped_code,1+b) + b;
|
|
l = l + b;
|
|
end
|
|
return escaped_code, len+l;
|
|
end
|
|
|
|
local Huffman_compressed = {};
|
|
local Huffman_large_compressed = {};
|
|
|
|
local compressed_size = 0
|
|
local remainder;
|
|
local remainder_length;
|
|
local function addBits(tbl, code, len)
|
|
remainder = remainder + bit.lshift(code, remainder_length);
|
|
remainder_length = len + remainder_length;
|
|
if remainder_length > 32 then
|
|
return true; -- Bits lost due to too long code-words.
|
|
end
|
|
while remainder_length>=8 do
|
|
compressed_size = compressed_size + 1;
|
|
tbl[compressed_size] = string.char(bit.band(remainder, 255));
|
|
remainder = bit.rshift(remainder, 8);
|
|
remainder_length = remainder_length -8;
|
|
end
|
|
end
|
|
|
|
local hist = {};
|
|
local leafs = {};
|
|
local symbols = {};
|
|
local huff = {};
|
|
|
|
local function cleanCompress()
|
|
compressed_size = 0;
|
|
freeTables();
|
|
clearTables(Huffman_compressed, Huffman_large_compressed, hist, leafs, symbols, huff);
|
|
end
|
|
|
|
-- word size for this huffman algorithm is 8 bits (1 byte). This means the best compression is representing 1 byte with 1 bit, i.e. compress to 0.125 of original size.
|
|
local function CompressHuffman(uncompressed)
|
|
compressed_size = 0;
|
|
if not type(uncompressed)=="string" then
|
|
cleanCompress();
|
|
return nil, "Can only compress strings";
|
|
end
|
|
|
|
-- make histogram
|
|
local n = 0;
|
|
-- dont have to use all datat to make the histogram
|
|
local uncompressed_size = string.len(uncompressed)
|
|
local c;
|
|
for i = 1, uncompressed_size do
|
|
c = string.byte(uncompressed, i)
|
|
hist[c] = (hist[c] or 0) + 1
|
|
end
|
|
|
|
--Start with as many leaves as there are symbols.
|
|
local leaf;
|
|
|
|
for symbol, weight in pairs(hist) do
|
|
leaf = getNewTable();
|
|
leaf.symbol=string.char(symbol);
|
|
leaf.weight=weight;
|
|
symbols[symbol] = leaf;
|
|
table.insert(leafs, leaf);
|
|
end
|
|
--Enqueue all leaf nodes into the first queue (by probability in increasing order so that the least likely item is in the head of the queue).
|
|
table.sort(leafs, function(a,b) if a.weight<b.weight then return true elseif a.weight>b.weight then return false else return nil end end)
|
|
|
|
local nLeafs = #leafs
|
|
|
|
--While there is more than one node in the queues:
|
|
local l,h, li, hi, leaf1, leaf2
|
|
local newNode;
|
|
while (#leafs+#huff > 1) do
|
|
-- Dequeue the two nodes with the lowest weight.
|
|
-- Dequeue first
|
|
if not next(huff) then
|
|
li, leaf1 = next(leafs)
|
|
table.remove(leafs, li)
|
|
elseif not next(leafs) then
|
|
hi, leaf1 = next(huff)
|
|
table.remove(huff, hi)
|
|
else
|
|
li, l = next(leafs);
|
|
hi, h = next(huff);
|
|
if l.weight<=h.weight then
|
|
leaf1 = l;
|
|
table.remove(leafs, li);
|
|
else
|
|
leaf1 = h;
|
|
table.remove(huff, hi);
|
|
end
|
|
end
|
|
-- Dequeue second
|
|
if not next(huff) then
|
|
li, leaf2 = next(leafs);
|
|
table.remove(leafs, li);
|
|
elseif not next(leafs) then
|
|
hi, leaf2 = next(huff);
|
|
table.remove(huff, hi);
|
|
else
|
|
li, l = next(leafs);
|
|
hi, h = next(huff);
|
|
if l.weight<=h.weight then
|
|
leaf2 = l;
|
|
table.remove(leafs, li);
|
|
else
|
|
leaf2 = h;
|
|
table.remove(huff, hi);
|
|
end
|
|
end
|
|
|
|
--Create a new internal node, with the two just-removed nodes as children (either node can be either child) and the sum of their weights as the new weight.
|
|
newNode = getNewTable();
|
|
newNode.c1 = leaf1;
|
|
newNode.c2 = leaf2;
|
|
newNode.weight = leaf1.weight+leaf2.weight;
|
|
table.insert(huff,newNode);
|
|
end
|
|
if #leafs>0 then
|
|
li, l = next(leafs);
|
|
table.insert(huff, l);
|
|
table.remove(leafs, li);
|
|
end
|
|
local _huff = huff[1];
|
|
|
|
-- assign codes to each symbol
|
|
-- c1 = "0", c2 = "1"
|
|
-- As a common convention, bit '0' represents following the left child and bit '1' represents following the right child.
|
|
-- c1 = left, c2 = right
|
|
|
|
addCode(_huff,0,0);
|
|
if _huff then
|
|
_huff.bcode = 0
|
|
_huff.blength = 1
|
|
end
|
|
|
|
-- WRITING
|
|
remainder = 0;
|
|
remainder_length = 0;
|
|
|
|
local compressed = Huffman_compressed;
|
|
--compressed_size = 0
|
|
|
|
-- first byte is version info. 0 = uncompressed, 1 = 8-bit word huffman compressed
|
|
compressed[1] = "\003";
|
|
|
|
-- Header: byte 0=#leafs, byte 1-3=size of uncompressed data
|
|
-- max 2^24 bytes
|
|
local l = string.len(uncompressed);
|
|
compressed[2] = string.char(bit.band(nLeafs-1, 255)); -- number of leafs
|
|
compressed[3] = string.char(bit.band(l, 255)); -- bit 0-7
|
|
compressed[4] = string.char(bit.band(bit.rshift(l, 8), 255)); -- bit 8-15
|
|
compressed[5] = string.char(bit.band(bit.rshift(l, 16), 255)); -- bit 16-23
|
|
compressed_size = 5;
|
|
|
|
-- create symbol/code map
|
|
for symbol, leaf in pairs(symbols) do
|
|
addBits(compressed, symbol, 8);
|
|
if addBits(compressed, escape_code(leaf.bcode, leaf.blength)) then
|
|
-- code word too long. Needs new revision to be able to handle more than 32 bits
|
|
cleanCompress();
|
|
return string_char(0)..uncompressed
|
|
end
|
|
addBits(compressed, 3, 2);
|
|
end
|
|
|
|
-- create huffman code
|
|
local large_compressed = Huffman_large_compressed;
|
|
local large_compressed_size = 0;
|
|
local ulimit;
|
|
for i = 1, l, 200 do
|
|
ulimit = l<(i+199) and l or (i+199);
|
|
for sub_i = i, ulimit do
|
|
c = string.byte(uncompressed, sub_i);
|
|
addBits(compressed, symbols[c].bcode, symbols[c].blength);
|
|
end
|
|
large_compressed_size = large_compressed_size + 1;
|
|
large_compressed[large_compressed_size] = table.concat(compressed, "", 1, compressed_size);
|
|
compressed_size = 0;
|
|
end
|
|
|
|
-- add remainding bits (if any)
|
|
if remainder_length>0 then
|
|
large_compressed_size = large_compressed_size + 1;
|
|
large_compressed[large_compressed_size] = string.char(remainder);
|
|
end
|
|
local compressed_string = table.concat(large_compressed, "", 1, large_compressed_size);
|
|
|
|
-- is compression worth it? If not, return uncompressed data.
|
|
if (#uncompressed+1) <= #compressed_string then
|
|
cleanCompress();
|
|
--return "\001"..uncompressed;
|
|
return uncompressed;
|
|
end
|
|
cleanCompress();
|
|
return compressed_string;
|
|
end
|
|
|
|
-- lookuptable (cached between calls)
|
|
local lshiftMask = {};
|
|
local lshiftMask_metamap = {
|
|
__index = function (t, k)
|
|
local v = bit.lshift(1, k);
|
|
rawset(t, k, v);
|
|
return v;
|
|
end
|
|
}
|
|
|
|
-- lookuptable (cached between calls)
|
|
local lshiftMinusOneMask = {};
|
|
local lshiftMinusOneMask_metamap = {
|
|
__index = function (t, k)
|
|
local v = bit.lshift(1, k)-1;
|
|
rawset(t, k, v);
|
|
return v;
|
|
end
|
|
};
|
|
|
|
local function getCode(bitfield, field_len)
|
|
if field_len>=2 then
|
|
local b;
|
|
local p = 0;
|
|
for i = 0, field_len-1 do
|
|
b = bit.band(bitfield, lshiftMask[i]);
|
|
if not (p==0) and not (b == 0) then
|
|
-- found 2 bits set right after each other (stop bits)
|
|
return bit.band( bitfield, lshiftMinusOneMask[i-1]), i-1,
|
|
bit.rshift(bitfield, i+1), field_len-i-1;
|
|
end
|
|
p = b;
|
|
end
|
|
end
|
|
return nil;
|
|
end
|
|
|
|
local function unescape_code(code, code_len)
|
|
local unescaped_code=0;
|
|
local b;
|
|
local l = 0;
|
|
local i = 0
|
|
while i < code_len do
|
|
b = bit.band( code, lshiftMask[i]);
|
|
if not (b==0) then
|
|
unescaped_code = bit.bor(unescaped_code, lshiftMask[l]);
|
|
i = i + 1;
|
|
end
|
|
i = i + 1;
|
|
l = l + 1;
|
|
end
|
|
return unescaped_code, l;
|
|
end
|
|
|
|
local Huffman_uncompressed = {};
|
|
local Huffman_large_uncompressed = {}; -- will always be as big as the larges string ever decompressed. Bad, but clearing i every timetakes precious time.
|
|
local map = {}; -- only table not reused in Huffman decode.
|
|
local mt = {};
|
|
|
|
local function cleanDecompress()
|
|
freeTables();
|
|
clearTables(Huffman_uncompressed, Huffman_large_uncompressed, map, lshiftMask, lshiftMinusOneMask);
|
|
end
|
|
|
|
local map_metatable1 = {
|
|
__index = function (t, k)
|
|
local v = getNewTable();
|
|
rawset(t, k, v)
|
|
return v
|
|
end
|
|
};
|
|
|
|
local map_metatable2 = {
|
|
__index = function (t, k)
|
|
return mt
|
|
end
|
|
};
|
|
|
|
local function DecompressHuffman(compressed)
|
|
setmetatable(lshiftMask, lshiftMask_metamap);
|
|
setmetatable(lshiftMinusOneMask, lshiftMinusOneMask_metamap);
|
|
if type(compressed) ~= "string" then
|
|
cleanDecompress();
|
|
return nil, "Can only uncompress strings"
|
|
end
|
|
|
|
local compressed_size = string.len(compressed);
|
|
--decode header
|
|
local info_byte = string.byte(compressed)
|
|
-- is data compressed
|
|
if info_byte==1 then
|
|
cleanDecompress();
|
|
return compressed:sub(2) --return uncompressed data
|
|
end
|
|
if not (info_byte==3) then
|
|
cleanDecompress();
|
|
return compressed --, "Can only decompress Huffman compressed data ("..tostring(info_byte)..")"
|
|
end
|
|
|
|
local num_symbols = string.byte(string.sub(compressed, 2, 2)) + 1
|
|
local c0 = string.byte(string.sub(compressed, 3, 3))
|
|
local c1 = string.byte(string.sub(compressed, 4, 4))
|
|
local c2 = string.byte(string.sub(compressed, 5, 5))
|
|
local orig_size = c2*65536 + c1*256 + c0
|
|
if orig_size==0 then
|
|
cleanDecompress();
|
|
return "";
|
|
end
|
|
|
|
-- decode code->symbal map
|
|
local bitfield = 0;
|
|
local bitfield_len = 0;
|
|
|
|
setmetatable(map, map_metatable1)
|
|
|
|
local i = 6; -- byte 1-5 are header bytes
|
|
local c, cl;
|
|
local minCodeLen = 1000;
|
|
local maxCodeLen = 0;
|
|
local symbol, code, code_len, _bitfield, _bitfield_len;
|
|
local n = 0;
|
|
local state = 0; -- 0 = get symbol (8 bits), 1 = get code (varying bits, ends with 2 bits set)
|
|
while n<num_symbols do
|
|
if i>compressed_size then
|
|
cleanDecompress();
|
|
return nil, "Cannot decode map"
|
|
end
|
|
|
|
c = string.byte(compressed, i)
|
|
bitfield = bit.bor(bitfield, bit.lshift(c, bitfield_len))
|
|
bitfield_len = bitfield_len + 8
|
|
|
|
if state == 0 then
|
|
symbol = bit.band(bitfield, 255)
|
|
bitfield = bit.rshift(bitfield, 8)
|
|
bitfield_len = bitfield_len -8
|
|
state = 1 -- search for code now
|
|
else
|
|
code, code_len, _bitfield, _bitfield_len = getCode(bitfield, bitfield_len)
|
|
if code then
|
|
bitfield, bitfield_len = _bitfield, _bitfield_len
|
|
c, cl = unescape_code(code, code_len)
|
|
map[cl][c]=string.char(symbol)
|
|
minCodeLen = math.min(minCodeLen, cl); --cl<minCodeLen and cl or minCodeLen
|
|
maxCodeLen = math.max(maxCodeLen, cl); --cl>maxCodeLen and cl or maxCodeLen
|
|
--print("symbol: "..string_char(symbol).." code: "..tobinary(c, cl))
|
|
n = n + 1
|
|
state = 0 -- search for next symbol (if any)
|
|
end
|
|
end
|
|
i=i+1
|
|
end
|
|
|
|
-- dont create new subtables for entries not in the map. Waste of space.
|
|
-- But do return an empty table to prevent runtime errors. (instead of returning nil)
|
|
setmetatable(map, map_metatable2);
|
|
|
|
local uncompressed = Huffman_uncompressed
|
|
local large_uncompressed = Huffman_large_uncompressed
|
|
local uncompressed_size = 0
|
|
local large_uncompressed_size = 0
|
|
local test_code
|
|
local test_code_len = minCodeLen;
|
|
local symbol;
|
|
local dec_size = 0;
|
|
compressed_size = compressed_size + 1
|
|
local temp_limit = 200; -- first limit of uncompressed data. large_uncompressed will hold strings of length 200
|
|
while true do
|
|
if test_code_len<=bitfield_len then
|
|
test_code=bit.band( bitfield, lshiftMinusOneMask[test_code_len])
|
|
symbol = map[test_code_len][test_code]
|
|
if symbol then
|
|
uncompressed_size = uncompressed_size + 1
|
|
uncompressed[uncompressed_size]=symbol
|
|
dec_size = dec_size + 1
|
|
if dec_size>=orig_size then -- checked here for speed reasons
|
|
break;
|
|
end
|
|
if dec_size >= temp_limit then
|
|
-- process compressed bytes in smaller chunks
|
|
large_uncompressed_size = large_uncompressed_size + 1
|
|
large_uncompressed[large_uncompressed_size] = table.concat(uncompressed, "", 1, uncompressed_size)
|
|
uncompressed_size = 0
|
|
temp_limit = temp_limit + 200 -- repeated chunk size is 200 uncompressed bytes
|
|
temp_limit = temp_limit > orig_size and orig_size or temp_limit
|
|
end
|
|
bitfield = bit.rshift(bitfield, test_code_len)
|
|
bitfield_len = bitfield_len - test_code_len
|
|
test_code_len = minCodeLen
|
|
else
|
|
test_code_len = test_code_len + 1
|
|
if test_code_len>maxCodeLen then
|
|
cleanDecompress();
|
|
return nil, "Decompression error at "..tostring(i).."/"..tostring(#compressed)
|
|
end
|
|
end
|
|
else
|
|
c = string.byte(compressed, i)
|
|
bitfield = bitfield + bit.lshift(c or 0, bitfield_len)
|
|
bitfield_len = bitfield_len + 8
|
|
i = i + 1
|
|
if i > temp_limit then
|
|
if i > compressed_size then
|
|
break;
|
|
end
|
|
end
|
|
end
|
|
end
|
|
local out = table.concat(large_uncompressed, "", 1, large_uncompressed_size)..table.concat(uncompressed, "", 1, uncompressed_size);
|
|
cleanDecompress();
|
|
return out;
|
|
end
|
|
|
|
|
|
function Compress(uncompressed)
|
|
if(not uncompressed) then
|
|
return uncompressed;
|
|
end
|
|
local str, err = CompressHuffman(uncompressed);
|
|
return huff_encode(str);
|
|
end
|
|
|
|
function Decompress(compressed)
|
|
return DecompressHuffman(huff_decode(compressed));
|
|
end
|
|
|