--[[
	SpamBayes for WoW, a lightweight bayesian antispam for World of Warcraft
	Algorithm of probabilities converted in Lua from SpamBayes Python built by
	a combination of work from Gary Robinson and Tim Peters and many contributers
	Copyright @ 2010 Arnaud Dovi (Merah@AthelLoren, ad@heapoverflow.com)

	This program is free software: you can redistribute it and/or modify
	it under the terms of the GNU General Public License as published by
	the Free Software Foundation, either version 3 of the License, or
	(at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with this program.  If not, see <http://www.gnu.org/licenses/>.
]]

--[[
$Rev:: 41                    $:  Revision of last commit
$Author:: arnaud             $:  Author of last commit
$Date:: 2012-09-02 23:53:50 #$:  Date of last commit
$WoW:: 5.0.5.16030           $:  Client of the last tests
]]

--[[ BEGIN STANDARD HEADER ]] --

-- Imports
local ipairs            = ipairs
local math_exp          = math.exp
local math_frexp        = math.frexp
local math_floor        = math.floor
local math_min          = math.min
local math_sqrt         = math.sqrt
local math_log          = math.log
local math_abs          = math.abs
local pairs             = pairs
local LN2               = math.log(2) -- used frequently by chi-combining

-- Isolate the environment
setfenv(1, select(2, ...))

--[[ END STANDARD HEADER ]] --
local dbg = function() end
--[==[@debug@
--dbg = function(...) PrintLiteral(nil, ...) end
--@end-debug@]==]



local function chi2Q(x2, v)
	-- Return prob(chisq >= x2, with v degrees of freedom).
	-- v must be even.
	-- XXX If x2 is very large, exp(-m) will underflow to 0.
	local term
	local m = x2 / 2.0
	local sum = math_exp(-m)
	local term = math_exp(-m)
	for i = 1, math_floor(v / 2) do
		term = term * (m / i)
		sum  = sum + term
	end
	-- With small x2 and large v, accumulated roundoff error, plus error in
	-- the platform exp(), can cause this to spill a few ULP above 1.0.  For
	-- example, chi2Q(100, 300) on my box has sum == 1.0 + 2.0**-52 at this
	-- point.  Returning a value even a teensy bit over 1.0 is no good.
	return math_min(sum, 1.0)
end

local function probability(db, record)
	-- Compute, store, and return prob(msg is spam | msg contains word).
	-- This is the Graham calculation, but stripped of biases, and
	-- stripped of clamping into 0.01 thru 0.99.  The Bayesian
	-- adjustment following keeps them in a sane range, and one
	-- that naturally grows the more evidence there is to back up
	-- a probability.
	local spamcount = record.spamcount
	local hamcount = record.hamcount
	local pcache = db.probcache[spamcount] and db.probcache[spamcount][hamcount] or nil -- Try the cache first
	if pcache then return pcache end
	local nham = db.nham ~= 0 and db.nham or 1
	local nspam = db.nspam ~= 0 and db.nspam or 1
	local hamratio = hamcount / nham --assert hamcount <= nham, "Token seen in more ham than ham trained."
	local spamratio = spamcount / nspam --assert spamcount <= nspam, "Token seen in more spam than spam trained."
	local prob = spamratio / (hamratio + spamratio)
	local S = 0.45 --local S = options["Classifier", "unknown_word_strength"]
	local StimesX = S * 0.5 --StimesX = S * options["Classifier", "unknown_word_prob"]
	-- Now do Robinson's Bayesian adjustment.
	--
	--         s*x + n*p(w)
	-- f(w) = --------------
	--           s + n
	--
	-- I find this easier to reason about like so (equivalent when
	-- s != 0):
	--
	--        x - p
	--  p +  -------
	--       1 + n/s
	--
	-- IOW, it moves p a fraction of the distance from p to x, and
	-- less so the larger n is, or the smaller s is.
	local n = hamcount + spamcount
	prob = (StimesX + n * prob) / (S + n)
	db.probcache[spamcount] = db.probcache[spamcount] or {} -- Update the cache
	db.probcache[spamcount][hamcount] = prob
	return prob
end

local function wordinfoget(db, word)
	return db.worddb["-"..word] or nil
end

local function wordinfoset(db,word, record) -- "-" forces quote in SavedVariables in Warhammer Online and left as it is for WoW - <Mera>
	if record.hamcount == 0 and record.spamcount == 0 then
		db.worddb["-"..word] = nil
		return
	end
	db.worddb["-"..word] = record
end

local function worddistanceget(db, word)
	local record = wordinfoget(db, word)
	local prob = record == nil and 0.5 or probability(db, record) --prob = options["Classifier", "unknown_word_prob"]
	local distance = math_abs(prob - 0.5)
	return distance, prob, word, record
end

local function _getclues(db, wordstream)
	local minimum_prob_strength = 0.1
	-- The all-unigram scheme just scores the tokens as-is.  A set()
	-- is used to weed out duplicates at high speed.
  dbg(nil, "_getclues", wordstream)

	local clues = {}
	for _, word in ipairs(wordstream) do
		local tup = { worddistanceget(db, word) }
		--d(tup)
		if tup[1] >= minimum_prob_strength then
			clues[#clues + 1] = tup
		end
	end
	--d(clues)
	--table_sort(clues, function(a,b) return a<b end)
	if #clues > 150 then
		for i = 0, #clues - 150 do --del clues[0 : -options["Classifier", "max_discriminators"]]
			clues[i] = nil
		end
	end
	for _, tup in ipairs(clues) do tup[1] = nil end
	return clues
end

local function add_msg(db, wordstream, is_spam)
  dbg(nil, "add", wordstream, is_spam)
	db.probcache = {}    -- nuke the prob cache
	if is_spam then
		db.nspam = db.nspam + 1
	else
		db.nham  = db.nham + 1
	end
	for _, word in ipairs(wordstream) do
		local record = wordinfoget(db, word) or { spamcount = 0, hamcount = 0 }
		if is_spam then
			record.spamcount = record.spamcount + 1
		else
			record.hamcount  = record.hamcount + 1
		end
		wordinfoset(db, word, record)
	end
end

local function verify_cache(db, wordstream, is_spam)
	local tempcache = {}
	local err = false
	for _, word in ipairs(wordstream) do
		local record = wordinfoget(db, word)
		if record ~= nil then
			if is_spam then
				if record.spamcount == 0 then -- the phrase has not been trained or slightly changed - <Mera>
					err = true
					break
				end
				tempcache[word] = record
			else
				if record.hamcount == 0 then
					err = true
					break
				end
				tempcache[word] = record
			end
		else
			err = true
			break
		end
	end
	return err == false and tempcache or nil
end

local function remove_msg(db, wordstream, is_spam)
	db.probcache = {} -- nuke the prob cache
  dbg(mil, "remove", wordstream, is_spam)
	local vcache = verify_cache(db, wordstream, is_spam) -- if one token does not exist, the phrase has never been trained and no tokens will be deleted  - <Mera>
	if vcache == nil then return 0 end
	for word, record in pairs(vcache) do
		if is_spam then
			record.spamcount = record.spamcount - 1
		else
			record.hamcount = record.hamcount - 1
		end
		wordinfoset(db, word, record)
	end
	if is_spam then
		if db.nspam   > 0 then db.nspam   = db.nspam - 1 end
		if db.cspam   > 0 then db.cspam   = db.cspam - 1 end
	else
		if db.nham    > 0 then db.nham    = db.nham - 1 end
		if db.cham    > 0 then db.cham    = db.cham - 1 end
	end
end

local function chi2_spamprob(db, wordstream, evidence)
	-- Return best-guess probability that wordstream is spam.
	-- wordstream is an iterable object producing words.
	-- The return value is a float in [0.0, 1.0].
	-- If optional arg evidence is True, the return value is a pair
	-- probability, evidence where evidence is a list of
	-- (word, probability) pairs.
	-- We compute two chi-squared statistics, one for ham and one for
	-- spam.  The sum-of-the-logs business is more sensitive to probs
	-- near 0 than to probs near 1, so the spam measure uses 1-p (so
	-- that high-spamprob words have greatest effect), and the ham
	-- measure uses p directly (so that lo-spamprob words have greatest
	-- effect).
	-- For optimization, sum-of-logs == log-of-product, and f.p.
	-- multiplication is a lot cheaper than calling ln().  It's easy
	-- to underflow to 0.0, though, so we simulate unbounded dynamic
	-- range via frexp.  The real product H = this H * 2**Hexp, and
	-- likewise the real product S = this S * 2**Sexp.
	local H, S       = 1.0, 1.0
	local Hexp, Sexp = 0, 0
	local ln         = math_log
	local prob, word, record
  dbg(nil, "spamprob", wordstream, evidence)

	local clues = _getclues(db, wordstream)
	for _, data in ipairs(clues) do
		local e
		prob, word, record = data[2], data[3], data[4]
		S = S * ( 1.0 - prob)
		H = H * prob
		if S < 1e-200 then  -- prevent underflow
			S, e = math_frexp(S)
			Sexp = Sexp + e
		end
		if H < 1e-200 then  -- prevent underflow
			H, e = math_frexp(H)
			Hexp = Hexp + e
		end
	end
	-- Compute the natural log of the product = sum of the logs: ln(x * 2**i) = ln(x) + i * ln(2).
	S = ln(S) + Sexp * LN2
	H = ln(H) + Hexp * LN2
	local n = #clues
	if n then
		S = 1.0 - chi2Q(-2.0 * S, 2*n)
		H = 1.0 - chi2Q(-2.0 * H, 2*n)
		-- How to combine these into a single spam score?  We originally
		-- used (S-H)/(S+H) scaled into [0., 1.], which equals S/(S+H).  A
		-- systematic problem is that we could end up being near-certain
		-- a thing was (for example) spam, even if S was small, provided
		-- that H was much smaller.
		-- Rob Hooft stared at these problems and invented the measure
		-- we use now, the simpler S-H, scaled into [0., 1.].
		prob = (S-H + 1.0) / 2.0
	else prob = 0.5 end
	if evidence then
		-- clues = [(w, p) for p, w, _r in clues]
		-- clues.sort(lambda a, b: cmp(a[1], b[1]))
		-- clues.insert(0, ('*S*', S))
		-- clues.insert(0, ('*H*', H))
		-- return prob, clues
	else
		return prob
	end
end

local defaults = { nham = 0, nspam = 0, cspam = 0, cham = 0, cunsure = 0, probcache = {}, worddb = {}}
function GetClassifier(db)
  local db = db
  for k,v in pairs(defaults) do
    db[k] = db[k] or v
  end
  return {
    getprob = function (...) return chi2_spamprob(db, ...) end,
    learn = function (...) return add_msg(db, ...) end,
    unlearn = function (...) return remove_msg(db, ...) end,
  }
end