--[[
SpamBayes for WoW, a lightweight bayesian antispam for World of Warcraft
Algorithm of probabilities converted in Lua from SpamBayes Python built by
a combination of work from Gary Robinson and Tim Peters and many contributers
Copyright @ 2010 Arnaud Dovi (Merah@AthelLoren, ad@heapoverflow.com)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
]]
--[[
$Rev:: 41 $: Revision of last commit
$Author:: arnaud $: Author of last commit
$Date:: 2012-09-02 23:53:50 #$: Date of last commit
$WoW:: 5.0.5.16030 $: Client of the last tests
]]
--[[ BEGIN STANDARD HEADER ]] --
-- Imports
local ipairs = ipairs
local math_exp = math.exp
local math_frexp = math.frexp
local math_floor = math.floor
local math_min = math.min
local math_sqrt = math.sqrt
local math_log = math.log
local math_abs = math.abs
local pairs = pairs
local LN2 = math.log(2) -- used frequently by chi-combining
-- Isolate the environment
setfenv(1, select(2, ...))
--[[ END STANDARD HEADER ]] --
local dbg = function() end
--[==[@debug@
--dbg = function(...) PrintLiteral(nil, ...) end
--@end-debug@]==]
local function chi2Q(x2, v)
-- Return prob(chisq >= x2, with v degrees of freedom).
-- v must be even.
-- XXX If x2 is very large, exp(-m) will underflow to 0.
local term
local m = x2 / 2.0
local sum = math_exp(-m)
local term = math_exp(-m)
for i = 1, math_floor(v / 2) do
term = term * (m / i)
sum = sum + term
end
-- With small x2 and large v, accumulated roundoff error, plus error in
-- the platform exp(), can cause this to spill a few ULP above 1.0. For
-- example, chi2Q(100, 300) on my box has sum == 1.0 + 2.0**-52 at this
-- point. Returning a value even a teensy bit over 1.0 is no good.
return math_min(sum, 1.0)
end
local function probability(db, record)
-- Compute, store, and return prob(msg is spam | msg contains word).
-- This is the Graham calculation, but stripped of biases, and
-- stripped of clamping into 0.01 thru 0.99. The Bayesian
-- adjustment following keeps them in a sane range, and one
-- that naturally grows the more evidence there is to back up
-- a probability.
local spamcount = record.spamcount
local hamcount = record.hamcount
local pcache = db.probcache[spamcount] and db.probcache[spamcount][hamcount] or nil -- Try the cache first
if pcache then return pcache end
local nham = db.nham ~= 0 and db.nham or 1
local nspam = db.nspam ~= 0 and db.nspam or 1
local hamratio = hamcount / nham --assert hamcount <= nham, "Token seen in more ham than ham trained."
local spamratio = spamcount / nspam --assert spamcount <= nspam, "Token seen in more spam than spam trained."
local prob = spamratio / (hamratio + spamratio)
local S = 0.45 --local S = options["Classifier", "unknown_word_strength"]
local StimesX = S * 0.5 --StimesX = S * options["Classifier", "unknown_word_prob"]
-- Now do Robinson's Bayesian adjustment.
--
-- s*x + n*p(w)
-- f(w) = --------------
-- s + n
--
-- I find this easier to reason about like so (equivalent when
-- s != 0):
--
-- x - p
-- p + -------
-- 1 + n/s
--
-- IOW, it moves p a fraction of the distance from p to x, and
-- less so the larger n is, or the smaller s is.
local n = hamcount + spamcount
prob = (StimesX + n * prob) / (S + n)
db.probcache[spamcount] = db.probcache[spamcount] or {} -- Update the cache
db.probcache[spamcount][hamcount] = prob
return prob
end
local function wordinfoget(db, word)
return db.worddb["-"..word] or nil
end
local function wordinfoset(db,word, record) -- "-" forces quote in SavedVariables in Warhammer Online and left as it is for WoW -
if record.hamcount == 0 and record.spamcount == 0 then
db.worddb["-"..word] = nil
return
end
db.worddb["-"..word] = record
end
local function worddistanceget(db, word)
local record = wordinfoget(db, word)
local prob = record == nil and 0.5 or probability(db, record) --prob = options["Classifier", "unknown_word_prob"]
local distance = math_abs(prob - 0.5)
return distance, prob, word, record
end
local function _getclues(db, wordstream)
local minimum_prob_strength = 0.1
-- The all-unigram scheme just scores the tokens as-is. A set()
-- is used to weed out duplicates at high speed.
dbg(nil, "_getclues", wordstream)
local clues = {}
for _, word in ipairs(wordstream) do
local tup = { worddistanceget(db, word) }
--d(tup)
if tup[1] >= minimum_prob_strength then
clues[#clues + 1] = tup
end
end
--d(clues)
--table_sort(clues, function(a,b) return a 150 then
for i = 0, #clues - 150 do --del clues[0 : -options["Classifier", "max_discriminators"]]
clues[i] = nil
end
end
for _, tup in ipairs(clues) do tup[1] = nil end
return clues
end
local function add_msg(db, wordstream, is_spam)
dbg(nil, "add", wordstream, is_spam)
db.probcache = {} -- nuke the prob cache
if is_spam then
db.nspam = db.nspam + 1
else
db.nham = db.nham + 1
end
for _, word in ipairs(wordstream) do
local record = wordinfoget(db, word) or { spamcount = 0, hamcount = 0 }
if is_spam then
record.spamcount = record.spamcount + 1
else
record.hamcount = record.hamcount + 1
end
wordinfoset(db, word, record)
end
end
local function verify_cache(db, wordstream, is_spam)
local tempcache = {}
local err = false
for _, word in ipairs(wordstream) do
local record = wordinfoget(db, word)
if record ~= nil then
if is_spam then
if record.spamcount == 0 then -- the phrase has not been trained or slightly changed -
err = true
break
end
tempcache[word] = record
else
if record.hamcount == 0 then
err = true
break
end
tempcache[word] = record
end
else
err = true
break
end
end
return err == false and tempcache or nil
end
local function remove_msg(db, wordstream, is_spam)
db.probcache = {} -- nuke the prob cache
dbg(mil, "remove", wordstream, is_spam)
local vcache = verify_cache(db, wordstream, is_spam) -- if one token does not exist, the phrase has never been trained and no tokens will be deleted -
if vcache == nil then return 0 end
for word, record in pairs(vcache) do
if is_spam then
record.spamcount = record.spamcount - 1
else
record.hamcount = record.hamcount - 1
end
wordinfoset(db, word, record)
end
if is_spam then
if db.nspam > 0 then db.nspam = db.nspam - 1 end
if db.cspam > 0 then db.cspam = db.cspam - 1 end
else
if db.nham > 0 then db.nham = db.nham - 1 end
if db.cham > 0 then db.cham = db.cham - 1 end
end
end
local function chi2_spamprob(db, wordstream, evidence)
-- Return best-guess probability that wordstream is spam.
-- wordstream is an iterable object producing words.
-- The return value is a float in [0.0, 1.0].
-- If optional arg evidence is True, the return value is a pair
-- probability, evidence where evidence is a list of
-- (word, probability) pairs.
-- We compute two chi-squared statistics, one for ham and one for
-- spam. The sum-of-the-logs business is more sensitive to probs
-- near 0 than to probs near 1, so the spam measure uses 1-p (so
-- that high-spamprob words have greatest effect), and the ham
-- measure uses p directly (so that lo-spamprob words have greatest
-- effect).
-- For optimization, sum-of-logs == log-of-product, and f.p.
-- multiplication is a lot cheaper than calling ln(). It's easy
-- to underflow to 0.0, though, so we simulate unbounded dynamic
-- range via frexp. The real product H = this H * 2**Hexp, and
-- likewise the real product S = this S * 2**Sexp.
local H, S = 1.0, 1.0
local Hexp, Sexp = 0, 0
local ln = math_log
local prob, word, record
dbg(nil, "spamprob", wordstream, evidence)
local clues = _getclues(db, wordstream)
for _, data in ipairs(clues) do
local e
prob, word, record = data[2], data[3], data[4]
S = S * ( 1.0 - prob)
H = H * prob
if S < 1e-200 then -- prevent underflow
S, e = math_frexp(S)
Sexp = Sexp + e
end
if H < 1e-200 then -- prevent underflow
H, e = math_frexp(H)
Hexp = Hexp + e
end
end
-- Compute the natural log of the product = sum of the logs: ln(x * 2**i) = ln(x) + i * ln(2).
S = ln(S) + Sexp * LN2
H = ln(H) + Hexp * LN2
local n = #clues
if n then
S = 1.0 - chi2Q(-2.0 * S, 2*n)
H = 1.0 - chi2Q(-2.0 * H, 2*n)
-- How to combine these into a single spam score? We originally
-- used (S-H)/(S+H) scaled into [0., 1.], which equals S/(S+H). A
-- systematic problem is that we could end up being near-certain
-- a thing was (for example) spam, even if S was small, provided
-- that H was much smaller.
-- Rob Hooft stared at these problems and invented the measure
-- we use now, the simpler S-H, scaled into [0., 1.].
prob = (S-H + 1.0) / 2.0
else prob = 0.5 end
if evidence then
-- clues = [(w, p) for p, w, _r in clues]
-- clues.sort(lambda a, b: cmp(a[1], b[1]))
-- clues.insert(0, ('*S*', S))
-- clues.insert(0, ('*H*', H))
-- return prob, clues
else
return prob
end
end
local defaults = { nham = 0, nspam = 0, cspam = 0, cham = 0, cunsure = 0, probcache = {}, worddb = {}}
function GetClassifier(db)
local db = db
for k,v in pairs(defaults) do
db[k] = db[k] or v
end
return {
getprob = function (...) return chi2_spamprob(db, ...) end,
learn = function (...) return add_msg(db, ...) end,
unlearn = function (...) return remove_msg(db, ...) end,
}
end