You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
301 lines
10 KiB
301 lines
10 KiB
--[[
|
|
SpamBayes for WoW, a lightweight bayesian antispam for World of Warcraft
|
|
Algorithm of probabilities converted in Lua from SpamBayes Python built by
|
|
a combination of work from Gary Robinson and Tim Peters and many contributers
|
|
Copyright @ 2010 Arnaud Dovi (Merah@AthelLoren, ad@heapoverflow.com)
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
]]
|
|
|
|
--[[
|
|
$Rev:: 41 $: Revision of last commit
|
|
$Author:: arnaud $: Author of last commit
|
|
$Date:: 2012-09-02 23:53:50 #$: Date of last commit
|
|
$WoW:: 5.0.5.16030 $: Client of the last tests
|
|
]]
|
|
|
|
--[[ BEGIN STANDARD HEADER ]] --
|
|
|
|
-- Imports
|
|
local ipairs = ipairs
|
|
local math_exp = math.exp
|
|
local math_frexp = math.frexp
|
|
local math_floor = math.floor
|
|
local math_min = math.min
|
|
local math_sqrt = math.sqrt
|
|
local math_log = math.log
|
|
local math_abs = math.abs
|
|
local pairs = pairs
|
|
local LN2 = math.log(2) -- used frequently by chi-combining
|
|
|
|
-- Isolate the environment
|
|
setfenv(1, select(2, ...))
|
|
|
|
--[[ END STANDARD HEADER ]] --
|
|
local dbg = function() end
|
|
--[==[@debug@
|
|
--dbg = function(...) PrintLiteral(nil, ...) end
|
|
--@end-debug@]==]
|
|
|
|
|
|
|
|
local function chi2Q(x2, v)
|
|
-- Return prob(chisq >= x2, with v degrees of freedom).
|
|
-- v must be even.
|
|
-- XXX If x2 is very large, exp(-m) will underflow to 0.
|
|
local term
|
|
local m = x2 / 2.0
|
|
local sum = math_exp(-m)
|
|
local term = math_exp(-m)
|
|
for i = 1, math_floor(v / 2) do
|
|
term = term * (m / i)
|
|
sum = sum + term
|
|
end
|
|
-- With small x2 and large v, accumulated roundoff error, plus error in
|
|
-- the platform exp(), can cause this to spill a few ULP above 1.0. For
|
|
-- example, chi2Q(100, 300) on my box has sum == 1.0 + 2.0**-52 at this
|
|
-- point. Returning a value even a teensy bit over 1.0 is no good.
|
|
return math_min(sum, 1.0)
|
|
end
|
|
|
|
local function probability(db, record)
|
|
-- Compute, store, and return prob(msg is spam | msg contains word).
|
|
-- This is the Graham calculation, but stripped of biases, and
|
|
-- stripped of clamping into 0.01 thru 0.99. The Bayesian
|
|
-- adjustment following keeps them in a sane range, and one
|
|
-- that naturally grows the more evidence there is to back up
|
|
-- a probability.
|
|
local spamcount = record.spamcount
|
|
local hamcount = record.hamcount
|
|
local pcache = db.probcache[spamcount] and db.probcache[spamcount][hamcount] or nil -- Try the cache first
|
|
if pcache then return pcache end
|
|
local nham = db.nham ~= 0 and db.nham or 1
|
|
local nspam = db.nspam ~= 0 and db.nspam or 1
|
|
local hamratio = hamcount / nham --assert hamcount <= nham, "Token seen in more ham than ham trained."
|
|
local spamratio = spamcount / nspam --assert spamcount <= nspam, "Token seen in more spam than spam trained."
|
|
local prob = spamratio / (hamratio + spamratio)
|
|
local S = 0.45 --local S = options["Classifier", "unknown_word_strength"]
|
|
local StimesX = S * 0.5 --StimesX = S * options["Classifier", "unknown_word_prob"]
|
|
-- Now do Robinson's Bayesian adjustment.
|
|
--
|
|
-- s*x + n*p(w)
|
|
-- f(w) = --------------
|
|
-- s + n
|
|
--
|
|
-- I find this easier to reason about like so (equivalent when
|
|
-- s != 0):
|
|
--
|
|
-- x - p
|
|
-- p + -------
|
|
-- 1 + n/s
|
|
--
|
|
-- IOW, it moves p a fraction of the distance from p to x, and
|
|
-- less so the larger n is, or the smaller s is.
|
|
local n = hamcount + spamcount
|
|
prob = (StimesX + n * prob) / (S + n)
|
|
db.probcache[spamcount] = db.probcache[spamcount] or {} -- Update the cache
|
|
db.probcache[spamcount][hamcount] = prob
|
|
return prob
|
|
end
|
|
|
|
local function wordinfoget(db, word)
|
|
return db.worddb["-"..word] or nil
|
|
end
|
|
|
|
local function wordinfoset(db,word, record) -- "-" forces quote in SavedVariables in Warhammer Online and left as it is for WoW - <Mera>
|
|
if record.hamcount == 0 and record.spamcount == 0 then
|
|
db.worddb["-"..word] = nil
|
|
return
|
|
end
|
|
db.worddb["-"..word] = record
|
|
end
|
|
|
|
local function worddistanceget(db, word)
|
|
local record = wordinfoget(db, word)
|
|
local prob = record == nil and 0.5 or probability(db, record) --prob = options["Classifier", "unknown_word_prob"]
|
|
local distance = math_abs(prob - 0.5)
|
|
return distance, prob, word, record
|
|
end
|
|
|
|
local function _getclues(db, wordstream)
|
|
local minimum_prob_strength = 0.1
|
|
-- The all-unigram scheme just scores the tokens as-is. A set()
|
|
-- is used to weed out duplicates at high speed.
|
|
dbg(nil, "_getclues", wordstream)
|
|
|
|
local clues = {}
|
|
for _, word in ipairs(wordstream) do
|
|
local tup = { worddistanceget(db, word) }
|
|
--d(tup)
|
|
if tup[1] >= minimum_prob_strength then
|
|
clues[#clues + 1] = tup
|
|
end
|
|
end
|
|
--d(clues)
|
|
--table_sort(clues, function(a,b) return a<b end)
|
|
if #clues > 150 then
|
|
for i = 0, #clues - 150 do --del clues[0 : -options["Classifier", "max_discriminators"]]
|
|
clues[i] = nil
|
|
end
|
|
end
|
|
for _, tup in ipairs(clues) do tup[1] = nil end
|
|
return clues
|
|
end
|
|
|
|
local function add_msg(db, wordstream, is_spam)
|
|
dbg(nil, "add", wordstream, is_spam)
|
|
db.probcache = {} -- nuke the prob cache
|
|
if is_spam then
|
|
db.nspam = db.nspam + 1
|
|
else
|
|
db.nham = db.nham + 1
|
|
end
|
|
for _, word in ipairs(wordstream) do
|
|
local record = wordinfoget(db, word) or { spamcount = 0, hamcount = 0 }
|
|
if is_spam then
|
|
record.spamcount = record.spamcount + 1
|
|
else
|
|
record.hamcount = record.hamcount + 1
|
|
end
|
|
wordinfoset(db, word, record)
|
|
end
|
|
end
|
|
|
|
local function verify_cache(db, wordstream, is_spam)
|
|
local tempcache = {}
|
|
local err = false
|
|
for _, word in ipairs(wordstream) do
|
|
local record = wordinfoget(db, word)
|
|
if record ~= nil then
|
|
if is_spam then
|
|
if record.spamcount == 0 then -- the phrase has not been trained or slightly changed - <Mera>
|
|
err = true
|
|
break
|
|
end
|
|
tempcache[word] = record
|
|
else
|
|
if record.hamcount == 0 then
|
|
err = true
|
|
break
|
|
end
|
|
tempcache[word] = record
|
|
end
|
|
else
|
|
err = true
|
|
break
|
|
end
|
|
end
|
|
return err == false and tempcache or nil
|
|
end
|
|
|
|
local function remove_msg(db, wordstream, is_spam)
|
|
db.probcache = {} -- nuke the prob cache
|
|
dbg(mil, "remove", wordstream, is_spam)
|
|
local vcache = verify_cache(db, wordstream, is_spam) -- if one token does not exist, the phrase has never been trained and no tokens will be deleted - <Mera>
|
|
if vcache == nil then return 0 end
|
|
for word, record in pairs(vcache) do
|
|
if is_spam then
|
|
record.spamcount = record.spamcount - 1
|
|
else
|
|
record.hamcount = record.hamcount - 1
|
|
end
|
|
wordinfoset(db, word, record)
|
|
end
|
|
if is_spam then
|
|
if db.nspam > 0 then db.nspam = db.nspam - 1 end
|
|
if db.cspam > 0 then db.cspam = db.cspam - 1 end
|
|
else
|
|
if db.nham > 0 then db.nham = db.nham - 1 end
|
|
if db.cham > 0 then db.cham = db.cham - 1 end
|
|
end
|
|
end
|
|
|
|
local function chi2_spamprob(db, wordstream, evidence)
|
|
-- Return best-guess probability that wordstream is spam.
|
|
-- wordstream is an iterable object producing words.
|
|
-- The return value is a float in [0.0, 1.0].
|
|
-- If optional arg evidence is True, the return value is a pair
|
|
-- probability, evidence where evidence is a list of
|
|
-- (word, probability) pairs.
|
|
-- We compute two chi-squared statistics, one for ham and one for
|
|
-- spam. The sum-of-the-logs business is more sensitive to probs
|
|
-- near 0 than to probs near 1, so the spam measure uses 1-p (so
|
|
-- that high-spamprob words have greatest effect), and the ham
|
|
-- measure uses p directly (so that lo-spamprob words have greatest
|
|
-- effect).
|
|
-- For optimization, sum-of-logs == log-of-product, and f.p.
|
|
-- multiplication is a lot cheaper than calling ln(). It's easy
|
|
-- to underflow to 0.0, though, so we simulate unbounded dynamic
|
|
-- range via frexp. The real product H = this H * 2**Hexp, and
|
|
-- likewise the real product S = this S * 2**Sexp.
|
|
local H, S = 1.0, 1.0
|
|
local Hexp, Sexp = 0, 0
|
|
local ln = math_log
|
|
local prob, word, record
|
|
dbg(nil, "spamprob", wordstream, evidence)
|
|
|
|
local clues = _getclues(db, wordstream)
|
|
for _, data in ipairs(clues) do
|
|
local e
|
|
prob, word, record = data[2], data[3], data[4]
|
|
S = S * ( 1.0 - prob)
|
|
H = H * prob
|
|
if S < 1e-200 then -- prevent underflow
|
|
S, e = math_frexp(S)
|
|
Sexp = Sexp + e
|
|
end
|
|
if H < 1e-200 then -- prevent underflow
|
|
H, e = math_frexp(H)
|
|
Hexp = Hexp + e
|
|
end
|
|
end
|
|
-- Compute the natural log of the product = sum of the logs: ln(x * 2**i) = ln(x) + i * ln(2).
|
|
S = ln(S) + Sexp * LN2
|
|
H = ln(H) + Hexp * LN2
|
|
local n = #clues
|
|
if n then
|
|
S = 1.0 - chi2Q(-2.0 * S, 2*n)
|
|
H = 1.0 - chi2Q(-2.0 * H, 2*n)
|
|
-- How to combine these into a single spam score? We originally
|
|
-- used (S-H)/(S+H) scaled into [0., 1.], which equals S/(S+H). A
|
|
-- systematic problem is that we could end up being near-certain
|
|
-- a thing was (for example) spam, even if S was small, provided
|
|
-- that H was much smaller.
|
|
-- Rob Hooft stared at these problems and invented the measure
|
|
-- we use now, the simpler S-H, scaled into [0., 1.].
|
|
prob = (S-H + 1.0) / 2.0
|
|
else prob = 0.5 end
|
|
if evidence then
|
|
-- clues = [(w, p) for p, w, _r in clues]
|
|
-- clues.sort(lambda a, b: cmp(a[1], b[1]))
|
|
-- clues.insert(0, ('*S*', S))
|
|
-- clues.insert(0, ('*H*', H))
|
|
-- return prob, clues
|
|
else
|
|
return prob
|
|
end
|
|
end
|
|
|
|
local defaults = { nham = 0, nspam = 0, cspam = 0, cham = 0, cunsure = 0, probcache = {}, worddb = {}}
|
|
function GetClassifier(db)
|
|
local db = db
|
|
for k,v in pairs(defaults) do
|
|
db[k] = db[k] or v
|
|
end
|
|
return {
|
|
getprob = function (...) return chi2_spamprob(db, ...) end,
|
|
learn = function (...) return add_msg(db, ...) end,
|
|
unlearn = function (...) return remove_msg(db, ...) end,
|
|
}
|
|
end
|
|
|