对于敏感词问题,先占个坑,后续写一篇由初到高的。
下面贴出的是lua实现的前缀树法。注意:系统字符集需要设置为UTF-8,lua脚本文件格式也要设为UTF-8,不然就乱码鸟。也有一种不区分编码的实现,就是按字节构建前缀树,但这样打码的符号个数与实际敏感词字符数就可能不一样了。
local path = "data.NGWords"
local db = require(path)
local NG = {}
function NG.init()
print("++++++++++++++++++++===========================+++++++++++++++++++++", "begin init")
NG.root = NG.parse()
NG.initFinished = true
print("++++++++++++++++++++===========================+++++++++++++++++++++", "init finish")
end
function NG.parse()
local root = {}
local parent = nil
local child = nil
local charArray = nil
for id, v in pairs(db) do
if v.str and "" ~= v.str then
parent = root
child = nil
charArray = NG.toLowerCharArray(v.str)
for _, c in pairs(charArray) do
child = NG.getSubNode(parent, c)
if not child then
child = NG.createNode()
NG.addSubNode(parent, c, child)
end
parent = child
end
NG.setNodeIsEnd(child)
end
end
return root
end
function NG.reload()
print("++++++++++++++++++++===========================+++++++++++++++++++++", "begin reload")
package.loaded[path] = nil
db = require(path)
NG.initFinished = nil
NG.root = NG.parse()
NG.initFinished = true
print("++++++++++++++++++++===========================+++++++++++++++++++++", "reload finish")
end
function NG.toLower(c)
local byte = string.byte(c, 1)
local charByteCount = NG.judgeByteCountByFirstUTF8Byte(byte)
if 1 == charByteCount and byte >= string.byte('A') and byte <= string.byte('Z') then
return string.char(byte + 32)
end
return c
end
function NG.createNode()
return { isEnd = nil, subNodes = nil }
end
function NG.addSubNode(node, c, child)
if not node.subNodes then
node.subNodes = {}
end
node.subNodes[c] = child
end
function NG.setNodeIsEnd(node)
node.isEnd = true
end
function NG.getNodeIsEnd(node)
return node.isEnd
--return not node.subNodes
end
function NG.getSubNode(node, c)
if not node.subNodes then
return nil
end
return node.subNodes[c]
end
function NG.judgeByteCountByFirstUTF8Byte(byte)
local charByteCount = nil
if byte < 128 then
charByteCount = 1
elseif byte < 224 then
charByteCount = 2
elseif byte < 240 then
charByteCount = 3
elseif byte < 248 then
charByteCount = 4
elseif byte < 252 then
charByteCount = 5
elseif byte < 254 then
charByteCount = 6
elseif byte < 255 then
charByteCount = 7
end
return charByteCount
end
function NG.charCount(str)
local count = 0
if not str or "" == str then
return count, 0
end
local pos = 1
local len = #str
local byte = nil
local charByteCount = nil
while pos <= len do
byte = string.byte(str, pos)
charByteCount = NG.judgeByteCountByFirstUTF8Byte(byte)
pos = pos + charByteCount
count = count + 1
end
return count, len
end
function NG.charAt(str, pos)
local c = nil
local charByteCount = nil
if not str or "" == str then
return c, charByteCount
end
pos = pos or 1
local len = #str
if pos > len then
return c, charByteCount
end
local byte = string.byte(str, pos)
charByteCount = NG.judgeByteCountByFirstUTF8Byte(byte)
c = string.sub(str, pos, pos + charByteCount - 1)
return c, charByteCount
end
function NG.toCharArray(str)
local charCount = 0
local result = {}
if not str or "" == str then
return result, charCount
end
local c = nil
local byte = nil
local charByteCount = nil
local pos = 1
local len = #str
while pos <= len do
byte = string.byte(str, pos)
charByteCount = NG.judgeByteCountByFirstUTF8Byte(byte)
c = string.sub(str, pos, pos + charByteCount - 1)
table.insert(result, c)
pos = pos + charByteCount
charCount = charCount + 1
end
return result, charCount
end
function NG.toLowerCharArray(str)
local charCount = 0
local result = {}
if not str or "" == str then
return result, charCount
end
local c = nil
local byte = nil
local charByteCount = nil
local pos = 1
local len = #str
while pos <= len do
byte = string.byte(str, pos)
charByteCount = NG.judgeByteCountByFirstUTF8Byte(byte)
c = string.sub(str, pos, pos + charByteCount - 1)
if 1 == charByteCount then
c = NG.toLower(c)
end
table.insert(result, c)
pos = pos + charByteCount
charCount = charCount + 1
end
return result, charCount
end
function NG.shouldEnd(node, nextC)
if not NG.getNodeIsEnd(node) then
return false
end
local child = NG.getSubNode(node, nextC)
return nil == child
end
function NG.findMore(charArray, arrayLength, startIndex)
local c = nil
local foundStartIndex = nil
local foundEndedIndex = nil
local root = NG.root
local parent = nil
local child = nil
local pos = nil
for i = startIndex, arrayLength, 1 do
parent = root
pos = i
while parent and pos <= arrayLength do
c = NG.toLower(charArray[pos])
child = NG.getSubNode(parent, c)
if not child then
break
end
if NG.getNodeIsEnd(child) then
foundStartIndex = i
foundEndedIndex = pos
end
parent = child
pos = pos + 1
end
if foundStartIndex then
return foundStartIndex, foundEndedIndex - foundStartIndex + 1
end
end
return nil, nil
end
function NG.findLess(charArray, arrayLength, startIndex)
local c = nil
local root = NG.root
local node = nil
for i = startIndex, arrayLength, 1 do
node = root
for pos = i, arrayLength, 1 do
c = NG.toLower(charArray[pos])
node = NG.getSubNode(node, c)
if not node then
break
end
if NG.getNodeIsEnd(node) then
return i, pos - i + 1
end
end
end
return nil, nil
end
function NG.findNext(charArray, arrayLength, startIndex, more)
if more then
return NG.findMore(charArray, arrayLength, startIndex)
end
return NG.findLess(charArray, arrayLength, startIndex)
end
function NG.test(str)
if not str or "" == str then
return false
end
local charArray, length = NG.toCharArray(str)
local startIndex, charCount = NG.findNext(charArray, length, 1)
if startIndex then
return true
end
return false
end
function NG.mask(str, more)
if not str or "" == str then
return str
end
local t = {}
local charArray, length = NG.toCharArray(str)
local i = 1
local startIndex, charCount = NG.findNext(charArray, length, i, more)
while startIndex and i <= length do
if startIndex > i then
for index = i, startIndex - 1, 1 do
table.insert(t, charArray[index])
end
end
for index = 1, charCount, 1 do
table.insert(t, "*")
end
i = startIndex + charCount
startIndex, charCount = NG.findNext(charArray, length, i, more)
end
if i <= length then
for index = i, length, 1 do
table.insert(t, charArray[index])
end
end
return table.concat(t, "")
end
function NG.stat(loop)
local format = "%.3f"
local item = nil
local items = {}
local stat = { tag = "ngtrie", items = items }
collectgarbage("collect")
stat.mem0 = collectgarbage("count")
item = { n = 0 }
items["test"] = item
item.t = os.clock()
for i = 1, loop do
for _, v in pairs(db) do
if v.str and "" ~= v.str then
item.n = item.n + 1
NG.test(v.str)
end
end
end
item.t = tonumber(string.format(format, os.clock() - item.t))
item = { n = 0 }
items["mask"] = item
item.t = os.clock()
for i = 1, loop do
for _, v in pairs(db) do
if v.str and "" ~= v.str then
item.n = item.n + 1
NG.mask(v.str)
end
end
end
item.t = tonumber(string.format(format, os.clock() - item.t))
stat.mem1 = tonumber(string.format(format, collectgarbage("count")))
return stat
end
local function test()
print("========:ngwords")
for id, v in pairs(db) do
print(string.format("\t%s:%s", id, v.str or "nil"))
end
NG.init()
local input = nil
print("========:test")
input = "fuck"
print(string.format("\ttest(\"%s\"):%s", input, NG.test(input) and "true" or "false"))
input = "hi,boy"
print(string.format("\ttest(\"%s\"):%s", input, NG.test(input) and "true" or "false"))
print("========:mask")
local more = nil
more = false
input = "fuck you"
print(string.format("\tmask(\"%s\", %s):%s", input or "nil", more and "true" or "false", NG.mask(input, more)))
more = true
input = "fuck you"
print(string.format("\tmask(\"%s\", %s):%s", input or "nil", more and "true" or "false", NG.mask(input, more)))
more = true
input = "hi,boy"
print(string.format("\tmask(\"%s\", %s):%s", input or "nil", more and "true" or "false", NG.mask(input, more)))
more = true
input = "hi,boy,fuck you"
print(string.format("\tmask(\"%s\", %s):%s", input or "nil", more and "true" or "false", NG.mask(input, more)))
more = true
input = "fuck you,boy"
print(string.format("\tmask(\"%s\", %s):%s", input or "nil", more and "true" or "false", NG.mask(input, more)))
more = true
input = "hi,fuck you,boy"
print(string.format("\tmask(\"%s\", %s):%s", input or "nil", more and "true" or "false", NG.mask(input, more)))
more = false
input = "草你妈"
print(string.format("\tmask(\"%s\", %s):%s", input or "nil", more and "true" or "false", NG.mask(input, more)))
more = true
input = "草你妈"
print(string.format("\tmask(\"%s\", %s):%s", input or "nil", more and "true" or "false", NG.mask(input, more)))
end
test()
敏感词库,NGWords.lua文件类似下面这样:
local Items = {
[1] = { id = 1, str = "fuck" },
[2] = { id = 1, str = "fu(k" },
[3] = { id = 1, str = "fuck you" },
[4] = { id = 1, str = "草你" },
[5] = { id = 1, str = "草你妈" },
}
return Items
测试输出结果:
========:ngwords
2:fu(k
3:fuck you
1:fuck
4:草你
5:草你妈
++++++++++++++++++++===========================+++++++++++++++++++++ begin init
++++++++++++++++++++===========================+++++++++++++++++++++ init finish
========:test
test("fuck"):true
test("hi,boy"):false
========:mask
mask("fuck you", false):**** you
mask("fuck you", true):********
mask("hi,boy", true):hi,boy
mask("hi,boy,fuck you", true):hi,boy,********
mask("fuck you,boy", true):********,boy
mask("hi,fuck you,boy", true):hi,********,boy
mask("草你妈", false):**妈
mask("草你妈", true):***