對于敏感詞問題,先占個坑,後續寫一篇由初到高的。
下面貼出的是lua實作的字首樹法。注意:系統字元集需要設定為UTF-8,lua腳本檔案格式也要設為UTF-8,不然就亂碼鳥。也有一種不區分編碼的實作,就是按位元組建構字首樹,但這樣打碼的符号個數與實際敏感詞字元數就可能不一樣了。
local path = "data.NGWords"
local db = require(path)
local NG = {}
function NG.init()
print("++++++++++++++++++++===========================+++++++++++++++++++++", "begin init")
NG.root = NG.parse()
NG.initFinished = true
print("++++++++++++++++++++===========================+++++++++++++++++++++", "init finish")
end
function NG.parse()
local root = {}
local parent = nil
local child = nil
local charArray = nil
for id, v in pairs(db) do
if v.str and "" ~= v.str then
parent = root
child = nil
charArray = NG.toLowerCharArray(v.str)
for _, c in pairs(charArray) do
child = NG.getSubNode(parent, c)
if not child then
child = NG.createNode()
NG.addSubNode(parent, c, child)
end
parent = child
end
NG.setNodeIsEnd(child)
end
end
return root
end
function NG.reload()
print("++++++++++++++++++++===========================+++++++++++++++++++++", "begin reload")
package.loaded[path] = nil
db = require(path)
NG.initFinished = nil
NG.root = NG.parse()
NG.initFinished = true
print("++++++++++++++++++++===========================+++++++++++++++++++++", "reload finish")
end
function NG.toLower(c)
local byte = string.byte(c, 1)
local charByteCount = NG.judgeByteCountByFirstUTF8Byte(byte)
if 1 == charByteCount and byte >= string.byte('A') and byte <= string.byte('Z') then
return string.char(byte + 32)
end
return c
end
function NG.createNode()
return { isEnd = nil, subNodes = nil }
end
function NG.addSubNode(node, c, child)
if not node.subNodes then
node.subNodes = {}
end
node.subNodes[c] = child
end
function NG.setNodeIsEnd(node)
node.isEnd = true
end
function NG.getNodeIsEnd(node)
return node.isEnd
--return not node.subNodes
end
function NG.getSubNode(node, c)
if not node.subNodes then
return nil
end
return node.subNodes[c]
end
function NG.judgeByteCountByFirstUTF8Byte(byte)
local charByteCount = nil
if byte < 128 then
charByteCount = 1
elseif byte < 224 then
charByteCount = 2
elseif byte < 240 then
charByteCount = 3
elseif byte < 248 then
charByteCount = 4
elseif byte < 252 then
charByteCount = 5
elseif byte < 254 then
charByteCount = 6
elseif byte < 255 then
charByteCount = 7
end
return charByteCount
end
function NG.charCount(str)
local count = 0
if not str or "" == str then
return count, 0
end
local pos = 1
local len = #str
local byte = nil
local charByteCount = nil
while pos <= len do
byte = string.byte(str, pos)
charByteCount = NG.judgeByteCountByFirstUTF8Byte(byte)
pos = pos + charByteCount
count = count + 1
end
return count, len
end
function NG.charAt(str, pos)
local c = nil
local charByteCount = nil
if not str or "" == str then
return c, charByteCount
end
pos = pos or 1
local len = #str
if pos > len then
return c, charByteCount
end
local byte = string.byte(str, pos)
charByteCount = NG.judgeByteCountByFirstUTF8Byte(byte)
c = string.sub(str, pos, pos + charByteCount - 1)
return c, charByteCount
end
function NG.toCharArray(str)
local charCount = 0
local result = {}
if not str or "" == str then
return result, charCount
end
local c = nil
local byte = nil
local charByteCount = nil
local pos = 1
local len = #str
while pos <= len do
byte = string.byte(str, pos)
charByteCount = NG.judgeByteCountByFirstUTF8Byte(byte)
c = string.sub(str, pos, pos + charByteCount - 1)
table.insert(result, c)
pos = pos + charByteCount
charCount = charCount + 1
end
return result, charCount
end
function NG.toLowerCharArray(str)
local charCount = 0
local result = {}
if not str or "" == str then
return result, charCount
end
local c = nil
local byte = nil
local charByteCount = nil
local pos = 1
local len = #str
while pos <= len do
byte = string.byte(str, pos)
charByteCount = NG.judgeByteCountByFirstUTF8Byte(byte)
c = string.sub(str, pos, pos + charByteCount - 1)
if 1 == charByteCount then
c = NG.toLower(c)
end
table.insert(result, c)
pos = pos + charByteCount
charCount = charCount + 1
end
return result, charCount
end
function NG.shouldEnd(node, nextC)
if not NG.getNodeIsEnd(node) then
return false
end
local child = NG.getSubNode(node, nextC)
return nil == child
end
function NG.findMore(charArray, arrayLength, startIndex)
local c = nil
local foundStartIndex = nil
local foundEndedIndex = nil
local root = NG.root
local parent = nil
local child = nil
local pos = nil
for i = startIndex, arrayLength, 1 do
parent = root
pos = i
while parent and pos <= arrayLength do
c = NG.toLower(charArray[pos])
child = NG.getSubNode(parent, c)
if not child then
break
end
if NG.getNodeIsEnd(child) then
foundStartIndex = i
foundEndedIndex = pos
end
parent = child
pos = pos + 1
end
if foundStartIndex then
return foundStartIndex, foundEndedIndex - foundStartIndex + 1
end
end
return nil, nil
end
function NG.findLess(charArray, arrayLength, startIndex)
local c = nil
local root = NG.root
local node = nil
for i = startIndex, arrayLength, 1 do
node = root
for pos = i, arrayLength, 1 do
c = NG.toLower(charArray[pos])
node = NG.getSubNode(node, c)
if not node then
break
end
if NG.getNodeIsEnd(node) then
return i, pos - i + 1
end
end
end
return nil, nil
end
function NG.findNext(charArray, arrayLength, startIndex, more)
if more then
return NG.findMore(charArray, arrayLength, startIndex)
end
return NG.findLess(charArray, arrayLength, startIndex)
end
function NG.test(str)
if not str or "" == str then
return false
end
local charArray, length = NG.toCharArray(str)
local startIndex, charCount = NG.findNext(charArray, length, 1)
if startIndex then
return true
end
return false
end
function NG.mask(str, more)
if not str or "" == str then
return str
end
local t = {}
local charArray, length = NG.toCharArray(str)
local i = 1
local startIndex, charCount = NG.findNext(charArray, length, i, more)
while startIndex and i <= length do
if startIndex > i then
for index = i, startIndex - 1, 1 do
table.insert(t, charArray[index])
end
end
for index = 1, charCount, 1 do
table.insert(t, "*")
end
i = startIndex + charCount
startIndex, charCount = NG.findNext(charArray, length, i, more)
end
if i <= length then
for index = i, length, 1 do
table.insert(t, charArray[index])
end
end
return table.concat(t, "")
end
function NG.stat(loop)
local format = "%.3f"
local item = nil
local items = {}
local stat = { tag = "ngtrie", items = items }
collectgarbage("collect")
stat.mem0 = collectgarbage("count")
item = { n = 0 }
items["test"] = item
item.t = os.clock()
for i = 1, loop do
for _, v in pairs(db) do
if v.str and "" ~= v.str then
item.n = item.n + 1
NG.test(v.str)
end
end
end
item.t = tonumber(string.format(format, os.clock() - item.t))
item = { n = 0 }
items["mask"] = item
item.t = os.clock()
for i = 1, loop do
for _, v in pairs(db) do
if v.str and "" ~= v.str then
item.n = item.n + 1
NG.mask(v.str)
end
end
end
item.t = tonumber(string.format(format, os.clock() - item.t))
stat.mem1 = tonumber(string.format(format, collectgarbage("count")))
return stat
end
local function test()
print("========:ngwords")
for id, v in pairs(db) do
print(string.format("\t%s:%s", id, v.str or "nil"))
end
NG.init()
local input = nil
print("========:test")
input = "fuck"
print(string.format("\ttest(\"%s\"):%s", input, NG.test(input) and "true" or "false"))
input = "hi,boy"
print(string.format("\ttest(\"%s\"):%s", input, NG.test(input) and "true" or "false"))
print("========:mask")
local more = nil
more = false
input = "fuck you"
print(string.format("\tmask(\"%s\", %s):%s", input or "nil", more and "true" or "false", NG.mask(input, more)))
more = true
input = "fuck you"
print(string.format("\tmask(\"%s\", %s):%s", input or "nil", more and "true" or "false", NG.mask(input, more)))
more = true
input = "hi,boy"
print(string.format("\tmask(\"%s\", %s):%s", input or "nil", more and "true" or "false", NG.mask(input, more)))
more = true
input = "hi,boy,fuck you"
print(string.format("\tmask(\"%s\", %s):%s", input or "nil", more and "true" or "false", NG.mask(input, more)))
more = true
input = "fuck you,boy"
print(string.format("\tmask(\"%s\", %s):%s", input or "nil", more and "true" or "false", NG.mask(input, more)))
more = true
input = "hi,fuck you,boy"
print(string.format("\tmask(\"%s\", %s):%s", input or "nil", more and "true" or "false", NG.mask(input, more)))
more = false
input = "草你媽"
print(string.format("\tmask(\"%s\", %s):%s", input or "nil", more and "true" or "false", NG.mask(input, more)))
more = true
input = "草你媽"
print(string.format("\tmask(\"%s\", %s):%s", input or "nil", more and "true" or "false", NG.mask(input, more)))
end
test()
敏感詞庫,NGWords.lua檔案類似下面這樣:
local Items = {
[1] = { id = 1, str = "fuck" },
[2] = { id = 1, str = "fu(k" },
[3] = { id = 1, str = "fuck you" },
[4] = { id = 1, str = "草你" },
[5] = { id = 1, str = "草你媽" },
}
return Items
測試輸出結果:
========:ngwords
2:fu(k
3:fuck you
1:fuck
4:草你
5:草你媽
++++++++++++++++++++===========================+++++++++++++++++++++ begin init
++++++++++++++++++++===========================+++++++++++++++++++++ init finish
========:test
test("fuck"):true
test("hi,boy"):false
========:mask
mask("fuck you", false):**** you
mask("fuck you", true):********
mask("hi,boy", true):hi,boy
mask("hi,boy,fuck you", true):hi,boy,********
mask("fuck you,boy", true):********,boy
mask("hi,fuck you,boy", true):hi,********,boy
mask("草你媽", false):**媽
mask("草你媽", true):***