天天看点

lua 前缀树法敏感词测试和打码

对于敏感词问题,先占个坑,后续写一篇由初到高的。

下面贴出的是lua实现的前缀树法。注意:系统字符集需要设置为UTF-8,lua脚本文件格式也要设为UTF-8,不然就乱码鸟。也有一种不区分编码的实现,就是按字节构建前缀树,但这样打码的符号个数与实际敏感词字符数就可能不一样了。

local path = "data.NGWords"
local db = require(path)

local NG = {}

function NG.init()
    print("++++++++++++++++++++===========================+++++++++++++++++++++", "begin init")
    NG.root = NG.parse()
    NG.initFinished = true
    print("++++++++++++++++++++===========================+++++++++++++++++++++", "init finish")
end

function NG.parse()
    local root = {}
    local parent = nil
    local child = nil
    local charArray = nil
    for id, v in pairs(db) do
        if v.str and "" ~= v.str then
            parent = root
            child = nil
            charArray = NG.toLowerCharArray(v.str)
            for _, c in pairs(charArray) do
                child = NG.getSubNode(parent, c)
                if not child then
                    child = NG.createNode()
                    NG.addSubNode(parent, c, child)
                end
                parent = child
            end
            NG.setNodeIsEnd(child)
        end
    end
    return root
end

function NG.reload()
    print("++++++++++++++++++++===========================+++++++++++++++++++++", "begin reload")
    package.loaded[path] = nil
    db = require(path)
    NG.initFinished = nil
    NG.root = NG.parse()
    NG.initFinished = true
    print("++++++++++++++++++++===========================+++++++++++++++++++++", "reload finish")
end

function NG.toLower(c)
    local byte = string.byte(c, 1)
    local charByteCount = NG.judgeByteCountByFirstUTF8Byte(byte)
    if 1 == charByteCount and byte >= string.byte('A') and byte <= string.byte('Z') then
        return string.char(byte + 32)
    end
    return c
end
function NG.createNode()
    return { isEnd = nil, subNodes = nil }
end
function NG.addSubNode(node, c, child)
    if not node.subNodes then
        node.subNodes = {}
    end
    node.subNodes[c] = child
end
function NG.setNodeIsEnd(node)
    node.isEnd = true
end
function NG.getNodeIsEnd(node)
    return node.isEnd
    --return not node.subNodes
end
function NG.getSubNode(node, c)
    if not node.subNodes then
        return nil
    end
    return node.subNodes[c]
end

function NG.judgeByteCountByFirstUTF8Byte(byte)
    local charByteCount = nil
    if byte < 128 then
        charByteCount = 1
    elseif byte < 224 then
        charByteCount = 2
    elseif byte < 240 then
        charByteCount = 3
    elseif byte < 248 then
        charByteCount = 4
    elseif byte < 252 then
        charByteCount = 5
    elseif byte < 254 then
        charByteCount = 6
    elseif byte < 255 then
        charByteCount = 7
    end
    return charByteCount
end

function NG.charCount(str)
    local count = 0
    if not str or "" == str then
        return count, 0
    end
    local pos = 1
    local len = #str
    local byte = nil
    local charByteCount = nil
    while pos <= len do
        byte = string.byte(str, pos)
        charByteCount = NG.judgeByteCountByFirstUTF8Byte(byte)
        pos = pos + charByteCount
        count = count + 1
    end
    return count, len
end

function NG.charAt(str, pos)
    local c = nil
    local charByteCount = nil
    if not str or "" == str then
        return c, charByteCount
    end
    pos = pos or 1
    local len = #str
    if pos > len then
        return c, charByteCount
    end
    local byte = string.byte(str, pos)
    charByteCount = NG.judgeByteCountByFirstUTF8Byte(byte)
    c = string.sub(str, pos, pos + charByteCount - 1)
    return c, charByteCount
end

function NG.toCharArray(str)
    local charCount = 0
    local result = {}
    if not str or "" == str then
        return result, charCount
    end
    local c = nil
    local byte = nil
    local charByteCount = nil
    local pos = 1
    local len = #str
    while pos <= len do
        byte = string.byte(str, pos)
        charByteCount = NG.judgeByteCountByFirstUTF8Byte(byte)
        c = string.sub(str, pos, pos + charByteCount - 1)
        table.insert(result, c)
        pos = pos + charByteCount
        charCount = charCount + 1
    end
    return result, charCount
end

function NG.toLowerCharArray(str)
    local charCount = 0
    local result = {}
    if not str or "" == str then
        return result, charCount
    end
    local c = nil
    local byte = nil
    local charByteCount = nil
    local pos = 1
    local len = #str
    while pos <= len do
        byte = string.byte(str, pos)
        charByteCount = NG.judgeByteCountByFirstUTF8Byte(byte)
        c = string.sub(str, pos, pos + charByteCount - 1)
        if 1 == charByteCount then
            c = NG.toLower(c)
        end
        table.insert(result, c)
        pos = pos + charByteCount
        charCount = charCount + 1
    end
    return result, charCount
end

function NG.shouldEnd(node, nextC)
    if not NG.getNodeIsEnd(node) then
        return false
    end
    local child = NG.getSubNode(node, nextC)
    return nil == child
end

function NG.findMore(charArray, arrayLength, startIndex)
    local c = nil
    local foundStartIndex = nil
    local foundEndedIndex = nil
    local root = NG.root
    local parent = nil
    local child = nil
    local pos = nil
    for i = startIndex, arrayLength, 1 do
        parent = root
        pos = i
        while parent and pos <= arrayLength do
            c = NG.toLower(charArray[pos])
            child = NG.getSubNode(parent, c)
            if not child then
                break
            end
            if NG.getNodeIsEnd(child) then
                foundStartIndex = i
                foundEndedIndex = pos
            end
            parent = child
            pos = pos + 1
        end
        if foundStartIndex then
            return foundStartIndex, foundEndedIndex - foundStartIndex + 1
        end
    end
    return nil, nil
end

function NG.findLess(charArray, arrayLength, startIndex)
    local c = nil
    local root = NG.root
    local node = nil
    for i = startIndex, arrayLength, 1 do
        node = root
        for pos = i, arrayLength, 1 do
            c = NG.toLower(charArray[pos])
            node = NG.getSubNode(node, c)
            if not node then
                break
            end
            if NG.getNodeIsEnd(node) then
                return i, pos - i + 1
            end
        end
    end
    return nil, nil
end

function NG.findNext(charArray, arrayLength, startIndex, more)
    if more then
        return NG.findMore(charArray, arrayLength, startIndex)
    end
    return NG.findLess(charArray, arrayLength, startIndex)
end

function NG.test(str)
    if not str or "" == str then
        return false
    end
    local charArray, length = NG.toCharArray(str)
    local startIndex, charCount = NG.findNext(charArray, length, 1)
    if startIndex then
        return true
    end
    return false
end

function NG.mask(str, more)
    if not str or "" == str then
        return str
    end
    local t = {}
    local charArray, length = NG.toCharArray(str)
    local i = 1
    local startIndex, charCount = NG.findNext(charArray, length, i, more)
    while startIndex and i <= length do
        if startIndex > i then
            for index = i, startIndex - 1, 1 do
                table.insert(t, charArray[index])
            end
        end
        for index = 1, charCount, 1 do
            table.insert(t, "*")
        end
        i = startIndex + charCount
        startIndex, charCount = NG.findNext(charArray, length, i, more)
    end
    if i <= length then
        for index = i, length, 1 do
            table.insert(t, charArray[index])
        end
    end
    return table.concat(t, "")
end

function NG.stat(loop)
    local format = "%.3f"
    local item = nil
    local items = {}
    local stat = { tag = "ngtrie", items = items }
    collectgarbage("collect")
    stat.mem0 = collectgarbage("count")

    item = { n = 0 }
    items["test"] = item
    item.t = os.clock()
    for i = 1, loop do
        for _, v in pairs(db) do
            if v.str and "" ~= v.str then
                item.n = item.n + 1
                NG.test(v.str)
            end
        end
    end
    item.t = tonumber(string.format(format, os.clock() - item.t))

    item = { n = 0 }
    items["mask"] = item
    item.t = os.clock()
    for i = 1, loop do
        for _, v in pairs(db) do
            if v.str and "" ~= v.str then
                item.n = item.n + 1
                NG.mask(v.str)
            end
        end
    end
    item.t = tonumber(string.format(format, os.clock() - item.t))

    stat.mem1 = tonumber(string.format(format, collectgarbage("count")))

    return stat
end


local function test()
	print("========:ngwords")
	for id, v in pairs(db) do
		print(string.format("\t%s:%s", id, v.str or "nil"))
	end
	NG.init()
	
	
	local input = nil
	print("========:test")
	input = "fuck"
	print(string.format("\ttest(\"%s\"):%s", input, NG.test(input) and "true" or "false"))
	input = "hi,boy"
	print(string.format("\ttest(\"%s\"):%s", input, NG.test(input) and "true" or "false"))
	
	print("========:mask")
	local more = nil
	more = false
	input = "fuck you"
	print(string.format("\tmask(\"%s\", %s):%s", input or "nil", more and "true" or "false", NG.mask(input, more)))

	more = true
	input = "fuck you"
	print(string.format("\tmask(\"%s\", %s):%s", input or "nil", more and "true" or "false", NG.mask(input, more)))

	more = true
	input = "hi,boy"
	print(string.format("\tmask(\"%s\", %s):%s", input or "nil", more and "true" or "false", NG.mask(input, more)))

	more = true
	input = "hi,boy,fuck you"
	print(string.format("\tmask(\"%s\", %s):%s", input or "nil", more and "true" or "false", NG.mask(input, more)))

	more = true
	input = "fuck you,boy"
	print(string.format("\tmask(\"%s\", %s):%s", input or "nil", more and "true" or "false", NG.mask(input, more)))

	more = true
	input = "hi,fuck you,boy"
	print(string.format("\tmask(\"%s\", %s):%s", input or "nil", more and "true" or "false", NG.mask(input, more)))
	
	more = false
	input = "草你妈"
	print(string.format("\tmask(\"%s\", %s):%s", input or "nil", more and "true" or "false", NG.mask(input, more)))
	
	more = true
	input = "草你妈"
	print(string.format("\tmask(\"%s\", %s):%s", input or "nil", more and "true" or "false", NG.mask(input, more)))
end
test()
           

敏感词库,NGWords.lua文件类似下面这样:

local Items = {
	[1] = { id = 1, str = "fuck" },
	[2] = { id = 1, str = "fu(k" },
	[3] = { id = 1, str = "fuck you" },
	[4] = { id = 1, str = "草你" },
	[5] = { id = 1, str = "草你妈" },
}

return Items
           

测试输出结果:

========:ngwords

        2:fu(k

        3:fuck you

        1:fuck

        4:草你

        5:草你妈

++++++++++++++++++++===========================+++++++++++++++++++++    begin init

++++++++++++++++++++===========================+++++++++++++++++++++    init finish

========:test

        test("fuck"):true

        test("hi,boy"):false

========:mask

        mask("fuck you", false):**** you

        mask("fuck you", true):********

        mask("hi,boy", true):hi,boy

        mask("hi,boy,fuck you", true):hi,boy,********

        mask("fuck you,boy", true):********,boy

        mask("hi,fuck you,boy", true):hi,********,boy

        mask("草你妈", false):**妈

        mask("草你妈", true):***