天天看點

lua之屏蔽字替換為 '*'

local socket = require "socket"

local function utf8len(ch)
	if not ch then
		return -1
	end
	if ch < 0x80 then
		return 1
	elseif ch < 0xC0 then
		return -1
	elseif ch < 0xE0 then
		return 2
	elseif ch < 0xF0 then
		return 3
	elseif ch < 0xF8 then
		return 4
	elseif ch < 0xFC then
		return 5
	elseif ch < 0xFE then
		return 6
	else
		return -1
	end
end

local function getutf8tbl(input)
	if not input then
		return nil, nil
	end
	local tbl = {}
	local tbllen = {}
	local len = #input
	local i = 1
	while i <= len do
		local j = utf8len(string.byte(string.sub(input, i, i)))
		if j <= 0 or i + j - 1 > len then
			return nil, nil
		end
		table.insert(tbl, string.sub(input, i, i + j - 1))
		table.insert(tbllen, j)
		i = i + j
	end
	return tbl, tbllen
end

local f0 = socket.gettime()
local data = {}
local maxlen = 0
local firstword = {}
for line in io.lines("forbidden_words.txt") do
	local len = string.len(line)
	if data[len] == nil then
		data[len] = {}
	end
	data[len][line] = true
	if len > maxlen then
		maxlen = len
	end
	local wordlen = utf8len(string.byte(string.sub(line, 1, 1)))
	if wordlen > 0 then
		firstword[string.sub(line, 1, wordlen)] = true
	end
end
local f1 = socket.gettime()
print(f1 - f0)

local fout = io.open("out.txt", "w")
for str in io.lines("test.txt") do
	local t0 = socket.gettime()
	local tbl, tbllen = getutf8tbl(str)
	if not tbl then
		print(str .. " input is invalid")
	end
	local count = 0
	local len = #tbl
	for i = 1, len do
		local wordlen = 0
		if tbl[i] ~= '*' and firstword[tbl[i]] then
			for j = 1, len - i + 1 do
				wordlen = wordlen + tbllen[i + j -1]
				if wordlen > maxlen then --optimization
					break
				end
				local t = data[wordlen]
				if t then
					local word = table.concat(tbl, nil, i, i + j - 1)
					count = count + 1
					if t[word] then
						for k = i, i + j - 1 do
							tbl[k] = '*'
						end
						break
					end
				end
			end
		end
	end
	local t1 = socket.gettime()
	fout:write(table.concat(tbl), t1 - t0, '\n')
	print(count, t1 - t0)
end
fout:close()
           

繼續閱讀