今天有點空閑,想想用ruby寫個nfa試試。從正規表達式構造nfa采用經典的thompson算法:正規表達式 -> 字尾表達式 -> 構造nfa。構造了nfa後,用之比對字元串。一句話,寫了個玩具的正規表達式引擎,支援concatenation、alternation以及*、?、+量詞,不支援反向引用和轉義符。測試了下與ruby自帶的正規表達式引擎的性能對比,慢了3倍
。構造nfa沒什麼問題,主要是比對運作寫的爛,有空再改改。
nfa.rb
module nfa
class nfa
def initialize(state)
@state=state
end
def step(clist,c)
return clist if clist.size==0;
nlist=[]
allnull = true
matched = false
clist.each do |t|
if !t.nil?
allnull = false if t.c!=-1
if t.c == c && t.end.type ==1 then
matched = true
nlist.push(t.end.out1) if !t.end.out1.end.nil?
nlist.push(t.end.out2) if !t.end.out2.end.nil?
elsif (t.c == c && t.end.type == 0) then
matched = true;
return listuitls.new_list(t);
elsif (t.c == -1 && !t.end.nil?) then
nlist.push(t.end.out1);
nlist.push(t.end.out2);
end
end
end
return step(nlist, c) if (allnull)
return step(nlist, c) if (!matched)
nlist
def test?(s)
match(@state,s)
def match(state,s)
clist =[]
clist.push(state.out1);
clist.push(state.out2);
s.each_byte do |c|
c =c&0xff;
clist = step(clist, c);
return false if clist.size==0
end
return is_match?(clist)
def is_match?(clist)
clist.each do |t|
return true if !t.nil? and t.c==-1 and t.end and t.end.is_matched?
false
end
class paren
attr_accessor:n_alt,:n_atom
class state
attr_accessor :out1,:out2,:type
def initialize(out1,out2)
@out1=out1
@out2=out2
@type=1
def is_matched?
return @type==0
class transition
attr_accessor :c,:end
def initialize(c)
@c=c
end
class frame
attr_accessor :start,:outs
def initialize(start,outs)
@start=start
@outs=outs
class listuitls
def self.link(list,state)
list.each{|t| t.end=state}
def self.append(list1,list2)
list1+list2
def self.new_list(out)
result=[]
result.push(out)
result
def self.compile(re)
post = re2post(re)
raise argumenterror.new,"bad regexp!" if post.nil?
state = post2nfa(post);
raise runtimeerror.new,"construct nfa from postfix fail!" if state.nil?
return nfa.new(state);
def self.post2nfa(postfix)
stack=[]
s=nil
t=t1=t2=nil
e1=e2=e=nil
return nil if postfix.nil?
postfix.each_byte do |p|
case p.chr
when '.':
e2 = stack.pop()
e1 = stack.pop()
listuitls.link(e1.outs, e2.start)
stack.push(frame.new(e1.start, e2.outs))
when '|':
t1 = transition.new(-1)
t2 = transition.new(-1)
t1.end = e1.start
t2.end = e2.start
s = state.new(t1, t2)
stack.push(frame.new(s, listuitls.append(e1.outs, e2.outs)))
when '?':
e = stack.pop()
t1.end = e.start
stack.push(frame.new(s, listuitls.append(e.outs, listuitls.new_list(t2))))
when '*':
t2 = transition.new(-1)
listuitls.link(e.outs, s)
stack.push(frame.new(s, listuitls.new_list(s.out2)))
when '+':
t1 = transition.new(-1)
stack.push(frame.new(e.start, listuitls.new_list(t2)))
else
t = transition.new(p)
s = state.new(t, transition.new(-1))
stack.push(frame.new(s, listuitls.new_list(s.out1)))
e = stack.pop()
return nil if stack.size()>0
end_state = state.new(nil, nil)
end_state.type=0
e.outs.each do |tran|
if tran.c!=-1
s=state.new(t1,t2)
tran.end=s
s.out1.end=end_state
s.out2.end=end_state
tran.end=end_state
start = e.start
return start
def self.re2post(re)
n_alt = n_atom = 0
result=""
paren=[]
re.each_byte do |c|
case c.chr
when '(' then
if (n_atom > 1) then
n_atom-=1
result<<"."
p =paren.new
p.n_alt = n_alt
p.n_atom = n_atom
paren.push(p)
n_alt = n_atom = 0
when '|' then
if (n_atom == 0)
return nil
while (n_atom-=1) > 0
n_alt+=1
when ')' then
if (paren.size() == 0)
end
return nil
while (n_atom-=1)>0
result<<"."
while(n_alt>0)
result<<"|"
n_alt-=1
p = paren.pop()
n_alt = p.n_alt
n_atom = p.n_atom
n_atom+=1
when '*','+','?':
result<<c
else
if (n_atom > 1)
return nil if paren.size()>0
while ( (n_atom-=1)> 0)
result<<"."
while(n_alt>0)
n_alt-=1
result<<"|"
result
end
使用:
nfa = nfa::compile("a(bb)+a(cdf)*")
assert nfa.test?("abba")
assert nfa.test?("abbbba")
assert !nfa.test?("a")
assert !nfa.test?("aa")
assert nfa.test?("abbacdf")
assert nfa.test?("abbbbacdfcdf")
assert !nfa.test?("bbbbacdfcdf")
assert !nfa.test?("abbbacdfcdf")
assert !nfa.test?("abbbbacdfdf")
assert !nfa.test?("abbbbacdfdfg")
文章轉自莊周夢蝶 ,原文釋出時間2008-02-25