思路来源于宗成庆老师PPT第9章线图分析法。在这里仅把算法描述放到这里,宗成庆老师的PPT上的例子对于实现该算法非常有帮助,由于例子较长,在这里就不放了,若有需要,自行查看。
PPT链接:http://www.nlpr.ia.ac.cn/cip/ZongReportandLecture/ReportandLectureIndex.htm
该算法在第九章第一部分
该算法需要三个容器:1.agenda代理表,一般用列表或数组存储即可。2.Activate_Arc:活性边,用队列或栈存储,本人用队列存储在本文中。3.chart:为一个句子长度+1乘以句子长度+1的矩阵,用以存储最后的结果
算法描述如下:
代码如下:
1.队列
class Queue:
def __init__(self):
self.first = None
self.last = None
self.N = 0
def enqueue(self,item):
oldLast = self.last
self.last = Node(item)
self.last.next = None
if self.isEmpty():
self.first = self.last
else:
oldLast.next = self.last
self.N += 1
def unqueue(self):
if self.isEmpty():
raise QueueIsEmptyException('队列已经没有元素')
item = self.first
self.first = self.first.next
if self.isEmpty():
self.last = None
self.N -= 1
return item
def isEmpty(self):
if self.N == 0:
return True
class Node:
def __init__(self,item):
self.item = item
self.next = None
class QueueIsEmptyException(Exception):
def __init__(self,errInfo):
super.__init__(self)
self.errorinfo = errInfo
def __str__(self):
return self.errorinfo
if __name__ == '__main__':
queue = Queue()
queue.enqueue(1)
queue.enqueue(2)
# print(queue)
print(queue.N)
item = queue.unqueue()
print(item.item)
print(queue.N)
queue.unqueue()
queue.unqueue()
2.线图分析法parsing-chart
from Parsing.Chart.LinearQueue import Queue
class parsing_chart:
def __init__(self,voca_tag):
queue = Queue()
self.agenda = queue
self.voca_tag = voca_tag
self.ActiveArc = []
self.chart = []
self.rules = [['S','NP','VP',0,0],['NP','Det','N',0,0],['VP','VP','PP',0,0],['VP','V','NP',0,0],['PP','Prep','NP',0,0]]
self.changed_rules = self.rules.copy()
self.rowFirst = None
self.columnFirst = None
def parsing(self,sentence):
print('=======开始分析句子结构=======')
# self.agenda = LinearQueue()
s_length = len(sentence)
count = -1
for i in range(s_length+1):
self.chart.append([0]*(s_length+1))
while(count < len(sentence)):
if self.agenda.isEmpty():
# word_pos = self.voca_tag[i] + '('+str(i)+','+str(i+1)+')'
count += 1
if count == len(sentence):
break
word_pos = [self.voca_tag[count],count, count + 1]
self.agenda.enqueue(word_pos)
else:
#如果符合tag X符合规则 A ->X r 则添加标记A ->X。r(i,j),并添加至活性边列表中
word_pos = self.agenda.unqueue().item
for i in range(len(self.rules)):
rule = self.changed_rules[i]
if word_pos[0] == rule[1]:
self.add_ActiveArc(rule,word_pos)
#扩展活性边
self.extend_arc(word_pos,s_length,count)
# self.get_text_struc(sentence)
def add_ActiveArc(self,rule,word_pos):
word = word_pos[0]
if '。' in rule:
rule.remove('。')
# 有些规则如VP -> VP PP 需要将。移动到后面的VP处
index = [i for i,x in enumerate(rule) if x == word]
if len(index) > 1:
real_index = index[len(index)-1]
else:
real_index = index[0]
rule.insert(real_index+1,'。')
rule[4] = word_pos[1]
rule[5] = word_pos[2]
self.ActiveArc.append(rule)
def extend_arc(self,word_pos,s_length,count):
self.chart[word_pos[1]][word_pos[2]] = word_pos[0]
for i in range(len(self.ActiveArc)):
#只有tag X符合 A -> u。X才执行以下程序
if word_pos[0] == self.ActiveArc[i][3]:
if 'S' == self.ActiveArc[i][0]:
self.chart[0][s_length] = 'S'
else:
self.ActiveArc[i] =[self.ActiveArc[i][0],self.ActiveArc[i][1],self.ActiveArc[i][3],self.ActiveArc[i][2],self.ActiveArc[i][4],self.ActiveArc[i][5]]
self.agenda.enqueue([self.ActiveArc[i][0],self.ActiveArc[i][4],word_pos[2]])
def get_text_struc(self,sentence):
print('=======句子结构如下=======')
# print(self.chart)
for i in range(len(self.chart)-1):
for j in range(len(self.chart)):
if self.chart[i][j] != 0:
print(str(sentence[i:j])+ ':' + self.chart[i][j])
def get_structure_tree(self,sentence,row,column):
col_value = None
row_value = None
n_row1 = None
n_col1 = None
n_row2 = None
n_col2 =None
if row is None or column is None:
return
curr_tag = self.chart[row][column]
for i in range(column-1,-1,-1):
if self.chart[row][i] != 0:
row_value = self.chart[row][i]
# print(row_value)
n_row1 = row
n_col1 = i
break
for r in range(row+1,len(self.chart) - 1):
if self.chart[r][column] != 0:
col_value = self.chart[r][column]
# print(col_value)
n_row2 = r
n_col2 = column
break
if col_value is not None and row_value is not None:
print(' '.join(sentence[row:column])+'/'+curr_tag + ' -> ' + ' '.join(sentence[n_row1:n_col1]) + '/' + row_value + ' ' + ' '.join(sentence[n_row2:n_col2]) + '/' + col_value)
self.get_structure_tree(sentence,n_row1,n_col1)
self.get_structure_tree(sentence,n_row2,n_col2)
if __name__ == '__main__':
sentence = ['the', 'boy', 'hits', 'the', 'dog', 'with', 'a', 'rod']
voca_tag = ['Det','N','V','Det','N','Prep','Det','N']
pc = parsing_chart(voca_tag)
pc.parsing(sentence)
nrow = 0
ncol = len(pc.chart)-1
pc.get_structure_tree(sentence,nrow,ncol)
3.结果
4.总结
此次代码实现浪费太多时间。。因为没有太研究例子,而仅仅根据算法写代码,写的总是有问题,之后详细做了下PPT上的例子,问题便迎刃而解,以前看过一个童鞋说过:多读书、多总结、少写代码十分有道理,先把思路理清,代码二点实现便水到渠成了