天天看点

通过parsing-chart实现句法分析

思路来源于宗成庆老师PPT第9章线图分析法。在这里仅把算法描述放到这里,宗成庆老师的PPT上的例子对于实现该算法非常有帮助,由于例子较长,在这里就不放了,若有需要,自行查看。

PPT链接:http://www.nlpr.ia.ac.cn/cip/ZongReportandLecture/ReportandLectureIndex.htm

该算法在第九章第一部分

该算法需要三个容器:1.agenda代理表,一般用列表或数组存储即可。2.Activate_Arc:活性边,用队列或栈存储,本人用队列存储在本文中。3.chart:为一个句子长度+1乘以句子长度+1的矩阵,用以存储最后的结果

算法描述如下:

通过parsing-chart实现句法分析
通过parsing-chart实现句法分析

代码如下:

1.队列

class Queue:
    def __init__(self):
        self.first = None
        self.last = None
        self.N = 0

    def enqueue(self,item):
        oldLast = self.last
        self.last = Node(item)
        self.last.next = None
        if self.isEmpty():
            self.first = self.last
        else:
            oldLast.next = self.last

        self.N += 1

    def unqueue(self):
        if self.isEmpty():
            raise QueueIsEmptyException('队列已经没有元素')
        item = self.first
        self.first = self.first.next
        if self.isEmpty():
            self.last = None

        self.N -= 1
        return item

    def isEmpty(self):
        if self.N == 0:
            return True


class Node:
    def __init__(self,item):
        self.item = item
        self.next = None

class QueueIsEmptyException(Exception):
    def __init__(self,errInfo):
        super.__init__(self)
        self.errorinfo = errInfo
    def __str__(self):
        return self.errorinfo

if __name__ == '__main__':
    queue = Queue()
    queue.enqueue(1)
    queue.enqueue(2)
    # print(queue)
    print(queue.N)
    item = queue.unqueue()
    print(item.item)
    print(queue.N)
    queue.unqueue()
    queue.unqueue()
           

2.线图分析法parsing-chart

from Parsing.Chart.LinearQueue import Queue


class parsing_chart:
    def __init__(self,voca_tag):
        queue = Queue()
        self.agenda = queue
        self.voca_tag = voca_tag
        self.ActiveArc = []
        self.chart = []
        self.rules = [['S','NP','VP',0,0],['NP','Det','N',0,0],['VP','VP','PP',0,0],['VP','V','NP',0,0],['PP','Prep','NP',0,0]]
        self.changed_rules = self.rules.copy()
        self.rowFirst = None
        self.columnFirst = None

    def parsing(self,sentence):
        print('=======开始分析句子结构=======')
        # self.agenda = LinearQueue()
        s_length = len(sentence)
        count = -1
        for i in range(s_length+1):
            self.chart.append([0]*(s_length+1))
        while(count < len(sentence)):
            if self.agenda.isEmpty():
                # word_pos = self.voca_tag[i] + '('+str(i)+','+str(i+1)+')'
                count += 1
                if count == len(sentence):
                    break
                word_pos = [self.voca_tag[count],count, count + 1]
                self.agenda.enqueue(word_pos)
            else:
                #如果符合tag X符合规则 A ->X r  则添加标记A ->X。r(i,j),并添加至活性边列表中
                word_pos = self.agenda.unqueue().item
                for i in range(len(self.rules)):
                    rule = self.changed_rules[i]
                    if word_pos[0] == rule[1]:
                        self.add_ActiveArc(rule,word_pos)
                #扩展活性边
                self.extend_arc(word_pos,s_length,count)

        # self.get_text_struc(sentence)


    def add_ActiveArc(self,rule,word_pos):
        word = word_pos[0]
        if '。' in rule:
            rule.remove('。')

        # 有些规则如VP -> VP PP 需要将。移动到后面的VP处
        index = [i for i,x in enumerate(rule) if x == word]
        if len(index) > 1:
            real_index = index[len(index)-1]
        else:
            real_index = index[0]
        rule.insert(real_index+1,'。')
        rule[4] = word_pos[1]
        rule[5] = word_pos[2]
        self.ActiveArc.append(rule)

    def extend_arc(self,word_pos,s_length,count):
        self.chart[word_pos[1]][word_pos[2]] = word_pos[0]
 
        for i in range(len(self.ActiveArc)):
            #只有tag X符合 A -> u。X才执行以下程序
            if word_pos[0] == self.ActiveArc[i][3]:
                if 'S' == self.ActiveArc[i][0]:
                    self.chart[0][s_length] = 'S'
                else:
                    self.ActiveArc[i] =[self.ActiveArc[i][0],self.ActiveArc[i][1],self.ActiveArc[i][3],self.ActiveArc[i][2],self.ActiveArc[i][4],self.ActiveArc[i][5]]
                    self.agenda.enqueue([self.ActiveArc[i][0],self.ActiveArc[i][4],word_pos[2]])

    def get_text_struc(self,sentence):
        print('=======句子结构如下=======')
        # print(self.chart)
        for i in range(len(self.chart)-1):
            for j in range(len(self.chart)):
                if self.chart[i][j] != 0:
                    print(str(sentence[i:j])+ ':' + self.chart[i][j])

    def get_structure_tree(self,sentence,row,column):
        col_value = None
        row_value = None
        n_row1 = None
        n_col1 = None
        n_row2 = None
        n_col2 =None
        if row is None or column is None:
            return

        curr_tag = self.chart[row][column]

        for i in range(column-1,-1,-1):
            if self.chart[row][i] != 0:
                row_value = self.chart[row][i]
                # print(row_value)
                n_row1 = row
                n_col1 = i
                break

        for r in range(row+1,len(self.chart) - 1):
            if self.chart[r][column] != 0:
                col_value = self.chart[r][column]
                # print(col_value)
                n_row2 = r
                n_col2 = column
                break

        if col_value is not None and row_value is not None:
            print(' '.join(sentence[row:column])+'/'+curr_tag + '  ->  ' + ' '.join(sentence[n_row1:n_col1]) + '/' + row_value + '   ' + ' '.join(sentence[n_row2:n_col2]) + '/' + col_value)

        self.get_structure_tree(sentence,n_row1,n_col1)
        self.get_structure_tree(sentence,n_row2,n_col2)




if __name__ == '__main__':
    sentence = ['the', 'boy', 'hits', 'the', 'dog', 'with', 'a', 'rod']
    voca_tag = ['Det','N','V','Det','N','Prep','Det','N']
    pc = parsing_chart(voca_tag)
    pc.parsing(sentence)
    nrow = 0
    ncol = len(pc.chart)-1
    pc.get_structure_tree(sentence,nrow,ncol)
           

3.结果

通过parsing-chart实现句法分析

4.总结

此次代码实现浪费太多时间。。因为没有太研究例子,而仅仅根据算法写代码,写的总是有问题,之后详细做了下PPT上的例子,问题便迎刃而解,以前看过一个童鞋说过:多读书、多总结、少写代码十分有道理,先把思路理清,代码二点实现便水到渠成了

继续阅读