import paddle
import paddle.nn as nn
import paddlenlp
from paddlenlp.datasets import MapDataset
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.layers import LinearChainCrf, ViterbiDecoder, LinearChainCrfLoss
from paddlenlp.metrics import ChunkEvaluator
(二)、数据集加载
from paddlenlp.datasets import load_dataset
# 由于MSRA_NER数据集没有dev dataset,我们这里重复加载test dataset作为dev_ds
train_ds, dev_ds, test_ds = load_dataset(
'msra_ner', splits=('train', 'test', 'test'), lazy=False)
label_vocab = {label:label_id for label_id, label in enumerate(train_ds.label_list)}
words = set()
word_vocab = []
for item in train_ds:
word_vocab += item['tokens']
word_vocab = {k:v+2 for v,k in enumerate(set(word_vocab))}
word_vocab['PAD'] = 0
word_vocab['OOV'] = 1
lens = len(train_ds)+len(dev_ds)
print(len(train_ds)/lens,len(dev_ds)/lens,len(test_ds)/lens)
def parse_decodes(ds, decodes, lens, label_vocab):
decodes = [x for batch in decodes for x in batch]
lens = [x for batch in lens for x in batch]
print(len(decodes),len(decodes))
id_label = dict(zip(label_vocab.values(), label_vocab.keys()))
# print(id_label)
outputs = []
i=0
for idx, end in enumerate(lens):
sent = ds.data[idx]['tokens'][:end]
tags = [id_label[x] for x in decodes[idx][:end]]
sent_out = []
tags_out = []
words = ""
for s, t in zip(sent, tags):
# {'B-PER': 0, 'I-PER': 1, 'B-ORG': 2, 'I-ORG': 3, 'B-LOC': 4, 'I-LOC': 5, 'O': 6}?
if t.startswith('B-') or t == 'O':
if len(words):
sent_out.append(words) # 上一个实体保存
tags_out.append(t.split('-')[-1]) # 保存该实体的类型
words = s
else: # 保存实体的
words += s
if len(sent_out) < len(tags_out):
sent_out.append(words)
if len(sent_out) != len(tags_out):
print(len(sent_out),len(tags_out))
continue
cs = [str((s, t)) for s, t in zip(sent_out, tags_out)]
ss = ''.join(cs)
i+=1
outputs.append(ss)
return outputs