import paddle
import paddle.nn as nn
import paddlenlp
from paddlenlp.datasets import MapDataset
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.layers import LinearChainCrf, ViterbiDecoder, LinearChainCrfLoss
from paddlenlp.metrics import ChunkEvaluator
(二)、資料集加載
from paddlenlp.datasets import load_dataset
# 由于MSRA_NER資料集沒有dev dataset,我們這裡重複加載test dataset作為dev_ds
train_ds, dev_ds, test_ds = load_dataset(
'msra_ner', splits=('train', 'test', 'test'), lazy=False)
label_vocab = {label:label_id for label_id, label in enumerate(train_ds.label_list)}
words = set()
word_vocab = []
for item in train_ds:
word_vocab += item['tokens']
word_vocab = {k:v+2 for v,k in enumerate(set(word_vocab))}
word_vocab['PAD'] = 0
word_vocab['OOV'] = 1
lens = len(train_ds)+len(dev_ds)
print(len(train_ds)/lens,len(dev_ds)/lens,len(test_ds)/lens)
def parse_decodes(ds, decodes, lens, label_vocab):
decodes = [x for batch in decodes for x in batch]
lens = [x for batch in lens for x in batch]
print(len(decodes),len(decodes))
id_label = dict(zip(label_vocab.values(), label_vocab.keys()))
# print(id_label)
outputs = []
i=0
for idx, end in enumerate(lens):
sent = ds.data[idx]['tokens'][:end]
tags = [id_label[x] for x in decodes[idx][:end]]
sent_out = []
tags_out = []
words = ""
for s, t in zip(sent, tags):
# {'B-PER': 0, 'I-PER': 1, 'B-ORG': 2, 'I-ORG': 3, 'B-LOC': 4, 'I-LOC': 5, 'O': 6}?
if t.startswith('B-') or t == 'O':
if len(words):
sent_out.append(words) # 上一個實體儲存
tags_out.append(t.split('-')[-1]) # 儲存該實體的類型
words = s
else: # 儲存實體的
words += s
if len(sent_out) < len(tags_out):
sent_out.append(words)
if len(sent_out) != len(tags_out):
print(len(sent_out),len(tags_out))
continue
cs = [str((s, t)) for s, t in zip(sent_out, tags_out)]
ss = ''.join(cs)
i+=1
outputs.append(ss)
return outputs