文章目錄
- 1、簡述
- 2、 CountVectorizer 和 Transformer儲存和加載
- 2.1、TF-IDF詞典的儲存
- 2.2、TF-IDF加載,測試新資料
- 3、模型的儲存和加載
- 3.1、模型的儲存
- 3.2、模型的加載
- 4、例子
1、簡述
如果用到TF-IDF,sklearn中經常會用CountVectorizer與TfidfTransformer兩個類。我們總是需要儲存TF-IDF的詞典,然後計算測試集的TF-IDF,這裡要注意sklearn中儲存有兩種方法:pickle與joblib。這裡,我們可以用pickle儲存特征,用joblib儲存模型。
2、 CountVectorizer 和 Transformer儲存和加載
2.1、TF-IDF詞典的儲存
train_content = segmentWord(X_train)
test_content = segmentWord(X_test)
# replace 必須加,儲存訓練集的特征
vectorizer = CountVectorizer(decode_error="replace")
tfidftransformer = TfidfTransformer()
# 注意在訓練的時候必須用vectorizer.fit_transform、tfidftransformer.fit_transform
# 在預測的時候必須用vectorizer.transform、tfidftransformer.transform
vec_train = vectorizer.fit_transform(train_content)
tfidf = tfidftransformer.fit_transform(vec_train)
# 儲存經過fit的vectorizer 與 經過fit的tfidftransformer,預測時使用
feature_path = 'models/feature.pkl'
with open(feature_path, 'wb') as fw:
pickle.dump(vectorizer.vocabulary_, fw)
tfidftransformer_path = 'models/tfidftransformer.pkl'
with open(tfidftransformer_path, 'wb') as fw:
pickle.dump(tfidftransformer, fw)
2.2、TF-IDF加載,測試新資料
# 加載特征
feature_path = 'models/feature.pkl'
loaded_vec = CountVectorizer(decode_error="replace", vocabulary=pickle.load(open(feature_path, "rb")))
# 加載TfidfTransformer
tfidftransformer_path = 'models/tfidftransformer.pkl'
tfidftransformer = pickle.load(open(tfidftransformer_path, "rb"))
#測試用transform,表示測試資料,為list
test_tfidf = tfidftransformer.transform(loaded_vec.transform(test_content))
3、模型的儲存和加載
3.1、模型的儲存
# clf_model為生成的模型,利用joblib的dump儲存
clf_model = trainModel()
joblib.dump(clf_model, "model_"+path)
3.2、模型的加載
# clf_model為模型,利用joblib的load加載
clf_model = joblib.load(model_path)
4、例子
"""
Author:沙振宇
Time:20191112
Info:簡單的情緒識别
"""
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score
import joblib
import os
import jieba
import datetime
import warnings # 忽略警告
import pickle
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn", lineno=196)
m_list_allText = []# 全部内容(包含重複标簽,順序)
m_list_allL4ID = []# 全部标簽(包含重複标簽,順序)
m_list_allLabel = [] # 模型全部标簽(不包含重複标簽,順序)
m_map_id_score = {} # id對應的分值
# 讀取檔案裡面資料,擷取标簽和内容
def getFile(filename, count = -1):
with open(filename, 'r' ,encoding='utf-8') as fp:
global m_list_allL4ID,m_list_allText
m_list_allL4ID = []
m_list_allText = []
tmp_text = []
m_file_text = fp.readlines()
tmp_lines = len(m_file_text)
for i in range(tmp_lines):
text = m_file_text[i]
if ":" in text:
L4ID = text.split(":")[-2]
Msg = text.split(":")[-1]
m_list_allL4ID.append(L4ID)
m_list_allText.append(Msg)
if L4ID not in m_list_allLabel:
m_list_allLabel.append(L4ID)
tmp_text = []
tmp_text.append(Msg)
# jieba分詞
def jiabaToVector(list, isTest, isTFIDF = False):
tmp_list = []
for sentence in list:
tmp_list.append(" ".join(jieba.cut(sentence.strip())))
# 利用TFIDF生成詞向量
transformer = TfidfTransformer()
if isTest:
if isTFIDF:
tfidf = transformer.fit_transform(vectorizer.transform(tmp_list))
else:
tfidf = vectorizer.transform(tmp_list)
else:
if isTFIDF:
tfidf = transformer.fit_transform(vectorizer.fit_transform(tmp_list))
else:
tfidf = vectorizer.fit_transform(tmp_list)
return tfidf
# 建立預設的參數
def predict_4(X, Y):
clf = svm.LinearSVC()
clf = clf.fit(X, Y)
return clf
# 将L4轉換為整形
def L4ToInt(m_label_l4):
m_label_l4New = []
for i in range(len(m_label_l4)):
m_label_l4New.append(int(m_label_l4[i][1:]))
# print("m_label_l4New:",m_label_l4New)
return m_label_l4New
# 訓練SVM模型
def trainSVM(path, linecount = -1):
getFile(path, linecount)
vectorizer = CountVectorizer(decode_error="replace") # 全局向量, replace 必須加,儲存訓練集的特征
vector_train = jiabaToVector(m_list_allText, False, True)# 生成訓練向量
lenall = len(m_list_allText)# 資料大小
print("總集大小:", lenall)# print("總集大小:", lenall)
startT_Train = datetime.datetime.now()# 訓練
clf = predict_4(vector_train, m_list_allL4ID)
endT_Train = datetime.datetime.now()
print("訓練Time:", (endT_Train - startT_Train).microseconds)
return clf,vectorizer
# 檢視完全比對
def completeLabelDataMatch(path , query):
outList = {}
file_train = os.path.join(path)
with open(file_train, 'r', encoding='UTF-8')as fp:
textlist = fp.readlines()
for text in textlist:
if ":" in text:
conditionId = text.split(":")[-2]
Msg = text.split(":")[-1]
message = Msg.strip("\n")
if query == message:
outList["conditionId"] = conditionId
outList["Score"] = 1
print("Complete labelData match work: %s:%s"%(conditionId,message))
return outList
return False
# 檢視query的分值
def SVMMain(path, clf, query, score):
outList = completeLabelDataMatch(path, query)
if outList:
print("outList[\"conditionId\"]:", outList["conditionId"])
print("outList[\"Score\"]:", outList["Score"])
else:
outList = useSVM(clf, query)
if outList["Score"] > score:
return emotionAnalysis(outList["conditionId"])
else:
return "normal"
# 運用SVM模型
def useSVM(clf, query):
outList = {}
querylist = []
querylist.append(query)
vector_test = jiabaToVector(querylist, True, True) # 生成測試向量
startT = datetime.datetime.now()
percent = clf.decision_function(vector_test)
scorelist = []
if len(percent[0]):
scorelist = percent[0]
if len(m_list_allLabel) == len(scorelist):
for i in range(len(scorelist)):
m_map_id_score[m_list_allLabel[i]] = scorelist[i]
pVallist = sorted(scorelist,reverse=True)
percent = max(pVallist)
conditionID = ""
for item in range (len(m_map_id_score)):
if m_map_id_score[m_list_allLabel[item]] == percent:
conditionID = m_list_allLabel[item]
endT = datetime.datetime.now()
print("測試Time:", (endT - startT).microseconds)
outList["conditionId"] = conditionID
outList["Score"] = percent
print("outList[\"conditionId\"]:", outList["conditionId"])
print("outList[\"Score\"]:", outList["Score"])
return outList
# 情緒識别
def emotionAnalysis(label):
negtiveId = ['4000447','4000448','4000453','4000449','4000450','4000451', '4000452','4000454','4000459','4000458','4002227','4000461','4000460','4000465','4000464','4000803','4000468'] # 消極
positiveId = ['4000439','4000440','4000441','4000442','4000462','4000467','4000469','4000496','4000497'] # 積極
print("negtiveId:",negtiveId)
print("positiveId:",positiveId)
if label in negtiveId:
return "negtive"
elif label in positiveId:
return "positive"
else:
return "normal"
# 儲存模型和特征
def saveModel(path):
clf, vectorizer = trainSVM(path, -1)
feature_path = path
if "/" in path:
joblib.dump(clf, "model_" + path.split("/")[1])
if ".txt" in path:
feature_path = 'feature_' + path.split("/")[1].split(".txt")[0] + '.pkl'
else:
joblib.dump(clf, "model_" + path)
if ".txt" in path:
feature_path = 'feature_' + path.split(".txt")[0] + '.pkl'
print("模型已經儲存,開始儲存特征")
with open(feature_path, 'wb') as fw:
pickle.dump(vectorizer.vocabulary_, fw)
print("特征已經儲存。。。")
# 使用模型和特征
def useModel(model_path, feature_path):
# 加載模型
clf = joblib.load(model_path)
# 加載特征
loaded_vec = CountVectorizer(decode_error = "replace", vocabulary = pickle.load(open(feature_path, "rb")))
return clf, loaded_vec
if __name__ =="__main__":
path = "../rg_train_20171230_1000008.txt"
# 儲存模型
# saveModel(path)
# 加載模型
global vectorizer
clf, vectorizer = useModel("model_rg_train_20171230_1000008.txt","feature_rg_train_20171230_1000008.pkl")
source = "我很開心"
source = source.replace("\r", "")
source = source.replace("\n", "")
source = source.lower()
source = source[0:256]
print("開始比對")
result = SVMMain(path , clf, source, 0.6)
if result == "normal":
print("中性情緒")
elif result == "negtive":
print("負向情緒")
elif result == "positive":
print("正向情緒")