支持向量机
在本练习中,我们将使用高斯核函数的支持向量机(SVM)来构建垃圾邮件分类器。
数据集1
现在2d数据集上实验
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from scipy.io import loadmat
raw_data = loadmat('ex6data1.mat')
data=pd.DataFrame(raw_data.get('X'),columns=['X1','X2'])
data['y'] = raw_data.get('y')
data.head()
X1 | X2 | y | |
---|---|---|---|
1.9643 | 4.5957 | 1 | |
1 | 2.2753 | 3.8589 | 1 |
2 | 2.9781 | 4.5651 | 1 |
3 | 2.9320 | 3.5519 | 1 |
4 | 3.5772 | 2.8560 | 1 |
#可视化
def plot_init_data(data,fig,ax):
positive = data[data['y']==1]
negative = data[data['y']==0]
ax.scatter(positive['X1'],positive['X2'],s=50,marker='o',c='r',label='positive')
ax.scatter(negative['X1'],negative['X2'],s=50,marker='x',c='b',label='negative')
fig,ax = plt.subplots(figsize=(12,8))
plot_init_data(data,fig,ax)
ax.legend()
plt.show()
可以看出左上角有一个异常点,但是整体依然呈现线性分布,所以可以调用线性支持向量机来学习类边界。
令C=1
from sklearn import svm
svc = svm.LinearSVC(C=1,loss='hinge',max_iter=1000)
svc
LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
penalty='l2', random_state=None, tol=0.0001, verbose=0)
svc.fit(data[['X1','X2']],data['y'])
svc.score(data[['X1','X2']],data['y'])
0.9803921568627451
# 可视化分类边界
def find_decision_boundary(svc,x1min,x2min,x1max,x2max,diff):
x1 = np.linspace(x1min,x1max,1000)
x2 = np.linspace(x2min,x2max,1000)
cordinates = [(x,y) for x in x1 for y in x2]
x_cord, y_cord = zip(*cordinates) #*cordinates可以理解为cordinates的解压,返回两个数组
c_val = pd.DataFrame({'x1':x_cord,'x2':y_cord})
c_val['cval'] = svc.decision_function(c_val[['x1','x2']])
decision = c_val[np.abs(c_val['cval'])<diff]
return decision.x1,decision.x2
x1,x2 = find_decision_boundary(svc,0,1.5,4,5,2*10**-3)
fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(x1,x2,s=10,c='r',label='bound')
plot_init_data(data,fig,ax)
ax.set_title('SVM(C=1) decision boundary')
ax.legend()
plt.show()
#改变C,让C变大观察变化
svc2 = svm.LinearSVC(C=100,loss='hinge',max_iter=1000)
svc2.fit(data[['X1','X2']],data['y'])
svc2.score(data[['X1','X2']],data['y'])
0.9411764705882353
x1,x2 = find_decision_boundary(svc2,0,1.5,4,5,2*10**-3)
fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(x1,x2,s=10,c='r',label='bound')
plot_init_data(data,fig,ax)
ax.set_title('SVM(C=100) decision boundary')
ax.legend()
plt.show()
高斯内核的SVM
为了能更好的理解非线性SVM分类器,我们从头开始实现高斯内核,并不使用sklearn中已经封装好的包。
高斯内核
高斯内核是一个衡量一对数据间的“距离”的函数,有一个参数 σ \sigma σ,决定了相似性下降至0有多快.( σ \sigma σ越大,下降越快)。
def gaussian_kernel(x1,x2,sigma):
return np.exp(-(np.sum((x1-x2)**2)/(2*(sigma**2))))
x1 = np.array([1,2,1])
x2 = np.array([0,4,-1])
sigma = 2
gaussian_kernel(x1,x2,sigma)
0.32465246735834974
数据集2
在这个数据集中,我们使用高斯内核实现非线性分类器。这里将直接调用sklean中的svm包。
raw_data = loadmat('ex6data2')
data = pd.DataFrame(raw_data['X'],columns=['X1','X2'])
data['y'] = raw_data['y']
fig,ax = plt.subplots(figsize=(12,8))
plot_init_data(data,fig,ax)
ax.legend()
plt.show()
svc = svm.SVC(C=100,gamma=10,probability=True)
svc
SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma=10, kernel='rbf',
max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
verbose=False)
svc.fit(data[['X1','X2']],data['y'])
svc.score(data[['X1','X2']],data['y'])
0.9698725376593279
x1,x2 = find_decision_boundary(svc,0,0.4,1,1,0.01)
fig,ax = plt.subplots(figsize=(12,8))
plot_init_data(data,fig,ax)
ax.scatter(x1,x2,s=10,c='g')
plt.show()
数据集3
这个数据集已经把X和y按照训练集和验证集分类好了,所以只需要找到最优的超参数即可。即,寻找最优的 C C C 和 σ \sigma σ. 我们给定数值范围:[0.01,0.03,0.1,0.3,1,3,10,30,100]
raw_data = loadmat('ex6data3.mat')
X = raw_data['X']
Xval = raw_data['Xval']
y = raw_data['y']
yval = raw_data['yval']
fig,ax = plt.subplots(figsize=(12,8))
data = pd.DataFrame(raw_data.get('X'),columns=['X1','X2'])
data['y'] = raw_data.get('y')
plot_init_data(data,fig,ax)
ax.legend()
plt.show()
C_candidates = [0.01,0.03,0.1,0.3,1,3,10,30,100]
gamma_candidates = [0.01,0.03,0.1,0.3,1,3,10,30,100]
best_score = 0
finalC = 0
finalgamma = 0
for C in C_candidates:
for gamma in gamma_candidates:
svc = svm.SVC(C=C,gamma=gamma)
svc.fit(X,y)
score = svc.score(Xval,yval)
if score>best_score:
best_score = score
finalC = C
finalgamma = gamma
best_score,finalC,finalgamma
(0.965, 0.3, 100)
svc = svm.SVC(C=finalC,gamma=finalgamma)
svc.fit(X,y)
x1,x2 = find_decision_boundary(svc,-0.6,-0.7,0.3,0.6,0.005)
fig,ax = plt.subplots(figsize=(12,8))
plot_init_data(data,fig,ax)
ax.scatter(x1,x2,s=10,c='g')
ax.legend()
plt.show()
垃圾邮件分类
我们通过构建SVM来构建垃圾邮件分类器
训练SVM
spam_train = loadmat('spamTrain.mat')
spam_test = loadmat('spamTest.mat')
spam_train
{'X': array([[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 1, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
'__globals__': [],
'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sun Nov 13 14:27:25 2011',
'__version__': '1.0',
'y': array([[1],
[1],
[0],
...,
[1],
[0],
[0]], dtype=uint8)}
X = spam_train['X']
Xtest = spam_test['Xtest']
y = spam_train['y'].ravel()
ytest = spam_test['ytest'].ravel()
X.shape,y.shape,Xtest.shape,ytest.shape
((4000, 1899), (4000,), (1000, 1899), (1000,))
这里可以理解为每个文档都是一个向量了,1899个维度对应1899个单词,每个维度仅有01用于判断这个文档是否有这个单词
svc = svm.SVC()
svc.fit(X,y)
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
(0.99325, 0.987)
可视化结果
rang = np.eye(1899)
spam_val['isspam'] = svc.decision_function(rang)
count 1899.000000
mean -0.110039
std 0.049094
min -0.428396
25% -0.131213
50% -0.111985
75% -0.091973
max 0.396286
Name: isspam, dtype: float64
decision = spam_val[spam_val['isspam']>0]
decision
idx | isspam | |
---|---|---|
155 | 155 | 0.095529 |
173 | 173 | 0.066666 |
297 | 297 | 0.396286 |
351 | 351 | 0.023785 |
382 | 382 | 0.030317 |
476 | 476 | 0.042474 |
478 | 478 | 0.057344 |
529 | 529 | 0.060692 |
537 | 537 | 0.008558 |
680 | 680 | 0.109643 |
697 | 697 | 0.003269 |
738 | 738 | 0.092561 |
774 | 774 | 0.181496 |
791 | 791 | 0.040396 |
1008 | 1008 | 0.012187 |
1088 | 1088 | 0.132633 |
1101 | 1101 | 0.002832 |
1120 | 1120 | 0.003076 |
1163 | 1163 | 0.072045 |
1178 | 1178 | 0.012122 |
1182 | 1182 | 0.015656 |
1190 | 1190 | 0.232788 |
1263 | 1263 | 0.160806 |
1298 | 1298 | 0.044018 |
1372 | 1372 | 0.019640 |
1397 | 1397 | 0.218337 |
1399 | 1399 | 0.018762 |
1460 | 1460 | 0.001859 |
1467 | 1467 | 0.002822 |
1519 | 1519 | 0.001654 |
1661 | 1661 | 0.003775 |
1721 | 1721 | 0.057241 |
1740 | 1740 | 0.034107 |
1795 | 1795 | 0.125143 |
1823 | 1823 | 0.002071 |
1829 | 1829 | 0.002630 |
1851 | 1851 | 0.030662 |
1892 | 1892 | 0.052786 |
1894 | 1894 | 0.101613 |
path = 'vocab.txt'
vocab = pd.read_csv(path,header=None,names=['idx','vocabulary'],sep='\t')
vocab.head()
idx | vocabulary | |
---|---|---|
1 | aa | |
1 | 2 | ab |
2 | 3 | abil |
3 | 4 | abl |
4 | 5 | about |
spamvocabulary = vocab.loc[list(decision['idx'])]
spamvocabulary
idx | vocabulary | |
---|---|---|
155 | 156 | basenumb |
173 | 174 | below |
297 | 298 | click |
351 | 352 | contact |
382 | 383 | credit |
476 | 477 | dollar |
478 | 479 | dollarnumb |
529 | 530 | |
537 | 538 | encod |
680 | 681 | free |
697 | 698 | futur |
738 | 739 | guarante |
774 | 775 | here |
791 | 792 | hour |
1008 | 1009 | market |
1088 | 1089 | nbsp |
1101 | 1102 | nextpart |
1120 | 1121 | numbera |
1163 | 1164 | offer |
1178 | 1179 | opt |
1182 | 1183 | order |
1190 | 1191 | our |
1263 | 1264 | pleas |
1298 | 1299 | price |
1372 | 1373 | receiv |
1397 | 1398 | remov |
1399 | 1400 | repli |
1460 | 1461 | se |
1467 | 1468 | see |
1519 | 1520 | sincer |
1661 | 1662 | text |
1721 | 1722 | transfer |
1740 | 1741 | type |
1795 | 1796 | visit |
1823 | 1824 | websit |
1829 | 1830 | welcom |
1851 | 1852 | will |
1892 | 1893 | you |
1894 | 1895 | your |