天天看点

吴恩达机器学习ex6 python实现

支持向量机

在本练习中,我们将使用高斯核函数的支持向量机(SVM)来构建垃圾邮件分类器。

数据集1

现在2d数据集上实验

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from scipy.io import loadmat
           
raw_data = loadmat('ex6data1.mat')
data=pd.DataFrame(raw_data.get('X'),columns=['X1','X2'])
data['y'] = raw_data.get('y')
data.head()
           
X1 X2 y
1.9643 4.5957 1
1 2.2753 3.8589 1
2 2.9781 4.5651 1
3 2.9320 3.5519 1
4 3.5772 2.8560 1
#可视化
def plot_init_data(data,fig,ax):
    positive = data[data['y']==1]
    negative = data[data['y']==0]
    ax.scatter(positive['X1'],positive['X2'],s=50,marker='o',c='r',label='positive')
    ax.scatter(negative['X1'],negative['X2'],s=50,marker='x',c='b',label='negative')
           
fig,ax = plt.subplots(figsize=(12,8))
plot_init_data(data,fig,ax)
ax.legend()
plt.show()
           
吴恩达机器学习ex6 python实现

可以看出左上角有一个异常点,但是整体依然呈现线性分布,所以可以调用线性支持向量机来学习类边界。

令C=1

from sklearn import svm
svc = svm.LinearSVC(C=1,loss='hinge',max_iter=1000)
svc
           
LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
          penalty='l2', random_state=None, tol=0.0001, verbose=0)
           
svc.fit(data[['X1','X2']],data['y'])
svc.score(data[['X1','X2']],data['y'])
           
0.9803921568627451
           
# 可视化分类边界
def find_decision_boundary(svc,x1min,x2min,x1max,x2max,diff):
    x1 = np.linspace(x1min,x1max,1000)
    x2 = np.linspace(x2min,x2max,1000)
    
    cordinates = [(x,y) for x in x1 for y in x2]
    x_cord, y_cord = zip(*cordinates) #*cordinates可以理解为cordinates的解压,返回两个数组
    c_val = pd.DataFrame({'x1':x_cord,'x2':y_cord})
    c_val['cval'] = svc.decision_function(c_val[['x1','x2']])
    
    decision = c_val[np.abs(c_val['cval'])<diff]
    
    return decision.x1,decision.x2
           
x1,x2 = find_decision_boundary(svc,0,1.5,4,5,2*10**-3)
fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(x1,x2,s=10,c='r',label='bound')
plot_init_data(data,fig,ax)
ax.set_title('SVM(C=1) decision boundary')
ax.legend()
plt.show()
           
吴恩达机器学习ex6 python实现
#改变C,让C变大观察变化
svc2 = svm.LinearSVC(C=100,loss='hinge',max_iter=1000)
svc2.fit(data[['X1','X2']],data['y'])
svc2.score(data[['X1','X2']],data['y'])
           
0.9411764705882353
           
x1,x2 = find_decision_boundary(svc2,0,1.5,4,5,2*10**-3)
fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(x1,x2,s=10,c='r',label='bound')
plot_init_data(data,fig,ax)
ax.set_title('SVM(C=100) decision boundary')
ax.legend()
plt.show()
           
吴恩达机器学习ex6 python实现

高斯内核的SVM

为了能更好的理解非线性SVM分类器,我们从头开始实现高斯内核,并不使用sklearn中已经封装好的包。

高斯内核

高斯内核是一个衡量一对数据间的“距离”的函数,有一个参数 σ \sigma σ,决定了相似性下降至0有多快.( σ \sigma σ越大,下降越快)。

def gaussian_kernel(x1,x2,sigma):
    return np.exp(-(np.sum((x1-x2)**2)/(2*(sigma**2))))
           
x1 = np.array([1,2,1])
x2 = np.array([0,4,-1])
sigma = 2

gaussian_kernel(x1,x2,sigma)
           
0.32465246735834974
           

数据集2

在这个数据集中,我们使用高斯内核实现非线性分类器。这里将直接调用sklean中的svm包。

raw_data = loadmat('ex6data2')
data = pd.DataFrame(raw_data['X'],columns=['X1','X2'])
data['y'] = raw_data['y']

fig,ax = plt.subplots(figsize=(12,8))
plot_init_data(data,fig,ax)
ax.legend()
plt.show()
           
吴恩达机器学习ex6 python实现
svc = svm.SVC(C=100,gamma=10,probability=True)
svc
           
SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=10, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)
           
svc.fit(data[['X1','X2']],data['y'])
svc.score(data[['X1','X2']],data['y'])
           
0.9698725376593279
           
x1,x2 = find_decision_boundary(svc,0,0.4,1,1,0.01)
fig,ax = plt.subplots(figsize=(12,8))
plot_init_data(data,fig,ax)
ax.scatter(x1,x2,s=10,c='g')
plt.show()
           
吴恩达机器学习ex6 python实现

数据集3

这个数据集已经把X和y按照训练集和验证集分类好了,所以只需要找到最优的超参数即可。即,寻找最优的 C C C 和 σ \sigma σ. 我们给定数值范围:[0.01,0.03,0.1,0.3,1,3,10,30,100]

raw_data = loadmat('ex6data3.mat')

X = raw_data['X']
Xval = raw_data['Xval']
y = raw_data['y']
yval = raw_data['yval']

fig,ax = plt.subplots(figsize=(12,8))
data = pd.DataFrame(raw_data.get('X'),columns=['X1','X2'])
data['y'] = raw_data.get('y')

plot_init_data(data,fig,ax)
ax.legend()
plt.show()


           
吴恩达机器学习ex6 python实现
C_candidates = [0.01,0.03,0.1,0.3,1,3,10,30,100]
gamma_candidates = [0.01,0.03,0.1,0.3,1,3,10,30,100]

best_score = 0
finalC = 0
finalgamma = 0

for C in C_candidates:
    for gamma in gamma_candidates:
        svc = svm.SVC(C=C,gamma=gamma)
        svc.fit(X,y)
        score = svc.score(Xval,yval)
        
        if score>best_score:
            best_score = score
            finalC = C
            finalgamma = gamma

best_score,finalC,finalgamma
            
           
(0.965, 0.3, 100)
           
svc = svm.SVC(C=finalC,gamma=finalgamma)
svc.fit(X,y)

x1,x2 = find_decision_boundary(svc,-0.6,-0.7,0.3,0.6,0.005)
fig,ax = plt.subplots(figsize=(12,8))
plot_init_data(data,fig,ax)
ax.scatter(x1,x2,s=10,c='g')
ax.legend()
plt.show()
           
吴恩达机器学习ex6 python实现

垃圾邮件分类

我们通过构建SVM来构建垃圾邮件分类器

训练SVM

spam_train = loadmat('spamTrain.mat')
spam_test = loadmat('spamTest.mat')

spam_train
           
{'X': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
 '__globals__': [],
 '__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sun Nov 13 14:27:25 2011',
 '__version__': '1.0',
 'y': array([[1],
        [1],
        [0],
        ...,
        [1],
        [0],
        [0]], dtype=uint8)}
           
X = spam_train['X']
Xtest = spam_test['Xtest']
y = spam_train['y'].ravel()
ytest = spam_test['ytest'].ravel()

X.shape,y.shape,Xtest.shape,ytest.shape
           
((4000, 1899), (4000,), (1000, 1899), (1000,))
           

这里可以理解为每个文档都是一个向量了,1899个维度对应1899个单词,每个维度仅有01用于判断这个文档是否有这个单词

svc = svm.SVC()
svc.fit(X,y)
           
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
           
(0.99325, 0.987)
           

可视化结果

rang = np.eye(1899)
spam_val['isspam'] = svc.decision_function(rang)
           
count    1899.000000
mean       -0.110039
std         0.049094
min        -0.428396
25%        -0.131213
50%        -0.111985
75%        -0.091973
max         0.396286
Name: isspam, dtype: float64
           
decision = spam_val[spam_val['isspam']>0]
decision
           
idx isspam
155 155 0.095529
173 173 0.066666
297 297 0.396286
351 351 0.023785
382 382 0.030317
476 476 0.042474
478 478 0.057344
529 529 0.060692
537 537 0.008558
680 680 0.109643
697 697 0.003269
738 738 0.092561
774 774 0.181496
791 791 0.040396
1008 1008 0.012187
1088 1088 0.132633
1101 1101 0.002832
1120 1120 0.003076
1163 1163 0.072045
1178 1178 0.012122
1182 1182 0.015656
1190 1190 0.232788
1263 1263 0.160806
1298 1298 0.044018
1372 1372 0.019640
1397 1397 0.218337
1399 1399 0.018762
1460 1460 0.001859
1467 1467 0.002822
1519 1519 0.001654
1661 1661 0.003775
1721 1721 0.057241
1740 1740 0.034107
1795 1795 0.125143
1823 1823 0.002071
1829 1829 0.002630
1851 1851 0.030662
1892 1892 0.052786
1894 1894 0.101613
path = 'vocab.txt'
vocab = pd.read_csv(path,header=None,names=['idx','vocabulary'],sep='\t')
vocab.head()
           
idx vocabulary
1 aa
1 2 ab
2 3 abil
3 4 abl
4 5 about
spamvocabulary = vocab.loc[list(decision['idx'])]
spamvocabulary
           
idx vocabulary
155 156 basenumb
173 174 below
297 298 click
351 352 contact
382 383 credit
476 477 dollar
478 479 dollarnumb
529 530 email
537 538 encod
680 681 free
697 698 futur
738 739 guarante
774 775 here
791 792 hour
1008 1009 market
1088 1089 nbsp
1101 1102 nextpart
1120 1121 numbera
1163 1164 offer
1178 1179 opt
1182 1183 order
1190 1191 our
1263 1264 pleas
1298 1299 price
1372 1373 receiv
1397 1398 remov
1399 1400 repli
1460 1461 se
1467 1468 see
1519 1520 sincer
1661 1662 text
1721 1722 transfer
1740 1741 type
1795 1796 visit
1823 1824 websit
1829 1830 welcom
1851 1852 will
1892 1893 you
1894 1895 your

继续阅读