吴恩达机器学习ex6 python实现

支持向量机

在本练习中，我们将使用高斯核函数的支持向量机（SVM）来构建垃圾邮件分类器。

数据集1

现在2d数据集上实验

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from scipy.io import loadmat

raw_data = loadmat('ex6data1.mat')
data=pd.DataFrame(raw_data.get('X'),columns=['X1','X2'])
data['y'] = raw_data.get('y')
data.head()

X1	X2	y
1.9643	4.5957	1
1	2.2753	3.8589	1
2	2.9781	4.5651	1
3	2.9320	3.5519	1
4	3.5772	2.8560	1

#可视化
def plot_init_data(data,fig,ax):
    positive = data[data['y']==1]
    negative = data[data['y']==0]
    ax.scatter(positive['X1'],positive['X2'],s=50,marker='o',c='r',label='positive')
    ax.scatter(negative['X1'],negative['X2'],s=50,marker='x',c='b',label='negative')

fig,ax = plt.subplots(figsize=(12,8))
plot_init_data(data,fig,ax)
ax.legend()
plt.show()

吴恩达机器学习ex6 python实现

可以看出左上角有一个异常点，但是整体依然呈现线性分布，所以可以调用线性支持向量机来学习类边界。

令C=1

from sklearn import svm
svc = svm.LinearSVC(C=1,loss='hinge',max_iter=1000)
svc

LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
          penalty='l2', random_state=None, tol=0.0001, verbose=0)

svc.fit(data[['X1','X2']],data['y'])
svc.score(data[['X1','X2']],data['y'])

0.9803921568627451

# 可视化分类边界
def find_decision_boundary(svc,x1min,x2min,x1max,x2max,diff):
    x1 = np.linspace(x1min,x1max,1000)
    x2 = np.linspace(x2min,x2max,1000)
    
    cordinates = [(x,y) for x in x1 for y in x2]
    x_cord, y_cord = zip(*cordinates) #*cordinates可以理解为cordinates的解压，返回两个数组
    c_val = pd.DataFrame({'x1':x_cord,'x2':y_cord})
    c_val['cval'] = svc.decision_function(c_val[['x1','x2']])
    
    decision = c_val[np.abs(c_val['cval'])<diff]
    
    return decision.x1,decision.x2

x1,x2 = find_decision_boundary(svc,0,1.5,4,5,2*10**-3)
fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(x1,x2,s=10,c='r',label='bound')
plot_init_data(data,fig,ax)
ax.set_title('SVM(C=1) decision boundary')
ax.legend()
plt.show()

吴恩达机器学习ex6 python实现

#改变C，让C变大观察变化
svc2 = svm.LinearSVC(C=100,loss='hinge',max_iter=1000)
svc2.fit(data[['X1','X2']],data['y'])
svc2.score(data[['X1','X2']],data['y'])

0.9411764705882353

x1,x2 = find_decision_boundary(svc2,0,1.5,4,5,2*10**-3)
fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(x1,x2,s=10,c='r',label='bound')
plot_init_data(data,fig,ax)
ax.set_title('SVM(C=100) decision boundary')
ax.legend()
plt.show()

吴恩达机器学习ex6 python实现

高斯内核的SVM

为了能更好的理解非线性SVM分类器，我们从头开始实现高斯内核，并不使用sklearn中已经封装好的包。

高斯内核

高斯内核是一个衡量一对数据间的“距离”的函数，有一个参数 σ \sigma σ，决定了相似性下降至0有多快.( σ \sigma σ越大，下降越快)。

def gaussian_kernel(x1,x2,sigma):
    return np.exp(-(np.sum((x1-x2)**2)/(2*(sigma**2))))

x1 = np.array([1,2,1])
x2 = np.array([0,4,-1])
sigma = 2

gaussian_kernel(x1,x2,sigma)

0.32465246735834974

数据集2

在这个数据集中，我们使用高斯内核实现非线性分类器。这里将直接调用sklean中的svm包。

raw_data = loadmat('ex6data2')
data = pd.DataFrame(raw_data['X'],columns=['X1','X2'])
data['y'] = raw_data['y']

fig,ax = plt.subplots(figsize=(12,8))
plot_init_data(data,fig,ax)
ax.legend()
plt.show()

吴恩达机器学习ex6 python实现

svc = svm.SVC(C=100,gamma=10,probability=True)
svc

SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=10, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

svc.fit(data[['X1','X2']],data['y'])
svc.score(data[['X1','X2']],data['y'])

0.9698725376593279

x1,x2 = find_decision_boundary(svc,0,0.4,1,1,0.01)
fig,ax = plt.subplots(figsize=(12,8))
plot_init_data(data,fig,ax)
ax.scatter(x1,x2,s=10,c='g')
plt.show()

吴恩达机器学习ex6 python实现

数据集3

这个数据集已经把X和y按照训练集和验证集分类好了，所以只需要找到最优的超参数即可。即，寻找最优的 C C C 和 σ \sigma σ. 我们给定数值范围：[0.01,0.03,0.1,0.3,1,3,10,30,100]

raw_data = loadmat('ex6data3.mat')

X = raw_data['X']
Xval = raw_data['Xval']
y = raw_data['y']
yval = raw_data['yval']

fig,ax = plt.subplots(figsize=(12,8))
data = pd.DataFrame(raw_data.get('X'),columns=['X1','X2'])
data['y'] = raw_data.get('y')

plot_init_data(data,fig,ax)
ax.legend()
plt.show()

吴恩达机器学习ex6 python实现

C_candidates = [0.01,0.03,0.1,0.3,1,3,10,30,100]
gamma_candidates = [0.01,0.03,0.1,0.3,1,3,10,30,100]

best_score = 0
finalC = 0
finalgamma = 0

for C in C_candidates:
    for gamma in gamma_candidates:
        svc = svm.SVC(C=C,gamma=gamma)
        svc.fit(X,y)
        score = svc.score(Xval,yval)
        
        if score>best_score:
            best_score = score
            finalC = C
            finalgamma = gamma

best_score,finalC,finalgamma

(0.965, 0.3, 100)

svc = svm.SVC(C=finalC,gamma=finalgamma)
svc.fit(X,y)

x1,x2 = find_decision_boundary(svc,-0.6,-0.7,0.3,0.6,0.005)
fig,ax = plt.subplots(figsize=(12,8))
plot_init_data(data,fig,ax)
ax.scatter(x1,x2,s=10,c='g')
ax.legend()
plt.show()

吴恩达机器学习ex6 python实现

垃圾邮件分类

我们通过构建SVM来构建垃圾邮件分类器

训练SVM

spam_train = loadmat('spamTrain.mat')
spam_test = loadmat('spamTest.mat')

spam_train

{'X': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
 '__globals__': [],
 '__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sun Nov 13 14:27:25 2011',
 '__version__': '1.0',
 'y': array([[1],
        [1],
        [0],
        ...,
        [1],
        [0],
        [0]], dtype=uint8)}

X = spam_train['X']
Xtest = spam_test['Xtest']
y = spam_train['y'].ravel()
ytest = spam_test['ytest'].ravel()

X.shape,y.shape,Xtest.shape,ytest.shape

((4000, 1899), (4000,), (1000, 1899), (1000,))

这里可以理解为每个文档都是一个向量了，1899个维度对应1899个单词，每个维度仅有01用于判断这个文档是否有这个单词

svc = svm.SVC()
svc.fit(X,y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

(0.99325, 0.987)

可视化结果

rang = np.eye(1899)
spam_val['isspam'] = svc.decision_function(rang)

count    1899.000000
mean       -0.110039
std         0.049094
min        -0.428396
25%        -0.131213
50%        -0.111985
75%        -0.091973
max         0.396286
Name: isspam, dtype: float64

decision = spam_val[spam_val['isspam']>0]
decision

idx	isspam
155	155	0.095529
173	173	0.066666
297	297	0.396286
351	351	0.023785
382	382	0.030317
476	476	0.042474
478	478	0.057344
529	529	0.060692
537	537	0.008558
680	680	0.109643
697	697	0.003269
738	738	0.092561
774	774	0.181496
791	791	0.040396
1008	1008	0.012187
1088	1088	0.132633
1101	1101	0.002832
1120	1120	0.003076
1163	1163	0.072045
1178	1178	0.012122
1182	1182	0.015656
1190	1190	0.232788
1263	1263	0.160806
1298	1298	0.044018
1372	1372	0.019640
1397	1397	0.218337
1399	1399	0.018762
1460	1460	0.001859
1467	1467	0.002822
1519	1519	0.001654
1661	1661	0.003775
1721	1721	0.057241
1740	1740	0.034107
1795	1795	0.125143
1823	1823	0.002071
1829	1829	0.002630
1851	1851	0.030662
1892	1892	0.052786
1894	1894	0.101613

path = 'vocab.txt'
vocab = pd.read_csv(path,header=None,names=['idx','vocabulary'],sep='\t')
vocab.head()

idx	vocabulary
1	aa
1	2	ab
2	3	abil
3	4	abl
4	5	about

spamvocabulary = vocab.loc[list(decision['idx'])]
spamvocabulary

idx	vocabulary
155	156	basenumb
173	174	below
297	298	click
351	352	contact
382	383	credit
476	477	dollar
478	479	dollarnumb
529	530	email
537	538	encod
680	681	free
697	698	futur
738	739	guarante
774	775	here
791	792	hour
1008	1009	market
1088	1089	nbsp
1101	1102	nextpart
1120	1121	numbera
1163	1164	offer
1178	1179	opt
1182	1183	order
1190	1191	our
1263	1264	pleas
1298	1299	price
1372	1373	receiv
1397	1398	remov
1399	1400	repli
1460	1461	se
1467	1468	see
1519	1520	sincer
1661	1662	text
1721	1722	transfer
1740	1741	type
1795	1796	visit
1823	1824	websit
1829	1830	welcom
1851	1852	will
1892	1893	you
1894	1895	your

吴恩达机器学习ex6 python实现

支持向量机

数据集1

高斯内核的SVM

高斯内核

数据集2

数据集3

垃圾邮件分类

训练SVM

可视化结果

继续阅读

简单文档分类——朴素贝叶斯算法朴素贝叶斯算法简单文档分类实例步骤总结朴素贝叶斯分类调用(sklearn)

【分类算法】什么是分类算法定义分类与聚类分类过程方法

分类算法的评价指标

K-近邻算法以及图像分类应用

weka之NB算法

使用weka的select attribute

weka中分类器算法

在weka中集成自己的算法

【多变量线性回归】学习记录序思路实现终

申请评分模型拒绝推断（RI）方法申请评分模型拒绝推断（RI）方法

【人工智能行业大师访谈1】吴恩达采访 Geoffery Hinton

【趋高机器视觉】机器视觉技术原理解析及解决方案

吴恩达 coursera ML 第七课总结+作业答案前言目录正文模型表示作业答案

XGBoost Plotting API以及GBDT组合特征实践 XGBoost Plotting API以及GBDT组合特征实践

解码器用于语义分割：数据依赖的解码可以实现灵活的特征聚合

2021-2025年中国运动疗法（KT）带行业市场供需与战略研究报告