# 导入所需要的包
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split

1 获取数据

train = pd.read_csv("./train.csv")
train.head()

SVM手写字体识别1 获取数据2 数据基本处理3 特征降维和模型训练4 确定最优模型

train.shape

(42000, 785)

# 1.1  确定特征值目标值
train_image = train.iloc[:, 1:]
train_image.head()

SVM手写字体识别1 获取数据2 数据基本处理3 特征降维和模型训练4 确定最优模型

train_label = train.iloc[:, 0]
train_label.head()

0 1

1 0

2 1

3 4

4 0

Name: label, dtype: int64

# 1.2  查看具体图像
num = train_image.iloc[0,].values.reshape(28, 28)
plt.imshow(num)
plt.axis("off")
plt.show()

SVM手写字体识别1 获取数据2 数据基本处理3 特征降维和模型训练4 确定最优模型

# 绘制图像函数
def to_plot(n):
    num = train_image.iloc[n,].values.reshape(28, 28)
    plt.imshow(num)
    plt.axis("off")
    plt.show()

to_plot(n=100)

SVM手写字体识别1 获取数据2 数据基本处理3 特征降维和模型训练4 确定最优模型

2 数据基本处理

# 2.1  数据归一化处理
# 对数据特征值归一化处理
train_image = train_image.values / 255

train_label = train_label.values

# 2.2  数据集分割

x_train, x_val, y_train, y_val = train_test_split(train_image, train_label, train_size = 0.8, random_state=0)
print(x_train.shape, x_val.shape)

(33600, 784) (8400, 784)

3 特征降维和模型训练

import time
from sklearn.decomposition import PCA

# 多次使用pca,确定最后的最优模型

def n_components_analysis(n, x_train, y_train, x_val, y_val):
    # 记录开始时间
    start = time.time()
    
    # pca降维实现
    pca = PCA(n_components=n)
    print("特征降维,传递的参数为:{}".format(n))
    pca.fit(x_train)
    
    # 在训练集和测试集进行降维
    x_train_pca = pca.transform(x_train)
    x_val_pca = pca.transform(x_val)
    
    # 利用svc进行训练
    print("开始使用svc进行训练")
    ss = svm.SVC()
    ss.fit(x_train_pca, y_train)
    
    # 获取accuracy结果
    accuracy = ss.score(x_val_pca, y_val)
    
    # 记录结束时间
    end = time.time()
    print("准确率是:{}, 消耗时间是:{}s".format(accuracy, int(end-start)))
    
    return accuracy, end-start

# 传递多个n_components,寻找合理的n_components:

n_s = np.linspace(0.70, 0.90, num=5)
accuracy = []
times = []

for n in n_s:
    tmp, loss_time = n_components_analysis(n, x_train, y_train, x_val, y_val)
    accuracy.append(tmp)
    times.append(loss_time)

特征降维,传递的参数为:0.7

开始使用svc进行训练

准确率是:0.9761904761904762, 消耗时间是:16s

特征降维,传递的参数为:0.75

开始使用svc进行训练

准确率是:0.9785714285714285, 消耗时间是:20s

特征降维,传递的参数为:0.8

开始使用svc进行训练

准确率是:0.979047619047619, 消耗时间是:25s

特征降维,传递的参数为:0.85

开始使用svc进行训练

准确率是:0.9803571428571428, 消耗时间是:31s

特征降维,传递的参数为:0.9

开始使用svc进行训练

准确率是:0.9805952380952381, 消耗时间是:44s

# 准确率可视化展示
plt.plot(n_s, np.array(accuracy), "r")
plt.show()

SVM手写字体识别1 获取数据2 数据基本处理3 特征降维和模型训练4 确定最优模型

plt.plot(n_s, np.array(times), "r")
plt.show()

SVM手写字体识别1 获取数据2 数据基本处理3 特征降维和模型训练4 确定最优模型

plt.plot( np.array(accuracy),np.array(times), "r", marker='^')
plt.xlabel('Accuracy')
plt.ylabel('Loss Time')
plt.show()

SVM手写字体识别1 获取数据2 数据基本处理3 特征降维和模型训练4 确定最优模型

4 确定最优模型

pca = PCA(n_components=0.90)

pca.fit(x_train)
pca.n_components_

x_train_pca = pca.transform(x_train)
x_val_pca = pca.transform(x_val)

print(x_train_pca.shape, x_val_pca.shape)

(33600, 87) (8400, 87)

# 训练比较优的模型,计算accuracy

svm = svm.SVC()

svm.fit(x_train_pca, y_train)

svm.score(x_val_pca, y_val)

0.9805952380952381

# 读取测试集
test = pd.read_csv("./test.csv")
test.head()

SVM手写字体识别1 获取数据2 数据基本处理3 特征降维和模型训练4 确定最优模型

test.shape

(28000, 784)

# 查看第二张图像
num1 = test.iloc[1,].values.reshape(28, 28)
plt.imshow(num1)
plt.axis("off")
plt.show()

SVM手写字体识别1 获取数据2 数据基本处理3 特征降维和模型训练4 确定最优模型

# 查看最后一张图像
num2 = test.iloc[-1,].values.reshape(28, 28)
num2
plt.imshow(num2)
plt.axis("off")
plt.show()

SVM手写字体识别1 获取数据2 数据基本处理3 特征降维和模型训练4 确定最优模型

# 对数据特征值归一化处理
test= test.values / 255

pca = PCA(n_components=0.90)

pca.fit(test)
pca.n_components_

test_pca = pca.transform(test)
test_pca.shape

(28000, 87)

test_pred = ss1.predict(test_pca)
test_pred

array([2, 0, 8, …, 7, 3, 2], dtype=int64)

SVM手写字体识别1 获取数据2 数据基本处理3 特征降维和模型训练4 确定最优模型

1 获取数据

2 数据基本处理

3 特征降维和模型训练

4 确定最优模型

继续阅读

XGBoost Plotting API以及GBDT组合特征实践 XGBoost Plotting API以及GBDT组合特征实践

解码器用于语义分割：数据依赖的解码可以实现灵活的特征聚合

YAML简介和PyYAML安全操作YAML支持的类型YAML的优点：yaml的基本语法python操作

2021-2025年中国运动疗法（KT）带行业市场供需与战略研究报告

Small tricks

libsvm for python 安装

学习软件测试基础测试第七天

Zeppelin 配置访问 REST APIApache Zeppelin Configuration REST API

【Torch】最简洁logging使用指南

27. Remove Element(列表)题目代码

Cloud Studio初体验

使用 ctypes 进行 Python 和 C 的混合编程

【python】【数据处理】画多维数据分布图

【python】netconf协议对接管理设备

「Python 网络自动化」NETCONF —— Python 使用 NETCONF 管理配置 H3C 网络设备

在python中创建excel并写入