C++實作KNN算法、
/*
* @Description: C++實作KNN算法
* @Author: szq
* @Github: https://github.com/MrQqqq
* @Date: 2020-07-08 19:13:25
* @LastEditors: szq
* @LastEditTime: 2020-07-09 16:50:55
* @FilePath: \cpp\src\KNN\KNN.cpp
*/
#include<iostream>
#include<vector>
#include<fstream>
#include<random>
#include<time.h>
#include<map>
#include<algorithm>
using namespace std;
/**
* @destription: 分割字元串
* @param s:源字元
* @param mode:分割的字元
* @return: 字元串分割後的字元串數組
*/
vector<string> split(string &s,char mode){
vector<string> res;
while(s.size() > 0){
int index = s.find(mode);
if(index != -1){
res.push_back(s.substr(0,index+1));
s = s.substr(index + 1);
}
else{
res.push_back(s);
break;
}
}
return res;
}
/**
* @destription: 擷取檔案的行數并将每一行的内容按','分割,儲存起來
* @param in:輸入流
* @param lines:儲存每一行的結果
* @return: 傳回檔案的行數
*/
int getFileRows(ifstream &in,vector<vector<string>> &lines){
int rows = 0;
char line[512];
while(!in.eof()){
in.getline(line,512,'\n');
string src = string(line);
lines.push_back(split(src,','));
rows++;
}
return rows;
}
/**
* @destription: 按一定的比例劃分訓練集和測試集
* @param filepath:儲存資料的檔案位址
* @param rate:訓練集占中總資料的比例
* @param trainingSet:訓練集
* @param testSet:測試集
* @return: 沒有傳回,結果都儲存在對應的參數中
*/
void loadDataset(string &filepath,double &rate,vector<vector<double>> &trainingSet,vector<vector<double>> &testSet){
ifstream input;
input.open("irisdata.txt",ios::in | ios :: binary);//讀或二進制打開檔案
vector<vector<string>> lines;//擷取檔案每一行内容
int rows = getFileRows(input,lines);//擷取行數和行内容
srand((unsigned int)time(NULL));//設定随機數種子
//将文本中的行内容轉換為數組,并放入訓練集或者測試集
vector<vector<double>> dataset(rows,vector<double>(5));
for(int i = 0;i < rows;i++){
//前四個數字轉換為double類型
for(int j = 0;j < 5;j++){
dataset[i][j] = atof(lines[i][j].c_str());
}
//劃分訓練集和測試集
if(rand()/double(RAND_MAX) < rate){
trainingSet.push_back(dataset[i]);
}
else{
testSet.push_back(dataset[i]);
}
}
input.close();
}
/**
* @destription: 計算距離
* @param instance1:執行個體1
* @param instance2:執行個體2
* @param length:特征數
* @return: 計算的距離
*/
double calculateDistance(vector<double> &instance1,vector<double> &instance2,int length){
double distance = 0;
for(int i = 0;i < length;i++){
distance += pow(instance1[i] - instance2[i],2);
}
return sqrt(distance);
}
/**
* @destription: 擷取訓練集中距離最小的k個近鄰
* @param trainingSet:訓練集
* @param testInstance:測試的執行個體對象
* @param k:選取的近鄰數量
* @return:選取的k個近鄰集合
*/
vector<vector<double>> getNeighbors(vector<vector<double>> &trainingSet,vector<double> &testInstance,int k){
vector<pair<vector<double>,double>> distances;
int len = testInstance.size() - 1;//特征數
//儲存執行個體和距離
for(int i = 0;i < trainingSet.size();i++){
double distance = calculateDistance(testInstance,trainingSet[i],len);
distances.push_back(make_pair(trainingSet[i],distance));
}
//按照距離排序
sort(distances.begin(),distances.end(),[](pair<vector<double>,double> &p1,pair<vector<double>,double> &p2){
return p1.second < p2.second;
});
//選取距離最小的k個執行個體作為近鄰
vector<vector<double>> neighbors;
for(int i = 0;i < k;i++){
neighbors.push_back(distances[i].first);
}
return neighbors;
}
/**
* @destription: 獲得選取近鄰的回報的結果,根據k個近鄰中分類結果最多的一個
* @param neighbors:選取的k個近鄰集合
* @return: 分類的結果
*/
double getResponse(vector<vector<double>> &neighbors){
map<int,int> classVotes;
//周遊k個近鄰,統計每個種類的個數
for(int i = 0;i < neighbors.size();i++){
classVotes[neighbors[i][4]]++;
}
int maxVote = 0;
double res = 0;
//計算種類個數最多那個種類
for(auto vote : classVotes){
if(vote.second > maxVote){
maxVote = vote.second;
res = vote.first;
}
}
return res;
}
/**
* @destription: 計算預測的準确率
* @param testSet:測試集合
* @param predictions:測試的結果集合
* @return: 預測的準确率
*/
double getAccuracy(vector<vector<double>> &testSet,vector<double> &predictions){
int correct = 0;
//統計預測正确的個數
for(int i = 0;i < testSet.size();i++){
if(testSet[i][4] == predictions[i]){
correct++;
}
}
//傳回準确率
return correct / (double)(testSet.size()) * 100.0;
}
/**
* @destription: 預測測試集結果
* @param trainSet:訓練集
* @param testSet:測試集
* @return: 預測的結果集合
*/
vector<double> pridict(vector<vector<double>> &trainSet,vector<vector<double>> &testSet){
vector<double> predictions;
int k = 3;
for(int i = 0;i < testSet.size();i++){
vector<std::vector<double>> neighbors = getNeighbors(trainSet,testSet[i],k);
double res = getResponse(neighbors);
predictions.push_back(res);
}
return predictions;
}
int main(){
vector<vector<double>> trainSet;
vector<vector<double>> testSet;
double rate = 0.8;
string filepath = "./irisdata.txt";
loadDataset(filepath,rate,trainSet,testSet);
cout << "------------trainSet:--------------" << endl;
for(auto traindata : trainSet){
for(double num : traindata){
cout << num << " ";
}
cout << endl;
}
cout << "------------testSet:--------------" << endl;
for(auto testdata : testSet){
for(double num : testdata){
cout << num << " ";
}
cout << endl;
}
vector<double> predictions;
predictions = pridict(trainSet,testSet);
cout << "------------測試結果為::--------------" << endl;
for(int i = 0;i < testSet.size();i++){
cout << "測試資料" << i << ":";
for(int j = 0;j < 4;j++){
cout << testSet[i][j] << " ";
}
cout << "預測值:" << predictions[i] << " " << "真實值:" << testSet[i][4] << endl;
}
double accuracy = getAccuracy(testSet,predictions);
cout << "準确率為:" << accuracy << endl;
}