天天看點

C++實作KNN算法C++實作KNN算法、

C++實作KNN算法、

/*
 * @Description: C++實作KNN算法
 * @Author: szq
 * @Github: https://github.com/MrQqqq
 * @Date: 2020-07-08 19:13:25
 * @LastEditors: szq
 * @LastEditTime: 2020-07-09 16:50:55
 * @FilePath: \cpp\src\KNN\KNN.cpp
 */ 

#include<iostream>
#include<vector>
#include<fstream>
#include<random>
#include<time.h>
#include<map>
#include<algorithm>
using namespace std;

/**
 * @destription: 分割字元串
 * @param s:源字元
 * @param mode:分割的字元 
 * @return: 字元串分割後的字元串數組
 */
vector<string> split(string &s,char mode){
    vector<string> res;
    while(s.size() > 0){
        int index = s.find(mode);
        if(index != -1){
            res.push_back(s.substr(0,index+1));
            s = s.substr(index + 1);
        }
        else{
            res.push_back(s);
            break;
        }
        
    }
    return res;
}

/**
 * @destription: 擷取檔案的行數并将每一行的内容按','分割,儲存起來
 * @param in:輸入流
 * @param lines:儲存每一行的結果
 * @return: 傳回檔案的行數
 */
int getFileRows(ifstream &in,vector<vector<string>> &lines){
    int rows = 0;
    char line[512];
    while(!in.eof()){
        in.getline(line,512,'\n');
        string src = string(line);
        lines.push_back(split(src,','));
        rows++;
    }
    return rows;
}

/**
 * @destription: 按一定的比例劃分訓練集和測試集
 * @param filepath:儲存資料的檔案位址
 * @param rate:訓練集占中總資料的比例
 * @param trainingSet:訓練集
 * @param testSet:測試集 
 * @return: 沒有傳回,結果都儲存在對應的參數中
 */
void loadDataset(string &filepath,double &rate,vector<vector<double>> &trainingSet,vector<vector<double>> &testSet){
    ifstream input;
    input.open("irisdata.txt",ios::in | ios :: binary);//讀或二進制打開檔案
    vector<vector<string>> lines;//擷取檔案每一行内容
    int rows = getFileRows(input,lines);//擷取行數和行内容
    srand((unsigned int)time(NULL));//設定随機數種子
    //将文本中的行内容轉換為數組,并放入訓練集或者測試集
    vector<vector<double>> dataset(rows,vector<double>(5));
    for(int i = 0;i < rows;i++){
        //前四個數字轉換為double類型
        for(int j = 0;j < 5;j++){
            dataset[i][j] = atof(lines[i][j].c_str());
        }
        //劃分訓練集和測試集
        if(rand()/double(RAND_MAX) < rate){
            trainingSet.push_back(dataset[i]);
        }
        else{
            testSet.push_back(dataset[i]);
        }
    }
    input.close();
}

/**
 * @destription: 計算距離
 * @param instance1:執行個體1
 * @param instance2:執行個體2
 * @param length:特征數 
 * @return: 計算的距離
 */
double calculateDistance(vector<double> &instance1,vector<double> &instance2,int length){
    double distance = 0;
    for(int i = 0;i < length;i++){
        distance += pow(instance1[i] - instance2[i],2);
    }
    return sqrt(distance);
}

/**
 * @destription: 擷取訓練集中距離最小的k個近鄰
 * @param trainingSet:訓練集 
 * @param testInstance:測試的執行個體對象
 * @param k:選取的近鄰數量
 * @return:選取的k個近鄰集合
 */
vector<vector<double>> getNeighbors(vector<vector<double>> &trainingSet,vector<double> &testInstance,int k){
    vector<pair<vector<double>,double>> distances;
    int len = testInstance.size() - 1;//特征數
    //儲存執行個體和距離
    for(int i = 0;i < trainingSet.size();i++){
        double distance = calculateDistance(testInstance,trainingSet[i],len);
        distances.push_back(make_pair(trainingSet[i],distance));
    }
    //按照距離排序
    sort(distances.begin(),distances.end(),[](pair<vector<double>,double> &p1,pair<vector<double>,double> &p2){
            return p1.second < p2.second;
        });
    //選取距離最小的k個執行個體作為近鄰
    vector<vector<double>> neighbors;
    for(int i = 0;i < k;i++){
        neighbors.push_back(distances[i].first);
    }
    return neighbors;
}

/**
 * @destription: 獲得選取近鄰的回報的結果,根據k個近鄰中分類結果最多的一個
 * @param neighbors:選取的k個近鄰集合 
 * @return: 分類的結果
 */
double getResponse(vector<vector<double>> &neighbors){
    map<int,int> classVotes;
    //周遊k個近鄰,統計每個種類的個數
    for(int i = 0;i < neighbors.size();i++){
        classVotes[neighbors[i][4]]++;
    }
    int maxVote = 0;
    double res = 0;
    //計算種類個數最多那個種類
    for(auto vote : classVotes){
        if(vote.second > maxVote){
            maxVote = vote.second;
            res = vote.first;
        }
    }
    return res;

}

/**
 * @destription: 計算預測的準确率
 * @param testSet:測試集合
 * @param predictions:測試的結果集合 
 * @return: 預測的準确率
 */
double getAccuracy(vector<vector<double>> &testSet,vector<double> &predictions){
    int correct = 0;
    //統計預測正确的個數
    for(int i = 0;i < testSet.size();i++){
        if(testSet[i][4] == predictions[i]){
            correct++;
        }
    }
    //傳回準确率
    return correct / (double)(testSet.size()) * 100.0;
}

/**
 * @destription: 預測測試集結果
 * @param trainSet:訓練集
 * @param testSet:測試集 
 * @return: 預測的結果集合
 */
vector<double> pridict(vector<vector<double>> &trainSet,vector<vector<double>> &testSet){
    vector<double> predictions;
    int k = 3;
    for(int i = 0;i < testSet.size();i++){
        vector<std::vector<double>> neighbors = getNeighbors(trainSet,testSet[i],k);
        double res = getResponse(neighbors);
        predictions.push_back(res);
    }
    return predictions;
}

int main(){
    vector<vector<double>> trainSet;
    vector<vector<double>> testSet;
    double rate = 0.8;
    string filepath = "./irisdata.txt";
    loadDataset(filepath,rate,trainSet,testSet);

    cout << "------------trainSet:--------------" << endl;
    for(auto traindata : trainSet){
        for(double num : traindata){
            cout << num << " ";
        }
        cout << endl;
    }

    cout << "------------testSet:--------------" << endl;
    for(auto testdata : testSet){
        for(double num : testdata){
            cout << num << " ";
        }
        cout << endl;
    }

    vector<double> predictions;
    predictions = pridict(trainSet,testSet);
    cout << "------------測試結果為::--------------" << endl;
    for(int i = 0;i < testSet.size();i++){
        cout << "測試資料" << i << ":";
        for(int j = 0;j < 4;j++){
            cout << testSet[i][j] << " ";
        }
        cout << "預測值:" << predictions[i] << " " << "真實值:" << testSet[i][4] << endl;
    }

    double accuracy = getAccuracy(testSet,predictions);
    cout << "準确率為:" << accuracy << endl;
}
           

繼續閱讀