概覽
本文給出了下面9種學習器的matlab代碼(代碼主要來自前人整理,外加自己編寫),應用的例程(附帶iris資料集),以及介紹算法原理優秀的連結。
- 支援向量機
- k-近鄰
- 線性判别分析
- 随機森林
- RSS
- Bagging
- Boosting
- 邏輯回歸
- 樸素貝葉斯
程式包
連結:https://pan.baidu.com/s/1M_lzlJ5LqyGY96KTAYDakw
提取碼:oom7
應用例程
%% 機器學習-例程
%% 簡單介紹
%
%
% 功能:基于訓練資料,利用學習器,建構預測模型。
%
% 輸入:訓練資料(特征+标簽),測試資料(特征+标簽)。
%
% 輸出:預測的标簽
%
% 學習器的選擇:穩定學習器優先SVM,不穩定學習器優先ELM。(可以對分類資料做可視化分析,根據分布特點選擇對應偏好的分類器)
%
%
% 注意:以下例程中,測試資料是‘特征+标簽’,輸出是測試精度和預測樣本。在實際應用中,可直接将測試樣本的标簽設定為0,然後進行預測标簽。
%% 資料整理
load iris.mat % 加載資料集,特征在前,标簽在後,每一行是一個樣本,每一列是一個特征
tr=iris(1:100,:); % 取iris資料的前100行作為訓練樣本
te=iris(101:end,:); % 取後50行作為測試樣本
%% 學習器
% 1. SVM:支援向量機
[A_SVM,SVM_label] = f_SVM(tr,te);
% 該程式需要安裝libsvm
% 原理:https://zhuanlan.zhihu.com/p/77750026
% 2.KNN:k-近鄰
k = 3;
[A_kNN,label_knn] = f_knn(tr,te,k);
[A_kNN_1,label_knn_1,Probability] = knn_ave(tr,te,k); % 改進型kNN
% 原理:https://zhuanlan.zhihu.com/p/61341071
% 3.LDA:線性判别分析
[A_LDA,label_LDA] = f_LDA(tr,te);
% 原理:https://zhuanlan.zhihu.com/p/32658341
% 4.RF:随機森林
NTrees = 5;
[A_RF,label_RF]=f_RF(tr,te,NTrees);
% 原理:https://blog.csdn.net/qq_34106574/article/details/82016442
% 5.RSS
[A_RSS,label_RSS]=f_RSS(tr,te);
% 6.Bagging
% 內建:樣本幹擾
NTrees = 5;
[A_Bagging,label_Bagging]=f_Bagging(tr,te,NTrees);
% 原理:https://www.cnblogs.com/cgmcoding/p/13567288.html
% 7.Boosting
% 內建:特征幹擾
NTrees = 5;
[A_Boosting,label_Boosting]=f_Boosting(tr,te,NTrees);
% 原理:https://www.cnblogs.com/lyfruit/articles/3011429.html
% 8.LR:邏輯回歸
[A_LR,label_LR]=f_LR(tr,te);
% 原理:https://zhuanlan.zhihu.com/p/74874291
% 9.NB:樸素貝葉斯
[A_NB,label_NB] = f_NB(tr,te);
% 原理:https://zhuanlan.zhihu.com/p/26262151
子程式
function [A,svmlabel]=f_SVM(tr,te)
Trd = tr(:,1:end-1);
Trl = tr(:,end);%%訓練樣本和标号
Ted = te(:,1:end-1);
Tel = te(:,end);%%測試樣本和标号
model = svmtrain(Trl, Trd, ' -c 1 -g 0.07 -t 0 -q');
[svmlabel, TestingAccuracy,~] = svmpredict(Tel, Ted, model);
A=TestingAccuracy(1)/100;
end
%% svmtrain 參數介紹
% -s svm類型:SVM設定類型(預設0)
% 0 — C-SVC; 1 –v-SVC; 2 – 一類SVM; 3 — e-SVR; 4 — v-SVR
% -t 核函數類型:核函數設定類型(預設2)
% 0 – 線性核函數:u’v
% 1 – 多項式核函數:(ru’v + coef0)^degree
% 2 – RBF(徑向基)核函數:exp(-r|u-v|^2)
% 3 – sigmoid核函數:tanh(ru’v + coef0)
% -d degree:核函數中的degree設定(針對多項式核函數)(預設3)
% -g r(gamma):核函數中的gamma函數設定(針對多項式/rbf/sigmoid核函數)(預設1/k,k為總類别數)
% -r coef0:核函數中的coef0設定(針對多項式/sigmoid核函數)((預設0)
% -c cost:設定C-SVC,e -SVR和v-SVR的參數(損失函數)(預設1)
% -n nu:設定v-SVC,一類SVM和v- SVR的參數(預設0.5)
% -p p:設定e -SVR 中損失函數p的值(預設0.1)
% -m cachesize:設定cache記憶體大小,以MB為機關(預設40)
% -e eps:設定允許的終止判據(預設0.001)
% -h shrinking:是否使用啟發式,0或1(預設1)
% -wi weight:設定第幾類的參數C為weight*C (C-SVC中的C) (預設1)
% -v n: n-fold互動檢驗模式,n為fold的個數,必須大于等于2
function [A,knnlabel]=f_knn(tr,te,k)
%knn : 搜尋距離測試樣本最近的k的訓練樣本,k個樣本中占比最高的标簽預測為測試樣本的标簽。
%Input: tr: 訓練資料(标簽在最後一列)
% te: 測試資料(标簽在最後一列)
% k: 近鄰數
%Output: A:測試精度
% knnlabel:預測标簽
if ~exist('k', 'var')
k = 3;
end %如果沒有輸入k值,取k=3
data=[tr;te];
n=size(data,2);
m1=size(tr,1);
m2=size(te,1); %m1為訓練樣本數,m2為測試樣本數
trd=tr(:,1:n-1);
trl=tr(:,n);
ted=te(:,1:n-1);
tel=te(:,n); %-d為資料,-l為标簽
knnlabel=zeros(m2,1);
for j=1:m2
distance=zeros(m1,1);
for i=1:m1
distance(i)=norm(ted(j,:)-trd(i,:)); %計算測試資料與每個訓練資料的歐式距離
end
[~,index]=sort(distance);
label=trl(index(1:k)); %統計距離最近的k個訓練樣本的标簽
knnlabel(j,1)=mode(label); %取數目最多的标簽為預測标簽
end
bj=(knnlabel==tel);
a=nnz(bj);
A=a/m2; %輸出識别率
end
function [A_1,knnlabel_1,Probability]=knn_ave(tr,te,k)
%knn : 搜尋距離測試樣本最近的k的訓練樣本,k個樣本中平均距離最短的标簽預測為測試樣本的标簽。
%Input: tr: 訓練資料(标簽在最後一列)
% te: 測試資料(标簽在最後一列)
% k: 近鄰數
%Output: A:測試精度
% knnlabel:預測标簽
if ~exist('k', 'var')
k = 3;
end %如果沒有輸入k值,取k=3
data=[tr;te];
n=size(data,2);
m1=size(tr,1);
m2=size(te,1); %m1為訓練樣本數,m2為測試樣本數
trd=tr(:,1:n-1);
trl=tr(:,n);
ted=te(:,1:n-1);
tel=te(:,n); %-d為資料,-l為标簽
num_label = size(unique(trl),1);
probability=zeros(size(te,1),num_label);
knnlabel_1=zeros(m2,1);
for j=1:m2
distance=zeros(m1,1);
for i=1:m1
distance(i)=norm(ted(j,:)-trd(i,:)); %計算測試資料與每個訓練資料的歐式距離
end
[distance1,index]=sort(distance);
x1=trl(index,end);
distance1(:,2)=x1; %distance1的第一列是标簽,第二列是距離
di=zeros(num_label,2);
for w=1:num_label
x2=find(distance1(:,2)==w);
x2=x2(1:k,:);
dis=distance1(x2,1);
dis=sum(dis)/k;
di(w,1)=dis;
di(w,2)=w;
end %把每一種标簽都找出距離最近的k的樣本,并計算平均距離
c=sum(di(:,1))./di(:,1)';
c=c/max(c,[],2);
probability(j,:)=c; %輸出機率:距離的總和除以各個距離,然後除以其中最大值,得類機率
b=sortrows(di,1);
knnlabel_1(j,1)=b(1,2); %平均距離最近的标簽為預測标簽
end
Probability=[probability,knnlabel_1];
bj=(knnlabel_1==tel);
a=nnz(bj);
A_1=a/m2; %輸出識别率
function [A,predict_label]=f_LDA(tr,te)
%Input: tr: Training set
% te: testing set
% Note that: each row represents a instance, last column is label, begins from 1
%Output: A: Testing Accuracy
% predict_label: predict label by DA for testingdata
data=[tr;te];
n=size(data,2);
m1=size(tr,1);m2=size(te,1);
trd=tr(:,1:n-1);trl=tr(:,n);
ted=te(:,1:n-1);tel=te(:,n);
obj = ClassificationDiscriminant.fit(trd, trl);
predict_label = predict(obj, ted);
bj=(predict_label==tel);a=nnz(bj);
A=a/m2; %輸出識别率
end
function [A,Predict_label]=f_RF(tr,te,NTrees)
%Input: tr: Training set
% te: testing set
% Note that: each row represents a instance, last column is label, begins from 1
% NTrees: the number of the decision trees
%Output: A: Testing Accuracy
% Predict_label: predict label by DA for testingdata
if ~exist('NTrees', 'var')
NTrees = 50;
end
data=[tr;te];
n=size(data,2);
m1=size(tr,1);m2=size(te,1);
trd=tr(:,1:n-1);trl=tr(:,n);
ted=te(:,1:n-1);tel=te(:,n);
Factor = TreeBagger(NTrees, trd, trl);
[Predict_label,Scores] = predict(Factor, ted);
a=0;
for i=1:m2
cla=str2num(Predict_label{i,1});
ssr(i,1)=cla;
if cla==tel(i);
a=a+1;
end
end
A=a/m2; %輸出識别率
Predict_label=ssr;
end
function [A,predict_label]=f_RSS(tr,te)
% 随機子空間分類 Random subspace method
%Input: tr: Training set
% te: testing set
% Note that: each row represents a instance, last column is label, begins from 1
% NTrees: the number of the decision trees
%Output: A: Testing Accuracy
% class: predict label by DA for testingdata
data=[tr;te];
n=size(data,2);
m1=size(tr,1);m2=size(te,1);
trd=tr(:,1:n-1);trl=tr(:,n);
ted=te(:,1:n-1);tel=te(:,n);
ens = fitensemble(trd,trl,'Subspace' ,'AllPredictorCombinations','Discriminant','type','classification');
predict_label = predict(ens, ted);
bj=(predict_label==tel);a=nnz(bj);
A=a/m2; %輸出識别率
end
function [A,predict_label]=f_Bagging(tr,te,NTrees)
%Input: tr: Training set
% te: testing set
% Note that: each row represents a instance, last column is label, begins from 1
% NTrees: the number of the decision trees
%Output: A: Testing Accuracy
% class: predict label by DA for testingdata
if ~exist('NTrees', 'var')
NTrees = 50;
end
data=[tr;te];
n=size(data,2);
m1=size(tr,1);m2=size(te,1);
trd=tr(:,1:n-1);trl=tr(:,n);
ted=te(:,1:n-1);tel=te(:,n);
ens = fitensemble(trd,trl,'Bag' ,NTrees,'tree','type','classification');
predict_label = predict(ens, ted);
bj=(predict_label==tel);a=nnz(bj);
A=a/m2; %輸出識别率
end
function [A,predict_label]=f_Boosting(tr,te,NTrees)
%Input: tr: Training set
% te: testing set
% Note that: each row represents a instance, last column is label, begins from 1
% NTrees: the number of the decision trees
%Output: A: Testing Accuracy
% predict_label: predict label by Boosting for testingdata
if ~exist('NTrees', 'var')
NTrees = 50;
end
data=[tr;te];
n=size(data,2);
m1=size(tr,1);m2=size(te,1);
trd=tr(:,1:n-1);trl=tr(:,n);
ted=te(:,1:n-1);tel=te(:,n);
L=unique(data(:,end)); % 合并标簽中相同資料
ls=length(L(:)); %統計類别總數
if ls==2
str='AdaBoostM1';
else str='AdaBoostM2';
end
ens = fitensemble(trd,trl,str ,NTrees,'tree','type','classification');
predict_label = predict(ens, ted);
bj=(predict_label==tel);a=nnz(bj);
A=a/m2; %輸出識别率
end
function [A,Predict_label]=f_LR(tr,te)
%Input: tr: Training set
% te: testing set
% Note that: each row represents a instance, last column is label, begins from 1
%Output: A: Testing Accuracy
% Predict_label: predict label by logistic regression for testingdata
data=[tr;te];
n=size(data,2);
m1=size(tr,1);m2=size(te,1);
trd=tr(:,1:n-1);trl=tr(:,n); tr_l=dummyvar(trl);
ted=te(:,1:n-1);tel=te(:,n);
[B1,dev1,stats1] = mnrfit(trd,tr_l);
pihat1 = mnrval(B1,ted);
Predict_label=[];
for i2=1:m2;
[q1,v1]=max(pihat1(i2,:));
Predict_label=[Predict_label;v1];
end
bj=(Predict_label==tel);a=nnz(bj);
A=a/m2; %輸出識别率
end
function [accuracy,nblabel] = f_NB(Train, Test)
%Input: Train: Training set
% Test: testing set
% Note that: each row represents a instance, last column is label, begins from 1
%Output: accuracy: Testing Accuracy
% nblabel: predict label by nb for testingdata
Train_sample = Train(:,1:end-1);
Train_label = Train(:,end);
Test_sample = Test(:,1:end-1);
Test_label = Test(:,end);
Class_num = length(unique(Train_label));
Feature_num = size(Train_sample,2);
Para_mean = cell(1,Class_num);%Mean for each feature and class
Para_dev = cell(1,Class_num);%Dev for each feature and class
Sample_byclass = cell(1,Class_num);%Reorder the data set by class
Prior_prob = zeros(1,Class_num);%Prior probability of each class
for i=1:1:size(Train_sample,1)
Sample_byclass{1,Train_label(i,1)} = [Sample_byclass{1,Train_label(i,1)}; Train_sample(i,:)];
Prior_prob(1,Train_label(i,1)) = Prior_prob(1,Train_label(i,1)) + 1;
end
Prior_prob = Prior_prob/size(Train_sample,1);
for i=1:1:Class_num
miu = mean(Sample_byclass{1,i});
delta = std(Sample_byclass{1,i});
Para_mean{1,i} = miu;
Para_dev{1,i} = delta;
end
nblabel = [];
for i=1:1:size(Test_sample,1)
prob = log(Prior_prob);
for j=1:1:Class_num
for k=1:1:Feature_num
if Para_dev{1,j}(1,k) == 0
Para_dev{1,j}(1,k) = 0.1667;
end
prob(1,j) = prob(1,j) - (Test_sample(i,k)-Para_mean{1,j}(1,k))^2/(2*Para_dev{1,j}(1,k)^2) - log(Para_dev{1,j}(1,k));
end
end
[value,index] = max(prob);
nblabel = [nblabel ; index];
end
accuracy = length(find(nblabel - Test_label ==0))/length(Test_label);
end