天天看點

機器學習matlab代碼整理--非神經網絡學習器

概覽

本文給出了下面9種學習器的matlab代碼(代碼主要來自前人整理,外加自己編寫),應用的例程(附帶iris資料集),以及介紹算法原理優秀的連結。

  1. 支援向量機
  2. k-近鄰
  3. 線性判别分析
  4. 随機森林
  5. RSS
  6. Bagging
  7. Boosting
  8. 邏輯回歸
  9. 樸素貝葉斯

程式包

連結:https://pan.baidu.com/s/1M_lzlJ5LqyGY96KTAYDakw

提取碼:oom7

應用例程

%% 機器學習-例程
%% 簡單介紹
% 
% 
% 功能:基于訓練資料,利用學習器,建構預測模型。
% 
% 輸入:訓練資料(特征+标簽),測試資料(特征+标簽)。
% 
% 輸出:預測的标簽
% 
% 學習器的選擇:穩定學習器優先SVM,不穩定學習器優先ELM。(可以對分類資料做可視化分析,根據分布特點選擇對應偏好的分類器)
% 
% 
% 注意:以下例程中,測試資料是‘特征+标簽’,輸出是測試精度和預測樣本。在實際應用中,可直接将測試樣本的标簽設定為0,然後進行預測标簽。
%% 資料整理

    load iris.mat       % 加載資料集,特征在前,标簽在後,每一行是一個樣本,每一列是一個特征
    tr=iris(1:100,:);   % 取iris資料的前100行作為訓練樣本
    te=iris(101:end,:); % 取後50行作為測試樣本
%% 學習器
% 1. SVM:支援向量機

    [A_SVM,SVM_label] = f_SVM(tr,te);
    % 該程式需要安裝libsvm
    % 原理:https://zhuanlan.zhihu.com/p/77750026

% 2.KNN:k-近鄰

    k = 3;
    [A_kNN,label_knn] = f_knn(tr,te,k);
    [A_kNN_1,label_knn_1,Probability] = knn_ave(tr,te,k); % 改進型kNN
    % 原理:https://zhuanlan.zhihu.com/p/61341071
    
% 3.LDA:線性判别分析

    [A_LDA,label_LDA] = f_LDA(tr,te);
    % 原理:https://zhuanlan.zhihu.com/p/32658341

% 4.RF:随機森林

    NTrees = 5;
    [A_RF,label_RF]=f_RF(tr,te,NTrees);
    % 原理:https://blog.csdn.net/qq_34106574/article/details/82016442
    
% 5.RSS

    [A_RSS,label_RSS]=f_RSS(tr,te);

% 6.Bagging
    % 內建:樣本幹擾

    NTrees = 5;
    [A_Bagging,label_Bagging]=f_Bagging(tr,te,NTrees);
    % 原理:https://www.cnblogs.com/cgmcoding/p/13567288.html

% 7.Boosting
    % 內建:特征幹擾

    NTrees = 5;
    [A_Boosting,label_Boosting]=f_Boosting(tr,te,NTrees);
    % 原理:https://www.cnblogs.com/lyfruit/articles/3011429.html
    
% 8.LR:邏輯回歸

    [A_LR,label_LR]=f_LR(tr,te);
    % 原理:https://zhuanlan.zhihu.com/p/74874291

% 9.NB:樸素貝葉斯

    [A_NB,label_NB] = f_NB(tr,te);
    % 原理:https://zhuanlan.zhihu.com/p/26262151
           

子程式

function [A,svmlabel]=f_SVM(tr,te)

Trd = tr(:,1:end-1);
Trl = tr(:,end);%%訓練樣本和标号
Ted = te(:,1:end-1);
Tel = te(:,end);%%測試樣本和标号
model = svmtrain(Trl, Trd, ' -c 1 -g 0.07 -t 0 -q');
[svmlabel, TestingAccuracy,~] = svmpredict(Tel, Ted, model);
A=TestingAccuracy(1)/100;
end
%% svmtrain 參數介紹
% -s svm類型:SVM設定類型(預設0)
%     0 — C-SVC; 1 –v-SVC; 2 – 一類SVM; 3 — e-SVR; 4 — v-SVR
% -t 核函數類型:核函數設定類型(預設2)
%     0 – 線性核函數:u’v
%     1 – 多項式核函數:(ru’v + coef0)^degree
%     2 – RBF(徑向基)核函數:exp(-r|u-v|^2)
%     3 – sigmoid核函數:tanh(ru’v + coef0)
% -d degree:核函數中的degree設定(針對多項式核函數)(預設3)
% -g r(gamma):核函數中的gamma函數設定(針對多項式/rbf/sigmoid核函數)(預設1/k,k為總類别數)
% -r coef0:核函數中的coef0設定(針對多項式/sigmoid核函數)((預設0)
% -c cost:設定C-SVC,e -SVR和v-SVR的參數(損失函數)(預設1)
% -n nu:設定v-SVC,一類SVM和v- SVR的參數(預設0.5)
% -p p:設定e -SVR 中損失函數p的值(預設0.1)
% -m cachesize:設定cache記憶體大小,以MB為機關(預設40)
% -e eps:設定允許的終止判據(預設0.001)
% -h shrinking:是否使用啟發式,0或1(預設1)
% -wi weight:設定第幾類的參數C為weight*C (C-SVC中的C) (預設1)
% -v n: n-fold互動檢驗模式,n為fold的個數,必須大于等于2

           
function [A,knnlabel]=f_knn(tr,te,k)

%knn :      搜尋距離測試樣本最近的k的訓練樣本,k個樣本中占比最高的标簽預測為測試樣本的标簽。
%Input:      tr: 訓練資料(标簽在最後一列)
%            te: 測試資料(标簽在最後一列)
%            k: 近鄰數 

%Output:     A:測試精度
%            knnlabel:預測标簽


if ~exist('k', 'var')
     k = 3;
end               %如果沒有輸入k值,取k=3
    
data=[tr;te];
n=size(data,2);
m1=size(tr,1);
m2=size(te,1);    %m1為訓練樣本數,m2為測試樣本數

trd=tr(:,1:n-1);
trl=tr(:,n);
ted=te(:,1:n-1);
tel=te(:,n);      %-d為資料,-l為标簽

knnlabel=zeros(m2,1);
for j=1:m2
    distance=zeros(m1,1);
    for i=1:m1
        distance(i)=norm(ted(j,:)-trd(i,:));    %計算測試資料與每個訓練資料的歐式距離 
    end

    [~,index]=sort(distance); 
    label=trl(index(1:k));      %統計距離最近的k個訓練樣本的标簽
    knnlabel(j,1)=mode(label);  %取數目最多的标簽為預測标簽
end

bj=(knnlabel==tel);
a=nnz(bj);
A=a/m2; %輸出識别率
end
           
function [A_1,knnlabel_1,Probability]=knn_ave(tr,te,k)
%knn :      搜尋距離測試樣本最近的k的訓練樣本,k個樣本中平均距離最短的标簽預測為測試樣本的标簽。
%Input:      tr: 訓練資料(标簽在最後一列)
%            te: 測試資料(标簽在最後一列)
%            k: 近鄰數 

%Output:     A:測試精度
%            knnlabel:預測标簽
if ~exist('k', 'var')
     k = 3;
end               %如果沒有輸入k值,取k=3


    
data=[tr;te];
n=size(data,2);
m1=size(tr,1);
m2=size(te,1);    %m1為訓練樣本數,m2為測試樣本數

trd=tr(:,1:n-1);
trl=tr(:,n);
ted=te(:,1:n-1);
tel=te(:,n);      %-d為資料,-l為标簽
num_label = size(unique(trl),1);

probability=zeros(size(te,1),num_label);

knnlabel_1=zeros(m2,1);
for j=1:m2
    distance=zeros(m1,1);
    for i=1:m1
        distance(i)=norm(ted(j,:)-trd(i,:));    %計算測試資料與每個訓練資料的歐式距離 
    end

 [distance1,index]=sort(distance); 
 x1=trl(index,end);
 distance1(:,2)=x1;      %distance1的第一列是标簽,第二列是距離
 di=zeros(num_label,2);
for w=1:num_label
    x2=find(distance1(:,2)==w);
    x2=x2(1:k,:);
    dis=distance1(x2,1);
    dis=sum(dis)/k;
    di(w,1)=dis;
    di(w,2)=w;
end                      %把每一種标簽都找出距離最近的k的樣本,并計算平均距離

c=sum(di(:,1))./di(:,1)';
c=c/max(c,[],2);
probability(j,:)=c;      %輸出機率:距離的總和除以各個距離,然後除以其中最大值,得類機率

b=sortrows(di,1);
knnlabel_1(j,1)=b(1,2);  %平均距離最近的标簽為預測标簽
end
Probability=[probability,knnlabel_1];
bj=(knnlabel_1==tel);
a=nnz(bj);
A_1=a/m2;                %輸出識别率
           
function [A,predict_label]=f_LDA(tr,te)

%Input:      tr: Training set
%            te: testing set
%            Note that: each row represents a instance, last column is label, begins from 1
%Output:     A: Testing Accuracy
%            predict_label: predict label by DA for testingdata

data=[tr;te];
n=size(data,2);
m1=size(tr,1);m2=size(te,1);

trd=tr(:,1:n-1);trl=tr(:,n);
ted=te(:,1:n-1);tel=te(:,n);

obj = ClassificationDiscriminant.fit(trd, trl);  
predict_label   =       predict(obj, ted);  

bj=(predict_label==tel);a=nnz(bj);
A=a/m2; %輸出識别率

end
           
function [A,Predict_label]=f_RF(tr,te,NTrees)

%Input:      tr: Training set
%            te: testing set
%            Note that: each row represents a instance, last column is label, begins from 1
%            NTrees: the number of the decision trees
%Output:     A: Testing Accuracy
%            Predict_label: predict label by DA for testingdata
    if ~exist('NTrees', 'var')
        NTrees = 50;
    end

data=[tr;te];
n=size(data,2);
m1=size(tr,1);m2=size(te,1);

trd=tr(:,1:n-1);trl=tr(:,n);
ted=te(:,1:n-1);tel=te(:,n);

Factor = TreeBagger(NTrees, trd, trl);
[Predict_label,Scores] = predict(Factor, ted);
a=0;
for i=1:m2
    cla=str2num(Predict_label{i,1});
     ssr(i,1)=cla;
    if cla==tel(i);
        a=a+1;
    end
end
A=a/m2; %輸出識别率
Predict_label=ssr;
end
           
function [A,predict_label]=f_RSS(tr,te)

% 随機子空間分類 Random subspace method
%Input:      tr: Training set
%            te: testing set
%            Note that: each row represents a instance, last column is label, begins from 1
%            NTrees: the number of the decision trees
%Output:     A: Testing Accuracy
%            class: predict label by DA for testingdata

data=[tr;te];
n=size(data,2);
m1=size(tr,1);m2=size(te,1);

trd=tr(:,1:n-1);trl=tr(:,n);
ted=te(:,1:n-1);tel=te(:,n);

ens = fitensemble(trd,trl,'Subspace' ,'AllPredictorCombinations','Discriminant','type','classification');  
predict_label   =       predict(ens, ted);  

bj=(predict_label==tel);a=nnz(bj);
A=a/m2; %輸出識别率

end 
           
function [A,predict_label]=f_Bagging(tr,te,NTrees)

%Input:      tr: Training set
%            te: testing set
%            Note that: each row represents a instance, last column is label, begins from 1
%            NTrees: the number of the decision trees
%Output:     A: Testing Accuracy
%            class: predict label by DA for testingdata

    if ~exist('NTrees', 'var')
        NTrees = 50;
    end


data=[tr;te];
n=size(data,2);
m1=size(tr,1);m2=size(te,1);

trd=tr(:,1:n-1);trl=tr(:,n);
ted=te(:,1:n-1);tel=te(:,n);

ens = fitensemble(trd,trl,'Bag' ,NTrees,'tree','type','classification');  
predict_label   =       predict(ens, ted);  

bj=(predict_label==tel);a=nnz(bj);
A=a/m2; %輸出識别率

end
           
function [A,predict_label]=f_Boosting(tr,te,NTrees)

%Input:      tr: Training set
%            te: testing set
%            Note that: each row represents a instance, last column is label, begins from 1
%            NTrees: the number of the decision trees
%Output:     A: Testing Accuracy
%            predict_label: predict label by Boosting for testingdata

    if ~exist('NTrees', 'var')
        NTrees = 50;
    end

data=[tr;te];
n=size(data,2);
m1=size(tr,1);m2=size(te,1);

trd=tr(:,1:n-1);trl=tr(:,n);
ted=te(:,1:n-1);tel=te(:,n);

L=unique(data(:,end));      % 合并标簽中相同資料
ls=length(L(:));  %統計類别總數
if ls==2
    str='AdaBoostM1';
else str='AdaBoostM2';
end

ens = fitensemble(trd,trl,str ,NTrees,'tree','type','classification');  
predict_label   =       predict(ens, ted);  

bj=(predict_label==tel);a=nnz(bj);
A=a/m2; %輸出識别率

end
           
function [A,Predict_label]=f_LR(tr,te)

%Input:      tr: Training set
%            te: testing set
%            Note that: each row represents a instance, last column is label, begins from 1
%Output:     A: Testing Accuracy
%            Predict_label: predict label by logistic regression for testingdata

data=[tr;te];
n=size(data,2);
m1=size(tr,1);m2=size(te,1);

trd=tr(:,1:n-1);trl=tr(:,n);    tr_l=dummyvar(trl);
ted=te(:,1:n-1);tel=te(:,n);

[B1,dev1,stats1] = mnrfit(trd,tr_l);
    pihat1 = mnrval(B1,ted);
    Predict_label=[];
  for i2=1:m2;
    [q1,v1]=max(pihat1(i2,:));
          Predict_label=[Predict_label;v1];
  end

bj=(Predict_label==tel);a=nnz(bj);
A=a/m2; %輸出識别率

end
           
function [accuracy,nblabel] = f_NB(Train, Test)

%Input:      Train: Training set
%            Test: testing set
%            Note that: each row represents a instance, last column is label, begins from 1
%Output:     accuracy: Testing Accuracy
%            nblabel: predict label by nb for testingdata

Train_sample = Train(:,1:end-1);
Train_label = Train(:,end);
Test_sample = Test(:,1:end-1);
Test_label = Test(:,end);
Class_num = length(unique(Train_label));
Feature_num = size(Train_sample,2);
Para_mean =   cell(1,Class_num);%Mean for each feature and class
Para_dev = cell(1,Class_num);%Dev for each feature and class
Sample_byclass = cell(1,Class_num);%Reorder the data set by class
Prior_prob = zeros(1,Class_num);%Prior probability of each class
for i=1:1:size(Train_sample,1)
    Sample_byclass{1,Train_label(i,1)} = [Sample_byclass{1,Train_label(i,1)}; Train_sample(i,:)];
    Prior_prob(1,Train_label(i,1)) = Prior_prob(1,Train_label(i,1)) + 1;
end
Prior_prob = Prior_prob/size(Train_sample,1);
for i=1:1:Class_num
     miu = mean(Sample_byclass{1,i});
     delta = std(Sample_byclass{1,i});   
     Para_mean{1,i} = miu;
     Para_dev{1,i} = delta;
end
nblabel = [];
for i=1:1:size(Test_sample,1)
     prob = log(Prior_prob);
     for j=1:1:Class_num
         for k=1:1:Feature_num
             if Para_dev{1,j}(1,k) == 0
                 Para_dev{1,j}(1,k) = 0.1667;
             end
             prob(1,j) = prob(1,j)   - (Test_sample(i,k)-Para_mean{1,j}(1,k))^2/(2*Para_dev{1,j}(1,k)^2)   - log(Para_dev{1,j}(1,k));
         end
     end
     [value,index] = max(prob);
     nblabel = [nblabel ; index];
end
accuracy = length(find(nblabel - Test_label ==0))/length(Test_label);
end
           

繼續閱讀