K-means clustering and K-nearest neighbourhood classifier

K-means clustering 是一個十分簡單而又實用的聚類算法，其是一種無監督聚類學習，隻需告訴分類器一共有幾類，即可實作分類。K-nearest neighbourhood是一個有監督分類算法。

K-means clustering 算法思想：

随機給定k個觀測作為初始種子點， m(1)1,m(1)2,...,m(1)k ，

配置設定步：

for i = 1：K

S(t)i={xp:||xp−m(t)i||2≤||xp−m(t)j||2,∀j,1≤j≤k} ,

其中 xp 是第p個要聚類的點，将其分給2範數最小的i類。

更新步：

for i = 1 : K

m(t+1)i=1|S(t)i|∑xj∈S(t)ixj

m(t+1)i 是第t次聚類後，i類用于計算t+1次聚類的中心。

舉個例子：

K-means clustering and K-nearest neighbourhood classifier

圖一：選擇初始點

K-means clustering and K-nearest neighbourhood classifier

圖二：根據初始點産生一組聚類

K-means clustering and K-nearest neighbourhood classifier

圖三重新計算聚類後的中心，作為種子點。

K-means clustering and K-nearest neighbourhood classifier

疊代2和3，産生最終的聚類

K-means 算法的K需要事先确定，而且初始的種子點随機選擇。初始種子點的選擇可采用K++算法改善。

1、從輸入的資料點集合中随機選擇一個點作為第一個聚類中心

2、對于資料集中的每一個點x，計算它與最近聚類中心(指已選擇的聚類中心)的距離D(x)

3、選擇一個新的資料點作為新的聚類中心，選擇的原則是：D(x)較大的點，被選取作為聚類中心的機率較大

4、重複2和3直到k個聚類中心被選出來

5、利用這k個初始的聚類中心來運作标準的k-means算法

K-nearest neighbourhood classifier

K近鄰分類，指的是對于某個點，找出其最近的k個點，這k個點進行投票，選取相同點數最多的類别作為該點的label。

如果K=3，那麼離綠色點最近的有2個紅色三角形和1個藍色的正方形，這3個點投票，于是綠色的這個待分類點屬于紅色的三角形

如果K=5，那麼離綠色點最近的有2個紅色三角形和3個藍色的正方形，這5個點投票，于是綠色的這個待分類點屬于藍色的正方形

另一種變形：本人想的。對于某個點，找出其最近的k個相同的點，這k個相同的點屬于哪一類，就将其作為該類的label。

如果K=1，則綠色的點屬于紅色。

如果K=2，則綠色的點屬于紅色。

如果K=3，則綠色的點屬于藍色。

這兩種方法應用上有什麼差別呢？待思考。

K-means 和 KNN的差別：

K-means

1、聚類算法

2、非監督

3、K是人為定的，需要一定先驗知識

KNN

1、分類

2、監督

3、K個最近的點

K-means clustering and K-nearest neighbourhood classifier

K-means matlab 代碼

RGB= imread ('test.jpg'); %讀入
img=rgb2gray(RGB);
[m,n]=size(img);
subplot(,,),imshow(img);title(' 圖一 原圖像')
subplot(,,),imhist(img);title(' 圖二 原圖像的灰階直方圖')
hold off;
img=double(img);
c1()=;
c2()=;
c3()=;%選擇三個初始聚類中心
for i=:
    r=abs(img-c1(i));
    g=abs(img-c2(i));
    b=abs(img-c3(i));%計算各像素灰階與聚類中心的距離
    r_g=r-g;
    g_b=g-b;
    r_b=r-b;
    n_r=find(r_g<=&r_b<=);%尋找最小的聚類中心
    n_g=find(r_g>&g_b<=);%尋找中間的一個聚類中心
    n_b=find(g_b>&r_b>);%尋找最大的聚類中心
    i=i+;
    c1(i)=sum(img(n_r))/length(n_r);%将所有低灰階求和取平均，作為下一個低灰階中心
    c2(i)=sum(img(n_g))/length(n_g);%将所有低灰階求和取平均，作為下一個中間灰階中心
    c3(i)=sum(img(n_b))/length(n_b);%将所有低灰階求和取平均，作為下一個高灰階中心
    d1(i)=abs(c1(i)-c1(i-));
    d2(i)=abs(c2(i)-c2(i-));
    d3(i)=abs(c3(i)-c3(i-));
    if d1(i)<=&&d2(i)<=&&d3(i)<=
        R=c1(i);
        G=c2(i);
        B=c3(i);
        k=i;
        break;
    end
end
R
G
B
img=uint8(img);
img(find(img<R))=;
img(find(img>R&img<G))=;
img(find(img>G))=;

subplot(,,),imshow(img);title(' 圖三 聚類後的圖像')
subplot(,,),imhist(img);title(' 圖四 聚類後的圖像直方圖'

K-means C++代碼，轉自wiki

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

typedef struct { double x, y; int group; } point_t, *point;

double randf(double m)
{
    return m * rand() / (RAND_MAX - );
}

point gen_xy(int count, double radius)
{
    double ang, r;
    point p, pt = malloc(sizeof(point_t) * count);

    /* note: this is not a uniform 2-d distribution */
    for (p = pt + count; p-- > pt;) {
        ang = randf( * M_PI);
        r = randf(radius);
        p->x = r * cos(ang);
        p->y = r * sin(ang);
    }

    return pt;
}

inline double dist2(point a, point b)
{
    double x = a->x - b->x, y = a->y - b->y;
    return x*x + y*y;
}

inline int
nearest(point pt, point cent, int n_cluster, double *d2)
{
    int i, min_i;
    point c;
    double d, min_d;

#   define for_n for (c = cent, i = 0; i < n_cluster; i++, c++)
    for_n {
        min_d = HUGE_VAL;
        min_i = pt->group;
        for_n {
            if (min_d > (d = dist2(c, pt))) {
                min_d = d; min_i = i;
            }
        }
    }
    if (d2) *d2 = min_d;
    return min_i;
}

void kpp(point pts, int len, point cent, int n_cent)
{
#   define for_len for (j = 0, p = pts; j < len; j++, p++)
    int i, j;
    int n_cluster;
    double sum, *d = malloc(sizeof(double) * len);

    point p, c;
    cent[] = pts[ rand() % len ];
    for (n_cluster = ; n_cluster < n_cent; n_cluster++) {
        sum = ;
        for_len {
            nearest(p, cent, n_cluster, d + j);
            sum += d[j];
        }
        sum = randf(sum);
        for_len {
            if ((sum -= d[j]) > ) continue;
            cent[n_cluster] = pts[j];
            break;
        }
    }
    for_len p->group = nearest(p, cent, n_cluster, );
    free(d);
}

point lloyd(point pts, int len, int n_cluster)
{
    int i, j, min_i;
    int changed;

    point cent = malloc(sizeof(point_t) * n_cluster), p, c;

    /* assign init grouping randomly */
    //for_len p->group = j % n_cluster;

    /* or call k++ init */
    kpp(pts, len, cent, n_cluster);

    do {
        /* group element for centroids are used as counters */
        for_n { c->group = ; c->x = c->y = ; }
        for_len {
            c = cent + p->group;
            c->group++;
            c->x += p->x; c->y += p->y;
        }
        for_n { c->x /= c->group; c->y /= c->group; }

        changed = ;
        /* find closest centroid of each point */
        for_len {
            min_i = nearest(p, cent, n_cluster, );
            if (min_i != p->group) {
                changed++;
                p->group = min_i;
            }
        }
    } while (changed > (len >> )); /* stop when 99.9% of points are good */

    for_n { c->group = i; }

    return cent;
}

void print_eps(point pts, int len, point cent, int n_cluster)
{
#   define W 400
#   define H 400
    int i, j;
    point p, c;
    double min_x, max_x, min_y, max_y, scale, cx, cy;
    double *colors = malloc(sizeof(double) * n_cluster * );

    for_n {
        colors[*i + ] = ( * (i + ) % )/;
        colors[*i + ] = ( * i % )/;
        colors[*i + ] = ( * i % )/;
    }

    max_x = max_y = -(min_x = min_y = HUGE_VAL);
    for_len {
        if (max_x < p->x) max_x = p->x;
        if (min_x > p->x) min_x = p->x;
        if (max_y < p->y) max_y = p->y;
        if (min_y > p->y) min_y = p->y;
    }
    scale = W / (max_x - min_x);
    if (scale > H / (max_y - min_y)) scale = H / (max_y - min_y);
    cx = (max_x + min_x) / ;
    cy = (max_y + min_y) / ;

    printf("%%!PS-Adobe-3.0\n%%%%BoundingBox: -5 -5 %d %d\n", W + , H + );
    printf( "/l {rlineto} def /m {rmoveto} def\n"
        "/c { .25 sub exch .25 sub exch .5 0 360 arc fill } def\n"
        "/s { moveto -2 0 m 2 2 l 2 -2 l -2 -2 l closepath "
        "   gsave 1 setgray fill grestore gsave 3 setlinewidth"
        " 1 setgray stroke grestore 0 setgray stroke }def\n"
    );
    for_n {
        printf("%g %g %g setrgbcolor\n",
            colors[*i], colors[*i + ], colors[*i + ]);
        for_len {
            if (p->group != i) continue;
            printf("%.3f %.3f c\n",
                (p->x - cx) * scale + W / ,
                (p->y - cy) * scale + H / );
        }
        printf("\n0 setgray %g %g s\n",
            (c->x - cx) * scale + W / ,
            (c->y - cy) * scale + H / );
    }
    printf("\n%%%%EOF");
    free(colors);
#   undef for_n
#   undef for_len
}

#define PTS 100000
#define K 11
int main()
{
    int i;
    point v = gen_xy(PTS, );
    point c = lloyd(v, PTS, K);
    print_eps(v, PTS, c, K);
    // free(v); free(c);
    return ;
}

KNN 代碼

轉自（http://www.cppblog.com/unixfy/archive/2012/02/14/165537.aspx）

#include <iostream>  
#include <string>  
#include <vector>  
#include <set>  
#include <map>  
#include <fstream>  
#include <sstream>  
#include <cassert>  
#include <cmath>  
using namespace std;  

//樣例結構體，所屬類型和特征向量  
struct sample  
{  
    string type;  
    vector<double> features;  
};  

// 類型和距離結構體，未用到  
struct typeDistance  
{  
    string type;  
    double distance;  
};  

bool operator < (const typeDistance& lhs, const typeDistance& rhs)  
{  
    return lhs.distance < rhs.distance;  
}  

// 讀取訓練樣本  
// 訓練樣本的格式是：每行代表一個樣例  
// 每行的第一個元素是類型名，後面的是樣例的特征向量  
// 例如：  
/* 
a    1 2 3 4 5 
b    5 4 3 2 1 
c    3 3 3 3 3 
d    -3 -3 -3 -3 -3 
a    1 2 3 4 4 
b    4 4 3 2 1 
c    3 3 3 2 4 
d    0 0 1 1 -2 
*/  
void readTrain(vector<sample>& train, const string& file)  
{  
    ifstream fin(file.c_str());  
    if (!fin)  
    {  
        cerr << "File error!" << endl;  
        exit();  
    }  
    string line;  
    double d = ;  
    while (getline(fin, line))  
    {  
        istringstream sin(line);  
        sample ts;  
        sin >> ts.type;  
        while (sin >> d)  
        {  
            ts.features.push_back(d);  
        }  
        train.push_back(ts);  
    }  
    fin.close();  
}  

// 讀取測試樣本  
// 每行代表一個樣例  
// 每一行是一個樣例的特征向量  
// 例如：  
/* 
1 2 3 2 4 
2 3 4 2 1 
8 7 2 3 5 
-3 -2 2 4 0 
-4 -4 -4 -4 -4 
1 2 3 4 4 
4 4 3 2 1 
3 3 3 2 4 
0 0 1 1 -2 
*/  
void readTest(vector<sample>& test, const string& file)  
{  
    ifstream fin(file.c_str());  
    if (!fin)  
    {  
        cerr << "File error!" << endl;  
        exit();  
    }  
    double d = ;  
    string line;  
    while (getline(fin, line))  
    {  
        istringstream sin(line);  
        sample ts;  
        while (sin >> d)  
        {  
            ts.features.push_back(d);  
        }  
        test.push_back(ts);  
    }  
    fin.close();  
}  

// 計算歐氏距離  
double euclideanDistance(const vector<double>& v1, const vector<double>& v2)  
{  
    assert(v1.size() == v2.size());  
    double ret = ;  
    /* 
    size_type由string類類型和vector類類型定義的類型，用以儲存任意string對象或vector對象的長度，标準庫類型将size_type定義為unsigned類型 
    */  
    for (vector<double>::size_type i = ; i != v1.size(); ++i)  
    {  
        ret += (v1[i] - v2[i]) * (v1[i] - v2[i]);  
    }  
    return sqrt(ret);  
}  

// 初始化距離矩陣  
// 該矩陣是根據訓練樣本和測試樣本而得  
// 矩陣的行數為測試樣本的數目，列數為訓練樣本的數目  
// 每一行為一個測試樣本到各個訓練樣本之間的歐式距離組成的數組  
void initDistanceMatrix(vector<vector<double> >& dm, const vector<sample>& train, const vector<sample>& test)  
{  
    for (vector<sample>::size_type i = ; i != test.size(); ++i)  
    {  
        vector<double> vd;  
        for (vector<sample>::size_type j = ; j != train.size(); ++j)  
        {  
            vd.push_back(euclideanDistance(test[i].features, train[j].features));  
        }  
        dm.push_back(vd);  
    }  
}  

// K-近鄰法的實作  
// 設定不同的 k 值，給每個測試樣例予以一個類型  
// 距離和權重成反比  
void knnProcess(vector<sample>& test, const vector<sample>& train, const vector<vector<double> >& dm, unsigned int k)  
{  
    for (vector<sample>::size_type i = ; i != test.size(); ++i)  
    {  
        multimap<double, string> dts;  //儲存與測試樣本i距離最近的k個點  
        for (vector<double>::size_type j = ; j != dm[i].size(); ++j)  
        {  
            if (dts.size() < k) //把前面k個插入dts中  
            {  
                dts.insert(make_pair(dm[i][j], train[j].type)); //插入時會自動排序，按dts中的double排序，最小的排在最後  
            }  
            else  
            {  
                multimap<double, string>::iterator it = dts.end();  
                --it;  
                if (dm[i][j] < it->first) //把目前測試樣本i到目前訓練樣本之間的歐氏距離與dts中最小距離比較，若更小就更新dts  
                {  
                    dts.erase(it);  
                    dts.insert(make_pair(dm[i][j], train[j].type));  
                }  
            }  
        }  
        map<string, double> tds;  
        string type = "";  
        double weight = ;  
        //下面for循環主要是求出與測試樣本i最鄰近的k個樣本點中大多數屬于的類别，即将其作為測試樣本點i的類别  
        for (multimap<double, string>::const_iterator cit = dts.begin(); cit != dts.end(); ++cit)  
        {  
            // 不考慮權重的情況，在 k 個樣例中隻要出現就加 1  
            // ++tds[cit->second];  

            // 這裡是考慮距離與權重的關系，距離越大權重越小  
            tds[cit->second] +=  / cit->first;  
            if (tds[cit->second] > weight)  
            {  
                weight = tds[cit->second];  
                type = cit->second;  //儲存一下類别  
            }  
        }  
        test[i].type = type;  
    }  
}  

// 輸出結果  
// 輸出的格式和訓練樣本的格式一樣  
// 每行表示一個樣例，第一個元素是該樣例的類型，後面是該樣例的特征向量  
// 例如：  
/* 
a    1 2 3 2 4  
b    2 3 4 2 1  
b    8 7 2 3 5  
a    -3 -2 2 4 0  
d    -4 -4 -4 -4 -4  
a    1 2 3 4 4  
b    4 4 3 2 1  
c    3 3 3 2 4  
d    0 0 1 1 -2  
*/  
void writeTest(const vector<sample>& test, const string& file)  
{  
    ofstream fout(file.c_str());  
    if (!fout)  
    {  
        cerr << "File error!" << endl;  
        exit();  
    }  
    for (vector<sample>::size_type i = ; i != test.size(); ++i)  
    {  
        fout << test[i].type << '\t';  
        for (vector<double>::size_type j = ; j != test[i].features.size(); ++j)  
        {  
            fout << test[i].features[j] << ' ';  
        }  
        fout << endl;  
    }  
}  

// 封裝  
void knn(const string& file1, const string& file2, const string& file3, int k)  
{  
    vector<sample> train, test;  
    readTrain(train, file1.c_str());  
    readTest(test, file2.c_str());  
    vector<vector<double> > dm;  
    initDistanceMatrix(dm, train, test);  
    knnProcess(test, train, dm, k);  
    writeTest(test, file3.c_str());  
}  

// 測試  
int main()  
{  
    knn("train.txt", "test.txt", "result.txt", );  
    return ;  
}

train.txt:

a 1 2 3 4 5

b 5 4 3 2 1

c 3 3 3 3 3

d -3 -3 -3 -3 -3

a 1 2 3 4 4

b 4 4 3 2 1

c 3 3 3 2 4

d 0 0 1 1 -2

test.txt:

1 2 3 2 4

2 3 4 2 1

8 7 2 3 5

-3 -2 2 4 0

-4 -4 -4 -4 -4

1 2 3 4 4

4 4 3 2 1

3 3 3 2 4

0 0 1 1 -2

result.txt:

a 1 2 3 2 4

b 2 3 4 2 1

b 8 7 2 3 5

a -3 -2 2 4 0

d -4 -4 -4 -4 -4

a 1 2 3 4 4

b 4 4 3 2 1

c 3 3 3 2 4

d 0 0 1 1 -2

K-means clustering and K-nearest neighbourhood classifier

繼續閱讀

Flash AS3 連續加載外部若幹圖檔

JBPM資料庫表說明 (1)

java.util.PropertyPermission翻譯

翻譯TIPatterns--模式重構(Pattern refactoring)

System.out.println()方法

object-c 記憶體管理學習筆記

Replace Data Value with Object(以對象取代資料值)

《JAVA程式設計思想》第四版學習需要我記住的something –持有對象

《JAVA程式設計思想》第四版學習需要我記住的something –複用類

淺談自制MFC控件淺談自制MFC控件

（C# 程式設計指南）

C#多線程——前台線程和背景線程

Android – ListView 中添加按鈕，動态删除添加ItemView的操作

IE8 CSS設定DIV居中，添加“margin:0 auto”

Small tricks

【python】【資料處理】畫多元資料分布圖