本文轉載自:http://blog.csdn.net/caiye917015406/article/details/7887221,謝謝原作者!
==============================================================================
第一個是用c語言做的關于文本的分類,主要是對待分類文本所有單詞在模闆中機率的後驗計算。算法比較簡單,從網上下的(沒記下位址,若不願意公開,請留言,自當處理),稍作了一點修改。。,等有時間可以實作垃圾郵件的分類,利用斯坦福機器學習公開課中方法,統計高頻詞,利用樸素貝葉斯。等有時間和大家分享。
[cpp] view plain copy
- #include <stdio.h>
- #include <string.h>
- #include <direct.h> //_getcwd(), _chdir()
- #include <stdlib.h> //_MAX_PATH, system()
- #include <io.h> //_finddata_t, _findfirst(), _findnext(), _findclose()
- #include<iostream>
- using namespace std;
- //#include<fstream>
- char vocabulary[1000][20];
- //@輸入參數:要分類的文本
- //@輸出參數:該文本中總單詞數
- int SplitToWord(char text[])
- {
- int i=0;
- char seps[]=", .\n";
- char *substring;
- substring=strtok(text,seps);
- while(substring!=NULL)
- {
- strcpy(vocabulary[i],substring);//将單詞存儲到vocabulary數組中
- substring=strtok(NULL,seps);
- i++;
- }
- return i; //傳回一共多少個單詞
- }
- //@輸入參數:無
- //@輸出參數:該目錄下.txt檔案數
- int CountDirectory()
- {
- int count=0; //txt檔案計數器
- long hFile;
- _finddata_t fileinfo;
- if ((hFile=_findfirst("*.txt",&fileinfo))!=-1L)
- {
- do
- {
- count++;
- } while (_findnext(hFile,&fileinfo) == 0);
- }
- return count;
- }
- //@輸入參數:分類文本中單詞數
- //@輸出參數:該類别下∏P(ai|vj)
- float CalculateWordProbability(int wordCount)
- {
- int countSame; //分類文本中的某單詞在所有訓練樣本中出現次數
- int countAll=0; //訓練樣本中總單詞數
- char token;
- FILE *fp;
- float wordProbability=1; //為後面聯乘做準備
- int i,j;
- long hFile;
- _finddata_t fileinfo;
- for(j=0;j<wordCount;j++) //對于分類樣本中的每一個單詞
- {
- countSame=0;
- countAll=0;
- if((hFile=_findfirst("*.txt",&fileinfo))!=-1L) //對于該類别下每一個.txt文本
- {
- do
- {
- if((fp=fopen(fileinfo.name,"r"))==NULL) //是否能打開該文本
- {
- printf("Sorry!Cannot open the file!\n");
- exit(0);
- }
- while((token = fgetc(fp)) != EOF)
- {
- char keyword[1024];
- i = 0;
- keyword[0] = token; // 将每個詞第一個字元賦給數組第一個元素
- while ((keyword[++i] = fgetc(fp)) != ' ' && keyword[i] != '\t' && keyword[i] != EOF && keyword[i] != '\n'); // 開始讀字元,直到遇到空白符,說明找到一個詞
- keyword[i] = '\0';// 加結束符
- countAll++;
- if (strcmp(keyword,vocabulary[j]) == 0) //比較兩個單詞是否相同
- countSame++;
- }
- fclose(fp);
- }while (_findnext(hFile,&fileinfo) == 0);
- }
- wordProbability*=(float)(countSame+1)/(float)(wordCount+countAll)*300; //計算∏P(wj|vi),為了擴大效果而*380
- }
- return wordProbability;
- }
- //@輸入參數:分類文本中單詞數
- void CalculateProbability(int wordCount,int num)
- {
- FILE *fp;
- char classList[10][20]; //類别清單
- char ch; //臨時讀取字元使用
- int index=0; //classList的行标
- int className_c=0; //classList的列标
- if((fp=fopen("ClassList.txt","r"))==NULL)
- {
- printf("Failed to open the file: ClassList.txt.\n");
- }
- ch = fgetc(fp);
- while(ch!=EOF)
- {
- if(ch!='\n')
- {
- classList[index][className_c]=ch;
- className_c++;
- }
- else
- {
- classList[index][className_c]='\0';
- index++;
- className_c=0;
- }
- ch = fgetc(fp);
- }
- int txtCount[10]; //每個類别下的訓練文本數
- int countAll=0; //訓練集中總文本數
- float wordProbability[10]; //每個類别的單詞機率,即∏P(ai|vj)
- if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\貝葉斯(文本分類)—c語言\\example\\1")) //更改目前絕對路徑
- printf("系統找不到指定路徑!\n");
- else
- {
- txtCount[0]=CountDirectory(); //擷取該類别下.txt檔案數
- countAll+=txtCount[0];
- wordProbability[0]=CalculateWordProbability(wordCount); //擷取該類别下∏P(wj|vi)
- }
- if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\貝葉斯(文本分類)—c語言\\example\\2")) //更改目前絕對路徑
- printf("系統找不到指定路徑!\n");
- else
- {
- txtCount[1]=CountDirectory(); //擷取該類别下.txt檔案數
- countAll+=txtCount[1];
- wordProbability[1]=CalculateWordProbability(wordCount); //擷取該類别下∏P(wj|vi)
- }
- if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\貝葉斯(文本分類)—c語言\\example\\3")) //更改目前絕對路徑
- printf("系統找不到指定路徑!\n");
- else
- {
- txtCount[2]=CountDirectory(); //擷取該類别下.txt檔案數
- countAll+=txtCount[2];
- wordProbability[2]=CalculateWordProbability(wordCount); //擷取該類别下∏P(wj|vi)
- }
- if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\貝葉斯(文本分類)—c語言\\example\\4")) //更改目前絕對路徑
- printf("系統找不到指定路徑!\n");
- else
- {
- txtCount[3]=CountDirectory(); //擷取該類别下.txt檔案數
- countAll+=txtCount[3];
- wordProbability[3]=CalculateWordProbability(wordCount); //擷取該類别下∏P(wj|vi)
- }
- if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\貝葉斯(文本分類)—c語言\\example\\5")) //更改目前絕對路徑
- printf("系統找不到指定路徑!\n");
- else
- {
- txtCount[4]=CountDirectory(); //擷取該類别下.txt檔案數
- countAll+=txtCount[4];
- wordProbability[4]=CalculateWordProbability(wordCount); //擷取該類别下∏P(wj|vi)
- }
- if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\貝葉斯(文本分類)—c語言\\example\\6")) //更改目前絕對路徑
- printf("系統找不到指定路徑!\n");
- else
- {
- txtCount[5]=CountDirectory(); //擷取該類别下.txt檔案數
- countAll+=txtCount[5];
- wordProbability[5]=CalculateWordProbability(wordCount); //擷取該類别下∏P(wj|vi)
- }
- if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\貝葉斯(文本分類)—c語言\\example\\7")) //更改目前絕對路徑
- printf("系統找不到指定路徑!\n");
- else
- {
- txtCount[6]=CountDirectory(); //擷取該類别下.txt檔案數
- countAll+=txtCount[6];
- wordProbability[6]=CalculateWordProbability(wordCount); //擷取該類别下∏P(wj|vi)
- }
- if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\貝葉斯(文本分類)—c語言\\example\\8")) //更改目前絕對路徑
- printf("系統找不到指定路徑!\n");
- else
- {
- txtCount[7]=CountDirectory(); //擷取該類别下.txt檔案數
- countAll+=txtCount[7];
- wordProbability[7]=CalculateWordProbability(wordCount); //擷取該類别下∏P(wj|vi)
- }
- if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\貝葉斯(文本分類)—c語言\\example\\9")) //更改目前絕對路徑
- printf("系統找不到指定路徑!\n");
- else
- {
- txtCount[8]=CountDirectory(); //擷取該類别下.txt檔案數
- countAll+=txtCount[8];
- wordProbability[8]=CalculateWordProbability(wordCount); //擷取該類别下∏P(wj|vi)
- }
- if(_chdir("D:\\openCV\\openCVProject\\openCVtext\\貝葉斯(文本分類)—c語言\\example\\10")) //更改目前絕對路徑
- printf("系統找不到指定路徑!\n");
- else
- {
- txtCount[9]=CountDirectory(); //擷取該類别下.txt檔案數
- countAll+=txtCount[9];
- wordProbability[9]=CalculateWordProbability(wordCount); //擷取該類别下∏P(wj|vi)
- }
- float max=0;
- int classNo=0;
- float priorProbability[10];
- float finalProbability[10];
- for(int i=0;i<num;i++)
- {
- priorProbability[i]=(float)txtCount[i]/(float)countAll; //先驗機率
- finalProbability[i]=priorProbability[i]*wordProbability[i]; //最終機率
- if(finalProbability[i]>max) //找到最大機率并記錄
- {
- max=finalProbability[i];
- classNo=i;
- }
- printf("該文本為類别%s的機率為:%.5e\n",classList[i],finalProbability[i]); //輸出每個類别的最終機率
- }
- printf("\n經分析,該文本最有可能為%s類文本!\n",classList[classNo]); //輸出最後分類結果
- }
- //@輸入參數:分類文本
- void NaiveBayesClassifier(char text[],int num)
- {
- int vocabularyCount;//分類樣本中單詞數
- vocabularyCount=SplitToWord(text); //對要分類的文本進行單詞分割,結果存儲在vocabulary數組中,傳回分類樣本中單詞數
- CalculateProbability(vocabularyCount,num); //計算最終機率
- }
- int main()
- {
- FILE *fp;
- if((fp=fopen("text.txt","r"))==NULL)
- {
- printf("Failed to open the file: ClassList.txt.\n");
- }
- char ch = fgetc(fp);
- int i=0;
- while(ch!=EOF)
- {
- ch = fgetc(fp);
- i++;
- }
- char *text=new char(i+1);
- fseek(fp,0,SEEK_SET);//
- ch = fgetc(fp);
- int j=0;
- while(ch!=EOF)
- {
- ch = fgetc(fp);
- cout<<ch;
- text[j]=ch;
- j++;
- }
- // char text[]=new char(i);;
- int num = 2;
- NaiveBayesClassifier(text,num);
- return 1;
- }