天天看點

詞頻統計

請編寫程式,對一段英文文本,統計其中所有不同單詞的個數,以及詞頻最大的前10%的單詞。

所謂“單詞”,是指由不超過80個單詞字元組成的連續字元串,但長度超過15的單詞将隻截取保留前15個單詞字元。而合法的“單詞字元”為大小寫字母、數字和下劃線,其它字元均認為是單詞分隔符。

輸入格式:

輸入給出一段非空文本,最後以符号

#

結尾。輸入保證存在至少10個不同的單詞。

輸出格式:

在第一行中輸出文本中所有不同單詞的個數。注意“單詞”不區分英文大小寫,例如“PAT”和“pat”被認為是同一個單詞。

随後按照詞頻遞減的順序,按照

詞頻:單詞

的格式輸出詞頻最大的前10%的單詞。若有并列,則按遞增字典序輸出。

輸入樣例:

This is a test.

The word "this" is the word with the highest frequency.

Longlonglonglongword should be cut off, so is considered as the same as longlonglonglonee.  But this_8 is different than this, and this, and this...#
this line should be ignored.
           

輸出樣例:(注意:雖然單詞

the

也出現了4次,但因為我們隻要輸出前10%(即23個單詞中的前2個)單詞,而按照字母序,

the

排第3位,是以不輸出。)

23
5:this
4:is
                

map映射到數組,統計并排序。      
#include <iostream>
#include <algorithm>
#include <map>
#include <cstring>
using namespace std;
struct str
{
    char s[16];
    int num;
}ans[10000];
int no = 1;
bool cmp(str a,str b)
{
    if(a.num == b.num)return strcmp(a.s,b.s) < 0;
    return a.num > b.num;
}
int main()
{
    char ch,s[16];
    int c = 0;
    map<string,int> p;
    while((ch = cin.get()) != '#')
    {
        if(ch == '_')
        {
            if(c < 15)s[c ++] = ch;
        }
        else if(isdigit(ch))
        {
            if(c < 15)s[c ++] = ch;
        }
        else if(isalpha(ch))
        {
            if(c < 15)s[c ++] = tolower(ch);
        }
        else
        {
            if(c)
            {
                s[c] = '\0';
                c = 0;
                if(!p[s])
                {
                    strcpy(ans[no].s,s);
                    ans[no].num = 1;
                    p[s] = no ++;
                }
                else
                {
                    ans[p[s]].num ++;
                }
            }
        }
    }
    sort(ans + 1,ans + no,cmp);
    cout<<no - 1<<endl;
    for(int i = 1;i <= no / 10;i ++)
    {
        cout<<ans[i].num<<':'<<ans[i].s<<endl;
    }
}      
#include <iostream>
#include <cstdlib>
#include <map>
#include <algorithm>
#include <sstream>
#include <vector>
using namespace std;
map<string,int> mp;
vector<string> vec;
bool cmp(string a,string b) {
    if(mp[a] == mp[b]) return a < b;
    return mp[a] > mp[b];
}
void check(string t) {
    int s = 0;
    for(int i = 0;i <= t.size();i ++) {
        if(!isalnum(t[i]) && t[i] != '_') {
            string temp = "";
            if(i - s > 15) temp = t.substr(s,15);
            else if(i != s) temp = t.substr(s,i - s);
            s = i + 1;
            if(temp == "") continue;
            if(!mp[temp] ++) vec.push_back(temp);
        }
        else t[i] = tolower(t[i]);
    }
}
int main() {
    string s;
    while(getline(cin,s)) {
        istringstream in(s);
        string t;
        while(in>>t) {
            check(t);
        }
        if(s.size() && s[s.size() - 1] == '#') break;
    }
    sort(vec.begin(),vec.end(),cmp);
    cout<<vec.size()<<endl;
    for(int i = 0;(i + 1) * 10 <= vec.size();i ++) {
        cout<<mp[vec[i]]<<':'<<vec[i]<<endl;
    }
}      
import sys
dict = {}
flag = 0
for line in sys.stdin:
    if(flag == 1):
        line = ""
    for i in line.split(' '):
        s = 0
        i = i + " "
        for j in range(len(i)):
            if(not i[j].isalnum() and i[j] != "_"):
                if(i[j] == "#"):
                    flag = 1
                temp = ""
                if(j - s > 15):
                    temp = i[s:s + 15]
                elif(j != s):
                    temp = i[s:j]
                s = j + 1
                if(temp != ""):
                    temp = temp.lower()
                    if(dict.get(temp) == None):
                        dict[temp] = 1
                    else:
                        dict[temp] += 1
dict = sorted(dict.items(),key=lambda d:(d[0]),reverse=False)
dict = sorted(dict,key=lambda d:(d[1]),reverse=True)
len = len(dict)
print(len)
for i in range(int(len / 10)):
    print("%d:%s" % (dict[i][1],dict[i][0]))