請編寫程式,對一段英文文本,統計其中所有不同單詞的個數,以及詞頻最大的前10%的單詞。
所謂“單詞”,是指由不超過80個單詞字元組成的連續字元串,但長度超過15的單詞将隻截取保留前15個單詞字元。而合法的“單詞字元”為大小寫字母、數字和下劃線,其它字元均認為是單詞分隔符。
輸入格式:
輸入給出一段非空文本,最後以符号
#
結尾。輸入保證存在至少10個不同的單詞。
輸出格式:
在第一行中輸出文本中所有不同單詞的個數。注意“單詞”不區分英文大小寫,例如“PAT”和“pat”被認為是同一個單詞。
随後按照詞頻遞減的順序,按照
詞頻:單詞
的格式輸出詞頻最大的前10%的單詞。若有并列,則按遞增字典序輸出。
輸入樣例:
This is a test.
The word "this" is the word with the highest frequency.
Longlonglonglongword should be cut off, so is considered as the same as longlonglonglonee. But this_8 is different than this, and this, and this...#
this line should be ignored.
輸出樣例:(注意:雖然單詞 the
也出現了4次,但因為我們隻要輸出前10%(即23個單詞中的前2個)單詞,而按照字母序, the
排第3位,是以不輸出。)
the
the
23
5:this
4:is
map映射到數組,統計并排序。
#include <iostream>
#include <algorithm>
#include <map>
#include <cstring>
using namespace std;
struct str
{
char s[16];
int num;
}ans[10000];
int no = 1;
bool cmp(str a,str b)
{
if(a.num == b.num)return strcmp(a.s,b.s) < 0;
return a.num > b.num;
}
int main()
{
char ch,s[16];
int c = 0;
map<string,int> p;
while((ch = cin.get()) != '#')
{
if(ch == '_')
{
if(c < 15)s[c ++] = ch;
}
else if(isdigit(ch))
{
if(c < 15)s[c ++] = ch;
}
else if(isalpha(ch))
{
if(c < 15)s[c ++] = tolower(ch);
}
else
{
if(c)
{
s[c] = '\0';
c = 0;
if(!p[s])
{
strcpy(ans[no].s,s);
ans[no].num = 1;
p[s] = no ++;
}
else
{
ans[p[s]].num ++;
}
}
}
}
sort(ans + 1,ans + no,cmp);
cout<<no - 1<<endl;
for(int i = 1;i <= no / 10;i ++)
{
cout<<ans[i].num<<':'<<ans[i].s<<endl;
}
}
#include <iostream>
#include <cstdlib>
#include <map>
#include <algorithm>
#include <sstream>
#include <vector>
using namespace std;
map<string,int> mp;
vector<string> vec;
bool cmp(string a,string b) {
if(mp[a] == mp[b]) return a < b;
return mp[a] > mp[b];
}
void check(string t) {
int s = 0;
for(int i = 0;i <= t.size();i ++) {
if(!isalnum(t[i]) && t[i] != '_') {
string temp = "";
if(i - s > 15) temp = t.substr(s,15);
else if(i != s) temp = t.substr(s,i - s);
s = i + 1;
if(temp == "") continue;
if(!mp[temp] ++) vec.push_back(temp);
}
else t[i] = tolower(t[i]);
}
}
int main() {
string s;
while(getline(cin,s)) {
istringstream in(s);
string t;
while(in>>t) {
check(t);
}
if(s.size() && s[s.size() - 1] == '#') break;
}
sort(vec.begin(),vec.end(),cmp);
cout<<vec.size()<<endl;
for(int i = 0;(i + 1) * 10 <= vec.size();i ++) {
cout<<mp[vec[i]]<<':'<<vec[i]<<endl;
}
}
import sys
dict = {}
flag = 0
for line in sys.stdin:
if(flag == 1):
line = ""
for i in line.split(' '):
s = 0
i = i + " "
for j in range(len(i)):
if(not i[j].isalnum() and i[j] != "_"):
if(i[j] == "#"):
flag = 1
temp = ""
if(j - s > 15):
temp = i[s:s + 15]
elif(j != s):
temp = i[s:j]
s = j + 1
if(temp != ""):
temp = temp.lower()
if(dict.get(temp) == None):
dict[temp] = 1
else:
dict[temp] += 1
dict = sorted(dict.items(),key=lambda d:(d[0]),reverse=False)
dict = sorted(dict,key=lambda d:(d[1]),reverse=True)
len = len(dict)
print(len)
for i in range(int(len / 10)):
print("%d:%s" % (dict[i][1],dict[i][0]))