統計英文單詞詞頻–Java
思想:首先英文文章中有很多标點符号,是以我們不妨把所有的标點或者與英文單詞無關的字元變成空格,之後把所有的單詞按空格切分裝進數組裡,然後用集合統計單詞的詞頻,最後排序,廢話不多說,直接上代碼。
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
public class WordFrequency {
public static String Read_File(String filename) {
// TODO Auto-generated method stub
String line = null;
StringBuffer sb = new StringBuffer();
try {
BufferedReader br = new BufferedReader(new FileReader(filename));
while((line = br.readLine()) != null){
sb.append(line + "\n");
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return sb.toString();//讀檔案,将文章中的所有句子變成一個字元串。
}
public static HashMap Char_to_Space(String article) {
// TODO Auto-generated method stub
String space = " ";
article = article.replaceAll("\"",space).replaceAll(",", space).replaceAll("\\.", space);
article = article.replaceAll(":", space).replaceAll("\\?", space);//用空格替換标點,當然這種方法很拙劣,筆者隻是為了省事,最好還是用正則。
article = article.replaceAll("\\s+", " ");//多個空格變成一個空格
article = article.toLowerCase();//将所有單詞轉換為小寫
String[] wordlist = article.split(" ");//分割單詞
HashMap<String,Integer> hm = new HashMap();
for(String word : wordlist){
if(hm.containsKey(word)){
Integer count = hm.get(word);
count ++;
hm.put(word, count);
}
else{
hm.put(word, );
}
}
//System.out.println(hm);
//System.out.println(article);
return hm;
}
//排序,這裡是按照value排序。
public static void Sort(HashMap hm) {
// TODO Auto-generated method stub
ArrayList <Map.Entry<String, Integer>> list = new ArrayList(hm.entrySet());
Collections.sort(list,new Comparator<Map.Entry<String, Integer>>() {
@Override
public int compare(Entry<String, Integer> o1,
Entry<String, Integer> o2) {
// TODO Auto-generated method stub
return o1.getValue() - o2.getValue();
}
});
System.out.println(list);
}
public static void main(String[] args) {
// TODO Auto-generated method stub
String filename = "D:\\word.txt";
String article = Read_File(filename);
HashMap hm = Char_to_Space(article);
Sort(hm);
}
}