天天看點

統計英文單詞詞頻統計英文單詞詞頻–Java

統計英文單詞詞頻–Java

思想:首先英文文章中有很多标點符号,是以我們不妨把所有的标點或者與英文單詞無關的字元變成空格,之後把所有的單詞按空格切分裝進數組裡,然後用集合統計單詞的詞頻,最後排序,廢話不多說,直接上代碼。

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;


public class WordFrequency {
    public static String Read_File(String filename) {
        // TODO Auto-generated method stub
        String line = null;
        StringBuffer sb = new StringBuffer();
        try {
            BufferedReader br = new BufferedReader(new FileReader(filename));
            while((line = br.readLine()) != null){
                sb.append(line + "\n");
            }
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return sb.toString();//讀檔案,将文章中的所有句子變成一個字元串。

    }

    public static HashMap Char_to_Space(String article) {
        // TODO Auto-generated method stub
        String space = " ";
        article = article.replaceAll("\"",space).replaceAll(",", space).replaceAll("\\.", space);
        article = article.replaceAll(":", space).replaceAll("\\?", space);//用空格替換标點,當然這種方法很拙劣,筆者隻是為了省事,最好還是用正則。
        article = article.replaceAll("\\s+", " ");//多個空格變成一個空格
        article = article.toLowerCase();//将所有單詞轉換為小寫

        String[] wordlist = article.split(" ");//分割單詞
        HashMap<String,Integer> hm = new HashMap();
        for(String word : wordlist){
            if(hm.containsKey(word)){
                Integer count = hm.get(word);
                count ++;
                hm.put(word, count);
            }
            else{
                hm.put(word, );
            }
        }
        //System.out.println(hm);
        //System.out.println(article);
        return hm;
    }

    //排序,這裡是按照value排序。
    public static void Sort(HashMap hm) {
        // TODO Auto-generated method stub
        ArrayList <Map.Entry<String, Integer>> list = new ArrayList(hm.entrySet());
        Collections.sort(list,new Comparator<Map.Entry<String, Integer>>() {

            @Override
            public int compare(Entry<String, Integer> o1,
                    Entry<String, Integer> o2) {
                // TODO Auto-generated method stub
                return o1.getValue() - o2.getValue();
            }

        });

        System.out.println(list);
    }

    public static void main(String[] args) {
        // TODO Auto-generated method stub
        String filename = "D:\\word.txt";
        String article = Read_File(filename);
        HashMap hm = Char_to_Space(article);
        Sort(hm);
    }   

}
           

另外,更多HashMap排序請戳這裡