餘弦定理對比文本相似度實作查重

1、在pom.xml中添加分詞器與word讀取依賴

<!-- ik.中文分詞器依賴-->
  <dependency>
      <groupId>com.janeluo</groupId>
      <artifactId>ikanalyzer</artifactId>
      <version>2012_u6</version>
  </dependency>
  <!-- lucene依賴 -->
  <dependency>
      <groupId>org.apache.lucene</groupId>
      <artifactId>lucene-highlighter</artifactId>
      <version>4.7.2</version>
  </dependency>
<!--word讀取-->
  <dependency>
      <groupId>org.apache.poi</groupId>
      <artifactId>poi-scratchpad</artifactId>
      <version>3.14-beta1</version>
  </dependency>

2、jsp 使用

<button type="button" style="width: 8%;outline: none;margin-left: 1.5%;margin-bottom: 15px" onclick="ExerciseCheck()" class="btn btn-primary">作業查重</button>
<script type="text/javascript">
function ExerciseCheck() {
    var CourseID = $("#ds_course").val();
    var ChapterID = $("#ds_cnumber").val();
    var MinChapterID = $("#ds_snumber").val();
    $.ajax({
        type:"POST",
        url:"/exercise/ExerciseRecheck",
        dataType:"json",
        data:{
            CourseID:CourseID,
            ChapterID:ChapterID,
            MinChapterID:MinChapterID
        },
        async:false,
        success: function(data){
            $("#listExport").html($("#listExport").html()
                + "<a style=\"width: 100%;outline: none;margin-top: 14px;margin-left: 1.5%;margin-bottom: 15px\" href=\"/excel/ListExports?CourseID="+CourseID+"&ChapterID="+ChapterID+"&MinChapterID="+MinChapterID+"\" class=\"btn btn-primary\">導出名單</a>"
            );
            openCheckWin();
            document.getElementById('checkView').innerHTML = "";
            if (data == null || data == ""){
                $("#checkView").html($("#checkView").html()
                    + "<li class=\"list-group-item\" style=\"height: 50px;padding-right: 0;margin-left: 1.5%;width: 97%;\">"
                    + "<span>暫無内容</span>"
                    + "</li>"
                );
            }else{
                var json = eval(data);
                $.each(json, function (index) {
                    var DetectionUserID = json[index].detectionUserID;
                    var DetectionUserName = json[index].detectionUserName;
                    var MeasuredUserID = json[index].measuredUserID;
                    var MeasuredUserName = json[index].measuredUserName;
                    var Similarity = json[index].similarity;
                    $("#checkView").html($("#checkView").html()
                        + "<li class=\"list-group-item\" style=\"height: 50px;padding-right: 0;margin-left: 1.5%;width: 97%;\">"
                        + "<span>" +"學号："+ DetectionUserID + "&nbsp;&nbsp;" +"姓名："+ DetectionUserName + "</span>"
                        + "</li>"
                    );
                });
            }
        }
    });
}
    function openCheckWin(){
    document.getElementById("CheckWin").style.display = "block";
}
</script>
<div class="floatingWin" style="border-radius: 5px;margin-left: 28%;width: 40%;display: none;position: absolute;background: #FFFFFF;height: 450px;z-index: 111111111111111111111111111111" id="CheckWin">
    <div id="listExport" style="width: 13%;float: left;margin-left: 1.5%">

    </div>
    <button type="button" style="width: 14%;outline: none;margin-top: 14px;margin-left: 1.5%;margin-bottom: 15px" onclick="closeCheckWin()" class="btn btn-primary">關閉</button>


    <div class="form-group">
        <span class="text-muted" style="margin-left: 1.5%">疑似抄襲名單</span>
        <ul class="list-group" id="checkView" style="overflow: auto">

        </ul>
    </div>
</div>

3、controller

@ResponseBody
    @RequestMapping("/ExerciseRecheck")
    public List<ExerciseCheck> ExerciseRecheck(String CourseID,String ChapterID,String MinChapterID,HttpServletRequest request) throws Exception {
        List<Exercise> list1 = exerciseService.QuerySectionExercise(CourseID,ChapterID,MinChapterID);
        List<Exercise> list2 = list1;
        List<ExerciseCheck> exerciseChecks = new ArrayList<ExerciseCheck>();
        if(list1.size() < 2){
            System.out.println("作業數小于2無法查重！");
        }else {
            int l = 0;
            for(int i = 0;i < list1.size();i++){
                String file1 = new WordRead().readWord(list1.get(i).getChapterExercise(),request).replaceAll("\r|\n", "");
                for (int j = 0;j<list2.size();j++){
                    if( i != j){
                        String file2 = new WordRead().readWord(list2.get(j).getChapterExercise(),request).replaceAll("\r|\n", "");
                        Double f = new CosineSimilarAlgorithm().cosSimilarityByString(file1,file2);
                        if(f > 0.6){
                            ExerciseCheck ec = new ExerciseCheck();
                            ec.setDetectionUserID(list1.get(i).getUserID());
                            ec.setDetectionUserName(list1.get(i).getUserName());
                            ec.setMeasuredUserID(list2.get(j).getUserID());
                            ec.setMeasuredUserName(list2.get(j).getUserName());
                            ec.setSimilarity(f.toString());
                            exerciseChecks.add(l,ec);
                            l++;
                            continue;
                        }
                    }
                }
            }
        }
        return exerciseChecks;
    }

4、讀取word檔案内容

package com.graduation.util;
	
	import java.io.File;
	import java.io.FileInputStream;
	import java.io.FileOutputStream;
	import java.io.InputStream;
	import java.io.OutputStream;
	import java.util.List;
	import java.util.UUID;
	
	import org.apache.poi.hwpf.HWPFDocument;
	import org.apache.poi.hwpf.usermodel.Picture;
	import org.apache.poi.hwpf.usermodel.Range;
	import org.apache.poi.hwpf.usermodel.Table;
	import org.apache.poi.hwpf.usermodel.TableCell;
	import org.apache.poi.hwpf.usermodel.TableIterator;
	import org.apache.poi.hwpf.usermodel.TableRow;
	
	import com.sun.xml.internal.messaging.saaj.util.ByteInputStream;
	
	import javax.servlet.http.HttpServletRequest;
	
	public class WordRead {
	
	    public static String readWord(String filename,HttpServletRequest request) throws Exception{
	        String path = request.getServletContext().getRealPath("");
	        System.out.println(path);
	        String FilePath=path + "\\static\\exercises\\";//從我們的上傳檔案夾中去取
	        String BASE_PATH = FilePath;
	        filename = filename+".doc";
	        File file = new File(BASE_PATH + filename);
	        System.out.println(BASE_PATH + filename);
	        HWPFDocument doc = new HWPFDocument(new FileInputStream(file));
	
	        //通過 Doc對象直接擷取Text
	        StringBuilder sb = doc.getText();
	//        System.out.println("文本："+sb.toString());
	
	        //通過Range對象擷取Text
	        Range range = doc.getRange();
	        String text = range.text();
	//        System.out.println(text);
	
	        //擷取段落數目
	        //在Word中，一個回車符就是一個段落了
	        int nums = range.numParagraphs();
	//        System.out.println("多少個段落："+nums);
	
	        //擷取doc中的圖檔數
	        List<Picture> pics = doc.getPicturesTable().getAllPictures();
	
	        for(Picture pic:pics){
	            //圖檔在doc檔案中的位置,分析Doc 轉化成其他文本時需要用到
	            int start = pic.getStartOffset();
	            int width = pic.getWidth();
	            int height = pic.getHeight();
	            String mimeType = pic.getMimeType();
	
	            System.out.printf("開始位置%d\t圖檔大小度%d,高%d,\t圖檔類型%s\r\n",start,width,height,mimeType);
	        }
	        //1.通過Picture的writeImageContent方法 寫檔案
	        //2.擷取Picture的byte 自己寫
	        copyPic2Disk(pics, new File(BASE_PATH));
	
	
	        //周遊range範圍内的table。
	        TableIterator tableIter = new TableIterator(range);
	        while (tableIter.hasNext()) {
	            Table table = tableIter.next();
	            //開始位置
	            int start = table.getStartOffset();
	            //結束位置
	            int end = table.getEndOffset();
	            System.out.printf("開始位置%d,結束為止%d\r\n",start,end);
	
	            //擷取行的數目
	            int rowNum = table.numRows();
	            for (int j = 0; j < rowNum; j++) {
	                //擷取每一行
	                TableRow row = table.getRow(j);
	                int cellNum = row.numCells();
	                for (int k = 0; k < cellNum; k++) {
	                    //擷取每一列
	                    TableCell cell = row.getCell(k);
	                    // 輸出單元格的文本
	                    System.out.println(cell.text().trim());
	                }
	            }
	        }
	        return text;
	    }
	
	    /**
	     * 也可以自己寫方法
	     * @param imgByte
	     * @throws Exception
	     */
	    public static void copyByteToFile(byte[] imgByte,String path) throws Exception {
	
	        InputStream in = new ByteInputStream(imgByte, 0, imgByte.length);
	        byte[] buff = new byte[1024];
	        String fileName = UUID.randomUUID().toString().substring(0, 6);
	        OutputStream out = new FileOutputStream(new File(path + fileName + ".jpg"));
	
	        int len = 0;
	        while ((len = in.read(buff)) > 0) {
	            out.write(buff, 0, len);
	        }
	
	        out.flush();
	        out.close();
	        in.close();
	    }
	
	    /**
	     * 通過Picture 自己類中的讀寫方法
	     * @param pics
	     * @param path
	     */
	    public static void copyPic2Disk(List<Picture> pics,File path){
	        if(pics == null  || pics.size()  <=0){
	            return;
	        }
	        if(!path.isDirectory()){
	            throw new RuntimeException("路徑填寫不正确");
	        }
	        //當檔案夾路徑不存在的情況下，我們自己建立檔案夾目錄
	        if(!path.exists() ){
	            path.mkdirs();
	        }
	
	        try {
	            for(Picture pic:pics){
	                //寫出資料，我們使用的是Poi類中，Picture自己所帶的函數
	                pic.writeImageContent(new FileOutputStream(new File(path,pic.suggestFullFileName())));
	                /*byte [] picBytes = pic.getContent(); //擷取位元組流，也可以自己寫入資料
	                copyByteToFile(picBytes);*/
	            }
	        } catch (Exception e) {
	            // TODO Auto-generated catch block
	            e.printStackTrace();
	        }
	    }
	}

5、CosineSimilarAlgorithm 擷取兩個檔案相似性

package com.graduation.util;
	import java.util.ArrayList;
	import java.util.LinkedHashMap;
	import java.util.List;
	import java.util.Map;
	import java.util.Set;
	public class CosineSimilarAlgorithm {
	
	    /**
	     *
	     * @Title: cosSimilarityByFile
	     * @Description: 擷取兩個檔案相似性
	     * @param @param firstFile
	     * @param @param secondFile
	     * @param @return
	     * @return Double
	     * @throws
	     */
	    public static Double cosSimilarityByFile(String firstFile,String secondFile){
	        try{
	            Map<String, Map<String, Integer>> firstTfMap=TfIdfAlgorithm.wordSegCount(firstFile);
	            Map<String, Map<String, Integer>> secondTfMap=TfIdfAlgorithm.wordSegCount(secondFile);
	            if(firstTfMap==null || firstTfMap.size()==0){
	                throw new IllegalArgumentException("firstFile not found or firstFile is empty! ");
	            }
	            if(secondTfMap==null || secondTfMap.size()==0){
	                throw new IllegalArgumentException("secondFile not found or secondFile is empty! ");
	            }
	            Map<String,Integer> firstWords=firstTfMap.get(firstFile);
	            Map<String,Integer> secondWords=secondTfMap.get(secondFile);
	            if(firstWords.size()<secondWords.size()){
	                Map<String, Integer> temp=firstWords;
	                firstWords=secondWords;
	                secondWords=temp;
	            }
	            return calculateCos((LinkedHashMap<String, Integer>)firstWords, (LinkedHashMap<String, Integer>)secondWords);
	
	        }catch(Exception e){
	            e.printStackTrace();
	        }
	        return 0d;
	    }
	
	    /**
	     *
	     * @Title: cosSimilarityByString
	     * @Description: 得到兩個字元串的相似性
	     * @param @param first
	     * @param @param second
	     * @param @return
	     * @return Double
	     * @throws
	     */
	    public static Double cosSimilarityByString(String first,String second){
	        try{
	            Map<String, Integer> firstTfMap=TfIdfAlgorithm.segStr(first);
	            Set<String> set = firstTfMap.keySet();
	            String res = "";
	            for(String i:set) {
	                res = res+i;
	            }
	            //System.out.println(res);
	            System.out.println("------------------------");
	
	            Map<String, Integer> secondTfMap=TfIdfAlgorithm.segStr(second);
	//
	//			for(int i=0;i<firstTfMap.size();i++) {
	//				System.out.print(secondTfMap.toString());
	//			}
	            System.out.println("------------------------");
	            if(firstTfMap.size()<secondTfMap.size()){
	                Map<String, Integer> temp=firstTfMap;
	                firstTfMap=secondTfMap;
	                secondTfMap=temp;
	
	            }
	
	            return calculateCos((LinkedHashMap<String, Integer>)firstTfMap, (LinkedHashMap<String, Integer>)secondTfMap);
	
	        }catch(Exception e){
	            e.printStackTrace();
	        }
	        return 0d;
	    }
	
	    /**
	     *
	     * @Title: calculateCos
	     * @Description: 計算餘弦相似性
	     * @param @param first
	     * @param @param second
	     * @param @return
	     * @return Double
	     * @throws
	     */
	    private static Double calculateCos(LinkedHashMap<String, Integer> first,LinkedHashMap<String, Integer> second){
	
	        List<Map.Entry<String, Integer>> firstList = new ArrayList<Map.Entry<String, Integer>>(first.entrySet());
	        List<Map.Entry<String, Integer>> secondList = new ArrayList<Map.Entry<String, Integer>>(second.entrySet());
	        //計算相似度
	        double vectorFirstModulo = 0.00;//向量1的模
	        double vectorSecondModulo = 0.00;//向量2的模
	        double vectorProduct = 0.00; //向量積
	        int secondSize=second.size();
	        for(int i=0;i<firstList.size();i++){
	            if(i<secondSize){
	                vectorSecondModulo+=secondList.get(i).getValue().doubleValue()*secondList.get(i).getValue().doubleValue();
	                vectorProduct+=firstList.get(i).getValue().doubleValue()*secondList.get(i).getValue().doubleValue();
	            }
	            vectorFirstModulo+=firstList.get(i).getValue().doubleValue()*firstList.get(i).getValue().doubleValue();
	        }
	        return vectorProduct/(Math.sqrt(vectorFirstModulo)*Math.sqrt(vectorSecondModulo));
	    }
	
	    public static void main(String[] args){
	        Double result=cosSimilarityByString("三網融合又可被稱為“數位彙流”，是将電信網、計算機網際網路和有線電視網三者互聯互通，融合發展，進而為使用者提供語音、資料和廣播電視等服務， 伴随着通信行業加快發展，傳統的三網融合已逐漸成為目前網際網路發展的趨勢。"
	                ,"三網融合是指電信網、廣播電視網、網際網路在向寬帶通信網、數字電視網、下一代網際網路演進過程中，三大網絡通過技術改造，其技術功能趨于一緻，業務範圍趨于相同，網絡互聯互通、資源共享，能為使用者提供語音、資料和廣播電視等多種服務。三合并不意味着三大網絡的實體合一，而主要是指高層業務應用的融合。三網融合應用廣泛，遍及智能交通、環境保護、政府工作、公共安全、平安家居等多個領域。以後的手機可以看電視、上網，電視可以打電話、上網，電腦也可以打電話、看電視。三者之間互相交叉，形成你中有我、我中有你的格局。");
	        System.out.println(result);
	    }
	}

6、TfIdfAlgorithm 統計單詞的TF-IDF

package com.graduation.util;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;

public class TfIdfAlgorithm {
    /**
     * 檔案名儲存在list
     */
    private static List<String> fileList = new ArrayList<String>();
    /**
     * 所有檔案tf結果.key:檔案名,value:該檔案tf
     */
    private static Map<String, Map<String, Double>> allTfMap = new HashMap<String, Map<String, Double>>();

    /**
     * 所有檔案分詞結果.key:檔案名,value:該檔案分詞統計
     */
    private static Map<String, Map<String, Integer>> allSegsMap = new HashMap<String, Map<String, Integer>>();

    /**
     * 所有檔案分詞的idf結果.key:檔案名,value:詞w在整個文檔集合中的逆向文檔頻率idf (Inverse Document Frequency)，即文檔總數n與詞w所出現檔案數docs(w, D)比值的對數
     */
    private static Map<String, Double> idfMap = new HashMap<String, Double>();

    /**
     * 統計包含單詞的文檔數  key:單詞  value:包含該詞的文檔數
     */
    private static Map<String, Integer> containWordOfAllDocNumberMap=new HashMap<String, Integer>();

    /**
     * 統計單詞的TF-IDF
     * key:檔案名 value:該檔案tf-idf
     */
    private static Map<String, Map<String, Double>> tfIdfMap = new HashMap<String, Map<String, Double>>();


    /**
     *
     * @Title: readDirs
     * @Description: 遞歸擷取檔案
     * @param @param filepath
     * @param @return List<String>
     * @param @throws FileNotFoundException
     * @param @throws IOException
     * @return List<String>
     * @throws
     */
    private static List<String> readDirs(String filepath) throws FileNotFoundException, IOException {
        try {
            File file = new File(filepath);
            if (!file.isDirectory()) {
                System.out.println("輸入的參數應該為[檔案夾名]");
                System.out.println("filepath: " + file.getAbsolutePath());
            } else if (file.isDirectory()) {
                String[] filelist = file.list();
                for (int i = 0; i < filelist.length; i++) {
                    File readfile = new File(filepath + File.separator + filelist[i]);
                    if (!readfile.isDirectory()) {
                        fileList.add(readfile.getAbsolutePath());
                    } else if (readfile.isDirectory()) {
                        readDirs(filepath + File.separator + filelist[i]);
                    }
                }
            }

        } catch (FileNotFoundException e) {
            e.printStackTrace();
        }
        return fileList;
    }

    /**
     *
     * @Title: readFile
     * @Description: 讀取檔案轉化成string
     * @param @param file
     * @param @return String
     * @param @throws FileNotFoundException
     * @param @throws IOException
     * @return String
     * @throws
     */
    private static String readFile(String file) throws FileNotFoundException, IOException {
        StringBuffer sb = new StringBuffer();
        InputStreamReader is = new InputStreamReader(new FileInputStream(file), "UTF-8");
        BufferedReader br = new BufferedReader(is);
        String line = br.readLine();
        while (line != null) {
            sb.append(line).append("\r\n");
            line = br.readLine();
        }
        br.close();
        return sb.toString();
    }


    /**
     *
     * @Title: segString
     * @Description: 用ik進行字元串分詞,統計各個詞出現的次數
     * @param @param content
     * @param @return  Map<String, Integer>
     * @return Map<String,Integer>
     * @throws
     */
    private static Map<String, Integer> segString(String content){
        // 分詞
        Reader input = new StringReader(content);
        // 智能分詞關閉（對分詞的精度影響很大）
        IKSegmenter iks = new IKSegmenter(input, true);
        Lexeme lexeme = null;
        Map<String, Integer> words = new HashMap<String, Integer>();
        try {
            while ((lexeme = iks.next()) != null) {
                if (words.containsKey(lexeme.getLexemeText())) {
                    words.put(lexeme.getLexemeText(), words.get(lexeme.getLexemeText()) + 1);
                } else {
                    words.put(lexeme.getLexemeText(), 1);
                }
            }
        }catch(IOException e) {
            e.printStackTrace();
        }
        return words;
    }

    /**
     *
     * @Title: segStr
     * @Description: 傳回LinkedHashMap的分詞
     * @param @param content
     * @param @return
     * @return Map<String,Integer>
     * @throws
     */
    public static Map<String, Integer> segStr(String content){
        // 分詞
        Reader input = new StringReader(content);
        // 智能分詞關閉（對分詞的精度影響很大）
        IKSegmenter iks = new IKSegmenter(input, true);
        Lexeme lexeme = null;
        Map<String, Integer> words = new LinkedHashMap<String, Integer>();
        try {
            while ((lexeme = iks.next()) != null) {
                if (words.containsKey(lexeme.getLexemeText())) {
                    words.put(lexeme.getLexemeText(), words.get(lexeme.getLexemeText()) + 1);
                } else {
                    words.put(lexeme.getLexemeText(), 1);
                }
            }
        }catch(IOException e) {
            e.printStackTrace();
        }
        return words;
    }

    public static Map<String, Integer> getMostFrequentWords(int num,Map<String, Integer> words){

        Map<String, Integer> keywords = new LinkedHashMap<String, Integer>();
        int count=0;
        // 詞頻統計
        List<Map.Entry<String, Integer>> info = new ArrayList<Map.Entry<String, Integer>>(words.entrySet());
        Collections.sort(info, new Comparator<Map.Entry<String, Integer>>() {
            public int compare(Map.Entry<String, Integer> obj1, Map.Entry<String, Integer> obj2) {
                return obj2.getValue() - obj1.getValue();
            }
        });

        // 高頻詞輸出
        for (int j = 0; j < info.size(); j++) {
            // 詞-->頻
            if(info.get(j).getKey().length()>1){
                if(num>count){
                    keywords.put(info.get(j).getKey(), info.get(j).getValue());
                    count++;
                }else{
                    break;
                }
            }
        }
        return keywords;
    }

    /**
     *
     * @Title: tf
     * @Description: 分詞結果轉化為tf,公式為:tf(w,d) = count(w, d) / size(d)
     * 即詞w在文檔d中出現次數count(w, d)和文檔d中總詞數size(d)的比值
     * @param @param segWordsResult
     * @param @return
     * @return HashMap<String,Double>
     * @throws
     */
    private static HashMap<String, Double> tf(Map<String, Integer> segWordsResult) {

        HashMap<String, Double> tf = new HashMap<String, Double>();// 正規化
        if(segWordsResult==null || segWordsResult.size()==0){
            return tf;
        }
        Double size=Double.valueOf(segWordsResult.size());
        Set<String> keys=segWordsResult.keySet();
        for(String key: keys){
            Integer value=segWordsResult.get(key);
            tf.put(key, Double.valueOf(value)/size);
        }
        return tf;
    }

    /**
     *
     * @Title: allTf
     * @Description: 得到所有檔案的tf
     * @param @param dir
     * @param @return Map<String, Map<String, Double>>
     * @return Map<String,Map<String,Double>>
     * @throws
     */
    public static Map<String, Map<String, Double>> allTf(String dir){
        try{
            fileList=readDirs(dir);
            for(String filePath : fileList){
                String content=readFile(filePath);
                Map<String, Integer> segs=segString(content);
                allSegsMap.put(filePath, segs);
                allTfMap.put(filePath, tf(segs));
            }
        }catch(FileNotFoundException ffe){
            ffe.printStackTrace();
        }catch(IOException io){
            io.printStackTrace();
        }
        return allTfMap;
    }

    /**
     *
     * @Title: wordSegCount
     * @Description: 傳回分詞結果,以LinkedHashMap儲存
     * @param @param dir
     * @param @return
     * @return Map<String,Map<String,Integer>>
     * @throws
     */
    public static Map<String, Map<String, Integer>> wordSegCount(String dir){
        try{
            fileList=readDirs(dir);
            for(String filePath : fileList){
                String content=readFile(filePath);
                Map<String, Integer> segs=segStr(content);
                allSegsMap.put(filePath, segs);
            }
        }catch(FileNotFoundException ffe){
            ffe.printStackTrace();
        }catch(IOException io){
            io.printStackTrace();
        }
        return allSegsMap;
    }


    /**
     *
     * @Title: containWordOfAllDocNumber
     * @Description: 統計包含單詞的文檔數  key:單詞  value:包含該詞的文檔數
     * @param @param allSegsMap
     * @param @return
     * @return Map<String,Integer>
     * @throws
     */
    private static Map<String, Integer> containWordOfAllDocNumber(Map<String, Map<String, Integer>> allSegsMap){
        if(allSegsMap==null || allSegsMap.size()==0){
            return containWordOfAllDocNumberMap;
        }

        Set<String> fileList=allSegsMap.keySet();
        for(String filePath: fileList){
            Map<String, Integer> fileSegs=allSegsMap.get(filePath);
            //擷取該檔案分詞為空或為0,進行下一個檔案
            if(fileSegs==null || fileSegs.size()==0){
                continue;
            }
            //統計每個分詞的idf
            Set<String> segs=fileSegs.keySet();
            for(String seg : segs){
                if (containWordOfAllDocNumberMap.containsKey(seg)) {
                    containWordOfAllDocNumberMap.put(seg, containWordOfAllDocNumberMap.get(seg) + 1);
                } else {
                    containWordOfAllDocNumberMap.put(seg, 1);
                }
            }

        }
        return containWordOfAllDocNumberMap;
    }

    /**
     *
     * @Title: idf
     * @Description: idf = log(n / docs(w, D))
     * @param @param containWordOfAllDocNumberMap
     * @param @return Map<String, Double>
     * @return Map<String,Double>
     * @throws
     */
    public static Map<String, Double> idf(Map<String, Map<String, Integer>> allSegsMap){
        if(allSegsMap==null || allSegsMap.size()==0){
            return idfMap;
        }
        containWordOfAllDocNumberMap=containWordOfAllDocNumber(allSegsMap);
        Set<String> words=containWordOfAllDocNumberMap.keySet();
        Double wordSize=Double.valueOf(containWordOfAllDocNumberMap.size());
        for(String word: words){
            Double number=Double.valueOf(containWordOfAllDocNumberMap.get(word));
            idfMap.put(word, Math.log(wordSize/(number+1.0d)));
        }
        return idfMap;
    }

    /**
     *
     * @Title: tfIdf
     * @Description: tf-idf
     * @param @param tf,idf
     * @return Map<String, Map<String, Double>>
     * @throws
     */
    public static Map<String, Map<String, Double>> tfIdf(Map<String, Map<String, Double>> allTfMap,Map<String, Double> idf){

        Set<String> fileList=allTfMap.keySet();
        for(String filePath : fileList){
            Map<String, Double> tfMap=allTfMap.get(filePath);
            Map<String, Double> docTfIdf=new HashMap<String,Double>();
            Set<String> words=tfMap.keySet();
            for(String word: words){
                Double tfValue=Double.valueOf(tfMap.get(word));
                Double idfValue=idf.get(word);
                docTfIdf.put(word, tfValue*idfValue);
            }
            tfIdfMap.put(filePath, docTfIdf);
        }
        return tfIdfMap;
    }


    public static void main(String[] args){

        System.out.println("tf--------------------------------------");
        Map<String, Map<String, Double>> allTfMap=TfIdfAlgorithm.allTf("d://dir");
        Set<String> fileList=allTfMap.keySet();
        for(String filePath : fileList){
            Map<String, Double> tfMap=allTfMap.get(filePath);
            Set<String> words=tfMap.keySet();
            for(String word: words){
                System.out.println("fileName:"+filePath+"     word:"+word+"      tf:"+tfMap.get(word));
            }
        }

        System.out.println("idf--------------------------------------");
        Map<String, Double> idfMap=TfIdfAlgorithm.idf(allSegsMap);
        Set<String> words=idfMap.keySet();
        for(String word : words){
            System.out.println("word:"+word+"     tf:"+idfMap.get(word));
        }

        System.out.println("tf-idf--------------------------------------");
        Map<String, Map<String, Double>> tfIdfMap=TfIdfAlgorithm.tfIdf(allTfMap, idfMap);
        Set<String> files=tfIdfMap.keySet();
        for(String filePath : files){
            Map<String, Double> tfIdf=tfIdfMap.get(filePath);
            Set<String> segs=tfIdf.keySet();
            for(String word: segs){
                System.out.println("fileName:"+filePath+"     word:"+word+"        tf-idf:"+tfIdf.get(word));
            }
        }
    }
}

餘弦定理對比文本相似度實作查重

繼續閱讀

ionic3 使用echarts簡單舉例 (贊)

DELETE_FAILED_INTERNAL_ERROR 小米之殇

KeePass全網最詳使用指南（附全平台個人收集官網的軟體）KeePass全網最詳使用指南（附全平台個人收集官網的軟體）

論文查重前應删掉哪些内容？

C#正規表達式驗證是否日期

python scipy使用餘弦定理求句子相似度

屏蔽網頁廣告的方法

吐槽一下opencv&xcode

開發踩坑記錄

HTML5之表單新功能詳解

餘弦定理

Android-----裝備選擇

html5實作地圖上定位導航路線

jquery的extend和fn.extend （擴充jQuery類或jQuery對象方法）

Android提高第二篇之SurfaceView的基本使用

Net Framework4.0安裝失敗解決之道