1、在pom.xml中添加分詞器與word讀取依賴
<!-- ik.中文分詞器依賴-->
<dependency>
<groupId>com.janeluo</groupId>
<artifactId>ikanalyzer</artifactId>
<version>2012_u6</version>
</dependency>
<!-- lucene依賴 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>4.7.2</version>
</dependency>
<!--word讀取-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.14-beta1</version>
</dependency>
2、jsp 使用
<button type="button" style="width: 8%;outline: none;margin-left: 1.5%;margin-bottom: 15px" onclick="ExerciseCheck()" class="btn btn-primary">作業查重</button>
<script type="text/javascript">
function ExerciseCheck() {
var CourseID = $("#ds_course").val();
var ChapterID = $("#ds_cnumber").val();
var MinChapterID = $("#ds_snumber").val();
$.ajax({
type:"POST",
url:"/exercise/ExerciseRecheck",
dataType:"json",
data:{
CourseID:CourseID,
ChapterID:ChapterID,
MinChapterID:MinChapterID
},
async:false,
success: function(data){
$("#listExport").html($("#listExport").html()
+ "<a style=\"width: 100%;outline: none;margin-top: 14px;margin-left: 1.5%;margin-bottom: 15px\" href=\"/excel/ListExports?CourseID="+CourseID+"&ChapterID="+ChapterID+"&MinChapterID="+MinChapterID+"\" class=\"btn btn-primary\">導出名單</a>"
);
openCheckWin();
document.getElementById('checkView').innerHTML = "";
if (data == null || data == ""){
$("#checkView").html($("#checkView").html()
+ "<li class=\"list-group-item\" style=\"height: 50px;padding-right: 0;margin-left: 1.5%;width: 97%;\">"
+ "<span>暫無内容</span>"
+ "</li>"
);
}else{
var json = eval(data);
$.each(json, function (index) {
var DetectionUserID = json[index].detectionUserID;
var DetectionUserName = json[index].detectionUserName;
var MeasuredUserID = json[index].measuredUserID;
var MeasuredUserName = json[index].measuredUserName;
var Similarity = json[index].similarity;
$("#checkView").html($("#checkView").html()
+ "<li class=\"list-group-item\" style=\"height: 50px;padding-right: 0;margin-left: 1.5%;width: 97%;\">"
+ "<span>" +"學号:"+ DetectionUserID + " " +"姓名:"+ DetectionUserName + "</span>"
+ "</li>"
);
});
}
}
});
}
function openCheckWin(){
document.getElementById("CheckWin").style.display = "block";
}
</script>
<div class="floatingWin" style="border-radius: 5px;margin-left: 28%;width: 40%;display: none;position: absolute;background: #FFFFFF;height: 450px;z-index: 111111111111111111111111111111" id="CheckWin">
<div id="listExport" style="width: 13%;float: left;margin-left: 1.5%">
</div>
<button type="button" style="width: 14%;outline: none;margin-top: 14px;margin-left: 1.5%;margin-bottom: 15px" onclick="closeCheckWin()" class="btn btn-primary">關閉</button>
<div class="form-group">
<span class="text-muted" style="margin-left: 1.5%">疑似抄襲名單</span>
<ul class="list-group" id="checkView" style="overflow: auto">
</ul>
</div>
</div>
3、controller
@ResponseBody
@RequestMapping("/ExerciseRecheck")
public List<ExerciseCheck> ExerciseRecheck(String CourseID,String ChapterID,String MinChapterID,HttpServletRequest request) throws Exception {
List<Exercise> list1 = exerciseService.QuerySectionExercise(CourseID,ChapterID,MinChapterID);
List<Exercise> list2 = list1;
List<ExerciseCheck> exerciseChecks = new ArrayList<ExerciseCheck>();
if(list1.size() < 2){
System.out.println("作業數小于2無法查重!");
}else {
int l = 0;
for(int i = 0;i < list1.size();i++){
String file1 = new WordRead().readWord(list1.get(i).getChapterExercise(),request).replaceAll("\r|\n", "");
for (int j = 0;j<list2.size();j++){
if( i != j){
String file2 = new WordRead().readWord(list2.get(j).getChapterExercise(),request).replaceAll("\r|\n", "");
Double f = new CosineSimilarAlgorithm().cosSimilarityByString(file1,file2);
if(f > 0.6){
ExerciseCheck ec = new ExerciseCheck();
ec.setDetectionUserID(list1.get(i).getUserID());
ec.setDetectionUserName(list1.get(i).getUserName());
ec.setMeasuredUserID(list2.get(j).getUserID());
ec.setMeasuredUserName(list2.get(j).getUserName());
ec.setSimilarity(f.toString());
exerciseChecks.add(l,ec);
l++;
continue;
}
}
}
}
}
return exerciseChecks;
}
4、讀取word檔案内容
package com.graduation.util;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.List;
import java.util.UUID;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableIterator;
import org.apache.poi.hwpf.usermodel.TableRow;
import com.sun.xml.internal.messaging.saaj.util.ByteInputStream;
import javax.servlet.http.HttpServletRequest;
public class WordRead {
public static String readWord(String filename,HttpServletRequest request) throws Exception{
String path = request.getServletContext().getRealPath("");
System.out.println(path);
String FilePath=path + "\\static\\exercises\\";//從我們的上傳檔案夾中去取
String BASE_PATH = FilePath;
filename = filename+".doc";
File file = new File(BASE_PATH + filename);
System.out.println(BASE_PATH + filename);
HWPFDocument doc = new HWPFDocument(new FileInputStream(file));
//通過 Doc對象直接擷取Text
StringBuilder sb = doc.getText();
// System.out.println("文本:"+sb.toString());
//通過Range對象擷取Text
Range range = doc.getRange();
String text = range.text();
// System.out.println(text);
//擷取段落數目
//在Word中,一個回車符就是一個段落了
int nums = range.numParagraphs();
// System.out.println("多少個段落:"+nums);
//擷取doc中的圖檔數
List<Picture> pics = doc.getPicturesTable().getAllPictures();
for(Picture pic:pics){
//圖檔在doc檔案中的位置,分析Doc 轉化成其他文本時需要用到
int start = pic.getStartOffset();
int width = pic.getWidth();
int height = pic.getHeight();
String mimeType = pic.getMimeType();
System.out.printf("開始位置%d\t圖檔大小度%d,高%d,\t圖檔類型%s\r\n",start,width,height,mimeType);
}
//1.通過Picture的writeImageContent方法 寫檔案
//2.擷取Picture的byte 自己寫
copyPic2Disk(pics, new File(BASE_PATH));
//周遊range範圍内的table。
TableIterator tableIter = new TableIterator(range);
while (tableIter.hasNext()) {
Table table = tableIter.next();
//開始位置
int start = table.getStartOffset();
//結束位置
int end = table.getEndOffset();
System.out.printf("開始位置%d,結束為止%d\r\n",start,end);
//擷取行的數目
int rowNum = table.numRows();
for (int j = 0; j < rowNum; j++) {
//擷取每一行
TableRow row = table.getRow(j);
int cellNum = row.numCells();
for (int k = 0; k < cellNum; k++) {
//擷取每一列
TableCell cell = row.getCell(k);
// 輸出單元格的文本
System.out.println(cell.text().trim());
}
}
}
return text;
}
/**
* 也可以自己寫方法
* @param imgByte
* @throws Exception
*/
public static void copyByteToFile(byte[] imgByte,String path) throws Exception {
InputStream in = new ByteInputStream(imgByte, 0, imgByte.length);
byte[] buff = new byte[1024];
String fileName = UUID.randomUUID().toString().substring(0, 6);
OutputStream out = new FileOutputStream(new File(path + fileName + ".jpg"));
int len = 0;
while ((len = in.read(buff)) > 0) {
out.write(buff, 0, len);
}
out.flush();
out.close();
in.close();
}
/**
* 通過Picture 自己類中的讀寫方法
* @param pics
* @param path
*/
public static void copyPic2Disk(List<Picture> pics,File path){
if(pics == null || pics.size() <=0){
return;
}
if(!path.isDirectory()){
throw new RuntimeException("路徑填寫不正确");
}
//當檔案夾路徑不存在的情況下,我們自己建立檔案夾目錄
if(!path.exists() ){
path.mkdirs();
}
try {
for(Picture pic:pics){
//寫出資料,我們使用的是Poi類中,Picture自己所帶的函數
pic.writeImageContent(new FileOutputStream(new File(path,pic.suggestFullFileName())));
/*byte [] picBytes = pic.getContent(); //擷取位元組流,也可以自己寫入資料
copyByteToFile(picBytes);*/
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
5、CosineSimilarAlgorithm 擷取兩個檔案相似性
package com.graduation.util;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class CosineSimilarAlgorithm {
/**
*
* @Title: cosSimilarityByFile
* @Description: 擷取兩個檔案相似性
* @param @param firstFile
* @param @param secondFile
* @param @return
* @return Double
* @throws
*/
public static Double cosSimilarityByFile(String firstFile,String secondFile){
try{
Map<String, Map<String, Integer>> firstTfMap=TfIdfAlgorithm.wordSegCount(firstFile);
Map<String, Map<String, Integer>> secondTfMap=TfIdfAlgorithm.wordSegCount(secondFile);
if(firstTfMap==null || firstTfMap.size()==0){
throw new IllegalArgumentException("firstFile not found or firstFile is empty! ");
}
if(secondTfMap==null || secondTfMap.size()==0){
throw new IllegalArgumentException("secondFile not found or secondFile is empty! ");
}
Map<String,Integer> firstWords=firstTfMap.get(firstFile);
Map<String,Integer> secondWords=secondTfMap.get(secondFile);
if(firstWords.size()<secondWords.size()){
Map<String, Integer> temp=firstWords;
firstWords=secondWords;
secondWords=temp;
}
return calculateCos((LinkedHashMap<String, Integer>)firstWords, (LinkedHashMap<String, Integer>)secondWords);
}catch(Exception e){
e.printStackTrace();
}
return 0d;
}
/**
*
* @Title: cosSimilarityByString
* @Description: 得到兩個字元串的相似性
* @param @param first
* @param @param second
* @param @return
* @return Double
* @throws
*/
public static Double cosSimilarityByString(String first,String second){
try{
Map<String, Integer> firstTfMap=TfIdfAlgorithm.segStr(first);
Set<String> set = firstTfMap.keySet();
String res = "";
for(String i:set) {
res = res+i;
}
//System.out.println(res);
System.out.println("------------------------");
Map<String, Integer> secondTfMap=TfIdfAlgorithm.segStr(second);
//
// for(int i=0;i<firstTfMap.size();i++) {
// System.out.print(secondTfMap.toString());
// }
System.out.println("------------------------");
if(firstTfMap.size()<secondTfMap.size()){
Map<String, Integer> temp=firstTfMap;
firstTfMap=secondTfMap;
secondTfMap=temp;
}
return calculateCos((LinkedHashMap<String, Integer>)firstTfMap, (LinkedHashMap<String, Integer>)secondTfMap);
}catch(Exception e){
e.printStackTrace();
}
return 0d;
}
/**
*
* @Title: calculateCos
* @Description: 計算餘弦相似性
* @param @param first
* @param @param second
* @param @return
* @return Double
* @throws
*/
private static Double calculateCos(LinkedHashMap<String, Integer> first,LinkedHashMap<String, Integer> second){
List<Map.Entry<String, Integer>> firstList = new ArrayList<Map.Entry<String, Integer>>(first.entrySet());
List<Map.Entry<String, Integer>> secondList = new ArrayList<Map.Entry<String, Integer>>(second.entrySet());
//計算相似度
double vectorFirstModulo = 0.00;//向量1的模
double vectorSecondModulo = 0.00;//向量2的模
double vectorProduct = 0.00; //向量積
int secondSize=second.size();
for(int i=0;i<firstList.size();i++){
if(i<secondSize){
vectorSecondModulo+=secondList.get(i).getValue().doubleValue()*secondList.get(i).getValue().doubleValue();
vectorProduct+=firstList.get(i).getValue().doubleValue()*secondList.get(i).getValue().doubleValue();
}
vectorFirstModulo+=firstList.get(i).getValue().doubleValue()*firstList.get(i).getValue().doubleValue();
}
return vectorProduct/(Math.sqrt(vectorFirstModulo)*Math.sqrt(vectorSecondModulo));
}
public static void main(String[] args){
Double result=cosSimilarityByString("三網融合又可被稱為“數位彙流”,是将電信網、計算機網際網路和有線電視網三者互聯互通,融合發展,進而為使用者提供語音、資料和廣播電視等服務, 伴随着通信行業加快發展,傳統的三網融合已逐漸成為目前網際網路發展的趨勢。"
,"三網融合是指電信網、廣播電視網、網際網路在向寬帶通信網、數字電視網、下一代網際網路演進過程中,三大網絡通過技術改造,其技術功能趨于一緻,業務範圍趨于相同,網絡互聯互通、資源共享,能為使用者提供語音、資料和廣播電視等多種服務。三合并不意味着三大網絡的實體合一,而主要是指高層業務應用的融合。三網融合應用廣泛,遍及智能交通、環境保護、政府工作、公共安全、平安家居等多個領域。以後的手機可以看電視、上網,電視可以打電話、上網,電腦也可以打電話、看電視。三者之間互相交叉,形成你中有我、我中有你的格局。");
System.out.println(result);
}
}
6、TfIdfAlgorithm 統計單詞的TF-IDF
package com.graduation.util;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
public class TfIdfAlgorithm {
/**
* 檔案名儲存在list
*/
private static List<String> fileList = new ArrayList<String>();
/**
* 所有檔案tf結果.key:檔案名,value:該檔案tf
*/
private static Map<String, Map<String, Double>> allTfMap = new HashMap<String, Map<String, Double>>();
/**
* 所有檔案分詞結果.key:檔案名,value:該檔案分詞統計
*/
private static Map<String, Map<String, Integer>> allSegsMap = new HashMap<String, Map<String, Integer>>();
/**
* 所有檔案分詞的idf結果.key:檔案名,value:詞w在整個文檔集合中的逆向文檔頻率idf (Inverse Document Frequency),即文檔總數n與詞w所出現檔案數docs(w, D)比值的對數
*/
private static Map<String, Double> idfMap = new HashMap<String, Double>();
/**
* 統計包含單詞的文檔數 key:單詞 value:包含該詞的文檔數
*/
private static Map<String, Integer> containWordOfAllDocNumberMap=new HashMap<String, Integer>();
/**
* 統計單詞的TF-IDF
* key:檔案名 value:該檔案tf-idf
*/
private static Map<String, Map<String, Double>> tfIdfMap = new HashMap<String, Map<String, Double>>();
/**
*
* @Title: readDirs
* @Description: 遞歸擷取檔案
* @param @param filepath
* @param @return List<String>
* @param @throws FileNotFoundException
* @param @throws IOException
* @return List<String>
* @throws
*/
private static List<String> readDirs(String filepath) throws FileNotFoundException, IOException {
try {
File file = new File(filepath);
if (!file.isDirectory()) {
System.out.println("輸入的參數應該為[檔案夾名]");
System.out.println("filepath: " + file.getAbsolutePath());
} else if (file.isDirectory()) {
String[] filelist = file.list();
for (int i = 0; i < filelist.length; i++) {
File readfile = new File(filepath + File.separator + filelist[i]);
if (!readfile.isDirectory()) {
fileList.add(readfile.getAbsolutePath());
} else if (readfile.isDirectory()) {
readDirs(filepath + File.separator + filelist[i]);
}
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
}
return fileList;
}
/**
*
* @Title: readFile
* @Description: 讀取檔案轉化成string
* @param @param file
* @param @return String
* @param @throws FileNotFoundException
* @param @throws IOException
* @return String
* @throws
*/
private static String readFile(String file) throws FileNotFoundException, IOException {
StringBuffer sb = new StringBuffer();
InputStreamReader is = new InputStreamReader(new FileInputStream(file), "UTF-8");
BufferedReader br = new BufferedReader(is);
String line = br.readLine();
while (line != null) {
sb.append(line).append("\r\n");
line = br.readLine();
}
br.close();
return sb.toString();
}
/**
*
* @Title: segString
* @Description: 用ik進行字元串分詞,統計各個詞出現的次數
* @param @param content
* @param @return Map<String, Integer>
* @return Map<String,Integer>
* @throws
*/
private static Map<String, Integer> segString(String content){
// 分詞
Reader input = new StringReader(content);
// 智能分詞關閉(對分詞的精度影響很大)
IKSegmenter iks = new IKSegmenter(input, true);
Lexeme lexeme = null;
Map<String, Integer> words = new HashMap<String, Integer>();
try {
while ((lexeme = iks.next()) != null) {
if (words.containsKey(lexeme.getLexemeText())) {
words.put(lexeme.getLexemeText(), words.get(lexeme.getLexemeText()) + 1);
} else {
words.put(lexeme.getLexemeText(), 1);
}
}
}catch(IOException e) {
e.printStackTrace();
}
return words;
}
/**
*
* @Title: segStr
* @Description: 傳回LinkedHashMap的分詞
* @param @param content
* @param @return
* @return Map<String,Integer>
* @throws
*/
public static Map<String, Integer> segStr(String content){
// 分詞
Reader input = new StringReader(content);
// 智能分詞關閉(對分詞的精度影響很大)
IKSegmenter iks = new IKSegmenter(input, true);
Lexeme lexeme = null;
Map<String, Integer> words = new LinkedHashMap<String, Integer>();
try {
while ((lexeme = iks.next()) != null) {
if (words.containsKey(lexeme.getLexemeText())) {
words.put(lexeme.getLexemeText(), words.get(lexeme.getLexemeText()) + 1);
} else {
words.put(lexeme.getLexemeText(), 1);
}
}
}catch(IOException e) {
e.printStackTrace();
}
return words;
}
public static Map<String, Integer> getMostFrequentWords(int num,Map<String, Integer> words){
Map<String, Integer> keywords = new LinkedHashMap<String, Integer>();
int count=0;
// 詞頻統計
List<Map.Entry<String, Integer>> info = new ArrayList<Map.Entry<String, Integer>>(words.entrySet());
Collections.sort(info, new Comparator<Map.Entry<String, Integer>>() {
public int compare(Map.Entry<String, Integer> obj1, Map.Entry<String, Integer> obj2) {
return obj2.getValue() - obj1.getValue();
}
});
// 高頻詞輸出
for (int j = 0; j < info.size(); j++) {
// 詞-->頻
if(info.get(j).getKey().length()>1){
if(num>count){
keywords.put(info.get(j).getKey(), info.get(j).getValue());
count++;
}else{
break;
}
}
}
return keywords;
}
/**
*
* @Title: tf
* @Description: 分詞結果轉化為tf,公式為:tf(w,d) = count(w, d) / size(d)
* 即詞w在文檔d中出現次數count(w, d)和文檔d中總詞數size(d)的比值
* @param @param segWordsResult
* @param @return
* @return HashMap<String,Double>
* @throws
*/
private static HashMap<String, Double> tf(Map<String, Integer> segWordsResult) {
HashMap<String, Double> tf = new HashMap<String, Double>();// 正規化
if(segWordsResult==null || segWordsResult.size()==0){
return tf;
}
Double size=Double.valueOf(segWordsResult.size());
Set<String> keys=segWordsResult.keySet();
for(String key: keys){
Integer value=segWordsResult.get(key);
tf.put(key, Double.valueOf(value)/size);
}
return tf;
}
/**
*
* @Title: allTf
* @Description: 得到所有檔案的tf
* @param @param dir
* @param @return Map<String, Map<String, Double>>
* @return Map<String,Map<String,Double>>
* @throws
*/
public static Map<String, Map<String, Double>> allTf(String dir){
try{
fileList=readDirs(dir);
for(String filePath : fileList){
String content=readFile(filePath);
Map<String, Integer> segs=segString(content);
allSegsMap.put(filePath, segs);
allTfMap.put(filePath, tf(segs));
}
}catch(FileNotFoundException ffe){
ffe.printStackTrace();
}catch(IOException io){
io.printStackTrace();
}
return allTfMap;
}
/**
*
* @Title: wordSegCount
* @Description: 傳回分詞結果,以LinkedHashMap儲存
* @param @param dir
* @param @return
* @return Map<String,Map<String,Integer>>
* @throws
*/
public static Map<String, Map<String, Integer>> wordSegCount(String dir){
try{
fileList=readDirs(dir);
for(String filePath : fileList){
String content=readFile(filePath);
Map<String, Integer> segs=segStr(content);
allSegsMap.put(filePath, segs);
}
}catch(FileNotFoundException ffe){
ffe.printStackTrace();
}catch(IOException io){
io.printStackTrace();
}
return allSegsMap;
}
/**
*
* @Title: containWordOfAllDocNumber
* @Description: 統計包含單詞的文檔數 key:單詞 value:包含該詞的文檔數
* @param @param allSegsMap
* @param @return
* @return Map<String,Integer>
* @throws
*/
private static Map<String, Integer> containWordOfAllDocNumber(Map<String, Map<String, Integer>> allSegsMap){
if(allSegsMap==null || allSegsMap.size()==0){
return containWordOfAllDocNumberMap;
}
Set<String> fileList=allSegsMap.keySet();
for(String filePath: fileList){
Map<String, Integer> fileSegs=allSegsMap.get(filePath);
//擷取該檔案分詞為空或為0,進行下一個檔案
if(fileSegs==null || fileSegs.size()==0){
continue;
}
//統計每個分詞的idf
Set<String> segs=fileSegs.keySet();
for(String seg : segs){
if (containWordOfAllDocNumberMap.containsKey(seg)) {
containWordOfAllDocNumberMap.put(seg, containWordOfAllDocNumberMap.get(seg) + 1);
} else {
containWordOfAllDocNumberMap.put(seg, 1);
}
}
}
return containWordOfAllDocNumberMap;
}
/**
*
* @Title: idf
* @Description: idf = log(n / docs(w, D))
* @param @param containWordOfAllDocNumberMap
* @param @return Map<String, Double>
* @return Map<String,Double>
* @throws
*/
public static Map<String, Double> idf(Map<String, Map<String, Integer>> allSegsMap){
if(allSegsMap==null || allSegsMap.size()==0){
return idfMap;
}
containWordOfAllDocNumberMap=containWordOfAllDocNumber(allSegsMap);
Set<String> words=containWordOfAllDocNumberMap.keySet();
Double wordSize=Double.valueOf(containWordOfAllDocNumberMap.size());
for(String word: words){
Double number=Double.valueOf(containWordOfAllDocNumberMap.get(word));
idfMap.put(word, Math.log(wordSize/(number+1.0d)));
}
return idfMap;
}
/**
*
* @Title: tfIdf
* @Description: tf-idf
* @param @param tf,idf
* @return Map<String, Map<String, Double>>
* @throws
*/
public static Map<String, Map<String, Double>> tfIdf(Map<String, Map<String, Double>> allTfMap,Map<String, Double> idf){
Set<String> fileList=allTfMap.keySet();
for(String filePath : fileList){
Map<String, Double> tfMap=allTfMap.get(filePath);
Map<String, Double> docTfIdf=new HashMap<String,Double>();
Set<String> words=tfMap.keySet();
for(String word: words){
Double tfValue=Double.valueOf(tfMap.get(word));
Double idfValue=idf.get(word);
docTfIdf.put(word, tfValue*idfValue);
}
tfIdfMap.put(filePath, docTfIdf);
}
return tfIdfMap;
}
public static void main(String[] args){
System.out.println("tf--------------------------------------");
Map<String, Map<String, Double>> allTfMap=TfIdfAlgorithm.allTf("d://dir");
Set<String> fileList=allTfMap.keySet();
for(String filePath : fileList){
Map<String, Double> tfMap=allTfMap.get(filePath);
Set<String> words=tfMap.keySet();
for(String word: words){
System.out.println("fileName:"+filePath+" word:"+word+" tf:"+tfMap.get(word));
}
}
System.out.println("idf--------------------------------------");
Map<String, Double> idfMap=TfIdfAlgorithm.idf(allSegsMap);
Set<String> words=idfMap.keySet();
for(String word : words){
System.out.println("word:"+word+" tf:"+idfMap.get(word));
}
System.out.println("tf-idf--------------------------------------");
Map<String, Map<String, Double>> tfIdfMap=TfIdfAlgorithm.tfIdf(allTfMap, idfMap);
Set<String> files=tfIdfMap.keySet();
for(String filePath : files){
Map<String, Double> tfIdf=tfIdfMap.get(filePath);
Set<String> segs=tfIdf.keySet();
for(String word: segs){
System.out.println("fileName:"+filePath+" word:"+word+" tf-idf:"+tfIdf.get(word));
}
}
}
}