軟體工程 | |
作業要求 | |
作業目标 | 論文的查重算法+單元測試 |
代碼連接配接
- github:https://github.com/ywh456/3119005484/tree/main/lunwenchachong
整體流程:
- 1.輸入本地檔案的絕對路徑,讀取檔案,傳回String變量filestring
- 2.将filestring分詞,并統計詞組的詞頻
- 3.根據詞的MD5值,求出每個詞的hash值,進而求出文本的simhash值
- 4.根據simhash求出海明距離并求出相似度
- 5.将結果儲存在本地
項目結構:

單元測試
測試覆寫率
單元測試代碼
public class cipin_and_simhashTest {
//測試分詞器文本檢視能否成功對文本分詞;
@Test
public void getTextDef() throws IOException {
String s = "測試分詞器文本//.檢視能否成功對文本分詞";
Map<String,Integer> wF1= cipin_and_simhash.getTextDef(s);
System.out.println(wF1);
}
//多重複詞句分詞測試
@Test
public void getTextDef2() throws IOException {
String s = "重複的測試文本,重複的測試文本,重複的測試文本,重複的測試文本,重複的測試文本,重複的測試文本,重複的測試文本。";
Map<String,Integer> wF1= cipin_and_simhash.getTextDef(s);
System.out.println(wF1);
}
//擷取SIMHASH值
@Test
public void simHash() throws IOException {
String[] s = {"測試","計","算si","m","hash","的文本//",".檢視","能否","成功求","出文本","的","simhash值;"};
for(String s1 :s) {
Map<String, Integer> wF1 = cipin_and_simhash.getTextDef(s1);
String sim = cipin_and_simhash.simHash(wF1);
System.out.println(sim);
}
}
public class HaiMingTest {
//計算海明距離
@Test
public void getHaiMing() {
String str1 = "11000111001";
String str2 = "00110011100";
int d = HaiMing.getHaiMing(str1,str2);
System.out.println("海明距離為:"+d);
}
//測試不同長度的simhash的海明距離
@Test
public void getHaiMing2() {
String str1 = "11000111001";
String str2 = "0011001110";
int d = HaiMing.getHaiMing(str1,str2);
System.out.println("海明距離為:"+d);
}
//測試相似度
@Test
public void getSSIM() {
String str1 = "11000111001";
String str2 = "00110011100";
double s = HaiMing.getSSIM(str1,str2);
System.out.println("相似度為:"+s);
}
}
public class HashTest {
//空字元竄擷取hash測試
@Test
public void getHash1() throws FileException {
String str =null;
String str2= Hash.getHash(str);
System.out.println(str2);
}
@Test
public void getHash2() {
String[] str = {"這是", "一次", "獲得", "哈希", "值", "的", "測試"};
for (String s : str) {
String hash = Hash.getHash(s);
System.out.println(hash.length());
System.out.println(hash);
}
}
}
public class wenjian_IOTest {
//讀檔案測試
@Test
public void read() throws FileException {
String str = "C:\\Users\\MZ\\Desktop\\Testfile\\orig.txt" ;
String a = wenjian_IO.read(str);
System.out.println(a);
}
//錯誤路徑讀入測試
@Test
public void read2() throws FileException {
String str = "C:/Users/MZ/Desktop//Desktop//null.txt";
String b = wenjian_IO.read(str);
System.out.println(b);
}
//空白檔案讀入測試
@Test
public void read3() throws FileException {
String str = null;
String c= wenjian_IO.read(str);
System.out.println(c);
}
@Test
public void writeFile() throws IOException {
double ssim = 1;
String str = "C:/Users/MZ/Desktop//Testfile/writefiletest.txt" ;
boolean a;
a = wenjian_IO.writeFile(str,ssim);
System.out.println(a);
}
@Test
public void writeFile2() throws IOException {
double ssim = 1;
String str ="p:/null.text";
boolean a = wenjian_IO.writeFile(str,ssim);
System.out.println(a);
}
@Test
public void writeFile3() throws IOException {
double ssim = 1;
String str = null;
boolean a =wenjian_IO.writeFile(str,ssim);
System.out.println(a);
}
}
public class mainTest {
// 測試原檔案與add檔案相似度
@Test
public void maintestADD() throws FileException, IOException {
String a = "C:\\Users\\MZ\\Desktop\\Testfile\\orig.txt" ;
String b = "C:\\Users\\MZ\\Desktop\\Testfile\\orig_0.8.add.txt";
String str = "C:\\Users\\MZ\\Desktop\\Testfile\\testadd.txt";
Map<String,Integer> wF1 = cipin_and_simhash.getTextDef(a);
Map<String,Integer> wF2 = cipin_and_simhash.getTextDef(b);
//Similarity
double s = HaiMing.getSSIM(cipin_and_simhash.simHash(wF1), cipin_and_simhash.simHash(wF2));
System.out.println(s);
wenjian_IO.writeFile(str,s);
}
// 測試原檔案與del檔案相似度
@Test
public void maintestDEL() throws FileException, IOException {
String a = wenjian_IO.read("C:/Users/MZ/Desktop/Testfile/orig.txt");
String b = wenjian_IO.read("C:/Users/MZ/Desktop/Testfile/orig_0.8_del.txt");
String str = "C:/Users/MZ/Desktop/Testfile/testdel.txt";
Map<String,Integer> wF1 = cipin_and_simhash.getTextDef(a);
Map<String,Integer> wF2 = cipin_and_simhash.getTextDef(b);
//Similarity
double s = HaiMing.getSSIM(cipin_and_simhash.simHash(wF1), cipin_and_simhash.simHash(wF2));
System.out.println(s);
wenjian_IO.writeFile(str,s);
}
// 測試原檔案與dis_1檔案相似度
@Test
public void mainteestDIS_1() throws FileException, IOException {
String a = wenjian_IO.read("C:/Users/MZ/Desktop/Testfile/orig.txt");
String b = wenjian_IO.read("C:/Users/MZ/Desktop/Testfile/orig_0.8_dis_1.txt");
String str = "C:/Users/MZ/Desktop/Testfile/testdis_1.txt";
Map<String,Integer> wF1 = cipin_and_simhash.getTextDef(a);
Map<String,Integer> wF2 = cipin_and_simhash.getTextDef(b);
//Similarity
double s = HaiMing.getSSIM(cipin_and_simhash.simHash(wF1), cipin_and_simhash.simHash(wF2));
System.out.println(s);
wenjian_IO.writeFile(str,s);
}
// 測試原檔案與dis_10檔案相似度
@Test
public void mainteestDIS_10() throws FileException, IOException {
String a = wenjian_IO.read("C:/Users/MZ/Desktop/Testfile/orig.txt");
String b = wenjian_IO.read("C:/Users/MZ/Desktop/Testfile/orig_0.8_dis_10.txt");
String str = "C:/Users/MZ/Desktop/Testfile/testdis_10.txt";
Map<String,Integer> wF1 = cipin_and_simhash.getTextDef(a);
Map<String,Integer> wF2 = cipin_and_simhash.getTextDef(b);
//Similarity
double s = HaiMing.getSSIM(cipin_and_simhash.simHash(wF1), cipin_and_simhash.simHash(wF2));
System.out.println(s);
wenjian_IO.writeFile(str,s);
}
// 測試原檔案與dis_15檔案相似度
@Test
public void mainteestDIS_15() throws FileException, IOException {
String a = wenjian_IO.read("C:/Users/MZ/Desktop//Testfile//orig.txt");
String b = wenjian_IO.read("C:/Users/MZ/Desktop//Testfile//orig_0.8_dis_15.txt");
String str = "C:/Users/MZ/Desktop/Testfile/testdis_15.txt";
Map<String,Integer> wF1 = cipin_and_simhash.getTextDef(a);
Map<String,Integer> wF2 = cipin_and_simhash.getTextDef(b);
//Similarity
double s = HaiMing.getSSIM(cipin_and_simhash.simHash(wF1), cipin_and_simhash.simHash(wF2));
System.out.println(s);
wenjian_IO.writeFile(str,s);
}
}
算法分析
相似度計算公式
用Jaccard計算文本的相似性
其中(A ∪ B)=simhash的布爾值長+海明距離;(A ∩ B)=simhash的布爾值長-海明距離;
算法設計思路
- 處理文本,分詞,計算詞頻;
- 計算simhash,計算中的權重就為詞頻;
- 求出相似度
具體實作
//Word segmentation and word frequency statistics are saved in HashMap
public static Map getTextDef(String text) throws IOException {
Map<String, Integer> wordsFren=new HashMap<String, Integer>();//使用HASGHMAP 記錄分詞結果
IKSegmenter ikSegmenter = new IKSegmenter(new StringReader(text), true);
Lexeme lexeme;
while ((lexeme = ikSegmenter.next()) != null) {
if(lexeme.getLexemeText().length()>1){
if(wordsFren.containsKey(lexeme.getLexemeText())){
wordsFren.put(lexeme.getLexemeText(),wordsFren.get(lexeme.getLexemeText())+1);
}else {
wordsFren.put(lexeme.getLexemeText(),1);
}
}
}
return wordsFren;
}
//就hash值
public static String getHash(String str)
{
try {//Get the hash value with the MD5 value of the file
MessageDigest MD = MessageDigest.getInstance("MD5");
String BI= new BigInteger(1, MD.digest(str.getBytes("UTF-8"))).toString(2);
return BI;
} catch (Exception e) {
e.printStackTrace();
return str;
}
}
//Calculate the simhash value of the text
public static String simHash(Map<String,Integer> wordsFrenMaps){
int[] a = new int[128];
String simhash = " ";
// 擷取疊代器
Iterator<Map.Entry<String, Integer>> wordsFrenMapsIterator = wordsFrenMaps.entrySet().iterator();
while (wordsFrenMapsIterator.hasNext()) {
//Map.Entry裡有相應的getKey和getValue方法,能夠從一個項中取出Key和Value。
Map.Entry<String, Integer> wordsFrenEntry = wordsFrenMapsIterator.next();
String WordHash = Hash.getHash(wordsFrenEntry.getKey());
if (WordHash.length() < 128) {
// hash值可能少于128位,在低位以0補齊
int dif = 128 - WordHash.length();
for (int j = 0; j < dif; j++) {
WordHash += "0";
}
}
//權重、合并
for (int j = 0; j < a.length; j++) {
if (WordHash.charAt(j) == '1') {
a[j] += wordsFrenEntry.getValue();//hans值為1權重重值,權重等于詞頻
} else {
a[j] -= wordsFrenEntry.getValue();//hans值為0減權重值,權重等于詞頻
}
}
}
//降維
for (int j = 0; j < a.length; j++) {
if (a[j] > 0) {
simhash += "1";
} else {
simhash += "0";
}
}
return simhash;
}
//Calculate the Hamming distance of two simhashes
public static int getHaiMing(String SH1 , String SH2)
{
int distance = 0;
//字元串相同才能計算海明距離
if(SH1.length() == SH2.length()){
for(int i=0;i<SH1.length();i++){
if(SH1.charAt(i)!=SH2.charAt(i)){distance++;}
}
}
else{distance = -1;}
return distance;
}
//Output the similarity of two simhash and the corresponding simhash value;
public static double getSSIM(String SH1 , String SH2){
int distance = getHaiMing(SH1,SH2);
int i=(SH1.length()-distance);
int j=(SH1.length()+distance);
return 100*i/j;//Jaccard系數計算相似度
}
運作結果
指令行運作結果
各檔案與原檔案相似度對比結果
JProfile進行性能分析
uploading-image-367782.png
類的記憶體消耗
堆記憶體情況
PSP表格
PSP各個階段 | 自己預估時間(分鐘) | 實際的記錄時間(分鐘) |
計劃: 明确需求和其他因素,估計以下的各個任務需要多少時間 | 30 | 20 |
開發 (包括下面 8 項子任務) | 600 | 660 |
需求分析 (包括學習新技術、新工具的時間) | 50 | 120 |
生成設計文檔 | ||
設計複審 | ||
代碼規範 (為目前的開發制定或選擇合适的規範) | ||
具體設計 | 40 | 70 |
具體編碼 | 220 | 260 |
代碼複審 | 10 | |
測試(自我測試,修改代碼,送出修改) | 130 | |
報告 | 60 | 80 |
測試報告 | ||
計算工作量 | ||
事後總結, 并提出過程改進計劃 | ||
合計 |