天天看點

個人程式設計作業

軟體工程
作業要求
作業目标 論文的查重算法+單元測試

代碼連接配接

  • github:​​https://github.com/ywh456/3119005484/tree/main/lunwenchachong​​

整體流程:

  • 1.輸入本地檔案的絕對路徑,讀取檔案,傳回String變量filestring
  • 2.将filestring分詞,并統計詞組的詞頻
  • 3.根據詞的MD5值,求出每個詞的hash值,進而求出文本的simhash值
  • 4.根據simhash求出海明距離并求出相似度
  • 5.将結果儲存在本地

項目結構:

個人程式設計作業

單元測試

測試覆寫率

個人程式設計作業

單元測試代碼

public class cipin_and_simhashTest {
//測試分詞器文本檢視能否成功對文本分詞;
@Test
public void getTextDef() throws IOException {
String s = "測試分詞器文本//.檢視能否成功對文本分詞";
Map<String,Integer> wF1= cipin_and_simhash.getTextDef(s);
System.out.println(wF1);
    }
//多重複詞句分詞測試
@Test
public void getTextDef2() throws IOException {
String s = "重複的測試文本,重複的測試文本,重複的測試文本,重複的測試文本,重複的測試文本,重複的測試文本,重複的測試文本。";
Map<String,Integer> wF1= cipin_and_simhash.getTextDef(s);
System.out.println(wF1);
    }
//擷取SIMHASH值
@Test
public void simHash() throws IOException {
String[] s = {"測試","計","算si","m","hash","的文本//",".檢視","能否","成功求","出文本","的","simhash值;"};
for(String s1 :s) {
Map<String, Integer> wF1 = cipin_and_simhash.getTextDef(s1);
String sim = cipin_and_simhash.simHash(wF1);
System.out.println(sim);
        }
    }
      
public class HaiMingTest {
//計算海明距離
@Test
public void getHaiMing() {
String str1 = "11000111001";
String str2 = "00110011100";
int d = HaiMing.getHaiMing(str1,str2);
System.out.println("海明距離為:"+d);
    }
//測試不同長度的simhash的海明距離
@Test
public void getHaiMing2() {
String str1 = "11000111001";
String str2 = "0011001110";
int d = HaiMing.getHaiMing(str1,str2);
System.out.println("海明距離為:"+d);
    }
//測試相似度
@Test
public void getSSIM() {
String str1 = "11000111001";
String str2 = "00110011100";
double s = HaiMing.getSSIM(str1,str2);
System.out.println("相似度為:"+s);
    }
}
      
public class HashTest {

//空字元竄擷取hash測試
@Test
public void getHash1() throws FileException {
String str =null;
String str2= Hash.getHash(str);
System.out.println(str2);
    }

@Test
public void getHash2() {
String[] str = {"這是", "一次", "獲得", "哈希", "值", "的", "測試"};
for (String s : str) {
String hash = Hash.getHash(s);
System.out.println(hash.length());
System.out.println(hash);
        }
    }
    }
      
public class wenjian_IOTest {

//讀檔案測試
@Test
public void read() throws FileException {
String str = "C:\\Users\\MZ\\Desktop\\Testfile\\orig.txt" ;
String a = wenjian_IO.read(str);
System.out.println(a);
    }
//錯誤路徑讀入測試
@Test
public void read2() throws FileException {
String str = "C:/Users/MZ/Desktop//Desktop//null.txt";
String b = wenjian_IO.read(str);
System.out.println(b);
    }
//空白檔案讀入測試
@Test
public void read3() throws FileException {
String str = null;
String c= wenjian_IO.read(str);
System.out.println(c);
    }


@Test
public void writeFile() throws IOException {
double ssim = 1;
String str = "C:/Users/MZ/Desktop//Testfile/writefiletest.txt" ;
boolean a;
a = wenjian_IO.writeFile(str,ssim);
System.out.println(a);
    }
@Test
public void writeFile2() throws IOException {
double ssim = 1;
String str ="p:/null.text";
boolean a = wenjian_IO.writeFile(str,ssim);
System.out.println(a);
    }
@Test
public void writeFile3() throws IOException {
double ssim = 1;
String str = null;
boolean a =wenjian_IO.writeFile(str,ssim);
System.out.println(a);
    }
}

      
public class mainTest {
// 測試原檔案與add檔案相似度
@Test
public void maintestADD() throws FileException, IOException {
String a = "C:\\Users\\MZ\\Desktop\\Testfile\\orig.txt" ;
String b = "C:\\Users\\MZ\\Desktop\\Testfile\\orig_0.8.add.txt";
String str = "C:\\Users\\MZ\\Desktop\\Testfile\\testadd.txt";
Map<String,Integer> wF1 = cipin_and_simhash.getTextDef(a);
Map<String,Integer> wF2 = cipin_and_simhash.getTextDef(b);
//Similarity
double s = HaiMing.getSSIM(cipin_and_simhash.simHash(wF1), cipin_and_simhash.simHash(wF2));
System.out.println(s);
wenjian_IO.writeFile(str,s);
    }
// 測試原檔案與del檔案相似度
@Test
public void maintestDEL() throws FileException, IOException {
String a = wenjian_IO.read("C:/Users/MZ/Desktop/Testfile/orig.txt");
String b = wenjian_IO.read("C:/Users/MZ/Desktop/Testfile/orig_0.8_del.txt");
String str = "C:/Users/MZ/Desktop/Testfile/testdel.txt";
Map<String,Integer> wF1 = cipin_and_simhash.getTextDef(a);
Map<String,Integer> wF2 = cipin_and_simhash.getTextDef(b);
//Similarity
double s = HaiMing.getSSIM(cipin_and_simhash.simHash(wF1), cipin_and_simhash.simHash(wF2));
System.out.println(s);
wenjian_IO.writeFile(str,s);
}
// 測試原檔案與dis_1檔案相似度
@Test
public void mainteestDIS_1() throws FileException, IOException {
String a = wenjian_IO.read("C:/Users/MZ/Desktop/Testfile/orig.txt");
String b = wenjian_IO.read("C:/Users/MZ/Desktop/Testfile/orig_0.8_dis_1.txt");
String str = "C:/Users/MZ/Desktop/Testfile/testdis_1.txt";
Map<String,Integer> wF1 = cipin_and_simhash.getTextDef(a);
Map<String,Integer> wF2 = cipin_and_simhash.getTextDef(b);
//Similarity
double s = HaiMing.getSSIM(cipin_and_simhash.simHash(wF1), cipin_and_simhash.simHash(wF2));
System.out.println(s);
wenjian_IO.writeFile(str,s);
    }
// 測試原檔案與dis_10檔案相似度
@Test
public void mainteestDIS_10() throws FileException, IOException {
String a = wenjian_IO.read("C:/Users/MZ/Desktop/Testfile/orig.txt");
String b = wenjian_IO.read("C:/Users/MZ/Desktop/Testfile/orig_0.8_dis_10.txt");
String str = "C:/Users/MZ/Desktop/Testfile/testdis_10.txt";
Map<String,Integer> wF1 = cipin_and_simhash.getTextDef(a);
Map<String,Integer> wF2 = cipin_and_simhash.getTextDef(b);
//Similarity
double s = HaiMing.getSSIM(cipin_and_simhash.simHash(wF1), cipin_and_simhash.simHash(wF2));
System.out.println(s);
wenjian_IO.writeFile(str,s);
    }
// 測試原檔案與dis_15檔案相似度
@Test
public void mainteestDIS_15() throws FileException, IOException {
String a = wenjian_IO.read("C:/Users/MZ/Desktop//Testfile//orig.txt");
String b = wenjian_IO.read("C:/Users/MZ/Desktop//Testfile//orig_0.8_dis_15.txt");
String str = "C:/Users/MZ/Desktop/Testfile/testdis_15.txt";
Map<String,Integer> wF1 = cipin_and_simhash.getTextDef(a);
Map<String,Integer> wF2 = cipin_and_simhash.getTextDef(b);
//Similarity
double s = HaiMing.getSSIM(cipin_and_simhash.simHash(wF1), cipin_and_simhash.simHash(wF2));
System.out.println(s);
wenjian_IO.writeFile(str,s);
    }
}

      

算法分析

相似度計算公式

用Jaccard計算文本的相似性

個人程式設計作業

其中(A ∪ B)=simhash的布爾值長+海明距離;(A ∩ B)=simhash的布爾值長-海明距離;

算法設計思路

  • 處理文本,分詞,計算詞頻;
  • 計算simhash,計算中的權重就為詞頻;
  • 求出相似度

具體實作

//Word segmentation and word frequency statistics are saved in HashMap
public static Map getTextDef(String text) throws IOException {
Map<String, Integer> wordsFren=new HashMap<String, Integer>();//使用HASGHMAP 記錄分詞結果
IKSegmenter ikSegmenter = new IKSegmenter(new StringReader(text), true);
Lexeme lexeme;
while ((lexeme = ikSegmenter.next()) != null) {
if(lexeme.getLexemeText().length()>1){
if(wordsFren.containsKey(lexeme.getLexemeText())){
wordsFren.put(lexeme.getLexemeText(),wordsFren.get(lexeme.getLexemeText())+1);
                }else {
wordsFren.put(lexeme.getLexemeText(),1);
                }
            }
        }
return wordsFren;
    }

//就hash值
public static String getHash(String str)
    {
try {//Get the hash value with the MD5 value of the file
MessageDigest MD = MessageDigest.getInstance("MD5");
String BI= new BigInteger(1, MD.digest(str.getBytes("UTF-8"))).toString(2);
return BI;
        } catch (Exception e) {
e.printStackTrace();
return str;
        }
    }


//Calculate the simhash value of the text
public static String simHash(Map<String,Integer> wordsFrenMaps){
int[] a = new int[128];
String simhash = " ";
// 擷取疊代器
Iterator<Map.Entry<String, Integer>> wordsFrenMapsIterator = wordsFrenMaps.entrySet().iterator();

while (wordsFrenMapsIterator.hasNext()) {
//Map.Entry裡有相應的getKey和getValue方法,能夠從一個項中取出Key和Value。
Map.Entry<String, Integer> wordsFrenEntry = wordsFrenMapsIterator.next();
String WordHash = Hash.getHash(wordsFrenEntry.getKey());

if (WordHash.length() < 128) {
// hash值可能少于128位,在低位以0補齊
int dif = 128 - WordHash.length();
for (int j = 0; j < dif; j++) {
WordHash += "0";
                   }
               }
//權重、合并
for (int j = 0; j < a.length; j++) {
if (WordHash.charAt(j) == '1') {
a[j] += wordsFrenEntry.getValue();//hans值為1權重重值,權重等于詞頻
                    } else {
a[j] -= wordsFrenEntry.getValue();//hans值為0減權重值,權重等于詞頻
                    }
                }
            }
//降維
for (int j = 0; j < a.length; j++) {
if (a[j] > 0) {
simhash += "1";
                } else {
simhash += "0";
                }
            }

return simhash;
    }

//Calculate the Hamming distance of two simhashes
public static  int getHaiMing(String SH1  , String SH2)
    {
int distance = 0;
//字元串相同才能計算海明距離
if(SH1.length() == SH2.length()){
for(int i=0;i<SH1.length();i++){
if(SH1.charAt(i)!=SH2.charAt(i)){distance++;}
            }
        }
else{distance = -1;}
return distance;
    }
//Output the similarity of two simhash and the corresponding simhash value;
public static double getSSIM(String SH1  , String SH2){
int distance = getHaiMing(SH1,SH2);
int i=(SH1.length()-distance);
int j=(SH1.length()+distance);
return 100*i/j;//Jaccard系數計算相似度
    }

      

運作結果

指令行運作結果

個人程式設計作業
各檔案與原檔案相似度對比結果
個人程式設計作業

JProfile進行性能分析

uploading-image-367782.png

類的記憶體消耗

個人程式設計作業

堆記憶體情況

個人程式設計作業

PSP表格

PSP各個階段 自己預估時間(分鐘) 實際的記錄時間(分鐘)
計劃: 明确需求和其他因素,估計以下的各個任務需要多少時間 30 20
開發 (包括下面 8 項子任務) 600 660
需求分析 (包括學習新技術、新工具的時間) 50 120
生成設計文檔
設計複審
代碼規範 (為目前的開發制定或選擇合适的規範)
具體設計 40 70
具體編碼 220 260
代碼複審 10
測試(自我測試,修改代碼,送出修改) 130
報告 60 80
測試報告
計算工作量
事後總結, 并提出過程改進計劃
合計

總結

繼續閱讀