1.爬蟲URL去重實戰-SpringBoot2.x+Guava布隆過濾器
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.12.0</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>31.1-jre</version>
</dependency>
@Test
public void testGeneUrl() {
try{
//注意這塊寫上 自己電腦的 路徑
File file = new File("D:\\ideaworkspace\\bloomfilter-test\\src\\main\\resources");
if (!file.exists()) {
file.createNewFile();
}
FileOutputStream fos = new FileOutputStream(file, true);
OutputStreamWriter osw = new OutputStreamWriter(fos);
BufferedWriter bw = new BufferedWriter(osw);
StringBuilder builder = new StringBuilder();
for (int i = 0; i < 5000000; i++) {
String name = RandomStringUtils.randomAlphabetic(5);
String fileName = "https://www." + name + ".com" + i + "\n";
builder.append(fileName);
}
bw.write(String.valueOf(builder));
bw.newLine();
bw.flush();
bw.close();
osw.close();
fos.close();
} catch (FileNotFoundException e1) {
e1.printStackTrace();
} catch (IOException e2) {
e2.printStackTrace();
}
}
//參數一: 指定布隆過濾器中存的是什麼類型的資料,有 IntegerFunnel,LongFunnel,StringCharsetFunnel
//參數二: 預期需要存儲的資料量
//參數三: 誤判率,預設是 0.03
BloomFilter.create(Funnels.stringFunnel(Charset.forName("UTF-8")), 5000000, 0.01);
- @Bean的方式将檔案的内容注入到BloomFilter中
/**
* 将檔案内容讀入到布隆過濾器中
* @return
* @throws IOException
*/
@Bean
public BloomFilter bloomFilter() throws IOException{
BloomFilter bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charset.forName("UTF-8")),5000000,0.01);
FileInputStream fileInputStream = new FileInputStream(new File("D:\\ideaworkspace\\bloomfilter-test\\src\\main\\resources\\url.txt"));
InputStreamReader inputStreamReader = new InputStreamReader(fileInputStream);
BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
String line;
while(true){
line = bufferedReader.readLine();
if(line !=null){
bloomFilter.put(line);
}else{
break;
}
}
inputStreamReader.close();
return bloomFilter;
}
@RestController
@RequestMapping("/api")
public class BloomFilterController {
@Autowired
private BloomFilter bloomFilter;
@RequestMapping("/bloomFilter")
public boolean bloomFilter(){
String url = "https://www.TpxVs.com10";
boolean flag = false;
//判斷是否包含這個内容
if (bloomFilter.mightContain(url)) flag = true;
return flag;
}
}
- 如果使用Set集合的話當資料量很大的情況下,會報堆記憶體溢出的報錯。