本来用某鱼抓取整理网站url的,结果超过一万条要付费充会员导出,有点郁闷,怎么办?因为java爬虫还不怎么会,就只有拼接了,想想就是查库然后转换成xml,勤快点自己动手。
获取网站url的后缀地址,一般都是id主键,先获取id,然后进行字符串拼接,最后输出成xml,这里采用springboot+mybatis+xStream。
引入依赖:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.booy</groupId>
<artifactId>url</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>war</packaging>
<!--引入springboot父版本-->
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.1.2.RELEASE</version>
<relativePath/>
</parent>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<java.version>1.8</java.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!--mybatis包-->
<dependency>
<groupId>org.mybatis.spring.boot</groupId>
<artifactId>mybatis-spring-boot-starter</artifactId>
<version>1.3.1</version>
</dependency>
<!-- mysql驱动包 -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.29</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.thoughtworks.xstream</groupId>
<artifactId>xstream</artifactId>
<version>1.4.11.1</version>
</dependency>
</dependencies>
<!--配置资源文件扫描,否则Mapper-->
<build>
<!--将springboot的应用程序打包成fat jar的插件-->
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
<resources>
<resource>
<directory>src/main/java</directory>
<includes>
<include>**/*.xml</include>
</includes>
<filtering>true</filtering>
</resource>
<resource>
<directory>src/main/resources</directory>
<includes>
<include>**/*.*</include>
</includes>
</resource>
</resources>
</build>
</project>
要查库,需要先配置下数据源
#数据源的基本信息
spring.datasource.url=jdbc:mysql://localhost:3306/test?characterEncoding=utf8
spring.datasource.username=test
spring.datasource.password=123456
spring.datasource.driverClassName = com.mysql.jdbc.Driver
#mybatis中mapper文件的路径
mybatis.mapper-locations=classpath*:com/booy/url/dao/mapper/*.xml
#起别名,可省略写mybatis的xml中的resultType的全路径
mybatis.type-aliases-package=com.booy.url.pojo
#视图解析器,规定访问资源路径的后缀
spring.mvc.view.suffix=.html
接口
package com.booy.url.dao;
import java.util.List;
public interface UrlDao {
List<Integer> getAllId();
}
mapper查询
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper
PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.booy.url.dao.UrlDao">
<select id="getAllId" resultType="int">
select web_id
from test_website
</select>
</mapper>
service接口
package com.booy.url.service;
import java.util.List;
public interface UrlService {
List<StringBuilder> getAllId();
}
业务逻辑实现,如果一个表数据超过5万可以做下判断,写入到第二个xml,需要多个表写入到一个xml中,就别覆盖了,直接数据追加即可
package com.booy.url.service.Impl;
import com.booy.url.dao.UrlDao;
import com.booy.url.pojo.Url;
import com.booy.url.service.UrlService;
import com.thoughtworks.xstream.XStream;
import com.thoughtworks.xstream.io.xml.Xpp3Driver;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
@Service
public class UrlServiceImpl implements UrlService {
@Resource
private UrlDao urlDao;
@Override
public List<StringBuilder> getAllId() {
//前后字符串
String urlPre = "http://www.zhuangyi.net/w/";
String urlSuf = ".html";
//存放url
List<StringBuilder> urls = new ArrayList<>();
//存放xml的url对象
List<Url> urlsXml = new ArrayList<>();
List<Integer> allId = urlDao.getAllId();
for (Integer id : allId) {
StringBuilder sb = new StringBuilder(40);
sb.append(urlPre).append(id).append(urlSuf);
urls.add(sb);
//构建单个对象
Url url = simpleObject(sb);
//将对象添加进集合
urlsXml.add(url);
}
//将对象集合输出为xml文档
outXml(urlsXml);
return urls;
}
//构建单个对象
public Url simpleObject(StringBuilder sb){
//设置当前时间
Date nowDate =new Date(System.currentTimeMillis());
DateFormat df = new SimpleDateFormat("yyyy-MM-dd");
String now = df.format(nowDate);
//构建输出对象
Url url = new Url();
String sb1 = new String(sb);
url.setLoc(sb1);
url.setPriority("0.6");
url.setChangefreq("always");
url.setLastmod(now);
return url;
}
//将对象集合输出为xml
public void outXml(List<Url> urlsXml){
FileOutputStream out=null;
try {
out = new FileOutputStream("D:/xml/sitemap.xml");//默认覆盖
} catch (FileNotFoundException e) {
e.printStackTrace();
}
//通过驱动构建一个xStream对象
XStream xStream = new XStream(new Xpp3Driver());
//修改别名Url.class为url
xStream.alias("url",Url.class);
xStream.alias("urlset",List.class);
//生成xml文件
// xStream.toXML(urlsXml,out);不带头输出
String top ="<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
String s = top + xStream.toXML(urlsXml);
try {
out.write(s.getBytes());
out.close();
} catch (IOException e) {
e.printStackTrace();
}
System.out.println(s);
}
}
xml元素实体
package com.booy.url.pojo;
public class Url {
private String loc;
private String priority;
private String lastmod;
private String changefreq;
//getter and setter
}
在页面上看看url
@RestController
public class resultcontroller {
@Resource private UrlService urlService;
@RequestMapping
public List<StringBuilder> test(){
return urlService.getAllId();
}
}
启动类
@SpringBootApplication
@MapperScan(basePackages = "com.booy.url.dao")
public class UrlApplication {
public static void main(String[] args) {
SpringApplication.run(UrlApplication.class, args);
}
}
控制台输出,实际就别控制台和页面上输出了
