天天看点

使用WebMagic 编写 java 网络爬虫

写这个的目的是为了爬歌词,因为喜欢听歌,遇到喜欢的歌就喜欢把歌词下载下来。

WebMacgic 教程地址

http://webmagic.io/docs/zh/posts/ch1-overview/

使用 IDEA 创建 maven工程

下面为工程目录结构

使用WebMagic 编写 java 网络爬虫

下面为源代码

package bean;

import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.TargetUrl;

import java.sql.Timestamp;
import java.util.Date;
import java.util.List;

/**
 * @author zhaoshenjiao
 * @Date 2017-04-18 23:12:34
 */
@TargetUrl("http://www.kuwo.cn/yinyue/*")
public class KuWoMusic {
    /**
     * 歌名
     */
//    @ExtractBy(value="div.tit em.f-ff2",type = ExtractBy.Type.Css)
    @ExtractBy("//p[@id='lrcName']/text()")
    private String name;
    /**
     * 歌手
     */

//    @ExtractBy(value="p.des span a",type = ExtractBy.Type.Css)
    @ExtractBy("//p[@class='artist']/span/a/text()")
    private String singer;
    /**
     * 歌词
     */
//    @ExtractBy(value="div.mCSB_container p",type = ExtractByactBy.Type.Css)
    @ExtractBy("//p[@class='lrcItem']")
    private List<String>  lyrics;


    private String  lyric;
    /**
     * 所属专辑
     */
//    @ExtractBy(value="p.des a",type = ExtractBy.Type.Css)
    @ExtractBy("//p[@class='album']/span/a/text()")
    private String album;

    private Timestamp recordTime;
    /**
     * 所属专辑
     */
//    @ExtractBy(value="body",type = ExtractBy.Type.Css)
//    private String body;

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public String getSinger() {
        return singer;
    }

    public void setSinger(String singer) {
        this.singer = singer;
    }

    public List<String> getLyrics() {
        return lyrics;
    }

    public void setLyrics(List<String> lyrics) {
        this.lyrics = lyrics;
    }

    public String getLyric() {
        StringBuilder sb = new StringBuilder();
        for ( String str: lyrics ) {
            sb.append(str);
        }
        return sb.toString();
    }

    public void setLyric(String lyric) {
        StringBuilder sb = new StringBuilder();
        for ( String str: lyrics ) {
            sb.append(str);
        }
        this.lyric = sb.toString();
    }

    public String getAlbum() {
        return album;
    }

    public void setAlbum(String album) {
        this.album = album;
    }

    public Timestamp getRecordTime() {
        return new Timestamp( new Date().getTime());
    }

    public void setRecordTime(Timestamp recordTime) {
        this.recordTime = recordTime;
    }

    @Override
    public String toString() {
        return "[name:"+name +",singer="+singer+",album="+album+",lyric="+lyric+"]";
    }
}
           
package dao;

import bean.KuWoMusic;
import org.apache.ibatis.annotations.Insert;

/**
 * @author zhaoshenjiao
 * @Date 2017-04-19 00:37:57
 */
public interface KuWoMusicDao {
    @Insert("insert into lyric (`title`,`content`,`source`,`singer`,`album`,`recorder`,`recordTime`,`curStatus`) " +
            "values (#{name},#{lyric},'酷我',#{singer},#{album},'admin',#{recordTime},'2')")
    int add(KuWoMusic kuWoMusic);
}
           
package dao.pipeline;

import dao.KuWoMusicDao;
import bean.KuWoMusic;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.PageModelPipeline;

/**
 * @author zhaoshenjiao
 * @Date 2017-04-19 00:42:41
 */
@Component("KuWoMusicDaoPipeline")
public class KuWoMusicDaoPipeline implements PageModelPipeline<KuWoMusic> {

    ApplicationContext context = new ClassPathXmlApplicationContext("root-context.xml");
    KuWoMusicDao kuWoMusicDao = (KuWoMusicDao)context.getBean("kuWoMusicDao");
//    @Resource
//    private KuWoMusicDao kuWoMusicDao;

    @Override
    public void process(KuWoMusic kuWoMusic, Task task) {
        //输出歌词信息
        System.out.println(kuWoMusic.toString());
        kuWoMusicDao.add(kuWoMusic);
    }
}
           
package execute;

import dao.pipeline.KuWoMusicDaoPipeline;
import bean.KuWoMusic;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.ConsolePageModelPipeline;
import us.codecraft.webmagic.model.OOSpider;

/**
 * 爬虫执行类
 * @author zhaoshenjiao
 * @Date 2017-04-18 23:23:43
 */
public class LyricCrawlerExecutor {
    public static void main(String[] args) {
        //保存到数据库
        OOSpider.create(
                Site.me(),
                new KuWoMusicDaoPipeline(), KuWoMusic.class)
                .addUrl("http://www.kuwo.cn/yinyue/492211?catalog=yueku2016")
                .thread(2)
                .run();
        //输出到控制台
//        OOSpider.create(
//                Site.me(),
//                new ConsolePageModelPipeline(), KuWoMusic.class)
//                .addUrl("http://www.kuwo.cn/yinyue/492211?catalog=yueku2016")
//                .thread(2)
//                .run();

        //测试获取bean
//        ApplicationContext context = new ClassPathXmlApplicationContext("root-context.xml");
//        KuWoMusicDao kuWoMusicDao = (KuWoMusicDao)context.getBean("kuWoMusicDao");
//
//        //包名(或者是保的完整路径)/配置文件名字(也就是xml文件)
//        ClassPathXmlApplicationContext cpx=new ClassPathXmlApplicationContext ("root-context.xml");
//
//        System.out.println(cpx.getBean("kuWoMusicDao"));
    }
}
           
log4j.rootLogger=INFO,DEBUG,stdout

log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] -%m%n


#log4j.logger.com.ibatis=debug
#log4j.logger.com.ibatis.common.jdbc.SimpleDataSource=debug
#log4j.logger.com.ibatis.common.jdbc.ScriptRunner=debug
#log4j.logger.com.ibatis.sqlmap.engine.impl.SqlMapClientDelegate=debug
#log4j.logger.java.sql.Connection=debug
#log4j.logger.java.sql.Statement=debug
#log4j.logger.java.sql.PreparedStatement=debug,stdout
           
<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans"
	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:tx="http://www.springframework.org/schema/tx"
	xmlns:task="http://www.springframework.org/schema/task"
	xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-3.2.xsd  
                http://www.springframework.org/schema/tx http://www.springframework.org/schema/tx/spring-tx-3.2.xsd
                http://www.springframework.org/schema/task http://www.springframework.org/schema/task/spring-task.xsd ">

	<!-- 配置DataSource数据源 -->
	<bean id="dataSource" class="org.apache.commons.dbcp.BasicDataSource" destroy-method="close">
		<property name="driverClassName" value="com.mysql.jdbc.Driver" />
		<property name="url" value="jdbc:mysql://localhost:3306/dbname?characterEncoding=utf-8" />
		<property name="username" value="" />
		<property name="password" value="" />
		<property name="maxActive" value="5" />
		<property name="maxIdle" value="3" />
		<property name="maxWait" value="1000" />
		<property name="defaultAutoCommit" value="true" />
		<property name="removeAbandoned" value="true" />
		<property name="removeAbandonedTimeout" value="60" />
	</bean>

	<!-- 创建SqlSessionFactory,同时指定数据源 -->
	<bean id="sqlSessionFactory" class="org.mybatis.spring.SqlSessionFactoryBean">
		<property name="dataSource" ref="dataSource" />
	</bean>

	<!-- 配置Spring的事务管理器 -->
	<bean id="transactionManager"
		class="org.springframework.jdbc.datasource.DataSourceTransactionManager">
		<property name="dataSource" ref="dataSource" />
	</bean>

	<bean id="kuWoMusicDao" class="org.mybatis.spring.mapper.MapperFactoryBean">
		<property name="mapperInterface" value="dao.KuWoMusicDao" />
		<property name="sqlSessionFactory" ref="sqlSessionFactory" />
	</bean>

	<tx:annotation-driven transaction-manager="transactionManager" />
	<!-- 识别@Scheduled注解,并设置线程池为5 -->
	<task:annotation-driven scheduler="qbScheduler"	mode="proxy" />
	<task:scheduler id="qbScheduler" pool-size="5" />
</beans>
           

pom.xml文件

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>lyric.crawler</groupId>
    <artifactId>lyric-crawler</artifactId>
    <version>1.0-SNAPSHOT</version>
    <properties>
        <!-- spring版本号 -->
        <spring.version>4.2.0.RELEASE</spring.version>
        <!-- mybatis版本号 -->
        <mybatis.version>3.3.0</mybatis.version>
        <!-- mySql版本号 -->
        <mysql.version>5.1.29</mysql.version>
    </properties>
    <dependencies>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.6.1</version>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.6.1</version>
        </dependency>
        <!-- spring核心包 -->
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-core</artifactId>
            <version>${spring.version}</version>
        </dependency>
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-context-support</artifactId>
            <version>${spring.version}</version>
        </dependency>
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-oxm</artifactId>
            <version>${spring.version}</version>
        </dependency>
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-tx</artifactId>
            <version>${spring.version}</version>
        </dependency>
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-jdbc</artifactId>
            <version>${spring.version}</version>
        </dependency>
        <!-- mybatis核心包 -->
        <dependency>
            <groupId>org.mybatis</groupId>
            <artifactId>mybatis</artifactId>
            <version>${mybatis.version}</version>
        </dependency>
        <!-- mybatis-spring包 -->
        <dependency>
            <groupId>org.mybatis</groupId>
            <artifactId>mybatis-spring</artifactId>
            <version>1.2.3</version>
        </dependency>
        <!-- 导入Mysql数据库链接jar包 -->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>${mysql.version}</version>
        </dependency>
        <!-- common组件 -->
        <dependency>
            <groupId>commons-dbcp</groupId>
            <artifactId>commons-dbcp</artifactId>
            <version>1.4</version>
        </dependency>
    </dependencies>
    <build>
        <finalName>lyriccrawler</finalName>
        <resources>
            <resource>
                <directory>src/main/java</directory>
                <includes>
                    <include>*.xml</include>
                    <include>*.properties</include>
                    <include>*.tld</include>
                    <include>*.txt</include>
                    <include>*.cfg</include>
                    <include>**/**/**/*.xml</include>
                    <include>**/**/**/**/*.xml</include>
                </includes>
            </resource>
        </resources>
    </build>

</project>
           

工程源代码下载地址

https://github.com/airujingye/lyriccrawler