天天看點

Maven_Webmagic 執行個體

webmagic中文文檔

Maven_Webmagic 執行個體

項目目錄

Maven_Webmagic 執行個體

Device.java

package com.demo.webmagic.bean;

import java.util.Date;

public class Device {

    private Integer id; //
    private String code; //
    private String name; //
    private String model; //
    private String manufacturer; //
    private String country; //
    private String contact; //
    private String contactNumber; //
    private String email; //
    private String institute; //
    private String location; //
    private Object specification; //
    private Object performance; //
    private Object application; //
    private Object description; //
    private String purchaseDate; //
    private String price; //
    private Object feeStandard; //
    private String imageUrl; //
    private String imageLocal; //
    private String province; //
    private String dataSource; //
    private String recorder; //
    private Date recordDateTime; //
    private String nameEn; //
    private String postCode; //

    public Integer getId() {
        return this.id;
    }

    public void setId(Integer id) {
        this.id = id;
    }

    public String getCode() {
        return this.code;
    }

    public void setCode(String code) {
        this.code = code;
    }

    public String getName() {
        return this.name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public String getModel() {
        return this.model;
    }

    public void setModel(String model) {
        this.model = model;
    }

    public String getManufacturer() {
        return this.manufacturer;
    }

    public void setManufacturer(String manufacturer) {
        this.manufacturer = manufacturer;
    }

    public String getCountry() {
        return this.country;
    }

    public void setCountry(String country) {
        this.country = country;
    }

    public String getContact() {
        return this.contact;
    }

    public void setContact(String contact) {
        this.contact = contact;
    }

    public String getContactNumber() {
        return this.contactNumber;
    }

    public void setContactNumber(String contactNumber) {
        this.contactNumber = contactNumber;
    }

    public String getEmail() {
        return this.email;
    }

    public void setEmail(String email) {
        this.email = email;
    }

    public String getInstitute() {
        return this.institute;
    }

    public void setInstitute(String institute) {
        this.institute = institute;
    }

    public String getLocation() {
        return this.location;
    }

    public void setLocation(String location) {
        this.location = location;
    }

    public Object getSpecification() {
        return this.specification;
    }

    public void setSpecification(Object specification) {
        this.specification = specification;
    }

    public Object getPerformance() {
        return this.performance;
    }

    public void setPerformance(Object performance) {
        this.performance = performance;
    }

    public Object getApplication() {
        return this.application;
    }

    public void setApplication(Object application) {
        this.application = application;
    }

    public Object getDescription() {
        return this.description;
    }

    public void setDescription(Object description) {
        this.description = description;
    }

    public String getPurchaseDate() {
        return this.purchaseDate;
    }

    public void setPurchaseDate(String purchaseDate) {
        this.purchaseDate = purchaseDate;
    }

    public String getPrice() {
        return this.price;
    }

    public void setPrice(String price) {
        this.price = price;
    }

    public Object getFeeStandard() {
        return this.feeStandard;
    }

    public void setFeeStandard(Object feeStandard) {
        this.feeStandard = feeStandard;
    }

    public String getImageUrl() {
        return this.imageUrl;
    }

    public void setImageUrl(String imageUrl) {
        this.imageUrl = imageUrl;
    }

    public String getImageLocal() {
        return this.imageLocal;
    }

    public void setImageLocal(String imageLocal) {
        this.imageLocal = imageLocal;
    }

    public String getProvince() {
        return this.province;
    }

    public void setProvince(String province) {
        this.province = province;
    }

    public String getDataSource() {
        return this.dataSource;
    }

    public void setDataSource(String dataSource) {
        this.dataSource = dataSource;
    }

    public String getRecorder() {
        return this.recorder;
    }

    public void setRecorder(String recorder) {
        this.recorder = recorder;
    }

    public Date getRecordDateTime() {
        return this.recordDateTime;
    }

    public void setRecordDateTime(Date recordDateTime) {
        this.recordDateTime = recordDateTime;
    }

    public String getNameEn() {
        return this.nameEn;
    }

    public void setNameEn(String nameEn) {
        this.nameEn = nameEn;
    }

    public String getPostCode() {
        return this.postCode;
    }

    public void setPostCode(String postCode) {
        this.postCode = postCode;
    }

    @Override
    public String toString() {
        return "Device [id=" + id + ", code=" + code + ", name=" + name + ", model=" + model + ", manufacturer="
                + manufacturer + ", country=" + country + ", contact=" + contact + ", contactNumber=" + contactNumber
                + ", email=" + email + ", institute=" + institute + ", location=" + location + ", specification="
                + specification + ", performance=" + performance + ", application=" + application + ", description="
                + description + ", purchaseDate=" + purchaseDate + ", price=" + price + ", feeStandard=" + feeStandard
                + ", imageUrl=" + imageUrl + ", imageLocal=" + imageLocal + ", province=" + province + ", dataSource="
                + dataSource + ", recorder=" + recorder + ", recordDateTime=" + recordDateTime + ", nameEn=" + nameEn
                + ", postCode=" + postCode + "]";
    }

}
           

ShanxiProcessor.java

package com.demo.webmagic.processor;

import java.util.Date;
import java.util.List;

import com.demo.webmagic.bean.Device;
import com.demo.webmagic.util.ImageDownloader;
import com.demo.webmagic.util.ImageDownloader.ImgNameType;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;

public class ShanxiProcessor implements PageProcessor {
    private static int currentPage = ;
    private static final int LAST_PAGE = ;

    public static final String SAVE_PATH = "D:/image/shanxi/";
    public static final String DOMAIN = "http://www.tydxyq.cn";

    public static final String URL_LIST = "http://www\\.tydxyq\\.cn/yqsb/list.asp\\?page=\\.*";
    public static final String URL_POST = "http://www\\.tydxyq\\.cn/yqsb/detail.asp\\?ID=\\.*";

    public static final String PREFIX_LIST = "http://www.tydxyq.cn/yqsb/list.asp?page=";
    public static final String PREFIX_POST = "http://www.tydxyq.cn/yqsb/detail.asp?ID=";


    private Site site = Site.me()
            .setRetryTimes()
            .setTimeOut()
            .setCharset("GBK")
            .setDomain("www.tydxyq.cn")
            .setSleepTime()
            .setUserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");

    @Override
    public void process(Page page) {
//       System.out.println(page.getHtml());

        if (page.getUrl().regex(URL_LIST).match()) {
            if (currentPage > LAST_PAGE) {
                return;
            }

            addTargetRequests(page);
            return;
        } 

        if (!page.getUrl().regex(URL_LIST).match()) {
            Device device = null;
            try {
                device = createDevice(page);
            } catch (Exception e1) {
                e1.printStackTrace();
            }

            System.out.println(device);

            try {
//              deviceService.add(device);
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }

    private void addTargetRequests(Page page) {
        List<String> urlList = page.getHtml().xpath("//tr[@align='center']").links().all();
        for (String urlString : urlList) {
            if (urlString.contains(PREFIX_POST)) {
                page.addTargetRequest(urlString);
            }
        }
        page.addTargetRequest(PREFIX_LIST + currentPage++);
    }

    private Device createDevice(Page page) {
        Html html = page.getHtml();
        Device device = new Device();

        device.setCode(createCode(page));
        String imageUrl = html.xpath("//table[4]//tr/td[2]/table[2]//a/@href").toString();
        device.setImageUrl(imageUrl);
        try {
            String imageLocal = ImageDownloader.download(imageUrl, SAVE_PATH, ImgNameType.OBTAIN);
            device.setImageLocal(imageLocal);
        } catch (Exception e) {
            e.printStackTrace();
        }

        device.setName(html.xpath("//table[4]//tr/td[2]/table[1]//tr[1]/td//strong/text()").toString().trim());
        device.setModel(html.xpath("//table[4]//tr/td[2]/table[1]//tr[1]/td/text()").toString().substring());
//      device.setUnivercity(html.xpath("//table[4]//tr/td[2]/table[1]//tr[4]/td[2]/allText()").toString().trim());
        device.setPurchaseDate(html.xpath("//table[4]//tr/td[2]/table[1]//tr[9]/td[2]/text()").toString().trim());
        device.setPrice(html.xpath("//table[4]//tr/td[2]/table[1]//tr[10]/td[2]/text()").toString().trim());
        device.setApplication(html.xpath("//table[4]//tr/td[2]/table[1]//tr[14]/td[2]/allText()").toString().trim());

        device.setFeeStandard(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb1']//tr[3]/td/allText()").toString());
        device.setInstitute(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb1']//tr[4]/td/text()").toString().substring());
        device.setContact(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb1']//tr[5]/td/text()").toString().substring());
        device.setEmail(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb1']//tr[6]/td/text()").toString().substring());
        device.setContactNumber(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb1']//tr[7]/td/allText()").toString().substring());

        device.setSpecification(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb2']//td/allText()").toString().substring());
        device.setCountry(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb4']//tr[1]/td/text()").toString().substring());
        device.setManufacturer(html.xpath("//table[4]//tr/td[2]/table[3]//table[@id='tb4']//tr[2]/td/text()").toString().substring());

        device.setProvince("山西省");
        device.setDataSource(DOMAIN);
        device.setRecorder("liuzhiguo");
        device.setRecordDateTime(new Date());

        return device;
    }

    private String createCode(Page page) {
        String urlString = page.getUrl().toString();
        return urlString.substring(urlString.lastIndexOf("=") + );
    }

    @Override
    public Site getSite() {
        return site;
    }

    public static void main(String[] args) {
        Spider.create(new ShanxiProcessor()).addUrl(PREFIX_LIST + currentPage++).thread().run();
    }
}
           

ImageDownloader.java

package com.demo.webmagic.util;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.UUID;

public class ImageDownloader {
    public enum ImgNameType {
        OBTAIN, UUID
    }

    public static String download(String imageUrl, String savePath, ImgNameType imgNameType) throws Exception {
        if (imageUrl == null) {
            return null;
        }

        String imageName = obtainImageName(imageUrl, imgNameType);
        String imgSavePath = createImgSavePath(savePath, imageName);
        if (new File(imgSavePath).exists()) {
            System.out.println("圖檔已存在:" + imgSavePath);
            return imgSavePath;
        }

        downloadImage(imageUrl, imgSavePath);
        return imgSavePath;
    }

    // 圖檔路徑中出現中文 會出錯
    private static void downloadImage(String imageUrl, String imgSavePath) throws Exception {
        URLConnection con = new URL(imageUrl).openConnection();
        con.setConnectTimeout( * );
        InputStream is = con.getInputStream();
        OutputStream os = new FileOutputStream(imgSavePath);

        byte[] bs = new byte[];
        int len;
        while ((len = is.read(bs)) != -) {
            os.write(bs, , len);
        }

        closeIOStream(is, os);
    }

    private static String createImgSavePath(String savePath, String imageName) {
        File sf = createFolder(savePath);
        return sf.getPath() + "\\" + imageName;
    }

    private static String obtainImageName(String urlString, ImgNameType imgNameType) {
        if (imgNameType == ImgNameType.UUID) {
            return UUID.randomUUID().toString() + ".jpg";
        }

        if (urlString.contains("?")) {
            return urlString.substring(urlString.lastIndexOf("=") + ) + ".jpg";
        }

        // if (!urlString.contains("\\.")) {
        // return urlString.substring(urlString.lastIndexOf("/") + 1) + ".jpg";
        // }

        return urlString.substring(urlString.lastIndexOf("/") + );
    }

    private static void closeIOStream(InputStream is, OutputStream os) {
        if (is != null) {
            try {
                is.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        if (os != null) {
            try {
                os.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        System.out.println("下載下傳完成");

    }

    private static File createFolder(String savePath) {
        File sf = new File(savePath);
        if (!sf.exists()) {
            sf.mkdirs();
        }

        return sf;
    }

}
           

pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.demo</groupId>
    <artifactId>maven-webmagic</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <packaging>jar</packaging>

    <name>maven-webmagic</name>
    <url>http://maven.apache.org</url>

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    </properties>

    <dependencies>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.5.3</version>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.5.3</version>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.5.1</version>
                <configuration>
                    <source>1.7</source>
                    <target>1.7</target>
                    <encoding>UTF-8</encoding>
                </configuration>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-source-plugin</artifactId>
                <version>3.0.1</version>
                <executions>
                    <execution>
                        <id>attach-sources</id>
                        <goals>
                            <goal>jar</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

</project>
           

maven-webmagic 執行個體源碼

補充傳回資料為JSON格式

如果console輸出有轉義字元,那麼就不能直接使用JSONPath,需要将其轉化為JSONObject對象處理。

Maven_Webmagic 執行個體