webmagic crawling the first time listings, and use the free agent pool ip

1.开机自启动爬虫

@CrossOrigin
@RestController
@RequestMapping("/property")
public class PropertyController implements CommandLineRunner {

    @Autowired
    private PropertyDaoPipeLine01 diYiPropertyDaoPipeLine;

    @Override
    public void run(String... args) throws Exception {
        property01();
    }

    @GetMapping("/start01")
    public void property01() {
        Spider.create(new PropertyPageProcessor01())
                .addUrl("http://wh.01fy.cn/sale/list_2_0_0_0-0_0_0-0_0_0_0-0_0_0-0_2_0_1_.html")
                .addPipeline(diYiPropertyDaoPipeLine)
                .thread(1)
                .setExitWhenComplete(true)
                .setDownloader(Downloader.newIpDownloader())
                .runAsync();
    }
2.实体类

import org.springframework.data.annotation.Id;

@Data
public class Property {

    @Id
    private Long id;

    /**
     * 联系人 user_name
     */
    private String userName;

    /**
     * 联系电话 user_mobile
     */
    private String userMobile;

    /**
     * 楼盘名称 estate_name
     */
    private String estateName;

    /**
     * 房源标题 property_title
     */
    private String propertyTitle;

    /**
     * 区域 area_name
     */
    private String areaName;

    /**
     * 几室几厅几卫 house_type
     */
    private String houseType;

    /**
     * 面积 size
     */
    private String size;

    /**
     * 总价 price
     */
    private String price;

    /**
     * 单价 ava_price
     */
    private String avaPrice;

    /**
     * 房源详情链接 source_url
     */
    private String sourceUrl;

    /**
     * 状态 未查看 1 已查看 2 未同步 3 已同步 4 删除
     */
    private Integer status;
}

3.自定义Agent 

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import java.util.concurrent.locks.ReentrantReadWriteLock;

/**
 * @author wongH
 * @date 2019/5/7 9:51
 * @Version 1.0
 */
public class Agent {
    private static final String AGENT_FILE_PATH = "user-agent/User-Agents.txt";
    private static ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
    private static List<String> agents;

    public static String getRandom() {
        String random = getRandom(null);
        System.err.println("Agent======================>" + random);
        return random;
    }

    private static String getRandom(String agent) {
        try {
            lock.readLock().lock();
            int size = agents.size();
            if (size == 0)
                return "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";
            Random random = new Random();
            if (null != agent) return agent;
            else return agents.get(random.nextInt(size));
        } catch (Exception e) {
            e.printStackTrace();
            return "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";
        } finally {
            lock.readLock().unlock();
        }
    }

    static {
        agents = new ArrayList<>();
        InputStream resourceAsStream = null;
        InputStreamReader inputStreamReader = null;
        BufferedReader bufferedReader = null;
        try {
            resourceAsStream = Agent.class.getClassLoader().getResourceAsStream(AGENT_FILE_PATH);
            inputStreamReader = new InputStreamReader(resourceAsStream);
            bufferedReader = new BufferedReader(inputStreamReader);
            String len;
            while ((len = bufferedReader.readLine()) != null) {
                if (!len.matches("^#.*")) {
                    agents.add(len.trim());
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            if (null != bufferedReader) {
                try {
                    bufferedReader.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            if (null != inputStreamReader) {
                try {
                    inputStreamReader.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            if (null != resourceAsStream) {
                try {
                    resourceAsStream.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }
}


附上User-Agents.txt内容
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60
Opera/8.0 (Windows NT 5.1; U; en)
Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50
Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0
Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11
Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36
Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER
Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)"
Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)
Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36
4.爬免费ip 并检测可用性
 
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.redis.core.RedisTemplate;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;

import java.io.IOException;
import java.io.InputStream;
import java.net.*;
import java.util.List;

/**
 * @author wongH
 * @date 2019/5/7 11:17
 * @Version 1.0
 */
@Component
public class UpdateIp {

    @Autowired
    private RedisTemplate redisTemplate;

    @Scheduled(cron = "*/20 * * * * ?")
    void update() {
        List<String> range = redisTemplate.opsForList().range("ip", 0, -1);
        for (String ip : range) {
            if (ifUseless(ip)) {
                System.err.println(ip + "  从redis移除");
                redisTemplate.opsForList().remove("ip", 0, ip);
            }
        }
    }

    @Scheduled(cron = "*/15 * * * * ?")
    void ips() {
        String string = null;
        try {
            Document document = Jsoup.connect("https://www.xicidaili.com/nn").timeout(3000).get();
            Elements tags = document.select("#ip_list > tbody > tr");
            for (Element element : tags) {
                //取得ip地址节点
                Elements tdChilds = element.select("tr > td:nth-child(2)");
                //取得端口号节点
                Elements tcpd = element.select("tr > td:nth-child(3)");
                if (StringUtils.isNotBlank(tdChilds.text()) && StringUtils.isNotBlank(tcpd.text())) {
                    string = tdChilds.text() + ":" + tcpd.text();
                    if (!ifUseless(string)) {
                        List<String> range = redisTemplate.opsForList().range("ip", 0, -1);
                        if (!range.contains(string)) {
                            System.err.println(string + "  存进redis");
                            if (redisTemplate.opsForList().size("ip") > 100)
                                redisTemplate.opsForList().rightPopAndLeftPush("ip", string);
                            else redisTemplate.opsForList().leftPush("ip", string);
                        }
                    }
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    //无效的ip 返回true 有效的ip返回false
    boolean ifUseless(String ip) {
        String[] split = ip.split(":");
        URL url = null;
        try {
            url = new URL("http://www.baidu.com");
            InetSocketAddress addr = new InetSocketAddress(split[0], Integer.parseInt(split[1]));
            Proxy proxy = new Proxy(Proxy.Type.HTTP, addr);
            InputStream in = null;
            try {
                URLConnection conn = url.openConnection(proxy);
                conn.setConnectTimeout(2000);
                in = conn.getInputStream();
            } catch (Exception e) {
                return true;
            }
            String s = IOUtils.toString(in);
            if (s.indexOf("baidu") > 0) {
                return false;
            }
            return true;
        } catch (Exception e) {
            return true;
        }
    }
}

5.自定义 HttpClientDownloader


import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.data.redis.core.RedisTemplate;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.SimpleProxyProvider;

import java.util.Random;


/**
 * @author wongH
 * @date 2019/5/7 9:21
 * @Version 1.0
 */
@Component
public class Downloader{

    private static RedisTemplate redisTemplate;

    @Autowired
    IProxyPool(RedisTemplate redisTemplate) {
        this.redisTemplate = redisTemplate;
    }

    public static HttpClientDownloader newIpDownloader() {
        HttpClientDownloader downloader = new HttpClientDownloader() {
            @Override
            protected void onError(Request request) {
                String[] ips = newIp();
                setProxyProvider(SimpleProxyProvider.from(new Proxy(ips[0], Integer.parseInt(ips[1]))));
            }
        };
        return downloader;
    }

    static String[] newIp() {
        Long size = redisTemplate.opsForList().size("ip");
        String ip = redisTemplate.opsForList().index("ip", new Random().nextInt(size.intValue())).toString();
        System.err.println("获取ip===========>" + ip);
        String[] ips = ip.split(":");
        return ips;
    }
}

6. 页面处理

import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Component;
import tech.liveeasy.spider.property.util.Agent;
import tech.liveeasy.spider.property.service.bean.Property;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;

@Component
public class PropertyPageProcessor01 implements PageProcessor {

    private String agent = Agent.getRandom();
    // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
    private Site site = Site.me().setCharset("UTF-8").setRetryTimes(3).setCycleRetryTimes(3).setSleepTime(30000).setUserAgent(agent);

    private static final String LIST = "http://wh.01fy.cn/sale/list";

    private static final String startList = "http://wh.01fy.cn/sale/list_2_0_0_0-0_0_0-0_0_0_0-0_0_0-0_2_0_1_.html";

    @Override
    public Site getSite() {
        return site;
    }

    @Override
    public void process(Page page) {
        if (page.getUrl().toString().startsWith(LIST)) {
            //列表页
            //添加所有详情页到队列
            page.addTargetRequests(page.getHtml().xpath("//*[@id=\"list\"]/ul/li/div[2]/div[1]/a/@href").all());
            //添加下一页
            String next = page.getHtml().xpath("//*[@id=\"page\"]/div/ul/ul/li[11]/a/@href").toString();
            //如果不为空继续查下一页
            if (StringUtils.isNotBlank(next)) {
                page.addTargetRequest(next);
            } else {
                //如果为空查询第一页
                page.addTargetRequest(startList);
            }
            //page.putField("propertys", propertys); 如果需要列表数据的话 自定义propertys对象
        } else {
            //详情页
            Property property = new Property();
            property.setPropertyTitle(page.getHtml().xpath("//*[@id=\"content\"]/h1/text()").toString());
            property.setPrice(page.getHtml().xpath("//div[@class=cr_left]/dl[1]/dd/span[@class=price]/b/text()").toString());

            String avgPrice = page.getHtml().xpath("//div[@class=cr_left]/dl[1]/dd/text()").toString();
            if (StringUtils.isNotBlank(avgPrice)) {
                avgPrice = avgPrice.replaceAll(" ", "");
            }

            int avgLength = avgPrice.length() - 1;
            property.setAvaPrice(avgPrice.substring(1, avgLength));

            String houseTypeAndSize = page.getHtml().xpath("//*[@id=\"content\"]/div[3]/div[2]/dl[2]/dd/text()").toString();

            if (StringUtils.isNotBlank(houseTypeAndSize)) {
                houseTypeAndSize = houseTypeAndSize.replaceAll(" ", "");

                if (houseTypeAndSize.contains("?")) {

                    String houseType = houseTypeAndSize.substring(0, houseTypeAndSize.indexOf("?"));

                    property.setHouseType(houseType);

                    int length = houseTypeAndSize.length();

                    String size = houseTypeAndSize.substring(houseTypeAndSize.indexOf("?") + 1, length);

                    property.setSize(size);

                } else {
                    String houseType = houseTypeAndSize.substring(0, houseTypeAndSize.indexOf("卫") + 1);
                    property.setHouseType(houseType);
                    int length = houseTypeAndSize.length();
                    String size = houseTypeAndSize.substring(houseTypeAndSize.indexOf("卫") + 2, length).replaceAll("㎡", "");
                    property.setSize(size);
                }
            }


            property.setSourceUrl(page.getUrl().toString());
            property.setEstateName(page.getHtml().xpath("//*[@id=\"content\"]/div[3]/div[2]/dl[3]/dd/text()").toString());
            String areaName = page.getHtml().xpath("//*[@id=\"content\"]/div[3]/div[2]/dl[4]/dd/text()").toString();
            if (StringUtils.isNotBlank(areaName)) {
                areaName = areaName.replaceAll(" ", "");
            }
            property.setAreaName(areaName);

            String userName = null;

            int dlSize = page.getHtml().xpath("//*[@id=\"content\"]/div[3]/div[@class=cr_left]/dl").nodes().size() - 1;

            userName = page.getHtml().xpath("//*[@id=\"content\"]/div[3]/div[2]/dl[" + dlSize + "]/dd/text()").toString();

            if (StringUtils.isNotBlank(userName)) {
                userName = userName.replaceAll("(个人)", "");
            }

            property.setUserName(userName);

            String mobile1 = page.getHtml().xpath("//div[@class=telephone]/span[@class=redtelphone]/text()").toString();
            String mobile2 = page.getHtml().xpath("//div[@class=telephone]/span[@class=redtelphone]/span[@class=mobile-split]/text()").toString();

            if (StringUtils.isNotBlank(mobile1) && StringUtils.isNotBlank(mobile2)) {
                int length = mobile1.length();
                String s = mobile1.substring(0, 3) + mobile2;
                String s1 = s + mobile1.substring(3, length);
                property.setUserMobile(s1);
            }
            page.putField("property", property);
        }
    }

}
7.数据处理存数据库
 

import org.springframework.stereotype.Component;
import tech.liveeasy.spider.property.service.bean.Property;
import tech.liveeasy.spider.property.service.bean.PropertyMapper;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.utils.FilePersistentBase;

import javax.annotation.Resource;

@Component
public class PropertyDaoPipeLine01 extends FilePersistentBase implements Pipeline {

    @Resource
    private PropertyMapper propertyMapper;

    @Override
    public void process(ResultItems resultItems, Task task) {
		// LIst<Property> propertys = resultItems.get("propertys"); 
		//propertyMapper.saves(propertys);
        Property property = resultItems.get("property");
		propertyMapper.save(property);    
    }
}

Here Insert Picture Description

Published 97 original articles · won praise 44 · Views 300,000 +

Guess you like

Origin blog.csdn.net/wangh92/article/details/89944794