webmagic学习之路-2:采集安居客经纪人列表

相比较 1  稍微成熟了一点,会用的东西多了。
正则用的不好,很多东西不会,大神轻喷!



package com.action;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.management.JMException;
import javax.swing.plaf.synth.SynthSpinnerUI;

import org.bson.Document;

import com.model.AgentListModel;
import com.model.Model_AnjukeList;
import com.mongodb.BasicDBObject;
import com.util.Constants;
import com.util.GetDate;
import com.util.MysqlUtils;
import com.util.MD5With32;
import com.util.MongoDBUtil;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.monitor.SpiderMonitor;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;

public class GetAnjukeAgentList implements PageProcessor {

    static AgentListModel anjukeList;
    static List<String> list = new ArrayList<String>();
    static List<AgentListModel> list_insert = new ArrayList<AgentListModel>();
    static BasicDBObject  doc = null;
    private Site site = Site.me().setSleepTime(1000).setRetryTimes(3).setCharset("UTF-8")
            .setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36");
    @Override
    public Site getSite() {
        // TODO Auto-generated method stub
        return this.site;
    }
    
    @Override
    public void process(Page page) {
        System.out.println("code:"+page.getStatusCode());
        System.out.println(page.getUrl());
        if(!page.getUrl().regex("https://[a-z]+.anjuke.com/tycoon/[a-z]+-q-[a-z]+/").match()/*&&!page.getHtml().regex("https://[a-z]+.anjuke.com/tycoon/[a-z]+-q-[a-z]+/p[0-9]+/").match()*/){
            page.addTargetRequests(page.getHtml().xpath("//span[@class='elems-l']/a/@href").regex("https://[a-z]+.anjuke.com/tycoon/[a-z]+/").all());
            if(page.getUrl().regex("https://[a-z]+.anjuke.com/tycoon/[a-z]+/").match()){
                page.addTargetRequests(page.getHtml().xpath("//div[@class='sub-items']/a/@href").all());
            }
            
        }else{
            //3行可以移动到这里!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
            //分割线--------------------
            List<Selectable> htmls = page.getHtml().xpath("//div[@class='jjr-info']").nodes();
            for(Selectable html: htmls){
                String name = html.xpath("//div/h3/a/text()").get();
                String staffNo = html.xpath("//div/h3/a/@href").get();
                String company = html.xpath("//p[@class='jjr-desc']/a[1]/text()").get();
                String company_url = html.xpath("//p[@class='jjr-desc']/a[1]/@href").get();
                String store = html.xpath("//p[@class='jjr-desc']/a[2]/text()").get();
                String store_url = html.xpath("//p[@class='jjr-desc']/a[2]/@href").get();
                anjukeList = new AgentListModel("", "", "", "", "anjuke", GetDate.getDay0(), page.getUrl()+"", name, staffNo, company, company_url, store, store_url);
                list_insert.add(anjukeList);
            }
            String city = page.getHtml().xpath("//div[@class='p_1180 p_crumbs']/a[2]/text()").get().replace("经纪人", "");
            String zone = page.getHtml().xpath("//div[@class='p_1180 p_crumbs']/a[3]/text()").get().replace("经纪人", "");
            String street = page.getHtml().xpath("//div[@class='p_1180 p_crumbs']/a[4]/text()").get().replace("经纪人", "");
            MysqlUtils.InsertAnjukeAgent(list_insert,city,zone,street);
            list_insert.clear();
            
            if(page.getHtml().regex("https://[a-z]+.anjuke.com/tycoon/[a-z]+-q-[a-z]+/p[0-9]+/").match()){
                //获取分页
                page.addTargetRequests(page.getHtml().xpath("//div[@class='page-content']/div/a/@href").all());
            }
        }
    }
    

    public static void main(String[] args) {
        List<String> list = new ArrayList<String>();
        list.add("https://chongqing.anjuke.com/tycoon/");for (int i = 0; i < list.size(); i++) {
            Spider.create(new GetAnjukeAgentList())
            .addUrl(list.get(i))
            .addPipeline(new ConsolePipeline())
            .thread(20)
            .run();
        }
        
        
    }
}

这段代码有个很大的疑问,不知道有没有大神给解释一下。

String city = page.getHtml().xpath("//div[@class='p_1180 p_crumbs']/a[2]/text()").get().replace("经纪人", "");
String zone = page.getHtml().xpath("//div[@class='p_1180 p_crumbs']/a[3]/text()").get().replace("经纪人", "");
String street = page.getHtml().xpath("//div[@class='p_1180 p_crumbs']/a[4]/text()").get().replace("经纪人", "");

上面这3行,如果你把位置移动一下。

移动到分割线上面去。

这3个xpath会匹配不到内容,我研究了很长时间,没搞明白,也就没再研究下去了。

评论区留言告知下,谢谢!!

 

猜你喜欢

转载自www.cnblogs.com/tnsay/p/10895325.html