大三上寒假15天--第15天

今天webmaigic爬虫又学了一个小技巧,想要自己设计保存爬取内容形式,可以不用重写Pipeline,在process()方法中写上,你想要的保存操作,多数情况可以达到相同的效果,我的爬虫程序,想要将内容保存在一个txt中,就是这么实现的,个人感觉简单很多,也是看了网上的高手的文章,才学到了这个技巧,受益匪浅。

爬虫北京政府信件到此就完成完成了,全部代码如下,我的保存特点为以空格隔开不同的信息,方便导入数据库:

package my.webmagic2;



import java.io.File;
import java.io.FileWriter;
import java.io.IOException;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.model.HttpRequestBody;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
import us.codecraft.webmagic.utils.HttpConstant;

public class text implements PageProcessor{
    public static int h=1;
    private Site site=Site.me().setRetrySleepTime(3).setSleepTime(100);
    public int check=0;
    /**
     * @param args
     */
    public Site getSite() {
        // TODO Auto-generated method stub
        return site;
    }
    public void process(Page page) {
        // TODO Auto-generated method stub
        if(check==0){
            check++;
            String[] str1=page.getHtml().regex("\"letter_type\":\"[^,]+").all().toString().split(",");
            String[] str2=page.getHtml().regex("\"original_id\":\"[^,]+").all().toString().split(",");
            int len1,len2;
            for(int i=0;i<str1.length-1;i++){
                len1=str1[i].length()-1;
                str1[i]=str1[i].substring(16,len1);
                len2=str2[i].length()-1;
                str2[i]=str2[i].substring(16,len2);
            }
            str1[str1.length-1]= str1[str1.length-1].substring(16,str1[str1.length-1].length()-2);
            str2[str2.length-1]= str2[str2.length-1].substring(16,str2[str2.length-1].length()-2);
            for(int i=0;i<str2.length;i++){
                if(str1[i].equals("咨询")){
                    page.addTargetRequest("http://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId="+str2[i]);
                 }
                 else if(str1[i].equals("建议")){
                     page.addTargetRequest("http://www.beijing.gov.cn/hudong/hdjl/com.web.suggest.suggesDetail.flow?originalId="+str2[i]);
                 }
                 else if(str1[i].equals("投诉")){
                     page.addTargetRequest("http://www.beijing.gov.cn/hudong/hdjl/com.web.complain.complainDetail.flow?originalId="+str2[i]);
                 }
                 else{
                     page.addTargetRequest("http://www.beijing.gov.cn/hudong/hdjl/com.web.complain.complainDetail.flow?originalId="+str2[i]);
                 }
            }
        }
        else{
            File file=new File("/home/hadoop/xinjian");
            try {
                FileWriter w=new FileWriter(file,true);
            if(page.getHtml().xpath("/html/body/div[2]/div/div[2]/div[1]/div[1]/div[2]/strong").toString()!=null){
                String hf=page.getHtml().xpath("/html/body/div[2]/div/div[2]/div[2]/div/div[1]/div[2]").toString();
                hf=hf.replace("<div class=\"col-xs-12 col-md-12 column p-4 text-muted my-3\">","" );
                hf=hf.replace("</div>", "");
                hf=hf.replaceAll("&nbsp;", "");
                hf=hf.replaceAll("<p>", "");
                hf=hf.replaceAll("</p>","");
                hf=hf.replaceAll(" ", "");
                hf=hf.replaceAll("\n", "");
                w.write(page.getHtml().xpath("/html/body/div[2]/div/div[2]/div[1]/div[1]/div[2]/strong/text()").toString().replaceAll(" ","")
                        +" "
                        +page.getHtml().xpath("/html/body/div[2]/div/div[2]/div[1]/div[2]/div[1]/text()").toString().substring(4).replaceAll(" ","")
                        +" "
                        +page.getHtml().xpath("/html/body/div[2]/div/div[2]/div[1]/div[2]/div[2]/text()").toString().substring(3).replaceAll(" ","")
                        +" "
                        +page.getHtml().xpath("/html/body/div[2]/div/div[2]/div[1]/div[2]/div[3]/label/text()").toString().replaceAll(" ","")
                        +" "
                        +page.getHtml().xpath("/html/body/div[2]/div/div[2]/div[1]/div[3]/text()").toString().replaceAll(" ","")
                        +" "
                        +page.getHtml().xpath("/html/body/div[2]/div/div[2]/div[2]/div/div[1]/div[1]/div[2]/text()").toString().replaceAll(" ","")
                        +" "
                        +page.getHtml().xpath("/html/body/div[2]/div/div[2]/div[2]/div/div[1]/div[1]/div[3]/text()").toString().substring(5).replaceAll(" ","")
                        +" "
                        +hf
                        +"\n"
                            );
                w.close();
                h++;
            }else if(page.getHtml().xpath("/html/body/div[2]/div/div[2]/form/div[1]/div[1]/div[2]/strong").toString()!=null){
                String hf=page.getHtml().xpath("/html/body/div[2]/div/div[2]/form/div[2]/div/div[1]/div[2]").toString();
                hf=hf.replace("<div class=\"col-xs-12 col-md-12 column p-4 text-muted my-3\">","" );
                hf=hf.replace("</div>", "");
                hf=hf.replaceAll("&nbsp;", " ");
                hf=hf.replaceAll("<p>", "");
                hf=hf.replaceAll("</p>","");
                hf=hf.replaceAll(" ", "");
                hf=hf.replaceAll("\n", "");
                w.write(
                        page.getHtml().xpath("/html/body/div[2]/div/div[2]/form/div[1]/div[1]/div[2]/strong/text()").toString().replaceAll(" ","")
                        +" "
                        +page.getHtml().xpath("/html/body/div[2]/div/div[2]/form/div[1]/div[2]/div[1]/text()").toString().substring(4).replaceAll(" ","")
                        +" "
                        +page.getHtml().xpath("/html/body/div[2]/div/div[2]/form/div[1]/div[2]/div[2]/text()").toString().substring(3).replaceAll(" ","")
                        +" "
                        +page.getHtml().xpath("/html/body/div[2]/div/div[2]/form/div[1]/div[2]/div[3]/label/text()").toString().replaceAll(" ","")
                        +" "
                        +page.getHtml().xpath("/html/body/div[2]/div/div[2]/form/div[1]/div[3]/text()").toString().replaceAll(" ","")
                        +" "
                        +page.getHtml().xpath("/html/body/div[2]/div/div[2]/form/div[2]/div/div[1]/div[1]/div[2]/text()").toString().replaceAll(" ","")
                        +" "
                        +page.getHtml().xpath("/html/body/div[2]/div/div[2]/form/div[2]/div/div[1]/div[1]/div[3]/text()").toString().substring(5).replaceAll(" ","")
                        +" "
                        +hf
                        +"\n"
                            );
                w.close();
                h++;
            }else{
                page.putField("all", page.getHtml().toString());
                w.close();
            }
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
            
    }

    /**
     * @param args
     */
    public static void main(String[] args) {
        int j=0;
        for(int i=0;i<=5586;i++){
            j=i*6;
            // TODO Auto-generated method stub
            Request request = new Request("http://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.replyMailList.biz.ext");
            request.setMethod(HttpConstant.Method.POST);
            request.setRequestBody(HttpRequestBody.json("{'PageCond/begin':"+j+",'PageCond/length':6,'PageCond/isCount':'true','keywords':'','orgids':'','startDate':'','endDate':'','letterType':'2','letterStatue':''}","utf-8"));
            Spider.create(new text())
            .addRequest(request)
.addPipeline(new FilePipeline("./xinjian/")) .setScheduler(
new FileCacheQueueScheduler("./xinjian/")) .thread(5) .run(); System.out.println("完成"+i); } System.out.println("全部完成"); } }

猜你喜欢

转载自www.cnblogs.com/my---world/p/12313824.html