java 爬虫之使用jsoup爬取页面

需求:

爬取出所有号段，并找到他们对应的运营商和所属地。

添加依赖：

       <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.9.2</version>
        </dependency>

        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.3</version>
        </dependency>

实现代码：

package com.deeplinkJavaSpider.MainPageSpider;

import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.*;

public class numberRegex {
    public static String outputPath="C:\\Users\\lenovo\\Desktop\\a1.txt";

    /**
     *
     * @param urlString
     * @return
     * @throws IOException
     */
    public static StringBuilder openUrl(String urlString) throws IOException {
        URL url = new URL(urlString);
        HttpURLConnection connection = (HttpURLConnection) url.openConnection();
        BufferedReader in = null;
        String userAgent = getUserAgent(); //随机获取ua
        StringBuilder result = new StringBuilder();

        try {
            // 设置通用的请求属性
            connection.setRequestProperty("accept", "*/*");
            connection.setRequestProperty("connection", "Keep-Alive");
            connection.setRequestProperty("user-agent", userAgent);
            // 建立实际的连接
            connection.connect();

            // 定义 BufferedReader输入流来读取URL的响应
            in = new BufferedReader(new InputStreamReader(
                    connection.getInputStream(), "gbk")); //设置编码，如果不设置应该是utf8
            String line;
            while ((line = in.readLine()) != null) {
                result.append(line);
            }
        } finally {        // 使用finally块来关闭输入流
            try {
                if (in != null) {
                    in.close();
                }
            } catch (Exception e2) {
                e2.printStackTrace();
            }
        }
      return result;
    }

    /**
     *
     * @return  获得随机选择的UA
     * @throws IOException
     */
    public static String getUserAgent() throws IOException {
        List<String> list = new ArrayList<>();
        InputStreamReader reader = new InputStreamReader(new FileInputStream("src//main//resources//user-Agent.txt"));
        BufferedReader bufferedReader = new BufferedReader(reader);
        String lineTxt = null;
        while ((lineTxt = bufferedReader.readLine()) != null) {
            list.add(lineTxt);
        }
        reader.close();
        Random random = new Random();
        String userAgent = list.get(random.nextInt(list.size()));
        return userAgent;
    }
    /**
     *
     * @param urlString  url字符串
     * @throws Exception
     */
    public static void doFetchPage(String urlString) throws Exception {
        StringBuilder result=openUrl(urlString);
        spiderMainPage(result.toString());
    }

    /**
     *
     * @param html   爬取到的主页面信息
     * @throws Exception
     */
    public static void spiderMainPage(String html) throws Exception {
        Map<String, String> map = new HashMap<>();
        Document doc = Jsoup.parse(html);
        Elements elements1 = doc.select("div.fkce");
        Elements elements2 = elements1.select("div.fkt");
        for (Element e : elements2) {
            String province = e.select("div.fkbj").text();
            Elements elements3 = e.select("div.fklk").select("a");
            for (Element c : elements3) {
                String city = c.text();
                String href = c.attr("href"); //爬取到div.fklk下的所有链接
                map.put(province + "-" + city, href);
            }
        }
        doFetchNextPage(map);

    }

    /**
     *
     * @param map   以省份-城市为key  对应的链接为value
     * @throws Exception
     */
    public static void doFetchNextPage(Map<String, String> map) throws Exception {
        for (Map.Entry<String, String> entry : map.entrySet()) {
            StringBuilder result=openUrl(entry.getValue());
            spiderDeepLinkPage(result.toString(), entry.getKey());
        }

    }

    /**
     *
     * @param html 针对链接爬取到的所有信息
     * @param province  省份-城市 信息
     * @throws Exception
     */
    public static void spiderDeepLinkPage(String html, String province) throws Exception {
        Map<String, String> map = new HashMap<>();
        List<String> listKey = new ArrayList<>();
        List<String> listValue = new ArrayList<>();
        Document doc = Jsoup.parse(html);
        Elements elements1 = doc.select("body > div.all>ul");
        Elements elements2 = doc.select("body > div.all >div>div.num_bg ");
        for (Element element : elements1) {
            listKey.add(element.text());
        }
        for (Element element : elements2) {
            listValue.add(element.text().substring(3, 5) + "-" + province);
        }
        for(int i=0;i<listKey.size();i++){   //合并两个list称为一个map ,使用Java8的lambda 表达式更好
            map.put(listKey.get(i),listValue.get(i));
        }
       writeInfo(map);
    }

    /**
     *
     * @param map  key为号段组成的字符串 value 为 运营商-省份-城市
     * @throws Exception
     */
    public static void writeInfo(Map<String,String> map) throws Exception {
        StringBuffer sb = new StringBuffer();
        File outputFile = new File(outputPath);
        BufferedWriter bufferedWriter=null;
        if(outputFile.exists()) {
            bufferedWriter = new BufferedWriter(new FileWriter(outputFile,true));
        }else {
            bufferedWriter = new BufferedWriter(new FileWriter(outputFile));
        }
        for(Map.Entry<String,String> entry:map.entrySet()){
            String[] str=entry.getKey().split(" ");
            for(String s:str){
                if(StringUtils.isNotBlank(s)) {
                    bufferedWriter.write(s+":"+entry.getValue()+"\n");
                }
            }

        }
        bufferedWriter.close();
    }



    public static void main(String[] args) throws Exception {
        doFetchPage("http://www.51hao.cc/");
    }


}

结果截图：

本代码中建立连接时采用的是 HttpURLConnection connection = (HttpURLConnection) url.openConnection();

除了HttpIURLConnection ,还可以采用HttpClient.

HttpClient建立连接代码：

 public static HttpResponse doPostConnect(HttpClient httpClient, String url, String params, String encoding) throws Exception {
        // 建立HTTP Post连线
        HttpPost httpPost = new HttpPost(url);
        // 设置post请求的参数
        //post.setHeader("MaxRTBversion", "1.0");
        //  设置post中的参数
        StringEntity paramsEntity = new StringEntity(params,encoding);
        paramsEntity.setContentType("application/x-www-form-urlencoded");
        httpPost.setEntity(paramsEntity);
        HttpResponse res = httpClient.execute(httpPost);
        return res;//HttpResponse 类型
    }

java 爬虫之使用jsoup爬取页面

需求:

实现代码：

猜你喜欢