httpClient及jsoup抓取解析网页数据

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;

import org.apache.http.HttpEntity;
import org.apache.http.ParseException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
 * httpClient及jsoup抓取解析网页数据
 */
public class AlmanacUtils {

    /**
     * 单例工具类
     */
    private AlmanacUtils() {
    }

    /**
     * 爬取网页信息指定内容
     * @param urlOrStr
     * @param i //1=按网址读取 2=按字符串读取
     * @return
     */
    public static String getAlmanac(String urlOrStr,int i){
        String html="";
        if (i == 1) {
            String url=urlOrStr;
            //根据网站抓取网页信息
            html=pickData(url);
        }else{
            html = urlOrStr;
        }
        //根据网页信息获取指定内容
        String almanac=analyzeHTMLByString(html);
        return almanac;
    }

    /*
     * 爬取网页信息
     */
    private static String pickData(String url) {
        CloseableHttpClient httpclient = HttpClients.createDefault();
        try {
            HttpGet httpget = new HttpGet(url);
            CloseableHttpResponse response = httpclient.execute(httpget);
            try {
                // 获取响应实体
                HttpEntity entity = response.getEntity();
                // 打印响应状态
                if (entity != null) {
                    return EntityUtils.toString(entity);
                }
            } finally {
                response.close();
            }
        } catch (ClientProtocolException e) {
            e.printStackTrace();
        } catch (ParseException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            // 关闭连接,释放资源
            try {
                httpclient.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return null;
    }

    /*
     * 使用jsoup解析网页信息获取指定内容
     */
    private static String analyzeHTMLByString(String html){
        String solarDate,lunarDate,chineseAra,should,avoid=" ";
        Document document = Jsoup.parse(html);
        //公历时间
        solarDate=getSolarDate();
        //通过id获取html节点对象
        Element eLunarDate=document.getElementById("info_nong");
        //获取子节点,并截取子节点的内容拼成新的内容
        lunarDate=eLunarDate.child(0).html().substring(1,3)+eLunarDate.html().substring(11);
        //通过id获取html节点对象
        Element eChineseAra=document.getElementById("info_chang");
        //获取该节点的文本内容
        chineseAra=eChineseAra.text().toString();
        //宜
        should=getSuggestion(document,"yi");
        //忌
        avoid=getSuggestion(document,"ji");
        return chineseAra+should + avoid;
    }
    /*
     * 根据id获取html节点对象里面的标签里的内容
     */
    private static String getSuggestion(Document doc,String id){
        //通过id获取html节点对象
        Element element=doc.getElementById(id);
        //获取节点对象里面的所有a标签对象
        Elements elements=element.getElementsByTag("a");
        StringBuffer sb=new StringBuffer();
        //循环获取a标签里面的文本内容
        for (Element e : elements) {
            sb.append(e.text()+" ");
        }
        return sb.toString();
    }

    /*
     * 获取公历时间,用yyyy年MM月dd日 EEEE格式表示。
     * @return yyyy年MM月dd日 EEEE
     */
    private static String getSolarDate() {
        Calendar calendar = Calendar.getInstance();
        Date solarDate = calendar.getTime();
        SimpleDateFormat formatter = new SimpleDateFormat("yyyy年MM月dd日 EEEE");
        return formatter.format(solarDate);
    }

    public static void main(String[] args) {

        System.out.println(getAlmanac("http://tools.2345.com/rili.htm",1));
        //String data = "<div class=\"s_form_wrapper soutu-env-nomac soutu-env-index\"><style>.index-logo-srcnew {display: none;}@media (-webkit-min-device-pixel-ratio: 2),(min--moz-device-pixel-ratio: 2),(-o-min-device-pixel-ratio: 2),(min-device-pixel-ratio: 2){.index-logo-src {display: none;}.index-logo-srcnew {display: inline;}}</style><div id=\"lg\"><img hidefocus=\"true\" class=\"index-logo-src\" src=\"//www.baidu.com/img/bd_logo1.png\" width=\"270\" height=\"129\" usemap=\"#mp\"><img hidefocus=\"true\" class=\"index-logo-srcnew\" src=\"//www.baidu.com/img/bd_logo1.png?qua=high\" width=\"270\" height=\"129\" usemap=\"#mp\"><map name=\"mp\"><area style=\"outline:none;\" hidefocus=\"true\" shape=\"rect\" coords=\"0,0,270,129\" href=\"//www.baidu.com/s?wd=%E4%BB%8A%E6%97%A5%E6%96%B0%E9%B2%9C%E4%BA%8B&amp;tn=SE_PclogoS_8whnvm25&amp;sa=ire_dl_gh_logo&amp;rsv_dl=igh_logo_pcs\" onmousedown=\"return ns_c({fm: 'tab', tab: 'felogo', rsv_platform: 'wwwhome' })\" target=\"_blank\" title=\"点击一下,了解更多\"></map></div><a href=\"/\" id=\"result_logo\" onmousedown=\"return c({'fm':'tab','tab':'logo'})\"><img class=\"index-logo-src\" src=\"//www.baidu.com/img/baidu_jgylogo3.gif\" alt=\"到百度首页\" title=\"到百度首页\"><img class=\"index-logo-srcnew\" src=\"//www.baidu.com/img/baidu_jgylogo3.gif\" alt=\"到百度首页\" title=\"到百度首页\"></a><form id=\"form\" name=\"f\" action=\"/s\" class=\"fm\"><input type=\"hidden\" name=\"ie\" value=\"utf-8\"><input type=\"hidden\" name=\"f\" value=\"8\"><input type=\"hidden\" name=\"rsv_bp\" value=\"0\"><input type=\"hidden\" name=\"rsv_idx\" value=\"1\"><input type=\"hidden\" name=\"ch\" value=\"\"><input type=\"hidden\" name=\"tn\" value=\"baidu\"><input type=\"hidden\" name=\"bar\" value=\"\"><span class=\"bg s_ipt_wr quickdelete-wrap\"><span class=\"soutu-btn\"></span><input id=\"kw\" name=\"wd\" class=\"s_ipt\" value=\"\" maxlength=\"255\" autocomplete=\"off\"><a href=\"javascript:;\" id=\"quickdelete\" title=\"清空\" class=\"quickdelete\" style=\"top: 0px; right: 0px; display: none;\"></a></span><span class=\"bg s_btn_wr\"><input type=\"submit\" id=\"su\" value=\"百度一下\" class=\"bg s_btn\"></span><span class=\"tools\"><span id=\"mHolder\"><div id=\"mCon\"><span>输入法</span></div><ul id=\"mMenu\"><li><a href=\"javascript:;\" name=\"ime_hw\">手写</a></li><li><a href=\"javascript:;\" name=\"ime_py\">拼音</a></li><li class=\"ln\"></li><li><a href=\"javascript:;\" name=\"ime_cl\">关闭</a></li></ul></span></span><input type=\"hidden\" name=\"rn\" value=\"\"><input type=\"hidden\" name=\"oq\" value=\"\"><input type=\"hidden\" name=\"rsv_pq\" value=\"af42cd9a00008911\"><input type=\"hidden\" name=\"rsv_t\" value=\"9052mHxcrAKpFIreYZ0KChSOrzM0qV57nwp93ZF8rBMhSzuChtUU6o78KQA\"><input type=\"hidden\" name=\"rqlang\" value=\"cn\"><input type=\"hidden\" name=\"rsv_enter\" value=\"1\"></form><div id=\"m\"></div></div>";
        //System.out.println(getAlmanac("http://tools.2345.com/rili.htm",2));
    }
}

猜你喜欢

转载自blog.csdn.net/qq_33391644/article/details/81417468