import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import org.apache.http.HttpEntity;
import org.apache.http.ParseException;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* httpClient及jsoup抓取解析网页数据
*/
public class AlmanacUtils {
/**
* 单例工具类
*/
private AlmanacUtils() {
}
/**
* 爬取网页信息指定内容
* @param urlOrStr
* @param i //1=按网址读取 2=按字符串读取
* @return
*/
public static String getAlmanac(String urlOrStr,int i){
String html="";
if (i == 1) {
String url=urlOrStr;
//根据网站抓取网页信息
html=pickData(url);
}else{
html = urlOrStr;
}
//根据网页信息获取指定内容
String almanac=analyzeHTMLByString(html);
return almanac;
}
/*
* 爬取网页信息
*/
private static String pickData(String url) {
CloseableHttpClient httpclient = HttpClients.createDefault();
try {
HttpGet httpget = new HttpGet(url);
CloseableHttpResponse response = httpclient.execute(httpget);
try {
// 获取响应实体
HttpEntity entity = response.getEntity();
// 打印响应状态
if (entity != null) {
return EntityUtils.toString(entity);
}
} finally {
response.close();
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (ParseException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
// 关闭连接,释放资源
try {
httpclient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
/*
* 使用jsoup解析网页信息获取指定内容
*/
private static String analyzeHTMLByString(String html){
String solarDate,lunarDate,chineseAra,should,avoid=" ";
Document document = Jsoup.parse(html);
//公历时间
solarDate=getSolarDate();
//通过id获取html节点对象
Element eLunarDate=document.getElementById("info_nong");
//获取子节点,并截取子节点的内容拼成新的内容
lunarDate=eLunarDate.child(0).html().substring(1,3)+eLunarDate.html().substring(11);
//通过id获取html节点对象
Element eChineseAra=document.getElementById("info_chang");
//获取该节点的文本内容
chineseAra=eChineseAra.text().toString();
//宜
should=getSuggestion(document,"yi");
//忌
avoid=getSuggestion(document,"ji");
return chineseAra+should + avoid;
}
/*
* 根据id获取html节点对象里面的标签里的内容
*/
private static String getSuggestion(Document doc,String id){
//通过id获取html节点对象
Element element=doc.getElementById(id);
//获取节点对象里面的所有a标签对象
Elements elements=element.getElementsByTag("a");
StringBuffer sb=new StringBuffer();
//循环获取a标签里面的文本内容
for (Element e : elements) {
sb.append(e.text()+" ");
}
return sb.toString();
}
/*
* 获取公历时间,用yyyy年MM月dd日 EEEE格式表示。
* @return yyyy年MM月dd日 EEEE
*/
private static String getSolarDate() {
Calendar calendar = Calendar.getInstance();
Date solarDate = calendar.getTime();
SimpleDateFormat formatter = new SimpleDateFormat("yyyy年MM月dd日 EEEE");
return formatter.format(solarDate);
}
public static void main(String[] args) {
System.out.println(getAlmanac("http://tools.2345.com/rili.htm",1));
//String data = "<div class=\"s_form_wrapper soutu-env-nomac soutu-env-index\"><style>.index-logo-srcnew {display: none;}@media (-webkit-min-device-pixel-ratio: 2),(min--moz-device-pixel-ratio: 2),(-o-min-device-pixel-ratio: 2),(min-device-pixel-ratio: 2){.index-logo-src {display: none;}.index-logo-srcnew {display: inline;}}</style><div id=\"lg\"><img hidefocus=\"true\" class=\"index-logo-src\" src=\"//www.baidu.com/img/bd_logo1.png\" width=\"270\" height=\"129\" usemap=\"#mp\"><img hidefocus=\"true\" class=\"index-logo-srcnew\" src=\"//www.baidu.com/img/bd_logo1.png?qua=high\" width=\"270\" height=\"129\" usemap=\"#mp\"><map name=\"mp\"><area style=\"outline:none;\" hidefocus=\"true\" shape=\"rect\" coords=\"0,0,270,129\" href=\"//www.baidu.com/s?wd=%E4%BB%8A%E6%97%A5%E6%96%B0%E9%B2%9C%E4%BA%8B&tn=SE_PclogoS_8whnvm25&sa=ire_dl_gh_logo&rsv_dl=igh_logo_pcs\" onmousedown=\"return ns_c({fm: 'tab', tab: 'felogo', rsv_platform: 'wwwhome' })\" target=\"_blank\" title=\"点击一下,了解更多\"></map></div><a href=\"/\" id=\"result_logo\" onmousedown=\"return c({'fm':'tab','tab':'logo'})\"><img class=\"index-logo-src\" src=\"//www.baidu.com/img/baidu_jgylogo3.gif\" alt=\"到百度首页\" title=\"到百度首页\"><img class=\"index-logo-srcnew\" src=\"//www.baidu.com/img/baidu_jgylogo3.gif\" alt=\"到百度首页\" title=\"到百度首页\"></a><form id=\"form\" name=\"f\" action=\"/s\" class=\"fm\"><input type=\"hidden\" name=\"ie\" value=\"utf-8\"><input type=\"hidden\" name=\"f\" value=\"8\"><input type=\"hidden\" name=\"rsv_bp\" value=\"0\"><input type=\"hidden\" name=\"rsv_idx\" value=\"1\"><input type=\"hidden\" name=\"ch\" value=\"\"><input type=\"hidden\" name=\"tn\" value=\"baidu\"><input type=\"hidden\" name=\"bar\" value=\"\"><span class=\"bg s_ipt_wr quickdelete-wrap\"><span class=\"soutu-btn\"></span><input id=\"kw\" name=\"wd\" class=\"s_ipt\" value=\"\" maxlength=\"255\" autocomplete=\"off\"><a href=\"javascript:;\" id=\"quickdelete\" title=\"清空\" class=\"quickdelete\" style=\"top: 0px; right: 0px; display: none;\"></a></span><span class=\"bg s_btn_wr\"><input type=\"submit\" id=\"su\" value=\"百度一下\" class=\"bg s_btn\"></span><span class=\"tools\"><span id=\"mHolder\"><div id=\"mCon\"><span>输入法</span></div><ul id=\"mMenu\"><li><a href=\"javascript:;\" name=\"ime_hw\">手写</a></li><li><a href=\"javascript:;\" name=\"ime_py\">拼音</a></li><li class=\"ln\"></li><li><a href=\"javascript:;\" name=\"ime_cl\">关闭</a></li></ul></span></span><input type=\"hidden\" name=\"rn\" value=\"\"><input type=\"hidden\" name=\"oq\" value=\"\"><input type=\"hidden\" name=\"rsv_pq\" value=\"af42cd9a00008911\"><input type=\"hidden\" name=\"rsv_t\" value=\"9052mHxcrAKpFIreYZ0KChSOrzM0qV57nwp93ZF8rBMhSzuChtUU6o78KQA\"><input type=\"hidden\" name=\"rqlang\" value=\"cn\"><input type=\"hidden\" name=\"rsv_enter\" value=\"1\"></form><div id=\"m\"></div></div>";
//System.out.println(getAlmanac("http://tools.2345.com/rili.htm",2));
}
}
httpClient及jsoup抓取解析网页数据
猜你喜欢
转载自blog.csdn.net/qq_33391644/article/details/81417468
今日推荐
周排行