灵感来源(误打误撞)https://www.oschina.net/question/147939_140523
1、浏览器(google)打开要爬去的网页我这里以
http://news.iciba.com/views/dailysentence/daily.html#!/detail/title/2018-12-11
为列子爬取每日一图,一句,一翻译
如果普通爬取代码源代码是这样的显然获取不了图片url,句子,译文的所以要直接找到
请求这些参数的值的url,因为获取更多内容是通过JSONP的方式,其本质就是动态的
加载js文件并执行其中的代码以获得服务器端返回的内容。
as follows:
2、打开开发则工具,点击Network,点击js按图操作
再点击Headers获取url
然后java代码实现返回的值就是我们要的东西如下图
//https://www.oschina.net/question/147939_140523
//callback
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class AnalyzeWeb {
public static void main(String args[]) {
//解析目标网址
String day = "2018-12-09";
String postfix = "&_=1544530355789";
String url = "http://sentence.iciba.com/index.php?
callback=jQuery1900046932545628676436_1544530355780&c=dailysentence&m=
getdetail&title=";
url = url + day + postfix;
String webContent = getWebTxt(url);
System.out.println(webContent);
String pattern = "content\":\"[\\w*\\s+(,)(.)]*";
takeWebDetail(webContent, pattern);
pattern = "note\":\"[\\\\\\/\\w*(.)]*";
takeWebDetail(webContent, pattern);
pattern = "translation\":\"[\\\\\\/\\w*(.)]*";
takeWebDetail(webContent, pattern);
pattern = "picture\":\"http:[\\\\\\/\\w*(.)]*";
takeWebDetail(webContent, pattern);
}
public static String getWebTxt(String url) {
URL u;
HttpURLConnection httpURLConnection;
String ret = "";
try {
u = new URL(url);
httpURLConnection = (HttpURLConnection)u.openConnection();
if (httpURLConnection.getResponseCode() == 200) {
BufferedReader bufferedReader = new BufferedReader(new
InputStreamReader(httpURLConnection.getInputStream(),"utf-8"));
String read;
while ((read = bufferedReader.readLine()) != null) {
ret += read ;
ret += "\r\n" ;
}
}
} catch (Exception e){
System.out.println("解析网页错误!");
}
return ret ;
}
public static String takeWebDetail(String str, String pattern) {
Pattern r = Pattern.compile(pattern);
Matcher m = r.matcher(str);
if (m.find( )) {
System.out.println("Found value: " + m.group(0) );
} else {
System.out.println("NO MATCH");
}
return "";
}
}