How to use JAVA to crawl the pages loaded by AJAX

灵感来源(误打误撞)https://www.oschina.net/question/147939_140523
1、浏览器(google)打开要爬去的网页我这里以
http://news.iciba.com/views/dailysentence/daily.html#!/detail/title/2018-12-11
为列子爬取每日一图,一句,一翻译

如果普通爬取代码源代码是这样的显然获取不了图片url,句子,译文的所以要直接找到
请求这些参数的值的url,因为获取更多内容是通过JSONP的方式,其本质就是动态的
加载js文件并执行其中的代码以获得服务器端返回的内容。

as follows:
Insert picture description here

2、打开开发则工具,点击Network,点击js按图操作

Insert picture description here

再点击Headers获取url

Insert picture description here

然后java代码实现返回的值就是我们要的东西如下图
//https://www.oschina.net/question/147939_140523
//callback

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class AnalyzeWeb {
	
	public static void main(String args[]) {
		//解析目标网址
		String day = "2018-12-09";
		String postfix = "&_=1544530355789";
		String url = "http://sentence.iciba.com/index.php?
		callback=jQuery1900046932545628676436_1544530355780&c=dailysentence&m=
		getdetail&title=";
		url = url + day + postfix;
		String webContent = getWebTxt(url);
		System.out.println(webContent);
		
		String pattern = "content\":\"[\\w*\\s+(,)(.)]*";
		takeWebDetail(webContent, pattern);
		
		pattern = "note\":\"[\\\\\\/\\w*(.)]*";
		takeWebDetail(webContent, pattern);
		
		pattern = "translation\":\"[\\\\\\/\\w*(.)]*";
		takeWebDetail(webContent, pattern);
		
		pattern = "picture\":\"http:[\\\\\\/\\w*(.)]*";
		takeWebDetail(webContent, pattern);
	}
	
	public static String getWebTxt(String url) {
        URL u;
        HttpURLConnection httpURLConnection;
        String ret = "";
        try {
        	
            u = new URL(url);
            httpURLConnection = (HttpURLConnection)u.openConnection();
            
            if (httpURLConnection.getResponseCode() == 200) {
            	
                BufferedReader bufferedReader = new BufferedReader(new 
                InputStreamReader(httpURLConnection.getInputStream(),"utf-8"));

                String read;
                while ((read = bufferedReader.readLine()) != null) {
                    ret += read ;
                    ret += "\r\n" ;
                }
            }
        } catch (Exception e){
        	System.out.println("解析网页错误!");
        }
        return ret ;
    }
	
	public static String takeWebDetail(String str, String pattern) {
		
		
		
		Pattern r = Pattern.compile(pattern);
		
	    Matcher m = r.matcher(str);
	    if (m.find( )) {
	    	System.out.println("Found value: " + m.group(0) );
	    } else {
	    	System.out.println("NO MATCH");
	    }
		
		return "";
	}
}	

Insert picture description here

Guess you like

Origin blog.csdn.net/ZWHSOUL/article/details/84961736