phantomjs

最近做爬虫,基于网页上面有很多ajax请求,可以采用httpclient找到相应的ajax,这样做比较麻烦,然后采用phantomjs,直接可以获取到网页所有信息。
public static String dealWithAjax(String url) throws IOException {    
Runtime rt = Runtime.getRuntime();    
Process p = rt.exec("phantomjs.exe D:\\rj\\RedisImportMysql\\src\\js\\test.js "+url);  
InputStream is = p.getInputStream();    
BufferedReader br = new BufferedReader(new InputStreamReader(is));    
StringBuffer sbf = new StringBuffer();    
String tmp = "";    
while((tmp = br.readLine())!=null){    
sbf.append(tmp);  
sbf.append("\n");
}    
sbf.append("-----------------------------------------------------------------------");
return sbf.toString();    
}    

public static void main(String[] args) throws IOException {    
// System.out.println(getAjaxCotnent("http://shixin.court.gov.cn/personMore.do")); 
String s = dealWithAjax("http://list.tmall.com/search_product.htm?q=%CA%D6%BB%FA&type=p&vmarket=&spm=a222r.7716956.a2227oh.d100&from=..pc_1_searchbutton");
System.out.println(s);
FileUtil.writeFile("d://a.txt", "utf-8", s,true);

}



test.js源码:
//codes.js    
system = require('system')    
address = system.args[1];//获得命令行第二个参数 接下来会用到    
//console.log('Loading a web page');    
var page = require('webpage').create();    
var url = address;  
var newUrl ;
//console.log(url);    
page.open(url, function (status) {    
//Page is loaded!    
if (status !== 'success') {    
console.log('Unable to post!');    
} else {    
//console.log(page.content);    
var content = page.evaluate(function() {  
//var elements =  document.querySelector("#xy-impcon-B");//示范下如何使用页面的jsapi去操作页面的  www.oicqzone.com
//return elements.innerHTML
return document.querySelector("*").innerHTML;
});    
console.log(content);
}       
});
page.onLoadFinished= function(){
console.log("执行完毕。。。。。。。。");

//window.scrollTo(0,10000);
//window.document.body.scrollTop = document.body.scrollHeight; 


//page.sendEvent('keydown', page.event.key.40, null, null);
//var evtObj = document.createEvent('KeyboardEvent');
//ev.initEvent("click", true, true);
//evtObj.initKeyEvent('keydown', true, true, window, false, false, false, false, 40, 0 );
//document.dispatchEvent(evtObj);
console.log("事件。。。。。。。。。。。。。。。。。。。。。。。。。。。");
phantom.exit();
};
page.onUrlChanged = function(targetUrl) {
console.log('New URL: ' + targetUrl);
newUrl = targetUrl;
/*
* if((targetUrl.indexOf("http://www.baidu.com/s?"))>-1){
* page.open(targetUrl); }
*/

};
page.onResourceReceived = function(response) {
console.log('resource rec page.url---'+page.url);
console.log('reponse url---'+response.url);

};

page.onResourceError = function(resourceError) {
console.log('Unable to load resource (#' + resourceError.id + 'URL:'
+ resourceError.url + ')');
console.log('Error code: ' + resourceError.errorCode + '. Description: '
+ resourceError.errorString);
};



phantom.onError = function(msg, trace) {
var msgStack = ['PHANTOM ERROR: ' + msg];
if (trace && trace.length) {
msgStack.push('TRACE:');
trace.forEach(function(t) {
msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line + (t.function ? ' (in function ' + t.function +')' : ''));
});
}
console.error(msgStack.join('\n'));
phantom.exit(1);
};
//page.colse()



猜你喜欢

转载自hadasione.iteye.com/blog/2220624