java实现爬取指定网站的数据

1.这个类是用来解析网站的内容
重点是："div#page>div#content>div#local>div#recommend>ul>li>a";
这里用用firefox的firebug组件查看网页的代码结构，不同的网页路径也不一样。

package zy.crawl.hupu;

import java.io.IOException;

import zy.crawl.common.*;

import java.util.ArrayList;
import java.util.List;

import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class CrawlHupu
{
	private List<NewsInfo> newsList = new ArrayList<>();//用来存储爬取的信息对象
	
	public String GetHtml(String url) //还方法是设置网络链接，是固定的用法
	{
		String html = null;
		HttpClient httpClient = new DefaultHttpClient();
		//set proxy ,because of nsn
//		HttpHost proxy = new HttpHost("10.68.120.11", 3128);
//		httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy);
		
		//configuration timeout
		httpClient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 20000);
		
		HttpGet httpGet = new HttpGet(url);
		try
		{
			HttpResponse httpResponse = httpClient.execute(httpGet);
			int resStatu = httpResponse.getStatusLine().getStatusCode();
			if(resStatu == HttpStatus.SC_OK)
			{
				HttpEntity entity = httpResponse.getEntity();
				if(entity != null)
				{
					html = EntityUtils.toString(entity);
				}
			}
			
		}
		catch (Exception e)
		{
			System.out.println("Connect " + url+" error");
			e.printStackTrace();
		}
		finally
		{
			httpClient.getConnectionManager().shutdown();
		}
		
		return html;
	}
	
	public void ParseHtmlForNewsList()
	{
		String html = GetHtml("http://qczx.qc1818.com/");
		
		//hupu voice 的第一个可以暂时去掉一个css，这样就不用处理空格了
		//String cssQueryHupu = "div.content>div.row>div.column>div.row>div.column>div.uibox>div.uibox-con>ul.ui-list>li>a";
		String cssQueryHupu ="div#mainbody>div.cjkx_mtsd>div.cjkx>ul.list_left>li>a";//这行是用来获取每条对象的标题信息
//		String cssQueryHuxiu = "div.container-hx>div.row-fluid-wrap-hx>"
//				+ "div.center-container-hx>div.clearfix>div.center-ctr-wrap>div.center-ctr-box>div.article-list>div.article-box>div.article-box-ctt>h4>a";
//		
//		String cssQueryIteye = "div#page>div#content>div#local>div#recommend>ul>li>a";
		if(!html.isEmpty())
		{
			Document doc = Jsoup.parse(html,"http://qczx.qc1818.com/");
			Elements linkElements = doc.select(cssQueryHupu);
			/*
			 *  <a class="button read" href="http://book.zongheng.com/showchapter/48552.html">点击阅读</a>
			 * 最后经过测试发现带空格的class可以写成两个select 写成 Elements indexEs = doc.select(".button").select(".read");成功抓取该书所有目录和链接。
			 */
			
			//Elements linkElements = doc.select("div.hp-wrap").select("div.index-wrap>div.col-B>div.voice-main>div.public>div#J_public_item>ul>li>dl.item-bd>dt>span>a");
			for(Element ele:linkElements)
			{
		
				NewsInfo newsTemp = new NewsInfo(ele.text(), ele.absUrl("href"));

				PaserHtmlForNewsContent(newsTemp.getHtmlAddr(),newsTemp);
				newsList.add(newsTemp);
				//String href = ele.attr("abs:href"); 也可以获取绝对地址		
				
				//for test
				System.out.println(newsTemp.getTitle()+"    "+newsTemp.getHtmlAddr());
				if(newsTemp.getImageAddrList() != null)
				System.out.println(newsTemp.getImageAddrList().get(0));
				System.out.println(newsTemp.getContent());
				
			}//System.out.println(newsList.get(0).getContent());
			
		}
	}
	
	public void PaserHtmlForNewsContent(String contentHtmlAddr, NewsInfo newsTemp)//通过上面获得的标题信息的连接，抓取标题的正文部分。
	{
		String html = GetHtml(contentHtmlAddr);
		String cssQueryphoto="asdfas";
		String cssQueryContent = //"div#pageMain>div.pageMainLeft>div.detailWrap>div.detailTitle"+
				//+"div#pageMain>div.pageMainLeft>div.detailWrap>div.detailIntr"
				"div#pageMain>div.pageMainLeft>div.detailWrap>div.detail";
		//String cssQueryContent = "div.content>div.row>div.column>div#articlewrap.area";
//		String cssQueryphoto = "div.hp-wrap>div.voice-main>div.voice-item>ul>li>div.voice-read-detailed>div.voice-photoVideo>"
//				+ "div.voice-photo>div.small-img>img";
		if(!html.isEmpty())
		{
			Document doc = Jsoup.parse(html);
			Elements contentElements = doc.select(cssQueryContent);
			Elements imgElements = doc.select(cssQueryphoto);
			for(Element ele:contentElements)
			{
				newsTemp.setContent(ele.html());
			}
			for(Element ele:imgElements)
			{
				List<String> tempImgList = new ArrayList<>();
				tempImgList.add(ele.attr("src"));
				newsTemp.setImageAddrList(tempImgList);
			}

		}
	}

	public static void main(String[] args)
	{
		CrawlHupu crawlHupu = new CrawlHupu();
		crawlHupu.ParseHtmlForNewsList();

	}

}

2.这个是要获取的信息的类。不多解释。

package zy.crawl.common;

import java.util.List;

public class NewsInfo
{
	private String title;
	private String htmlAddr;
	private String content;
	private List<String> imageAddrList;
	
	
	public NewsInfo(String title, String htmlAddr)
	{
		super();
		this.title = title;
		this.htmlAddr = htmlAddr;
	}
	
	
	public NewsInfo(String content, List<String> imageAddrList)
	{
		super();
		this.content = content;
		this.imageAddrList = imageAddrList;
	}


	public String getTitle()
	{
		return title;
	}
	public void setTitle(String title)
	{
		this.title = title;
	}
	public String getHtmlAddr()
	{
		return htmlAddr;
	}
	public void setHtmlAddr(String htmlAddr)
	{
		this.htmlAddr = htmlAddr;
	}
	public String getContent()
	{
		return content;
	}
	public void setContent(String content)
	{
		this.content = content;
	}
	public List<String> getImageAddrList()
	{
		return imageAddrList;
	}
	public void setImageAddrList(List<String> imageAddrList)
	{
		this.imageAddrList = imageAddrList;
	}
	
}

java实现爬取指定网站的数据

猜你喜欢