Jsoup爬虫 demo

pom.xml文件添加下面的内容
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>webContent</groupId>
  <artifactId>com.xly.webContent</artifactId>
  <version>0.0.1-SNAPSHOT</version>
      <repositories>  
        <repository>  
            <id>com.springsource.repository.bundles.release</id>  
            <name>EBR Spring Release Repository</name>  
            <url>http:// repository.springsource.com/maven/bundles/release</url>  
        </repository>  
        <repository>  
            <id>com.springsource.repository.bundles.external</id>  
            <name>EBR External Release Repository</name>  
            <url>http:// repository.springsource.com/maven/bundles/external</url>  
        </repository>  
    </repositories> 
       <properties>  
        <org.springframework.version>3.0.5.RELEASE</org.springframework.version>  
    </properties>
	<dependencies>
		<dependency>
			<!-- jsoup HTML parser library @ http://jsoup.org/ -->
			<groupId>org.jsoup</groupId>
			<artifactId>jsoup</artifactId>
			<version>1.5.2</version>
		</dependency>
		<dependency>
			<groupId>org.springframework</groupId>
			<artifactId>spring-core</artifactId>
			<version>${org.springframework.version}</version>
		</dependency>
		<dependency>
			<groupId>org.slf4j</groupId>
			<artifactId>slf4j-api</artifactId>
			<version>1.6.1</version>
		</dependency>

	</dependencies>
</project>


处理逻辑
package com.xly.jsoup;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.xly.jsoup.bean.WebInfoBean;
/**
 * 
 * 
 * @author Kaikai
 * @version $Id: WebContentMain.java, v 0.1 2014-10-26 上午9:49:32 Kaikai Exp $
 */
public class WebContentMain {

    public static final String BASE_URL="";

    public static final Logger log = LoggerFactory.getLogger(WebContentMain.class);

    static String base_url = "http://finance.sina.com.cn/";
    static String base_info_url="http://vip.stock.finance.sina.com.cn/corp/go.php/vCB_AllNewsStock/symbol/";//sh600158.phtml
    static String sub_div_name="artibody";
    /**
     * @param args
     * @throws Exception
     */
    public static void main(String[] args) throws Exception {
        getDatelistBean(base_info_url+"sh600158.phtml","datelist");
    }

    /**
     * 从doc中获取<div class="datelist">
     * 对html的处理逻辑匹配新浪财经的url结构
     * @param doc
     */
    private static void getDatelistBean(String url,String divname) throws IOException 
    {
        Document doc = Jsoup.connect(url).get();
        Elements el =  doc.select("div[class="+divname+"]");
        Elements urls =  el.select("a[href]"); 
        List<WebInfoBean> list = new ArrayList<WebInfoBean>();
        for(Element e:urls){
            WebInfoBean bean = new WebInfoBean();
            Attributes attr= e.attributes();
            bean.setUrl(attr.get("href"));
            bean.setBaseUrl(e.baseUri());
            bean.setTitle(e.childNode(0).toString());
            try {
                String[] tmp = bean.getUrl().split("/");
                int lenght = tmp.length;
                if(lenght>4)bean.setTime(tmp[lenght-2]+tmp[lenght-1].substring(0,4));
            } catch (Exception e1) {
                System.out.println(bean.getUrl());
            }
            bean.setContent(extContent(bean.getUrl(),sub_div_name));
            list.add(bean);
        }
        save(list);
        
    }	
    
  /**
     * 解析内容
     * 
     * @param url
     * @param divname
     * @return
     * @throws IOException
     */
    private static String extContent(String url,String divname) throws IOException{
        Document doc = Jsoup.connect(url).get();
        Elements el =  doc.select("div[id="+divname+"]");
        Elements ps = el.select("p");
        String infoStr = "";
        for(Element e:ps){
            infoStr=infoStr+e.text()
//                    +"\n"
                    ;
        }
        return infoStr;
    }

    private static void save(List<WebInfoBean> list){
        for(WebInfoBean bean:list){
            System.out.println(bean.toString());
        }
    }

}


bean
public class WebInfoBean {

	private String id;
	
	private String title;
	private String index;
	private String time;
	private String content;
	private String baseUrl;
	private String url;
	
	public String getId() {
		return id;
	}
	public void setId(String id) {
		this.id = id;
	}
	public String getContent() {
		return content;
	}
	public void setContent(String content) {
		this.content = content;
	}
	public String getUrl() {
		return url;
	}
	public void setUrl(String url) {
		this.url = url;
	}
    public String getTitle() {
        return title;
    }
    public void setTitle(String title) {
        this.title = title;
    }
    public String getIndex() {
        return index;
    }
    public void setIndex(String index) {
        this.index = index;
    }
    public String getTime() {
        return time;
    }
    public void setTime(String time) {
        this.time = time;
    }
    public String getBaseUrl() {
        return baseUrl;
    }
    public void setBaseUrl(String baseUrl) {
        this.baseUrl = baseUrl;
    }
    @Override
    public String toString() {
        return "WebInfoBean [id=" + id + ", title=" + title + ", index=" + index + ", time=" + time
               + ", content=" + content + ", baseUrl=" + baseUrl + ", url=" + url + "]";
    }
	
	
}

猜你喜欢

转载自xly1981.iteye.com/blog/2147997