java simple scraping web data

Get filter elements with jsoup's select

ideas

1. Get page elements through url —— 2. Use select to extract useful information —— 3. Assemble into the desired format

Example
To get the names of all occupations online (the job search app lets you choose a certain position in a certain field of a certain industry) and insert it into the database by level.
Selected URL: http://career.eol.cn/html/sy/zhiye/ .

Get page element by url

    /**
     * @Description: 获取页面元素
     * @return Document 返回类型
     * @author yangjing
     * @date 2017-2-25 下午4:26:16
     */
    public static Elements getElements(String url, String select) {
        try {
            Document doc = Jsoup.connect(url).get();
            Elements element1 = doc.select(select);
            return element1;
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;
    }
... prompt'''

Use select to extract useful information and splice the data format you want to use

    /**
     * @Description: 筛选
     * @return List<travedParam> 返回类型
     * @author yangjing
     * @date 2017-2-25 下午4:33:29
     */
    public static List<TravedParam> selectDocment(Elements elements) {
        List<TravedParam> list = new ArrayList<>();
        int parent_id2=0;
        int parent_id3=0;
        TravedParam travedParam=null;
        List<TravedParam> list2=null;
        int id=0;
        for (int i = 0; i < elements.size(); i++) {
            Elements as=elements.get(i).select("a");
            for (int j = 0; j < as.size(); j++) {
                id++;
                Element a=as.get(j);
                if(a.className().equals("black_14b")){
                    //第一级
                    if(i>0){
                        list.add(travedParam);
                    }
                    travedParam=new TravedParam();
                    list2=new ArrayList<>();
                    travedParam.setId(id);
                    String name="";
                    if(a.text().contains(">")){
                        name=a.text().substring(a.text().indexOf(">")+1);
                    }
                    travedParam.setName(name);
                    travedParam.setLevel(1);
                    travedParam.setParent_id(0);
                    parent_id2=id;
                }else{
                    TravedParam travedParam2=new TravedParam();
                    travedParam2.setId(id);
                    travedParam2.setName(a.text());
                    //第二级
                    travedParam2.setParent_id(parent_id2);
                    travedParam2.setLevel(2);
                    list2.add(travedParam2);
                    travedParam.setParamList(list2);
                    String url=a.attr("href");
                if(!url.contains("http://career.eol.cn/html/sy/zhiye/")){
                        url="http://career.eol.cn/html/sy/zhiye/"+url;
                    }
                    String select=".border_c";
                    Elements elements3=getElements(url, select);
                    Elements elements33=null;
                    if(elements3.size()>1){
                        elements33=elements3.get(i).select("a strong");
                    }
                    List<TravedParam> list3=new ArrayList<>();
                    parent_id3=id;
                    for (int k = 0; k < elements33.size(); k++) {
                        id++;
                        Element element3=elements33.get(k);
                        TravedParam travedParam3 = new TravedParam();
                        travedParam3.setName(element3.text());
                        travedParam3.setId(id);
                        travedParam3.setLevel(3);
                        travedParam3.setParent_id(parent_id3);
             travedParam3.setDescr(elements3.select("span").get(k).text());
                        list3.add(travedParam3);        
                    }
                    travedParam2.setParamList(list3);
                }
            }
        }
        return list;
    }
... prompt'''

Integrate the above two methods to write requirements

    /**
 * @author 作者yangjing: 
 * @date 创建时间:2017-2-17 上午10:48:49
 */
public class Traved {
    public static void main(String[] args) {
        String url = "http://career.eol.cn/html/sy/zhiye/";
        String select = (".border_c .line_23 p");
        selectDocment(getElements(url, select));    
    }
... prompt'''

desired data format object

package httpreq.guide;

import java.util.Date;
import java.util.List;

/**@author  作者yangjing: 
 * @date 创建时间:2017-2-25 下午4:28:21 
 */
public class TravedParam {


    private int id;
    private String name;//名称
    private String descr;//描述
    private String logo;//logo
    private int parent_id;//
    private int level;//等级
    private Date time;//时间
    private List<TravedParam> paramList;//存放下级


    public List<TravedParam> getParamList() {
        return paramList;
    }
    public void setParamList(List<TravedParam> paramList) {
        this.paramList = paramList;
    }
    public int getId() {
        return id;
    }
    public void setId(int id) {
        this.id = id;
    }
    public String getName() {
        return name;
    }
    public void setName(String name) {
        this.name = name;
    }
    public String getDescr() {
        return descr;
    }
    public void setDescr(String descr) {
        this.descr = descr;
    }
    public String getLogo() {
        return logo;
    }
    public void setLogo(String logo) {
        this.logo = logo;
    }
    public int getParent_id() {
        return parent_id;
    }
    public void setParent_id(int parent_id) {
        this.parent_id = parent_id;
    }
    public int getLevel() {
        return level;
    }
    public void setLevel(int level) {
        this.level = level;
    }
    public Date getTime() {
        return time;
    }
    public void setTime(Date time) {
        this.time = time;
    }

}
... prompt'''

Details

1. The official Chinese document of jsoup is: http://www.open-open.com/
2. The API is: http://jsoup.org/apidocs/

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325835922&siteId=291194637