java html字符串转html文档树(Java HTML Parser)

如何将一个html页面转化位java文档树

步骤一:下载jsoup-1.14.3.jar包。

尽量不重复造轮子, 官网https://jsoup.org/;api说明文档https://jsoup.org/apidocs/;下载页https://jsoup.org/download

步骤二:引入到项目。

此处省略。

步骤三:编写一个DEMO测试。

DEMO为获取博主CSDN首页基本属性值,即页面中asideProfile块的信息。如下图

代码实现

/**
 * Copyright (C), 2000-2021, XXX有限公司
 * FileName: JSOUPTest
 * Author: wangyetao
 * Date: 21-11-14 18:09:52
 * Description: JSOUPTest
 * <p>
 * History:
 * <author> 作者姓名
 * <time> 修改时间
 * <version> 版本号
 * <desc> 版本描述
 */
package simple.callback.htmlparsetest;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * @ClassName: JSOUPTest
 * @Description: JSOUPTest测试用例
 * @Author: wangyetao
 * @Date: 21-11-14 18:09:52
 */
class JSOUPTest {

    //测试用例
    public static void main(String[] args) {
        try {
            Document doc = Jsoup.connect("https://blog.csdn.net/u014132947").get();
            System.out.println(doc.title());

            //选择id为(#asideProfile)的Elements
            Elements idasideProfile = doc.select("#asideProfile");
            //选择class为([class=data-info d-flex item-tiling])的Elements
            Elements selectClassNode = idasideProfile.select("[class=data-info d-flex item-tiling]");
            for (int i = 0; i < selectClassNode.size(); i++) {
                for (int j = 1; j < 10; ) {
                    //获取对应值并输出
                    String title = selectClassNode.get(i).childNode(j).attr("title");
                    System.out.println(title);
                    j = j + 2;
                }
            }


        } catch (Exception e) {

        }
    }
}

DEMO执行结果:

输出:

未来工厂的博客_dnbug Blog_CSDN博客-[Php基础],[Ubuntu 16.04.6 LTS],[Java基础]领域博主
318
3342
2661
485696
6级,点击查看等级说明
7767
107
108
106
366

Process finished with exit code 0

记录与总结,2021年 11月 14日 星期日 20:04:42 CST。

UPDATE2

UPDATE2,2021年 11月 16日 星期二 08:18:18 CST

Jsoup转Document常用API

Jsoup转Document常用API(13个)如下:

    public static Document parse(String html, String baseUri) {
        return Parser.parse(html, baseUri);
    }

    public static Document parse(String html, String baseUri, Parser parser) {
        return parser.parseInput(html, baseUri);
    }

    public static Document parse(String html, Parser parser) {
        return parser.parseInput(html, "");
    }

    public static Document parse(String html) {
        return Parser.parse(html, "");
    }

    public static Connection connect(String url) {
        return HttpConnection.connect(url);
    }

    public static Document parse(File file, @Nullable String charsetName, String baseUri) throws IOException {
        return DataUtil.load(file, charsetName, baseUri);
    }

    public static Document parse(File file, @Nullable String charsetName) throws IOException {
        return DataUtil.load(file, charsetName, file.getAbsolutePath());
    }

    public static Document parse(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
        return DataUtil.load(file, charsetName, baseUri, parser);
    }

    public static Document parse(InputStream in, @Nullable String charsetName, String baseUri) throws IOException {
        return DataUtil.load(in, charsetName, baseUri);
    }

    public static Document parse(InputStream in, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
        return DataUtil.load(in, charsetName, baseUri, parser);
    }

    public static Document parseBodyFragment(String bodyHtml, String baseUri) {
        return Parser.parseBodyFragment(bodyHtml, baseUri);
    }

    public static Document parseBodyFragment(String bodyHtml) {
        return Parser.parseBodyFragment(bodyHtml, "");
    }

    public static Document parse(URL url, int timeoutMillis) throws IOException {
        Connection con = HttpConnection.connect(url);
        con.timeout(timeoutMillis);
        return con.get();
    }
/**
 * Copyright (C), 2000-2021, XXX有限公司
 * FileName: JSOUPTest3
 * Author: wangyetao
 * Date: 21-11-16 07:52:17
 * Description: JSOUPTest3
 * <p>
 * History:
 * <author> 作者姓名
 * <time> 修改时间
 * <version> 版本号
 * <desc> 版本描述
 */
package simple.callback.htmlparsetest;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

/**
 * @ClassName: JSOUPTest3
 * @Description: java类描述
 * @Author: wangyetao
 * @Date: 21-11-16 07:52:17
 */
class JSOUPTest3 {

    //测试用例
    public static void main(String[] args) {
        try {

            //https://blog.csdn.net/hebtu666
            Document doc = Jsoup.connect("https://blog.csdn.net/u014132947").get();
            System.out.println(doc.title());

            //选择id为(#asideProfile)的Elements
            Elements idasideProfile = doc.select("#asideProfile");
            //选择class为([class=data-info d-flex item-tiling])的Elements
            Elements selectClassNode = idasideProfile.select("[class=data-info d-flex item-tiling]");

            //原创数
            String yuanchuang = selectClassNode.get(0).childNode(1).attr("title");
            //周排名
            String zhoupaiming = selectClassNode.get(0).childNode(3).attr("title");
            //总排名
            String zongpaiming = selectClassNode.get(0).childNode(5).attr("title");
            //访问
            String fangwen = selectClassNode.get(0).childNode(7).attr("title");
            //等级
            String tmp = selectClassNode.get(0).childNode(9).attr("title");
            String dengji = tmp.split(",")[0];

            //积分
            String jifen = selectClassNode.get(1).childNode(1).attr("title");
            //粉丝
            String fans = selectClassNode.get(1).childNode(3).attr("title");
            //获赞
            String huozan = selectClassNode.get(1).childNode(5).attr("title");
            //评论
            String pinglun = selectClassNode.get(1).childNode(7).attr("title");
            //收藏
            String shoucang = selectClassNode.get(1).childNode(9).attr("title");

            System.out.println("原创数:" + yuanchuang);
            System.out.println("周排名:" + zhoupaiming);
            System.out.println("总排名:" + zongpaiming);
            System.out.println("访问:" + fangwen);
            System.out.println("等级:" + dengji);

            System.out.println("积分:" + jifen);
            System.out.println("粉丝:" + fans);
            System.out.println("获赞:" + huozan);
            System.out.println("评论:" + pinglun);
            System.out.println("收藏:" + shoucang);

        } catch (Exception e) {

        }
    }
}

UPDATE3-Android中测试用例

UPDATE3,2021年 11月 18日 星期四 10:28:50 CST

Android中的使用,测试用例是在一个Activity中一个TextView控件中输出博客信息。
步骤1).因为涉及到了网络使用,所以要在AndroidManifest.xml清单文件中添加网络使用权限
<uses-permission android:name="android.permission.INTERNET" />
步骤2).由于此处测试用例Jsoup是通过Jsoup.connect(blogUrl).get()网络的方式来解析网页数据,因为涉及到了网络,所以我们只能再新开一个线程+Handler的方式来更新UI。
DEMO_ANDROID(布局文件很简单这里不贴代码了,只贴这个Activity的代码):

package com.example.driverslanguage;

import android.app.Activity;
import android.os.Bundle;
import android.os.Handler;
import android.os.Message;
import android.widget.TextView;

import androidx.annotation.NonNull;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

public class MyblogActivity extends Activity {

    private String blogUrl = "https://blog.csdn.net/u014132947";
    private StringBuilder stringBuilder = new StringBuilder();
    private TextView tv_mybloglink, tv_myblog;

    private Handler handler = new Handler() {
        @Override
        public void handleMessage(@NonNull Message msg) {
            super.handleMessage(msg);
            tv_myblog.setText(stringBuilder.toString());
            stringBuilder.delete(0, stringBuilder.toString().length());
        }
    };

    @Override
    protected void onCreate(Bundle savedInstanceState) {
        super.onCreate(savedInstanceState);
        setContentView(R.layout.activity_myblog);

        tv_myblog = findViewById(R.id.tv_myblog);
        tv_mybloglink = findViewById(R.id.tv_mybloglink);
        tv_mybloglink.setText(blogUrl);

        new Thread(new Runnable() {
            @Override
            public void run() {
                try {

                    Document doc = Jsoup.connect(blogUrl).get();

                    //选择id为(#asideProfile)的Elements
                    Elements idasideProfile = doc.select("#asideProfile");
                    //选择class为([class=data-info d-flex item-tiling])的Elements
                    Elements selectClassNode = idasideProfile.select("[class=data-info d-flex item-tiling]");

                    //原创数
                    String yuanchuang = selectClassNode.get(0).childNode(1).attr("title");
                    //周排名
                    String zhoupaiming = selectClassNode.get(0).childNode(3).attr("title");
                    //总排名
                    String zongpaiming = selectClassNode.get(0).childNode(5).attr("title");
                    //访问
                    String fangwen = selectClassNode.get(0).childNode(7).attr("title");
                    //等级
                    String tmp = selectClassNode.get(0).childNode(9).attr("title");
                    String dengji = tmp.split(",")[0];

                    //积分
                    String jifen = selectClassNode.get(1).childNode(1).attr("title");
                    //粉丝
                    String fans = selectClassNode.get(1).childNode(3).attr("title");
                    //获赞
                    String huozan = selectClassNode.get(1).childNode(5).attr("title");
                    //评论
                    String pinglun = selectClassNode.get(1).childNode(7).attr("title");
                    //收藏
                    String shoucang = selectClassNode.get(1).childNode(9).attr("title");


                    stringBuilder.append(doc.title());
                    stringBuilder.append("\n");
                    stringBuilder.append("原创数:");
                    stringBuilder.append(yuanchuang);
                    stringBuilder.append("\n");
                    stringBuilder.append("周排名:");
                    stringBuilder.append(zhoupaiming);
                    stringBuilder.append("\n");
                    stringBuilder.append("总排名:");
                    stringBuilder.append(zongpaiming);
                    stringBuilder.append("\n");
                    stringBuilder.append("访问:");
                    stringBuilder.append(fangwen);
                    stringBuilder.append("\n");
                    stringBuilder.append("等级:");
                    stringBuilder.append(dengji);
                    stringBuilder.append("\n");

                    stringBuilder.append("积分:");
                    stringBuilder.append(jifen);
                    stringBuilder.append("\n");
                    stringBuilder.append("粉丝:");
                    stringBuilder.append(fans);
                    stringBuilder.append("\n");
                    stringBuilder.append("获赞:");
                    stringBuilder.append(huozan);
                    stringBuilder.append("\n");
                    stringBuilder.append("评论:");
                    stringBuilder.append(pinglun);
                    stringBuilder.append("\n");
                    stringBuilder.append("收藏:");
                    stringBuilder.append(shoucang);
                    stringBuilder.append("\n");

                    handler.sendEmptyMessage(0);

                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        }).start();


    }
}

下面走测试用例代码,看执行结果:

猜你喜欢

转载自blog.csdn.net/u014132947/article/details/121321320