Python example | Clean text containing HTML encoding and HTML tags

Clean text containing HTML encoding and HTML tags:

import html
import re


def clean_html(s: str) -> str:
    """清理包含 HTML 编码、HTML 标签的文本文档

    Parameters
    ----------
    s : str
        处理前的文本

    Returns
    -------
    str
        处理后的文本

    Examples
    --------
    >>> clean_html("<p>This is <b>bold</b> text. </p>")
    'This is bold text.'
    >>> clean_html("<p>    <span>&nbsp;测试&nbsp;文本&nbsp;</span></p>")
    '测试 文本'
    """
    s = html.unescape(s)  # 解码 HTML 编码
    s = re.sub("<[^>]+>", "", s)  # 移除 HTML 标签
    s = s.replace("\u00A0", " ")  # 将非断行空格替换为一般空格
    s = re.sub(" *\n *", "\n", s)  # 移除每一行前后的多余空格
    s = re.sub("\n+", "\n", s)  # 将多个连续换行替换为一个
    s = s.strip()  # 替换掉头尾的多余空行
    return s

Guess you like

Origin blog.csdn.net/Changxing_J/article/details/133063344