python method to remove all tags from html

import re
from bs4 import BeautifulSoup
from lxml import etree
     html = '''
    <div id="info">
    <span ><span class='pl'>导演</span>: <span class='attrs'><a>郭帆</a></span></span><br/>
    <span ><span class='pl'>编剧</span>: <span class='attrs'><a >郭帆</a></span></span><br/>
    <span class="pl">制片国家/地区:</span> 中国大陆<br/>
    <span class="pl">语言:</span> 汉语普通话 / 俄语 / 英语 / 印地语 / 法语<br/>
    <span class="pl">上映日期:</span> <span >2023-01-22(中国大陆)</span><br/>
    <span class="pl">片长:</span> <span>173分钟</span><br/>
    <span class="pl">又名:</span> The Wandering Earth Ⅱ / The Wandering Earth 2 / 《流浪地球》前传<br/>
    <span class="pl">IMDb:</span> tt13539646<br>
    </div>
    '''

    # 方法一
    pattern = re.compile(r'<[^>]+>', re.S)
    result = pattern.sub('', html)
    print(f"正则去除:{
      
      result}")

    # 方法二
    soup = BeautifulSoup(html, 'html.parser')
    print(f"BeautifulSoup去除:{
      
      soup.get_text()}")

    # 方法三
    response = etree.HTML(text=html)
    # print(dir(response))
    print(f"etree去除:{
      
      response.xpath('string(.)')}")

Guess you like

Origin blog.csdn.net/weixin_43824520/article/details/129349325