具体参见:https://blog.csdn.net/Likianta/article/details/101293915
import requests
def get_text(resp):
# 优先使用 chardet 预测的 encoding, 其次使用 http header 提供的 encoding
source_encoding = resp.apparent_encoding or resp.encoding
if source_encoding is None:
# 说明是二进制文件, 比如 pdf, jpg 之类的
raise Exception
elif source_encoding == 'GB2312':
source_encoding = 'GBK'
return resp.content.decode(source_encoding, errors="ignore")
# 测试 "问题" 网页
url = 'http://www.most.gov.cn/ztzl/gjkxjsjldh/jldh2002/zrj/zrjml.htm'
response = requests.get(url)
text = get_text(response)
# | text = response.text # 不用这个了
# 保存为文件
with open('result.html', 'w', encoding='utf-8') as f:
f.write(text)