Because the teacher requested the original text of the reference to be included in the experiment report, this document was compiled. (Homework is the best motivation
Python's pdf tool is still very complete, this time the wkhtmltopdf interface is used
Because there is basically no error reported, it is written as a command line operation form
Only one URL can be processed at a time
Because I wrote it according to my needs, I put it on the blog just for my friends to refer to ideas, and modify it according to my own needs~
#!/usr/bin/python
#@Author: zhongshsh
import requests
from bs4 import BeautifulSoup, NavigableString
import urllib
import pdfkit
import sys
# 获取网页内容
def get_html(url):
headers ={
'User-Agent':'Mozilla/5.0(Macintosh; Intel Mac OS X 10_11_4)\
AppleWebKit/537.36(KHTML, like Gecko) Chrome/52 .0.2743. 116 Safari/537.36'}
response = requests.get(url, headers=headers)return response.text
# 删除超链接,保留标签内的内容
def strip_tags(html, invalid_tags):
soup =BeautifulSoup(html,'lxml')for tag in soup.findAll(True):if tag.name in invalid_tags:
s =""for c in tag.contents:if not isinstance(c, NavigableString):
c =strip_tags(str(c), invalid_tags)
s +=str(c)
tag.replaceWith(s)return soup
# 删除一些标签
def strip_ct(soup):[s.extract()for s insoup(class_="article-bar-top")][s.extract()for s insoup(class_="href-article-edit slide-toggle")][s.extract()for s insoup(class_="person-messagebox")]return soup
# 过滤网页信息
def get_main(html):
soup =BeautifulSoup(html,'lxml')
i=strip_tags(str(strip_ct(soup.find(attrs={
'class':"blog-content-box"}))),['a'])returnstr(i)
# 生成pdf
def html_pdf(html):
path_wkthmltopdf = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe'
options ={
'page-size':'A4','margin-top':'0.75in','margin-right':'0.75in','margin-bottom':'0.75in','margin-left':'0.75in','encoding':"UTF-8",'custom-header':[('Accept-Encoding','gzip')],'cookie':[('cookie-name1','cookie-value1'),('cookie-name2','cookie-value2'),],'no-outline': None
}
config = pdfkit.configuration(wkhtmltopdf=path_wkthmltopdf)
pdfkit.from_string(html,'data.pdf', options=options, configuration=config)if __name__ =='__main__':
url = sys.argv[1]withopen('data.pdf','w')as f:
f.write('')
# url_list =['https://blog.csdn.net/u013803499/article/details/82877993']html_pdf(get_main(get_html(url)))