Python-Get the content of CSDN webpage and output as pdf

Because the teacher requested the original text of the reference to be included in the experiment report, this document was compiled. (Homework is the best motivation

Python's pdf tool is still very complete, this time the wkhtmltopdf interface is used

Because there is basically no error reported, it is written as a command line operation form

Only one URL can be processed at a time

Because I wrote it according to my needs, I put it on the blog just for my friends to refer to ideas, and modify it according to my own needs~

#!/usr/bin/python
#@Author: zhongshsh

import requests
from bs4 import BeautifulSoup, NavigableString
import urllib
import pdfkit
import sys

# 获取网页内容
def get_html(url):
        headers = {
    
    
        'User-Agent': 'Mozilla/5.0(Macintosh; Intel Mac OS X 10_11_4)\
        AppleWebKit/537.36(KHTML, like Gecko) Chrome/52 .0.2743. 116 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        return response.text

# 删除超链接,保留标签内的内容
def strip_tags(html, invalid_tags):
    soup = BeautifulSoup(html, 'lxml')
    for tag in soup.findAll(True):
        if tag.name in invalid_tags:
            s = ""
            for c in tag.contents:
                if not isinstance(c, NavigableString):
                    c = strip_tags(str(c), invalid_tags)
                s += str(c)
            tag.replaceWith(s)
    return soup

# 删除一些标签
def strip_ct(soup):
    [s.extract() for s in soup(class_="article-bar-top")]
    [s.extract() for s in soup(class_="href-article-edit slide-toggle")]
    [s.extract() for s in soup(class_="person-messagebox")]
    return soup

# 过滤网页信息
def get_main(html):
    soup = BeautifulSoup(html,'lxml')
    i=strip_tags(str(strip_ct(soup.find(attrs={
    
    'class':"blog-content-box"}))),['a'])
    return str(i)

# 生成pdf
def html_pdf(html):
    path_wkthmltopdf = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe'
    options = {
    
    
        'page-size':'A4',
        'margin-top': '0.75in',
        'margin-right': '0.75in',
        'margin-bottom': '0.75in',
        'margin-left': '0.75in',
        'encoding': "UTF-8",
        'custom-header': [
            ('Accept-Encoding', 'gzip')
        ],
        'cookie': [
            ('cookie-name1', 'cookie-value1'),
            ('cookie-name2', 'cookie-value2'),
        ],
        'no-outline': None
    }
    config = pdfkit.configuration(wkhtmltopdf=path_wkthmltopdf)
    pdfkit.from_string(html, 'data.pdf', options=options, configuration=config)


if __name__ == '__main__':
    url = sys.argv[1]
    with open('data.pdf', 'w') as f:
        f.write('')
    # url_list = ['https://blog.csdn.net/u013803499/article/details/82877993']
    html_pdf(get_main(get_html(url)))

Partial screenshot of the result

Insert picture description here

Guess you like

Origin blog.csdn.net/MaoziYa/article/details/105896212