Python爬取知乎专栏文章并生成PDF

爬取的专栏文章按照原网站样式生成保存PDF

import json
import io
import requests
from bs4 import BeautifulSoup

from Novel import headers
from articleUtils import *

sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='GBK')

def __find_next_page(url):
    articleResult = json.loads(requests.get(url, headers=headers).content.decode())
    htmls = []
    while not articleResult['paging']['is_end']:
        for d in articleResult['data']:
            htmls.append(__open_article(d['id']))
        list = articleResult['paging']['next'].split("/")
        list.insert(3, "api")
        __find_next_page('/'.join(list))
        break
    if htmls:
        save_pdf(htmls)
        print('转换完成!')
    else:
        print('没有任何要转换的文件')


def __open_article(article_id):
    article_url = f'https://zhuanlan.zhihu.com/p/{article_id}'
    soup = BeautifulSoup(requests.get(article_url, headers=headers).content.decode())
    title = safe_file_name(soup.select_one(".Post-Title").text)
    author = safe_file_name(soup.select_one('.AuthorInfo>meta[itemprop="name"]').attrs['content'])
    content = soup.select_one('.Post-RichTextContainer').prettify()

    return save_html(soup, content, title, author, './zhihu.css', convert_img_attrs=['data-actualsrc', 'data-original'])


def down(url):
    article_id = url.split('/')[-1]
    html = __open_article(article_id)
    if html:
        name = html.split('/')[-1].split('.')[0]
        name = "/".join(html.split('/')[:-1]) + "/" + str(name) + ".pdf"
        save_pdf(html, name)
        print('转换完成!')
    else:
        print('没有任何要转换的文件')


def zhuanlan_down(s):
    url = 'https://zhuanlan.zhihu.com/api/columns/{}/articles?data%5B*%5D.upvoted_followees%2Cadmin_closed_comment&limit=10&offset=10'.format(
        str(s).strip().split('/')[-1])
    __find_next_page(url)


if __name__ == '__main__':
    zhuanlan_down('https://zhuanlan.zhihu.com/crossin')

    down('https://zhuanlan.zhihu.com/p/26252318')

articleUtils.py

import io
import os
import re
import base64
import sys
import requests

import pdfkit

sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='UTF-8')


def safe_file_name(file_name):
    return re.sub(r'[\|/|:|*|?|"|<|>|\|]', "", file_name)


def get_image_file_as_base64_data(img_src):
    if img_src:
        return f'data:image/jpeg;base64,{str(base64.b64encode(requests.get(img_src).content), encoding="utf-8")}'


def regex_str(str):
    for r in [r'\.', r'\+']:
        str = re.sub(r, r, str)
    return str


def convert_img_tag(content, soup, attrs):
    img_tags = soup.select('img')
    for img_tag in img_tags:
        if not img_tag:
            continue
        if not img_tag.attrs:
            continue
        img_src = None

        old_tag = img_tag.prettify()
        for attr in attrs:
            if attr in img_tag.attrs and img_tag.attrs[attr]:
                img_src = img_tag.attrs[attr]
                break
        if img_src:
            if img_src.startswith("//"):
                img_src = "http:" + img_src
            img_tag.attrs['src'] = get_image_file_as_base64_data(img_src)
            content = content.replace(old_tag,img_tag.prettify())
    return content


def save_file(content, title, user_name):
    path = './%s' % user_name
    if not os.path.exists(path):
        os.mkdir(path)
    file = '%s/%s.html' % (path, title)
    if not os.path.exists(file):
        file = file.encode('UTF-8', errors='ignore').decode(encoding='UTF-8')
        with open(file, 'w', encoding='utf-8') as f:
            f.write(content)
    else:
        pass
    return file



def html_convert(soup, content, title, author, css_file_path=None, convert_img_attrs=['src']):
    content = convert_img_tag(content, soup, attrs=convert_img_attrs)
    html = f"""
            <!DOCTYPE html>
            <html lang="en">
            <head>
                <meta charset="UTF-8">
                <title>{title} {author}</title>
            </head>
            <body style="max-width:700px;margin: 0 auto;background-color:#fff!important;">
            <article class="Post-Main Post-NormalMain">
                <header class="Post-Header">
                    <h1 class="Post-Title" style="font-size: 22px;line-height: 1.4;margin-bottom: 14px;">{title}</h1>
                </header>
                {content}
            </article>
            </body>
            <style type="text/css">
                body{{
                    background-color:#fff;
                }}
            </style>
            
            {__get_css(css_file_path)}
            
            
            <style type="text/css">
                .md-toc {{
                    position: fixed;
                    height: 100%;
                    left: 0;
                    top: 0;
                    bottom: 0;
                    margin-left: 10em;
                    overflow: scroll;
                    border-bottom: 1px solid rgb(221, 221, 216);
                    box-sizing: border-box;
                    padding: 0px 20px 20px 0px;
                    width: 25em;
                }}
            </style>
            </html>
        """
    return html



def save_html(soup, content, title, author, css_file_path=None, convert_img_attrs=None):
    html = html_convert(soup, content, title, author, css_file_path, convert_img_attrs)
    return save_file(html, title, author)



def __get_css(css_file_path):
    if css_file_path:
        with open(css_file_path, 'r', encoding='utf-8') as f2:
            return f'<style type="text/css">{"".join(f2.readlines())}</style>'
    else:
        return ''



def save_pdf(htmls, output_path='./out.pdf'):
    path_wk = r'D:\Program Files (x86)\wkhtmltopdf\bin\wkhtmltopdf.exe'  # ��װλ��
    config = pdfkit.configuration(wkhtmltopdf=path_wk)
    options = {
        'encoding': "UTF-8",
        'custom-header': [
            ('Accept-Encoding', 'gzip')
        ]
    }
    pdfkit.from_file(htmls, output_path, configuration=config, options=options)

./zhihu.css 网站
文件已上传:
https://download.csdn.net/download/mbh12333/11999306
需要的找我,文件太大。无法上传

发布了139 篇原创文章 · 获赞 24 · 访问量 10万+

猜你喜欢

转载自blog.csdn.net/mbh12333/article/details/103264866