Using Python obtain public article number and save it as a PDF file

As its title, the present blog content is divided into two parts:

  1. Article number acquired public
  2. Save the article as a PDF file

First, access to public article number

This uses wechatsogouthe third-party modules, the use of the module is very simple, enter the number of the public can obtain the name list of articles number of public information and the number of recent public

import wechatsogou
import os, pdfkit, time

# 公众号名称
gzh_name = ‘xxx’
# 公众号信息
gzh_info = ws_api.get_gzh_info(gzh_name)
# 搜索公众号文章
searched_articles = ws_api.search_article(gzh_name)
# 公众号近期文章
article_ls = ws_api.get_gzh_article_by_history(gzh_name)['article']
# 首页热门文章
hot_articles = ws_api.get_gzh_article_by_hot(gzh_name)

Second, the browser page to PDF file

Use pdfkitmodule can easily complete HTML file to PDF file conversion method

import pdfkit

# 从页面 URL 出发得到 PDF文件
pdfkit.from_url(html_path, save_path, configuration=config)
# 从本地 HTML 文件路径出发得到 PDF 文件
pdfkit.from_file(html_path, save_path, configuration=config)
# 从 HTML 文件内容出发的到 PDF 文件
pdfkit.from_string(html_path, save_path, configuration=config)

Specific configuration and usage can refer to my previous blog

Third, the complete code

import wechatsogou
import os, pdfkit, time

ws_api =wechatsogou.WechatSogouAPI()

class Utils:
    @staticmethod
    def select_path(path):
        if not os.path.exists(path):
            os.makedirs(path)

    @staticmethod
    def timeStamp_to_timeStr(timeStamp, time_format='%Y%m%d'):
        '''
        时间戳转时间字符串
        '''
        timeStruct = time.localtime(timeStamp)
        timeStr = time.strftime(time_format,timeStruct)
        return timeStr

    @staticmethod
    def format_file_name(file_name):
        ls = [',', '/', '?', '|', '\\', ',', '。', '、', '?', '|', ';', ':', ';', ':']
        for i in ls:
            if i in file_name:
                file_name = file_name.replace(i, '_')
        return file_name

    @staticmethod
    def html_to_pdf(html_path, save_path, path_type='file_path'):
        '''
        params:
            html_path    HTML 文件路径/URL/字符串
            save_path    PDF 文件要保存的位置 
            path_type    输入的 html_path 的类型 ['url_path', 'file_path', 'string']
        '''
        path_wk = r'D:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe' #wkhtmltopdf 安装位置
        config = pdfkit.configuration(wkhtmltopdf = path_wk)
        if path_type == 'url_path':
            pdfkit.from_url(html_path, save_path, configuration=config)
        elif path_type == 'file_path':
            pdfkit.from_file(html_path, save_path, configuration=config)
        elif path_type == 'string':
            pdfkit.from_string(html_path, save_path, configuration=config)

def get_gzh_article(gzh_name):
    article_ls = ws_api.get_gzh_article_by_history(gzh_name)['article']
    for article in article_ls:
        datetime = article['datetime']
        title = article['title']
        content_url = article['content_url']
        author = article['author']
        # 格式化处理
        datetime = Utils.timeStamp_to_timeStr(datetime)
        title = Utils.format_file_name(title)

        if author != '':
            file_name = datetime + '-' + title + '-' + author + '.pdf'
        else:
            file_name = datetime + '-' + title + '.pdf'

        article_root_path = '公众号文章/' + gzh_name

        Utils.select_path(article_root_path)  # 创建目录
        saved_article_ls = os.listdir(article_root_path)

        if file_name in saved_article_ls:
            continue
        else:
            save_path = article_root_path + '/' + file_name

        print(file_name + '-- start')
        Utils.html_to_pdf(content_url, save_path, 'url_path')
        print(file_name + '-- end')

if __name__ == '__main__':
    gzh_ls = ['公众号1', '公众号2']
    for gzh in gzh_ls:
        get_gzh_article(gzh)

Directory structure is as follows:

root --- main.py
     |-- 公众号文章  --- 公众号1
     	               |-- 文章1
     	               |-- 文章2
     	               |-- ...
                   |-- 公众号2
                       |-- 文章1
                       |-- 文章2
                       |-- ...

Guess you like

Origin blog.csdn.net/weixin_42902669/article/details/92389873