As its title, the present blog content is divided into two parts:
- Article number acquired public
- Save the article as a PDF file
First, access to public article number
This uses wechatsogou
the third-party modules, the use of the module is very simple, enter the number of the public can obtain the name list of articles number of public information and the number of recent public
import wechatsogou
import os, pdfkit, time
# 公众号名称
gzh_name = ‘xxx’
# 公众号信息
gzh_info = ws_api.get_gzh_info(gzh_name)
# 搜索公众号文章
searched_articles = ws_api.search_article(gzh_name)
# 公众号近期文章
article_ls = ws_api.get_gzh_article_by_history(gzh_name)['article']
# 首页热门文章
hot_articles = ws_api.get_gzh_article_by_hot(gzh_name)
Second, the browser page to PDF file
Use pdfkit
module can easily complete HTML file to PDF file conversion method
import pdfkit
# 从页面 URL 出发得到 PDF文件
pdfkit.from_url(html_path, save_path, configuration=config)
# 从本地 HTML 文件路径出发得到 PDF 文件
pdfkit.from_file(html_path, save_path, configuration=config)
# 从 HTML 文件内容出发的到 PDF 文件
pdfkit.from_string(html_path, save_path, configuration=config)
Specific configuration and usage can refer to my previous blog
Third, the complete code
import wechatsogou
import os, pdfkit, time
ws_api =wechatsogou.WechatSogouAPI()
class Utils:
@staticmethod
def select_path(path):
if not os.path.exists(path):
os.makedirs(path)
@staticmethod
def timeStamp_to_timeStr(timeStamp, time_format='%Y%m%d'):
'''
时间戳转时间字符串
'''
timeStruct = time.localtime(timeStamp)
timeStr = time.strftime(time_format,timeStruct)
return timeStr
@staticmethod
def format_file_name(file_name):
ls = [',', '/', '?', '|', '\\', ',', '。', '、', '?', '|', ';', ':', ';', ':']
for i in ls:
if i in file_name:
file_name = file_name.replace(i, '_')
return file_name
@staticmethod
def html_to_pdf(html_path, save_path, path_type='file_path'):
'''
params:
html_path HTML 文件路径/URL/字符串
save_path PDF 文件要保存的位置
path_type 输入的 html_path 的类型 ['url_path', 'file_path', 'string']
'''
path_wk = r'D:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe' #wkhtmltopdf 安装位置
config = pdfkit.configuration(wkhtmltopdf = path_wk)
if path_type == 'url_path':
pdfkit.from_url(html_path, save_path, configuration=config)
elif path_type == 'file_path':
pdfkit.from_file(html_path, save_path, configuration=config)
elif path_type == 'string':
pdfkit.from_string(html_path, save_path, configuration=config)
def get_gzh_article(gzh_name):
article_ls = ws_api.get_gzh_article_by_history(gzh_name)['article']
for article in article_ls:
datetime = article['datetime']
title = article['title']
content_url = article['content_url']
author = article['author']
# 格式化处理
datetime = Utils.timeStamp_to_timeStr(datetime)
title = Utils.format_file_name(title)
if author != '':
file_name = datetime + '-' + title + '-' + author + '.pdf'
else:
file_name = datetime + '-' + title + '.pdf'
article_root_path = '公众号文章/' + gzh_name
Utils.select_path(article_root_path) # 创建目录
saved_article_ls = os.listdir(article_root_path)
if file_name in saved_article_ls:
continue
else:
save_path = article_root_path + '/' + file_name
print(file_name + '-- start')
Utils.html_to_pdf(content_url, save_path, 'url_path')
print(file_name + '-- end')
if __name__ == '__main__':
gzh_ls = ['公众号1', '公众号2']
for gzh in gzh_ls:
get_gzh_article(gzh)
Directory structure is as follows:
root --- main.py
|-- 公众号文章 --- 公众号1
|-- 文章1
|-- 文章2
|-- ...
|-- 公众号2
|-- 文章1
|-- 文章2
|-- ...