BeautifulSoup抓取html内容并输出Markdown

目的

抓取网络中html中的书籍内容，并且按照Markdown的语法，形成Markdown文档。本文的Markdown语法按照Typora（Markdown编辑器）来输出。可以直接用Typora打开。

系统环境

ubuntu16.04， python3.9

方案

爬虫工具比较

BeautifulSoup4 + requests
优点：简单，api简单，适合入门
缺点: 抓取速度比较慢，因为不是异步抓取，如果需要讲究效率，可以字节做异步。
Scrapy
优点：抓取速度快，有一整套框架
缺点：复杂，门槛较高，需要点时间入门
html2txt
使用了一下，虽然用起来很简单，比较傻瓜式，但是很有局限性，不能按照自己的意愿write成Markdown文档，感觉用得不如BeautifulSoup4 爽。

参考：
pip install BeautifulSoup4
pip install fake-useragent
pip install requests

遇到的困难

在抓取过程中，文本类很容易抓取，但是图片并不能随便的抓取，出现403的错误。
但是，可以通过分析headers信息知道原因：抓取图片的header是不一样的，如'Accept': 'image/webp, */*' 。

# -*- coding:utf-8 -*-

import html2text
import requests
from bs4 import BeautifulSoup
import uuid
from fake_useragent import UserAgent
import os

ua = UserAgent()
# headers = {"User-Agent": ua.random}
headers = {
    
    
    "User-Agent": ua.chrome,
}

img_headers = {
    
    
    # "User-Agent": ua.firefox,
    'Host': 'img.dushu.com',
    'User-Agent': 'Mozilla/5.0 (iPhone CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/12.0 Mobile/15A372 Safari/604.1',
    'Accept': 'image/webp, */*',
    'Accept-Language': 'zh-CN, zh q = 0.8, zh-TW q = 0.7, zh-HK q = 0.5, en-US q = 0.3, en q = 0.2',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Referer': 'https://m.dushu.com/showbook/113574/1412792.html',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache',
    'Upgrade-Insecure-Requests': '1'
}

cookies = {
    
    
    '__gads': "ID=f5d687364432864d-2285bab46db80007:T=1606648841:RT=1606648841:S=ALNI_MYS8QpBvJ7f8bM2PWydyfr1bEn7Ig",
    'Hm_lpvt_8008bbd51b8bc504162e1a61c3741a9d': "1606658490",
    'Hm_lvt_8008bbd51b8bc504162e1a61c3741a9d': "1606648421"
}


class m_doshu_com(object):
    """
    https://m.doshu.com
    """

    def __init__(self, url="https://m.dushu.com/showbook/113574/1412792.html", page_begin=1412792, page_end=1412816, filename='test.md'):
        self.url = "https://m.dushu.com/showbook/113574/1412792.html"
        self.page_begin = page_begin
        self.page_end = page_end
        self.filename = filename
        self.fp_file = open(os.path.join("./", self.filename), 'w')
        


    def m_doshu_com_write_1_page(self, url="https://m.dushu.com/showbook/113574/1412792.html"):
        """
        use BeautifulSoup() to write 1 page
        """
        response = requests.get(url, headers=headers)
        response.encoding = response.apparent_encoding
        # print(response.status_code)
        soup = BeautifulSoup(response.text, 'html.parser')
        title = soup.find_all('div', class_="article-detail")[0].find_all('h1')[0].get_text()
        print(title)
        self.fp_file.write('## {}\n\n'.format(title))
        print("--------")
        text = soup.find_all('div', class_="text")[0]
        for p in text.find_all('p'):
            print(p.get_text())
            self.fp_file.write('{}\n\n'.format(p.get_text()))
            if(p.find('img')):
                img_src = p.find('img')['src']
                # print(img_headers)
                img_reponse = requests.get(url=img_src, headers=img_headers)
                if img_reponse.status_code == 200:
                    file_name = str(uuid.uuid4()) + '.jpg'
                    if not os.path.exists("assets"):
                        os.mkdir('assets')
                    with open(os.path.join('assets/', file_name), 'wb') as fp:
                        fp.write(img_reponse.content)
                    print('![](assets/{})'.format(file_name))
                    self.fp_file.write('![](assets/{})\n\n'.format(file_name))
                else:
                    print('!!! img({}) not ok'.format(img_src))
            print('--')
        del soup
        return response.status_code


    def generate(self):

        tmp_url = self.url

        for index in range(self.page_begin, self.page_end+1):
            tmp_url = 'https://m.dushu.com/showbook/113574/{}.html'.format(index)
            self.m_doshu_com_write_1_page( url=tmp_url)
            print(tmp_url)

        self.fp_file.close()


if __name__ == '__main__':
    book1 = m_doshu_com(url="https://m.dushu.com/showbook/113574/1412792.html", page_begin=1412787, page_end=1412816, filename='test.md')
    book1.generate()

BeautifulSoup抓取html内容并输出Markdown

目的

系统环境

方案

遇到的困难

猜你喜欢