spider----爬取多层多页书籍

版权声明:个人原创,所属@jerry 本人 https://blog.csdn.net/qq_42938842/article/details/83591144

代码如下:

import os
import urllib.request
import time
from bs4 import BeautifulSoup


def get_request(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
    }
    request = urllib.request.Request(url=url, headers=headers)
    return request


def soup_content(content):
    soup = BeautifulSoup(content, 'lxml')
    return soup


def url_content(request):
    content = urllib.request.urlopen(request).read().decode('utf8')
    return content


def main():
    url = 'https://m.feiyanqing.com/huahuo/'
    request = get_request(url)
    content = url_content(request)
    # print(content)
    soup = soup_content(content)

    ret = soup.select('ul[class="picli fix"]')

    for i in ret:
        # print(i)
        # 第二层
        # url = 'http://m.feiyanqing.com' + i.select('a')[0].attrs['href']
        url = 'http://m.feiyanqing.com' + i.select('a')[0].attrs['href']
        # print(url)
        request = get_request(url)
        content = url_content(request)
        soup = soup_content(content)
        # print(soup)
        result = soup.select('h3 > a')
        # print(result)
        dirname = input('输入文件夹名称: ')
        for http in result:
            url = 'http://m.feiyanqing.com' + http.attrs['href']
            # print(url)
            request = get_request(url)
            content = url_content(request)
            soup = soup_content(content)
            pagelink = soup.select('.zzt > li > b')
            # print(pagelink)
            if len(pagelink) == 0:
                filename = soup.select('.boxcon > .wzbt')[0].text
                # 加载文件目录
                if not os.path.exists(dirname):
                    os.mkdir(dirname)
                filepath = os.path.join(dirname, filename)
                fp = open(filepath, 'w', encoding='utf8')
                text = soup.select('.zw > p')
                print('正在下载%s......' % filename)
                for text1 in text:
                    fp.write(text1.text + '\n')
                    # print(text)
                time.sleep(2)
                print('下载完成...')
                fp.close()
            elif len(pagelink) >= 2:
                num = int(pagelink[0].text)
                # print(num)
                for page in range(1, num + 1):
                    if page == 1:
                        url = url
                    else:
                        url = 'http://m.feiyanqing.com' + str(http.attrs['href']).replace('.', '_%s.' % page)
                    # print(url)
                    request = get_request(url)
                    content = url_content(request)
                    soup = soup_content(content)
                    filename = soup.select('.boxcon > .wzbt')[0].text
                    if not os.path.exists(dirname):
                        os.mkdir(dirname)
                    filepath = os.path.join(dirname, filename)
                    fp = open(filepath, 'w', encoding='utf8')
                    print('正在下载%s......' % filename)
                    text = soup.select('.zw > p')
                    # print(text)
                    for text1 in text:
                        fp.write(text1.text + '\n')
                    print('下载完成...')
                    time.sleep(2)
                    fp.close()


#

#

if __name__ == '__main__':
    main()

猜你喜欢

转载自blog.csdn.net/qq_42938842/article/details/83591144