小说爬取

import requests
import re
import time
from bs4 import BeautifulSoup
headers = {
   'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
}

f = open('E:/HFTX.txt','a+')
url_list=[]

def get_urls(start_url):
    wb_data=requests.get(start_url,headers=headers)
    soup=BeautifulSoup(wb_data.text,'lxml')
    links=soup.select('#readlist > ul > li > a')
    for link in links:
        page_url='http://www.quanshuwu.com/'+link.get('href')
        url_list.append(page_url)

def get_info(url):
    res = requests.get(url,headers=headers)
    if res.status_code == 200:
        contents = re.findall('<p>(.*?)</p>',res.content.decode('utf-8',errors='ignore'),re.S)
        for content in contents:
            try:
                f.write(content+'\n')
            except:
                print('error')
    else:
        pass

if __name__ == '__main__':
    start_url='http://www.quanshuwu.com/book/2039.aspx'
    get_urls(start_url)
    for url in url_list:
        get_info(url)
        time.sleep(1)
    f.close()

猜你喜欢

转载自blog.csdn.net/qq_42052864/article/details/80737990