爬取笔趣阁_完本书籍(未完)

import os, time, shutil, requests,sqlite3
from bs4 import BeautifulSoup
from threading import Thread
from datetime import datetime

def fun_makedir(file_path):
"""
创建文件夹,并进入该文件夹
:return:
"""
if not os.path.exists(file_path):
os.makedirs(file_path)
# else:
# shutil.rmtree(file_path)
# os.makedirs(file_path)
os.chdir(file_path)

def get_book(url):
"""
获取二级页面地址
:param url:
:return:(名称,二级页面地址)
"""
books = []
response = requests.get(url,headers=headers)
response.encoding = "gbk"
soup = BeautifulSoup(response.text, "html.parser")
book_txts = soup.find('div', class_='r').findAll('li')
for book in book_txts:
book_url = book.find('a')['href']
book_name = book.find('a').get_text()
print("{:<40s}{:<60s}".format(book_name,book_url))
books.append([book_name, book_url])
return books

def get_chapter(book_url):
chapters = []
chapter_res = requests.get(book_url, headers=headers)
chapter_res.encoding = 'gbk'
chapter_soup = BeautifulSoup(chapter_res.text, "html.parser")
chs = chapter_soup.find('div', id="list").findAll('dd')
# print(chs[9])
# print(len(chs)) #854
for i in range(9,len(chs)):
chapter = chs[i].find('a')
chapter_url = "http://www.biquge.tv" + chapter['href']
chapter_name = chapter.get_text()
chapters.append([chapter_name,chapter_url])
print(chapter_name,chapter_url)
return chapters

def create_table_book(table_id):
if not os.path.exists(dbname):
conn = sqlite3.connect(dbname)
cursor = conn.cursor()
cursor.execute("create table bqg_book_" + str(table_id) + "(chapter_id int,chapter_name varchar(20),"+
"chapter_text varchar(10000),chapter_url varchar(60))")

def main():
start = datetime.now()

book_urls = []
url = "http://www.biquge.tv/wanben/1_1"

book_urls = get_book(url)

threads = []
i = 0
for item in book_urls:
    i += 1
    t = Thread(target=down_book_todb, args=(i,item[0], item[1]))
    threads.append(t)
for t in threads:
    t.start()
for t in threads:
    t.join()
print("\n共抓取{}张照片\t".format(count))
run_time = (datetime.now() - start).total_seconds()
print("总共用时{}秒".format(run_time), end="\t")

def save_chapter_todb(chapter_id,chapter_name,chapter_url):
down_chapter_res = requests.get(chapter_url, headers=headers)
down_chapter_res.encoding = 'gbk'
down_chapter_soup = BeautifulSoup(down_chapter_res.text, "html.parser")
chapter_text = down_chapter_soup.find('div', id="content")
# 获取html中的文本
chapter_text = chapter_text.text
# 去除空行
chapter_text = "".join([s for s in chapter_text.splitlines(True) if s.strip()])
# print(down_text)
save_db(chapter_id,chapter_name,chapter_text,chapter_url)

def save_db(chapter_id,chapter_name,chapter_text,chapter_url):
pass

def down_book_todb(i,book_name, book_url):
global count
count = count + 1
chapters = []
chapters =get_chapter(book_url)
j = 0
for item in chapters:
j += 1
save_chapter_todb(j,item[0],item[1])

程序入口

if name == 'main':
global category,save_path, headers, count,dbname
category = "笔趣阁"
save_path = os.getcwd() + '/down/' + category
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}

dbname = "笔趣阁小说_" + time.strftime("%Y-%m-%d", time.localtime()) + ".sqlite"
count = 0
# 创建文件夹
fun_makedir(save_path)

# 执行主函数
# # main()
# book_url = "http://www.biquge.tv/7_7325/"
# get_chapter(book_url)
chapter_name = "第一章 穿越"
chapter_url ="http://www.biquge.tv/7_7325/2823434.html"
down_chapter_todb(1,chapter_name,chapter_url)

猜你喜欢

转载自www.cnblogs.com/yuexiao/p/12823899.html