Python crawls novel websites

It feels like it’s been a long time since I updated my blog. I’m not going to prepare for the adult college entrance examination. As a secondary school graduate, I think the college certificate is still a little useful, so I spent some time reading, but it’s a pity that every time I read, read As I watched it, I went to chat with Mr. Zhou. Hey, I don’t know if any boss taught me how to increase my interest in reading, I am very grateful...

The gossip ends here, today’s goal: Xinbiquge , and then we enter the search page: http://www.xbiquge.la/modules/article/waps.php

Today I’m going to climb some novels. When it comes to reading, I won’t fall asleep while reading novels, haha. .

Simple crawling of novels is actually quite simple, but today let’s make ourselves a little more difficult. Using requests’ POST to grab novel
requests’ GET is literal, and POST is also literal. The slightly different from GET is that you need to carry it. Data, and where is the data? In the website, for example, search for a novel I like: Jian Ke

Insert picture description here

The data in the Form Data in the lower right corner is what we want to bring to the requests for analysis:

import requests
import parsel
import re

def get_url(headers,keyword):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
    }
    url = 'http://www.xbiquge.la/modules/article/waps.php'
    data = {
        'searchkey': keyword,
    }
    res = requests.post(url,data=data,headers=headers)
    res.encoding = 'utf-8'
    sreach = parsel.Selector(res.text)
    n = 0       
    href = []
    for each in sreach.xpath('//div[@id="content"]/form/table/tr')[1:]:
        href.append(each.xpath('./td/a/@href').get())     # 书籍地址
        title = each.xpath('./td/a/text()').get()        # 书籍名称
        author= each.xpath('./td[3]/text()').get()        # 作者
        n += 1
        print(str(n) +": "+ title,author)
        if n == 4:
            break

    if bool(href) == False:             # 判断是否有该书籍,如果没有,则返回main继续从头开始
        print(f"未找到{keyword},请重新输入!!")
        main()


    while True:
        choice = int(input("请按序号选择你要下载的书籍:"))
        if choice == 1:
            return href[0]
        elif choice == 2:
            return href[1]
        elif choice == 3:
            return href[2]
        elif choice == 4:
            return href[3]
        else:
            print("输入错误!请重新输入!")

def main():
    keyword = input("请输入您要查找的书籍/作者名称(最多显示四本):")
    headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
        }
    get_url(headers,keyword)

if __name__ == "__main__":
    main()

The results are as follows:
Insert picture description here
Insert picture description here
Insert picture description here

Some more functions are added. You can search for books or authors, and display up to four books, so please try to write all the names of the books. Of course, you can also change the 4 of if == 4 to the number of books you want to display That’s right, of course, the following judgments have to be increased

Now that we get these things, the latter is actually simple, just go to the code:

import requests
import parsel
import re

def get_url(headers,keyword):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
    }
    url = 'http://www.xbiquge.la/modules/article/waps.php'
    data = {
        'searchkey': keyword,
    }
    res = requests.post(url,data=data,headers=headers)
    res.encoding = 'utf-8'
    sreach = parsel.Selector(res.text)
    n = 0       
    href = []
    for each in sreach.xpath('//div[@id="content"]/form/table/tr')[1:]:
        href.append(each.xpath('./td/a/@href').get())     # 书籍地址
        title = each.xpath('./td/a/text()').get()        # 书籍名称
        author= each.xpath('./td[3]/text()').get()        # 作者
        n += 1
        print(str(n) +": "+ title,author)
        if n == 4:
            break

    if bool(href) == False:             # 判断是否有该书籍,如果没有,则返回main继续从头开始
        print(f"未找到{keyword},请重新输入!!")
        main()


    while True:
        choice = int(input("请按序号选择你要下载的书籍:"))
        if choice == 1:
            return href[0]
        elif choice == 2:
            return href[1]
        elif choice == 3:
            return href[2]
        elif choice == 4:
            return href[3]
        else:
            print("输入错误!请重新输入!")

def get_list(url,headers):
    res = requests.get(url,headers=headers)
    res.encoding = "utf-8"
    list_url = parsel.Selector(res.text)
    book_name = list_url.xpath('//div[@id="info"]/h1/text()').get()               # 书籍名称
    print("开始下载:",book_name)
    for lis in list_url.xpath('//div[@id="list"]/dl/dd'):
        list_url = 'http://www.xbiquge.la' + lis.xpath('./a/@href').get()       # 内容页 URL
        chap = lis.xpath('./a/text()').get()                                    # 章节标题 
        print(chap)

        content = requests.get(list_url,headers=headers)
        content.encoding = "utf-8"
        content_url = parsel.Selector(content.text)

        con_text = ""
        all_content = content_url.xpath('//div[@id="content"]/text()').getall()
        for ac in all_content:
            re.sub("[\xa0]"," ",ac)             # 替换文本里面的乱码
            con_text += ac
            
        # print(con_text)
        
        with open("./images/" + book_name+".txt","a+",encoding="utf-8") as f:
            f.write(chap)
            f.write("\n")
            f.write(con_text)
            f.write('\n')
        
def main():
    keyword = input("请输入您要查找的书籍/作者名称(最多显示四本):")
    headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
        }
	url = get_url(headers,keyword)
	get_list(url,headers)


if __name__ == "__main__":
    main()

Insert picture description here

Insert picture description here

Get it done! Welcome everyone to communicate together, I'm going to read a book!

2020/10/25 amended! !
Suddenly I thought of an idea, and when I added it in, the code was changed a little bit, and a progress bar was added, so it’s much better. The new code:

import requests
import parsel
import re
from tqdm import tqdm

def get_url(headers,keyword):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
    }
    url = 'http://www.xbiquge.la/modules/article/waps.php'
    data = {
        'searchkey': keyword,
    }
    res = requests.post(url,data=data,headers=headers)
    res.encoding = 'utf-8'
    sreach = parsel.Selector(res.text)
    n = 0       
    href = []
    for each in sreach.xpath('//div[@id="content"]/form/table/tr')[1:]:
        href.append(each.xpath('./td/a/@href').get())     # 书籍地址
        title = each.xpath('./td/a/text()').get()        # 书籍名称
        author= each.xpath('./td[3]/text()').get()        # 作者
        n += 1
        print(str(n) +": "+ title,author)
        if n == 4:
            break

    if bool(href) == False:             # 判断是否有该书籍,如果没有,则返回main继续从头开始
        print(f"未找到{keyword},请重新输入!!")
        main()


    while True:
        choice = int(input("请按序号选择你要下载的书籍( 按 0 退出 ):"))
        if choice == 1:
            return href[0]
        elif choice == 2:
            return href[1]
        elif choice == 3:
            return href[2]
        elif choice == 4:
            return href[3]
        elif choice == 0:
            exit()
        else:
            print("输入错误!请重新输入!")

def get_list(url,headers):
    res = requests.get(url,headers=headers)
    res.encoding = "utf-8"
    list_url = parsel.Selector(res.text)
    book_name = list_url.xpath('//div[@id="info"]/h1/text()').get()               # 书籍名称
    books = "开始下载:"+ book_name
    for lis in tqdm(list_url.xpath('//div[@id="list"]/dl/dd'), desc=books, ncols=100,unit="MB"):
        list_url = 'http://www.xbiquge.la' + lis.xpath('./a/@href').get()       # 内容页 URL

        chap = lis.xpath('./a/text()').get()                                    # 章节标题 
        # print(chap)

        content = requests.get(list_url,headers=headers)
        content.encoding = "utf-8"
        content_url = parsel.Selector(content.text)

        con_text = ""
        all_content = content_url.xpath('//div[@id="content"]/text()').getall()
        for ac in all_content:
            re.sub("[\xa0]"," ",ac)             # 替换文本里面的乱码
            con_text += ac
            
        # print(con_text)
        
        with open("./images/" + book_name+".txt","a+",encoding="utf-8") as f:
            f.write(chap)
            f.write("\n")
            f.write(con_text)
            f.write('\n')
        
def main():
    keyword = input("请输入您要查找的书籍/作者名称(最多显示四本):")
    headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
        }
    url = get_url(headers,keyword)
    get_list(url,headers)


if __name__ == "__main__":
    main()

Look at the terminal display:

Insert picture description here
There are the names of the books to be downloaded, the download progress, the number of chapters and the approximate time required, basically all are there, this is the final version, no change! ^ _ ^

Guess you like

Origin blog.csdn.net/weixin_51211600/article/details/109139257