Table of contents
Contents of the previous chapter
The inconvenience of crawling novels by the requests package
Use the aiohttp package to crawl novels asynchronously
General explanation of the code
Contents of the previous chapter
foreword
The content of this chapter is about how to crawl novels when links to novel articles are given
some problems
The inconvenience of crawling novels by the requests package
At the beginning, including the blog I wrote some time ago, I used the requests package to crawl
But this time there is a problem
Simply put, requests are executed sequentially
The next step must wait until the previous network request returns
Suppose the novel I want to crawl has 2000 chapters
It takes 1 second for each request to be returned and the information to be processed
Then it will take a total of 2000 seconds, which is more than half an hour.
If there is another return timeout error in the middle
mentality is about to explode
return timeout we can set the timeout waiting time
But it is still the delay of network requests that takes up most of the time
So is there any way to solve it?
Use the aiohttp package to crawl novels asynchronously
introduce
Asynchrony is a concurrency model that is much more efficient than multithreading. It is out of order. In order to complete a certain task, in the process of execution, there is no need for communication and coordination between different program units, and the task can also be completed. That is to say, unrelated program units can be asynchronous.
To put it simply, it can be compared to a kind of mathematics in elementary school - you can wash vegetables while boiling water, and chop vegetables while cooking.
In the python program, you can continue to send other data packets while waiting for the network reply packet
The trend of maximizing the utilization of resources
the code
The specific code is here
Here is only a preliminary introduction, and the use of specific packages will not be elaborated
main.py
import sys
# PyQt5中使用的基本控件都在PyQt5.QtWidgets模块中
from PyQt5.QtWidgets import QApplication, QMainWindow
# 导入designer工具生成的login模块
from win import Ui_MainWindow
from test_1 import *
from test_3 import *
import time
class MyMainForm(QMainWindow, Ui_MainWindow):
def __init__(self, parent=None):
super(MyMainForm, self).__init__(parent)
self.setupUi(self)
self.Button_close.clicked.connect(self.close)
self.Button_run.clicked.connect(self.F_run)
def F_run(self):
link_1=self.line_link.text()
title_1=F_gettitle(link_1)
self.text_result.setText(f"标题获取成功——{title_1}")
# file_1=open(f'{title_1}.txt',mode='w',encoding='utf-8 ')
test_1=F_getyuan(link_1)
self.text_result.append("提取源代码成功")
time.sleep(1)
search_1=F_searchlink(test_1)
self.text_result.append("提取文章链接成功")
pachong(search_1,title_1)
if __name__ == "__main__":
# 固定的,PyQt5程序都需要QApplication对象。sys.argv是命令行参数列表,确保程序可以双击运行
app = QApplication(sys.argv)
# 初始化
myWin = MyMainForm()
# 将窗口控件显示在屏幕上
myWin.show()
# 程序运行,sys.exit方法确保程序完整退出。
sys.exit(app.exec_())
test_1.py
import requests
import re
import numpy as np
from lxml import etree
#获取文章标题
def F_gettitle(link_0):
head_qb={
'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36'
}
test_1=requests.get(url=link_0,headers=head_qb)
test_yuan=test_1.text
dom=etree.HTML(test_yuan)
test_2=dom.xpath('/html/body/article[1]/div[2]/div[2]/h1/text()')
return test_2[0]
#提取源代码
def F_getyuan(link_1):
head_qb={
'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36'
}
test_1=requests.get(url=link_1,headers=head_qb)
test_yuan=test_1.text
test_yuan=str(test_yuan)
return test_yuan
#查询所有小说章节链接
def F_searchlink(link_2):
re_1='<a id="haitung" href="(.*?)" rel="chapter">'
re_1=re.compile(re_1)
link_3=re.findall(re_1,link_2)
link_max=np.array([])
for link_1 in link_3:
link_4=f'http://www.biquge66.net{link_1}'
link_max=np.append(link_max,link_4)
return link_max
# #输出文章内容
# def F_edittxt(link_3):
# head_qb={
# 'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36'
# }
# trytimes = 3
# for i in range(trytimes):
# try:
# proxies = None
# test_1=requests.get(url=link_3,headers=head_qb, verify=False, proxies=None, timeout=3)
# if test_1.status_code == 200:
# break
# except:
# print(f'requests failed {i} time')
# #提取文章链接
# re_2='<p>(.*?)</p>'
# re_2=re.compile(re_2)
# #提取文章标题
# re_3='<h1 class="bookname">(.*?)</h1>'
# re.compile(re_3)
# test_2=np.array([])
# test_3=np.array([])
# test_2=re.findall(re_2,test_1.text)
# test_3 = re.findall(re_3, test_1.text)
# #放在数组的最后一个
# test_2=np.append(test_3,test_2)
# return test_2
test_3.py
import asyncio
import aiohttp
import re
import numpy as np
title=''
async def F_2(session,url):
head_qb = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36'
}
async with session.get(url,verify_ssl=False,headers=head_qb) as resqonse:
global title
text=await resqonse.text()
text=str(text)
re_2 = '<p>(.*?)</p>'
re_2 = re.compile(re_2)
# 提取文章标题
re_3 = '<h1 class="bookname">(.*?)</h1>'
re.compile(re_3)
test_2 = np.array([])
test_3 = np.array([])
test_2 = re.findall(re_2, text)
test_3 = re.findall(re_3, text)
test_2 = np.append(test_3, test_2)
for test_max in test_2:
with open(f'{title}.txt',mode='a',encoding='utf-8') as file:
file.writelines(test_max)
async def F_1(urls):
async with aiohttp.ClientSession() as session:
tasks=[asyncio.create_task(F_2(session,url)) for url in urls]
await asyncio.wait(tasks)
def pachong(urls_1,title_1):
global title
title=title_1
asyncio.run(F_1(urls_1))
title=title_1
General explanation of the code
The two parameters of pachong() passed in the main function, one is the total list of article links, and the other is the name of the novel (the name used to create the txt file)
Continue sending requests while waiting for a reply from the network
Then use the re library to extract the article text in the source code and write it into txt
Notice
The novel chapters crawled by this method are out of order, but they can be sorted by chapter name (it’s so difficult to write, I don’t want to write it anymore)
Series Summary
This article has a preliminary understanding of the production of the GUI graphical interface, and learned about another method of crawler crawling - asynchronous crawler