Table of contents
Code implementation of the main function
Get the source code of the web page
Get links to individual articles
Get the source code of the article
Extract links to individual articles of an article directory
Previous Chapter Blog
foreword
In the previous chapter of the blog, we talked about how to make a graphical interface through PyQt5 and make some basic settings
In the next two chapters, we mainly talk about the implementation of the core crawler code
Code implementation of the main function
Code from the previous chapter
self.Button_run.clicked.connect(self.F_run)
Represents clicking the button to execute the F_run function (note that there are no brackets here)
Then we need to define this function
The idea is probably like this
def F_run(self):
link_1=self.line_link.text()
title_1=F_gettitle(link_1)
self.text_result.setText(f"标题获取成功——{title_1}")
# file_1=open(f'{title_1}.txt',mode='w',encoding='utf-8 ')
test_1=F_getyuan(link_1)
self.text_result.setText("提取源代码成功")
time.sleep(1)
search_1=F_searchlink(test_1)
self.text_result.append("提取文章链接成功")
pachong(search_1,title_1)
Line-by-line code analysis
get link
first pass
self.line_link.text()
command to get the link entered in the input box
And assign it to link_1
get title
At the same time, I will extract the keywords by crawling the source code of the web page link to get the title of the article
that is the name of the novel
title_1=F_gettitle(link_1)
Get the source code of the web page
Crawl the source code of the novel article catalog page and assign it as test_1 (for subsequent extraction of links to each article)
test_1=F_getyuan(link_1)
Get links to individual articles
search_1=F_searchlink(test_1)
Extract and filter the obtained source code to obtain the links of each article
Among them, self.text_result.setText and self.text_result.append are displayed in the red circle below
(For aesthetic use, you can not add it)
function code
In order not to make the code too long here, I have created two python files separately to store python functions
import library file
import requests
import re
import numpy as np
from lxml import etree
request is used for network requests
re and lxml are used to filter source code information
while numpy is used to store the elements
Get the title of the article
def F_gettitle(link_0):
head_qb={
'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36'
}
test_1=requests.get(url=link_0,headers=head_qb)
test_yuan=test_1.text
dom=etree.HTML(test_yuan)
test_2=dom.xpath('/html/body/article[1]/div[2]/div[2]/h1/text()')
return test_2[0]
Very simple structure of a
Get the source code by requests
Then use the tree in lxml to filter the source code
(When using the xpath path, add text() at the end to output the text form, otherwise the source code will not be displayed)
xpath path can be extracted by pressing f12 console
Get the source code of the article
It should be easy to understand, just write the code directly
def F_getyuan(link_1):
head_qb={
'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36'
}
test_1=requests.get(url=link_1,headers=head_qb)
test_yuan=test_1.text
test_yuan=str(test_yuan)
return test_yuan
Extract links to individual articles of an article directory
def F_searchlink(link_2):
re_1='<a id="haitung" href="(.*?)" rel="chapter">'
re_1=re.compile(re_1)
link_3=re.findall(re_1,link_2)
link_max=np.array([])
for link_1 in link_3:
link_4=f'http://www.biquge66.net{link_1}'
link_max=np.append(link_max,link_4)
return link_max
Here I directly use the regularity of the re library to match the matching link
Note that since the matching link is not a full link
So it needs to be spliced
After the splicing is completed, it can be opened directly
Here I store it in an array for convenience and then crawl the source code of each article
and then return
total code
main.py
import sys
# PyQt5中使用的基本控件都在PyQt5.QtWidgets模块中
from PyQt5.QtWidgets import QApplication, QMainWindow
# 导入designer工具生成的login模块
from win import Ui_MainWindow
from test_1 import *
import time
class MyMainForm(QMainWindow, Ui_MainWindow):
def __init__(self, parent=None):
super(MyMainForm, self).__init__(parent)
self.setupUi(self)
self.Button_close.clicked.connect(self.close)
self.Button_run.clicked.connect(self.F_run)
def F_run(self):
link_1=self.line_link.text()
title_1=F_gettitle(link_1)
self.text_result.setText(f"标题获取成功——{title_1}")
# file_1=open(f'{title_1}.txt',mode='w',encoding='utf-8 ')
test_1=F_getyuan(link_1)
self.text_result.append("提取源代码成功")
time.sleep(1)
search_1=F_searchlink(test_1)
self.text_result.append("提取文章链接成功")
pachong(search_1,title_1)
if __name__ == "__main__":
# 固定的,PyQt5程序都需要QApplication对象。sys.argv是命令行参数列表,确保程序可以双击运行
app = QApplication(sys.argv)
# 初始化
myWin = MyMainForm()
# 将窗口控件显示在屏幕上
myWin.show()
# 程序运行,sys.exit方法确保程序完整退出。
sys.exit(app.exec_())
test_1.py
import requests
import re
import numpy as np
from lxml import etree
#获取文章标题
def F_gettitle(link_0):
head_qb={
'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36'
}
test_1=requests.get(url=link_0,headers=head_qb)
test_yuan=test_1.text
dom=etree.HTML(test_yuan)
test_2=dom.xpath('/html/body/article[1]/div[2]/div[2]/h1/text()')
return test_2[0]
#提取源代码
def F_getyuan(link_1):
head_qb={
'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36'
}
test_1=requests.get(url=link_1,headers=head_qb)
test_yuan=test_1.text
test_yuan=str(test_yuan)
return test_yuan
#查询所有小说章节链接
def F_searchlink(link_2):
re_1='<a id="haitung" href="(.*?)" rel="chapter">'
re_1=re.compile(re_1)
link_3=re.findall(re_1,link_2)
link_max=np.array([])
for link_1 in link_3:
link_4=f'http://www.biquge66.net{link_1}'
link_max=np.append(link_max,link_4)
return link_max
# #输出文章内容
# def F_edittxt(link_3):
# head_qb={
# 'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36'
# }
# trytimes = 3
# for i in range(trytimes):
# try:
# proxies = None
# test_1=requests.get(url=link_3,headers=head_qb, verify=False, proxies=None, timeout=3)
# if test_1.status_code == 200:
# break
# except:
# print(f'requests failed {i} time')
# #提取文章链接
# re_2='<p>(.*?)</p>'
# re_2=re.compile(re_2)
# #提取文章标题
# re_3='<h1 class="bookname">(.*?)</h1>'
# re.compile(re_3)
# test_2=np.array([])
# test_3=np.array([])
# test_2=re.findall(re_2,test_1.text)
# test_3 = re.findall(re_3, test_1.text)
# #放在数组的最后一个
# test_2=np.append(test_3,test_2)
# return test_2
next chapter content
Finally, all the chapter links are obtained, and the next step is to crawl the article
It could have been written together (you can see the commented out part in my test_1.py), but later found some problems
Only the next chapter
will be explained in detail in the next chapter