Website: http://dict.cnki.net/dict_result.aspx
Need to use the library:
import requests
from lxml import etree
import time
import re
from requests.adapters import HTTPAdapter
- The first part first obtains the web page source code. Can see the data on the page's source code is not put through the
json
document asynchronously described. Then we directly find the source code in the message to construct our request header:
By viewing Preview, you can see that the source code of the web page is placed in: dict_result.aspx, then we directly construct the request header based on its headers:
def crawl(url):
header = {
'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
'accept-encoding': "gzip,deflate",
'accept-language': "zh-CN,zh;q=0.9,en;q=0.8",
'cache-control': "no-cache",
'connection': "keep-alive",
'cookie': "Ecp_ClientId=1190617212402883927; cnkiUserKey=41b7ffaf-85e4-417d-376e-9d896e89b5fc; OUTFOX_SEARCH_USER_ID_NCOO=202854832.3388026; UM_distinctid=17096417d21285-090cc6aba1c552-6701b35-144000-17096417d223dc; ASP.NET_SessionId=etpdn15sj14efvr2ymak15hg; SID=203006; amid=fc0a8fe8-c8fe-42d5-a63f-9d0e7f494dc8; hw=wordList=%u52a8%u6001%u89c4%u5212%5e%u8ba1%u7b97%u673a%5e%u8ba1%u7b97%5e%u4e8c%u53c9%u6811%5e%u7a7f%u7ebf%u4e8c%u53c9%u6811%5e%u6811%5e%u4e8c%u5206%u7b97%u6cd5; CNZZDATA3209959=cnzz_eid%3D231822529-1583409734-null%26ntime%3D1586737892",
'host': "dict.cnki.net",
'upgrade-insecure-requests': "1",
# 'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
'user-agent': "Mozilla/5.0 (X11; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"
}
try:
s = requests.Session()
s.mount('http://',HTTPAdapter(max_retries=3))#设置重试次数为3次
s.mount('https://',HTTPAdapter(max_retries=3))
response = s.get(headers=header,url=url,timeout=5)
response.close()
response.encoding = response.apparent_encoding
except Exception as e:
print(e)
return ""
return response.text
Follow the webpage and fill in what is done.
- The second step to resolve page source code, keyword
动态规划
as an example. To get all the bilingual example sentences, we must first know how many pages of data there are.
Click to check the location of the page information, and you can see:
My approach here is to match this part of the data with the regular expression first, and then analyze it further. Look at the code first:
# 获取页面数,最小为1
def page_num(html):
if html==None:
return 1
# 获取有关页面信息的字符串规则
pattern1 = re.compile(r'更多双语句对:(.*?)<\/a>[ ]*<\/tr>',re.I)
if len(re.findall(pattern1,str(html)))==0:
return 1
else: page_info = re.findall(pattern1,str(html))[0]
pattern2 = re.compile(r'<a.*?>(\d)<\/a>',re.I)
max_page = int(re.findall(pattern2,str(page_info))[-1])
return max_page
Why do you want to write the above analysis is mainly different situations, when there is only one page of data: the interface is this:
this time because the pattern1
match is returned less than 1 ..
As the pattern2:<a.*?>(\d)<\/a>
only match to
The middle is the case of numbers, so if there are multiple pages, then the last data is its total number of pages.
The next page and last page will not be matched because they are not integers.
- The next step is to analyze the structure of the bilingual example sentences, analyze and extract them, here the analysis method is used
xpath
:
First, let's look at the xpath expression where the example sentences are:
Click to check where the example sentence is, and then right-click to copy the xpath expression in elements:
//*[@id="showjd_0"]/tbody/tr[1]/td/text()
//*[@id="showjd_1"]/tbody/tr[1]/td/text()
Through comparison, it can be found that the example sentences corresponding to each phrase correspond to different ids, and they are increased one by one. In this case, when we parse the page, we must first know how many phrases there are on the page, so we parse the phrases that exist on the page first, and in the same way, click on the phrase to view the xpath expression:
//*[@id="lblresult"]/table[1]/tbody/tr[2]/td/table/tbody/tr[1]/td/table[1]/tbody/tr/td/font/b/a
//*[@id="lblresult"]/table[1]/tbody/tr[2]/td/table/tbody/tr[1]/td/table[3]/tbody/tr/td/font/b/a
You can see that the interval between each phrase is table+2.
In this way we can construct our xpath expression is: //*[@id="lblresult"]/table[1]/tr[2]/td/table/tr[1]/td/table['+str(i)+']/tr/td/font/b/a
.
# //*[@id="lblresult"]/table[1]/tr[2]/td/table/tbody/tr[1]/td/table[15]/tr/td/font/b/a
# 获取页面中存在的短语,返回一个短语列表
def parse_phrase(html):
if html == None:
return []
try:
phrase_list = []
tree = etree.HTML(html)
# 先计算table节点有多少个,才能知道循环的上限
num = int(tree.xpath('count(//*[@id="lblresult"]/table[1]/tr[2]/td/table/tr[1]/td/table)'))
for i in range(1,num,2):
phrase_xpath = '//*[@id="lblresult"]/table[1]/tr[2]/td/table/tr[1]/td/table['+str(i)+']/tr/td/font/b/a'
phrase = tree.xpath(phrase_xpath+'//text()')[0].strip()
phrase_list.append(phrase)
except Exception as e:
print(e)
return phrase_list
With the phrase list, we can know the range of id in the example sentence.
# //*[@id="showjd_0"]/tbody/tr[1]/td
# start 是 id 的起点
# end 是 id 的终点
def parse_original_sentence(html,start,end):
if html == None:
return "",""
tree = etree.HTML(html)
e_string = ""
c_string = ""
for i in range(start,end+1):
id = '//*[@id="showjd_' + str(i) + "\"]"
num = int(tree.xpath('count('+id+'/tr)'))
for index in range(1,num,3):
e_sentence_xpath = id + "/tr[" + str(index) + "]/td//text()"
en_list = tree.xpath(e_sentence_xpath)
en_list = [ii.strip() for ii in en_list]
en = " ".join(en_list)
e_string = e_string + en.lower().strip()
e_string = e_string + '\n'
# print(en)
c_sentence_xpath = id + "/tr[" + str(index+1) + "]/td//text()"
ch_list = tree.xpath(c_sentence_xpath)
ch_list = [ii.strip() for ii in ch_list]
ch = "".join(ch_list)
c_string = c_string + ch.strip()
c_string = c_string + '\n'
# print(ch)
return e_string,c_string
The same analysis method is used to crawl more incomplete matching sentence pairs in the interface
# http://dict.cnki.net/dict_more_sen.aspx?searchword=%E7%A9%BA%E9%97%B4&unvsm=1&t=&s=0&c=506&z=I138&page=2
# //*[@id="lblresult"]/table/tbody/tr/td/table/tbody/tr[3]/td/table/tbody/tr[2]/td
def crawl_and_parse_more_html(data,style):
prefix_url = "http://dict.cnki.net/dict_more_sen.aspx?searchword=" + data +"&unvsm=1&t=&s=0&c=100&z=" + style +"&page="
en_string = ""
ch_string = ""
for i in range(1,6):
try:
url = prefix_url + str(i)
html = crawl(url)
time.sleep(0.1)
tree = etree.HTML(html)
num = int(tree.xpath('count(//*[@id="lblresult"]/table/tr/td/table/tr[3]/td/table/tr)'))
if num == 4:
break
else:
for index in range(2,num,3):
en_xpath = '//*[@id="lblresult"]/table/tr/td/table/tr[3]/td/table/tr[' + str(index) + ']/td//text()'
en_list = tree.xpath(en_xpath)
en_list = [ii.strip() for ii in en_list]
en = " ".join(en_list)
en_string = en_string + en.lower().strip()
en_string = en_string + '\n'
ch_xpath = '//*[@id="lblresult"]/table/tr/td/table/tr[3]/td/table/tr[' + str(index+1) + ']/td//text()'
ch_list = tree.xpath(ch_xpath)
ch_list = [ii.strip() for ii in ch_list]
ch = "".join(ch_list)
ch_string = ch_string + ch.strip()
ch_string = ch_string + '\n'
# print(ch)
except Exception as e:
print(e)
return en_string,ch_string
Let's parse the web page connection:
you can see that style refers to the query category, page refers to the current page, and searchword is the keyword to search.
Where the classification I138
for the computer software and computer applications, A002
mathematics, I139
as Internet technology. Knowing these can construct url to crawl web pages.
Main function:
# http://dict.cnki.net/dict_result.aspx?searchword=%e7%ae%97%e6%b3%95&tjType=sentence&style=I138&page=2
if __name__ == "__main__":
# 用来获取检索关键字的文本
data_file = r"E:/大四下/数据/例句/知网例句/关键词2.txt"
# 英语例句存放的文件
en_save_file = r"E:/大四下/数据/例句/知网例句/cnki_liju.en"
# 中文例句存放的文件
ch_save_file = r"E:/大四下/数据/例句/知网例句/cnki_liju.ch"
# 这个是短语存放的文件
# phrase_save_file = r'E:/大四下/数据/例句/知网例句/关键词2.txt'
# 以追加的方式打开
en_fw = open(en_save_file,'a+',encoding='utf8')
ch_fw = open(ch_save_file,'a+',encoding='utf8')
# phrase_fw = open(phrase_save_file,'a+',encoding='utf8')
# 定义查询类别列表
style = ['A002','I139','I138']
with open(data_file,'r',encoding='utf8') as fr:
# 获取检索关键词
line = fr.readline()
while line:
if line!='\n':
data = line.strip()
# 构造公共前缀 url
prefix_url = "http://dict.cnki.net/dict_result.aspx?searchword=" + data
# 迭代类别分别爬取
for s in style:
print("data;{},s:{}".format(data,s))
prefix_url_tmp = prefix_url + "&tjType=sentence&style="+ s + "&page="
# 获取页数
url_for_get_page_size = "http://dict.cnki.net/dict_result.aspx?searchword=" + data + "&style="+s+"&tjType=sentence"
html_for_get_page_size = crawl(url_for_get_page_size)
if html_for_get_page_size == "":
continue
time.sleep(0.1)
page = page_num(html_for_get_page_size)
# print(page)
# 记录例句 id 的起始
start = 0
end = 0
for i in range(1,page+1):
url = prefix_url_tmp + str(i)
# print(url)
original_html = crawl(url)
if original_html == "":
continue
time.sleep(0.1)
#获取短语列表并写入文件
phrase_list = parse_phrase(original_html)
# for phrase in phrase_list:
# if phrase[0].encode('UTF-8').isalpha():
# print(phrase)
# phrase_fw.write(phrase)
# phrase_fw.write('\n')
# print(phrase_list)
# 根据短语列表计算 id 的起始
end = end + len(phrase_list)
print("start:{} and end:{}".format(start,end))
e_string,c_string = parse_original_sentence(original_html,start,end)
start = end
en_fw.write(e_string)
ch_fw.write(c_string)
# 爬取更多页面中的例句
e_string,c_string = crawl_and_parse_more_html(data,s)
en_fw.write(e_string)
ch_fw.write(c_string)
en_fw.flush()
ch_fw.flush()
line = fr.readline()
en_fw.close()
ch_fw.close()
phrase_fw.close()
print('ok')