声明:本文只作学习研究,禁止用于非法用途,否则后果自负,如有侵权,请告知删除,谢谢!
知网专利数据获取
项目场景:
网址:https://epub.cnki.net/kns/brief/result.aspx?dbprefix=SCPD
这次给大家带来的是知网专利信息的获取,没有复杂的反爬措施,只要先用session获取查询时的cookie,然后带着cookie访问列表页即可。
项目代码:
这里要讲解一下,一个页面最多50条数据,最多120页,所以我们要想获取全部的数据,就要对专利类型进行帅选(尽量细分),这样根据日期和专利类型搜索,大部分数据都能获取到。
测试代码如下,可自行修改
import requests,re
from lxml import etree
def get_cookie(): # 获取访问的cookie
params = (
('action', ''),
('NaviCode', 'A'), # 筛选的类别
('ua', '1.21'),
('PageName', 'ASP.brief_result_aspx'),
('DbPrefix', 'SCPD'),
('DbCatalog', '\u4E2D\u56FD\u4E13\u5229\u6570\u636E\u5E93'),
('ConfigFile', 'SCPD.xml'),
('db_opt', '\u4E2D\u56FD\u4E13\u5229\u6570\u636E\u5E93'),
('db_value', '\u4E2D\u56FD\u4E13\u5229\u6570\u636E\u5E93'),
('date_gkr_from', '2020-01-24'), # 筛选日期
('date_gkr_to', '2020-01-24'), # 筛选日期
('his', '0'),
('__', 'Fri Oct 16 2020 14:37:38 GMT+0800 (\u4E2D\u56FD\u6807\u51C6\u65F6\u95F4)'),
)
session = requests.session()
session.get('https://epub.cnki.net/kns/request/SearchHandler.ashx', headers=headers, params=params)
return session
def get_list_info(): # 获取列表页
params = (
('curpage', '1'), # 当前页数
('RecordsPerPage', '50'),
('QueryID', '20'),
('ID', ''),
('turnpage', '1'),
('tpagemode', 'L'),
('dbPrefix', 'SCPD'),
('Fields', ''),
('DisplayMode', 'listmode'),
('SortType', "(公开日, 'DATE')desc"),
('PageName', 'ASP.brief_result_aspx'),
)
response = session.get('https://epub.cnki.net/kns/brief/brief.aspx', headers=headers, params=params)
selector = etree.HTML(response.text)
urls_info = re.compile("<a class='fz14' href='/kns/detail/detail.aspx(.*?)'").findall(response.text)
page_info = selector.xpath('//*[@id="J_ORDER"]/tr[2]/td/table/tr/td[2]/div/span[1]')[0].text
nums = len(urls_info)
now_page = int(re.compile('浏览(.*?)/').findall(page_info)[0])
print("当前获取第{}页数据".format(now_page), "数目", nums)
return urls_info
def get_detil():# 获取详情页
for url in urls_info:
# 旧地址访问速度慢,可更换新地址 https://kns.cnki.net/KCMS/detail/detail.aspx 需修改正则匹配
detail_url = 'https://dbpub.cnki.net/grid2008/dbpub/detail.aspx' + url # 详情页地址
print(detail_url)
response = requests.get(url=detail_url, headers=headers)
main_info = ''.join(etree.HTML(response.text).xpath('//*[@id="box"]//text()')).replace('\r\n', '').replace(' ','').replace(' ', '')
# print(main_info)
title = re.compile('font-weight:bold;text-align:center;">(.*?)</td>').findall(response.text)[0]
gb_id = re.compile('【公开号】(.*?)【').findall(main_info)[0]
gb_time = re.compile('【公开日】(.*?)【').findall(main_info)[0]
sq_id = re.compile('【申请号】(.*?)【').findall(main_info)[0]
sq_time = re.compile('【申请日】(.*?)【').findall(main_info)[0]
sq_person = re.compile('【申请人】(.*?)【').findall(main_info)[0]
addr = re.compile('【地址】(.*?)【').findall(main_info)[0]
fmr = '#'.join(re.compile('【发明人】(.*?)【').findall(main_info)[0].split(';'))
int_cl = re.compile('【专利分类号】(.*?)推荐下载').findall(main_info)[0]
try:
patent_agency = re.compile('【专利代理机构】(.*?)【').findall(main_info)[0]
agent = '#'.join(re.compile('【代理人】(.*?)【').findall(main_info)[0].split(';'))
except Exception:
patent_agency = agent = ''
abstract = re.compile('【摘要】(.*?)【').findall(main_info)[0].replace("'", '"')
print(title, gb_id, gb_time, sq_id, sq_time, sq_person, addr, fmr, int_cl, patent_agency, agent, abstract)
# break
if __name__ == '__main__':
headers = {
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36',
'Referer': 'https://epub.cnki.net/kns/brief/result.aspx?dbprefix=SCPD',
}
session = get_cookie()
urls_info = get_list_info()
get_detil()
结语:
注:下面给出博主的细分类型
#分类id
navicode_list = ['A001,A002,A003,A004,A005,A006,A007','A008,A009,A010,A011,A012,A013',
'F,G,H,J',
'B014_1,B014_2','B014_31,B014_32,B014_33,B014_34,B014_35,B014_36,B014_37','B014_38,B014_39,B014_3A','B014_4','B014_5,B014_6,B014_7,B014_8',
'B015_1,B015_3,B015_4,B015_5,B015_6,B015_7,B015_8','B015_2','B016_11','B016_12',
'B016_3,B016_5,B016_6,B016_7,B016_8','B016_4',
'B016_21,B016_22,B016_23,B016_24,B016_26,B016_27,B016_28,B016_29','B016_25','B017,B018,B019',
'B020_1','B020_2,B020_3,B020_4,B020_5,B020_6,B020_7,B020_8,B020_9,B020_A,B020_B,B020_C',
'B021,B023,B025','B022_1,B022_2,B022_3,B022_4,B022_5','B022_6,B022_7','B022_8,B022_B,B022_C','B022_9','B022_A',
'B024_3','B024_7','B024_1,B024_2,B024_4,B024_5,B024_6','B024_A','B024_B,B024_E','B024_C','B024_D','B024_8,B024_9',
'B026','B027_1,B027_2,B027_3,B027_4','B027_5,B027_6',
'C028_1,C028_2,C028_4,C028_5,C028_6,C028_7,C028_8','C028_9','C028_38','C028_31,C028_32,C028_33,C028_34,C028_35',
'C028_36,C028_37,C028_39,C028_3A,C028_3B,C028_3C','C029_1,C029_2,C029_3,C029_4,C029_6,C029_7',
'C029_8,C029_9,C029_B,C029_C,C029_D','C029_51,C029_52,C029_53,C029_54,C029_55',
'C029_56,C029_57,C029_58,C029_59','C029_A1,C029_A3','C029_A2','C030_1,C030_2,C030_3,C030_4,C030_5','C030_6,C030_7,C030_8,C030_9,C030_A',
'C030_B,C030_C,C030_D,C030_E,C030_F,C030_G,C030_H,C030_I','C031,C032,C033,C034','C035_1,C035_2,C035_3,C035_4,C035_5,C035_6,C035_7,C035_8',
'C035_9,C035_A,C035_B,C035_C,C035_D,C035_E','C036,C037,C040,C041','C038_1,C038_21,C038_22,C038_23,C038_24,C038_3',
'C038_25,C038_26,C038_27,C038_28,C038_29','C039','C042',
'D043,D044,D045,D046,D047','D044,D045,D046,D047',
'I135_1,I135_2,I135_3,I135_4,I135_7,I135_8','I135_6','I135_522,I135_523,I135_524','I135_521',
'I138_1,I138_2,I138_3,I138_4,I138_5,I138_6,I138_7,I138_8,I138_9,I138_A,I138_B','I138_C12,I138_C13,I138_C14,I138_C2','I138_C11',
'I136_87,I136_88','I136_84,I136_85,I136_86','I136_81,I136_82,I136_83',
'I137_3,I137_4,I137_5','I137_1,I137_2','I139,I140,I141,I142,I143,I144',
'I136_1,I136_2,I136_3,I136_4,I136_5,I136_6,I136_7','I136_9,I136_A,I136_B,I136_C,I136_D,I136_E,I136_F,I136_G']