【2020-10-16】知网专利信息爬虫

声明:本文只作学习研究,禁止用于非法用途,否则后果自负,如有侵权,请告知删除,谢谢!




项目场景:


网址:https://epub.cnki.net/kns/brief/result.aspx?dbprefix=SCPD

这次给大家带来的是知网专利信息的获取,没有复杂的反爬措施,只要先用session获取查询时的cookie,然后带着cookie访问列表页即可。

项目代码:


这里要讲解一下,一个页面最多50条数据,最多120页,所以我们要想获取全部的数据,就要对专利类型进行帅选(尽量细分),这样根据日期和专利类型搜索,大部分数据都能获取到。

在这里插入图片描述
在这里插入图片描述


测试代码如下,可自行修改
import requests,re
from lxml import etree


def get_cookie(): # 获取访问的cookie
    params = (
        ('action', ''),
        ('NaviCode', 'A'), # 筛选的类别
        ('ua', '1.21'),
        ('PageName', 'ASP.brief_result_aspx'),
        ('DbPrefix', 'SCPD'),
        ('DbCatalog', '\u4E2D\u56FD\u4E13\u5229\u6570\u636E\u5E93'),
        ('ConfigFile', 'SCPD.xml'),
        ('db_opt', '\u4E2D\u56FD\u4E13\u5229\u6570\u636E\u5E93'),
        ('db_value', '\u4E2D\u56FD\u4E13\u5229\u6570\u636E\u5E93'),
        ('date_gkr_from', '2020-01-24'), # 筛选日期
        ('date_gkr_to', '2020-01-24'), # 筛选日期
        ('his', '0'),
        ('__', 'Fri Oct 16 2020 14:37:38 GMT+0800 (\u4E2D\u56FD\u6807\u51C6\u65F6\u95F4)'),
    )
    session = requests.session()
    session.get('https://epub.cnki.net/kns/request/SearchHandler.ashx', headers=headers, params=params)
    return session

def get_list_info(): # 获取列表页
    params = (
        ('curpage', '1'), # 当前页数
        ('RecordsPerPage', '50'),
        ('QueryID', '20'),
        ('ID', ''),
        ('turnpage', '1'),
        ('tpagemode', 'L'),
        ('dbPrefix', 'SCPD'),
        ('Fields', ''),
        ('DisplayMode', 'listmode'),
        ('SortType', "(公开日, 'DATE')desc"),
        ('PageName', 'ASP.brief_result_aspx'),
    )

    response = session.get('https://epub.cnki.net/kns/brief/brief.aspx', headers=headers, params=params)

    selector = etree.HTML(response.text)
    urls_info = re.compile("<a class='fz14' href='/kns/detail/detail.aspx(.*?)'").findall(response.text)
    page_info = selector.xpath('//*[@id="J_ORDER"]/tr[2]/td/table/tr/td[2]/div/span[1]')[0].text
    nums = len(urls_info)
    now_page = int(re.compile('浏览(.*?)/').findall(page_info)[0])
    print("当前获取第{}页数据".format(now_page), "数目", nums)
    return urls_info

def get_detil():# 获取详情页
    for url in urls_info:
        # 旧地址访问速度慢,可更换新地址 https://kns.cnki.net/KCMS/detail/detail.aspx 需修改正则匹配
        detail_url = 'https://dbpub.cnki.net/grid2008/dbpub/detail.aspx' + url # 详情页地址
        print(detail_url)
        response = requests.get(url=detail_url, headers=headers)
        main_info = ''.join(etree.HTML(response.text).xpath('//*[@id="box"]//text()')).replace('\r\n', '').replace(' ','').replace(' ', '')
        # print(main_info)
        title = re.compile('font-weight:bold;text-align:center;">(.*?)</td>').findall(response.text)[0]
        gb_id = re.compile('【公开号】(.*?)【').findall(main_info)[0]
        gb_time = re.compile('【公开日】(.*?)【').findall(main_info)[0]
        sq_id = re.compile('【申请号】(.*?)【').findall(main_info)[0]
        sq_time = re.compile('【申请日】(.*?)【').findall(main_info)[0]
        sq_person = re.compile('【申请人】(.*?)【').findall(main_info)[0]
        addr = re.compile('【地址】(.*?)【').findall(main_info)[0]
        fmr = '#'.join(re.compile('【发明人】(.*?)【').findall(main_info)[0].split(';'))
        int_cl = re.compile('【专利分类号】(.*?)推荐下载').findall(main_info)[0]
        try:
            patent_agency = re.compile('【专利代理机构】(.*?)【').findall(main_info)[0]
            agent = '#'.join(re.compile('【代理人】(.*?)【').findall(main_info)[0].split(';'))
        except Exception:
            patent_agency = agent = ''
        abstract = re.compile('【摘要】(.*?)【').findall(main_info)[0].replace("'", '"')
        print(title, gb_id, gb_time, sq_id, sq_time, sq_person, addr, fmr, int_cl, patent_agency, agent, abstract)
        # break


if __name__ == '__main__':
    headers = {
    
    
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36',
        'Referer': 'https://epub.cnki.net/kns/brief/result.aspx?dbprefix=SCPD',
    }

    session = get_cookie()
    urls_info = get_list_info()
    get_detil()
 

在这里插入图片描述


结语:


注:下面给出博主的细分类型
    #分类id
    navicode_list = ['A001,A002,A003,A004,A005,A006,A007','A008,A009,A010,A011,A012,A013',
                     'F,G,H,J',
                     'B014_1,B014_2','B014_31,B014_32,B014_33,B014_34,B014_35,B014_36,B014_37','B014_38,B014_39,B014_3A','B014_4','B014_5,B014_6,B014_7,B014_8',
                     'B015_1,B015_3,B015_4,B015_5,B015_6,B015_7,B015_8','B015_2','B016_11','B016_12',
                     'B016_3,B016_5,B016_6,B016_7,B016_8','B016_4',
                     'B016_21,B016_22,B016_23,B016_24,B016_26,B016_27,B016_28,B016_29','B016_25','B017,B018,B019',
                     'B020_1','B020_2,B020_3,B020_4,B020_5,B020_6,B020_7,B020_8,B020_9,B020_A,B020_B,B020_C',
                     'B021,B023,B025','B022_1,B022_2,B022_3,B022_4,B022_5','B022_6,B022_7','B022_8,B022_B,B022_C','B022_9','B022_A',
                     'B024_3','B024_7','B024_1,B024_2,B024_4,B024_5,B024_6','B024_A','B024_B,B024_E','B024_C','B024_D','B024_8,B024_9',
                     'B026','B027_1,B027_2,B027_3,B027_4','B027_5,B027_6',
                     'C028_1,C028_2,C028_4,C028_5,C028_6,C028_7,C028_8','C028_9','C028_38','C028_31,C028_32,C028_33,C028_34,C028_35',
                     'C028_36,C028_37,C028_39,C028_3A,C028_3B,C028_3C','C029_1,C029_2,C029_3,C029_4,C029_6,C029_7',
                     'C029_8,C029_9,C029_B,C029_C,C029_D','C029_51,C029_52,C029_53,C029_54,C029_55',
                     'C029_56,C029_57,C029_58,C029_59','C029_A1,C029_A3','C029_A2','C030_1,C030_2,C030_3,C030_4,C030_5','C030_6,C030_7,C030_8,C030_9,C030_A',
                     'C030_B,C030_C,C030_D,C030_E,C030_F,C030_G,C030_H,C030_I','C031,C032,C033,C034','C035_1,C035_2,C035_3,C035_4,C035_5,C035_6,C035_7,C035_8',
                     'C035_9,C035_A,C035_B,C035_C,C035_D,C035_E','C036,C037,C040,C041','C038_1,C038_21,C038_22,C038_23,C038_24,C038_3',
                     'C038_25,C038_26,C038_27,C038_28,C038_29','C039','C042',
                     'D043,D044,D045,D046,D047','D044,D045,D046,D047',
                     'I135_1,I135_2,I135_3,I135_4,I135_7,I135_8','I135_6','I135_522,I135_523,I135_524','I135_521',
                     'I138_1,I138_2,I138_3,I138_4,I138_5,I138_6,I138_7,I138_8,I138_9,I138_A,I138_B','I138_C12,I138_C13,I138_C14,I138_C2','I138_C11',
                     'I136_87,I136_88','I136_84,I136_85,I136_86','I136_81,I136_82,I136_83',
                     'I137_3,I137_4,I137_5','I137_1,I137_2','I139,I140,I141,I142,I143,I144',
                     'I136_1,I136_2,I136_3,I136_4,I136_5,I136_6,I136_7','I136_9,I136_A,I136_B,I136_C,I136_D,I136_E,I136_F,I136_G']

猜你喜欢

转载自blog.csdn.net/qq_26079939/article/details/109117721
今日推荐