[Python crawler series tutorial 27-100] Using selenium to crawl the Python literature information in CNKI, Miss Sister suddenly feels that she has become a great god

Crawling ideas

Crawl address: https://www.cnki.net/old, then click高级检索

Insert picture description here

Insert picture description here
Then click to enter the theme Python

Insert picture description here
Then parse the data through xpath

Finally, it can be parsed by xpath.

Instructions for use

entry content
Environmental requirements Chrome browser
Chrome corresponding version of chromedriver.exe browser driver (the built-in version is 88.0.4324.182 chromedriver.exe mirror download address: http://npm.taobao.org/mirrors/chromedriver/ please download by yourself)
Instructions 1. Please put the corresponding version of chromedriver.exe under ./tools
2. Open ./datas/since.xls to configure the search conditions (the first line is the search key, * represents all)
3. Run main.py
4. The result will be generated under result.xlsx

Specific code

from selenium import webdriver
from lxml import etree
from time import sleep
from random import uniform
import csv
import re
import xlrd
from math import ceil
from sys import exit

def get_data():
    data_sheet = xlrd.open_workbook(r"./datas/since.xls").sheet_by_index(0)
    results = []
    for row in range(1,data_sheet.nrows):
        result = ""
        SUstr = data_sheet.cell(row,0).value
        KYstr = data_sheet.cell(row,1).value
        AUstr = data_sheet.cell(row,2).value
        AFstr = data_sheet.cell(row,3).value
        FUstr = data_sheet.cell(row,4).value
        if(SUstr != "*"):
            result = "SU=" + SUstr
        if(KYstr != "*"):
            if(result != ""):
                result = result + " AND KY=" + KYstr
            else:
                result = "KY=" + KYstr
        if(AUstr != "*"):
            if(result != ""):
                result = result + " AND AU=" + AUstr
            else:
                result = "AU=" + AUstr
        if(AFstr != "*"):
            if(result != ""):
                result = result + " AND AF=" + AFstr
            else:
                result = "AF=" + AFstr
        if(FUstr != "*"):
            if(result != ""):
                result = result + " AND FU=" + FUstr
            else:
                result = "FU=" + FUstr
        results.append(result)
        print(result)
    return results
'''
SU=主题,TKA=篇关摘,KY=关键词,TI=篇名,FT=全文,AU=作者,FI=第一作者,RP=通讯作者,AF=作者单位,FU=基金,AB=摘要,CO=小标题, RF=参考文献,CLC=分类号,LY=文献来源, DOI=DOI,CF=被引频次
'''

def over_one_search():
    driver.switch_to.parent_frame()
    search_win = driver.find_element_by_id('expertvalue')
    search_win.clear()
    sleep(uniform(1,2))

def pasre_page(driver):
    try:
        html = etree.HTML(driver.page_source)
        trs = html.xpath('//tr[@bgcolor]')
    except:
        print("获取html错误...结束程序...")
        return -1
    for tr in trs:
        try:
            title = tr.xpath('./td//a[@class="fz14"]/text()')[0]
            print(title)
        except:
            title = "NaN"
        try:
            authors = tr.xpath('./td[@class="author_flag"]/a[@class="KnowledgeNetLink"]//text()')
            authors = "|".join(authors)
        except:
            authors = "NaN"
        try:
            source = tr.xpath('./td//a[@target="_blank"]/text()')[1]
        except:
            source = "NaN"
        try:
            times = tr.xpath('./td[@align="center"]/text()')[0].strip()
        except:
            times = "NaN"
        try:
            database = tr.xpath('./td[@align="center"]/text()')[1].strip()
        except:
            database = "NaN"
        try:
            counted = tr.xpath('./td//span[@class="KnowledgeNetcont"]/a/text()')
            if len(counted) == 0:
                counted = 0
            else:
                counted = counted[0]
            downloadCount = tr.xpath('./td//span[@class="downloadCount"]/a/text()')
            if len(downloadCount) == 0:
                downloadCount = 0
            else:
                downloadCount = downloadCount[0]
        except:
            downloadCount = "NaN"
        try:
            downloadURL = tr.xpath('./td[@align="center"]/a[@href and @class="briefDl_D"]')[0].attrib["href"]
        except:
            downloadURL = "NaN"
        data = {
                "title":title,
                "authors":authors,
                "source":source,
                "times":times,
                "database":database,
                "counted":counted,
                "downloadCount":downloadCount,
                "downloadURL":downloadURL,
                }
        csvwriter.writerow([title,authors,source,times,database,counted,downloadCount,downloadURL])

#main 

driver = webdriver.Chrome()

url = "https://www.cnki.net/old"
driver.get(url)

#高级检索
home_page = driver.find_element_by_id('highSearch')
home_page.click()
driver.switch_to_window(driver.window_handles[1])
#专业检索
search_page = driver.find_element_by_id('1_3')
search_page.click()
datas = []
#读取检索字段
results = get_data()
#创建csv
f = open("result.csv", "w", encoding="utf-8")
csvwriter = csv.writer(f)
csvwriter.writerow([
        "title",
        "authors",
        "source",
        "times",
        "database",
        "counted",
        "downloadCount",
        "downloadURL"
])
for result in results:
    #输入检索表达式
    search_win = driver.find_element_by_id('expertvalue')
    search_win.send_keys(result)
    #点击检索按钮
    search_btn = driver.find_element_by_id('btnSearch')
    search_btn.click()
    #转到iframe
    iframe = driver.find_element_by_id('iframeResult')
    driver.switch_to.frame(iframe)
    sleep(uniform(1,2))
    #获取检索条数及页数(20条为一页)
    try:
        html = etree.HTML(driver.page_source)
        sum_count_path = html.xpath('//div[@class="pagerTitleCell"]/text()')[0]
        print(sum_count_path)
        sum_count = int(re.search(r'[0-9]+', sum_count_path).group())
        page_count = ceil(sum_count / 20)
    except:
        print("获取条目信息错误...结束此条爬取...")
        over_one_search()
        break
    for index in range(page_count):
        #分析信息,写入csv
        pasre_page(driver)#分析信息
        #获取页数
        print("第 " + str(index+1) + " 页/共 " + str(page_count) + " 页,共 " + str(sum_count) + " 条结果")
        if(index != page_count - 1):
            #进入下一页
            try:
                next_page = driver.find_elements_by_xpath('.//div[@class="TitleLeftCell"]/a')[-1]
                next_page.click()
                sleep(uniform(8,10))
            except:
                    check_img = driver.find_elements_by_id('CheckCodeImg')[0]
                    print('正在处理验证码...')
                    locations = check_img.location
                    print(locations)
                    sizes = check_img.size
                    rangle = (int(locations['x']),int(locations['y']),int(locations['x'] + sizes['width']),int(locations['y'] + sizes['height']))
                    print(rangle)
                    js_top = "var q=document.documentElement.scrollTop=0"
                    driver.execute_script(js_top)
                    savepath = "checkcode.jpg"
                    driver.save_screenshot(savepath)
                    exit(1)

    #下一条
    try:
        over_one_search()
    except:
        print("跳转错误...结束程序...")
        exit(1)
f.close()
print("爬取结束...")
driver.quit()
exit(0)

Specific crawl results

./datas/since.xls
Insert picture description here
A total of 298 results in HowNet were crawled

Crawled result.csv

Guess you like

Origin blog.csdn.net/weixin_54707168/article/details/114819344