from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
import time
import requests
import zipfile
import os
import pandas as pd
import re
import traceback
from bs4 import BeautifulSoup
def un_zip(file_name, to_dir='./'):
"""unzip zip file"""
zip_file = zipfile.ZipFile(file_name)
if os.path.isdir(to_dir):
pass
else:
os.mkdir(to_dir)
for names in zip_file.namelist():
zip_file.extract(names, to_dir)
zip_file.close()
def download_driver(to_dir='./', version=''):
print('install chrome-driver first')
url = 'http://npm.taobao.org/mirrors/chromedriver/LATEST_RELEASE'
if len(version)>0:
url = 'http://npm.taobao.org/mirrors/chromedriver/LATEST_RELEASE_'+version
version = requests.get(url).content.decode('utf8')
driver_file = 'http://npm.taobao.org/mirrors/chromedriver/' + version + '/chromedriver_win32.zip'
r = requests.get(driver_file)
download_zip = "chromedriver_win32.zip"
with open(download_zip, "wb") as code:
code.write(r.content)
un_zip(download_zip, to_dir)
os.remove(download_zip)
try:
driver = webdriver.Chrome()
except Exception as e:
download_driver(to_dir='./', version='76')
driver = webdriver.Chrome()
driver.get("https://kns.cnki.net/kns8/AdvSearch")
kv = pd.read_csv('中图分类号.csv', names = ['k','v'], delimiter=',')
kv = kv.set_index('k')
def search_class_name(i):
v = ''
if type(i) is str:
k = i.split(';')[0]
while len(k) > 0 and not k in kv.index:
k = k[:-1]
if len(k)>0:
v = kv.loc[k]['v']
return v
input('请在弹出的浏览器中输入检索词,待结果出现后,输入任意字符继续:')
result_file = 'result88.csv'
html = driver.page_source
html = BeautifulSoup(html, 'lxml')
num_papers = int(html.find('span', {'class':'pagerTitleCell'}).em.text)
num_pages = int(html.find('span', {'class':'countPageMark'}).text.split('/')[1])
print('总共检索到 {} 篇文章, 共 {} 个页面'.format(num_papers, num_pages))
failed_list = []
encoding = 'utf_8_sig'
end = False
while(not end):
if os.path.exists(result_file):
result = pd.read_csv(result_file, encoding=encoding)
else:
result = pd.DataFrame(columns=[
'论文ID',
'题名',
'作者',
'来源',
'发表时间',
'数据库',
'被引',
'专辑',
'专题',
'分类号',
'中图分类',
'摘要',
'关键词'
])
time.sleep(1)
try:
html = driver.page_source
html = BeautifulSoup(html, 'lxml')
table = html.find('table', {'class': 'result-table-list'}).findAll('tr')
except:
s = input('请在网页中输入验证码,或检查其他错误,待页面加载出来后,在此输入任意字符:')
html = driver.page_source
html = BeautifulSoup(html, 'lxml')
table = html.find('table', {'class': 'result-table-list'}).findAll('tr')
for tr in table[1:]:
try:
time.sleep(0.5)
td_name = tr.find('td',{'class':'name'})
title = td_name.a.get_text().strip()
authors = tr.find('td',{'class':'author'}).text
source = tr.find('td',{'class':'source'}).text.strip()
date = tr.find('td',{'class':'date'}).text.strip()
data = tr.find('td',{'class':'data'}).text.strip()
quote = tr.find('td',{'class':'quote'}).text.strip()
keywords = ''
abstract = ''
collection = ''
topic = ''
class_id = ''
if 'DbCode' in str(td_name) and 'DbName' in str(td_name):
dbcode = re.findall('DbCode=(.*?)[&|"]',str(td_name))[0]
dbname = re.findall('DbName=(.*?)[&|"]',str(td_name))[0]
filename = re.findall('FileName=(.*?)[&|"]',str(td_name))[0]
url = 'https://kns.cnki.net/kcms/detail/detail.aspx?dbcode={}&dbname={}&filename={}'.format(dbcode,dbname,filename)
detail_page = requests.get(url).text
detail_page = BeautifulSoup(detail_page,'lxml')
abstract_ = detail_page.find('span',{'class':'abstract-text'})
if abstract_ is not None:
abstract = abstract_.text
keywords_ = detail_page.find('p',{'class':'keywords'})
if keywords_ is not None:
keywords = ' '.join([i.strip() for i in keywords_.text.split('\r\n')])
div_doc = detail_page.find('div',{'class':'doc'})
if div_doc is not None:
collection_ = div_doc.findAll(lambda tag: tag.get('class') is not None and 'top-space' in tag.get('class') and '专辑' in tag.text)
if len(collection_) > 0:
collection = collection_[0].p.text
topic_ = div_doc.findAll(lambda tag: tag.get('class') is not None and 'top-space' in tag.get('class') and '专题' in tag.text)
if len(topic_) > 0:
topic = topic_[0].p.text
class_id_ = div_doc.findAll(lambda tag: tag.get('class') is not None and 'top-space' in tag.get('class') and '分类号' in tag.text)
if len(class_id_) > 0:
class_id = class_id_[0].p.text
if len(keywords) == 0 and len(abstract) == 0:
brief = div_doc.find('div',{'class':'brief'})
if brief is not None:
keywords_ = div_doc.findAll(lambda tag: tag.get('class') is not None and 'row' in tag.get('class') and '关键词' in tag.text)
if len(keywords_) > 0:
keywords = keywords_[0].p.text
keywords = ' '.join([i.strip() for i in keywords.split('\r\n')])
abstract_ = div_doc.findAll(lambda tag: tag.get('class') is not None and 'row' in tag.get('class') and '成果简介' in tag.text)
if len(abstract_) > 0:
abstract = abstract_[0].p.text
class_id_ = div_doc.findAll(lambda tag: tag.get('class') is not None and 'row' in tag.get('class') and '中图分类号' in tag.text)
if len(class_id_) > 0:
class_id = class_id_[0].p.text
elif 'RedirectScholar':
tablename = re.findall('tablename=(.*?)[&|"]',str(td_name))[0]
filename = re.findall('filename=(.*?)[&|"]',str(td_name))[0]
url = 'https://schlr.cnki.net/Detail/index/{}/{}'.format(tablename, filename)
detail_page = requests.get(url).text
detail_page = BeautifulSoup(detail_page,'lxml')
div_doc = detail_page.find('div',{'class':'right-top'})
if div_doc is not None:
keywords = div_doc.find('div',{'class':'doc-keyword doc-item'}) .find('span',{'class':'value'}).text
abstract = div_doc.find('div',{'class':'doc-summary doc-item show'}) .find('span',{'class':'value'}).text
result.loc[result.shape[0]] = {
'论文ID' : url,
'题名' : title,
'作者' : authors,
'来源' : source,
'发表时间' : date,
'数据库' : data,
'被引' : quote,
'专辑' : collection,
'专题' : topic,
'分类号' : class_id,
'中图分类' : search_class_name(class_id),
'摘要' : abstract,
'关键词' : keywords
}
print(result.iloc[-1],'\n')
except Exception as e:
print('获取信息失败:'+title)
print('错误信息 :')
traceback.print_exc()
print()
failed_list.append(title)
result.to_csv(result_file, index=False, encoding=encoding)
end = True
page_links = driver.find_elements_by_xpath('//div[@class="pages"]/a')
for a in page_links:
if '下一页' in a.text:
end=False
print('进入下一页\n')
a.click()
result.to_csv(result_file, index=False, encoding=encoding)
print('{} 篇文章抓取失败'.format(len(failed_list)))
for i in failed_list:
print(i)