# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
import requests, sys
import re
from selenium import webdriver
import time
import random
import os
from fake_useragent import UserAgent
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import TimeoutException
ua = UserAgent()
header = {
'User-Agent': ua.random,
}
num = 1
"""
response = requests.get("https://www.ncbi.nlm.nih.gov/pmc/?term=carbon",headers=headers)
soup = BeautifulSoup(response.text,'lxml')
contents = soup.find_all('div', class_='title')
for content in contents:
print(content.text) #文章题目
print(content.find('a')['href']) #文章链接
"""
option = webdriver.ChromeOptions()
option.add_argument('headless')
option.add_argument('--disable-gpu')
#option.add_argument('user-agent = ''')
driver = webdriver.Chrome(executable_path='E:\chromedriver_win32\chromedriver.exe',chrome_options=option)
wait = WebDriverWait(driver, timeout=10)
driver.get('https://www.ncbi.nlm.nih.gov/pmc/?term=carbon')
#wait.until(EC.visibility_of_element_located((By.XPATH,'//*[@id="EntrezSystem2.PEntrez.PMC.Pmc_ResultsPanel.Entrez_Pager.Page"]'))).click()
def next_page():
print('正在翻页...')
try:
wait.until(EC.visibility_of_element_located((By.XPATH,'//*[@id="EntrezSystem2.PEntrez.PMC.Pmc_ResultsPanel.Entrez_Pager.Page"]'))).click()
Get_list()
except TimeoutException:
next_page()
def Get_list():
pre_url = 'https://www.ncbi.nlm.nih.gov'
html = driver.page_source
soup = BeautifulSoup(html,'lxml')
contents = soup.find_all('div', class_='title')
for content in contents:
#print(content.text) #文章题目
end_url = content.find('a')['href'] #文章链接 https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4711879/
Article_url = pre_url + end_url
Get_info(Article_url)
def Get_info(url):
content = []
global num
filename = r"E:\文献全文\carbon" +"\\"+ str(num) + ".txt"
response = requests.get(url,headers = header,timeout = 10)
soup = BeautifulSoup(response.text,'lxml')
title = soup.find('h1',class_ = "content-title").text.strip()
try:
keywords = soup.find('span',class_="kwd-text").text.strip()
except:
pass
contents = soup.find_all('p',attrs={"id":re.compile(r"__p(\d\w+)?")})
contents2 = soup.find_all('p',attrs={"id":re.compile(r"P(\d\w+)?")})
if contents2:
for each in contents2:
content.append(each.text)
if contents:
for each in contents:
content.append(each.text)
with open(filename,'w',encoding='utf-8') as f:
f.write(str(title)+'\n\n')
f.write('\n\n')
try:
if keywords:
f.write(str(keywords)+'\n\n')
except:pass
for each in content:
f.write(each+'\n\n')
num += 1
if __name__ =="__main__":
page = 50
Get_list()
for numer in range(page):
next_page()
driver.close()
文献获取
猜你喜欢
转载自blog.csdn.net/qq1195365047/article/details/88659365
今日推荐
周排行