文献获取

# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
import requests, sys
import re
from selenium import webdriver
import time
import random
import os
from fake_useragent import UserAgent
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import TimeoutException
ua = UserAgent()
header = {
        'User-Agent': ua.random,
    }
num = 1
"""
response = requests.get("https://www.ncbi.nlm.nih.gov/pmc/?term=carbon",headers=headers)
soup = BeautifulSoup(response.text,'lxml')
contents = soup.find_all('div', class_='title')
for content in contents:
    print(content.text)  #文章题目
    print(content.find('a')['href'])  #文章链接
"""


option = webdriver.ChromeOptions()
option.add_argument('headless')
option.add_argument('--disable-gpu')
#option.add_argument('user-agent = ''')
driver = webdriver.Chrome(executable_path='E:\chromedriver_win32\chromedriver.exe',chrome_options=option)
wait = WebDriverWait(driver, timeout=10)
driver.get('https://www.ncbi.nlm.nih.gov/pmc/?term=carbon')
#wait.until(EC.visibility_of_element_located((By.XPATH,'//*[@id="EntrezSystem2.PEntrez.PMC.Pmc_ResultsPanel.Entrez_Pager.Page"]'))).click()
def next_page():
    print('正在翻页...')
    try:
        wait.until(EC.visibility_of_element_located((By.XPATH,'//*[@id="EntrezSystem2.PEntrez.PMC.Pmc_ResultsPanel.Entrez_Pager.Page"]'))).click()
        Get_list()
    except TimeoutException:
        next_page()
def Get_list():
    pre_url = 'https://www.ncbi.nlm.nih.gov'
    html = driver.page_source
    soup = BeautifulSoup(html,'lxml')
    contents = soup.find_all('div', class_='title')
    for content in contents:
        #print(content.text)  #文章题目
        end_url = content.find('a')['href']   #文章链接  https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4711879/
        Article_url = pre_url + end_url
        Get_info(Article_url)
def Get_info(url):
    content = []
    global num
    filename = r"E:\文献全文\carbon" +"\\"+ str(num) + ".txt"
    response = requests.get(url,headers = header,timeout = 10)
    soup = BeautifulSoup(response.text,'lxml')
    title = soup.find('h1',class_ = "content-title").text.strip()
    try:
        keywords = soup.find('span',class_="kwd-text").text.strip()
    except:
        pass
    contents = soup.find_all('p',attrs={"id":re.compile(r"__p(\d\w+)?")})
    contents2 = soup.find_all('p',attrs={"id":re.compile(r"P(\d\w+)?")})
    if contents2:
        for each in contents2:
            content.append(each.text)
    if contents:
        for each in contents:
            content.append(each.text)
    with open(filename,'w',encoding='utf-8') as f:
        f.write(str(title)+'\n\n')
        f.write('\n\n')
        try:
            if keywords:
                f.write(str(keywords)+'\n\n')
        except:pass
        for each in content:
            f.write(each+'\n\n')
        num += 1

if __name__ =="__main__":
    page = 50
    Get_list()
    for numer in range(page):
        next_page()
    driver.close()

猜你喜欢

转载自blog.csdn.net/qq1195365047/article/details/88659365