1主页
2.代码
import re
import time
from bs4 import BeautifulSoup
import requests
from requests import RequestException
import conference_wf
from lxml import etree
def get_page(url):
try:
# 添加User-Agent,放在headers中,伪装成浏览器
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
response.encoding = response.apparent_encoding
return response.text
return None
except RequestException as e:
print(e)
return None
def get_url(html):
url_list = []
soup = BeautifulSoup(htm1.content, 'html.parser')
ids=soup.find('div',{'class':"main wrapper clearfix"}).find_all("li")
for id in ids:
a=id.find('a')
url_list.append(a.attrs.get('href'))
return url_list
def get_info(url):
conference_wf.main(url)
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
key_word = input('请输入搜索关键词:') # 可以交互输入 也可以直接指定
# 从哪一页开始爬 爬几页
start_page = int(input('请输入爬取的起始页:'))
base_url = 'https://papers.nips.cc/search/?q={}&page={}'
first_url=base_url.format(key_word,start_page)
htm1 = requests.get(first_url, headers=headers)
soup = BeautifulSoup(htm1.text, 'html.parser')
# 总页数
#pagesum = soup.find('span', class_='searchPageWrap_all').get_text()
pagesum=5
for page in range(int(start_page), int(pagesum)):
new_url = base_url.format(key_word,page)
# 爬取当前页面 发送请求、获取响应
html = get_page(new_url)
# 解析响应 提取当前页面所有论文的url
url_list = get_url(html)
for url in url_list:
# 获取每篇论文的详细信息
urll="https://papers.nips.cc"+url
print("url:",urll)
get_info(urll)
time.sleep(2) # 间隔2s
conference_wf.py
import os
import re
import requests
import xlrd
import xlutils.copy
import xlwt
from bs4 import BeautifulSoup
//主要解析名称、作者、摘要、网址,其他字段是测试的数据
def parse_html(url):
#使用beautifulSoup进行解析
response = requests.get(url)
soup = BeautifulSoup(response.text,'html.parser')
#题目
title = soup.find('h2',{'class':"subtitle"})
title=title.text+""
# 作者
author = soup.find_all('li',{'class':"author"})
authors=""
for author_a in author:
authors= authors+author_a.find('a').text+';'
# 第一作者单位
unit = "qq"
# 关键词
keywords = 'qq'
#摘要
abstract = soup.find('h3',{'class':"abstract"})
if abstract:
abstract = abstract.text.strip()
else:
abstract=''
#会议名称
conference = "qq"
#会议时间
date ="qq"
# 发表时间
online_date ="qq"
paper = [title, authors, unit, keywords, abstract,conference, date, online_date]
paper.append(url)
return paper
def save_p(paper):
if not os.path.exists('会议论文.xls'):
wb = xlwt.Workbook()
sheet = wb.add_sheet('sheet1')
title = ['题目', '作者', '第一作者单位', '关键词','摘要', '会议名称', '会议时间','发表时间', '链接']
for i in range(len(title)):
sheet.write(0, i, title[i]) #在第0行写入标题
wb.save('会议论文.xls')
wb = xlrd.open_workbook('会议论文.xls')
sheet = wb.sheet_by_index(0)
rows = sheet.nrows #当前行数
ws = xlutils.copy.copy(wb)
sheet = ws.get_sheet(0)
for i in range(len(paper)):
sheet.write(rows, i, paper[i])
ws.save('会议论文.xls')
def main(url):
#发送请求、获取响应
#解析响应
paper = parse_html(url)
#数据存储
save_p(paper)
问题---总结:
1.BeautifulSoup(htm1.content, 'html.parser'):BeautifulSoup对象需是文本,而不是html网页
2.soup.find('div',{'class':"main wrapper clearfix"}).find_all("li"):标签或者soup都可以直接调用find、findall方法
3.获取ul标签的所有li标签中的a标签的href值:
ids=soup.find('div',{'class':"main wrapper clearfix"}).find_all("li")
for id in ids: a=id.find('a') url_list.append(a.attrs.get('href')) return url_list
4.获取ul标签的所有li标签中的a标签的内容:
ids=soup.find('div',{'class':"main wrapper clearfix"}).find_all("li")
for id in ids: a=id.find('a') url_list.append(a.text) return url_list
结果:可以看到摘要是空白的
解决:
看到如下源码:是<p>中class=abstract,而爬取代码中写了h3.将h3改为p即可