爬取一本三国演义:
import urllib.request
from bs4 import BeautifulSoup
import time
# 首先向第一个url发送请求,得到相应
url = 'http://www.shicimingju.com/book/sanguoyanyi.html'
headers ={
'User-Agent':' Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
# 构建请求对象
request = urllib.request.Request(url=url, headers=headers)
# 发送请求得到响应
response = urllib.request.urlopen(request)
# 通过bs4进行解析响应
soup = BeautifulSoup(response.read(), 'html.parser')
print(soup)
# 解析内容,得到所有的章节标题还有每个章节的链接
oa_list = soup.select('.book-mulu > ul > li > a')
# print(oa_list)
# print(len(oa_list))
# 打开文件
fp = open('三国演义.txt', 'w', encoding='utf8')
# 循环遍历oa_list,以次得到每一个a的内容和herf
for oa in oa_list:
# 获取标题
title = oa.text
print('正在爬取--%s--....' % title)
# 获取每一个a的链接
herf = 'http://www.shicimingju.com' + oa['href']
# 构建请求对象
title_request = urllib.request.Request(url = herf,headers = headers)
# 发送请求,得到响应
title_response = urllib.request.urlopen(title_request)
#解析响应
title_soup = BeautifulSoup(title_response.read(),'html.parser')
# 解析获得内容
content = title_soup.select('.chapter_content')[0].text
# print(content)
# exit()
# 将title和content写入到文件中
fp.write(title + content)
print('结束爬取--%s--' % title)
time.sleep(2)
# 关闭文件
fp.close()