#-*- conding:utf-8 -*-
from bs4 import BeautifulSoup
import requests
import time
'''
1.需求分析
获取:
title = Python 练习实例1
timu = 题目:有四个数字:1、2、3、4,能组成多少个互不相同且无重复数字的三位数?各是多少?
cxfx = 程序分析:可填在百位、十位、个位的数字都是1、2、3、4。组成所有的排列后再去 掉不满足条件的排列。
code = 源代码
2.源码分析
入口:http://www.runoob.com/python/python-100-examples.html
1. 获取所有的 a 标签
find(id = 'content').find_all('a')
2. 获取标题
find(id = 'content').h1
3. 获取题目
find(id = 'content').find_all('p')[1]
4. 获取程序分析
find(id = 'content').find_all('p')[2]
5. 获取源代码
find(class_ = 'hl-main').text
3.代码实现
'''
'''
一、发送请求获取py100首页源代码
'''
startUrl = 'http://www.runoob.com/python/python-100-examples.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
# 发送请求
response = requests.get(startUrl,headers = headers).content.decode('utf-8')
# print(response)
# 解析成 BeautifulSoup
soup = BeautifulSoup(response,'lxml')
# print(soup)
# 提取a 标签
link = soup.select('#content a')
# a = []
num = 1
for i in link:
# a.append(i.attrs['href'])
print('第{0}道题'.format(num))
'''
二、请求详细页面获取内容
'''
response2 = requests.get('http://www.runoob.com'+i.attrs['href'],headers = headers).content.decode('utf-8')
# 解析
html = BeautifulSoup(response2,'lxml')
#获取标题
Title = html.select('#content h1')[0].text
# 题目
timu = html.select('#content p')[1].text
# 获取程序分析
cxfx = html.select('#content p')[2].text
# 源代码
try:
code = html.select('.hl-main')[0].text
except:
code = html.select('pre')[0].text
'''
保存内容
'''
with open('py100.txt','a+',encoding='utf-8') as file:
file.write(Title+'\n'+timu+'\n'+cxfx+'\n'+code+'\n'+'='*50+'\n')
# time.sleep(1)
num+=1
#-*- conding:utf-8 -*-
from lxml import etree
import requests
import time
'''
1.需求分析
1.获取每一篇帖子的标题
2.获取每一篇帖子的内容
2.源码分析
入口:https://www.cnblogs.com/
1. 获取每一篇帖子的a链接
//div[@class='post_item_body']/h3/a[@href]
获取下一页
//div[@class='pager']/a[last()]/@href
//div[@class='pager']/a[last()]/text()
2.获取标题
//div[@class='post_item_body']/h3/a/text()
3.获取内容
string(//div[@id='cnblogs_post_body'])
3.代码实现
'''
'''
一、请求首页帖子链接
'''
stratUrl = 'https://www.cnblogs.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
# 起始页码
page = 1
while True:
# 请求首页源码
response = requests.get(stratUrl,headers = headers).text
# 解析
html = etree.HTML(response)
# 提取a标签中的链接以
link = html.xpath("//div[@class='post_item_body']/h3/a/@href")
# 下一页的链接以及文本
nextPage = html.xpath("//div[@class='pager']/a[last()]/@href")
nextPageText = html.xpath("//div[@class='pager']/a[last()]/text()")
'''
二、获取帖子详细内容
'''
# 累加器
num = 1
for i in link:
print('第{0}页第{1}篇帖子'.format(page,num))
# 请求帖子的内容
response_info = requests.get(i,headers = headers).text
# 解析
html_info = etree.HTML(response_info)
# print(html_info)
# 提取标题
title = html_info.xpath("//a[@id='cb_post_title_url']/text()")[0]
# 提取内容
content = html_info.xpath("string(//div[@id='cnblogs_post_body'])")
'''
保存文件
'''
with open('cnblogs.txt','a+',encoding='utf-8') as file:
file.write(title+'\n'+content+'='*50+'\n')
time.sleep(0.5)
num+=1
if nextPageText[0] == 'Next >':
stratUrl = 'https://www.cnblogs.com'+nextPage[0]
page+=1
time.sleep(1)