爬虫实例 基础教程
其他
2020-10-16 23:47:05
阅读次数: 0
1、第一种方法
- # 第一种方式:requests 和 lxml结合使用
- import requests
- from lxml import etree
- #1、拿到所有的页面链接,并使用yield返回完整的超链接
- def get_html(url):
- # 获取页面HTML
- html=requests.get(url)
- # 使用etree格式化HTML
- seq=etree.HTML(html.text)
- link_list=seq.xpath('//*[@id="content"]/ul/li/a/@href')
- for i in link_list:
- yield "http://www.runoob.com"+i
- # 2、获取详细的页面数据
- def get_html_link(link):
- for i in link:
- # 获取界面
- link_html=requests.get(i)
- # 初始化
- link_seq=etree.HTML(link_html.content)
- # 得到标题
- title=link_seq.xpath('//*[@id="content"]/h1/text()')[0]
- # 得到题目内容
- subject=link_seq.xpath('//*[@id="content"]/p[position()>1 and position()<4]/text()')
- subject_list='\n'.join(subject)
- yield (title,subject_list)
- # 3、保存数据
- def save_subject(title_subject):
- with open("G:/1.txt",'a+',encoding='utf-8') as f:
- for title,subject_list in title_subject:
- f.write(title+'\n')
- f.write(subject_list+'\n')
- f.write("#"*50+'\n')
- # 4、函数回调
- def funcall(url):
- link=get_html(url)
- title_subject=get_html_link(link)
- save_subject(title_subject)
- # 5、主函数
- def main():
- url='http://www.runoob.com/python/python-100-examples.html'
- funcall(url)
- if __name__=="__main__":
- main()
-
- # for i in get_html('http://www.runoob.com/python/python-100-examples.html'):
- # print(i)
- # for i in get_html_link(link):
- # print(i)
2、第二种方法
-
# 第二种\方式:urllib.request 与 beautifulsoup结合使用
-
import urllib.request
-
from bs4 import BeautifulSoup
-
# 1、获取所有页面链接
-
def get_html(url):
-
# 获取页面HTML源码
-
html=urllib.request.urlopen(url).read()
-
# 格式化html
-
soup=BeautifulSoup(html,'lxml')
-
# 首先找到第一个id='content'的标签,并找到子标签ul(2个)
-
# 其次遍历子标签ul,并获取到所有的ul的子标签li
扫描二维码关注公众号,回复:
11934664 查看本文章
-
# 然后遍历li标签,并获取到li标签下的所有a标签
-
# 使用yield返回超链接
-
for i in soup.find(id='content').find_all('ul'):
-
for j in i.find_all('li'):
-
for k in j.find_all('a'):
-
yield 'http://www.runoob.com'+k['href']
-
# 2、获取详细的页面数据
-
def get_html_link(link):
-
# 遍历所有的超链接
-
for i in link:
-
# 请求超链接页面HTML
-
link_list=urllib.request.urlopen(i).read()
-
# 格式化HTML
-
soup=BeautifulSoup(link_list,'lxml')
-
# 获取id='content'的标签
-
content=soup.find(id='content')
-
if content:
-
# 获取h1标签的内容
-
title=content.find('h1').string
-
# 获取前3个p标签的内容
-
conten_list=content.find_all('p',limit=3)
-
subject=''
-
for j in conten_list:
-
subject+=j.get_text()
-
yield (title,subject)
-
# 3、数据保存
-
def save_suject(title_content):
-
with open('G:/2.txt','w+',encoding='utf+8') as f:
-
for tile,content in title_content:
-
f.write(tile+'\n')
-
f.write(content+'\n')
-
f.write('#'*80+'\n')
-
# 4、函数回调
-
def fun_call(url):
-
link=get_html(url)
-
title_content=get_html_link(link)
-
save_suject(title_content)
-
# 5、主函数
-
def main():
-
url='http://www.runoob.com/python/python-100-examples.html'
-
fun_call(url)
-
if __name__=='__main__':
-
main()
3、第三种方法
-
# 第三种方式
-
import requests,re
-
from bs4 import BeautifulSoup
-
# 1、获取页面的超链接信息
-
def get_html(url):
-
html=requests.get(url)
-
html.encoding='utf-8'
-
soup=BeautifulSoup(html.text,'lxml')
-
for i in soup.find_all('a',href=re.compile('^/python/python-exercise')):
-
yield 'http://www.runoob.com'+i.get('href')
-
# 2、获取超链接页面的详细信息
-
def get_html_link(link_list):
-
for i in link_list:
-
html_link=requests.get(i)
-
html_link.encoding='utf-8'
-
soup=BeautifulSoup(html_link.text,'lxml')
-
title=soup.find('div',class_="article-intro").h1.string
-
con=soup.find('div',class_="article-intro").find_all('p')
-
i=1
-
list1=[]
-
while True:
-
if re.match('程序源代码',con[i].text) or re.match(' Python 100例',con[i].text) or re.match('以上实例输出结果为',con[i].text):
-
break
-
else:
-
list1.append(con[i].text)
-
i+=1
-
yield (title,list1)
-
# 3、保存数据
-
def save_data(content_list):
-
with open('G:/3.txt','w+',encoding='utf+8') as f:
-
for tile,content in content_list:
-
f.write(tile+'\n')
-
for i in range(len(content)):
-
f.write(content[i]+'\n')
-
f.write('#'*80+'\n')
-
# 4、函数回调
-
def fun_call(url):
-
link_list=get_html(url)
-
content_list=get_html_link(link_list)
-
save_data(content_list)
-
# 5、主函数
-
def main():
-
url='http://www.runoob.com/python/python-100-examples.html'
-
fun_call(url)
-
if __name__=='__main__':
-
main()
转载自blog.csdn.net/xixi20200/article/details/109066229