bs4爬虫入门

 1 # -*- coding: utf-8 -*-
 2 """
 3 Created on Fri Nov 16 13:35:33 2018
 4 
 5 @author: zhen
 6 """
 7 import urllib
 8 import urllib.request
 9 from bs4 import BeautifulSoup
10 
11 # 设置目标rootUrl,使用urllib.request.Request创建请求
12 rootUrl = "https://www.cnblogs.com/"
13 request = urllib.request.Request(rootUrl)
14 
15 header = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
16 # 使用add_header设置请求头,将代码伪装成浏览器
17 request.add_header("User-Agent", header)
18 
19 # 使用urllib.request.urlopen打开页面,使用read方法保存html代码
20 htmlUrl = urllib.request.urlopen(request).read()
21 
22 # 使用BeautifulSoup创建html代码的BeautifulSoup实例,存为beautifulSoup
23 beautifulSoup = BeautifulSoup(htmlUrl)
24 
25 # 获取尾页(对照前一小节获取尾页的内容看你就明白了)
26 total_page = int(beautifulSoup.find("div",class_= "pager").findAll("a")[-2].get_text())
27 
28 list_item = beautifulSoup.findAll("a",class_="titlelnk")
29 for i in list_item: # 遍历所有的内容
30     href = i["href"] # 获取对应的href
31     req = urllib.request.Request(href)
32     req.add_header("User-Agent", header)
33     html = urllib.request.urlopen(req).read()
34     soup = BeautifulSoup(html)
35     # 获取标题
36     titleContent = soup.find("a", id="cb_post_title_url")
37     if titleContent is not None: # 判读是否为空
38         title = titleContent.get_text()   
39         # 获取内容
40         content = soup.find("div").get_text().strip()
41         print(title, "\n=====================================\n", content[1:100])

爬虫结果:

 

猜你喜欢

转载自www.cnblogs.com/yszd/p/9974800.html