看了这个大神的博客—爬虫项目合集,自己也动手实践一下
请求:requests 解析:xpath
思路:找到起始网页(第一页),爬取初识网页的数据,获取下一页的链接,爬取下一页的数据,以此类推
非常简单,直接放代码:
import requests
from lxml import etree
source_url = "http://bj.xiaozhu.com/" # 以北京地区为例
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36",
"referer": "http://bj.xiaozhu.com/"
} # 请求头比较简单,如果被识别可以换为更复杂的(多加几个字段)
data_lst = [] # 这里用list和dict简单存一下,最好存到数据库中(以后会用mysql)
def request(url):
response = requests.get(source_url, headers=headers).content
return response
def get_data(text):
html = etree.HTML(text)
url_lst = html.xpath(".//div[@id='page_list']//li/a[@class='resule_img_a']/@href")
for url in url_lst:
ht = etree.HTML(requests.get(url, headers=headers).content)
title = ht.xpath(".//div[@class='pho_info']/h4/em/text()")[0] # 获取房源标题
address = ht.xpath(".//div[@class='pho_info']/p/@title")[0] # 获取房源地址
price = ht.xpath(".//div[@class='day_l']/span/text()")[0] # 获取房源价格
data_dict = {}
data_dict["title"] = title
data_dict["address"] = address
data_dict["price"] = price
data_lst.append(data_dict)
next_url = html.xpath(".//div[@class='day_l']/span/text()") # 获取下一页的网址
return next_url
if __name__ == '__main__':
next = ""
page = 2
for i in range(page): # 这里的page可以自己随便设置,但不要超过页数范围
if i == 0:
text = request(source_url)
else:
text = request(next)
next = get_data(text)
print(data_lst)