import requests
import os
from bs4 import BeautifulSoup
import openpyxl
#请求服务器,获取页面信息
def get_url(x):
headers={'origin':'https://www.douban.com','referer':'https://www.douban.com/group/yuexiuzufang/','User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
res= requests.get('https://www.douban.com/group/yuexiuzufang/discussion?start=x*25',headers=headers)
bs= BeautifulSoup(res.text,'html.parser') #指定Beautiful的解析器为“html.parser”,转换为bs4.BeautifulSoup数据类型
return bs
#获取发表时间
def get_release_time(bs):
list_time= bs.find_all('td',class_='time')#get release time
return list_time
#获取发表的用户
def get_user(bs):
list_user = []
user = bs.find_all('td',nowrap="nowrap",class_ = '')#get Publisher
for j in range(len(user)-1):
if j%2 == 1 :
list_user.append(user[j])
return list_user
#获取讨论标题和连接
def get_sub_url_and_title(bs):
list_all = [] #use to store titles and users
URL_list = []
Title_list = []
list_url_and_title = bs.find_all('td',class_='title') #匹配出class标签,每页25条信息,储存数据类型:bs4.element.Tag
for i in list_url_and_title:
tag= i.find('a')#保留下有a标签的
Title = tag['title'] #获取标题
URL = tag['href'] #获取url
list_all.append([Title,URL])
URL_list.append(URL)
Title_list.append(Title)
return list_all,URL_list,Title_list
#将获取到的信息保存下载
def write_to_xlsx(all_list):
web=openpyxl.Workbook()
sheet = web.active
sheet.title = '租房信息'
for i in all_list:
sheet.append(i)
path = r'c:\豆瓣租房信息.xlsx'
if os.path.exists(path):
os.remove(path)
web.save(path)
else:
web.save(path)
if __name__ == '__main__':
all_list = []
user_list_all = []
URL_list_all = []
Title_list_all = []
for x in range(10): #请求搜索的页数,素第一页
bs = get_url(x) #获取网页信息
time = get_release_time(bs) #获取帖子发布时间
user = get_user(bs) #获取帖子作者列表
list_,URL,Title = get_sub_url_and_title(bs) #获取帖子标题和连接
all_list += list_ #列表拼接,将每一页获取到的标题&连接,保存到同一个列表中
write_to_xlsx(all_list)
python爬虫之获取豆瓣房源信息并保存
猜你喜欢
转载自blog.csdn.net/qq_27149279/article/details/105387107
今日推荐
周排行