python爬虫之获取豆瓣房源信息并保存

在这里插入图片描述

import requests
import os
from bs4 import BeautifulSoup
import openpyxl

#请求服务器,获取页面信息
def get_url(x):
    headers={'origin':'https://www.douban.com','referer':'https://www.douban.com/group/yuexiuzufang/','User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
    res= requests.get('https://www.douban.com/group/yuexiuzufang/discussion?start=x*25',headers=headers)
    bs= BeautifulSoup(res.text,'html.parser') #指定Beautiful的解析器为“html.parser”,转换为bs4.BeautifulSoup数据类型
    return bs

#获取发表时间
def get_release_time(bs):
	list_time= bs.find_all('td',class_='time')#get release time
	return list_time
	
#获取发表的用户
def get_user(bs):
    list_user = []
    user = bs.find_all('td',nowrap="nowrap",class_ = '')#get Publisher
    for j in range(len(user)-1):
        if j%2 == 1 :
           list_user.append(user[j])
    return list_user
	
#获取讨论标题和连接
def get_sub_url_and_title(bs):
   list_all = []   #use to store titles and users
   URL_list = []
   Title_list = []

   list_url_and_title = bs.find_all('td',class_='title') #匹配出class标签,每页25条信息,储存数据类型:bs4.element.Tag
   for i in list_url_and_title:
      tag= i.find('a')#保留下有a标签的
      Title = tag['title'] #获取标题
      URL = tag['href']  #获取url
      list_all.append([Title,URL])
      URL_list.append(URL)
      Title_list.append(Title)

   return list_all,URL_list,Title_list

#将获取到的信息保存下载
def write_to_xlsx(all_list):
   web=openpyxl.Workbook()
   sheet = web.active
   sheet.title = '租房信息'

   for i in all_list:
      sheet.append(i)

   path = r'c:\豆瓣租房信息.xlsx'
   if os.path.exists(path):
      os.remove(path)
      web.save(path)
   else:
      web.save(path)

   



if __name__ == '__main__':
   all_list = []  

   user_list_all = []
   URL_list_all = []
   Title_list_all = []

   for x in range(10):  #请求搜索的页数,素第一页
      bs = get_url(x)  #获取网页信息
      time = get_release_time(bs)  #获取帖子发布时间
      user = get_user(bs)   #获取帖子作者列表
      list_,URL,Title = get_sub_url_and_title(bs)  #获取帖子标题和连接

      all_list += list_ #列表拼接,将每一页获取到的标题&连接,保存到同一个列表中

   write_to_xlsx(all_list)

   


发布了31 篇原创文章 · 获赞 2 · 访问量 2万+

猜你喜欢

转载自blog.csdn.net/qq_27149279/article/details/105387107