假期十二

爬虫爬取

from bs4 import BeautifulSoup

import  requests
import  xlwt
 
 
def  getHouseList(url):
     house  =  []
     headers  =  {
         'User-Agent' 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER' }
     # get从网页获取信息
     res  =  requests.get(url, headers = headers)
     # 解析内容
     soup  =  BeautifulSoup(res.content,  'html.parser' )
     # 房源title
     housename_divs  =  soup.find_all( 'div' class_ = 'title' )
     for  housename_div  in  housename_divs:
         housename_as  =  housename_div.find_all( 'a' )
         for  housename_a  in  housename_as:
             housename  =  []
             # 标题
             housename.append(housename_a.get_text())
             # 超链接
             housename.append(housename_a.get( 'href' ))
             house.append(housename)
     huseinfo_divs  =  soup.find_all( 'div' class_ = 'houseInfo' )
     for  in  range ( len (huseinfo_divs)):
         info  =  huseinfo_divs[i].get_text()
         infos  =  info.split( '|' )
         # 小区名称
         house[i].append(infos[ 0 ])
         # 户型
         house[i].append(infos[ 1 ])
         # 平米
         house[i].append(infos[ 2 ])
     # 查询总价
     house_prices  =  soup.find_all( 'div' class_ = 'totalPrice' )
     for  in  range ( len (house_prices)):
         # 价格
         price  =  house_prices[i].get_text()
         house[i].append(price)
     return  house
 
 
# 爬取房屋详细信息:所在区域、套内面积
def  houseinfo(url):
     headers  =  {
         'User-Agent' 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER' }
     res  =  requests.get(url, headers = headers)
     soup  =  BeautifulSoup(res.content,  'html.parser' )
     msg  =  []
     # 所在区域
     areainfos  =  soup.find_all( 'span' class_ = 'info' )
     for  areainfo  in  areainfos:
         # 只需要获取第一个a标签的内容即可
         area  =  areainfo.find( 'a' )
         if  ( not  area):
             continue
         hrefStr  =  area[ 'href' ]
         if  (hrefStr.startswith( 'javascript' )):
             continue
         msg.append(area.get_text())
         break
     # 根据房屋户型计算套内面积
     infolist  =  soup.find_all( 'div' id = 'infoList' )
     num  =  []
     for  info  in  infolist:
         cols  =  info.find_all( 'div' class_ = 'col' )
         for  in  cols:
             pingmi  =  i.get_text()
             try :
                 =  float (pingmi[: - 2 ])
                 num.append(a)
             except  ValueError:
                 continue
     msg.append( sum (num))
     return  msg
 
 
# 将房源信息写入excel文件
def  writeExcel(excelPath, houses):
     workbook  =  xlwt.Workbook()
     # 获取第一个sheet页
     sheet  =  workbook.add_sheet( 'git' )
     row0  =  [ '标题' '链接地址' '户型' '面积' '朝向' '总价' '所属区域' '套内面积' ]
     for  in  range ( 0 len (row0)):
         sheet.write( 0 , i, row0[i])
     for  in  range ( 0 len (houses)):
         house  =  houses[i]
         print (house)
         for  in  range ( 0 len (house)):
             sheet.write(i  +  1 , j, house[j])
     workbook.save(excelPath)
 
 
# 主函数
def  main():
     data  =  []
     for  in  range ( 1 5 ):
         print ( '-----分隔符' , i,  '-------' )
         if  = =  1 :
             url  =  'https://sjz.lianjia.com/ershoufang/l2rs%E5%92%8C%E5%B9%B3%E4%B8%96%E5%AE%B6/'
         else :
             url  =  'https://sjz.lianjia.com/ershoufang/pg'  +  str (i)  +  'l2rs%E5%92%8C%E5%B9%B3%E4%B8%96%E5%AE%B6/'
         houses  =  getHouseList(url)
         for  house  in  houses:
             link  =  house[ 1 ]
             if  ( not  link  or  not  link.startswith( 'http' )):
                 continue
             mianji  =  houseinfo(link)
             # 将套内面积、所在区域增加到房源信息
             house.extend(mianji)
         data.extend(houses)
     writeExcel( 'd:/house.xls' , data)
 
 

猜你喜欢

转载自www.cnblogs.com/jbwen/p/12301762.html