python爬取深圳安居客租房信息

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/qq_39071593/article/details/84996393

有个死循环,而去有数据重复,记得你觉得够了,就中断它,然后把表格进行数据重复删除。

import re
import csv
import requests
from bs4 import BeautifulSoup

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    'Connection':'keep-alive'
} #伪装成360浏览器

def gethtml(url):     #最基本requests库使用因为BeautifulSoup要解析的是html.text,所以不要只返回html
    html = requests.get(url,headers = headers)
    html.encoding = html.apparent_encoding
    return html.text  #因为BeautifulSoup要解析的是html.text,所以不要只返回html

            
def soup(html): #BeautifulSoup解析
    soup = BeautifulSoup(html,'lxml')
    body = soup.body
    Itemmod = body.find_all(attrs={'class':"zu-itemmod"})
    for itemmod in Itemmod:
        dict = {'title':'','address':'','room':'','hall':'','area':'','floor':'','isshare':'','Oriented':'','monthly':''}
        '''
        print(itemmod.div.h3.a.string,\
              itemmod.div.address.a.string,\
              re.findall(r"\d+\.?\d*",itemmod.div.p.get_text('',strip=True))[0],\
              #str(itemmod.div.p).strip(),\
              #itemmod.div.p.get_text('',strip=True),\ #去换行、空格很有用
              itemmod.div.p.next_sibling.next_sibling.next_sibling.next_sibling.span.string,\
              itemmod.div.p.next_sibling.next_sibling.next_sibling.next_sibling.span.next_sibling.next_sibling.string,\
              #itemmod.div.p.next_sibling.next_sibling.next_sibling.next_sibling.span.next_sibling.next_sibling.next_sibling.next_sibling.string,\
              itemmod.div.next_sibling.next_sibling.p.strong.string,'元\月')
        '''
        dict['title'] = itemmod.div.h3.a.string
        dict['address'] = itemmod.div.address.a.string
        dict['room'] = re.findall(r"\d+\.?\d*",itemmod.div.p.get_text('',strip=True))[0]
        dict['hall'] = re.findall(r"\d+\.?\d*",itemmod.div.p.get_text('',strip=True))[1]
        dict['area'] = re.findall(r"\d+\.?\d*",itemmod.div.p.get_text('',strip=True))[2]
        dict['floor'] = re.findall(r"\d+\.?\d*",itemmod.div.p.get_text('',strip=True))[3]
        dict['isshare'] = itemmod.div.p.next_sibling.next_sibling.next_sibling.next_sibling.span.string
        dict['Oriented'] = itemmod.div.p.next_sibling.next_sibling.next_sibling.next_sibling.span.next_sibling.next_sibling.string
        dict['monthly'] = itemmod.div.next_sibling.next_sibling.p.strong.string
        print(itemmod.div.h3.a.string)
        writer.writerow(dict)
    
if __name__=="__main__":
    with open('zufang.csv', 'w', newline='') as csvfile:
        fieldnames = ['title','address','room','hall','area','floor','isshare','Oriented','monthly']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        page = 1 #页数
        while True:
            print('第',page,'页')
            soup(gethtml('https://gz.zu.anjuke.com/fangyuan/p+'+str(page)+'/'))
            page = page + 1
        print('完成')

猜你喜欢

转载自blog.csdn.net/qq_39071593/article/details/84996393