Python:好租网商务楼数据爬取

前言:

爬的方法和以前房天下类似,只是在使用next_sibling的时候一直不成功,所以先找了parent然后再找儿子的方式曲线救国解决了。
因为我的需求只是名字、区域、地址和楼层四个,所以参数不多,需要的可以自己在我的代码基础上增加。

代码:

import requests
from bs4 import BeautifulSoup

def get_building_url(url):
    '''获取每一页的大楼的内容(名称和链接)'''
    r = requests.get(url=url, headers=headers)
    soup = BeautifulSoup(r.text, 'lxml')
    for temp in soup.find_all(name='h1', attrs={'class': 'h1-title'}):
        building_dict = {'name': '', 'district': '', 'address': '', 'floor': ''}
        if temp.find(name='a'):
            building_href = temp.find(name='a').attrs['href'] #网址
            if building_href.startswith('/sh_'):
                building_href = 'https://www.haozu.com' + building_href
                building_dict['name'] = temp.find(name='a').attrs['alt']  # 名称
                building_dict['district'], building_dict['address'], building_dict[
                    'floor'] = get_normal_building_information(building_href)
            else:
                building_href = 'https:' + building_href
                building_dict['name'] = temp.find(name='a').string
                building_dict['district'], building_dict['address'], building_dict[
                    'floor'] = get_share_building_information(building_href)
            print(building_dict)
            export_buildingInfo(building_dict)

def get_normal_building_information(url):
    '''获取每一个普通大楼的具体信息(区域、地址和楼层)'''
    r = requests.get(url=url, headers=headers)
    soup = BeautifulSoup(r.text, 'lxml')
    # print(soup)
    information = soup.find(name='div', attrs={'house-address'}).span.text.split('\xa0')
    district = information[0] #区域
    address = information[-1] #地址
    if soup.find(text='总楼层', name='span'):
        floor = soup.find(text='总楼层', name='span').parent.find(name='span', attrs={'class': 's2'}).attrs['title'] #楼层
    else:
        floor = ''
    return district, address, floor

def get_share_building_information(url):
    '''获取每一个共享大楼的具体信息(区域、地址和楼层)'''
    r = requests.get(url=url, headers=headers)
    soup = BeautifulSoup(r.text, 'lxml')
    # print(soup)
    information = soup.find(name='p', attrs={'class':'p2'}).text
    # print(information)
    district = information[1:4].strip() #区域
    address = information.split(']')[1].split(' ')[0] #地址
    if soup.find(text='总楼层:', name='div'):
        floor = soup.find(text='总楼层:', name='div').parent.find(name='span', attrs={'class': 'li-con'}).attrs['title'] #楼层
    else:
        floor = ''
    return district, address, floor

def export_buildingInfo(building_dict):
    '''导出大楼的信息'''
    with open('上海楼宇信息.txt', 'a', encoding='utf-8') as file:
        file.write('||'.join(building_dict.values()))
        file.write('\n')

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
    'cookie': 'PHPSESSID=v9qdqo7slhi6t9mh3bhc6b75u3; citydomain=sh; haozu_user=%7B%22index%22%3A5%2C%22name%22%3A%22sem%22%2C%22value%22%3A%22sem_baidu%22%2C%22opt_scope%22%3A2%2C%22ca_n%22%3A%2295622646681%22%7D; Hm_lvt_826deb6478895f40cc4a3e9b54b0ba24=1590042924; _pk_ref.1.0dea=%5B%22%22%2C%22%22%2C1590042925%2C%22https%3A%2F%2Fwww.baidu.com%2Fbaidu.php%3Fsc.0f00000uEDLSpLgiCo05K50ear24i6vpsLPthZeMvmWjEXpNfxibearrTnyevkqZQU1y47v5azx134kS7veG7JKYwFdLX8tWsGevlVId2yobizOK0rIJubsZk1HDL6UcA0piKTrV9oz0otLQLXr5WFOc0CupdgJ4lHnGiV1mZSR-b3YClAgwVJPItNhqOlLWIjmlkWNo_JeYMUKAxyUXP39TPTWw.7b_NR2Ar5Od663rj6tJhgTrKDDIgQYTPZ-oLU8hjEa1zk_wGYtVerQKd9u4mhPSeGzurM_lXMugd9u4g_NT521TGJIGHz3qis1f_uQrPhkf.U1Yk0ZDqzTSlCzRznQH11tn-nWjfkTSB0ZKGm1Ys0ZK1pyI85H63uycYnyNWnvndP1c4n179nWuhnARYuWw-n179Pjnd0ZfqzTSlCqj71l2tEnodV555ssKGUHYznWR0u1dsT1c0Iybqmh7GuZR0TA-b5Hc0mv-b5Hfsr0KVIjYknjD4g1DsnHIxnW0vn-t1PW0k0AVG5H00TMfqn1fL0ANGujYkPjmzg1nkPWTsg1cknHDkg1nknHDkg1nkPH64g1nknWTsg1nkPjm1g1nsrjcY0AFG5HDdPNtkPH9xnW0Yg1ckPsKVm1Yknj0kg1D3PH03Pjmzn17xnHDknWnYPj03nHKxn0KkTA-b5Hc0TyPGujYs0ZFMIA7M5H00mycqn7ts0ANzu1Yz0ZKs5HfYP1f4n1cd0A4vTjYsQW0snj0snj0s0AdYTjYs0AwbUL0qn0KzpWYs0Aw-IWdsmsKhIjYs0ZKC5H00ULnqn0KBI1Ykn0K8IjYs0ZPl5fK9TdqGuAnqTZnVuyPJ0A-bm1dribGH0ZwdT1Ykn1R3n10Lrjnvn1cdnjb4njnLnsKzug7Y5HDdrH0sPjc4nH6Yn1R0Tv-b5H-9nAcLnvcsnj0krAuhrj00mLPV5HRvrH6srHTzPbc1rjKanHn0mynqnfKs%22%5D; _pk_ses.1.0dea=*; lookBuilding=12184; 1buildView=%5B%7B%22viewId%22%3A%2212184%22%2C%22userId%22%3A0%2C%22circleId%22%3A0%2C%22streetId%22%3A%227267%22%2C%22districtId%22%3A%22203%22%2C%22cityId%22%3A%2213%22%2C%22user_uuid%22%3A%22%22%2C%22type%22%3A1%2C%22category%22%3A1%2C%22viewTime%22%3A1590042966%7D%5D; showheader=1; Hm_lpvt_826deb6478895f40cc4a3e9b54b0ba24=1590043591; _pk_id.1.0dea=7e465a97f3decd72.1590042925.1.1590043602.1590042925.'
}

for i in range(1, 150):
    url = r"https://www.haozu.com/sh/zuxiezilou/o" + str(i) + "/"
    try:
        get_building_url(url=url)
    except:
        pass

猜你喜欢

转载自blog.csdn.net/weixin_42029733/article/details/106267273
今日推荐