Area information data at crawling Wheat Xiamen

I started crawling when not disguised with headers to the browser, causing Catcher Beijing and Fuzhou district to put my ip ban, but fortunately later found reasons not to be left in the Xiamen area I broke the code as follows:

#-*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
page_url = "http://xm.maitian.cn/xqall"
headers = {"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
           "Referer":"http://xm.maitian.cn/esfall",
           "Connection":"keep-alive",
           "Content-Type":"text/plain; charset=utf-8"}


def get_communities_url():
    all_data =[]
    try:
        reponse = requests.get(url=page_url,headers=headers)
    except Exception as e:
        print("请求连接错误")
        raise e

    soup = BeautifulSoup(reponse.text,"lxml")
    soup = soup.find("div","list_wrap")
    tag_li = soup.find_all("li")
    for tag_li in soup.find_all("li"):
        href = tag_li.h1.a['href']
        new_url = page_url.replace("/xqall",href)
        #all_url.append(new_url)
        dict_data =get_target_info(new_url)
        if dict_data:
            all_data.append(dict_data)
    #print(all_data)
    return all_data

def get_target_info(new_url):
    # all_url = get_communities_url()
    # print(len(all_url))

    dict = {}

    try:
        reponse = requests.get(url=new_url,headers=headers)
    except Exception as e:
        print("请求连接错误")
        raise e

    #print(reponse.text)
    soup = BeautifulSoup(reponse.text,'lxml')
    soup1 = soup.find("section","home_main") 
    PS = soup1.find_all ( " P " )
     # cell Median 
    community_avg = PS [0] .b.string.strip () 
    dict [ " community_avg " ] = community_avg
     # homes for sale 
    unsold_homes = ps [1] .find_all ( " EM " ) [0] + .a.string " sets " 
    dict [ " unsold_homes " ] = unsold_homes
     # Rent listings 
    rent_homes PS = [. 1] .find_all ( " EM " ) [. 1] + .a.string "Sets "
    dict["rent_homes"] = rent_homes
    #所属商圈
    business_circle = ps[2].label.string
    dict["business_circle"] =business_circle
    #开发商
    developers = ps[2].em.string
    dict["developers"] = developers

    soup2 = soup.find("ul","home_details")
    for tag_li in soup2.find_all("li"):
         IF tag_li [ " class " ] == [ ' li_left ' ]: 
            P = tag_li.find_all ( " P " )
             # building area 
            Area = P [0] .em.string 
            dict [ " Area " ] = Area
             # Property The company 
            property_company the p-= [1 ] .em.string 
            dict [ " property_company " ] = property_company
             # property costs 
            industry_fee = p [2].em.string
            dict["industry_fee"] = industry_fee

        elif tag_li["class"] == ['li_center']:
            p = tag_li.find_all("p")
            #建成年代
            built_year = p[0].em.string
            dict["built_year"] = built_year
            #房屋总数
            total_houses = p[1].em.string
            dict["total_houses"] = total_houses
            #绿化率
            green_rates = p[2].em.string
            dict["green_rates"] =green_rates

        elif tag_li["class"] == ['li_right']:
            p = tag_li.find_all("p")
            # 占地面积
            cover_area = p[0].em.string
            dict["cover_area"] = cover_area
            # 楼栋总数
            total_built = p[1].em.string
            dict["total_built"] = total_built
            # 容积率
            product_rates = p[2].em.string
            dict["product_rates"] = product_rates
    return dict



if __name__ == '__main__':
    data_all = get_communities_url()
    print(data_all)

 

Guess you like

Origin www.cnblogs.com/venvive/p/11415472.html