我用python爬取济南房价,告诉你哪里买房合适(-)

 
 
import requests
from bs4 import BeautifulSoup
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def GetUrlHtml(url):
    kv={"User-Agent": "Mozilla/5.0"}
    response=requests.get(url,headers=kv)
    try:
        response.raise_for_status()
        response.status_code=response.apparent_encoding
        return response.text
    except:
        return "URL异常了"
def HtmlParser(response):
     soup=BeautifulSoup(response,"lxml")
     #############提取房源名称
     resblock_name=soup.find_all("div",class_="resblock-name")
     # 使用查询结果再创建一个BeautifulSoup对象,对其继续进行解析
     for a in resblock_name:
         #获取楼盘名字
         name=a.text.split("\n")[1]
         loupan_name.append(name)
         #获取楼盘类型
         type=a.text.split("\n")[2]
         resblock_type.append(type)
         #获取楼盘状态
         status=a.text.split("\n")[3]
         sale_status.append(status)
     ################提取房源位置
     loupan_location=soup.find_all("div",class_="resblock-location")
     for a in  loupan_location:
        location1=a.text.split("\n")[1]
        location2 = a.text.split("\n")[3]
        location3 = a.text.split("\n")[5]
        location=location1+"/"+location2+"/"+location3
        #print(location)
        resblock_location.append(location)
     ###########获取房源均价
     loupan_price=soup.find_all("div",class_="main-price")
     for a in loupan_price:
         price1=a.text.split("\n")[1]
         price2=a.text.split("\n")[2]
         price=price1+price2
         #print(price)
         main_price.append(price1)

def plot(house):
    name=house["resblock_name"]
    price = house["main_price"]
    price = np.array(price)
    name=np.array(name)
    #添加横纵轴名称
    plt.rc('font', family='STXihei', size=11)
    plt.xlabel("楼盘名称")
    plt.ylabel("楼盘价格")
    #设置图例
    plt.legend(["价格"],loc="upper right")
    plt.plot(name,price)
    plt.show()
if __name__ == '__main__':
    url="https://jn.fang.lianjia.com/loupan/"
    loupan_name = []
    resblock_type = []
    sale_status = []
    resblock_location=[]
    main_price=[]

    for i in range(1,22):
        #将url转化为字符串
        i=str(i)
        url=url+"pg"+i+"/"
        #print(url)
        response = GetUrlHtml(url)
        HtmlParser(response)
    #str.strip()过滤
    house = pd.DataFrame({"resblock_name": loupan_name,"main_price":main_price,"resblock_location":resblock_location, " resblock_type": resblock_type, "sale_status": sale_status})
    #调整列的顺序
    house=house[["resblock_name","main_price","resblock_location"," resblock_type", "sale_status"]]
    print(house)
    if not os.path.exists("济南房价数据"):
        os.mkdir("济南房价数据")
    house.to_csv("济南房价数据/房价.csv",encoding = 'gbk', index = False)
    #plot(house)
    # 价格进行降序
    # 删除价格待定的行
    house = house[~house['main_price'].str.contains("价格待定")]
    house=house.sort_values(by="main_price",ascending=False)

实验结果,这是按照从高到低出售排序,去除


 

猜你喜欢

转载自blog.csdn.net/qq_36114862/article/details/80554059