数据采集分析

#!/bin/env python
#--coding:utf-8--
#auth:tyk
#data:2019-2-3
#function:Exploratory Visualization
###################################
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import sys
import warnings
import re
from data_1 import ZcSummary
from scipy.stats import norm
warnings.filterwarnings("ignore")
mpl.rcParams['axes.unicode_minus'] = False
mpl.rcParams['font.family']='sans-serif'
mpl.rcParams['font.sans-serif']=['SimHei']
reload(sys)
sys.setdefaultencoding('UTF-8')
hit_df = pd.read_csv('hit.csv',encoding='UTF-8')
print ">>==================data info=====================<<"
#hit_df.info()
#print (hit_df.shape)
#print (hit_df.dtypes)
#print ">>==================data describe=================<<"
try:

print(hit_df.describe())

pass

except Exception,e:
print e
df = hit_df.copy()
#df['floor']=re.findall("\d+",df['floor'])
#df['PerPrice'] = hit_df['Price']/hit_df['Size']
#columns = ['fang_key', 'fang_desc', 'price', 'price_pre', 'community', 'housetype', 'area', 'region', 'plate', 'floor', 'direction', 'age','address','updated_date']
#df = pd.DataFrame(df, columns = columns)
def variables_analyse(data):
sns.distplot(df['price'])
plt.show()
def region_analyse(data):
df=data.copy()
df_house_count = df.groupby('region')['price'].count().sort_values(ascending=False).to_frame().reset_index()
df_house_mean = df.groupby('region')['price_pre'].mean().sort_values(ascending=False).to_frame().reset_index()
f, [ax1,ax2,ax3] = plt.subplots(3,1,figsize=(30,45),dpi=100)
sns.barplot(x='region', y='price_pre', palette="Blues_d", data=df_house_mean, ax=ax1)
ax1.set_title('深圳各大区二手房每平米单价对比',fontsize=5)
ax1.set_xlabel('区域',)
ax1.set_ylabel('每平方米单价')
sns.barplot(x='region', y='price', palette="Greens_d", data=df_house_count, ax=ax2)
ax2.set_title('深圳各大区二手房数量对比',fontsize=5)
ax2.set_xlabel('区域')
ax2.set_ylabel('数量')
sns.boxplot(x='region', y='price', data=df, ax=ax3)
ax3.set_title('深圳各大区二手房房屋总价',fontsize=5)
ax3.set_xlabel('区域')
ax3.set_ylabel('房屋总价')
plt.savefig("region.png",dpi=100)
plt.show()
def area_analyse(data):
df=data.copy()
#过滤散点

df = df[(df['Layout']!='叠拼别墅')&(df['Size']<1000)]

f, [ax1,ax2] = plt.subplots(1, 2, figsize=(15, 5),dpi=100)
sns.distplot(df['area'], bins=20, ax=ax1, color='r')
sns.kdeplot(df['area'], shade=True, ax=ax1)
sns.regplot(x='area', y='price', data=df, ax=ax2)
plt.savefig("area.png",dpi=100)

#另外一种展示
fig,axes = plt.subplots()
sns.distplot(data['area'],bins=50,kde=False,fit=norm,ax=axes)
axes.set(xlabel='面积/平米',ylabel='概率密度',title='面积频率分布直方图')
plt.show()

def housetype_analyse(data):
df=data.copy()
f, ax1= plt.subplots(figsize=(20,20))
sns.countplot(y='housetype', data=df, ax=ax1)
ax1.set_title('房屋户型',fontsize=15)
ax1.set_xlabel('数量')
ax1.set_ylabel('户型')
plt.savefig("housetype.png",dpi=100)
plt.show()
def year_analyse(data):
new_data=data.copy()
new_data['age'].unique()
count_by_create_time = new_data['age'].groupby(new_data['age']).count() # 对不同时间建造的房屋进行分组统计

将房屋建造时间分成2000年及以前、2001-2005年、2006-2010年、2011-2015年、2016年及以后这五组数据,并分组计数

count_by_create_time1 = count_by_create_time.loc[:2000].sum()
count_by_create_time2 = count_by_create_time.loc[2001:2005].sum()
count_by_create_time3 = count_by_create_time.loc[2006:2010].sum()
count_by_create_time4 = count_by_create_time.loc[2011:2015].sum()
count_by_create_time5 = count_by_create_time.loc[2016:].sum()
new_count_by_create_time = pd.Series([count_by_create_time1,count_by_create_time2,count_by_create_time3,
                                  count_by_create_time4,count_by_create_time5],
                                index=['2000年及以前','2001-2005年','2006-2010年','2011-2015年','2016年及以后'])

绘制折线图

fig,axes = plt.subplots(1,2)
count_by_create_time.plot(kind='line',ax=axes[0])

axes[0].set(xlabel='房屋建造时间/年',ylabel='房数量/套',
              title='房数量随房屋建造时间变化折线图',
       xticks=[1970,1980,1990,2000,2010,2018])  # 设置折线图标题、坐标轴标签和x轴上的数值标签

# 绘制饼形图
new_count_by_create_time.plot(kind='pie',ax=axes[1],autopct='%.1f%%',startangle=90,label='')
# autopct参数的作用是指定饼形图中数据标签的显示方式
# '%.1f%%'表示数据标签的格式是保留一位小数的百分数
# startangle=90表示饼图的起始绘制角度是偏离x轴90度,并按逆时针绘制
# label=''后,饼形图的左边便不会再显示Series对象的名字
axes[1].set_title('不同建造时间范围内房屋占比饼形图')   # 设置饼形图的标题
axes[1].set_aspect('equal')   # 设置饼形图的纵横比相等  
plt.subplots_adjust(wspace=0.5)   # 设置figure对象中子图的间距
plt.show()

def floor_analyse(data):
df=data.copy()
f, ax1= plt.subplots(figsize=(20,5))
sns.countplot(x='floor', data=df, ax=ax1)
ax1.set_title('房屋户型',fontsize=15)
ax1.set_xlabel('数量')
ax1.set_ylabel('价格')
plt.savefig("floor.png",dpi=100)
plt.show()
def price_analyse(data):
new_data=data.copy()
#处理计算各区的平均房价
fig,axes = plt.subplots(1,2)
sns.distplot(new_data['price'],bins=50,kde=False,fit=norm,ax=axes[0])
sns.distplot(new_data['price_pre'],bins=50,kde=False,fit=norm,ax=axes[1])
axes[0].set(xlabel='售价/万',ylabel='概率密度')
axes[1].set(xlabel='单价/元每平米',ylabel='概率密度')
fig.suptitle('二手房的售价和单价频率分布直方图')
plt.subplots_adjust(wspace=0.4) # 设置figure对象中两子图的间距
plt.savefig('price.png')

plt.show()

min_price = new_data['price'].mean() - new_data['price'].std()
max_price = new_data['price'].mean() +  new_data['price'].std()
print(min_price,max_price)
min_average_price = new_data['price_pre'].mean() - new_data['price_pre'].std()
max_average_price = new_data['price_pre'].mean() + new_data['price_pre'].std()
print(min_average_price,max_average_price)

def main():
parm=["price_pre","area",'age','floor']
data_init=ZcSummary()
data=data_init.preprocess_features(df,parm)
#region_analyse(data)
#area_analyse(data)
#housetype_analyse(data)
#year_analyse(data)
#variables_analyse(data)
#floor_analyse(data)####has####
price_analyse(data)
if name == 'main':
try:
main()
except Exception as e:
print e

猜你喜欢

转载自blog.51cto.com/12768454/2375509