导入相关包
# !pip install seaborn pandas matplotlib numpy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython import display
display.set_matplotlib_formats('svg')
# Alternative to set svg for newer versions
# import matplotlib_inline
# matplotlib_inline.backend_inline.set_matplotlib_formats('svg')
numpy:python中做数据分析常用的包;
pandas:也是用于数据分析,擅长处理表,数据没那么大要放入内存中,这将是首选;
matplotlib.pyplot:源自matlab的画图工具;
seaborn:基于matplotlib,提供更多的画法
剩下两行用于将图片设成svg文件(画起来分辨率相对高一点)
读取数据
data = pd.read_csv('house_sales.zip')
csv文件存下来相对比较大,可以先压缩成一个zip或一个tar,主流的读取文件都可以从压缩文件中读取。建议存成压缩文件,在传输存储都会比较好,甚至还会比直接读取还要好(这个方法可用于文本) 作者:爱喝水的崩奔 https://www.bilibili.com/read/cv13353927?spm_id_from=333.999.0.0 出处:bilibili
data.shape
data.head()#打印前五行
删缺失值
将列中30%缺失的列删去,以此来简化数据
null_sum = data.isnull().sum()
data.columns[null_sum < len(data) * 0.3] # columns will keep
data.drop(columns=data.columns[null_sum > len(data) * 0.3], inplace=True)
检查type
data.dtypes
处理错误的数据类型
currency = ['Sold Price', 'Listed Price', 'Tax assessed value', 'Annual tax amount']
for c in currency:
data[c] = data[c].replace(
r'[$,-]', '', regex=True).replace(
r'^\s*$', np.nan, regex=True).astype(float)
areas = ['Total interior livable area', 'Lot size']
for c in areas:
acres = data[c].str.contains('Acres') == True
col = data[c].replace(r'\b sqft\b|\b Acres\b|\b,\b','', regex=True).astype(float)
col[acres] *= 43560
data[c] = col
data.describe()#查看数据特征
#把不正常的取消掉
abnormal = (data[areas[1]] < 10) | (data[areas[1]] > 1e4)
data = data[~abnormal]
sum(abnormal)
卖价格分布情况,log10让分布均匀
ax = sns.histplot(np.log10(data['Sold Price']))
ax.set_xlim([3, 8])
ax.set_xticks(range(3, 9))
ax.set_xticklabels(['%.0e'%a for a in 10**ax.get_xticks()]);
查看房子的种类
data['Type'].value_counts()[0:20]
查看不同类型的价格
types = data['Type'].isin(['SingleFamily', 'Condo', 'MultiFamily', 'Townhouse'])
sns.displot(pd.DataFrame({
'Sold Price':np.log10(data[types]['Sold Price']),
'Type':data[types]['Type']}),
x='Sold Price', hue='Type', kind='kde');
箱体图-一平米卖多少钱-表示不同分布情况
data['Price per living sqft'] = data['Sold Price'] / data['Total interior livable area']
ax = sns.boxplot(x='Type', y='Price per living sqft', data=data[types], fliersize=0)
ax.set_ylim([0, 2000]);
每个邮政编码的房价
d = data[data['Zip'].isin(data['Zip'].value_counts()[:20].keys())]
ax = sns.boxplot(x='Zip', y='Price per living sqft', data=d, fliersize=0)
ax.set_ylim([0, 2000])
ax.set_xticklabels(ax.get_xticklabels(), rotation=90);
每个特征之间的关系(协方差)
_, ax = plt.subplots(figsize=(6,6))
columns = ['Sold Price', 'Listed Price', 'Annual tax amount', 'Price per living sqft', 'Elementary School Score', 'High School Score']
sns.heatmap(data[columns].corr(),annot=True,cmap='RdYlGn', ax=ax);