Python机器学习实践指南-第二章

# # -*- coding:utf-8 -*-
#准备数据
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
plt.style.use(('ggplot'))

pd.set_option("display.max_columns",30)
pd.set_option("display.max_colwidth",100)
pd.set_option("display.precision",3)
CSV_PATH = r"D:/python机器学习+数据分析/Python机器学习实践指南（中文版带书签）、原书代码、数据集/PMLB Datasets/PMLB Datasets/PMLB Datasets copy/magic.csv"
df = pd.read_csv(CSV_PATH)
# print(df.columns)   #输出列标题
# print(df.head())
# print(df.describe())
# print(df.T)
# print(df.T.ix[:,1:2])
# nn = df[df['routable_link/_text'].str.contains('203 Rivington')|df['routable_link/_text'].str.contains('280 E 2nd')]
# print(nn[:2].T)
#有上面代码输出结果可以看到，有不少缺失值
#multiple units
mu = df[df['listingtype_value'].str.contains('Apartments For')]

#single units
su = df[df['listingtype_value'].str.contains('Apartment For')]
# print(len(mu))  #161
# print(len(su))  #339

"""
大多数房源属于单一单元的类型。需要将数据格式化为标准结构，例如卧室数、浴室数等准备一列。
"""
# print(su['propertyinfo_value'])

#以上数据看出，包含卧室和浴室，也有的数据包含年份

#检查没有包含‘bd'或者’Studio的行数
num=len(su[~(su['propertyinfo_value'].str.contains('Studio')|su['propertyinfo_value'].str.contains('bd'))])
# print(num)  #0
# #检查有没有包含ba的行数
# print(len(su[~(su['propertyinfo_value'].str.contains('ba'))]))   #6
#显示有6条数据，看来有几行缺少浴室数量的数据。需要填充或者插补这些缺失的数据点。

"""
缺失数据的处理，是建模过程的一个关键步骤之一。
"""

#选择拥有浴室的房源
no_baths = su[~(su['propertyinfo_value'].str.contains('ba'))]


#再排除那些缺失了浴室信息的房源
sucln = su[~su.index.isin(no_baths.index)]

#使用项目符号进行切分，解析卧室和浴室的信息
def parse_info(row):
    if not 'sqft' in row:
        br,ba = row.split('•')[:2]
        sqft = np.nan
    else:
        br,ba ,sqft=row.split('•')[:3]
    return pd.Series({'Beds':br,'Baths':ba,'Sqft':sqft})
attr = sucln['propertyinfo_value'].apply(parse_info)
# print(attr)

#在取值中将字符串删除
attr_cln = attr.applymap(lambda x:x.strip().split(' ')[0] if isinstance(x,str) else np.nan)
# print(attr_cln)
sujnd = sucln.join(attr_cln)
# print(sujnd.T)

"""
数据基本出来了。可以基于浴室、卧室的数量和面积，测试公寓的价值。
"""
"""
如果可能，尝试提取楼层的信息。假设一个模式，其中一个数字后面跟随一个字母，
该字母就表示建筑的楼层。
"""
#parse out zip,floor
def parse_addy(r):
    so_zip = re.search('NY(\d+)',r)
    so_flr = re.search('(?:APT|#)\s+(\d+)[A-Z]+,',r)
    if so_zip:
        zipc = so_zip.group(1)
    else:
        zipc = np.nan
    if so_flr:
        flr = so_flr.group(1)
    else:
        flr = np.nan
    return pd.Series({'Zip':zipc,'Floor':flr})
flrzip = sujnd['routable_link/_text'].apply(parse_addy)
# print(flrzip)
# print(len(flrzip[~flrzip['Floor'].isnull()]))
# print(len(flrzip[~flrzip['Zip'].isnull()]))
suf = sujnd.join(flrzip)
# print(suf.T)


"""
当楼层和邮编信息出现的时候，从333个房源中获得320个带有邮编的房源和164个带有楼层信息的房源，
"""
#将数据减少为所感兴趣的那些列
sudf = suf[['pricelarge_value_prices','Beds','Baths','Sqft','Floor','Zip']]
#清理奇怪的列名，并重置索引
sudf.rename(columns={'pricelarge_value_prices':'Rent'},inplace=True)
sudf.reset_index(drop=True,inplace=True)
# print(sudf)

"""
2.2.1分析数据
"""
# print(sudf.describe())
"""
可以看到租金的统计细分。
"""
#将出现的'Studio'替换成0
sudf.loc[:,'Beds'] = sudf['Beds'].map(lambda x:0 if 'Studio' in x else x)
# print(sudf)

#解决了上面的问题，还有一个问题，统计数据的列必须是数值型的。
# print(sudf.info())

#解决列中数据类型的问题
sudf.loc[:,'Rent'] = sudf['Rent'].astype(int)
sudf.loc[:,'Beds'] = sudf['Beds'].astype(int)
#存在半浴室的情况，使用浮点型
sudf.loc[:,'Baths'] = sudf['Baths'].astype(float)

#存在NAN的，需要浮点型，但是要首先将逗号替换掉
sudf.loc[:,'Sqft'] = sudf['Sqft'].str.replace(',','')
sudf.loc[:,'Sqft'] = sudf['Sqft'].astype(float)
sudf.loc[:,'Floor'] = sudf['Floor'].astype(float)

#看看结果如何
# print(sudf.info())
# print(sudf.describe())

#索引标号318是有问题的房源，可以放弃此数据
sudf = sudf.drop([318])
# print(sudf.describe())

"""
使用数据透视查看数据。首先看邮政编码和卧室数量来检查价格
"""
# print(sudf.pivot_table('Rent','Zip','Beds',aggfunc='mean'))
"""
以上结果可以根据邮编查看结果。随着房间数量的增加，看到越来越少的房源。nan值就是最好的证明。
"""
#基于房源数量进行透视
# print(sudf.pivot_table('Rent','Zip','Beds',aggfunc='count'))
"""
由结果可知，数据是稀疏的。没关系，还可以继续分析。
"""

#2.2.2可视化数据

"""
由于目前的数据是基于邮政编码的，最好使用热图作为可视化数据的方法。使用folium的python库来实现。
"""

#由于缺少包含两到三间卧室的公寓，可以先缩减数据，聚焦到工作室和一间卧室的房源。
su_lt_two = sudf[sudf['Beds']<2]

# import folium
# map = folium.Map(location=[40.748817,-73.985428],zoom_start=13)
# map.geo_json(geo_path=r'D:/python机器学习+数据分析/Python机器学习实践指南（中文版带书签）、原书代码、数据集/PMLB Datasets/PMLB Datasets/PMLB Datasets copy/nyc_boroughs.geojson',
#              data=su_lt_two,
#              columns=['Zip','Rent'],
#              key_on ='feature.properties.postalCode',
#              threshold_scale=[1700.00,1900.00,2100,2300.00,2500.00,2750.00],
#              fill_color='YlOrRd',fill_opacity=0.7,line_opacity=0.2,
#              legend_name='Rent(%)',reset=True
#              )
# map.create_map(path='nyc.html')

"""
对数据建模
"""
import patsy
import statsmodels.api as sm
f = 'Rent~Zip + Beds'  #Rent是因变量，Zip+Beds是预测变量。邮编和卧室数量如何影响价格
"""
将公式和包含相应列名的数据放一起，传递给patsy.dmatrices().
然后设置patsy，返回一个数据框，其中X矩阵由预测变量组成，而y向量由响应变量组成。
传递给sm.OLS(),之后调研fit（）运行模型。
"""
y,X = patsy.dmatrices(f,su_lt_two,return_type='dataframe')
results = sm.OLS(y,X).fit()
"""
  打印结果分析：看到模型包含262个观测样本，调整后的R2位0.282，F值尾1.21e-10，具有统计显示性。
显著性质所建模型，仅仅使用卧室数量和邮政编码，就已经能够解释约三分之一的价格差异。
  看看中间一大堆数据吧。中间部分提供了模型中每个自变量的有关信息。从左至右，看到有变量、变量在模型中的系数。
标准误差，t统计值，t统计值的p值，及置信区间。
  如果看P值这一列，可以确定独立变量从统计的角度来看是否具有意义。在回归模型中具有统计学意义，
意味着一个独立变量和响应变量之间的关系不太可能偶然发生。通常，统计学家使用0.05的p值来确定。
一个0.05的p值意味着看到的结果只有5%的几率是偶然发生的。从结果看，卧室的数量显然有意义。
   邮编会有什么影响呢？截距代表了10001的邮编。建立线性回归模型的时候，是需要截距。
截距就是回归线和y轴交叉的地方。Statsmodels会自动选择一个预测变量作为截距。
   卧室的数量，截距是显著的。邮编并不显著。显著的几个10001，10029和10035，具有很高的负置信区间。
"""
# print(results.summary())

"""
2.3.1预测
"""
# print(X.head())
"""
结果看出，输入是用所谓的虚拟变量进行编码。由于邮编不是胡数字的，为了表示这个特征，
系统使用了虚拟编码。如果某个公寓在10003中，那么该列将被编码为1，而其他编码都是0.
卧室是数值型的，系统根据实际的数字进行编码。
"""
to_pred_idx = X.iloc[0].index
to_pred_zeros = np.zeros(len(to_pred_idx))
tpdf = pd.DataFrame(to_pred_zeros,index = to_pred_idx,columns=['value'])
# print(tpdf)
"""
刚刚使用了X矩阵索引，并用0填充数据。现在填充一些实际的值，比如对10009区域的、包含一间卧室的公寓估价。
"""
tpdf.loc['Intercept']=1
tpdf.loc['Beds']=1
tpdf.loc['Zip[T.10009]']=1
# print(tpdf)  #截距10009邮政编码已经被设置为1了。卧室数量’Beds也变成1了。

#已经将特征设置为适当的值，现在使用该模型返回一个预测。
# print(results.predict(tpdf['value']))  #2529.56

#results是保存模型的变量名。这个模型对象有一个.predict()方法，使用自己的输入值调用该方法。

#如果想要增加一间卧室怎么办？
tpdf['value'] = 0
tpdf.loc['Intercept'] = 1
tpdf.loc['Beds'] =2
tpdf.loc['Zip[T.10009]'] = 1
# print(tpdf)  #卧室数量更新为2

# print(results.predict(tpdf['value']))   #2378.035。可以看出，增加一间卧室，大概多花200美元。
#如果地区选择在10002的话，会是什么情况呢？
tpdf['value'] = 0
tpdf.loc['Intercept'] = 1
tpdf.loc['Beds'] = 2
tpdf.loc['Zip[T.10002]']=1
print(results.predict(tpdf['value']))  #2651.176 .10002地区比10009地区稍微便宜点呢。

"""
目前，只检视了邮政编码、卧室和出租价格之间的关系。虽然有一定的解释能力，但是数据量太小，特征太少，
无法充分的观测房地产估值。
也可以添加更多的特征和数据。
"""

"""
本章，学习了如何获取房地产列表的数据，利用pandas的功能操作和清洗数据，通过热图检视数据，最后，
构建并使用回归模型预估公寓价格。
目前只是接触了机器学习的表层，下面继续探索深层算法和应用。
"""
Python机器学习实践指南-第二章

猜你喜欢