最近有一个新任务我们作为基础数据组需要收集很多基本信息。其中很重要的一个字段就是房源的最近地铁站和与地铁的距离为多少,这对于租户来说,是影响租房与否以及愿意支付价格的重要因素。
class MatchSubway:
def __init__(self,data1,data2):
import pandas as pd
assert 'name' in data1.columns and '小区经度' in data1.columns and '小区纬度' in data1.columns and \
'block' in data2.columns and 'lng' in data2.columns and 'lat' in data2.columns,\
'The information is not enough'
self._df1 = data1
self._df2 = data2
self._df1.insert(self._df1.shape[1],'匹配地铁站',None)
self._df2.insert(self._df1.shape[1],'近铁距离',None)
def _getdisfromXtoY(self,lng_a,lat_a,lng_b,lat_b):
import numpy as np
pk = 180/3.14169
a1 = lat_a/pk
a2 = lng_a / pk
b1 = lat_b / pk
b2 = lng_b / pk
t1 = np.cos(a1)* np.cos(a2) * np.cos(b1) *np.cos(b2)
t2 = np.cos(a1)*np.sin(a2) *np.cos(b1) *np.sin(b2)
t3 = np.sin(a1)*np.sin(b1)
tt = np.arccos(t1 + t2 + t3)
return (6378000*tt)
def matchsubway(self):
for i in self._df1.index:
result = self._getdisfromXtoY(self._df1.loc[i,'小区经度'],self._df1.loc[i,'小区纬度'],self._df2['lng'],self._df2['lat'])
pos1 = result.sort_values().index[0]
subway = self._df2.loc[pos1,'block']
distance = np.min(result)
self._df1.loc[i,'匹配地铁站'] = subway
self._df1.loc[i,'近铁距离'] = distance
return self._df1