import pandas as pd import matplotlib.pyplot as plt import numpy as np ###################Get data####################### house_data = pd.read_csv("./housing.csv") # Divide by 1.5 to limit the number of income categories house_data["income_cat"] = np.ceil(house_data["median_income"] / 1.5) # Label those above 5 as 5 house_data["income_cat"].where(house_data["income_cat"] < 5, 5.0, inplace=True) from sklearn.model_selection import StratifiedShuffleSplit split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) for train_index, test_index in split.split(house_data, house_data["income_cat"]): strat_train_set = house_data.loc[train_index] strat_test_set = house_data.loc[test_index] # from sklearn.model_selection import train_test_split # train_set, test_set = train_test_split(house_data, test_size=0.2, random_state=16) strat_train_set = strat_train_set.drop("income_cat",axis =1) strat_test_set = strat_test_set.drop("income_cat",axis =1) house_data = strat_train_set.copy() ###############Visualize######################## house_data.hist(bins=50, figsize=(15, 10)) house_data.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1) house_data.plot(kind="scatter", x="longitude", y="latitude", alpha=0.3, s=house_data["population"] / 100, label="population", c=house_data["median_house_value"], cmap=plt.get_cmap("jet"), colorbar=True, ) plt.legend() #plt.show() corr_matrix = house_data.corr() a = corr_matrix["median_house_value"].sort_values(ascending=False) house_data["rooms_per_households"] = house_data["total_rooms"] / house_data["households"] house_data["bedrooms_per_room"] = house_data["total_bedrooms"] / house_data["total_rooms"] house_data["population_per_households"] = house_data["population"] / house_data["households"] #print(house_data.info()) corr_matrix = house_data.corr() b = corr_matrix["median_house_value"].sort_values(ascending=False) #print(b) ###################Prepare data#################### house_data = strat_train_set.drop("median_house_value", axis=1) # drop labels for training set house_labels = strat_train_set["median_house_value"].copy() from sklearn.preprocessing import Imputer imputer = Imputer(strategy="median") house_num = house_data.drop("ocean_proximity",axis=1) imputer.fit(house_num) print(imputer.statistics_,imputer.strategy) print(house_num.median().values) X = imputer.transform(house_num) house_tr = pd.DataFrame(X,columns=house_num.columns,index=list(house_data.index.values)) print(house_tr.info()) house_cat = house_data['ocean_proximity'] a = house_cat.value_counts() print(a) house_cat_encoded,house_cat_categories = house_cat.factorize() print(house_cat_encoded[:10]) from sklearn.preprocessing import OneHotEncoder encoder = OneHotEncoder() house_cat_onehot = encoder.fit_transform(house_cat_encoded.reshape(-1,1)) print(house_cat_onehot.toarray()) # from sklearn.preprocessing import LabelBinarizer # encoder = LabelBinarizer() # house_cat_onehot1 = encoder.fit_transform(house_cat) # print(house_cat_onehot1.toarray()) from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils import check_array from sklearn.preprocessing import LabelEncoder from scipy import sparse class CategoricalEncoder(BaseEstimator, TransformerMixin): def __init__(self, encoding='onehot', categories='auto', dtype=np.float64, handle_unknown='error'): self.encoding = encoding self.categories = categories self.dtype = dtype self.handle_unknown = handle_unknown def fit(self, X, y=None): """Fit the CategoricalEncoder to X. Parameters ---------- X : array-like, shape [n_samples, n_feature] The data to determine the categories of each feature. Returns ------- self """ if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']: template = ("encoding should be either 'onehot', 'onehot-dense' " "or 'ordinal', got %s") raise ValueError(template % self.handle_unknown) if self.handle_unknown not in ['error', 'ignore']: template = ("handle_unknown should be either 'error' or " "'ignore', got %s") raise ValueError(template % self.handle_unknown) if self.encoding == 'ordinal' and self.handle_unknown == 'ignore': raise ValueError("handle_unknown='ignore' is not supported for" " encoding='ordinal'") X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True) n_samples, n_features = X.shape self._label_encoders_ = [LabelEncoder() for _ in range(n_features)] for i in range(n_features): le = self._label_encoders_[i] Xi = X[:, i] if self.categories == 'auto': le.fit(Xi) else: valid_mask = np.in1d(Xi, self.categories[i]) if not np.all(valid_mask): if self.handle_unknown == 'error': diff = np.unique(Xi[~valid_mask]) msg = ("Found unknown categories {0} in column {1}" " during fit".format(diff, i)) raise ValueError(msg) le.classes_ = np.array(np.sort(self.categories[i])) self.categories_ = [le.classes_ for le in self._label_encoders_] return self def transform(self, X): """Transform X using one-hot encoding. Parameters ---------- X : array-like, shape [n_samples, n_features] The data to encode. Returns ------- X_out : sparse matrix or a 2-d array Transformed input. """ X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True) n_samples, n_features = X.shape X_int = np.zeros_like(X, dtype=np.int) X_mask = np.ones_like(X, dtype=np.bool) for i in range(n_features): valid_mask = np.in1d(X[:, i], self.categories_[i]) if not np.all(valid_mask): if self.handle_unknown == 'error': diff = np.unique(X[~valid_mask, i]) msg = ("Found unknown categories {0} in column {1}" " during transform".format(diff, i)) raise ValueError(msg) else: # Set the problematic rows to an acceptable value and # continue `The rows are marked `X_mask` and will be # removed later. X_mask[:, i] = valid_mask X[:, i][~valid_mask] = self.categories_[i][0] X_int[:, i] = self._label_encoders_[i].transform(X[:, i]) if self.encoding == 'ordinal': return X_int.astype(self.dtype, copy=False) mask = X_mask.ravel() n_values = [cats.shape[0] for cats in self.categories_] n_values = np.array([0] + n_values) indices = np.cumsum(n_values) column_indices = (X_int + indices[:-1]).ravel()[mask] row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), n_features)[mask] data = np.ones(n_samples * n_features)[mask] out = sparse.csc_matrix((data, (row_indices, column_indices)), shape=(n_samples, indices[-1]), dtype=self.dtype).tocsr() if self.encoding == 'onehot-dense': return out.toarray() else: return out cat_encoder = CategoricalEncoder() housing_cat_reshaped = house_cat.values.reshape(-1, 1) housing_cat_1hot = cat_encoder.fit_transform(housing_cat_reshaped) print(housing_cat_1hot.toarray()) from sklearn.preprocessing import StandardScaler std_scaler = StandardScaler() std = std_scaler.fit_transform(house_tr) print(std) from sklearn.base import BaseEstimator, TransformerMixin # column index rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6 class CombinedAttributesAdder(BaseEstimator, TransformerMixin): def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs self.add_bedrooms_per_room = add_bedrooms_per_room def fit(self, X, y=None): return self # nothing else to do def transform(self, X, y=None): rooms_per_household = X[:, rooms_ix] / X[:, household_ix] population_per_household = X[:, population_ix] / X[:, household_ix] if self.add_bedrooms_per_room: bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix] return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room] else: return np.c_[X, rooms_per_household, population_per_household] from sklearn.base import BaseEstimator, TransformerMixin # Create a class to select numerical or categorical columns # since Scikit-Learn doesn't handle DataFrames yet class DataFrameSelector(BaseEstimator, TransformerMixin): def __init__(self, attribute_names): self.attribute_names = attribute_names def fit(self, X, y=None): return self def transform(self, X): return X[self.attribute_names].values num_attribs = list(house_num) cat_attribs = ["ocean_proximity"] from sklearn.pipeline import Pipeline from sklearn.pipeline import FeatureUnion num_pipeline = Pipeline([ ('selector', DataFrameSelector(num_attribs)), ('imputer', Imputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) cat_pipeline = Pipeline([ ('selector', DataFrameSelector(cat_attribs)), ('cat_encoder', CategoricalEncoder(encoding="onehot-dense")), ]) full_pipeline = FeatureUnion(transformer_list=[ ("num_pipeline", num_pipeline), ("cat_pipeline", cat_pipeline), ]) house_prepare = full_pipeline.fit_transform(house_data) print(house_prepare.shape) from sklearn.linear_model import LinearRegression lin_reg = LinearRegression() lin_reg.fit(house_prepare,house_labels) some_data = house_data.iloc[:5] some_labels = house_labels.iloc[:5] some_prepare = full_pipeline.transform(some_data) print("Predictions:",lin_reg.predict(some_prepare)) print("Labels:",list(some_labels)) from sklearn.metrics import mean_squared_error house_predictions = lin_reg.predict(house_prepare) lin_mse = mean_squared_error(house_labels,house_predictions) lin_rmse = np.sqrt(lin_mse) print(lin_rmse) from sklearn.metrics import mean_absolute_error lin_mae = mean_absolute_error(house_labels,house_predictions) print(lin_mae) from sklearn.tree import DecisionTreeRegressor tree_reg = DecisionTreeRegressor(random_state=42) tree_reg.fit(house_prepare,house_labels) house_predictions_tree = tree_reg.predict(house_prepare) tree_mse = mean_squared_error(house_predictions_tree,house_labels) tree_rmse = np.sqrt(tree_mse) print(tree_rmse) from sklearn.cross_validation import cross_val_score scores = cross_val_score(tree_reg,house_prepare,house_labels,scoring="neg_mean_squared_error",cv=10) tree_rmse_scores = np.sqrt(-scores) print(tree_rmse_scores) def displayscores(scores): print("scores:",scores) print("Mean scores:",np.mean(scores)) print("Std scores:",np.std(scores)) displayscores(tree_rmse_scores) from sklearn.model_selection import GridSearchCV from sklearn.ensemble import RandomForestRegressor param_grid = [ {"n_estimators":[3,30,100],"max_features":[2,4,6,8]}, {"bootstap":[False],"n_estimators":[50,100],"max_features":[2,3,4]}, ] forest_reg = RandomForestRegressor() grid_seach = GridSearchCV(forest_reg,param_grid,cv=5,scoring="neg_mean_squared_error") grid_seach.fit(house_prepare,house_labels) print(grid_seach.best_params_,grid_seach.best_estimator_,grid_seach.best_score_)
housing price
猜你喜欢
转载自blog.csdn.net/u010016056/article/details/80601536
今日推荐
周排行