housing price

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

###################Get data#######################

house_data = pd.read_csv("./housing.csv")

# Divide by 1.5 to limit the number of income categories
house_data["income_cat"] = np.ceil(house_data["median_income"] / 1.5)
# Label those above 5 as 5
house_data["income_cat"].where(house_data["income_cat"] < 5, 5.0, inplace=True)

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(house_data, house_data["income_cat"]):
    strat_train_set = house_data.loc[train_index]
    strat_test_set = house_data.loc[test_index]

# from sklearn.model_selection import train_test_split
# train_set, test_set = train_test_split(house_data, test_size=0.2, random_state=16)

strat_train_set = strat_train_set.drop("income_cat",axis =1)
strat_test_set = strat_test_set.drop("income_cat",axis =1)
house_data = strat_train_set.copy()


###############Visualize########################

house_data.hist(bins=50, figsize=(15, 10))
house_data.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
house_data.plot(kind="scatter", x="longitude", y="latitude", alpha=0.3,
                s=house_data["population"] / 100, label="population",
                c=house_data["median_house_value"], cmap=plt.get_cmap("jet"), colorbar=True,
                )
plt.legend()
#plt.show()

corr_matrix = house_data.corr()
a = corr_matrix["median_house_value"].sort_values(ascending=False)

house_data["rooms_per_households"] = house_data["total_rooms"] / house_data["households"]
house_data["bedrooms_per_room"] = house_data["total_bedrooms"] / house_data["total_rooms"]
house_data["population_per_households"] = house_data["population"] / house_data["households"]

#print(house_data.info())

corr_matrix = house_data.corr()
b = corr_matrix["median_house_value"].sort_values(ascending=False)
#print(b)


###################Prepare data####################

house_data = strat_train_set.drop("median_house_value", axis=1) # drop labels for training set
house_labels = strat_train_set["median_house_value"].copy()


from sklearn.preprocessing import  Imputer

imputer = Imputer(strategy="median")
house_num = house_data.drop("ocean_proximity",axis=1)
imputer.fit(house_num)
print(imputer.statistics_,imputer.strategy)
print(house_num.median().values)
X = imputer.transform(house_num)
house_tr = pd.DataFrame(X,columns=house_num.columns,index=list(house_data.index.values))
print(house_tr.info())

house_cat = house_data['ocean_proximity']
a = house_cat.value_counts()
print(a)

house_cat_encoded,house_cat_categories = house_cat.factorize()
print(house_cat_encoded[:10])

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
house_cat_onehot = encoder.fit_transform(house_cat_encoded.reshape(-1,1))
print(house_cat_onehot.toarray())

# from sklearn.preprocessing import LabelBinarizer
# encoder = LabelBinarizer()
# house_cat_onehot1 = encoder.fit_transform(house_cat)
# print(house_cat_onehot1.toarray())

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.preprocessing import LabelEncoder
from scipy import sparse

class CategoricalEncoder(BaseEstimator, TransformerMixin):

    def __init__(self, encoding='onehot', categories='auto', dtype=np.float64,
                 handle_unknown='error'):
        self.encoding = encoding
        self.categories = categories
        self.dtype = dtype
        self.handle_unknown = handle_unknown

    def fit(self, X, y=None):
        """Fit the CategoricalEncoder to X.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_feature]
            The data to determine the categories of each feature.
        Returns
        -------
        self
        """

        if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']:
            template = ("encoding should be either 'onehot', 'onehot-dense' "
                        "or 'ordinal', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.handle_unknown not in ['error', 'ignore']:
            template = ("handle_unknown should be either 'error' or "
                        "'ignore', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.encoding == 'ordinal' and self.handle_unknown == 'ignore':
            raise ValueError("handle_unknown='ignore' is not supported for"
                             " encoding='ordinal'")

        X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True)
        n_samples, n_features = X.shape

        self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]

        for i in range(n_features):
            le = self._label_encoders_[i]
            Xi = X[:, i]
            if self.categories == 'auto':
                le.fit(Xi)
            else:
                valid_mask = np.in1d(Xi, self.categories[i])
                if not np.all(valid_mask):
                    if self.handle_unknown == 'error':
                        diff = np.unique(Xi[~valid_mask])
                        msg = ("Found unknown categories {0} in column {1}"
                               " during fit".format(diff, i))
                        raise ValueError(msg)
                le.classes_ = np.array(np.sort(self.categories[i]))

        self.categories_ = [le.classes_ for le in self._label_encoders_]

        return self

    def transform(self, X):
        """Transform X using one-hot encoding.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data to encode.
        Returns
        -------
        X_out : sparse matrix or a 2-d array
            Transformed input.
        """
        X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True)
        n_samples, n_features = X.shape
        X_int = np.zeros_like(X, dtype=np.int)
        X_mask = np.ones_like(X, dtype=np.bool)

        for i in range(n_features):
            valid_mask = np.in1d(X[:, i], self.categories_[i])

            if not np.all(valid_mask):
                if self.handle_unknown == 'error':
                    diff = np.unique(X[~valid_mask, i])
                    msg = ("Found unknown categories {0} in column {1}"
                           " during transform".format(diff, i))
                    raise ValueError(msg)
                else:
                    # Set the problematic rows to an acceptable value and
                    # continue `The rows are marked `X_mask` and will be
                    # removed later.
                    X_mask[:, i] = valid_mask
                    X[:, i][~valid_mask] = self.categories_[i][0]
            X_int[:, i] = self._label_encoders_[i].transform(X[:, i])

        if self.encoding == 'ordinal':
            return X_int.astype(self.dtype, copy=False)

        mask = X_mask.ravel()
        n_values = [cats.shape[0] for cats in self.categories_]
        n_values = np.array([0] + n_values)
        indices = np.cumsum(n_values)

        column_indices = (X_int + indices[:-1]).ravel()[mask]
        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
                                n_features)[mask]
        data = np.ones(n_samples * n_features)[mask]

        out = sparse.csc_matrix((data, (row_indices, column_indices)),
                                shape=(n_samples, indices[-1]),
                                dtype=self.dtype).tocsr()
        if self.encoding == 'onehot-dense':
            return out.toarray()
        else:
            return out


cat_encoder = CategoricalEncoder()
housing_cat_reshaped = house_cat.values.reshape(-1, 1)
housing_cat_1hot = cat_encoder.fit_transform(housing_cat_reshaped)
print(housing_cat_1hot.toarray())


from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
std = std_scaler.fit_transform(house_tr)
print(std)



from sklearn.base import BaseEstimator, TransformerMixin

# column index
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]


from sklearn.base import BaseEstimator, TransformerMixin


# Create a class to select numerical or categorical columns
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.attribute_names].values

num_attribs = list(house_num)
cat_attribs = ["ocean_proximity"]


from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', Imputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('cat_encoder', CategoricalEncoder(encoding="onehot-dense")),
    ])



full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

house_prepare = full_pipeline.fit_transform(house_data)
print(house_prepare.shape)


from sklearn.linear_model import  LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(house_prepare,house_labels)

some_data = house_data.iloc[:5]
some_labels = house_labels.iloc[:5]
some_prepare = full_pipeline.transform(some_data)
print("Predictions:",lin_reg.predict(some_prepare))
print("Labels:",list(some_labels))

from sklearn.metrics import mean_squared_error

house_predictions = lin_reg.predict(house_prepare)
lin_mse = mean_squared_error(house_labels,house_predictions)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

from sklearn.metrics import mean_absolute_error
lin_mae = mean_absolute_error(house_labels,house_predictions)
print(lin_mae)

from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(house_prepare,house_labels)

house_predictions_tree = tree_reg.predict(house_prepare)
tree_mse = mean_squared_error(house_predictions_tree,house_labels)
tree_rmse = np.sqrt(tree_mse)
print(tree_rmse)

from sklearn.cross_validation import cross_val_score
scores = cross_val_score(tree_reg,house_prepare,house_labels,scoring="neg_mean_squared_error",cv=10)
tree_rmse_scores = np.sqrt(-scores)
print(tree_rmse_scores)

def displayscores(scores):
    print("scores:",scores)
    print("Mean scores:",np.mean(scores))
    print("Std scores:",np.std(scores))

displayscores(tree_rmse_scores)


from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = [
    {"n_estimators":[3,30,100],"max_features":[2,4,6,8]},
    {"bootstap":[False],"n_estimators":[50,100],"max_features":[2,3,4]},
]


forest_reg = RandomForestRegressor()
grid_seach = GridSearchCV(forest_reg,param_grid,cv=5,scoring="neg_mean_squared_error")
grid_seach.fit(house_prepare,house_labels)
print(grid_seach.best_params_,grid_seach.best_estimator_,grid_seach.best_score_)

猜你喜欢

转载自blog.csdn.net/u010016056/article/details/80601536