# ================ 关于泰坦尼克号乘客的生还率 ================ # 加载包 import pandas as pd from sklearn.linear_model import LogisticRegression # Logistic 回归模型 包 from sklearn.linear_model import LogisticRegressionCV # 带有正则化参数C的粒度 from sklearn.model_selection import cross_val_score # 交叉验证 # 文件位置 fil_tr = r"E:\python Data\happytry\Kaggle_Titanic-master\train.csv" fil_te = r"E:\python Data\happytry\Kaggle_Titanic-master\test.csv" # 加载数据 train = pd.read_csv(fil_tr) test = pd.read_csv(fil_te) # ====================================pandas预处理================================= # 查看数据 train[:4] ## statsmodels 和 scikit-learn通常不能接收缺失数据 train.isnull().sum() test.isnull().sum() # 用年龄预测生存率 ## 补全缺失 train['Age'] = train['Age'].fillna(train['Age'].median()) test['Age'] = test['Age'].fillna(test['Age'].median()) ## 将性别转换为 0 1变量 train['IsFemale'] = (train['Sex'] == 'female') * 1 test['IsFemale'] = (test['Sex'] == 'female') * 1 # ================================================================================= # ====================================转换成numpy进入模型============================ predictors = ['Pclass', 'IsFemale', 'Age'] ## 装换为numpy array x_train = train[predictors].values x_test = test[predictors].values y_train = train['Survived'].values # ===== 创建Logistic模型 ====== model = LogisticRegression() model.fit(x_train, y_train) # 预测 ## 训练集预测 y_predict = model.predict(x_train) ## 测试集预测 y_predict1 = model.predict(x_test) # 计算错误率 (train['Survived'] == y_predict).mean() # logisticregressioncv 类用参数指定网格搜索对模型的正则化参数C的粒度 model_cv = LogisticRegressionCV(10) model_cv.fit(x_train, y_train) # 交叉验证带有四个不重叠训练数据的模型 model = LogisticRegression(C = 10) scores = cross_val_score(model, x_train, y_train, cv = 4) # 做四次交叉验证
简单Logistic回归_简单交叉验证
猜你喜欢
转载自blog.csdn.net/scc_hy/article/details/80145407
今日推荐
周排行