Machine learning combat-logistic regression-user churn prediction
import numpy as np
train_data = np.genfromtxt('Churn-Modelling.csv',delimiter=',',dtype=np.str)
test_data = np.genfromtxt('Churn-Modelling-Test-Data.csv',delimiter=',',dtype=np.str)
x_train = train_data[1:,:-1]
y_train = train_data[1:,-1].astype(int)
x_test = test_data[1:,:-1]
y_test = test_data[1:,-1].astype(int)
x_train = np.delete(x_train,[0,1,2],axis=1)
x_test = np.delete(x_test,[0,1,2],axis=1)
x_train[:5]
y_train[:5]
# x_train[x_train=='Female'] = 0
# x_train[x_train=='Male'] = 1
from sklearn.preprocessing import LabelEncoder
labelencoder1 = LabelEncoder()
x_train[:,1] = labelencoder1.fit_transform(x_train[:,1])
x_test[:,1] = labelencoder1.transform(x_test[:,1])
labelencoder2 = LabelEncoder()
x_train[:,2] = labelencoder2.fit_transform(x_train[:,2])
x_test[:,2] = labelencoder2.transform(x_test[:,2])
x_train = x_train.astype(np.float32)
x_test = x_test.astype(np.float32)
y_train = y_train.astype(np.float32)
y_test = y_test.astype(np.float32)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification
LR = LinearRegression()
LR.fit(x_train,y_train)
predictions = LR.predict(x_test)
print(classification_report(y_test, predictions))
Machine Learning Actual Combat-Logistic Regression-Diabetes Prediction Model
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# 载入数据
diabetes_data = pd.read_csv('diabetes.csv')
diabetes_data.head()
# 数据信息
diabetes_data.info(verbose=True)
# 数据描述
diabetes_data.describe()
# 数据形状
diabetes_data.shape
# 查看标签分布
print(diabetes_data.Outcome.value_counts())
# 使用柱状图的方式画出标签个数统计
p=diabetes_data.Outcome.value_counts().plot(kind="bar")
plt.show()
# 可视化数据分布
p=sns.pairplot(diabetes_data, hue = 'Outcome')
plt.show()
The graphs drawn here are mainly of two types, histograms and scatter plots. A histogram is used when comparing a single feature, and a scatter plot is used when comparing different features to show the relationship between the two features. Observing the data distribution, we can find some abnormal values, such as Glucose glucose, Blood Pressure, SkinThickness, Insulin, BMI body mass index, these characteristics should not have a zero value.
# 把葡萄糖,血压,皮肤厚度,胰岛素,身体质量指数中的0替换为nan
colume = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
diabetes_data[colume] = diabetes_data[colume].replace(0,np.nan)
# pip install missingno
import missingno as msno
p=msno.bar(diabetes_data)
plt.show()
# 设定阀值
thresh_count = diabetes_data.shape[0]*0.8
# 若某一列数据缺失的数量超过20%就会被删除
diabetes_data = diabetes_data.dropna(thresh=thresh_count, axis=1)
p=msno.bar(diabetes_data)
plt.show()
# 导入插补库
from sklearn.preprocessing import Imputer
# 对数值型变量的缺失值,我们采用均值插补的方法来填充缺失值
imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
colume = ['Glucose', 'BloodPressure', 'BMI']
# 进行插补
diabetes_data[colume] = imr.fit_transform(diabetes_data[colume])
p=msno.bar(diabetes_data)
plt.show()
plt.figure(figsize=(12,10))
# 画热力图,数值为两个变量之间的相关系数
p=sns.heatmap(diabetes_data.corr(), annot=True)
plt.show()
# 把数据切分为特征x和标签y
x = diabetes_data.drop("Outcome",axis = 1)
y = diabetes_data.Outcome
from sklearn.model_selection import train_test_split
# 切分数据集,stratify=y表示切分后训练集和测试集中的数据类型的比例跟切分前y中的比例一致
# 比如切分前y中0和1的比例为1:2,切分后y_train和y_test中0和1的比例也都是1:2
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3, stratify=y)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
LR = LogisticRegression()
LR.fit(x_train,y_train)
predictions = LR.predict(x_test)
print(classification_report(y_test, predictions))