kaggle等算法比赛的合奏方法Ensemble(全)

第一种(多输出回归问题)

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.stats import mode
import pandas as pd
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
np.set_printoptions(threshold=np.inf)
import os
for dirname, _, filenames in os.walk('input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


df1 = pd.read_csv("input/0.48503.csv")
df2 = pd.read_csv("input/0.48743.csv")
df3 = pd.read_csv("input/0.48748.csv")
df4 = pd.read_csv("input/0.48874.csv")
df5 = pd.read_csv("input/0.48879.csv")
df6 = pd.read_csv("input/0.48908.csv")
df7 = pd.read_csv("input/0.49608.csv")
id_col = df1[['id']]
id_col.head()
df1.drop(['id'], axis=1, inplace=True)
df2.drop(['id'], axis=1, inplace=True)
df3.drop(['id'], axis=1, inplace=True)
df4.drop(['id'], axis=1, inplace=True)
df5.drop(['id'], axis=1, inplace=True)
df6.drop(['id'], axis=1, inplace=True)
df7.drop(['id'], axis=1, inplace=True)
df1.head() # example of output
# number of datasets for ensemble
N = 5
# either divide ensemble by number of components or assign them weights that sum to 1
w1 = 1/N+0.02
w2 = 1/N-0.005
w3 = 1/N-0.005
w4 = 1/N-0.006
w5 = 1/N-0.004
# w6 = 1/N
# w7 = 1/N
ensemble = w1*df1 + w2*df2 + w3*df3 + w4*df4 + w5*df5
# ensemble = w1*df1 + w2*df2 + w3*df3
predictions = pd.concat([id_col, ensemble], axis=1)
predictions.head()
predictions.to_csv('submission5_3.csv', index=False)
print(predictions)

第二种(单输出分类)

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.stats import mode
import pandas as pd
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
np.set_printoptions(threshold=np.inf)
import os
for dirname, _, filenames in os.walk('input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

df1 = pd.read_csv("input/0.96523.csv")
df2 = pd.read_csv("input/0.96334.csv")
df3 = pd.read_csv("input/0.96531.csv")
df4 = pd.read_csv("input/0.96487.csv")
df5 = pd.read_csv("input/0.95937.csv")
df6 = pd.read_csv("input/0.96620.csv")
df7 = pd.read_csv("input/0.94596.csv")
df8 = pd.read_csv("input/0.96552.csv")
df9 = pd.read_csv("input/0.95489.csv")
df10 = pd.read_csv("input/0.95532.csv")
df11 = pd.read_csv("input/0.95487.csv")
df12 = pd.read_csv("input/0.95914.csv")
df13 = pd.read_csv("input/0.95330.csv")
df14 = pd.read_csv("input/0.96703.csv")
df15 = pd.read_csv("input/0.96112.csv")
df16 = pd.read_csv("input/0.96739.csv")
df17 = pd.read_csv("input/0.96671.csv")
df18 = pd.read_csv("input/0.95637.csv")
df19 = pd.read_csv("input/0.96511.csv")
df20 = pd.read_csv("input/0.95959.csv")
df21 = pd.read_csv("input/0.95554.csv")



print(df1)
df1.head()
df4.head()

id_col = df1[['id']]
id_col.head()

df1.drop(['id'], axis=1, inplace=True)
df2.drop(['id'], axis=1, inplace=True)
df3.drop(['id'], axis=1, inplace=True)
df4.drop(['id'], axis=1, inplace=True)
df5.drop(['id'], axis=1, inplace=True)
df6.drop(['id'], axis=1, inplace=True)
df7.drop(['id'], axis=1, inplace=True)
df8.drop(['id'], axis=1, inplace=True)
df9.drop(['id'], axis=1, inplace=True)
df10.drop(['id'], axis=1, inplace=True)
df11.drop(['id'], axis=1, inplace=True)
df12.drop(['id'], axis=1, inplace=True)
df13.drop(['id'], axis=1, inplace=True)
df14.drop(['id'], axis=1, inplace=True)
df15.drop(['id'], axis=1, inplace=True)
df16.drop(['id'], axis=1, inplace=True)
df17.drop(['id'], axis=1, inplace=True)
df18.drop(['id'], axis=1, inplace=True)
df19.drop(['id'], axis=1, inplace=True)
df20.drop(['id'], axis=1, inplace=True)
df21.drop(['id'], axis=1, inplace=True)

print(df1.head()) # example of output

print(df1.iat[1,0])



data = pd.concat([df1,df2,df3,df4,df5,df6,
	df7,df8,df9,df10,df11,df12,
	df13,df14,df15,df16,df17,df18,
	df19,df20,df21], axis=1)
print(data)
print(data.iloc[0])
print(mode(data.iloc[0])[0][0])
ensemble=np.zeros([data.shape[0]])
for i in range(data.shape[0]):
	ensemble[i] = int(mode(data.iloc[i])[0][0])
	ensemble[i] = ensemble[i].astype(np.int64)
print(pd.DataFrame(ensemble),type((ensemble[9])))


ensemble=pd.DataFrame(ensemble,dtype=np.int64)
predictions = pd.concat([id_col, ensemble], axis=1)


# predictions.head()
print(type(predictions.iat[1,1]))

predictions.rename(columns={0:'label'},inplace=True)
print(predictions)

predictions.to_csv('submission.csv', index=False)

第三种(单变量回归)

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# from scipy import stats
import pandas as pd
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
np.set_printoptions(threshold=np.inf)
import os
for dirname, _, filenames in os.walk('input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

df1 = pd.read_csv("input/0.9240_submission.csv")
df2 = pd.read_csv("input/0.9240.csv")
df3 = pd.read_csv("input/0.9247.csv")
df4 = pd.read_csv("input/0.9344.csv")
df5 = pd.read_csv("input/0.9383.csv")
df6 = pd.read_csv("input/0.9354.csv")
df7 = pd.read_csv("input/0.9404_submission.csv")


print(df1)
df1.head()
df4.head()

id_col = df1[['id']]
id_col.head()

df1.drop(['id'], axis=1, inplace=True)
df2.drop(['id'], axis=1, inplace=True)
df3.drop(['id'], axis=1, inplace=True)
df4.drop(['id'], axis=1, inplace=True)
df5.drop(['id'], axis=1, inplace=True)
df6.drop(['id'], axis=1, inplace=True)
df7.drop(['id'], axis=1, inplace=True)

print(df1.head()) # example of output

print(df1.iat[1,0])



data = pd.concat([df1,df2,df3,df4,df5,df6,df7], axis=1)
print(data)
print(data.iloc[0])
# print(mode(data.iloc[0])[0][0])
ensemble=np.zeros([data.shape[0]])
for i in range(data.shape[0]):
	ensemble[i] = (np.mean(data.iloc[i]))
	ensemble[i] = ensemble[i].astype(np.float64)
print(pd.DataFrame(ensemble),type((ensemble[9])))


ensemble=pd.DataFrame(ensemble,dtype=np.float64)
predictions = pd.concat([id_col, ensemble], axis=1)


# predictions.head()
print(type(predictions.iat[1,1]))

predictions.rename(columns={0:'toxic'},inplace=True)
print(predictions)

predictions.to_csv('submission.csv', index=False)

猜你喜欢

转载自blog.csdn.net/qq_39867051/article/details/106458918