numpy
a = np.array([2, 0, 1, 5])
a[:3]
data == u'好'
data[data == u'是']
b= np.array([[1, 2, 3], [4, 5, 6]]) #创建二维数组
x = np.linspace(0, 10, 1000) #作图的变量自变量
np.arange(1, 9, 0.25)
from numpy.random import shuffle #引入随机函数
shuffle(data) #随机打乱数据
data_train = data[:int(0.8*len(data)), :] #选取前80%为训练数据
data_test = data[int(0.8*len(data)):, :] #选取前20%为测试数据
pandas
s = pd.Series([1,2,3], index=['a', 'b', 'c'])
d = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns = ['a', 'b', 'c']) #创建一个表
d2 = pd.DataFrame(s) #也可以用已有的序列来创建表格
data = pd.read_csv(datafile,encoding='utf-8')
pd.read_excel('data.xls')
pd.read_csv('data.csv', encoding = 'utf-8')
data = pd.read_excel(catering_sale, index_col = u'日期')
data['w']#w列,返回序列
data[['w']]#w列,返回dataframe
data[1:3]#取2到3行(前闭后开)
data.iat[1,1]#取第二行第二列
data[i][j] #第i列第j行(注意数组的表示)(可以用来通过判定取某些数据)
data[int(len(data)*p):,:]#..行..列
data.loc['a',['w','x']]#返回a行,w,x列
data.head()#返回前几行
data.tail()#返回后几行
data.iloc[-1]#最后一行,返回序列
data.iloc[-1:]#最后一行,返回dataframe
pd.DataFrame(cm_train, index = range(1, 6), columns = range(1, 6))# 改变行列标题
data_processed.to_excel(transformeddata, index = False)
data_zs = 1.0*(data - data.mean())/data.std() #数据标准化
a=data.corr()
b=data.corr()[u'百合酱蒸凤爪']
c=data[u'百合酱蒸凤爪'].corr(data[u'翡翠蒸香茜饺'])
(data - data.min())/(data.max() - data.min()) #最小-最大规范化
(data - data.mean())/data.std() #零-均值规范化
data/10**np.ceil(np.log10(data.abs().max())) #小数定标规范化
- 计算基本统计量(count mean std min 25% 50% 75% max )
statistics = data.describe()
statistics.loc['range'] = statistics.loc['max']-statistics.loc['min']
statistics.loc['var'] = statistics.loc['std']/statistics.loc['mean']
statistics.loc['dis'] = statistics.loc['75%']-statistics.loc['25%']
d = data[u'发生时间'].diff() > ts
matplotlib
- 设置图像大小,x,y轴名称,标题,显示范围,显示图例
plt.figure(figsize = (8, 4))
plt.xlabel('Time(s) ')
plt.ylabel('Volt')
plt.title('A Simple Example')
plt.ylim(0, 2.2)
plt.legend()
plt.show()
plt.plot(x,y,label = '$\sin x+1$', color = 'red', linewidth = 2) #作图,设置标签、线条颜色、线条大小
plt.plot(x, z, 'b--', label = '$\cos x^2+1$') #作图,设置标签、线条类型
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.figure()
p = data.boxplot(return_type='dict')
x = p['fliers'][0].get_xdata() # 'flies'即为异常值的标签
y = p['fliers'][0].get_ydata()
y.sort() #从小到大排序,该方法直接改变原对象
for i in range(len(x)):
if i>0:
plt.annotate(y[i], xy = (x[i],y[i]), xytext=(x[i]+0.05 -0.8/(y[i]-y[i-1]),y[i]))
else:
plt.annotate(y[i], xy = (x[i],y[i]), xytext=(x[i]+0.08,y[i]))
plt.annotate(format(p[6], '.4%'), xy = (6, p[6]), xytext=(6*0.9, p[6]*0.9), arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=.2"))
data.plot(kind='bar')
p.plot(color = 'r', secondary_y = True, style = '-o',linewidth = 2)
def density_plot(data):
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
p = data.plot(kind='kde', linewidth = 2, subplots = True, sharex = False)
[p[i].set_ylabel(u'密度') for i in range(k)]
plt.legend()
return plt
sklearn
rom sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import RandomizedLogisticRegression as RLR
rlr = RLR()
rlr.fit(x, y)
rlr.get_support()
print(u'通过随机逻辑回归模型筛选特征结束。')
print(u'有效特征为:%s' % ','.join(data.columns[rlr.get_support()]))
x = data[data.columns[rlr.get_support()]].as_matrix()
lr = LR()
lr.fit(x, y)
print(u'逻辑回归模型训练结束。')
print(u'模型的平均正确率为:%s' % lr.score(x, y))
from sklearn.decomposition import PCA
pca = PCA()
b=pca.fit(data)
a=pca.components_ #返回模型的各个特征向量
pca.explained_variance_ratio_ #返回各个成分各自的方差百分比
svm
from sklearn import svm
model = svm.SVC()
model.fit(x_train, y_train)
import pickle
pickle.dump(model, open('../tmp/svm.model', 'wb'))
scripy
from scipy.io import loadmat #mat是MATLAB专用格式,需要用loadmat读取它
mat = loadmat(inputfile)
signal = mat['leleccum'][0]
keras
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
model = Sequential()
model.add(Dense(11, input_shape = 17))
model.add(Activation('relu'))
model.add(Dense(17, 10))
model.add(Activation('relu'))
model.add(Dense(10, 1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', class_mode="binary")
model.fit(x_train, y_train, nb_epoch = 100, batch_size = 1)
model.save_weights('../tmp/net.model')
r = pd.DataFrame(model.predict_classes(x_test), columns = [u'预测结果'])
pd.concat([data_test.iloc[:,:5], r], axis = 1).to_excel(testoutputfile)