import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.linear_model import LogisticRegression
import tushare as ts
from sklearn import cross_validation
data=ts.get_k_data('600000',start='2007-01-01',end='2018-04-13')
print(data.head())
print(data.tail())
date open close high low volume code
0 2007-01-04 3.702 3.670 4.009 3.413 508894.23 600000
1 2007-01-05 3.670 3.562 3.670 3.338 357055.86 600000
2 2007-01-08 3.557 3.634 3.708 3.525 254888.76 600000
3 2007-01-09 3.596 3.897 3.916 3.591 329619.46 600000
4 2007-01-10 3.904 4.041 4.152 3.904 352768.36 600000
date open close high low volume code
2686 2018-04-09 11.53 11.50 11.59 11.49 167224.0 600000
2687 2018-04-10 11.52 11.77 11.79 11.51 287482.0 600000
2688 2018-04-11 11.79 11.91 12.02 11.75 312985.0 600000
2689 2018-04-12 11.91 11.78 11.96 11.76 188242.0 600000
2690 2018-04-13 11.83 11.69 11.89 11.69 140948.0 600000
data_SZ_index=ts.get_k_data('000001',index=True,start='2007-01-01',end='2018-04-13')
print(data_SZ_index.head())
print(data_SZ_index.tail())
date open close high low volume code
0 2007-01-04 2728.19 2715.72 2847.61 2684.82 120156000.0 sh000001
1 2007-01-05 2668.58 2641.33 2685.80 2617.02 106156000.0 sh000001
2 2007-01-08 2621.07 2707.20 2708.44 2620.62 106813000.0 sh000001
3 2007-01-09 2711.05 2807.80 2809.39 2691.36 110751000.0 sh000001
4 2007-01-10 2838.11 2825.58 2841.74 2770.99 111769000.0 sh000001
date open close high low volume code
2738 2018-04-09 3125.44 3138.29 3146.09 3110.30 139608621.0 sh000001
2739 2018-04-10 3144.26 3190.32 3190.65 3139.08 168201359.0 sh000001
2740 2018-04-11 3197.37 3208.08 3220.85 3191.59 175867197.0 sh000001
2741 2018-04-12 3203.28 3180.16 3205.25 3177.05 148231313.0 sh000001
2742 2018-04-13 3192.04 3159.05 3197.90 3155.51 127552310.0 sh000001
num_data=len(data)
num_SZ_index=len(data_SZ_index)
print(num_data,num_SZ_index)#股票会有停牌天数,但指数不会
(2691, 2743)
from datetime import datetime
data['date'] = [datetime.strptime(x,'%Y-%m-%d') for x in data['date']]
data['date'].head()
0 2007-01-04
1 2007-01-05
2 2007-01-08
3 2007-01-09
4 2007-01-10
Name: date, dtype: datetime64[ns]
data_SZ_index['date'] = [datetime.strptime(x,'%Y-%m-%d') for x in data_SZ_index['date']]
subdata_SZ_index=data_SZ_index[data_SZ_index['date'].isin(data['date'])]#数据对齐
sub_index_open=subdata_SZ_index['open'].values #z做了对齐之后,丢失一部分大盘数据对index有影响,直接取数据部分
sub_index_close=subdata_SZ_index['close'].values
col_index=[]
y=[]
data_open=data['open']
data_close=data['close']
for i in xrange(2691):
if sub_index_close[i]>=sub_index_open[i]:
col_index.append(1)
else:
col_index.append(0)
if data_close[i]>=data_open[i]:
y.append(1)
else:
y.append(0)
x_data=data[['open','close','high','low','volume']].as_matrix()
x=np.c_[x_data,col_index]#将大盘指数的涨跌合并到特征值中
data_shape=x.shape
data_rows=data_shape[0]
data_cols=data_shape[1]
data_col_max=x.max(axis=0)
data_col_min=x.min(axis=0)
print(data_col_max,data_col_min)
(array([ 1.36700000e+01, 1.37600000e+01, 1.40200000e+01,
1.35300000e+01, 1.19802410e+07, 1.00000000e+00]), array([ 2.42500000e+00, 2.47000000e+00, 2.65400000e+00,
2.41600000e+00, 2.89912100e+04, 0.00000000e+00]))
for i in xrange(0, data_rows, 1):#将输入数组归一化
for j in xrange(0, data_cols, 1):
x[i][j] = \
(x[i][j] - data_col_min[j]) / \
(data_col_max[j] - data_col_min[j])
print(x[0:2])
[[ 0.11356158 0.10628875 0.1192152 0.08970668 0.04015505 0. ]
[ 0.11071587 0.09672276 0.08938941 0.08295843 0.02745024 0. ]]
y=y[1:2691]
x=x[0:2690]
clf1 = svm.SVC(kernel='rbf')
clf2 = LogisticRegression()
result1 = []
result2 = []
for i in range(5):
# x和y的验证集和测试集,切分80-20%的测试集
x_train, x_test, y_train, y_test = \
cross_validation.train_test_split(x, y, test_size=0.2)
# 训练数据进行训练
clf1.fit(x_train, y_train)
# 将预测数据和测试集的验证数据比对
result1.append(np.mean(y_test == clf1.predict(x_test)))
clf2.fit(x_train, y_train)
result2.append(np.mean(y_test == clf2.predict(x_test)))
print("svm classifier accuacy:")
print(result1)
print("LogisticRegression classifier accuacy:")
print(result2)
svm classifier accuacy:
[0.53345724907063197, 0.53717472118959109, 0.55018587360594795, 0.51115241635687736, 0.54460966542750933]
LogisticRegression classifier accuacy:
[0.55390334572490707, 0.55576208178438657, 0.53717472118959109, 0.52416356877323422, 0.53903345724907059]
The main improvement of this code is to add the rise and fall of the market to the parameters of the model, and to predict the rise and fall of tomorrow through the situation of the stock yesterday. It is found that the effect of SVM and logistic regression is similar.