Improvement of stock forecasting model

import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.linear_model import LogisticRegression
import tushare as ts
from sklearn import cross_validation
data=ts.get_k_data('600000',start='2007-01-01',end='2018-04-13')
print(data.head())
print(data.tail())
         date   open  close   high    low     volume    code
0  2007-01-04  3.702  3.670  4.009  3.413  508894.23  600000
1  2007-01-05  3.670  3.562  3.670  3.338  357055.86  600000
2  2007-01-08  3.557  3.634  3.708  3.525  254888.76  600000
3  2007-01-09  3.596  3.897  3.916  3.591  329619.46  600000
4  2007-01-10  3.904  4.041  4.152  3.904  352768.36  600000
            date   open  close   high    low    volume    code
2686  2018-04-09  11.53  11.50  11.59  11.49  167224.0  600000
2687  2018-04-10  11.52  11.77  11.79  11.51  287482.0  600000
2688  2018-04-11  11.79  11.91  12.02  11.75  312985.0  600000
2689  2018-04-12  11.91  11.78  11.96  11.76  188242.0  600000
2690  2018-04-13  11.83  11.69  11.89  11.69  140948.0  600000
data_SZ_index=ts.get_k_data('000001',index=True,start='2007-01-01',end='2018-04-13')
print(data_SZ_index.head())
print(data_SZ_index.tail())
         date     open    close     high      low       volume      code
0  2007-01-04  2728.19  2715.72  2847.61  2684.82  120156000.0  sh000001
1  2007-01-05  2668.58  2641.33  2685.80  2617.02  106156000.0  sh000001
2  2007-01-08  2621.07  2707.20  2708.44  2620.62  106813000.0  sh000001
3  2007-01-09  2711.05  2807.80  2809.39  2691.36  110751000.0  sh000001
4  2007-01-10  2838.11  2825.58  2841.74  2770.99  111769000.0  sh000001
            date     open    close     high      low       volume      code
2738  2018-04-09  3125.44  3138.29  3146.09  3110.30  139608621.0  sh000001
2739  2018-04-10  3144.26  3190.32  3190.65  3139.08  168201359.0  sh000001
2740  2018-04-11  3197.37  3208.08  3220.85  3191.59  175867197.0  sh000001
2741  2018-04-12  3203.28  3180.16  3205.25  3177.05  148231313.0  sh000001
2742  2018-04-13  3192.04  3159.05  3197.90  3155.51  127552310.0  sh000001
num_data=len(data)
num_SZ_index=len(data_SZ_index)
print(num_data,num_SZ_index)#股票会有停牌天数,但指数不会
(2691, 2743)
from datetime import datetime
data['date'] = [datetime.strptime(x,'%Y-%m-%d') for x in data['date']]
data['date'].head()
0   2007-01-04
1   2007-01-05
2   2007-01-08
3   2007-01-09
4   2007-01-10
Name: date, dtype: datetime64[ns]
data_SZ_index['date'] = [datetime.strptime(x,'%Y-%m-%d') for x in data_SZ_index['date']]
subdata_SZ_index=data_SZ_index[data_SZ_index['date'].isin(data['date'])]#数据对齐
sub_index_open=subdata_SZ_index['open'].values  #z做了对齐之后,丢失一部分大盘数据对index有影响,直接取数据部分
sub_index_close=subdata_SZ_index['close'].values
col_index=[]
y=[]
data_open=data['open']
data_close=data['close']

for i in xrange(2691):
    if sub_index_close[i]>=sub_index_open[i]:
        col_index.append(1)
    else:
        col_index.append(0)
    if data_close[i]>=data_open[i]:
        y.append(1)
    else:
        y.append(0)
x_data=data[['open','close','high','low','volume']].as_matrix()
x=np.c_[x_data,col_index]#将大盘指数的涨跌合并到特征值中
data_shape=x.shape
data_rows=data_shape[0]
data_cols=data_shape[1]
data_col_max=x.max(axis=0)
data_col_min=x.min(axis=0)
print(data_col_max,data_col_min)
(array([  1.36700000e+01,   1.37600000e+01,   1.40200000e+01,
         1.35300000e+01,   1.19802410e+07,   1.00000000e+00]), array([  2.42500000e+00,   2.47000000e+00,   2.65400000e+00,
         2.41600000e+00,   2.89912100e+04,   0.00000000e+00]))
for i in xrange(0, data_rows, 1):#将输入数组归一化
    for j in xrange(0, data_cols, 1):
        x[i][j] = \
            (x[i][j] - data_col_min[j]) / \
            (data_col_max[j] - data_col_min[j])
print(x[0:2])
[[ 0.11356158  0.10628875  0.1192152   0.08970668  0.04015505  0.        ]
 [ 0.11071587  0.09672276  0.08938941  0.08295843  0.02745024  0.        ]]
y=y[1:2691]
x=x[0:2690]
clf1 = svm.SVC(kernel='rbf')
clf2 = LogisticRegression()
result1 = []
result2 = []
for i in range(5):
    # x和y的验证集和测试集,切分80-20%的测试集
    x_train, x_test, y_train, y_test = \
        cross_validation.train_test_split(x, y, test_size=0.2)
    # 训练数据进行训练
    clf1.fit(x_train, y_train)
    # 将预测数据和测试集的验证数据比对
    result1.append(np.mean(y_test == clf1.predict(x_test)))
    clf2.fit(x_train, y_train)
    result2.append(np.mean(y_test == clf2.predict(x_test)))
print("svm classifier accuacy:")
print(result1)
print("LogisticRegression classifier accuacy:")
print(result2)
svm classifier accuacy:
[0.53345724907063197, 0.53717472118959109, 0.55018587360594795, 0.51115241635687736, 0.54460966542750933]
LogisticRegression classifier accuacy:
[0.55390334572490707, 0.55576208178438657, 0.53717472118959109, 0.52416356877323422, 0.53903345724907059]

The main improvement of this code is to add the rise and fall of the market to the parameters of the model, and to predict the rise and fall of tomorrow through the situation of the stock yesterday. It is found that the effect of SVM and logistic regression is similar.

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324603352&siteId=291194637