- 量化策略建模:
1.建立训练集:
X:t时刻之前的特征因子(价格、价格衍生特征、文本特征等)
Y:t时刻之前对应的标签(价格、买卖交易)
2.建立测试集
t时刻之后的特征/因子
3.常用的feature
Time Lags:将滞后期(时间窗口)前的数据样本的指标作为特征
- direction_pred_main.py
创建滞后序列
from __future__ import print_function
import datetime
import numpy as np
import pandas as pd
import tushare as ts
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
def create_lagged_series(symbol, start_date_str, end_date_str, lags=5):
"""
根据start_data, end_date创建symbol的收盘价的滞后序列
因为当期的数据会受前期数据的影响
默认滞后期为5天
"""
date_str_fmt = '%Y-%m-%d'
start_date = datetime.datetime.strptime(start_date_str, date_str_fmt)
one_yr_before_start = start_date - datetime.timedelta(days=365)
one_yr_before_start_str = one_yr_before_start.strftime(date_str_fmt)
hist_data = ts.get_k_data(symbol, one_yr_before_start_str, end_date_str)
hist_data['date'] = pd.to_datetime(hist_data['date'])
hist_data.set_index('date', inplace=True)
hist_lag = pd.DataFrame(index=hist_data.index)
hist_lag['today'] = hist_data['close']
hist_lag['volume'] = hist_data['volume']
for i in range(0, lags):
hist_lag['lag{}'.format(str(i + 1))] = hist_data['close'].shift(i + 1)
ret_df = pd.DataFrame(index=hist_lag.index)
ret_df['volume'] = hist_lag['volume']
ret_df['today'] = hist_lag['today'].pct_change() * 100.0
ret_df['today'] = ret_df['today'].apply(lambda x: 0.0001 if abs(x) < 0.0001 else x)
for i in range(0, lags):
ret_df['lag{}'.format(str(i + 1))] = \
hist_lag['lag{}'.format(str(i + 1))].pct_change() * 100.0
ret_df['direction'] = np.sign(ret_df['today'])
ret_df = ret_df[ret_df.index >= start_date]
return ret_df
if __name__ == "__main__":
lag_ret_df = create_lagged_series(
'hs300', '2018-01-01', '2020-02-10', lags=5
)
X = lag_ret_df[['lag1','lag2','lag3']]
y = lag_ret_df['direction']
start_test = datetime.datetime(2019,12,1)
X_train = X[X.index < start_test]
X_test = X[X.index >= start_test]
y_train = y[y.index < start_test]
y_test = y[y.index >= start_test]
print('准确率与混淆矩阵\n')
models = [('逻辑回归', LogisticRegression()),
('支持向量机', SVC(C=1000000.0, cache_size=200, class_weight=None,
coef0=0.0, degree=3, gamma=0.0001, kernel='rbf',
max_iter=-1, probability=False, random_state=None,
shrinking=True, tol=0.001, verbose=False)),
('随机森林', RandomForestClassifier(n_estimators=1000, criterion='gini',
max_depth=None, min_samples_split=2,
min_samples_leaf=1, max_features='auto',
bootstrap=True, oob_score=False, n_jobs=1,
random_state=None, verbose=0))]
for model in models:
model[1].fit(X_train, y_train)
pred = model[1].predict(X_test)
print('{}:\n{:.2f}'.format(model[0], model[1].score(X_test, y_test)))
print('{}\n'.format(confusion_matrix(pred, y_test, labels=[-1, 1])))
try:
from sklearn.ensemble import VotingClassifier
except:
try:
import sklearn
print("WARNING: [VotingClassifier] not available\n",
"WARNING: [import sklearn] reports version: ",
sklearn.__version__, "\n"+60*"|")
except:
print("WARNING: impossible to [import sklearn] at all\n",
60*"|")
voting_clf=VotingClassifier(estimators=[
('log_reg',LogisticRegression()),
('svc',SVC(C=1000000.0, cache_size=200, class_weight=None,
coef0=0.0, degree=3, gamma=0.0001, kernel='rbf',
max_iter=-1, probability=True, random_state=None,
shrinking=True, tol=0.001, verbose=False)),
('rfc',RandomForestClassifier(n_estimators=1000, criterion='gini',
max_depth=None, min_samples_split=2,
min_samples_leaf=1, max_features='auto',
bootstrap=True, oob_score=False, n_jobs=1,
random_state=None, verbose=0))],
voting='hard')
voting_clf.fit(X_train,y_train)
pred=voting_clf.predict(X_test)
score_vot=voting_clf.score(X_test,y_test)