The 114th original article focuses on "personal growth and wealth freedom, the logic of world operation, AI quantitative investment".
The core work today is to integrate the reinforcement learning environment into our AI quantitative platform.
Many codes on the Internet integrate data acquisition and preprocessing into the reinforcement learning environment, which is not conducive to code reuse for the overall quantitative platform. We have implemented the dataloader before. So we can implement the gym of reinforcement learning alone.
01 Environment for Financial Reinforcement Learning
A reinforcement learning environment needs to define four things: state space, action space, reward function, and state observation.
State space and action space.
The state space is the dimension of the environment that the agent can observe, and for the financial reinforcement learning environment, it is the dimension of the factor (the dimension of the feature).
# State space class observation_space: def __init__(self, n): self.shape = (n,) # Action space class action_space: def __init__(self, n): self.n = n def seed(self, seed): pass def sample(self): return random.randint(0, self.n - 1)
The action space is the dimension of the actions that the agent can take based on the observed state. For example, "buy" and "close" are two actions.
Environment initialization:
class FinanceEnv: def __init__(self, symbols, features, df_features): self.symbols = symbols self.features = features self.df_features = df_features self.observation_space = observation_space(4) self.rows = self.observation _space. shape[0] # How many lines to fetch at a time self.action_space = action_space(2) # Action dimension self.min_accuracy = 0.475 # Minimum accuracy
Reset is to reset the environment:
def _get_state(self): state = self.df_features[self.features].iloc[ self.index - self.rows:self.index] return state.values def reset(self): self.treward = 0 self.accuracy = 0 self.index = self.rows # Return the number of rows in the state space, the initial state of the features column number state = self._get_state() return state.values
Two main working methods: reset and step.
where step is the most important function of the environment. The agent observes the state of the environment, chooses the corresponding action, and gets feedback from the environment after performing the action. At the same time, it will check the overall income, whether the task failed, etc.
def step(self, action): # Calculate the reward according to the incoming action. correct = action == self.df_features['label'].iloc[self.index] reward = 1 if correct else 0 # Calculate the reward value, Accuracy self.treward += reward self.index += 1 self.accuracy = self.treward / (self.index - self.rows) # index>=total length, exit if self.index >= len(self. df_features): done = True elif reward == 1: done = False elif (self.accuracy < self.min_accuracy and self.index > self.rows + 10): done = True else: done = False state = self._get_state( ) info = {} return state, reward, done, info
02 Deeply strengthen network DQLAgent
from collections import deque import random import numpy as np from keras import Sequential from keras.layers import Dense from keras.optimizers import Adam class DQLAgent: def __init__(self, env, gamma=0.95, hu=24, opt=Adam, lr=0.001, finish=False): self.env = env self.finish = finish self.epsilon = 1.0 self.epsilon_min = 0.01 self.epsilon_decay = 0.995 self.gamma = gamma self.batch_size = 32 self.max_treward = 0 self.averages = list() self.memory = deque(maxlen=2000) self.osn = env.observation_space.shape[0] self.model = self._build_model(hu, opt, lr) def _build_model(self, hu, opt, lr): model = Sequential() model.add(Dense(hu, input_dim=self.osn, activation='relu')) model.add(Dense(hu, activation='relu')) model.add(Dense(self.env.action_space.n, activation='linear')) model.compile(loss='mse', optimizer=opt(lr=lr)) return model def act(self, state): if random.random() <= self.epsilon: return self.env.action_space.sample() action = self.model.predict(state)[0] return np.argmax(action) def replay(self): batch = random.sample(self.memory, self.batch_size) for state, action, reward, next_state, done in batch: if not done: reward += self.gamma * np.amax( self.model.predict(next_state)[0]) target = self.model.predict(state) target[0, action] = reward self.model.fit(state, target, epochs=1, verbose=False) if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay def learn(self, episodes): trewards = [] for e in range(1, episodes + 1): state = self.env.reset() state = np.reshape(state, [1, self.osn]) for _ in range(5000): action = self.act(state) next_state, reward, done, info = self.env.step(action) next_state = np.reshape(next_state, [1, self.osn]) self.memory.append([state, action, reward, next_state, done]) state = next_state if done: treward = _ + 1 trewards.append(treward) av = sum(trewards[-25:]) / 25 self.averages.append(av) self.max_treward = max(self.max_treward, treward) templ = 'episode: {:4d}/{} | treward: {:4d} | ' templ += 'av: {:6.1f} | max: {:4d}' print(templ.format(e, episodes, treward, av, self.max_treward), end='\r') break if av > 195 and self.finish: print() break if len(self.memory) > self.batch_size: self.replay() def test(self, episodes): trewards = [] for e in range(1, episodes + 1): state = self.env.reset() for _ in range(5001): state = np.reshape(state, [1, self.osn]) action = np.argmax(self.model.predict(state)[0]) next_state, reward, done, info = self.env.step(action) state = next_state if done: treward = _ + 1 trewards.append(treward) print('episode: {:4d}/{} | treward: {:4d}' .format(e, episodes, treward), end='\r') break return trewards
summary:
Today, I went through the application of deep reinforcement learning in finance and built a financial trading environment.
Subsequent optimization is required.
The code is uploaded to the planet.
ETF rotation + RSRS timing, plus Kaman filter: annualized 48.41%, Sharpe ratio 1.89