Introduction to the application of deep reinforcement learning + financial investment

The 114th original article focuses on "personal growth and wealth freedom, the logic of world operation, AI quantitative investment".

 

The core work today is to integrate the reinforcement learning environment into our AI quantitative platform.

Many codes on the Internet integrate data acquisition and preprocessing into the reinforcement learning environment, which is not conducive to code reuse for the overall quantitative platform. We have implemented the dataloader before. So we can implement the gym of reinforcement learning alone.

01 Environment for Financial Reinforcement Learning

A reinforcement learning environment needs to define four things: state space, action space, reward function, and state observation.

State space and action space.

The state space is the dimension of the environment that the agent can observe, and for the financial reinforcement learning environment, it is the dimension of the factor (the dimension of the feature).

# State space 
class observation_space: 
    def __init__(self, n): 
        self.shape = (n,) 


# Action space 
class action_space: 
    def __init__(self, n): 
        self.n = n 

    def seed(self, seed): 
        pass 

    def sample(self): 
        return random.randint(0, self.n - 1)

The action space is the dimension of the actions that the agent can take based on the observed state. For example, "buy" and "close" are two actions.

Environment initialization:

class FinanceEnv: 
    def __init__(self, symbols, features, df_features): 
        self.symbols = symbols 
        self.features = features 
        self.df_features = df_features 

        self.observation_space = observation_space(4) 
        self.rows = self.observation _space. shape[0] # How many lines to fetch at a time 
        self.action_space = action_space(2) # Action dimension 
        self.min_accuracy = 0.475 # Minimum accuracy

Reset is to reset the environment:

 
 
def _get_state(self): 
    state = self.df_features[self.features].iloc[ 
            self.index - self.rows:self.index] 
    return state.values 

​​def reset(self): 
    self.treward = 0 
    self.accuracy = 0 
    self.index = self.rows 

    # Return the number of rows in the state space, the initial state of the features column number 
    state = self._get_state() 
    return state.values

Two main working methods: reset and step.

where step is the most important function of the environment. The agent observes the state of the environment, chooses the corresponding action, and gets feedback from the environment after performing the action. At the same time, it will check the overall income, whether the task failed, etc.

def step(self, action): 
    # Calculate the reward according to the incoming action. 
    correct = action == self.df_features['label'].iloc[self.index] 
    reward = 1 if correct else 0 

    # Calculate the reward value, Accuracy 
    self.treward += reward 
    self.index += 1 
    self.accuracy = self.treward / (self.index - self.rows) 

    # index>=total length, exit 
    if self.index >= len(self. df_features): 
        done = True 
    elif reward == 1: 
        done = False 
    elif (self.accuracy < self.min_accuracy and 
          self.index > self.rows + 10): 
        done = True 
    else: 
        done = False 

    state = self._get_state( ) 
    info = {}
    return state, reward, done, info

02 Deeply strengthen network DQLAgent

from collections import deque
import random

import numpy as np
from keras import Sequential
from keras.layers import Dense
from keras.optimizers import Adam


class DQLAgent:
    def __init__(self, env, gamma=0.95, hu=24, opt=Adam,
                 lr=0.001, finish=False):
        self.env = env

        self.finish = finish
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.gamma = gamma
        self.batch_size = 32
        self.max_treward = 0
        self.averages = list()
        self.memory = deque(maxlen=2000)
        self.osn = env.observation_space.shape[0]
        self.model = self._build_model(hu, opt, lr)

    def _build_model(self, hu, opt, lr):
        model = Sequential()
        model.add(Dense(hu, input_dim=self.osn,
                        activation='relu'))
        model.add(Dense(hu, activation='relu'))
        model.add(Dense(self.env.action_space.n, activation='linear'))
        model.compile(loss='mse', optimizer=opt(lr=lr))
        return model

    def act(self, state):
        if random.random() <= self.epsilon:
            return self.env.action_space.sample()
        action = self.model.predict(state)[0]
        return np.argmax(action)

    def replay(self):
        batch = random.sample(self.memory, self.batch_size)
        for state, action, reward, next_state, done in batch:
            if not done:
                reward += self.gamma * np.amax(
                    self.model.predict(next_state)[0])
            target = self.model.predict(state)
            target[0, action] = reward
            self.model.fit(state, target, epochs=1,
                           verbose=False)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def learn(self, episodes):
        trewards = []
        for e in range(1, episodes + 1):
            state = self.env.reset()
            state = np.reshape(state, [1, self.osn])
            for _ in range(5000):
                action = self.act(state)
                next_state, reward, done, info = self.env.step(action)
                next_state = np.reshape(next_state,
                                        [1, self.osn])
                self.memory.append([state, action, reward,
                                    next_state, done])
                state = next_state
                if done:
                    treward = _ + 1
                    trewards.append(treward)
                    av = sum(trewards[-25:]) / 25
                    self.averages.append(av)
                    self.max_treward = max(self.max_treward, treward)
                    templ = 'episode: {:4d}/{} | treward: {:4d} | '
                    templ += 'av: {:6.1f} | max: {:4d}'
                    print(templ.format(e, episodes, treward, av,
                                       self.max_treward), end='\r')
                    break
            if av > 195 and self.finish:
                print()
                break
            if len(self.memory) > self.batch_size:
                self.replay()

    def test(self, episodes):
        trewards = []
        for e in range(1, episodes + 1):
            state = self.env.reset()
            for _ in range(5001):
                state = np.reshape(state, [1, self.osn])
                action = np.argmax(self.model.predict(state)[0])
                next_state, reward, done, info = self.env.step(action)
                state = next_state
                if done:
                    treward = _ + 1
                    trewards.append(treward)
                    print('episode: {:4d}/{} | treward: {:4d}'
                          .format(e, episodes, treward), end='\r')
                    break
        return trewards


summary:

Today, I went through the application of deep reinforcement learning in finance and built a financial trading environment.

Subsequent optimization is required.

The code is uploaded to the planet.

ETF rotation + RSRS timing, plus Kaman filter: annualized 48.41%, Sharpe ratio 1.89

My Open Source Projects and Knowledge Planet

Guess you like

Origin blog.csdn.net/weixin_38175458/article/details/128023877