-
-
Notifications
You must be signed in to change notification settings - Fork 46.8k
Added lstm stock prediction #1777
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
{ | ||
"data": { | ||
"filename": "sp500.csv", | ||
"columns": [ | ||
"Close", | ||
"Volume" | ||
], | ||
"sequence_length": 50, | ||
"train_test_split": 0.85, | ||
"normalise": true | ||
}, | ||
"training": { | ||
"epochs": 2, | ||
"batch_size": 32 | ||
}, | ||
"model": { | ||
"loss": "mse", | ||
"optimizer": "adam", | ||
"save_dir": "saved_models", | ||
"layers": [ | ||
{ | ||
"type": "lstm", | ||
"neurons": 100, | ||
"input_timesteps": 49, | ||
"input_dim": 2, | ||
"return_seq": true | ||
}, | ||
{ | ||
"type": "dropout", | ||
"rate": 0.2 | ||
}, | ||
{ | ||
"type": "lstm", | ||
"neurons": 100, | ||
"return_seq": true | ||
}, | ||
{ | ||
"type": "lstm", | ||
"neurons": 100, | ||
"return_seq": false | ||
}, | ||
{ | ||
"type": "dropout", | ||
"rate": 0.2 | ||
}, | ||
{ | ||
"type": "dense", | ||
"neurons": 1, | ||
"activation": "linear" | ||
} | ||
] | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
import warnings | ||
warnings.filterwarnings("ignore") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
import math | ||
import numpy as np | ||
import pandas as pd | ||
|
||
class DataLoader(): | ||
"""A class for loading and transforming data for the lstm model""" | ||
|
||
def __init__(self, filename, split, cols): | ||
dataframe = pd.read_csv(filename) | ||
i_split = int(len(dataframe) * split) | ||
self.data_train = dataframe.get(cols).values[:i_split] | ||
self.data_test = dataframe.get(cols).values[i_split:] | ||
self.len_train = len(self.data_train) | ||
self.len_test = len(self.data_test) | ||
self.len_train_windows = None | ||
|
||
def get_test_data(self, seq_len, normalise): | ||
''' | ||
Create x, y test data windows | ||
Warning: batch method, not generative, make sure you have enough memory to | ||
load data, otherwise reduce size of the training split. | ||
''' | ||
data_windows = [] | ||
for i in range(self.len_test - seq_len): | ||
data_windows.append(self.data_test[i:i+seq_len]) | ||
|
||
data_windows = np.array(data_windows).astype(float) | ||
data_windows = self.normalise_windows(data_windows, single_window=False) if normalise else data_windows | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This line is too long for a ternary if... Just do a normal if and wrap lines to 88 char max. |
||
|
||
x = data_windows[:, :-1] | ||
y = data_windows[:, -1, [0]] | ||
return x,y | ||
|
||
def get_train_data(self, seq_len, normalise): | ||
''' | ||
Create x, y train data windows | ||
Warning: batch method, not generative, make sure you have enough memory to | ||
load data, otherwise use generate_training_window() method. | ||
''' | ||
data_x = [] | ||
data_y = [] | ||
for i in range(self.len_train - seq_len): | ||
x, y = self._next_window(i, seq_len, normalise) | ||
data_x.append(x) | ||
data_y.append(y) | ||
return np.array(data_x), np.array(data_y) | ||
|
||
def generate_train_batch(self, seq_len, batch_size, normalise): | ||
'''Yield a generator of training data from filename on given list of cols split for train/test''' | ||
i = 0 | ||
while i < (self.len_train - seq_len): | ||
x_batch = [] | ||
y_batch = [] | ||
for b in range(batch_size): | ||
if i >= (self.len_train - seq_len): | ||
# stop-condition for a smaller final batch if data doesn't divide evenly | ||
yield np.array(x_batch), np.array(y_batch) | ||
i = 0 | ||
x, y = self._next_window(i, seq_len, normalise) | ||
x_batch.append(x) | ||
y_batch.append(y) | ||
i += 1 | ||
yield np.array(x_batch), np.array(y_batch) | ||
|
||
def _next_window(self, i, seq_len, normalise): | ||
'''Generates the next data window from the given index location i''' | ||
window = self.data_train[i:i+seq_len] | ||
window = self.normalise_windows(window, single_window=True)[0] if normalise else window | ||
x = window[:-1] | ||
y = window[-1, [0]] | ||
return x, y | ||
|
||
def normalise_windows(self, window_data, single_window=False): | ||
'''Normalise window with a base value of zero''' | ||
normalised_data = [] | ||
window_data = [window_data] if single_window else window_data | ||
for window in window_data: | ||
normalised_window = [] | ||
for col_i in range(window.shape[1]): | ||
normalised_col = [((float(p) / float(window[0, col_i])) - 1) for p in window[:, col_i]] | ||
normalised_window.append(normalised_col) | ||
normalised_window = np.array(normalised_window).T # reshape and transpose array back into original multidimensional format | ||
normalised_data.append(normalised_window) | ||
return np.array(normalised_data) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
import os | ||
import math | ||
import numpy as np | ||
import datetime as dt | ||
from numpy import newaxis | ||
from core.utils import Timer | ||
from keras.layers import Dense, Activation, Dropout, LSTM | ||
from keras.models import Sequential, load_model | ||
from keras.callbacks import EarlyStopping, ModelCheckpoint | ||
|
||
class Model(): | ||
"""A class for an building and inferencing an lstm model""" | ||
|
||
def __init__(self): | ||
self.model = Sequential() | ||
|
||
def load_model(self, filepath): | ||
print('[Model] Loading model from file %s' % filepath) | ||
self.model = load_model(filepath) | ||
|
||
def build_model(self, configs): | ||
timer = Timer() | ||
timer.start() | ||
|
||
for layer in configs['model']['layers']: | ||
neurons = layer['neurons'] if 'neurons' in layer else None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's use dict.get()... https://docs.python.org/3/library/stdtypes.html#dict.get
|
||
dropout_rate = layer['rate'] if 'rate' in layer else None | ||
activation = layer['activation'] if 'activation' in layer else None | ||
return_seq = layer['return_seq'] if 'return_seq' in layer else None | ||
input_timesteps = layer['input_timesteps'] if 'input_timesteps' in layer else None | ||
input_dim = layer['input_dim'] if 'input_dim' in layer else None | ||
|
||
if layer['type'] == 'dense': | ||
self.model.add(Dense(neurons, activation=activation)) | ||
if layer['type'] == 'lstm': | ||
self.model.add(LSTM(neurons, input_shape=(input_timesteps, input_dim), return_sequences=return_seq)) | ||
if layer['type'] == 'dropout': | ||
self.model.add(Dropout(dropout_rate)) | ||
|
||
self.model.compile(loss=configs['model']['loss'], optimizer=configs['model']['optimizer']) | ||
|
||
print('[Model] Model Compiled') | ||
timer.stop() | ||
|
||
def train(self, x, y, epochs, batch_size, save_dir): | ||
timer = Timer() | ||
timer.start() | ||
print('[Model] Training Started') | ||
print('[Model] %s epochs, %s batch size' % (epochs, batch_size)) | ||
|
||
save_fname = os.path.join(save_dir, '%s-e%s.h5' % (dt.datetime.now().strftime('%d%m%Y-%H%M%S'), str(epochs))) | ||
callbacks = [ | ||
EarlyStopping(monitor='val_loss', patience=2), | ||
ModelCheckpoint(filepath=save_fname, monitor='val_loss', save_best_only=True) | ||
] | ||
self.model.fit( | ||
x, | ||
y, | ||
epochs=epochs, | ||
batch_size=batch_size, | ||
callbacks=callbacks | ||
) | ||
self.model.save(save_fname) | ||
|
||
print('[Model] Training Completed. Model saved as %s' % save_fname) | ||
timer.stop() | ||
|
||
def train_generator(self, data_gen, epochs, batch_size, steps_per_epoch, save_dir): | ||
timer = Timer() | ||
timer.start() | ||
print('[Model] Training Started') | ||
print('[Model] %s epochs, %s batch size, %s batches per epoch' % (epochs, batch_size, steps_per_epoch)) | ||
|
||
save_fname = os.path.join(save_dir, '%s-e%s.h5' % (dt.datetime.now().strftime('%d%m%Y-%H%M%S'), str(epochs))) | ||
callbacks = [ | ||
ModelCheckpoint(filepath=save_fname, monitor='loss', save_best_only=True) | ||
] | ||
self.model.fit_generator( | ||
data_gen, | ||
steps_per_epoch=steps_per_epoch, | ||
epochs=epochs, | ||
callbacks=callbacks, | ||
workers=1 | ||
) | ||
|
||
print('[Model] Training Completed. Model saved as %s' % save_fname) | ||
timer.stop() | ||
|
||
def predict_point_by_point(self, data): | ||
#Predict each timestep given the last sequence of true data, in effect only predicting 1 step ahead each time | ||
print('[Model] Predicting Point-by-Point...') | ||
predicted = self.model.predict(data) | ||
predicted = np.reshape(predicted, (predicted.size,)) | ||
return predicted | ||
|
||
def predict_sequences_multiple(self, data, window_size, prediction_len): | ||
#Predict sequence of 50 steps before shifting prediction run forward by 50 steps | ||
print('[Model] Predicting Sequences Multiple...') | ||
prediction_seqs = [] | ||
for i in range(int(len(data)/prediction_len)): | ||
curr_frame = data[i*prediction_len] | ||
predicted = [] | ||
for j in range(prediction_len): | ||
predicted.append(self.model.predict(curr_frame[newaxis,:,:])[0,0]) | ||
curr_frame = curr_frame[1:] | ||
curr_frame = np.insert(curr_frame, [window_size-2], predicted[-1], axis=0) | ||
prediction_seqs.append(predicted) | ||
return prediction_seqs | ||
|
||
def predict_sequence_full(self, data, window_size): | ||
#Shift the window by 1 new prediction each time, re-run predictions on new window | ||
print('[Model] Predicting Sequences Full...') | ||
curr_frame = data[0] | ||
predicted = [] | ||
for i in range(len(data)): | ||
predicted.append(self.model.predict(curr_frame[newaxis,:,:])[0,0]) | ||
curr_frame = curr_frame[1:] | ||
curr_frame = np.insert(curr_frame, [window_size-2], predicted[-1], axis=0) | ||
return predicted |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
import datetime as dt | ||
|
||
class Timer(): | ||
|
||
def __init__(self): | ||
self.start_dt = None | ||
|
||
def start(self): | ||
self.start_dt = dt.datetime.now() | ||
|
||
def stop(self): | ||
end_dt = dt.datetime.now() | ||
print('Time taken: %s' % (end_dt - self.start_dt)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Type hints and doctests please.