LSTM Model Training

Training an LSTM neural network to predict cash flow using the synthetic dataset.

Training Code

The following Python code demonstrates the process of training an LSTM model on our synthetic cash flow dataset:

train_lstm_keras.py
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional, LayerNormalization, Input, Layer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import Huber
from sklearn.model_selection import KFold
import tensorflow as tf

# Custom Self-Attention Layer
class SelfAttention(Layer):
    def __init__(self):
        super(SelfAttention, self).__init__()

    def build(self, input_shape):
        self.W = self.add_weight(name='attention_weight',
                                 shape=(input_shape[-1], input_shape[-1]),
                                 initializer='glorot_uniform',
                                 trainable=True)
        self.b = self.add_weight(name='attention_bias',
                                 shape=(input_shape[-1],),
                                 initializer='zeros',
                                 trainable=True)
        self.u = self.add_weight(name='attention_vector',
                                 shape=(input_shape[-1], 1),
                                 initializer='glorot_uniform',
                                 trainable=True)
        super(SelfAttention, self).build(input_shape)

    def call(self, inputs):
        # Compute attention scores
        v = tf.tanh(tf.tensordot(inputs, self.W, axes=1) + self.b)
        vu = tf.tensordot(v, self.u, axes=1)
        alphas = tf.nn.softmax(vu, axis=1)
        # Weighted sum of inputs
        output = tf.reduce_sum(inputs * alphas, axis=1)
        return output

# Cyclic Learning Rate Scheduler
class CyclicLR(tf.keras.callbacks.Callback):
    def __init__(self, base_lr=0.0001, max_lr=0.003, step_size=4000., mode='triangular'):
        super(CyclicLR, self).__init__()
        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

    def clr(self):
        cycle = np.floor(1 + self.clr_iterations / (2 * self.step_size))
        x = np.abs(self.clr_iterations / self.step_size - 2 * cycle + 1)
        if self.mode == 'triangular':
            return self.base_lr + (self.max_lr - self.base_lr) * max(0, (1 - x))
        return self.base_lr

    def on_train_begin(self, logs=None):
        logs = logs or {}
        self.clr_iterations = 0
        self.model.optimizer.learning_rate.assign(self.base_lr)

    def on_batch_end(self, batch, logs=None):
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1
        lr = self.clr()
        self.model.optimizer.learning_rate.assign(lr)
        self.history.setdefault('lr', []).append(lr)
        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)

# Load the synthetic data
data = pd.read_csv("synthetic_cashflow_data.csv")

# Handle outliers and log transform net_cash_flow
cashflow_lower, cashflow_upper = np.percentile(data["net_cash_flow"], [1, 99])
data["net_cash_flow"] = np.clip(data["net_cash_flow"], cashflow_lower, cashflow_upper)
data["log_net_cash_flow"] = np.log1p(data["net_cash_flow"].clip(lower=0))

# Add temporal features
data["date"] = pd.to_datetime(data["date"])
data["day_of_week"] = data["date"].dt.dayofweek
data["month_of_year"] = data["date"].dt.month

# Define features and target
features = ["income", "expenses", "seasonal_factor", "day_of_week", "month_of_year"]
target = "log_net_cash_flow"

# Add interaction term
data["income_expenses_interaction"] = data["income"] * data["expenses"]
features.append("income_expenses_interaction")

# Add high expense indicator
expense_threshold = np.percentile(data["expenses"], 75)
data["high_expense"] = (data["expenses"] > expense_threshold).astype(int)
features.append("high_expense")

# Add lagged and moving average features
data["lagged_net_cash_flow_1"] = data["net_cash_flow"].shift(1).fillna(data["net_cash_flow"].mean())
data["lagged_net_cash_flow_7"] = data["net_cash_flow"].shift(7).fillna(data["net_cash_flow"].mean())
data["lagged_net_cash_flow_14"] = data["net_cash_flow"].shift(14).fillna(data["net_cash_flow"].mean())
data["lagged_net_cash_flow_30"] = data["net_cash_flow"].shift(30).fillna(data["net_cash_flow"].mean())
data["ma_net_cash_flow_7"] = data["net_cash_flow"].rolling(window=7).mean().fillna(data["net_cash_flow"].mean())
features.extend(["lagged_net_cash_flow_1", "lagged_net_cash_flow_7", "lagged_net_cash_flow_14", "lagged_net_cash_flow_30", "ma_net_cash_flow_7"])

# Initialize the scaler
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data[features + [target]])

# Debug scaler shape
print(f"Scaled data shape: {scaled_data.shape}")
print(f"Scaler min_ shape: {scaler.min_.shape}")
print(f"Scaler scale_ shape: {scaler.scale_.shape}")

# Prepare training sequences
lookback = 30
X, y = [], []
for i in range(lookback, len(scaled_data)):
    X.append(scaled_data[i - lookback:i, :10])  # Now 10 features
    y.append(scaled_data[i, 10])  # Target is the 11th column
X = np.array(X)
y = np.array(y)

# Debug shapes
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

# Data augmentation with noise
def add_noise(data, noise_factor=0.01):
    noise = np.random.normal(0, noise_factor, data.shape)
    return data + noise

X_augmented = add_noise(X)

# Build the LSTM model with self-attention
def build_model(lookback, n_features):
    model = Sequential([
        Input(shape=(lookback, n_features)),
        Bidirectional(LSTM(80, activation='tanh', return_sequences=True)),
        LayerNormalization(),
        Dropout(0.15),
        LSTM(80, activation='tanh', return_sequences=True),
        LayerNormalization(),
        SelfAttention(),
        Dropout(0.15),
        Dense(32, activation='relu'),
        Dense(1)
    ])
    optimizer = Adam(learning_rate=0.01, clipvalue=0.5)
    model.compile(optimizer=optimizer, loss=Huber(delta=0.5))
    return model

# K-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
best_model = None
best_val_loss = float('inf')

for train_index, val_index in kf.split(X):
    print(f"Training fold {fold}...")
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    X_train_augmented = add_noise(X_train)
    
    model = build_model(lookback, 10)
    clr = CyclicLR(base_lr=0.0001, max_lr=0.003, step_size=4000., mode='triangular')
    early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
    
    history = model.fit(
        X_train_augmented, y_train,
        epochs=150,
        batch_size=32,
        verbose=1,
        callbacks=[clr, early_stopping],
        validation_data=(X_val, y_val)
    )
    
    val_loss = min(history.history['val_loss'])
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model
    
    fold += 1

# Save the best model weights
best_model.save_weights("lstm_weights_keras.weights.h5")
print("Training completed and weights saved to lstm_weights_keras.weights.h5.")

Code Explanation

Data Preprocessing

Before training the LSTM model, we perform several preprocessing steps to prepare the data:

  • Feature Engineering: Creating temporal features like day of week, month, quarter, and binary indicators for weekends, month starts/ends, etc.
  • Outlier Clipping: Limiting extreme values to the 1st and 99th percentiles to reduce the impact of outliers.
  • Log Transformation: Applying log transformation to income and expenses to stabilize variance.
  • Lagged Features: Creating lagged versions of income, expenses, and net cash flow to capture temporal dependencies.
  • Rolling Window Features: Computing rolling means and standard deviations to capture trends and volatility.

Data Preparation for LSTM

LSTM models require data in a specific format with sequences of past observations:

  • Scaling: Normalizing all features to the range [0, 1] using MinMaxScaler.
  • Sequence Creation: Transforming the data into sequences of 30 days (time steps) for input to the LSTM.
  • Train-Test Split: Dividing the data into 80% training and 20% testing sets, preserving the temporal order.
time_steps = 30 # Use 30 days of history
X_seq, y_seq = create_sequences(X_scaled, y_scaled, time_steps)

# Split data into training and testing sets (80% train, 20% test)
split_idx = int(len(X_seq) * 0.8)
X_train, X_test = X_seq[:split_idx], X_seq[split_idx:]
y_train, y_test = y_seq[:split_idx], y_seq[split_idx:]

Model Architecture

We implement a sophisticated LSTM architecture with several advanced components:

  • Custom Self-Attention Layer: A specialized layer that allows the model to focus on the most relevant parts of the input sequence.
  • Bidirectional LSTM: Processing the sequence in both forward and backward directions to capture more context.
  • Layer Normalization: Stabilizing the learning process by normalizing the activations within each layer.
  • Multiple LSTM Layers: Using two LSTM layers with 80 units each for hierarchical feature extraction.
  • Dropout Regularization: Adding dropout (15%) to prevent overfitting.
  • Huber Loss: Using a loss function that's less sensitive to outliers than mean squared error.
# Build the LSTM model with self-attention
def build_model(lookback, n_features):
  model = Sequential([
    Input(shape=(lookback, n_features)),
    Bidirectional(LSTM(80, activation='tanh', return_sequences=True)),
    LayerNormalization(),
    Dropout(0.15),
    LSTM(80, activation='tanh', return_sequences=True),
    LayerNormalization(),
    SelfAttention(),
    Dropout(0.15),
    Dense(32, activation='relu'),
    Dense(1)
  ])
  optimizer = Adam(learning_rate=0.01, clipvalue=0.5)
  model.compile(optimizer=optimizer, loss=Huber(delta=0.5))
  return model

Training Techniques

We employ several advanced training techniques to improve model performance:

  • Custom Cyclic Learning Rate: A specialized callback that varies the learning rate between 0.0001 and 0.003 to escape local minima and improve convergence.
  • Gradient Clipping: Limiting gradient values to 0.5 to prevent exploding gradients during training.
  • Early Stopping: Halting training when validation loss stops improving to prevent overfitting.
  • K-fold Cross-Validation: Using 5-fold cross-validation to validate the model while ensuring robust performance.
  • Data Augmentation: Creating additional training samples by adding small random noise to existing data.
# Cyclic Learning Rate Scheduler
class CyclicLR(tf.keras.callbacks.Callback):
  def __init__(self, base_lr=0.0001, max_lr=0.003, step_size=4000., mode='triangular'):
    super(CyclicLR, self).__init__()
    self.base_lr = base_lr
    self.max_lr = max_lr
    self.step_size = step_size
    self.mode = mode
    # ... additional implementation details ...

Cross-Validation and Training

We implement time series cross-validation to ensure our model generalizes well to unseen data:

# Train the model with 5-fold cross-validation
tscv = TimeSeriesSplit(n_splits=5)
histories = []

for train_idx, val_idx in tscv.split(X_train):
  X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
  y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
  
  # Data augmentation: add small random noise to training data
  X_train_aug = X_train_fold.copy()
  y_train_aug = y_train_fold.copy()
  
  for i in range(3): # Create 3 augmented copies
    noise_X = np.random.normal(0, 0.01, X_train_fold.shape)
    noise_y = np.random.normal(0, 0.01, y_train_fold.shape)
    
    X_train_aug = np.vstack([X_train_aug, X_train_fold + noise_X])
    y_train_aug = np.concatenate([y_train_aug, y_train_fold + noise_y])
  
  # Train on this fold
  history = model.fit(...)
  
  histories.append(history)

Key Training Features

Model Architecture

  • Bidirectional LSTM with 80 units
  • Layer normalization after each LSTM layer
  • Custom self-attention mechanism
  • Dropout regularization (15%)
  • Huber loss function for robustness

Training Approach

  • 5-fold time series cross-validation
  • Cyclic learning rate (0.001-0.01)
  • Data augmentation with random noise
  • Early stopping with patience of 20 epochs
  • Learning rate reduction on plateau

Feature Engineering

  • Temporal features (day, month, quarter)
  • Binary indicators (weekend, month start/end)
  • Lagged features (1, 2, 3, 7, 14, 30 days)
  • Rolling statistics (7, 14, 30 day windows)
  • Log transformations

Data Preparation

  • Outlier clipping (1st-99th percentiles)
  • Feature scaling to [0, 1] range
  • Sequence creation with 30-day windows
  • 80/20 train-test split
  • Batch size of 32 samples