neural/mlp.go from kujenga/goml

neural/mlp.go
Summary

Maintainability

0 mins
Test Coverage

100%
Issues
Coverage
// Package neural provides an implementation of an Artificial Neural Network
//
package neural

import (
    "errors"
    "fmt"
    "math"
    "math/rand"

    "github.com/kujenga/goml/lin"
)

// This work was based on learnings from the following resources:
// - "Make Your Own Neural Network" by Tariq Rashid
// - deeplizard series on "Backpropagation explained":
//   https://www.youtube.com/playlist?list=PLZbbT5o_s2xq7LwI2y8_QtvuXZedL6tQU
// - "Build an Artificial Neural Network From Scratch" article:
//   https://www.kdnuggets.com/2019/11/build-artificial-neural-network-scratch-part-1.html

// MLP provides a Multi-Layer Perceptron which can be configured for
// any network architecture within that paradigm.
type MLP struct {
    // Layers is a list of layers in the network, where the first is the
    // input and last is the output, with inner layers acting as hidden
    // layers.
    //
    // These must not be modified after initialization/training.
    Layers []*Layer

    // LearningRate is the rate at which learning occurs in back
    // propagation, relative to the error calculations.
    LearningRate float32

    // Introspect provides a way for the caller of this network to
    // check the status of network learning over time and witness
    // convergence (or lack thereof).
    Introspect func(step Step)
}

// Step captures status updates that happens within a single Epoch, for use in
// introspecting models.
type Step struct {
    // Monotonically increasing counter of which training epoch this step
    // represents.
    Epoch int
    // Loss is the sum of normalized error values to for the epoch using
    // the loss function for the network.
    Loss float32
}

// Initialize sets up network layers with the needed memory allocations and
// references for proper operation. It is called automatically during training,
// provided separately only to facilitate more precise use of the network from
// a performance analysis perspective.
func (n *MLP) Initialize() {
    var prev *Layer
    for i, layer := range n.Layers {
        var next *Layer
        if i < len(n.Layers)-1 {
            next = n.Layers[i+1]
        }
        // Idempotent initialization of the layer, passing in the
        // previous and next layers for reference in training.
        layer.initialize(n, prev, next)
        prev = layer
    }
}

// Train takes in a set of inputs and a set of labels and trains the network
// using backpropagation to adjust internal weights to minimize loss, over the
// specified number of epochs. The final loss value is returned after training
// completes.
func (n *MLP) Train(epochs int, inputs, labels lin.Frame) (float32, error) {
    // Validate that the inputs match the network configuration.
    if err := n.check(inputs, labels); err != nil {
        return 0, err
    }

    // Initialize all layers within the network.
    n.Initialize()

    // Run the training process for the specified number of epochs.
    var loss float32
    for e := 0; e < epochs; e++ {
        predictions := make(lin.Frame, len(inputs))

        // Iterate over each inputs to train in SGD fashion.
        for i, input := range inputs {
            // Iterate FORWARDS through the network.
            activations := input
            for _, layer := range n.Layers {
                activations = layer.ForwardProp(activations)
            }
            predictions[i] = activations

            // Iterate BACKWARDS through the network.
            for step := range n.Layers {
                l := len(n.Layers) - (step + 1)
                layer := n.Layers[l]

                if l == 0 {
                    // If we are at the input layer, nothing to do.
                    continue
                }

                layer.BackProp(labels[i])
            }
        }

        // Calculate loss
        loss = Loss(predictions, labels)
        if n.Introspect != nil {
            n.Introspect(Step{
                Epoch: e,
                Loss:  loss,
            })
        }

    }

    return loss, nil
}

// Predict takes in a set of input rows with the width of the input layer, and
// returns a frame of prediction rows with the width of the output layer,
// representing the predictions of the network.
func (n *MLP) Predict(inputs lin.Frame) lin.Frame {
    // Iterate FORWARDS through the network
    preds := make(lin.Frame, len(inputs))
    for i, input := range inputs {
        activations := input
        for _, layer := range n.Layers {
            activations = layer.ForwardProp(activations)
        }
        // Activations from the last layer are our predictions
        preds[i] = activations
    }
    return preds
}

func (n *MLP) check(inputs lin.Frame, outputs lin.Frame) error {
    if len(n.Layers) == 0 {
        return errors.New("ann must have at least one layer")
    }

    if len(inputs) != len(outputs) {
        return fmt.Errorf(
            "inputs count %d mismatched with outputs count %d",
            len(inputs), len(outputs),
        )
    }
    return nil
}

// Layer defines a layer in the neural network. These are presently basic
// feed-forward layers that also provide capabilities to facilitate
// backpropagatin within the MLP structure.
type Layer struct {
    // Name provides a human displayable name for the layer, for debugging
    // and documentation purposes.
    Name string
    // Width defines the number of neurons in this layer.
    Width int
    // Activation function for transforming values passed out of this
    // layer. Defaults to sigmoid when not specified.
    ActivationFunction func(float32) float32
    // Derivative of the activation function, needed for backpropagation.
    // Must match the value of the ActivationFunction variable. Defaults to
    // the derivative of the default sigmoid function.
    ActivationFunctionDeriv func(float32) float32

    // Pointer to the neural network that this layer is being used within.
    nn *MLP
    // Pointer to previous layer in the network for use in initialization
    // steps and backprop.
    prev *Layer
    // Pointer to next layer in the network for use in backprop.
    next *Layer

    initialized bool
    // weights are row x column. each row is a node in the current layer,
    // each column corresponds with a node from the previous layer.
    weights lin.Frame
    // each node has a bias which can be changed over time.
    biases lin.Vector

    // Every time that input is fed through this layer, the last input
    // values "z" computed from the weights and the last activation values are
    // preserved for use in backpropagation.
    lastZ           lin.Vector
    lastActivations lin.Vector
    // As backpropagation progresses, we record the value of the Loss last
    // seen so that it can be incorporated as a proxy error for the errors
    // computed in the output layer.
    lastE lin.Vector
    lastL lin.Frame
}

// initialize sets up the needed data structures and random initial values for
// the layer. If key values are unspecified, defaults are configured.
func (l *Layer) initialize(nn *MLP, prev *Layer, next *Layer) {
    if l.initialized || prev == nil {
        // If already initialized or the input layer, nothing to do.
        return
    }

    // Pointers to other components in the network.
    l.nn = nn
    l.prev = prev
    l.next = next

    // Provide defaults for unspecified hyperparameters.
    if l.ActivationFunction == nil {
        l.ActivationFunction = lin.Sigmoid
    }
    if l.ActivationFunctionDeriv == nil {
        l.ActivationFunctionDeriv = lin.SigmoidDerivative
    }

    // Initialize data structures for use in the network training and
    // predictions, providing them with random initial values.
    l.weights = make(lin.Frame, l.Width)
    for i := range l.weights {
        l.weights[i] = make(lin.Vector, l.prev.Width)
        for j := range l.weights[i] {
            // We scale these based on the "connectedness" of the
            // node to avoid saturating the gradients in the
            // network, where really high values do not play nicely
            // with activation functions like sigmoid.
            weight := rand.NormFloat64() *
                math.Pow(float64(l.prev.Width), -0.5)
            l.weights[i][j] = float32(weight)
        }
    }
    l.biases = make(lin.Vector, l.Width)
    for i := range l.biases {
        l.biases[i] = rand.Float32()
    }
    // Set up empty error and loss structures for use in backprop.
    l.lastE = make(lin.Vector, l.Width)
    l.lastL = make(lin.Frame, l.Width)
    for i := range l.lastL {
        l.lastL[i] = make(lin.Vector, l.prev.Width)
    }

    l.initialized = true
}

// ForwardProp takes in a set of inputs from the previous layer and performs
// forward propagation for the current layer, returning the resulting
// activations. As a special case, if this Layer has no previous layer and is
// thus the input layer for the network, the values are passed through
// unmodified. Internal state from the calculation is persisted for later use
// in back propagation.
func (l *Layer) ForwardProp(input lin.Vector) lin.Vector {
    // If this is the input layer, pass through values unmodified.
    if l.prev == nil {
        l.lastActivations = input
        return input
    }

    // Create vectors with state for each node in this layer.
    Z := make(lin.Vector, l.Width)
    activations := make(lin.Vector, l.Width)
    // For each node in the layer, perform feed-forward calculation.
    for i := range activations {
        // Vector of weights for each edge to this node, incoming from
        // the previous layer.
        nodeWeights := l.weights[i]
        // Scalar bias value for the current node index.
        nodeBias := l.biases[i]
        // Combine input with incoming edge weights, then apply bias.
        Z[i] = lin.DotProduct(input, nodeWeights) + nodeBias
        // Apply activation function for non-linearity.
        activations[i] = l.ActivationFunction(Z[i])
    }
    // Capture state for use in back-propagation.
    l.lastZ = Z
    l.lastActivations = activations
    return activations
}

// BackProp performs the training process of back propagation on the layer for
// the given set of labels. Weights and biases are updated for this layer
// according to the computed error. Internal state on the backpropagation
// process is captured for further backpropagation in earlier layers of the
// network as well.
func (l *Layer) BackProp(label lin.Vector) {
    // ∂L/∂a, deriv Loss w.r.t. activation:
    // 2 ( a1(L) - y1 )
    // First calculate the "lastE" vector of last observed error values.
    if l.next == nil { // Output layer
        // For the output layer, this is just the difference between
        // output value and label.
        l.lastE = l.lastActivations.Subtract(label)
    } else {
        // Formula for propagated error in hidden layers:
        // ∑0-j ( 2(aj-yj) (g'(zj)) (wj2) )

        // Compute an error for this hidden node by summing up losses
        // attributed to it from the next layer down in the network.
        l.lastE = make(lin.Vector, len(l.lastE))
        for j := range l.weights {
            for jn := range l.next.lastL {
                // Add the loss from node jn in the next layer
                // that came from node j in this layer.
                l.lastE[j] += l.next.lastL[jn][j]
            }
        }
    }
    // Derivative of the squared error w.r.t. activation is applied to the
    // vector of computed errors for each node in this layer.
    dLdA := l.lastE.Scalar(2)

    // ∂a/∂z, derivative of activation w.r.t. input:
    // g'(L)(z) ( z1(L) )
    // We apply the derivative of the activation function, specified at
    // network creation time, to the vector of "Z" values for each node
    // captured during forward propagation.
    dAdZ := l.lastZ.Apply(l.ActivationFunctionDeriv)

    // Capture the loss for this edge for use in the next layer up
    // of backprop. This references and feeds into the lastE
    // calculation above, used in the first derivative term.
    for j := range l.weights {
        l.lastL[j] = l.weights[j].Scalar(l.lastE[j])
    }

    // Iterate over each weight for node "j" in this layer and "k" in the
    // previous layer and update it according to the computed derivatives.
    for j := range l.weights {
        for k := range l.weights[j] {
            // ∂z/∂w, derivative of input w.r.t. weight:
            // a2(L-1)
            // This comes out to the same thing as the formula for
            // the last used activations themselves.
            dZdW := l.prev.lastActivations[k]

            // Total derivative, via chain rule ∂L/∂w,
            // derivative of loss w.r.t. weight
            dLdW := dLdA[j] * dAdZ[j] * dZdW

            // Update the weight according to the learning rate.
            l.weights[j][k] -= dLdW * l.nn.LearningRate
        }
    }

    // Calculate bias updates along the gradient of the loss w.r.t. inputs.
    // ∂L/∂z = ∂L/∂a * ∂a/∂z
    biasUpdate := dLdA.ElementwiseProduct(dAdZ)
    // Update the bias according to the learning rate.
    l.biases = l.biases.Subtract(biasUpdate.Scalar(l.nn.LearningRate))
}