Processing math: 100%

Neural Net Example

Partially based on Stanford CS231n neural network case study.

Setup

In [7]:
# A bit of setup
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading extenrnal modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Create data set

In [8]:
np.random.seed(0)

n = 2 # dimensionality
points_per_class = 100 # number of points per class
num_classes = 3 # number of classes

m = points_per_class*num_classes

X = np.zeros((m,n))
y = np.zeros(m, dtype='uint8')

for j in range(num_classes):
    
    inds = range(points_per_class*j, points_per_class*(j+1))

    # Generate radius and angle for each point
    r = np.linspace(0.0, 1, points_per_class) # radius
    t = np.linspace(j*4,(j+1)*4,points_per_class) + np.random.randn(points_per_class)*0.2 # theta

    X[inds] = np.c_[r*np.sin(t), r*np.cos(t)]
    y[inds] = j  # class label
    
fig = plt.figure()
plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.Spectral)
plt.xlim([-1,1])
plt.ylim([-1,1])
Out[8]:
(-1, 1)

Plotting setup

In [9]:
h = 0.05
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

X_test = np.c_[xx.ravel(), yy.ravel()]

def plot_model(scores):
    # Put the result into a color plot
    Z = scores.reshape(xx.shape)
    plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)

    # Plot also the training points
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
    plt.xlabel('x1')
    plt.ylabel('x2')
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.xticks(())
    plt.yticks(())

Helper Functions for Non-Linearities

First, setup some functions for non-linear functions we may use in the neural network, together with their derivatives.

In [4]:
# ReLU: "rectified linear unit" nonlinearity
def relu(z):
    return np.maximum(0, z)

# Derivative of relu wrt its input (for backprop)
def relu_prime(z):
    d = np.zeros_like(z)
    d[z > 0] = 1.
    return d

# Logistic function (aka "sigmoid") nonlinearity
def logistic(z):
    return 1./(1+np.exp(-z))

# Derivative of logistic function wrt its input
def logistic_prime(z):
    p = logistic(z)
    return p*(1-p)

Train a 2-Layer Neural Network Using Autograd

Here we will show a complete example of training a feed-forward neural network with one hidden layer using autograd. First, we need to define a multiclass loss function for our model.

Cross-Entropy Loss Function

We would like a multiclass loss function analogous to log loss for logistic regression. First, suppose we are operating on a single input x with class y, and we have computed class scores s1,,sc for this example for each of c classes. We first transform these scores to probabilities by the following trick, known as the "softmax" transformation:

pk=exp(sk)kexp(sk)

We have first exponentiated the scores to make them positive, and then normalized them to sum to one.

Then, the appropriate generalization of log loss is for multiple classes is:

L=kI{y=k}logpk

Here the indicator function I{y=k} is equal to one when the true class y is equal to k and zero otherwise. Note that, like log loss, this simply picks out the negative log predicted probability for the correct class. This loss function is usually called the "cross-entropy" loss function, for reasons we will not dive into.

This is the loss on a single training example. Our overall loss function is an average of the cross-entropy loss over all training example: L=1mmi=1(kI{y(i)=k}logp(i)k)

To train a model with automatic differentiation, all we need to do is write a routine to compute the loss, and then we can automatically compute the derivatives of the loss function with respect to parameters for gradient descent. Here is a complete example.

In [10]:
import autograd.numpy as np  # Thinly wrapped version of numpy
from autograd import grad

# Initialize parameters randomly
h  = 100 # size of hidden layer
W1 = 0.01 * np.random.randn(n,h)
b1 = np.zeros((1,h))
W2 = 0.01 * np.random.randn(h,num_classes)
b2 = np.zeros((1,num_classes))

# Select hyperparameters
iters      = 10000
step_size  = 1e-0
lambda_val = 1e-3 # regularization strength

'''
Do entire feed-forward computation and compute loss function
'''
def compute_loss(params):
    W1, b1, W2, b2 = params
    
    # Compute scores
    hidden = relu(np.dot(X, W1) + b1)
    scores = np.dot(hidden, W2) + b2
    
    # Compute probabilities
    exp_scores = np.exp(scores)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

    # Compute cross-entropy loss    
    logprob_correct_class = -np.log(probs[range(m),y])
    data_loss = np.sum(logprob_correct_class)/m    # cross-entropy
    
    # Compute regularization loss
    reg_loss = 0.5 * lambda_val * (np.sum(W1*W1) + np.sum(W2*W2))
    
    return data_loss + reg_loss


# This is the gradient of the entire feedforward training
gradient = grad(compute_loss)

# Gradient descent loop
for i in range(iters):
  
    # Print diagnostic
    loss = compute_loss((W1, b1, W2, b2))    
  
    if i % 1000 == 0:
        print "iteration %d: loss %f" % (i, loss)
        
    dW1, db1, dW2, db2 = gradient((W1, b1, W2, b2))
    
    # perform a parameter update
    W1 += -step_size * dW1
    b1 += -step_size * db1
    W2 += -step_size * dW2
    b2 += -step_size * db2

def predict(X):
    hidden = relu(np.dot(X, W1) + b1)
    scores = np.dot(hidden, W2) + b2
    pred = np.argmax(scores, axis=1)
    return pred

plot_model(predict(X_test))
iteration 0: loss 1.098765
iteration 1000: loss 0.292278
iteration 2000: loss 0.256588
iteration 3000: loss 0.248217
iteration 4000: loss 0.246182
iteration 5000: loss 0.245667
iteration 6000: loss 0.245437
iteration 7000: loss 0.245281
iteration 8000: loss 0.245177
iteration 9000: loss 0.245094

Train a 2-Layer Neural Network Using Backprop

Now, we will do the same thing but code the backpropagation ourselves. First, we

Derivative of Log-Loss with Respect to Scores

It is possible to show using calculations similar to the derivations of the gradient of log-loss that:

dLdsk={pkif ykpk1if y=k

Computing these derivatives is the first step of backpropagation.

Backprop through One Layer Using Matrix Multiplication

Our feed-forward computation is structured very efficiently through matrix multiplication. For example, one layer without any non-linearity looks like:

.python
S = np.dot(X, W) + b

Here

  • XRm×n is a data matrix (with feature vectors in rows),
  • WRn×c is a matrix of weight vectors (one in each column),
  • bR1×c is a row vector containing the biases/intercepts for each class.

We have imagined that this is the first layer, so the inputs are the original inputs X. However, the same math holds true if we replace X by a matrix H where each row contains the outputs of the previous layer for a single input feature vector. Observe that we are simultaneously conducting feedforward for the entire batch of inputs represented by the rows of X. This makes it extremely efficient. Also, note that broadcasting is happening here with b, so the proper way to write this algebraically is:

S=XW+1b

Here, 1Rm is a vector of ones, so that the product 1b is an m×c matrix (just like XW) that contains a copy of b is each row. Now, suppose we have a loss function L that depends on S, and we have already computed the partial derivative LS, which has the same dimensions as S (m×c). Then, the backpropagated partial derivatives with respect to X, W, and b are as follows:

LW=XTLSLX=LSWTLb=1TLS

These follow from standard identities for derivatives with respect to matrices and vectors. Note that the left-multiplication 1TLS by the ones vector simply adds along columns of LS. Therefore, in code we would write:

.python
# dS = array holding dL/dS
dW = np.dot(X.T,  dS)       # dL/dW
dX = np.dot( dS, W.T)       # dL/dX
db = np.sum( dS, axis=0 )   # dL/db
In [ ]:
# Initialize parameters randomly

h  = 100 # size of hidden layer
W1 = 0.01 * np.random.randn(n,h)
b1 = np.zeros((1,h))
W2 = 0.01 * np.random.randn(h,num_classes)
b2 = np.zeros((1,num_classes))

# Select hyperparameters
iters      = 10000
step_size  = 1e-0
lambda_val = 1e-3 # regularization strength

# Select nonlinearity
g       = relu 
g_prime = relu_prime

#g = logistic
#g_prime = logistic_prime

# Gradient descent loop
for i in range(iters):
  
    '''
    FORWARD PROPAGATION
    ''' 

    # Compute class scores
    hidden_layer = g(np.dot(X, W1) + b1)           # shape (m, h)
    scores       = np.dot(hidden_layer, W2) + b2   # shape (m, num_classes)

    # Compute class probabilities
    exp_scores = np.exp(scores)                    # shape (m, num_classes)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    
    # Compute the loss function
    logprob_correct_class = -np.log(probs[range(m),y])
    data_loss = np.sum(logprob_correct_class)/m                    # cross-entropy
    reg_loss = 0.5 * lambda_val * (np.sum(W1*W1) + np.sum(W2*W2))  # regularization
    loss = data_loss + reg_loss                                    # total          
    
    # Print diagnostic
    if i % 1000 == 0:
        print "iteration %d: loss %f" % (i, loss)

        
    ''' 
    BACKWARD PROPAGATION
    '''

    # Compute gradient of cross-entropy wrt class scores
    dscores = probs
    dscores[range(m),y] -= 1
    dscores /= m

    # Now backpropagate to get gradient of cross-entropy wrt parameters (W2,b2)
    # and hidden layer outputs 
    dW2     = np.dot(hidden_layer.T, dscores)
    db2     = np.sum(dscores, axis=0)
    dhidden = np.dot(dscores, W2.T)
    
    # Backprop through the nonlinearity
    dhidden = dhidden * g_prime(hidden_layer)
    
    # Backprop to (W1,b1)
    dW1 = np.dot(X.T, dhidden)
    db1 = np.sum(dhidden, axis=0)

    # Add regularization gradient contribution
    dW2 += lambda_val * W2
    dW1 += lambda_val * W1
    
    '''
    UPDATE PARAMETERS
    '''
    
    # perform a parameter update
    W1 += -step_size * dW1
    b1 += -step_size * db1
    W2 += -step_size * dW2
    b2 += -step_size * db2

def predict(X):
    hidden_layer = g(np.dot(X, W1) + b1)
    scores = np.dot(hidden_layer, W2) + b2
    pred = np.argmax(scores, axis=1)
    return pred

plot_model(predict(X_test))

Train Multi-Class Logistic Regression Using Backprop

This is a simpler model than the one above. You may wish to inspect this first if working through backprop. It is a one-layer neural network, or multi-class logistic regression.

In [ ]:
# Initialize parameters randomly
W = 0.01 * np.random.randn(n, num_classes)
b = np.zeros((1,num_classes))

# Gradient descent loop
iters = 200
step_size  = 1e-0
lambda_val = 1e-3 # regularization strength

for i in range(iters):

    '''
    FORWARD PROPAGATION
    '''
    
    # Compute class probabilities
    scores     = np.dot(X, W) + b     # shape (m, num_classes)
    exp_scores = np.exp(scores)       # shape (m, num_classes)
    probs      = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)  # shape (m, num_classes)

    # Compute the loss function
    logprob_correct_class = -np.log(probs[range(m),y])
    data_loss = np.sum(logprob_correct_class)/m    # cross-entropy
    reg_loss = 0.5 * lambda_val * np.sum(W*W)      # regularization
    loss = data_loss + reg_loss                    # total          
    
    # Print diagnostic
    if i % 10 == 0:
        print "iteration %d: loss %f" % (i, loss)

        
    '''
    BACKWARD PROPAGATION
    '''

    # Compute gradient of cross-entropy wrt class scores
    dscores = probs
    dscores[range(m),y] -= 1
    dscores /= m                 # shape (m, num_classes)

    # Now backpropate get gradient of cross-entropy wrt parameters (W,b)
    dW = np.dot(X.T, dscores)
    db = np.sum(dscores, axis=0, keepdims=True)

    # Add gradient of regularization term
    dW += lambda_val * W 


    '''
    UPDATE PARAMETERS
    '''

    W += -step_size * dW
    b += -step_size * db

# evaluate training set accuracy
scores = np.dot(X, W) + b
predicted_class = np.argmax(scores, axis=1)
print 'training accuracy: %.2f' % (np.mean(predicted_class == y))

scores_test = np.dot(X_test, W) + b
pred = np.argmax(scores_test, axis=1)
plot_model(pred)