# A bit of setup
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
# for auto-reloading extenrnal modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
np.random.seed(0)
n = 2 # dimensionality
points_per_class = 100 # number of points per class
num_classes = 3 # number of classes
m = points_per_class*num_classes
X = np.zeros((m,n))
y = np.zeros(m, dtype='uint8')
for j in range(num_classes):
inds = range(points_per_class*j, points_per_class*(j+1))
# Generate radius and angle for each point
r = np.linspace(0.0, 1, points_per_class) # radius
t = np.linspace(j*4,(j+1)*4,points_per_class) + np.random.randn(points_per_class)*0.2 # theta
X[inds] = np.c_[r*np.sin(t), r*np.cos(t)]
y[inds] = j # class label
fig = plt.figure()
plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.Spectral)
plt.xlim([-1,1])
plt.ylim([-1,1])
h = 0.05
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
X_test = np.c_[xx.ravel(), yy.ravel()]
def plot_model(scores):
# Put the result into a color plot
Z = scores.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
plt.xlabel('x1')
plt.ylabel('x2')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())
First, setup some functions for non-linear functions we may use in the neural network, together with their derivatives.
# ReLU: "rectified linear unit" nonlinearity
def relu(z):
return np.maximum(0, z)
# Derivative of relu wrt its input (for backprop)
def relu_prime(z):
d = np.zeros_like(z)
d[z > 0] = 1.
return d
# Logistic function (aka "sigmoid") nonlinearity
def logistic(z):
return 1./(1+np.exp(-z))
# Derivative of logistic function wrt its input
def logistic_prime(z):
p = logistic(z)
return p*(1-p)
Here we will show a complete example of training a feed-forward neural network with one hidden layer using autograd. First, we need to define a multiclass loss function for our model.
We would like a multiclass loss function analogous to log loss for logistic regression. First, suppose we are operating on a single input x with class y, and we have computed class scores s1,…,sc for this example for each of c classes. We first transform these scores to probabilities by the following trick, known as the "softmax" transformation:
pk=exp(sk)∑k′exp(sk′)We have first exponentiated the scores to make them positive, and then normalized them to sum to one.
Then, the appropriate generalization of log loss is for multiple classes is:
L=−∑kI{y=k}logpkHere the indicator function I{y=k} is equal to one when the true class y is equal to k and zero otherwise. Note that, like log loss, this simply picks out the negative log predicted probability for the correct class. This loss function is usually called the "cross-entropy" loss function, for reasons we will not dive into.
This is the loss on a single training example. Our overall loss function is an average of the cross-entropy loss over all training example: L=1mm∑i=1(∑k−I{y(i)=k}logp(i)k)
To train a model with automatic differentiation, all we need to do is write a routine to compute the loss, and then we can automatically compute the derivatives of the loss function with respect to parameters for gradient descent. Here is a complete example.
import autograd.numpy as np # Thinly wrapped version of numpy
from autograd import grad
# Initialize parameters randomly
h = 100 # size of hidden layer
W1 = 0.01 * np.random.randn(n,h)
b1 = np.zeros((1,h))
W2 = 0.01 * np.random.randn(h,num_classes)
b2 = np.zeros((1,num_classes))
# Select hyperparameters
iters = 10000
step_size = 1e-0
lambda_val = 1e-3 # regularization strength
'''
Do entire feed-forward computation and compute loss function
'''
def compute_loss(params):
W1, b1, W2, b2 = params
# Compute scores
hidden = relu(np.dot(X, W1) + b1)
scores = np.dot(hidden, W2) + b2
# Compute probabilities
exp_scores = np.exp(scores)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
# Compute cross-entropy loss
logprob_correct_class = -np.log(probs[range(m),y])
data_loss = np.sum(logprob_correct_class)/m # cross-entropy
# Compute regularization loss
reg_loss = 0.5 * lambda_val * (np.sum(W1*W1) + np.sum(W2*W2))
return data_loss + reg_loss
# This is the gradient of the entire feedforward training
gradient = grad(compute_loss)
# Gradient descent loop
for i in range(iters):
# Print diagnostic
loss = compute_loss((W1, b1, W2, b2))
if i % 1000 == 0:
print "iteration %d: loss %f" % (i, loss)
dW1, db1, dW2, db2 = gradient((W1, b1, W2, b2))
# perform a parameter update
W1 += -step_size * dW1
b1 += -step_size * db1
W2 += -step_size * dW2
b2 += -step_size * db2
def predict(X):
hidden = relu(np.dot(X, W1) + b1)
scores = np.dot(hidden, W2) + b2
pred = np.argmax(scores, axis=1)
return pred
plot_model(predict(X_test))
Now, we will do the same thing but code the backpropagation ourselves. First, we
It is possible to show using calculations similar to the derivations of the gradient of log-loss that:
dLdsk={pkif y≠kpk−1if y=kComputing these derivatives is the first step of backpropagation.
Our feed-forward computation is structured very efficiently through matrix multiplication. For example, one layer without any non-linearity looks like:
.python
S = np.dot(X, W) + b
Here
We have imagined that this is the first layer, so the inputs are the original inputs X. However, the same math holds true if we replace X by a matrix H where each row contains the outputs of the previous layer for a single input feature vector. Observe that we are simultaneously conducting feedforward for the entire batch of inputs represented by the rows of X. This makes it extremely efficient. Also, note that broadcasting is happening here with b, so the proper way to write this algebraically is:
S=XW+1bHere, 1∈Rm is a vector of ones, so that the product 1b is an m×c matrix (just like XW) that contains a copy of b is each row. Now, suppose we have a loss function L that depends on S, and we have already computed the partial derivative ∂L∂S, which has the same dimensions as S (m×c). Then, the backpropagated partial derivatives with respect to X, W, and b are as follows:
∂L∂W=XT∂L∂S∂L∂X=∂L∂SWT∂L∂b=1T∂L∂SThese follow from standard identities for derivatives with respect to matrices and vectors. Note that the left-multiplication 1T∂L∂S by the ones vector simply adds along columns of ∂L∂S. Therefore, in code we would write:
.python
# dS = array holding dL/dS
dW = np.dot(X.T, dS) # dL/dW
dX = np.dot( dS, W.T) # dL/dX
db = np.sum( dS, axis=0 ) # dL/db
# Initialize parameters randomly
h = 100 # size of hidden layer
W1 = 0.01 * np.random.randn(n,h)
b1 = np.zeros((1,h))
W2 = 0.01 * np.random.randn(h,num_classes)
b2 = np.zeros((1,num_classes))
# Select hyperparameters
iters = 10000
step_size = 1e-0
lambda_val = 1e-3 # regularization strength
# Select nonlinearity
g = relu
g_prime = relu_prime
#g = logistic
#g_prime = logistic_prime
# Gradient descent loop
for i in range(iters):
'''
FORWARD PROPAGATION
'''
# Compute class scores
hidden_layer = g(np.dot(X, W1) + b1) # shape (m, h)
scores = np.dot(hidden_layer, W2) + b2 # shape (m, num_classes)
# Compute class probabilities
exp_scores = np.exp(scores) # shape (m, num_classes)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
# Compute the loss function
logprob_correct_class = -np.log(probs[range(m),y])
data_loss = np.sum(logprob_correct_class)/m # cross-entropy
reg_loss = 0.5 * lambda_val * (np.sum(W1*W1) + np.sum(W2*W2)) # regularization
loss = data_loss + reg_loss # total
# Print diagnostic
if i % 1000 == 0:
print "iteration %d: loss %f" % (i, loss)
'''
BACKWARD PROPAGATION
'''
# Compute gradient of cross-entropy wrt class scores
dscores = probs
dscores[range(m),y] -= 1
dscores /= m
# Now backpropagate to get gradient of cross-entropy wrt parameters (W2,b2)
# and hidden layer outputs
dW2 = np.dot(hidden_layer.T, dscores)
db2 = np.sum(dscores, axis=0)
dhidden = np.dot(dscores, W2.T)
# Backprop through the nonlinearity
dhidden = dhidden * g_prime(hidden_layer)
# Backprop to (W1,b1)
dW1 = np.dot(X.T, dhidden)
db1 = np.sum(dhidden, axis=0)
# Add regularization gradient contribution
dW2 += lambda_val * W2
dW1 += lambda_val * W1
'''
UPDATE PARAMETERS
'''
# perform a parameter update
W1 += -step_size * dW1
b1 += -step_size * db1
W2 += -step_size * dW2
b2 += -step_size * db2
def predict(X):
hidden_layer = g(np.dot(X, W1) + b1)
scores = np.dot(hidden_layer, W2) + b2
pred = np.argmax(scores, axis=1)
return pred
plot_model(predict(X_test))
This is a simpler model than the one above. You may wish to inspect this first if working through backprop. It is a one-layer neural network, or multi-class logistic regression.
# Initialize parameters randomly
W = 0.01 * np.random.randn(n, num_classes)
b = np.zeros((1,num_classes))
# Gradient descent loop
iters = 200
step_size = 1e-0
lambda_val = 1e-3 # regularization strength
for i in range(iters):
'''
FORWARD PROPAGATION
'''
# Compute class probabilities
scores = np.dot(X, W) + b # shape (m, num_classes)
exp_scores = np.exp(scores) # shape (m, num_classes)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # shape (m, num_classes)
# Compute the loss function
logprob_correct_class = -np.log(probs[range(m),y])
data_loss = np.sum(logprob_correct_class)/m # cross-entropy
reg_loss = 0.5 * lambda_val * np.sum(W*W) # regularization
loss = data_loss + reg_loss # total
# Print diagnostic
if i % 10 == 0:
print "iteration %d: loss %f" % (i, loss)
'''
BACKWARD PROPAGATION
'''
# Compute gradient of cross-entropy wrt class scores
dscores = probs
dscores[range(m),y] -= 1
dscores /= m # shape (m, num_classes)
# Now backpropate get gradient of cross-entropy wrt parameters (W,b)
dW = np.dot(X.T, dscores)
db = np.sum(dscores, axis=0, keepdims=True)
# Add gradient of regularization term
dW += lambda_val * W
'''
UPDATE PARAMETERS
'''
W += -step_size * dW
b += -step_size * db
# evaluate training set accuracy
scores = np.dot(X, W) + b
predicted_class = np.argmax(scores, axis=1)
print 'training accuracy: %.2f' % (np.mean(predicted_class == y))
scores_test = np.dot(X_test, W) + b
pred = np.argmax(scores_test, axis=1)
plot_model(pred)