In [7]:

```
# A bit of setup
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
# for auto-reloading extenrnal modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
```

In [8]:

```
np.random.seed(0)
n = 2 # dimensionality
points_per_class = 100 # number of points per class
num_classes = 3 # number of classes
m = points_per_class*num_classes
X = np.zeros((m,n))
y = np.zeros(m, dtype='uint8')
for j in range(num_classes):
inds = range(points_per_class*j, points_per_class*(j+1))
# Generate radius and angle for each point
r = np.linspace(0.0, 1, points_per_class) # radius
t = np.linspace(j*4,(j+1)*4,points_per_class) + np.random.randn(points_per_class)*0.2 # theta
X[inds] = np.c_[r*np.sin(t), r*np.cos(t)]
y[inds] = j # class label
fig = plt.figure()
plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.Spectral)
plt.xlim([-1,1])
plt.ylim([-1,1])
```

Out[8]:

In [9]:

```
h = 0.05
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
X_test = np.c_[xx.ravel(), yy.ravel()]
def plot_model(scores):
# Put the result into a color plot
Z = scores.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
plt.xlabel('x1')
plt.ylabel('x2')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())
```

First, setup some functions for non-linear functions we may use in the neural network, together with their derivatives.

In [4]:

```
# ReLU: "rectified linear unit" nonlinearity
def relu(z):
return np.maximum(0, z)
# Derivative of relu wrt its input (for backprop)
def relu_prime(z):
d = np.zeros_like(z)
d[z > 0] = 1.
return d
# Logistic function (aka "sigmoid") nonlinearity
def logistic(z):
return 1./(1+np.exp(-z))
# Derivative of logistic function wrt its input
def logistic_prime(z):
p = logistic(z)
return p*(1-p)
```

Here we will show a complete example of training a feed-forward neural network with one hidden layer using autograd. First, we need to define a multiclass loss function for our model.

We would like a multiclass loss function analogous to log loss for logistic regression. First, suppose we are operating on a single input $\mathbf{x}$ with class $y$, and we have computed class scores $s_1, \ldots, s_c$ for this example for each of $c$ classes. We first transform these scores to probabilities by the following trick, known as the "softmax" transformation:

$$ p_k = \frac{\exp(s_k)}{\sum_{k'} \exp(s_{k'})} $$We have first exponentiated the scores to make them positive, and then normalized them to sum to one.

Then, the appropriate generalization of log loss is for multiple classes is:

$$ L = - \sum_k \mathbb{I}\{y = k\} \log p_k $$Here the indicator function $\mathbb{I}\{y = k\}$ is equal to one when the true class $y$ is equal to $k$ and zero otherwise. Note that, like log loss, this simply picks out the negative log predicted probability for the correct class. This loss function is usually called the "cross-entropy" loss function, for reasons we will not dive into.

This is the loss on a single training example. Our overall loss function is an average of the cross-entropy loss over all training example: $$ L = \frac{1}{m} \sum_{i=1}^m \Bigg(\sum_k -\mathbb{I}\{y^{(i)} = k\} \log p^{(i)}_k\Bigg) $$

To train a model with automatic differentiation, all we need to do is write a routine to compute the loss, and then we can automatically compute the derivatives of the loss function with respect to parameters for gradient descent. Here is a complete example.

In [10]:

```
import autograd.numpy as np # Thinly wrapped version of numpy
from autograd import grad
# Initialize parameters randomly
h = 100 # size of hidden layer
W1 = 0.01 * np.random.randn(n,h)
b1 = np.zeros((1,h))
W2 = 0.01 * np.random.randn(h,num_classes)
b2 = np.zeros((1,num_classes))
# Select hyperparameters
iters = 10000
step_size = 1e-0
lambda_val = 1e-3 # regularization strength
'''
Do entire feed-forward computation and compute loss function
'''
def compute_loss(params):
W1, b1, W2, b2 = params
# Compute scores
hidden = relu(np.dot(X, W1) + b1)
scores = np.dot(hidden, W2) + b2
# Compute probabilities
exp_scores = np.exp(scores)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
# Compute cross-entropy loss
logprob_correct_class = -np.log(probs[range(m),y])
data_loss = np.sum(logprob_correct_class)/m # cross-entropy
# Compute regularization loss
reg_loss = 0.5 * lambda_val * (np.sum(W1*W1) + np.sum(W2*W2))
return data_loss + reg_loss
# This is the gradient of the entire feedforward training
gradient = grad(compute_loss)
# Gradient descent loop
for i in range(iters):
# Print diagnostic
loss = compute_loss((W1, b1, W2, b2))
if i % 1000 == 0:
print "iteration %d: loss %f" % (i, loss)
dW1, db1, dW2, db2 = gradient((W1, b1, W2, b2))
# perform a parameter update
W1 += -step_size * dW1
b1 += -step_size * db1
W2 += -step_size * dW2
b2 += -step_size * db2
def predict(X):
hidden = relu(np.dot(X, W1) + b1)
scores = np.dot(hidden, W2) + b2
pred = np.argmax(scores, axis=1)
return pred
plot_model(predict(X_test))
```

Now, we will do the same thing but code the backpropagation ourselves. First, we

It is possible to show using calculations similar to the derivations of the gradient of log-loss that:

$$ \frac{dL}{ds_k} = \begin{cases} p_k & \text{if } y \neq k \\ p_k - 1 & \text{if } y = k \end{cases} $$Computing these derivatives is the first step of backpropagation.

Our feed-forward computation is structured very efficiently through matrix multiplication. For example, one layer without any non-linearity looks like:

```
.python
S = np.dot(X, W) + b
```

Here

- $X \in \mathbb{R}^{m \times n}$ is a data matrix (with feature vectors in rows),
- $W \in \mathbb{R}^{n \times c}$ is a matrix of weight vectors (one in each column),
- $b \in \mathbb{R}^{1 \times c}$ is a row vector containing the biases/intercepts for each class.

We have imagined that this is the first layer, so the inputs are the original inputs $X$. However, the same math holds true if we replace $X$ by a matrix $H$ where each row contains the outputs of the previous layer for a single input feature vector. Observe that we are simultaneously conducting feedforward for the entire batch of inputs represented by the rows of $X$. This makes it extremely efficient. Also, note that broadcasting is happening here with $b$, so the proper way to write this algebraically is:

$$ \newcommand{\del}{\partial} S = X W + \mathbf{1}b $$Here, $\mathbf{1} \in \mathbb{R}^m$ is a vector of ones, so that the product $\mathbf{1}b$ is an $m \times c$ matrix (just like $XW$) that contains a copy of $b$ is each row. Now, suppose we have a loss function $L$ that depends on $S$, and we have already computed the partial derivative $\frac{\del L}{\del S}$, which has the same dimensions as $S$ ($m \times c$). Then, the backpropagated partial derivatives with respect to $X$, $W$, and $b$ are as follows:

$$ \begin{aligned} \frac{\del L}{\del W} &= X^T \frac{\del L}{\del S} \\ \frac{\del L}{\del X} &= \frac{\del L}{\del S} W^T \\ \frac{\del L}{\del b} &= \mathbf{1}^T \frac{\del L}{\del S} \end{aligned} $$These follow from standard identities for derivatives with respect to matrices and vectors. Note that the left-multiplication $\mathbf{1}^T \frac{\del L}{\del S}$ by the ones vector simply adds along columns of $\frac{\del L}{\del S}$. Therefore, in code we would write:

```
.python
# dS = array holding dL/dS
dW = np.dot(X.T, dS) # dL/dW
dX = np.dot( dS, W.T) # dL/dX
db = np.sum( dS, axis=0 ) # dL/db
```

In [ ]:

```
# Initialize parameters randomly
h = 100 # size of hidden layer
W1 = 0.01 * np.random.randn(n,h)
b1 = np.zeros((1,h))
W2 = 0.01 * np.random.randn(h,num_classes)
b2 = np.zeros((1,num_classes))
# Select hyperparameters
iters = 10000
step_size = 1e-0
lambda_val = 1e-3 # regularization strength
# Select nonlinearity
g = relu
g_prime = relu_prime
#g = logistic
#g_prime = logistic_prime
# Gradient descent loop
for i in range(iters):
'''
FORWARD PROPAGATION
'''
# Compute class scores
hidden_layer = g(np.dot(X, W1) + b1) # shape (m, h)
scores = np.dot(hidden_layer, W2) + b2 # shape (m, num_classes)
# Compute class probabilities
exp_scores = np.exp(scores) # shape (m, num_classes)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
# Compute the loss function
logprob_correct_class = -np.log(probs[range(m),y])
data_loss = np.sum(logprob_correct_class)/m # cross-entropy
reg_loss = 0.5 * lambda_val * (np.sum(W1*W1) + np.sum(W2*W2)) # regularization
loss = data_loss + reg_loss # total
# Print diagnostic
if i % 1000 == 0:
print "iteration %d: loss %f" % (i, loss)
'''
BACKWARD PROPAGATION
'''
# Compute gradient of cross-entropy wrt class scores
dscores = probs
dscores[range(m),y] -= 1
dscores /= m
# Now backpropagate to get gradient of cross-entropy wrt parameters (W2,b2)
# and hidden layer outputs
dW2 = np.dot(hidden_layer.T, dscores)
db2 = np.sum(dscores, axis=0)
dhidden = np.dot(dscores, W2.T)
# Backprop through the nonlinearity
dhidden = dhidden * g_prime(hidden_layer)
# Backprop to (W1,b1)
dW1 = np.dot(X.T, dhidden)
db1 = np.sum(dhidden, axis=0)
# Add regularization gradient contribution
dW2 += lambda_val * W2
dW1 += lambda_val * W1
'''
UPDATE PARAMETERS
'''
# perform a parameter update
W1 += -step_size * dW1
b1 += -step_size * db1
W2 += -step_size * dW2
b2 += -step_size * db2
def predict(X):
hidden_layer = g(np.dot(X, W1) + b1)
scores = np.dot(hidden_layer, W2) + b2
pred = np.argmax(scores, axis=1)
return pred
plot_model(predict(X_test))
```

This is a simpler model than the one above. You may wish to inspect this first if working through backprop. It is a one-layer neural network, or multi-class logistic regression.

In [ ]:

```
# Initialize parameters randomly
W = 0.01 * np.random.randn(n, num_classes)
b = np.zeros((1,num_classes))
# Gradient descent loop
iters = 200
step_size = 1e-0
lambda_val = 1e-3 # regularization strength
for i in range(iters):
'''
FORWARD PROPAGATION
'''
# Compute class probabilities
scores = np.dot(X, W) + b # shape (m, num_classes)
exp_scores = np.exp(scores) # shape (m, num_classes)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # shape (m, num_classes)
# Compute the loss function
logprob_correct_class = -np.log(probs[range(m),y])
data_loss = np.sum(logprob_correct_class)/m # cross-entropy
reg_loss = 0.5 * lambda_val * np.sum(W*W) # regularization
loss = data_loss + reg_loss # total
# Print diagnostic
if i % 10 == 0:
print "iteration %d: loss %f" % (i, loss)
'''
BACKWARD PROPAGATION
'''
# Compute gradient of cross-entropy wrt class scores
dscores = probs
dscores[range(m),y] -= 1
dscores /= m # shape (m, num_classes)
# Now backpropate get gradient of cross-entropy wrt parameters (W,b)
dW = np.dot(X.T, dscores)
db = np.sum(dscores, axis=0, keepdims=True)
# Add gradient of regularization term
dW += lambda_val * W
'''
UPDATE PARAMETERS
'''
W += -step_size * dW
b += -step_size * db
# evaluate training set accuracy
scores = np.dot(X, W) + b
predicted_class = np.argmax(scores, axis=1)
print 'training accuracy: %.2f' % (np.mean(predicted_class == y))
scores_test = np.dot(X_test, W) + b
pred = np.argmax(scores_test, axis=1)
plot_model(pred)
```