from numpy import *
import matplotlib.pyplot as plt
from sklearn import datasets

# ensure the same random numbers appear every time
random.seed(0)

# display images in notebook
%matplotlib inline
plt.rcParams['figure.figsize'] = (12,12)


# download MNIST dataset
digits = datasets.load_digits()

# define inputs and labels
inputs = digits.images # x_i
labels = digits.target # t_i

print('inputs = (n_inputs, pixel_width, pixel_height) =',inputs.shape)
print('labels = (n_inputs) =',labels.shape)

inputs = (n_inputs, pixel_width, pixel_height) = (1797, 8, 8)
labels = (n_inputs) = (1797,)


# flatten the image
# the value -1 means dimension is inferred from the remaining dimensions: 8x8 = 64
n_inputs,nx,ny = inputs.shape
inputs = inputs.reshape(n_inputs, nx*ny)
print('X = (n_inputs, n_features) =', inputs.shape)

# choose some random images to display
random_indices = random.choice(range(n_inputs), size=5)

for i,image in enumerate(digits.images[random_indices]):
    plt.subplot(1, 5, i+1)
    plt.axis('off')
    plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    plt.title("Label: %d" % digits.target[random_indices[i]])
plt.show()

X = (n_inputs, n_features) = (1797, 64)


from sklearn.model_selection import train_test_split

# one-liner from scikit-learn library
train_size = 0.8
X_train, X_test, Y_train, Y_test = train_test_split(inputs, labels, train_size=train_size,test_size=1-train_size)

# equivalently in numpy
def train_test_split_numpy(inputs, labels, train_size):
    n_inputs = len(inputs)
    inputs_shuffled = inputs.copy()
    labels_shuffled = labels.copy()
    random.shuffle(inputs_shuffled)
    random.shuffle(labels_shuffled)
    
    train_end = int(n_inputs*train_size)
    X_train, X_test = inputs_shuffled[:train_end], inputs_shuffled[train_end:]
    Y_train, Y_test = labels_shuffled[:train_end], labels_shuffled[train_end:]
    
    return X_train, X_test, Y_train, Y_test
#X_train, X_test, Y_train, Y_test = train_test_split_numpy(inputs, labels, train_size, test_size)

print("Number of training images: " + str(len(X_train)))
print("Number of test images: " + str(len(X_test)))

Number of training images: 1437
Number of test images: 360


# to categorical turns our integer vector into a onehot representation
def to_categorical_numpy(integer_vector): # integer_vector[n_inputs] contains number between 0...9
    n_inputs = len(integer_vector)          # inputs
    n_categories = max(integer_vector) + 1  # 10 chategories
    onehot_vector = zeros((n_inputs, n_categories),dtype=int)
    onehot_vector[range(n_inputs), integer_vector] = 1    
    return onehot_vector


integer_vector=[3,5,4,8,0]
to_categorical_numpy(integer_vector)

array([[0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0, 0, 0, 0]])


Y_train_onehot, Y_test_onehot = to_categorical_numpy(Y_train), to_categorical_numpy(Y_test)


# building our neural network
n_inputs, n_features = X_train.shape
n_hidden_neurons = 50
n_categories = 10

# we make the weights normally distributed using numpy.random.randn
def GiveStartingRandomWeights():
    random.seed(0)
    # weights and bias in the hidden layer
    W_1 = random.randn(n_features, n_hidden_neurons)
    b_1 = zeros(n_hidden_neurons) + 0.01

    # weights and bias in the output layer
    W_2 = random.randn(n_hidden_neurons, n_categories)
    b_2 = zeros(n_categories) + 0.01
    return (W_1, b_1, W_2, b_2)


def mfermi(x):
    return 1/(1 + exp(-x))

def feed_forward(X, all_weights):
    "identical to feed_forward, except we also return a_1, i.e, hidden layer a"
    W_1, b_1, W_2, b_2 = all_weights
    # weighted sum of inputs to the hidden layer
    z_1 = matmul(X, W_1) + b_1
    # activation in the hidden layer
    a_1 = mfermi(z_1)
    # weighted sum of inputs to the output layer
    z_2 = matmul(a_1, W_2) + b_2
    # softmax output
    # axis 0 holds each input and axis 1 the probabilities of each category
    exp_term = exp(z_2)
    probabilities = exp_term/sum(exp_term, axis=1, keepdims=True)
    # for backpropagation need activations in hidden and output layers
    return a_1, probabilities


# we obtain a prediction by taking the class with the highest likelihood
def predict(X, all_weights):
    a_1, probabilities = feed_forward(X, all_weights)
    return (probabilities,argmax(probabilities, axis=1))


all_weights = GiveStartingRandomWeights()
(probabilities,predictions) = predict(X_train, all_weights)

print("probabilities = (n_inputs, n_categories) = " + str(probabilities.shape))
print("probability that image 0 is in category 0,1,2,...,9 = \n" + str(probabilities[0]))
print("probabilities sum up to: " + str(probabilities[0].sum()))
print()

print("predictions = (n_inputs) = " + str(predictions.shape))
print("prediction for image 0: " + str(predictions[0]))
print("correct label for image 0: " + str(Y_train[0]))

probabilities = (n_inputs, n_categories) = (1437, 10)
probability that image 0 is in category 0,1,2,...,9 = 
[2.23785373e-07 1.47533958e-01 7.28910767e-04 3.32202888e-05
 4.42269923e-05 1.06343900e-04 7.66939998e-03 8.14604377e-01
 4.64970935e-07 2.92788746e-02]
probabilities sum up to: 1.0

predictions = (n_inputs) = (1437,)
prediction for image 0: 7
correct label for image 0: 6


from sklearn.metrics import accuracy_score

(probabilities,predictions) = predict(X_train, all_weights)
print("Old accuracy on training data:", accuracy_score(predictions, Y_train))

Old accuracy on training data: 0.04314544189283229


def backpropagation(X, Y, all_weights):
    a_1, probabilities = feed_forward(X, all_weights)
    W_1, b_1, W_2, b_2 = all_weights
    # error in the output layer
    error_output = probabilities - Y
    # error in the hidden layer
    error_hidden = matmul(error_output, W_2.T) * a_1 * (1 - a_1)
    
    # gradients for the output layer
    dW2 = matmul(a_1.T, error_output)
    dB2 = sum(error_output, axis=0)
    
    # gradient for the hidden layer
    dW1 = matmul(X.T, error_hidden)
    dB1 = sum(error_hidden, axis=0)

    return dW2, dB2, dW1, dB1


dW2, dB2, dW1, dB1 = backpropagation(X_train, Y_train_onehot, all_weights)
print('shapes of gradients=', dW2.shape, dB2.shape, dW1.shape, dB1.shape)

shapes of gradients= (50, 10) (10,) (64, 50) (50,)


def SimpleGradientMethod(X_train, Y_train, all_weights, eta, lmbd, num_iterations):
    (W_1,b_1,W_2,b_2) = all_weights
    for i in range(num_iterations):
        # calculate gradients
        dW_2, dB_2, dW_1, dB_1 = backpropagation(X_train, Y_train, [W_1,b_1,W_2,b_2])    
        # regularization term gradients
        dW_2 += lmbd * W_2
        dW_1 += lmbd * W_1
        # update weights and biases
        W_1 -= eta * dW_1
        b_1 -= eta * dB_1
        W_2 -= eta * dW_2
        b_2 -= eta * dB_2
    return (W_1,b_1,W_2,b_2)


eta = 0.01
lmbd = 0.01
num_iterations=100

all_weights = GiveStartingRandomWeights()
all_weights = SimpleGradientMethod(X_train, Y_train_onehot, all_weights, eta, lmbd, num_iterations)

error=accuracy_score(predict(X_train,all_weights)[1],Y_train) 
print('Accuracy on training data: ', error)

/var/folders/j8/d9m3r0zx7j37l3ktfl_n1xw00000gn/T/ipykernel_20664/1438300027.py:2: RuntimeWarning: overflow encountered in exp
  return 1/(1 + exp(-x))

Accuracy on training data:  0.10438413361169102


def StochasticGradientMethod(X_train, Y_train, all_weights, eta, lmbd, batch_size, epochs):
    (W_1,b_1,W_2,b_2) = all_weights

    data_indices = arange(len(X_train))
    iterations = len(X_train) // batch_size
    print('Number of iterations=', iterations)
    for i in range(epochs):
        for j in range(iterations):
            chosen_datapoints = random.choice(data_indices, size=batch_size, replace=False)
            # minibatch training data
            X_batch = X_train[chosen_datapoints]
            Y_batch = Y_train[chosen_datapoints]
            dW_2, dB_2, dW_1, dB_1 = backpropagation(X_batch, Y_batch, [W_1,b_1,W_2,b_2])
            # regularization term gradients
            dW_2 += lmbd * W_2
            dW_1 += lmbd * W_1
            # update weights and biases
            W_1 -= eta * dW_1
            b_1 -= eta * dB_1
            W_2 -= eta * dW_2
            b_2 -= eta * dB_2
    return (W_1,b_1,W_2,b_2)


eta = 0.01
lmbd = 0.1
epochs = 100
batch_size = 100

all_weights = GiveStartingRandomWeights()

all_weights = StochasticGradientMethod(X_train, Y_train_onehot, all_weights, eta, lmbd, batch_size, epochs)

error=accuracy_score(predict(X_train,all_weights)[1],Y_train) 
error2=accuracy_score(predict(X_test,all_weights)[1],Y_test)
print('Accuracy on training data: ', error, error2)

Number of iterations= 14
Accuracy on training data:  0.9937369519832986 0.9805555555555555


eta_vals = logspace(-5, 1, 7)
lmbd_vals = logspace(-5, 1, 7)
# store the models for later use
DNN_numpy = zeros((len(eta_vals), len(lmbd_vals)), dtype=object)

# grid search
for i, eta in enumerate(eta_vals):
    for j, lmbd in enumerate(lmbd_vals):
        
        all_weights = GiveStartingRandomWeights()                
        all_weights = StochasticGradientMethod(X_train, Y_train_onehot, all_weights, eta, lmbd, batch_size, epochs)
        
        error=accuracy_score(predict(X_train,all_weights)[1],Y_train) 
        error2=accuracy_score(predict(X_test,all_weights)[1],Y_test) 
        DNN_numpy[i][j] = error
        
        #test_predict = dnn.predict(X_test)
        
        print('Learning rate=', eta, 'Lambda=', lmbd, 'Accuracy=', error, error2)

Number of iterations= 14
Learning rate= 1e-05 Lambda= 1e-05 Accuracy= 0.13569937369519833 0.18055555555555555
Number of iterations= 14
Learning rate= 1e-05 Lambda= 0.0001 Accuracy= 0.13569937369519833 0.18055555555555555
Number of iterations= 14
Learning rate= 1e-05 Lambda= 0.001 Accuracy= 0.13569937369519833 0.18055555555555555
Number of iterations= 14
Learning rate= 1e-05 Lambda= 0.01 Accuracy= 0.13569937369519833 0.18055555555555555
Number of iterations= 14
Learning rate= 1e-05 Lambda= 0.1 Accuracy= 0.13569937369519833 0.18055555555555555
Number of iterations= 14
Learning rate= 1e-05 Lambda= 1.0 Accuracy= 0.13569937369519833 0.18055555555555555
Number of iterations= 14
Learning rate= 1e-05 Lambda= 10.0 Accuracy= 0.13848295059151008 0.18055555555555555
Number of iterations= 14
Learning rate= 0.0001 Lambda= 1e-05 Accuracy= 0.6089074460681977 0.5833333333333334
Number of iterations= 14
Learning rate= 0.0001 Lambda= 0.0001 Accuracy= 0.6089074460681977 0.5833333333333334
Number of iterations= 14
Learning rate= 0.0001 Lambda= 0.001 Accuracy= 0.6089074460681977 0.5833333333333334
Number of iterations= 14
Learning rate= 0.0001 Lambda= 0.01 Accuracy= 0.6089074460681977 0.5805555555555556
Number of iterations= 14
Learning rate= 0.0001 Lambda= 0.1 Accuracy= 0.6116910229645094 0.5805555555555556
Number of iterations= 14
Learning rate= 0.0001 Lambda= 1.0 Accuracy= 0.6450939457202505 0.6083333333333333
Number of iterations= 14
Learning rate= 0.0001 Lambda= 10.0 Accuracy= 0.8545581071677105 0.8138888888888889
Number of iterations= 14
Learning rate= 0.001 Lambda= 1e-05 Accuracy= 0.9617258176757133 0.8916666666666667
Number of iterations= 14
Learning rate= 0.001 Lambda= 0.0001 Accuracy= 0.9617258176757133 0.8916666666666667
Number of iterations= 14
Learning rate= 0.001 Lambda= 0.001 Accuracy= 0.9617258176757133 0.8916666666666667
Number of iterations= 14
Learning rate= 0.001 Lambda= 0.01 Accuracy= 0.9617258176757133 0.8944444444444445
Number of iterations= 14
Learning rate= 0.001 Lambda= 0.1 Accuracy= 0.9624217118997912 0.9055555555555556
Number of iterations= 14
Learning rate= 0.001 Lambda= 1.0 Accuracy= 0.9826026443980515 0.95
Number of iterations= 14
Learning rate= 0.001 Lambda= 10.0 Accuracy= 0.942936673625609 0.9305555555555556
Number of iterations= 14
Learning rate= 0.01 Lambda= 1e-05 Accuracy= 0.9965205288796103 0.9361111111111111
Number of iterations= 14
Learning rate= 0.01 Lambda= 0.0001 Accuracy= 0.9972164231036882 0.9527777777777777
Number of iterations= 14
Learning rate= 0.01 Lambda= 0.001 Accuracy= 0.9979123173277662 0.9555555555555556
Number of iterations= 14
Learning rate= 0.01 Lambda= 0.01 Accuracy= 0.9979123173277662 0.9472222222222222
Number of iterations= 14
Learning rate= 0.01 Lambda= 0.1 Accuracy= 0.9937369519832986 0.9805555555555555
Number of iterations= 14
Learning rate= 0.01 Lambda= 1.0 Accuracy= 0.8176757132915797 0.7805555555555556
Number of iterations= 14
Learning rate= 0.01 Lambda= 10.0 Accuracy= 0.20668058455114824 0.18333333333333332
Number of iterations= 14

/var/folders/j8/d9m3r0zx7j37l3ktfl_n1xw00000gn/T/ipykernel_20664/1438300027.py:2: RuntimeWarning: overflow encountered in exp
  return 1/(1 + exp(-x))
/var/folders/j8/d9m3r0zx7j37l3ktfl_n1xw00000gn/T/ipykernel_20664/1438300027.py:2: RuntimeWarning: overflow encountered in exp
  return 1/(1 + exp(-x))
/var/folders/j8/d9m3r0zx7j37l3ktfl_n1xw00000gn/T/ipykernel_20664/1438300027.py:2: RuntimeWarning: overflow encountered in exp
  return 1/(1 + exp(-x))

Learning rate= 0.1 Lambda= 1e-05 Accuracy= 0.10438413361169102 0.07777777777777778
Number of iterations= 14

/var/folders/j8/d9m3r0zx7j37l3ktfl_n1xw00000gn/T/ipykernel_20664/1438300027.py:2: RuntimeWarning: overflow encountered in exp
  return 1/(1 + exp(-x))
/var/folders/j8/d9m3r0zx7j37l3ktfl_n1xw00000gn/T/ipykernel_20664/1438300027.py:2: RuntimeWarning: overflow encountered in exp
  return 1/(1 + exp(-x))

Learning rate= 0.1 Lambda= 0.0001 Accuracy= 0.10368823938761308 0.08888888888888889
Number of iterations= 14

/var/folders/j8/d9m3r0zx7j37l3ktfl_n1xw00000gn/T/ipykernel_20664/1438300027.py:2: RuntimeWarning: overflow encountered in exp
  return 1/(1 + exp(-x))
/var/folders/j8/d9m3r0zx7j37l3ktfl_n1xw00000gn/T/ipykernel_20664/1438300027.py:2: RuntimeWarning: overflow encountered in exp
  return 1/(1 + exp(-x))

Learning rate= 0.1 Lambda= 0.001 Accuracy= 0.0953375086986778 0.125
Number of iterations= 14

/var/folders/j8/d9m3r0zx7j37l3ktfl_n1xw00000gn/T/ipykernel_20664/1438300027.py:2: RuntimeWarning: overflow encountered in exp
  return 1/(1 + exp(-x))
/var/folders/j8/d9m3r0zx7j37l3ktfl_n1xw00000gn/T/ipykernel_20664/1438300027.py:2: RuntimeWarning: overflow encountered in exp
  return 1/(1 + exp(-x))

Learning rate= 0.1 Lambda= 0.01 Accuracy= 0.10438413361169102 0.07777777777777778
Number of iterations= 14

/var/folders/j8/d9m3r0zx7j37l3ktfl_n1xw00000gn/T/ipykernel_20664/1438300027.py:2: RuntimeWarning: overflow encountered in exp
  return 1/(1 + exp(-x))
/var/folders/j8/d9m3r0zx7j37l3ktfl_n1xw00000gn/T/ipykernel_20664/1438300027.py:2: RuntimeWarning: overflow encountered in exp
  return 1/(1 + exp(-x))

Deep Learning methods and Neural Networks¶

Simple example OR and XOR gate¶

Back propagation and automatic differentiation¶

Final algorithm¶

Example code from MNIST dataset on handwritten numbers¶

Adjust hyperparameters¶