0

Here is my project. It consists of: m = 24 where m is the number of training examples; 3 hidden layers and the input layer; 3 sets of weights connecting each layer; the data is 1x38 with a response of y (1x1).

import numpy as np
x = np.array([
[1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0],
[1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0],
[1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0],
[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0],
[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0],
[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0],
[1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0],
[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0],
[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0],
[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1],
[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1],
[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0],
[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0],
[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0],
[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0],
[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1],
[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0],
[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0],
[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0],
[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0],
[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0],
[1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0],
[1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0],
[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0]])

y = np.array([
    [1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1,0]]).T

w = np.random.random((38, 39))
w2 = np.random.random((39, 39))
w3 = np.random.random((39, 1))

for j in xrange(100000):
    a2 = 1/(1 + np.exp(-(np.dot(x, w) + 1)))
    a3 = 1/(1 + np.exp(-(np.dot(a2, w2) + 1)))
    a4 = 1/(1 + np.exp(-(np.dot(a3, w3) + 1)))
    a4delta = (y - a4) * (1 * (1 - a4))
    a3delta = a4delta.dot(w3.T) * (1 * (1 - a3))
    a2delta = a3delta.dot(w2.T) * (1 * (1 - a2))
    w3 += a3.T.dot(a4delta)
    w2 += a2.T.dot(a3delta)
    w += x.T.dot(a2delta)
print(a4)

Here are the results:

[[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]]

Can anyone see if i have gone wrong? Does my network need to be changed? I have tried experimenting with the hyperparameters by adding more hidden layers and more memory

6
  • Use a library instead of coding every single thing
    – WiLL_K
    Commented Nov 22, 2017 at 14:57
  • 2
    What if I would LIKE to code every little thing? If I wanted to use a library I would have already @WiLL_K Commented Nov 22, 2017 at 14:58
  • Ok, then lets start with this, where are you calculating loss? . Where is your gradient-descent function? where is your function that predicts?
    – WiLL_K
    Commented Nov 22, 2017 at 15:00
  • Have a look at the deltas. I did error (y-a4) times the derivative of the respective hidden layer @WiLL_K Commented Nov 22, 2017 at 15:02
  • Could you expand on this please? I sort of understand, thank you very much @silgon Commented Nov 22, 2017 at 15:05

1 Answer 1

1

You have some mistakes and some things I think are mistakes, but maybe just a different implementation.

You are adding your gradients to your weights, when you should be subtracting your gradient multiplied by a step size. This is why your weights shoot up to 1.0 in only a single iteration.

These:

w3 += a3.T.dot(a4delta)

Should be something like this:

 w3 -= addBias(a3).T.dot(a4delta) * step

Also, I don't think you have the correct formulation for the partial derivative of the sigmoid function. I think these:

a3delta = a4delta.dot(w3.T) * (1 * (1 - a3)) 

Should be:

a3delta = a4delta.dot(w3.T) * (a3 * (1 - a3))

You should also initialize your weight around zero with something like:

ep = 0.12
w = np.random.random((39, 39)) * 2 * ep - ep

Most implementations add a bias node to each layer, you're not doing that. It complicates things a little, but I think it will make it converge faster.

For me, this converges on a confident answer in 200 iterations:

# Weights have different shapes to account for bias node
w = np.random.random((39, 39)) * 2 * ep - ep
w2 = np.random.random((40, 39))* 2 * ep - ep
w3 = np.random.random((40, 1)) * 2 * ep - ep

ep = 0.12
w = np.random.random((39, 39)) * 2 * ep - ep
w2 = np.random.random((40, 39))* 2 * ep - ep
w3 = np.random.random((40, 1)) * 2 * ep - ep

def addBias(mat):
    return np.hstack((np.ones((mat.shape[0], 1)), mat))

step = -.1
for j in range(200):
    # Forward prop
    a2 = 1/(1 + np.exp(- addBias(x).dot(w)))
    a3 = 1/(1 + np.exp(- addBias(a2).dot(w2)))
    a4 = 1/(1 + np.exp(- addBias(a3).dot(w3)))

    # Back prop
    a4delta = (y - a4) 
    # need to remove bias nodes here
    a3delta = a4delta.dot(w3[1:,:].T) * (a3 * (1 - a3))
    a2delta = a3delta.dot(w2[1:,:].T) * (a2 * (1 - a2))

    # Gradient Descent
    # Multiply gradient by step then subtract
    w3 -= addBias(a3).T.dot(a4delta) * step
    w2 -= addBias(a2).T.dot(a3delta) * step 
    w -= addBias(x).T.dot(a2delta) * step
print(np.rint(a4))

Not the answer you're looking for? Browse other questions tagged or ask your own question.