MNIST – Обычная нейронная сеть – Почему увеличивается функция стоимости?

Question 1

Я уже неделю изучаю этот код, пытаясь понять, почему моя функция стоимости увеличивается, как на следующем изображении. Уменьшение скорости обучения немного помогает, но очень незначительно. Может кто-нибудь увидеть, почему функция стоимости не работает, как ожидалось?

Я понимаю, что предпочтительнее была бы CNN, но я все равно хочу понять, почему эта простая сеть не работает. Пожалуйста, помогите:)

import numpy as np
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

import matplotlib.pyplot as plt

mnist = input_data.read_data_sets("MNIST_DATA/",one_hot=True)

def createPlaceholders():
    xph = tf.placeholder(tf.float32, (784, None))
    yph = tf.placeholder(tf.float32, (10, None))
    return xph, yph

def init_param(layers_dim):
    weights = {}
    L = len(layers_dim)

    for l in range(1,L):
        weights['W' + str(l)] = tf.get_variable('W' + str(l), shape=(layers_dim[l],layers_dim[l-1]), initializer= tf.contrib.layers.xavier_initializer())
        weights['b' + str(l)] = tf.get_variable('b' + str(l), shape=(layers_dim[l],1), initializer= tf.zeros_initializer())

    return weights

def forward_prop(X,L,weights):
    parameters = {}
    parameters['A0'] = tf.cast(X,tf.float32)

    for l in range(1,L-1):
        parameters['Z' + str(l)] = tf.add(tf.matmul(weights['W' + str(l)], parameters['A' + str(l-1)]), weights['b' + str(l)])
        parameters['A' + str(l)] = tf.nn.relu(parameters['Z' + str(l)])

    parameters['Z' + str(L-1)] = tf.add(tf.matmul(weights['W' + str(L-1)], parameters['A' + str(L-2)]), weights['b' + str(L-1)])
    return parameters['Z' + str(L-1)]

def compute_cost(ZL,Y):
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = tf.cast(Y,tf.float32), logits = ZL))
    return cost

def randomMiniBatches(X,Y,minibatch_size):
    m = X.shape[1]
    shuffle = np.random.permutation(m)
    temp_X = X[:,shuffle]
    temp_Y = Y[:,shuffle]

    num_complete_minibatches = int(np.floor(m/minibatch_size))

    mini_batches = []

    for batch in range(num_complete_minibatches):
        mini_batches.append((temp_X[:,batch*minibatch_size: (batch+1)*minibatch_size], temp_Y[:,batch*minibatch_size: (batch+1)*minibatch_size]))

    mini_batches.append((temp_X[:,num_complete_minibatches*minibatch_size:], temp_Y[:,num_complete_minibatches*minibatch_size:]))

    return mini_batches

def model(X, Y, layers_dim, learning_rate = 0.001, num_epochs = 20, minibatch_size = 64):
    tf.reset_default_graph()
    costs = []

    xph, yph = createPlaceholders()
    weights = init_param(layers_dim)
    ZL = forward_prop(xph, len(layers_dim), weights)
    cost = compute_cost(ZL,yph)
    optimiser = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

    init = tf.global_variables_initializer()

    with tf.Session() as sess:
        sess.run(init)

        for epoch in range(num_epochs):
            minibatches = randomMiniBatches(X,Y,minibatch_size)
            epoch_cost = 0

            for b, mini in enumerate(minibatches,1):
                mini_x, mini_y = mini
                _,c = sess.run([optimiser,cost],feed_dict={xph:mini_x,yph:mini_y})
                epoch_cost += c
            print('epoch: ',epoch+1,"https://datascience.stackexchange.com/",num_epochs)

            epoch_cost /= len(minibatches)
            costs.append(epoch_cost)

    plt.plot(costs) 
    print(costs)

X_train = mnist.train.images.T
n_x = X_train.shape[0]
Y_train = mnist.train.labels.T
n_y = Y_train.shape[0]
layers_dim = [n_x,10,n_y]

model(X_train, Y_train, layers_dim)

Question 2

Функция softmax в Tensorflow работает только если количество пакетов находится в строках, а вывод в столбцах. Если они переставлены, то вам нужно транспонировать тензоры в функции стоимости.

Question 3

Можете попробовать то же самое после нормализации данных, иногда большинство из нас в начале не уделяет этому специфическому аспекту достаточно внимания. Одна из причин, по которой это может происходить, заключается в том, что на обучающем наборе вы могли нормализовать, но подпишите сырые данные в качестве меток при подгонке к сети.

MNIST – Обычная нейронная сеть – Почему увеличивается функция стоимости?

Вопрос или проблема

Ответ или решение