我不确定这是否是这里要问的那种问题,因为它不是关于人工智能的一般问题,而是关于它的编码的更多问题,但是我认为它不适合堆栈溢出。
我一直在用 C++ 编写多层感知器,它似乎正在使用 sigmoid 函数,但是当我将激活函数更改为 ReLU 时,它不会收敛并且每个训练示例的平均成本为 1。这是因为网络的所有输出神经元都输出 0。
使用 sigmoid 函数,它收敛得很好,我做了一些测试,在大约 1000 代之后,它在 MNIST 数据集中的前 1000 个项目上的平均成本为 0.1。
我将向您展示我首先为激活函数更改的代码,然后我会将整个代码块放入其中。
任何帮助将不胜感激!
乙状结肠:
inline float activation(float num)
{
return 1 / (1 + std::exp(-num));
}
inline float activation_derivative(float num)
{
return activation(num) * (1 - activation(num));
}
ReLU:
inline float activation(float num)
{
return std::max(num, 0.0f);
}
inline float activation_derivative(float num)
{
return num > 0 ? 1.0f : 0.0f;
}
这是整个代码块(我折叠了用于基准测试的代码区域和用于创建数据集的区域):
#include <iostream>
#include <fstream>
#include <vector>
#include <random>
#include <chrono>
#include <cmath>
#include <string>
#include <algorithm>
#pragma region benchmarking
#pragma endregion
class Network
{
public:
float cost = 0.0f;
std::vector<std::vector<std::vector<float>>> weights;
std::vector<std::vector<std::vector<float>>> deriv_weights;
std::vector<std::vector<float>> biases;
std::vector<std::vector<float>> deriv_biases;
std::vector<std::vector<float>> activations;
std::vector<std::vector<float>> deriv_activations;
void clear_deriv_activations()
{
for (unsigned int i = 0; i < deriv_activations.size(); ++i)
{
std::fill(deriv_activations[i].begin(), deriv_activations[i].end(), 0.0f);
}
}
int get_memory_usage()
{
int memory = 4;
memory += get_vector_memory_usage(weights);
memory += get_vector_memory_usage(deriv_weights);
memory += get_vector_memory_usage(biases);
memory += get_vector_memory_usage(deriv_biases);
memory += get_vector_memory_usage(activations);
memory += get_vector_memory_usage(deriv_activations);
return memory;
}
};
struct DataSet
{
std::vector<std::vector<float>> training_inputs;
std::vector<std::vector<float>> training_answers;
std::vector<std::vector<float>> testing_inputs;
std::vector<std::vector<float>> testing_answers;
};
Network create_network(std::vector<int> layers)
{
Network network;
int layer_count = layers.size() - 1;
network.weights.reserve(layer_count);
network.deriv_weights.reserve(layer_count);
network.biases.reserve(layer_count);
network.deriv_biases.reserve(layer_count);
network.activations.reserve(layer_count);
network.deriv_activations.reserve(layer_count);
int nodes_in_prev_layer = layers[0];
for (unsigned int i = 0; i < layers.size() - 1; ++i)
{
int nodes_in_layer = layers[i + 1];
network.weights.emplace_back();
network.weights[i].reserve(nodes_in_layer);
network.deriv_weights.emplace_back();
network.deriv_weights[i].reserve(nodes_in_layer);
network.biases.emplace_back();
network.biases[i].reserve(nodes_in_layer);
network.deriv_biases.emplace_back(nodes_in_layer, 0.0f);
network.activations.emplace_back(nodes_in_layer, 0.0f);
network.deriv_activations.emplace_back(nodes_in_layer, 0.0f);
for (int j = 0; j < nodes_in_layer; ++j)
{
network.weights[i].emplace_back();
network.weights[i][j].reserve(nodes_in_prev_layer);
network.deriv_weights[i].emplace_back(nodes_in_prev_layer, 0.0f);
for (int k = 0; k < nodes_in_prev_layer; ++k)
{
float input_weight = (2 * (float(std::rand()) / RAND_MAX)) - 1;
network.weights[i][j].push_back(input_weight);
}
float input_bias = (2 * (float(std::rand()) / RAND_MAX)) - 1;
network.biases[i].push_back(input_bias);
}
nodes_in_prev_layer = nodes_in_layer;
}
return network;
}
void judge_network(Network &network, const std::vector<float>& correct_answers)
{
int final_layer_index = network.activations.size() - 1;
for (unsigned int i = 0; i < network.activations[final_layer_index].size(); ++i)
{
float val_sq = (network.activations[final_layer_index][i] - correct_answers[i]);
network.cost += val_sq * val_sq;
}
}
inline float activation(float num)
{
return std::max(num, 0.0f);
}
void forward_propogate(Network& network, const std::vector<float>& input)
{
const std::vector<float>* last_layer_activations = &input;
int last_layer_node_count = input.size();
for (unsigned int i = 0; i < network.weights.size(); ++i)
{
for (unsigned int j = 0; j < network.weights[i].size(); ++j)
{
float total = network.biases[i][j];
for (int k = 0; k < last_layer_node_count; ++k)
{
total += (*last_layer_activations)[k] * network.weights[i][j][k];
}
network.activations[i][j] = activation(total);
}
last_layer_activations = &network.activations[i];
last_layer_node_count = network.weights[i].size();
}
}
void final_layer_deriv_activations(Network& network, const std::vector<float>& correct_answers)
{
int final_layer_index = network.activations.size() - 1;
int final_layer_node_count = network.activations[final_layer_index].size();
for (int i = 0; i < final_layer_node_count; ++i)
{
float deriv = network.activations[final_layer_index][i] - correct_answers[i];
network.deriv_activations[final_layer_index][i] = deriv * 2;
}
}
inline float activation_derivative(float num)
{
return num > 0 ? 1.0f : 0.0f;
}
void back_propogate_layer(Network& network, int layer)
{
int nodes_in_layer = network.activations[layer].size();
int nodes_in_prev_layer = network.activations[layer - 1].size();
for (int i = 0; i < nodes_in_layer; ++i)
{
float total = network.biases[layer][i];
for (int j = 0; j < nodes_in_prev_layer; ++j)
{
total += network.weights[layer][i][j] * network.activations[layer - 1][j];
}
float dzda = activation_derivative(total);
float dzdc = dzda * network.deriv_activations[layer][i];
for (int j = 0; j < nodes_in_prev_layer; ++j)
{
network.deriv_weights[layer][i][j] += network.activations[layer - 1][j] * dzdc;
network.deriv_activations[layer - 1][j] += network.weights[layer][i][j] * dzdc;
}
network.deriv_biases[layer][i] += dzdc;
}
}
void back_propogate_first_layer(Network& network, std::vector<float> inputs)
{
int nodes_in_layer = network.activations[0].size();
int input_count = inputs.size();
for (int i = 0; i < nodes_in_layer; ++i)
{
float total = network.biases[0][i];
for (int j = 0; j < input_count; ++j)
{
total += network.weights[0][i][j] * inputs[j];
}
float dzda = activation_derivative(total);
float dzdc = dzda * network.deriv_activations[0][i];
for (int j = 0; j < input_count; ++j)
{
network.deriv_weights[0][i][j] += inputs[j] * dzdc;
}
network.deriv_biases[0][i] += dzdc;
}
}
void back_propogate(Network& network, const std::vector<float>& inputs, const std::vector<float>& correct_answers)
{
network.clear_deriv_activations();
final_layer_deriv_activations(network, correct_answers);
for (int i = network.activations.size() - 1; i > 0; --i)
{
back_propogate_layer(network, i);
}
back_propogate_first_layer(network, inputs);
}
void apply_derivatives(Network& network, int training_example_count)
{
for (unsigned int i = 0; i < network.weights.size(); ++i)
{
for (unsigned int j = 0; j < network.weights[i].size(); ++j)
{
for (unsigned int k = 0; k < network.weights[i][j].size(); ++k)
{
network.weights[i][j][k] -= network.deriv_weights[i][j][k] / training_example_count;
network.deriv_weights[i][j][k] = 0;
}
network.biases[i][j] -= network.deriv_biases[i][j] / training_example_count;
network.deriv_biases[i][j] = 0;
network.deriv_activations[i][j] = 0;
}
}
}
void training_iteration(Network& network, const DataSet& data)
{
int training_example_count = data.training_inputs.size();
for (int i = 0; i < training_example_count; ++i)
{
forward_propogate(network, data.training_inputs[i]);
judge_network(network, data.training_answers[i]);
back_propogate(network, data.training_inputs[i], data.training_answers[i]);
}
apply_derivatives(network, training_example_count);
}
void train_network(Network& network, const DataSet& dataset, int training_iterations)
{
for (int i = 0; i < training_iterations; ++i)
{
training_iteration(network, dataset);
std::cout << "Generation " << i << ": " << network.cost << std::endl;
network.cost = 0.0f;
}
}
#pragma region dataset creation
#pragma endregion
int main()
{
Timer timer;
DataSet dataset = create_dataset_from_file("data.txt");
Network network = create_network({784, 128, 10});
train_network(network, dataset, 1000);
std::cout << timer.get_duration() << std::endl;
std::cin.get();
}
```