我在 C++ 中有一个神经网络的矢量化实现。我成功解决了 Fashion MNIST 和 CIFAR 的分类问题。
现在我正在修改我的代码以进行线性回归。我被困在一个点上。我必须在这里使用 MSE 损失函数而不是平方误差。
我的问题是:1)在线性回归任务中;MSE和平方误差之间有区别吗(只有平均值的区别,这意味着除以小批量的数量......根据我的理解)?2) 下面给出了我的这个网络的 C++ 实现,为了实现 MSE,我应该修改我的损失函数行并将其除以批量大小吗?
__global__ void loss(double* X, double* Y, double *Z, size_t n) {
size_t index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < n) {
Z[index] = ((X[index] - Y[index]));
}
}
void forward_prop(){
L_F( b1, b_x, w1, a1, BATCH_SIZE, layer_0_nodes, layer_1_nodes );
tan_h(a1, BATCH_SIZE, layer_1_nodes);
L_F( b2, a1, w2, a2, BATCH_SIZE, layer_1_nodes, layer_2_nodes );
tan_h(a2, BATCH_SIZE, layer_2_nodes);
L_F( b3, a2, w3, a3, BATCH_SIZE, layer_2_nodes, layer_3_nodes );
}
void backward_prop(){
cuda_loss(a3, b_y, loss_m, BATCH_SIZE, layer_3_nodes);
L_B(a2, loss_m, dw3, layer_2_nodes, BATCH_SIZE, layer_3_nodes, true);
tan_h_B(a2, BATCH_SIZE, layer_2_nodes);
L_B(loss_m, w3, dz2, BATCH_SIZE, layer_3_nodes, layer_2_nodes, false);
cuda_simple_dot_ab(dz2, a2, BATCH_SIZE, layer_2_nodes);
L_B(a1, dz2, dw2, layer_1_nodes, BATCH_SIZE, layer_2_nodes, true);
tan_h_B(a1, BATCH_SIZE, layer_1_nodes);
L_B(dz2, w2, dz1, BATCH_SIZE, layer_2_nodes, layer_1_nodes, false);
cuda_simple_dot_ab(dz1, a1, BATCH_SIZE, layer_1_nodes);
L_B(b_x, dz1, dw1, layer_0_nodes, BATCH_SIZE, layer_1_nodes, true);
}
__global__ void linearLayerForward( double *b, double* W, double* A, double* Z, size_t W_x_dim, size_t W_y_dim, size_t A_x_dim) {
size_t row = blockIdx.y * blockDim.y + threadIdx.y;
size_t col = blockIdx.x * blockDim.x + threadIdx.x;
size_t Z_x_dim = A_x_dim;
size_t Z_y_dim = W_y_dim;
double Z_value = 0;
if (row < Z_y_dim && col < Z_x_dim) {
for (size_t i = 0; i < W_x_dim; i++) {
Z_value += W[row * W_x_dim + i] * A[i * A_x_dim + col];
}
Z[row * Z_x_dim + col] = Z_value + b[col];
}
}
__global__ void linearLayerBackprop(double* W, double* dZ, double *dA,
size_t W_x_dim, size_t W_y_dim,
size_t dZ_x_dim) {
size_t col = blockIdx.x * blockDim.x + threadIdx.x;
size_t row = blockIdx.y * blockDim.y + threadIdx.y;
// W is treated as transposed
size_t dA_x_dim = dZ_x_dim;
size_t dA_y_dim = W_x_dim;
double dA_value = 0.0f;
if (row < dA_y_dim && col < dA_x_dim) {
for (size_t i = 0; i < W_y_dim; i++) {
dA_value += W[i * W_x_dim + row] * dZ[i * dZ_x_dim + col];
}
dA[row * dA_x_dim + col] = dA_value;
}
}
__global__ void tanhActivationForward(double* Z, size_t n) {
size_t index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < n) {
Z[index] = tanh(Z[index]);
}
}
__global__ void tanhActivationBackward(double* Z, size_t n) {
size_t index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < n) {
Z[index] = 1-(tanh(Z[index]) * tanh(Z[index]));
}
}
```