From b79ce5fffaafbb38cfca998db4a7b9204fdd05e2 Mon Sep 17 00:00:00 2001 From: Brendan Hansen Date: Mon, 25 Jan 2021 20:23:50 -0600 Subject: [PATCH] generalized criteria --- docs/abstractions | 23 +++++++ project.4coder | 32 ++++++++++ src/mnist.onyx | 13 ++-- src/neuralnet.onyx | 151 +++++++++++++++++++++++++++++++++++---------- 4 files changed, 181 insertions(+), 38 deletions(-) create mode 100644 docs/abstractions create mode 100644 project.4coder diff --git a/docs/abstractions b/docs/abstractions new file mode 100644 index 0000000..626fd14 --- /dev/null +++ b/docs/abstractions @@ -0,0 +1,23 @@ +Abstractions still needed: + * Optimizer + Currently, only SGD is implemented. It should be easy to add different + optimizers for the networks. A question to answer is should the optimizer + store the gradients computed in the back propagation, or should that be + stored on the layers? I'm leaning towards storing it on the layers. + + Other optimizers: + - Adam + - AdaMax + - AdaGrad + + * Criteria + - MSE (implemented) + - MAE + - BCE + + * Data Loader + Each dataloader will different, but a common API should be added, so + there can be an automatic training system, that just pulls data from + the dataloader as it is needed. The dataloader then has the freedom + to cache or preload the data. + diff --git a/project.4coder b/project.4coder new file mode 100644 index 0000000..9306168 --- /dev/null +++ b/project.4coder @@ -0,0 +1,32 @@ +version(1); +project_name = "Onyx"; + +patterns = { +"*.onyx", +"*.bat", +"*.sh", +"*.4coder", +}; +blacklist_patterns = { +".*", +}; +load_paths_custom = { + {"."}, +}; +load_paths = { + { load_paths_custom, .os = "win" }, + { load_paths_custom, .os = "linux"}, + { load_paths_custom, .os = "mac" }, +}; + +build_win32 = "\\dev\\onyx\\onyx.exe -V src\\mnist.onyx -o mnist.wasm"; +build_linux = "/usr/bin/onyx -V src/mnist.onyx -o mnist.wasm"; + +command_list = { + { .name = "Build", + .out = "*compilation*", .footer_panel = true, .save_dirty_files = true, + .cmd = { {build_win32, .os ="win" }, + {build_linux, .os ="linux"}, }, }, +}; + +fkey_command[1] = "Build"; diff --git a/src/mnist.onyx b/src/mnist.onyx index dbb7fe6..897347a 100644 --- a/src/mnist.onyx +++ b/src/mnist.onyx @@ -58,7 +58,7 @@ stocastic_gradient_descent :: (nn: ^NeuralNet, mnist_data: ^MNIST_Data, training for i: input.count do input[i] = (cast(f32) cast(u32) example[i]) / 255; neural_net_forward(nn, ~~ input); - neural_net_backward(nn, ~~ expected); + neural_net_backward(nn, ~~ expected, mean_squared_error); prediction := neural_net_get_prediction(nn); if prediction == label do past_100_correct += 1; @@ -86,19 +86,18 @@ stocastic_gradient_descent :: (nn: ^NeuralNet, mnist_data: ^MNIST_Data, training print_colored_array(cast([] f32) expected, label, color); print_colored_array(output, prediction, color); - loss := neural_net_loss(nn, ~~ expected); - printf("MSE loss: %f Correct: %i / 100\n", cast(f32) loss, past_100_correct); + loss := neural_net_loss(nn, ~~ expected, mean_squared_error); + printf("Loss: %f Correct: %i / 100\n", cast(f32) loss, past_100_correct); past_100_correct = 0; if ex % 10000 == 0 { println("Saving neural network..."); - neural_net_save(nn, "data/test_2.nn"); + neural_net_save(nn, "data/test_3.nn"); } } } } - } main :: (args: [] cstr) { @@ -106,8 +105,8 @@ main :: (args: [] cstr) { // main_allocator := context.allocator; // context.allocator = alloc.log.logging_allocator(^main_allocator); - nn := neural_net_load("data/test_2.nn"); - // nn := make_neural_net(28 * 28, 512, 256, 100, 10); +// nn := neural_net_load("data/test_2.nn"); + nn := make_neural_net(28 * 28, 512, 256, 100, 10); defer neural_net_free(^nn); random.set_seed(5234); diff --git a/src/neuralnet.onyx b/src/neuralnet.onyx index c427793..3c5ea46 100644 --- a/src/neuralnet.onyx +++ b/src/neuralnet.onyx @@ -42,7 +42,7 @@ neural_net_forward :: (use nn: ^NeuralNet, input: [] f32) { } } -neural_net_backward :: (use nn: ^NeuralNet, expected_output: [] f32) { +neural_net_backward :: (use nn: ^NeuralNet, expected_output: [] f32, criterion: Criterion) { assert(layers[layers.count - 1].neurons.count == expected_output.count, "Expected output does not have the same size as the last layer."); @@ -59,27 +59,33 @@ neural_net_backward :: (use nn: ^NeuralNet, expected_output: [] f32) { // kind of an ambiguous term here. It specifically means the partial derivative // of the the loss with respect to the weighted sum of the previous layers // neurons, plus a bias. - for j: layers[i].neurons.count { - sigmoid_value := layers[i].neurons[j]; - d_sigmoid_value := layers[i].activation.backward(sigmoid_value, layers[i].pre_activation_neurons[j]); - - // The last layer has its derivative computed special, since it needs to capture - // the derivative of the MSE function. - if i == layers.count - 1 { - layers[i].deltas[j] = 2 * (expected_output[j] - sigmoid_value) * d_sigmoid_value / ~~expected_output.count; - - } else { + + // The last layer has its derivative computed special, since it needs to capture + // the derivative of the criterion function. + if i == layers.count - 1 { + criterion.compute_deltas(layers[i].deltas, layers[i].neurons, expected_output); + + } else { + for j: layers[i].neurons.count { d_neuron: f32 = 0; for k: layers[i + 1].neurons.count { d_neuron += layers[i + 1].deltas[k] * layers[i + 1].weights[k][j]; } - layers[i].deltas[j] = d_neuron * d_sigmoid_value; + + layers[i].deltas[j] = d_neuron; } } + + // Here we multiply by the derivative of the activation function for each neuron. + for j: layers[i].deltas.count { + d_sigmoid_value := layers[i].activation.backward(layers[i].neurons[j], layers[i].pre_activation_neurons[j]); + layers[i].deltas[j] *= d_sigmoid_value; + } } // Once all the deltas are computed, we can use them to compute the actual // derivatives and update the biases and weights. + // This part is responsible for optimization, and can easily be swapped out. for i: 1 .. layers.count { for j: layers[i].neurons.count { if layers[i].use_bias { @@ -107,21 +113,8 @@ neural_net_get_prediction :: (use nn: ^NeuralNet) -> i32 { return greatest_idx; } -neural_net_loss :: (use nn: ^NeuralNet, expected_output: [] f32) -> f32 { - // MSE loss - assert(layers[layers.count - 1].neurons.count == expected_output.count, - "Expected output does not have the same size as the last layer."); - - output := layers[layers.count - 1].neurons; - - squared_sum: f32 = 0; - for i: expected_output.count { - diff := output[i] - expected_output[i]; - squared_sum += diff * diff; - } - - loss := squared_sum / ~~expected_output.count; - return loss; +neural_net_loss :: (use nn: ^NeuralNet, expected_output: [] f32, criterion: Criterion) -> f32 { + return criterion.compute_loss(layers[layers.count - 1].neurons, expected_output); } @@ -136,7 +129,8 @@ Layer :: struct { neurons : [] f32; pre_activation_neurons : [] f32; - deltas : [] f32; + // The deltas could possibly be stored in the optimizer. + deltas : [] f32; } layer_init :: (use layer: ^Layer, layer_size: u32, prev_layer_size: u32, allocator := context.allocator, allocate_weights_and_biases := true) { @@ -275,6 +269,13 @@ neural_net_load :: (filename: str) -> NeuralNet { +// +// Activation functions +// The activation functions that are currently implemented are: +// - Sigmoid +// - Hyperbolic Tangent +// - ReLU +// // Solely used for serializing. Need a way to store the activation @@ -283,12 +284,16 @@ ActivationFunctionID :: enum (u8) { Invalid :: 0x00; Sigmoid :: 0x01; Hyperbolic_Tangent :: 0x02; + ReLU :: 0x03; } activation_function_from_id :: (id: ActivationFunctionID) -> ActivationFunction { + use ActivationFunctionID; + switch id { - case ActivationFunctionID.Sigmoid do return sigmoid_activation; - case ActivationFunctionID.Hyperbolic_Tangent do return tanh_activation; + case Sigmoid do return sigmoid_activation; + case Hyperbolic_Tangent do return tanh_activation; + case ReLU do return relu_activation; case #default do return ActivationFunction.{ ActivationFunctionID.Invalid, @@ -299,7 +304,7 @@ activation_function_from_id :: (id: ActivationFunctionID) -> ActivationFunction ActivationFunction :: struct { id : ActivationFunctionID; - forward : (x : f32) -> f32; + forward : (x : f32) -> f32; backward : (fx: f32, x: f32) -> f32; } @@ -337,4 +342,88 @@ tanh_prime :: (_: f32, x: f32) -> f32 { emx := math.exp(-x); s := emx + ex; return 4 / (s * s); +} + + +relu_activation := ActivationFunction.{ + ActivationFunctionID.ReLU, + relu, relu_prime +} + +relu :: (x: f32) -> f32 { + if x < 0 do return 0; + return x; +} + +relu_prime :: (rx: f32, _: f32) -> f32 { + if rx > 0 do return 1; + return 0; +} + + +// +// Criteria +// Currently, these are the implemented criteria: +// - MSE (Mean Squared Error) +// - MAE (Mean Absolute Error) +// - BCE (Binary Cross Entropy) +// + +Criterion :: struct { + compute_loss : (predictions: [] f32, expected: [] f32) -> f32; + + // `deltas` is an out parameter that holds the derivatives. + compute_deltas : (deltas: [] f32, predictions: [] f32, expected: [] f32) -> void; +} + +mean_squared_error := Criterion.{ + compute_loss = (prediction: [] f32, expected: [] f32) -> f32 { + assert(prediction.count == expected.count, "Expected output does not have the same size as predictions."); + + squared_sum: f32 = 0; + for i: expected.count { + diff := prediction[i] - expected[i]; + squared_sum += diff * diff; + } + + loss := squared_sum / ~~expected.count; + return loss; + }, + + compute_deltas = (deltas: [] f32, predictions: [] f32, expected: [] f32) { + // Leaving the assert turned off for right now. + // assert(predictions.count == expected.count && expected.count == deltas.count, "Expected output does not have the same size as predictions."); + + for j: deltas.count { + deltas[j] = 2 * (expected[j] - predictions[j]) / ~~expected.count; + } + }, +} + +mean_absolute_error := Criterion.{ + compute_loss = (prediction: [] f32, expected: [] f32) -> f32 { + assert(prediction.count == expected.count, "Expected output does not have the same size as predictions."); + + squared_sum: f32 = 0; + for i: expected.count { + diff := prediction[i] - expected[i]; + squared_sum += math.abs(diff); + } + + loss := squared_sum / ~~expected.count; + return loss; + }, + + compute_deltas = (deltas: [] f32, predictions: [] f32, expected: [] f32) { + // Leaving the assert turned off for right now. + // assert(predictions.count == expected.count, "Expected output does not have the same size as predictions."); + + for j: deltas.count { + deltas[j] = 1.0f; + if expected[j] < predictions[j] do deltas[j] = -1.0f; + + // Technically, this division should be here, but it doesn't appear to be helping the gradient descent. + deltas[j] /= cast(f32) expected.count; + } + }, } \ No newline at end of file -- 2.25.1