From b79ce5fffaafbb38cfca998db4a7b9204fdd05e2 Mon Sep 17 00:00:00 2001
From: Brendan Hansen <brendan.f.hansen@gmail.com>
Date: Mon, 25 Jan 2021 20:23:50 -0600
Subject: [PATCH] generalized criteria

---
 docs/abstractions  |  23 +++++++
 project.4coder     |  32 ++++++++++
 src/mnist.onyx     |  13 ++--
 src/neuralnet.onyx | 151 +++++++++++++++++++++++++++++++++++----------
 4 files changed, 181 insertions(+), 38 deletions(-)
 create mode 100644 docs/abstractions
 create mode 100644 project.4coder

diff --git a/docs/abstractions b/docs/abstractions
new file mode 100644
index 0000000..626fd14
--- /dev/null
+++ b/docs/abstractions
@@ -0,0 +1,23 @@
+Abstractions still needed:
+	* Optimizer
+		Currently, only SGD is implemented. It should be easy to add different
+		optimizers for the networks. A question to answer is should the optimizer
+		store the gradients computed in the back propagation, or should that be
+		stored on the layers? I'm leaning towards storing it on the layers.
+
+		Other optimizers:
+			- Adam
+			- AdaMax
+			- AdaGrad
+
+	* Criteria
+		- MSE (implemented)
+		- MAE
+		- BCE
+
+	* Data Loader
+		Each dataloader will different, but a common API should be added, so
+		there can be an automatic training system, that just pulls data from
+		the dataloader as it is needed. The dataloader then has the freedom
+		to cache or preload the data.
+
diff --git a/project.4coder b/project.4coder
new file mode 100644
index 0000000..9306168
--- /dev/null
+++ b/project.4coder
@@ -0,0 +1,32 @@
+version(1);
+project_name = "Onyx";
+
+patterns = {
+"*.onyx",
+"*.bat",
+"*.sh",
+"*.4coder",
+};
+blacklist_patterns = {
+".*",
+};
+load_paths_custom = {
+ {"."},
+};
+load_paths = {
+ { load_paths_custom, .os = "win"  },
+ { load_paths_custom, .os = "linux"},
+ { load_paths_custom, .os = "mac"  },
+};
+
+build_win32   = "\\dev\\onyx\\onyx.exe -V src\\mnist.onyx -o mnist.wasm";
+build_linux   = "/usr/bin/onyx -V src/mnist.onyx -o mnist.wasm";
+
+command_list = {
+ { .name = "Build",
+   .out = "*compilation*", .footer_panel = true, .save_dirty_files = true,
+   .cmd = { {build_win32, .os ="win"  },
+            {build_linux, .os ="linux"}, }, },
+};
+
+fkey_command[1] = "Build";
diff --git a/src/mnist.onyx b/src/mnist.onyx
index dbb7fe6..897347a 100644
--- a/src/mnist.onyx
+++ b/src/mnist.onyx
@@ -58,7 +58,7 @@ stocastic_gradient_descent :: (nn: ^NeuralNet, mnist_data: ^MNIST_Data, training
             for i: input.count do input[i] = (cast(f32) cast(u32) example[i]) / 255;
 
             neural_net_forward(nn, ~~ input);
-            neural_net_backward(nn, ~~ expected);
+            neural_net_backward(nn, ~~ expected, mean_squared_error);
 
             prediction := neural_net_get_prediction(nn);
             if prediction == label do past_100_correct += 1;
@@ -86,19 +86,18 @@ stocastic_gradient_descent :: (nn: ^NeuralNet, mnist_data: ^MNIST_Data, training
                 print_colored_array(cast([] f32) expected, label, color);
                 print_colored_array(output, prediction, color);
 
-                loss := neural_net_loss(nn, ~~ expected);
-                printf("MSE loss: %f     Correct: %i / 100\n", cast(f32) loss, past_100_correct);
+                loss := neural_net_loss(nn, ~~ expected, mean_squared_error);
+                printf("Loss: %f         Correct: %i / 100\n", cast(f32) loss, past_100_correct);
 
                 past_100_correct = 0;
 
                 if ex % 10000 == 0 {
                     println("Saving neural network...");
-                    neural_net_save(nn, "data/test_2.nn");
+                    neural_net_save(nn, "data/test_3.nn");
                 }
             }
         }
     }
-
 }
 
 main :: (args: [] cstr) {
@@ -106,8 +105,8 @@ main :: (args: [] cstr) {
     // main_allocator := context.allocator;
     // context.allocator = alloc.log.logging_allocator(^main_allocator);
 
-    nn := neural_net_load("data/test_2.nn");
-    // nn := make_neural_net(28 * 28, 512, 256, 100, 10);
+//    nn := neural_net_load("data/test_2.nn");
+    nn := make_neural_net(28 * 28, 512, 256, 100, 10);
     defer neural_net_free(^nn);
 
     random.set_seed(5234);
diff --git a/src/neuralnet.onyx b/src/neuralnet.onyx
index c427793..3c5ea46 100644
--- a/src/neuralnet.onyx
+++ b/src/neuralnet.onyx
@@ -42,7 +42,7 @@ neural_net_forward :: (use nn: ^NeuralNet, input: [] f32) {
     }
 }
 
-neural_net_backward :: (use nn: ^NeuralNet, expected_output: [] f32) {
+neural_net_backward :: (use nn: ^NeuralNet, expected_output: [] f32, criterion: Criterion) {
     assert(layers[layers.count - 1].neurons.count == expected_output.count,
             "Expected output does not have the same size as the last layer.");
 
@@ -59,27 +59,33 @@ neural_net_backward :: (use nn: ^NeuralNet, expected_output: [] f32) {
         // kind of an ambiguous term here. It specifically means the partial derivative
         // of the the loss with respect to the weighted sum of the previous layers
         // neurons, plus a bias.
-        for j: layers[i].neurons.count {
-            sigmoid_value   := layers[i].neurons[j];
-            d_sigmoid_value := layers[i].activation.backward(sigmoid_value, layers[i].pre_activation_neurons[j]);
-
-            // The last layer has its derivative computed special, since it needs to capture
-            // the derivative of the MSE function.
-            if i == layers.count - 1 {
-                layers[i].deltas[j] = 2 * (expected_output[j] - sigmoid_value) * d_sigmoid_value / ~~expected_output.count;
-
-            } else {
+        
+        // The last layer has its derivative computed special, since it needs to capture
+        // the derivative of the criterion function.
+        if i == layers.count - 1 {
+            criterion.compute_deltas(layers[i].deltas, layers[i].neurons, expected_output);
+            
+        } else {
+            for j: layers[i].neurons.count {
                 d_neuron: f32 = 0;
                 for k: layers[i + 1].neurons.count {
                     d_neuron += layers[i + 1].deltas[k] * layers[i + 1].weights[k][j];
                 }
-                layers[i].deltas[j] = d_neuron * d_sigmoid_value;
+
+                layers[i].deltas[j] = d_neuron;
             }
         }
+        
+        // Here we multiply by the derivative of the activation function for each neuron.
+        for j: layers[i].deltas.count {
+            d_sigmoid_value := layers[i].activation.backward(layers[i].neurons[j], layers[i].pre_activation_neurons[j]);
+            layers[i].deltas[j] *= d_sigmoid_value;
+        }
     }
 
     // Once all the deltas are computed, we can use them to compute the actual
     // derivatives and update the biases and weights.
+    // This part is responsible for optimization, and can easily be swapped out.
     for i: 1 .. layers.count {
         for j: layers[i].neurons.count {
             if layers[i].use_bias {
@@ -107,21 +113,8 @@ neural_net_get_prediction :: (use nn: ^NeuralNet) -> i32 {
     return greatest_idx;
 }
 
-neural_net_loss :: (use nn: ^NeuralNet, expected_output: [] f32) -> f32 {
-    // MSE loss
-    assert(layers[layers.count - 1].neurons.count == expected_output.count,
-            "Expected output does not have the same size as the last layer.");
-
-    output := layers[layers.count - 1].neurons;
-
-    squared_sum: f32 = 0;
-    for i: expected_output.count {
-        diff := output[i] - expected_output[i];
-        squared_sum += diff * diff;
-    }
-
-    loss := squared_sum / ~~expected_output.count;
-    return loss;
+neural_net_loss :: (use nn: ^NeuralNet, expected_output: [] f32, criterion: Criterion) -> f32 {
+    return criterion.compute_loss(layers[layers.count - 1].neurons, expected_output);
 }
 
 
@@ -136,7 +129,8 @@ Layer :: struct {
     neurons                : [] f32;
     pre_activation_neurons : [] f32;
 
-    deltas  :   [] f32;
+    // The deltas could possibly be stored in the optimizer.
+    deltas : [] f32;
 }
 
 layer_init :: (use layer: ^Layer, layer_size: u32, prev_layer_size: u32, allocator := context.allocator, allocate_weights_and_biases := true) {
@@ -275,6 +269,13 @@ neural_net_load :: (filename: str) -> NeuralNet {
 
 
 
+//
+// Activation functions
+//     The activation functions that are currently implemented are:
+//         - Sigmoid
+//         - Hyperbolic Tangent
+//         - ReLU
+//
 
 
 // Solely used for serializing. Need a way to store the activation
@@ -283,12 +284,16 @@ ActivationFunctionID :: enum (u8) {
     Invalid            :: 0x00;
     Sigmoid            :: 0x01;
     Hyperbolic_Tangent :: 0x02;
+    ReLU               :: 0x03;
 }
 
 activation_function_from_id :: (id: ActivationFunctionID) -> ActivationFunction {
+    use ActivationFunctionID;
+    
     switch id {
-        case ActivationFunctionID.Sigmoid do return sigmoid_activation;
-        case ActivationFunctionID.Hyperbolic_Tangent do return tanh_activation;
+        case Sigmoid            do return sigmoid_activation;
+        case Hyperbolic_Tangent do return tanh_activation;
+        case ReLU               do return relu_activation;
 
         case #default do return ActivationFunction.{
             ActivationFunctionID.Invalid,
@@ -299,7 +304,7 @@ activation_function_from_id :: (id: ActivationFunctionID) -> ActivationFunction
 
 ActivationFunction :: struct {
     id       : ActivationFunctionID;
-    forward  : (x : f32)           -> f32;
+    forward  : (x : f32)         -> f32;
     backward : (fx: f32, x: f32) -> f32;
 }
 
@@ -337,4 +342,88 @@ tanh_prime :: (_: f32, x: f32) -> f32 {
     emx := math.exp(-x);
     s   := emx + ex;
     return 4 / (s * s);
+}
+
+
+relu_activation := ActivationFunction.{
+    ActivationFunctionID.ReLU,
+    relu, relu_prime
+}
+
+relu :: (x: f32) -> f32 {
+    if x < 0 do return 0;
+    return x;
+}
+
+relu_prime :: (rx: f32, _: f32) -> f32 {
+    if rx > 0 do return 1;
+    return 0;
+}
+
+
+//
+// Criteria
+//     Currently, these are the implemented criteria:
+//         - MSE (Mean Squared Error)
+//         - MAE (Mean Absolute Error)
+//         - BCE (Binary Cross Entropy)
+//
+
+Criterion :: struct {
+    compute_loss   : (predictions: [] f32, expected: [] f32) -> f32;
+    
+    // `deltas` is an out parameter that holds the derivatives.
+    compute_deltas : (deltas: [] f32, predictions: [] f32, expected: [] f32) -> void;
+}
+
+mean_squared_error := Criterion.{
+    compute_loss = (prediction: [] f32, expected: [] f32) -> f32 {
+        assert(prediction.count == expected.count, "Expected output does not have the same size as predictions.");
+                
+        squared_sum: f32 = 0;
+        for i: expected.count {
+            diff := prediction[i] - expected[i];
+            squared_sum += diff * diff;
+        }
+        
+        loss := squared_sum / ~~expected.count;
+        return loss;
+    },
+    
+    compute_deltas = (deltas: [] f32, predictions: [] f32, expected: [] f32) {
+        // Leaving the assert turned off for right now.
+        // assert(predictions.count == expected.count && expected.count == deltas.count, "Expected output does not have the same size as predictions.");
+        
+        for j: deltas.count {
+            deltas[j] = 2 * (expected[j] - predictions[j]) / ~~expected.count;
+        }
+    },
+}
+
+mean_absolute_error := Criterion.{
+    compute_loss = (prediction: [] f32, expected: [] f32) -> f32 {
+        assert(prediction.count == expected.count, "Expected output does not have the same size as predictions.");
+        
+        squared_sum: f32 = 0;
+        for i: expected.count {
+            diff := prediction[i] - expected[i];
+            squared_sum += math.abs(diff);
+        }
+        
+        loss := squared_sum / ~~expected.count;
+        return loss;
+    },
+    
+    compute_deltas = (deltas: [] f32, predictions: [] f32, expected: [] f32) {
+        // Leaving the assert turned off for right now.
+        // assert(predictions.count == expected.count, "Expected output does not have the same size as predictions.");
+        
+        for j: deltas.count {
+            deltas[j] = 1.0f;
+            if expected[j] < predictions[j] do deltas[j] = -1.0f;
+            
+            // Technically, this division should be here, but it doesn't appear to be helping the gradient descent.
+            deltas[j] /= cast(f32) expected.count;
+        }
+    },
 }
\ No newline at end of file
-- 
2.25.1