generalized criteria

author Brendan Hansen <brendan.f.hansen@gmail.com>

Tue, 26 Jan 2021 02:23:50 +0000 (20:23 -0600)

committer Brendan Hansen <brendan.f.hansen@gmail.com>

Tue, 23 Feb 2021 04:00:15 +0000 (22:00 -0600)
author Brendan Hansen <brendan.f.hansen@gmail.com>
Tue, 26 Jan 2021 02:23:50 +0000 (20:23 -0600)
committer Brendan Hansen <brendan.f.hansen@gmail.com>
Tue, 23 Feb 2021 04:00:15 +0000 (22:00 -0600)
diff --git a/docs/abstractions b/docs/abstractions

new file mode 100644 (file)

index 0000000..626fd14
--- /dev/null
+++ b/docs/abstractions
@@ -0,0 +1,23 @@
+Abstractions still needed:
+       * Optimizer
+               Currently, only SGD is implemented. It should be easy to add different
+               optimizers for the networks. A question to answer is should the optimizer
+               store the gradients computed in the back propagation, or should that be
+               stored on the layers? I'm leaning towards storing it on the layers.
+
+               Other optimizers:
+                       - Adam
+                       - AdaMax
+                       - AdaGrad
+
+       * Criteria
+               - MSE (implemented)
+               - MAE
+               - BCE
+
+       * Data Loader
+               Each dataloader will different, but a common API should be added, so
+               there can be an automatic training system, that just pulls data from
+               the dataloader as it is needed. The dataloader then has the freedom
+               to cache or preload the data.
+
diff --git a/project.4coder b/project.4coder

new file mode 100644 (file)

index 0000000..9306168
--- /dev/null
+++ b/project.4coder
@@ -0,0 +1,32 @@
+version(1);
+project_name = "Onyx";
+
+patterns = {
+"*.onyx",
+"*.bat",
+"*.sh",
+"*.4coder",
+};
+blacklist_patterns = {
+".*",
+};
+load_paths_custom = {
+ {"."},
+};
+load_paths = {
+ { load_paths_custom, .os = "win"  },
+ { load_paths_custom, .os = "linux"},
+ { load_paths_custom, .os = "mac"  },
+};
+
+build_win32   = "\\dev\\onyx\\onyx.exe -V src\\mnist.onyx -o mnist.wasm";
+build_linux   = "/usr/bin/onyx -V src/mnist.onyx -o mnist.wasm";
+
+command_list = {
+ { .name = "Build",
+   .out = "*compilation*", .footer_panel = true, .save_dirty_files = true,
+   .cmd = { {build_win32, .os ="win"  },
+            {build_linux, .os ="linux"}, }, },
+};
+
+fkey_command[1] = "Build";
diff --git a/src/mnist.onyx b/src/mnist.onyx

index dbb7fe6385f545d5d08b54056d2d6f2f914b16a7..897347a30e0cc55564180a96db15d95b60d6182a 100644 (file)
--- a/src/mnist.onyx
+++ b/src/mnist.onyx
@@ -58,7 +58,7 @@ stocastic_gradient_descent :: (nn: ^NeuralNet, mnist_data: ^MNIST_Data, training
              for i: input.count do input[i] = (cast(f32) cast(u32) example[i]) / 255;
  
              neural_net_forward(nn, ~~ input);
-            neural_net_backward(nn, ~~ expected);
+            neural_net_backward(nn, ~~ expected, mean_squared_error);
  
              prediction := neural_net_get_prediction(nn);
              if prediction == label do past_100_correct += 1;
@@ -86,19 +86,18 @@ stocastic_gradient_descent :: (nn: ^NeuralNet, mnist_data: ^MNIST_Data, training
                  print_colored_array(cast([] f32) expected, label, color);
                  print_colored_array(output, prediction, color);
  
-                loss := neural_net_loss(nn, ~~ expected);
-                printf("MSE loss: %f     Correct: %i / 100\n", cast(f32) loss, past_100_correct);
+                loss := neural_net_loss(nn, ~~ expected, mean_squared_error);
+                printf("Loss: %f         Correct: %i / 100\n", cast(f32) loss, past_100_correct);
  
                  past_100_correct = 0;
  
                  if ex % 10000 == 0 {
                      println("Saving neural network...");
-                    neural_net_save(nn, "data/test_2.nn");
+                    neural_net_save(nn, "data/test_3.nn");
                  }
              }
          }
      }
-
  }
  
  main :: (args: [] cstr) {
@@ -106,8 +105,8 @@ main :: (args: [] cstr) {
      // main_allocator := context.allocator;
      // context.allocator = alloc.log.logging_allocator(^main_allocator);
  
-    nn := neural_net_load("data/test_2.nn");
-    // nn := make_neural_net(28 * 28, 512, 256, 100, 10);
+//    nn := neural_net_load("data/test_2.nn");
+    nn := make_neural_net(28 * 28, 512, 256, 100, 10);
      defer neural_net_free(^nn);
  
      random.set_seed(5234);
diff --git a/src/neuralnet.onyx b/src/neuralnet.onyx

index c427793bb1788fe3ed652682899d64286c342659..3c5ea46180426547dc476697764903d314995aaf 100644 (file)
--- a/src/neuralnet.onyx
+++ b/src/neuralnet.onyx
@@ -42,7 +42,7 @@ neural_net_forward :: (use nn: ^NeuralNet, input: [] f32) {
      }
  }
  
-neural_net_backward :: (use nn: ^NeuralNet, expected_output: [] f32) {
+neural_net_backward :: (use nn: ^NeuralNet, expected_output: [] f32, criterion: Criterion) {
      assert(layers[layers.count - 1].neurons.count == expected_output.count,
              "Expected output does not have the same size as the last layer.");
  
@@ -59,27 +59,33 @@ neural_net_backward :: (use nn: ^NeuralNet, expected_output: [] f32) {
          // kind of an ambiguous term here. It specifically means the partial derivative
          // of the the loss with respect to the weighted sum of the previous layers
          // neurons, plus a bias.
-        for j: layers[i].neurons.count {
-            sigmoid_value   := layers[i].neurons[j];
-            d_sigmoid_value := layers[i].activation.backward(sigmoid_value, layers[i].pre_activation_neurons[j]);
-
-            // The last layer has its derivative computed special, since it needs to capture
-            // the derivative of the MSE function.
-            if i == layers.count - 1 {
-                layers[i].deltas[j] = 2 * (expected_output[j] - sigmoid_value) * d_sigmoid_value / ~~expected_output.count;
-
-            } else {
+        
+        // The last layer has its derivative computed special, since it needs to capture
+        // the derivative of the criterion function.
+        if i == layers.count - 1 {
+            criterion.compute_deltas(layers[i].deltas, layers[i].neurons, expected_output);
+            
+        } else {
+            for j: layers[i].neurons.count {
                  d_neuron: f32 = 0;
                  for k: layers[i + 1].neurons.count {
                      d_neuron += layers[i + 1].deltas[k] * layers[i + 1].weights[k][j];
                  }
-                layers[i].deltas[j] = d_neuron * d_sigmoid_value;
+
+                layers[i].deltas[j] = d_neuron;
              }
          }
+        
+        // Here we multiply by the derivative of the activation function for each neuron.
+        for j: layers[i].deltas.count {
+            d_sigmoid_value := layers[i].activation.backward(layers[i].neurons[j], layers[i].pre_activation_neurons[j]);
+            layers[i].deltas[j] *= d_sigmoid_value;
+        }
      }
  
      // Once all the deltas are computed, we can use them to compute the actual
      // derivatives and update the biases and weights.
+    // This part is responsible for optimization, and can easily be swapped out.
      for i: 1 .. layers.count {
          for j: layers[i].neurons.count {
              if layers[i].use_bias {
@@ -107,21 +113,8 @@ neural_net_get_prediction :: (use nn: ^NeuralNet) -> i32 {
      return greatest_idx;
  }
  
-neural_net_loss :: (use nn: ^NeuralNet, expected_output: [] f32) -> f32 {
-    // MSE loss
-    assert(layers[layers.count - 1].neurons.count == expected_output.count,
-            "Expected output does not have the same size as the last layer.");
-
-    output := layers[layers.count - 1].neurons;
-
-    squared_sum: f32 = 0;
-    for i: expected_output.count {
-        diff := output[i] - expected_output[i];
-        squared_sum += diff * diff;
-    }
-
-    loss := squared_sum / ~~expected_output.count;
-    return loss;
+neural_net_loss :: (use nn: ^NeuralNet, expected_output: [] f32, criterion: Criterion) -> f32 {
+    return criterion.compute_loss(layers[layers.count - 1].neurons, expected_output);
  }
  
  
@@ -136,7 +129,8 @@ Layer :: struct {
      neurons                : [] f32;
      pre_activation_neurons : [] f32;
  
-    deltas  :   [] f32;
+    // The deltas could possibly be stored in the optimizer.
+    deltas : [] f32;
  }
  
  layer_init :: (use layer: ^Layer, layer_size: u32, prev_layer_size: u32, allocator := context.allocator, allocate_weights_and_biases := true) {
@@ -275,6 +269,13 @@ neural_net_load :: (filename: str) -> NeuralNet {
  
  
  
+//
+// Activation functions
+//     The activation functions that are currently implemented are:
+//         - Sigmoid
+//         - Hyperbolic Tangent
+//         - ReLU
+//
  
  
  // Solely used for serializing. Need a way to store the activation
@@ -283,12 +284,16 @@ ActivationFunctionID :: enum (u8) {
      Invalid            :: 0x00;
      Sigmoid            :: 0x01;
      Hyperbolic_Tangent :: 0x02;
+    ReLU               :: 0x03;
  }
  
  activation_function_from_id :: (id: ActivationFunctionID) -> ActivationFunction {
+    use ActivationFunctionID;
+    
      switch id {
-        case ActivationFunctionID.Sigmoid do return sigmoid_activation;
-        case ActivationFunctionID.Hyperbolic_Tangent do return tanh_activation;
+        case Sigmoid            do return sigmoid_activation;
+        case Hyperbolic_Tangent do return tanh_activation;
+        case ReLU               do return relu_activation;
  
          case #default do return ActivationFunction.{
              ActivationFunctionID.Invalid,
@@ -299,7 +304,7 @@ activation_function_from_id :: (id: ActivationFunctionID) -> ActivationFunction
  
  ActivationFunction :: struct {
      id       : ActivationFunctionID;
-    forward  : (x : f32)           -> f32;
+    forward  : (x : f32)         -> f32;
      backward : (fx: f32, x: f32) -> f32;
  }
  
@@ -337,4 +342,88 @@ tanh_prime :: (_: f32, x: f32) -> f32 {
      emx := math.exp(-x);
      s   := emx + ex;
      return 4 / (s * s);
+}
+
+
+relu_activation := ActivationFunction.{
+    ActivationFunctionID.ReLU,
+    relu, relu_prime
+}
+
+relu :: (x: f32) -> f32 {
+    if x < 0 do return 0;
+    return x;
+}
+
+relu_prime :: (rx: f32, _: f32) -> f32 {
+    if rx > 0 do return 1;
+    return 0;
+}
+
+
+//
+// Criteria
+//     Currently, these are the implemented criteria:
+//         - MSE (Mean Squared Error)
+//         - MAE (Mean Absolute Error)
+//         - BCE (Binary Cross Entropy)
+//
+
+Criterion :: struct {
+    compute_loss   : (predictions: [] f32, expected: [] f32) -> f32;
+    
+    // `deltas` is an out parameter that holds the derivatives.
+    compute_deltas : (deltas: [] f32, predictions: [] f32, expected: [] f32) -> void;
+}
+
+mean_squared_error := Criterion.{
+    compute_loss = (prediction: [] f32, expected: [] f32) -> f32 {
+        assert(prediction.count == expected.count, "Expected output does not have the same size as predictions.");
+                
+        squared_sum: f32 = 0;
+        for i: expected.count {
+            diff := prediction[i] - expected[i];
+            squared_sum += diff * diff;
+        }
+        
+        loss := squared_sum / ~~expected.count;
+        return loss;
+    },
+    
+    compute_deltas = (deltas: [] f32, predictions: [] f32, expected: [] f32) {
+        // Leaving the assert turned off for right now.
+        // assert(predictions.count == expected.count && expected.count == deltas.count, "Expected output does not have the same size as predictions.");
+        
+        for j: deltas.count {
+            deltas[j] = 2 * (expected[j] - predictions[j]) / ~~expected.count;
+        }
+    },
+}
+
+mean_absolute_error := Criterion.{
+    compute_loss = (prediction: [] f32, expected: [] f32) -> f32 {
+        assert(prediction.count == expected.count, "Expected output does not have the same size as predictions.");
+        
+        squared_sum: f32 = 0;
+        for i: expected.count {
+            diff := prediction[i] - expected[i];
+            squared_sum += math.abs(diff);
+        }
+        
+        loss := squared_sum / ~~expected.count;
+        return loss;
+    },
+    
+    compute_deltas = (deltas: [] f32, predictions: [] f32, expected: [] f32) {
+        // Leaving the assert turned off for right now.
+        // assert(predictions.count == expected.count, "Expected output does not have the same size as predictions.");
+        
+        for j: deltas.count {
+            deltas[j] = 1.0f;
+            if expected[j] < predictions[j] do deltas[j] = -1.0f;
+            
+            // Technically, this division should be here, but it doesn't appear to be helping the gradient descent.
+            deltas[j] /= cast(f32) expected.count;
+        }
+    },
  }
 \ No newline at end of file
author	Brendan Hansen <brendan.f.hansen@gmail.com>
	Tue, 26 Jan 2021 02:23:50 +0000 (20:23 -0600)
committer	Brendan Hansen <brendan.f.hansen@gmail.com>
	Tue, 23 Feb 2021 04:00:15 +0000 (22:00 -0600)
docs/abstractions	[new file with mode: 0644]	patch \| blob
project.4coder	[new file with mode: 0644]	patch \| blob
src/mnist.onyx		patch \| blob \| history
src/neuralnet.onyx		patch \| blob \| history