refactored to use a separate optimizer; loading/saving networks is much slower now

author Brendan Hansen <brendan.f.hansen@gmail.com>

Thu, 28 Jan 2021 17:01:14 +0000 (11:01 -0600)

committer Brendan Hansen <brendan.f.hansen@gmail.com>

Tue, 23 Feb 2021 04:00:15 +0000 (22:00 -0600)
author Brendan Hansen <brendan.f.hansen@gmail.com>
Thu, 28 Jan 2021 17:01:14 +0000 (11:01 -0600)
committer Brendan Hansen <brendan.f.hansen@gmail.com>
Tue, 23 Feb 2021 04:00:15 +0000 (22:00 -0600)
diff --git a/project.4coder b/project.4coder

index 50d312714086a77984f949934780c302585fada1..f8f7ca04b92cf43d0440172236462445c1c0d4c8 100644 (file)
--- a/project.4coder
+++ b/project.4coder
@@ -19,7 +19,7 @@ load_paths = {
   { load_paths_custom, .os = "mac"  },
  };
  
-build_win32   = "\\dev\\onyx\\onyx.exe -V src\\cifar10.onyx -o network.wasm";
+build_win32   = "\\dev\\onyx\\onyx.exe -V src\\mnist.onyx -o network.wasm";
  build_linux   = "/usr/bin/onyx -V src/mnist.onyx -o mnist.wasm";
  
  command_list = {
diff --git a/src/mnist.onyx b/src/mnist.onyx

index 155542a6ca3fd3856fa419e6224f22fc1e6e3461..61632183dc434c4dd53a1517f6e947e63016ca1f 100644 (file)
--- a/src/mnist.onyx
+++ b/src/mnist.onyx
@@ -61,8 +61,7 @@ mnist_dataloader_functions := DataLoader_Functions.{
      }
  }
  
-
-stocastic_gradient_descent :: (nn: ^NeuralNet, dataloader: ^DataLoader, criterion: Criterion = mean_squared_error) {
+train :: (nn: ^NeuralNet, dataloader: ^DataLoader, optimizer: ^Optimizer, criterion: Criterion = mean_squared_error) {
      input := memory.make_slice(f32, 784);
      defer cfree(input.data);
      expected : [10] f32;
@@ -71,14 +70,21 @@ stocastic_gradient_descent :: (nn: ^NeuralNet, dataloader: ^DataLoader, criterio
      
      past_100_correct := 0;
      for i: 10 {
+        printf("Staring epoch %i ===================================\n", i);
          for ex: training_example_count {
              dataloader_get_item(dataloader, ex, input, ~~ expected);
              
+            // NOTE(Brendan Hansen): Currently, zeroing the gradient is not
+            // necessary because neural_net_backward replaces the gradient,
+            // in other words it doesn't add to the existing gradient.
+            // optimizer_zero_gradient(optimizer);
+            
              neural_net_forward(nn, ~~ input);
              neural_net_backward(nn, ~~ expected, criterion);
-            
-            // The optimizing step should be put here.
-            
+            optimizer_step(optimizer);
+
+
+            // NOTE(Brendan Hansen): Prediction printing and tracking.
              label, _   := array.greatest(expected);
              prediction := neural_net_get_prediction(nn);
              if prediction == label do past_100_correct += 1;
@@ -111,12 +117,10 @@ stocastic_gradient_descent :: (nn: ^NeuralNet, dataloader: ^DataLoader, criterio
  
                  past_100_correct = 0;
                  
-                /*
                  if ex % 10000 == 0 {
                      println("Saving neural network...");
-                    neural_net_save(nn, "data/test_4.nn");
+                    neural_net_save(nn, "data/still_working.nn");
                  }
-                */
              }
          }
      }
@@ -127,7 +131,6 @@ main :: (args: [] cstr) {
      // main_allocator := context.allocator;
      // context.allocator = alloc.log.logging_allocator(^main_allocator);
  
-    //nn := neural_net_load("data/test_3.nn");
      nn := make_neural_net(28 * 28, 512, 256, 100, 10);
      defer neural_net_free(^nn);
  
@@ -136,5 +139,9 @@ main :: (args: [] cstr) {
      mnist_data := mnist_data_make();
      defer mnist_data_close(^mnist_data);
  
-    stocastic_gradient_descent(^nn, ^mnist_data);
+    optimizer := sgd_optimizer_create(^nn, learning_rate = 0.005f);
+    neural_net_supply_parameters(^nn, ^optimizer);
+
+    println("Starting training");
+    train(^nn, ^mnist_data, ^optimizer);
  }
 \ No newline at end of file
diff --git a/src/neuralnet.onyx b/src/neuralnet.onyx

index 093b0f8d8e8cce7ab34a40476be57830149f7a25..ad2d71e6a0aa2db5aae1668152e84a67840be35d 100644 (file)
--- a/src/neuralnet.onyx
+++ b/src/neuralnet.onyx
@@ -1,5 +1,15 @@
  use package core
  
+
+//
+// Variable
+//
+// TODO(Brendan Hansen): Document this better
+Variable :: struct {
+    value : f32;
+    delta : f32;
+}
+
  //
  // General purpose Multi-Layer Perceptron (MLP)
  //
@@ -51,8 +61,6 @@ neural_net_backward :: (use nn: ^NeuralNet, expected_output: [] f32, criterion:
      assert(layers[layers.count - 1].neurons.count == expected_output.count,
              "Expected output does not have the same size as the last layer.");
  
-    LEARNING_RATE :: cast(f32) 0.01;
-    
      // NOTE(Brendan Hansen):
      // Iterating backwards through the layers (hence the name "back propagation")
      // The reason this is necessary is because we need to know the derivatives of
@@ -93,11 +101,12 @@ neural_net_backward :: (use nn: ^NeuralNet, expected_output: [] f32, criterion:
      for i: 1 .. layers.count {
          for j: layers[i].neurons.count {
              if layers[i].use_bias {
-                layers[i].biases[j] += LEARNING_RATE * layers[i].deltas[j];
+                layers[i].biases[j].delta = layers[i].deltas[j];
              }
  
-            for k: layers[i].weights[j].count {
-                layers[i].weights[j][k] += LEARNING_RATE * layers[i].deltas[j] * layers[i - 1].neurons[k];
+            prev_layer_count := layers[i - 1].neurons.count;
+            for k: prev_layer_count {
+                layers[i].weights[j * prev_layer_count + k].delta = layers[i].deltas[j] * layers[i - 1].neurons[k];
              }
          }
      }
@@ -121,19 +130,25 @@ neural_net_loss :: (use nn: ^NeuralNet, expected_output: [] f32, criterion: Crit
      return criterion.compute_loss(layers[layers.count - 1].neurons, expected_output);
  }
  
+neural_net_supply_parameters :: (use nn: ^NeuralNet, optimizer: ^Optimizer) {
+    for ^layer: layers {
+        if layer.biases.data != null  do array.push(^optimizer.variable_arrays, ^layer.biases);
+        if layer.weights.data != null do array.push(^optimizer.variable_arrays, ^layer.weights);
+    }
+}
+
  
  Layer :: struct {
      use_bias   : bool;
      is_input   : bool;
      activation : ActivationFunction;
  
-    biases  :   [] f32;
-    weights : [][] f32; // CLEANUP: Make this a rank 1 slice
+    biases  : [] Variable;
+    weights : [] Variable;
  
      neurons                : [] f32;
      pre_activation_neurons : [] f32;
  
-    // The deltas could possibly be stored in the optimizer.
      deltas : [] f32;
  }
  
@@ -149,13 +164,10 @@ layer_init :: (use layer: ^Layer, layer_size: u32, prev_layer_size: u32, allocat
  
      if !is_input && allocate_weights_and_biases {
          if use_bias {
-            biases = memory.make_slice(f32, layer_size, allocator);
+            biases = memory.make_slice(Variable, layer_size, allocator);
          }
          
-        weights = memory.make_slice(#type [] f32, layer_size, allocator);
-        for ^weight: weights {
-            *weight = memory.make_slice(f32, prev_layer_size, allocator);
-        }
+        weights = memory.make_slice(Variable, layer_size * prev_layer_size, allocator);
  
          randomize_weights_and_biases(layer);
      }
@@ -163,27 +175,25 @@ layer_init :: (use layer: ^Layer, layer_size: u32, prev_layer_size: u32, allocat
  
  randomize_weights_and_biases :: (use layer: ^Layer) {
      for ^weight: weights {
-        for ^w: *weight {
-            *w = cast(f32) random.float(-0.5f, 0.5f);
-        }
+        weight.value = cast(f32) random.float(-0.5f, 0.5f);
      }
  
      if use_bias {
-        for ^bias: biases do *bias = cast(f32) random.float(-0.5f, 0.5f);
+        for ^bias: biases do bias.value = cast(f32) random.float(-0.5f, 0.5f);
      }
  }
  
  layer_forward :: (use layer: ^Layer, prev_layer: ^Layer) {
      for i: neurons.count {
-        neurons[i] = 0;
-        if use_bias do neurons[i] = biases[i];
+        neuron: f32 = 0;
+        if use_bias do neuron = biases[i].value;
  
-        for j: weights[i].count {
-            neurons[i] += prev_layer.neurons[j] * weights[i][j];
+        for j: prev_layer.neurons.count {
+            neuron += prev_layer.neurons[j] * weights[i * prev_layer.neurons.count + j].value;
          }
  
-        pre_activation_neurons[i] = neurons[i];
-        neurons[i] = activation.forward(neurons[i]);
+        pre_activation_neurons[i] = neuron;
+        neurons[i] = activation.forward(neuron);
      }
  }
  
@@ -191,19 +201,17 @@ layer_backward :: (use layer: ^Layer, next_layer: ^Layer) {
      for j: neurons.count {
          d_neuron: f32 = 0;
          for k: next_layer.neurons.count {
-            d_neuron += next_layer.deltas[k] * next_layer.weights[k][j];
+            d_neuron += next_layer.deltas[k] * next_layer.weights[k * neurons.count + j].value;
          }
          
          d_sigmoid_value := activation.backward(neurons[j], pre_activation_neurons[j]);
          
-        // This could easily become '+=', which would allow for an accumulated gradient,
-        // before taking a step.
          deltas[j] = d_neuron * d_sigmoid_value;
      }
  }
  
  
-Onyx_NN_Magic_Number :: 0x4E4E584F
+Onyx_NN_Magic_Number := 0x4E4E584F
  
  neural_net_save :: (use nn: ^NeuralNet, filename: str) {
      err, output_file := io.open(filename, io.OpenMode.Write);
@@ -213,13 +221,13 @@ neural_net_save :: (use nn: ^NeuralNet, filename: str) {
      writer := io.binary_writer_make(^output_file);
  
      // Magic string
-    io.binary_write(^writer, i32, Onyx_NN_Magic_Number);
+    io.binary_write(^writer, i32, ^Onyx_NN_Magic_Number);
  
      // Number of layers
-    io.binary_write(^writer, i32, layers.count);
+    io.binary_write(^writer, i32, ^layers.count);
  
      for ^layer: layers {
-        io.binary_write(^writer, i32, layer.neurons.count);
+        io.binary_write(^writer, i32, ^layer.neurons.count);
  
          io.binary_write_byte(^writer, cast(u8) layer.is_input);
          if layer.is_input do continue;
@@ -228,11 +236,14 @@ neural_net_save :: (use nn: ^NeuralNet, filename: str) {
          io.binary_write_byte(^writer, cast(u8) layer.activation.id);
  
          if layer.use_bias {
-            io.binary_write_slice(^writer, layer.biases);
+//            io.binary_write_slice(^writer, layer.biases);
+            for ^bias: layer.biases {
+                io.binary_write(^writer, f32, ^bias.value);
+            }
          }
-
+        
          for ^weight: layer.weights {
-            io.binary_write_slice(^writer, *weight);
+            io.binary_write(^writer, f32, ^weight.value);
          }
      }
  }
@@ -259,7 +270,7 @@ neural_net_load :: (filename: str) -> NeuralNet {
          layer_size := io.binary_read(^reader, i32);
          is_input   := cast(bool) io.binary_read_byte(^reader);
  
-        layer_init(^nn.layers[l], layer_size, prev_layer_size, allocator = layer_allocator, allocate_weights_and_biases = false);
+        layer_init(^nn.layers[l], layer_size, prev_layer_size, allocator = layer_allocator);
          if !is_input {
              nn.layers[l].use_bias = cast(bool) io.binary_read_byte(^reader);
  
@@ -267,11 +278,15 @@ neural_net_load :: (filename: str) -> NeuralNet {
              nn.layers[l].activation = activation_function_from_id(activation_id);
  
              if nn.layers[l].use_bias {
-                nn.layers[l].biases = io.binary_read_slice(^reader, f32, layer_size, allocator = layer_allocator);
+                for i: layer_size {
+                    nn.layers[l].biases[i].value = io.binary_read(^reader, f32);
+                }
              }
  
              for w: layer_size {
-                nn.layers[l].weights[w] = io.binary_read_slice(^reader, f32, prev_layer_size, allocator = layer_allocator);
+                for ww: prev_layer_size {
+                    nn.layers[l].weights[w * prev_layer_size + ww].value = io.binary_read(^reader, f32);
+                }
              }
          }
  
@@ -474,3 +489,91 @@ dataloader_get_item :: (use data: ^DataLoader, index: u32, input: [] f32, output
      
      return vtable.get_item(data, index, input, output);
  }
+
+
+//
+// Optimizers
+//
+
+Optimizer :: struct {
+    vtable    : ^Optimizer_Functions;
+    network   : ^NeuralNet;
+    
+    // TODO(Brendan Hansen): Make these fixed size slices?
+    // This would require know the exact parameter count for the network.
+    
+    // NOTE(Brendan Hansen): Used to store standalone variables that need to be updated.
+    variables : [..] ^Variable;
+    
+    // NOTE(Brendan Hansen): Used to store contigiously allocated variables that need to be updated.
+    // This prevents having a LOT of variables in the variables array.
+    variable_arrays : [..] ^[] Variable;
+}
+
+Optimizer_Functions :: struct {
+    step : (optimizer: ^Optimizer) -> void;
+}
+
+optimizer_init :: (use optim: ^Optimizer, nn: ^NeuralNet, allocator := context.allocator) {
+    network = nn;
+
+    #context_scope {
+        context.allocator = allocator;
+
+        variables       = array.make(#type ^Variable);
+        variable_arrays = array.make(#type ^[] Variable);
+    }
+}
+
+optimizer_step :: (use optim: ^Optimizer) {
+    if vtable == null do return;
+    if vtable.step == null_proc do return;
+
+    vtable.step(optim);
+}
+
+optimizer_zero_gradient :: (use optim: ^Optimizer) {
+    for variable: variables {
+        variable.delta = 0;
+    }
+
+    for variable_array: variable_arrays {
+        for ^variable: *variable_array {
+            variable.delta = 0;
+        }
+    }
+}
+
+
+
+SGD_Optimizer :: struct {
+    use base : Optimizer;
+    
+    learning_rate : f32;
+}
+
+sgd_optimizer_vtable := Optimizer_Functions.{
+    step = sgd_optimizer_step,
+};
+
+sgd_optimizer_create :: (nn: ^NeuralNet, learning_rate := 0.01f, allocator := context.allocator) -> SGD_Optimizer {
+    sgd : SGD_Optimizer;
+    sgd.vtable = ^sgd_optimizer_vtable;
+    optimizer_init(^sgd, nn, allocator);
+
+    learning_rate = learning_rate;
+
+    return sgd;
+}
+
+sgd_optimizer_step :: (use optimizer: ^SGD_Optimizer) {
+    for variable: variables {
+        variable.value += variable.delta * learning_rate;
+    }
+
+    for variable_array: variable_arrays {
+        for ^variable: *variable_array {
+            variable.value += variable.delta * learning_rate;
+        }
+    }
+}
+\ No newline at end of file
author	Brendan Hansen <brendan.f.hansen@gmail.com>
	Thu, 28 Jan 2021 17:01:14 +0000 (11:01 -0600)
committer	Brendan Hansen <brendan.f.hansen@gmail.com>
	Tue, 23 Feb 2021 04:00:15 +0000 (22:00 -0600)
project.4coder		patch \| blob \| history
src/mnist.onyx		patch \| blob \| history
src/neuralnet.onyx		patch \| blob \| history