From 709da2a4475e8b07052dd741de18ecc82090c256 Mon Sep 17 00:00:00 2001 From: Brendan Hansen Date: Thu, 28 Jan 2021 11:01:14 -0600 Subject: [PATCH] refactored to use a separate optimizer; loading/saving networks is much slower now --- project.4coder | 2 +- src/mnist.onyx | 27 ++++--- src/neuralnet.onyx | 175 +++++++++++++++++++++++++++++++++++---------- 3 files changed, 157 insertions(+), 47 deletions(-) diff --git a/project.4coder b/project.4coder index 50d3127..f8f7ca0 100644 --- a/project.4coder +++ b/project.4coder @@ -19,7 +19,7 @@ load_paths = { { load_paths_custom, .os = "mac" }, }; -build_win32 = "\\dev\\onyx\\onyx.exe -V src\\cifar10.onyx -o network.wasm"; +build_win32 = "\\dev\\onyx\\onyx.exe -V src\\mnist.onyx -o network.wasm"; build_linux = "/usr/bin/onyx -V src/mnist.onyx -o mnist.wasm"; command_list = { diff --git a/src/mnist.onyx b/src/mnist.onyx index 155542a..6163218 100644 --- a/src/mnist.onyx +++ b/src/mnist.onyx @@ -61,8 +61,7 @@ mnist_dataloader_functions := DataLoader_Functions.{ } } - -stocastic_gradient_descent :: (nn: ^NeuralNet, dataloader: ^DataLoader, criterion: Criterion = mean_squared_error) { +train :: (nn: ^NeuralNet, dataloader: ^DataLoader, optimizer: ^Optimizer, criterion: Criterion = mean_squared_error) { input := memory.make_slice(f32, 784); defer cfree(input.data); expected : [10] f32; @@ -71,14 +70,21 @@ stocastic_gradient_descent :: (nn: ^NeuralNet, dataloader: ^DataLoader, criterio past_100_correct := 0; for i: 10 { + printf("Staring epoch %i ===================================\n", i); for ex: training_example_count { dataloader_get_item(dataloader, ex, input, ~~ expected); + // NOTE(Brendan Hansen): Currently, zeroing the gradient is not + // necessary because neural_net_backward replaces the gradient, + // in other words it doesn't add to the existing gradient. + // optimizer_zero_gradient(optimizer); + neural_net_forward(nn, ~~ input); neural_net_backward(nn, ~~ expected, criterion); - - // The optimizing step should be put here. - + optimizer_step(optimizer); + + + // NOTE(Brendan Hansen): Prediction printing and tracking. label, _ := array.greatest(expected); prediction := neural_net_get_prediction(nn); if prediction == label do past_100_correct += 1; @@ -111,12 +117,10 @@ stocastic_gradient_descent :: (nn: ^NeuralNet, dataloader: ^DataLoader, criterio past_100_correct = 0; - /* if ex % 10000 == 0 { println("Saving neural network..."); - neural_net_save(nn, "data/test_4.nn"); + neural_net_save(nn, "data/still_working.nn"); } - */ } } } @@ -127,7 +131,6 @@ main :: (args: [] cstr) { // main_allocator := context.allocator; // context.allocator = alloc.log.logging_allocator(^main_allocator); - //nn := neural_net_load("data/test_3.nn"); nn := make_neural_net(28 * 28, 512, 256, 100, 10); defer neural_net_free(^nn); @@ -136,5 +139,9 @@ main :: (args: [] cstr) { mnist_data := mnist_data_make(); defer mnist_data_close(^mnist_data); - stocastic_gradient_descent(^nn, ^mnist_data); + optimizer := sgd_optimizer_create(^nn, learning_rate = 0.005f); + neural_net_supply_parameters(^nn, ^optimizer); + + println("Starting training"); + train(^nn, ^mnist_data, ^optimizer); } \ No newline at end of file diff --git a/src/neuralnet.onyx b/src/neuralnet.onyx index 093b0f8..ad2d71e 100644 --- a/src/neuralnet.onyx +++ b/src/neuralnet.onyx @@ -1,5 +1,15 @@ use package core + +// +// Variable +// +// TODO(Brendan Hansen): Document this better +Variable :: struct { + value : f32; + delta : f32; +} + // // General purpose Multi-Layer Perceptron (MLP) // @@ -51,8 +61,6 @@ neural_net_backward :: (use nn: ^NeuralNet, expected_output: [] f32, criterion: assert(layers[layers.count - 1].neurons.count == expected_output.count, "Expected output does not have the same size as the last layer."); - LEARNING_RATE :: cast(f32) 0.01; - // NOTE(Brendan Hansen): // Iterating backwards through the layers (hence the name "back propagation") // The reason this is necessary is because we need to know the derivatives of @@ -93,11 +101,12 @@ neural_net_backward :: (use nn: ^NeuralNet, expected_output: [] f32, criterion: for i: 1 .. layers.count { for j: layers[i].neurons.count { if layers[i].use_bias { - layers[i].biases[j] += LEARNING_RATE * layers[i].deltas[j]; + layers[i].biases[j].delta = layers[i].deltas[j]; } - for k: layers[i].weights[j].count { - layers[i].weights[j][k] += LEARNING_RATE * layers[i].deltas[j] * layers[i - 1].neurons[k]; + prev_layer_count := layers[i - 1].neurons.count; + for k: prev_layer_count { + layers[i].weights[j * prev_layer_count + k].delta = layers[i].deltas[j] * layers[i - 1].neurons[k]; } } } @@ -121,19 +130,25 @@ neural_net_loss :: (use nn: ^NeuralNet, expected_output: [] f32, criterion: Crit return criterion.compute_loss(layers[layers.count - 1].neurons, expected_output); } +neural_net_supply_parameters :: (use nn: ^NeuralNet, optimizer: ^Optimizer) { + for ^layer: layers { + if layer.biases.data != null do array.push(^optimizer.variable_arrays, ^layer.biases); + if layer.weights.data != null do array.push(^optimizer.variable_arrays, ^layer.weights); + } +} + Layer :: struct { use_bias : bool; is_input : bool; activation : ActivationFunction; - biases : [] f32; - weights : [][] f32; // CLEANUP: Make this a rank 1 slice + biases : [] Variable; + weights : [] Variable; neurons : [] f32; pre_activation_neurons : [] f32; - // The deltas could possibly be stored in the optimizer. deltas : [] f32; } @@ -149,13 +164,10 @@ layer_init :: (use layer: ^Layer, layer_size: u32, prev_layer_size: u32, allocat if !is_input && allocate_weights_and_biases { if use_bias { - biases = memory.make_slice(f32, layer_size, allocator); + biases = memory.make_slice(Variable, layer_size, allocator); } - weights = memory.make_slice(#type [] f32, layer_size, allocator); - for ^weight: weights { - *weight = memory.make_slice(f32, prev_layer_size, allocator); - } + weights = memory.make_slice(Variable, layer_size * prev_layer_size, allocator); randomize_weights_and_biases(layer); } @@ -163,27 +175,25 @@ layer_init :: (use layer: ^Layer, layer_size: u32, prev_layer_size: u32, allocat randomize_weights_and_biases :: (use layer: ^Layer) { for ^weight: weights { - for ^w: *weight { - *w = cast(f32) random.float(-0.5f, 0.5f); - } + weight.value = cast(f32) random.float(-0.5f, 0.5f); } if use_bias { - for ^bias: biases do *bias = cast(f32) random.float(-0.5f, 0.5f); + for ^bias: biases do bias.value = cast(f32) random.float(-0.5f, 0.5f); } } layer_forward :: (use layer: ^Layer, prev_layer: ^Layer) { for i: neurons.count { - neurons[i] = 0; - if use_bias do neurons[i] = biases[i]; + neuron: f32 = 0; + if use_bias do neuron = biases[i].value; - for j: weights[i].count { - neurons[i] += prev_layer.neurons[j] * weights[i][j]; + for j: prev_layer.neurons.count { + neuron += prev_layer.neurons[j] * weights[i * prev_layer.neurons.count + j].value; } - pre_activation_neurons[i] = neurons[i]; - neurons[i] = activation.forward(neurons[i]); + pre_activation_neurons[i] = neuron; + neurons[i] = activation.forward(neuron); } } @@ -191,19 +201,17 @@ layer_backward :: (use layer: ^Layer, next_layer: ^Layer) { for j: neurons.count { d_neuron: f32 = 0; for k: next_layer.neurons.count { - d_neuron += next_layer.deltas[k] * next_layer.weights[k][j]; + d_neuron += next_layer.deltas[k] * next_layer.weights[k * neurons.count + j].value; } d_sigmoid_value := activation.backward(neurons[j], pre_activation_neurons[j]); - // This could easily become '+=', which would allow for an accumulated gradient, - // before taking a step. deltas[j] = d_neuron * d_sigmoid_value; } } -Onyx_NN_Magic_Number :: 0x4E4E584F +Onyx_NN_Magic_Number := 0x4E4E584F neural_net_save :: (use nn: ^NeuralNet, filename: str) { err, output_file := io.open(filename, io.OpenMode.Write); @@ -213,13 +221,13 @@ neural_net_save :: (use nn: ^NeuralNet, filename: str) { writer := io.binary_writer_make(^output_file); // Magic string - io.binary_write(^writer, i32, Onyx_NN_Magic_Number); + io.binary_write(^writer, i32, ^Onyx_NN_Magic_Number); // Number of layers - io.binary_write(^writer, i32, layers.count); + io.binary_write(^writer, i32, ^layers.count); for ^layer: layers { - io.binary_write(^writer, i32, layer.neurons.count); + io.binary_write(^writer, i32, ^layer.neurons.count); io.binary_write_byte(^writer, cast(u8) layer.is_input); if layer.is_input do continue; @@ -228,11 +236,14 @@ neural_net_save :: (use nn: ^NeuralNet, filename: str) { io.binary_write_byte(^writer, cast(u8) layer.activation.id); if layer.use_bias { - io.binary_write_slice(^writer, layer.biases); +// io.binary_write_slice(^writer, layer.biases); + for ^bias: layer.biases { + io.binary_write(^writer, f32, ^bias.value); + } } - + for ^weight: layer.weights { - io.binary_write_slice(^writer, *weight); + io.binary_write(^writer, f32, ^weight.value); } } } @@ -259,7 +270,7 @@ neural_net_load :: (filename: str) -> NeuralNet { layer_size := io.binary_read(^reader, i32); is_input := cast(bool) io.binary_read_byte(^reader); - layer_init(^nn.layers[l], layer_size, prev_layer_size, allocator = layer_allocator, allocate_weights_and_biases = false); + layer_init(^nn.layers[l], layer_size, prev_layer_size, allocator = layer_allocator); if !is_input { nn.layers[l].use_bias = cast(bool) io.binary_read_byte(^reader); @@ -267,11 +278,15 @@ neural_net_load :: (filename: str) -> NeuralNet { nn.layers[l].activation = activation_function_from_id(activation_id); if nn.layers[l].use_bias { - nn.layers[l].biases = io.binary_read_slice(^reader, f32, layer_size, allocator = layer_allocator); + for i: layer_size { + nn.layers[l].biases[i].value = io.binary_read(^reader, f32); + } } for w: layer_size { - nn.layers[l].weights[w] = io.binary_read_slice(^reader, f32, prev_layer_size, allocator = layer_allocator); + for ww: prev_layer_size { + nn.layers[l].weights[w * prev_layer_size + ww].value = io.binary_read(^reader, f32); + } } } @@ -474,3 +489,91 @@ dataloader_get_item :: (use data: ^DataLoader, index: u32, input: [] f32, output return vtable.get_item(data, index, input, output); } + + +// +// Optimizers +// + +Optimizer :: struct { + vtable : ^Optimizer_Functions; + network : ^NeuralNet; + + // TODO(Brendan Hansen): Make these fixed size slices? + // This would require know the exact parameter count for the network. + + // NOTE(Brendan Hansen): Used to store standalone variables that need to be updated. + variables : [..] ^Variable; + + // NOTE(Brendan Hansen): Used to store contigiously allocated variables that need to be updated. + // This prevents having a LOT of variables in the variables array. + variable_arrays : [..] ^[] Variable; +} + +Optimizer_Functions :: struct { + step : (optimizer: ^Optimizer) -> void; +} + +optimizer_init :: (use optim: ^Optimizer, nn: ^NeuralNet, allocator := context.allocator) { + network = nn; + + #context_scope { + context.allocator = allocator; + + variables = array.make(#type ^Variable); + variable_arrays = array.make(#type ^[] Variable); + } +} + +optimizer_step :: (use optim: ^Optimizer) { + if vtable == null do return; + if vtable.step == null_proc do return; + + vtable.step(optim); +} + +optimizer_zero_gradient :: (use optim: ^Optimizer) { + for variable: variables { + variable.delta = 0; + } + + for variable_array: variable_arrays { + for ^variable: *variable_array { + variable.delta = 0; + } + } +} + + + +SGD_Optimizer :: struct { + use base : Optimizer; + + learning_rate : f32; +} + +sgd_optimizer_vtable := Optimizer_Functions.{ + step = sgd_optimizer_step, +}; + +sgd_optimizer_create :: (nn: ^NeuralNet, learning_rate := 0.01f, allocator := context.allocator) -> SGD_Optimizer { + sgd : SGD_Optimizer; + sgd.vtable = ^sgd_optimizer_vtable; + optimizer_init(^sgd, nn, allocator); + + learning_rate = learning_rate; + + return sgd; +} + +sgd_optimizer_step :: (use optimizer: ^SGD_Optimizer) { + for variable: variables { + variable.value += variable.delta * learning_rate; + } + + for variable_array: variable_arrays { + for ^variable: *variable_array { + variable.value += variable.delta * learning_rate; + } + } +} \ No newline at end of file -- 2.25.1