From 47b2d2e96732682cc9e1f26621f4693bcfc04c1b Mon Sep 17 00:00:00 2001 From: Robert Baumgartner Date: Sat, 7 Mar 2026 14:47:59 +0100 Subject: [PATCH 01/24] Made addition broadcast on tensor + vector for adding bias --- readme.md | 8 ++-- src/backend/computational_graph/add_node.cpp | 12 ++++- src/backend/computational_graph/add_node.h | 13 ++++- .../computational_graph/topological_sort.cpp | 1 + src/backend/data_modeling/dim_type.cpp | 23 +++++++++ src/backend/data_modeling/dim_type.h | 2 + src/backend/data_modeling/tensor.cpp | 43 ++++++++++------- .../data_modeling/tensor_functions.cpp | 34 +++++++++++++ src/backend/data_modeling/tensor_functions.h | 3 ++ src/backend/utility/global_params.h | 6 ++- tests/backend/test_data_modeling.cpp | 48 +++++++++---------- 11 files changed, 143 insertions(+), 50 deletions(-) diff --git a/readme.md b/readme.md index 8c531f0..7c1ef22 100644 --- a/readme.md +++ b/readme.md @@ -34,10 +34,10 @@ For some examples on Python interface, see tests/python. 🚧 **Work in Progress** - Implementing additional layers and optimizations Roadmap: -- [ ] Python Binding Unit Tests -- [ ] Additional layer types (Conv2D, LSTM, etc.) +- [x] Python Binding Unit Tests +- [ ] Additional layer types (Conv2D, Dropout, etc.) - [ ] Optimizers and training framework -- [ ] CUDA kernels for performance-critical operations +- [ ] CUDA mode for operations - [ ] AlexNet reference implementation - [ ] Docker deployment example @@ -62,7 +62,7 @@ ctest ## Required -- Compiler capable of C++20 at least (we test with gcc 12.3.0) +- Compiler capable of C++23 at least (we test with gcc 13.3.0) - Boost Python - Cmake > 3.24 - Python 3 (we test with 3.10, but it should work with any version) diff --git a/src/backend/computational_graph/add_node.cpp b/src/backend/computational_graph/add_node.cpp index 43427f5..4f3e072 100644 --- a/src/backend/computational_graph/add_node.cpp +++ b/src/backend/computational_graph/add_node.cpp @@ -11,11 +11,19 @@ #include "add_node.h" +#include "data_modeling/tensor_functions.h" + using namespace std; using namespace graph; vector< shared_ptr > AddNode::backward(const Tensor& upstreamGrad) { assert(!upstreamGrad.getRequiresGrad()); - auto res = make_shared(upstreamGrad.createDeepCopy()); - return {res, res}; + auto weightGrad = make_shared(upstreamGrad.createDeepCopy()); + + if(broadcasted){ + auto biasGrad = make_shared(TensorFunctions::SumOverDims(*weightGrad)); + return {weightGrad, biasGrad}; + } + + return {weightGrad, weightGrad}; } \ No newline at end of file diff --git a/src/backend/computational_graph/add_node.h b/src/backend/computational_graph/add_node.h index 99bc964..f20fb07 100644 --- a/src/backend/computational_graph/add_node.h +++ b/src/backend/computational_graph/add_node.h @@ -14,10 +14,19 @@ #include "graph_node.h" namespace graph { - class AddNode final : public GraphNode { + class AddNode final : public GraphNode { + private: + // if t2 has been a vector we broadcast t2 into t1, see Tensor::add() + bool broadcasted = false; + public: explicit AddNode(std::shared_ptr t1, std::shared_ptr t2) - : GraphNode({std::move(t1), std::move(t2)}) {} + : GraphNode({std::move(t1), std::move(t2)}) { + // t2 is either tensor of same size or 1D-vector as bias + assert(t1->getDims().nDims()>=t2->getDims().nDims()); + + broadcasted = parents[0]->getDims() != parents[1]->getDims(); + } AddNode(const AddNode& other) = delete; AddNode& operator=(const AddNode& other) = delete; diff --git a/src/backend/computational_graph/topological_sort.cpp b/src/backend/computational_graph/topological_sort.cpp index d6250ca..810214f 100644 --- a/src/backend/computational_graph/topological_sort.cpp +++ b/src/backend/computational_graph/topological_sort.cpp @@ -110,6 +110,7 @@ vector< Tensor* > TopologicalSort::reverseSort(Tensor* root) { nodeQueue.push(root); edgeCounts[root] = 0; + // TODO: this about your cgNode design and requiresGrad. We want to freeze layers, too auto updateQueueAndEdgeCounts = [&nodeQueue, &edgeCounts](Tensor* t){ if(!edgeCounts.contains(t)) { edgeCounts[t] = 1; diff --git a/src/backend/data_modeling/dim_type.cpp b/src/backend/data_modeling/dim_type.cpp index af77c24..64dd6bf 100644 --- a/src/backend/data_modeling/dim_type.cpp +++ b/src/backend/data_modeling/dim_type.cpp @@ -13,6 +13,7 @@ #include "utility/safe_arithmetics.h" #include +#include using namespace std; @@ -82,6 +83,28 @@ Dimension& Dimension::operator=(Dimension&& other) noexcept { return *this; } +/** + * @brief This method gets interesting when we want to get a copy of + * this dimension instance, but we collapsed one of the dimensions. + * E.g. when we have a tensor, and we sum over one of its dimensions + * to get a new tensor, then this will be the new dimensions of the result. + * + * Example: t=Tensor with dims (b-size, d). We sum over all batches and + * get a new tensor tSum=Tensor with dims (d). + * + * @param idx The dimension to collapse. + */ +Dimension Dimension::collapseDimension(int idx) const { + auto mappedIdx = getItem(idx); + + std::vector newDims; + newDims.reserve(dims.size() - 1); + newDims.insert(newDims.end(), dims.begin(), dims.begin() + idx); + newDims.insert(newDims.end(), dims.begin() + idx + 1, dims.end()); + + return Dimension(newDims); +} + ostream& operator<<(ostream& os, const Dimension& d) noexcept { os << "("; for(int i=0; i& dims); tensorSize_t getSize() const noexcept { diff --git a/src/backend/data_modeling/tensor.cpp b/src/backend/data_modeling/tensor.cpp index 04a94aa..3dcb8a1 100644 --- a/src/backend/data_modeling/tensor.cpp +++ b/src/backend/data_modeling/tensor.cpp @@ -348,6 +348,7 @@ void Tensor::matMul2DCpu(Tensor& res, const Tensor& left, const Tensor& right, c * @brief Matrix multiplication. */ Tensor Tensor::matmul(const Tensor& other) const { + assert(values->getDevice()==other.values->getDevice()); if(values->getDevice()==Device::CUDA){ __throw_invalid_argument("Multiplication not implemented on CUDA"); } @@ -356,36 +357,45 @@ Tensor Tensor::matmul(const Tensor& other) const { __throw_runtime_error("Tensors on different devices."); } - // TODO: check what to do about these two gradients and if you want broadcasting here at all - if(other.dims.getSize()==1){ - return multiplyScalar(other, *this); - } - else if(dims.getSize()==1){ - return multiplyScalar(*this, other); - } - return matMulImpl(*this, other); } /** - * @brief Elementise addition. + * @brief Addition of two tensors. This works in two ways: + * 1. Shapes of the two tensors are identical. In this case it is simple + * elementwise addition. + * 2. The second tensor is a vector. In this case broadcast it. We assume + * other.dims == (dimN) && this->dims == (dim0, dim1,..., dimN). */ Tensor Tensor::operator+(const Tensor& other) const { if(values->getDevice()==Device::CUDA){ - __throw_invalid_argument("Multiplication not implemented on CUDA"); + __throw_invalid_argument("Addition not implemented on CUDA"); } - if(this->dims != other.dims){ - __throw_invalid_argument("Tensors need same dimensions"); + if(this->dims != other.dims && + !(other.dims.nDims() == 1 && other.dims.getItem(0) == dims.getItem(-1))){ + __throw_invalid_argument("Tensors need matching dimensions"); } else if(values->getDevice()!=other.values->getDevice()){ __throw_runtime_error("Tensors on different devices."); } - assert(values->getSize()==other.values->getSize()); - Tensor res(dims, values->getDevice(), false); - for(tensorSize_t i=0; igetSize(); i++){ - (*res.values)[i] = (*values)[i] + (*other.values)[i]; + Tensor res(dims, values->getDevice()); + + if(dims==other.dims){ + // elementwise add + for(tensorSize_t i=0; igetSize(); i++){ + (*res.values)[i] = (*values)[i] + (*other.values)[i]; + } + } + else { [[likely]] + // broadcasted add + const auto stride = static_cast(other.dims.getItem(0)); + for(tensorSize_t offset=0; offsetgetSize(); offset+=stride){ + for(tensorSize_t i=0; igetDevice()==other.values->getDevice()); if(values->getDevice()==Device::CUDA){ __throw_invalid_argument("Multiplication not implemented on CUDA"); } diff --git a/src/backend/data_modeling/tensor_functions.cpp b/src/backend/data_modeling/tensor_functions.cpp index 3ac032e..97928b9 100644 --- a/src/backend/data_modeling/tensor_functions.cpp +++ b/src/backend/data_modeling/tensor_functions.cpp @@ -75,4 +75,38 @@ shared_ptr TensorFunctions::makeSharedTensor(const vector& Device d, bool requiresGrad){ return make_shared(dims, initValues, d, requiresGrad); +} + +/************************************************************************************ + ************************************ Arithmetics *********************************** + ***********************************************************************************/ + + /** + * @brief Sums over the dimensions. If input is (b-size, dim1, dim2), and + * input dim-parameter is 1, then output will be (b-size, dim2). If + * input dim-parameter is 0, then output will be (dim1, dim2). + * Input dim must be smaller then t.dims.nDims()-1 + */ +Tensor TensorFunctions::SumOverDims(const Tensor& t, tensorDim_t dim) { + if(dim>=t.getDims().nDims()-1){ + __throw_invalid_argument("Dim parameter must be smaller than number of dims, but was " + dim); + } + + auto resDims = t.getDims().collapseDimension(dim); + Tensor res = Zeros(resDims.toVector(), t.getDevice(), t.getRequiresGrad()); // inefficiency toVector + + tensorSize_t stride = 1; + for(tensorDim_t i=dim+1; i Date: Sat, 7 Mar 2026 17:06:23 +0100 Subject: [PATCH 02/24] Started with activations, optimizers, and losses --- src/backend/CMakeLists.txt | 15 ++---- .../activation_function_base.h | 33 ++++++++++++ .../activation_functions/leaky_relu.cpp | 27 ++++++++++ .../activation_functions/leaky_relu.h | 13 +++-- src/backend/activation_functions/relu.cpp | 27 ++++++++++ .../{layers => }/activation_functions/relu.h | 6 +-- .../activation_functions/sigmoid.h | 0 src/backend/activation_functions/softmax.cpp | 50 +++++++++++++++++++ src/backend/activation_functions/softmax.h | 21 ++++++++ .../activation_functions/graph_creation.cpp | 36 +++++++++++++ .../activation_functions/graph_creation.h | 25 ++++++++++ .../activation_functions/leaky_relu_node.cpp | 29 +++++++++++ .../activation_functions/leaky_relu_node.h | 29 +++++++++++ .../{ => activation_functions}/relu_node.cpp | 9 ++-- .../{ => activation_functions}/relu_node.h | 10 +--- src/backend/computational_graph/graph_node.h | 8 +++ .../{ => tensor_ops}/add_node.cpp | 0 .../{ => tensor_ops}/add_node.h | 10 +--- .../{ => tensor_ops}/elementwise_mul_node.cpp | 0 .../{ => tensor_ops}/elementwise_mul_node.h | 10 +--- .../{ => tensor_ops}/getter_node.cpp | 0 .../{ => tensor_ops}/getter_node.h | 10 +--- .../{ => tensor_ops}/graph_creation.cpp | 0 .../{ => tensor_ops}/graph_creation.h | 0 .../{ => tensor_ops}/matmul_node.cpp | 0 .../{ => tensor_ops}/matmul_node.h | 10 +--- .../{ => tensor_ops}/scalar_op_nodes.cpp | 0 .../{ => tensor_ops}/scalar_op_nodes.h | 10 +--- .../computational_graph/topological_sort.cpp | 10 ++-- src/backend/data_modeling/tensor.cpp | 38 ++++++++------ src/backend/data_modeling/tensor.h | 3 ++ .../activation_function_base.cpp | 18 ------- .../activation_function_base.h | 26 ---------- .../layers/activation_functions/relu.cpp | 37 -------------- src/backend/layers/ff_layer.cpp | 50 ++++++++++++++++--- src/backend/layers/ff_layer.h | 12 ++--- src/backend/layers/layer_base.cpp | 19 +++---- src/backend/layers/layer_base.h | 30 +++++++---- .../training/loss_functions/bce_loss.cpp | 43 ++++++++++++++++ .../training/loss_functions/bce_loss.h | 21 ++++++++ .../loss_functions/crossentropy_loss.cpp | 46 +++++++++++++++++ .../loss_functions/crossentropy_loss.h | 21 ++++++++ .../training/loss_functions/loss_base.h | 22 ++++++-- .../training/loss_functions/mse_loss.cpp | 35 ------------- .../training/loss_functions/mse_loss.h | 22 -------- .../training/optimizers/optimizer_base.cpp | 20 -------- .../training/optimizers/optimizer_base.h | 20 ++++---- src/python/data_modeling/py_data_modeling.cpp | 2 +- .../data_modeling/py_data_modeling_util.h | 2 +- tests/backend/test_computational_graph.cpp | 3 +- 50 files changed, 580 insertions(+), 308 deletions(-) create mode 100644 src/backend/activation_functions/activation_function_base.h create mode 100644 src/backend/activation_functions/leaky_relu.cpp rename src/backend/{layers => }/activation_functions/leaky_relu.h (51%) create mode 100644 src/backend/activation_functions/relu.cpp rename src/backend/{layers => }/activation_functions/relu.h (58%) rename src/backend/{layers => }/activation_functions/sigmoid.h (100%) create mode 100644 src/backend/activation_functions/softmax.cpp create mode 100644 src/backend/activation_functions/softmax.h create mode 100644 src/backend/computational_graph/activation_functions/graph_creation.cpp create mode 100644 src/backend/computational_graph/activation_functions/graph_creation.h create mode 100644 src/backend/computational_graph/activation_functions/leaky_relu_node.cpp create mode 100644 src/backend/computational_graph/activation_functions/leaky_relu_node.h rename src/backend/computational_graph/{ => activation_functions}/relu_node.cpp (69%) rename src/backend/computational_graph/{ => activation_functions}/relu_node.h (63%) rename src/backend/computational_graph/{ => tensor_ops}/add_node.cpp (100%) rename src/backend/computational_graph/{ => tensor_ops}/add_node.h (75%) rename src/backend/computational_graph/{ => tensor_ops}/elementwise_mul_node.cpp (100%) rename src/backend/computational_graph/{ => tensor_ops}/elementwise_mul_node.h (59%) rename src/backend/computational_graph/{ => tensor_ops}/getter_node.cpp (100%) rename src/backend/computational_graph/{ => tensor_ops}/getter_node.h (77%) rename src/backend/computational_graph/{ => tensor_ops}/graph_creation.cpp (100%) rename src/backend/computational_graph/{ => tensor_ops}/graph_creation.h (100%) rename src/backend/computational_graph/{ => tensor_ops}/matmul_node.cpp (100%) rename src/backend/computational_graph/{ => tensor_ops}/matmul_node.h (64%) rename src/backend/computational_graph/{ => tensor_ops}/scalar_op_nodes.cpp (100%) rename src/backend/computational_graph/{ => tensor_ops}/scalar_op_nodes.h (77%) delete mode 100644 src/backend/layers/activation_functions/activation_function_base.cpp delete mode 100644 src/backend/layers/activation_functions/activation_function_base.h delete mode 100644 src/backend/layers/activation_functions/relu.cpp create mode 100644 src/backend/training/loss_functions/bce_loss.cpp create mode 100644 src/backend/training/loss_functions/bce_loss.h create mode 100644 src/backend/training/loss_functions/crossentropy_loss.cpp create mode 100644 src/backend/training/loss_functions/crossentropy_loss.h delete mode 100644 src/backend/training/loss_functions/mse_loss.cpp delete mode 100644 src/backend/training/loss_functions/mse_loss.h delete mode 100644 src/backend/training/optimizers/optimizer_base.cpp diff --git a/src/backend/CMakeLists.txt b/src/backend/CMakeLists.txt index 579fae3..016dd30 100644 --- a/src/backend/CMakeLists.txt +++ b/src/backend/CMakeLists.txt @@ -1,19 +1,10 @@ -#include_directories( -# "${CMAKE_CURRENT_SOURCE_DIR}/computational_graph" -# "${CMAKE_CURRENT_SOURCE_DIR}/data_modeling" -# "${CMAKE_CURRENT_SOURCE_DIR}/utility" -# "${CMAKE_CURRENT_SOURCE_DIR}/layers" -# "${CMAKE_CURRENT_SOURCE_DIR}/system" -# "${CMAKE_CURRENT_SOURCE_DIR}/training/loss_functions" -# "${CMAKE_CURRENT_SOURCE_DIR}/training/optimizers" -# ) - file(GLOB_RECURSE CORE_SOURCES + activation_functions/*.cpp computational_graph/*.cpp data_modeling/*.cpp - #layers/*.cpp + layers/*.cpp #networks/*.cpp - #training/*.cpp + training/*.cpp utility/*.cpp system/*.cpp ) diff --git a/src/backend/activation_functions/activation_function_base.h b/src/backend/activation_functions/activation_function_base.h new file mode 100644 index 0000000..9015d93 --- /dev/null +++ b/src/backend/activation_functions/activation_function_base.h @@ -0,0 +1,33 @@ +/** + * @file function_base.h + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-02-01 + * + * @copyright Copyright (c) 2026 + * + */ + +#pragma once + +#include "data_modeling/tensor.h" + +#include + +namespace activation { + class ActivationFunctionBase { + public: + ActivationFunctionBase() = default; + + ActivationFunctionBase(const ActivationFunctionBase& other) = delete; + ActivationFunctionBase& operator=(const ActivationFunctionBase& other) = delete; + + ActivationFunctionBase(ActivationFunctionBase&& other) noexcept = default; + ActivationFunctionBase& operator=(ActivationFunctionBase&& other) noexcept = default; + + ~ActivationFunctionBase() noexcept = default; + + virtual Tensor operator()(const Tensor& t) const noexcept = 0; + }; +} diff --git a/src/backend/activation_functions/leaky_relu.cpp b/src/backend/activation_functions/leaky_relu.cpp new file mode 100644 index 0000000..1559d83 --- /dev/null +++ b/src/backend/activation_functions/leaky_relu.cpp @@ -0,0 +1,27 @@ +/** + * @file leaky_relu.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-07 + * + * @copyright Copyright (c) 2026 + * + */ + +#include "leaky_relu.h" + +using namespace activation; + +Tensor LeakyReLu::operator()(const Tensor& t) const noexcept { + auto res = t.createDeepCopy(); + + for(tensorSize_t i=0; i + +using namespace activation; + +/** + * @brief Softmax over last dimension. Expects shape + * (dim1, dim2, ..., n_classes) + * @return Tensor of shape (dim1, dim2, ..., n_classes) [== input.shape] + */ +Tensor Softmax::operator()(const Tensor& t) const noexcept { + Tensor res(t.getDims(), t.getDevice()); + + Tensor tmp(t.getDims(), t.getDevice()); + for(tensorSize_t i=0; i(exp(t[i])), i); + } + + const tensorSize_t stride = t.getDims().getItem(-1); + auto compute = [&t, &res, &tmp, stride](tensorSize_t start){ + ftype sum = 0; + for(tensorSize_t i=0; i(t[start+i]); + } + + for(tensorSize_t i=0; i doActivation(const ReLu& r, const shared_ptr& t) { + auto res = make_shared(r(*t)); + if(t->getRequiresGrad()){ + res->setCgNode(make_shared(t)); + assert(res->getRequiresGrad()); + } + return res; +} + +shared_ptr doActivation(const LeakyReLu& r, const shared_ptr& t) { + auto res = make_shared(r(*t)); + if(t->getRequiresGrad()){ + res->setCgNode(make_shared(t, r.getEps())); + assert(res->getRequiresGrad()); + } + return res; +} \ No newline at end of file diff --git a/src/backend/computational_graph/activation_functions/graph_creation.h b/src/backend/computational_graph/activation_functions/graph_creation.h new file mode 100644 index 0000000..704b142 --- /dev/null +++ b/src/backend/computational_graph/activation_functions/graph_creation.h @@ -0,0 +1,25 @@ +/** + * @file graph_creation.h + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-07 + * + * @copyright Copyright (c) 2026 + * + */ + +#pragma once + +#include "data_modeling/tensor.h" + +#include "activation_functions/relu.h" +#include "activation_functions/leaky_relu.h" + +#include + +namespace graph { + std::shared_ptr doActivation(const activation::ReLu& r, const std::shared_ptr& t); + std::shared_ptr doActivation(const activation::LeakyReLu& r, const std::shared_ptr& t); +} + \ No newline at end of file diff --git a/src/backend/computational_graph/activation_functions/leaky_relu_node.cpp b/src/backend/computational_graph/activation_functions/leaky_relu_node.cpp new file mode 100644 index 0000000..e57c640 --- /dev/null +++ b/src/backend/computational_graph/activation_functions/leaky_relu_node.cpp @@ -0,0 +1,29 @@ +/** + * @file leaky_relu_node.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-07 + * + * @copyright Copyright (c) 2026 + * + */ + +#include "leaky_relu_node.h" + +#include + +using namespace std; +using namespace graph; + +vector> LeakyReLuNode::backward(const Tensor& upstreamGrad) { + assert(!upstreamGrad.getRequiresGrad()); + constexpr ftype zero = 0.0; + + auto res = make_shared(upstreamGrad.getDims(), upstreamGrad.getDevice(), false); + for(tensorSize_t i=0; isetItem(upstreamGrad[i] > zero ? 1 : eps, i); + } + + return {res}; +} \ No newline at end of file diff --git a/src/backend/computational_graph/activation_functions/leaky_relu_node.h b/src/backend/computational_graph/activation_functions/leaky_relu_node.h new file mode 100644 index 0000000..076b4d2 --- /dev/null +++ b/src/backend/computational_graph/activation_functions/leaky_relu_node.h @@ -0,0 +1,29 @@ +/** + * @file leaky_relu_node.h + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-07 + * + * @copyright Copyright (c) 2026 + * + */ + +#pragma once + +#include "computational_graph/graph_node.h" + +#include + +namespace graph { + class LeakyReLuNode final : public GraphNode { + private: + const ftype eps; + + public: + explicit LeakyReLuNode(std::shared_ptr t, const ftype eps) + : GraphNode({std::move(t)}), eps{eps} {} + + std::vector> backward(const Tensor& upstreamGrad) override; + }; +} diff --git a/src/backend/computational_graph/relu_node.cpp b/src/backend/computational_graph/activation_functions/relu_node.cpp similarity index 69% rename from src/backend/computational_graph/relu_node.cpp rename to src/backend/computational_graph/activation_functions/relu_node.cpp index 2e0f647..90d2a9b 100644 --- a/src/backend/computational_graph/relu_node.cpp +++ b/src/backend/computational_graph/activation_functions/relu_node.cpp @@ -18,13 +18,12 @@ using namespace graph; vector> ReLuNode::backward(const Tensor& upstreamGrad) { assert(!upstreamGrad.getRequiresGrad()); - constexpr ftype zero = 0.0; - auto res = make_shared(upstreamGrad.getDims().toVector(), upstreamGrad.getDevice(), false); + auto res = make_shared(upstreamGrad.getDims(), upstreamGrad.getDevice(), false); for(tensorSize_t i=0; isetItem(v > zero ? v : zero, i); + res->setItem(upstreamGrad[i] > zero ? 1 : zero, i); } - return {std::move(res)}; + + return {res}; } \ No newline at end of file diff --git a/src/backend/computational_graph/relu_node.h b/src/backend/computational_graph/activation_functions/relu_node.h similarity index 63% rename from src/backend/computational_graph/relu_node.h rename to src/backend/computational_graph/activation_functions/relu_node.h index b0ce5b8..9c83143 100644 --- a/src/backend/computational_graph/relu_node.h +++ b/src/backend/computational_graph/activation_functions/relu_node.h @@ -11,7 +11,7 @@ #pragma once -#include "graph_node.h" +#include "computational_graph/graph_node.h" #include @@ -21,14 +21,6 @@ namespace graph { explicit ReLuNode(std::shared_ptr t) : GraphNode({std::move(t)}) {} - ReLuNode(const ReLuNode& other) = delete; - ReLuNode& operator=(const ReLuNode& other) = delete; - - ReLuNode(ReLuNode&& other) = default; - ReLuNode& operator=(ReLuNode&& other) = default; - - ~ReLuNode() noexcept = default; - std::vector> backward(const Tensor& upstreamGrad) override; }; } diff --git a/src/backend/computational_graph/graph_node.h b/src/backend/computational_graph/graph_node.h index 67cb033..9302703 100644 --- a/src/backend/computational_graph/graph_node.h +++ b/src/backend/computational_graph/graph_node.h @@ -25,6 +25,14 @@ namespace graph { explicit GraphNode(std::vector< std::shared_ptr > parents) : parents{std::move(parents)}{} public: + GraphNode(const GraphNode& other) = delete; + GraphNode& operator=(const GraphNode& other) = delete; + + GraphNode(GraphNode&& other) = default; + GraphNode& operator=(GraphNode&& other) = default; + + virtual ~GraphNode() noexcept = default; + virtual std::vector> backward(const Tensor& upstreamGrad) = 0; const auto& getParents() const noexcept { diff --git a/src/backend/computational_graph/add_node.cpp b/src/backend/computational_graph/tensor_ops/add_node.cpp similarity index 100% rename from src/backend/computational_graph/add_node.cpp rename to src/backend/computational_graph/tensor_ops/add_node.cpp diff --git a/src/backend/computational_graph/add_node.h b/src/backend/computational_graph/tensor_ops/add_node.h similarity index 75% rename from src/backend/computational_graph/add_node.h rename to src/backend/computational_graph/tensor_ops/add_node.h index f20fb07..4adc443 100644 --- a/src/backend/computational_graph/add_node.h +++ b/src/backend/computational_graph/tensor_ops/add_node.h @@ -11,7 +11,7 @@ #pragma once -#include "graph_node.h" +#include "computational_graph/graph_node.h" namespace graph { class AddNode final : public GraphNode { @@ -28,14 +28,6 @@ namespace graph { broadcasted = parents[0]->getDims() != parents[1]->getDims(); } - AddNode(const AddNode& other) = delete; - AddNode& operator=(const AddNode& other) = delete; - - AddNode(AddNode&& other) = default; - AddNode& operator=(AddNode&& other) = default; - - ~AddNode() noexcept = default; - std::vector> backward(const Tensor& upstreamGrad) override; }; } \ No newline at end of file diff --git a/src/backend/computational_graph/elementwise_mul_node.cpp b/src/backend/computational_graph/tensor_ops/elementwise_mul_node.cpp similarity index 100% rename from src/backend/computational_graph/elementwise_mul_node.cpp rename to src/backend/computational_graph/tensor_ops/elementwise_mul_node.cpp diff --git a/src/backend/computational_graph/elementwise_mul_node.h b/src/backend/computational_graph/tensor_ops/elementwise_mul_node.h similarity index 59% rename from src/backend/computational_graph/elementwise_mul_node.h rename to src/backend/computational_graph/tensor_ops/elementwise_mul_node.h index 81203fd..f2a5344 100644 --- a/src/backend/computational_graph/elementwise_mul_node.h +++ b/src/backend/computational_graph/tensor_ops/elementwise_mul_node.h @@ -11,7 +11,7 @@ #pragma once -#include "graph_node.h" +#include "computational_graph/graph_node.h" namespace graph { class ElementwiseMulNode final : public GraphNode { @@ -19,14 +19,6 @@ namespace graph { explicit ElementwiseMulNode(std::shared_ptr t1, std::shared_ptr t2) : GraphNode({std::move(t1), std::move(t2)}) {} - ElementwiseMulNode(const ElementwiseMulNode& other) = delete; - ElementwiseMulNode& operator=(const ElementwiseMulNode& other) = delete; - - ElementwiseMulNode(ElementwiseMulNode&& other) = default; - ElementwiseMulNode& operator=(ElementwiseMulNode&& other) = default; - - ~ElementwiseMulNode() noexcept = default; - std::vector> backward(const Tensor& upstreamGrad) override; }; } diff --git a/src/backend/computational_graph/getter_node.cpp b/src/backend/computational_graph/tensor_ops/getter_node.cpp similarity index 100% rename from src/backend/computational_graph/getter_node.cpp rename to src/backend/computational_graph/tensor_ops/getter_node.cpp diff --git a/src/backend/computational_graph/getter_node.h b/src/backend/computational_graph/tensor_ops/getter_node.h similarity index 77% rename from src/backend/computational_graph/getter_node.h rename to src/backend/computational_graph/tensor_ops/getter_node.h index e55b2d5..5d359d2 100644 --- a/src/backend/computational_graph/getter_node.h +++ b/src/backend/computational_graph/tensor_ops/getter_node.h @@ -11,7 +11,7 @@ #pragma once -#include "graph_node.h" +#include "computational_graph/graph_node.h" #include #include @@ -36,13 +36,5 @@ namespace graph{ explicit GetterNode(std::shared_ptr t, const multiDimIdx_t& idx) : GraphNode({std::move(t)}), idx{idx} {} - GetterNode(const GetterNode& other) = delete; - GetterNode& operator=(const GetterNode& other) = delete; - - GetterNode(GetterNode&& other) = default; - GetterNode& operator=(GetterNode&& other) = default; - - ~GetterNode() noexcept = default; - std::vector> backward(const Tensor& upstreamGrad) override; };} diff --git a/src/backend/computational_graph/graph_creation.cpp b/src/backend/computational_graph/tensor_ops/graph_creation.cpp similarity index 100% rename from src/backend/computational_graph/graph_creation.cpp rename to src/backend/computational_graph/tensor_ops/graph_creation.cpp diff --git a/src/backend/computational_graph/graph_creation.h b/src/backend/computational_graph/tensor_ops/graph_creation.h similarity index 100% rename from src/backend/computational_graph/graph_creation.h rename to src/backend/computational_graph/tensor_ops/graph_creation.h diff --git a/src/backend/computational_graph/matmul_node.cpp b/src/backend/computational_graph/tensor_ops/matmul_node.cpp similarity index 100% rename from src/backend/computational_graph/matmul_node.cpp rename to src/backend/computational_graph/tensor_ops/matmul_node.cpp diff --git a/src/backend/computational_graph/matmul_node.h b/src/backend/computational_graph/tensor_ops/matmul_node.h similarity index 64% rename from src/backend/computational_graph/matmul_node.h rename to src/backend/computational_graph/tensor_ops/matmul_node.h index 7fa94bf..c7b14e2 100644 --- a/src/backend/computational_graph/matmul_node.h +++ b/src/backend/computational_graph/tensor_ops/matmul_node.h @@ -11,7 +11,7 @@ #pragma once -#include "graph_node.h" +#include "computational_graph/graph_node.h" #include @@ -21,14 +21,6 @@ namespace graph { explicit MatMulNode(std::shared_ptr t1, std::shared_ptr t2) : GraphNode({std::move(t1), std::move(t2)}) {} - MatMulNode(const MatMulNode& other) = delete; - MatMulNode& operator=(const MatMulNode& other) = delete; - - MatMulNode(MatMulNode&& other) = default; - MatMulNode& operator=(MatMulNode&& other) = default; - - ~MatMulNode() noexcept = default; - std::vector> backward(const Tensor& upstreamGrad) override; }; } diff --git a/src/backend/computational_graph/scalar_op_nodes.cpp b/src/backend/computational_graph/tensor_ops/scalar_op_nodes.cpp similarity index 100% rename from src/backend/computational_graph/scalar_op_nodes.cpp rename to src/backend/computational_graph/tensor_ops/scalar_op_nodes.cpp diff --git a/src/backend/computational_graph/scalar_op_nodes.h b/src/backend/computational_graph/tensor_ops/scalar_op_nodes.h similarity index 77% rename from src/backend/computational_graph/scalar_op_nodes.h rename to src/backend/computational_graph/tensor_ops/scalar_op_nodes.h index 5a6588e..a2d1db9 100644 --- a/src/backend/computational_graph/scalar_op_nodes.h +++ b/src/backend/computational_graph/tensor_ops/scalar_op_nodes.h @@ -11,7 +11,7 @@ #pragma once -#include "graph_node.h" +#include "computational_graph/graph_node.h" namespace graph { class ScalarAddNode final : public GraphNode { @@ -38,14 +38,6 @@ namespace graph { explicit ScalarMulNode(std::shared_ptr t, ftype factor) : GraphNode({std::move(t)}), factor{factor} {} - ScalarMulNode(const ScalarMulNode& other) = delete; - ScalarMulNode& operator=(const ScalarMulNode& other) = delete; - - ScalarMulNode(ScalarMulNode&& other) = default; - ScalarMulNode& operator=(ScalarMulNode&& other) = default; - - ~ScalarMulNode() noexcept = default; - std::vector> backward(const Tensor& upstreamGrad) override; }; } \ No newline at end of file diff --git a/src/backend/computational_graph/topological_sort.cpp b/src/backend/computational_graph/topological_sort.cpp index 810214f..fb6e698 100644 --- a/src/backend/computational_graph/topological_sort.cpp +++ b/src/backend/computational_graph/topological_sort.cpp @@ -110,7 +110,6 @@ vector< Tensor* > TopologicalSort::reverseSort(Tensor* root) { nodeQueue.push(root); edgeCounts[root] = 0; - // TODO: this about your cgNode design and requiresGrad. We want to freeze layers, too auto updateQueueAndEdgeCounts = [&nodeQueue, &edgeCounts](Tensor* t){ if(!edgeCounts.contains(t)) { edgeCounts[t] = 1; @@ -134,7 +133,7 @@ vector< Tensor* > TopologicalSort::reverseSort(Tensor* root) { auto pushParentsWithGraphNode = [&nodeQueue, &edgeCounts](Tensor* t){ const auto& parents = t->cgNode->getParents(); - for(const auto& parent: parents){ // TODO: check for requiresGrad to save runtime? + for(const auto& parent: parents){ if(!parent->cgNode) continue; @@ -147,12 +146,15 @@ vector< Tensor* > TopologicalSort::reverseSort(Tensor* root) { }; // pass 2: topological sort based on Kahn's algorithm - vector< Tensor* > res; // TODO: reserve capacity to save runtime? + vector< Tensor* > res; + res.reserve(nodeQueue.size()); + nodeQueue.push(root); while(!nodeQueue.empty()){ auto tensorPtr = nodeQueue.front(); nodeQueue.pop(); - + assert(tensorPtr->cgNode); + if(edgeCounts[tensorPtr]==0){ pushParentsWithGraphNode(tensorPtr); res.push_back(tensorPtr); diff --git a/src/backend/data_modeling/tensor.cpp b/src/backend/data_modeling/tensor.cpp index 3dcb8a1..d85b3ed 100644 --- a/src/backend/data_modeling/tensor.cpp +++ b/src/backend/data_modeling/tensor.cpp @@ -418,12 +418,12 @@ Tensor Tensor::operator*(const Tensor& other) const { } // TODO: check what to do about these two gradients and if you want broadcasting here at all - if(other.dims.getSize()==1){ +/* if(other.dims.getSize()==1){ return multiplyScalar(other, *this); } else if(dims.getSize()==1){ return multiplyScalar(*this, other); - } + } */ if(this->dims != other.dims){ __throw_invalid_argument("Tensors need same dimensions"); @@ -517,21 +517,19 @@ void Tensor::backward() { auto& tensor = *tPtr; assert(tensor.grads && !tensor.grads->requiresGrad); // gradient should not require grad - if(tensor.cgNode){ - auto incomingGrads = tensor.cgNode->backward(*tensor.grads); - const auto& parents = tensor.cgNode->getParents(); + auto incomingGrads = tensor.cgNode->backward(*tensor.grads); + const auto& parents = tensor.cgNode->getParents(); - for(size_t i=0; irequiresGrad){ - continue; - } - else if(!parent->grads){ - parent->grads = incomingGrads[i]; - } - else{ - *parent->grads->values += *incomingGrads[i]->values; - } + for(size_t i=0; irequiresGrad){ + continue; + } + else if(!parent->grads){ + parent->grads = incomingGrads[i]; + } + else{ + *parent->grads->values += *incomingGrads[i]->values; } } } @@ -901,9 +899,17 @@ ftype Tensor::getItem(const std::vector& idx) const { * Can lead to unexpected results in multidimensional tensors. */ ftype Tensor::getItem(tensorSize_t idx) const { + return (*this)[idx]; +} + +/** + * @brief For convenience. + */ +ftype Tensor::operator[](tensorSize_t idx) const { return (*values)[idx]; } + ftype Tensor::getItem(tensorDim_t idx0, tensorDim_t idx1) const { return getItem({idx0, idx1}); } diff --git a/src/backend/data_modeling/tensor.h b/src/backend/data_modeling/tensor.h index 9c1384b..b1249fe 100644 --- a/src/backend/data_modeling/tensor.h +++ b/src/backend/data_modeling/tensor.h @@ -237,6 +237,9 @@ class Tensor final : public std::enable_shared_from_this { ftype getItem(tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2) const; ftype getItem(tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2, tensorDim_t idx3) const; + // non-const version of operator[] does not exist because of CUDA + ftype operator[](tensorSize_t idx) const; + ftype getItem(const std::vector& idx) const; // for convenience we provide some simple setters diff --git a/src/backend/layers/activation_functions/activation_function_base.cpp b/src/backend/layers/activation_functions/activation_function_base.cpp deleted file mode 100644 index 8e482c3..0000000 --- a/src/backend/layers/activation_functions/activation_function_base.cpp +++ /dev/null @@ -1,18 +0,0 @@ -/** - * @file activation_function_base.cpp - * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) - * @brief - * @version 0.1 - * @date 2026-02-02 - * - * @copyright Copyright (c) 2026 - * - */ - -#include "activation_function_base.h" - -using namespace activation; - -Tensor ActivationFunctionBase::forward(Tensor& t) const noexcept { - return (*this)(t); -} \ No newline at end of file diff --git a/src/backend/layers/activation_functions/activation_function_base.h b/src/backend/layers/activation_functions/activation_function_base.h deleted file mode 100644 index b0370c6..0000000 --- a/src/backend/layers/activation_functions/activation_function_base.h +++ /dev/null @@ -1,26 +0,0 @@ -/** - * @file function_base.h - * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) - * @brief - * @version 0.1 - * @date 2026-02-01 - * - * @copyright Copyright (c) 2026 - * - */ - -#pragma once - -#include "tensor.h" - -#include - -namespace activation { - class ActivationFunctionBase { - public: - virtual Tensor operator()(Tensor& t) const noexcept = 0; - Tensor forward(Tensor& t) const noexcept; - - virtual Tensor gradient(const Tensor& t) noexcept = 0; - }; -} diff --git a/src/backend/layers/activation_functions/relu.cpp b/src/backend/layers/activation_functions/relu.cpp deleted file mode 100644 index fd2b42d..0000000 --- a/src/backend/layers/activation_functions/relu.cpp +++ /dev/null @@ -1,37 +0,0 @@ -/** - * @file relu.cpp - * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) - * @brief - * @version 0.1 - * @date 2026-02-01 - * - * @copyright Copyright (c) 2026 - * - */ - -#include "relu.h" -#include "global_params.h" - -using namespace activation; - -Tensor ReLU::operator()(Tensor& t) const noexcept { - for(tensorSize_t i=0; i target){ - t[i] = 0; - } - } - return t; -} - -Tensor ReLU::gradient(const Tensor& t) noexcept { -/* for(tensorSize_t i=0; i target){ - t[i] = 0; - } - } - return t; */ -} \ No newline at end of file diff --git a/src/backend/layers/ff_layer.cpp b/src/backend/layers/ff_layer.cpp index 11cc82b..52e1485 100644 --- a/src/backend/layers/ff_layer.cpp +++ b/src/backend/layers/ff_layer.cpp @@ -10,6 +10,7 @@ */ #include "ff_layer.h" +#include "computational_graph/tensor_ops/graph_creation.h" #include #include @@ -17,15 +18,52 @@ using namespace std; using namespace layers; -FfLayer::FfLayer(const tensorDim_t in_size, const tensorDim_t out_size) { - //weights.emplace(Device::CPU, in_size, out_size); - //weights->reset(utility::InitClass::Gaussian); +FfLayer::FfLayer(const vector& dims, bool useBias, bool requiresGrad) + : FfLayer(dims, Tensor::getDefaultDevice(), requiresGrad) {} + +/** + * @brief Construct a new Ff Layer:: Ff Layer object + * Assumption for dims: (batch-size, ..., n_rows, n_cols). + * @param dims Dimensions, see above. + * @param d The device. + * @param useBias Use a bias if true. Bias will receiver shape (n_rows) + * @param requiresGrad If true train this layer. + */ +FfLayer::FfLayer(const vector& dims, Device d, bool useBias, bool requiresGrad) + : LayerBase(useBias, requiresGrad) { + weights = make_shared(dims, d, requiresGrad); + + if(useBias && dims.size()<2){ + bias = make_shared(vector{static_cast(1)}, d, requiresGrad); + } + else if(useBias){ + bias = make_shared(vector{dims[dims.size()-2]}, d, requiresGrad); + } } +/** + * @brief Normal forward function. Does not build computational graph. + */ Tensor FfLayer::forward(const Tensor& input) const { - return *weights * input; + auto res = *weights * input; + if(useBias){ + res = res + *bias; + } + return res; } -//ftype* FfLayer::backward(ftype* input) { +/** + * @brief Like overload, but creates computational graph. + */ +std::shared_ptr FfLayer::forward(const std::shared_ptr& input) const { + auto res = graph::matmul(input, weights); + if(useBias){ + res = graph::add(res, bias); // TODO: add needs to happen on each of those, how to broadcast? + } + return res; +} -//} \ No newline at end of file +void FfLayer::print(ostream& os) const noexcept { + LayerBase::print(os); + os << "\nuseBias: " << useBias ? "true" : "false"; +} \ No newline at end of file diff --git a/src/backend/layers/ff_layer.h b/src/backend/layers/ff_layer.h index 1c2ea06..1bd781a 100644 --- a/src/backend/layers/ff_layer.h +++ b/src/backend/layers/ff_layer.h @@ -18,15 +18,13 @@ namespace layers { class FfLayer : public LayerBase { - protected: - // memoization - // TODO: necessary? - //mutable std::optional v1; - public: - FfLayer(tensorDim_t in_size, tensorDim_t out_size); + FfLayer(const std::vector& dims, bool useBias=true, bool requiresGrad=false); + FfLayer(const std::vector& dims, Device d, bool useBias=true, bool requiresGrad=false); Tensor forward(const Tensor& input) const override; - //ftype* backward(ftype* input) override; + std::shared_ptr forward(const std::shared_ptr& input) const override; + + void print(std::ostream& os) const noexcept override; }; } diff --git a/src/backend/layers/layer_base.cpp b/src/backend/layers/layer_base.cpp index 7abcb0b..911eb48 100644 --- a/src/backend/layers/layer_base.cpp +++ b/src/backend/layers/layer_base.cpp @@ -16,19 +16,16 @@ using namespace std; using namespace layers; -ftype LayerBase::getItem(vector&&idx) const { - assert(weights); - return weights.value().getItem(std::move(idx)); -} - -void LayerBase::setItem(ftype item, vector&& idx) { - assert(weights); - weights.value().setItem(item, std::move(idx)); -} - void LayerBase::print(ostream& os) const noexcept { assert(weights); - os << weights.value(); + + os << "Weigths:\n"; + os << *weights; + + if(bias){ + os << "Bias:\n"; + os << *bias; + } } ostream& operator<<(ostream& os, const LayerBase& l) noexcept { diff --git a/src/backend/layers/layer_base.h b/src/backend/layers/layer_base.h index 2803540..b213a5f 100644 --- a/src/backend/layers/layer_base.h +++ b/src/backend/layers/layer_base.h @@ -12,10 +12,11 @@ #pragma once #include "data_modeling/tensor.h" - #include "utility/global_params.h" #include +#include + #include namespace layers { @@ -24,24 +25,33 @@ namespace layers { */ class LayerBase { protected: - std::optional weights = std::nullopt; - std::optional bias = std::nullopt; + bool requiresGrad = false; + bool useBias = false; + + std::shared_ptr weights = nullptr; + std::shared_ptr bias = nullptr; public: - LayerBase() = default; + LayerBase(bool useBias, bool requiresGrad) + : useBias{false}, requiresGrad{requiresGrad} + { } + virtual ~LayerBase() noexcept = default; + // for inference -> no graph creation virtual Tensor forward(const Tensor& input) const = 0; - //virtual ftype* backward(ftype* input) = 0; + + // for training -> creates graph + virtual std::shared_ptr forward(const std::shared_ptr& input) const = 0; // weights should always exist, never nullopt outside of c'tor - const Dimension& getDims() const noexcept { - return weights.value().getDims(); + const Dimension& getDims() const noexcept { + assert(weights); + return weights->getDims(); } - ftype getItem(std::vector&& idx) const; - void setItem(ftype item, std::vector&& idx); - + auto getWeights() const noexcept { return weights; } + auto getBias() const noexcept { return bias; } virtual void print(std::ostream& os) const noexcept; friend std::ostream& operator<<(std::ostream& os, const LayerBase& t) noexcept; diff --git a/src/backend/training/loss_functions/bce_loss.cpp b/src/backend/training/loss_functions/bce_loss.cpp new file mode 100644 index 0000000..5d42f06 --- /dev/null +++ b/src/backend/training/loss_functions/bce_loss.cpp @@ -0,0 +1,43 @@ +/** + * @file bce_loss.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-07 + * + * @copyright Copyright (c) 2026 + * + */ + +#include "bce_loss.h" + +#include + +using namespace std; +using namespace train; + +/** + * @brief Expected shapes: (batch_size) + * @return Tensor of shape (1) + */ +Tensor BceLoss::operator()(const Tensor& y, const Tensor& ypred) const { + if(y.getDevice() != ypred.getDevice()){ + __throw_invalid_argument("y and ypred must be on same device"); + } + else if(y.getDims()!=ypred.getDims()){ + __throw_invalid_argument("Tensors must be of same shape"); + } + + auto bce = [](ftype y, ftype ypred){ + return y*log(ypred) + (1-y)*log(1-ypred); + }; + + const auto nBatches = y.getDims().getItem(0); + + ftype res = 0; + for(tensorSize_t i=0; i + +using namespace std; +using namespace train; + +/** + * @brief Expected shapes: (batch_size, n_classes) + * @return Tensor of shape (1) + */ +Tensor CrossEntropyLoss::operator()(const Tensor& y, const Tensor& ypred) const { + if(y.getDevice() != ypred.getDevice()){ + __throw_invalid_argument("y and ypred must be on same device"); + } + else if(y.getDims()!=ypred.getDims()){ + __throw_invalid_argument("Tensors must be of same shape"); + } + + auto ce = [&y, &ypred](const tensorDim_t b){ + ftype res = 0; + for(tensorDim_t i=0; i - -/** - * @brief Expects shape (b-size, 1), or simply (batch-size) - * - * @param y Predicted output - * @param t_target Target - * @return Tensor of shape (b-size, 1) - */ -Tensor MseLoss::operator()(Tensor& y, const Tensor& y_target) const noexcept { - auto res = Tensor(y); - for(tensorSize_t i = 0; ilr = lr; -} \ No newline at end of file diff --git a/src/backend/training/optimizers/optimizer_base.h b/src/backend/training/optimizers/optimizer_base.h index 732b219..e88cd51 100644 --- a/src/backend/training/optimizers/optimizer_base.h +++ b/src/backend/training/optimizers/optimizer_base.h @@ -3,7 +3,7 @@ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) * @brief * @version 0.1 - * @date 2026-02-02 + * @date 2026-03-07 * * @copyright Copyright (c) 2026 * @@ -11,14 +11,16 @@ #pragma once -#include "data_modeling/tensor.h" - class OptimizerBase { - private: - float lr = 0.05; + OptimizerBase() = default; + + OptimizerBase(const OptimizerBase& other) = delete; + OptimizerBase& operator=(const OptimizerBase& other) = delete; + + OptimizerBase(OptimizerBase&& other) noexcept = default; + OptimizerBase& operator=(OptimizerBase&& other) noexcept = default; + + ~OptimizerBase() noexcept = default; - public: - virtual Tensor operator()(Tensor& t) const noexcept; - float getLr() const noexcept; - void setLr(const float lr) noexcept; + virtual void step() = 0; }; \ No newline at end of file diff --git a/src/python/data_modeling/py_data_modeling.cpp b/src/python/data_modeling/py_data_modeling.cpp index 88b4bec..5c66672 100644 --- a/src/python/data_modeling/py_data_modeling.cpp +++ b/src/python/data_modeling/py_data_modeling.cpp @@ -17,7 +17,7 @@ #include "data_modeling/tensor.h" #include "data_modeling/tensor_functions.h" -#include "computational_graph/graph_creation.h" +#include "computational_graph/tensor_ops/graph_creation.h" #include #include diff --git a/src/python/data_modeling/py_data_modeling_util.h b/src/python/data_modeling/py_data_modeling_util.h index 82a8343..2b8ec65 100644 --- a/src/python/data_modeling/py_data_modeling_util.h +++ b/src/python/data_modeling/py_data_modeling_util.h @@ -15,7 +15,7 @@ #include "data_modeling/tensor.h" #include "data_modeling/tensor_functions.h" -#include "computational_graph/graph_creation.h" +#include "computational_graph/tensor_ops/graph_creation.h" #include #include diff --git a/tests/backend/test_computational_graph.cpp b/tests/backend/test_computational_graph.cpp index d2a686f..f7e877c 100644 --- a/tests/backend/test_computational_graph.cpp +++ b/tests/backend/test_computational_graph.cpp @@ -14,7 +14,8 @@ #include "data_modeling/tensor.h" #include "data_modeling/tensor_functions.h" -#include "computational_graph/graph_creation.h" +#include "computational_graph/tensor_ops/graph_creation.h" +#include "computational_graph/activation_functions/graph_creation.h" #include From 581ff5ba1de8be274f8b54e383bed396dbf3bb4a Mon Sep 17 00:00:00 2001 From: Robert Baumgartner Date: Sun, 8 Mar 2026 16:04:15 +0100 Subject: [PATCH 03/24] Started implementing FF-layer, graph creation for activation functions --- python_lib/dl_lib/sys/__init__.py | 1 + src/CMakeLists.txt | 22 +- src/backend/CMakeLists.txt | 8 +- .../activation_function_base.cpp | 20 ++ .../activation_function_base.h | 12 +- .../activation_functions/leaky_relu.cpp | 15 +- src/backend/activation_functions/leaky_relu.h | 6 +- src/backend/activation_functions/relu.cpp | 17 +- src/backend/activation_functions/relu.h | 5 +- src/backend/activation_functions/softmax.cpp | 16 +- src/backend/activation_functions/softmax.h | 3 +- .../activation_functions/graph_creation.cpp | 36 --- .../activation_functions/graph_creation.h | 25 -- src/backend/data_modeling/device.cpp | 26 +++ src/backend/data_modeling/device.h | 19 ++ src/backend/data_modeling/tensor.h | 28 +-- src/backend/layers/ff_layer.cpp | 12 + src/backend/layers/layer_base.cpp | 6 +- src/backend/layers/layer_base.h | 7 +- src/backend/networks/sequential.cpp | 24 +- src/backend/networks/sequential.h | 41 ++-- src/backend/system/sys_functions.cpp | 3 +- src/backend/system/sys_functions.h | 8 +- .../training/optimizers/optimizer_base.h | 12 +- src/backend/training/optimizers/sgd.cpp | 18 ++ src/backend/training/optimizers/sgd.h | 29 +++ src/backend/utility/global_params.h | 2 +- src/python/CMakeLists.txt | 30 ++- src/python/data_modeling/py_data_modeling.cpp | 221 ------------------ .../data_modeling/py_data_modeling_util.h | 135 ----------- src/python/layers/py_layers.h | 61 ----- src/python/networks/py_sequential.cpp | 0 src/python/networks/py_sequential.h | 0 src/python/py_core/py_core.cpp | 221 ++++++++++++++++++ .../py_core_util.cpp} | 4 +- src/python/py_core/py_core_util.h | 135 +++++++++++ .../py_network.cpp} | 80 ++++++- src/python/py_network/py_network_util.h | 60 +++++ src/python/py_sys/py_sys.cpp | 23 ++ .../custom_converters.h | 0 .../python_templates.h | 11 - tests/backend/test_computational_graph.cpp | 1 - 42 files changed, 811 insertions(+), 592 deletions(-) create mode 100644 python_lib/dl_lib/sys/__init__.py create mode 100644 src/backend/activation_functions/activation_function_base.cpp delete mode 100644 src/backend/computational_graph/activation_functions/graph_creation.cpp delete mode 100644 src/backend/computational_graph/activation_functions/graph_creation.h create mode 100644 src/backend/data_modeling/device.cpp create mode 100644 src/backend/data_modeling/device.h create mode 100644 src/backend/training/optimizers/sgd.cpp create mode 100644 src/backend/training/optimizers/sgd.h delete mode 100644 src/python/data_modeling/py_data_modeling.cpp delete mode 100644 src/python/data_modeling/py_data_modeling_util.h delete mode 100644 src/python/layers/py_layers.h delete mode 100644 src/python/networks/py_sequential.cpp delete mode 100644 src/python/networks/py_sequential.h create mode 100644 src/python/py_core/py_core.cpp rename src/python/{data_modeling/py_data_modeling_util.cpp => py_core/py_core_util.cpp} (98%) create mode 100644 src/python/py_core/py_core_util.h rename src/python/{layers/py_layers.cpp => py_network/py_network.cpp} (53%) create mode 100644 src/python/py_network/py_network_util.h create mode 100644 src/python/py_sys/py_sys.cpp rename src/python/{python_utility => py_utility}/custom_converters.h (100%) rename src/python/{python_utility => py_utility}/python_templates.h (52%) diff --git a/python_lib/dl_lib/sys/__init__.py b/python_lib/dl_lib/sys/__init__.py new file mode 100644 index 0000000..51cbded --- /dev/null +++ b/python_lib/dl_lib/sys/__init__.py @@ -0,0 +1 @@ +from .._compiled._sys import getGlobalDevice, setGlobalDevice diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 0d564e2..f75c1ac 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,7 +2,7 @@ add_subdirectory(backend) add_subdirectory(python) -target_link_libraries(_core +target_link_libraries(_core PRIVATE ${Boost_LIBRARIES} ${PYTHON_LIBRARIES} BackendCore) @@ -11,8 +11,20 @@ target_include_directories(_core PRIVATE ${PYTHON_INCLUDE_DIRS} ${Boost_INCLUDE_DIRS}) -#target_link_libraries(py_layers PRIVATE ${Boost_LIBRARIES} ${PYTHON_LIBRARIES} BackendCore) -#target_include_directories(py_layers PRIVATE ${PYTHON_INCLUDE_DIRS} ${Boost_INCLUDE_DIRS}) +target_link_libraries(_network PRIVATE + ${Boost_LIBRARIES} + ${PYTHON_LIBRARIES} + BackendCore) + +target_include_directories(_network PRIVATE + ${PYTHON_INCLUDE_DIRS} + ${Boost_INCLUDE_DIRS}) -# for compiled boost lib -#target_link_libraries(hello PRIVATE Boost::filesystem) \ No newline at end of file +target_link_libraries(_sys PRIVATE + ${Boost_LIBRARIES} + ${PYTHON_LIBRARIES} + BackendCore) + +target_include_directories(_sys PRIVATE + ${PYTHON_INCLUDE_DIRS} + ${Boost_INCLUDE_DIRS}) \ No newline at end of file diff --git a/src/backend/CMakeLists.txt b/src/backend/CMakeLists.txt index 016dd30..3e40b12 100644 --- a/src/backend/CMakeLists.txt +++ b/src/backend/CMakeLists.txt @@ -4,13 +4,17 @@ file(GLOB_RECURSE CORE_SOURCES data_modeling/*.cpp layers/*.cpp #networks/*.cpp + system/*.cpp training/*.cpp utility/*.cpp - system/*.cpp ) -add_library(BackendCore STATIC ${CORE_SOURCES}) +add_library(BackendCore SHARED ${CORE_SOURCES}) target_include_directories(BackendCore PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} +) + +set_target_properties(BackendCore PROPERTIES + LIBRARY_OUTPUT_DIRECTORY "${PYTHON_MODULE_DIR}" # make sure Python-modules see backend ) \ No newline at end of file diff --git a/src/backend/activation_functions/activation_function_base.cpp b/src/backend/activation_functions/activation_function_base.cpp new file mode 100644 index 0000000..54e1d16 --- /dev/null +++ b/src/backend/activation_functions/activation_function_base.cpp @@ -0,0 +1,20 @@ +/** + * @file activation_function_base.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-08 + * + * @copyright Copyright (c) 2026 + * + */ + +#include "activation_function_base.h" + +using namespace std; +using namespace activation; + +ostream& operator<<(ostream& os, const ActivationFunctionBase& l) noexcept { + static_cast(&l)->print(os); // calling vtable + return os; +} \ No newline at end of file diff --git a/src/backend/activation_functions/activation_function_base.h b/src/backend/activation_functions/activation_function_base.h index 9015d93..1a8ea80 100644 --- a/src/backend/activation_functions/activation_function_base.h +++ b/src/backend/activation_functions/activation_function_base.h @@ -13,7 +13,8 @@ #include "data_modeling/tensor.h" -#include +#include +#include namespace activation { class ActivationFunctionBase { @@ -28,6 +29,13 @@ namespace activation { ~ActivationFunctionBase() noexcept = default; - virtual Tensor operator()(const Tensor& t) const noexcept = 0; + // creates no graph + virtual Tensor operator()(const Tensor& t) const = 0; + + // greates a graph + virtual std::shared_ptr operator()(const std::shared_ptr& t) const = 0; + + virtual void print(std::ostream& os) const noexcept { }; + friend std::ostream& operator<<(std::ostream& os, const ActivationFunctionBase& t) noexcept; }; } diff --git a/src/backend/activation_functions/leaky_relu.cpp b/src/backend/activation_functions/leaky_relu.cpp index 1559d83..2bb2c1f 100644 --- a/src/backend/activation_functions/leaky_relu.cpp +++ b/src/backend/activation_functions/leaky_relu.cpp @@ -10,10 +10,12 @@ */ #include "leaky_relu.h" +#include "computational_graph/activation_functions/leaky_relu_node.h" +using namespace std; using namespace activation; -Tensor LeakyReLu::operator()(const Tensor& t) const noexcept { +Tensor LeakyReLu::operator()(const Tensor& t) const { auto res = t.createDeepCopy(); for(tensorSize_t i=0; i LeakyReLu::operator()(const shared_ptr& t) const { + auto res = make_shared((*this)(*t)); + + if(t->getRequiresGrad()){ + res->setCgNode(make_shared(t, eps)); + assert(res->getRequiresGrad()); + } + + return res; } \ No newline at end of file diff --git a/src/backend/activation_functions/leaky_relu.h b/src/backend/activation_functions/leaky_relu.h index 2bf4c57..229a902 100644 --- a/src/backend/activation_functions/leaky_relu.h +++ b/src/backend/activation_functions/leaky_relu.h @@ -22,7 +22,9 @@ namespace activation { LeakyReLu(ftype eps) : eps{eps} { } - Tensor operator()(const Tensor& t) const noexcept override; - ftype getEps() const noexcept { return eps; } + Tensor operator()(const Tensor& t) const override; + std::shared_ptr operator()(const std::shared_ptr& t) const override; + + void print(std::ostream& os) const noexcept override { os << "\neps: " << eps; } }; } diff --git a/src/backend/activation_functions/relu.cpp b/src/backend/activation_functions/relu.cpp index 4ab4a71..01d7448 100644 --- a/src/backend/activation_functions/relu.cpp +++ b/src/backend/activation_functions/relu.cpp @@ -10,10 +10,12 @@ */ #include "relu.h" +#include "computational_graph/activation_functions/relu_node.h" +using namespace std; using namespace activation; -Tensor ReLu::operator()(const Tensor& t) const noexcept { +Tensor ReLu::operator()(const Tensor& t) const { auto res = t.createDeepCopy(); for(tensorSize_t i=0; i ReLu::operator()(const shared_ptr& t) const { + auto res = make_shared((*this)(*t)); + + if(t->getRequiresGrad()){ + res->setCgNode(make_shared(t)); + assert(res->getRequiresGrad()); + } + + return res; +} diff --git a/src/backend/activation_functions/relu.h b/src/backend/activation_functions/relu.h index 8f04e20..d9bc504 100644 --- a/src/backend/activation_functions/relu.h +++ b/src/backend/activation_functions/relu.h @@ -16,6 +16,9 @@ namespace activation { class ReLu final : public ActivationFunctionBase { public: - Tensor operator()(const Tensor& t) const noexcept override; + ReLu() = default; + + Tensor operator()(const Tensor& t) const override; + std::shared_ptr operator()(const std::shared_ptr& t) const override; }; } diff --git a/src/backend/activation_functions/softmax.cpp b/src/backend/activation_functions/softmax.cpp index 2844186..3e79814 100644 --- a/src/backend/activation_functions/softmax.cpp +++ b/src/backend/activation_functions/softmax.cpp @@ -13,6 +13,7 @@ #include +using namespace std; using namespace activation; /** @@ -20,7 +21,7 @@ using namespace activation; * (dim1, dim2, ..., n_classes) * @return Tensor of shape (dim1, dim2, ..., n_classes) [== input.shape] */ -Tensor Softmax::operator()(const Tensor& t) const noexcept { +Tensor Softmax::operator()(const Tensor& t) const { Tensor res(t.getDims(), t.getDevice()); Tensor tmp(t.getDims(), t.getDevice()); @@ -47,4 +48,15 @@ Tensor Softmax::operator()(const Tensor& t) const noexcept { } return res; -} \ No newline at end of file +} + +shared_ptr Softmax::operator()(const shared_ptr& t) const { + auto res = make_shared((*this)(*t)); + + if(t->getRequiresGrad()){ + //res->setCgNode(make_shared(t, eps)); + assert(res->getRequiresGrad()); + } + + return res; +} diff --git a/src/backend/activation_functions/softmax.h b/src/backend/activation_functions/softmax.h index 26a272b..cf1ed10 100644 --- a/src/backend/activation_functions/softmax.h +++ b/src/backend/activation_functions/softmax.h @@ -16,6 +16,7 @@ namespace activation { class Softmax final : public ActivationFunctionBase { public: - Tensor operator()(const Tensor& t) const noexcept override; + Tensor operator()(const Tensor& t) const override; + std::shared_ptr operator()(const std::shared_ptr& t) const override; }; } diff --git a/src/backend/computational_graph/activation_functions/graph_creation.cpp b/src/backend/computational_graph/activation_functions/graph_creation.cpp deleted file mode 100644 index 9f64251..0000000 --- a/src/backend/computational_graph/activation_functions/graph_creation.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/** - * @file graph_creation.cpp - * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) - * @brief - * @version 0.1 - * @date 2026-03-07 - * - * @copyright Copyright (c) 2026 - * - */ - -#include "graph_creation.h" - -#include "relu_node.h" -#include "leaky_relu_node.h" - -using namespace std; -using namespace activation; - -shared_ptr doActivation(const ReLu& r, const shared_ptr& t) { - auto res = make_shared(r(*t)); - if(t->getRequiresGrad()){ - res->setCgNode(make_shared(t)); - assert(res->getRequiresGrad()); - } - return res; -} - -shared_ptr doActivation(const LeakyReLu& r, const shared_ptr& t) { - auto res = make_shared(r(*t)); - if(t->getRequiresGrad()){ - res->setCgNode(make_shared(t, r.getEps())); - assert(res->getRequiresGrad()); - } - return res; -} \ No newline at end of file diff --git a/src/backend/computational_graph/activation_functions/graph_creation.h b/src/backend/computational_graph/activation_functions/graph_creation.h deleted file mode 100644 index 704b142..0000000 --- a/src/backend/computational_graph/activation_functions/graph_creation.h +++ /dev/null @@ -1,25 +0,0 @@ -/** - * @file graph_creation.h - * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) - * @brief - * @version 0.1 - * @date 2026-03-07 - * - * @copyright Copyright (c) 2026 - * - */ - -#pragma once - -#include "data_modeling/tensor.h" - -#include "activation_functions/relu.h" -#include "activation_functions/leaky_relu.h" - -#include - -namespace graph { - std::shared_ptr doActivation(const activation::ReLu& r, const std::shared_ptr& t); - std::shared_ptr doActivation(const activation::LeakyReLu& r, const std::shared_ptr& t); -} - \ No newline at end of file diff --git a/src/backend/data_modeling/device.cpp b/src/backend/data_modeling/device.cpp new file mode 100644 index 0000000..a7726ae --- /dev/null +++ b/src/backend/data_modeling/device.cpp @@ -0,0 +1,26 @@ +/** + * @file device.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-08 + * + * @copyright Copyright (c) 2026 + * + */ + +#include "device.h" + +#include + +const char* DeviceToString(Device d) { + switch(d){ + case Device::CPU: + return "CPU"; + case Device::CUDA: + return "CUDA"; + } + + std::__throw_invalid_argument("Unknown device encountered"); + return ""; // suppress +} \ No newline at end of file diff --git a/src/backend/data_modeling/device.h b/src/backend/data_modeling/device.h new file mode 100644 index 0000000..65f83a5 --- /dev/null +++ b/src/backend/data_modeling/device.h @@ -0,0 +1,19 @@ +/** + * @file device.h + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-08 + * + * @copyright Copyright (c) 2026 + * + */ + +#pragma once + +enum class Device { + CPU, + CUDA +}; + +const char* DeviceToString(Device d); \ No newline at end of file diff --git a/src/backend/data_modeling/tensor.h b/src/backend/data_modeling/tensor.h index b1249fe..cb91839 100644 --- a/src/backend/data_modeling/tensor.h +++ b/src/backend/data_modeling/tensor.h @@ -12,6 +12,7 @@ #pragma once #include "dim_type.h" +#include "device.h" #include "computational_graph/topological_sort.h" #include "computational_graph/graph_node.h" @@ -32,23 +33,6 @@ namespace graph { class TopologicalSort; } -enum class Device { - CPU, - CUDA -}; - -constexpr const char* DeviceToString(Device d) { - switch(d){ - case Device::CPU: - return "CPU"; - case Device::CUDA: - return "CUDA"; - } - - std::__throw_invalid_argument("Unknown device encountered"); - return ""; // suppress -} - class Tensor final : public std::enable_shared_from_this { friend class graph::TopologicalSort; @@ -253,15 +237,7 @@ class Tensor final : public std::enable_shared_from_this { Device getDevice() const noexcept; bool getRequiresGrad() const noexcept { return requiresGrad; } - void setRequiresGrad(const bool requiresGrad) noexcept { - this->requiresGrad=requiresGrad; - if(!requiresGrad && cgNode){ - cgNode = nullptr; - } - if(!requiresGrad && grads){ - grads = nullptr; - } - } + void setRequiresGrad(const bool requiresGrad) noexcept { this->requiresGrad=requiresGrad; } void setCgNode(std::shared_ptr node) noexcept { cgNode = std::move(node); diff --git a/src/backend/layers/ff_layer.cpp b/src/backend/layers/ff_layer.cpp index 52e1485..5d485ec 100644 --- a/src/backend/layers/ff_layer.cpp +++ b/src/backend/layers/ff_layer.cpp @@ -10,6 +10,8 @@ */ #include "ff_layer.h" +#include "activation_functions/activation_function_base.h" + #include "computational_graph/tensor_ops/graph_creation.h" #include @@ -49,6 +51,11 @@ Tensor FfLayer::forward(const Tensor& input) const { if(useBias){ res = res + *bias; } + + for(auto& af: activations){ + res = (*af)(res); + } + return res; } @@ -60,6 +67,11 @@ std::shared_ptr FfLayer::forward(const std::shared_ptr& input) c if(useBias){ res = graph::add(res, bias); // TODO: add needs to happen on each of those, how to broadcast? } + + for(auto& af: activations){ + res = (*af)(res); + } + return res; } diff --git a/src/backend/layers/layer_base.cpp b/src/backend/layers/layer_base.cpp index 911eb48..15dfc03 100644 --- a/src/backend/layers/layer_base.cpp +++ b/src/backend/layers/layer_base.cpp @@ -16,6 +16,10 @@ using namespace std; using namespace layers; +void LayerBase::addActivation(shared_ptr f) { + activations.push_back(std::move(f)); +} + void LayerBase::print(ostream& os) const noexcept { assert(weights); @@ -29,6 +33,6 @@ void LayerBase::print(ostream& os) const noexcept { } ostream& operator<<(ostream& os, const LayerBase& l) noexcept { - l.print(os); + static_cast(&l)->print(os); // calling vtable return os; } \ No newline at end of file diff --git a/src/backend/layers/layer_base.h b/src/backend/layers/layer_base.h index b213a5f..65a5428 100644 --- a/src/backend/layers/layer_base.h +++ b/src/backend/layers/layer_base.h @@ -13,6 +13,7 @@ #include "data_modeling/tensor.h" #include "utility/global_params.h" +#include "activation_functions/activation_function_base.h" #include #include @@ -31,9 +32,11 @@ namespace layers { std::shared_ptr weights = nullptr; std::shared_ptr bias = nullptr; + std::vector< std::shared_ptr > activations; + public: LayerBase(bool useBias, bool requiresGrad) - : useBias{false}, requiresGrad{requiresGrad} + : useBias{useBias}, requiresGrad{requiresGrad} { } virtual ~LayerBase() noexcept = default; @@ -50,6 +53,8 @@ namespace layers { return weights->getDims(); } + void addActivation(std::shared_ptr f); + auto getWeights() const noexcept { return weights; } auto getBias() const noexcept { return bias; } diff --git a/src/backend/networks/sequential.cpp b/src/backend/networks/sequential.cpp index 08617e5..b89d1fa 100644 --- a/src/backend/networks/sequential.cpp +++ b/src/backend/networks/sequential.cpp @@ -22,24 +22,34 @@ bool SequentialNetwork::assertDims(const LayerBase& layer) const noexcept { if(layers.size() == 0) return true; - return layers.at(layers.size()-1).getDims() == layer.getDims(); + return layers[layers.size()-1]->getDims() == layer.getDims(); } Tensor SequentialNetwork::forward(const Tensor& input) const { - if(input.getDims().getItem(1) != layers.at(0).getDims().getItem(0)){ - // TODO: show meaningful message rather than exception - __throw_invalid_argument("Not implemented yet. Dimensions don't match"); + if(input.getDims().getItem(-1) != layers[0]->getDims().getItem(-2)){ + __throw_invalid_argument("Input tensor has invalid dimension."); } if(layers.size()==0){ - // TODO: show meaningful message rather than exception __throw_invalid_argument("Network empy, cannot be called."); } - Tensor x = layers.at(0).forward(input); + Tensor x = layers[0]->forward(input); for(int i=1; iforward(x); } return x; +} + +void SequentialNetwork::append(shared_ptr l) { + if(!assertDims(*l)){ + __throw_invalid_argument("Dimensions of tensors don't fit."); + } + layers.push_back(std::move(l)); +} + +void SequentialNetwork::append(shared_ptr f) { + assert(layers.size()>0); + layers[layers.size()-1]->addActivation(std::move(f)); } \ No newline at end of file diff --git a/src/backend/networks/sequential.h b/src/backend/networks/sequential.h index f05fafd..646e5f5 100644 --- a/src/backend/networks/sequential.h +++ b/src/backend/networks/sequential.h @@ -12,33 +12,22 @@ #pragma once #include "layers/layer_base.h" +#include "activation_functions/activation_function_base.h" #include -#include -#include +#include class SequentialNetwork { - protected: - std::vector layers; - bool assertDims(const layers::LayerBase& layer) const noexcept; - - template - requires (std::derived_from< std::remove_cvref_t, layers::LayerBase >) - void addLayer(T&& layer) { - if(!assertDims(layer)){ - // TODO: show warning that the dims don't match - return; - } - layers.push_back(std::forward(layer)); - } - - public: - SequentialNetwork() = default; - - Tensor forward(const Tensor& input) const; -}; - -/*template -void SequentialNetwork::addLayer(LayerBase&& layer) noexcept { - layers.push_back(std::forward(layer)); -}*/ \ No newline at end of file + protected: + std::vector< std::shared_ptr > layers; + + bool assertDims(const layers::LayerBase& layer) const noexcept; + + void append(std::shared_ptr l); + void append(std::shared_ptr f); + + public: + SequentialNetwork() = default; + + Tensor forward(const Tensor& input) const; +}; \ No newline at end of file diff --git a/src/backend/system/sys_functions.cpp b/src/backend/system/sys_functions.cpp index 02a501f..4e5e56b 100644 --- a/src/backend/system/sys_functions.cpp +++ b/src/backend/system/sys_functions.cpp @@ -10,8 +10,9 @@ */ #include "sys_functions.h" +#include "data_modeling/tensor.h" -using namespace global; +using namespace sys; void setDevice(Device d) noexcept { Tensor::setDefaultDevice(d); diff --git a/src/backend/system/sys_functions.h b/src/backend/system/sys_functions.h index ad58358..5919029 100644 --- a/src/backend/system/sys_functions.h +++ b/src/backend/system/sys_functions.h @@ -12,9 +12,9 @@ #pragma once -#include "data_modeling/tensor.h" +#include "data_modeling/device.h" -namespace global { - void setDevice(Device d) noexcept; - Device getDevice() noexcept; +namespace sys { + void setGlobalDevice(Device d) noexcept; + Device getGlobalDevice() noexcept; } \ No newline at end of file diff --git a/src/backend/training/optimizers/optimizer_base.h b/src/backend/training/optimizers/optimizer_base.h index e88cd51..f755730 100644 --- a/src/backend/training/optimizers/optimizer_base.h +++ b/src/backend/training/optimizers/optimizer_base.h @@ -11,16 +11,18 @@ #pragma once -class OptimizerBase { +namespace train { + class OptimizerBase { + public: OptimizerBase() = default; + ~OptimizerBase() noexcept = default; OptimizerBase(const OptimizerBase& other) = delete; OptimizerBase& operator=(const OptimizerBase& other) = delete; OptimizerBase(OptimizerBase&& other) noexcept = default; OptimizerBase& operator=(OptimizerBase&& other) noexcept = default; - - ~OptimizerBase() noexcept = default; - + virtual void step() = 0; -}; \ No newline at end of file +}; +} \ No newline at end of file diff --git a/src/backend/training/optimizers/sgd.cpp b/src/backend/training/optimizers/sgd.cpp new file mode 100644 index 0000000..46ba31d --- /dev/null +++ b/src/backend/training/optimizers/sgd.cpp @@ -0,0 +1,18 @@ +/** + * @file sgd.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-08 + * + * @copyright Copyright (c) 2026 + * + */ + +#include "sgd.h" + +using namespace train; + +void SgdOptimizer::step() { + // TODO: implement +} \ No newline at end of file diff --git a/src/backend/training/optimizers/sgd.h b/src/backend/training/optimizers/sgd.h new file mode 100644 index 0000000..e7a615c --- /dev/null +++ b/src/backend/training/optimizers/sgd.h @@ -0,0 +1,29 @@ +/** + * @file sgd.h + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-08 + * + * @copyright Copyright (c) 2026 + * + */ + +#include "optimizer_base.h" + +#include "utility/global_params.h" + +namespace train { + class SgdOptimizer final : public OptimizerBase { + private: + const ftype lr; + + public: + SgdOptimizer(ftype lr) : lr{lr} + { } + + void step() override; + + // TODO: print + }; +} \ No newline at end of file diff --git a/src/backend/utility/global_params.h b/src/backend/utility/global_params.h index 160d940..9af9694 100644 --- a/src/backend/utility/global_params.h +++ b/src/backend/utility/global_params.h @@ -13,7 +13,7 @@ #include -using ftype = float; // TODO: make compiler flag +using ftype = float; // TODO: make compiler flag? /** * IMPORTANT: For the following block we assume that diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt index 6c5370d..c0fb746 100644 --- a/src/python/CMakeLists.txt +++ b/src/python/CMakeLists.txt @@ -1,16 +1,32 @@ include_directories( - "${CMAKE_CURRENT_SOURCE_DIR}/python_utility" + "${CMAKE_CURRENT_SOURCE_DIR}/py_utility" ) # remove the lib... prefix set(CMAKE_SHARED_MODULE_PREFIX "") add_library(_core MODULE - data_modeling/py_data_modeling.cpp - data_modeling/py_data_modeling_util.cpp + py_core/py_core.cpp + py_core/py_core_util.cpp ) - -set_target_properties(_core PROPERTIES + +add_library(_network MODULE + py_network/py_network.cpp + #py_network/py_network_util.cpp + ) + +add_library(_sys MODULE + py_sys/py_sys.cpp + ) + +set_target_properties(_core _sys _network PROPERTIES PREFIX "" - OUTPUT_NAME "_core" - LIBRARY_OUTPUT_DIRECTORY ${PYTHON_MODULE_DIR}) \ No newline at end of file + INSTALL_RPATH "$ORIGIN" # to find shared backend-core lib + BUILD_WITH_INSTALL_RPATH TRUE # use install RPATH even during build + LIBRARY_OUTPUT_DIRECTORY "${PYTHON_MODULE_DIR}") + +set_target_properties(_core PROPERTIES + OUTPUT_NAME "_core") + +set_target_properties(_sys PROPERTIES + OUTPUT_NAME "_sys") \ No newline at end of file diff --git a/src/python/data_modeling/py_data_modeling.cpp b/src/python/data_modeling/py_data_modeling.cpp deleted file mode 100644 index 5c66672..0000000 --- a/src/python/data_modeling/py_data_modeling.cpp +++ /dev/null @@ -1,221 +0,0 @@ -/** - * @file py_data_modeling.cpp - * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) - * @brief - * @version 0.1 - * @date 2026-02-21 - * - * @copyright Copyright (c) 2026 - * - */ - -#include "data_modeling/tensor.h" - -#include "py_data_modeling_util.h" -#include "python_templates.h" -#include "custom_converters.h" - -#include "data_modeling/tensor.h" -#include "data_modeling/tensor_functions.h" -#include "computational_graph/tensor_ops/graph_creation.h" - -#include -#include -#include - -BOOST_PYTHON_MODULE(_core) -{ - using namespace boost::python; - - // some macros to make code below easier to read - #define WRAP_TENSOR_METHOD_1(method) \ - +[](const Tensor& self, const Tensor& other) -> std::shared_ptr { \ - return std::make_shared(self.method(other)); \ - } - - #define WRAP_SCALAR(method, T) \ - +[](const Tensor& self, T val) -> std::shared_ptr { \ - return std::make_shared(self.method(val)); \ - } - - #define WRAP_SCALAR_REVERSE(op, T) \ - +[](const Tensor& self, T val) -> std::shared_ptr { \ - return std::make_shared(val op self); \ - } - - // different, since those are not methods anymore - #define WRAP_FREE_MEMBER_FUNC_1(fPtr, T1, T2) \ - +[](const Tensor& self, int v1, int v2) -> std::shared_ptr { \ - return std::make_shared((self.*fPtr)(v1, v2)); \ - } - - #define WRAP_FREE_MEMBER_FUNC_2(fPtr, T1, T2, T3) \ - +[](const Tensor& self, T1 v1, T2 v2, T3 v3) -> std::shared_ptr { \ - return std::make_shared((self.*fPtr)(v1, v2, v3)); \ - } - - #define WRAP_FREE_FUNC_1(fPtr, T1) \ - +[](T1 v1) -> std::shared_ptr { \ - return std::make_shared((*fPtr)(v1)); \ - } - - #define WRAP_FREE_FUNC_2(fPtr, T1, T2) \ - +[](T1 v1, T2 v2) -> std::shared_ptr { \ - return std::make_shared((*fPtr)(v1, v2)); \ - } - - #define WRAP_FREE_FUNC_3(fPtr, T1, T2, T3) \ - +[](T1 v1, T2 v2, T3 v3) -> std::shared_ptr { \ - return std::make_shared((*fPtr)(v1, v2, v3)); \ - } - - #define WRAP_FREE_FUNC_4(fPtr, T) \ - +[](const Tensor& self, T val) -> std::shared_ptr { \ - return (*fPtr)(self.getSharedPtr(), val); \ - } - - #define WRAP_FREE_FUNC_5(fPtr) \ - +[](const Tensor& self, const Tensor& other) -> std::shared_ptr { \ - return (*fPtr)(self.getSharedPtr(), other.getSharedPtr()); \ - } - - #define WRAP_FREE_FUNC_6(fPtr, T) \ - +[](const Tensor& self, T val) -> std::shared_ptr { \ - return (*fPtr)(val, self.getSharedPtr()); \ - } - - #define WRAP_FREE_FUNC_7(fPtr) \ - +[](const Tensor& self) -> std::shared_ptr { \ - return (*fPtr)(self.getSharedPtr()); \ - } - - #define WRAP_FUNC_AND_CONVERT_DTYPE_1(method) \ - +[](const Tensor& self, int v1) -> ftype { \ - return self.method(static_cast(v1)); \ - } - - #define WRAP_FUNC_AND_CONVERT_DTYPE_2(method) \ - +[](const Tensor& self, int v1, int v2) -> ftype { \ - return self.method(static_cast(v1), static_cast(v2)); \ - } - - #define WRAP_FUNC_AND_CONVERT_DTYPE_3(method) \ - +[](const Tensor& self, int v1, int v2, int v3) -> ftype { \ - return self.method(static_cast(v1), static_cast(v2), \ - static_cast(v3)); \ - } - - #define WRAP_FUNC_AND_CONVERT_DTYPE_4(method) \ - +[](const Tensor& self, int v1, int v2, int v3, int v4) -> ftype { \ - return self.method(static_cast(v1), static_cast(v2), \ - static_cast(v3), static_cast(v4)); \ - } - - // classes - class_("Dimension", no_init) - .add_property("list", &Dimension::getItem) - .def("__str__", &Py_Util::toString) - .def("__eq__", Py_DataModeling::dimEquals1) - .def("__eq__", Py_DataModeling::dimEquals2) - .def("__ne__", Py_DataModeling::nDimEquals1) - .def("__ne__", Py_DataModeling::nDimEquals2) - ; - - enum_("Device") - .value("CPU", Device::CPU) - .value("CUDA", Device::CUDA) - ; - - // register implicit dtype conversion - custom_converters::PyListToVectorConverter(); - custom_converters::PyListToVectorConverter(); - - // to convert std::shared_ptr to std::shared_ptr> in Python - boost::python::register_ptr_to_python< std::shared_ptr >(); - - // we manage via shared_ptr, since we deleted copy-ctor - class_, boost::noncopyable>("Tensor", no_init) - .def(init&, optional >()) - .def(init&, Device, optional >()) - .def(init&, const std::vector&, optional >()) - .def(init&, const std::vector&, Device, optional >()) - - // static creation methods - .def("ones", WRAP_FREE_FUNC_1(Py_DataModeling::Ones0, std::vector)) - .def("ones", WRAP_FREE_FUNC_2(Py_DataModeling::Ones1, std::vector, Device)) - .def("ones", WRAP_FREE_FUNC_2(Py_DataModeling::Ones2, std::vector, const bool)) - .def("ones", WRAP_FREE_FUNC_3(Py_DataModeling::Ones3, std::vector, Device, const bool)).staticmethod("ones") - - .def("zeros", WRAP_FREE_FUNC_1(Py_DataModeling::Zeros0, std::vector)) - .def("zeros", WRAP_FREE_FUNC_2(Py_DataModeling::Zeros1, std::vector, Device)) - .def("zeros", WRAP_FREE_FUNC_2(Py_DataModeling::Zeros2, std::vector, const bool)) - .def("zeros", WRAP_FREE_FUNC_3(Py_DataModeling::Zeros3, std::vector, Device, const bool)).staticmethod("zeros") - - .def("gauss", WRAP_FREE_FUNC_1(Py_DataModeling::Gaussian0, std::vector)) - .def("gauss", WRAP_FREE_FUNC_2(Py_DataModeling::Gaussian1, std::vector, Device)) - .def("gauss", WRAP_FREE_FUNC_2(Py_DataModeling::Gaussian2, std::vector, const bool)) - .def("gauss", WRAP_FREE_FUNC_3(Py_DataModeling::Gaussian3, std::vector, Device, const bool)).staticmethod("gauss") - - // properties - .add_property("device", &Tensor::getDevice, &Tensor::setDevice) - .add_property("dims", make_function(&Tensor::getDims, return_internal_reference<>())) - .add_property("grads", make_function(&Tensor::getGrads)) - .add_property("requiresGrad", &Tensor::getRequiresGrad, &Tensor::setRequiresGrad) - - // operators - .def("__str__", &Py_Util::toString) - .def("__repr__", &Py_Util::toString) - .def("__len__", &Tensor::getSize) - .def("__getitem__", WRAP_FREE_FUNC_4(&Py_DataModeling::getItemAsTensor1, tensorSize_t)) - .def("__getitem__", WRAP_FREE_FUNC_4(&Py_DataModeling::getItemAsTensor2, std::vector)) - .def("__setitem__", &Py_DataModeling::tensorSetItem) - - // arithmetics - .def("__matmul__", WRAP_FREE_FUNC_5(Py_DataModeling::matmul)) - .def("__add__", WRAP_FREE_FUNC_5(Py_DataModeling::elementwiseadd)) // elementwise add - .def("__add__", WRAP_FREE_FUNC_4(Py_DataModeling::scalaradd, ftype)) - .def("__radd__", WRAP_FREE_FUNC_6(Py_DataModeling::rscalaradd, ftype)) - - .def("__mul__", WRAP_FREE_FUNC_5(Py_DataModeling::elementwisemul)) // elementwise mult - .def("__mul__", WRAP_FREE_FUNC_4(Py_DataModeling::scalarmul, ftype)) - .def("__rmul__", WRAP_FREE_FUNC_6(Py_DataModeling::rscalarmul, ftype)) - - .def("__sub__", WRAP_FREE_FUNC_4(Py_DataModeling::scalarsub, ftype)) - .def("__truediv__", WRAP_FREE_FUNC_4(Py_DataModeling::scalardiv, ftype)) - - // member functions - .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_1(Tensor::getItem)) - .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_2(Tensor::getItem)) - .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_3(Tensor::getItem)) - .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_4(Tensor::getItem)) - .def("getitem", Py_DataModeling::getItemVector) // the vector arg - - .def("sum", WRAP_FREE_FUNC_7(&(graph::sumTensor))) - - .def("reset", Py_DataModeling::reset1) - .def("reset", Py_DataModeling::reset2) - - .def("transpose", WRAP_FREE_MEMBER_FUNC_1(Py_DataModeling::transpose1, int, int)) - .def("transpose", WRAP_FREE_MEMBER_FUNC_2(Py_DataModeling::transpose2, int, int, bool)) - .def("transposeThis", Py_DataModeling::transposeThis1) - .def("transposeThis", Py_DataModeling::transposeThis2) - - .def("backward", &Tensor::backward) - ; - - // functions - def("Ones", WRAP_FREE_FUNC_1(Py_DataModeling::Ones0, std::vector)); - def("Ones", WRAP_FREE_FUNC_2(Py_DataModeling::Ones1, std::vector, Device)); - def("Ones", WRAP_FREE_FUNC_2(Py_DataModeling::Ones2, std::vector, const bool)); - def("Ones", WRAP_FREE_FUNC_3(Py_DataModeling::Ones3, std::vector, Device, const bool)); - - def("Zeros", WRAP_FREE_FUNC_1(Py_DataModeling::Zeros0, std::vector)); - def("Zeros", WRAP_FREE_FUNC_2(Py_DataModeling::Zeros1, std::vector, Device)); - def("Zeros", WRAP_FREE_FUNC_2(Py_DataModeling::Zeros2, std::vector, const bool)); - def("Zeros", WRAP_FREE_FUNC_3(Py_DataModeling::Zeros3, std::vector, Device, const bool)); - - def("Gaussian", WRAP_FREE_FUNC_1(Py_DataModeling::Gaussian0, std::vector)); - def("Gaussian", WRAP_FREE_FUNC_2(Py_DataModeling::Gaussian1, std::vector, Device)); - def("Gaussian", WRAP_FREE_FUNC_2(Py_DataModeling::Gaussian2, std::vector, const bool)); - def("Gaussian", WRAP_FREE_FUNC_3(Py_DataModeling::Gaussian3, std::vector, Device, const bool)); -} \ No newline at end of file diff --git a/src/python/data_modeling/py_data_modeling_util.h b/src/python/data_modeling/py_data_modeling_util.h deleted file mode 100644 index 2b8ec65..0000000 --- a/src/python/data_modeling/py_data_modeling_util.h +++ /dev/null @@ -1,135 +0,0 @@ -/** - * @file util.h - * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) - * @brief Helper and wrapper functions - * @version 0.1 - * @date 2026-02-21 - * - * @copyright Copyright (c) 2026 - * - */ - -#pragma once - -#include "data_modeling/dim_type.h" - -#include "data_modeling/tensor.h" -#include "data_modeling/tensor_functions.h" -#include "computational_graph/tensor_ops/graph_creation.h" - -#include -#include - -#include - -namespace Py_DataModeling { - - /********************************************************************************************************* - ********************************************** Dimension ************************************************* - *********************************************************************************************************/ - - inline bool (Dimension::*dimEquals1)(const Dimension&) const = &Dimension::operator==; - inline bool (Dimension::*dimEquals2)(const std::vector&) const = &Dimension::operator==; - - inline bool (Dimension::*nDimEquals1)(const Dimension&) const = &Dimension::operator!=; - inline bool (Dimension::*nDimEquals2)(const std::vector&) const = &Dimension::operator!=; - /********************************************************************************************************* - *********************************************** Tensor *************************************************** - *********************************************************************************************************/ - - ftype tensorGetItem(const Tensor& self, boost::python::object index); - void tensorSetItem(Tensor& self, boost::python::object index, ftype value); - - // need wrappers for default arguments, see - // https://beta.boost.org/doc/libs/develop/libs/python/doc/html/tutorial/tutorial/functions.html - inline auto OnesWrapper0(std::vector dims) { - return TensorFunctions::Ones(std::move(dims)); - } - - inline auto OnesWrapper1(std::vector dims, Device d) { - return TensorFunctions::Ones(std::move(dims), d); - } - - inline auto ZerosWrapper0(std::vector dims) { - return TensorFunctions::Zeros(std::move(dims)); - } - - inline auto ZerosWrapper1(std::vector dims, Device d) { - return TensorFunctions::Zeros(std::move(dims), d); - } - - inline auto GaussianWrapper0(std::vector dims) { - return TensorFunctions::Gaussian(std::move(dims)); - } - - inline auto GaussianWrapper1(std::vector dims, Device d) { - return TensorFunctions::Gaussian(std::move(dims), d); - } - - inline Tensor (*Ones0)(std::vector) = &OnesWrapper0; - inline Tensor (*Ones1)(std::vector, Device) = &OnesWrapper1; - inline Tensor (*Ones2)(std::vector, const bool) = &(TensorFunctions::Ones); - inline Tensor (*Ones3)(std::vector, Device, const bool) = &(TensorFunctions::Ones); - - inline Tensor (*Zeros0)(std::vector) = &ZerosWrapper0; - inline Tensor (*Zeros1)(std::vector, Device) = &ZerosWrapper1; - inline Tensor (*Zeros2)(std::vector, const bool) = &(TensorFunctions::Zeros); - inline Tensor (*Zeros3)(std::vector, Device, const bool) = &(TensorFunctions::Zeros); - - inline Tensor (*Gaussian0)(std::vector) = &GaussianWrapper0; - inline Tensor (*Gaussian1)(std::vector, Device) = &GaussianWrapper1; - inline Tensor (*Gaussian2)(std::vector, const bool) = &(TensorFunctions::Gaussian); - inline Tensor (*Gaussian3)(std::vector, Device, const bool) = &(TensorFunctions::Gaussian); - - inline void (Tensor::*reset1)(const ftype) = &Tensor::reset; - inline void (Tensor::*reset2)(const utility::InitClass) = &Tensor::reset; - - inline void (Tensor::*transposeThis1)() = &Tensor::transposeThis; - inline void (Tensor::*transposeThis2)(int, int) = &Tensor::transposeThis; - inline Tensor (Tensor::*transpose1)(int, int) const = &Tensor::transpose; - inline Tensor (Tensor::*transpose2)(int, int, bool) const = &Tensor::transpose; - - inline ftype (Tensor::*getItemVector)(const std::vector&) const = &Tensor::getItem; - - /********************************************************************************************************* - ***************************************** Graph creation ************************************************* - *********************************************************************************************************/ - - // multiplications - inline std::shared_ptr (*elementwisemul) - (const std::shared_ptr left, const std::shared_ptr right) = &(graph::mul); - - inline std::shared_ptr (*scalarmul) - (const std::shared_ptr, ftype) = &(graph::mul); - - inline std::shared_ptr (*rscalarmul) - (ftype, const std::shared_ptr) = &(graph::mul); - - // additions - inline std::shared_ptr (*elementwiseadd) - (const std::shared_ptr left, const std::shared_ptr right) = &(graph::add); - - inline std::shared_ptr (*scalaradd) - (const std::shared_ptr, ftype) = &(graph::add); - - inline std::shared_ptr (*rscalaradd) - (ftype, const std::shared_ptr) = &(graph::add); - - // matmul - inline std::shared_ptr (*matmul) - (const std::shared_ptr left, const std::shared_ptr right) = &(graph::matmul); - - // sub, div - inline std::shared_ptr (*scalarsub) - (const std::shared_ptr, ftype) = &(graph::sub); - - inline std::shared_ptr (*scalardiv) - (const std::shared_ptr, ftype) = &(graph::div); - - // get - inline std::shared_ptr (*getItemAsTensor1) - (const std::shared_ptr& t, tensorSize_t idx) = &(graph::get); - - inline std::shared_ptr (*getItemAsTensor2) - (const std::shared_ptr& t, const std::vector& idx) = &(graph::get); -} \ No newline at end of file diff --git a/src/python/layers/py_layers.h b/src/python/layers/py_layers.h deleted file mode 100644 index c90929a..0000000 --- a/src/python/layers/py_layers.h +++ /dev/null @@ -1,61 +0,0 @@ -/** - * @file layers.h - * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) - * @brief - * @version 0.1 - * @date 2025-11-17 - * - * @copyright Copyright (c) 2025 - * - */ - -#pragma once - -#include "ff_layer.h" -#include "python_templates.h" - -#include -#include -#include -#include - -namespace Py_Layers { - ftype layerGetItem(const layers::LayerBase& self, boost::python::object index); - void layerSetItem(layers::LayerBase& self, boost::python::object index, ftype value); -} - -BOOST_PYTHON_MODULE(py_layers) -{ - using namespace boost::python; - - /** - * @brief Wrapper class needed for Boost Python to get the virtual function working - * the way it is intended. See documentation here: - * https://beta.boost.org/doc/libs/develop/libs/python/doc/html/tutorial/tutorial/exposing.html - * - */ - struct LayerBaseWrap : layers::LayerBase, wrapper { - Tensor forward(const Tensor& input) const { - return this->get_override("forward")(input); - } - - void print(std::ostream& os) const noexcept { - this->get_override("print")(os); - } - }; - - class_("LayerBase", no_init) - .def("forward", pure_virtual(&layers::LayerBase::forward)) - //.def("backward", &FfLayer::backward) - .def("getDims", &layers::LayerBase::getDims, return_internal_reference<>()) - .def("getTensor", &layers::LayerBase::getDims, return_internal_reference<>()) - .def("__getitem__", &Py_Layers::layerGetItem) - .def("__setitem__", &Py_Layers::layerSetItem) - .def("__str__", &toString) - ; - - class_ >("FfLayer", init()) - .def("forward", &layers::FfLayer::forward) - //.def("backward", &FfLayer::backward) - ; -} \ No newline at end of file diff --git a/src/python/networks/py_sequential.cpp b/src/python/networks/py_sequential.cpp deleted file mode 100644 index e69de29..0000000 diff --git a/src/python/networks/py_sequential.h b/src/python/networks/py_sequential.h deleted file mode 100644 index e69de29..0000000 diff --git a/src/python/py_core/py_core.cpp b/src/python/py_core/py_core.cpp new file mode 100644 index 0000000..2776077 --- /dev/null +++ b/src/python/py_core/py_core.cpp @@ -0,0 +1,221 @@ +/** + * @file py_data_modeling.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-02-21 + * + * @copyright Copyright (c) 2026 + * + */ + +#include "data_modeling/tensor.h" + +#include "py_core_util.h" +#include "python_templates.h" +#include "custom_converters.h" + +#include "data_modeling/tensor.h" +#include "data_modeling/tensor_functions.h" +#include "computational_graph/tensor_ops/graph_creation.h" + +#include +#include +#include + +BOOST_PYTHON_MODULE(_core) +{ + using namespace boost::python; + + // some macros to make code below easier to read + #define WRAP_TENSOR_METHOD_1(method) \ + +[](const Tensor& self, const Tensor& other) -> std::shared_ptr { \ + return std::make_shared(self.method(other)); \ + } + + #define WRAP_SCALAR(method, T) \ + +[](const Tensor& self, T val) -> std::shared_ptr { \ + return std::make_shared(self.method(val)); \ + } + + #define WRAP_SCALAR_REVERSE(op, T) \ + +[](const Tensor& self, T val) -> std::shared_ptr { \ + return std::make_shared(val op self); \ + } + + // different, since those are not methods anymore + #define WRAP_FREE_MEMBER_FUNC_1(fPtr, T1, T2) \ + +[](const Tensor& self, int v1, int v2) -> std::shared_ptr { \ + return std::make_shared((self.*fPtr)(v1, v2)); \ + } + + #define WRAP_FREE_MEMBER_FUNC_2(fPtr, T1, T2, T3) \ + +[](const Tensor& self, T1 v1, T2 v2, T3 v3) -> std::shared_ptr { \ + return std::make_shared((self.*fPtr)(v1, v2, v3)); \ + } + + #define WRAP_FREE_FUNC_1(fPtr, T1) \ + +[](T1 v1) -> std::shared_ptr { \ + return std::make_shared((*fPtr)(v1)); \ + } + + #define WRAP_FREE_FUNC_2(fPtr, T1, T2) \ + +[](T1 v1, T2 v2) -> std::shared_ptr { \ + return std::make_shared((*fPtr)(v1, v2)); \ + } + + #define WRAP_FREE_FUNC_3(fPtr, T1, T2, T3) \ + +[](T1 v1, T2 v2, T3 v3) -> std::shared_ptr { \ + return std::make_shared((*fPtr)(v1, v2, v3)); \ + } + + #define WRAP_FREE_FUNC_4(fPtr, T) \ + +[](const Tensor& self, T val) -> std::shared_ptr { \ + return (*fPtr)(self.getSharedPtr(), val); \ + } + + #define WRAP_FREE_FUNC_5(fPtr) \ + +[](const Tensor& self, const Tensor& other) -> std::shared_ptr { \ + return (*fPtr)(self.getSharedPtr(), other.getSharedPtr()); \ + } + + #define WRAP_FREE_FUNC_6(fPtr, T) \ + +[](const Tensor& self, T val) -> std::shared_ptr { \ + return (*fPtr)(val, self.getSharedPtr()); \ + } + + #define WRAP_FREE_FUNC_7(fPtr) \ + +[](const Tensor& self) -> std::shared_ptr { \ + return (*fPtr)(self.getSharedPtr()); \ + } + + #define WRAP_FUNC_AND_CONVERT_DTYPE_1(method) \ + +[](const Tensor& self, int v1) -> ftype { \ + return self.method(static_cast(v1)); \ + } + + #define WRAP_FUNC_AND_CONVERT_DTYPE_2(method) \ + +[](const Tensor& self, int v1, int v2) -> ftype { \ + return self.method(static_cast(v1), static_cast(v2)); \ + } + + #define WRAP_FUNC_AND_CONVERT_DTYPE_3(method) \ + +[](const Tensor& self, int v1, int v2, int v3) -> ftype { \ + return self.method(static_cast(v1), static_cast(v2), \ + static_cast(v3)); \ + } + + #define WRAP_FUNC_AND_CONVERT_DTYPE_4(method) \ + +[](const Tensor& self, int v1, int v2, int v3, int v4) -> ftype { \ + return self.method(static_cast(v1), static_cast(v2), \ + static_cast(v3), static_cast(v4)); \ + } + + // classes + class_("Dimension", no_init) + .add_property("list", &Dimension::getItem) + .def("__str__", &Py_Util::toString) + .def("__eq__", Py_DataModeling::dimEquals1) + .def("__eq__", Py_DataModeling::dimEquals2) + .def("__ne__", Py_DataModeling::nDimEquals1) + .def("__ne__", Py_DataModeling::nDimEquals2) + ; + + enum_("Device") + .value("CPU", Device::CPU) + .value("CUDA", Device::CUDA) + ; + + // register implicit dtype conversion + custom_converters::PyListToVectorConverter(); + custom_converters::PyListToVectorConverter(); + + // to convert std::shared_ptr to std::shared_ptr> in Python + boost::python::register_ptr_to_python< std::shared_ptr >(); + + // we manage via shared_ptr, since we deleted copy-ctor + class_, boost::noncopyable>("Tensor", no_init) + .def(init&, optional >()) + .def(init&, Device, optional >()) + .def(init&, const std::vector&, optional >()) + .def(init&, const std::vector&, Device, optional >()) + + // static creation methods + .def("ones", WRAP_FREE_FUNC_1(Py_DataModeling::Ones0, std::vector)) + .def("ones", WRAP_FREE_FUNC_2(Py_DataModeling::Ones1, std::vector, Device)) + .def("ones", WRAP_FREE_FUNC_2(Py_DataModeling::Ones2, std::vector, const bool)) + .def("ones", WRAP_FREE_FUNC_3(Py_DataModeling::Ones3, std::vector, Device, const bool)).staticmethod("ones") + + .def("zeros", WRAP_FREE_FUNC_1(Py_DataModeling::Zeros0, std::vector)) + .def("zeros", WRAP_FREE_FUNC_2(Py_DataModeling::Zeros1, std::vector, Device)) + .def("zeros", WRAP_FREE_FUNC_2(Py_DataModeling::Zeros2, std::vector, const bool)) + .def("zeros", WRAP_FREE_FUNC_3(Py_DataModeling::Zeros3, std::vector, Device, const bool)).staticmethod("zeros") + + .def("gauss", WRAP_FREE_FUNC_1(Py_DataModeling::Gaussian0, std::vector)) + .def("gauss", WRAP_FREE_FUNC_2(Py_DataModeling::Gaussian1, std::vector, Device)) + .def("gauss", WRAP_FREE_FUNC_2(Py_DataModeling::Gaussian2, std::vector, const bool)) + .def("gauss", WRAP_FREE_FUNC_3(Py_DataModeling::Gaussian3, std::vector, Device, const bool)).staticmethod("gauss") + + // properties + .add_property("device", &Tensor::getDevice, &Tensor::setDevice) + .add_property("dims", make_function(&Tensor::getDims, return_internal_reference<>())) + .add_property("grads", make_function(&Tensor::getGrads)) + .add_property("requiresGrad", &Tensor::getRequiresGrad, &Tensor::setRequiresGrad) + + // operators + .def("__str__", &Py_Util::toString) + .def("__repr__", &Py_Util::toString) + .def("__len__", &Tensor::getSize) + .def("__getitem__", WRAP_FREE_FUNC_4(&Py_DataModeling::getItemAsTensor1, tensorSize_t)) + .def("__getitem__", WRAP_FREE_FUNC_4(&Py_DataModeling::getItemAsTensor2, std::vector)) + .def("__setitem__", &Py_DataModeling::tensorSetItem) + + // arithmetics + .def("__matmul__", WRAP_FREE_FUNC_5(Py_DataModeling::matmul)) + .def("__add__", WRAP_FREE_FUNC_5(Py_DataModeling::elementwiseadd)) // elementwise add + .def("__add__", WRAP_FREE_FUNC_4(Py_DataModeling::scalaradd, ftype)) + .def("__radd__", WRAP_FREE_FUNC_6(Py_DataModeling::rscalaradd, ftype)) + + .def("__mul__", WRAP_FREE_FUNC_5(Py_DataModeling::elementwisemul)) // elementwise mult + .def("__mul__", WRAP_FREE_FUNC_4(Py_DataModeling::scalarmul, ftype)) + .def("__rmul__", WRAP_FREE_FUNC_6(Py_DataModeling::rscalarmul, ftype)) + + .def("__sub__", WRAP_FREE_FUNC_4(Py_DataModeling::scalarsub, ftype)) + .def("__truediv__", WRAP_FREE_FUNC_4(Py_DataModeling::scalardiv, ftype)) + + // member functions + .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_1(Tensor::getItem)) + .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_2(Tensor::getItem)) + .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_3(Tensor::getItem)) + .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_4(Tensor::getItem)) + .def("getitem", Py_DataModeling::getItemVector) // the vector arg + + .def("sum", WRAP_FREE_FUNC_7(&(graph::sumTensor))) + + .def("reset", Py_DataModeling::reset1) + .def("reset", Py_DataModeling::reset2) + + .def("transpose", WRAP_FREE_MEMBER_FUNC_1(Py_DataModeling::transpose1, int, int)) + .def("transpose", WRAP_FREE_MEMBER_FUNC_2(Py_DataModeling::transpose2, int, int, bool)) + .def("transposeThis", Py_DataModeling::transposeThis1) + .def("transposeThis", Py_DataModeling::transposeThis2) + + .def("backward", &Tensor::backward) + ; + + // free functions + def("Ones", WRAP_FREE_FUNC_1(Py_DataModeling::Ones0, std::vector)); + def("Ones", WRAP_FREE_FUNC_2(Py_DataModeling::Ones1, std::vector, Device)); + def("Ones", WRAP_FREE_FUNC_2(Py_DataModeling::Ones2, std::vector, const bool)); + def("Ones", WRAP_FREE_FUNC_3(Py_DataModeling::Ones3, std::vector, Device, const bool)); + + def("Zeros", WRAP_FREE_FUNC_1(Py_DataModeling::Zeros0, std::vector)); + def("Zeros", WRAP_FREE_FUNC_2(Py_DataModeling::Zeros1, std::vector, Device)); + def("Zeros", WRAP_FREE_FUNC_2(Py_DataModeling::Zeros2, std::vector, const bool)); + def("Zeros", WRAP_FREE_FUNC_3(Py_DataModeling::Zeros3, std::vector, Device, const bool)); + + def("Gaussian", WRAP_FREE_FUNC_1(Py_DataModeling::Gaussian0, std::vector)); + def("Gaussian", WRAP_FREE_FUNC_2(Py_DataModeling::Gaussian1, std::vector, Device)); + def("Gaussian", WRAP_FREE_FUNC_2(Py_DataModeling::Gaussian2, std::vector, const bool)); + def("Gaussian", WRAP_FREE_FUNC_3(Py_DataModeling::Gaussian3, std::vector, Device, const bool)); +} \ No newline at end of file diff --git a/src/python/data_modeling/py_data_modeling_util.cpp b/src/python/py_core/py_core_util.cpp similarity index 98% rename from src/python/data_modeling/py_data_modeling_util.cpp rename to src/python/py_core/py_core_util.cpp index d495300..9864a2b 100644 --- a/src/python/data_modeling/py_data_modeling_util.cpp +++ b/src/python/py_core/py_core_util.cpp @@ -1,5 +1,5 @@ /** - * @file py_data_modeling_util.cpp + * @file py_core_util.cpp * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) * @brief * @version 0.1 @@ -9,7 +9,7 @@ * */ -#include "py_data_modeling_util.h" +#include "py_core_util.h" #include #include diff --git a/src/python/py_core/py_core_util.h b/src/python/py_core/py_core_util.h new file mode 100644 index 0000000..ba6ecc9 --- /dev/null +++ b/src/python/py_core/py_core_util.h @@ -0,0 +1,135 @@ +/** + * @file util.h + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief Helper and wrapper functions + * @version 0.1 + * @date 2026-02-21 + * + * @copyright Copyright (c) 2026 + * + */ + +#pragma once + +#include "data_modeling/dim_type.h" + +#include "data_modeling/tensor.h" +#include "data_modeling/tensor_functions.h" +#include "computational_graph/tensor_ops/graph_creation.h" + +#include +#include + +#include + +namespace Py_DataModeling { + + /********************************************************************************************************* + ********************************************** Dimension ************************************************* + *********************************************************************************************************/ + + inline bool (Dimension::*dimEquals1)(const Dimension&) const = &Dimension::operator==; + inline bool (Dimension::*dimEquals2)(const std::vector&) const = &Dimension::operator==; + + inline bool (Dimension::*nDimEquals1)(const Dimension&) const = &Dimension::operator!=; + inline bool (Dimension::*nDimEquals2)(const std::vector&) const = &Dimension::operator!=; + /********************************************************************************************************* + *********************************************** Tensor *************************************************** + *********************************************************************************************************/ + + ftype tensorGetItem(const Tensor& self, boost::python::object index); + void tensorSetItem(Tensor& self, boost::python::object index, ftype value); + + // need wrappers for default arguments, see + // https://beta.boost.org/doc/libs/develop/libs/python/doc/html/tutorial/tutorial/functions.html + inline auto OnesWrapper0(std::vector dims) { + return TensorFunctions::Ones(std::move(dims)); + } + + inline auto OnesWrapper1(std::vector dims, Device d) { + return TensorFunctions::Ones(std::move(dims), d); + } + + inline auto ZerosWrapper0(std::vector dims) { + return TensorFunctions::Zeros(std::move(dims)); + } + + inline auto ZerosWrapper1(std::vector dims, Device d) { + return TensorFunctions::Zeros(std::move(dims), d); + } + + inline auto GaussianWrapper0(std::vector dims) { + return TensorFunctions::Gaussian(std::move(dims)); + } + + inline auto GaussianWrapper1(std::vector dims, Device d) { + return TensorFunctions::Gaussian(std::move(dims), d); + } + + inline Tensor (*Ones0)(std::vector) = &OnesWrapper0; + inline Tensor (*Ones1)(std::vector, Device) = &OnesWrapper1; + inline Tensor (*Ones2)(std::vector, const bool) = &(TensorFunctions::Ones); + inline Tensor (*Ones3)(std::vector, Device, const bool) = &(TensorFunctions::Ones); + + inline Tensor (*Zeros0)(std::vector) = &ZerosWrapper0; + inline Tensor (*Zeros1)(std::vector, Device) = &ZerosWrapper1; + inline Tensor (*Zeros2)(std::vector, const bool) = &(TensorFunctions::Zeros); + inline Tensor (*Zeros3)(std::vector, Device, const bool) = &(TensorFunctions::Zeros); + + inline Tensor (*Gaussian0)(std::vector) = &GaussianWrapper0; + inline Tensor (*Gaussian1)(std::vector, Device) = &GaussianWrapper1; + inline Tensor (*Gaussian2)(std::vector, const bool) = &(TensorFunctions::Gaussian); + inline Tensor (*Gaussian3)(std::vector, Device, const bool) = &(TensorFunctions::Gaussian); + + inline void (Tensor::*reset1)(const ftype) = &Tensor::reset; + inline void (Tensor::*reset2)(const utility::InitClass) = &Tensor::reset; + + inline void (Tensor::*transposeThis1)() = &Tensor::transposeThis; + inline void (Tensor::*transposeThis2)(int, int) = &Tensor::transposeThis; + inline Tensor (Tensor::*transpose1)(int, int) const = &Tensor::transpose; + inline Tensor (Tensor::*transpose2)(int, int, bool) const = &Tensor::transpose; + + inline ftype (Tensor::*getItemVector)(const std::vector&) const = &Tensor::getItem; + + /********************************************************************************************************* + ***************************************** Graph creation ************************************************* + *********************************************************************************************************/ + + // multiplications + inline std::shared_ptr (*elementwisemul) + (const std::shared_ptr left, const std::shared_ptr right) = &(graph::mul); + + inline std::shared_ptr (*scalarmul) + (const std::shared_ptr, ftype) = &(graph::mul); + + inline std::shared_ptr (*rscalarmul) + (ftype, const std::shared_ptr) = &(graph::mul); + + // additions + inline std::shared_ptr (*elementwiseadd) + (const std::shared_ptr left, const std::shared_ptr right) = &(graph::add); + + inline std::shared_ptr (*scalaradd) + (const std::shared_ptr, ftype) = &(graph::add); + + inline std::shared_ptr (*rscalaradd) + (ftype, const std::shared_ptr) = &(graph::add); + + // matmul + inline std::shared_ptr (*matmul) + (const std::shared_ptr left, const std::shared_ptr right) = &(graph::matmul); + + // sub, div + inline std::shared_ptr (*scalarsub) + (const std::shared_ptr, ftype) = &(graph::sub); + + inline std::shared_ptr (*scalardiv) + (const std::shared_ptr, ftype) = &(graph::div); + + // get + inline std::shared_ptr (*getItemAsTensor1) + (const std::shared_ptr& t, tensorSize_t idx) = &(graph::get); + + inline std::shared_ptr (*getItemAsTensor2) + (const std::shared_ptr& t, const std::vector& idx) = &(graph::get); +} \ No newline at end of file diff --git a/src/python/layers/py_layers.cpp b/src/python/py_network/py_network.cpp similarity index 53% rename from src/python/layers/py_layers.cpp rename to src/python/py_network/py_network.cpp index 5fc3613..c1f9ba2 100644 --- a/src/python/layers/py_layers.cpp +++ b/src/python/py_network/py_network.cpp @@ -9,12 +9,86 @@ * */ -#include "py_layers.h" +#include "py_network_util.h" +#include "python_templates.h" +#include "utility/global_params.h" + +#include "layers/ff_layer.h" + +#include "activation_functions/relu.h" +#include "activation_functions/leaky_relu.h" +#include "activation_functions/softmax.h" + +#include "training/loss_functions/bce_loss.h" +#include "training/loss_functions/crossentropy_loss.h" + +#include "training/optimizers/sgd.h" #include -using namespace boost::python; +BOOST_PYTHON_MODULE(py_layers) +{ + using namespace std; + + using namespace Py_Util; + using namespace Py_Network; + + using namespace boost::python; + + // Layers + class_("LayerBase", no_init) + // attributes + .add_property("dims", make_function(&layers::LayerBase::getDims, return_internal_reference<>())) + .add_property("weights", make_function(&layers::LayerBase::getWeights)) + .add_property("bias", make_function(&layers::LayerBase::getBias)) + // methods + .def("forward", pure_virtual(Py_Network::layerforward)) + .def("addActivation", make_function(&layers::LayerBase::addActivation)) + // operators + .def("__str__", &toString) + ; + + class_, boost::noncopyable>("FfLayer", no_init) + .def(init&, optional, optional >()) + .def(init&, Device, optional, optional >()) + .def("forward", &layers::FfLayer::forward) + ; + + // Activation functions + class_("ActivationFunctionBase", no_init) + .def("call", pure_virtual(&ActivationFunctionWrap::operator())) + .def("__str__", &toString) + ; + + class_, bases >("ReLU", init) + .def("call", &activation::ReLu::operator()) + ; + + class_, bases >("LeakyReLU", init) + .def("call", &activation::LeakyReLu::operator()) + ; + + class_, bases >("Softmax", init) + .def("call", &activation::Softmax::operator()) + ; + + // Loss functions + class_("LossBase", no_init) + .def("call", pure_virtual(&LossWrap::operator())) + ; + + class_("BCE", no_init) + .def("call", pure_virtual(&train::BceLoss::operator())) + ; + + class_("CrossEntropy", no_init) + .def("call", pure_virtual(&train::CrossEntropyLoss::operator())) + ; + + // Optimizers +} +/* ftype Py_Layers::layerGetItem(const layers::LayerBase& self, boost::python::object index) { extract int_extractor(index); @@ -102,4 +176,4 @@ void Py_Layers::layerSetItem(layers::LayerBase& self, boost::python::object inde PyErr_SetString(PyExc_TypeError, "Index must be an integer or tuple"); throw_error_already_set(); -} \ No newline at end of file +}*/ \ No newline at end of file diff --git a/src/python/py_network/py_network_util.h b/src/python/py_network/py_network_util.h new file mode 100644 index 0000000..058e91e --- /dev/null +++ b/src/python/py_network/py_network_util.h @@ -0,0 +1,60 @@ +/** + * @file layers.h + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2025-11-17 + * + * @copyright Copyright (c) 2025 + * + */ + +#pragma once + +#include "layers/layer_base.h" +#include "activation_functions/activation_function_base.h" +#include "training/loss_functions/loss_base.h" +#include "training/optimizers/optimizer_base.h" + +#include +#include +#include +#include + +namespace Py_Network { + using namespace boost::python; + + ftype layerGetItem(const layers::LayerBase& self, boost::python::object index); + void layerSetItem(layers::LayerBase& self, boost::python::object index, ftype value); + + /** + * @brief Wrapper class needed for Boost Python to get the virtual function working + * the way it is intended. See documentation here: + * https://beta.boost.org/doc/libs/develop/libs/python/doc/html/tutorial/tutorial/exposing.html + * + */ + struct LayerBaseWrap : layers::LayerBase, wrapper { + std::shared_ptr forward(const std::shared_ptr& input) const override { + return this->get_override("forward")(input); + } + + Tensor forward(const Tensor& input) const override { + std::__throw_runtime_error("This function should never be called from within Python"); + } + }; + + struct ActivationFunctionWrap : activation::ActivationFunctionBase, wrapper { + std::shared_ptr operator()(const std::shared_ptr& input) const override { + return this->get_override("call")(input); + } + }; + + struct LossWrap : train::LossBase, wrapper { + Tensor operator()(const Tensor& y, const Tensor& ypred) const override { + return this->get_override("call")(y, ypred); + } + }; + + inline std::shared_ptr (LayerBaseWrap::*layerforward)(const std::shared_ptr&) const = &LayerBaseWrap::forward; +} + diff --git a/src/python/py_sys/py_sys.cpp b/src/python/py_sys/py_sys.cpp new file mode 100644 index 0000000..ebfe674 --- /dev/null +++ b/src/python/py_sys/py_sys.cpp @@ -0,0 +1,23 @@ +/** + * @file py_sys.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-08 + * + * @copyright Copyright (c) 2026 + * + */ + + +#include "system/sys_functions.h" + +#include + +BOOST_PYTHON_MODULE(_sys) +{ + using namespace boost::python; + + def("setGlobalDevice", &sys::setGlobalDevice); + def("getGlobalDevice", &sys::getGlobalDevice); +} \ No newline at end of file diff --git a/src/python/python_utility/custom_converters.h b/src/python/py_utility/custom_converters.h similarity index 100% rename from src/python/python_utility/custom_converters.h rename to src/python/py_utility/custom_converters.h diff --git a/src/python/python_utility/python_templates.h b/src/python/py_utility/python_templates.h similarity index 52% rename from src/python/python_utility/python_templates.h rename to src/python/py_utility/python_templates.h index 54217d2..e0e625c 100644 --- a/src/python/python_utility/python_templates.h +++ b/src/python/py_utility/python_templates.h @@ -24,15 +24,4 @@ namespace Py_Util { oss << obj; return oss.str(); } - - /** - * @brief Because we manage tensors via shared_ptr, we need this to wrap - * return values when a function/method demands it. - */ - /* template - auto WrapReturnedTensor(Func f) { - return [f](const Tensor& self, auto&&... args) -> std::shared_ptr { - return std::make_shared(f(self, std::forward(args)...)); - }; - } */ } diff --git a/tests/backend/test_computational_graph.cpp b/tests/backend/test_computational_graph.cpp index f7e877c..3c22843 100644 --- a/tests/backend/test_computational_graph.cpp +++ b/tests/backend/test_computational_graph.cpp @@ -15,7 +15,6 @@ #include "data_modeling/tensor_functions.h" #include "computational_graph/tensor_ops/graph_creation.h" -#include "computational_graph/activation_functions/graph_creation.h" #include From e18f74cbf11d7ebd26206285c7225b3de16b1640 Mon Sep 17 00:00:00 2001 From: Robert Baumgartner Date: Mon, 9 Mar 2026 14:43:47 +0100 Subject: [PATCH 04/24] Upgraded to CMake 3.28 and updated python includes --- CMakeLists.txt | 13 ++++++------- readme.md | 2 +- src/CMakeLists.txt | 4 ++-- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b09189f..cdd5dc6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,7 +40,7 @@ if(APPLE) message("Python_INCLUDE_DIRS:${${Python_INCLUDE_DIRS}}") set(PYTHON_LIBRARIES ${Python_LIBRARIES}) - string(COMPARE EQUAL "${PYTHON_LIBRARIES}" "" PYTHONLIBS_EMPTY) + string(COMPARE EQUAL "${Python_LIBRARIES}" "" PYTHONLIBS_EMPTY) if(PYTHONLIBS_EMPTY) message(FATAL_ERROR "Problem: PYTHON_LIBRARIES not found. Do you have Python installed on your machine?") endif() @@ -50,19 +50,18 @@ if(APPLE) message("Failed to automatically find Python_INCLUDE_DIRS. Setting the PYTHON_INCLUDE_DIRS variable manually. If this crashes please adjust the following path to the path where Python.h resides (the one matching the found Python instance). Paths must be consistent iff multiple Python versions on machine.") set(PYTHON_H_PATH "/usr/local/opt/python@3.13/Frameworks/Python.framework/Versions/3.13/include/python3.13") - set(PYTHON_INCLUDE_DIRS "${PYTHON_H_PATH}") + set(PYTHON_INCLUDE_DIRS "${Python_H_PATH}") else() set(PYTHON_INCLUDE_DIRS ${Python_INCLUDE_DIRS}) endif() #FindPython3() - message("Apple - Using Python:${Python_VERSION_MAJOR} - Libraries:${PYTHON_LIBRARIES} - IncludeDirs: ${PYTHON_INCLUDE_DIRS}") + message("Apple - Using Python:${Python_VERSION_MAJOR} - Libraries:${Python_LIBRARIES} - IncludeDirs: ${Python_INCLUDE_DIRS}") else() message("Getting PythonLibs on Linux or Windows path") - find_package(PythonLibs REQUIRED) + find_package(Python 3 REQUIRED COMPONENTS Interpreter Development) endif() -include_directories(${PYTHON_INCLUDE_DIRS}) -message("Using Python:${Python_VERSION_MAJOR} - Libraries:${PYTHON_LIBRARIES} - IncludeDirs: ${PYTHON_INCLUDE_DIRS}") - +include_directories(${Python_INCLUDE_DIRS}) +message("Using Python:${Python_VERSION_MAJOR} - Libraries:${Python_LIBRARIES} - IncludeDirs: ${Python_INCLUDE_DIRS}") #set(CMAKE_MESSAGE_LOG_LEVEL WARNING) diff --git a/readme.md b/readme.md index 8c531f0..e587227 100644 --- a/readme.md +++ b/readme.md @@ -64,7 +64,7 @@ ctest - Compiler capable of C++20 at least (we test with gcc 12.3.0) - Boost Python -- Cmake > 3.24 +- Cmake > 3.28 - Python 3 (we test with 3.10, but it should work with any version) - pytest for unit tests (we use 9.0.2) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 0d564e2..604b02b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -4,11 +4,11 @@ add_subdirectory(python) target_link_libraries(_core ${Boost_LIBRARIES} - ${PYTHON_LIBRARIES} + ${Python_LIBRARIES} BackendCore) target_include_directories(_core PRIVATE - ${PYTHON_INCLUDE_DIRS} + ${Python_INCLUDE_DIRS} ${Boost_INCLUDE_DIRS}) #target_link_libraries(py_layers PRIVATE ${Boost_LIBRARIES} ${PYTHON_LIBRARIES} BackendCore) From 82edf60eeb47c33474713a25a0e72ecf645f2200 Mon Sep 17 00:00:00 2001 From: Robert Baumgartner Date: Mon, 9 Mar 2026 19:18:46 +0100 Subject: [PATCH 05/24] Fixed issues, added unit tests --- .../activation_functions/leaky_relu_node.cpp | 4 +- .../activation_functions/relu_node.cpp | 4 +- src/backend/layers/ff_layer.cpp | 27 ++++--- src/backend/layers/layer_base.cpp | 12 +-- src/python/py_network/py_network.cpp | 75 +++++++++--------- src/python/py_network/py_network_util.h | 16 +++- tests/CMakeLists.txt | 4 +- tests/backend/test_computational_graph.cpp | 39 ++++++--- tests/backend/test_networks.cpp | 79 +++++++++++++++++++ tests/backend/test_training.cpp | 16 ++++ 10 files changed, 204 insertions(+), 72 deletions(-) create mode 100644 tests/backend/test_networks.cpp create mode 100644 tests/backend/test_training.cpp diff --git a/src/backend/computational_graph/activation_functions/leaky_relu_node.cpp b/src/backend/computational_graph/activation_functions/leaky_relu_node.cpp index e57c640..c39297c 100644 --- a/src/backend/computational_graph/activation_functions/leaky_relu_node.cpp +++ b/src/backend/computational_graph/activation_functions/leaky_relu_node.cpp @@ -21,8 +21,10 @@ vector> LeakyReLuNode::backward(const Tensor& upstreamGrad) { constexpr ftype zero = 0.0; auto res = make_shared(upstreamGrad.getDims(), upstreamGrad.getDevice(), false); + + const auto& parent = parents[0]; for(tensorSize_t i=0; isetItem(upstreamGrad[i] > zero ? 1 : eps, i); + res->setItem((*parent)[i] > zero ? 1 : eps, i); } return {res}; diff --git a/src/backend/computational_graph/activation_functions/relu_node.cpp b/src/backend/computational_graph/activation_functions/relu_node.cpp index 90d2a9b..43577f1 100644 --- a/src/backend/computational_graph/activation_functions/relu_node.cpp +++ b/src/backend/computational_graph/activation_functions/relu_node.cpp @@ -21,8 +21,10 @@ vector> ReLuNode::backward(const Tensor& upstreamGrad) { constexpr ftype zero = 0.0; auto res = make_shared(upstreamGrad.getDims(), upstreamGrad.getDevice(), false); + + const auto& parent = parents[0]; for(tensorSize_t i=0; isetItem(upstreamGrad[i] > zero ? 1 : zero, i); + res->setItem((*parent)[i] > zero ? 1 : zero, i); } return {res}; diff --git a/src/backend/layers/ff_layer.cpp b/src/backend/layers/ff_layer.cpp index 5d485ec..8fd1639 100644 --- a/src/backend/layers/ff_layer.cpp +++ b/src/backend/layers/ff_layer.cpp @@ -11,6 +11,7 @@ #include "ff_layer.h" #include "activation_functions/activation_function_base.h" +#include "data_modeling/tensor_functions.h" #include "computational_graph/tensor_ops/graph_creation.h" @@ -21,33 +22,37 @@ using namespace std; using namespace layers; FfLayer::FfLayer(const vector& dims, bool useBias, bool requiresGrad) - : FfLayer(dims, Tensor::getDefaultDevice(), requiresGrad) {} + : FfLayer(dims, Tensor::getDefaultDevice(), useBias, requiresGrad) {} /** * @brief Construct a new Ff Layer:: Ff Layer object - * Assumption for dims: (batch-size, ..., n_rows, n_cols). + * Assumption for dims: (in-size, out-size) * @param dims Dimensions, see above. * @param d The device. * @param useBias Use a bias if true. Bias will receiver shape (n_rows) * @param requiresGrad If true train this layer. */ FfLayer::FfLayer(const vector& dims, Device d, bool useBias, bool requiresGrad) - : LayerBase(useBias, requiresGrad) { - weights = make_shared(dims, d, requiresGrad); + : LayerBase(useBias, requiresGrad) { + assert(dims.size()==2); + + weights = make_shared(Dimension({dims[0], dims[1]}), d, requiresGrad); + TensorFunctions::ToGaussian(*weights); - if(useBias && dims.size()<2){ - bias = make_shared(vector{static_cast(1)}, d, requiresGrad); - } - else if(useBias){ - bias = make_shared(vector{dims[dims.size()-2]}, d, requiresGrad); - } + if(useBias){ + bias = make_shared(vector{dims[1]}, d, requiresGrad); + TensorFunctions::ToGaussian(*bias); + } } /** * @brief Normal forward function. Does not build computational graph. + * + * Assumption for input: (b-size, ..., dim1, in-size) */ Tensor FfLayer::forward(const Tensor& input) const { - auto res = *weights * input; + auto res = input.matmul(*weights); + if(useBias){ res = res + *bias; } diff --git a/src/backend/layers/layer_base.cpp b/src/backend/layers/layer_base.cpp index 15dfc03..7e0a59f 100644 --- a/src/backend/layers/layer_base.cpp +++ b/src/backend/layers/layer_base.cpp @@ -22,17 +22,13 @@ void LayerBase::addActivation(shared_ptr f) void LayerBase::print(ostream& os) const noexcept { assert(weights); - - os << "Weigths:\n"; - os << *weights; - + os << "Weigths:\n" << *weights; if(bias){ - os << "Bias:\n"; - os << *bias; + os << "\nBias:\n" << *bias; } } -ostream& operator<<(ostream& os, const LayerBase& l) noexcept { - static_cast(&l)->print(os); // calling vtable +ostream& layers::operator<<(ostream& os, const LayerBase& l) noexcept { + l.print(os); // calling vtable return os; } \ No newline at end of file diff --git a/src/python/py_network/py_network.cpp b/src/python/py_network/py_network.cpp index c1f9ba2..d8c1d06 100644 --- a/src/python/py_network/py_network.cpp +++ b/src/python/py_network/py_network.cpp @@ -13,12 +13,6 @@ #include "python_templates.h" #include "utility/global_params.h" -#include "layers/ff_layer.h" - -#include "activation_functions/relu.h" -#include "activation_functions/leaky_relu.h" -#include "activation_functions/softmax.h" - #include "training/loss_functions/bce_loss.h" #include "training/loss_functions/crossentropy_loss.h" @@ -28,61 +22,66 @@ BOOST_PYTHON_MODULE(py_layers) { - using namespace std; - using namespace Py_Util; - using namespace Py_Network; using namespace boost::python; + #define WRAP_METHOD_ONE_TENSORARG(T, method) \ + +[](const T& self, Tensor& t) -> std::shared_ptr { \ + return (self.*method)(t.getSharedPtr()); \ + } + + #define WRAP_METHOD_TWO_TENSORARGS(T, method) \ + +[](const T& self, Tensor& t1, Tensor& t2) -> std::shared_ptr { \ + return (self.*method)(t1.getSharedPtr(), t2.getSharedPtr()); \ + } + // Layers - class_("LayerBase", no_init) + class_("LayerBase", no_init) // attributes .add_property("dims", make_function(&layers::LayerBase::getDims, return_internal_reference<>())) .add_property("weights", make_function(&layers::LayerBase::getWeights)) .add_property("bias", make_function(&layers::LayerBase::getBias)) // methods - .def("forward", pure_virtual(Py_Network::layerforward)) - .def("addActivation", make_function(&layers::LayerBase::addActivation)) - // operators - .def("__str__", &toString) - ; - - class_, boost::noncopyable>("FfLayer", no_init) - .def(init&, optional, optional >()) - .def(init&, Device, optional, optional >()) - .def("forward", &layers::FfLayer::forward) + .def("addActivation", make_function(&layers::LayerBase::addActivation)) ; - // Activation functions - class_("ActivationFunctionBase", no_init) - .def("call", pure_virtual(&ActivationFunctionWrap::operator())) - .def("__str__", &toString) + class_, bases, boost::noncopyable>("FfLayer", no_init) + // init + .def(init&>()) + .def(init&, bool>()) + .def(init&, bool, bool>()) + .def(init&, Device>()) + .def(init&, Device, bool>()) + .def(init&, Device, bool, bool>()) + // methods + .def("forward", WRAP_METHOD_ONE_TENSORARG(layers::FfLayer, Py_Network::ffForward)) + // operators + .def("__str__", &toString) ; - class_, bases >("ReLU", init) - .def("call", &activation::ReLu::operator()) + class_, boost::noncopyable>("ReLU") + .def("__call__", WRAP_METHOD_ONE_TENSORARG(activation::ReLu, Py_Network::reluF)) + .def("__str__", &toString) ; - class_, bases >("LeakyReLU", init) - .def("call", &activation::LeakyReLu::operator()) + class_, boost::noncopyable>("LeakyReLU", init()) + .def("__call__", WRAP_METHOD_ONE_TENSORARG(activation::LeakyReLu, Py_Network::leakyReluF)) + .def("__str__", &toString) ; - class_, bases >("Softmax", init) - .def("call", &activation::Softmax::operator()) + class_, boost::noncopyable>("Softmax") + .def("__call__", WRAP_METHOD_ONE_TENSORARG(activation::Softmax, Py_Network::softmaxF)) + .def("__str__", &toString) ; // Loss functions - class_("LossBase", no_init) - .def("call", pure_virtual(&LossWrap::operator())) - ; - - class_("BCE", no_init) - .def("call", pure_virtual(&train::BceLoss::operator())) + class_("BCE") + .def("__call__", &train::BceLoss::operator()) ; - class_("CrossEntropy", no_init) - .def("call", pure_virtual(&train::CrossEntropyLoss::operator())) + class_("CrossEntropy") + .def("__call__", &train::CrossEntropyLoss::operator()) ; // Optimizers diff --git a/src/python/py_network/py_network_util.h b/src/python/py_network/py_network_util.h index 058e91e..faef35f 100644 --- a/src/python/py_network/py_network_util.h +++ b/src/python/py_network/py_network_util.h @@ -16,6 +16,12 @@ #include "training/loss_functions/loss_base.h" #include "training/optimizers/optimizer_base.h" +#include "layers/ff_layer.h" + +#include "activation_functions/relu.h" +#include "activation_functions/leaky_relu.h" +#include "activation_functions/softmax.h" + #include #include #include @@ -33,7 +39,7 @@ namespace Py_Network { * https://beta.boost.org/doc/libs/develop/libs/python/doc/html/tutorial/tutorial/exposing.html * */ - struct LayerBaseWrap : layers::LayerBase, wrapper { + /* struct LayerBaseWrap : layers::LayerBase, wrapper { std::shared_ptr forward(const std::shared_ptr& input) const override { return this->get_override("forward")(input); } @@ -53,8 +59,12 @@ namespace Py_Network { Tensor operator()(const Tensor& y, const Tensor& ypred) const override { return this->get_override("call")(y, ypred); } - }; + }; */ + + inline std::shared_ptr (layers::FfLayer::*ffForward)(const std::shared_ptr&) const = &layers::FfLayer::forward; - inline std::shared_ptr (LayerBaseWrap::*layerforward)(const std::shared_ptr&) const = &LayerBaseWrap::forward; + inline std::shared_ptr (activation::ReLu::*reluF)(const std::shared_ptr&) const = &activation::ReLu::operator(); + inline std::shared_ptr (activation::LeakyReLu::*leakyReluF)(const std::shared_ptr&) const = &activation::LeakyReLu::operator(); + inline std::shared_ptr (activation::Softmax::*softmaxF)(const std::shared_ptr&) const = &activation::Softmax::operator(); } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 3b258b9..d0a409f 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -9,7 +9,9 @@ FetchContent_MakeAvailable(googletest) add_executable(unit_tests_backend backend/test_data_modeling.cpp backend/test_computational_graph.cpp -) + backend/test_networks.cpp + backend/test_training.cpp + ) target_link_libraries(unit_tests_backend PRIVATE gtest_main # pre-built main, avoids boilerplate if no custom initialization needed diff --git a/tests/backend/test_computational_graph.cpp b/tests/backend/test_computational_graph.cpp index 3c22843..42adf07 100644 --- a/tests/backend/test_computational_graph.cpp +++ b/tests/backend/test_computational_graph.cpp @@ -16,6 +16,9 @@ #include "computational_graph/tensor_ops/graph_creation.h" +#include "activation_functions/relu.h" +#include "activation_functions/leaky_relu.h" + #include TEST(AutogradTest, ThrowsIfNoGradientSet) { @@ -119,16 +122,34 @@ TEST(AutogradTest, MultiVariateChainRule) { ASSERT_DOUBLE_EQ(y->getGrads()->getItem(1), 1.0); } -/* TEST(AutogradTest, ReLU) { - Tensor x({3}, {-1.0, 0.0, 2.0}, true); +TEST(AutogradTest, ReLU) { + auto x = TensorFunctions::makeSharedTensor({3}, {-1.0, 0.0, 2.0}, true); + auto relu = activation::ReLu(); + + auto y = relu(x); // [0, 0, 2] + auto loss = graph::sumTensor(y); // loss = 2 - Tensor y = relu(x); // [0, 0, 2] - Tensor loss = sum(y); // loss = 2 + loss->backward(); + + // Gradient: [0, 0, 1] (only where input > 0) + ASSERT_DOUBLE_EQ(x->getGrads()->getItem(0), 0.0); + ASSERT_DOUBLE_EQ(x->getGrads()->getItem(1), 0.0); + ASSERT_DOUBLE_EQ(x->getGrads()->getItem(2), 1.0); +} + +TEST(AutogradTest, LeakyReLU) { + auto x = TensorFunctions::makeSharedTensor({3}, {-1.0, 0.0, 2.0}, true); + + constexpr ftype eps = 0.3; + auto relu = activation::LeakyReLu(eps); + + auto y = relu(x); // [0, 0, 2] + auto loss = graph::sumTensor(y); // loss = 2 - loss.backward(); + loss->backward(); // Gradient: [0, 0, 1] (only where input > 0) - EXPECT_NEAR(t.getGrads()->getItem(0), 0.0, 1e-5); - EXPECT_NEAR(t.getGrads()->getItem(1), 0.0, 1e-5); - EXPECT_NEAR(t.getGrads()->getItem(2), 1.0, 1e-5); -} */ \ No newline at end of file + ASSERT_DOUBLE_EQ(x->getGrads()->getItem(0), eps); + ASSERT_DOUBLE_EQ(x->getGrads()->getItem(1), eps); // by convention + ASSERT_DOUBLE_EQ(x->getGrads()->getItem(2), 1.0); +} \ No newline at end of file diff --git a/tests/backend/test_networks.cpp b/tests/backend/test_networks.cpp new file mode 100644 index 0000000..e266894 --- /dev/null +++ b/tests/backend/test_networks.cpp @@ -0,0 +1,79 @@ +/** + * @file test_layers.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-09 + * + * @copyright Copyright (c) 2026 + * + */ + +#include + +#include "layers/ff_layer.h" + +#include "activation_functions/relu.h" +#include "activation_functions/leaky_relu.h" +#include "activation_functions/softmax.h" + +#include "data_modeling/tensor_functions.h" + +using namespace layers; +using namespace activation; + +TEST(ActivationTest, TestRelu1) { + auto t1 = TensorFunctions::Ones({3, 2}, false); + auto f = ReLu(); + + auto res = f(t1); + + for(size_t i=0; i + +#include "activation_functions/relu.h" +#include "data_modeling/tensor_functions.h" + From 260889419dbfbc6f69419afe5f192be8ed4a0b09 Mon Sep 17 00:00:00 2001 From: Robert Baumgartner Date: Tue, 10 Mar 2026 19:44:12 +0100 Subject: [PATCH 06/24] Naive tensor slicing to enable batch-optimization; ground work for optimizers layed down --- src/backend/data_modeling/tensor.cpp | 100 ++++++++++++++++-- src/backend/data_modeling/tensor.h | 10 +- .../training/loss_functions/bce_loss.cpp | 14 +-- .../training/loss_functions/bce_loss.h | 2 +- .../loss_functions/crossentropy_loss.cpp | 16 +-- .../loss_functions/crossentropy_loss.h | 2 +- .../training/loss_functions/loss_base.h | 4 +- .../training/optimizers/optimizer_base.cpp | 39 +++++++ .../training/optimizers/optimizer_base.h | 28 ++++- src/backend/training/optimizers/rmsprop.cpp | 19 ++++ src/backend/training/optimizers/rmsprop.h | 28 +++++ src/backend/training/optimizers/sgd.cpp | 5 +- src/backend/training/optimizers/sgd.h | 11 +- 13 files changed, 241 insertions(+), 37 deletions(-) create mode 100644 src/backend/training/optimizers/optimizer_base.cpp create mode 100644 src/backend/training/optimizers/rmsprop.cpp create mode 100644 src/backend/training/optimizers/rmsprop.h diff --git a/src/backend/data_modeling/tensor.cpp b/src/backend/data_modeling/tensor.cpp index d85b3ed..ab4bec1 100644 --- a/src/backend/data_modeling/tensor.cpp +++ b/src/backend/data_modeling/tensor.cpp @@ -60,18 +60,63 @@ Tensor::tensorValues_t::~tensorValues_t() noexcept { * do not create a deepcopy, but construct another pointer pointing to the same piece * of memory. */ -void Tensor::tensorValues_t::copyValues(Tensor::tensorValues_t& target, - const Tensor::tensorValues_t& origin) { - assert(origin.device==target.device && origin.size==target.size); +void Tensor::tensorValues_t::copyValues(Tensor::tensorValues_t& target) const { + assert(device==target.device && size==target.size); - switch(origin.device){ + switch(device){ case Device::CPU: - for(tensorSize_t i=0; i= high - low); + + switch(device){ + case Device::CPU: + for(tensorSize_t i=0; i indices, + const tensorSize_t sizeOfDim) const { + assert(target.size >= sizeOfDim * indices.size()); + + switch(device){ + case Device::CPU: { + tensorSize_t targetOffset = 0; + for(tensorDim_t idx: indices){ + tensorSize_t thisOffset = idx * sizeOfDim; + copyValues(target, thisOffset, thisOffset+sizeOfDim, targetOffset); + targetOffset += sizeOfDim; + } + break; + } + case Device::CUDA: + __throw_runtime_error("CUDA not implemented for deep copy"); + break; } } @@ -116,6 +161,7 @@ Tensor::tensorValues_t::operator+=(const Tensor::tensorValues_t& other) { break; case Device::CUDA: __throw_invalid_argument("CUDA not supported yet for += operation"); + break; } return *this; @@ -212,7 +258,7 @@ Tensor Tensor::createDeepCopy() const { assert(!grads || (grads && !grads->requiresGrad)); // gradient should not require gradient auto res = Tensor(dims, values->getDevice(), requiresGrad); - tensorValues_t::copyValues(*res.values, *this->values); + values->copyValues(*res.values); /* if(grads){ res.grads = make_shared( grads->createDeepCopy() ); // TODO: do we want this? @@ -785,6 +831,46 @@ Device Tensor::getDevice() const noexcept { return values->getDevice(); } +/** + * @brief Gets a slice of this tensor. + * + * Quick and dirty implementation for now: Copies and + * returns. + * + * @param low Lower idx, inclusive bound. + * @param high Upper idx, non-inclusive bound. + * @return Tensor The slices tensor. + */ +Tensor Tensor::getSlice(tensorSize_t low, tensorSize_t high) const { + if(high<=low){ + __throw_invalid_argument("Upper bound most be larger than lower bound."); + } + + auto resDims = dims.toVector(); + resDims[0] = high-low; + Tensor res(std::move(resDims), values->getDevice(), false); + values->copyValues(*res.values, low, high, 0); + return res; +} + +/** + * @brief Like overload, but gets the slicing according to the + * indices given by the argument. Used e.g. in batch-size. + * + * @param indices A list of indices + * @return Tensor The result. + */ +Tensor Tensor::getSlice(span indices) const { + assert(indices.size()>0); + + auto resDims = dims.toVector(); + resDims[0] = indices.size(); + + Tensor res(std::move(resDims), values->getDevice(), false); + values->copyValues(*res.values, indices, getDimOffset(0, resDims)); + return res; +} + /** * @brief Prints only sample of up to 2D tensors. */ diff --git a/src/backend/data_modeling/tensor.h b/src/backend/data_modeling/tensor.h index cb91839..a47778f 100644 --- a/src/backend/data_modeling/tensor.h +++ b/src/backend/data_modeling/tensor.h @@ -20,11 +20,12 @@ #include "utility/initializers.h" #include -#include +#include #include #include +#include #include // break circular dependency @@ -94,7 +95,9 @@ class Tensor final : public std::enable_shared_from_this { void setDevice(const Device d) noexcept; Device getDevice() const noexcept; - static void copyValues(tensorValues_t& target, const tensorValues_t& origin); + void copyValues(tensorValues_t& target) const; + void copyValues(tensorValues_t& target, tensorSize_t low, tensorSize_t high, tensorSize_t targetOffset) const; + void copyValues(tensorValues_t& target, std::span indices, const tensorSize_t sizeOfDim) const; static void setDefaultDevice(const Device d) noexcept; static Device getDefaultDevice() noexcept; @@ -255,6 +258,9 @@ class Tensor final : public std::enable_shared_from_this { } } + Tensor getSlice(tensorSize_t low, tensorSize_t high) const; + Tensor getSlice(std::span indices) const; + // these two should not be exposed to the python interface static void setDefaultDevice(const Device d) noexcept; static Device getDefaultDevice() noexcept; diff --git a/src/backend/training/loss_functions/bce_loss.cpp b/src/backend/training/loss_functions/bce_loss.cpp index 5d42f06..388ed97 100644 --- a/src/backend/training/loss_functions/bce_loss.cpp +++ b/src/backend/training/loss_functions/bce_loss.cpp @@ -20,11 +20,13 @@ using namespace train; * @brief Expected shapes: (batch_size) * @return Tensor of shape (1) */ -Tensor BceLoss::operator()(const Tensor& y, const Tensor& ypred) const { - if(y.getDevice() != ypred.getDevice()){ +shared_ptr BceLoss::operator()(const shared_ptr& y, const shared_ptr& ypred) const { + assert(ypred->getRequiresGrad()); + + if(y->getDevice() != ypred->getDevice()){ __throw_invalid_argument("y and ypred must be on same device"); } - else if(y.getDims()!=ypred.getDims()){ + else if(y->getDims()!=ypred->getDims()){ __throw_invalid_argument("Tensors must be of same shape"); } @@ -32,12 +34,12 @@ Tensor BceLoss::operator()(const Tensor& y, const Tensor& ypred) const { return y*log(ypred) + (1-y)*log(1-ypred); }; - const auto nBatches = y.getDims().getItem(0); + const auto nBatches = y->getDims().getItem(0); ftype res = 0; for(tensorSize_t i=0; i(std::vector{1}, std::vector{-res / nBatches}, y->getDevice(), true);; } \ No newline at end of file diff --git a/src/backend/training/loss_functions/bce_loss.h b/src/backend/training/loss_functions/bce_loss.h index cbeeab8..677316a 100644 --- a/src/backend/training/loss_functions/bce_loss.h +++ b/src/backend/training/loss_functions/bce_loss.h @@ -16,6 +16,6 @@ namespace train { class BceLoss final : public LossBase { public: - Tensor operator()(const Tensor& y, const Tensor& ypred) const override; + std::shared_ptr operator()(const std::shared_ptr& y, const std::shared_ptr& ypred) const override; }; } diff --git a/src/backend/training/loss_functions/crossentropy_loss.cpp b/src/backend/training/loss_functions/crossentropy_loss.cpp index e2a5c5a..a27737e 100644 --- a/src/backend/training/loss_functions/crossentropy_loss.cpp +++ b/src/backend/training/loss_functions/crossentropy_loss.cpp @@ -20,27 +20,29 @@ using namespace train; * @brief Expected shapes: (batch_size, n_classes) * @return Tensor of shape (1) */ -Tensor CrossEntropyLoss::operator()(const Tensor& y, const Tensor& ypred) const { - if(y.getDevice() != ypred.getDevice()){ +shared_ptr CrossEntropyLoss::operator()(const shared_ptr & y, const shared_ptr & ypred) const { + assert(ypred->getRequiresGrad()); + + if(y->getDevice() != ypred->getDevice()){ __throw_invalid_argument("y and ypred must be on same device"); } - else if(y.getDims()!=ypred.getDims()){ + else if(y->getDims()!=ypred->getDims()){ __throw_invalid_argument("Tensors must be of same shape"); } auto ce = [&y, &ypred](const tensorDim_t b){ ftype res = 0; - for(tensorDim_t i=0; igetDims().getItem(-1); i++){ + res += y->getItem(b, i) * log(ypred->getItem(b, i)); } return res; }; - const auto nBatches = y.getDims().getItem(0); + const auto nBatches = y->getDims().getItem(0); ftype res = 0; for(tensorSize_t b=0; b(std::vector{1}, std::vector{-res / nBatches}, y->getDevice(), true);; } \ No newline at end of file diff --git a/src/backend/training/loss_functions/crossentropy_loss.h b/src/backend/training/loss_functions/crossentropy_loss.h index 3655637..f838a53 100644 --- a/src/backend/training/loss_functions/crossentropy_loss.h +++ b/src/backend/training/loss_functions/crossentropy_loss.h @@ -16,6 +16,6 @@ namespace train { class CrossEntropyLoss final : public LossBase { public: - Tensor operator()(const Tensor& y, const Tensor& ypred) const override; + std::shared_ptr operator()(const std::shared_ptr& y, const std::shared_ptr& ypred) const override; }; } diff --git a/src/backend/training/loss_functions/loss_base.h b/src/backend/training/loss_functions/loss_base.h index f196ca8..6bdf1a9 100644 --- a/src/backend/training/loss_functions/loss_base.h +++ b/src/backend/training/loss_functions/loss_base.h @@ -13,6 +13,8 @@ #include "data_modeling/tensor.h" +#include + namespace train { class LossBase { public: @@ -26,6 +28,6 @@ namespace train { ~LossBase() noexcept = default; - virtual Tensor operator()(const Tensor& y, const Tensor& ypred) const = 0; + virtual std::shared_ptr operator()(const std::shared_ptr& y, const std::shared_ptr& ypred) const = 0; }; } diff --git a/src/backend/training/optimizers/optimizer_base.cpp b/src/backend/training/optimizers/optimizer_base.cpp new file mode 100644 index 0000000..cfd9844 --- /dev/null +++ b/src/backend/training/optimizers/optimizer_base.cpp @@ -0,0 +1,39 @@ +/** + * @file optimizer_base.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-10 + * + * @copyright Copyright (c) 2026 + * + */ + +#include "optimizer_base.h" + +#include + +#include +#include + +using namespace std; +using namespace train; + +void OptimizerBase::run(shared_ptr& x, shared_ptr& y, const bool shuffle) { + for(size_t e=0; e indices(bsize); + std::iota(indices.begin(), indices.end(), 0); + + if(shuffle){ + std::random_shuffle(indices.begin(), indices.end()); + } + + const auto nSamples = x->getDims().getItem(0); + tensorDim_t low = 0; + while(low < nSamples){ + std::span batchSpan(indices.data() + low, low+bsize < nSamples ? bsize : nSamples-low); + step(make_shared(x->getSlice(batchSpan)), make_shared(y->getSlice(batchSpan))); + low += bsize; + } + } +} \ No newline at end of file diff --git a/src/backend/training/optimizers/optimizer_base.h b/src/backend/training/optimizers/optimizer_base.h index f755730..5edca83 100644 --- a/src/backend/training/optimizers/optimizer_base.h +++ b/src/backend/training/optimizers/optimizer_base.h @@ -11,10 +11,30 @@ #pragma once +#include "data_modeling/tensor.h" +#include "training/loss_functions/loss_base.h" + +#include +#include + namespace train { class OptimizerBase { + protected: + ftype lr; + + const size_t epochs; + const tensorDim_t bsize; + + std::shared_ptr loss; + std::vector< std::shared_ptr > params; + + virtual void step(std::shared_ptr x, std::shared_ptr y) = 0; + public: - OptimizerBase() = default; + OptimizerBase(std::vector< std::shared_ptr >& params, std::shared_ptr loss, + ftype lr, size_t epochs, tensorDim_t bsize) + : params{std::move(params)}, loss{loss}, lr{lr}, epochs{epochs}, bsize{bsize} {}; + ~OptimizerBase() noexcept = default; OptimizerBase(const OptimizerBase& other) = delete; @@ -22,7 +42,7 @@ namespace train { OptimizerBase(OptimizerBase&& other) noexcept = default; OptimizerBase& operator=(OptimizerBase&& other) noexcept = default; - - virtual void step() = 0; -}; + + void run(std::shared_ptr& x, std::shared_ptr& y, const bool shuffle); + }; } \ No newline at end of file diff --git a/src/backend/training/optimizers/rmsprop.cpp b/src/backend/training/optimizers/rmsprop.cpp new file mode 100644 index 0000000..feff48e --- /dev/null +++ b/src/backend/training/optimizers/rmsprop.cpp @@ -0,0 +1,19 @@ +/** + * @file rmsprop.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-10 + * + * @copyright Copyright (c) 2026 + * + */ + +#include "rmsprop.h" + +using namespace std; +using namespace train; + +void RmsPropOptimizer::step(shared_ptr x, shared_ptr y) { + // TODO: implement +} \ No newline at end of file diff --git a/src/backend/training/optimizers/rmsprop.h b/src/backend/training/optimizers/rmsprop.h new file mode 100644 index 0000000..8877174 --- /dev/null +++ b/src/backend/training/optimizers/rmsprop.h @@ -0,0 +1,28 @@ +/** + * @file rmsprop.h + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-10 + * + * @copyright Copyright (c) 2026 + * + */ + +#include "optimizer_base.h" + +#include "utility/global_params.h" + +namespace train { + class RmsPropOptimizer final : public OptimizerBase { + private: + void step(std::shared_ptr x, std::shared_ptr y) override; + + public: + RmsPropOptimizer(std::vector< std::shared_ptr >& params, + std::shared_ptr loss, ftype lr, size_t epochs, tensorDim_t bsize) + : OptimizerBase(params, loss, lr, epochs, bsize) { } + + // TODO: print + }; +} \ No newline at end of file diff --git a/src/backend/training/optimizers/sgd.cpp b/src/backend/training/optimizers/sgd.cpp index 46ba31d..752b60a 100644 --- a/src/backend/training/optimizers/sgd.cpp +++ b/src/backend/training/optimizers/sgd.cpp @@ -11,8 +11,9 @@ #include "sgd.h" +using namespace std; using namespace train; -void SgdOptimizer::step() { - // TODO: implement +void SgdOptimizer::step(shared_ptr x, shared_ptr y) { + } \ No newline at end of file diff --git a/src/backend/training/optimizers/sgd.h b/src/backend/training/optimizers/sgd.h index e7a615c..a6bd463 100644 --- a/src/backend/training/optimizers/sgd.h +++ b/src/backend/training/optimizers/sgd.h @@ -16,13 +16,12 @@ namespace train { class SgdOptimizer final : public OptimizerBase { private: - const ftype lr; - - public: - SgdOptimizer(ftype lr) : lr{lr} - { } + void step(std::shared_ptr x, std::shared_ptr y) override; - void step() override; + public: + SgdOptimizer(std::vector< std::shared_ptr >& params, + std::shared_ptr loss, ftype lr, size_t epochs, tensorDim_t bsize) + : OptimizerBase(params, loss, lr, epochs, bsize) { } // TODO: print }; From ba0ad22919ef1e35bf442d2c3675f39806ca3e98 Mon Sep 17 00:00:00 2001 From: Robert Baumgartner Date: Wed, 11 Mar 2026 21:24:48 +0100 Subject: [PATCH 07/24] Implemented SGD, trainer --- src/backend/CMakeLists.txt | 2 +- src/backend/data_modeling/tensor.cpp | 2 +- src/backend/data_modeling/tensor.h | 2 +- src/backend/networks/sequential.cpp | 20 +++++++- src/backend/networks/sequential.h | 1 + .../training/loss_functions/bce_loss.cpp | 12 ++--- .../training/loss_functions/bce_loss.h | 2 +- .../loss_functions/crossentropy_loss.cpp | 14 ++--- .../loss_functions/crossentropy_loss.h | 2 +- .../training/loss_functions/loss_base.h | 8 ++- .../training/optimizers/optimizer_base.h | 14 ++--- src/backend/training/optimizers/rmsprop.cpp | 4 +- src/backend/training/optimizers/rmsprop.h | 10 ++-- src/backend/training/optimizers/sgd.cpp | 10 +++- src/backend/training/optimizers/sgd.h | 8 ++- .../base_trainer.cpp} | 19 +++++-- src/backend/training/trainers/base_trainer.h | 51 +++++++++++++++++++ src/python/py_network/py_network.cpp | 14 ++++- 18 files changed, 140 insertions(+), 55 deletions(-) rename src/backend/training/{optimizers/optimizer_base.cpp => trainers/base_trainer.cpp} (63%) create mode 100644 src/backend/training/trainers/base_trainer.h diff --git a/src/backend/CMakeLists.txt b/src/backend/CMakeLists.txt index 3e40b12..a00f2be 100644 --- a/src/backend/CMakeLists.txt +++ b/src/backend/CMakeLists.txt @@ -3,7 +3,7 @@ file(GLOB_RECURSE CORE_SOURCES computational_graph/*.cpp data_modeling/*.cpp layers/*.cpp - #networks/*.cpp + networks/*.cpp system/*.cpp training/*.cpp utility/*.cpp diff --git a/src/backend/data_modeling/tensor.cpp b/src/backend/data_modeling/tensor.cpp index ab4bec1..633ebee 100644 --- a/src/backend/data_modeling/tensor.cpp +++ b/src/backend/data_modeling/tensor.cpp @@ -584,7 +584,7 @@ void Tensor::backward() { /** * @brief Get gradients */ -shared_ptr Tensor::getGrads() const { +shared_ptr Tensor::getGrads() const { if(!grads){ __throw_runtime_error("Tensor has no gradients."); } diff --git a/src/backend/data_modeling/tensor.h b/src/backend/data_modeling/tensor.h index a47778f..1a61aed 100644 --- a/src/backend/data_modeling/tensor.h +++ b/src/backend/data_modeling/tensor.h @@ -206,7 +206,7 @@ class Tensor final : public std::enable_shared_from_this { void backward(); bool hasGrads() const noexcept { return grads!=nullptr; } - std::shared_ptr getGrads() const; + std::shared_ptr getGrads() const; void transposeThis() noexcept; void transposeThis(int dim1, int dim2) noexcept; diff --git a/src/backend/networks/sequential.cpp b/src/backend/networks/sequential.cpp index b89d1fa..7dbfa21 100644 --- a/src/backend/networks/sequential.cpp +++ b/src/backend/networks/sequential.cpp @@ -34,7 +34,7 @@ Tensor SequentialNetwork::forward(const Tensor& input) const { __throw_invalid_argument("Network empy, cannot be called."); } - Tensor x = layers[0]->forward(input); + auto x = layers[0]->forward(input); for(int i=1; iforward(x); } @@ -42,6 +42,24 @@ Tensor SequentialNetwork::forward(const Tensor& input) const { return x; } +std::shared_ptr SequentialNetwork::forward(const std::shared_ptr& input) const { + if(input->getDims().getItem(-1) != layers[0]->getDims().getItem(-2)){ + __throw_invalid_argument("Input tensor has invalid dimension."); + } + + if(layers.size()==0){ + __throw_invalid_argument("Network empy, cannot be called."); + } + + auto x = layers[0]->forward(input); + for(int i=1; iforward(x); + } + + return x; +} + + void SequentialNetwork::append(shared_ptr l) { if(!assertDims(*l)){ __throw_invalid_argument("Dimensions of tensors don't fit."); diff --git a/src/backend/networks/sequential.h b/src/backend/networks/sequential.h index 646e5f5..db53f65 100644 --- a/src/backend/networks/sequential.h +++ b/src/backend/networks/sequential.h @@ -30,4 +30,5 @@ class SequentialNetwork { SequentialNetwork() = default; Tensor forward(const Tensor& input) const; + std::shared_ptr forward(const std::shared_ptr& input) const; }; \ No newline at end of file diff --git a/src/backend/training/loss_functions/bce_loss.cpp b/src/backend/training/loss_functions/bce_loss.cpp index 388ed97..bdce4e5 100644 --- a/src/backend/training/loss_functions/bce_loss.cpp +++ b/src/backend/training/loss_functions/bce_loss.cpp @@ -20,13 +20,13 @@ using namespace train; * @brief Expected shapes: (batch_size) * @return Tensor of shape (1) */ -shared_ptr BceLoss::operator()(const shared_ptr& y, const shared_ptr& ypred) const { +shared_ptr BceLoss::operator()(const Tensor& y, const shared_ptr& ypred) const { assert(ypred->getRequiresGrad()); - if(y->getDevice() != ypred->getDevice()){ + if(y.getDevice() != ypred->getDevice()){ __throw_invalid_argument("y and ypred must be on same device"); } - else if(y->getDims()!=ypred->getDims()){ + else if(y.getDims()!=ypred->getDims()){ __throw_invalid_argument("Tensors must be of same shape"); } @@ -34,12 +34,12 @@ shared_ptr BceLoss::operator()(const shared_ptr& y, const shared return y*log(ypred) + (1-y)*log(1-ypred); }; - const auto nBatches = y->getDims().getItem(0); + const auto nBatches = y.getDims().getItem(0); ftype res = 0; for(tensorSize_t i=0; i(std::vector{1}, std::vector{-res / nBatches}, y->getDevice(), true);; + return make_shared(std::vector{1}, std::vector{-res / nBatches}, y.getDevice(), true);; } \ No newline at end of file diff --git a/src/backend/training/loss_functions/bce_loss.h b/src/backend/training/loss_functions/bce_loss.h index 677316a..7ee06dd 100644 --- a/src/backend/training/loss_functions/bce_loss.h +++ b/src/backend/training/loss_functions/bce_loss.h @@ -16,6 +16,6 @@ namespace train { class BceLoss final : public LossBase { public: - std::shared_ptr operator()(const std::shared_ptr& y, const std::shared_ptr& ypred) const override; + std::shared_ptr operator()(const Tensor& y, const std::shared_ptr& ypred) const override; }; } diff --git a/src/backend/training/loss_functions/crossentropy_loss.cpp b/src/backend/training/loss_functions/crossentropy_loss.cpp index a27737e..1ef3a27 100644 --- a/src/backend/training/loss_functions/crossentropy_loss.cpp +++ b/src/backend/training/loss_functions/crossentropy_loss.cpp @@ -20,29 +20,29 @@ using namespace train; * @brief Expected shapes: (batch_size, n_classes) * @return Tensor of shape (1) */ -shared_ptr CrossEntropyLoss::operator()(const shared_ptr & y, const shared_ptr & ypred) const { +shared_ptr CrossEntropyLoss::operator()(const Tensor& y, const shared_ptr & ypred) const { assert(ypred->getRequiresGrad()); - if(y->getDevice() != ypred->getDevice()){ + if(y.getDevice() != ypred->getDevice()){ __throw_invalid_argument("y and ypred must be on same device"); } - else if(y->getDims()!=ypred->getDims()){ + else if(y.getDims()!=ypred->getDims()){ __throw_invalid_argument("Tensors must be of same shape"); } auto ce = [&y, &ypred](const tensorDim_t b){ ftype res = 0; - for(tensorDim_t i=0; igetDims().getItem(-1); i++){ - res += y->getItem(b, i) * log(ypred->getItem(b, i)); + for(tensorDim_t i=0; igetItem(b, i)); } return res; }; - const auto nBatches = y->getDims().getItem(0); + const auto nBatches = y.getDims().getItem(0); ftype res = 0; for(tensorSize_t b=0; b(std::vector{1}, std::vector{-res / nBatches}, y->getDevice(), true);; + return make_shared(std::vector{1}, std::vector{-res / nBatches}, y.getDevice(), true);; } \ No newline at end of file diff --git a/src/backend/training/loss_functions/crossentropy_loss.h b/src/backend/training/loss_functions/crossentropy_loss.h index f838a53..b91e037 100644 --- a/src/backend/training/loss_functions/crossentropy_loss.h +++ b/src/backend/training/loss_functions/crossentropy_loss.h @@ -16,6 +16,6 @@ namespace train { class CrossEntropyLoss final : public LossBase { public: - std::shared_ptr operator()(const std::shared_ptr& y, const std::shared_ptr& ypred) const override; + std::shared_ptr operator()(const Tensor& y, const std::shared_ptr& ypred) const override; }; } diff --git a/src/backend/training/loss_functions/loss_base.h b/src/backend/training/loss_functions/loss_base.h index 6bdf1a9..ef9ce5e 100644 --- a/src/backend/training/loss_functions/loss_base.h +++ b/src/backend/training/loss_functions/loss_base.h @@ -27,7 +27,11 @@ namespace train { LossBase& operator=(LossBase&& other) noexcept = default; ~LossBase() noexcept = default; - - virtual std::shared_ptr operator()(const std::shared_ptr& y, const std::shared_ptr& ypred) const = 0; + + virtual std::shared_ptr operator()(const Tensor& y, const std::shared_ptr& ypred) const = 0; + + std::shared_ptr operator()(const std::shared_ptr& y, const std::shared_ptr& ypred) { + return operator()(*y, ypred); + } }; } diff --git a/src/backend/training/optimizers/optimizer_base.h b/src/backend/training/optimizers/optimizer_base.h index 5edca83..16b0581 100644 --- a/src/backend/training/optimizers/optimizer_base.h +++ b/src/backend/training/optimizers/optimizer_base.h @@ -21,19 +21,11 @@ namespace train { class OptimizerBase { protected: ftype lr; - - const size_t epochs; - const tensorDim_t bsize; - - std::shared_ptr loss; std::vector< std::shared_ptr > params; - virtual void step(std::shared_ptr x, std::shared_ptr y) = 0; - public: - OptimizerBase(std::vector< std::shared_ptr >& params, std::shared_ptr loss, - ftype lr, size_t epochs, tensorDim_t bsize) - : params{std::move(params)}, loss{loss}, lr{lr}, epochs{epochs}, bsize{bsize} {}; + OptimizerBase(std::vector< std::shared_ptr > params, ftype lr) + : params{std::move(params)}, lr{lr} {}; ~OptimizerBase() noexcept = default; @@ -43,6 +35,6 @@ namespace train { OptimizerBase(OptimizerBase&& other) noexcept = default; OptimizerBase& operator=(OptimizerBase&& other) noexcept = default; - void run(std::shared_ptr& x, std::shared_ptr& y, const bool shuffle); + virtual void step() = 0; }; } \ No newline at end of file diff --git a/src/backend/training/optimizers/rmsprop.cpp b/src/backend/training/optimizers/rmsprop.cpp index feff48e..096415a 100644 --- a/src/backend/training/optimizers/rmsprop.cpp +++ b/src/backend/training/optimizers/rmsprop.cpp @@ -14,6 +14,6 @@ using namespace std; using namespace train; -void RmsPropOptimizer::step(shared_ptr x, shared_ptr y) { - // TODO: implement +void RmsPropOptimizer::step() { + __throw_runtime_error("Not implemented yet"); } \ No newline at end of file diff --git a/src/backend/training/optimizers/rmsprop.h b/src/backend/training/optimizers/rmsprop.h index 8877174..7d58f4f 100644 --- a/src/backend/training/optimizers/rmsprop.h +++ b/src/backend/training/optimizers/rmsprop.h @@ -15,14 +15,10 @@ namespace train { class RmsPropOptimizer final : public OptimizerBase { - private: - void step(std::shared_ptr x, std::shared_ptr y) override; - public: - RmsPropOptimizer(std::vector< std::shared_ptr >& params, - std::shared_ptr loss, ftype lr, size_t epochs, tensorDim_t bsize) - : OptimizerBase(params, loss, lr, epochs, bsize) { } + RmsPropOptimizer(std::vector< std::shared_ptr > params, ftype lr) + : OptimizerBase(std::move(params), lr) { } - // TODO: print + void step() override; }; } \ No newline at end of file diff --git a/src/backend/training/optimizers/sgd.cpp b/src/backend/training/optimizers/sgd.cpp index 752b60a..03cdac0 100644 --- a/src/backend/training/optimizers/sgd.cpp +++ b/src/backend/training/optimizers/sgd.cpp @@ -14,6 +14,12 @@ using namespace std; using namespace train; -void SgdOptimizer::step(shared_ptr x, shared_ptr y) { - +void SgdOptimizer::step() { + for(auto& t: params){ + auto grads = t->getGrads(); + for(auto idx=0; idxgetSize(); idx++){ + auto updatedWeight = (*t)[idx] - lr*(*grads)[idx]; + t->setItem(updatedWeight, idx); + } + } } \ No newline at end of file diff --git a/src/backend/training/optimizers/sgd.h b/src/backend/training/optimizers/sgd.h index a6bd463..7eb6f9b 100644 --- a/src/backend/training/optimizers/sgd.h +++ b/src/backend/training/optimizers/sgd.h @@ -16,13 +16,11 @@ namespace train { class SgdOptimizer final : public OptimizerBase { private: - void step(std::shared_ptr x, std::shared_ptr y) override; public: - SgdOptimizer(std::vector< std::shared_ptr >& params, - std::shared_ptr loss, ftype lr, size_t epochs, tensorDim_t bsize) - : OptimizerBase(params, loss, lr, epochs, bsize) { } + SgdOptimizer(std::vector< std::shared_ptr > params, ftype lr) + : OptimizerBase(std::move(params), lr) { } - // TODO: print + void step() override; }; } \ No newline at end of file diff --git a/src/backend/training/optimizers/optimizer_base.cpp b/src/backend/training/trainers/base_trainer.cpp similarity index 63% rename from src/backend/training/optimizers/optimizer_base.cpp rename to src/backend/training/trainers/base_trainer.cpp index cfd9844..f9e8edb 100644 --- a/src/backend/training/optimizers/optimizer_base.cpp +++ b/src/backend/training/trainers/base_trainer.cpp @@ -1,15 +1,15 @@ /** - * @file optimizer_base.cpp + * @file base_trainer.cpp * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) * @brief * @version 0.1 - * @date 2026-03-10 + * @date 2026-03-11 * * @copyright Copyright (c) 2026 * */ -#include "optimizer_base.h" +#include "base_trainer.h" #include @@ -19,7 +19,7 @@ using namespace std; using namespace train; -void OptimizerBase::run(shared_ptr& x, shared_ptr& y, const bool shuffle) { +void BaseTrainer::run(shared_ptr& x, shared_ptr& y, const bool shuffle) { for(size_t e=0; e indices(bsize); std::iota(indices.begin(), indices.end(), 0); @@ -32,7 +32,16 @@ void OptimizerBase::run(shared_ptr& x, shared_ptr& y, const bool tensorDim_t low = 0; while(low < nSamples){ std::span batchSpan(indices.data() + low, low+bsize < nSamples ? bsize : nSamples-low); - step(make_shared(x->getSlice(batchSpan)), make_shared(y->getSlice(batchSpan))); + + auto xBatch = make_shared(x->getSlice(batchSpan)); + auto yBatch = y->getSlice(batchSpan); + + auto yPred = network->forward(xBatch); + auto l = (*loss)(yBatch, yPred); + + l->backward(); + optim->step(); + low += bsize; } } diff --git a/src/backend/training/trainers/base_trainer.h b/src/backend/training/trainers/base_trainer.h new file mode 100644 index 0000000..a23da08 --- /dev/null +++ b/src/backend/training/trainers/base_trainer.h @@ -0,0 +1,51 @@ +/** + * @file train_mode.h + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-11 + * + * @copyright Copyright (c) 2026 + * + */ + +#pragma once + +#include "training/optimizers/optimizer_base.h" +#include "training/loss_functions/loss_base.h" + +#include "data_modeling/tensor.h" +#include "training/loss_functions/loss_base.h" +#include "networks/sequential.h" + +#include +#include + +namespace train { + class BaseTrainer { + protected: + ftype lr; + + const size_t epochs; + const tensorDim_t bsize; + + std::shared_ptr loss; + std::shared_ptr optim; + std::shared_ptr network; + + public: + BaseTrainer(std::shared_ptr& network, std::shared_ptr loss, + std::shared_ptr optim, ftype lr, size_t epochs, tensorDim_t bsize) + : network{std::move(network)}, optim{std::move(optim)}, loss{loss}, lr{lr}, epochs{epochs}, bsize{bsize} {}; + + ~BaseTrainer() noexcept = default; + + BaseTrainer(const BaseTrainer& other) = delete; + BaseTrainer& operator=(const BaseTrainer& other) = delete; + + BaseTrainer(BaseTrainer&& other) noexcept = default; + BaseTrainer& operator=(BaseTrainer&& other) noexcept = default; + + void run(std::shared_ptr& x, std::shared_ptr& y, const bool shuffle); + }; +} \ No newline at end of file diff --git a/src/python/py_network/py_network.cpp b/src/python/py_network/py_network.cpp index d8c1d06..57bd91f 100644 --- a/src/python/py_network/py_network.cpp +++ b/src/python/py_network/py_network.cpp @@ -36,6 +36,9 @@ BOOST_PYTHON_MODULE(py_layers) return (self.*method)(t1.getSharedPtr(), t2.getSharedPtr()); \ } + // Networks + // TODO + // Layers class_("LayerBase", no_init) // attributes @@ -76,15 +79,22 @@ BOOST_PYTHON_MODULE(py_layers) ; // Loss functions - class_("BCE") + class_, boost::noncopyable>("BCE") .def("__call__", &train::BceLoss::operator()) ; - class_("CrossEntropy") + class_, boost::noncopyable>("CrossEntropy") .def("__call__", &train::CrossEntropyLoss::operator()) ; // Optimizers + class_, boost::noncopyable>("SGD", no_init) + .def(init >, ftype>()) + .def("step", &train::SgdOptimizer::step) + ; + + // Trainers + // TODO } /* From b54eab4887af0de7871508023bd361cc6300ec26 Mon Sep 17 00:00:00 2001 From: Robert Baumgartner Date: Fri, 13 Mar 2026 12:11:55 +0100 Subject: [PATCH 08/24] RmsProp --- src/backend/layers/layer_base.h | 5 ++++ src/backend/networks/sequential.cpp | 12 ++++++++ src/backend/networks/sequential.h | 2 ++ .../training/optimizers/optimizer_base.h | 4 +-- src/backend/training/optimizers/rmsprop.cpp | 30 ++++++++++++++++++- src/backend/training/optimizers/rmsprop.h | 16 ++++++++-- .../{base_trainer.cpp => base_train_loop.cpp} | 6 ++-- .../{base_trainer.h => base_train_loop.h} | 18 +++++------ src/python/py_utility/custom_converters.h | 24 +++++++++++++++ 9 files changed, 99 insertions(+), 18 deletions(-) rename src/backend/training/trainers/{base_trainer.cpp => base_train_loop.cpp} (86%) rename src/backend/training/trainers/{base_trainer.h => base_train_loop.h} (67%) diff --git a/src/backend/layers/layer_base.h b/src/backend/layers/layer_base.h index 65a5428..06c51d9 100644 --- a/src/backend/layers/layer_base.h +++ b/src/backend/layers/layer_base.h @@ -17,6 +17,7 @@ #include #include +#include #include @@ -58,6 +59,10 @@ namespace layers { auto getWeights() const noexcept { return weights; } auto getBias() const noexcept { return bias; } + std::pair, std::shared_ptr>getParams() const { + return std::make_pair(weights, bias); + } + virtual void print(std::ostream& os) const noexcept; friend std::ostream& operator<<(std::ostream& os, const LayerBase& t) noexcept; }; diff --git a/src/backend/networks/sequential.cpp b/src/backend/networks/sequential.cpp index 7dbfa21..7f1bb5b 100644 --- a/src/backend/networks/sequential.cpp +++ b/src/backend/networks/sequential.cpp @@ -59,6 +59,18 @@ std::shared_ptr SequentialNetwork::forward(const std::shared_ptr return x; } +std::vector> SequentialNetwork::getParams() const { + std::vector> res; + res.reserve(layers.size()*2); + + for(const auto& layer: layers){ + auto [weigths, bias] = layer->getParams(); + res.push_back(std::move(weights)); + res.push_back(std::move(bias)); + } + + return res; +} void SequentialNetwork::append(shared_ptr l) { if(!assertDims(*l)){ diff --git a/src/backend/networks/sequential.h b/src/backend/networks/sequential.h index db53f65..bf1c09a 100644 --- a/src/backend/networks/sequential.h +++ b/src/backend/networks/sequential.h @@ -29,6 +29,8 @@ class SequentialNetwork { public: SequentialNetwork() = default; + std::vector> getParams() const; + Tensor forward(const Tensor& input) const; std::shared_ptr forward(const std::shared_ptr& input) const; }; \ No newline at end of file diff --git a/src/backend/training/optimizers/optimizer_base.h b/src/backend/training/optimizers/optimizer_base.h index 16b0581..a9545c5 100644 --- a/src/backend/training/optimizers/optimizer_base.h +++ b/src/backend/training/optimizers/optimizer_base.h @@ -20,8 +20,8 @@ namespace train { class OptimizerBase { protected: - ftype lr; - std::vector< std::shared_ptr > params; + const ftype lr; + const std::vector< std::shared_ptr > params; public: OptimizerBase(std::vector< std::shared_ptr > params, ftype lr) diff --git a/src/backend/training/optimizers/rmsprop.cpp b/src/backend/training/optimizers/rmsprop.cpp index 096415a..8e346ec 100644 --- a/src/backend/training/optimizers/rmsprop.cpp +++ b/src/backend/training/optimizers/rmsprop.cpp @@ -15,5 +15,33 @@ using namespace std; using namespace train; void RmsPropOptimizer::step() { - __throw_runtime_error("Not implemented yet"); + constexpr ftype eps = 1e-6; + for(const auto& param: params){ + auto tPtr = param.get(); + const auto gPtr = tPtr->getGrads().get(); + auto vPtr = movingAvg[tPtr].get(); + + // update moving avg + if(vPtr!=nullptr) { // hot path + for(tensorSize_t i=0; igetSize(); i++){ + auto g = (*gPtr)[i]; + auto update = decay * (*vPtr)[i] + (1-decay)*g*g; + vPtr->setItem(update, i); + } + } + else { // init loop + movingAvg[tPtr] = make_unique(tPtr->getDims(), tPtr->getDevice(), false); // create empty tensor + vPtr = movingAvg[tPtr].get(); + for(tensorSize_t i=0; igetSize(); i++) { + auto g = (*tPtr)[i]; + vPtr->setItem((1-decay)*g*g, i); + } + } + + // update gradients + for(tensorSize_t i=0; igetSize(); i++) { + auto update = (*tPtr)[i] - lr * (*gPtr)[i] / ((*vPtr)[i] + eps); + tPtr->setItem(update, i); + } + } } \ No newline at end of file diff --git a/src/backend/training/optimizers/rmsprop.h b/src/backend/training/optimizers/rmsprop.h index 7d58f4f..967c18e 100644 --- a/src/backend/training/optimizers/rmsprop.h +++ b/src/backend/training/optimizers/rmsprop.h @@ -10,14 +10,24 @@ */ #include "optimizer_base.h" - #include "utility/global_params.h" +#include + namespace train { class RmsPropOptimizer final : public OptimizerBase { + private: + const ftype decay; + std::unordered_map> movingAvg; + public: - RmsPropOptimizer(std::vector< std::shared_ptr > params, ftype lr) - : OptimizerBase(std::move(params), lr) { } + RmsPropOptimizer(std::vector< std::shared_ptr > params, ftype lr, ftype decay) + : OptimizerBase(std::move(params), lr), decay{decay} + { + for(const auto& param: params) { + movingAvg[param.get()] = nullptr; // lazy initialization + } + } void step() override; }; diff --git a/src/backend/training/trainers/base_trainer.cpp b/src/backend/training/trainers/base_train_loop.cpp similarity index 86% rename from src/backend/training/trainers/base_trainer.cpp rename to src/backend/training/trainers/base_train_loop.cpp index f9e8edb..66a5ca2 100644 --- a/src/backend/training/trainers/base_trainer.cpp +++ b/src/backend/training/trainers/base_train_loop.cpp @@ -1,5 +1,5 @@ /** - * @file base_trainer.cpp + * @file base_train_loop.cpp * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) * @brief * @version 0.1 @@ -9,7 +9,7 @@ * */ -#include "base_trainer.h" +#include "base_train_loop.h" #include @@ -19,7 +19,7 @@ using namespace std; using namespace train; -void BaseTrainer::run(shared_ptr& x, shared_ptr& y, const bool shuffle) { +void BaseTrainLoop::run(shared_ptr& x, shared_ptr& y, const bool shuffle) { for(size_t e=0; e indices(bsize); std::iota(indices.begin(), indices.end(), 0); diff --git a/src/backend/training/trainers/base_trainer.h b/src/backend/training/trainers/base_train_loop.h similarity index 67% rename from src/backend/training/trainers/base_trainer.h rename to src/backend/training/trainers/base_train_loop.h index a23da08..363395e 100644 --- a/src/backend/training/trainers/base_trainer.h +++ b/src/backend/training/trainers/base_train_loop.h @@ -1,9 +1,9 @@ /** - * @file train_mode.h + * @file base_train_loop.h * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) * @brief * @version 0.1 - * @date 2026-03-11 + * @date 2026-03-13 * * @copyright Copyright (c) 2026 * @@ -22,7 +22,7 @@ #include namespace train { - class BaseTrainer { + class BaseTrainLoop { protected: ftype lr; @@ -34,17 +34,17 @@ namespace train { std::shared_ptr network; public: - BaseTrainer(std::shared_ptr& network, std::shared_ptr loss, + BaseTrainLoop(std::shared_ptr& network, std::shared_ptr loss, std::shared_ptr optim, ftype lr, size_t epochs, tensorDim_t bsize) : network{std::move(network)}, optim{std::move(optim)}, loss{loss}, lr{lr}, epochs{epochs}, bsize{bsize} {}; - ~BaseTrainer() noexcept = default; + ~BaseTrainLoop() noexcept = default; - BaseTrainer(const BaseTrainer& other) = delete; - BaseTrainer& operator=(const BaseTrainer& other) = delete; + BaseTrainLoop(const BaseTrainLoop& other) = delete; + BaseTrainLoop& operator=(const BaseTrainLoop& other) = delete; - BaseTrainer(BaseTrainer&& other) noexcept = default; - BaseTrainer& operator=(BaseTrainer&& other) noexcept = default; + BaseTrainLoop(BaseTrainLoop&& other) noexcept = default; + BaseTrainLoop& operator=(BaseTrainLoop&& other) noexcept = default; void run(std::shared_ptr& x, std::shared_ptr& y, const bool shuffle); }; diff --git a/src/python/py_utility/custom_converters.h b/src/python/py_utility/custom_converters.h index 5114d7f..4c2d49d 100644 --- a/src/python/py_utility/custom_converters.h +++ b/src/python/py_utility/custom_converters.h @@ -52,6 +52,30 @@ namespace custom_converters { }; } +/* struct DimsFromPython { + static void* convertible(PyObject* obj) { + if (!PyTuple_Check(obj) && !PyList_Check(obj)) return nullptr; + return obj; + } + + static void construct(PyObject* obj, + bp::converter::rvalue_from_python_stage1_data* data) { + void* storage = ((bp::converter::rvalue_from_python_object_data*)data)->storage.bytes; + Dims* dims = new (storage) Dims(); + int len = PySequence_Length(obj); + dims->ndim = len; + for (int i = 0; i < len; i++) + dims->data[i] = bp::extract(PySequence_GetItem(obj, i)); + data->convertible = storage; + } +}; + +// register it in your module init: +bp::converter::registry::push_back( + &DimsFromPython::convertible, + &DimsFromPython::construct, + bp::type_id()); */ + /******************************************************************************************/ /******************************************************************************************/ /******************************************************************************************/ From 476bd5700fe6249c61ea20476a6899543c9bb5bb Mon Sep 17 00:00:00 2001 From: Robert Baumgartner Date: Fri, 13 Mar 2026 15:40:27 +0100 Subject: [PATCH 09/24] Defining network, layers, and activation functions as module, so that we can cleanly define a network in Python --- python_lib/dl_lib/nn/__init__.py | 1 + python_lib/dl_lib/nn/module.py | 27 ++++++ .../activation_function_base.cpp | 20 ----- .../activation_function_base.h | 41 --------- .../activation_functions/leaky_relu_node.cpp | 2 +- .../activation_functions/leaky_relu_node.h | 2 +- .../activation_functions/relu_node.cpp | 2 +- .../activation_functions/relu_node.h | 2 +- src/backend/computational_graph/graph_node.h | 2 +- .../tensor_ops/add_node.cpp | 2 +- .../computational_graph/tensor_ops/add_node.h | 2 +- .../tensor_ops/elementwise_mul_node.cpp | 2 +- .../tensor_ops/elementwise_mul_node.h | 2 +- .../tensor_ops/getter_node.cpp | 2 +- .../tensor_ops/getter_node.h | 2 +- .../tensor_ops/graph_creation.cpp | 48 +++++------ .../tensor_ops/graph_creation.h | 2 +- .../tensor_ops/matmul_node.cpp | 2 +- .../tensor_ops/matmul_node.h | 2 +- .../tensor_ops/scalar_op_nodes.cpp | 6 +- .../tensor_ops/scalar_op_nodes.h | 2 +- .../computational_graph/topological_sort.cpp | 2 +- .../computational_graph/topological_sort.h | 2 +- src/backend/data_modeling/dim_type.cpp | 30 +++---- src/backend/data_modeling/dim_type.h | 28 +++--- src/backend/data_modeling/tensor.cpp | 2 +- src/backend/data_modeling/tensor.h | 8 +- src/backend/layers/ff_layer.h | 30 ------- src/backend/layers/layer_base.cpp | 34 -------- src/backend/layers/layer_base.h | 69 --------------- .../activation_functions/leaky_relu.cpp | 4 +- .../activation_functions/leaky_relu.h | 6 +- .../activation_functions/relu.cpp | 4 +- .../{ => module}/activation_functions/relu.h | 6 +- .../activation_functions/sigmoid.h | 0 .../activation_functions/softmax.cpp | 4 +- .../activation_functions/softmax.h | 6 +- src/backend/{ => module}/layers/ff_layer.cpp | 41 ++++----- src/backend/module/layers/ff_layer.h | 54 ++++++++++++ src/backend/module/module_base.cpp | 31 +++++++ src/backend/module/module_base.h | 49 +++++++++++ src/backend/module/networks/sequential.cpp | 60 +++++++++++++ src/backend/module/networks/sequential.h | 42 +++++++++ src/backend/networks/sequential.cpp | 85 ------------------- src/backend/networks/sequential.h | 36 -------- .../training/optimizers/optimizer_base.h | 9 +- .../training/trainers/base_train_loop.cpp | 7 +- .../training/trainers/base_train_loop.h | 15 ++-- src/python/py_core/py_core.cpp | 2 +- src/python/py_core/py_core_util.h | 22 ++--- src/python/py_network/py_network.cpp | 62 ++++++++------ src/python/py_network/py_network_util.h | 43 ++++------ tests/backend/test_computational_graph.cpp | 36 ++++---- tests/backend/test_networks.cpp | 14 +-- tests/backend/test_training.cpp | 2 +- 55 files changed, 484 insertions(+), 534 deletions(-) create mode 100644 python_lib/dl_lib/nn/module.py delete mode 100644 src/backend/activation_functions/activation_function_base.cpp delete mode 100644 src/backend/activation_functions/activation_function_base.h delete mode 100644 src/backend/layers/ff_layer.h delete mode 100644 src/backend/layers/layer_base.cpp delete mode 100644 src/backend/layers/layer_base.h rename src/backend/{ => module}/activation_functions/leaky_relu.cpp (89%) rename src/backend/{ => module}/activation_functions/leaky_relu.h (81%) rename src/backend/{ => module}/activation_functions/relu.cpp (89%) rename src/backend/{ => module}/activation_functions/relu.h (77%) rename src/backend/{ => module}/activation_functions/sigmoid.h (100%) rename src/backend/{ => module}/activation_functions/softmax.cpp (93%) rename src/backend/{ => module}/activation_functions/softmax.h (75%) rename src/backend/{ => module}/layers/ff_layer.cpp (65%) create mode 100644 src/backend/module/layers/ff_layer.h create mode 100644 src/backend/module/module_base.cpp create mode 100644 src/backend/module/module_base.h create mode 100644 src/backend/module/networks/sequential.cpp create mode 100644 src/backend/module/networks/sequential.h delete mode 100644 src/backend/networks/sequential.cpp delete mode 100644 src/backend/networks/sequential.h diff --git a/python_lib/dl_lib/nn/__init__.py b/python_lib/dl_lib/nn/__init__.py index 75fefbc..11a7537 100644 --- a/python_lib/dl_lib/nn/__init__.py +++ b/python_lib/dl_lib/nn/__init__.py @@ -1,3 +1,4 @@ +#from module import Module #from .._compiled._layers import FfLayer, ReLU #from .._compiled._core import Tensor # re-export if needed diff --git a/python_lib/dl_lib/nn/module.py b/python_lib/dl_lib/nn/module.py new file mode 100644 index 0000000..2c00b32 --- /dev/null +++ b/python_lib/dl_lib/nn/module.py @@ -0,0 +1,27 @@ +""" +Module base class. We use it to automatically register network +modules when defining graphs via Module. +""" + +class Module: + def __init__(self): + self._modules = {} + + """ + Stores attributes defined in __init__ in private + _modules dictionary + """ + def __setattr__(self, name, value): + if isinstance(value, Module): + self._modules[name] = value + object.__setattr__(self, name, value) + + """ + Returns a list of leaf parameters. Used to identify trainable + nodes of a graph. + """ + def parameters(self): + params = self._own_parameters() # calls C++ side for leaf modules + for module in self._modules.values(): + params.extend(module.parameters()) + return params \ No newline at end of file diff --git a/src/backend/activation_functions/activation_function_base.cpp b/src/backend/activation_functions/activation_function_base.cpp deleted file mode 100644 index 54e1d16..0000000 --- a/src/backend/activation_functions/activation_function_base.cpp +++ /dev/null @@ -1,20 +0,0 @@ -/** - * @file activation_function_base.cpp - * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) - * @brief - * @version 0.1 - * @date 2026-03-08 - * - * @copyright Copyright (c) 2026 - * - */ - -#include "activation_function_base.h" - -using namespace std; -using namespace activation; - -ostream& operator<<(ostream& os, const ActivationFunctionBase& l) noexcept { - static_cast(&l)->print(os); // calling vtable - return os; -} \ No newline at end of file diff --git a/src/backend/activation_functions/activation_function_base.h b/src/backend/activation_functions/activation_function_base.h deleted file mode 100644 index 1a8ea80..0000000 --- a/src/backend/activation_functions/activation_function_base.h +++ /dev/null @@ -1,41 +0,0 @@ -/** - * @file function_base.h - * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) - * @brief - * @version 0.1 - * @date 2026-02-01 - * - * @copyright Copyright (c) 2026 - * - */ - -#pragma once - -#include "data_modeling/tensor.h" - -#include -#include - -namespace activation { - class ActivationFunctionBase { - public: - ActivationFunctionBase() = default; - - ActivationFunctionBase(const ActivationFunctionBase& other) = delete; - ActivationFunctionBase& operator=(const ActivationFunctionBase& other) = delete; - - ActivationFunctionBase(ActivationFunctionBase&& other) noexcept = default; - ActivationFunctionBase& operator=(ActivationFunctionBase&& other) noexcept = default; - - ~ActivationFunctionBase() noexcept = default; - - // creates no graph - virtual Tensor operator()(const Tensor& t) const = 0; - - // greates a graph - virtual std::shared_ptr operator()(const std::shared_ptr& t) const = 0; - - virtual void print(std::ostream& os) const noexcept { }; - friend std::ostream& operator<<(std::ostream& os, const ActivationFunctionBase& t) noexcept; - }; -} diff --git a/src/backend/computational_graph/activation_functions/leaky_relu_node.cpp b/src/backend/computational_graph/activation_functions/leaky_relu_node.cpp index c39297c..2334992 100644 --- a/src/backend/computational_graph/activation_functions/leaky_relu_node.cpp +++ b/src/backend/computational_graph/activation_functions/leaky_relu_node.cpp @@ -14,7 +14,7 @@ #include using namespace std; -using namespace graph; +using namespace cgraph; vector> LeakyReLuNode::backward(const Tensor& upstreamGrad) { assert(!upstreamGrad.getRequiresGrad()); diff --git a/src/backend/computational_graph/activation_functions/leaky_relu_node.h b/src/backend/computational_graph/activation_functions/leaky_relu_node.h index 076b4d2..1d9304e 100644 --- a/src/backend/computational_graph/activation_functions/leaky_relu_node.h +++ b/src/backend/computational_graph/activation_functions/leaky_relu_node.h @@ -15,7 +15,7 @@ #include -namespace graph { +namespace cgraph { class LeakyReLuNode final : public GraphNode { private: const ftype eps; diff --git a/src/backend/computational_graph/activation_functions/relu_node.cpp b/src/backend/computational_graph/activation_functions/relu_node.cpp index 43577f1..015d7ff 100644 --- a/src/backend/computational_graph/activation_functions/relu_node.cpp +++ b/src/backend/computational_graph/activation_functions/relu_node.cpp @@ -14,7 +14,7 @@ #include using namespace std; -using namespace graph; +using namespace cgraph; vector> ReLuNode::backward(const Tensor& upstreamGrad) { assert(!upstreamGrad.getRequiresGrad()); diff --git a/src/backend/computational_graph/activation_functions/relu_node.h b/src/backend/computational_graph/activation_functions/relu_node.h index 9c83143..ef17749 100644 --- a/src/backend/computational_graph/activation_functions/relu_node.h +++ b/src/backend/computational_graph/activation_functions/relu_node.h @@ -15,7 +15,7 @@ #include -namespace graph { +namespace cgraph { class ReLuNode final : public GraphNode { public: explicit ReLuNode(std::shared_ptr t) diff --git a/src/backend/computational_graph/graph_node.h b/src/backend/computational_graph/graph_node.h index 9302703..e18518b 100644 --- a/src/backend/computational_graph/graph_node.h +++ b/src/backend/computational_graph/graph_node.h @@ -18,7 +18,7 @@ #include -namespace graph { +namespace cgraph { class GraphNode { protected: std::vector< std::shared_ptr > parents; diff --git a/src/backend/computational_graph/tensor_ops/add_node.cpp b/src/backend/computational_graph/tensor_ops/add_node.cpp index 4f3e072..77cfd0c 100644 --- a/src/backend/computational_graph/tensor_ops/add_node.cpp +++ b/src/backend/computational_graph/tensor_ops/add_node.cpp @@ -14,7 +14,7 @@ #include "data_modeling/tensor_functions.h" using namespace std; -using namespace graph; +using namespace cgraph; vector< shared_ptr > AddNode::backward(const Tensor& upstreamGrad) { assert(!upstreamGrad.getRequiresGrad()); diff --git a/src/backend/computational_graph/tensor_ops/add_node.h b/src/backend/computational_graph/tensor_ops/add_node.h index 4adc443..2402394 100644 --- a/src/backend/computational_graph/tensor_ops/add_node.h +++ b/src/backend/computational_graph/tensor_ops/add_node.h @@ -13,7 +13,7 @@ #include "computational_graph/graph_node.h" -namespace graph { +namespace cgraph { class AddNode final : public GraphNode { private: // if t2 has been a vector we broadcast t2 into t1, see Tensor::add() diff --git a/src/backend/computational_graph/tensor_ops/elementwise_mul_node.cpp b/src/backend/computational_graph/tensor_ops/elementwise_mul_node.cpp index fcdb130..22d39e7 100644 --- a/src/backend/computational_graph/tensor_ops/elementwise_mul_node.cpp +++ b/src/backend/computational_graph/tensor_ops/elementwise_mul_node.cpp @@ -12,7 +12,7 @@ #include "elementwise_mul_node.h" using namespace std; -using namespace graph; +using namespace cgraph; vector> ElementwiseMulNode::backward(const Tensor& upstreamGrad) { assert(!upstreamGrad.getRequiresGrad()); diff --git a/src/backend/computational_graph/tensor_ops/elementwise_mul_node.h b/src/backend/computational_graph/tensor_ops/elementwise_mul_node.h index f2a5344..ffa8038 100644 --- a/src/backend/computational_graph/tensor_ops/elementwise_mul_node.h +++ b/src/backend/computational_graph/tensor_ops/elementwise_mul_node.h @@ -13,7 +13,7 @@ #include "computational_graph/graph_node.h" -namespace graph { +namespace cgraph { class ElementwiseMulNode final : public GraphNode { public: explicit ElementwiseMulNode(std::shared_ptr t1, std::shared_ptr t2) diff --git a/src/backend/computational_graph/tensor_ops/getter_node.cpp b/src/backend/computational_graph/tensor_ops/getter_node.cpp index e1a3ac0..f937d91 100644 --- a/src/backend/computational_graph/tensor_ops/getter_node.cpp +++ b/src/backend/computational_graph/tensor_ops/getter_node.cpp @@ -12,7 +12,7 @@ #include "getter_node.h" using namespace std; -using namespace graph; +using namespace cgraph; vector< shared_ptr > GetterNode::backward(const Tensor& upstreamGrad) { // upstreamGrad is scalar by definition diff --git a/src/backend/computational_graph/tensor_ops/getter_node.h b/src/backend/computational_graph/tensor_ops/getter_node.h index 5d359d2..c0bdb45 100644 --- a/src/backend/computational_graph/tensor_ops/getter_node.h +++ b/src/backend/computational_graph/tensor_ops/getter_node.h @@ -16,7 +16,7 @@ #include #include -namespace graph{ +namespace cgraph{ /** * @brief When calling a get function, say as in * loss += myTensor[i], then we need to build a graph in between diff --git a/src/backend/computational_graph/tensor_ops/graph_creation.cpp b/src/backend/computational_graph/tensor_ops/graph_creation.cpp index 1955493..130270e 100644 --- a/src/backend/computational_graph/tensor_ops/graph_creation.cpp +++ b/src/backend/computational_graph/tensor_ops/graph_creation.cpp @@ -19,72 +19,72 @@ using namespace std; -shared_ptr graph::mul(const shared_ptr left, const shared_ptr right) { +shared_ptr cgraph::mul(const shared_ptr left, const shared_ptr right) { auto res = make_shared((*left) * (*right)); if(left->getRequiresGrad() || right->getRequiresGrad()){ - res->setCgNode(make_shared(left, right)); + res->setCgNode(make_shared(left, right)); assert(res->getRequiresGrad()); } return res; } -shared_ptr graph::add(const shared_ptr left, const shared_ptr right) { +shared_ptr cgraph::add(const shared_ptr left, const shared_ptr right) { auto res = make_shared(*left + *right); if(left->getRequiresGrad() || right->getRequiresGrad()){ - res->setCgNode(make_shared(left, right)); + res->setCgNode(make_shared(left, right)); assert(res->getRequiresGrad()); } return res; } -shared_ptr graph::matmul(const shared_ptr left, const shared_ptr right) { +shared_ptr cgraph::matmul(const shared_ptr left, const shared_ptr right) { auto res = make_shared(left->matmul(*right)); if(left->getRequiresGrad() || right->getRequiresGrad()){ - res->setCgNode(make_shared(left, right)); + res->setCgNode(make_shared(left, right)); assert(res->getRequiresGrad()); } return res; } -shared_ptr graph::mul(const shared_ptr t, ftype scalar) { +shared_ptr cgraph::mul(const shared_ptr t, ftype scalar) { auto res = make_shared((*t) * scalar); if(t->getRequiresGrad()){ - res->setCgNode(std::make_shared(t, scalar)); + res->setCgNode(std::make_shared(t, scalar)); assert(res->getRequiresGrad()); } return res; } -shared_ptr graph::mul(ftype scalar, const shared_ptr t) { - return graph::mul(t, scalar); +shared_ptr cgraph::mul(ftype scalar, const shared_ptr t) { + return cgraph::mul(t, scalar); } -shared_ptr graph::add(const shared_ptr t, ftype scalar) { +shared_ptr cgraph::add(const shared_ptr t, ftype scalar) { auto res = make_shared((*t) + scalar); if(t->getRequiresGrad()){ - res->setCgNode(std::make_shared(t)); + res->setCgNode(std::make_shared(t)); assert(res->getRequiresGrad()); } return res; } -shared_ptr graph::add(ftype scalar, const shared_ptr t) { - return graph::add(t, scalar); +shared_ptr cgraph::add(ftype scalar, const shared_ptr t) { + return cgraph::add(t, scalar); } -shared_ptr graph::sub(const shared_ptr t, ftype scalar) { +shared_ptr cgraph::sub(const shared_ptr t, ftype scalar) { auto res = make_shared((*t) - scalar); if(t->getRequiresGrad()){ - res->setCgNode(std::make_shared(t)); + res->setCgNode(std::make_shared(t)); assert(res->getRequiresGrad()); } return res; } -shared_ptr graph::div(const shared_ptr t, ftype scalar) { +shared_ptr cgraph::div(const shared_ptr t, ftype scalar) { auto res = make_shared((*t) / scalar); if(t->getRequiresGrad()){ - res->setCgNode(std::make_shared(t, 1 / scalar)); + res->setCgNode(std::make_shared(t, 1 / scalar)); assert(res->getRequiresGrad()); } return res; @@ -97,13 +97,13 @@ shared_ptr graph::div(const shared_ptr t, ftype scalar) { * * loss = loss + other.get(i), we need to make sure get(i) can map to computational graph. */ -shared_ptr graph::get(const shared_ptr& t, tensorSize_t idx) { +shared_ptr cgraph::get(const shared_ptr& t, tensorSize_t idx) { ftype val = t->getItem(idx); auto res = make_shared(std::vector{1}, std::vector{val}, t->getDevice()); if(t->getRequiresGrad()){ - res->setCgNode(std::make_shared(t, idx)); + res->setCgNode(std::make_shared(t, idx)); assert(res->getRequiresGrad()); } return res; @@ -115,12 +115,12 @@ shared_ptr graph::get(const shared_ptr& t, tensorSize_t idx) { * * loss = loss + other.get(i), we need to make sure get(i) can map to computational graph. */ -shared_ptr graph::get(const shared_ptr& t, const vector& idx) { +shared_ptr cgraph::get(const shared_ptr& t, const vector& idx) { ftype val = t->getItem(std::move(idx)); auto res = make_shared(std::vector{1}, std::vector{val}, t->getDevice()); if(t->getRequiresGrad()){ - res->setCgNode(std::make_shared(t, idx)); + res->setCgNode(std::make_shared(t, idx)); assert(res->getRequiresGrad()); } return res; @@ -129,11 +129,11 @@ shared_ptr graph::get(const shared_ptr& t, const vector graph::sumTensor(const shared_ptr t) { +shared_ptr cgraph::sumTensor(const shared_ptr t) { auto res = make_shared(std::vector{1}, std::vector{0.0}, t->getDevice(), t->getRequiresGrad()); for(tensorSize_t i=0; igetSize(); i++){ - res = graph::add(res, graph::get(t, i)); + res = cgraph::add(res, cgraph::get(t, i)); } return res; } \ No newline at end of file diff --git a/src/backend/computational_graph/tensor_ops/graph_creation.h b/src/backend/computational_graph/tensor_ops/graph_creation.h index f68cb4c..9decd8b 100644 --- a/src/backend/computational_graph/tensor_ops/graph_creation.h +++ b/src/backend/computational_graph/tensor_ops/graph_creation.h @@ -15,7 +15,7 @@ #include -namespace graph { +namespace cgraph { // Artithmetic operations std::shared_ptr mul(const std::shared_ptr left, const std::shared_ptr right); std::shared_ptr mul(const std::shared_ptr left, ftype scalar); diff --git a/src/backend/computational_graph/tensor_ops/matmul_node.cpp b/src/backend/computational_graph/tensor_ops/matmul_node.cpp index fc24fd4..2237026 100644 --- a/src/backend/computational_graph/tensor_ops/matmul_node.cpp +++ b/src/backend/computational_graph/tensor_ops/matmul_node.cpp @@ -12,7 +12,7 @@ #include "matmul_node.h" using namespace std; -using namespace graph; +using namespace cgraph; vector> MatMulNode::backward(const Tensor& upstreamGrad) { assert(!upstreamGrad.getRequiresGrad()); diff --git a/src/backend/computational_graph/tensor_ops/matmul_node.h b/src/backend/computational_graph/tensor_ops/matmul_node.h index c7b14e2..6758602 100644 --- a/src/backend/computational_graph/tensor_ops/matmul_node.h +++ b/src/backend/computational_graph/tensor_ops/matmul_node.h @@ -15,7 +15,7 @@ #include -namespace graph { +namespace cgraph { class MatMulNode final : public GraphNode { public: explicit MatMulNode(std::shared_ptr t1, std::shared_ptr t2) diff --git a/src/backend/computational_graph/tensor_ops/scalar_op_nodes.cpp b/src/backend/computational_graph/tensor_ops/scalar_op_nodes.cpp index 05a3643..ae8d352 100644 --- a/src/backend/computational_graph/tensor_ops/scalar_op_nodes.cpp +++ b/src/backend/computational_graph/tensor_ops/scalar_op_nodes.cpp @@ -14,14 +14,14 @@ #include using namespace std; -using namespace graph; +using namespace cgraph; -vector> graph::ScalarAddNode::backward(const Tensor& upstreamGrad) { +vector> cgraph::ScalarAddNode::backward(const Tensor& upstreamGrad) { assert(!upstreamGrad.getRequiresGrad()); return {make_shared(upstreamGrad.createDeepCopy())}; } -vector> graph::ScalarMulNode::backward(const Tensor& upstreamGrad) { +vector> cgraph::ScalarMulNode::backward(const Tensor& upstreamGrad) { assert(!upstreamGrad.getRequiresGrad()); auto res = make_shared(upstreamGrad.createDeepCopy()); diff --git a/src/backend/computational_graph/tensor_ops/scalar_op_nodes.h b/src/backend/computational_graph/tensor_ops/scalar_op_nodes.h index a2d1db9..13cb067 100644 --- a/src/backend/computational_graph/tensor_ops/scalar_op_nodes.h +++ b/src/backend/computational_graph/tensor_ops/scalar_op_nodes.h @@ -13,7 +13,7 @@ #include "computational_graph/graph_node.h" -namespace graph { +namespace cgraph { class ScalarAddNode final : public GraphNode { public: explicit ScalarAddNode(std::shared_ptr t) diff --git a/src/backend/computational_graph/topological_sort.cpp b/src/backend/computational_graph/topological_sort.cpp index fb6e698..6a4266d 100644 --- a/src/backend/computational_graph/topological_sort.cpp +++ b/src/backend/computational_graph/topological_sort.cpp @@ -20,7 +20,7 @@ #include using namespace std; -using namespace graph; +using namespace cgraph; #ifndef NDEBUG /** diff --git a/src/backend/computational_graph/topological_sort.h b/src/backend/computational_graph/topological_sort.h index 96bba77..5c60ed5 100644 --- a/src/backend/computational_graph/topological_sort.h +++ b/src/backend/computational_graph/topological_sort.h @@ -16,7 +16,7 @@ class Tensor; // to break circular dependency -namespace graph { +namespace cgraph { /** * @brief Topological sort class. * diff --git a/src/backend/data_modeling/dim_type.cpp b/src/backend/data_modeling/dim_type.cpp index 64dd6bf..1f2e40c 100644 --- a/src/backend/data_modeling/dim_type.cpp +++ b/src/backend/data_modeling/dim_type.cpp @@ -18,6 +18,9 @@ using namespace std; tensorDim_t Dimension::multVector(const std::vector& dims) const noexcept { + if(dims.size()==0) + return 0; + tensorDim_t res = 1; #ifndef NDEBUG @@ -39,10 +42,6 @@ tensorDim_t Dimension::multVector(const std::vector& dims) const no void Dimension::resize(const std::vector& dims) { this->dims = dims; size = multVector(dims); - - if(size==0){ - __throw_invalid_argument("Tensor-Dims must all be greater than 0."); - } } /** @@ -56,10 +55,6 @@ void Dimension::swap(const tensorDim_t dim1, const tensorDim_t dim2) { Dimension::Dimension(const vector& dims) : dims{dims} { size = multVector(dims); - - if(size==0){ - __throw_invalid_argument("Tensor-Dims must all be greater than 0."); - } } Dimension::Dimension(const Dimension& other) : dims{other.dims}, size{other.size} { } @@ -106,14 +101,19 @@ Dimension Dimension::collapseDimension(int idx) const { } ostream& operator<<(ostream& os, const Dimension& d) noexcept { - os << "("; - for(int i=0; i0){ + os << "\n("; + for(int i=0; i #include -template -concept is_valid_dim = requires(T x) { - requires std::is_integral_v>; - requires std::convertible_to, tensorDim_t>; - x >= 0; -}; - class Dimension final { private: std::vector dims; @@ -50,14 +43,12 @@ class Dimension final { Dimension collapseDimension(int idx) const; void resize(const std::vector& dims); - + tensorSize_t getSize() const noexcept { - assert(size!=0); return size; } tensorDim_t getItem(int idx) const { - assert(size!=0); if(idx<0){ idx = dims.size() + idx; // -1 is last idx, -2 second last and so forth } @@ -72,17 +63,28 @@ class Dimension final { void swap(const tensorDim_t dim1, const tensorDim_t dim2); size_t nDims() const noexcept { - assert(size!=0); return dims.size(); } + /** + * @brief Returns empty dims. Used e.g. to identify dimensions + * of activation functions. + */ + static const Dimension& getEmpty() { + static const auto emptyDims = Dimension(std::vector()); + return emptyDims; + } + + bool empty() const noexcept { + return size > 0; + } + bool operator==(const Dimension& other) const { assert(size!=0); return this->dims == other.dims; } bool operator==(const std::vector& other) const { - assert(size!=0); return this->dims == other; } @@ -94,5 +96,5 @@ class Dimension final { return !(*this == other); } - friend std::ostream& operator<<(std::ostream& os, const Dimension& d) noexcept; + friend std::ostream& operator<<(std::ostream& os, const Dimension& d) noexcept; }; \ No newline at end of file diff --git a/src/backend/data_modeling/tensor.cpp b/src/backend/data_modeling/tensor.cpp index 633ebee..a6bf6ed 100644 --- a/src/backend/data_modeling/tensor.cpp +++ b/src/backend/data_modeling/tensor.cpp @@ -558,7 +558,7 @@ void Tensor::backward() { } } - vector sortedTensors = graph::TopologicalSort::reverseSort(this); + vector sortedTensors = cgraph::TopologicalSort::reverseSort(this); for(auto tPtr: sortedTensors){ auto& tensor = *tPtr; assert(tensor.grads && !tensor.grads->requiresGrad); // gradient should not require grad diff --git a/src/backend/data_modeling/tensor.h b/src/backend/data_modeling/tensor.h index 1a61aed..5c6610c 100644 --- a/src/backend/data_modeling/tensor.h +++ b/src/backend/data_modeling/tensor.h @@ -29,13 +29,13 @@ #include // break circular dependency -namespace graph { +namespace cgraph { class GraphNode; class TopologicalSort; } class Tensor final : public std::enable_shared_from_this { - friend class graph::TopologicalSort; + friend class cgraph::TopologicalSort; private: /** @@ -108,7 +108,7 @@ class Tensor final : public std::enable_shared_from_this { bool requiresGrad = false; std::shared_ptr grads = nullptr; // gradients - std::shared_ptr cgNode = nullptr; + std::shared_ptr cgNode = nullptr; static Tensor multiplyScalar(const Tensor& scalar, const Tensor& other) noexcept; static void matMul2DCpu(Tensor& res, const Tensor& left, const Tensor& right, const tensorSize_t resOffset, @@ -242,7 +242,7 @@ class Tensor final : public std::enable_shared_from_this { bool getRequiresGrad() const noexcept { return requiresGrad; } void setRequiresGrad(const bool requiresGrad) noexcept { this->requiresGrad=requiresGrad; } - void setCgNode(std::shared_ptr node) noexcept { + void setCgNode(std::shared_ptr node) noexcept { cgNode = std::move(node); requiresGrad = true; } diff --git a/src/backend/layers/ff_layer.h b/src/backend/layers/ff_layer.h deleted file mode 100644 index 1bd781a..0000000 --- a/src/backend/layers/ff_layer.h +++ /dev/null @@ -1,30 +0,0 @@ -/** - * @file ff_layer.h - * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) - * @brief - * @version 0.1 - * @date 2025-12-07 - * - * @copyright Copyright (c) 2025 - * - */ - -#pragma once - -#include "layer_base.h" -#include "utility/initializers.h" - -#include - -namespace layers { - class FfLayer : public LayerBase { - public: - FfLayer(const std::vector& dims, bool useBias=true, bool requiresGrad=false); - FfLayer(const std::vector& dims, Device d, bool useBias=true, bool requiresGrad=false); - - Tensor forward(const Tensor& input) const override; - std::shared_ptr forward(const std::shared_ptr& input) const override; - - void print(std::ostream& os) const noexcept override; - }; -} diff --git a/src/backend/layers/layer_base.cpp b/src/backend/layers/layer_base.cpp deleted file mode 100644 index 7e0a59f..0000000 --- a/src/backend/layers/layer_base.cpp +++ /dev/null @@ -1,34 +0,0 @@ -/** - * @file layer_base.cpp - * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) - * @brief - * @version 0.1 - * @date 2026-01-25 - * - * @copyright Copyright (c) 2026 - * - */ - -#include "layer_base.h" - -#include - -using namespace std; -using namespace layers; - -void LayerBase::addActivation(shared_ptr f) { - activations.push_back(std::move(f)); -} - -void LayerBase::print(ostream& os) const noexcept { - assert(weights); - os << "Weigths:\n" << *weights; - if(bias){ - os << "\nBias:\n" << *bias; - } -} - -ostream& layers::operator<<(ostream& os, const LayerBase& l) noexcept { - l.print(os); // calling vtable - return os; -} \ No newline at end of file diff --git a/src/backend/layers/layer_base.h b/src/backend/layers/layer_base.h deleted file mode 100644 index 06c51d9..0000000 --- a/src/backend/layers/layer_base.h +++ /dev/null @@ -1,69 +0,0 @@ -/** - * @file layer_base.h - * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) - * @brief - * @version 0.1 - * @date 2025-12-07 - * - * @copyright Copyright (c) 2025 - * - */ - -#pragma once - -#include "data_modeling/tensor.h" -#include "utility/global_params.h" -#include "activation_functions/activation_function_base.h" - -#include -#include -#include - -#include - -namespace layers { - /** - * The base class for all the layers that we have. Not instantiable. - */ - class LayerBase { - protected: - bool requiresGrad = false; - bool useBias = false; - - std::shared_ptr weights = nullptr; - std::shared_ptr bias = nullptr; - - std::vector< std::shared_ptr > activations; - - public: - LayerBase(bool useBias, bool requiresGrad) - : useBias{useBias}, requiresGrad{requiresGrad} - { } - - virtual ~LayerBase() noexcept = default; - - // for inference -> no graph creation - virtual Tensor forward(const Tensor& input) const = 0; - - // for training -> creates graph - virtual std::shared_ptr forward(const std::shared_ptr& input) const = 0; - - // weights should always exist, never nullopt outside of c'tor - const Dimension& getDims() const noexcept { - assert(weights); - return weights->getDims(); - } - - void addActivation(std::shared_ptr f); - - auto getWeights() const noexcept { return weights; } - auto getBias() const noexcept { return bias; } - - std::pair, std::shared_ptr>getParams() const { - return std::make_pair(weights, bias); - } - - virtual void print(std::ostream& os) const noexcept; - friend std::ostream& operator<<(std::ostream& os, const LayerBase& t) noexcept; - }; -} \ No newline at end of file diff --git a/src/backend/activation_functions/leaky_relu.cpp b/src/backend/module/activation_functions/leaky_relu.cpp similarity index 89% rename from src/backend/activation_functions/leaky_relu.cpp rename to src/backend/module/activation_functions/leaky_relu.cpp index 2bb2c1f..add5a0b 100644 --- a/src/backend/activation_functions/leaky_relu.cpp +++ b/src/backend/module/activation_functions/leaky_relu.cpp @@ -13,7 +13,7 @@ #include "computational_graph/activation_functions/leaky_relu_node.h" using namespace std; -using namespace activation; +using namespace module; Tensor LeakyReLu::operator()(const Tensor& t) const { auto res = t.createDeepCopy(); @@ -32,7 +32,7 @@ shared_ptr LeakyReLu::operator()(const shared_ptr& t) const { auto res = make_shared((*this)(*t)); if(t->getRequiresGrad()){ - res->setCgNode(make_shared(t, eps)); + res->setCgNode(make_shared(t, eps)); assert(res->getRequiresGrad()); } diff --git a/src/backend/activation_functions/leaky_relu.h b/src/backend/module/activation_functions/leaky_relu.h similarity index 81% rename from src/backend/activation_functions/leaky_relu.h rename to src/backend/module/activation_functions/leaky_relu.h index 229a902..daa8d92 100644 --- a/src/backend/activation_functions/leaky_relu.h +++ b/src/backend/module/activation_functions/leaky_relu.h @@ -11,10 +11,10 @@ #pragma once -#include "activation_function_base.h" +#include "module/module_base.h" -namespace activation { - class LeakyReLu final : public ActivationFunctionBase { +namespace module { + class LeakyReLu final : public ModuleBase { private: const ftype eps; diff --git a/src/backend/activation_functions/relu.cpp b/src/backend/module/activation_functions/relu.cpp similarity index 89% rename from src/backend/activation_functions/relu.cpp rename to src/backend/module/activation_functions/relu.cpp index 01d7448..29c4668 100644 --- a/src/backend/activation_functions/relu.cpp +++ b/src/backend/module/activation_functions/relu.cpp @@ -13,7 +13,7 @@ #include "computational_graph/activation_functions/relu_node.h" using namespace std; -using namespace activation; +using namespace module; Tensor ReLu::operator()(const Tensor& t) const { auto res = t.createDeepCopy(); @@ -32,7 +32,7 @@ shared_ptr ReLu::operator()(const shared_ptr& t) const { auto res = make_shared((*this)(*t)); if(t->getRequiresGrad()){ - res->setCgNode(make_shared(t)); + res->setCgNode(make_shared(t)); assert(res->getRequiresGrad()); } diff --git a/src/backend/activation_functions/relu.h b/src/backend/module/activation_functions/relu.h similarity index 77% rename from src/backend/activation_functions/relu.h rename to src/backend/module/activation_functions/relu.h index d9bc504..05268e0 100644 --- a/src/backend/activation_functions/relu.h +++ b/src/backend/module/activation_functions/relu.h @@ -11,10 +11,10 @@ #pragma once -#include "activation_function_base.h" +#include "module/module_base.h" -namespace activation { - class ReLu final : public ActivationFunctionBase { +namespace module { + class ReLu final : public ModuleBase { public: ReLu() = default; diff --git a/src/backend/activation_functions/sigmoid.h b/src/backend/module/activation_functions/sigmoid.h similarity index 100% rename from src/backend/activation_functions/sigmoid.h rename to src/backend/module/activation_functions/sigmoid.h diff --git a/src/backend/activation_functions/softmax.cpp b/src/backend/module/activation_functions/softmax.cpp similarity index 93% rename from src/backend/activation_functions/softmax.cpp rename to src/backend/module/activation_functions/softmax.cpp index 3e79814..3cb1215 100644 --- a/src/backend/activation_functions/softmax.cpp +++ b/src/backend/module/activation_functions/softmax.cpp @@ -14,7 +14,7 @@ #include using namespace std; -using namespace activation; +using namespace module; /** * @brief Softmax over last dimension. Expects shape @@ -54,7 +54,7 @@ shared_ptr Softmax::operator()(const shared_ptr& t) const { auto res = make_shared((*this)(*t)); if(t->getRequiresGrad()){ - //res->setCgNode(make_shared(t, eps)); + //res->setCgNode(make_shared(t, eps)); assert(res->getRequiresGrad()); } diff --git a/src/backend/activation_functions/softmax.h b/src/backend/module/activation_functions/softmax.h similarity index 75% rename from src/backend/activation_functions/softmax.h rename to src/backend/module/activation_functions/softmax.h index cf1ed10..d3c4ade 100644 --- a/src/backend/activation_functions/softmax.h +++ b/src/backend/module/activation_functions/softmax.h @@ -11,10 +11,10 @@ #pragma once -#include "activation_function_base.h" +#include "module/module_base.h" -namespace activation { - class Softmax final : public ActivationFunctionBase { +namespace module { + class Softmax final : public ModuleBase { public: Tensor operator()(const Tensor& t) const override; std::shared_ptr operator()(const std::shared_ptr& t) const override; diff --git a/src/backend/layers/ff_layer.cpp b/src/backend/module/layers/ff_layer.cpp similarity index 65% rename from src/backend/layers/ff_layer.cpp rename to src/backend/module/layers/ff_layer.cpp index 8fd1639..8612bf2 100644 --- a/src/backend/layers/ff_layer.cpp +++ b/src/backend/module/layers/ff_layer.cpp @@ -10,7 +10,6 @@ */ #include "ff_layer.h" -#include "activation_functions/activation_function_base.h" #include "data_modeling/tensor_functions.h" #include "computational_graph/tensor_ops/graph_creation.h" @@ -19,7 +18,7 @@ #include using namespace std; -using namespace layers; +using namespace module; FfLayer::FfLayer(const vector& dims, bool useBias, bool requiresGrad) : FfLayer(dims, Tensor::getDefaultDevice(), useBias, requiresGrad) {} @@ -33,7 +32,7 @@ FfLayer::FfLayer(const vector& dims, bool useBias, bool requiresGra * @param requiresGrad If true train this layer. */ FfLayer::FfLayer(const vector& dims, Device d, bool useBias, bool requiresGrad) - : LayerBase(useBias, requiresGrad) { + : useBias{useBias}, requiresGrad{requiresGrad} { assert(dims.size()==2); weights = make_shared(Dimension({dims[0], dims[1]}), d, requiresGrad); @@ -50,37 +49,29 @@ FfLayer::FfLayer(const vector& dims, Device d, bool useBias, bool r * * Assumption for input: (b-size, ..., dim1, in-size) */ -Tensor FfLayer::forward(const Tensor& input) const { - auto res = input.matmul(*weights); +Tensor FfLayer::operator()(const Tensor& input) const { + auto res = input.matmul(*weights); - if(useBias){ - res = res + *bias; - } - - for(auto& af: activations){ - res = (*af)(res); - } + if(useBias){ + res = res + *bias; + } - return res; + return res; } /** * @brief Like overload, but creates computational graph. */ -std::shared_ptr FfLayer::forward(const std::shared_ptr& input) const { - auto res = graph::matmul(input, weights); - if(useBias){ - res = graph::add(res, bias); // TODO: add needs to happen on each of those, how to broadcast? - } - - for(auto& af: activations){ - res = (*af)(res); - } +std::shared_ptr FfLayer::operator()(const std::shared_ptr& input) const { + auto res = cgraph::matmul(input, weights); + if(useBias){ + res = cgraph::add(res, bias); // TODO: add needs to happen on each of those, how to broadcast? + } - return res; + return res; } void FfLayer::print(ostream& os) const noexcept { - LayerBase::print(os); - os << "\nuseBias: " << useBias ? "true" : "false"; + ModuleBase::print(os); + os << "\nuseBias: " << useBias ? "true" : "false"; } \ No newline at end of file diff --git a/src/backend/module/layers/ff_layer.h b/src/backend/module/layers/ff_layer.h new file mode 100644 index 0000000..bf7a174 --- /dev/null +++ b/src/backend/module/layers/ff_layer.h @@ -0,0 +1,54 @@ +/** + * @file ff_layer.h + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2025-12-07 + * + * @copyright Copyright (c) 2025 + * + */ + +#pragma once + +#include "module/module_base.h" +#include "utility/initializers.h" + +#include + +namespace module { + class FfLayer : public ModuleBase { + bool requiresGrad = false; + bool useBias = false; + + std::shared_ptr weights = nullptr; + std::shared_ptr bias = nullptr; + + public: + FfLayer(const std::vector& dims, bool useBias=true, bool requiresGrad=false); + FfLayer(const std::vector& dims, Device d, bool useBias=true, bool requiresGrad=false); + + Tensor operator()(const Tensor& input) const override; + std::shared_ptr operator()(const std::shared_ptr& input) const override; + + const Dimension& getDims() const { + if(weights){ + return weights->getDims(); + } + return Dimension::getEmpty(); + } + + auto getWeights() const noexcept { return weights; } + auto getBias() const noexcept { return bias; } + + bool hasWeights() const { + return weights != nullptr; + } + + std::vector< std::shared_ptr > parameters() const override { + return {weights, bias}; + } + + void print(std::ostream& os) const noexcept override; + }; +} diff --git a/src/backend/module/module_base.cpp b/src/backend/module/module_base.cpp new file mode 100644 index 0000000..7617da5 --- /dev/null +++ b/src/backend/module/module_base.cpp @@ -0,0 +1,31 @@ +/** + * @file module.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-13 + * + * @copyright Copyright (c) 2026 + * + */ + +#include "module/module_base.h" + +#include + +using namespace std; +using namespace module; + +void ModuleBase::print(ostream& os) const noexcept { + if(weights){ + os << "Weigths:\n" << *weights; + } + if(bias){ + os << "\nBias:\n" << *bias; + } +} + +ostream& module::operator<<(ostream& os, const ModuleBase& l) noexcept { + l.print(os); // calling vtable + return os; +} \ No newline at end of file diff --git a/src/backend/module/module_base.h b/src/backend/module/module_base.h new file mode 100644 index 0000000..d6d4cc9 --- /dev/null +++ b/src/backend/module/module_base.h @@ -0,0 +1,49 @@ +/** + * @file module_base.h + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-13 + * + * @copyright Copyright (c) 2026 + * + */ + +#pragma once + +#include "data_modeling/tensor.h" +#include "utility/global_params.h" + +#include +#include +#include + +#include + +namespace module { + /** + * The base class for all the layers that we have. Not instantiable. + */ + class ModuleBase { + public: + ModuleBase() = default; + + ModuleBase(const ModuleBase& other) = delete; + ModuleBase& operator=(const ModuleBase& other) = delete; + + ModuleBase(ModuleBase&& other) noexcept = default; + ModuleBase& operator=(ModuleBase&& other) noexcept = default; + + ~ModuleBase() noexcept = default; + + // for inference -> no graph creation + virtual Tensor operator()(const Tensor& input) const = 0; + + // for training -> creates graph + virtual std::shared_ptr operator()(const std::shared_ptr& input) const = 0; + virtual std::vector> parameters() const { return {}; } + + virtual void print(std::ostream& os) const noexcept; + friend std::ostream& operator<<(std::ostream& os, const ModuleBase& t) noexcept; + }; +} \ No newline at end of file diff --git a/src/backend/module/networks/sequential.cpp b/src/backend/module/networks/sequential.cpp new file mode 100644 index 0000000..fa1190c --- /dev/null +++ b/src/backend/module/networks/sequential.cpp @@ -0,0 +1,60 @@ +/** + * @file sequential.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2025-12-07 + * + * @copyright Copyright (c) 2025 + * + */ + +#include "sequential.h" + +using namespace std; +using namespace module; + +Tensor Sequential::operator()(const Tensor& input) const { + if(layers.size()==0){ + __throw_invalid_argument("Network empy, cannot be called."); + } + + auto x = layers[0]->operator()(input); + for(int i=1; ioperator()(x); + } + + return x; +} + +shared_ptr Sequential::operator()(const shared_ptr& input) const { + if(layers.size()==0){ + __throw_invalid_argument("Network empy, cannot be called."); + } + + auto x = layers[0]->operator()(input); + for(int i=1; ioperator()(x); + } + + return x; +} + +vector> Sequential::parameters() const { + vector> res; + + for(const auto& layer: layers) { + auto p = layer->parameters(); + for(auto& pp: p){ + if(pp){ + res.push_back(std::move(pp)); + } + } + } + + return res; +} + +void Sequential::append(shared_ptr l) { + layers.push_back(move(l)); +} \ No newline at end of file diff --git a/src/backend/module/networks/sequential.h b/src/backend/module/networks/sequential.h new file mode 100644 index 0000000..ea55f02 --- /dev/null +++ b/src/backend/module/networks/sequential.h @@ -0,0 +1,42 @@ +/** + * @file sequential.h + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2025-12-07 + * + * @copyright Copyright (c) 2025 + * + */ + +#pragma once + +#include "module/module_base.h" + +#include +#include + +namespace module { + class Sequential : public ModuleBase { + protected: + std::vector< std::shared_ptr > layers; + + public: + Sequential() = default; + + Sequential(const Sequential& other) = delete; + Sequential& operator=(const Sequential& other) = delete; + + Sequential(Sequential&& other) noexcept = default; + Sequential& operator=(Sequential&& other) noexcept = default; + + ~Sequential() noexcept = default; + + Tensor operator()(const Tensor& input) const override; + std::shared_ptr operator()(const std::shared_ptr& input) const override; + + std::vector> parameters() const override; + + void append(std::shared_ptr l); + }; +} diff --git a/src/backend/networks/sequential.cpp b/src/backend/networks/sequential.cpp deleted file mode 100644 index 7f1bb5b..0000000 --- a/src/backend/networks/sequential.cpp +++ /dev/null @@ -1,85 +0,0 @@ -/** - * @file sequential.cpp - * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) - * @brief - * @version 0.1 - * @date 2025-12-07 - * - * @copyright Copyright (c) 2025 - * - */ - -#include "sequential.h" - -using namespace std; -using namespace layers; - -/** - * @brief Returns true if dimensions valid, else false. - * Ensures consistency along network. - */ -bool SequentialNetwork::assertDims(const LayerBase& layer) const noexcept { - if(layers.size() == 0) - return true; - - return layers[layers.size()-1]->getDims() == layer.getDims(); -} - -Tensor SequentialNetwork::forward(const Tensor& input) const { - if(input.getDims().getItem(-1) != layers[0]->getDims().getItem(-2)){ - __throw_invalid_argument("Input tensor has invalid dimension."); - } - - if(layers.size()==0){ - __throw_invalid_argument("Network empy, cannot be called."); - } - - auto x = layers[0]->forward(input); - for(int i=1; iforward(x); - } - - return x; -} - -std::shared_ptr SequentialNetwork::forward(const std::shared_ptr& input) const { - if(input->getDims().getItem(-1) != layers[0]->getDims().getItem(-2)){ - __throw_invalid_argument("Input tensor has invalid dimension."); - } - - if(layers.size()==0){ - __throw_invalid_argument("Network empy, cannot be called."); - } - - auto x = layers[0]->forward(input); - for(int i=1; iforward(x); - } - - return x; -} - -std::vector> SequentialNetwork::getParams() const { - std::vector> res; - res.reserve(layers.size()*2); - - for(const auto& layer: layers){ - auto [weigths, bias] = layer->getParams(); - res.push_back(std::move(weights)); - res.push_back(std::move(bias)); - } - - return res; -} - -void SequentialNetwork::append(shared_ptr l) { - if(!assertDims(*l)){ - __throw_invalid_argument("Dimensions of tensors don't fit."); - } - layers.push_back(std::move(l)); -} - -void SequentialNetwork::append(shared_ptr f) { - assert(layers.size()>0); - layers[layers.size()-1]->addActivation(std::move(f)); -} \ No newline at end of file diff --git a/src/backend/networks/sequential.h b/src/backend/networks/sequential.h deleted file mode 100644 index bf1c09a..0000000 --- a/src/backend/networks/sequential.h +++ /dev/null @@ -1,36 +0,0 @@ -/** - * @file sequential.h - * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) - * @brief - * @version 0.1 - * @date 2025-12-07 - * - * @copyright Copyright (c) 2025 - * - */ - -#pragma once - -#include "layers/layer_base.h" -#include "activation_functions/activation_function_base.h" - -#include -#include - -class SequentialNetwork { - protected: - std::vector< std::shared_ptr > layers; - - bool assertDims(const layers::LayerBase& layer) const noexcept; - - void append(std::shared_ptr l); - void append(std::shared_ptr f); - - public: - SequentialNetwork() = default; - - std::vector> getParams() const; - - Tensor forward(const Tensor& input) const; - std::shared_ptr forward(const std::shared_ptr& input) const; -}; \ No newline at end of file diff --git a/src/backend/training/optimizers/optimizer_base.h b/src/backend/training/optimizers/optimizer_base.h index a9545c5..f74718b 100644 --- a/src/backend/training/optimizers/optimizer_base.h +++ b/src/backend/training/optimizers/optimizer_base.h @@ -25,7 +25,14 @@ namespace train { public: OptimizerBase(std::vector< std::shared_ptr > params, ftype lr) - : params{std::move(params)}, lr{lr} {}; + : params{std::move(params)}, lr{lr} + { +#ifndef NDEBUG + for(const auto& param: params){ + assert(param); // we don't want nullptrs here + } +#endif // NDEBUG + }; ~OptimizerBase() noexcept = default; diff --git a/src/backend/training/trainers/base_train_loop.cpp b/src/backend/training/trainers/base_train_loop.cpp index 66a5ca2..f9cc903 100644 --- a/src/backend/training/trainers/base_train_loop.cpp +++ b/src/backend/training/trainers/base_train_loop.cpp @@ -15,6 +15,7 @@ #include #include +#include using namespace std; using namespace train; @@ -25,7 +26,9 @@ void BaseTrainLoop::run(shared_ptr& x, shared_ptr& y, const bool std::iota(indices.begin(), indices.end(), 0); if(shuffle){ - std::random_shuffle(indices.begin(), indices.end()); + std::random_device rd; + std::mt19937 rng(rd()); + std::shuffle(indices.begin(), indices.end(), rng); } const auto nSamples = x->getDims().getItem(0); @@ -36,7 +39,7 @@ void BaseTrainLoop::run(shared_ptr& x, shared_ptr& y, const bool auto xBatch = make_shared(x->getSlice(batchSpan)); auto yBatch = y->getSlice(batchSpan); - auto yPred = network->forward(xBatch); + auto yPred = graph->operator()(xBatch); auto l = (*loss)(yBatch, yPred); l->backward(); diff --git a/src/backend/training/trainers/base_train_loop.h b/src/backend/training/trainers/base_train_loop.h index 363395e..a3ed108 100644 --- a/src/backend/training/trainers/base_train_loop.h +++ b/src/backend/training/trainers/base_train_loop.h @@ -11,12 +11,11 @@ #pragma once -#include "training/optimizers/optimizer_base.h" -#include "training/loss_functions/loss_base.h" - #include "data_modeling/tensor.h" +#include "module/module_base.h" + +#include "training/optimizers/optimizer_base.h" #include "training/loss_functions/loss_base.h" -#include "networks/sequential.h" #include #include @@ -31,12 +30,14 @@ namespace train { std::shared_ptr loss; std::shared_ptr optim; - std::shared_ptr network; + std::shared_ptr graph; public: - BaseTrainLoop(std::shared_ptr& network, std::shared_ptr loss, + BaseTrainLoop(std::shared_ptr& graph, std::shared_ptr loss, std::shared_ptr optim, ftype lr, size_t epochs, tensorDim_t bsize) - : network{std::move(network)}, optim{std::move(optim)}, loss{loss}, lr{lr}, epochs{epochs}, bsize{bsize} {}; + : graph{std::move(graph)}, optim{std::move(optim)}, loss{loss}, lr{lr}, epochs{epochs}, bsize{bsize} { + + }; ~BaseTrainLoop() noexcept = default; diff --git a/src/python/py_core/py_core.cpp b/src/python/py_core/py_core.cpp index 2776077..e74e114 100644 --- a/src/python/py_core/py_core.cpp +++ b/src/python/py_core/py_core.cpp @@ -190,7 +190,7 @@ BOOST_PYTHON_MODULE(_core) .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_4(Tensor::getItem)) .def("getitem", Py_DataModeling::getItemVector) // the vector arg - .def("sum", WRAP_FREE_FUNC_7(&(graph::sumTensor))) + .def("sum", WRAP_FREE_FUNC_7(&(cgraph::sumTensor))) .def("reset", Py_DataModeling::reset1) .def("reset", Py_DataModeling::reset2) diff --git a/src/python/py_core/py_core_util.h b/src/python/py_core/py_core_util.h index ba6ecc9..4ddf62c 100644 --- a/src/python/py_core/py_core_util.h +++ b/src/python/py_core/py_core_util.h @@ -97,39 +97,39 @@ namespace Py_DataModeling { // multiplications inline std::shared_ptr (*elementwisemul) - (const std::shared_ptr left, const std::shared_ptr right) = &(graph::mul); + (const std::shared_ptr left, const std::shared_ptr right) = &(cgraph::mul); inline std::shared_ptr (*scalarmul) - (const std::shared_ptr, ftype) = &(graph::mul); + (const std::shared_ptr, ftype) = &(cgraph::mul); inline std::shared_ptr (*rscalarmul) - (ftype, const std::shared_ptr) = &(graph::mul); + (ftype, const std::shared_ptr) = &(cgraph::mul); // additions inline std::shared_ptr (*elementwiseadd) - (const std::shared_ptr left, const std::shared_ptr right) = &(graph::add); + (const std::shared_ptr left, const std::shared_ptr right) = &(cgraph::add); inline std::shared_ptr (*scalaradd) - (const std::shared_ptr, ftype) = &(graph::add); + (const std::shared_ptr, ftype) = &(cgraph::add); inline std::shared_ptr (*rscalaradd) - (ftype, const std::shared_ptr) = &(graph::add); + (ftype, const std::shared_ptr) = &(cgraph::add); // matmul inline std::shared_ptr (*matmul) - (const std::shared_ptr left, const std::shared_ptr right) = &(graph::matmul); + (const std::shared_ptr left, const std::shared_ptr right) = &(cgraph::matmul); // sub, div inline std::shared_ptr (*scalarsub) - (const std::shared_ptr, ftype) = &(graph::sub); + (const std::shared_ptr, ftype) = &(cgraph::sub); inline std::shared_ptr (*scalardiv) - (const std::shared_ptr, ftype) = &(graph::div); + (const std::shared_ptr, ftype) = &(cgraph::div); // get inline std::shared_ptr (*getItemAsTensor1) - (const std::shared_ptr& t, tensorSize_t idx) = &(graph::get); + (const std::shared_ptr& t, tensorSize_t idx) = &(cgraph::get); inline std::shared_ptr (*getItemAsTensor2) - (const std::shared_ptr& t, const std::vector& idx) = &(graph::get); + (const std::shared_ptr& t, const std::vector& idx) = &(cgraph::get); } \ No newline at end of file diff --git a/src/python/py_network/py_network.cpp b/src/python/py_network/py_network.cpp index 57bd91f..bea321e 100644 --- a/src/python/py_network/py_network.cpp +++ b/src/python/py_network/py_network.cpp @@ -17,6 +17,9 @@ #include "training/loss_functions/crossentropy_loss.h" #include "training/optimizers/sgd.h" +#include "training/optimizers/rmsprop.h" + +#include "training/trainers/base_train_loop.h" #include @@ -37,19 +40,13 @@ BOOST_PYTHON_MODULE(py_layers) } // Networks - // TODO - - // Layers - class_("LayerBase", no_init) - // attributes - .add_property("dims", make_function(&layers::LayerBase::getDims, return_internal_reference<>())) - .add_property("weights", make_function(&layers::LayerBase::getWeights)) - .add_property("bias", make_function(&layers::LayerBase::getBias)) - // methods - .def("addActivation", make_function(&layers::LayerBase::addActivation)) + class_, boost::noncopyable>("Module", no_init) + // operators + .def("__call__", WRAP_METHOD_ONE_TENSORARG(module::ModuleBaseWrapper, Py_Network::moduleForward)) + .def("__str__", &toString) ; - class_, bases, boost::noncopyable>("FfLayer", no_init) + class_, boost::noncopyable>("FfLayer", no_init) // init .def(init&>()) .def(init&, bool>()) @@ -58,24 +55,28 @@ BOOST_PYTHON_MODULE(py_layers) .def(init&, Device, bool>()) .def(init&, Device, bool, bool>()) // methods - .def("forward", WRAP_METHOD_ONE_TENSORARG(layers::FfLayer, Py_Network::ffForward)) - // operators - .def("__str__", &toString) + .add_property("dims", make_function(&module::FfLayer::getDims, return_internal_reference<>())) + .add_property("weights", &module::FfLayer::getWeights) + .add_property("bias", &module::FfLayer::getBias) + .add_property("params", &module::ModuleBase::getParams) + // operators + .def("__call__", WRAP_METHOD_ONE_TENSORARG(module::FfLayer, Py_Network::ffForward)) + .def("__str__", &toString) ; - class_, boost::noncopyable>("ReLU") - .def("__call__", WRAP_METHOD_ONE_TENSORARG(activation::ReLu, Py_Network::reluF)) - .def("__str__", &toString) + class_, boost::noncopyable>("ReLU") + .def("__call__", WRAP_METHOD_ONE_TENSORARG(module::ReLu, Py_Network::reluF)) + .def("__str__", &toString) ; - class_, boost::noncopyable>("LeakyReLU", init()) - .def("__call__", WRAP_METHOD_ONE_TENSORARG(activation::LeakyReLu, Py_Network::leakyReluF)) - .def("__str__", &toString) + class_, boost::noncopyable>("LeakyReLU", init()) + .def("__call__", WRAP_METHOD_ONE_TENSORARG(module::LeakyReLu, Py_Network::leakyReluF)) + .def("__str__", &toString) ; - class_, boost::noncopyable>("Softmax") - .def("__call__", WRAP_METHOD_ONE_TENSORARG(activation::Softmax, Py_Network::softmaxF)) - .def("__str__", &toString) + class_, boost::noncopyable>("Softmax") + .def("__call__", WRAP_METHOD_ONE_TENSORARG(module::Softmax, Py_Network::softmaxF)) + .def("__str__", &toString) ; // Loss functions @@ -93,12 +94,21 @@ BOOST_PYTHON_MODULE(py_layers) .def("step", &train::SgdOptimizer::step) ; + class_, boost::noncopyable>("RmsProp", no_init) + .def(init >, ftype, ftype>()) + .def("step", &train::RmsPropOptimizer::step) + ; + // Trainers - // TODO + class_, boost::noncopyable>("TrainLoop", no_init) + .def(init&, std::shared_ptr, std::shared_ptr, + ftype, size_t, tensorDim_t>()) + .def("step", &train::RmsPropOptimizer::step) + ; } /* -ftype Py_Layers::layerGetItem(const layers::LayerBase& self, boost::python::object index) { +ftype Py_module::layerGetItem(const module::ModuleBase& self, boost::python::object index) { extract int_extractor(index); // Single integer index (1D) @@ -142,7 +152,7 @@ ftype Py_Layers::layerGetItem(const layers::LayerBase& self, boost::python::obje return 0.0; // Never reached } -void Py_Layers::layerSetItem(layers::LayerBase& self, boost::python::object index, ftype value) { +void Py_module::layerSetItem(module::ModuleBase& self, boost::python::object index, ftype value) { extract int_extractor(index); // Single integer index (1D) diff --git a/src/python/py_network/py_network_util.h b/src/python/py_network/py_network_util.h index faef35f..746923c 100644 --- a/src/python/py_network/py_network_util.h +++ b/src/python/py_network/py_network_util.h @@ -11,16 +11,13 @@ #pragma once -#include "layers/layer_base.h" -#include "activation_functions/activation_function_base.h" -#include "training/loss_functions/loss_base.h" -#include "training/optimizers/optimizer_base.h" +#include "module/module_base.h" -#include "layers/ff_layer.h" +#include "module/layers/ff_layer.h" -#include "activation_functions/relu.h" -#include "activation_functions/leaky_relu.h" -#include "activation_functions/softmax.h" +#include "module/activation_functions/relu.h" +#include "module/activation_functions/leaky_relu.h" +#include "module/activation_functions/softmax.h" #include #include @@ -30,8 +27,8 @@ namespace Py_Network { using namespace boost::python; - ftype layerGetItem(const layers::LayerBase& self, boost::python::object index); - void layerSetItem(layers::LayerBase& self, boost::python::object index, ftype value); + ftype layerGetItem(const module::ModuleBase& self, boost::python::object index); + void layerSetItem(module::ModuleBase& self, boost::python::object index, ftype value); /** * @brief Wrapper class needed for Boost Python to get the virtual function working @@ -39,32 +36,22 @@ namespace Py_Network { * https://beta.boost.org/doc/libs/develop/libs/python/doc/html/tutorial/tutorial/exposing.html * */ - /* struct LayerBaseWrap : layers::LayerBase, wrapper { - std::shared_ptr forward(const std::shared_ptr& input) const override { + struct ModuleBaseWrapper : module::ModuleBase, wrapper { + std::shared_ptr operator()(const std::shared_ptr& input) const override { return this->get_override("forward")(input); } - Tensor forward(const Tensor& input) const override { + Tensor operator()(const Tensor& input) const override { std::__throw_runtime_error("This function should never be called from within Python"); } }; - struct ActivationFunctionWrap : activation::ActivationFunctionBase, wrapper { - std::shared_ptr operator()(const std::shared_ptr& input) const override { - return this->get_override("call")(input); - } - }; - - struct LossWrap : train::LossBase, wrapper { - Tensor operator()(const Tensor& y, const Tensor& ypred) const override { - return this->get_override("call")(y, ypred); - } - }; */ + inline std::shared_ptr (ModuleBaseWrapper::*moduleForward)(const std::shared_ptr&) const = &ModuleBaseWrapper::operator(); - inline std::shared_ptr (layers::FfLayer::*ffForward)(const std::shared_ptr&) const = &layers::FfLayer::forward; + inline std::shared_ptr (module::FfLayer::*ffForward)(const std::shared_ptr&) const = &module::FfLayer::operator(); - inline std::shared_ptr (activation::ReLu::*reluF)(const std::shared_ptr&) const = &activation::ReLu::operator(); - inline std::shared_ptr (activation::LeakyReLu::*leakyReluF)(const std::shared_ptr&) const = &activation::LeakyReLu::operator(); - inline std::shared_ptr (activation::Softmax::*softmaxF)(const std::shared_ptr&) const = &activation::Softmax::operator(); + inline std::shared_ptr (module::ReLu::*reluF)(const std::shared_ptr&) const = &module::ReLu::operator(); + inline std::shared_ptr (module::LeakyReLu::*leakyReluF)(const std::shared_ptr&) const = &module::LeakyReLu::operator(); + inline std::shared_ptr (module::Softmax::*softmaxF)(const std::shared_ptr&) const = &module::Softmax::operator(); } diff --git a/tests/backend/test_computational_graph.cpp b/tests/backend/test_computational_graph.cpp index 42adf07..b0406b7 100644 --- a/tests/backend/test_computational_graph.cpp +++ b/tests/backend/test_computational_graph.cpp @@ -16,8 +16,8 @@ #include "computational_graph/tensor_ops/graph_creation.h" -#include "activation_functions/relu.h" -#include "activation_functions/leaky_relu.h" +#include "module/activation_functions/relu.h" +#include "module/activation_functions/leaky_relu.h" #include @@ -25,7 +25,7 @@ TEST(AutogradTest, ThrowsIfNoGradientSet) { auto t1 = TensorFunctions::makeSharedTensor({1}, {3.0}, false); auto t2 = TensorFunctions::makeSharedTensor({1}, {2.0}, false); - auto loss = graph::add(t1, t2); + auto loss = cgraph::add(t1, t2); EXPECT_THROW(loss->backward(), std::runtime_error); } @@ -34,8 +34,8 @@ TEST(AutogradTest, SimpleAddition) { auto t1 = TensorFunctions::makeSharedTensor({1}, {3.0}, true); auto t2 = TensorFunctions::makeSharedTensor({1}, {2.0}, true); - auto t3 = graph::add(t1, t2); - auto loss = graph::mul(t3, t3); + auto t3 = cgraph::add(t1, t2); + auto loss = cgraph::mul(t3, t3); loss->backward(); @@ -47,8 +47,8 @@ TEST(AutogradTest, ScalarMultiplication) { auto t1 = TensorFunctions::makeSharedTensor({1}, {2.0}, true); auto t2 = TensorFunctions::makeSharedTensor({1}, {3.0}, true); - auto t3 = graph::mul(t1, t2); - auto loss = graph::mul(t3, t3); + auto t3 = cgraph::mul(t1, t2); + auto loss = cgraph::mul(t3, t3); loss->backward(); @@ -60,11 +60,11 @@ TEST(AutogradTest, MatMul) { auto t1 = TensorFunctions::makeSharedTensor({2, 3}, {1, 2, 3, 4, 5, 6}, true); auto t2 = TensorFunctions::makeSharedTensor({3, 2}, {1, 2, 3, 4, 5, 6}, true); - auto t3 = graph::matmul(t1, t2); + auto t3 = cgraph::matmul(t1, t2); auto loss = TensorFunctions::makeSharedTensor({1}, {0.0}, true); for (size_t i = 0; i < t3->getSize(); ++i) { - loss = graph::add(loss, graph::get(t3, i)); + loss = cgraph::add(loss, cgraph::get(t3, i)); } loss->backward(); @@ -92,9 +92,9 @@ TEST(AutogradTest, MatMul) { TEST(AutogradTest, ChainRule) { auto x = TensorFunctions::makeSharedTensor({1}, {2.0}, true); - auto y = graph::mul(x, x); // y = x^2 - auto z = graph::add(x, y); // z = x^2 + x - auto loss = graph::mul(z, z); // loss = (x^2 + x)^2 + auto y = cgraph::mul(x, x); // y = x^2 + auto z = cgraph::add(x, y); // z = x^2 + x + auto loss = cgraph::mul(z, z); // loss = (x^2 + x)^2 loss->backward(); @@ -106,10 +106,10 @@ TEST(AutogradTest, ChainRule) { TEST(AutogradTest, MultiVariateChainRule) { auto x = TensorFunctions::makeSharedTensor({2}, {1.0, 2.0}, true); - auto y = graph::mul(x, 3.0); // y = [3, 6] + auto y = cgraph::mul(x, 3.0); // y = [3, 6] auto loss = TensorFunctions::makeSharedTensor({1}, {0.0}, true); for(int i=0; igetSize(); i++){ - loss = graph::add(loss, graph::get(y, i)); + loss = cgraph::add(loss, cgraph::get(y, i)); } // loss = 9 loss->backward(); @@ -124,10 +124,10 @@ TEST(AutogradTest, MultiVariateChainRule) { TEST(AutogradTest, ReLU) { auto x = TensorFunctions::makeSharedTensor({3}, {-1.0, 0.0, 2.0}, true); - auto relu = activation::ReLu(); + auto relu = module::ReLu(); auto y = relu(x); // [0, 0, 2] - auto loss = graph::sumTensor(y); // loss = 2 + auto loss = cgraph::sumTensor(y); // loss = 2 loss->backward(); @@ -141,10 +141,10 @@ TEST(AutogradTest, LeakyReLU) { auto x = TensorFunctions::makeSharedTensor({3}, {-1.0, 0.0, 2.0}, true); constexpr ftype eps = 0.3; - auto relu = activation::LeakyReLu(eps); + auto relu = module::LeakyReLu(eps); auto y = relu(x); // [0, 0, 2] - auto loss = graph::sumTensor(y); // loss = 2 + auto loss = cgraph::sumTensor(y); // loss = 2 loss->backward(); diff --git a/tests/backend/test_networks.cpp b/tests/backend/test_networks.cpp index e266894..dc78ba0 100644 --- a/tests/backend/test_networks.cpp +++ b/tests/backend/test_networks.cpp @@ -11,16 +11,16 @@ #include -#include "layers/ff_layer.h" +#include "module/layers/ff_layer.h" -#include "activation_functions/relu.h" -#include "activation_functions/leaky_relu.h" -#include "activation_functions/softmax.h" +#include "module/activation_functions/relu.h" +#include "module/activation_functions/leaky_relu.h" +#include "module/activation_functions/softmax.h" #include "data_modeling/tensor_functions.h" -using namespace layers; -using namespace activation; +using namespace module; +using namespace module; TEST(ActivationTest, TestRelu1) { auto t1 = TensorFunctions::Ones({3, 2}, false); @@ -73,7 +73,7 @@ TEST(LayerTest, TestFfLayer) { auto t1 = TensorFunctions::Ones({3, 2}, false); auto layer = FfLayer({2, 1}, true, false); - auto res = layer.forward(t1); + auto res = layer(t1); ASSERT_EQ(res.getDims(), Dimension({3, 1})); } \ No newline at end of file diff --git a/tests/backend/test_training.cpp b/tests/backend/test_training.cpp index fa4b054..57f0570 100644 --- a/tests/backend/test_training.cpp +++ b/tests/backend/test_training.cpp @@ -11,6 +11,6 @@ #include -#include "activation_functions/relu.h" +#include "module/activation_functions/relu.h" #include "data_modeling/tensor_functions.h" From db2896511792a041019d5ae12d2ebfba596213e3 Mon Sep 17 00:00:00 2001 From: Robert Baumgartner Date: Fri, 13 Mar 2026 15:57:14 +0100 Subject: [PATCH 10/24] Updating module and dim type --- src/backend/CMakeLists.txt | 4 +--- src/backend/data_modeling/dim_type.cpp | 5 ++--- src/backend/data_modeling/dim_type.h | 14 +------------- src/backend/module/module_base.cpp | 9 --------- src/backend/module/module_base.h | 6 +++--- tests/CMakeLists.txt | 3 +-- .../{test_networks.cpp => test_module.cpp} | 0 tests/backend/test_training.cpp | 16 ---------------- 8 files changed, 8 insertions(+), 49 deletions(-) rename tests/backend/{test_networks.cpp => test_module.cpp} (100%) delete mode 100644 tests/backend/test_training.cpp diff --git a/src/backend/CMakeLists.txt b/src/backend/CMakeLists.txt index a00f2be..ed6bade 100644 --- a/src/backend/CMakeLists.txt +++ b/src/backend/CMakeLists.txt @@ -1,9 +1,7 @@ file(GLOB_RECURSE CORE_SOURCES - activation_functions/*.cpp computational_graph/*.cpp data_modeling/*.cpp - layers/*.cpp - networks/*.cpp + module/*.cpp system/*.cpp training/*.cpp utility/*.cpp diff --git a/src/backend/data_modeling/dim_type.cpp b/src/backend/data_modeling/dim_type.cpp index 1f2e40c..b7aee8c 100644 --- a/src/backend/data_modeling/dim_type.cpp +++ b/src/backend/data_modeling/dim_type.cpp @@ -18,9 +18,6 @@ using namespace std; tensorDim_t Dimension::multVector(const std::vector& dims) const noexcept { - if(dims.size()==0) - return 0; - tensorDim_t res = 1; #ifndef NDEBUG @@ -42,6 +39,7 @@ tensorDim_t Dimension::multVector(const std::vector& dims) const no void Dimension::resize(const std::vector& dims) { this->dims = dims; size = multVector(dims); + assert(size>0); } /** @@ -55,6 +53,7 @@ void Dimension::swap(const tensorDim_t dim1, const tensorDim_t dim2) { Dimension::Dimension(const vector& dims) : dims{dims} { size = multVector(dims); + assert(size>0); } Dimension::Dimension(const Dimension& other) : dims{other.dims}, size{other.size} { } diff --git a/src/backend/data_modeling/dim_type.h b/src/backend/data_modeling/dim_type.h index 336d0b5..16b33ad 100644 --- a/src/backend/data_modeling/dim_type.h +++ b/src/backend/data_modeling/dim_type.h @@ -49,6 +49,7 @@ class Dimension final { } tensorDim_t getItem(int idx) const { + assert(size>0); if(idx<0){ idx = dims.size() + idx; // -1 is last idx, -2 second last and so forth } @@ -66,19 +67,6 @@ class Dimension final { return dims.size(); } - /** - * @brief Returns empty dims. Used e.g. to identify dimensions - * of activation functions. - */ - static const Dimension& getEmpty() { - static const auto emptyDims = Dimension(std::vector()); - return emptyDims; - } - - bool empty() const noexcept { - return size > 0; - } - bool operator==(const Dimension& other) const { assert(size!=0); return this->dims == other.dims; diff --git a/src/backend/module/module_base.cpp b/src/backend/module/module_base.cpp index 7617da5..33f6d71 100644 --- a/src/backend/module/module_base.cpp +++ b/src/backend/module/module_base.cpp @@ -16,15 +16,6 @@ using namespace std; using namespace module; -void ModuleBase::print(ostream& os) const noexcept { - if(weights){ - os << "Weigths:\n" << *weights; - } - if(bias){ - os << "\nBias:\n" << *bias; - } -} - ostream& module::operator<<(ostream& os, const ModuleBase& l) noexcept { l.print(os); // calling vtable return os; diff --git a/src/backend/module/module_base.h b/src/backend/module/module_base.h index d6d4cc9..e08bd9c 100644 --- a/src/backend/module/module_base.h +++ b/src/backend/module/module_base.h @@ -38,12 +38,12 @@ namespace module { // for inference -> no graph creation virtual Tensor operator()(const Tensor& input) const = 0; - // for training -> creates graph virtual std::shared_ptr operator()(const std::shared_ptr& input) const = 0; - virtual std::vector> parameters() const { return {}; } - virtual void print(std::ostream& os) const noexcept; + virtual std::vector< std::shared_ptr > parameters() const { return {}; } + + virtual void print(std::ostream& os) const noexcept {}; friend std::ostream& operator<<(std::ostream& os, const ModuleBase& t) noexcept; }; } \ No newline at end of file diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d0a409f..d6f3ba4 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -9,8 +9,7 @@ FetchContent_MakeAvailable(googletest) add_executable(unit_tests_backend backend/test_data_modeling.cpp backend/test_computational_graph.cpp - backend/test_networks.cpp - backend/test_training.cpp + backend/test_module.cpp ) target_link_libraries(unit_tests_backend PRIVATE diff --git a/tests/backend/test_networks.cpp b/tests/backend/test_module.cpp similarity index 100% rename from tests/backend/test_networks.cpp rename to tests/backend/test_module.cpp diff --git a/tests/backend/test_training.cpp b/tests/backend/test_training.cpp deleted file mode 100644 index 57f0570..0000000 --- a/tests/backend/test_training.cpp +++ /dev/null @@ -1,16 +0,0 @@ -/** - * @file test_layers.cpp - * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) - * @brief - * @version 0.1 - * @date 2026-03-09 - * - * @copyright Copyright (c) 2026 - * - */ - -#include - -#include "module/activation_functions/relu.h" -#include "data_modeling/tensor_functions.h" - From 26630ca334beef7bcc6f6397276676253004838a Mon Sep 17 00:00:00 2001 From: Robert Baumgartner Date: Sat, 14 Mar 2026 12:09:04 +0100 Subject: [PATCH 11/24] Restructuring to have cleaner and more modular Python interface. Bringing networks and training utils into Python binding --- python_lib/dl_lib/__init__.py | 6 +- python_lib/dl_lib/nn/__init__.py | 6 +- python_lib/dl_lib/nn/activation/__init__.py | 4 + python_lib/dl_lib/nn/module.py | 7 +- python_lib/dl_lib/sys/__init__.py | 2 +- python_lib/dl_lib/train/__init__.py | 4 + python_lib/dl_lib/train/loss/__init__.py | 4 + python_lib/dl_lib/train/optim/__init__.py | 4 + src/CMakeLists.txt | 13 +- src/backend/module/layers/ff_layer.cpp | 10 +- src/backend/module/layers/ff_layer.h | 6 +- src/backend/module/module_base.cpp | 3 +- .../training/trainers/base_train_loop.cpp | 2 +- .../training/trainers/base_train_loop.h | 7 +- src/python/CMakeLists.txt | 19 +- src/python/py_network/py_network.cpp | 198 ------------------ src/python/py_nn/py_nn.cpp | 75 +++++++ .../py_network_util.h => py_nn/py_nn_util.h} | 5 +- src/python/py_train/py_train.cpp | 54 +++++ tests/CMakeLists.txt | 1 + tests/backend/test_training.cpp | 131 ++++++++++++ 21 files changed, 330 insertions(+), 231 deletions(-) create mode 100644 python_lib/dl_lib/nn/activation/__init__.py create mode 100644 python_lib/dl_lib/train/__init__.py create mode 100644 python_lib/dl_lib/train/loss/__init__.py create mode 100644 python_lib/dl_lib/train/optim/__init__.py delete mode 100644 src/python/py_network/py_network.cpp create mode 100644 src/python/py_nn/py_nn.cpp rename src/python/{py_network/py_network_util.h => py_nn/py_nn_util.h} (90%) create mode 100644 src/python/py_train/py_train.cpp create mode 100644 tests/backend/test_training.cpp diff --git a/python_lib/dl_lib/__init__.py b/python_lib/dl_lib/__init__.py index 9817df9..e7f6844 100644 --- a/python_lib/dl_lib/__init__.py +++ b/python_lib/dl_lib/__init__.py @@ -1,3 +1,5 @@ -from ._compiled._core import Tensor, Dimension, Device, Ones, Zeros, Gaussian +from ._compiled._core import Tensor, Dimension, Device -__all__ = ['Tensor', 'Device', 'Dimension'] \ No newline at end of file +__all__ = ['Tensor', 'Device', 'Dimension'] + +__version__ = "0.2.0" \ No newline at end of file diff --git a/python_lib/dl_lib/nn/__init__.py b/python_lib/dl_lib/nn/__init__.py index 11a7537..5ce8bb2 100644 --- a/python_lib/dl_lib/nn/__init__.py +++ b/python_lib/dl_lib/nn/__init__.py @@ -1,5 +1,5 @@ -#from module import Module -#from .._compiled._layers import FfLayer, ReLU +from .module import Module +from dl_lib._compiled._nn import FfLayer #from .._compiled._core import Tensor # re-export if needed -#__all__ = ['FfLayer', 'ReLU'] \ No newline at end of file +__all__ = ['Module', 'FfLayer'] \ No newline at end of file diff --git a/python_lib/dl_lib/nn/activation/__init__.py b/python_lib/dl_lib/nn/activation/__init__.py new file mode 100644 index 0000000..0ab1bab --- /dev/null +++ b/python_lib/dl_lib/nn/activation/__init__.py @@ -0,0 +1,4 @@ +from dl_lib._compiled._nn import ReLU, LeakyReLU, Softmax +#from .._compiled._core import Tensor # re-export if needed + +__all__ = ['ReLU', 'LeakyReLU', 'Softmax'] \ No newline at end of file diff --git a/python_lib/dl_lib/nn/module.py b/python_lib/dl_lib/nn/module.py index 2c00b32..755ed13 100644 --- a/python_lib/dl_lib/nn/module.py +++ b/python_lib/dl_lib/nn/module.py @@ -3,9 +3,12 @@ modules when defining graphs via Module. """ -class Module: +from .._compiled._nn import _Module + +class Module(_Module): def __init__(self): - self._modules = {} + object.__setattr__(self, "_modules", {}) # not necessary, but more explicit + self._modules = {} """ Stores attributes defined in __init__ in private diff --git a/python_lib/dl_lib/sys/__init__.py b/python_lib/dl_lib/sys/__init__.py index 51cbded..0401df8 100644 --- a/python_lib/dl_lib/sys/__init__.py +++ b/python_lib/dl_lib/sys/__init__.py @@ -1 +1 @@ -from .._compiled._sys import getGlobalDevice, setGlobalDevice +from dl_lib._compiled._sys import getGlobalDevice, setGlobalDevice \ No newline at end of file diff --git a/python_lib/dl_lib/train/__init__.py b/python_lib/dl_lib/train/__init__.py new file mode 100644 index 0000000..9614d89 --- /dev/null +++ b/python_lib/dl_lib/train/__init__.py @@ -0,0 +1,4 @@ +from dl_lib._compiled._train import TrainLoop +#from dl_lib._compiled._core import Tensor # re-export if needed + +__all__ = ['TrainLoop'] \ No newline at end of file diff --git a/python_lib/dl_lib/train/loss/__init__.py b/python_lib/dl_lib/train/loss/__init__.py new file mode 100644 index 0000000..cba1a96 --- /dev/null +++ b/python_lib/dl_lib/train/loss/__init__.py @@ -0,0 +1,4 @@ +from dl_lib._compiled._train import BCE, CrossEntropy +#from dl_lib._compiled._core import Tensor # re-export if needed + +__all__ = ['BCE', 'CrossEntropy'] \ No newline at end of file diff --git a/python_lib/dl_lib/train/optim/__init__.py b/python_lib/dl_lib/train/optim/__init__.py new file mode 100644 index 0000000..a6669c6 --- /dev/null +++ b/python_lib/dl_lib/train/optim/__init__.py @@ -0,0 +1,4 @@ +from dl_lib._compiled._train import SGD, RmsProp +#from dl_lib._compiled._core import Tensor # re-export if needed + +__all__ = ['SGD', 'RmsProp'] \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 15b045c..d47e4d1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -11,12 +11,12 @@ target_include_directories(_core PRIVATE ${Python_INCLUDE_DIRS} ${Boost_INCLUDE_DIRS}) -target_link_libraries(_network PRIVATE +target_link_libraries(_nn PRIVATE ${Boost_LIBRARIES} ${PYTHON_LIBRARIES} BackendCore) -target_include_directories(_network PRIVATE +target_include_directories(_nn PRIVATE ${PYTHON_INCLUDE_DIRS} ${Boost_INCLUDE_DIRS}) @@ -26,5 +26,14 @@ target_link_libraries(_sys PRIVATE BackendCore) target_include_directories(_sys PRIVATE + ${PYTHON_INCLUDE_DIRS} + ${Boost_INCLUDE_DIRS}) + +target_link_libraries(_train PRIVATE + ${Boost_LIBRARIES} + ${PYTHON_LIBRARIES} + BackendCore) + +target_include_directories(_train PRIVATE ${PYTHON_INCLUDE_DIRS} ${Boost_INCLUDE_DIRS}) \ No newline at end of file diff --git a/src/backend/module/layers/ff_layer.cpp b/src/backend/module/layers/ff_layer.cpp index 8612bf2..5c192d5 100644 --- a/src/backend/module/layers/ff_layer.cpp +++ b/src/backend/module/layers/ff_layer.cpp @@ -33,7 +33,9 @@ FfLayer::FfLayer(const vector& dims, bool useBias, bool requiresGra */ FfLayer::FfLayer(const vector& dims, Device d, bool useBias, bool requiresGrad) : useBias{useBias}, requiresGrad{requiresGrad} { - assert(dims.size()==2); + if(dims.size()!=2){ + __throw_runtime_error("FfLayer needs only two dims, that's it."); + } weights = make_shared(Dimension({dims[0], dims[1]}), d, requiresGrad); TensorFunctions::ToGaussian(*weights); @@ -72,6 +74,8 @@ std::shared_ptr FfLayer::operator()(const std::shared_ptr& input } void FfLayer::print(ostream& os) const noexcept { - ModuleBase::print(os); - os << "\nuseBias: " << useBias ? "true" : "false"; + os << "Weigths:\n" << *weights; + if(bias){ + os << "\nBias:\n" << *bias; + } } \ No newline at end of file diff --git a/src/backend/module/layers/ff_layer.h b/src/backend/module/layers/ff_layer.h index bf7a174..478b457 100644 --- a/src/backend/module/layers/ff_layer.h +++ b/src/backend/module/layers/ff_layer.h @@ -32,10 +32,8 @@ namespace module { std::shared_ptr operator()(const std::shared_ptr& input) const override; const Dimension& getDims() const { - if(weights){ - return weights->getDims(); - } - return Dimension::getEmpty(); + assert(weights); + return weights->getDims(); } auto getWeights() const noexcept { return weights; } diff --git a/src/backend/module/module_base.cpp b/src/backend/module/module_base.cpp index 33f6d71..ef4e426 100644 --- a/src/backend/module/module_base.cpp +++ b/src/backend/module/module_base.cpp @@ -14,9 +14,8 @@ #include using namespace std; -using namespace module; -ostream& module::operator<<(ostream& os, const ModuleBase& l) noexcept { +ostream& operator<<(ostream& os, const module::ModuleBase& l) noexcept { l.print(os); // calling vtable return os; } \ No newline at end of file diff --git a/src/backend/training/trainers/base_train_loop.cpp b/src/backend/training/trainers/base_train_loop.cpp index f9cc903..6cdfce5 100644 --- a/src/backend/training/trainers/base_train_loop.cpp +++ b/src/backend/training/trainers/base_train_loop.cpp @@ -39,7 +39,7 @@ void BaseTrainLoop::run(shared_ptr& x, shared_ptr& y, const bool auto xBatch = make_shared(x->getSlice(batchSpan)); auto yBatch = y->getSlice(batchSpan); - auto yPred = graph->operator()(xBatch); + auto yPred = (*graph)(xBatch); auto l = (*loss)(yBatch, yPred); l->backward(); diff --git a/src/backend/training/trainers/base_train_loop.h b/src/backend/training/trainers/base_train_loop.h index a3ed108..0348b5f 100644 --- a/src/backend/training/trainers/base_train_loop.h +++ b/src/backend/training/trainers/base_train_loop.h @@ -33,11 +33,10 @@ namespace train { std::shared_ptr graph; public: - BaseTrainLoop(std::shared_ptr& graph, std::shared_ptr loss, + BaseTrainLoop(std::shared_ptr graph, std::shared_ptr loss, std::shared_ptr optim, ftype lr, size_t epochs, tensorDim_t bsize) - : graph{std::move(graph)}, optim{std::move(optim)}, loss{loss}, lr{lr}, epochs{epochs}, bsize{bsize} { - - }; + : graph{std::move(graph)}, optim{std::move(optim)}, loss{loss}, lr{lr}, epochs{epochs}, bsize{bsize} + { } ~BaseTrainLoop() noexcept = default; diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt index c0fb746..405d1f3 100644 --- a/src/python/CMakeLists.txt +++ b/src/python/CMakeLists.txt @@ -10,16 +10,19 @@ add_library(_core MODULE py_core/py_core_util.cpp ) -add_library(_network MODULE - py_network/py_network.cpp - #py_network/py_network_util.cpp +add_library(_nn MODULE + py_nn/py_nn.cpp ) add_library(_sys MODULE py_sys/py_sys.cpp ) -set_target_properties(_core _sys _network PROPERTIES +add_library(_train MODULE + py_train/py_train.cpp + ) + +set_target_properties(_core _nn _sys _train PROPERTIES PREFIX "" INSTALL_RPATH "$ORIGIN" # to find shared backend-core lib BUILD_WITH_INSTALL_RPATH TRUE # use install RPATH even during build @@ -28,5 +31,11 @@ set_target_properties(_core _sys _network PROPERTIES set_target_properties(_core PROPERTIES OUTPUT_NAME "_core") +set_target_properties(_nn PROPERTIES + OUTPUT_NAME "_nn") + set_target_properties(_sys PROPERTIES - OUTPUT_NAME "_sys") \ No newline at end of file + OUTPUT_NAME "_sys") + +set_target_properties(_train PROPERTIES + OUTPUT_NAME "_train") \ No newline at end of file diff --git a/src/python/py_network/py_network.cpp b/src/python/py_network/py_network.cpp deleted file mode 100644 index bea321e..0000000 --- a/src/python/py_network/py_network.cpp +++ /dev/null @@ -1,198 +0,0 @@ -/** - * @file layers.cpp - * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) - * @brief - * @version 0.1 - * @date 2025-11-17 - * - * @copyright Copyright (c) 2025 - * - */ - -#include "py_network_util.h" -#include "python_templates.h" -#include "utility/global_params.h" - -#include "training/loss_functions/bce_loss.h" -#include "training/loss_functions/crossentropy_loss.h" - -#include "training/optimizers/sgd.h" -#include "training/optimizers/rmsprop.h" - -#include "training/trainers/base_train_loop.h" - -#include - -BOOST_PYTHON_MODULE(py_layers) -{ - using namespace Py_Util; - - using namespace boost::python; - - #define WRAP_METHOD_ONE_TENSORARG(T, method) \ - +[](const T& self, Tensor& t) -> std::shared_ptr { \ - return (self.*method)(t.getSharedPtr()); \ - } - - #define WRAP_METHOD_TWO_TENSORARGS(T, method) \ - +[](const T& self, Tensor& t1, Tensor& t2) -> std::shared_ptr { \ - return (self.*method)(t1.getSharedPtr(), t2.getSharedPtr()); \ - } - - // Networks - class_, boost::noncopyable>("Module", no_init) - // operators - .def("__call__", WRAP_METHOD_ONE_TENSORARG(module::ModuleBaseWrapper, Py_Network::moduleForward)) - .def("__str__", &toString) - ; - - class_, boost::noncopyable>("FfLayer", no_init) - // init - .def(init&>()) - .def(init&, bool>()) - .def(init&, bool, bool>()) - .def(init&, Device>()) - .def(init&, Device, bool>()) - .def(init&, Device, bool, bool>()) - // methods - .add_property("dims", make_function(&module::FfLayer::getDims, return_internal_reference<>())) - .add_property("weights", &module::FfLayer::getWeights) - .add_property("bias", &module::FfLayer::getBias) - .add_property("params", &module::ModuleBase::getParams) - // operators - .def("__call__", WRAP_METHOD_ONE_TENSORARG(module::FfLayer, Py_Network::ffForward)) - .def("__str__", &toString) - ; - - class_, boost::noncopyable>("ReLU") - .def("__call__", WRAP_METHOD_ONE_TENSORARG(module::ReLu, Py_Network::reluF)) - .def("__str__", &toString) - ; - - class_, boost::noncopyable>("LeakyReLU", init()) - .def("__call__", WRAP_METHOD_ONE_TENSORARG(module::LeakyReLu, Py_Network::leakyReluF)) - .def("__str__", &toString) - ; - - class_, boost::noncopyable>("Softmax") - .def("__call__", WRAP_METHOD_ONE_TENSORARG(module::Softmax, Py_Network::softmaxF)) - .def("__str__", &toString) - ; - - // Loss functions - class_, boost::noncopyable>("BCE") - .def("__call__", &train::BceLoss::operator()) - ; - - class_, boost::noncopyable>("CrossEntropy") - .def("__call__", &train::CrossEntropyLoss::operator()) - ; - - // Optimizers - class_, boost::noncopyable>("SGD", no_init) - .def(init >, ftype>()) - .def("step", &train::SgdOptimizer::step) - ; - - class_, boost::noncopyable>("RmsProp", no_init) - .def(init >, ftype, ftype>()) - .def("step", &train::RmsPropOptimizer::step) - ; - - // Trainers - class_, boost::noncopyable>("TrainLoop", no_init) - .def(init&, std::shared_ptr, std::shared_ptr, - ftype, size_t, tensorDim_t>()) - .def("step", &train::RmsPropOptimizer::step) - ; -} - -/* -ftype Py_module::layerGetItem(const module::ModuleBase& self, boost::python::object index) { - extract int_extractor(index); - - // Single integer index (1D) - if(int_extractor.check()) { - int i0 = int_extractor(); - return self.getItem(i0); - } - - // Tuple index (2D, 3D, or 4D) - extract tuple_extractor(index); - if(tuple_extractor.check()) { - tuple idx_tuple = tuple_extractor(); - int ndim = boost::python::len(idx_tuple); - - if (ndim == 2) { - int i0 = extract(idx_tuple[0]); - int i1 = extract(idx_tuple[1]); - return self.getItem(i0, i1); - } - else if (ndim == 3) { - int i0 = extract(idx_tuple[0]); - int i1 = extract(idx_tuple[1]); - int i2 = extract(idx_tuple[2]); - return self.getItem(i0, i1, i2); - } - else if (ndim == 4) { - int i0 = extract(idx_tuple[0]); - int i1 = extract(idx_tuple[1]); - int i2 = extract(idx_tuple[2]); - int i3 = extract(idx_tuple[3]); - return self.getItem(i0, i1, i2, i3); - } - else { - PyErr_SetString(PyExc_IndexError, "Unsupported number of dimensions"); - throw_error_already_set(); - } - } - - PyErr_SetString(PyExc_TypeError, "Index must be an integer or tuple"); - throw_error_already_set(); - return 0.0; // Never reached -} - -void Py_module::layerSetItem(module::ModuleBase& self, boost::python::object index, ftype value) { - extract int_extractor(index); - - // Single integer index (1D) - if(int_extractor.check()) { - int i0 = int_extractor(); - self.setItem(value, i0);\ - return; - } - - // Tuple index (2D, 3D, or 4D) - extract tuple_extractor(index); - if(tuple_extractor.check()) { - tuple idx_tuple = tuple_extractor(); - int ndim = boost::python::len(idx_tuple); - - if (ndim == 2) { - int i0 = extract(idx_tuple[0]); - int i1 = extract(idx_tuple[1]); - self.setItem(value, i0, i1); - } - else if (ndim == 3) { - int i0 = extract(idx_tuple[0]); - int i1 = extract(idx_tuple[1]); - int i2 = extract(idx_tuple[2]); - self.setItem(value, i0, i1, i2); - } - else if (ndim == 4) { - int i0 = extract(idx_tuple[0]); - int i1 = extract(idx_tuple[1]); - int i2 = extract(idx_tuple[2]); - int i3 = extract(idx_tuple[3]); - self.setItem(value, i0, i1, i2, i3); - } - else { - PyErr_SetString(PyExc_IndexError, "Unsupported number of dimensions"); - throw_error_already_set(); - } - return; - } - - PyErr_SetString(PyExc_TypeError, "Index must be an integer or tuple"); - throw_error_already_set(); -}*/ \ No newline at end of file diff --git a/src/python/py_nn/py_nn.cpp b/src/python/py_nn/py_nn.cpp new file mode 100644 index 0000000..a8470ca --- /dev/null +++ b/src/python/py_nn/py_nn.cpp @@ -0,0 +1,75 @@ +/** + * @file layers.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2025-11-17 + * + * @copyright Copyright (c) 2025 + * + */ + +#include "py_nn_util.h" +#include "python_templates.h" +#include "utility/global_params.h" + +#include + +BOOST_PYTHON_MODULE(_nn) +{ + using namespace Py_Util; + + using namespace boost::python; + + #define WRAP_METHOD_ONE_TENSORARG(T, method) \ + +[](const T& self, Tensor& t) -> std::shared_ptr { \ + return (self.*method)(t.getSharedPtr()); \ + } + + #define WRAP_METHOD_TWO_TENSORARGS(T, method) \ + +[](const T& self, Tensor& t1, Tensor& t2) -> std::shared_ptr { \ + return (self.*method)(t1.getSharedPtr(), t2.getSharedPtr()); \ + } + + // Networks + class_, boost::noncopyable>("_Module", no_init) + // methods + .def("_own_parameters", &module::ModuleBase::parameters) + // operators + .def("forward", pure_virtual(WRAP_METHOD_ONE_TENSORARG(Py_nn::ModuleBaseWrapper, Py_nn::moduleForward))) + .def("__str__", &toString) + ; + + class_, boost::noncopyable>("FfLayer", no_init) + // init + .def(init&>()) + .def(init&, bool>()) + .def(init&, bool, bool>()) + .def(init&, Device>()) + .def(init&, Device, bool>()) + .def(init&, Device, bool, bool>()) + // methods + .add_property("dims", make_function(&module::FfLayer::getDims, return_internal_reference<>())) + .add_property("weights", &module::FfLayer::getWeights) + .add_property("bias", &module::FfLayer::getBias) + .add_property("params", &module::ModuleBase::parameters) + // operators + .def("__call__", WRAP_METHOD_ONE_TENSORARG(module::FfLayer, Py_nn::ffForward)) + .def("__str__", &toString) + ; + + class_, boost::noncopyable>("ReLU") + .def("__call__", WRAP_METHOD_ONE_TENSORARG(module::ReLu, Py_nn::reluF)) + .def("__str__", &toString) + ; + + class_, boost::noncopyable>("LeakyReLU", init()) + .def("__call__", WRAP_METHOD_ONE_TENSORARG(module::LeakyReLu, Py_nn::leakyReluF)) + .def("__str__", &toString) + ; + + class_, boost::noncopyable>("Softmax") + .def("__call__", WRAP_METHOD_ONE_TENSORARG(module::Softmax, Py_nn::softmaxF)) + .def("__str__", &toString) + ; +} \ No newline at end of file diff --git a/src/python/py_network/py_network_util.h b/src/python/py_nn/py_nn_util.h similarity index 90% rename from src/python/py_network/py_network_util.h rename to src/python/py_nn/py_nn_util.h index 746923c..766589a 100644 --- a/src/python/py_network/py_network_util.h +++ b/src/python/py_nn/py_nn_util.h @@ -24,12 +24,9 @@ #include #include -namespace Py_Network { +namespace Py_nn { using namespace boost::python; - ftype layerGetItem(const module::ModuleBase& self, boost::python::object index); - void layerSetItem(module::ModuleBase& self, boost::python::object index, ftype value); - /** * @brief Wrapper class needed for Boost Python to get the virtual function working * the way it is intended. See documentation here: diff --git a/src/python/py_train/py_train.cpp b/src/python/py_train/py_train.cpp new file mode 100644 index 0000000..aa07ef2 --- /dev/null +++ b/src/python/py_train/py_train.cpp @@ -0,0 +1,54 @@ +/** + * @file py_train.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-14 + * + * @copyright Copyright (c) 2026 + * + */ + +#include + +#include "utility/global_params.h" + +#include "training/loss_functions/bce_loss.h" +#include "training/loss_functions/crossentropy_loss.h" + +#include "training/optimizers/sgd.h" +#include "training/optimizers/rmsprop.h" + +#include "training/trainers/base_train_loop.h" + +BOOST_PYTHON_MODULE(_train) +{ + using namespace boost::python; + + // Loss functions + class_, boost::noncopyable>("BCE") + .def("__call__", &train::BceLoss::operator()) + ; + + class_, boost::noncopyable>("CrossEntropy") + .def("__call__", &train::CrossEntropyLoss::operator()) + ; + + // Optimizers + class_, boost::noncopyable>("SGD", no_init) + .def(init >, ftype>()) + .def("step", &train::SgdOptimizer::step) + ; + + class_, boost::noncopyable>("RmsProp", no_init) + .def(init >, ftype, ftype>()) + .def("step", &train::RmsPropOptimizer::step) + ; + + // Trainers + class_, boost::noncopyable>("TrainLoop", no_init) + .def(init&, std::shared_ptr, std::shared_ptr, + ftype, size_t, tensorDim_t>()) + .def("step", &train::RmsPropOptimizer::step) + ; +} \ No newline at end of file diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d6f3ba4..a94e089 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -10,6 +10,7 @@ add_executable(unit_tests_backend backend/test_data_modeling.cpp backend/test_computational_graph.cpp backend/test_module.cpp + backend/test_training.cpp ) target_link_libraries(unit_tests_backend PRIVATE diff --git a/tests/backend/test_training.cpp b/tests/backend/test_training.cpp new file mode 100644 index 0000000..6c39812 --- /dev/null +++ b/tests/backend/test_training.cpp @@ -0,0 +1,131 @@ +/** + * @file test_training.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-14 + * + * @copyright Copyright (c) 2026 + * + */ + +#include + +#include "module/networks/sequential.h" +#include "module/layers/ff_layer.h" + +#include "module/activation_functions/relu.h" +#include "module/activation_functions/leaky_relu.h" +#include "module/activation_functions/softmax.h" + +#include "training/optimizers/sgd.h" +#include "training/optimizers/rmsprop.h" + +#include "training/loss_functions/bce_loss.h" +#include "training/loss_functions/crossentropy_loss.h" + +#include "training/trainers/base_train_loop.h" + +#include "data_modeling/tensor_functions.h" + +static std::shared_ptr makeBinaryNet() { + auto net = std::make_shared(); + + net->append(std::make_shared( + std::vector{2, 4}, true, true)); + + net->append(std::make_shared()); + + net->append(std::make_shared( + std::vector{4, 1}, true, true)); + // BCE expects raw logits or sigmoid output — adjust if needed + return net; +} + +static std::shared_ptr makeMulticlassNet() { + auto net = std::make_shared(); + + net->append(std::make_shared( + std::vector{2, 8}, true, true)); + + net->append(std::make_shared()); + + net->append(std::make_shared( + std::vector{8, 3}, true, true)); + + net->append(std::make_shared()); + return net; +} + +// ─── binary overfit ───────────────────────────────────────────────────────── + +TEST(OverfitTest, BCE_SGD_OverfitsSmallDataset) { + // XOR-like: 4 samples, 2 features, binary labels + auto x = TensorFunctions::makeSharedTensor( + {4, 2}, {0.0, 0.0, + 0.0, 1.0, + 1.0, 0.0, + 1.0, 1.0}, false); + + auto y = TensorFunctions::makeSharedTensor( + {4, 1}, {0.0, + 1.0, + 1.0, + 0.0}, false); + + auto net = makeBinaryNet(); + auto loss = std::make_shared(); + auto optim = std::make_shared( + net->parameters(), /*lr=*/0.01); + + auto trainLoop = train::BaseTrainLoop( + net, loss, optim, /*lr=*/static_cast(0.01), /*epochs=*/2000, /*bsize=*/static_cast(4)); + + trainLoop.run(x, y, /*shuffle=*/false); + + // forward one more time to get final loss + auto pred = (*net)(x); + auto finalLoss = (*loss)(*pred, y); + + EXPECT_LT((*finalLoss)[0], 0.05f) + << "Network failed to overfit binary dataset"; +} + + +// ─── multiclass overfit ────────────────────────────────────────────────────── + +TEST(OverfitTest, CrossEntropy_RMSProp_OverfitsSmallDataset) { + // 6 samples, 2 features, 3 classes + auto x = TensorFunctions::makeSharedTensor( + {6, 2}, {1.0, 0.0, + 1.0, 0.1, + 0.0, 1.0, + 0.1, 1.0, + 0.5, 0.5, + 0.4, 0.6}, false); + + // one-hot encoded labels + auto y = TensorFunctions::makeSharedTensor( + {6, 3}, {1.0, 0.0, 0.0, + 1.0, 0.0, 0.0, + 0.0, 1.0, 0.0, + 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, + 0.0, 0.0, 1.0}, false); + + auto net = makeMulticlassNet(); + auto loss = std::make_shared(); + auto optim = std::make_shared( + net->parameters(), /*lr=*/0.001, /*decay=*/0.9); + + auto trainLoop = train::BaseTrainLoop( + net, loss, optim, /*lr=*/0.001, /*epochs=*/2000, /*bsize=*/6); + + trainLoop.run(x, y, /*shuffle=*/false); + + auto pred = (*net)(x); + auto finalLoss = (*loss)(*pred, y); + + EXPECT_LT((*finalLoss)[0], 0.05f) + << "Network failed to overfit multiclass dataset"; +} \ No newline at end of file From c35e52672982076c37368f81801d45e2bbf8e7ac Mon Sep 17 00:00:00 2001 From: Robert Baumgartner Date: Sat, 14 Mar 2026 18:17:17 +0100 Subject: [PATCH 12/24] Finished implementing training loop. Needs debugging --- .gitignore | 7 +- CMakeLists.txt | 7 +- readme.md | 3 +- .../activation_functions/leaky_relu_node.cpp | 2 +- .../activation_functions/relu_node.cpp | 2 +- .../activation_functions/sigmoid_node.cpp | 35 ++++ .../activation_functions/sigmoid_node.h | 32 ++++ .../loss_functions/bce_node.cpp | 37 +++++ .../loss_functions/bce_node.h | 36 +++++ .../loss_functions/crossentropy_node.cpp | 38 +++++ .../loss_functions/crossentropy_node.h | 45 ++++++ .../loss_functions/rsme_node.cpp | 38 +++++ .../loss_functions/rsme_node.h | 39 +++++ .../tensor_ops/graph_creation.cpp | 3 +- src/backend/data_modeling/dim_type.h | 4 + src/backend/data_modeling/tensor.cpp | 7 +- src/backend/data_modeling/tensor.h | 2 +- .../data_modeling/tensor_functions.cpp | 4 +- src/backend/data_modeling/tensor_functions.h | 4 +- .../module/activation_functions/leaky_relu.h | 4 +- .../module/activation_functions/sigmoid.cpp | 51 ++++++ .../module/activation_functions/sigmoid.h | 22 +++ src/backend/module/layers/ff_layer.cpp | 2 +- src/backend/module/module_base.cpp | 2 +- src/backend/module/module_base.h | 19 ++- src/backend/module/networks/sequential.cpp | 7 + src/backend/module/networks/sequential.h | 2 + .../training/loss_functions/bce_loss.cpp | 32 ++-- .../training/loss_functions/bce_loss.h | 3 +- .../loss_functions/crossentropy_loss.cpp | 30 ++-- .../loss_functions/crossentropy_loss.h | 3 +- .../training/loss_functions/loss_base.h | 7 +- .../training/loss_functions/rsme_loss.cpp | 55 +++++++ .../training/loss_functions/rsme_loss.h | 22 +++ .../training/optimizers/optimizer_base.cpp | 25 +++ .../training/optimizers/optimizer_base.h | 2 + src/backend/training/optimizers/rmsprop.cpp | 2 +- .../training/trainers/base_train_loop.cpp | 27 +++- .../training/trainers/base_train_loop.h | 8 +- src/python/py_train/py_train.cpp | 4 +- src/python/py_utility/custom_converters.h | 1 + tests/CMakeLists.txt | 3 +- tests/backend/test_losses.cpp | 151 ++++++++++++++++++ tests/backend/test_train_loop.cpp | 139 ++++++++++++++++ tests/backend/test_training.cpp | 131 --------------- 45 files changed, 906 insertions(+), 193 deletions(-) create mode 100644 src/backend/computational_graph/activation_functions/sigmoid_node.cpp create mode 100644 src/backend/computational_graph/activation_functions/sigmoid_node.h create mode 100644 src/backend/computational_graph/loss_functions/bce_node.cpp create mode 100644 src/backend/computational_graph/loss_functions/bce_node.h create mode 100644 src/backend/computational_graph/loss_functions/crossentropy_node.cpp create mode 100644 src/backend/computational_graph/loss_functions/crossentropy_node.h create mode 100644 src/backend/computational_graph/loss_functions/rsme_node.cpp create mode 100644 src/backend/computational_graph/loss_functions/rsme_node.h create mode 100644 src/backend/module/activation_functions/sigmoid.cpp create mode 100644 src/backend/training/loss_functions/rsme_loss.cpp create mode 100644 src/backend/training/loss_functions/rsme_loss.h create mode 100644 src/backend/training/optimizers/optimizer_base.cpp create mode 100644 tests/backend/test_losses.cpp create mode 100644 tests/backend/test_train_loop.cpp delete mode 100644 tests/backend/test_training.cpp diff --git a/.gitignore b/.gitignore index ef9f13c..2744889 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,9 @@ build .vscode -unit_tests_backend *.txt python_lib/dl_lib/_compiled -*__pycache__* \ No newline at end of file +*__pycache__* +*_cache + +# TODO: remove later +benchmarks \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index cdd5dc6..e317ea5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,7 +27,7 @@ endif() add_compile_options("$<$:/utf-8>") add_compile_options("$<$:/utf-8>") -option(DOUBLE_PRECISION "Compile with double precision at cost of speed?" OFF) # TODO: not implemented yet +# TODO: add flag for double precision? # include python libs if(APPLE) @@ -107,4 +107,9 @@ option(BUILD_TESTS "Build tests" OFF) if(BUILD_TESTS) enable_testing() add_subdirectory(tests) +endif() + +option(BUILD_BENCHMARKS "Build benchmarks" OFF) +if(BUILD_BENCHMARKS) + add_subdirectory(benchmarks) endif() \ No newline at end of file diff --git a/readme.md b/readme.md index 30cef3e..2eeaf95 100644 --- a/readme.md +++ b/readme.md @@ -66,7 +66,8 @@ ctest - Boost Python - Cmake > 3.28 - Python 3 (we test with 3.10, but it should work with any version) -- pytest for unit tests (we use 9.0.2) +- pytest and GTest for unit tests (we use pytest=9.0.2) +- Google Benchmark for benchmarking ## Troubleshooting diff --git a/src/backend/computational_graph/activation_functions/leaky_relu_node.cpp b/src/backend/computational_graph/activation_functions/leaky_relu_node.cpp index 2334992..0557f56 100644 --- a/src/backend/computational_graph/activation_functions/leaky_relu_node.cpp +++ b/src/backend/computational_graph/activation_functions/leaky_relu_node.cpp @@ -24,7 +24,7 @@ vector> LeakyReLuNode::backward(const Tensor& upstreamGrad) { const auto& parent = parents[0]; for(tensorSize_t i=0; isetItem((*parent)[i] > zero ? 1 : eps, i); + res->setItem((*parent)[i] > zero ? upstreamGrad[i] : upstreamGrad[i] * eps, i); } return {res}; diff --git a/src/backend/computational_graph/activation_functions/relu_node.cpp b/src/backend/computational_graph/activation_functions/relu_node.cpp index 015d7ff..4db7938 100644 --- a/src/backend/computational_graph/activation_functions/relu_node.cpp +++ b/src/backend/computational_graph/activation_functions/relu_node.cpp @@ -24,7 +24,7 @@ vector> ReLuNode::backward(const Tensor& upstreamGrad) { const auto& parent = parents[0]; for(tensorSize_t i=0; isetItem((*parent)[i] > zero ? 1 : zero, i); + res->setItem((*parent)[i] > zero ? upstreamGrad[i] : zero, i); } return {res}; diff --git a/src/backend/computational_graph/activation_functions/sigmoid_node.cpp b/src/backend/computational_graph/activation_functions/sigmoid_node.cpp new file mode 100644 index 0000000..a0b0d9c --- /dev/null +++ b/src/backend/computational_graph/activation_functions/sigmoid_node.cpp @@ -0,0 +1,35 @@ +/** + * @file sigmoid_node.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-14 + * + * @copyright Copyright (c) 2026 + * + */ + +#include "sigmoid_node.h" + +#include + +using namespace std; +using namespace cgraph; + +vector> SigmoidNode::backward(const Tensor& upstreamGrad) { + assert(!upstreamGrad.getRequiresGrad()); + constexpr ftype zero = 0.0; + + auto res = make_shared(upstreamGrad.getDims(), upstreamGrad.getDevice(), false); + + // s is result from forward pass sigmoid + auto derivative = [](ftype s){ + return s * (1-s); + }; + + for(tensorSize_t i=0; isetItem(derivative((*sigmoid)[i] * upstreamGrad[i]), i); + } + + return {res}; +} \ No newline at end of file diff --git a/src/backend/computational_graph/activation_functions/sigmoid_node.h b/src/backend/computational_graph/activation_functions/sigmoid_node.h new file mode 100644 index 0000000..82e824d --- /dev/null +++ b/src/backend/computational_graph/activation_functions/sigmoid_node.h @@ -0,0 +1,32 @@ +/** + * @file relu_node.h + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-02-15 + * + * @copyright Copyright (c) 2026 + * + */ + +#pragma once + +#include "computational_graph/graph_node.h" +#include "data_modeling/tensor.h" + +#include +#include + +namespace cgraph { + class SigmoidNode final : public GraphNode { + private: + // cache the result of the forward function + std::shared_ptr sigmoid; + + public: + explicit SigmoidNode(std::shared_ptr t, std::shared_ptr sigmoid) + : GraphNode({std::move(t)}), sigmoid{std::move(sigmoid)} {} + + std::vector> backward(const Tensor& upstreamGrad) override; + }; +} diff --git a/src/backend/computational_graph/loss_functions/bce_node.cpp b/src/backend/computational_graph/loss_functions/bce_node.cpp new file mode 100644 index 0000000..f1d4507 --- /dev/null +++ b/src/backend/computational_graph/loss_functions/bce_node.cpp @@ -0,0 +1,37 @@ +/** + * @file bce_node.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-14 + * + * @copyright Copyright (c) 2026 + * + */ + +#include "bce_node.h" + +#include "data_modeling/tensor_functions.h" + +#include + +using namespace std; +using namespace cgraph; + +vector< shared_ptr > BceNode::backward(const Tensor& upstreamGrad) { + assert(!upstreamGrad.getRequiresGrad()); + + const auto& yPred = parents[0]; + auto res = make_shared(yPred->createEmptyCopy()); + + for(tensorSize_t i=0; i(bSize); i++){ + auto yi = (*yTrue)[i]; + auto yiHat = (*yPred)[i]; + + constexpr ftype eps = 1e-6; + auto g = -yi/std::max(yiHat, eps) + (1-yi)/std::max(1-yiHat, eps); + res->setItem(g/bSize, i); + } + + return {res}; +} \ No newline at end of file diff --git a/src/backend/computational_graph/loss_functions/bce_node.h b/src/backend/computational_graph/loss_functions/bce_node.h new file mode 100644 index 0000000..b3ec915 --- /dev/null +++ b/src/backend/computational_graph/loss_functions/bce_node.h @@ -0,0 +1,36 @@ +/** + * @file bce_node.h + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-14 + * + * @copyright Copyright (c) 2026 + * + */ + +#pragma once + +#include "computational_graph/graph_node.h" +#include "utility/global_params.h" + +namespace cgraph { + class BceNode final : public GraphNode { + private: + const std::shared_ptr yTrue; + const ftype bSize; + + public: + explicit BceNode(std::shared_ptr y, std::shared_ptr yPred) + : GraphNode({yPred}), yTrue{std::move(y)}, bSize{static_cast(yPred->getDims()[0])} + { + assert(yPred->getDims()==yTrue->getDims()); + + if(!yPred->getRequiresGrad()){ + std::__throw_invalid_argument("yPred must be a graph node"); + } + } + + std::vector> backward(const Tensor& upstreamGrad) override; + }; +} \ No newline at end of file diff --git a/src/backend/computational_graph/loss_functions/crossentropy_node.cpp b/src/backend/computational_graph/loss_functions/crossentropy_node.cpp new file mode 100644 index 0000000..b28a5e0 --- /dev/null +++ b/src/backend/computational_graph/loss_functions/crossentropy_node.cpp @@ -0,0 +1,38 @@ +/** + * @file add_node.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-02-03 + * + * @copyright Copyright (c) 2026 + * + */ + +#include "crossentropy_node.h" + +#include "data_modeling/tensor_functions.h" + +using namespace std; +using namespace cgraph; + +vector< shared_ptr > CrossEntropyNode::backward(const Tensor& upstreamGrad) { + assert(!upstreamGrad.getRequiresGrad()); + + const auto& yPred = parents[0]; + auto res = make_shared(yPred->createEmptyCopy()); + + for(tensorDim_t i=0; i(bSize); i++){ + auto yi = (*yTrue)[i]; + + for(tensorDim_t j=0; igetItem(i, j), eps); + + auto g = -yi/yijHat; + res->setItem(g/bSize, i, j); + } + } + + return {res}; +} \ No newline at end of file diff --git a/src/backend/computational_graph/loss_functions/crossentropy_node.h b/src/backend/computational_graph/loss_functions/crossentropy_node.h new file mode 100644 index 0000000..28fc581 --- /dev/null +++ b/src/backend/computational_graph/loss_functions/crossentropy_node.h @@ -0,0 +1,45 @@ +/** + * @file add_node.h + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-02-03 + * + * @copyright Copyright (c) 2026 + * + */ + +#pragma once + +#include "computational_graph/graph_node.h" +#include "utility/global_params.h" + +namespace cgraph { + class CrossEntropyNode final : public GraphNode { + private: + const std::shared_ptr yTrue; + + const ftype bSize; + const tensorDim_t nClasses; + + public: + + /** + * @brief Expexted shapes are same as for CrossEntropyLoss. + * + * @param y shape (batchsize) + * @param yPred shape (batchsize, nclasses) + */ + explicit CrossEntropyNode(std::shared_ptr y, std::shared_ptr yPred) + : GraphNode({yPred}), yTrue{std::move(y)}, bSize{static_cast(yPred->getDims()[0])}, + nClasses{yPred->getDims()[1]} + { + assert(yPred->getDims()[0]==yTrue->getDims()[0]); + if(!yPred->getRequiresGrad()){ + std::__throw_invalid_argument("yPred must be a graph node"); + } + } + + std::vector> backward(const Tensor& upstreamGrad) override; + }; +} \ No newline at end of file diff --git a/src/backend/computational_graph/loss_functions/rsme_node.cpp b/src/backend/computational_graph/loss_functions/rsme_node.cpp new file mode 100644 index 0000000..6f73b14 --- /dev/null +++ b/src/backend/computational_graph/loss_functions/rsme_node.cpp @@ -0,0 +1,38 @@ +/** + * @file rsme_node.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-14 + * + * @copyright Copyright (c) 2026 + * + */ + +#include "rsme_node.h" + +#include "data_modeling/tensor_functions.h" + +#include + +using namespace std; +using namespace cgraph; + +vector< shared_ptr > RsmeNode::backward(const Tensor& upstreamGrad) { + assert(!upstreamGrad.getRequiresGrad()); + + const auto& yPred = parents[0]; + auto res = make_shared(yPred->createEmptyCopy()); + + for(tensorSize_t i=0; i(bSize); i++){ + auto yi = (*yTrue)[i]; + auto yiHat = (*yPred)[i]; + + constexpr ftype eps = 1e-6; + auto denom = rsme * bSize + eps; + auto g = (yi-yiHat) / denom; + res->setItem(g, i); + } + + return {res}; +} \ No newline at end of file diff --git a/src/backend/computational_graph/loss_functions/rsme_node.h b/src/backend/computational_graph/loss_functions/rsme_node.h new file mode 100644 index 0000000..74526fc --- /dev/null +++ b/src/backend/computational_graph/loss_functions/rsme_node.h @@ -0,0 +1,39 @@ +/** + * @file rsme_node.h + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-14 + * + * @copyright Copyright (c) 2026 + * + */ + +#pragma once + +#include "computational_graph/graph_node.h" +#include "utility/global_params.h" + +namespace cgraph { + class RsmeNode final : public GraphNode { + private: + const std::shared_ptr yTrue; + + const ftype bSize; + ftype rsme; + + public: + explicit RsmeNode(std::shared_ptr y, std::shared_ptr yPred, ftype rsme) + : GraphNode({yPred}), yTrue{std::move(y)}, bSize{static_cast(yPred->getDims()[0])}, + rsme{rsme} + { + assert(yPred->getDims()==yTrue->getDims()); + + if(!yPred->getRequiresGrad()){ + std::__throw_invalid_argument("yPred must be a graph node"); + } + } + + std::vector> backward(const Tensor& upstreamGrad) override; + }; +} \ No newline at end of file diff --git a/src/backend/computational_graph/tensor_ops/graph_creation.cpp b/src/backend/computational_graph/tensor_ops/graph_creation.cpp index 130270e..25b2c08 100644 --- a/src/backend/computational_graph/tensor_ops/graph_creation.cpp +++ b/src/backend/computational_graph/tensor_ops/graph_creation.cpp @@ -84,7 +84,8 @@ shared_ptr cgraph::sub(const shared_ptr t, ftype scalar) { shared_ptr cgraph::div(const shared_ptr t, ftype scalar) { auto res = make_shared((*t) / scalar); if(t->getRequiresGrad()){ - res->setCgNode(std::make_shared(t, 1 / scalar)); + constexpr ftype eps = 1e-6; + res->setCgNode(std::make_shared(t, 1/std::max(scalar, eps))); assert(res->getRequiresGrad()); } return res; diff --git a/src/backend/data_modeling/dim_type.h b/src/backend/data_modeling/dim_type.h index 16b33ad..b9a1ee4 100644 --- a/src/backend/data_modeling/dim_type.h +++ b/src/backend/data_modeling/dim_type.h @@ -49,6 +49,10 @@ class Dimension final { } tensorDim_t getItem(int idx) const { + return (*this)[idx]; + } + + tensorDim_t operator[](int idx) const { assert(size>0); if(idx<0){ idx = dims.size() + idx; // -1 is last idx, -2 second last and so forth diff --git a/src/backend/data_modeling/tensor.cpp b/src/backend/data_modeling/tensor.cpp index a6bf6ed..ff66b6e 100644 --- a/src/backend/data_modeling/tensor.cpp +++ b/src/backend/data_modeling/tensor.cpp @@ -252,7 +252,7 @@ Tensor Tensor::createEmptyCopy() const { return res; } /** - * @brief Does a deep copy. + * @brief Does a deep copy, but omits gradient and computational graph information. */ Tensor Tensor::createDeepCopy() const { assert(!grads || (grads && !grads->requiresGrad)); // gradient should not require gradient @@ -563,7 +563,10 @@ void Tensor::backward() { auto& tensor = *tPtr; assert(tensor.grads && !tensor.grads->requiresGrad); // gradient should not require grad + cout << "backward of " << tPtr << endl; + cout << "grads " << *tensor.grads << endl; auto incomingGrads = tensor.cgNode->backward(*tensor.grads); + const auto& parents = tensor.cgNode->getParents(); for(size_t i=0; i&& newOrder) noexcept { /** * @brief Populates the tensor with value. */ -void Tensor::reset(const ftype x) { +void Tensor::reset(const ftype x) noexcept { for(tensorSize_t i=0; igetSize(); i++){ (*values)[i] = x; } diff --git a/src/backend/data_modeling/tensor.h b/src/backend/data_modeling/tensor.h index 5c6610c..61adc65 100644 --- a/src/backend/data_modeling/tensor.h +++ b/src/backend/data_modeling/tensor.h @@ -174,7 +174,7 @@ class Tensor final : public std::enable_shared_from_this { Tensor(Tensor&& other) noexcept; Tensor& operator=(Tensor&& other) noexcept; - void reset(const ftype x); + void reset(const ftype x) noexcept; void reset(const utility::InitClass ic); const Dimension& getDims() const noexcept; diff --git a/src/backend/data_modeling/tensor_functions.cpp b/src/backend/data_modeling/tensor_functions.cpp index 97928b9..a878970 100644 --- a/src/backend/data_modeling/tensor_functions.cpp +++ b/src/backend/data_modeling/tensor_functions.cpp @@ -44,11 +44,11 @@ Tensor TensorFunctions::Gaussian(vector dims, const bool requiresGr } // Tensor manipulation -void TensorFunctions::ToZeros(Tensor& t) { +void TensorFunctions::ToZeros(Tensor& t) noexcept { t.reset(0); } -void TensorFunctions::ToOnes(Tensor& t) { +void TensorFunctions::ToOnes(Tensor& t) noexcept { t.reset(1); } diff --git a/src/backend/data_modeling/tensor_functions.h b/src/backend/data_modeling/tensor_functions.h index a410ac6..a79955e 100644 --- a/src/backend/data_modeling/tensor_functions.h +++ b/src/backend/data_modeling/tensor_functions.h @@ -48,8 +48,8 @@ namespace TensorFunctions { // class name acts as namespace for us Device d, bool requiresGrad=false); // Tensor manipulation - void ToZeros(Tensor& t); - void ToOnes(Tensor& t); + void ToZeros(Tensor& t) noexcept; + void ToOnes(Tensor& t) noexcept; void ToGaussian(Tensor& t); // Arithmetics diff --git a/src/backend/module/activation_functions/leaky_relu.h b/src/backend/module/activation_functions/leaky_relu.h index daa8d92..794e41c 100644 --- a/src/backend/module/activation_functions/leaky_relu.h +++ b/src/backend/module/activation_functions/leaky_relu.h @@ -25,6 +25,8 @@ namespace module { Tensor operator()(const Tensor& t) const override; std::shared_ptr operator()(const std::shared_ptr& t) const override; - void print(std::ostream& os) const noexcept override { os << "\neps: " << eps; } + void print(std::ostream& os) const noexcept override { + os << "\nLeakyReLU\neps: " << eps; + } }; } diff --git a/src/backend/module/activation_functions/sigmoid.cpp b/src/backend/module/activation_functions/sigmoid.cpp new file mode 100644 index 0000000..1c14c61 --- /dev/null +++ b/src/backend/module/activation_functions/sigmoid.cpp @@ -0,0 +1,51 @@ +/** + * @file sigmoid.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-07 + * + * @copyright Copyright (c) 2026 + * + */ + +#include "sigmoid.h" + +#include "computational_graph/activation_functions/sigmoid_node.h" + +#include + +using namespace std; +using namespace module; + +/** + * @brief Sigmoid activation function. + */ +Tensor Sigmoid::operator()(const Tensor& t) const { + auto res = t.createEmptyCopy(); + + auto compute = [](ftype x){ + if(x>=0){ + return static_cast(1.0f) / (static_cast(1.0f) + exp(x)); + } + auto e = exp(x); + return e / (static_cast(1.0f) + e); + }; + + for(tensorSize_t i=0; i Sigmoid::operator()(const shared_ptr& t) const { + auto res = make_shared((*this)(*t)); + + if(t->getRequiresGrad()){ + res->setCgNode(make_shared(t, res)); + assert(res->getRequiresGrad()); + } + + return res; +} diff --git a/src/backend/module/activation_functions/sigmoid.h b/src/backend/module/activation_functions/sigmoid.h index e69de29..1cdf8d7 100644 --- a/src/backend/module/activation_functions/sigmoid.h +++ b/src/backend/module/activation_functions/sigmoid.h @@ -0,0 +1,22 @@ +/** + * @file sigmoid.h + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-07 + * + * @copyright Copyright (c) 2026 + * + */ + +#pragma once + +#include "module/module_base.h" + +namespace module { + class Sigmoid final : public ModuleBase { + public: + Tensor operator()(const Tensor& t) const override; + std::shared_ptr operator()(const std::shared_ptr& t) const override; + }; +} diff --git a/src/backend/module/layers/ff_layer.cpp b/src/backend/module/layers/ff_layer.cpp index 5c192d5..412f6f7 100644 --- a/src/backend/module/layers/ff_layer.cpp +++ b/src/backend/module/layers/ff_layer.cpp @@ -74,7 +74,7 @@ std::shared_ptr FfLayer::operator()(const std::shared_ptr& input } void FfLayer::print(ostream& os) const noexcept { - os << "Weigths:\n" << *weights; + os << "\nFfLayer\nWeigths:\n" << *weights; if(bias){ os << "\nBias:\n" << *bias; } diff --git a/src/backend/module/module_base.cpp b/src/backend/module/module_base.cpp index ef4e426..951f96c 100644 --- a/src/backend/module/module_base.cpp +++ b/src/backend/module/module_base.cpp @@ -15,7 +15,7 @@ using namespace std; -ostream& operator<<(ostream& os, const module::ModuleBase& l) noexcept { +ostream& module::operator<<(ostream& os, const module::ModuleBase& l) noexcept { l.print(os); // calling vtable return os; } \ No newline at end of file diff --git a/src/backend/module/module_base.h b/src/backend/module/module_base.h index e08bd9c..28247ab 100644 --- a/src/backend/module/module_base.h +++ b/src/backend/module/module_base.h @@ -20,6 +20,11 @@ #include +// if GCC or Clang +#ifdef __GNUC__ +#include +#endif // __GNUC__ + namespace module { /** * The base class for all the layers that we have. Not instantiable. @@ -43,7 +48,19 @@ namespace module { virtual std::vector< std::shared_ptr > parameters() const { return {}; } - virtual void print(std::ostream& os) const noexcept {}; + virtual void print(std::ostream& os) const noexcept { + os << "\n"; + #ifdef __GNUC__ + // demangle name on gcc and clang + int status; + char* demangled = abi::__cxa_demangle(typeid(*this).name(), nullptr, nullptr, &status); + os << (status == 0 ? demangled : typeid(*this).name()); + std::free(demangled); + #else + os << typeid(*this).name(); + #endif + }; + friend std::ostream& operator<<(std::ostream& os, const ModuleBase& t) noexcept; }; } \ No newline at end of file diff --git a/src/backend/module/networks/sequential.cpp b/src/backend/module/networks/sequential.cpp index fa1190c..e807e51 100644 --- a/src/backend/module/networks/sequential.cpp +++ b/src/backend/module/networks/sequential.cpp @@ -57,4 +57,11 @@ vector> Sequential::parameters() const { void Sequential::append(shared_ptr l) { layers.push_back(move(l)); +} + +void Sequential::print(std::ostream& os) const noexcept { + os << "\nSequential"; + for(const auto& l: layers){ + os << *l; + } } \ No newline at end of file diff --git a/src/backend/module/networks/sequential.h b/src/backend/module/networks/sequential.h index ea55f02..cf13417 100644 --- a/src/backend/module/networks/sequential.h +++ b/src/backend/module/networks/sequential.h @@ -38,5 +38,7 @@ namespace module { std::vector> parameters() const override; void append(std::shared_ptr l); + + void print(std::ostream& os) const noexcept override; }; } diff --git a/src/backend/training/loss_functions/bce_loss.cpp b/src/backend/training/loss_functions/bce_loss.cpp index bdce4e5..9fb71fc 100644 --- a/src/backend/training/loss_functions/bce_loss.cpp +++ b/src/backend/training/loss_functions/bce_loss.cpp @@ -11,35 +11,45 @@ #include "bce_loss.h" +#include "computational_graph/loss_functions/bce_node.h" + #include +#include + using namespace std; using namespace train; /** - * @brief Expected shapes: (batch_size) + * @brief Expected shapes: (batchsize) or (batchsize, 1) * @return Tensor of shape (1) */ -shared_ptr BceLoss::operator()(const Tensor& y, const shared_ptr& ypred) const { - assert(ypred->getRequiresGrad()); - - if(y.getDevice() != ypred->getDevice()){ +shared_ptr BceLoss::operator()(const shared_ptr y, const shared_ptr ypred) const { + if(!ypred->getRequiresGrad()) { + __throw_invalid_argument("ypred must have gradient enabled"); + } + else if(y->getDevice() != ypred->getDevice()){ __throw_invalid_argument("y and ypred must be on same device"); } - else if(y.getDims()!=ypred->getDims()){ + else if(y->getDims()!=ypred->getDims()){ __throw_invalid_argument("Tensors must be of same shape"); } auto bce = [](ftype y, ftype ypred){ - return y*log(ypred) + (1-y)*log(1-ypred); + constexpr ftype eps = 1e-6; + return y*log(std::max(ypred, eps)) + (1-y)*log(std::max(1-ypred, eps)); }; - const auto nBatches = y.getDims().getItem(0); + const auto nBatches = y->getDims()[0]; - ftype res = 0; + ftype loss = 0; for(tensorSize_t i=0; i(std::vector{1}, std::vector{-res / nBatches}, y.getDevice(), true);; + auto res = make_shared(std::vector{1}, std::vector{-loss / nBatches}, y->getDevice(), true); + res->setCgNode(make_shared(y, ypred)); + assert(res->getRequiresGrad()); + + return res; } \ No newline at end of file diff --git a/src/backend/training/loss_functions/bce_loss.h b/src/backend/training/loss_functions/bce_loss.h index 7ee06dd..9ddc938 100644 --- a/src/backend/training/loss_functions/bce_loss.h +++ b/src/backend/training/loss_functions/bce_loss.h @@ -16,6 +16,7 @@ namespace train { class BceLoss final : public LossBase { public: - std::shared_ptr operator()(const Tensor& y, const std::shared_ptr& ypred) const override; + std::shared_ptr operator()(const std::shared_ptr y, + const std::shared_ptr ypred) const override; }; } diff --git a/src/backend/training/loss_functions/crossentropy_loss.cpp b/src/backend/training/loss_functions/crossentropy_loss.cpp index 1ef3a27..430116f 100644 --- a/src/backend/training/loss_functions/crossentropy_loss.cpp +++ b/src/backend/training/loss_functions/crossentropy_loss.cpp @@ -11,6 +11,8 @@ #include "crossentropy_loss.h" +#include "computational_graph/loss_functions/crossentropy_node.h" + #include using namespace std; @@ -20,29 +22,35 @@ using namespace train; * @brief Expected shapes: (batch_size, n_classes) * @return Tensor of shape (1) */ -shared_ptr CrossEntropyLoss::operator()(const Tensor& y, const shared_ptr & ypred) const { - assert(ypred->getRequiresGrad()); - - if(y.getDevice() != ypred->getDevice()){ +shared_ptr CrossEntropyLoss::operator()(const shared_ptr y, const shared_ptr ypred) const { + if(!ypred->getRequiresGrad()) { + __throw_invalid_argument("ypred must have gradient enabled"); + } + else if(y->getDevice() != ypred->getDevice()){ __throw_invalid_argument("y and ypred must be on same device"); } - else if(y.getDims()!=ypred->getDims()){ + else if(y->getDims()!=ypred->getDims()){ __throw_invalid_argument("Tensors must be of same shape"); } auto ce = [&y, &ypred](const tensorDim_t b){ ftype res = 0; - for(tensorDim_t i=0; igetItem(b, i)); + for(tensorDim_t i=0; igetDims().getItem(-1); i++){ + constexpr ftype eps = 1e-6; + res += y->getItem(b, i) * log(std::max(ypred->getItem(b, i), eps)); } return res; }; - const auto nBatches = y.getDims().getItem(0); - ftype res = 0; + const auto nBatches = y->getDims()[0]; + ftype loss = 0; for(tensorSize_t b=0; b(std::vector{1}, std::vector{-res / nBatches}, y.getDevice(), true);; + auto res = make_shared(std::vector{1}, std::vector{-loss / nBatches}, y->getDevice(), true); + res->setCgNode(std::make_shared(y, ypred)); + assert(res->getRequiresGrad()); + + return res; } \ No newline at end of file diff --git a/src/backend/training/loss_functions/crossentropy_loss.h b/src/backend/training/loss_functions/crossentropy_loss.h index b91e037..dfd71b3 100644 --- a/src/backend/training/loss_functions/crossentropy_loss.h +++ b/src/backend/training/loss_functions/crossentropy_loss.h @@ -16,6 +16,7 @@ namespace train { class CrossEntropyLoss final : public LossBase { public: - std::shared_ptr operator()(const Tensor& y, const std::shared_ptr& ypred) const override; + std::shared_ptr operator()(const std::shared_ptr y, + const std::shared_ptr ypred) const override; }; } diff --git a/src/backend/training/loss_functions/loss_base.h b/src/backend/training/loss_functions/loss_base.h index ef9ce5e..9da95ea 100644 --- a/src/backend/training/loss_functions/loss_base.h +++ b/src/backend/training/loss_functions/loss_base.h @@ -28,10 +28,7 @@ namespace train { ~LossBase() noexcept = default; - virtual std::shared_ptr operator()(const Tensor& y, const std::shared_ptr& ypred) const = 0; - - std::shared_ptr operator()(const std::shared_ptr& y, const std::shared_ptr& ypred) { - return operator()(*y, ypred); - } + virtual std::shared_ptr operator()(const std::shared_ptr y, + const std::shared_ptr ypred) const = 0; }; } diff --git a/src/backend/training/loss_functions/rsme_loss.cpp b/src/backend/training/loss_functions/rsme_loss.cpp new file mode 100644 index 0000000..4dde243 --- /dev/null +++ b/src/backend/training/loss_functions/rsme_loss.cpp @@ -0,0 +1,55 @@ +/** + * @file rsme_loss.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-14 + * + * @copyright Copyright (c) 2026 + * + */ + +#include "rsme_loss.h" + +#include "computational_graph/loss_functions/rsme_node.h" + +#include +#include + +using namespace std; +using namespace train; + +/** + * @brief Expected shapes: (batchsize) or (batchsize, 1) + * @return Tensor of shape (1) + */ +shared_ptr RsmeLoss::operator()(const shared_ptr y, const shared_ptr ypred) const { + if(!ypred->getRequiresGrad()) { + __throw_invalid_argument("ypred must have gradient enabled"); + } + else if(y->getDevice() != ypred->getDevice()){ + __throw_invalid_argument("y and ypred must be on same device"); + } + else if(y->getDims()!=ypred->getDims()){ + __throw_invalid_argument("Tensors must be of same shape"); + } + + auto diffPow = [](ftype y, ftype ypred){ + auto diff = y - ypred; + return diff * diff; + }; + + const auto nBatches = y->getDims()[0]; + + ftype loss = 0; + for(tensorSize_t i=0; i(std::vector{1}, std::vector{loss}, y->getDevice(), true); + res->setCgNode(make_shared(y, ypred, loss)); + assert(res->getRequiresGrad()); + + return res; +} \ No newline at end of file diff --git a/src/backend/training/loss_functions/rsme_loss.h b/src/backend/training/loss_functions/rsme_loss.h new file mode 100644 index 0000000..5028012 --- /dev/null +++ b/src/backend/training/loss_functions/rsme_loss.h @@ -0,0 +1,22 @@ +/** + * @file rsme_loss.h + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-14 + * + * @copyright Copyright (c) 2026 + * + */ + +#pragma once + +#include "loss_base.h" + +namespace train { + class RsmeLoss final : public LossBase { + public: + std::shared_ptr operator()(const std::shared_ptr y, + const std::shared_ptr ypred) const override; + }; +} diff --git a/src/backend/training/optimizers/optimizer_base.cpp b/src/backend/training/optimizers/optimizer_base.cpp new file mode 100644 index 0000000..e2a6d8d --- /dev/null +++ b/src/backend/training/optimizers/optimizer_base.cpp @@ -0,0 +1,25 @@ +/** + * @file optimizer_base.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-14 + * + * @copyright Copyright (c) 2026 + * + */ + +#include "optimizer_base.h" + +#include "data_modeling/tensor_functions.h" + +using namespace train; + +void OptimizerBase::zeroGrad() noexcept{ + for(auto& p: params){ + auto grads = p->getGrads(); + + if(grads) + TensorFunctions::ToZeros(*grads); + } +} \ No newline at end of file diff --git a/src/backend/training/optimizers/optimizer_base.h b/src/backend/training/optimizers/optimizer_base.h index f74718b..af4f6ef 100644 --- a/src/backend/training/optimizers/optimizer_base.h +++ b/src/backend/training/optimizers/optimizer_base.h @@ -43,5 +43,7 @@ namespace train { OptimizerBase& operator=(OptimizerBase&& other) noexcept = default; virtual void step() = 0; + + void zeroGrad() noexcept; }; } \ No newline at end of file diff --git a/src/backend/training/optimizers/rmsprop.cpp b/src/backend/training/optimizers/rmsprop.cpp index 8e346ec..10699ba 100644 --- a/src/backend/training/optimizers/rmsprop.cpp +++ b/src/backend/training/optimizers/rmsprop.cpp @@ -33,7 +33,7 @@ void RmsPropOptimizer::step() { movingAvg[tPtr] = make_unique(tPtr->getDims(), tPtr->getDevice(), false); // create empty tensor vPtr = movingAvg[tPtr].get(); for(tensorSize_t i=0; igetSize(); i++) { - auto g = (*tPtr)[i]; + auto g = (*gPtr)[i]; vPtr->setItem((1-decay)*g*g, i); } } diff --git a/src/backend/training/trainers/base_train_loop.cpp b/src/backend/training/trainers/base_train_loop.cpp index 6cdfce5..2a4e9d3 100644 --- a/src/backend/training/trainers/base_train_loop.cpp +++ b/src/backend/training/trainers/base_train_loop.cpp @@ -17,13 +17,20 @@ #include #include +#include + using namespace std; using namespace train; -void BaseTrainLoop::run(shared_ptr& x, shared_ptr& y, const bool shuffle) { +void BaseTrainLoop::run(shared_ptr& x, shared_ptr& y, const bool shuffle, const bool verbose) { + const auto nSamples = x->getDims().getItem(0); + for(size_t e=0; e indices(bsize); - std::iota(indices.begin(), indices.end(), 0); + std::vector indices(nSamples); + std::iota(indices.begin(), indices.end(), 0); + + if(verbose) + cout << "\nEpoch " << e; if(shuffle){ std::random_device rd; @@ -31,19 +38,27 @@ void BaseTrainLoop::run(shared_ptr& x, shared_ptr& y, const bool std::shuffle(indices.begin(), indices.end(), rng); } - const auto nSamples = x->getDims().getItem(0); tensorDim_t low = 0; + + int batch = 0; while(low < nSamples){ + if(verbose) + cout << "\nBatch " << batch << endl; + std::span batchSpan(indices.data() + low, low+bsize < nSamples ? bsize : nSamples-low); auto xBatch = make_shared(x->getSlice(batchSpan)); - auto yBatch = y->getSlice(batchSpan); + auto yBatch = make_shared(y->getSlice(batchSpan)); auto yPred = (*graph)(xBatch); + cout << "\nypred: " << *yPred << endl; + auto l = (*loss)(yBatch, yPred); - + cout << "\nloss: " << (*l)[0] << endl; + l->backward(); optim->step(); + optim->zeroGrad(); low += bsize; } diff --git a/src/backend/training/trainers/base_train_loop.h b/src/backend/training/trainers/base_train_loop.h index 0348b5f..3beeb46 100644 --- a/src/backend/training/trainers/base_train_loop.h +++ b/src/backend/training/trainers/base_train_loop.h @@ -23,8 +23,6 @@ namespace train { class BaseTrainLoop { protected: - ftype lr; - const size_t epochs; const tensorDim_t bsize; @@ -34,8 +32,8 @@ namespace train { public: BaseTrainLoop(std::shared_ptr graph, std::shared_ptr loss, - std::shared_ptr optim, ftype lr, size_t epochs, tensorDim_t bsize) - : graph{std::move(graph)}, optim{std::move(optim)}, loss{loss}, lr{lr}, epochs{epochs}, bsize{bsize} + std::shared_ptr optim, size_t epochs, tensorDim_t bsize) + : graph{std::move(graph)}, optim{std::move(optim)}, loss{loss}, epochs{epochs}, bsize{bsize} { } ~BaseTrainLoop() noexcept = default; @@ -46,6 +44,6 @@ namespace train { BaseTrainLoop(BaseTrainLoop&& other) noexcept = default; BaseTrainLoop& operator=(BaseTrainLoop&& other) noexcept = default; - void run(std::shared_ptr& x, std::shared_ptr& y, const bool shuffle); + void run(std::shared_ptr& x, std::shared_ptr& y, bool shuffle, bool verbose=true); }; } \ No newline at end of file diff --git a/src/python/py_train/py_train.cpp b/src/python/py_train/py_train.cpp index aa07ef2..35c3854 100644 --- a/src/python/py_train/py_train.cpp +++ b/src/python/py_train/py_train.cpp @@ -47,8 +47,8 @@ BOOST_PYTHON_MODULE(_train) // Trainers class_, boost::noncopyable>("TrainLoop", no_init) - .def(init&, std::shared_ptr, std::shared_ptr, - ftype, size_t, tensorDim_t>()) + .def(init&, std::shared_ptr, + std::shared_ptr, size_t, tensorDim_t>()) .def("step", &train::RmsPropOptimizer::step) ; } \ No newline at end of file diff --git a/src/python/py_utility/custom_converters.h b/src/python/py_utility/custom_converters.h index 4c2d49d..f712da9 100644 --- a/src/python/py_utility/custom_converters.h +++ b/src/python/py_utility/custom_converters.h @@ -52,6 +52,7 @@ namespace custom_converters { }; } +// TODO: do array instead of tensor /* struct DimsFromPython { static void* convertible(PyObject* obj) { if (!PyTuple_Check(obj) && !PyList_Check(obj)) return nullptr; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index a94e089..c8ca76b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -10,7 +10,8 @@ add_executable(unit_tests_backend backend/test_data_modeling.cpp backend/test_computational_graph.cpp backend/test_module.cpp - backend/test_training.cpp + backend/test_losses.cpp + backend/test_train_loop.cpp ) target_link_libraries(unit_tests_backend PRIVATE diff --git a/tests/backend/test_losses.cpp b/tests/backend/test_losses.cpp new file mode 100644 index 0000000..05da8e6 --- /dev/null +++ b/tests/backend/test_losses.cpp @@ -0,0 +1,151 @@ +/** + * @file test_losses.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-14 + * + * @copyright Copyright (c) 2026 + * + */ + +#include + +#include "data_modeling/tensor_functions.h" + +#include "training/loss_functions/bce_loss.h" +#include "training/loss_functions/crossentropy_loss.h" + +#include + +using namespace train; + +static constexpr ftype kTol = 1e-4f; + +// ─── CrossEntropy ──────────────────────────────────────────────────────────── + +TEST(LossTest, CrossEntropy_CorrectValue) { + auto y = TensorFunctions::makeSharedTensor( + {2, 3}, {1.0, 0.0, 0.0, + 0.0, 1.0, 0.0}, false); + + auto ypred = TensorFunctions::makeSharedTensor( + {2, 3}, {0.7, 0.2, 0.1, + 0.1, 0.8, 0.1}, true); + + auto loss = CrossEntropyLoss{}; + auto result = loss(y, ypred); + + // expected: -( log(0.7) + log(0.8) ) / 2 = 0.2899 + const ftype expected = -(std::log(0.7f) + std::log(0.8f)) / 2.0f; + EXPECT_NEAR((*result)[0], expected, kTol); +} + +TEST(LossTest, CrossEntropy_PerfectPrediction) { + auto y = TensorFunctions::makeSharedTensor( + {2, 3}, {1.0, 0.0, 0.0, + 0.0, 1.0, 0.0}, false); + + // near-perfect predictions — can't use exactly 1.0 due to log(0) + auto ypred = TensorFunctions::makeSharedTensor( + {2, 3}, {0.999, 0.0005, 0.0005, + 0.0005, 0.999, 0.0005}, true); + + auto loss = CrossEntropyLoss{}; + auto result = loss(y, ypred); + + // loss should be very small + EXPECT_LT((*result)[0], 0.01f); +} + +TEST(LossTest, CrossEntropy_UniformPrediction) { + // uniform prediction should give log(3) ~ 1.0986 + auto y = TensorFunctions::makeSharedTensor( + {1, 3}, {1.0, 0.0, 0.0}, false); + + auto ypred = TensorFunctions::makeSharedTensor( + {1, 3}, {1.0f/3, 1.0f/3, 1.0f/3}, true); + + auto loss = CrossEntropyLoss{}; + auto result = loss(y, ypred); + + EXPECT_NEAR((*result)[0], std::log(3.0f), kTol); +} + +TEST(LossTest, CrossEntropy_DimMismatch_Throws) { + auto y = TensorFunctions::makeSharedTensor( + {2, 3}, {1.0, 0.0, 0.0, 0.0, 1.0, 0.0}, false); + auto ypred = TensorFunctions::makeSharedTensor( + {2, 2}, {0.5, 0.5, 0.5, 0.5}, true); + + auto loss = CrossEntropyLoss{}; + EXPECT_THROW(loss(y, ypred), std::invalid_argument); +} + +// ─── BCE ───────────────────────────────────────────────────────────────────── + +TEST(LossTest, BCE_CorrectValue) { + auto y = TensorFunctions::makeSharedTensor( + {4, 1}, {0.0, 1.0, 1.0, 0.0}, false); + + auto ypred = TensorFunctions::makeSharedTensor( + {4, 1}, {0.1, 0.9, 0.8, 0.2}, true); + + auto loss = BceLoss{}; + auto result = loss(y, ypred); + + // expected: -( log(0.9) + log(0.9) + log(0.8) + log(0.8) ) / 4 = 0.1643 + const ftype expected = -(std::log(0.9f) + std::log(0.9f) + + std::log(0.8f) + std::log(0.8f)) / 4.0f; + EXPECT_NEAR((*result)[0], expected, kTol); +} + +TEST(LossTest, BCE_PerfectPrediction) { + auto y = TensorFunctions::makeSharedTensor( + {2, 1}, {1.0, 0.0}, false); + + auto ypred = TensorFunctions::makeSharedTensor( + {2, 1}, {0.999, 0.001}, true); + + auto loss = BceLoss{}; + auto result = loss(y, ypred); + + EXPECT_LT((*result)[0], 0.01f); +} + +TEST(LossTest, BCE_RandomPrediction) { + // ypred = 0.5 for all -> loss = log(2) ~ 0.6931 + auto y = TensorFunctions::makeSharedTensor( + {2, 1}, {1.0, 0.0}, false); + + auto ypred = TensorFunctions::makeSharedTensor( + {2, 1}, {0.5, 0.5}, true); + + auto loss = BceLoss{}; + auto result = loss(y, ypred); + + EXPECT_NEAR((*result)[0], std::log(2.0f), kTol); +} + +TEST(LossTest, BCE_DimMismatch_Throws) { + auto y = TensorFunctions::makeSharedTensor( + {2, 1}, {1.0, 0.0}, false); + auto ypred = TensorFunctions::makeSharedTensor( + {3, 1}, {0.5, 0.5, 0.5}, true); + + auto loss = BceLoss{}; + EXPECT_THROW(loss(y, ypred), std::invalid_argument); +} + +TEST(LossTest, BCE_NearZeroPred_NoInfOrNan) { + auto y = TensorFunctions::makeSharedTensor( + {1, 1}, {1.0}, false); + auto ypred = TensorFunctions::makeSharedTensor( + {1, 1}, {0.0}, true); + + auto loss = BceLoss{}; + auto result = loss(y, ypred); + + // clipping prevents log(0) + EXPECT_FALSE(std::isinf((*result)[0])); +} \ No newline at end of file diff --git a/tests/backend/test_train_loop.cpp b/tests/backend/test_train_loop.cpp new file mode 100644 index 0000000..0c967a9 --- /dev/null +++ b/tests/backend/test_train_loop.cpp @@ -0,0 +1,139 @@ +/** + * @file test_training.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-14 + * + * @copyright Copyright (c) 2026 + * + */ + +#include + +#include "module/networks/sequential.h" +#include "module/layers/ff_layer.h" + +#include "module/activation_functions/sigmoid.h" +#include "module/activation_functions/relu.h" +#include "module/activation_functions/leaky_relu.h" +#include "module/activation_functions/softmax.h" + +#include "training/optimizers/sgd.h" +#include "training/optimizers/rmsprop.h" + +#include "training/loss_functions/bce_loss.h" +#include "training/loss_functions/crossentropy_loss.h" + +#include "training/trainers/base_train_loop.h" + +#include "data_modeling/tensor_functions.h" + +using namespace std; + +static shared_ptr makeBinaryNet() { + auto net = make_shared(); + + net->append(make_shared( + vector{2, 4}, true, true)); + + net->append(make_shared(1e-5)); + + net->append(make_shared( + vector{4, 1}, true, true)); + + net->append(make_shared()); + return net; +} + +static shared_ptr makeMulticlassNet() { + auto net = make_shared(); + + net->append(make_shared( + vector{2, 8}, true, true)); + + net->append(make_shared()); + + net->append(make_shared( + vector{8, 3}, true, true)); + + net->append(make_shared()); + return net; +} + +// ─── binary overfit ───────────────────────────────────────────────────────── + +TEST(OverfitTest, BCE_SGD_OverfitsSmallDataset) { + // XOR-like: 4 samples, 2 features, binary labels + auto x = TensorFunctions::makeSharedTensor( + {4, 2}, {0.0, 0.0, + 0.0, 1.0, + 1.0, 0.0, + 1.0, 1.0}, false); + + auto y = TensorFunctions::makeSharedTensor( + {4, 1}, {0.0, + 1.0, + 1.0, + 0.0}, false); + + auto net = makeBinaryNet(); + cout << "Network: " << *net << endl; + + auto loss = make_shared(); + auto optim = make_shared( + net->parameters(), /*lr=*/0.01); + + auto trainLoop = train::BaseTrainLoop( + net, loss, optim, /*epochs=*/1, /*bsize=*/static_cast(4)); + + trainLoop.run(x, y, /*shuffle=*/false); + + // forward one more time to get final loss + auto pred = (*net)(x); + auto finalLoss = (*loss)(y, pred); + + cout << "pred: " << *pred << "\nloss: " << *finalLoss << endl; + + EXPECT_LT((*finalLoss)[0], 0.05f) + << "Network failed to overfit binary dataset"; +} + + +// ─── multiclass overfit ────────────────────────────────────────────────────── + +// TEST(OverfitTest, CrossEntropy_RMSProp_OverfitsSmallDataset) { +// // 6 samples, 2 features, 3 classes +// auto x = TensorFunctions::makeSharedTensor( +// {6, 2}, {1.0, 0.0, +// 1.0, 0.1, +// 0.0, 1.0, +// 0.1, 1.0, +// 0.5, 0.5, +// 0.4, 0.6}, false); +// +// // one-hot encoded labels +// auto y = TensorFunctions::makeSharedTensor( +// {6, 3}, {1.0, 0.0, 0.0, +// 1.0, 0.0, 0.0, +// 0.0, 1.0, 0.0, +// 0.0, 1.0, 0.0, +// 0.0, 0.0, 1.0, +// 0.0, 0.0, 1.0}, false); +// +// auto net = makeMulticlassNet(); +// auto loss = make_shared(); +// auto optim = make_shared( +// net->parameters(), /*lr=*/0.001, /*decay=*/0.9); +// +// auto trainLoop = train::BaseTrainLoop( +// net, loss, optim, /*epochs=*/2000, /*bsize=*/6); +// +// trainLoop.run(x, y, /*shuffle=*/false); +// +// auto pred = (*net)(x); +// auto finalLoss = (*loss)(y, pred); +// +// EXPECT_LT((*finalLoss)[0], 0.05f) +// << "Network failed to overfit multiclass dataset"; +// } \ No newline at end of file diff --git a/tests/backend/test_training.cpp b/tests/backend/test_training.cpp deleted file mode 100644 index 6c39812..0000000 --- a/tests/backend/test_training.cpp +++ /dev/null @@ -1,131 +0,0 @@ -/** - * @file test_training.cpp - * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) - * @brief - * @version 0.1 - * @date 2026-03-14 - * - * @copyright Copyright (c) 2026 - * - */ - -#include - -#include "module/networks/sequential.h" -#include "module/layers/ff_layer.h" - -#include "module/activation_functions/relu.h" -#include "module/activation_functions/leaky_relu.h" -#include "module/activation_functions/softmax.h" - -#include "training/optimizers/sgd.h" -#include "training/optimizers/rmsprop.h" - -#include "training/loss_functions/bce_loss.h" -#include "training/loss_functions/crossentropy_loss.h" - -#include "training/trainers/base_train_loop.h" - -#include "data_modeling/tensor_functions.h" - -static std::shared_ptr makeBinaryNet() { - auto net = std::make_shared(); - - net->append(std::make_shared( - std::vector{2, 4}, true, true)); - - net->append(std::make_shared()); - - net->append(std::make_shared( - std::vector{4, 1}, true, true)); - // BCE expects raw logits or sigmoid output — adjust if needed - return net; -} - -static std::shared_ptr makeMulticlassNet() { - auto net = std::make_shared(); - - net->append(std::make_shared( - std::vector{2, 8}, true, true)); - - net->append(std::make_shared()); - - net->append(std::make_shared( - std::vector{8, 3}, true, true)); - - net->append(std::make_shared()); - return net; -} - -// ─── binary overfit ───────────────────────────────────────────────────────── - -TEST(OverfitTest, BCE_SGD_OverfitsSmallDataset) { - // XOR-like: 4 samples, 2 features, binary labels - auto x = TensorFunctions::makeSharedTensor( - {4, 2}, {0.0, 0.0, - 0.0, 1.0, - 1.0, 0.0, - 1.0, 1.0}, false); - - auto y = TensorFunctions::makeSharedTensor( - {4, 1}, {0.0, - 1.0, - 1.0, - 0.0}, false); - - auto net = makeBinaryNet(); - auto loss = std::make_shared(); - auto optim = std::make_shared( - net->parameters(), /*lr=*/0.01); - - auto trainLoop = train::BaseTrainLoop( - net, loss, optim, /*lr=*/static_cast(0.01), /*epochs=*/2000, /*bsize=*/static_cast(4)); - - trainLoop.run(x, y, /*shuffle=*/false); - - // forward one more time to get final loss - auto pred = (*net)(x); - auto finalLoss = (*loss)(*pred, y); - - EXPECT_LT((*finalLoss)[0], 0.05f) - << "Network failed to overfit binary dataset"; -} - - -// ─── multiclass overfit ────────────────────────────────────────────────────── - -TEST(OverfitTest, CrossEntropy_RMSProp_OverfitsSmallDataset) { - // 6 samples, 2 features, 3 classes - auto x = TensorFunctions::makeSharedTensor( - {6, 2}, {1.0, 0.0, - 1.0, 0.1, - 0.0, 1.0, - 0.1, 1.0, - 0.5, 0.5, - 0.4, 0.6}, false); - - // one-hot encoded labels - auto y = TensorFunctions::makeSharedTensor( - {6, 3}, {1.0, 0.0, 0.0, - 1.0, 0.0, 0.0, - 0.0, 1.0, 0.0, - 0.0, 1.0, 0.0, - 0.0, 0.0, 1.0, - 0.0, 0.0, 1.0}, false); - - auto net = makeMulticlassNet(); - auto loss = std::make_shared(); - auto optim = std::make_shared( - net->parameters(), /*lr=*/0.001, /*decay=*/0.9); - - auto trainLoop = train::BaseTrainLoop( - net, loss, optim, /*lr=*/0.001, /*epochs=*/2000, /*bsize=*/6); - - trainLoop.run(x, y, /*shuffle=*/false); - - auto pred = (*net)(x); - auto finalLoss = (*loss)(*pred, y); - - EXPECT_LT((*finalLoss)[0], 0.05f) - << "Network failed to overfit multiclass dataset"; -} \ No newline at end of file From a31a3b904f5a78a94fa8b9b1cc164c8f229e98da Mon Sep 17 00:00:00 2001 From: Robert Baumgartner Date: Sat, 14 Mar 2026 18:24:21 +0100 Subject: [PATCH 13/24] Renamed getitem and setitem --- .../activation_functions/leaky_relu_node.cpp | 2 +- .../activation_functions/relu_node.cpp | 2 +- .../activation_functions/sigmoid_node.cpp | 2 +- .../loss_functions/bce_node.cpp | 2 +- .../loss_functions/crossentropy_node.cpp | 4 +- .../loss_functions/rsme_node.cpp | 2 +- .../tensor_ops/getter_node.cpp | 6 +- .../tensor_ops/graph_creation.cpp | 6 +- .../tensor_ops/scalar_op_nodes.cpp | 2 +- src/backend/data_modeling/dim_type.cpp | 4 +- src/backend/data_modeling/dim_type.h | 2 +- src/backend/data_modeling/tensor.cpp | 82 +++++------ src/backend/data_modeling/tensor.h | 26 ++-- .../data_modeling/tensor_functions.cpp | 6 +- .../activation_functions/leaky_relu.cpp | 2 +- .../module/activation_functions/relu.cpp | 2 +- .../module/activation_functions/sigmoid.cpp | 2 +- .../module/activation_functions/softmax.cpp | 6 +- .../loss_functions/crossentropy_loss.cpp | 4 +- src/backend/training/optimizers/rmsprop.cpp | 6 +- src/backend/training/optimizers/sgd.cpp | 2 +- .../training/trainers/base_train_loop.cpp | 2 +- src/python/py_core/py_core.cpp | 10 +- src/python/py_core/py_core_util.cpp | 24 ++-- src/python/py_core/py_core_util.h | 2 +- tests/backend/test_computational_graph.cpp | 54 ++++---- tests/backend/test_data_modeling.cpp | 128 +++++++++--------- 27 files changed, 196 insertions(+), 196 deletions(-) diff --git a/src/backend/computational_graph/activation_functions/leaky_relu_node.cpp b/src/backend/computational_graph/activation_functions/leaky_relu_node.cpp index 0557f56..83de4ca 100644 --- a/src/backend/computational_graph/activation_functions/leaky_relu_node.cpp +++ b/src/backend/computational_graph/activation_functions/leaky_relu_node.cpp @@ -24,7 +24,7 @@ vector> LeakyReLuNode::backward(const Tensor& upstreamGrad) { const auto& parent = parents[0]; for(tensorSize_t i=0; isetItem((*parent)[i] > zero ? upstreamGrad[i] : upstreamGrad[i] * eps, i); + res->set((*parent)[i] > zero ? upstreamGrad[i] : upstreamGrad[i] * eps, i); } return {res}; diff --git a/src/backend/computational_graph/activation_functions/relu_node.cpp b/src/backend/computational_graph/activation_functions/relu_node.cpp index 4db7938..3fcc958 100644 --- a/src/backend/computational_graph/activation_functions/relu_node.cpp +++ b/src/backend/computational_graph/activation_functions/relu_node.cpp @@ -24,7 +24,7 @@ vector> ReLuNode::backward(const Tensor& upstreamGrad) { const auto& parent = parents[0]; for(tensorSize_t i=0; isetItem((*parent)[i] > zero ? upstreamGrad[i] : zero, i); + res->set((*parent)[i] > zero ? upstreamGrad[i] : zero, i); } return {res}; diff --git a/src/backend/computational_graph/activation_functions/sigmoid_node.cpp b/src/backend/computational_graph/activation_functions/sigmoid_node.cpp index a0b0d9c..79bd756 100644 --- a/src/backend/computational_graph/activation_functions/sigmoid_node.cpp +++ b/src/backend/computational_graph/activation_functions/sigmoid_node.cpp @@ -28,7 +28,7 @@ vector> SigmoidNode::backward(const Tensor& upstreamGrad) { }; for(tensorSize_t i=0; isetItem(derivative((*sigmoid)[i] * upstreamGrad[i]), i); + res->set(derivative((*sigmoid)[i] * upstreamGrad[i]), i); } return {res}; diff --git a/src/backend/computational_graph/loss_functions/bce_node.cpp b/src/backend/computational_graph/loss_functions/bce_node.cpp index f1d4507..bf59006 100644 --- a/src/backend/computational_graph/loss_functions/bce_node.cpp +++ b/src/backend/computational_graph/loss_functions/bce_node.cpp @@ -30,7 +30,7 @@ vector< shared_ptr > BceNode::backward(const Tensor& upstreamGrad) { constexpr ftype eps = 1e-6; auto g = -yi/std::max(yiHat, eps) + (1-yi)/std::max(1-yiHat, eps); - res->setItem(g/bSize, i); + res->set(g/bSize, i); } return {res}; diff --git a/src/backend/computational_graph/loss_functions/crossentropy_node.cpp b/src/backend/computational_graph/loss_functions/crossentropy_node.cpp index b28a5e0..8023d4a 100644 --- a/src/backend/computational_graph/loss_functions/crossentropy_node.cpp +++ b/src/backend/computational_graph/loss_functions/crossentropy_node.cpp @@ -27,10 +27,10 @@ vector< shared_ptr > CrossEntropyNode::backward(const Tensor& upstreamGr for(tensorDim_t j=0; igetItem(i, j), eps); + auto yijHat = std::max(yPred->get(i, j), eps); auto g = -yi/yijHat; - res->setItem(g/bSize, i, j); + res->set(g/bSize, i, j); } } diff --git a/src/backend/computational_graph/loss_functions/rsme_node.cpp b/src/backend/computational_graph/loss_functions/rsme_node.cpp index 6f73b14..2603a5b 100644 --- a/src/backend/computational_graph/loss_functions/rsme_node.cpp +++ b/src/backend/computational_graph/loss_functions/rsme_node.cpp @@ -31,7 +31,7 @@ vector< shared_ptr > RsmeNode::backward(const Tensor& upstreamGrad) { constexpr ftype eps = 1e-6; auto denom = rsme * bSize + eps; auto g = (yi-yiHat) / denom; - res->setItem(g, i); + res->set(g, i); } return {res}; diff --git a/src/backend/computational_graph/tensor_ops/getter_node.cpp b/src/backend/computational_graph/tensor_ops/getter_node.cpp index f937d91..ded3640 100644 --- a/src/backend/computational_graph/tensor_ops/getter_node.cpp +++ b/src/backend/computational_graph/tensor_ops/getter_node.cpp @@ -20,14 +20,14 @@ vector< shared_ptr > GetterNode::backward(const Tensor& upstreamGrad) { auto res = make_shared(parents[0]->getDims(), parents[0]->getDevice(), false); for(tensorSize_t i=0; igetSize(); i++){ - res->setItem(0, i); + res->set(0, i); } if(std::holds_alternative(idx)){ - res->setItem(upstreamGrad.getItem(0), std::get(idx)); + res->set(upstreamGrad.get(0), std::get(idx)); } else if(std::holds_alternative(idx)){ - res->setItem(upstreamGrad.getItem(0), std::get(idx)); + res->set(upstreamGrad.get(0), std::get(idx)); } else{ __throw_runtime_error("Idx variant in unexpected state"); diff --git a/src/backend/computational_graph/tensor_ops/graph_creation.cpp b/src/backend/computational_graph/tensor_ops/graph_creation.cpp index 25b2c08..0004c94 100644 --- a/src/backend/computational_graph/tensor_ops/graph_creation.cpp +++ b/src/backend/computational_graph/tensor_ops/graph_creation.cpp @@ -92,14 +92,14 @@ shared_ptr cgraph::div(const shared_ptr t, ftype scalar) { } /** - * @brief Special linear indexing, see getItem() overloads in tensor. + * @brief Special linear indexing, see get() overloads in tensor. * Used to keep the computational graph intact. * E.g. if we have something like * * loss = loss + other.get(i), we need to make sure get(i) can map to computational graph. */ shared_ptr cgraph::get(const shared_ptr& t, tensorSize_t idx) { - ftype val = t->getItem(idx); + ftype val = t->get(idx); auto res = make_shared(std::vector{1}, std::vector{val}, t->getDevice()); @@ -117,7 +117,7 @@ shared_ptr cgraph::get(const shared_ptr& t, tensorSize_t idx) { * loss = loss + other.get(i), we need to make sure get(i) can map to computational graph. */ shared_ptr cgraph::get(const shared_ptr& t, const vector& idx) { - ftype val = t->getItem(std::move(idx)); + ftype val = t->get(std::move(idx)); auto res = make_shared(std::vector{1}, std::vector{val}, t->getDevice()); if(t->getRequiresGrad()){ diff --git a/src/backend/computational_graph/tensor_ops/scalar_op_nodes.cpp b/src/backend/computational_graph/tensor_ops/scalar_op_nodes.cpp index ae8d352..e0f52f0 100644 --- a/src/backend/computational_graph/tensor_ops/scalar_op_nodes.cpp +++ b/src/backend/computational_graph/tensor_ops/scalar_op_nodes.cpp @@ -26,7 +26,7 @@ vector> cgraph::ScalarMulNode::backward(const Tensor& upstrea auto res = make_shared(upstreamGrad.createDeepCopy()); for(tensorSize_t i=0; igetSize(); i++){ - res->setItem(res->getItem(i) * factor, i); + res->set(res->get(i) * factor, i); } return {std::move(res)}; } \ No newline at end of file diff --git a/src/backend/data_modeling/dim_type.cpp b/src/backend/data_modeling/dim_type.cpp index b7aee8c..f7d1804 100644 --- a/src/backend/data_modeling/dim_type.cpp +++ b/src/backend/data_modeling/dim_type.cpp @@ -89,7 +89,7 @@ Dimension& Dimension::operator=(Dimension&& other) noexcept { * @param idx The dimension to collapse. */ Dimension Dimension::collapseDimension(int idx) const { - auto mappedIdx = getItem(idx); + auto mappedIdx = get(idx); std::vector newDims; newDims.reserve(dims.size() - 1); @@ -103,7 +103,7 @@ ostream& operator<<(ostream& os, const Dimension& d) noexcept { if(d.size>0){ os << "\n("; for(int i=0; i= size) throw std::out_of_range("Out of range for tensor"); @@ -202,7 +202,7 @@ void Tensor::tensorValues_t::setItem(ftype v, tensorSize_t idx) { __throw_runtime_error("Should never reach here."); } -ftype Tensor::tensorValues_t::getItem(tensorSize_t idx) { +ftype Tensor::tensorValues_t::get(tensorSize_t idx) { if(idx >= size) throw std::out_of_range("Out of range for tensor"); @@ -294,7 +294,7 @@ Tensor Tensor::multiplyScalar(const Tensor& scalar, const Tensor& right) noexcep * network class object instance upon construction. */ Tensor Tensor::matMulImpl(const Tensor& left, const Tensor& right) const { - if(left.dims.getItem(-1) != right.dims.getItem(-2)){ + if(left.dims.get(-1) != right.dims.get(-2)){ __throw_runtime_error("Tensor dimensions do not match"); } @@ -304,15 +304,15 @@ Tensor Tensor::matMulImpl(const Tensor& left, const Tensor& right) const { } auto resDims = left.dims.nDims() > right.dims.nDims() ? left.dims.toVector() : right.dims.toVector(); - resDims[resDims.size()-2] = left.dims.getItem(-2); // rows - resDims[resDims.size()-1] = right.dims.getItem(-1); // cols + resDims[resDims.size()-2] = left.dims.get(-2); // rows + resDims[resDims.size()-1] = right.dims.get(-1); // cols Tensor res(resDims, values->getDevice(), false); // sizes of the 2D matrices respectively - const tensorSize_t leftSize = left.dims.getItem(-1) * left.dims.getItem(-2); - const tensorSize_t rightSize = right.dims.getItem(-1) * right.dims.getItem(-2); - const tensorSize_t resSize = left.dims.getItem(-2) * right.dims.getItem(-1); + const tensorSize_t leftSize = left.dims.get(-1) * left.dims.get(-2); + const tensorSize_t rightSize = right.dims.get(-1) * right.dims.get(-2); + const tensorSize_t resSize = left.dims.get(-2) * right.dims.get(-1); tensorSize_t leftOffset = 0; tensorSize_t rightOffset = 0; @@ -334,7 +334,7 @@ Tensor Tensor::matMulImpl(const Tensor& left, const Tensor& right) const { multiplyNTimes(nMultiplications); } else if(left.dims.nDims() > right.dims.nDims()) { - const auto nBatches = left.dims.getItem(0); + const auto nBatches = left.dims.get(0); for(tensorDim_t batch = 0; batch < nBatches; batch++){ const auto nMultsPerBatch = res.values->getSize() / (nBatches * resSize); @@ -343,7 +343,7 @@ Tensor Tensor::matMulImpl(const Tensor& left, const Tensor& right) const { } } else { - const auto nBatches = right.dims.getItem(0); + const auto nBatches = right.dims.get(0); for(tensorDim_t batch = 0; batch < nBatches; batch++){ const auto nMultsPerBatch = res.values->getSize() / (nBatches * resSize); @@ -361,10 +361,10 @@ Tensor Tensor::matMulImpl(const Tensor& left, const Tensor& right) const { void Tensor::matMul2DCpu(Tensor& res, const Tensor& left, const Tensor& right, const tensorSize_t resOffset, const tensorSize_t leftOffset, const tensorSize_t rightOffset) { - const auto nRowsLeft = static_cast(left.dims.getItem(-2)); - const auto nColsLeft = static_cast(left.dims.getItem(-1)); - const auto nRowsRight = static_cast(right.dims.getItem(-2)); - const auto nColsRight = static_cast(right.dims.getItem(-1)); + const auto nRowsLeft = static_cast(left.dims.get(-2)); + const auto nColsLeft = static_cast(left.dims.get(-1)); + const auto nRowsRight = static_cast(right.dims.get(-2)); + const auto nColsRight = static_cast(right.dims.get(-1)); for(tensorSize_t row=0; rowdims != other.dims && - !(other.dims.nDims() == 1 && other.dims.getItem(0) == dims.getItem(-1))){ + !(other.dims.nDims() == 1 && other.dims.get(0) == dims.get(-1))){ __throw_invalid_argument("Tensors need matching dimensions"); } else if(values->getDevice()!=other.values->getDevice()){ @@ -436,7 +436,7 @@ Tensor Tensor::operator+(const Tensor& other) const { } else { [[likely]] // broadcasted add - const auto stride = static_cast(other.dims.getItem(0)); + const auto stride = static_cast(other.dims.get(0)); for(tensorSize_t offset=0; offsetgetSize(); offset+=stride){ for(tensorSize_t i=0; i= 0; d--) { - dimSizes[d] = source.dims.getItem(d); + dimSizes[d] = source.dims.get(d); sourceStrides[d] = stride; stride *= dimSizes[d]; } @@ -705,8 +705,8 @@ void Tensor::transposeImpl2D(Tensor& target, const int dim1, const int dim2) con transposedValues->resize(source.values->getSize()); tensorSize_t resIdx = 0; - for(tensorSize_t smallDimCount=0; smallDimCount(10); if(t.dims.nDims()==2){ - for(tensorDim_t i=0; i& idx, const } const auto lastIdx = idx.size()-1; - tensorSize_t offsetFactor = dims.getItem(lastIdx); + tensorSize_t offsetFactor = dims.get(lastIdx); tensorSize_t res = idx[lastIdx]; for(int i=lastIdx-1; i>=0; i--){ res += idx[i] * offsetFactor; - offsetFactor *= dims.getItem(i); + offsetFactor *= dims.get(i); } return res; @@ -962,7 +962,7 @@ tensorSize_t Tensor::getDimOffset(const tensorDim_t dim, const Dimension& dims) tensorSize_t res = 1; // minimum possible dimsize for(size_t idx = dims.nDims()-1; idx>dim; idx--){ - res *= dims.getItem(idx); + res *= dims.get(idx); } assert(res!=0); @@ -979,7 +979,7 @@ tensorSize_t Tensor::getDimOffset(const int dim, const Dimension& dims) { /** * @brief No explanation needed. */ -ftype Tensor::getItem(const std::vector& idx) const { +ftype Tensor::get(const std::vector& idx) const { return (*values)[computeLinearIdx(idx, dims)]; } @@ -987,7 +987,7 @@ ftype Tensor::getItem(const std::vector& idx) const { * @brief Special getter, indexes the contained underlying array linearly. * Can lead to unexpected results in multidimensional tensors. */ -ftype Tensor::getItem(tensorSize_t idx) const { +ftype Tensor::get(tensorSize_t idx) const { return (*this)[idx]; } @@ -999,22 +999,22 @@ ftype Tensor::operator[](tensorSize_t idx) const { } -ftype Tensor::getItem(tensorDim_t idx0, tensorDim_t idx1) const { - return getItem({idx0, idx1}); +ftype Tensor::get(tensorDim_t idx0, tensorDim_t idx1) const { + return get({idx0, idx1}); } -ftype Tensor::getItem(tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2) const { - return getItem({idx0, idx1, idx2}); +ftype Tensor::get(tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2) const { + return get({idx0, idx1, idx2}); } -ftype Tensor::getItem(tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2, tensorDim_t idx3) const { - return getItem({idx0, idx1, idx2, idx3}); +ftype Tensor::get(tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2, tensorDim_t idx3) const { + return get({idx0, idx1, idx2, idx3}); } /** * @brief No explanation needed. */ -void Tensor::setItem(ftype item, const std::vector& idx) { +void Tensor::set(ftype item, const std::vector& idx) { (*values)[computeLinearIdx(idx, dims)] = item; } @@ -1022,18 +1022,18 @@ void Tensor::setItem(ftype item, const std::vector& idx) { * @brief Special setter, indexes the contained underlying array linearly. * Can lead to unexpected results in multidimensional tensors. */ -void Tensor::setItem(ftype item, tensorDim_t idx) { +void Tensor::set(ftype item, tensorDim_t idx) { (*values)[idx] = item; } -void Tensor::setItem(ftype item, tensorDim_t idx0, tensorDim_t idx1) { - setItem(item, {idx0, idx1}); +void Tensor::set(ftype item, tensorDim_t idx0, tensorDim_t idx1) { + set(item, {idx0, idx1}); } -void Tensor::setItem(ftype item, tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2) { - setItem(item, {idx0, idx1, idx2}); +void Tensor::set(ftype item, tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2) { + set(item, {idx0, idx1, idx2}); } -void Tensor::setItem(ftype item, tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2, tensorDim_t idx3) { - setItem(item, {idx0, idx1, idx2, idx3}); +void Tensor::set(ftype item, tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2, tensorDim_t idx3) { + set(item, {idx0, idx1, idx2, idx3}); } \ No newline at end of file diff --git a/src/backend/data_modeling/tensor.h b/src/backend/data_modeling/tensor.h index 61adc65..5d4c7cd 100644 --- a/src/backend/data_modeling/tensor.h +++ b/src/backend/data_modeling/tensor.h @@ -70,8 +70,8 @@ class Tensor final : public std::enable_shared_from_this { ftype& operator[](const tensorSize_t idx); ftype operator[](const tensorSize_t idx) const; - void setItem(ftype v, tensorSize_t idx); - ftype getItem(tensorSize_t idx); + void set(ftype v, tensorSize_t idx); + ftype get(tensorSize_t idx); tensorSize_t getSize() const noexcept; @@ -152,7 +152,7 @@ class Tensor final : public std::enable_shared_from_this { explicit Tensor(const std::vector& dims, const std::vector& initValues, Device d, bool requiresGrad=false) : Tensor{dims, d, requiresGrad} { for(tensorSize_t i=0; isetItem(initValues[i], i); + values->set(initValues[i], i); } } @@ -219,22 +219,22 @@ class Tensor final : public std::enable_shared_from_this { friend std::ostream& operator<<(std::ostream& os, const Tensor& t) noexcept; // for convenience we provide some simple getters - ftype getItem(tensorSize_t idx) const; - ftype getItem(tensorDim_t idx0, tensorDim_t idx1) const; - ftype getItem(tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2) const; - ftype getItem(tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2, tensorDim_t idx3) const; + ftype get(tensorSize_t idx) const; + ftype get(tensorDim_t idx0, tensorDim_t idx1) const; + ftype get(tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2) const; + ftype get(tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2, tensorDim_t idx3) const; // non-const version of operator[] does not exist because of CUDA ftype operator[](tensorSize_t idx) const; - ftype getItem(const std::vector& idx) const; + ftype get(const std::vector& idx) const; // for convenience we provide some simple setters - void setItem(ftype item, tensorDim_t idx); - void setItem(ftype item, tensorDim_t idx0, tensorDim_t idx1); - void setItem(ftype item, tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2); - void setItem(ftype item, tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2, tensorDim_t idx3); - void setItem(ftype item, const std::vector& idx); + void set(ftype item, tensorDim_t idx); + void set(ftype item, tensorDim_t idx0, tensorDim_t idx1); + void set(ftype item, tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2); + void set(ftype item, tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2, tensorDim_t idx3); + void set(ftype item, const std::vector& idx); void setDevice(const Device d) noexcept; Device getDevice() const noexcept; diff --git a/src/backend/data_modeling/tensor_functions.cpp b/src/backend/data_modeling/tensor_functions.cpp index a878970..c7986a0 100644 --- a/src/backend/data_modeling/tensor_functions.cpp +++ b/src/backend/data_modeling/tensor_functions.cpp @@ -97,13 +97,13 @@ Tensor TensorFunctions::SumOverDims(const Tensor& t, tensorDim_t dim) { tensorSize_t stride = 1; for(tensorDim_t i=dim+1; i(exp(t[i])), i); + tmp.set(static_cast(exp(t[i])), i); } - const tensorSize_t stride = t.getDims().getItem(-1); + const tensorSize_t stride = t.getDims()[-1]; auto compute = [&t, &res, &tmp, stride](tensorSize_t start){ ftype sum = 0; for(tensorSize_t i=0; i CrossEntropyLoss::operator()(const shared_ptr y, cons auto ce = [&y, &ypred](const tensorDim_t b){ ftype res = 0; - for(tensorDim_t i=0; igetDims().getItem(-1); i++){ + for(tensorDim_t i=0; igetDims()[-1]; i++){ constexpr ftype eps = 1e-6; - res += y->getItem(b, i) * log(std::max(ypred->getItem(b, i), eps)); + res += y->get(b, i) * log(std::max(ypred->get(b, i), eps)); } return res; }; diff --git a/src/backend/training/optimizers/rmsprop.cpp b/src/backend/training/optimizers/rmsprop.cpp index 10699ba..def896d 100644 --- a/src/backend/training/optimizers/rmsprop.cpp +++ b/src/backend/training/optimizers/rmsprop.cpp @@ -26,7 +26,7 @@ void RmsPropOptimizer::step() { for(tensorSize_t i=0; igetSize(); i++){ auto g = (*gPtr)[i]; auto update = decay * (*vPtr)[i] + (1-decay)*g*g; - vPtr->setItem(update, i); + vPtr->set(update, i); } } else { // init loop @@ -34,14 +34,14 @@ void RmsPropOptimizer::step() { vPtr = movingAvg[tPtr].get(); for(tensorSize_t i=0; igetSize(); i++) { auto g = (*gPtr)[i]; - vPtr->setItem((1-decay)*g*g, i); + vPtr->set((1-decay)*g*g, i); } } // update gradients for(tensorSize_t i=0; igetSize(); i++) { auto update = (*tPtr)[i] - lr * (*gPtr)[i] / ((*vPtr)[i] + eps); - tPtr->setItem(update, i); + tPtr->set(update, i); } } } \ No newline at end of file diff --git a/src/backend/training/optimizers/sgd.cpp b/src/backend/training/optimizers/sgd.cpp index 03cdac0..83f122d 100644 --- a/src/backend/training/optimizers/sgd.cpp +++ b/src/backend/training/optimizers/sgd.cpp @@ -19,7 +19,7 @@ void SgdOptimizer::step() { auto grads = t->getGrads(); for(auto idx=0; idxgetSize(); idx++){ auto updatedWeight = (*t)[idx] - lr*(*grads)[idx]; - t->setItem(updatedWeight, idx); + t->set(updatedWeight, idx); } } } \ No newline at end of file diff --git a/src/backend/training/trainers/base_train_loop.cpp b/src/backend/training/trainers/base_train_loop.cpp index 2a4e9d3..b5961e5 100644 --- a/src/backend/training/trainers/base_train_loop.cpp +++ b/src/backend/training/trainers/base_train_loop.cpp @@ -23,7 +23,7 @@ using namespace std; using namespace train; void BaseTrainLoop::run(shared_ptr& x, shared_ptr& y, const bool shuffle, const bool verbose) { - const auto nSamples = x->getDims().getItem(0); + const auto nSamples = x->getDims()[0]; for(size_t e=0; e indices(nSamples); diff --git a/src/python/py_core/py_core.cpp b/src/python/py_core/py_core.cpp index e74e114..476511f 100644 --- a/src/python/py_core/py_core.cpp +++ b/src/python/py_core/py_core.cpp @@ -113,7 +113,7 @@ BOOST_PYTHON_MODULE(_core) // classes class_("Dimension", no_init) - .add_property("list", &Dimension::getItem) + .add_property("list", &Dimension::get) .def("__str__", &Py_Util::toString) .def("__eq__", Py_DataModeling::dimEquals1) .def("__eq__", Py_DataModeling::dimEquals2) @@ -184,10 +184,10 @@ BOOST_PYTHON_MODULE(_core) .def("__truediv__", WRAP_FREE_FUNC_4(Py_DataModeling::scalardiv, ftype)) // member functions - .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_1(Tensor::getItem)) - .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_2(Tensor::getItem)) - .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_3(Tensor::getItem)) - .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_4(Tensor::getItem)) + .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_1(Tensor::get)) + .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_2(Tensor::get)) + .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_3(Tensor::get)) + .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_4(Tensor::get)) .def("getitem", Py_DataModeling::getItemVector) // the vector arg .def("sum", WRAP_FREE_FUNC_7(&(cgraph::sumTensor))) diff --git a/src/python/py_core/py_core_util.cpp b/src/python/py_core/py_core_util.cpp index 9864a2b..581c99f 100644 --- a/src/python/py_core/py_core_util.cpp +++ b/src/python/py_core/py_core_util.cpp @@ -22,7 +22,7 @@ ftype Py_DataModeling::tensorGetItem(const Tensor& self, boost::python::object i // Single integer index (1D) if(int_extractor.check()) { auto i0 = static_cast(int_extractor()); - return self.getItem(i0); + return self.get(i0); } // Tuple index (2D, 3D, or 4D, or list) @@ -32,25 +32,25 @@ ftype Py_DataModeling::tensorGetItem(const Tensor& self, boost::python::object i // Dispatch to convenience functions for 1-4 args if (len == 1) { auto i0 = static_cast(extract(index[0])); - return self.getItem(i0); + return self.get(i0); } else if (len == 2) { auto i0 = static_cast(extract(index[0])); auto i1 = static_cast(extract(index[1])); - return self.getItem(i0, i1); + return self.get(i0, i1); } else if (len == 3) { auto i0 = static_cast(extract(index[0])); auto i1 = static_cast(extract(index[1])); auto i2 = static_cast(extract(index[2])); - return self.getItem(i0, i1, i2); + return self.get(i0, i1, i2); } else if (len == 4) { auto i0 = static_cast(extract(index[0])); auto i1 = static_cast(extract(index[1])); auto i2 = static_cast(extract(index[2])); auto i3 = static_cast(extract(index[3])); - return self.getItem(i0, i1, i2, i3); + return self.get(i0, i1, i2, i3); } else { // Arbitrary length - use vector version @@ -58,7 +58,7 @@ ftype Py_DataModeling::tensorGetItem(const Tensor& self, boost::python::object i for (int i = 0; i < len; ++i) { indices.push_back(static_cast(extract(index[i]))); } - return self.getItem(std::move(indices)); + return self.get(std::move(indices)); } } @@ -71,7 +71,7 @@ void Py_DataModeling::tensorSetItem(Tensor& self, boost::python::object index, f extract int_extractor(index); if(int_extractor.check()) { auto i0 = static_cast(int_extractor()); - self.setItem(value, i0); + self.set(value, i0); return; } @@ -83,25 +83,25 @@ void Py_DataModeling::tensorSetItem(Tensor& self, boost::python::object index, f // Dispatch to convenience functions for 1-4 args if (len == 1) { auto i0 = static_cast(extract(index[0])); - self.setItem(value, i0); + self.set(value, i0); } else if (len == 2) { auto i0 = static_cast(extract(index[0])); auto i1 = static_cast(extract(index[1])); - self.setItem(value, i0, i1); + self.set(value, i0, i1); } else if (len == 3) { auto i0 = static_cast(extract(index[0])); auto i1 = static_cast(extract(index[1])); auto i2 = static_cast(extract(index[2])); - self.setItem(value, i0, i1, i2); + self.set(value, i0, i1, i2); } else if (len == 4) { auto i0 = static_cast(extract(index[0])); auto i1 = static_cast(extract(index[1])); auto i2 = static_cast(extract(index[2])); auto i3 = static_cast(extract(index[3])); - self.setItem(value, i0, i1, i2, i3); + self.set(value, i0, i1, i2, i3); } else { // Arbitrary length - use vector version @@ -109,7 +109,7 @@ void Py_DataModeling::tensorSetItem(Tensor& self, boost::python::object index, f for (int i = 0; i < len; ++i) { indices.push_back(static_cast(extract(index[i]))); } - self.setItem(value, std::move(indices)); + self.set(value, std::move(indices)); } return; } diff --git a/src/python/py_core/py_core_util.h b/src/python/py_core/py_core_util.h index 4ddf62c..1f97ce4 100644 --- a/src/python/py_core/py_core_util.h +++ b/src/python/py_core/py_core_util.h @@ -89,7 +89,7 @@ namespace Py_DataModeling { inline Tensor (Tensor::*transpose1)(int, int) const = &Tensor::transpose; inline Tensor (Tensor::*transpose2)(int, int, bool) const = &Tensor::transpose; - inline ftype (Tensor::*getItemVector)(const std::vector&) const = &Tensor::getItem; + inline ftype (Tensor::*getItemVector)(const std::vector&) const = &Tensor::get; /********************************************************************************************************* ***************************************** Graph creation ************************************************* diff --git a/tests/backend/test_computational_graph.cpp b/tests/backend/test_computational_graph.cpp index b0406b7..2379cad 100644 --- a/tests/backend/test_computational_graph.cpp +++ b/tests/backend/test_computational_graph.cpp @@ -39,8 +39,8 @@ TEST(AutogradTest, SimpleAddition) { loss->backward(); - EXPECT_NEAR(t1->getGrads()->getItem(0), 10.0, 1e-5); - EXPECT_NEAR(t2->getGrads()->getItem(0), 10.0, 1e-5); + EXPECT_NEAR(t1->getGrads()->get(0), 10.0, 1e-5); + EXPECT_NEAR(t2->getGrads()->get(0), 10.0, 1e-5); } TEST(AutogradTest, ScalarMultiplication) { @@ -52,8 +52,8 @@ TEST(AutogradTest, ScalarMultiplication) { loss->backward(); - ASSERT_DOUBLE_EQ(t1->getGrads()->getItem(0), 36.0); - ASSERT_DOUBLE_EQ(t2->getGrads()->getItem(0), 24.0); + ASSERT_DOUBLE_EQ(t1->getGrads()->get(0), 36.0); + ASSERT_DOUBLE_EQ(t2->getGrads()->get(0), 24.0); } TEST(AutogradTest, MatMul) { @@ -73,20 +73,20 @@ TEST(AutogradTest, MatMul) { EXPECT_TRUE(t2->hasGrads()); // dL/dt1 = dloss/dt3 @ t2^t = Ones({2, 2}) @ t2^t - ASSERT_DOUBLE_EQ(t1->getGrads()->getItem({0, 0}), 3.0); - ASSERT_DOUBLE_EQ(t1->getGrads()->getItem({0, 1}), 7.0); - ASSERT_DOUBLE_EQ(t1->getGrads()->getItem({0, 2}), 11.0); - ASSERT_DOUBLE_EQ(t1->getGrads()->getItem({1, 0}), 3.0); - ASSERT_DOUBLE_EQ(t1->getGrads()->getItem({1, 1}), 7.0); - ASSERT_DOUBLE_EQ(t1->getGrads()->getItem({1, 2}), 11.0); + ASSERT_DOUBLE_EQ(t1->getGrads()->get({0, 0}), 3.0); + ASSERT_DOUBLE_EQ(t1->getGrads()->get({0, 1}), 7.0); + ASSERT_DOUBLE_EQ(t1->getGrads()->get({0, 2}), 11.0); + ASSERT_DOUBLE_EQ(t1->getGrads()->get({1, 0}), 3.0); + ASSERT_DOUBLE_EQ(t1->getGrads()->get({1, 1}), 7.0); + ASSERT_DOUBLE_EQ(t1->getGrads()->get({1, 2}), 11.0); // dL/dt2 = t1^t @ dloss/dt3 = t1^t @ Ones({2, 2}) - ASSERT_DOUBLE_EQ(t2->getGrads()->getItem({0, 0}), 5.0); - ASSERT_DOUBLE_EQ(t2->getGrads()->getItem({0, 1}), 5.0); - ASSERT_DOUBLE_EQ(t2->getGrads()->getItem({1, 0}), 7.0); - ASSERT_DOUBLE_EQ(t2->getGrads()->getItem({1, 1}), 7.0); - ASSERT_DOUBLE_EQ(t2->getGrads()->getItem({2, 0}), 9.0); - ASSERT_DOUBLE_EQ(t2->getGrads()->getItem({2, 1}), 9.0); + ASSERT_DOUBLE_EQ(t2->getGrads()->get({0, 0}), 5.0); + ASSERT_DOUBLE_EQ(t2->getGrads()->get({0, 1}), 5.0); + ASSERT_DOUBLE_EQ(t2->getGrads()->get({1, 0}), 7.0); + ASSERT_DOUBLE_EQ(t2->getGrads()->get({1, 1}), 7.0); + ASSERT_DOUBLE_EQ(t2->getGrads()->get({2, 0}), 9.0); + ASSERT_DOUBLE_EQ(t2->getGrads()->get({2, 1}), 9.0); } TEST(AutogradTest, ChainRule) { @@ -100,7 +100,7 @@ TEST(AutogradTest, ChainRule) { // dloss/dx = 2(x^2 + x) * (2x + 1) // At x=2: 2(4 + 2) * (4 + 1) = 2 * 6 * 5 = 60 - ASSERT_DOUBLE_EQ(x->getGrads()->getItem(0), 60.0); + ASSERT_DOUBLE_EQ(x->getGrads()->get(0), 60.0); } TEST(AutogradTest, MultiVariateChainRule) { @@ -115,11 +115,11 @@ TEST(AutogradTest, MultiVariateChainRule) { loss->backward(); // dloss/dx = scalar = 3 - ASSERT_DOUBLE_EQ(x->getGrads()->getItem(0), 3.0); - ASSERT_DOUBLE_EQ(x->getGrads()->getItem(1), 3.0); + ASSERT_DOUBLE_EQ(x->getGrads()->get(0), 3.0); + ASSERT_DOUBLE_EQ(x->getGrads()->get(1), 3.0); - ASSERT_DOUBLE_EQ(y->getGrads()->getItem(0), 1.0); - ASSERT_DOUBLE_EQ(y->getGrads()->getItem(1), 1.0); + ASSERT_DOUBLE_EQ(y->getGrads()->get(0), 1.0); + ASSERT_DOUBLE_EQ(y->getGrads()->get(1), 1.0); } TEST(AutogradTest, ReLU) { @@ -132,9 +132,9 @@ TEST(AutogradTest, ReLU) { loss->backward(); // Gradient: [0, 0, 1] (only where input > 0) - ASSERT_DOUBLE_EQ(x->getGrads()->getItem(0), 0.0); - ASSERT_DOUBLE_EQ(x->getGrads()->getItem(1), 0.0); - ASSERT_DOUBLE_EQ(x->getGrads()->getItem(2), 1.0); + ASSERT_DOUBLE_EQ(x->getGrads()->get(0), 0.0); + ASSERT_DOUBLE_EQ(x->getGrads()->get(1), 0.0); + ASSERT_DOUBLE_EQ(x->getGrads()->get(2), 1.0); } TEST(AutogradTest, LeakyReLU) { @@ -149,7 +149,7 @@ TEST(AutogradTest, LeakyReLU) { loss->backward(); // Gradient: [0, 0, 1] (only where input > 0) - ASSERT_DOUBLE_EQ(x->getGrads()->getItem(0), eps); - ASSERT_DOUBLE_EQ(x->getGrads()->getItem(1), eps); // by convention - ASSERT_DOUBLE_EQ(x->getGrads()->getItem(2), 1.0); + ASSERT_DOUBLE_EQ(x->getGrads()->get(0), eps); + ASSERT_DOUBLE_EQ(x->getGrads()->get(1), eps); // by convention + ASSERT_DOUBLE_EQ(x->getGrads()->get(2), 1.0); } \ No newline at end of file diff --git a/tests/backend/test_data_modeling.cpp b/tests/backend/test_data_modeling.cpp index 0d41664..1fa4c06 100644 --- a/tests/backend/test_data_modeling.cpp +++ b/tests/backend/test_data_modeling.cpp @@ -23,10 +23,10 @@ TEST(TensorOpsTest, TestCtor) { ASSERT_EQ(t.getDevice(), Device::CPU); ASSERT_TRUE(!t.getRequiresGrad()); - ASSERT_DOUBLE_EQ(t.getItem(0, 0), 2.0); - ASSERT_DOUBLE_EQ(t.getItem(0, 1), 3.0); - ASSERT_DOUBLE_EQ(t.getItem(1, 0), 4.0); - ASSERT_DOUBLE_EQ(t.getItem(1, 1), 5.0); + ASSERT_DOUBLE_EQ(t.get(0, 0), 2.0); + ASSERT_DOUBLE_EQ(t.get(0, 1), 3.0); + ASSERT_DOUBLE_EQ(t.get(1, 0), 4.0); + ASSERT_DOUBLE_EQ(t.get(1, 1), 5.0); } TEST(TensorOpsTest, ScalarAddWorks) { @@ -35,9 +35,9 @@ TEST(TensorOpsTest, ScalarAddWorks) { auto res = t1 + 1.5; constexpr ftype sum = 2.5; - for(auto i=0; i Date: Sun, 15 Mar 2026 10:56:00 +0100 Subject: [PATCH 14/24] Fixed bug, simplified FfLayer ctor and Matmul implementation --- src/backend/data_modeling/tensor.cpp | 82 ++++++++------------------ src/backend/data_modeling/tensor.h | 8 ++- src/backend/module/layers/ff_layer.cpp | 19 +++--- src/backend/module/layers/ff_layer.h | 4 +- src/python/py_nn/py_nn.cpp | 12 ++-- tests/backend/test_module.cpp | 2 +- tests/backend/test_train_loop.cpp | 23 ++------ 7 files changed, 52 insertions(+), 98 deletions(-) diff --git a/src/backend/data_modeling/tensor.cpp b/src/backend/data_modeling/tensor.cpp index 2ca3f7f..47478ff 100644 --- a/src/backend/data_modeling/tensor.cpp +++ b/src/backend/data_modeling/tensor.cpp @@ -85,11 +85,11 @@ void Tensor::tensorValues_t::copyValues(tensorValues_t& target, tensorSize_t low switch(device){ case Device::CPU: for(tensorSize_t i=0; i(right.dims.nDims()) - static_cast(left.dims.nDims())) > 1){ - auto str = "Tensor dimension assumptions violated. See file 'assumption_matrices.md'."; - __throw_invalid_argument(str); - } - + // broadcasting auto resDims = left.dims.nDims() > right.dims.nDims() ? left.dims.toVector() : right.dims.toVector(); resDims[resDims.size()-2] = left.dims.get(-2); // rows resDims[resDims.size()-1] = right.dims.get(-1); // cols - Tensor res(resDims, values->getDevice(), false); + Tensor res(resDims, left.values->getDevice(), false); // sizes of the 2D matrices respectively const tensorSize_t leftSize = left.dims.get(-1) * left.dims.get(-2); @@ -318,73 +314,43 @@ Tensor Tensor::matMulImpl(const Tensor& left, const Tensor& right) const { tensorSize_t rightOffset = 0; tensorSize_t resOffset = 0; - // lambda expected to get inlined by compiler - auto multiplyNTimes = [&](const tensorDim_t n){ - for(tensorDim_t i=0; igetSize() / resSize; // total size / size of 2D matrix - multiplyNTimes(nMultiplications); - } - else if(left.dims.nDims() > right.dims.nDims()) { - const auto nBatches = left.dims.get(0); - - for(tensorDim_t batch = 0; batch < nBatches; batch++){ - const auto nMultsPerBatch = res.values->getSize() / (nBatches * resSize); - multiplyNTimes(nMultsPerBatch); - rightOffset = 0; - } - } - else { - const auto nBatches = right.dims.get(0); - - for(tensorDim_t batch = 0; batch < nBatches; batch++){ - const auto nMultsPerBatch = res.values->getSize() / (nBatches * resSize); - multiplyNTimes(nMultsPerBatch); - leftOffset = 0; - } + leftOffset += leftSize; + rightOffset += rightSize; + resOffset += resSize; } return res; } /** - * @brief Name says it all. Inplace operation on res + * @brief Name says it all. Inplace operation on res. */ void Tensor::matMul2DCpu(Tensor& res, const Tensor& left, const Tensor& right, const tensorSize_t resOffset, const tensorSize_t leftOffset, const tensorSize_t rightOffset) { - + const auto nRowsLeft = static_cast(left.dims.get(-2)); const auto nColsLeft = static_cast(left.dims.get(-1)); const auto nRowsRight = static_cast(right.dims.get(-2)); const auto nColsRight = static_cast(right.dims.get(-1)); - for(tensorSize_t row=0; row { std::shared_ptr cgNode = nullptr; static Tensor multiplyScalar(const Tensor& scalar, const Tensor& other) noexcept; - static void matMul2DCpu(Tensor& res, const Tensor& left, const Tensor& right, const tensorSize_t resOffset, - const tensorSize_t leftOffset, const tensorSize_t rightOffset); - Tensor matMulImpl(const Tensor& left, const Tensor& right) const; + static Tensor matMulImpl(const Tensor& left, const Tensor& right); + static void matMul2DCpu(Tensor& res, const Tensor& left, const Tensor& right, + const tensorSize_t resOffset, const tensorSize_t leftOffset, + const tensorSize_t rightOffset); + void transposeImpl2D(Tensor& target, const int dim1, const int dim2) const noexcept; void transposeImpl(Tensor& target, const int dim1, const int dim2) const noexcept; diff --git a/src/backend/module/layers/ff_layer.cpp b/src/backend/module/layers/ff_layer.cpp index 412f6f7..ce9946e 100644 --- a/src/backend/module/layers/ff_layer.cpp +++ b/src/backend/module/layers/ff_layer.cpp @@ -20,8 +20,8 @@ using namespace std; using namespace module; -FfLayer::FfLayer(const vector& dims, bool useBias, bool requiresGrad) - : FfLayer(dims, Tensor::getDefaultDevice(), useBias, requiresGrad) {} +FfLayer::FfLayer(tensorDim_t inSize, tensorDim_t outSize, bool useBias, bool requiresGrad) + : FfLayer(inSize, outSize, Tensor::getDefaultDevice(), useBias, requiresGrad) {} /** * @brief Construct a new Ff Layer:: Ff Layer object @@ -31,17 +31,14 @@ FfLayer::FfLayer(const vector& dims, bool useBias, bool requiresGra * @param useBias Use a bias if true. Bias will receiver shape (n_rows) * @param requiresGrad If true train this layer. */ -FfLayer::FfLayer(const vector& dims, Device d, bool useBias, bool requiresGrad) - : useBias{useBias}, requiresGrad{requiresGrad} { - if(dims.size()!=2){ - __throw_runtime_error("FfLayer needs only two dims, that's it."); - } - - weights = make_shared(Dimension({dims[0], dims[1]}), d, requiresGrad); +FfLayer::FfLayer(tensorDim_t inSize, tensorDim_t outSize, Device d, bool useBias, bool requiresGrad) + : useBias{useBias}, requiresGrad{requiresGrad} +{ + weights = make_shared(Dimension({inSize, outSize}), d, requiresGrad); TensorFunctions::ToGaussian(*weights); if(useBias){ - bias = make_shared(vector{dims[1]}, d, requiresGrad); + bias = make_shared(vector{outSize}, d, requiresGrad); TensorFunctions::ToGaussian(*bias); } } @@ -67,7 +64,7 @@ Tensor FfLayer::operator()(const Tensor& input) const { std::shared_ptr FfLayer::operator()(const std::shared_ptr& input) const { auto res = cgraph::matmul(input, weights); if(useBias){ - res = cgraph::add(res, bias); // TODO: add needs to happen on each of those, how to broadcast? + res = cgraph::add(res, bias); } return res; diff --git a/src/backend/module/layers/ff_layer.h b/src/backend/module/layers/ff_layer.h index 478b457..59ba0f5 100644 --- a/src/backend/module/layers/ff_layer.h +++ b/src/backend/module/layers/ff_layer.h @@ -25,8 +25,8 @@ namespace module { std::shared_ptr bias = nullptr; public: - FfLayer(const std::vector& dims, bool useBias=true, bool requiresGrad=false); - FfLayer(const std::vector& dims, Device d, bool useBias=true, bool requiresGrad=false); + FfLayer(tensorDim_t inSize, tensorDim_t outSize, bool useBias=true, bool requiresGrad=false); + FfLayer(tensorDim_t inSize, tensorDim_t outSize, Device d, bool useBias=true, bool requiresGrad=false); Tensor operator()(const Tensor& input) const override; std::shared_ptr operator()(const std::shared_ptr& input) const override; diff --git a/src/python/py_nn/py_nn.cpp b/src/python/py_nn/py_nn.cpp index a8470ca..75f6d11 100644 --- a/src/python/py_nn/py_nn.cpp +++ b/src/python/py_nn/py_nn.cpp @@ -42,12 +42,12 @@ BOOST_PYTHON_MODULE(_nn) class_, boost::noncopyable>("FfLayer", no_init) // init - .def(init&>()) - .def(init&, bool>()) - .def(init&, bool, bool>()) - .def(init&, Device>()) - .def(init&, Device, bool>()) - .def(init&, Device, bool, bool>()) + .def(init()) + .def(init()) + .def(init()) + .def(init()) + .def(init()) + .def(init()) // methods .add_property("dims", make_function(&module::FfLayer::getDims, return_internal_reference<>())) .add_property("weights", &module::FfLayer::getWeights) diff --git a/tests/backend/test_module.cpp b/tests/backend/test_module.cpp index dc78ba0..459d577 100644 --- a/tests/backend/test_module.cpp +++ b/tests/backend/test_module.cpp @@ -71,7 +71,7 @@ TEST(TensorOpsTest, TestLeakyRelu2) { TEST(LayerTest, TestFfLayer) { auto t1 = TensorFunctions::Ones({3, 2}, false); - auto layer = FfLayer({2, 1}, true, false); + auto layer = FfLayer(2, 1, true, false); auto res = layer(t1); diff --git a/tests/backend/test_train_loop.cpp b/tests/backend/test_train_loop.cpp index 0c967a9..21e255f 100644 --- a/tests/backend/test_train_loop.cpp +++ b/tests/backend/test_train_loop.cpp @@ -34,13 +34,11 @@ using namespace std; static shared_ptr makeBinaryNet() { auto net = make_shared(); - net->append(make_shared( - vector{2, 4}, true, true)); + net->append(make_shared(2, 4, true, true)); net->append(make_shared(1e-5)); - net->append(make_shared( - vector{4, 1}, true, true)); + net->append(make_shared(4, 1, true, true)); net->append(make_shared()); return net; @@ -49,20 +47,16 @@ static shared_ptr makeBinaryNet() { static shared_ptr makeMulticlassNet() { auto net = make_shared(); - net->append(make_shared( - vector{2, 8}, true, true)); + net->append(make_shared(2, 8, true, true)); net->append(make_shared()); - net->append(make_shared( - vector{8, 3}, true, true)); + net->append(make_shared(8, 3, true, true)); net->append(make_shared()); return net; } -// ─── binary overfit ───────────────────────────────────────────────────────── - TEST(OverfitTest, BCE_SGD_OverfitsSmallDataset) { // XOR-like: 4 samples, 2 features, binary labels auto x = TensorFunctions::makeSharedTensor( @@ -77,15 +71,13 @@ TEST(OverfitTest, BCE_SGD_OverfitsSmallDataset) { 1.0, 0.0}, false); - auto net = makeBinaryNet(); - cout << "Network: " << *net << endl; - + auto net = makeBinaryNet(); auto loss = make_shared(); auto optim = make_shared( net->parameters(), /*lr=*/0.01); auto trainLoop = train::BaseTrainLoop( - net, loss, optim, /*epochs=*/1, /*bsize=*/static_cast(4)); + net, loss, optim, /*epochs=*/2000, /*bsize=*/static_cast(4)); trainLoop.run(x, y, /*shuffle=*/false); @@ -99,9 +91,6 @@ TEST(OverfitTest, BCE_SGD_OverfitsSmallDataset) { << "Network failed to overfit binary dataset"; } - -// ─── multiclass overfit ────────────────────────────────────────────────────── - // TEST(OverfitTest, CrossEntropy_RMSProp_OverfitsSmallDataset) { // // 6 samples, 2 features, 3 classes // auto x = TensorFunctions::makeSharedTensor( From 9075c67f0fe02d667bc8a832c009ba92666df01e Mon Sep 17 00:00:00 2001 From: Robert Baumgartner Date: Sun, 15 Mar 2026 13:51:46 +0100 Subject: [PATCH 15/24] Added unit tests, fixed bugs and numerical instabilities --- .../activation_functions/softmax_node.cpp | 45 +++++ .../activation_functions/softmax_node.h | 31 +++ .../loss_functions/bce_node.cpp | 5 +- .../loss_functions/bce_node.h | 8 +- .../loss_functions/crossentropy_node.cpp | 12 +- .../loss_functions/crossentropy_node.h | 10 +- .../{rsme_node.cpp => rmse_node.cpp} | 15 +- .../{rsme_node.h => rmse_node.h} | 18 +- .../tensor_ops/graph_creation.cpp | 2 +- src/backend/data_modeling/tensor.h | 4 + .../module/activation_functions/sigmoid.cpp | 7 +- .../module/activation_functions/softmax.cpp | 36 +++- .../training/loss_functions/bce_loss.cpp | 2 +- .../loss_functions/crossentropy_loss.cpp | 2 +- .../{rsme_loss.cpp => rmse_loss.cpp} | 10 +- .../{rsme_loss.h => rmse_loss.h} | 4 +- src/backend/training/optimizers/rmsprop.cpp | 2 +- tests/backend/test_computational_graph.cpp | 65 +++--- tests/backend/test_data_modeling.cpp | 18 +- tests/backend/test_losses.cpp | 135 +++++++++++-- tests/backend/test_module.cpp | 188 ++++++++++++++++-- 21 files changed, 490 insertions(+), 129 deletions(-) create mode 100644 src/backend/computational_graph/activation_functions/softmax_node.cpp create mode 100644 src/backend/computational_graph/activation_functions/softmax_node.h rename src/backend/computational_graph/loss_functions/{rsme_node.cpp => rmse_node.cpp} (63%) rename src/backend/computational_graph/loss_functions/{rsme_node.h => rmse_node.h} (56%) rename src/backend/training/loss_functions/{rsme_loss.cpp => rmse_loss.cpp} (83%) rename src/backend/training/loss_functions/{rsme_loss.h => rmse_loss.h} (86%) diff --git a/src/backend/computational_graph/activation_functions/softmax_node.cpp b/src/backend/computational_graph/activation_functions/softmax_node.cpp new file mode 100644 index 0000000..a616064 --- /dev/null +++ b/src/backend/computational_graph/activation_functions/softmax_node.cpp @@ -0,0 +1,45 @@ +/** + * @file softmax_node.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-15 + * + * @copyright Copyright (c) 2026 + * + */ + +#include "softmax_node.h" + +#include "data_modeling/tensor_functions.h" + +#include + +using namespace std; +using namespace cgraph; + +vector< shared_ptr > SoftmaxNode::backward(const Tensor& upstreamGrad) { + assert(!upstreamGrad.getRequiresGrad()); + constexpr ftype eps = 1e-9; + + const auto& yPred = parents[0]; + auto res = make_shared(yPred->createEmptyCopy()); + + ftype bSize = yPred->getDims()[0]; + for(tensorDim_t i=0; igetDims()[0]; i++){ + for(tensorDim_t j=0; jgetDims()[1]; j++){ + ftype g = 0; + + if(i!=j){ + g = -softmax->get(i) * softmax->get(j); + } + else{ + g = softmax->get(i) * (1-softmax->get(j)); + } + + res->set(upstreamGrad[i] * g / bSize, i, j); + } + } + + return {res}; +} \ No newline at end of file diff --git a/src/backend/computational_graph/activation_functions/softmax_node.h b/src/backend/computational_graph/activation_functions/softmax_node.h new file mode 100644 index 0000000..6c3c8d1 --- /dev/null +++ b/src/backend/computational_graph/activation_functions/softmax_node.h @@ -0,0 +1,31 @@ +/** + * @file softmax_node.h + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-15 + * + * @copyright Copyright (c) 2026 + * + */ + +#pragma once + +#include "computational_graph/graph_node.h" +#include "utility/global_params.h" + +namespace cgraph { + class SoftmaxNode final : public GraphNode { + private: + const std::shared_ptr softmax; + + public: + explicit SoftmaxNode(std::shared_ptr t, std::shared_ptr softmax) + : GraphNode({std::move(t)}), softmax{std::move(softmax)} + { + assert(softmax->getSize()==parents[0]->getDims()[0]); + } + + std::vector> backward(const Tensor& upstreamGrad) override; + }; +} \ No newline at end of file diff --git a/src/backend/computational_graph/loss_functions/bce_node.cpp b/src/backend/computational_graph/loss_functions/bce_node.cpp index bf59006..8dfc1d7 100644 --- a/src/backend/computational_graph/loss_functions/bce_node.cpp +++ b/src/backend/computational_graph/loss_functions/bce_node.cpp @@ -20,15 +20,16 @@ using namespace cgraph; vector< shared_ptr > BceNode::backward(const Tensor& upstreamGrad) { assert(!upstreamGrad.getRequiresGrad()); + constexpr ftype eps = 1e-9; const auto& yPred = parents[0]; auto res = make_shared(yPred->createEmptyCopy()); - for(tensorSize_t i=0; i(bSize); i++){ + ftype bSize = yPred->getDims()[0]; + for(tensorSize_t i=0; igetDims()[0]; i++){ auto yi = (*yTrue)[i]; auto yiHat = (*yPred)[i]; - constexpr ftype eps = 1e-6; auto g = -yi/std::max(yiHat, eps) + (1-yi)/std::max(1-yiHat, eps); res->set(g/bSize, i); } diff --git a/src/backend/computational_graph/loss_functions/bce_node.h b/src/backend/computational_graph/loss_functions/bce_node.h index b3ec915..25b5f62 100644 --- a/src/backend/computational_graph/loss_functions/bce_node.h +++ b/src/backend/computational_graph/loss_functions/bce_node.h @@ -18,15 +18,13 @@ namespace cgraph { class BceNode final : public GraphNode { private: const std::shared_ptr yTrue; - const ftype bSize; public: explicit BceNode(std::shared_ptr y, std::shared_ptr yPred) - : GraphNode({yPred}), yTrue{std::move(y)}, bSize{static_cast(yPred->getDims()[0])} + : GraphNode({std::move(yPred)}), yTrue{std::move(y)} { - assert(yPred->getDims()==yTrue->getDims()); - - if(!yPred->getRequiresGrad()){ + assert(parents[0]->getDims()==yTrue->getDims()); + if(!parents[0]->getRequiresGrad()){ std::__throw_invalid_argument("yPred must be a graph node"); } } diff --git a/src/backend/computational_graph/loss_functions/crossentropy_node.cpp b/src/backend/computational_graph/loss_functions/crossentropy_node.cpp index 8023d4a..e9ea4bc 100644 --- a/src/backend/computational_graph/loss_functions/crossentropy_node.cpp +++ b/src/backend/computational_graph/loss_functions/crossentropy_node.cpp @@ -18,18 +18,18 @@ using namespace cgraph; vector< shared_ptr > CrossEntropyNode::backward(const Tensor& upstreamGrad) { assert(!upstreamGrad.getRequiresGrad()); + constexpr ftype eps = 1e-9; const auto& yPred = parents[0]; auto res = make_shared(yPred->createEmptyCopy()); - for(tensorDim_t i=0; i(bSize); i++){ - auto yi = (*yTrue)[i]; - - for(tensorDim_t j=0; igetDims()[0]; + for(tensorDim_t i=0; igetDims()[0]; i++){ + for(tensorDim_t j=0; jgetDims()[1]; j++){ + auto yij = yTrue->get(i, j); auto yijHat = std::max(yPred->get(i, j), eps); - auto g = -yi/yijHat; + auto g = -yij/yijHat; res->set(g/bSize, i, j); } } diff --git a/src/backend/computational_graph/loss_functions/crossentropy_node.h b/src/backend/computational_graph/loss_functions/crossentropy_node.h index 28fc581..2644a8d 100644 --- a/src/backend/computational_graph/loss_functions/crossentropy_node.h +++ b/src/backend/computational_graph/loss_functions/crossentropy_node.h @@ -19,9 +19,6 @@ namespace cgraph { private: const std::shared_ptr yTrue; - const ftype bSize; - const tensorDim_t nClasses; - public: /** @@ -31,11 +28,10 @@ namespace cgraph { * @param yPred shape (batchsize, nclasses) */ explicit CrossEntropyNode(std::shared_ptr y, std::shared_ptr yPred) - : GraphNode({yPred}), yTrue{std::move(y)}, bSize{static_cast(yPred->getDims()[0])}, - nClasses{yPred->getDims()[1]} + : GraphNode({std::move(yPred)}), yTrue{std::move(y)} { - assert(yPred->getDims()[0]==yTrue->getDims()[0]); - if(!yPred->getRequiresGrad()){ + assert(parents[0]->getDims()==yTrue->getDims()); + if(!parents[0]->getRequiresGrad()){ std::__throw_invalid_argument("yPred must be a graph node"); } } diff --git a/src/backend/computational_graph/loss_functions/rsme_node.cpp b/src/backend/computational_graph/loss_functions/rmse_node.cpp similarity index 63% rename from src/backend/computational_graph/loss_functions/rsme_node.cpp rename to src/backend/computational_graph/loss_functions/rmse_node.cpp index 2603a5b..e3eb11e 100644 --- a/src/backend/computational_graph/loss_functions/rsme_node.cpp +++ b/src/backend/computational_graph/loss_functions/rmse_node.cpp @@ -1,5 +1,5 @@ /** - * @file rsme_node.cpp + * @file rmse_node.cpp * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) * @brief * @version 0.1 @@ -9,7 +9,7 @@ * */ -#include "rsme_node.h" +#include "rmse_node.h" #include "data_modeling/tensor_functions.h" @@ -18,19 +18,20 @@ using namespace std; using namespace cgraph; -vector< shared_ptr > RsmeNode::backward(const Tensor& upstreamGrad) { +vector< shared_ptr > RmseNode::backward(const Tensor& upstreamGrad) { assert(!upstreamGrad.getRequiresGrad()); + constexpr ftype eps = 1e-9; const auto& yPred = parents[0]; auto res = make_shared(yPred->createEmptyCopy()); - for(tensorSize_t i=0; i(bSize); i++){ + ftype bSize = yPred->getDims()[0]; + for(tensorSize_t i=0; igetDims()[0]; i++){ auto yi = (*yTrue)[i]; auto yiHat = (*yPred)[i]; - constexpr ftype eps = 1e-6; - auto denom = rsme * bSize + eps; - auto g = (yi-yiHat) / denom; + auto denom = rmse * bSize + eps; + auto g = (yiHat-yi) / denom; res->set(g, i); } diff --git a/src/backend/computational_graph/loss_functions/rsme_node.h b/src/backend/computational_graph/loss_functions/rmse_node.h similarity index 56% rename from src/backend/computational_graph/loss_functions/rsme_node.h rename to src/backend/computational_graph/loss_functions/rmse_node.h index 74526fc..62e5cc5 100644 --- a/src/backend/computational_graph/loss_functions/rsme_node.h +++ b/src/backend/computational_graph/loss_functions/rmse_node.h @@ -1,5 +1,5 @@ /** - * @file rsme_node.h + * @file rmse_node.h * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) * @brief * @version 0.1 @@ -15,21 +15,17 @@ #include "utility/global_params.h" namespace cgraph { - class RsmeNode final : public GraphNode { + class RmseNode final : public GraphNode { private: const std::shared_ptr yTrue; - - const ftype bSize; - ftype rsme; + ftype rmse; public: - explicit RsmeNode(std::shared_ptr y, std::shared_ptr yPred, ftype rsme) - : GraphNode({yPred}), yTrue{std::move(y)}, bSize{static_cast(yPred->getDims()[0])}, - rsme{rsme} + explicit RmseNode(std::shared_ptr y, std::shared_ptr yPred, ftype rmse) + : GraphNode({std::move(yPred)}), yTrue{std::move(y)}, rmse{rmse} { - assert(yPred->getDims()==yTrue->getDims()); - - if(!yPred->getRequiresGrad()){ + assert(parents[0]->getDims()==yTrue->getDims()); + if(!parents[0]->getRequiresGrad()){ std::__throw_invalid_argument("yPred must be a graph node"); } } diff --git a/src/backend/computational_graph/tensor_ops/graph_creation.cpp b/src/backend/computational_graph/tensor_ops/graph_creation.cpp index 0004c94..1a28a08 100644 --- a/src/backend/computational_graph/tensor_ops/graph_creation.cpp +++ b/src/backend/computational_graph/tensor_ops/graph_creation.cpp @@ -84,7 +84,7 @@ shared_ptr cgraph::sub(const shared_ptr t, ftype scalar) { shared_ptr cgraph::div(const shared_ptr t, ftype scalar) { auto res = make_shared((*t) / scalar); if(t->getRequiresGrad()){ - constexpr ftype eps = 1e-6; + constexpr ftype eps = 1e-9; res->setCgNode(std::make_shared(t, 1/std::max(scalar, eps))); assert(res->getRequiresGrad()); } diff --git a/src/backend/data_modeling/tensor.h b/src/backend/data_modeling/tensor.h index 7b754fd..8915c99 100644 --- a/src/backend/data_modeling/tensor.h +++ b/src/backend/data_modeling/tensor.h @@ -209,6 +209,10 @@ class Tensor final : public std::enable_shared_from_this { bool hasGrads() const noexcept { return grads!=nullptr; } std::shared_ptr getGrads() const; + + void setGrads(std::shared_ptr grads) noexcept{ + this->grads = std::move(grads); + } void transposeThis() noexcept; void transposeThis(int dim1, int dim2) noexcept; diff --git a/src/backend/module/activation_functions/sigmoid.cpp b/src/backend/module/activation_functions/sigmoid.cpp index a0614d0..765b44a 100644 --- a/src/backend/module/activation_functions/sigmoid.cpp +++ b/src/backend/module/activation_functions/sigmoid.cpp @@ -24,12 +24,13 @@ using namespace module; Tensor Sigmoid::operator()(const Tensor& t) const { auto res = t.createEmptyCopy(); + constexpr ftype one = 1.0; auto compute = [](ftype x){ if(x>=0){ - return static_cast(1.0f) / (static_cast(1.0f) + exp(x)); + return one / (one + exp(-x)); } auto e = exp(x); - return e / (static_cast(1.0f) + e); + return e / (one + e); }; for(tensorSize_t i=0; i Sigmoid::operator()(const shared_ptr& t) const { auto res = make_shared((*this)(*t)); - + if(t->getRequiresGrad()){ res->setCgNode(make_shared(t, res)); assert(res->getRequiresGrad()); diff --git a/src/backend/module/activation_functions/softmax.cpp b/src/backend/module/activation_functions/softmax.cpp index 0e3e957..7a54965 100644 --- a/src/backend/module/activation_functions/softmax.cpp +++ b/src/backend/module/activation_functions/softmax.cpp @@ -11,6 +11,8 @@ #include "softmax.h" +#include "computational_graph/activation_functions/softmax_node.cpp" + #include using namespace std; @@ -22,21 +24,37 @@ using namespace module; * @return Tensor of shape (dim1, dim2, ..., n_classes) [== input.shape] */ Tensor Softmax::operator()(const Tensor& t) const { - Tensor res(t.getDims(), t.getDevice()); + if(t.getDims().nDims()<2){ + __throw_invalid_argument("Softmax expects input shape of minimum two dimensions"); + } + + const auto nRows = t.getDims()[-2]; + const auto nCols = t.getDims()[-1]; - Tensor tmp(t.getDims(), t.getDevice()); - for(tensorSize_t i=0; i(exp(t[i])), i); + // pre-compute exponents + Tensor tmp(t.getDims(), t.getDevice(), false); + for(tensorDim_t i=0; i::infinity(); + for(tensorDim_t j=0; j(t[start+i]); + for(tensorSize_t i=start; i(tmp[i]); } - for(tensorSize_t i=0; i Softmax::operator()(const shared_ptr& t) const { auto res = make_shared((*this)(*t)); if(t->getRequiresGrad()){ - //res->setCgNode(make_shared(t, eps)); + res->setCgNode(make_shared(t, res)); assert(res->getRequiresGrad()); } diff --git a/src/backend/training/loss_functions/bce_loss.cpp b/src/backend/training/loss_functions/bce_loss.cpp index 9fb71fc..ecd0f10 100644 --- a/src/backend/training/loss_functions/bce_loss.cpp +++ b/src/backend/training/loss_functions/bce_loss.cpp @@ -36,7 +36,7 @@ shared_ptr BceLoss::operator()(const shared_ptr y, const shared_ } auto bce = [](ftype y, ftype ypred){ - constexpr ftype eps = 1e-6; + constexpr ftype eps = 1e-9; return y*log(std::max(ypred, eps)) + (1-y)*log(std::max(1-ypred, eps)); }; diff --git a/src/backend/training/loss_functions/crossentropy_loss.cpp b/src/backend/training/loss_functions/crossentropy_loss.cpp index df7ba0a..10b6cb7 100644 --- a/src/backend/training/loss_functions/crossentropy_loss.cpp +++ b/src/backend/training/loss_functions/crossentropy_loss.cpp @@ -36,7 +36,7 @@ shared_ptr CrossEntropyLoss::operator()(const shared_ptr y, cons auto ce = [&y, &ypred](const tensorDim_t b){ ftype res = 0; for(tensorDim_t i=0; igetDims()[-1]; i++){ - constexpr ftype eps = 1e-6; + constexpr ftype eps = 1e-9; res += y->get(b, i) * log(std::max(ypred->get(b, i), eps)); } return res; diff --git a/src/backend/training/loss_functions/rsme_loss.cpp b/src/backend/training/loss_functions/rmse_loss.cpp similarity index 83% rename from src/backend/training/loss_functions/rsme_loss.cpp rename to src/backend/training/loss_functions/rmse_loss.cpp index 4dde243..30c750a 100644 --- a/src/backend/training/loss_functions/rsme_loss.cpp +++ b/src/backend/training/loss_functions/rmse_loss.cpp @@ -1,5 +1,5 @@ /** - * @file rsme_loss.cpp + * @file rmse_loss.cpp * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) * @brief * @version 0.1 @@ -9,9 +9,9 @@ * */ -#include "rsme_loss.h" +#include "rmse_loss.h" -#include "computational_graph/loss_functions/rsme_node.h" +#include "computational_graph/loss_functions/rmse_node.h" #include #include @@ -23,7 +23,7 @@ using namespace train; * @brief Expected shapes: (batchsize) or (batchsize, 1) * @return Tensor of shape (1) */ -shared_ptr RsmeLoss::operator()(const shared_ptr y, const shared_ptr ypred) const { +shared_ptr RmseLoss::operator()(const shared_ptr y, const shared_ptr ypred) const { if(!ypred->getRequiresGrad()) { __throw_invalid_argument("ypred must have gradient enabled"); } @@ -48,7 +48,7 @@ shared_ptr RsmeLoss::operator()(const shared_ptr y, const shared loss = sqrt(loss/nBatches); auto res = make_shared(std::vector{1}, std::vector{loss}, y->getDevice(), true); - res->setCgNode(make_shared(y, ypred, loss)); + res->setCgNode(make_shared(y, ypred, loss)); assert(res->getRequiresGrad()); return res; diff --git a/src/backend/training/loss_functions/rsme_loss.h b/src/backend/training/loss_functions/rmse_loss.h similarity index 86% rename from src/backend/training/loss_functions/rsme_loss.h rename to src/backend/training/loss_functions/rmse_loss.h index 5028012..804f88d 100644 --- a/src/backend/training/loss_functions/rsme_loss.h +++ b/src/backend/training/loss_functions/rmse_loss.h @@ -1,5 +1,5 @@ /** - * @file rsme_loss.h + * @file rmse_loss.h * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) * @brief * @version 0.1 @@ -14,7 +14,7 @@ #include "loss_base.h" namespace train { - class RsmeLoss final : public LossBase { + class RmseLoss final : public LossBase { public: std::shared_ptr operator()(const std::shared_ptr y, const std::shared_ptr ypred) const override; diff --git a/src/backend/training/optimizers/rmsprop.cpp b/src/backend/training/optimizers/rmsprop.cpp index def896d..c5a93a8 100644 --- a/src/backend/training/optimizers/rmsprop.cpp +++ b/src/backend/training/optimizers/rmsprop.cpp @@ -15,7 +15,7 @@ using namespace std; using namespace train; void RmsPropOptimizer::step() { - constexpr ftype eps = 1e-6; + constexpr ftype eps = 1e-9; for(const auto& param: params){ auto tPtr = param.get(); const auto gPtr = tPtr->getGrads().get(); diff --git a/tests/backend/test_computational_graph.cpp b/tests/backend/test_computational_graph.cpp index 2379cad..ef07e65 100644 --- a/tests/backend/test_computational_graph.cpp +++ b/tests/backend/test_computational_graph.cpp @@ -16,9 +16,6 @@ #include "computational_graph/tensor_ops/graph_creation.h" -#include "module/activation_functions/relu.h" -#include "module/activation_functions/leaky_relu.h" - #include TEST(AutogradTest, ThrowsIfNoGradientSet) { @@ -43,6 +40,36 @@ TEST(AutogradTest, SimpleAddition) { EXPECT_NEAR(t2->getGrads()->get(0), 10.0, 1e-5); } +TEST(AutogradTest, BroadcastAdd) { + // gradient of broadcast add w.r.t. bias should be sum over batch dimension + // upstream grad: (2,3) of ones → bias grad should be (3) of twos + auto t1 = TensorFunctions::makeSharedTensor({2, 3}, + {1.0, 2.0, 3.0, + 4.0, 5.0, 6.0}, true); + auto bias = TensorFunctions::makeSharedTensor({3}, + {0.0, 0.0, 0.0}, true); + + auto res = cgraph::add(t1, bias); + + // set upstream grad to ones and backprop + auto upstreamGrad = TensorFunctions::makeSharedTensor({2, 3}, + {1.0, 1.0, 1.0, + 1.0, 1.0, 1.0}, false); + res->backward(); + + // bias grad should be sum over batch: [2, 2, 2] + auto biasGrad = bias->getGrads(); + ASSERT_DOUBLE_EQ((*biasGrad)[0], 2.0); + ASSERT_DOUBLE_EQ((*biasGrad)[1], 2.0); + ASSERT_DOUBLE_EQ((*biasGrad)[2], 2.0); + + // t1 grad should be ones (add is identity for non-broadcast operand) + auto t1Grad = t1->getGrads(); + for(int i = 0; i < 6; i++) { + ASSERT_DOUBLE_EQ((*t1Grad)[i], 1.0); + } +} + TEST(AutogradTest, ScalarMultiplication) { auto t1 = TensorFunctions::makeSharedTensor({1}, {2.0}, true); auto t2 = TensorFunctions::makeSharedTensor({1}, {3.0}, true); @@ -120,36 +147,4 @@ TEST(AutogradTest, MultiVariateChainRule) { ASSERT_DOUBLE_EQ(y->getGrads()->get(0), 1.0); ASSERT_DOUBLE_EQ(y->getGrads()->get(1), 1.0); -} - -TEST(AutogradTest, ReLU) { - auto x = TensorFunctions::makeSharedTensor({3}, {-1.0, 0.0, 2.0}, true); - auto relu = module::ReLu(); - - auto y = relu(x); // [0, 0, 2] - auto loss = cgraph::sumTensor(y); // loss = 2 - - loss->backward(); - - // Gradient: [0, 0, 1] (only where input > 0) - ASSERT_DOUBLE_EQ(x->getGrads()->get(0), 0.0); - ASSERT_DOUBLE_EQ(x->getGrads()->get(1), 0.0); - ASSERT_DOUBLE_EQ(x->getGrads()->get(2), 1.0); -} - -TEST(AutogradTest, LeakyReLU) { - auto x = TensorFunctions::makeSharedTensor({3}, {-1.0, 0.0, 2.0}, true); - - constexpr ftype eps = 0.3; - auto relu = module::LeakyReLu(eps); - - auto y = relu(x); // [0, 0, 2] - auto loss = cgraph::sumTensor(y); // loss = 2 - - loss->backward(); - - // Gradient: [0, 0, 1] (only where input > 0) - ASSERT_DOUBLE_EQ(x->getGrads()->get(0), eps); - ASSERT_DOUBLE_EQ(x->getGrads()->get(1), eps); // by convention - ASSERT_DOUBLE_EQ(x->getGrads()->get(2), 1.0); } \ No newline at end of file diff --git a/tests/backend/test_data_modeling.cpp b/tests/backend/test_data_modeling.cpp index 1fa4c06..69f585b 100644 --- a/tests/backend/test_data_modeling.cpp +++ b/tests/backend/test_data_modeling.cpp @@ -72,7 +72,23 @@ TEST(TensorOpsTest, TensorAddCanBroadCast) { } } -TEST(TensorOpsTest, TensorAddBroadcastNotComutative) { +TEST(TensorOpsTest, BroadcastAdd_2D) { + // (2,3) + (3) + auto t1 = Tensor({2, 3}, {1.0, 2.0, 3.0, + 4.0, 5.0, 6.0}, false); + auto t2 = Tensor({3}, {10.0, 20.0, 30.0}, false); + auto res = t1 + t2; + + // expected: each row of t1 gets t2 added elementwise + ASSERT_DOUBLE_EQ(res.get(0, 0), 11.0); + ASSERT_DOUBLE_EQ(res.get(0, 1), 22.0); + ASSERT_DOUBLE_EQ(res.get(0, 2), 33.0); + ASSERT_DOUBLE_EQ(res.get(1, 0), 14.0); + ASSERT_DOUBLE_EQ(res.get(1, 1), 25.0); + ASSERT_DOUBLE_EQ(res.get(1, 2), 36.0); +} + +TEST(TensorOpsTest, TensorAddBroadcastNotCommutative) { auto t1 = TensorFunctions::Ones({3, 2, 2}, false); auto t2 = Tensor({2}, {2, 3}, false); diff --git a/tests/backend/test_losses.cpp b/tests/backend/test_losses.cpp index 05da8e6..8844a82 100644 --- a/tests/backend/test_losses.cpp +++ b/tests/backend/test_losses.cpp @@ -13,6 +13,7 @@ #include "data_modeling/tensor_functions.h" +#include "training/loss_functions/rmse_loss.h" #include "training/loss_functions/bce_loss.h" #include "training/loss_functions/crossentropy_loss.h" @@ -24,7 +25,7 @@ static constexpr ftype kTol = 1e-4f; // ─── CrossEntropy ──────────────────────────────────────────────────────────── -TEST(LossTest, CrossEntropy_CorrectValue) { +TEST(LossTest, CrossEntropyFoward) { auto y = TensorFunctions::makeSharedTensor( {2, 3}, {1.0, 0.0, 0.0, 0.0, 1.0, 0.0}, false); @@ -33,7 +34,7 @@ TEST(LossTest, CrossEntropy_CorrectValue) { {2, 3}, {0.7, 0.2, 0.1, 0.1, 0.8, 0.1}, true); - auto loss = CrossEntropyLoss{}; + CrossEntropyLoss loss; auto result = loss(y, ypred); // expected: -( log(0.7) + log(0.8) ) / 2 = 0.2899 @@ -41,7 +42,7 @@ TEST(LossTest, CrossEntropy_CorrectValue) { EXPECT_NEAR((*result)[0], expected, kTol); } -TEST(LossTest, CrossEntropy_PerfectPrediction) { +TEST(LossTest, CrossEntropyPerfectPrediction) { auto y = TensorFunctions::makeSharedTensor( {2, 3}, {1.0, 0.0, 0.0, 0.0, 1.0, 0.0}, false); @@ -51,14 +52,14 @@ TEST(LossTest, CrossEntropy_PerfectPrediction) { {2, 3}, {0.999, 0.0005, 0.0005, 0.0005, 0.999, 0.0005}, true); - auto loss = CrossEntropyLoss{}; + CrossEntropyLoss loss; auto result = loss(y, ypred); // loss should be very small EXPECT_LT((*result)[0], 0.01f); } -TEST(LossTest, CrossEntropy_UniformPrediction) { +TEST(LossTest, CrossEntropyUniformPrediction) { // uniform prediction should give log(3) ~ 1.0986 auto y = TensorFunctions::makeSharedTensor( {1, 3}, {1.0, 0.0, 0.0}, false); @@ -66,32 +67,63 @@ TEST(LossTest, CrossEntropy_UniformPrediction) { auto ypred = TensorFunctions::makeSharedTensor( {1, 3}, {1.0f/3, 1.0f/3, 1.0f/3}, true); - auto loss = CrossEntropyLoss{}; + CrossEntropyLoss loss; auto result = loss(y, ypred); EXPECT_NEAR((*result)[0], std::log(3.0f), kTol); } -TEST(LossTest, CrossEntropy_DimMismatch_Throws) { +TEST(LossTest, CrossEntropyThrowsOnDimMismatch) { auto y = TensorFunctions::makeSharedTensor( {2, 3}, {1.0, 0.0, 0.0, 0.0, 1.0, 0.0}, false); auto ypred = TensorFunctions::makeSharedTensor( {2, 2}, {0.5, 0.5, 0.5, 0.5}, true); - auto loss = CrossEntropyLoss{}; + CrossEntropyLoss loss; EXPECT_THROW(loss(y, ypred), std::invalid_argument); } +TEST(LossTest, CrossEntropyBackward) { + // y = [[1,0,0],[0,1,0]], ypred = [[0.7,0.2,0.1],[0.1,0.8,0.1]] + // grad CE w.r.t. ypred[b,i] = -y[b,i] / (ypred[b,i] * n) + // grad[0,0] = -1/(0.7*2) = -0.7143 + // grad[0,1] = 0 + // grad[0,2] = 0 + // grad[1,0] = 0 + // grad[1,1] = -1/(0.8*2) = -0.625 + // grad[1,2] = 0 + auto y = TensorFunctions::makeSharedTensor( + {2, 3}, {1.0, 0.0, 0.0, + 0.0, 1.0, 0.0}, false); + auto ypred = TensorFunctions::makeSharedTensor( + {2, 3}, {0.7, 0.2, 0.1, + 0.1, 0.8, 0.1}, true); + + CrossEntropyLoss loss; + auto result = loss(y, ypred); + std::cout << "before bw" << std::endl; + result->backward(); + std::cout << "past bw" << std::endl; + + auto grads = ypred->getGrads(); + EXPECT_NEAR((*grads)[0], -0.7143f, kTol); + EXPECT_NEAR((*grads)[1], 0.0f, kTol); + EXPECT_NEAR((*grads)[2], 0.0f, kTol); + EXPECT_NEAR((*grads)[3], 0.0f, kTol); + EXPECT_NEAR((*grads)[4], -0.625f, kTol); + EXPECT_NEAR((*grads)[5], 0.0f, kTol); +} + // ─── BCE ───────────────────────────────────────────────────────────────────── -TEST(LossTest, BCE_CorrectValue) { +TEST(LossTest, BceForward) { auto y = TensorFunctions::makeSharedTensor( {4, 1}, {0.0, 1.0, 1.0, 0.0}, false); auto ypred = TensorFunctions::makeSharedTensor( {4, 1}, {0.1, 0.9, 0.8, 0.2}, true); - auto loss = BceLoss{}; + BceLoss loss; auto result = loss(y, ypred); // expected: -( log(0.9) + log(0.9) + log(0.8) + log(0.8) ) / 4 = 0.1643 @@ -100,20 +132,20 @@ TEST(LossTest, BCE_CorrectValue) { EXPECT_NEAR((*result)[0], expected, kTol); } -TEST(LossTest, BCE_PerfectPrediction) { +TEST(LossTest, BcePerfectPrediction) { auto y = TensorFunctions::makeSharedTensor( {2, 1}, {1.0, 0.0}, false); auto ypred = TensorFunctions::makeSharedTensor( {2, 1}, {0.999, 0.001}, true); - auto loss = BceLoss{}; + BceLoss loss; auto result = loss(y, ypred); EXPECT_LT((*result)[0], 0.01f); } -TEST(LossTest, BCE_RandomPrediction) { +TEST(LossTest, BceRandomPrediction) { // ypred = 0.5 for all -> loss = log(2) ~ 0.6931 auto y = TensorFunctions::makeSharedTensor( {2, 1}, {1.0, 0.0}, false); @@ -121,31 +153,98 @@ TEST(LossTest, BCE_RandomPrediction) { auto ypred = TensorFunctions::makeSharedTensor( {2, 1}, {0.5, 0.5}, true); - auto loss = BceLoss{}; + BceLoss loss; auto result = loss(y, ypred); EXPECT_NEAR((*result)[0], std::log(2.0f), kTol); } -TEST(LossTest, BCE_DimMismatch_Throws) { +TEST(LossTest, BceThrowsOnDimMismatch) { auto y = TensorFunctions::makeSharedTensor( {2, 1}, {1.0, 0.0}, false); auto ypred = TensorFunctions::makeSharedTensor( {3, 1}, {0.5, 0.5, 0.5}, true); - auto loss = BceLoss{}; + BceLoss loss; EXPECT_THROW(loss(y, ypred), std::invalid_argument); } -TEST(LossTest, BCE_NearZeroPred_NoInfOrNan) { +TEST(LossTest, BceNoInfOrNanOnNearZeroPred) { auto y = TensorFunctions::makeSharedTensor( {1, 1}, {1.0}, false); auto ypred = TensorFunctions::makeSharedTensor( {1, 1}, {0.0}, true); - auto loss = BceLoss{}; + BceLoss loss; auto result = loss(y, ypred); // clipping prevents log(0) EXPECT_FALSE(std::isinf((*result)[0])); +} + +TEST(LossTest, BceBackward) { + // y = [1, 0], ypred = [0.8, 0.3] + // grad BCE w.r.t. ypred_i = (-y/ypred + (1-y)/(1-ypred)) / n + // grad[0] = (-1/0.8 + 0) / 2 = -0.625 + // grad[1] = (0 + 1/0.7) / 2 = 0.7143 + auto y = TensorFunctions::makeSharedTensor( + {2, 1}, {1.0, 0.0}, false); + auto ypred = TensorFunctions::makeSharedTensor( + {2, 1}, {0.8, 0.3}, true); + + BceLoss loss; + auto result = loss(y, ypred); + result->backward(); + + auto grads = ypred->getGrads(); + EXPECT_NEAR((*grads)[0], -0.625f, kTol); + EXPECT_NEAR((*grads)[1], 0.7143f, kTol); +} + +TEST(LossTest, RmseForward) { + // y = [1, 2, 3], ypred = [1.5, 2.5, 2.5] + // diffs = [-0.5, -0.5, 0.5] + // MSE = (0.25 + 0.25 + 0.25) / 3 = 0.25 + // RMSE = 0.5 + auto y = TensorFunctions::makeSharedTensor( + {3}, {1.0, 2.0, 3.0}, false); + auto ypred = TensorFunctions::makeSharedTensor( + {3}, {1.5, 2.5, 2.5}, true); + + auto loss = RmseLoss{}; + auto result = loss(y, ypred); + + EXPECT_NEAR((*result)[0], 0.5f, kTol); +} + +TEST(LossTest, RmsePerfectPrediction) { + auto y = TensorFunctions::makeSharedTensor( + {3}, {1.0, 2.0, 3.0}, false); + auto ypred = TensorFunctions::makeSharedTensor( + {3}, {1.0, 2.0, 3.0}, true); + + RmseLoss loss; + auto result = loss(y, ypred); + + EXPECT_NEAR((*result)[0], 0.0f, kTol); +} + +TEST(LossTest, RmseBackward) { + // y = [1, 0], ypred = [0.5, 0.5] + // diffs = [0.5, -0.5], MSE = 0.25, RMSE = 0.5 + // grad_i = -(y_i - ypred_i) / (n * RMSE) + // grad[0] = -(1 - 0.5) / (2 * 0.5) = -0.5 + // grad[1] = -(0 - 0.5) / (2 * 0.5) = 0.5 + auto y = TensorFunctions::makeSharedTensor( + {2}, {1.0, 0.0}, false); + auto ypred = TensorFunctions::makeSharedTensor( + {2}, {0.5, 0.5}, true); + + RmseLoss loss; + auto result = loss(y, ypred); + result->backward(); + + auto grads = ypred->getGrads(); + EXPECT_NEAR((*grads)[0], -0.5f, kTol); + EXPECT_NEAR((*grads)[1], 0.5f, kTol); } \ No newline at end of file diff --git a/tests/backend/test_module.cpp b/tests/backend/test_module.cpp index 459d577..a74c88c 100644 --- a/tests/backend/test_module.cpp +++ b/tests/backend/test_module.cpp @@ -16,15 +16,18 @@ #include "module/activation_functions/relu.h" #include "module/activation_functions/leaky_relu.h" #include "module/activation_functions/softmax.h" +#include "module/activation_functions/sigmoid.h" #include "data_modeling/tensor_functions.h" +#include "computational_graph/tensor_ops/graph_creation.h" -using namespace module; -using namespace module; +#include -TEST(ActivationTest, TestRelu1) { +constexpr ftype delta = 1e-3; + +TEST(ActivationTest, ReluForward) { auto t1 = TensorFunctions::Ones({3, 2}, false); - auto f = ReLu(); + auto f = module::ReLu(); auto res = f(t1); @@ -33,9 +36,9 @@ TEST(ActivationTest, TestRelu1) { } } -TEST(ActivationTest, TestRelu2) { +TEST(ActivationTest, ReluInputNegative) { auto t1 = TensorFunctions::Ones({3, 2}, false) * -1; - auto f = ReLu(); + auto f = module::ReLu(); auto res = f(t1); @@ -45,10 +48,25 @@ TEST(ActivationTest, TestRelu2) { } } -TEST(ActivationTest, TestLeakyRelu1) { +TEST(AutogradTest, ReLUBackward) { + auto x = TensorFunctions::makeSharedTensor({3}, {-1.0, 0.0, 2.0}, true); + auto relu = module::ReLu(); + + auto y = relu(x); // [0, 0, 2] + auto loss = cgraph::sumTensor(y); // loss = 2 + + loss->backward(); + + // Gradient: [0, 0, 1] (only where input > 0) + ASSERT_DOUBLE_EQ(x->getGrads()->get(0), 0.0); + ASSERT_DOUBLE_EQ(x->getGrads()->get(1), 0.0); + ASSERT_DOUBLE_EQ(x->getGrads()->get(2), 1.0); +} + +TEST(ActivationTest, LeakyReluForward) { auto t1 = TensorFunctions::Ones({3, 2}, false); - auto f = LeakyReLu(0.3); + auto f = module::LeakyReLu(0.3); auto res = f(t1); for(size_t i=0; ibackward(); + + // Gradient: [0, 0, 1] (only where input > 0) + ASSERT_DOUBLE_EQ(x->getGrads()->get(0), eps); + ASSERT_DOUBLE_EQ(x->getGrads()->get(1), eps); // by convention + ASSERT_DOUBLE_EQ(x->getGrads()->get(2), 1.0); +} + +TEST(ActivationTest, SigmoidForward) { + // sigmoid(0) = 0.5, sigmoid(1) = 0.7311, sigmoid(-1) = 0.2689 + auto t = Tensor({3}, {0.0, 1.0, -1.0}, true); + + module::Sigmoid sig; + auto res = sig(t); + + EXPECT_NEAR(res[0], 0.5, delta); + EXPECT_NEAR(res[1], 0.7311, delta); + EXPECT_NEAR(res[2], 0.2689, delta); +} + +TEST(ActivationTest, SigmoidLargePositive) { + // sigmoid(100) should be ~1, not inf or nan + auto t = Tensor({1}, {100.0}, true); + + module::Sigmoid sig; + auto res = sig(t); + + EXPECT_NEAR(res[0], 1.0, delta); + EXPECT_FALSE(std::isnan(res[0])); + EXPECT_FALSE(std::isinf(res[0])); +} + +TEST(ActivationTest, SigmoidLargeNegative) { + // sigmoid(-100) should be ~0, not nan + auto t = Tensor({1}, {-100.0}, true); + + module::Sigmoid sig; + auto res = sig(t); + + EXPECT_NEAR(res[0], 0.0, delta); + EXPECT_FALSE(std::isnan(res[0])); + EXPECT_FALSE(std::isinf(res[0])); +} + +TEST(AutogradTest, SigmoidBackward) { + // grad of sigmoid = sigmoid(x) * (1 - sigmoid(x)) + // for x=0: grad = 0.5 * 0.5 = 0.25 + // for x=1: grad = 0.7311 * 0.2689 = 0.1966 + auto t = TensorFunctions::makeSharedTensor( + {2}, {0.0, 1.0}, true); + + module::Sigmoid sig; + auto res = sig(t); + res->backward(); + + auto grads = t->getGrads(); + EXPECT_NEAR((*grads)[0], 0.25, delta); + EXPECT_NEAR((*grads)[1], 0.1966, delta); +} + +TEST(ActivationTest, SoftmaxForward) { + // softmax([1, 2, 3]) + // exp([1,2,3]) = [2.7183, 7.3891, 20.0855] + // sum = 30.1929 + // softmax = [0.0900, 0.2447, 0.6652] + auto t = Tensor({1, 3}, {1.0, 2.0, 3.0}, true); + + module::Softmax sm; + auto res = sm(t); + + EXPECT_NEAR(res[0], 0.0900, delta); + EXPECT_NEAR(res[1], 0.2447, delta); + EXPECT_NEAR(res[2], 0.6652, delta); +} + +TEST(ActivationTest, SoftmaxSumsToOne) { + auto t = Tensor({2, 4}, + {1.0, 2.0, 3.0, 4.0, + 2.0, 1.0, 4.0, 3.0}, + true); + + module::Softmax sm; + auto res = sm(t); + + // each row must sum to 1 + ftype row0sum = res[0] + res[1] + res[2] + res[3]; + ftype row1sum = res[4] + res[5] + res[6] + res[7]; + EXPECT_NEAR(row0sum, 1.0, delta); + EXPECT_NEAR(row1sum, 1.0, delta); +} + +TEST(ActivationTest, SoftmaxForwardNumericalStability) { + // large values should not produce nan or inf + auto t = Tensor({1, 3}, {100.0, 101.0, 102.0}, true); + + module::Softmax sm; + auto res = sm(t); + + for(int i = 0; i < 3; i++) { + EXPECT_FALSE(std::isnan(res[i])); + EXPECT_FALSE(std::isinf(res[i])); + } + ftype rowsum = res[0] + res[1] + res[2]; + EXPECT_NEAR(rowsum, 1.0, delta); +} + +TEST(AutogradTest, SoftmaxBackward) { + // for softmax with upstream grad of ones, the gradient is zero + // because d/dx_i sum(softmax(x)) = 0 (softmax sums to 1 always) + // more useful: upstream = [1, 0, 0] + // grad[i] = softmax[i] * (upstream[i] - dot(upstream, softmax)) + // for x=[1,2,3], softmax=[0.09, 0.2447, 0.6652] + // dot([1,0,0], softmax) = 0.09 + // grad[0] = 0.09 * (1 - 0.09) = 0.0819 + // grad[1] = 0.2447 * (0 - 0.09) = -0.0220 + // grad[2] = 0.6652 * (0 - 0.09) = -0.0599 + auto t = TensorFunctions::makeSharedTensor( + {1, 3}, {1.0, 2.0, 3.0}, true); + + module::Softmax sm; + auto resPtr = sm(t); + + // set upstream gradient to [1, 0, 0] + auto upstream = TensorFunctions::makeSharedTensor( + {1, 3}, {1.0, 0.0, 0.0}, false); + resPtr->setGrads(upstream); + resPtr->backward(); + + auto grads = t->getGrads(); + EXPECT_NEAR((*grads)[0], 0.0819, delta); + EXPECT_NEAR((*grads)[1], -0.0220, delta); + EXPECT_NEAR((*grads)[2], -0.0599, delta); +} + TEST(LayerTest, TestFfLayer) { auto t1 = TensorFunctions::Ones({3, 2}, false); - auto layer = FfLayer(2, 1, true, false); + auto layer = module::FfLayer(2, 1, true, false); auto res = layer(t1); From f3cb32b33683b6bdab4d502150c4fce10024aa66 Mon Sep 17 00:00:00 2001 From: Robert Baumgartner Date: Sun, 15 Mar 2026 14:39:42 +0100 Subject: [PATCH 16/24] Give gaussian init variables to fix train loop --- .../activation_functions/softmax_node.cpp | 1 + src/backend/data_modeling/tensor.cpp | 12 ++++--- src/backend/data_modeling/tensor.h | 2 +- .../data_modeling/tensor_functions.cpp | 14 ++++---- src/backend/data_modeling/tensor_functions.h | 6 ++-- src/backend/module/layers/ff_layer.cpp | 6 ++-- src/backend/utility/initializers.cpp | 18 +++++----- src/backend/utility/initializers.h | 2 +- src/python/py_core/py_core.cpp | 33 +++++++++++++------ src/python/py_core/py_core_util.h | 18 +++++----- 10 files changed, 68 insertions(+), 44 deletions(-) diff --git a/src/backend/computational_graph/activation_functions/softmax_node.cpp b/src/backend/computational_graph/activation_functions/softmax_node.cpp index a616064..2b7dc41 100644 --- a/src/backend/computational_graph/activation_functions/softmax_node.cpp +++ b/src/backend/computational_graph/activation_functions/softmax_node.cpp @@ -26,6 +26,7 @@ vector< shared_ptr > SoftmaxNode::backward(const Tensor& upstreamGrad) { auto res = make_shared(yPred->createEmptyCopy()); ftype bSize = yPred->getDims()[0]; + assert(bSize>0); for(tensorDim_t i=0; igetDims()[0]; i++){ for(tensorDim_t j=0; jgetDims()[1]; j++){ ftype g = 0; diff --git a/src/backend/data_modeling/tensor.cpp b/src/backend/data_modeling/tensor.cpp index 47478ff..ba0010d 100644 --- a/src/backend/data_modeling/tensor.cpp +++ b/src/backend/data_modeling/tensor.cpp @@ -529,8 +529,12 @@ void Tensor::backward() { auto& tensor = *tPtr; assert(tensor.grads && !tensor.grads->requiresGrad); // gradient should not require grad - cout << "backward of " << tPtr << endl; - cout << "grads " << *tensor.grads << endl; + static int count = 0; + if(tPtr->requiresGrad && count % 50 == 0){ + cout << "\nbackward of " << tPtr << endl; // TODO: remove + cout << "\ngrads " << *tensor.grads << endl; // TODO: remove + } + auto incomingGrads = tensor.cgNode->backward(*tensor.grads); const auto& parents = tensor.cgNode->getParents(); @@ -769,8 +773,8 @@ void Tensor::reset(const ftype x) noexcept { /** * @brief Populates the tensor with values drawn according to initializer. */ -void Tensor::reset(const utility::InitClass ic) { - const auto init = utility::InitializerFactory::getInitializer(ic); +void Tensor::reset(const utility::InitClass ic, const ftype mean, const ftype stddev) { + const auto init = utility::InitializerFactory::getInitializer(ic, mean, stddev); for(tensorSize_t i=0; igetSize(); i++){ (*values)[i] = init->drawNumber(); } diff --git a/src/backend/data_modeling/tensor.h b/src/backend/data_modeling/tensor.h index 8915c99..00df2ed 100644 --- a/src/backend/data_modeling/tensor.h +++ b/src/backend/data_modeling/tensor.h @@ -177,7 +177,7 @@ class Tensor final : public std::enable_shared_from_this { Tensor& operator=(Tensor&& other) noexcept; void reset(const ftype x) noexcept; - void reset(const utility::InitClass ic); + void reset(const utility::InitClass ic, ftype mean, ftype stddev); const Dimension& getDims() const noexcept; tensorSize_t getSize() const noexcept; diff --git a/src/backend/data_modeling/tensor_functions.cpp b/src/backend/data_modeling/tensor_functions.cpp index c7986a0..ccab416 100644 --- a/src/backend/data_modeling/tensor_functions.cpp +++ b/src/backend/data_modeling/tensor_functions.cpp @@ -33,14 +33,16 @@ Tensor TensorFunctions::Ones(vector dims, const bool requiresGrad) return Ones(std::move(dims), Tensor::getDefaultDevice(), requiresGrad); } -Tensor TensorFunctions::Gaussian(vector dims, Device d, const bool requiresGrad) { +Tensor TensorFunctions::Gaussian(vector dims, const ftype mean, const ftype stddev, + const Device d, const bool requiresGrad) { auto res = Tensor(std::move(dims), d, requiresGrad); - res.reset(utility::InitClass::Gaussian); + res.reset(utility::InitClass::Gaussian, mean, stddev); return res; } -Tensor TensorFunctions::Gaussian(vector dims, const bool requiresGrad) { - return Gaussian(std::move(dims), Tensor::getDefaultDevice(), requiresGrad); +Tensor TensorFunctions::Gaussian(vector dims, const ftype mean, + const ftype stddev, const bool requiresGrad) { + return Gaussian(std::move(dims), mean, stddev, Tensor::getDefaultDevice(), requiresGrad); } // Tensor manipulation @@ -52,8 +54,8 @@ void TensorFunctions::ToOnes(Tensor& t) noexcept { t.reset(1); } -void TensorFunctions::ToGaussian(Tensor& t) { - t.reset(utility::InitClass::Gaussian); +void TensorFunctions::ToGaussian(Tensor& t, const ftype mean, const ftype stddev) { + t.reset(utility::InitClass::Gaussian, mean, stddev); } shared_ptr TensorFunctions::makeSharedTensor(const vector& dims, bool requiresGrad){ diff --git a/src/backend/data_modeling/tensor_functions.h b/src/backend/data_modeling/tensor_functions.h index a79955e..b05f8da 100644 --- a/src/backend/data_modeling/tensor_functions.h +++ b/src/backend/data_modeling/tensor_functions.h @@ -32,8 +32,8 @@ namespace TensorFunctions { // class name acts as namespace for us Tensor Ones(std::vector dims, Device d, const bool requiresGrad=false); Tensor Ones(std::vector dims, const bool requiresGrad=false); - Tensor Gaussian(std::vector dims, Device d, const bool requiresGrad=false); - Tensor Gaussian(std::vector dims, const bool requiresGrad=false); + Tensor Gaussian(std::vector dims, ftype mean, ftype stddev, Device d, const bool requiresGrad=false); + Tensor Gaussian(std::vector dims, ftype mean=0, ftype stddev=1, const bool requiresGrad=false); std::shared_ptr makeSharedTensor(const std::vector& dims, bool requiresGrad=false); @@ -50,7 +50,7 @@ namespace TensorFunctions { // class name acts as namespace for us // Tensor manipulation void ToZeros(Tensor& t) noexcept; void ToOnes(Tensor& t) noexcept; - void ToGaussian(Tensor& t); + void ToGaussian(Tensor& t, ftype mean, ftype stddev); // Arithmetics Tensor SumOverDims(const Tensor& t, tensorDim_t dim=0); // default 0 for batch-size diff --git a/src/backend/module/layers/ff_layer.cpp b/src/backend/module/layers/ff_layer.cpp index ce9946e..77566f5 100644 --- a/src/backend/module/layers/ff_layer.cpp +++ b/src/backend/module/layers/ff_layer.cpp @@ -35,11 +35,13 @@ FfLayer::FfLayer(tensorDim_t inSize, tensorDim_t outSize, Device d, bool useBias : useBias{useBias}, requiresGrad{requiresGrad} { weights = make_shared(Dimension({inSize, outSize}), d, requiresGrad); - TensorFunctions::ToGaussian(*weights); + TensorFunctions::ToGaussian(*weights, 0, 0.1); + weights = weights; if(useBias){ bias = make_shared(vector{outSize}, d, requiresGrad); - TensorFunctions::ToGaussian(*bias); + TensorFunctions::ToGaussian(*bias, 0, 0.001); + bias = bias; } } diff --git a/src/backend/utility/initializers.cpp b/src/backend/utility/initializers.cpp index 4240345..ad8dbfa 100644 --- a/src/backend/utility/initializers.cpp +++ b/src/backend/utility/initializers.cpp @@ -19,26 +19,28 @@ using namespace utility; namespace { class GaussianInitializer final : public InitializerBase { + private: + std::random_device rd{}; + mutable std::mt19937 gen; + mutable std::normal_distribution dist; + public: - GaussianInitializer(); + GaussianInitializer(ftype mean, ftype stddev); ftype drawNumber() const override; }; - GaussianInitializer::GaussianInitializer() : InitializerBase() {} + GaussianInitializer::GaussianInitializer(ftype mean, ftype stddev) + : InitializerBase(), gen{rd()}, dist{mean, stddev} {} ftype GaussianInitializer::drawNumber() const { - static std::random_device rd; - static std::mt19937 gen{rd()}; - static std::normal_distribution dist; - return dist(gen); } } -unique_ptr InitializerFactory::getInitializer(InitClass ic) { +unique_ptr InitializerFactory::getInitializer(InitClass ic, ftype mean, ftype stddev) { switch(ic){ case InitClass::Gaussian: - return make_unique(); + return make_unique(mean, stddev); default: __throw_invalid_argument("Init class not implemented yet"); } diff --git a/src/backend/utility/initializers.h b/src/backend/utility/initializers.h index ba76707..05991e3 100644 --- a/src/backend/utility/initializers.h +++ b/src/backend/utility/initializers.h @@ -31,6 +31,6 @@ namespace utility{ class InitializerFactory final { public: InitializerFactory() = delete; - static std::unique_ptr getInitializer(InitClass ic); + static std::unique_ptr getInitializer(InitClass ic, ftype mean=0, ftype stddev=0.5); }; } diff --git a/src/python/py_core/py_core.cpp b/src/python/py_core/py_core.cpp index 476511f..8dfad1f 100644 --- a/src/python/py_core/py_core.cpp +++ b/src/python/py_core/py_core.cpp @@ -89,6 +89,16 @@ BOOST_PYTHON_MODULE(_core) return (*fPtr)(self.getSharedPtr()); \ } + #define WRAP_FREE_FUNC_8(fPtr, T1, T2, T3, T4) \ + +[](T1 v1, T2 v2, T3 v3, T4 v4) -> std::shared_ptr { \ + return std::make_shared((*fPtr)(v1, v2, v3, v4)); \ + } + + #define WRAP_FREE_FUNC_9(fPtr, T1, T2, T3, T4, T5) \ + +[](T1 v1, T2 v2, T3 v3, T4 v4, T5 v5) -> std::shared_ptr { \ + return std::make_shared((*fPtr)(v1, v2, v3, v4, v5)); \ + } + #define WRAP_FUNC_AND_CONVERT_DTYPE_1(method) \ +[](const Tensor& self, int v1) -> ftype { \ return self.method(static_cast(v1)); \ @@ -144,17 +154,20 @@ BOOST_PYTHON_MODULE(_core) .def("ones", WRAP_FREE_FUNC_1(Py_DataModeling::Ones0, std::vector)) .def("ones", WRAP_FREE_FUNC_2(Py_DataModeling::Ones1, std::vector, Device)) .def("ones", WRAP_FREE_FUNC_2(Py_DataModeling::Ones2, std::vector, const bool)) - .def("ones", WRAP_FREE_FUNC_3(Py_DataModeling::Ones3, std::vector, Device, const bool)).staticmethod("ones") + .def("ones", WRAP_FREE_FUNC_3(Py_DataModeling::Ones3, std::vector, Device, const bool)) + .staticmethod("ones") .def("zeros", WRAP_FREE_FUNC_1(Py_DataModeling::Zeros0, std::vector)) .def("zeros", WRAP_FREE_FUNC_2(Py_DataModeling::Zeros1, std::vector, Device)) .def("zeros", WRAP_FREE_FUNC_2(Py_DataModeling::Zeros2, std::vector, const bool)) - .def("zeros", WRAP_FREE_FUNC_3(Py_DataModeling::Zeros3, std::vector, Device, const bool)).staticmethod("zeros") + .def("zeros", WRAP_FREE_FUNC_3(Py_DataModeling::Zeros3, std::vector, Device, const bool)) + .staticmethod("zeros") - .def("gauss", WRAP_FREE_FUNC_1(Py_DataModeling::Gaussian0, std::vector)) - .def("gauss", WRAP_FREE_FUNC_2(Py_DataModeling::Gaussian1, std::vector, Device)) - .def("gauss", WRAP_FREE_FUNC_2(Py_DataModeling::Gaussian2, std::vector, const bool)) - .def("gauss", WRAP_FREE_FUNC_3(Py_DataModeling::Gaussian3, std::vector, Device, const bool)).staticmethod("gauss") + .def("gauss", WRAP_FREE_FUNC_3(Py_DataModeling::Gaussian0, std::vector, ftype, ftype)) + .def("gauss", WRAP_FREE_FUNC_8(Py_DataModeling::Gaussian1, std::vector, ftype, ftype, Device)) + .def("gauss", WRAP_FREE_FUNC_8(Py_DataModeling::Gaussian2, std::vector, ftype, ftype, const bool)) + .def("gauss", WRAP_FREE_FUNC_9(Py_DataModeling::Gaussian3, std::vector, ftype, ftype, Device, const bool)) + .staticmethod("gauss") // properties .add_property("device", &Tensor::getDevice, &Tensor::setDevice) @@ -214,8 +227,8 @@ BOOST_PYTHON_MODULE(_core) def("Zeros", WRAP_FREE_FUNC_2(Py_DataModeling::Zeros2, std::vector, const bool)); def("Zeros", WRAP_FREE_FUNC_3(Py_DataModeling::Zeros3, std::vector, Device, const bool)); - def("Gaussian", WRAP_FREE_FUNC_1(Py_DataModeling::Gaussian0, std::vector)); - def("Gaussian", WRAP_FREE_FUNC_2(Py_DataModeling::Gaussian1, std::vector, Device)); - def("Gaussian", WRAP_FREE_FUNC_2(Py_DataModeling::Gaussian2, std::vector, const bool)); - def("Gaussian", WRAP_FREE_FUNC_3(Py_DataModeling::Gaussian3, std::vector, Device, const bool)); + def("Gaussian", WRAP_FREE_FUNC_3(Py_DataModeling::Gaussian0, std::vector, ftype, ftype)); + def("Gaussian", WRAP_FREE_FUNC_8(Py_DataModeling::Gaussian1, std::vector, ftype, ftype, Device)); + def("Gaussian", WRAP_FREE_FUNC_8(Py_DataModeling::Gaussian2, std::vector, ftype, ftype, const bool)); + def("Gaussian", WRAP_FREE_FUNC_9(Py_DataModeling::Gaussian3, std::vector, ftype, ftype, Device, const bool)); } \ No newline at end of file diff --git a/src/python/py_core/py_core_util.h b/src/python/py_core/py_core_util.h index 1f97ce4..6951184 100644 --- a/src/python/py_core/py_core_util.h +++ b/src/python/py_core/py_core_util.h @@ -58,12 +58,12 @@ namespace Py_DataModeling { return TensorFunctions::Zeros(std::move(dims), d); } - inline auto GaussianWrapper0(std::vector dims) { - return TensorFunctions::Gaussian(std::move(dims)); + inline auto GaussianWrapper0(std::vector dims, ftype mean, ftype stddev) { + return TensorFunctions::Gaussian(std::move(dims), mean, stddev); } - inline auto GaussianWrapper1(std::vector dims, Device d) { - return TensorFunctions::Gaussian(std::move(dims), d); + inline auto GaussianWrapper1(std::vector dims, ftype mean, ftype stddev, Device d) { + return TensorFunctions::Gaussian(std::move(dims), mean, stddev, d); } inline Tensor (*Ones0)(std::vector) = &OnesWrapper0; @@ -76,13 +76,13 @@ namespace Py_DataModeling { inline Tensor (*Zeros2)(std::vector, const bool) = &(TensorFunctions::Zeros); inline Tensor (*Zeros3)(std::vector, Device, const bool) = &(TensorFunctions::Zeros); - inline Tensor (*Gaussian0)(std::vector) = &GaussianWrapper0; - inline Tensor (*Gaussian1)(std::vector, Device) = &GaussianWrapper1; - inline Tensor (*Gaussian2)(std::vector, const bool) = &(TensorFunctions::Gaussian); - inline Tensor (*Gaussian3)(std::vector, Device, const bool) = &(TensorFunctions::Gaussian); + inline Tensor (*Gaussian0)(std::vector, ftype, ftype) = &GaussianWrapper0; + inline Tensor (*Gaussian1)(std::vector, ftype, ftype, Device) = &GaussianWrapper1; + inline Tensor (*Gaussian2)(std::vector, ftype, ftype, const bool) = &(TensorFunctions::Gaussian); + inline Tensor (*Gaussian3)(std::vector, ftype, ftype, Device, const bool) = &(TensorFunctions::Gaussian); inline void (Tensor::*reset1)(const ftype) = &Tensor::reset; - inline void (Tensor::*reset2)(const utility::InitClass) = &Tensor::reset; + inline void (Tensor::*reset2)(const utility::InitClass, ftype, ftype) = &Tensor::reset; inline void (Tensor::*transposeThis1)() = &Tensor::transposeThis; inline void (Tensor::*transposeThis2)(int, int) = &Tensor::transposeThis; From dfd98b9a80abfe64bd05da1e1a1dca5f9a7c8de4 Mon Sep 17 00:00:00 2001 From: Robert Baumgartner Date: Sun, 15 Mar 2026 15:01:54 +0100 Subject: [PATCH 17/24] Yet to fix training loop --- src/backend/module/layers/ff_layer.cpp | 4 +- tests/backend/test_train_loop.cpp | 78 +++++++++++++------------- 2 files changed, 41 insertions(+), 41 deletions(-) diff --git a/src/backend/module/layers/ff_layer.cpp b/src/backend/module/layers/ff_layer.cpp index 77566f5..e0500a7 100644 --- a/src/backend/module/layers/ff_layer.cpp +++ b/src/backend/module/layers/ff_layer.cpp @@ -35,12 +35,12 @@ FfLayer::FfLayer(tensorDim_t inSize, tensorDim_t outSize, Device d, bool useBias : useBias{useBias}, requiresGrad{requiresGrad} { weights = make_shared(Dimension({inSize, outSize}), d, requiresGrad); - TensorFunctions::ToGaussian(*weights, 0, 0.1); + TensorFunctions::ToGaussian(*weights, 0, 0.2); weights = weights; if(useBias){ bias = make_shared(vector{outSize}, d, requiresGrad); - TensorFunctions::ToGaussian(*bias, 0, 0.001); + TensorFunctions::ToZeros(*bias); bias = bias; } } diff --git a/tests/backend/test_train_loop.cpp b/tests/backend/test_train_loop.cpp index 21e255f..c829775 100644 --- a/tests/backend/test_train_loop.cpp +++ b/tests/backend/test_train_loop.cpp @@ -49,7 +49,7 @@ static shared_ptr makeMulticlassNet() { net->append(make_shared(2, 8, true, true)); - net->append(make_shared()); + net->append(make_shared(1e-5)); net->append(make_shared(8, 3, true, true)); @@ -57,7 +57,7 @@ static shared_ptr makeMulticlassNet() { return net; } -TEST(OverfitTest, BCE_SGD_OverfitsSmallDataset) { +TEST(OverfitTest, BceSgdOverfitsSmallDataset) { // XOR-like: 4 samples, 2 features, binary labels auto x = TensorFunctions::makeSharedTensor( {4, 2}, {0.0, 0.0, @@ -74,12 +74,12 @@ TEST(OverfitTest, BCE_SGD_OverfitsSmallDataset) { auto net = makeBinaryNet(); auto loss = make_shared(); auto optim = make_shared( - net->parameters(), /*lr=*/0.01); + net->parameters(), /*lr=*/0.001); auto trainLoop = train::BaseTrainLoop( net, loss, optim, /*epochs=*/2000, /*bsize=*/static_cast(4)); - trainLoop.run(x, y, /*shuffle=*/false); + trainLoop.run(x, y, /*shuffle=*/false, /*verbose=*/false); // forward one more time to get final loss auto pred = (*net)(x); @@ -91,38 +91,38 @@ TEST(OverfitTest, BCE_SGD_OverfitsSmallDataset) { << "Network failed to overfit binary dataset"; } -// TEST(OverfitTest, CrossEntropy_RMSProp_OverfitsSmallDataset) { -// // 6 samples, 2 features, 3 classes -// auto x = TensorFunctions::makeSharedTensor( -// {6, 2}, {1.0, 0.0, -// 1.0, 0.1, -// 0.0, 1.0, -// 0.1, 1.0, -// 0.5, 0.5, -// 0.4, 0.6}, false); -// -// // one-hot encoded labels -// auto y = TensorFunctions::makeSharedTensor( -// {6, 3}, {1.0, 0.0, 0.0, -// 1.0, 0.0, 0.0, -// 0.0, 1.0, 0.0, -// 0.0, 1.0, 0.0, -// 0.0, 0.0, 1.0, -// 0.0, 0.0, 1.0}, false); -// -// auto net = makeMulticlassNet(); -// auto loss = make_shared(); -// auto optim = make_shared( -// net->parameters(), /*lr=*/0.001, /*decay=*/0.9); -// -// auto trainLoop = train::BaseTrainLoop( -// net, loss, optim, /*epochs=*/2000, /*bsize=*/6); -// -// trainLoop.run(x, y, /*shuffle=*/false); -// -// auto pred = (*net)(x); -// auto finalLoss = (*loss)(y, pred); -// -// EXPECT_LT((*finalLoss)[0], 0.05f) -// << "Network failed to overfit multiclass dataset"; -// } \ No newline at end of file +TEST(OverfitTest, CrossEntropyRMSPropOverfitsSmallDataset) { + // 6 samples, 2 features, 3 classes + auto x = TensorFunctions::makeSharedTensor( + {6, 2}, {1.0, 0.0, + 1.0, 0.1, + 0.0, 1.0, + 0.1, 1.0, + 0.5, 0.5, + 0.4, 0.6}, false); + + // one-hot encoded labels + auto y = TensorFunctions::makeSharedTensor( + {6, 3}, {1.0, 0.0, 0.0, + 1.0, 0.0, 0.0, + 0.0, 1.0, 0.0, + 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, + 0.0, 0.0, 1.0}, false); + + auto net = makeMulticlassNet(); + auto loss = make_shared(); + auto optim = make_shared( + net->parameters(), /*lr=*/0.0001, /*decay=*/0.95); + + auto trainLoop = train::BaseTrainLoop( + net, loss, optim, /*epochs=*/2000, /*bsize=*/6); + + trainLoop.run(x, y, /*shuffle=*/false); + + auto pred = (*net)(x); + auto finalLoss = (*loss)(y, pred); + + EXPECT_LT((*finalLoss)[0], 0.05f) + << "Network failed to overfit multiclass dataset"; +} \ No newline at end of file From 4b0fe885a1acebec4ce80cde2678d8d6fd5e52e6 Mon Sep 17 00:00:00 2001 From: Robert Baumgartner Date: Mon, 16 Mar 2026 16:52:18 +0100 Subject: [PATCH 18/24] Fixing first train loop --- .../activation_functions/sigmoid_node.cpp | 2 +- src/backend/data_modeling/tensor.cpp | 2 +- src/backend/training/optimizers/sgd.h | 2 - tests/backend/test_train_loop.cpp | 74 +++++++++---------- 4 files changed, 39 insertions(+), 41 deletions(-) diff --git a/src/backend/computational_graph/activation_functions/sigmoid_node.cpp b/src/backend/computational_graph/activation_functions/sigmoid_node.cpp index 79bd756..5873724 100644 --- a/src/backend/computational_graph/activation_functions/sigmoid_node.cpp +++ b/src/backend/computational_graph/activation_functions/sigmoid_node.cpp @@ -28,7 +28,7 @@ vector> SigmoidNode::backward(const Tensor& upstreamGrad) { }; for(tensorSize_t i=0; iset(derivative((*sigmoid)[i] * upstreamGrad[i]), i); + res->set(derivative((*sigmoid)[i]) * upstreamGrad[i], i); } return {res}; diff --git a/src/backend/data_modeling/tensor.cpp b/src/backend/data_modeling/tensor.cpp index ba0010d..685368b 100644 --- a/src/backend/data_modeling/tensor.cpp +++ b/src/backend/data_modeling/tensor.cpp @@ -868,7 +868,7 @@ void printValuesCpu(std::ostream& os, const Tensor& t) { printVals(t); if(t.grads){ - os << "Grads:\n"; + os << "\n\nGrads:\n"; printVals(*t.grads); } } diff --git a/src/backend/training/optimizers/sgd.h b/src/backend/training/optimizers/sgd.h index 7eb6f9b..0d8a891 100644 --- a/src/backend/training/optimizers/sgd.h +++ b/src/backend/training/optimizers/sgd.h @@ -15,8 +15,6 @@ namespace train { class SgdOptimizer final : public OptimizerBase { - private: - public: SgdOptimizer(std::vector< std::shared_ptr > params, ftype lr) : OptimizerBase(std::move(params), lr) { } diff --git a/tests/backend/test_train_loop.cpp b/tests/backend/test_train_loop.cpp index c829775..3f3dc4b 100644 --- a/tests/backend/test_train_loop.cpp +++ b/tests/backend/test_train_loop.cpp @@ -74,10 +74,10 @@ TEST(OverfitTest, BceSgdOverfitsSmallDataset) { auto net = makeBinaryNet(); auto loss = make_shared(); auto optim = make_shared( - net->parameters(), /*lr=*/0.001); + net->parameters(), /*lr=*/0.05); auto trainLoop = train::BaseTrainLoop( - net, loss, optim, /*epochs=*/2000, /*bsize=*/static_cast(4)); + net, loss, optim, /*epochs=*/2000, /*bsize=*/static_cast(1)); trainLoop.run(x, y, /*shuffle=*/false, /*verbose=*/false); @@ -91,38 +91,38 @@ TEST(OverfitTest, BceSgdOverfitsSmallDataset) { << "Network failed to overfit binary dataset"; } -TEST(OverfitTest, CrossEntropyRMSPropOverfitsSmallDataset) { - // 6 samples, 2 features, 3 classes - auto x = TensorFunctions::makeSharedTensor( - {6, 2}, {1.0, 0.0, - 1.0, 0.1, - 0.0, 1.0, - 0.1, 1.0, - 0.5, 0.5, - 0.4, 0.6}, false); - - // one-hot encoded labels - auto y = TensorFunctions::makeSharedTensor( - {6, 3}, {1.0, 0.0, 0.0, - 1.0, 0.0, 0.0, - 0.0, 1.0, 0.0, - 0.0, 1.0, 0.0, - 0.0, 0.0, 1.0, - 0.0, 0.0, 1.0}, false); - - auto net = makeMulticlassNet(); - auto loss = make_shared(); - auto optim = make_shared( - net->parameters(), /*lr=*/0.0001, /*decay=*/0.95); - - auto trainLoop = train::BaseTrainLoop( - net, loss, optim, /*epochs=*/2000, /*bsize=*/6); - - trainLoop.run(x, y, /*shuffle=*/false); - - auto pred = (*net)(x); - auto finalLoss = (*loss)(y, pred); - - EXPECT_LT((*finalLoss)[0], 0.05f) - << "Network failed to overfit multiclass dataset"; -} \ No newline at end of file +//TEST(OverfitTest, CrossEntropyRMSPropOverfitsSmallDataset) { +// // 6 samples, 2 features, 3 classes +// auto x = TensorFunctions::makeSharedTensor( +// {6, 2}, {1.0, 0.0, +// 1.0, 0.1, +// 0.0, 1.0, +// 0.1, 1.0, +// 0.5, 0.5, +// 0.4, 0.6}, false); +// +// // one-hot encoded labels +// auto y = TensorFunctions::makeSharedTensor( +// {6, 3}, {1.0, 0.0, 0.0, +// 1.0, 0.0, 0.0, +// 0.0, 1.0, 0.0, +// 0.0, 1.0, 0.0, +// 0.0, 0.0, 1.0, +// 0.0, 0.0, 1.0}, false); +// +// auto net = makeMulticlassNet(); +// auto loss = make_shared(); +// auto optim = make_shared( +// net->parameters(), /*lr=*/0.0001, /*decay=*/0.95); +// +// auto trainLoop = train::BaseTrainLoop( +// net, loss, optim, /*epochs=*/2000, /*bsize=*/6); +// +// trainLoop.run(x, y, /*shuffle=*/false); +// +// auto pred = (*net)(x); +// auto finalLoss = (*loss)(y, pred); +// +// EXPECT_LT((*finalLoss)[0], 0.05f) +// << "Network failed to overfit multiclass dataset"; +//} \ No newline at end of file From 344f1e1a5f7816106480fda59f8368ebb381baab Mon Sep 17 00:00:00 2001 From: Robert Baumgartner Date: Tue, 17 Mar 2026 10:12:41 +0100 Subject: [PATCH 19/24] Fixed the two train loops --- .../activation_functions/softmax_node.cpp | 24 +++--- src/backend/computational_graph/graph_node.h | 23 ++++++ .../loss_functions/bce_node.cpp | 2 +- .../training/loss_functions/bce_loss.cpp | 2 +- .../loss_functions/crossentropy_loss.cpp | 2 +- tests/backend/test_train_loop.cpp | 78 +++++++++---------- 6 files changed, 76 insertions(+), 55 deletions(-) diff --git a/src/backend/computational_graph/activation_functions/softmax_node.cpp b/src/backend/computational_graph/activation_functions/softmax_node.cpp index 2b7dc41..c155c63 100644 --- a/src/backend/computational_graph/activation_functions/softmax_node.cpp +++ b/src/backend/computational_graph/activation_functions/softmax_node.cpp @@ -25,22 +25,22 @@ vector< shared_ptr > SoftmaxNode::backward(const Tensor& upstreamGrad) { const auto& yPred = parents[0]; auto res = make_shared(yPred->createEmptyCopy()); - ftype bSize = yPred->getDims()[0]; + const auto bSize = yPred->getDims()[0]; assert(bSize>0); - for(tensorDim_t i=0; igetDims()[0]; i++){ - for(tensorDim_t j=0; jgetDims()[1]; j++){ - ftype g = 0; - if(i!=j){ - g = -softmax->get(i) * softmax->get(j); + for(tensorDim_t b=0; bgetDims()[1]; i++){ + ftype grad = 0; + const ftype yi = softmax->get(b, i); + + for(tensorDim_t j=0; jgetDims()[1]; j++){ + ftype yj = softmax->get(b, j); + ftype jacobian = (i==j) ? yi*(1-yj) : -yi*yj; + grad += upstreamGrad.get(b, j) * jacobian; } - else{ - g = softmax->get(i) * (1-softmax->get(j)); - } - - res->set(upstreamGrad[i] * g / bSize, i, j); + res->set(grad, b, i); } } - + return {res}; } \ No newline at end of file diff --git a/src/backend/computational_graph/graph_node.h b/src/backend/computational_graph/graph_node.h index e18518b..c8d3cec 100644 --- a/src/backend/computational_graph/graph_node.h +++ b/src/backend/computational_graph/graph_node.h @@ -18,6 +18,11 @@ #include +// if GCC or Clang +#ifdef __GNUC__ +#include +#endif // __GNUC__ + namespace cgraph { class GraphNode { protected: @@ -38,5 +43,23 @@ namespace cgraph { const auto& getParents() const noexcept { return parents; } + + virtual void print(std::ostream& os) const noexcept { + os << "\n"; + #ifdef __GNUC__ + // demangle name on gcc and clang + int status; + char* demangled = abi::__cxa_demangle(typeid(*this).name(), nullptr, nullptr, &status); + os << (status == 0 ? demangled : typeid(*this).name()); + std::free(demangled); + #else + os << typeid(*this).name(); + #endif + }; + + friend std::ostream& operator<<(std::ostream& os, const GraphNode& n) noexcept { + n.print(os); // calling vtable + return os; + } }; } diff --git a/src/backend/computational_graph/loss_functions/bce_node.cpp b/src/backend/computational_graph/loss_functions/bce_node.cpp index 8dfc1d7..729eff7 100644 --- a/src/backend/computational_graph/loss_functions/bce_node.cpp +++ b/src/backend/computational_graph/loss_functions/bce_node.cpp @@ -20,7 +20,7 @@ using namespace cgraph; vector< shared_ptr > BceNode::backward(const Tensor& upstreamGrad) { assert(!upstreamGrad.getRequiresGrad()); - constexpr ftype eps = 1e-9; + constexpr ftype eps = 1e-5; const auto& yPred = parents[0]; auto res = make_shared(yPred->createEmptyCopy()); diff --git a/src/backend/training/loss_functions/bce_loss.cpp b/src/backend/training/loss_functions/bce_loss.cpp index ecd0f10..03ef1c2 100644 --- a/src/backend/training/loss_functions/bce_loss.cpp +++ b/src/backend/training/loss_functions/bce_loss.cpp @@ -36,7 +36,7 @@ shared_ptr BceLoss::operator()(const shared_ptr y, const shared_ } auto bce = [](ftype y, ftype ypred){ - constexpr ftype eps = 1e-9; + constexpr ftype eps = 1e-5; return y*log(std::max(ypred, eps)) + (1-y)*log(std::max(1-ypred, eps)); }; diff --git a/src/backend/training/loss_functions/crossentropy_loss.cpp b/src/backend/training/loss_functions/crossentropy_loss.cpp index 10b6cb7..37c87dd 100644 --- a/src/backend/training/loss_functions/crossentropy_loss.cpp +++ b/src/backend/training/loss_functions/crossentropy_loss.cpp @@ -36,7 +36,7 @@ shared_ptr CrossEntropyLoss::operator()(const shared_ptr y, cons auto ce = [&y, &ypred](const tensorDim_t b){ ftype res = 0; for(tensorDim_t i=0; igetDims()[-1]; i++){ - constexpr ftype eps = 1e-9; + constexpr ftype eps = 1e-5; res += y->get(b, i) * log(std::max(ypred->get(b, i), eps)); } return res; diff --git a/tests/backend/test_train_loop.cpp b/tests/backend/test_train_loop.cpp index 3f3dc4b..dccce3a 100644 --- a/tests/backend/test_train_loop.cpp +++ b/tests/backend/test_train_loop.cpp @@ -36,7 +36,7 @@ static shared_ptr makeBinaryNet() { net->append(make_shared(2, 4, true, true)); - net->append(make_shared(1e-5)); + net->append(make_shared(0.01)); net->append(make_shared(4, 1, true, true)); @@ -49,7 +49,7 @@ static shared_ptr makeMulticlassNet() { net->append(make_shared(2, 8, true, true)); - net->append(make_shared(1e-5)); + net->append(make_shared(0.01)); net->append(make_shared(8, 3, true, true)); @@ -77,7 +77,7 @@ TEST(OverfitTest, BceSgdOverfitsSmallDataset) { net->parameters(), /*lr=*/0.05); auto trainLoop = train::BaseTrainLoop( - net, loss, optim, /*epochs=*/2000, /*bsize=*/static_cast(1)); + net, loss, optim, /*epochs=*/2000, /*bsize=*/static_cast(4)); trainLoop.run(x, y, /*shuffle=*/false, /*verbose=*/false); @@ -85,44 +85,42 @@ TEST(OverfitTest, BceSgdOverfitsSmallDataset) { auto pred = (*net)(x); auto finalLoss = (*loss)(y, pred); - cout << "pred: " << *pred << "\nloss: " << *finalLoss << endl; - EXPECT_LT((*finalLoss)[0], 0.05f) << "Network failed to overfit binary dataset"; } -//TEST(OverfitTest, CrossEntropyRMSPropOverfitsSmallDataset) { -// // 6 samples, 2 features, 3 classes -// auto x = TensorFunctions::makeSharedTensor( -// {6, 2}, {1.0, 0.0, -// 1.0, 0.1, -// 0.0, 1.0, -// 0.1, 1.0, -// 0.5, 0.5, -// 0.4, 0.6}, false); -// -// // one-hot encoded labels -// auto y = TensorFunctions::makeSharedTensor( -// {6, 3}, {1.0, 0.0, 0.0, -// 1.0, 0.0, 0.0, -// 0.0, 1.0, 0.0, -// 0.0, 1.0, 0.0, -// 0.0, 0.0, 1.0, -// 0.0, 0.0, 1.0}, false); -// -// auto net = makeMulticlassNet(); -// auto loss = make_shared(); -// auto optim = make_shared( -// net->parameters(), /*lr=*/0.0001, /*decay=*/0.95); -// -// auto trainLoop = train::BaseTrainLoop( -// net, loss, optim, /*epochs=*/2000, /*bsize=*/6); -// -// trainLoop.run(x, y, /*shuffle=*/false); -// -// auto pred = (*net)(x); -// auto finalLoss = (*loss)(y, pred); -// -// EXPECT_LT((*finalLoss)[0], 0.05f) -// << "Network failed to overfit multiclass dataset"; -//} \ No newline at end of file +TEST(OverfitTest, CrossEntropyRMSPropOverfitsSmallDataset) { + // 6 samples, 2 features, 3 classes + auto x = TensorFunctions::makeSharedTensor( + {6, 2}, {1.0, 0.0, + 1.0, 0.1, + 0.0, 1.0, + 0.1, 1.0, + 0.5, 0.5, + 0.4, 0.6}, false); + + // one-hot encoded labels + auto y = TensorFunctions::makeSharedTensor( + {6, 3}, {1.0, 0.0, 0.0, + 1.0, 0.0, 0.0, + 0.0, 1.0, 0.0, + 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, + 0.0, 0.0, 1.0}, false); + + auto net = makeMulticlassNet(); + auto loss = make_shared(); + auto optim = make_shared( + net->parameters(), /*lr=*/0.00001, /*decay=*/0.95); + + auto trainLoop = train::BaseTrainLoop( + net, loss, optim, /*epochs=*/3000, /*bsize=*/6); + + trainLoop.run(x, y, /*shuffle=*/false); + + auto pred = (*net)(x); + auto finalLoss = (*loss)(y, pred); + + EXPECT_LT((*finalLoss)[0], 0.05f) + << "Network failed to overfit multiclass dataset"; +} \ No newline at end of file From 982f8f5e517dcace488c204ebd82b55a13ac911a Mon Sep 17 00:00:00 2001 From: Robert Baumgartner Date: Tue, 17 Mar 2026 12:25:26 +0100 Subject: [PATCH 20/24] Minor refactors --- .../activation_functions/softmax_node.cpp | 3 +-- .../computational_graph/loss_functions/bce_node.cpp | 5 +---- .../loss_functions/crossentropy_node.cpp | 5 ++--- src/backend/module/activation_functions/leaky_relu.h | 2 +- src/backend/module/activation_functions/softmax.cpp | 5 +++-- src/backend/training/loss_functions/bce_loss.cpp | 5 +---- src/backend/training/loss_functions/crossentropy_loss.cpp | 7 +++---- src/backend/utility/global_params.h | 7 ++++++- 8 files changed, 18 insertions(+), 21 deletions(-) diff --git a/src/backend/computational_graph/activation_functions/softmax_node.cpp b/src/backend/computational_graph/activation_functions/softmax_node.cpp index c155c63..8603355 100644 --- a/src/backend/computational_graph/activation_functions/softmax_node.cpp +++ b/src/backend/computational_graph/activation_functions/softmax_node.cpp @@ -20,8 +20,7 @@ using namespace cgraph; vector< shared_ptr > SoftmaxNode::backward(const Tensor& upstreamGrad) { assert(!upstreamGrad.getRequiresGrad()); - constexpr ftype eps = 1e-9; - + const auto& yPred = parents[0]; auto res = make_shared(yPred->createEmptyCopy()); diff --git a/src/backend/computational_graph/loss_functions/bce_node.cpp b/src/backend/computational_graph/loss_functions/bce_node.cpp index 729eff7..add016f 100644 --- a/src/backend/computational_graph/loss_functions/bce_node.cpp +++ b/src/backend/computational_graph/loss_functions/bce_node.cpp @@ -13,14 +13,11 @@ #include "data_modeling/tensor_functions.h" -#include - using namespace std; using namespace cgraph; vector< shared_ptr > BceNode::backward(const Tensor& upstreamGrad) { assert(!upstreamGrad.getRequiresGrad()); - constexpr ftype eps = 1e-5; const auto& yPred = parents[0]; auto res = make_shared(yPred->createEmptyCopy()); @@ -30,7 +27,7 @@ vector< shared_ptr > BceNode::backward(const Tensor& upstreamGrad) { auto yi = (*yTrue)[i]; auto yiHat = (*yPred)[i]; - auto g = -yi/std::max(yiHat, eps) + (1-yi)/std::max(1-yiHat, eps); + auto g = -yi/std::max(yiHat, epsBce) + (1-yi)/std::max(1-yiHat, epsBce); res->set(g/bSize, i); } diff --git a/src/backend/computational_graph/loss_functions/crossentropy_node.cpp b/src/backend/computational_graph/loss_functions/crossentropy_node.cpp index e9ea4bc..249de43 100644 --- a/src/backend/computational_graph/loss_functions/crossentropy_node.cpp +++ b/src/backend/computational_graph/loss_functions/crossentropy_node.cpp @@ -18,7 +18,6 @@ using namespace cgraph; vector< shared_ptr > CrossEntropyNode::backward(const Tensor& upstreamGrad) { assert(!upstreamGrad.getRequiresGrad()); - constexpr ftype eps = 1e-9; const auto& yPred = parents[0]; auto res = make_shared(yPred->createEmptyCopy()); @@ -27,9 +26,9 @@ vector< shared_ptr > CrossEntropyNode::backward(const Tensor& upstreamGr for(tensorDim_t i=0; igetDims()[0]; i++){ for(tensorDim_t j=0; jgetDims()[1]; j++){ auto yij = yTrue->get(i, j); - auto yijHat = std::max(yPred->get(i, j), eps); + auto yijHat = yPred->get(i, j); - auto g = -yij/yijHat; + auto g = -yij/std::max(yijHat, epsCrossentropy); res->set(g/bSize, i, j); } } diff --git a/src/backend/module/activation_functions/leaky_relu.h b/src/backend/module/activation_functions/leaky_relu.h index 794e41c..94f0c3d 100644 --- a/src/backend/module/activation_functions/leaky_relu.h +++ b/src/backend/module/activation_functions/leaky_relu.h @@ -19,7 +19,7 @@ namespace module { const ftype eps; public: - LeakyReLu(ftype eps) : eps{eps} + LeakyReLu(ftype eps=0.01) : eps{eps} { } Tensor operator()(const Tensor& t) const override; diff --git a/src/backend/module/activation_functions/softmax.cpp b/src/backend/module/activation_functions/softmax.cpp index 7a54965..8a45318 100644 --- a/src/backend/module/activation_functions/softmax.cpp +++ b/src/backend/module/activation_functions/softmax.cpp @@ -34,7 +34,8 @@ Tensor Softmax::operator()(const Tensor& t) const { // pre-compute exponents Tensor tmp(t.getDims(), t.getDevice(), false); for(tensorDim_t i=0; i::infinity(); for(tensorDim_t j=0; j(tmp[i]); + sum += tmp[i]; } for(tensorSize_t i=start; i -#include - using namespace std; using namespace train; @@ -36,8 +34,7 @@ shared_ptr BceLoss::operator()(const shared_ptr y, const shared_ } auto bce = [](ftype y, ftype ypred){ - constexpr ftype eps = 1e-5; - return y*log(std::max(ypred, eps)) + (1-y)*log(std::max(1-ypred, eps)); + return y*log(std::max(ypred, epsBce)) + (1-y)*log(std::max(1-ypred, epsBce)); }; const auto nBatches = y->getDims()[0]; diff --git a/src/backend/training/loss_functions/crossentropy_loss.cpp b/src/backend/training/loss_functions/crossentropy_loss.cpp index 37c87dd..d1a5291 100644 --- a/src/backend/training/loss_functions/crossentropy_loss.cpp +++ b/src/backend/training/loss_functions/crossentropy_loss.cpp @@ -1,9 +1,9 @@ /** - * @file bce_loss.cpp + * @file crossentropy_loss.cpp * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) * @brief * @version 0.1 - * @date 2026-03-07 + * @date 2026-03-17 * * @copyright Copyright (c) 2026 * @@ -36,8 +36,7 @@ shared_ptr CrossEntropyLoss::operator()(const shared_ptr y, cons auto ce = [&y, &ypred](const tensorDim_t b){ ftype res = 0; for(tensorDim_t i=0; igetDims()[-1]; i++){ - constexpr ftype eps = 1e-5; - res += y->get(b, i) * log(std::max(ypred->get(b, i), eps)); + res += y->get(b, i) * log(std::max(ypred->get(b, i), epsCrossentropy)); } return res; }; diff --git a/src/backend/utility/global_params.h b/src/backend/utility/global_params.h index 9af9694..3d6edcb 100644 --- a/src/backend/utility/global_params.h +++ b/src/backend/utility/global_params.h @@ -31,4 +31,9 @@ using tensorSize_t = std::uint32_t; // we assert this here so during conversions of tensorDim_t to // tensorSize_t we do not need to cast explicitely -static_assert(sizeof(tensorDim_t)<=sizeof(tensorSize_t)); \ No newline at end of file +static_assert(sizeof(tensorDim_t)<=sizeof(tensorSize_t)); + +// ----------------- Numerical stability ------------------- + +constexpr ftype epsCrossentropy = 1e-5; +constexpr ftype epsBce = 1e-5; \ No newline at end of file From 28cc849beb42c57e1b3d44d4fd6eb1a208c519ac Mon Sep 17 00:00:00 2001 From: Robert Baumgartner Date: Tue, 17 Mar 2026 12:47:50 +0100 Subject: [PATCH 21/24] Added numerically stable losses --- .../loss_functions/bce_sigmoid_node.cpp | 46 ++++++++++ .../loss_functions/bce_sigmoid_node.h | 33 +++++++ .../crossentropy_softmax_node.cpp | 37 ++++++++ .../crossentropy_softmax_node.h | 33 +++++++ .../module/activation_functions/softmax.cpp | 2 +- .../loss_functions/bce_sigmoid_loss.cpp | 52 +++++++++++ .../loss_functions/bce_sigmoid_loss.h | 22 +++++ .../crossentropy_softmax_loss.cpp | 92 +++++++++++++++++++ .../crossentropy_softmax_loss.h | 22 +++++ 9 files changed, 338 insertions(+), 1 deletion(-) create mode 100644 src/backend/computational_graph/loss_functions/bce_sigmoid_node.cpp create mode 100644 src/backend/computational_graph/loss_functions/bce_sigmoid_node.h create mode 100644 src/backend/computational_graph/loss_functions/crossentropy_softmax_node.cpp create mode 100644 src/backend/computational_graph/loss_functions/crossentropy_softmax_node.h create mode 100644 src/backend/training/loss_functions/bce_sigmoid_loss.cpp create mode 100644 src/backend/training/loss_functions/bce_sigmoid_loss.h create mode 100644 src/backend/training/loss_functions/crossentropy_softmax_loss.cpp create mode 100644 src/backend/training/loss_functions/crossentropy_softmax_loss.h diff --git a/src/backend/computational_graph/loss_functions/bce_sigmoid_node.cpp b/src/backend/computational_graph/loss_functions/bce_sigmoid_node.cpp new file mode 100644 index 0000000..998e110 --- /dev/null +++ b/src/backend/computational_graph/loss_functions/bce_sigmoid_node.cpp @@ -0,0 +1,46 @@ +/** + * @file bce_sigmoid_node.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-17 + * + * @copyright Copyright (c) 2026 + * + */ + +#include "bce_sigmoid_node.h" + +#include "data_modeling/tensor_functions.h" + +#include + +using namespace std; +using namespace cgraph; + +vector< shared_ptr > BceSigmoidNode::backward(const Tensor& upstreamGrad) { + assert(!upstreamGrad.getRequiresGrad()); + + auto sigmoid = [](ftype x){ + constexpr ftype one = 1.0; + if(x>=0){ + return one / (one + exp(-x)); + } + auto e = exp(x); + return e / (one + e); + }; + + const auto& logits = parents[0]; + auto res = make_shared(logits->createEmptyCopy()); + + ftype bSize = logits->getDims()[0]; + for(tensorSize_t i=0; igetDims()[0]; i++){ + auto y = (*yTrue)[i]; + auto s = sigmoid((*logits)[i]); + + auto g = s - y; + res->set(g/bSize, i); + } + + return {res}; +} \ No newline at end of file diff --git a/src/backend/computational_graph/loss_functions/bce_sigmoid_node.h b/src/backend/computational_graph/loss_functions/bce_sigmoid_node.h new file mode 100644 index 0000000..f90059c --- /dev/null +++ b/src/backend/computational_graph/loss_functions/bce_sigmoid_node.h @@ -0,0 +1,33 @@ +/** + * @file bce_sigmoid_node.h + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-17 + * + * @copyright Copyright (c) 2026 + * + */ + +#pragma once + +#include "computational_graph/graph_node.h" + +namespace cgraph { + class BceSigmoidNode final : public GraphNode { + private: + const std::shared_ptr yTrue; + + public: + explicit BceSigmoidNode(std::shared_ptr y, std::shared_ptr logits) + : GraphNode({std::move(logits)}), yTrue{std::move(y)} + { + assert(parents[0]->getDims()==yTrue->getDims()); + if(!parents[0]->getRequiresGrad()){ + std::__throw_invalid_argument("yPred must be a graph node"); + } + } + + std::vector> backward(const Tensor& upstreamGrad) override; + }; +} \ No newline at end of file diff --git a/src/backend/computational_graph/loss_functions/crossentropy_softmax_node.cpp b/src/backend/computational_graph/loss_functions/crossentropy_softmax_node.cpp new file mode 100644 index 0000000..8a4aae4 --- /dev/null +++ b/src/backend/computational_graph/loss_functions/crossentropy_softmax_node.cpp @@ -0,0 +1,37 @@ +/** + * @file crossentropy_softmax_node.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-17 + * + * @copyright Copyright (c) 2026 + * + */ + +#include "crossentropy_softmax_node.h" + +#include "module/activation_functions/softmax.h" + +using namespace std; +using namespace cgraph; + +vector< shared_ptr > CrossEntropySoftmaxNode::backward(const Tensor& upstreamGrad) { + assert(!upstreamGrad.getRequiresGrad()); + + const auto& logits = parents[0]; + auto res = make_shared(logits->createEmptyCopy()); + + const auto softmax = module::Softmax(); + const auto s = softmax(*logits); + + ftype bSize = logits->getDims()[0]; + for(tensorSize_t b=0; bgetDims()[0]; b++){ + for(tensorSize_t i=0; igetDims()[1]; i++){ + auto g = s.get(b, i) - yTrue->get(b, i); + res->set(g, b, i); + } + } + + return {res}; +} \ No newline at end of file diff --git a/src/backend/computational_graph/loss_functions/crossentropy_softmax_node.h b/src/backend/computational_graph/loss_functions/crossentropy_softmax_node.h new file mode 100644 index 0000000..17f0d15 --- /dev/null +++ b/src/backend/computational_graph/loss_functions/crossentropy_softmax_node.h @@ -0,0 +1,33 @@ +/** + * @file crossentropy_softmax_node.h + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-17 + * + * @copyright Copyright (c) 2026 + * + */ + +#pragma once + +#include "computational_graph/graph_node.h" + +namespace cgraph { + class CrossEntropySoftmaxNode final : public GraphNode { + private: + const std::shared_ptr yTrue; + + public: + explicit CrossEntropySoftmaxNode(std::shared_ptr y, std::shared_ptr logits) + : GraphNode({std::move(logits)}), yTrue{std::move(y)} + { + assert(parents[0]->getDims()==yTrue->getDims()); + if(!parents[0]->getRequiresGrad()){ + std::__throw_invalid_argument("yPred must be a graph node"); + } + } + + std::vector> backward(const Tensor& upstreamGrad) override; + }; +} \ No newline at end of file diff --git a/src/backend/module/activation_functions/softmax.cpp b/src/backend/module/activation_functions/softmax.cpp index 8a45318..a001a69 100644 --- a/src/backend/module/activation_functions/softmax.cpp +++ b/src/backend/module/activation_functions/softmax.cpp @@ -11,7 +11,7 @@ #include "softmax.h" -#include "computational_graph/activation_functions/softmax_node.cpp" +#include "computational_graph/activation_functions/softmax_node.h" #include diff --git a/src/backend/training/loss_functions/bce_sigmoid_loss.cpp b/src/backend/training/loss_functions/bce_sigmoid_loss.cpp new file mode 100644 index 0000000..798af96 --- /dev/null +++ b/src/backend/training/loss_functions/bce_sigmoid_loss.cpp @@ -0,0 +1,52 @@ +/** + * @file bce_logits_loss.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-17 + * + * @copyright Copyright (c) 2026 + * + */ + + #include "bce_sigmoid_loss.h" + + #include "computational_graph/loss_functions/bce_sigmoid_node.h" + + #include + +using namespace std; +using namespace train; + +/** + * @brief Expected shapes: (batchsize) or (batchsize, 1) + * @return Tensor of shape (1) + */ +shared_ptr BceSigmoidLoss::operator()(const shared_ptr y, const shared_ptr logits) const { + if(!logits->getRequiresGrad()) { + __throw_invalid_argument("logits must have gradient enabled"); + } + else if(y->getDevice() != logits->getDevice()){ + __throw_invalid_argument("y and logits must be on same device"); + } + else if(y->getDims()!=logits->getDims()){ + __throw_invalid_argument("Tensors must be of same shape"); + } + + auto bceSimplified = [](ftype y, ftype logit){ + return std::max(y, (ftype)0) - logit*y + log(1+exp(logit < 0 ? logit : -logit)); + }; + + const auto nBatches = y->getDims()[0]; + + ftype loss = 0; + for(tensorSize_t i=0; i(std::vector{1}, std::vector{-loss / nBatches}, y->getDevice(), true); + res->setCgNode(make_shared(y, logits)); + assert(res->getRequiresGrad()); + + return res; +} \ No newline at end of file diff --git a/src/backend/training/loss_functions/bce_sigmoid_loss.h b/src/backend/training/loss_functions/bce_sigmoid_loss.h new file mode 100644 index 0000000..7aae6bc --- /dev/null +++ b/src/backend/training/loss_functions/bce_sigmoid_loss.h @@ -0,0 +1,22 @@ +/** + * @file bce_loss.h + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-07 + * + * @copyright Copyright (c) 2026 + * + */ + +#pragma once + +#include "loss_base.h" + +namespace train { + class BceSigmoidLoss final : public LossBase { + public: + std::shared_ptr operator()(const std::shared_ptr y, + const std::shared_ptr logits) const override; + }; +} diff --git a/src/backend/training/loss_functions/crossentropy_softmax_loss.cpp b/src/backend/training/loss_functions/crossentropy_softmax_loss.cpp new file mode 100644 index 0000000..a472d3a --- /dev/null +++ b/src/backend/training/loss_functions/crossentropy_softmax_loss.cpp @@ -0,0 +1,92 @@ +/** + * @file crossentropy_logits_loss.cpp + * @author Robert Baumgartner (r.baumgartner-1@tudelflogits->nl) + * @brief + * @version 0.1 + * @date 2026-03-17 + * + * @copyright Copyright (c) 2026 + * + */ + +#include "crossentropy_softmax_loss.h" + +#include "computational_graph/loss_functions/crossentropy_softmax_node.h" + +#include + +using namespace std; +using namespace train; + +/** + * @brief Expected shapes: (batch_size, n_classes) + * @return Tensor of shape (1) + */ +shared_ptr CrossEntropySoftmaxLoss::operator()(const shared_ptr y, const shared_ptr logits) const { + if(!logits->getRequiresGrad()) { + __throw_invalid_argument("logits must have gradient enabled"); + } + else if(y->getDevice() != logits->getDevice()){ + __throw_invalid_argument("y and logits must be on same device"); + } + else if(y->getDims()!=logits->getDims()){ + __throw_invalid_argument("Tensors must be of same shape"); + } + + //////////////////////////////////////////////// + + const auto nRows = logits->getDims()[-2]; + const auto nCols = logits->getDims()[-1]; + + // pre-compute exponents and max-values + vector maxValues(nRows); + Tensor tmp(logits->getDims(), logits->getDevice(), false); + for(tensorDim_t i=0; i::infinity(); + for(tensorDim_t j=0; jget(i, j)); + } + + maxValues[i] = maxV; + + for(tensorDim_t j=0; jget(i, j)-maxV; + tmp.set(exp(e), i, j); + } + } + + const tensorSize_t stride = logits->getDims()[-1]; + ftype loss = 0; + + /** + * CE = -sum_i(y_i * z_i) + log(sum_j(exp(z_j))) with + * log(sum_j(exp(z_j))) = max(z) + log(sum_j(exp(z_j - max(z)))). + * for numerical stability + */ + auto compute = [&loss, &y, &logits, &tmp, stride](tensorSize_t start){ + ftype lsum = 0; + for(tensorSize_t i=start; i0){ // y either zero or one + loss += (*logits)[i] - lsum; + } + } + }; + + tensorSize_t offset=0; + while(offsetgetSize()) { + compute(offset); + offset += stride; + } + + auto res = make_shared(std::vector{1}, std::vector{-loss / logits->getDims()[0]}, y->getDevice(), true); + res->setCgNode(std::make_shared(y, logits)); + assert(res->getRequiresGrad()); + + return res; +} \ No newline at end of file diff --git a/src/backend/training/loss_functions/crossentropy_softmax_loss.h b/src/backend/training/loss_functions/crossentropy_softmax_loss.h new file mode 100644 index 0000000..6feb16a --- /dev/null +++ b/src/backend/training/loss_functions/crossentropy_softmax_loss.h @@ -0,0 +1,22 @@ +/** + * @file crossentropy_loss.h + * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl) + * @brief + * @version 0.1 + * @date 2026-03-07 + * + * @copyright Copyright (c) 2026 + * + */ + +#pragma once + +#include "loss_base.h" + +namespace train { + class CrossEntropySoftmaxLoss final : public LossBase { + public: + std::shared_ptr operator()(const std::shared_ptr y, + const std::shared_ptr logits) const override; + }; +} From e6d9ba6d9d1cfbbbe0a91ba539d5ff55bb0815b7 Mon Sep 17 00:00:00 2001 From: Robert Baumgartner Date: Sat, 21 Mar 2026 12:28:56 +0100 Subject: [PATCH 22/24] Added modules, fixed bugs --- .../crossentropy_softmax_node.cpp | 2 +- src/backend/data_modeling/tensor.cpp | 6 - .../loss_functions/bce_sigmoid_loss.cpp | 5 +- .../crossentropy_softmax_loss.cpp | 7 +- .../training/trainers/base_train_loop.cpp | 3 - tests/backend/test_train_loop.cpp | 110 +++++++++++++++++- 6 files changed, 114 insertions(+), 19 deletions(-) diff --git a/src/backend/computational_graph/loss_functions/crossentropy_softmax_node.cpp b/src/backend/computational_graph/loss_functions/crossentropy_softmax_node.cpp index 8a4aae4..ca0d1c7 100644 --- a/src/backend/computational_graph/loss_functions/crossentropy_softmax_node.cpp +++ b/src/backend/computational_graph/loss_functions/crossentropy_softmax_node.cpp @@ -29,7 +29,7 @@ vector< shared_ptr > CrossEntropySoftmaxNode::backward(const Tensor& ups for(tensorSize_t b=0; bgetDims()[0]; b++){ for(tensorSize_t i=0; igetDims()[1]; i++){ auto g = s.get(b, i) - yTrue->get(b, i); - res->set(g, b, i); + res->set(g / bSize, b, i); } } diff --git a/src/backend/data_modeling/tensor.cpp b/src/backend/data_modeling/tensor.cpp index 685368b..c4dce79 100644 --- a/src/backend/data_modeling/tensor.cpp +++ b/src/backend/data_modeling/tensor.cpp @@ -529,12 +529,6 @@ void Tensor::backward() { auto& tensor = *tPtr; assert(tensor.grads && !tensor.grads->requiresGrad); // gradient should not require grad - static int count = 0; - if(tPtr->requiresGrad && count % 50 == 0){ - cout << "\nbackward of " << tPtr << endl; // TODO: remove - cout << "\ngrads " << *tensor.grads << endl; // TODO: remove - } - auto incomingGrads = tensor.cgNode->backward(*tensor.grads); const auto& parents = tensor.cgNode->getParents(); diff --git a/src/backend/training/loss_functions/bce_sigmoid_loss.cpp b/src/backend/training/loss_functions/bce_sigmoid_loss.cpp index 798af96..8eb81bd 100644 --- a/src/backend/training/loss_functions/bce_sigmoid_loss.cpp +++ b/src/backend/training/loss_functions/bce_sigmoid_loss.cpp @@ -34,7 +34,8 @@ shared_ptr BceSigmoidLoss::operator()(const shared_ptr y, const } auto bceSimplified = [](ftype y, ftype logit){ - return std::max(y, (ftype)0) - logit*y + log(1+exp(logit < 0 ? logit : -logit)); + constexpr ftype zero = 0; + return std::max(y, zero) - logit*y + log(1+exp(logit < 0 ? logit : -logit)); }; const auto nBatches = y->getDims()[0]; @@ -44,7 +45,7 @@ shared_ptr BceSigmoidLoss::operator()(const shared_ptr y, const loss += bceSimplified((*y)[i], (*logits)[i]); } - auto res = make_shared(std::vector{1}, std::vector{-loss / nBatches}, y->getDevice(), true); + auto res = make_shared(std::vector{1}, std::vector{loss / nBatches}, y->getDevice(), true); res->setCgNode(make_shared(y, logits)); assert(res->getRequiresGrad()); diff --git a/src/backend/training/loss_functions/crossentropy_softmax_loss.cpp b/src/backend/training/loss_functions/crossentropy_softmax_loss.cpp index a472d3a..a2b7866 100644 --- a/src/backend/training/loss_functions/crossentropy_softmax_loss.cpp +++ b/src/backend/training/loss_functions/crossentropy_softmax_loss.cpp @@ -64,16 +64,17 @@ shared_ptr CrossEntropySoftmaxLoss::operator()(const shared_ptr * log(sum_j(exp(z_j))) = max(z) + log(sum_j(exp(z_j - max(z)))). * for numerical stability */ - auto compute = [&loss, &y, &logits, &tmp, stride](tensorSize_t start){ + auto compute = [&loss, &y, &logits, &tmp, &maxValues, stride](tensorSize_t start){ ftype lsum = 0; for(tensorSize_t i=start; i0){ // y either zero or one - loss += (*logits)[i] - lsum; + loss += -(*logits)[i] + maxValues[j] + lsum; } } }; @@ -84,7 +85,7 @@ shared_ptr CrossEntropySoftmaxLoss::operator()(const shared_ptr offset += stride; } - auto res = make_shared(std::vector{1}, std::vector{-loss / logits->getDims()[0]}, y->getDevice(), true); + auto res = make_shared(std::vector{1}, std::vector{loss / logits->getDims()[0]}, y->getDevice(), true); res->setCgNode(std::make_shared(y, logits)); assert(res->getRequiresGrad()); diff --git a/src/backend/training/trainers/base_train_loop.cpp b/src/backend/training/trainers/base_train_loop.cpp index b5961e5..ff95688 100644 --- a/src/backend/training/trainers/base_train_loop.cpp +++ b/src/backend/training/trainers/base_train_loop.cpp @@ -51,10 +51,7 @@ void BaseTrainLoop::run(shared_ptr& x, shared_ptr& y, const bool auto yBatch = make_shared(y->getSlice(batchSpan)); auto yPred = (*graph)(xBatch); - cout << "\nypred: " << *yPred << endl; - auto l = (*loss)(yBatch, yPred); - cout << "\nloss: " << (*l)[0] << endl; l->backward(); optim->step(); diff --git a/tests/backend/test_train_loop.cpp b/tests/backend/test_train_loop.cpp index dccce3a..fc55d4a 100644 --- a/tests/backend/test_train_loop.cpp +++ b/tests/backend/test_train_loop.cpp @@ -24,6 +24,8 @@ #include "training/loss_functions/bce_loss.h" #include "training/loss_functions/crossentropy_loss.h" +#include "training/loss_functions/bce_sigmoid_loss.h" +#include "training/loss_functions/crossentropy_softmax_loss.h" #include "training/trainers/base_train_loop.h" @@ -44,6 +46,18 @@ static shared_ptr makeBinaryNet() { return net; } +static shared_ptr makeBinaryNet2() { + auto net = make_shared(); + + net->append(make_shared(2, 4, true, true)); + + net->append(make_shared(0.01)); + + net->append(make_shared(4, 1, true, true)); + + return net; +} + static shared_ptr makeMulticlassNet() { auto net = make_shared(); @@ -57,6 +71,18 @@ static shared_ptr makeMulticlassNet() { return net; } +static shared_ptr makeMulticlassNet2() { + auto net = make_shared(); + + net->append(make_shared(2, 8, true, true)); + + net->append(make_shared(0.01)); + + net->append(make_shared(8, 3, true, true)); + + return net; +} + TEST(OverfitTest, BceSgdOverfitsSmallDataset) { // XOR-like: 4 samples, 2 features, binary labels auto x = TensorFunctions::makeSharedTensor( @@ -74,10 +100,42 @@ TEST(OverfitTest, BceSgdOverfitsSmallDataset) { auto net = makeBinaryNet(); auto loss = make_shared(); auto optim = make_shared( - net->parameters(), /*lr=*/0.05); + net->parameters(), /*lr=*/0.01); + + auto trainLoop = train::BaseTrainLoop( + net, loss, optim, /*epochs=*/10000, /*bsize=*/static_cast(4)); + + trainLoop.run(x, y, /*shuffle=*/false, /*verbose=*/false); + + // forward one more time to get final loss + auto pred = (*net)(x); + auto finalLoss = (*loss)(y, pred); + + EXPECT_LT((*finalLoss)[0], 0.05f) + << "Network failed to overfit binary dataset"; +} + +TEST(OverfitTest, BceSgdOverfitsSmallDataset_OptimizedLoss) { + // XOR-like: 4 samples, 2 features, binary labels + auto x = TensorFunctions::makeSharedTensor( + {4, 2}, {0.0, 0.0, + 0.0, 1.0, + 1.0, 0.0, + 1.0, 1.0}, false); + + auto y = TensorFunctions::makeSharedTensor( + {4, 1}, {0.0, + 1.0, + 1.0, + 0.0}, false); + + auto net = makeBinaryNet2(); + auto loss = make_shared(); + auto optim = make_shared( + net->parameters(), /*lr=*/0.02); auto trainLoop = train::BaseTrainLoop( - net, loss, optim, /*epochs=*/2000, /*bsize=*/static_cast(4)); + net, loss, optim, /*epochs=*/10000, /*bsize=*/static_cast(4)); trainLoop.run(x, y, /*shuffle=*/false, /*verbose=*/false); @@ -85,6 +143,9 @@ TEST(OverfitTest, BceSgdOverfitsSmallDataset) { auto pred = (*net)(x); auto finalLoss = (*loss)(y, pred); + auto sigmoid = module::Sigmoid(); + cout << "Final prediction: " << sigmoid(*pred) << "\nFinal loss: " << *finalLoss << endl; + EXPECT_LT((*finalLoss)[0], 0.05f) << "Network failed to overfit binary dataset"; } @@ -111,15 +172,56 @@ TEST(OverfitTest, CrossEntropyRMSPropOverfitsSmallDataset) { auto net = makeMulticlassNet(); auto loss = make_shared(); auto optim = make_shared( - net->parameters(), /*lr=*/0.00001, /*decay=*/0.95); + net->parameters(), /*lr=*/0.00002, /*decay=*/0.95); auto trainLoop = train::BaseTrainLoop( - net, loss, optim, /*epochs=*/3000, /*bsize=*/6); + net, loss, optim, /*epochs=*/10000, /*bsize=*/6); trainLoop.run(x, y, /*shuffle=*/false); auto pred = (*net)(x); auto finalLoss = (*loss)(y, pred); + cout << "Final prediction: " << *pred << "\nFinal loss: " << *finalLoss << endl; + + EXPECT_LT((*finalLoss)[0], 0.05f) + << "Network failed to overfit multiclass dataset"; +} + +TEST(OverfitTest, CrossEntropyRMSPropOverfitsSmallDataset_OptimizedLoss) { + // 6 samples, 2 features, 3 classes + auto x = TensorFunctions::makeSharedTensor( + {6, 2}, {1.0, 0.0, + 1.0, 0.1, + 0.0, 1.0, + 0.1, 1.0, + 0.5, 0.5, + 0.4, 0.6}, false); + + // one-hot encoded labels + auto y = TensorFunctions::makeSharedTensor( + {6, 3}, {1.0, 0.0, 0.0, + 1.0, 0.0, 0.0, + 0.0, 1.0, 0.0, + 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, + 0.0, 0.0, 1.0}, false); + + auto net = makeMulticlassNet2(); + auto loss = make_shared(); + auto optim = make_shared( + net->parameters(), /*lr=*/0.00002, /*decay=*/0.95); + + auto trainLoop = train::BaseTrainLoop( + net, loss, optim, /*epochs=*/10000, /*bsize=*/6); + + trainLoop.run(x, y, /*shuffle=*/false); + + auto pred = (*net)(x); + auto finalLoss = (*loss)(y, pred); + + auto softmax = module::Softmax(); + pred = softmax(pred); + cout << "Final prediction: " << softmax(*pred) << "\nFinal loss: " << *finalLoss << endl; EXPECT_LT((*finalLoss)[0], 0.05f) << "Network failed to overfit multiclass dataset"; From 33b22f8974d704008fb62cf7952c2a514ead41da Mon Sep 17 00:00:00 2001 From: Robert Baumgartner Date: Sat, 21 Mar 2026 14:22:03 +0100 Subject: [PATCH 23/24] Added random seed and Xavier initializer; cleanup of initializer interface --- python_lib/dl_lib/sys/__init__.py | 2 +- src/backend/data_modeling/tensor.cpp | 3 +- src/backend/data_modeling/tensor.h | 2 +- .../data_modeling/tensor_functions.cpp | 16 +-- src/backend/data_modeling/tensor_functions.h | 6 +- src/backend/module/layers/ff_layer.cpp | 16 ++- src/backend/module/layers/ff_layer.h | 7 +- src/backend/system/sys_functions.cpp | 10 +- src/backend/system/sys_functions.h | 6 +- src/backend/utility/initializers.cpp | 39 +++--- src/backend/utility/initializers.h | 112 +++++++++++++++--- src/python/py_core/py_core.cpp | 16 +-- src/python/py_core/py_core_util.h | 19 +-- src/python/py_sys/py_sys.cpp | 5 +- tests/backend/test_losses.cpp | 2 - tests/backend/test_train_loop.cpp | 46 +++---- 16 files changed, 195 insertions(+), 112 deletions(-) diff --git a/python_lib/dl_lib/sys/__init__.py b/python_lib/dl_lib/sys/__init__.py index 0401df8..b21f145 100644 --- a/python_lib/dl_lib/sys/__init__.py +++ b/python_lib/dl_lib/sys/__init__.py @@ -1 +1 @@ -from dl_lib._compiled._sys import getGlobalDevice, setGlobalDevice \ No newline at end of file +from dl_lib._compiled._sys import getDevice, setDevice, setSeed \ No newline at end of file diff --git a/src/backend/data_modeling/tensor.cpp b/src/backend/data_modeling/tensor.cpp index c4dce79..34ec548 100644 --- a/src/backend/data_modeling/tensor.cpp +++ b/src/backend/data_modeling/tensor.cpp @@ -767,8 +767,7 @@ void Tensor::reset(const ftype x) noexcept { /** * @brief Populates the tensor with values drawn according to initializer. */ -void Tensor::reset(const utility::InitClass ic, const ftype mean, const ftype stddev) { - const auto init = utility::InitializerFactory::getInitializer(ic, mean, stddev); +void Tensor::reset(const shared_ptr init) noexcept { for(tensorSize_t i=0; igetSize(); i++){ (*values)[i] = init->drawNumber(); } diff --git a/src/backend/data_modeling/tensor.h b/src/backend/data_modeling/tensor.h index 00df2ed..7cf178c 100644 --- a/src/backend/data_modeling/tensor.h +++ b/src/backend/data_modeling/tensor.h @@ -177,7 +177,7 @@ class Tensor final : public std::enable_shared_from_this { Tensor& operator=(Tensor&& other) noexcept; void reset(const ftype x) noexcept; - void reset(const utility::InitClass ic, ftype mean, ftype stddev); + void reset(const std::shared_ptr init) noexcept; const Dimension& getDims() const noexcept; tensorSize_t getSize() const noexcept; diff --git a/src/backend/data_modeling/tensor_functions.cpp b/src/backend/data_modeling/tensor_functions.cpp index ccab416..5f06d2d 100644 --- a/src/backend/data_modeling/tensor_functions.cpp +++ b/src/backend/data_modeling/tensor_functions.cpp @@ -33,16 +33,16 @@ Tensor TensorFunctions::Ones(vector dims, const bool requiresGrad) return Ones(std::move(dims), Tensor::getDefaultDevice(), requiresGrad); } -Tensor TensorFunctions::Gaussian(vector dims, const ftype mean, const ftype stddev, - const Device d, const bool requiresGrad) { +Tensor TensorFunctions::Gaussian(vector dims, const Device d, + const ftype stddev, const bool requiresGrad) { auto res = Tensor(std::move(dims), d, requiresGrad); - res.reset(utility::InitClass::Gaussian, mean, stddev); + res.reset(std::make_shared(stddev)); return res; } -Tensor TensorFunctions::Gaussian(vector dims, const ftype mean, - const ftype stddev, const bool requiresGrad) { - return Gaussian(std::move(dims), mean, stddev, Tensor::getDefaultDevice(), requiresGrad); +Tensor TensorFunctions::Gaussian(vector dims, const ftype stddev, + const bool requiresGrad) { + return Gaussian(std::move(dims), Tensor::getDefaultDevice(), stddev, requiresGrad); } // Tensor manipulation @@ -54,8 +54,8 @@ void TensorFunctions::ToOnes(Tensor& t) noexcept { t.reset(1); } -void TensorFunctions::ToGaussian(Tensor& t, const ftype mean, const ftype stddev) { - t.reset(utility::InitClass::Gaussian, mean, stddev); +void TensorFunctions::ToGaussian(Tensor& t, const ftype stddev) { + t.reset(std::make_shared(stddev)); } shared_ptr TensorFunctions::makeSharedTensor(const vector& dims, bool requiresGrad){ diff --git a/src/backend/data_modeling/tensor_functions.h b/src/backend/data_modeling/tensor_functions.h index b05f8da..2d93811 100644 --- a/src/backend/data_modeling/tensor_functions.h +++ b/src/backend/data_modeling/tensor_functions.h @@ -32,8 +32,8 @@ namespace TensorFunctions { // class name acts as namespace for us Tensor Ones(std::vector dims, Device d, const bool requiresGrad=false); Tensor Ones(std::vector dims, const bool requiresGrad=false); - Tensor Gaussian(std::vector dims, ftype mean, ftype stddev, Device d, const bool requiresGrad=false); - Tensor Gaussian(std::vector dims, ftype mean=0, ftype stddev=1, const bool requiresGrad=false); + Tensor Gaussian(std::vector dims, Device d, ftype stddev, const bool requiresGrad=false); + Tensor Gaussian(std::vector dims, ftype stddev=1, const bool requiresGrad=false); std::shared_ptr makeSharedTensor(const std::vector& dims, bool requiresGrad=false); @@ -50,7 +50,7 @@ namespace TensorFunctions { // class name acts as namespace for us // Tensor manipulation void ToZeros(Tensor& t) noexcept; void ToOnes(Tensor& t) noexcept; - void ToGaussian(Tensor& t, ftype mean, ftype stddev); + void ToGaussian(Tensor& t, ftype stddev); // Arithmetics Tensor SumOverDims(const Tensor& t, tensorDim_t dim=0); // default 0 for batch-size diff --git a/src/backend/module/layers/ff_layer.cpp b/src/backend/module/layers/ff_layer.cpp index e0500a7..82fed69 100644 --- a/src/backend/module/layers/ff_layer.cpp +++ b/src/backend/module/layers/ff_layer.cpp @@ -19,9 +19,10 @@ using namespace std; using namespace module; +using namespace utility; -FfLayer::FfLayer(tensorDim_t inSize, tensorDim_t outSize, bool useBias, bool requiresGrad) - : FfLayer(inSize, outSize, Tensor::getDefaultDevice(), useBias, requiresGrad) {} +FfLayer::FfLayer(tensorDim_t inSize, tensorDim_t outSize, bool useBias, bool requiresGrad, shared_ptr init) + : FfLayer(inSize, outSize, Tensor::getDefaultDevice(), useBias, requiresGrad, init) {} /** * @brief Construct a new Ff Layer:: Ff Layer object @@ -31,17 +32,20 @@ FfLayer::FfLayer(tensorDim_t inSize, tensorDim_t outSize, bool useBias, bool req * @param useBias Use a bias if true. Bias will receiver shape (n_rows) * @param requiresGrad If true train this layer. */ -FfLayer::FfLayer(tensorDim_t inSize, tensorDim_t outSize, Device d, bool useBias, bool requiresGrad) +FfLayer::FfLayer(tensorDim_t inSize, tensorDim_t outSize, Device d, + bool useBias, bool requiresGrad, shared_ptr init) : useBias{useBias}, requiresGrad{requiresGrad} { + if(!init){ + init = make_shared(inSize, outSize); + } + weights = make_shared(Dimension({inSize, outSize}), d, requiresGrad); - TensorFunctions::ToGaussian(*weights, 0, 0.2); - weights = weights; + weights->reset(init); if(useBias){ bias = make_shared(vector{outSize}, d, requiresGrad); TensorFunctions::ToZeros(*bias); - bias = bias; } } diff --git a/src/backend/module/layers/ff_layer.h b/src/backend/module/layers/ff_layer.h index 59ba0f5..8c58dc2 100644 --- a/src/backend/module/layers/ff_layer.h +++ b/src/backend/module/layers/ff_layer.h @@ -25,8 +25,11 @@ namespace module { std::shared_ptr bias = nullptr; public: - FfLayer(tensorDim_t inSize, tensorDim_t outSize, bool useBias=true, bool requiresGrad=false); - FfLayer(tensorDim_t inSize, tensorDim_t outSize, Device d, bool useBias=true, bool requiresGrad=false); + FfLayer(tensorDim_t inSize, tensorDim_t outSize, + bool useBias=true, bool requiresGrad=false, std::shared_ptr init=nullptr); + + FfLayer(tensorDim_t inSize, tensorDim_t outSize, Device d, + bool useBias=true, bool requiresGrad=false, std::shared_ptr init=nullptr); Tensor operator()(const Tensor& input) const override; std::shared_ptr operator()(const std::shared_ptr& input) const override; diff --git a/src/backend/system/sys_functions.cpp b/src/backend/system/sys_functions.cpp index 4e5e56b..d653663 100644 --- a/src/backend/system/sys_functions.cpp +++ b/src/backend/system/sys_functions.cpp @@ -12,12 +12,18 @@ #include "sys_functions.h" #include "data_modeling/tensor.h" +#include "utility/initializers.h" + using namespace sys; -void setDevice(Device d) noexcept { +void sys::setDevice(Device d) noexcept { Tensor::setDefaultDevice(d); } -Device getDevice() noexcept { +Device sys::getDevice() noexcept { return Tensor::getDefaultDevice(); +} + +void sys::setRandomSeed(const unsigned int s) noexcept { + utility::InitializerBase::setSeed(s); } \ No newline at end of file diff --git a/src/backend/system/sys_functions.h b/src/backend/system/sys_functions.h index 5919029..b3a0f63 100644 --- a/src/backend/system/sys_functions.h +++ b/src/backend/system/sys_functions.h @@ -15,6 +15,8 @@ #include "data_modeling/device.h" namespace sys { - void setGlobalDevice(Device d) noexcept; - Device getGlobalDevice() noexcept; + void setDevice(Device d) noexcept; + Device getDevice() noexcept; + + void setRandomSeed(unsigned int s) noexcept; } \ No newline at end of file diff --git a/src/backend/utility/initializers.cpp b/src/backend/utility/initializers.cpp index ad8dbfa..0fad81c 100644 --- a/src/backend/utility/initializers.cpp +++ b/src/backend/utility/initializers.cpp @@ -11,38 +11,27 @@ #include "initializers.h" -#include -#include +#include using namespace std; using namespace utility; -namespace { - class GaussianInitializer final : public InitializerBase { - private: - std::random_device rd{}; - mutable std::mt19937 gen; - mutable std::normal_distribution dist; +ftype GaussianInitializer::drawNumber() const { + return dist(gen); +} - public: - GaussianInitializer(ftype mean, ftype stddev); - ftype drawNumber() const override; - }; +ftype UniformXavierInitializer::computeRange(ftype nInputs, ftype nOutputs) { + return sqrt(6/nInputs + nOutputs); +} - GaussianInitializer::GaussianInitializer(ftype mean, ftype stddev) - : InitializerBase(), gen{rd()}, dist{mean, stddev} {} +ftype UniformXavierInitializer::drawNumber() const { + return dist(gen); +} - ftype GaussianInitializer::drawNumber() const { - return dist(gen); - } +ftype NormalXavierInitializer::computeSigma(ftype nInputs, ftype nOutputs) { + return sqrt(6/nInputs + nOutputs); } -unique_ptr InitializerFactory::getInitializer(InitClass ic, ftype mean, ftype stddev) { - switch(ic){ - case InitClass::Gaussian: - return make_unique(mean, stddev); - default: - __throw_invalid_argument("Init class not implemented yet"); - } - return nullptr; // never reached, suppress warning +ftype NormalXavierInitializer::drawNumber() const { + return dist(gen); } \ No newline at end of file diff --git a/src/backend/utility/initializers.h b/src/backend/utility/initializers.h index 05991e3..85eb9c5 100644 --- a/src/backend/utility/initializers.h +++ b/src/backend/utility/initializers.h @@ -13,24 +13,98 @@ #include "global_params.h" +#include +#include + #include -#include - -namespace utility{ - enum class InitClass { - Gaussian - }; - - class InitializerBase { - public: - InitializerBase() = default; - virtual ~InitializerBase() = default; - virtual ftype drawNumber() const = 0; - }; - - class InitializerFactory final { - public: - InitializerFactory() = delete; - static std::unique_ptr getInitializer(InitClass ic, ftype mean=0, ftype stddev=0.5); - }; +#include + +namespace utility { + class InitializerBase { + protected: + static inline std::optional randomSeed_opt = std::nullopt; + + public: + InitializerBase() = default; + + virtual ~InitializerBase() = default; + virtual ftype drawNumber() const = 0; + + static void setSeed(unsigned int s) noexcept { randomSeed_opt = s; } + }; + + class GaussianInitializer final : public InitializerBase { + private: + std::random_device rd{}; + mutable std::mt19937 gen; + mutable std::normal_distribution dist; + + public: + GaussianInitializer(ftype stddev) : gen{rd()}, dist{0, stddev} + { + if(randomSeed_opt){ + gen = std::mt19937{randomSeed_opt.value()}; + } + } + + GaussianInitializer(ftype stddev, unsigned int seed) + : dist{0, stddev} + { + gen = std::mt19937{seed}; + } + + ftype drawNumber() const override; + }; + + class UniformXavierInitializer final : public InitializerBase { + private: + std::random_device rd{}; + mutable std::mt19937 gen; + mutable std::uniform_real_distribution dist; + + ftype computeRange(ftype nInputs, ftype nOutputs); + + public: + UniformXavierInitializer(tensorDim_t nInputs, tensorDim_t nOutputs) + : gen{rd()}, dist{-computeRange(nInputs, nOutputs), computeRange(nInputs, nOutputs)} + { + if(randomSeed_opt){ + gen = std::mt19937{randomSeed_opt.value()}; + } + } + + UniformXavierInitializer(tensorDim_t nInputs, tensorDim_t nOutputs, unsigned int seed) + : dist{-computeRange(nInputs, nOutputs), computeRange(nInputs, nOutputs)} + { + gen = std::mt19937{seed}; + } + + ftype drawNumber() const override; + }; + + class NormalXavierInitializer final : public InitializerBase { + private: + std::random_device rd{}; + mutable std::mt19937 gen; + mutable std::normal_distribution dist; + + ftype computeSigma(ftype nInputs, ftype nOutputs); + + public: + NormalXavierInitializer(tensorDim_t nInputs, tensorDim_t nOutputs) + : gen{rd()}, dist{0, computeSigma(nInputs, nOutputs)} + { + if(randomSeed_opt){ + gen = std::mt19937{randomSeed_opt.value()}; + } + } + + NormalXavierInitializer(tensorDim_t nInputs, tensorDim_t nOutputs, unsigned int seed) + : dist{0, computeSigma(nInputs, nOutputs)} + { + gen = std::mt19937{seed}; + } + + ftype drawNumber() const override; + }; } diff --git a/src/python/py_core/py_core.cpp b/src/python/py_core/py_core.cpp index 8dfad1f..891b152 100644 --- a/src/python/py_core/py_core.cpp +++ b/src/python/py_core/py_core.cpp @@ -163,10 +163,10 @@ BOOST_PYTHON_MODULE(_core) .def("zeros", WRAP_FREE_FUNC_3(Py_DataModeling::Zeros3, std::vector, Device, const bool)) .staticmethod("zeros") - .def("gauss", WRAP_FREE_FUNC_3(Py_DataModeling::Gaussian0, std::vector, ftype, ftype)) - .def("gauss", WRAP_FREE_FUNC_8(Py_DataModeling::Gaussian1, std::vector, ftype, ftype, Device)) - .def("gauss", WRAP_FREE_FUNC_8(Py_DataModeling::Gaussian2, std::vector, ftype, ftype, const bool)) - .def("gauss", WRAP_FREE_FUNC_9(Py_DataModeling::Gaussian3, std::vector, ftype, ftype, Device, const bool)) + .def("gauss", WRAP_FREE_FUNC_2(Py_DataModeling::Gaussian0, std::vector, ftype)) + .def("gauss", WRAP_FREE_FUNC_3(Py_DataModeling::Gaussian1, std::vector, Device, ftype)) + .def("gauss", WRAP_FREE_FUNC_3(Py_DataModeling::Gaussian2, std::vector, ftype, const bool)) + .def("gauss", WRAP_FREE_FUNC_8(Py_DataModeling::Gaussian3, std::vector, Device, ftype, const bool)) .staticmethod("gauss") // properties @@ -227,8 +227,8 @@ BOOST_PYTHON_MODULE(_core) def("Zeros", WRAP_FREE_FUNC_2(Py_DataModeling::Zeros2, std::vector, const bool)); def("Zeros", WRAP_FREE_FUNC_3(Py_DataModeling::Zeros3, std::vector, Device, const bool)); - def("Gaussian", WRAP_FREE_FUNC_3(Py_DataModeling::Gaussian0, std::vector, ftype, ftype)); - def("Gaussian", WRAP_FREE_FUNC_8(Py_DataModeling::Gaussian1, std::vector, ftype, ftype, Device)); - def("Gaussian", WRAP_FREE_FUNC_8(Py_DataModeling::Gaussian2, std::vector, ftype, ftype, const bool)); - def("Gaussian", WRAP_FREE_FUNC_9(Py_DataModeling::Gaussian3, std::vector, ftype, ftype, Device, const bool)); + def("Gaussian", WRAP_FREE_FUNC_2(Py_DataModeling::Gaussian0, std::vector, ftype)); + def("Gaussian", WRAP_FREE_FUNC_3(Py_DataModeling::Gaussian1, std::vector, Device, ftype)); + def("Gaussian", WRAP_FREE_FUNC_3(Py_DataModeling::Gaussian2, std::vector, ftype, const bool)); + def("Gaussian", WRAP_FREE_FUNC_8(Py_DataModeling::Gaussian3, std::vector, Device, ftype, const bool)); } \ No newline at end of file diff --git a/src/python/py_core/py_core_util.h b/src/python/py_core/py_core_util.h index 6951184..7aa01d0 100644 --- a/src/python/py_core/py_core_util.h +++ b/src/python/py_core/py_core_util.h @@ -12,6 +12,7 @@ #pragma once #include "data_modeling/dim_type.h" +#include "utility/initializers.h" #include "data_modeling/tensor.h" #include "data_modeling/tensor_functions.h" @@ -58,12 +59,12 @@ namespace Py_DataModeling { return TensorFunctions::Zeros(std::move(dims), d); } - inline auto GaussianWrapper0(std::vector dims, ftype mean, ftype stddev) { - return TensorFunctions::Gaussian(std::move(dims), mean, stddev); + inline auto GaussianWrapper0(std::vector dims, ftype stddev) { + return TensorFunctions::Gaussian(std::move(dims), stddev); } - inline auto GaussianWrapper1(std::vector dims, ftype mean, ftype stddev, Device d) { - return TensorFunctions::Gaussian(std::move(dims), mean, stddev, d); + inline auto GaussianWrapper1(std::vector dims, Device d, ftype stddev) { + return TensorFunctions::Gaussian(std::move(dims), d, stddev); } inline Tensor (*Ones0)(std::vector) = &OnesWrapper0; @@ -76,13 +77,13 @@ namespace Py_DataModeling { inline Tensor (*Zeros2)(std::vector, const bool) = &(TensorFunctions::Zeros); inline Tensor (*Zeros3)(std::vector, Device, const bool) = &(TensorFunctions::Zeros); - inline Tensor (*Gaussian0)(std::vector, ftype, ftype) = &GaussianWrapper0; - inline Tensor (*Gaussian1)(std::vector, ftype, ftype, Device) = &GaussianWrapper1; - inline Tensor (*Gaussian2)(std::vector, ftype, ftype, const bool) = &(TensorFunctions::Gaussian); - inline Tensor (*Gaussian3)(std::vector, ftype, ftype, Device, const bool) = &(TensorFunctions::Gaussian); + inline Tensor (*Gaussian0)(std::vector, ftype) = &GaussianWrapper0; + inline Tensor (*Gaussian1)(std::vector, Device, ftype) = &GaussianWrapper1; + inline Tensor (*Gaussian2)(std::vector, ftype, const bool) = &(TensorFunctions::Gaussian); + inline Tensor (*Gaussian3)(std::vector, Device, ftype, const bool) = &(TensorFunctions::Gaussian); inline void (Tensor::*reset1)(const ftype) = &Tensor::reset; - inline void (Tensor::*reset2)(const utility::InitClass, ftype, ftype) = &Tensor::reset; + inline void (Tensor::*reset2)(const std::shared_ptr) = &Tensor::reset; inline void (Tensor::*transposeThis1)() = &Tensor::transposeThis; inline void (Tensor::*transposeThis2)(int, int) = &Tensor::transposeThis; diff --git a/src/python/py_sys/py_sys.cpp b/src/python/py_sys/py_sys.cpp index ebfe674..af7d905 100644 --- a/src/python/py_sys/py_sys.cpp +++ b/src/python/py_sys/py_sys.cpp @@ -18,6 +18,7 @@ BOOST_PYTHON_MODULE(_sys) { using namespace boost::python; - def("setGlobalDevice", &sys::setGlobalDevice); - def("getGlobalDevice", &sys::getGlobalDevice); + def("setDevice", &sys::setDevice); + def("getDevice", &sys::getDevice); + def("setSeed", &sys::setRandomSeed); } \ No newline at end of file diff --git a/tests/backend/test_losses.cpp b/tests/backend/test_losses.cpp index 8844a82..180d6ae 100644 --- a/tests/backend/test_losses.cpp +++ b/tests/backend/test_losses.cpp @@ -101,9 +101,7 @@ TEST(LossTest, CrossEntropyBackward) { CrossEntropyLoss loss; auto result = loss(y, ypred); - std::cout << "before bw" << std::endl; result->backward(); - std::cout << "past bw" << std::endl; auto grads = ypred->getGrads(); EXPECT_NEAR((*grads)[0], -0.7143f, kTol); diff --git a/tests/backend/test_train_loop.cpp b/tests/backend/test_train_loop.cpp index fc55d4a..247b9b0 100644 --- a/tests/backend/test_train_loop.cpp +++ b/tests/backend/test_train_loop.cpp @@ -31,6 +31,8 @@ #include "data_modeling/tensor_functions.h" +#include "system/sys_functions.h" + using namespace std; static shared_ptr makeBinaryNet() { @@ -83,6 +85,12 @@ static shared_ptr makeMulticlassNet2() { return net; } +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + sys::setRandomSeed(42); + return RUN_ALL_TESTS(); +} + TEST(OverfitTest, BceSgdOverfitsSmallDataset) { // XOR-like: 4 samples, 2 features, binary labels auto x = TensorFunctions::makeSharedTensor( @@ -100,10 +108,10 @@ TEST(OverfitTest, BceSgdOverfitsSmallDataset) { auto net = makeBinaryNet(); auto loss = make_shared(); auto optim = make_shared( - net->parameters(), /*lr=*/0.01); + net->parameters(), /*lr=*/0.05); auto trainLoop = train::BaseTrainLoop( - net, loss, optim, /*epochs=*/10000, /*bsize=*/static_cast(4)); + net, loss, optim, /*epochs=*/2000, /*bsize=*/static_cast(4)); trainLoop.run(x, y, /*shuffle=*/false, /*verbose=*/false); @@ -112,10 +120,11 @@ TEST(OverfitTest, BceSgdOverfitsSmallDataset) { auto finalLoss = (*loss)(y, pred); EXPECT_LT((*finalLoss)[0], 0.05f) - << "Network failed to overfit binary dataset"; + << "Network failed to overfit binary dataset\n" + << "Final prediction: " << *pred << "\nFinal loss: " << *finalLoss; } -TEST(OverfitTest, BceSgdOverfitsSmallDataset_OptimizedLoss) { +TEST(OverfitTest, BceSgdOverfitsSmallDataset_OptimizedLoss) { // XOR-like: 4 samples, 2 features, binary labels auto x = TensorFunctions::makeSharedTensor( {4, 2}, {0.0, 0.0, @@ -132,10 +141,10 @@ TEST(OverfitTest, BceSgdOverfitsSmallDataset_OptimizedLoss) { auto net = makeBinaryNet2(); auto loss = make_shared(); auto optim = make_shared( - net->parameters(), /*lr=*/0.02); + net->parameters(), /*lr=*/0.05); auto trainLoop = train::BaseTrainLoop( - net, loss, optim, /*epochs=*/10000, /*bsize=*/static_cast(4)); + net, loss, optim, /*epochs=*/2000, /*bsize=*/static_cast(4)); trainLoop.run(x, y, /*shuffle=*/false, /*verbose=*/false); @@ -144,10 +153,9 @@ TEST(OverfitTest, BceSgdOverfitsSmallDataset_OptimizedLoss) { auto finalLoss = (*loss)(y, pred); auto sigmoid = module::Sigmoid(); - cout << "Final prediction: " << sigmoid(*pred) << "\nFinal loss: " << *finalLoss << endl; - EXPECT_LT((*finalLoss)[0], 0.05f) - << "Network failed to overfit binary dataset"; + << "Network failed to overfit binary dataset\n" + << "Final prediction: " << sigmoid(*pred) << "\nFinal loss: " << *finalLoss; } TEST(OverfitTest, CrossEntropyRMSPropOverfitsSmallDataset) { @@ -172,19 +180,19 @@ TEST(OverfitTest, CrossEntropyRMSPropOverfitsSmallDataset) { auto net = makeMulticlassNet(); auto loss = make_shared(); auto optim = make_shared( - net->parameters(), /*lr=*/0.00002, /*decay=*/0.95); + net->parameters(), /*lr=*/0.0001, /*decay=*/0.95); auto trainLoop = train::BaseTrainLoop( - net, loss, optim, /*epochs=*/10000, /*bsize=*/6); + net, loss, optim, /*epochs=*/2000, /*bsize=*/6); - trainLoop.run(x, y, /*shuffle=*/false); + trainLoop.run(x, y, /*shuffle=*/false, /*verbose=*/false); auto pred = (*net)(x); auto finalLoss = (*loss)(y, pred); - cout << "Final prediction: " << *pred << "\nFinal loss: " << *finalLoss << endl; EXPECT_LT((*finalLoss)[0], 0.05f) - << "Network failed to overfit multiclass dataset"; + << "Network failed to overfit multiclass dataset" + << "Final prediction: " << *pred << "\nFinal loss: " << *finalLoss; } TEST(OverfitTest, CrossEntropyRMSPropOverfitsSmallDataset_OptimizedLoss) { @@ -209,20 +217,18 @@ TEST(OverfitTest, CrossEntropyRMSPropOverfitsSmallDataset_OptimizedLoss) { auto net = makeMulticlassNet2(); auto loss = make_shared(); auto optim = make_shared( - net->parameters(), /*lr=*/0.00002, /*decay=*/0.95); + net->parameters(), /*lr=*/0.0003, /*decay=*/0.95); auto trainLoop = train::BaseTrainLoop( net, loss, optim, /*epochs=*/10000, /*bsize=*/6); - trainLoop.run(x, y, /*shuffle=*/false); + trainLoop.run(x, y, /*shuffle=*/false, /*verbose=*/false); auto pred = (*net)(x); auto finalLoss = (*loss)(y, pred); auto softmax = module::Softmax(); - pred = softmax(pred); - cout << "Final prediction: " << softmax(*pred) << "\nFinal loss: " << *finalLoss << endl; - EXPECT_LT((*finalLoss)[0], 0.05f) - << "Network failed to overfit multiclass dataset"; + << "Network failed to overfit multiclass dataset" + << "Final prediction: " << softmax(*pred) << "\nFinal loss: " << *finalLoss; } \ No newline at end of file From 9a998f1d598ec9fdfb2aa1fe05ab444e1b34f8e4 Mon Sep 17 00:00:00 2001 From: Robert Baumgartner Date: Sat, 21 Mar 2026 16:08:07 +0100 Subject: [PATCH 24/24] Testing Python interface of training loops --- python_lib/dl_lib/nn/__init__.py | 4 +- python_lib/dl_lib/nn/module.py | 29 +++++ python_lib/dl_lib/train/loss/__init__.py | 4 +- src/backend/data_modeling/tensor.h | 7 +- .../loss_functions/bce_sigmoid_loss.cpp | 2 +- src/python/py_core/py_core.cpp | 5 + src/python/py_nn/py_nn.cpp | 14 ++ src/python/py_train/py_train.cpp | 14 +- src/python/py_utility/custom_converters.h | 33 +++++ tests/python/test_training.py | 123 ++++++++++++++++++ 10 files changed, 225 insertions(+), 10 deletions(-) create mode 100644 tests/python/test_training.py diff --git a/python_lib/dl_lib/nn/__init__.py b/python_lib/dl_lib/nn/__init__.py index 5ce8bb2..774cbe0 100644 --- a/python_lib/dl_lib/nn/__init__.py +++ b/python_lib/dl_lib/nn/__init__.py @@ -1,5 +1,5 @@ -from .module import Module +from .module import Module, Sequential from dl_lib._compiled._nn import FfLayer #from .._compiled._core import Tensor # re-export if needed -__all__ = ['Module', 'FfLayer'] \ No newline at end of file +__all__ = ['Module', 'Sequential', 'FfLayer'] \ No newline at end of file diff --git a/python_lib/dl_lib/nn/module.py b/python_lib/dl_lib/nn/module.py index 755ed13..d4a232a 100644 --- a/python_lib/dl_lib/nn/module.py +++ b/python_lib/dl_lib/nn/module.py @@ -27,4 +27,33 @@ def parameters(self): params = self._own_parameters() # calls C++ side for leaf modules for module in self._modules.values(): params.extend(module.parameters()) + return params + +""" +For convenience. +""" +class Sequential(Module): + def __init__(self): + super().__init__() + object.__setattr__(self, "_layers", []) + + def append(self, module): + self._layers.append(module) + + def forward(self, x): + for layer in self._layers: + x = layer(x) + return x + + def parameters(self): + params = [] + for layer in self._layers: + if hasattr(layer, 'parameters'): + result = layer.parameters() + if isinstance(result, list): + params.extend(result) + else: + params.extend(list(result)) # force conversion from BP proxy + elif hasattr(layer, 'params'): + params.extend(list(layer.params)) return params \ No newline at end of file diff --git a/python_lib/dl_lib/train/loss/__init__.py b/python_lib/dl_lib/train/loss/__init__.py index cba1a96..c9d5b4f 100644 --- a/python_lib/dl_lib/train/loss/__init__.py +++ b/python_lib/dl_lib/train/loss/__init__.py @@ -1,4 +1,4 @@ -from dl_lib._compiled._train import BCE, CrossEntropy +from dl_lib._compiled._train import BCE, BceWithSigmoid, CrossEntropy, CrossEntropyWithSoftmax #from dl_lib._compiled._core import Tensor # re-export if needed -__all__ = ['BCE', 'CrossEntropy'] \ No newline at end of file +__all__ = ['BCE', 'BceWithSigmoid', 'CrossEntropy', 'CrossEntropyWithSoftmax'] \ No newline at end of file diff --git a/src/backend/data_modeling/tensor.h b/src/backend/data_modeling/tensor.h index 7cf178c..d8a59fc 100644 --- a/src/backend/data_modeling/tensor.h +++ b/src/backend/data_modeling/tensor.h @@ -206,13 +206,12 @@ class Tensor final : public std::enable_shared_from_this { friend Tensor operator+(ftype scalar, const Tensor& tensor); void backward(); - - bool hasGrads() const noexcept { return grads!=nullptr; } - std::shared_ptr getGrads() const; - void setGrads(std::shared_ptr grads) noexcept{ + std::shared_ptr getGrads() const; + void setGrads(std::shared_ptr grads) noexcept { this->grads = std::move(grads); } + bool hasGrads() const noexcept { return grads!=nullptr; } void transposeThis() noexcept; void transposeThis(int dim1, int dim2) noexcept; diff --git a/src/backend/training/loss_functions/bce_sigmoid_loss.cpp b/src/backend/training/loss_functions/bce_sigmoid_loss.cpp index 8eb81bd..2634bf4 100644 --- a/src/backend/training/loss_functions/bce_sigmoid_loss.cpp +++ b/src/backend/training/loss_functions/bce_sigmoid_loss.cpp @@ -35,7 +35,7 @@ shared_ptr BceSigmoidLoss::operator()(const shared_ptr y, const auto bceSimplified = [](ftype y, ftype logit){ constexpr ftype zero = 0; - return std::max(y, zero) - logit*y + log(1+exp(logit < 0 ? logit : -logit)); + return std::max(logit, zero) - logit*y + log(1+exp(-std::abs(logit))); }; const auto nBatches = y->getDims()[0]; diff --git a/src/python/py_core/py_core.cpp b/src/python/py_core/py_core.cpp index 891b152..846d6e9 100644 --- a/src/python/py_core/py_core.cpp +++ b/src/python/py_core/py_core.cpp @@ -207,6 +207,11 @@ BOOST_PYTHON_MODULE(_core) .def("reset", Py_DataModeling::reset1) .def("reset", Py_DataModeling::reset2) + + .def("hasGrads", &Tensor::hasGrads) + .def("hasGrads", +[](const std::shared_ptr& t) -> bool { + return t->hasGrads(); + }) .def("transpose", WRAP_FREE_MEMBER_FUNC_1(Py_DataModeling::transpose1, int, int)) .def("transpose", WRAP_FREE_MEMBER_FUNC_2(Py_DataModeling::transpose2, int, int, bool)) diff --git a/src/python/py_nn/py_nn.cpp b/src/python/py_nn/py_nn.cpp index 75f6d11..5eb175a 100644 --- a/src/python/py_nn/py_nn.cpp +++ b/src/python/py_nn/py_nn.cpp @@ -11,8 +11,11 @@ #include "py_nn_util.h" #include "python_templates.h" +#include "custom_converters.h" #include "utility/global_params.h" +#include + #include BOOST_PYTHON_MODULE(_nn) @@ -31,6 +34,17 @@ BOOST_PYTHON_MODULE(_nn) return (self.*method)(t1.getSharedPtr(), t2.getSharedPtr()); \ } + // register vector of shared_ptr converter; needed for ModuleBase::parameters() + class_>>("TensorList") + .def(vector_indexing_suite>>()) + ; + + // convert python list of tensors back to c++ + converter::registry::push_back( + &custom_converters::TensorListFromPython::convertible, + &custom_converters::TensorListFromPython::construct, + type_id>>()); + // Networks class_, boost::noncopyable>("_Module", no_init) // methods diff --git a/src/python/py_train/py_train.cpp b/src/python/py_train/py_train.cpp index 35c3854..9cbb23f 100644 --- a/src/python/py_train/py_train.cpp +++ b/src/python/py_train/py_train.cpp @@ -14,7 +14,9 @@ #include "utility/global_params.h" #include "training/loss_functions/bce_loss.h" +#include "training/loss_functions/bce_sigmoid_loss.h" #include "training/loss_functions/crossentropy_loss.h" +#include "training/loss_functions/crossentropy_softmax_loss.h" #include "training/optimizers/sgd.h" #include "training/optimizers/rmsprop.h" @@ -30,25 +32,35 @@ BOOST_PYTHON_MODULE(_train) .def("__call__", &train::BceLoss::operator()) ; + class_, boost::noncopyable>("BceWithSigmoid") + .def("__call__", &train::BceSigmoidLoss::operator()) + ; + class_, boost::noncopyable>("CrossEntropy") .def("__call__", &train::CrossEntropyLoss::operator()) ; + class_, boost::noncopyable>("CrossEntropyWithSoftmax") + .def("__call__", &train::CrossEntropySoftmaxLoss::operator()) + ; + // Optimizers class_, boost::noncopyable>("SGD", no_init) .def(init >, ftype>()) .def("step", &train::SgdOptimizer::step) + .def("zeroGrad", &train::SgdOptimizer::zeroGrad) ; class_, boost::noncopyable>("RmsProp", no_init) .def(init >, ftype, ftype>()) .def("step", &train::RmsPropOptimizer::step) + .def("zeroGrad", &train::RmsPropOptimizer::zeroGrad) ; // Trainers class_, boost::noncopyable>("TrainLoop", no_init) .def(init&, std::shared_ptr, std::shared_ptr, size_t, tensorDim_t>()) - .def("step", &train::RmsPropOptimizer::step) + .def("run", &train::BaseTrainLoop::run) ; } \ No newline at end of file diff --git a/src/python/py_utility/custom_converters.h b/src/python/py_utility/custom_converters.h index f712da9..68c1ea7 100644 --- a/src/python/py_utility/custom_converters.h +++ b/src/python/py_utility/custom_converters.h @@ -50,6 +50,16 @@ namespace custom_converters { static void* convertible(PyObject* obj_ptr); static void construct(PyObject* obj_ptr,rvalueFromPythonData* data); }; + + /** + * @brief Convert from Python list to std::vector> + */ + struct TensorListFromPython { + using rvalueFromPythonData = boost::python::converter::rvalue_from_python_stage1_data; + + static void* convertible(PyObject* obj); + static void construct(PyObject* obj, rvalueFromPythonData* data); + }; } // TODO: do array instead of tensor @@ -81,6 +91,29 @@ bp::converter::registry::push_back( /******************************************************************************************/ /******************************************************************************************/ +void* custom_converters::TensorListFromPython::convertible(PyObject* obj) { + using namespace boost::python; + if (!PyList_Check(obj)) return nullptr; + return obj; +} + +void custom_converters::TensorListFromPython::construct(PyObject* obj, rvalueFromPythonData* data) { + using namespace boost::python; + void* storage = ((converter::rvalue_from_python_storage< std::vector> >*)data)->storage.bytes; + //void* storage = ((converter::rvalue_from_python_storage< std::vector >*)data)->storage.bytes; + + new (storage) std::vector>(); + auto* vec = reinterpret_cast>*>(storage); + + int len = PyList_Size(obj); + vec->reserve(len); + for (int i = 0; i < len; i++) { + vec->push_back(extract>( + PyList_GetItem(obj, i))); + } + data->convertible = storage; +} + template requires ( std::is_integral_v< T > || std::is_floating_point_v< T >) diff --git a/tests/python/test_training.py b/tests/python/test_training.py new file mode 100644 index 0000000..d974a0f --- /dev/null +++ b/tests/python/test_training.py @@ -0,0 +1,123 @@ +""" +Robert Baumgartner, r.baumgartner-1@tudelft.nl +""" + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent.parent / "python_lib")) +print(sys.path) + +from dl_lib import Tensor +from dl_lib.nn import FfLayer, Sequential +from dl_lib.nn.activation import LeakyReLU +from dl_lib.train.loss import BCE, BceWithSigmoid, CrossEntropyWithSoftmax +from dl_lib.train.optim import SGD, RmsProp + +from dl_lib.sys import setSeed +import pytest + +setSeed(42) + +def train(net, loss_fn, optim, x, y, epochs): + for epoch in range(epochs): + ypred = net.forward(x) + loss = loss_fn(y, ypred) + + loss.backward() + optim.step() + optim.zeroGrad() + + return loss + +def make_binary_net(): + net = Sequential() + net.append(FfLayer(2, 4, True, True)) + net.append(LeakyReLU(0.01)) + net.append(FfLayer(4, 1, True, True)) + return net + +def make_multiclass_net(): + net = Sequential() + net.append(FfLayer(2, 8, True, True)) + net.append(LeakyReLU(0.01)) + net.append(FfLayer(8, 3, True, True)) + return net + +def make_xor_data(): + x = Tensor([4, 2], [0.0, 0.0, + 0.0, 1.0, + 1.0, 0.0, + 1.0, 1.0], False) + y = Tensor([4, 1], [0.0, + 1.0, + 1.0, + 0.0], False) + return x, y + +def make_multiclass_data(): + x = Tensor([6, 2], [1.0, 0.0, + 1.0, 0.1, + 0.0, 1.0, + 0.1, 1.0, + 0.5, 0.5, + 0.4, 0.6], False) + y = Tensor([6, 3], [1.0, 0.0, 0.0, + 1.0, 0.0, 0.0, + 0.0, 1.0, 0.0, + 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, + 0.0, 0.0, 1.0], False) + return x, y + +class TestOverfitBinary: + def test_binary_sgd_overfits(self): + x, y = make_xor_data() + net = make_binary_net() + loss_fn = BceWithSigmoid() + optim = SGD(net.parameters(), 0.05) + + final_loss = train(net, loss_fn, optim, x, y, epochs=2000) + + assert final_loss.getitem(0) < 0.05, \ + f"SGD failed to overfit XOR, loss={final_loss.getitem(0)}" + + def test_binary_rmsprop_overfits(self): + x, y = make_xor_data() + net = make_binary_net() + loss_fn = BceWithSigmoid() + optim = RmsProp(net.parameters(), 0.0001, 0.95) + + final_loss = train(net, loss_fn, optim, x, y, epochs=5000) + + assert final_loss.getitem(0) < 0.05, \ + f"RmsProp failed to overfit XOR, loss={final_loss.getitem(0)}" + + def test_multiclass_rmsprop_overfits(self): + x, y = make_multiclass_data() + net = make_multiclass_net() + loss_fn = CrossEntropyWithSoftmax() + optim = RmsProp(net.parameters(), 0.0003, 0.95) + + final_loss = train(net, loss_fn, optim, x, y, epochs=10000) + + assert final_loss.getitem(0) < 0.05, \ + f"RmsProp failed to overfit multiclass, loss={final_loss.getitem(0)}" + + def test_loss_decreases(self): + """Loss should be strictly lower after training than before""" + x, y = make_xor_data() + net = make_binary_net() + loss_fn = BceWithSigmoid() + + optim = SGD(net.parameters(), 0.001) + initial_pred = net.forward(x) + initial_loss = loss_fn(y, initial_pred).getitem(0) + train(net, loss_fn, optim, x, y, epochs=2000) + + final_pred = net.forward(x) + final_loss = loss_fn(y, final_pred).getitem(0) + + assert final_loss < initial_loss, \ + f"Loss did not decrease: {initial_loss} -> {final_loss}" + \ No newline at end of file