diff --git a/.gitignore b/.gitignore
index ef9f13c..2744889 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,9 @@
 build
 .vscode
-unit_tests_backend
 *.txt
 python_lib/dl_lib/_compiled
-*__pycache__*
\ No newline at end of file
+*__pycache__*
+*_cache
+
+# TODO: remove later
+benchmarks
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b09189f..e317ea5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,7 +27,7 @@ endif()
 add_compile_options("$<$<C_COMPILER_ID:MSVC>:/utf-8>")
 add_compile_options("$<$<CXX_COMPILER_ID:MSVC>:/utf-8>")
 
-option(DOUBLE_PRECISION "Compile with double precision at cost of speed?" OFF) # TODO: not implemented yet
+# TODO: add flag for double precision?
 
 # include python libs
 if(APPLE)
@@ -40,7 +40,7 @@ if(APPLE)
   message("Python_INCLUDE_DIRS:${${Python_INCLUDE_DIRS}}")
 
   set(PYTHON_LIBRARIES ${Python_LIBRARIES})
-  string(COMPARE EQUAL "${PYTHON_LIBRARIES}" "" PYTHONLIBS_EMPTY)
+  string(COMPARE EQUAL "${Python_LIBRARIES}" "" PYTHONLIBS_EMPTY)
   if(PYTHONLIBS_EMPTY)
     message(FATAL_ERROR "Problem: PYTHON_LIBRARIES not found. Do you have Python installed on your machine?")
   endif()
@@ -50,19 +50,18 @@ if(APPLE)
     message("Failed to automatically find Python_INCLUDE_DIRS. Setting the PYTHON_INCLUDE_DIRS variable manually. If this crashes please adjust the following
             path to the path where Python.h resides (the one matching the found Python instance). Paths must be consistent iff multiple Python versions on machine.")
     set(PYTHON_H_PATH "/usr/local/opt/python@3.13/Frameworks/Python.framework/Versions/3.13/include/python3.13")
-    set(PYTHON_INCLUDE_DIRS "${PYTHON_H_PATH}")
+    set(PYTHON_INCLUDE_DIRS "${Python_H_PATH}")
   else()
     set(PYTHON_INCLUDE_DIRS ${Python_INCLUDE_DIRS})
   endif()
     #FindPython3()
-  message("Apple - Using Python:${Python_VERSION_MAJOR} - Libraries:${PYTHON_LIBRARIES} - IncludeDirs: ${PYTHON_INCLUDE_DIRS}")
+  message("Apple - Using Python:${Python_VERSION_MAJOR} - Libraries:${Python_LIBRARIES} - IncludeDirs: ${Python_INCLUDE_DIRS}")
 else()
   message("Getting PythonLibs on Linux or Windows path")
-  find_package(PythonLibs REQUIRED)
+  find_package(Python 3 REQUIRED COMPONENTS Interpreter Development)
 endif()
-include_directories(${PYTHON_INCLUDE_DIRS})
-message("Using Python:${Python_VERSION_MAJOR} - Libraries:${PYTHON_LIBRARIES} - IncludeDirs: ${PYTHON_INCLUDE_DIRS}")
-
+include_directories(${Python_INCLUDE_DIRS})
+message("Using Python:${Python_VERSION_MAJOR} - Libraries:${Python_LIBRARIES} - IncludeDirs: ${Python_INCLUDE_DIRS}")
 
 #set(CMAKE_MESSAGE_LOG_LEVEL WARNING)
 
@@ -108,4 +107,9 @@ option(BUILD_TESTS "Build tests" OFF)
 if(BUILD_TESTS)
     enable_testing()
     add_subdirectory(tests)
+endif()
+
+option(BUILD_BENCHMARKS "Build benchmarks" OFF)
+if(BUILD_BENCHMARKS)
+    add_subdirectory(benchmarks)
 endif()
\ No newline at end of file
diff --git a/python_lib/dl_lib/__init__.py b/python_lib/dl_lib/__init__.py
index 9817df9..e7f6844 100644
--- a/python_lib/dl_lib/__init__.py
+++ b/python_lib/dl_lib/__init__.py
@@ -1,3 +1,5 @@
-from ._compiled._core import Tensor, Dimension, Device, Ones, Zeros, Gaussian
+from ._compiled._core import Tensor, Dimension, Device
 
-__all__ = ['Tensor', 'Device', 'Dimension']
\ No newline at end of file
+__all__ = ['Tensor', 'Device', 'Dimension']
+
+__version__ = "0.2.0"
\ No newline at end of file
diff --git a/python_lib/dl_lib/nn/__init__.py b/python_lib/dl_lib/nn/__init__.py
index 75fefbc..774cbe0 100644
--- a/python_lib/dl_lib/nn/__init__.py
+++ b/python_lib/dl_lib/nn/__init__.py
@@ -1,4 +1,5 @@
-#from .._compiled._layers import FfLayer, ReLU
+from .module import Module, Sequential
+from dl_lib._compiled._nn import FfLayer
 #from .._compiled._core import Tensor  # re-export if needed
 
-#__all__ = ['FfLayer', 'ReLU']
\ No newline at end of file
+__all__ = ['Module', 'Sequential', 'FfLayer']
\ No newline at end of file
diff --git a/python_lib/dl_lib/nn/activation/__init__.py b/python_lib/dl_lib/nn/activation/__init__.py
new file mode 100644
index 0000000..0ab1bab
--- /dev/null
+++ b/python_lib/dl_lib/nn/activation/__init__.py
@@ -0,0 +1,4 @@
+from dl_lib._compiled._nn import ReLU, LeakyReLU, Softmax
+#from .._compiled._core import Tensor  # re-export if needed
+
+__all__ = ['ReLU', 'LeakyReLU', 'Softmax']
\ No newline at end of file
diff --git a/python_lib/dl_lib/nn/module.py b/python_lib/dl_lib/nn/module.py
new file mode 100644
index 0000000..d4a232a
--- /dev/null
+++ b/python_lib/dl_lib/nn/module.py
@@ -0,0 +1,59 @@
+"""
+Module base class. We use it to automatically register network 
+modules when defining graphs via Module.
+"""
+
+from .._compiled._nn import _Module
+
+class Module(_Module):
+  def __init__(self):
+    object.__setattr__(self, "_modules", {}) # not necessary, but more explicit
+    self._modules = {} 
+  
+  """
+  Stores attributes defined in __init__ in private 
+  _modules dictionary
+  """
+  def __setattr__(self, name, value):
+    if isinstance(value, Module):
+      self._modules[name] = value
+    object.__setattr__(self, name, value)
+  
+  """
+  Returns a list of leaf parameters. Used to identify trainable
+  nodes of a graph.
+  """
+  def parameters(self):
+    params = self._own_parameters()  # calls C++ side for leaf modules
+    for module in self._modules.values():
+      params.extend(module.parameters())
+    return params
+  
+"""
+For convenience.
+"""
+class Sequential(Module):
+  def __init__(self):
+    super().__init__()
+    object.__setattr__(self, "_layers", [])
+
+  def append(self, module):
+    self._layers.append(module)
+
+  def forward(self, x):
+    for layer in self._layers:
+      x = layer(x)
+    return x
+
+  def parameters(self):
+    params = []
+    for layer in self._layers:
+      if hasattr(layer, 'parameters'):
+        result = layer.parameters()
+        if isinstance(result, list):
+          params.extend(result)
+        else:
+          params.extend(list(result))  # force conversion from BP proxy
+      elif hasattr(layer, 'params'):
+        params.extend(list(layer.params))
+    return params
\ No newline at end of file
diff --git a/python_lib/dl_lib/sys/__init__.py b/python_lib/dl_lib/sys/__init__.py
new file mode 100644
index 0000000..b21f145
--- /dev/null
+++ b/python_lib/dl_lib/sys/__init__.py
@@ -0,0 +1 @@
+from dl_lib._compiled._sys import getDevice, setDevice, setSeed
\ No newline at end of file
diff --git a/python_lib/dl_lib/train/__init__.py b/python_lib/dl_lib/train/__init__.py
new file mode 100644
index 0000000..9614d89
--- /dev/null
+++ b/python_lib/dl_lib/train/__init__.py
@@ -0,0 +1,4 @@
+from dl_lib._compiled._train import TrainLoop
+#from dl_lib._compiled._core import Tensor  # re-export if needed
+
+__all__ = ['TrainLoop']
\ No newline at end of file
diff --git a/python_lib/dl_lib/train/loss/__init__.py b/python_lib/dl_lib/train/loss/__init__.py
new file mode 100644
index 0000000..c9d5b4f
--- /dev/null
+++ b/python_lib/dl_lib/train/loss/__init__.py
@@ -0,0 +1,4 @@
+from dl_lib._compiled._train import BCE, BceWithSigmoid, CrossEntropy, CrossEntropyWithSoftmax
+#from dl_lib._compiled._core import Tensor  # re-export if needed
+
+__all__ = ['BCE', 'BceWithSigmoid', 'CrossEntropy', 'CrossEntropyWithSoftmax']
\ No newline at end of file
diff --git a/python_lib/dl_lib/train/optim/__init__.py b/python_lib/dl_lib/train/optim/__init__.py
new file mode 100644
index 0000000..a6669c6
--- /dev/null
+++ b/python_lib/dl_lib/train/optim/__init__.py
@@ -0,0 +1,4 @@
+from dl_lib._compiled._train import SGD, RmsProp
+#from dl_lib._compiled._core import Tensor  # re-export if needed
+
+__all__ = ['SGD', 'RmsProp']
\ No newline at end of file
diff --git a/readme.md b/readme.md
index 8c531f0..2eeaf95 100644
--- a/readme.md
+++ b/readme.md
@@ -34,10 +34,10 @@ For some examples on Python interface, see tests/python.
 🚧 **Work in Progress** - Implementing additional layers and optimizations
 
 Roadmap:
-- [ ] Python Binding Unit Tests
-- [ ] Additional layer types (Conv2D, LSTM, etc.)
+- [x] Python Binding Unit Tests
+- [ ] Additional layer types (Conv2D, Dropout, etc.)
 - [ ] Optimizers and training framework
-- [ ] CUDA kernels for performance-critical operations
+- [ ] CUDA mode for operations
 - [ ] AlexNet reference implementation
 - [ ] Docker deployment example
 
@@ -62,11 +62,12 @@ ctest
 
 ## Required
 
-- Compiler capable of C++20 at least (we test with gcc 12.3.0)
+- Compiler capable of C++23 at least (we test with gcc 13.3.0)
 - Boost Python
-- Cmake > 3.24
+- Cmake > 3.28
 - Python 3 (we test with 3.10, but it should work with any version)
-- pytest for unit tests (we use 9.0.2)
+- pytest and GTest for unit tests (we use pytest=9.0.2)
+- Google Benchmark for benchmarking
 
 ## Troubleshooting
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 0d564e2..d47e4d1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -2,17 +2,38 @@
 add_subdirectory(backend)
 add_subdirectory(python)
 
-target_link_libraries(_core 
+target_link_libraries(_core PRIVATE
                       ${Boost_LIBRARIES} 
-                      ${PYTHON_LIBRARIES} 
+                      ${Python_LIBRARIES} 
                       BackendCore)
 
 target_include_directories(_core PRIVATE 
+                           ${Python_INCLUDE_DIRS} 
+                           ${Boost_INCLUDE_DIRS})
+
+target_link_libraries(_nn PRIVATE
+                      ${Boost_LIBRARIES} 
+                      ${PYTHON_LIBRARIES} 
+                      BackendCore)
+
+target_include_directories(_nn PRIVATE 
+                           ${PYTHON_INCLUDE_DIRS} 
+                           ${Boost_INCLUDE_DIRS})
+
+target_link_libraries(_sys PRIVATE
+                      ${Boost_LIBRARIES} 
+                      ${PYTHON_LIBRARIES} 
+                      BackendCore)
+
+target_include_directories(_sys PRIVATE 
                            ${PYTHON_INCLUDE_DIRS} 
                            ${Boost_INCLUDE_DIRS})
 
-#target_link_libraries(py_layers PRIVATE ${Boost_LIBRARIES} ${PYTHON_LIBRARIES} BackendCore)
-#target_include_directories(py_layers PRIVATE ${PYTHON_INCLUDE_DIRS} ${Boost_INCLUDE_DIRS})
+target_link_libraries(_train PRIVATE
+                      ${Boost_LIBRARIES} 
+                      ${PYTHON_LIBRARIES} 
+                      BackendCore)
 
-# for compiled boost lib
-#target_link_libraries(hello PRIVATE Boost::filesystem)
\ No newline at end of file
+target_include_directories(_train PRIVATE 
+                           ${PYTHON_INCLUDE_DIRS} 
+                           ${Boost_INCLUDE_DIRS})
\ No newline at end of file
diff --git a/src/backend/CMakeLists.txt b/src/backend/CMakeLists.txt
index 579fae3..ed6bade 100644
--- a/src/backend/CMakeLists.txt
+++ b/src/backend/CMakeLists.txt
@@ -1,25 +1,18 @@
-#include_directories(
-#        "${CMAKE_CURRENT_SOURCE_DIR}/computational_graph"
-#        "${CMAKE_CURRENT_SOURCE_DIR}/data_modeling"
-#        "${CMAKE_CURRENT_SOURCE_DIR}/utility"
-#        "${CMAKE_CURRENT_SOURCE_DIR}/layers"
-#        "${CMAKE_CURRENT_SOURCE_DIR}/system"
-#        "${CMAKE_CURRENT_SOURCE_DIR}/training/loss_functions"
-#        "${CMAKE_CURRENT_SOURCE_DIR}/training/optimizers"
-#        )
-
 file(GLOB_RECURSE CORE_SOURCES
     computational_graph/*.cpp
     data_modeling/*.cpp
-    #layers/*.cpp
-    #networks/*.cpp
-    #training/*.cpp
-    utility/*.cpp
+    module/*.cpp
     system/*.cpp
+    training/*.cpp
+    utility/*.cpp
 )
 
-add_library(BackendCore STATIC ${CORE_SOURCES})
+add_library(BackendCore SHARED ${CORE_SOURCES})
 
 target_include_directories(BackendCore PUBLIC
     ${CMAKE_CURRENT_SOURCE_DIR}
+)
+
+set_target_properties(BackendCore PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY "${PYTHON_MODULE_DIR}" # make sure Python-modules see backend
 )
\ No newline at end of file
diff --git a/src/backend/computational_graph/activation_functions/leaky_relu_node.cpp b/src/backend/computational_graph/activation_functions/leaky_relu_node.cpp
new file mode 100644
index 0000000..83de4ca
--- /dev/null
+++ b/src/backend/computational_graph/activation_functions/leaky_relu_node.cpp
@@ -0,0 +1,31 @@
+/**
+ * @file leaky_relu_node.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-07
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include "leaky_relu_node.h"
+
+#include <utility>
+
+using namespace std;
+using namespace cgraph;
+
+vector<shared_ptr<Tensor>> LeakyReLuNode::backward(const Tensor& upstreamGrad) {
+  assert(!upstreamGrad.getRequiresGrad());
+  constexpr ftype zero = 0.0;
+  
+  auto res = make_shared<Tensor>(upstreamGrad.getDims(), upstreamGrad.getDevice(), false);
+  
+  const auto& parent = parents[0];
+  for(tensorSize_t i=0; i<upstreamGrad.getSize(); i++){
+    res->set((*parent)[i] > zero ? upstreamGrad[i] : upstreamGrad[i] * eps, i);
+  }
+
+  return {res};
+}
\ No newline at end of file
diff --git a/src/backend/computational_graph/activation_functions/leaky_relu_node.h b/src/backend/computational_graph/activation_functions/leaky_relu_node.h
new file mode 100644
index 0000000..1d9304e
--- /dev/null
+++ b/src/backend/computational_graph/activation_functions/leaky_relu_node.h
@@ -0,0 +1,29 @@
+/**
+ * @file leaky_relu_node.h
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-07
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#pragma once
+
+#include "computational_graph/graph_node.h"
+
+#include <memory>
+
+namespace cgraph {
+  class LeakyReLuNode final : public GraphNode {
+    private:
+      const ftype eps;
+
+    public:
+      explicit LeakyReLuNode(std::shared_ptr<Tensor> t, const ftype eps) 
+        : GraphNode({std::move(t)}), eps{eps} {}
+
+      std::vector<std::shared_ptr<Tensor>> backward(const Tensor& upstreamGrad) override;
+  };
+}
diff --git a/src/backend/computational_graph/relu_node.cpp b/src/backend/computational_graph/activation_functions/relu_node.cpp
similarity index 66%
rename from src/backend/computational_graph/relu_node.cpp
rename to src/backend/computational_graph/activation_functions/relu_node.cpp
index 2e0f647..3fcc958 100644
--- a/src/backend/computational_graph/relu_node.cpp
+++ b/src/backend/computational_graph/activation_functions/relu_node.cpp
@@ -14,17 +14,18 @@
 #include <utility>
 
 using namespace std;
-using namespace graph;
+using namespace cgraph;
 
 vector<shared_ptr<Tensor>> ReLuNode::backward(const Tensor& upstreamGrad) {
   assert(!upstreamGrad.getRequiresGrad());
-
   constexpr ftype zero = 0.0;
   
-  auto res = make_shared<Tensor>(upstreamGrad.getDims().toVector(), upstreamGrad.getDevice(), false);
+  auto res = make_shared<Tensor>(upstreamGrad.getDims(), upstreamGrad.getDevice(), false);
+
+  const auto& parent = parents[0];
   for(tensorSize_t i=0; i<upstreamGrad.getSize(); i++){
-    auto v = upstreamGrad.getItem(i);
-    res->setItem(v > zero ? v : zero, i);
+    res->set((*parent)[i] > zero ? upstreamGrad[i] : zero, i);
   }
-  return {std::move(res)};
+
+  return {res};
 }
\ No newline at end of file
diff --git a/src/backend/computational_graph/relu_node.h b/src/backend/computational_graph/activation_functions/relu_node.h
similarity index 61%
rename from src/backend/computational_graph/relu_node.h
rename to src/backend/computational_graph/activation_functions/relu_node.h
index b0ce5b8..ef17749 100644
--- a/src/backend/computational_graph/relu_node.h
+++ b/src/backend/computational_graph/activation_functions/relu_node.h
@@ -11,24 +11,16 @@
 
 #pragma once
 
-#include "graph_node.h"
+#include "computational_graph/graph_node.h"
 
 #include <memory>
 
-namespace graph {
+namespace cgraph {
   class ReLuNode final : public GraphNode {
     public:
       explicit ReLuNode(std::shared_ptr<Tensor> t) 
         : GraphNode({std::move(t)}) {}
 
-      ReLuNode(const ReLuNode& other) = delete;
-      ReLuNode& operator=(const ReLuNode& other) = delete;
-
-      ReLuNode(ReLuNode&& other) = default;
-      ReLuNode& operator=(ReLuNode&& other) = default;
-
-      ~ReLuNode() noexcept = default; 
-
       std::vector<std::shared_ptr<Tensor>> backward(const Tensor& upstreamGrad) override;
   };
 }
diff --git a/src/backend/computational_graph/activation_functions/sigmoid_node.cpp b/src/backend/computational_graph/activation_functions/sigmoid_node.cpp
new file mode 100644
index 0000000..5873724
--- /dev/null
+++ b/src/backend/computational_graph/activation_functions/sigmoid_node.cpp
@@ -0,0 +1,35 @@
+/**
+ * @file sigmoid_node.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-14
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include "sigmoid_node.h"
+
+#include <utility>
+
+using namespace std;
+using namespace cgraph;
+
+vector<shared_ptr<Tensor>> SigmoidNode::backward(const Tensor& upstreamGrad) {
+  assert(!upstreamGrad.getRequiresGrad());
+  constexpr ftype zero = 0.0;
+  
+  auto res = make_shared<Tensor>(upstreamGrad.getDims(), upstreamGrad.getDevice(), false);
+
+  // s is result from forward pass sigmoid
+  auto derivative = [](ftype s){
+    return s * (1-s);
+  };
+
+  for(tensorSize_t i=0; i<upstreamGrad.getSize(); i++){
+    res->set(derivative((*sigmoid)[i]) * upstreamGrad[i], i);
+  }
+
+  return {res};
+}
\ No newline at end of file
diff --git a/src/backend/computational_graph/activation_functions/sigmoid_node.h b/src/backend/computational_graph/activation_functions/sigmoid_node.h
new file mode 100644
index 0000000..82e824d
--- /dev/null
+++ b/src/backend/computational_graph/activation_functions/sigmoid_node.h
@@ -0,0 +1,32 @@
+/**
+ * @file relu_node.h
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-02-15
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#pragma once
+
+#include "computational_graph/graph_node.h"
+#include "data_modeling/tensor.h"
+
+#include <memory>
+#include <utility>
+
+namespace cgraph {
+  class SigmoidNode final : public GraphNode {
+    private:
+      // cache the result of the forward function
+      std::shared_ptr<const Tensor> sigmoid;
+
+    public:
+      explicit SigmoidNode(std::shared_ptr<Tensor> t, std::shared_ptr<const Tensor> sigmoid) 
+        : GraphNode({std::move(t)}), sigmoid{std::move(sigmoid)} {}
+
+      std::vector<std::shared_ptr<Tensor>> backward(const Tensor& upstreamGrad) override;
+  };
+}
diff --git a/src/backend/computational_graph/activation_functions/softmax_node.cpp b/src/backend/computational_graph/activation_functions/softmax_node.cpp
new file mode 100644
index 0000000..8603355
--- /dev/null
+++ b/src/backend/computational_graph/activation_functions/softmax_node.cpp
@@ -0,0 +1,45 @@
+/**
+ * @file softmax_node.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-15
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include "softmax_node.h"
+
+#include "data_modeling/tensor_functions.h"
+
+#include <iostream>
+
+using namespace std;
+using namespace cgraph;
+
+vector< shared_ptr<Tensor> > SoftmaxNode::backward(const Tensor& upstreamGrad) {
+  assert(!upstreamGrad.getRequiresGrad());
+    
+  const auto& yPred = parents[0];
+  auto res = make_shared<Tensor>(yPred->createEmptyCopy());
+
+  const auto bSize = yPred->getDims()[0];
+  assert(bSize>0);
+
+  for(tensorDim_t b=0; b<bSize; b++){
+    for(tensorDim_t i=0; i<yPred->getDims()[1]; i++){
+      ftype grad = 0;
+      const ftype yi = softmax->get(b, i);
+      
+      for(tensorDim_t j=0; j<yPred->getDims()[1]; j++){
+        ftype yj = softmax->get(b, j);
+        ftype jacobian = (i==j) ? yi*(1-yj) : -yi*yj;
+        grad += upstreamGrad.get(b, j) * jacobian;
+      }
+      res->set(grad, b, i);
+    }
+  }
+
+  return {res};
+}
\ No newline at end of file
diff --git a/src/backend/computational_graph/activation_functions/softmax_node.h b/src/backend/computational_graph/activation_functions/softmax_node.h
new file mode 100644
index 0000000..6c3c8d1
--- /dev/null
+++ b/src/backend/computational_graph/activation_functions/softmax_node.h
@@ -0,0 +1,31 @@
+/**
+ * @file softmax_node.h
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-15
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#pragma once
+
+#include "computational_graph/graph_node.h"
+#include "utility/global_params.h"
+
+namespace cgraph {
+  class SoftmaxNode final : public GraphNode {
+    private:
+      const std::shared_ptr<const Tensor> softmax;
+
+    public:
+      explicit SoftmaxNode(std::shared_ptr<Tensor> t, std::shared_ptr<const Tensor> softmax) 
+        : GraphNode({std::move(t)}), softmax{std::move(softmax)}
+        {
+          assert(softmax->getSize()==parents[0]->getDims()[0]);
+        }
+
+      std::vector<std::shared_ptr<Tensor>> backward(const Tensor& upstreamGrad) override;
+  };
+}
\ No newline at end of file
diff --git a/src/backend/computational_graph/add_node.h b/src/backend/computational_graph/add_node.h
deleted file mode 100644
index 99bc964..0000000
--- a/src/backend/computational_graph/add_node.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/**
- * @file add_node.h
- * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
- * @brief 
- * @version 0.1
- * @date 2026-02-03
- * 
- * @copyright Copyright (c) 2026
- * 
- */
-
-#pragma once
-
-#include "graph_node.h"
-
-namespace graph {
-  class AddNode final : public GraphNode {  
-    public:
-      explicit AddNode(std::shared_ptr<Tensor> t1, std::shared_ptr<Tensor> t2) 
-        : GraphNode({std::move(t1), std::move(t2)}) {}
-
-      AddNode(const AddNode& other) = delete;
-      AddNode& operator=(const AddNode& other) = delete;
-
-      AddNode(AddNode&& other) = default;
-      AddNode& operator=(AddNode&& other) = default;
-
-      ~AddNode() noexcept = default; 
-
-      std::vector<std::shared_ptr<Tensor>> backward(const Tensor& upstreamGrad) override;
-  };
-}
\ No newline at end of file
diff --git a/src/backend/computational_graph/graph_node.h b/src/backend/computational_graph/graph_node.h
index 67cb033..c8d3cec 100644
--- a/src/backend/computational_graph/graph_node.h
+++ b/src/backend/computational_graph/graph_node.h
@@ -18,17 +18,48 @@
 
 #include <utility>
 
-namespace graph {
+// if GCC or Clang
+#ifdef __GNUC__
+#include <cxxabi.h>
+#endif // __GNUC__
+
+namespace cgraph {
   class GraphNode {
     protected:
       std::vector< std::shared_ptr<Tensor> > parents;
       explicit GraphNode(std::vector< std::shared_ptr<Tensor> > parents) : parents{std::move(parents)}{}
       
     public:
+      GraphNode(const GraphNode& other) = delete;
+      GraphNode& operator=(const GraphNode& other) = delete;
+
+      GraphNode(GraphNode&& other) = default;
+      GraphNode& operator=(GraphNode&& other) = default;
+
+      virtual ~GraphNode() noexcept = default; 
+
       virtual std::vector<std::shared_ptr<Tensor>> backward(const Tensor& upstreamGrad) = 0;
       
       const auto& getParents() const noexcept {
         return parents;
       }
+
+      virtual void print(std::ostream& os) const noexcept {
+        os << "\n";
+      #ifdef __GNUC__
+        // demangle name on gcc and clang
+        int status;
+        char* demangled = abi::__cxa_demangle(typeid(*this).name(), nullptr, nullptr, &status);
+        os << (status == 0 ? demangled : typeid(*this).name());
+        std::free(demangled);
+      #else
+        os << typeid(*this).name();
+      #endif
+      };
+
+      friend std::ostream& operator<<(std::ostream& os, const GraphNode& n) noexcept {
+        n.print(os); // calling vtable
+        return os;
+      }
   };
 }
diff --git a/src/backend/computational_graph/loss_functions/bce_node.cpp b/src/backend/computational_graph/loss_functions/bce_node.cpp
new file mode 100644
index 0000000..add016f
--- /dev/null
+++ b/src/backend/computational_graph/loss_functions/bce_node.cpp
@@ -0,0 +1,35 @@
+/**
+ * @file bce_node.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-14
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include "bce_node.h"
+
+#include "data_modeling/tensor_functions.h"
+
+using namespace std;
+using namespace cgraph;
+
+vector< shared_ptr<Tensor> > BceNode::backward(const Tensor& upstreamGrad) {
+  assert(!upstreamGrad.getRequiresGrad());
+
+  const auto& yPred = parents[0];
+  auto res = make_shared<Tensor>(yPred->createEmptyCopy());
+
+  ftype bSize = yPred->getDims()[0];
+  for(tensorSize_t i=0; i<yPred->getDims()[0]; i++){
+    auto yi = (*yTrue)[i];
+    auto yiHat = (*yPred)[i];
+
+    auto g = -yi/std::max(yiHat, epsBce) + (1-yi)/std::max(1-yiHat, epsBce);
+    res->set(g/bSize, i);
+  }
+  
+  return {res};
+}
\ No newline at end of file
diff --git a/src/backend/computational_graph/loss_functions/bce_node.h b/src/backend/computational_graph/loss_functions/bce_node.h
new file mode 100644
index 0000000..25b5f62
--- /dev/null
+++ b/src/backend/computational_graph/loss_functions/bce_node.h
@@ -0,0 +1,34 @@
+/**
+ * @file bce_node.h
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-14
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#pragma once
+
+#include "computational_graph/graph_node.h"
+#include "utility/global_params.h"
+
+namespace cgraph {
+  class BceNode final : public GraphNode {
+    private:
+      const std::shared_ptr<const Tensor> yTrue;
+
+    public:
+      explicit BceNode(std::shared_ptr<Tensor> y, std::shared_ptr<Tensor> yPred) 
+        : GraphNode({std::move(yPred)}), yTrue{std::move(y)}
+        {
+          assert(parents[0]->getDims()==yTrue->getDims());
+          if(!parents[0]->getRequiresGrad()){
+            std::__throw_invalid_argument("yPred must be a graph node");
+          }
+        }
+
+      std::vector<std::shared_ptr<Tensor>> backward(const Tensor& upstreamGrad) override;
+  };
+}
\ No newline at end of file
diff --git a/src/backend/computational_graph/loss_functions/bce_sigmoid_node.cpp b/src/backend/computational_graph/loss_functions/bce_sigmoid_node.cpp
new file mode 100644
index 0000000..998e110
--- /dev/null
+++ b/src/backend/computational_graph/loss_functions/bce_sigmoid_node.cpp
@@ -0,0 +1,46 @@
+/**
+ * @file bce_sigmoid_node.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-17
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include "bce_sigmoid_node.h"
+
+#include "data_modeling/tensor_functions.h"
+
+#include <cmath>
+
+using namespace std;
+using namespace cgraph;
+
+vector< shared_ptr<Tensor> > BceSigmoidNode::backward(const Tensor& upstreamGrad) {
+  assert(!upstreamGrad.getRequiresGrad());
+  
+  auto sigmoid = [](ftype x){
+    constexpr ftype one = 1.0;
+    if(x>=0){
+      return one / (one + exp(-x));
+    }
+    auto e = exp(x);
+    return e / (one + e);
+  };
+
+  const auto& logits = parents[0];
+  auto res = make_shared<Tensor>(logits->createEmptyCopy());
+
+  ftype bSize = logits->getDims()[0];
+  for(tensorSize_t i=0; i<logits->getDims()[0]; i++){
+    auto y = (*yTrue)[i];
+    auto s = sigmoid((*logits)[i]);
+
+    auto g = s - y;
+    res->set(g/bSize, i);
+  }
+  
+  return {res};
+}
\ No newline at end of file
diff --git a/src/backend/computational_graph/loss_functions/bce_sigmoid_node.h b/src/backend/computational_graph/loss_functions/bce_sigmoid_node.h
new file mode 100644
index 0000000..f90059c
--- /dev/null
+++ b/src/backend/computational_graph/loss_functions/bce_sigmoid_node.h
@@ -0,0 +1,33 @@
+/**
+ * @file bce_sigmoid_node.h
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-17
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#pragma once
+
+#include "computational_graph/graph_node.h"
+
+namespace cgraph {
+  class BceSigmoidNode final : public GraphNode {
+    private:
+      const std::shared_ptr<const Tensor> yTrue;
+
+    public:
+      explicit BceSigmoidNode(std::shared_ptr<Tensor> y, std::shared_ptr<Tensor> logits) 
+        : GraphNode({std::move(logits)}), yTrue{std::move(y)}
+        {
+          assert(parents[0]->getDims()==yTrue->getDims());
+          if(!parents[0]->getRequiresGrad()){
+            std::__throw_invalid_argument("yPred must be a graph node");
+          }
+        }
+
+      std::vector<std::shared_ptr<Tensor>> backward(const Tensor& upstreamGrad) override;
+  };
+}
\ No newline at end of file
diff --git a/src/backend/computational_graph/loss_functions/crossentropy_node.cpp b/src/backend/computational_graph/loss_functions/crossentropy_node.cpp
new file mode 100644
index 0000000..249de43
--- /dev/null
+++ b/src/backend/computational_graph/loss_functions/crossentropy_node.cpp
@@ -0,0 +1,37 @@
+/**
+ * @file add_node.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-02-03
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include "crossentropy_node.h"
+
+#include "data_modeling/tensor_functions.h"
+
+using namespace std;
+using namespace cgraph;
+
+vector< shared_ptr<Tensor> > CrossEntropyNode::backward(const Tensor& upstreamGrad) {
+  assert(!upstreamGrad.getRequiresGrad());
+  
+  const auto& yPred = parents[0];
+  auto res = make_shared<Tensor>(yPred->createEmptyCopy());
+
+  ftype bSize = yPred->getDims()[0];
+  for(tensorDim_t i=0; i<yPred->getDims()[0]; i++){
+    for(tensorDim_t j=0; j<yPred->getDims()[1]; j++){
+      auto yij = yTrue->get(i, j);
+      auto yijHat = yPred->get(i, j);
+
+      auto g = -yij/std::max(yijHat, epsCrossentropy);
+      res->set(g/bSize, i, j);
+    }
+  }
+  
+  return {res};
+}
\ No newline at end of file
diff --git a/src/backend/computational_graph/loss_functions/crossentropy_node.h b/src/backend/computational_graph/loss_functions/crossentropy_node.h
new file mode 100644
index 0000000..2644a8d
--- /dev/null
+++ b/src/backend/computational_graph/loss_functions/crossentropy_node.h
@@ -0,0 +1,41 @@
+/**
+ * @file add_node.h
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-02-03
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#pragma once
+
+#include "computational_graph/graph_node.h"
+#include "utility/global_params.h"
+
+namespace cgraph {
+  class CrossEntropyNode final : public GraphNode {
+    private:
+      const std::shared_ptr<const Tensor> yTrue;
+
+    public:
+
+      /**
+       * @brief Expexted shapes are same as for CrossEntropyLoss.
+       * 
+       * @param y shape (batchsize)
+       * @param yPred shape (batchsize, nclasses)
+       */
+      explicit CrossEntropyNode(std::shared_ptr<Tensor> y, std::shared_ptr<Tensor> yPred) 
+        : GraphNode({std::move(yPred)}), yTrue{std::move(y)}
+        {
+          assert(parents[0]->getDims()==yTrue->getDims());
+          if(!parents[0]->getRequiresGrad()){
+            std::__throw_invalid_argument("yPred must be a graph node");
+          }
+        }
+
+      std::vector<std::shared_ptr<Tensor>> backward(const Tensor& upstreamGrad) override;
+  };
+}
\ No newline at end of file
diff --git a/src/backend/computational_graph/loss_functions/crossentropy_softmax_node.cpp b/src/backend/computational_graph/loss_functions/crossentropy_softmax_node.cpp
new file mode 100644
index 0000000..ca0d1c7
--- /dev/null
+++ b/src/backend/computational_graph/loss_functions/crossentropy_softmax_node.cpp
@@ -0,0 +1,37 @@
+/**
+ * @file crossentropy_softmax_node.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-17
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include "crossentropy_softmax_node.h"
+
+#include "module/activation_functions/softmax.h"
+
+using namespace std;
+using namespace cgraph;
+
+vector< shared_ptr<Tensor> > CrossEntropySoftmaxNode::backward(const Tensor& upstreamGrad) {
+  assert(!upstreamGrad.getRequiresGrad());
+
+  const auto& logits = parents[0];
+  auto res = make_shared<Tensor>(logits->createEmptyCopy());
+
+  const auto softmax = module::Softmax();
+  const auto s = softmax(*logits);
+
+  ftype bSize = logits->getDims()[0];
+  for(tensorSize_t b=0; b<logits->getDims()[0]; b++){
+    for(tensorSize_t i=0; i<logits->getDims()[1]; i++){
+      auto g = s.get(b, i) - yTrue->get(b, i);
+      res->set(g / bSize, b, i);
+    }
+  }
+
+  return {res};
+}
\ No newline at end of file
diff --git a/src/backend/computational_graph/loss_functions/crossentropy_softmax_node.h b/src/backend/computational_graph/loss_functions/crossentropy_softmax_node.h
new file mode 100644
index 0000000..17f0d15
--- /dev/null
+++ b/src/backend/computational_graph/loss_functions/crossentropy_softmax_node.h
@@ -0,0 +1,33 @@
+/**
+ * @file crossentropy_softmax_node.h
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-17
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#pragma once
+
+#include "computational_graph/graph_node.h"
+
+namespace cgraph {
+  class CrossEntropySoftmaxNode final : public GraphNode {
+    private:
+      const std::shared_ptr<const Tensor> yTrue;
+
+    public:
+      explicit CrossEntropySoftmaxNode(std::shared_ptr<Tensor> y, std::shared_ptr<Tensor> logits) 
+        : GraphNode({std::move(logits)}), yTrue{std::move(y)}
+        {
+          assert(parents[0]->getDims()==yTrue->getDims());
+          if(!parents[0]->getRequiresGrad()){
+            std::__throw_invalid_argument("yPred must be a graph node");
+          }
+        }
+
+      std::vector<std::shared_ptr<Tensor>> backward(const Tensor& upstreamGrad) override;
+  };
+}
\ No newline at end of file
diff --git a/src/backend/computational_graph/loss_functions/rmse_node.cpp b/src/backend/computational_graph/loss_functions/rmse_node.cpp
new file mode 100644
index 0000000..e3eb11e
--- /dev/null
+++ b/src/backend/computational_graph/loss_functions/rmse_node.cpp
@@ -0,0 +1,39 @@
+/**
+ * @file rmse_node.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-14
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include "rmse_node.h"
+
+#include "data_modeling/tensor_functions.h"
+
+#include <iostream>
+
+using namespace std;
+using namespace cgraph;
+
+vector< shared_ptr<Tensor> > RmseNode::backward(const Tensor& upstreamGrad) {
+  assert(!upstreamGrad.getRequiresGrad());
+  constexpr ftype eps = 1e-9;
+
+  const auto& yPred = parents[0];
+  auto res = make_shared<Tensor>(yPred->createEmptyCopy());
+
+  ftype bSize = yPred->getDims()[0];
+  for(tensorSize_t i=0; i<yPred->getDims()[0]; i++){
+    auto yi = (*yTrue)[i];
+    auto yiHat = (*yPred)[i];
+
+    auto denom = rmse * bSize + eps;
+    auto g = (yiHat-yi) / denom;
+    res->set(g, i);
+  }
+  
+  return {res};
+}
\ No newline at end of file
diff --git a/src/backend/computational_graph/loss_functions/rmse_node.h b/src/backend/computational_graph/loss_functions/rmse_node.h
new file mode 100644
index 0000000..62e5cc5
--- /dev/null
+++ b/src/backend/computational_graph/loss_functions/rmse_node.h
@@ -0,0 +1,35 @@
+/**
+ * @file rmse_node.h
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-14
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#pragma once
+
+#include "computational_graph/graph_node.h"
+#include "utility/global_params.h"
+
+namespace cgraph {
+  class RmseNode final : public GraphNode {
+    private:
+      const std::shared_ptr<const Tensor> yTrue;
+      ftype rmse;
+
+    public:
+      explicit RmseNode(std::shared_ptr<Tensor> y, std::shared_ptr<Tensor> yPred, ftype rmse) 
+        : GraphNode({std::move(yPred)}), yTrue{std::move(y)}, rmse{rmse}
+        {
+          assert(parents[0]->getDims()==yTrue->getDims());
+          if(!parents[0]->getRequiresGrad()){
+            std::__throw_invalid_argument("yPred must be a graph node");
+          }
+        }
+
+      std::vector<std::shared_ptr<Tensor>> backward(const Tensor& upstreamGrad) override;
+  };
+}
\ No newline at end of file
diff --git a/src/backend/computational_graph/add_node.cpp b/src/backend/computational_graph/tensor_ops/add_node.cpp
similarity index 51%
rename from src/backend/computational_graph/add_node.cpp
rename to src/backend/computational_graph/tensor_ops/add_node.cpp
index 43427f5..77cfd0c 100644
--- a/src/backend/computational_graph/add_node.cpp
+++ b/src/backend/computational_graph/tensor_ops/add_node.cpp
@@ -11,11 +11,19 @@
 
 #include "add_node.h"
 
+#include "data_modeling/tensor_functions.h"
+
 using namespace std;
-using namespace graph;
+using namespace cgraph;
 
 vector< shared_ptr<Tensor> > AddNode::backward(const Tensor& upstreamGrad) {
   assert(!upstreamGrad.getRequiresGrad());
-  auto res = make_shared<Tensor>(upstreamGrad.createDeepCopy());
-  return {res, res};
+  auto weightGrad = make_shared<Tensor>(upstreamGrad.createDeepCopy());
+  
+  if(broadcasted){
+    auto biasGrad = make_shared<Tensor>(TensorFunctions::SumOverDims(*weightGrad));
+    return {weightGrad, biasGrad};
+  }
+  
+  return {weightGrad, weightGrad};
 }
\ No newline at end of file
diff --git a/src/backend/computational_graph/tensor_ops/add_node.h b/src/backend/computational_graph/tensor_ops/add_node.h
new file mode 100644
index 0000000..2402394
--- /dev/null
+++ b/src/backend/computational_graph/tensor_ops/add_node.h
@@ -0,0 +1,33 @@
+/**
+ * @file add_node.h
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-02-03
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#pragma once
+
+#include "computational_graph/graph_node.h"
+
+namespace cgraph {
+  class AddNode final : public GraphNode {
+    private:
+      // if t2 has been a vector we broadcast t2 into t1, see Tensor::add()
+      bool broadcasted = false;
+
+    public:
+      explicit AddNode(std::shared_ptr<Tensor> t1, std::shared_ptr<Tensor> t2) 
+        : GraphNode({std::move(t1), std::move(t2)}) {
+          // t2 is either tensor of same size or 1D-vector as bias
+          assert(t1->getDims().nDims()>=t2->getDims().nDims());
+
+          broadcasted = parents[0]->getDims() != parents[1]->getDims();
+        }
+
+      std::vector<std::shared_ptr<Tensor>> backward(const Tensor& upstreamGrad) override;
+  };
+}
\ No newline at end of file
diff --git a/src/backend/computational_graph/elementwise_mul_node.cpp b/src/backend/computational_graph/tensor_ops/elementwise_mul_node.cpp
similarity index 95%
rename from src/backend/computational_graph/elementwise_mul_node.cpp
rename to src/backend/computational_graph/tensor_ops/elementwise_mul_node.cpp
index fcdb130..22d39e7 100644
--- a/src/backend/computational_graph/elementwise_mul_node.cpp
+++ b/src/backend/computational_graph/tensor_ops/elementwise_mul_node.cpp
@@ -12,7 +12,7 @@
 #include "elementwise_mul_node.h"
 
 using namespace std;
-using namespace graph;
+using namespace cgraph;
 
 vector<shared_ptr<Tensor>> ElementwiseMulNode::backward(const Tensor& upstreamGrad) {
     assert(!upstreamGrad.getRequiresGrad());
diff --git a/src/backend/computational_graph/elementwise_mul_node.h b/src/backend/computational_graph/tensor_ops/elementwise_mul_node.h
similarity index 57%
rename from src/backend/computational_graph/elementwise_mul_node.h
rename to src/backend/computational_graph/tensor_ops/elementwise_mul_node.h
index 81203fd..ffa8038 100644
--- a/src/backend/computational_graph/elementwise_mul_node.h
+++ b/src/backend/computational_graph/tensor_ops/elementwise_mul_node.h
@@ -11,22 +11,14 @@
 
 #pragma once
 
-#include "graph_node.h"
+#include "computational_graph/graph_node.h"
 
-namespace graph {
+namespace cgraph {
   class ElementwiseMulNode final : public GraphNode {
     public:
       explicit ElementwiseMulNode(std::shared_ptr<Tensor> t1, std::shared_ptr<Tensor> t2) 
         : GraphNode({std::move(t1), std::move(t2)}) {}
 
-      ElementwiseMulNode(const ElementwiseMulNode& other) = delete;
-      ElementwiseMulNode& operator=(const ElementwiseMulNode& other) = delete;
-
-      ElementwiseMulNode(ElementwiseMulNode&& other) = default;
-      ElementwiseMulNode& operator=(ElementwiseMulNode&& other) = default;
-
-      ~ElementwiseMulNode() noexcept = default; 
-
       std::vector<std::shared_ptr<Tensor>> backward(const Tensor& upstreamGrad) override;
   };
 }
diff --git a/src/backend/computational_graph/getter_node.cpp b/src/backend/computational_graph/tensor_ops/getter_node.cpp
similarity index 80%
rename from src/backend/computational_graph/getter_node.cpp
rename to src/backend/computational_graph/tensor_ops/getter_node.cpp
index e1a3ac0..ded3640 100644
--- a/src/backend/computational_graph/getter_node.cpp
+++ b/src/backend/computational_graph/tensor_ops/getter_node.cpp
@@ -12,7 +12,7 @@
 #include "getter_node.h"
 
 using namespace std;
-using namespace graph;
+using namespace cgraph;
 
 vector< shared_ptr<Tensor> > GetterNode::backward(const Tensor& upstreamGrad) {
   // upstreamGrad is scalar by definition
@@ -20,14 +20,14 @@ vector< shared_ptr<Tensor> > GetterNode::backward(const Tensor& upstreamGrad) {
 
   auto res = make_shared<Tensor>(parents[0]->getDims(), parents[0]->getDevice(), false);
   for(tensorSize_t i=0; i<res->getSize(); i++){
-    res->setItem(0, i);
+    res->set(0, i);
   }
 
   if(std::holds_alternative<tensorSize_t>(idx)){
-    res->setItem(upstreamGrad.getItem(0), std::get<tensorSize_t>(idx));
+    res->set(upstreamGrad.get(0), std::get<tensorSize_t>(idx));
   }
   else if(std::holds_alternative<multiDimIdx_t>(idx)){
-    res->setItem(upstreamGrad.getItem(0), std::get<multiDimIdx_t>(idx));
+    res->set(upstreamGrad.get(0), std::get<multiDimIdx_t>(idx));
   }
   else{
     __throw_runtime_error("Idx variant in unexpected state");
diff --git a/src/backend/computational_graph/getter_node.h b/src/backend/computational_graph/tensor_ops/getter_node.h
similarity index 75%
rename from src/backend/computational_graph/getter_node.h
rename to src/backend/computational_graph/tensor_ops/getter_node.h
index e55b2d5..c0bdb45 100644
--- a/src/backend/computational_graph/getter_node.h
+++ b/src/backend/computational_graph/tensor_ops/getter_node.h
@@ -11,12 +11,12 @@
 
 #pragma once
 
-#include "graph_node.h"
+#include "computational_graph/graph_node.h"
 
 #include <vector>
 #include <variant>
 
-namespace graph{
+namespace cgraph{
   /**
    * @brief When calling a get function, say as in 
    * loss += myTensor[i], then we need to build a graph in between 
@@ -36,13 +36,5 @@ namespace graph{
       explicit GetterNode(std::shared_ptr<Tensor> t, const multiDimIdx_t& idx) 
         : GraphNode({std::move(t)}), idx{idx} {}
 
-      GetterNode(const GetterNode& other) = delete;
-      GetterNode& operator=(const GetterNode& other) = delete;
-
-      GetterNode(GetterNode&& other) = default;
-      GetterNode& operator=(GetterNode&& other) = default;
-
-      ~GetterNode() noexcept = default; 
-
       std::vector<std::shared_ptr<Tensor>> backward(const Tensor& upstreamGrad) override;
   };}
diff --git a/src/backend/computational_graph/graph_creation.cpp b/src/backend/computational_graph/tensor_ops/graph_creation.cpp
similarity index 57%
rename from src/backend/computational_graph/graph_creation.cpp
rename to src/backend/computational_graph/tensor_ops/graph_creation.cpp
index 1955493..1a28a08 100644
--- a/src/backend/computational_graph/graph_creation.cpp
+++ b/src/backend/computational_graph/tensor_ops/graph_creation.cpp
@@ -19,91 +19,92 @@
 
 using namespace std;
 
-shared_ptr<Tensor> graph::mul(const shared_ptr<Tensor> left, const shared_ptr<Tensor> right) {
+shared_ptr<Tensor> cgraph::mul(const shared_ptr<Tensor> left, const shared_ptr<Tensor> right) {
   auto res = make_shared<Tensor>((*left) * (*right));
   if(left->getRequiresGrad() || right->getRequiresGrad()){
-    res->setCgNode(make_shared<graph::ElementwiseMulNode>(left, right));
+    res->setCgNode(make_shared<cgraph::ElementwiseMulNode>(left, right));
     assert(res->getRequiresGrad());
   }
   return res;
 }
 
-shared_ptr<Tensor> graph::add(const shared_ptr<Tensor> left, const shared_ptr<Tensor> right) {
+shared_ptr<Tensor> cgraph::add(const shared_ptr<Tensor> left, const shared_ptr<Tensor> right) {
   auto res = make_shared<Tensor>(*left + *right);
   if(left->getRequiresGrad() || right->getRequiresGrad()){
-    res->setCgNode(make_shared<graph::AddNode>(left, right));
+    res->setCgNode(make_shared<cgraph::AddNode>(left, right));
     assert(res->getRequiresGrad());
   }
   return res;
 }
 
-shared_ptr<Tensor> graph::matmul(const shared_ptr<Tensor> left, const shared_ptr<Tensor> right) {
+shared_ptr<Tensor> cgraph::matmul(const shared_ptr<Tensor> left, const shared_ptr<Tensor> right) {
   auto res = make_shared<Tensor>(left->matmul(*right));
   if(left->getRequiresGrad() || right->getRequiresGrad()){
-    res->setCgNode(make_shared<graph::MatMulNode>(left, right));
+    res->setCgNode(make_shared<cgraph::MatMulNode>(left, right));
     assert(res->getRequiresGrad());
   }
   return res;
 }
 
-shared_ptr<Tensor> graph::mul(const shared_ptr<Tensor> t, ftype scalar) {
+shared_ptr<Tensor> cgraph::mul(const shared_ptr<Tensor> t, ftype scalar) {
   auto res = make_shared<Tensor>((*t) * scalar);
   if(t->getRequiresGrad()){
-    res->setCgNode(std::make_shared<graph::ScalarMulNode>(t, scalar));
+    res->setCgNode(std::make_shared<cgraph::ScalarMulNode>(t, scalar));
     assert(res->getRequiresGrad());
   }
   return res;
 }
 
-shared_ptr<Tensor> graph::mul(ftype scalar, const shared_ptr<Tensor> t) {
-  return graph::mul(t, scalar);
+shared_ptr<Tensor> cgraph::mul(ftype scalar, const shared_ptr<Tensor> t) {
+  return cgraph::mul(t, scalar);
 }
 
-shared_ptr<Tensor> graph::add(const shared_ptr<Tensor> t, ftype scalar) {
+shared_ptr<Tensor> cgraph::add(const shared_ptr<Tensor> t, ftype scalar) {
   auto res = make_shared<Tensor>((*t) + scalar);
   if(t->getRequiresGrad()){
-    res->setCgNode(std::make_shared<graph::ScalarAddNode>(t));
+    res->setCgNode(std::make_shared<cgraph::ScalarAddNode>(t));
     assert(res->getRequiresGrad());
   }
   return res;
 }
 
-shared_ptr<Tensor> graph::add(ftype scalar, const shared_ptr<Tensor> t) {
-  return graph::add(t, scalar);
+shared_ptr<Tensor> cgraph::add(ftype scalar, const shared_ptr<Tensor> t) {
+  return cgraph::add(t, scalar);
 }
 
-shared_ptr<Tensor> graph::sub(const shared_ptr<Tensor> t, ftype scalar) {
+shared_ptr<Tensor> cgraph::sub(const shared_ptr<Tensor> t, ftype scalar) {
   auto res = make_shared<Tensor>((*t) - scalar);
   if(t->getRequiresGrad()){
-    res->setCgNode(std::make_shared<graph::ScalarAddNode>(t));
+    res->setCgNode(std::make_shared<cgraph::ScalarAddNode>(t));
     assert(res->getRequiresGrad());
   }
   return res;
 }
 
-shared_ptr<Tensor> graph::div(const shared_ptr<Tensor> t, ftype scalar) {
+shared_ptr<Tensor> cgraph::div(const shared_ptr<Tensor> t, ftype scalar) {
   auto res = make_shared<Tensor>((*t) / scalar);
   if(t->getRequiresGrad()){
-    res->setCgNode(std::make_shared<graph::ScalarMulNode>(t, 1 / scalar));
+    constexpr ftype eps = 1e-9;
+    res->setCgNode(std::make_shared<cgraph::ScalarMulNode>(t, 1/std::max(scalar, eps)));
     assert(res->getRequiresGrad());
   }
   return res;
 }
 
 /**
- * @brief Special linear indexing, see getItem() overloads in tensor. 
+ * @brief Special linear indexing, see get() overloads in tensor. 
  * Used to keep the computational graph intact.
  * E.g. if we have something like 
  * 
  * loss = loss + other.get(i), we need to make sure get(i) can map to computational graph.
  */
-shared_ptr<Tensor> graph::get(const shared_ptr<Tensor>& t, tensorSize_t idx) {
-  ftype val = t->getItem(idx);
+shared_ptr<Tensor> cgraph::get(const shared_ptr<Tensor>& t, tensorSize_t idx) {
+  ftype val = t->get(idx);
   auto res = make_shared<Tensor>(std::vector<tensorDim_t>{1}, std::vector<ftype>{val}, 
                              t->getDevice());
                              
   if(t->getRequiresGrad()){
-    res->setCgNode(std::make_shared<graph::GetterNode>(t, idx));
+    res->setCgNode(std::make_shared<cgraph::GetterNode>(t, idx));
     assert(res->getRequiresGrad());
   }
   return res;
@@ -115,12 +116,12 @@ shared_ptr<Tensor> graph::get(const shared_ptr<Tensor>& t, tensorSize_t idx) {
  * 
  * loss = loss + other.get(i), we need to make sure get(i) can map to computational graph.
  */
-shared_ptr<Tensor> graph::get(const shared_ptr<Tensor>& t, const vector<tensorDim_t>& idx) {
-  ftype val = t->getItem(std::move(idx));
+shared_ptr<Tensor> cgraph::get(const shared_ptr<Tensor>& t, const vector<tensorDim_t>& idx) {
+  ftype val = t->get(std::move(idx));
   auto res = make_shared<Tensor>(std::vector<tensorDim_t>{1}, std::vector<ftype>{val}, 
                              t->getDevice());
   if(t->getRequiresGrad()){
-    res->setCgNode(std::make_shared<graph::GetterNode>(t, idx));
+    res->setCgNode(std::make_shared<cgraph::GetterNode>(t, idx));
     assert(res->getRequiresGrad());
   }
   return res;
@@ -129,11 +130,11 @@ shared_ptr<Tensor> graph::get(const shared_ptr<Tensor>& t, const vector<tensorDi
 /**
  * @brief Takes the sum of the whole tensor, then returns result as vector.
  */
-shared_ptr<Tensor> graph::sumTensor(const shared_ptr<Tensor> t) {
+shared_ptr<Tensor> cgraph::sumTensor(const shared_ptr<Tensor> t) {
   auto res = make_shared<Tensor>(std::vector<tensorDim_t>{1}, std::vector<ftype>{0.0}, 
                                  t->getDevice(), t->getRequiresGrad());
   for(tensorSize_t i=0; i<t->getSize(); i++){
-    res = graph::add(res, graph::get(t, i));
+    res = cgraph::add(res, cgraph::get(t, i));
   }
   return res;
 }
\ No newline at end of file
diff --git a/src/backend/computational_graph/graph_creation.h b/src/backend/computational_graph/tensor_ops/graph_creation.h
similarity index 98%
rename from src/backend/computational_graph/graph_creation.h
rename to src/backend/computational_graph/tensor_ops/graph_creation.h
index f68cb4c..9decd8b 100644
--- a/src/backend/computational_graph/graph_creation.h
+++ b/src/backend/computational_graph/tensor_ops/graph_creation.h
@@ -15,7 +15,7 @@
 
 #include <memory>
 
-namespace graph {
+namespace cgraph {
   // Artithmetic operations
   std::shared_ptr<Tensor> mul(const std::shared_ptr<Tensor> left, const std::shared_ptr<Tensor> right);
   std::shared_ptr<Tensor> mul(const std::shared_ptr<Tensor> left, ftype scalar); 
diff --git a/src/backend/computational_graph/matmul_node.cpp b/src/backend/computational_graph/tensor_ops/matmul_node.cpp
similarity index 95%
rename from src/backend/computational_graph/matmul_node.cpp
rename to src/backend/computational_graph/tensor_ops/matmul_node.cpp
index fc24fd4..2237026 100644
--- a/src/backend/computational_graph/matmul_node.cpp
+++ b/src/backend/computational_graph/tensor_ops/matmul_node.cpp
@@ -12,7 +12,7 @@
 #include "matmul_node.h"
 
 using namespace std;
-using namespace graph;
+using namespace cgraph;
 
 vector<shared_ptr<Tensor>> MatMulNode::backward(const Tensor& upstreamGrad) {
     assert(!upstreamGrad.getRequiresGrad());
diff --git a/src/backend/computational_graph/matmul_node.h b/src/backend/computational_graph/tensor_ops/matmul_node.h
similarity index 62%
rename from src/backend/computational_graph/matmul_node.h
rename to src/backend/computational_graph/tensor_ops/matmul_node.h
index 7fa94bf..6758602 100644
--- a/src/backend/computational_graph/matmul_node.h
+++ b/src/backend/computational_graph/tensor_ops/matmul_node.h
@@ -11,24 +11,16 @@
 
 #pragma once
 
-#include "graph_node.h"
+#include "computational_graph/graph_node.h"
 
 #include <memory>
 
-namespace graph {
+namespace cgraph {
   class MatMulNode final : public GraphNode {
     public:
       explicit MatMulNode(std::shared_ptr<Tensor> t1, std::shared_ptr<Tensor> t2) 
         : GraphNode({std::move(t1), std::move(t2)}) {}
 
-      MatMulNode(const MatMulNode& other) = delete;
-      MatMulNode& operator=(const MatMulNode& other) = delete;
-
-      MatMulNode(MatMulNode&& other) = default;
-      MatMulNode& operator=(MatMulNode&& other) = default;
-
-      ~MatMulNode() noexcept = default; 
-
       std::vector<std::shared_ptr<Tensor>> backward(const Tensor& upstreamGrad) override;
   };
 }
diff --git a/src/backend/computational_graph/scalar_op_nodes.cpp b/src/backend/computational_graph/tensor_ops/scalar_op_nodes.cpp
similarity index 69%
rename from src/backend/computational_graph/scalar_op_nodes.cpp
rename to src/backend/computational_graph/tensor_ops/scalar_op_nodes.cpp
index 05a3643..e0f52f0 100644
--- a/src/backend/computational_graph/scalar_op_nodes.cpp
+++ b/src/backend/computational_graph/tensor_ops/scalar_op_nodes.cpp
@@ -14,19 +14,19 @@
 #include <utility>
 
 using namespace std;
-using namespace graph;
+using namespace cgraph;
 
-vector<shared_ptr<Tensor>> graph::ScalarAddNode::backward(const Tensor& upstreamGrad) {
+vector<shared_ptr<Tensor>> cgraph::ScalarAddNode::backward(const Tensor& upstreamGrad) {
   assert(!upstreamGrad.getRequiresGrad());
   return {make_shared<Tensor>(upstreamGrad.createDeepCopy())};
 }
 
-vector<shared_ptr<Tensor>> graph::ScalarMulNode::backward(const Tensor& upstreamGrad) {
+vector<shared_ptr<Tensor>> cgraph::ScalarMulNode::backward(const Tensor& upstreamGrad) {
   assert(!upstreamGrad.getRequiresGrad());
 
   auto res = make_shared<Tensor>(upstreamGrad.createDeepCopy());
   for(tensorSize_t i=0; i<res->getSize(); i++){
-    res->setItem(res->getItem(i) * factor, i);
+    res->set(res->get(i) * factor, i);
   }
   return {std::move(res)};
 }
\ No newline at end of file
diff --git a/src/backend/computational_graph/scalar_op_nodes.h b/src/backend/computational_graph/tensor_ops/scalar_op_nodes.h
similarity index 76%
rename from src/backend/computational_graph/scalar_op_nodes.h
rename to src/backend/computational_graph/tensor_ops/scalar_op_nodes.h
index 5a6588e..13cb067 100644
--- a/src/backend/computational_graph/scalar_op_nodes.h
+++ b/src/backend/computational_graph/tensor_ops/scalar_op_nodes.h
@@ -11,9 +11,9 @@
 
 #pragma once
 
-#include "graph_node.h"
+#include "computational_graph/graph_node.h"
 
-namespace graph {
+namespace cgraph {
   class ScalarAddNode final : public GraphNode {  
     public:
       explicit ScalarAddNode(std::shared_ptr<Tensor> t) 
@@ -38,14 +38,6 @@ namespace graph {
       explicit ScalarMulNode(std::shared_ptr<Tensor> t, ftype factor) 
         : GraphNode({std::move(t)}), factor{factor} {}
 
-      ScalarMulNode(const ScalarMulNode& other) = delete;
-      ScalarMulNode& operator=(const ScalarMulNode& other) = delete;
-
-      ScalarMulNode(ScalarMulNode&& other) = default;
-      ScalarMulNode& operator=(ScalarMulNode&& other) = default;
-
-      ~ScalarMulNode() noexcept = default; 
-
       std::vector<std::shared_ptr<Tensor>> backward(const Tensor& upstreamGrad) override;
   };
 }
\ No newline at end of file
diff --git a/src/backend/computational_graph/topological_sort.cpp b/src/backend/computational_graph/topological_sort.cpp
index d6250ca..6a4266d 100644
--- a/src/backend/computational_graph/topological_sort.cpp
+++ b/src/backend/computational_graph/topological_sort.cpp
@@ -20,7 +20,7 @@
 #include <utility>
 
 using namespace std;
-using namespace graph;
+using namespace cgraph;
 
 #ifndef NDEBUG
 /**
@@ -133,7 +133,7 @@ vector< Tensor* > TopologicalSort::reverseSort(Tensor* root) {
 
   auto pushParentsWithGraphNode = [&nodeQueue, &edgeCounts](Tensor* t){
     const auto& parents = t->cgNode->getParents();
-    for(const auto& parent: parents){ // TODO: check for requiresGrad to save runtime?
+    for(const auto& parent: parents){
       if(!parent->cgNode)
         continue;
 
@@ -146,12 +146,15 @@ vector< Tensor* > TopologicalSort::reverseSort(Tensor* root) {
   };
 
   // pass 2: topological sort based on Kahn's algorithm
-  vector< Tensor* > res; // TODO: reserve capacity to save runtime?
+  vector< Tensor* > res;
+  res.reserve(nodeQueue.size());
+
   nodeQueue.push(root);
   while(!nodeQueue.empty()){
     auto tensorPtr = nodeQueue.front();
     nodeQueue.pop();
-
+    assert(tensorPtr->cgNode);
+    
     if(edgeCounts[tensorPtr]==0){
       pushParentsWithGraphNode(tensorPtr);
       res.push_back(tensorPtr);
diff --git a/src/backend/computational_graph/topological_sort.h b/src/backend/computational_graph/topological_sort.h
index 96bba77..5c60ed5 100644
--- a/src/backend/computational_graph/topological_sort.h
+++ b/src/backend/computational_graph/topological_sort.h
@@ -16,7 +16,7 @@
 
 class Tensor; // to break circular dependency
 
-namespace graph {
+namespace cgraph {
   /**
    * @brief Topological sort class. 
    * 
diff --git a/src/backend/data_modeling/device.cpp b/src/backend/data_modeling/device.cpp
new file mode 100644
index 0000000..a7726ae
--- /dev/null
+++ b/src/backend/data_modeling/device.cpp
@@ -0,0 +1,26 @@
+/**
+ * @file device.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-08
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include "device.h"
+
+#include <stdexcept>
+
+const char* DeviceToString(Device d) {
+    switch(d){
+        case Device::CPU:
+            return "CPU";
+        case Device::CUDA:
+            return "CUDA";
+    }
+
+    std::__throw_invalid_argument("Unknown device encountered");
+    return ""; // suppress
+}
\ No newline at end of file
diff --git a/src/backend/data_modeling/device.h b/src/backend/data_modeling/device.h
new file mode 100644
index 0000000..65f83a5
--- /dev/null
+++ b/src/backend/data_modeling/device.h
@@ -0,0 +1,19 @@
+/**
+ * @file device.h
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-08
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#pragma once
+
+enum class Device {
+    CPU,
+    CUDA
+};
+
+const char* DeviceToString(Device d);
\ No newline at end of file
diff --git a/src/backend/data_modeling/dim_type.cpp b/src/backend/data_modeling/dim_type.cpp
index af77c24..f7d1804 100644
--- a/src/backend/data_modeling/dim_type.cpp
+++ b/src/backend/data_modeling/dim_type.cpp
@@ -13,6 +13,7 @@
 #include "utility/safe_arithmetics.h"
 
 #include <utility>
+#include <sstream>
 
 using namespace std;
 
@@ -38,10 +39,7 @@ tensorDim_t Dimension::multVector(const std::vector<tensorDim_t>& dims) const no
 void Dimension::resize(const std::vector<tensorDim_t>& dims) {
   this->dims = dims;
   size = multVector(dims);
-
-  if(size==0){
-    __throw_invalid_argument("Tensor-Dims must all be greater than 0.");
-  }
+  assert(size>0);
 }
 
 /**
@@ -55,10 +53,7 @@ void Dimension::swap(const tensorDim_t dim1, const tensorDim_t dim2) {
 
 Dimension::Dimension(const vector<tensorDim_t>& dims) : dims{dims} {
   size = multVector(dims);
-
-  if(size==0){
-    __throw_invalid_argument("Tensor-Dims must all be greater than 0.");
-  }
+  assert(size>0);
 }
 
 Dimension::Dimension(const Dimension& other) : dims{other.dims}, size{other.size} { }
@@ -82,15 +77,42 @@ Dimension& Dimension::operator=(Dimension&& other) noexcept {
   return *this;
 }
 
-ostream& operator<<(ostream& os, const Dimension& d) noexcept {
-  os << "(";
-  for(int i=0; i<d.nDims(); i++){
-    os << d.getItem(i);
+/**
+ * @brief This method gets interesting when we want to get a copy of 
+ * this dimension instance, but we collapsed one of the dimensions.
+ * E.g. when we have a tensor, and we sum over one of its dimensions 
+ * to get a new tensor, then this will be the new dimensions of the result.
+ * 
+ * Example: t=Tensor with dims (b-size, d). We sum over all batches and 
+ * get a new tensor tSum=Tensor with dims (d).
+ * 
+ * @param idx The dimension to collapse.
+ */
+Dimension Dimension::collapseDimension(int idx) const {
+  auto mappedIdx = get(idx);
 
-    if(i+1<d.nDims()){
-      os << ",";
+  std::vector<tensorDim_t> newDims;
+  newDims.reserve(dims.size() - 1);
+  newDims.insert(newDims.end(), dims.begin(), dims.begin() + idx);
+  newDims.insert(newDims.end(), dims.begin() + idx + 1, dims.end());
+
+  return Dimension(newDims);
+}
+
+ostream& operator<<(ostream& os, const Dimension& d) noexcept {
+  if(d.size>0){
+    os << "\n(";
+    for(int i=0; i<d.nDims(); i++){
+      os << d.get(i);
+
+      if(i+1<d.nDims()){
+        os << ",";
+      }
     }
+    os << ")";
+    return os;
   }
-  os << ")";
+
+  os << "\nempty";
   return os;
 }
\ No newline at end of file
diff --git a/src/backend/data_modeling/dim_type.h b/src/backend/data_modeling/dim_type.h
index babdc96..6af6933 100644
--- a/src/backend/data_modeling/dim_type.h
+++ b/src/backend/data_modeling/dim_type.h
@@ -18,13 +18,6 @@
 #include <iostream>
 #include <cassert>
 
-template <typename T>
-concept is_valid_dim = requires(T x) {
-    requires std::is_integral_v<std::remove_const_t<T>>;
-    requires std::convertible_to<std::remove_const_t<T>, tensorDim_t>;
-    x >= 0;
-};
-
 class Dimension final {
   private:
     std::vector<tensorDim_t> dims;
@@ -47,15 +40,20 @@ class Dimension final {
 
     ~Dimension() noexcept = default;
 
+    Dimension collapseDimension(int idx) const;
+
     void resize(const std::vector<tensorDim_t>& dims);
-    
+      
     tensorSize_t getSize() const noexcept {
-      assert(size!=0);
       return size;
     }
 
-    tensorDim_t getItem(int idx) const {
-      assert(size!=0);
+    tensorDim_t get(int idx) const {
+      return (*this)[idx];
+    }
+
+    tensorDim_t operator[](int idx) const {
+      assert(size>0);
       if(idx<0){
         idx = dims.size() + idx; // -1 is last idx, -2 second last and so forth
       }
@@ -70,7 +68,6 @@ class Dimension final {
     void swap(const tensorDim_t dim1, const tensorDim_t dim2);
 
     size_t nDims() const noexcept {
-      assert(size!=0);
       return dims.size();
     }
 
@@ -80,7 +77,6 @@ class Dimension final {
     }
 
     bool operator==(const std::vector<tensorDim_t>& other) const {
-      assert(size!=0);
       return this->dims == other;
     }
 
@@ -92,5 +88,5 @@ class Dimension final {
       return !(*this == other);
     }
 
-    friend std::ostream& operator<<(std::ostream& os, const Dimension& d) noexcept;
+    friend std::ostream& operator<<(std::ostream& os, const Dimension& d) noexcept;    
 };
\ No newline at end of file
diff --git a/src/backend/data_modeling/tensor.cpp b/src/backend/data_modeling/tensor.cpp
index 04a94aa..34ec548 100644
--- a/src/backend/data_modeling/tensor.cpp
+++ b/src/backend/data_modeling/tensor.cpp
@@ -60,18 +60,63 @@ Tensor::tensorValues_t::~tensorValues_t() noexcept {
  * do not create a deepcopy, but construct another pointer pointing to the same piece
  * of memory.
  */
-void Tensor::tensorValues_t::copyValues(Tensor::tensorValues_t& target, 
-                                            const Tensor::tensorValues_t& origin) {
-  assert(origin.device==target.device && origin.size==target.size);
+void Tensor::tensorValues_t::copyValues(Tensor::tensorValues_t& target) const {
+  assert(device==target.device && size==target.size);
 
-  switch(origin.device){
+  switch(device){
     case Device::CPU:
-      for(tensorSize_t i=0; i<origin.size; i++){
-        target[i] = origin.values[i];
+      for(tensorSize_t i=0; i<size; i++){
+        target[i] = values[i];
       }
       break;
     case Device::CUDA:
       __throw_runtime_error("CUDA not implemented for deep copy");
+      break;
+  }
+}
+
+/**
+ * @brief Does what you think it does. For linear slicing.
+ */
+void Tensor::tensorValues_t::copyValues(tensorValues_t& target, tensorSize_t low, 
+                                        tensorSize_t high, tensorSize_t targetOffset) const {
+  assert(target.size >= high - low);
+
+  switch(device){
+    case Device::CPU:
+      for(tensorSize_t i=0; i<high-low; i++){
+        target[targetOffset+i] = values[low+i];
+      }
+      break;
+    case Device::CUDA:
+      __throw_runtime_error("CUDA not implemented for slicing");
+      break;
+  }
+}
+
+/**
+ * @brief Indexed slicing along first dimension.
+ * 
+ * @param indices The indices of the first dimension.
+ * @param sizeOfDim Complete size of the flattened first dimension.
+ */
+void Tensor::tensorValues_t::copyValues(tensorValues_t& target, span<const tensorDim_t> indices, 
+                                        const tensorSize_t sizeOfDim) const {
+  assert(target.size >= sizeOfDim * indices.size());
+
+  switch(device){
+    case Device::CPU: {
+      tensorSize_t targetOffset = 0;
+      for(tensorDim_t idx: indices){
+        tensorSize_t thisOffset = idx * sizeOfDim;
+        copyValues(target, thisOffset, thisOffset+sizeOfDim, targetOffset);
+        targetOffset += sizeOfDim;
+      }
+      break; 
+    }
+    case Device::CUDA:
+      __throw_runtime_error("CUDA not implemented for slicing");
+      break;
   }
 }
 
@@ -116,6 +161,7 @@ Tensor::tensorValues_t::operator+=(const Tensor::tensorValues_t& other) {
       break;
     case Device::CUDA:
       __throw_invalid_argument("CUDA not supported yet for += operation");
+      break;
   }
 
   return *this;
@@ -141,7 +187,7 @@ ftype Tensor::tensorValues_t::operator[](const tensorSize_t idx) const {
   return values[idx];
 }
 
-void Tensor::tensorValues_t::setItem(ftype v, tensorSize_t idx) {
+void Tensor::tensorValues_t::set(ftype v, tensorSize_t idx) {
   if(idx >= size)
     throw std::out_of_range("Out of range for tensor");
 
@@ -156,7 +202,7 @@ void Tensor::tensorValues_t::setItem(ftype v, tensorSize_t idx) {
   __throw_runtime_error("Should never reach here.");
 }
 
-ftype Tensor::tensorValues_t::getItem(tensorSize_t idx) {
+ftype Tensor::tensorValues_t::get(tensorSize_t idx) {
   if(idx >= size)
     throw std::out_of_range("Out of range for tensor");
   
@@ -206,13 +252,13 @@ Tensor Tensor::createEmptyCopy() const {
   return res;
 }
 /**
- * @brief Does a deep copy.
+ * @brief Does a deep copy, but omits gradient and computational graph information.
  */
 Tensor Tensor::createDeepCopy() const {
   assert(!grads || (grads && !grads->requiresGrad)); // gradient should not require gradient
 
   auto res = Tensor(dims, values->getDevice(), requiresGrad);
-  tensorValues_t::copyValues(*res.values, *this->values);
+  values->copyValues(*res.values);
 
   /* if(grads){
     res.grads = make_shared<Tensor>( grads->createDeepCopy() ); // TODO: do we want this?
@@ -247,98 +293,64 @@ Tensor Tensor::multiplyScalar(const Tensor& scalar, const Tensor& right) noexcep
  * The check of whether they do or not is to be performed by the surrounding
  * network class object instance upon construction. 
  */
-Tensor Tensor::matMulImpl(const Tensor& left, const Tensor& right) const {
-  if(left.dims.getItem(-1) != right.dims.getItem(-2)){
+Tensor Tensor::matMulImpl(const Tensor& left, const Tensor& right) {
+  if(left.dims.get(-1) != right.dims.get(-2)){
     __throw_runtime_error("Tensor dimensions do not match");
   }
 
-  if(abs(static_cast<int>(right.dims.nDims()) - static_cast<int>(left.dims.nDims())) > 1){
-    auto str = "Tensor dimension assumptions violated. See file 'assumption_matrices.md'.";
-    __throw_invalid_argument(str);
-  }
-
+  // broadcasting
   auto resDims = left.dims.nDims() > right.dims.nDims() ? left.dims.toVector() : right.dims.toVector();
-  resDims[resDims.size()-2] = left.dims.getItem(-2); // rows
-  resDims[resDims.size()-1] = right.dims.getItem(-1); // cols
+  resDims[resDims.size()-2] = left.dims.get(-2); // rows
+  resDims[resDims.size()-1] = right.dims.get(-1); // cols
 
-  Tensor res(resDims, values->getDevice(), false);
+  Tensor res(resDims, left.values->getDevice(), false);
 
   // sizes of the 2D matrices respectively
-  const tensorSize_t leftSize = left.dims.getItem(-1) * left.dims.getItem(-2); 
-  const tensorSize_t rightSize = right.dims.getItem(-1) * right.dims.getItem(-2);
-  const tensorSize_t resSize = left.dims.getItem(-2) * right.dims.getItem(-1);
+  const tensorSize_t leftSize = left.dims.get(-1) * left.dims.get(-2); 
+  const tensorSize_t rightSize = right.dims.get(-1) * right.dims.get(-2);
+  const tensorSize_t resSize = left.dims.get(-2) * right.dims.get(-1);
 
   tensorSize_t leftOffset = 0;
   tensorSize_t rightOffset = 0;
   tensorSize_t resOffset = 0;
 
-  // lambda expected to get inlined by compiler
-  auto multiplyNTimes = [&](const tensorDim_t n){
-    for(tensorDim_t i=0; i<n; i++){
-      matMul2DCpu(res, left, right, resOffset, leftOffset, rightOffset);
-
-      leftOffset += leftSize;
-      rightOffset += rightSize;
-      resOffset += resSize;
-    }
-  };
-
-  if(left.dims.nDims() == right.dims.nDims()){
-    const auto nMultiplications = res.values->getSize() / resSize; // total size / size of 2D matrix
-    multiplyNTimes(nMultiplications);
-  }
-  else if(left.dims.nDims() > right.dims.nDims()) {
-    const auto nBatches = left.dims.getItem(0);
-
-    for(tensorDim_t batch = 0; batch < nBatches; batch++){
-      const auto nMultsPerBatch = res.values->getSize() / (nBatches * resSize);
-      multiplyNTimes(nMultsPerBatch);
-      rightOffset = 0;
-    }
-  }
-  else {
-    const auto nBatches = right.dims.getItem(0);
+  while(leftOffset < left.getSize()){
+    matMul2DCpu(res, left, right, resOffset, leftOffset, rightOffset);
 
-    for(tensorDim_t batch = 0; batch < nBatches; batch++){
-      const auto nMultsPerBatch = res.values->getSize() / (nBatches * resSize);  
-      multiplyNTimes(nMultsPerBatch);
-      leftOffset = 0;
-    }
+    leftOffset += leftSize;
+    rightOffset += rightSize;
+    resOffset += resSize;
   }
 
   return res;
 }
 
 /**
- * @brief Name says it all. Inplace operation on res
+ * @brief Name says it all. Inplace operation on res.
  */
 void Tensor::matMul2DCpu(Tensor& res, const Tensor& left, const Tensor& right, const tensorSize_t resOffset, 
                            const tensorSize_t leftOffset, const tensorSize_t rightOffset) {
+  
+  const auto nRowsLeft = static_cast<tensorSize_t>(left.dims.get(-2));
+  const auto nColsLeft = static_cast<tensorSize_t>(left.dims.get(-1));
+  const auto nRowsRight = static_cast<tensorSize_t>(right.dims.get(-2));
+  const auto nColsRight = static_cast<tensorSize_t>(right.dims.get(-1));
 
-  const auto nRowsLeft = static_cast<tensorSize_t>(left.dims.getItem(-2));
-  const auto nColsLeft = static_cast<tensorSize_t>(left.dims.getItem(-1));
-  const auto nRowsRight = static_cast<tensorSize_t>(right.dims.getItem(-2));
-  const auto nColsRight = static_cast<tensorSize_t>(right.dims.getItem(-1));
+  tensorSize_t resIdx = resOffset;
+  for(tensorSize_t lrow=0; lrow<nRowsLeft; lrow++){
 
-  for(tensorSize_t row=0; row<nRowsLeft; row++){
-    const tensorSize_t leftRowOffset = row * nColsLeft;
-    const tensorSize_t resRowOffset = row * nColsRight;
-    
-    tensorSize_t resIdx = resOffset + resRowOffset;
-    // TODO: can we optimize mem-access for right matrix?
-    for(tensorSize_t col=0; col<nColsRight; col++){
-      ftype scalarProd = 0;
-      
-      tensorSize_t leftIdx = leftOffset + leftRowOffset;
-      tensorSize_t rightIdx = rightOffset + col;
-      for(tensorSize_t idx=0; idx<nColsLeft; idx++){
-        scalarProd += (*left.values)[leftIdx] * (*right.values)[rightIdx];
-        
+    for(tensorSize_t rcol=0; rcol<nColsRight; rcol++){
+      tensorSize_t leftIdx = leftOffset + lrow * nColsLeft;
+      tensorSize_t rightIdx = rightOffset + rcol;
+
+      ftype scalar = 0.0;
+      for(tensorSize_t lcol=0; lcol<nColsLeft; lcol++){
+        scalar += (*left.values)[leftIdx] * (*right.values)[rightIdx];
         leftIdx++;
         rightIdx += nColsRight;
       }
 
-      (*res.values)[resIdx] = scalarProd;
+      (*res.values)[resIdx] = scalar;
       resIdx++;
     }
   }
@@ -348,6 +360,7 @@ void Tensor::matMul2DCpu(Tensor& res, const Tensor& left, const Tensor& right, c
  * @brief Matrix multiplication.
  */
 Tensor Tensor::matmul(const Tensor& other) const {
+  assert(values->getDevice()==other.values->getDevice());
   if(values->getDevice()==Device::CUDA){
     __throw_invalid_argument("Multiplication not implemented on CUDA");
   }
@@ -356,36 +369,45 @@ Tensor Tensor::matmul(const Tensor& other) const {
     __throw_runtime_error("Tensors on different devices.");
   }
 
-  // TODO: check what to do about these two gradients and if you want broadcasting here at all
-  if(other.dims.getSize()==1){
-    return multiplyScalar(other, *this);
-  }
-  else if(dims.getSize()==1){
-    return multiplyScalar(*this, other); 
-  }
-  
   return matMulImpl(*this, other);
 }
 
 /**
- * @brief Elementise addition.
+ * @brief Addition of two tensors. This works in two ways: 
+ * 1. Shapes of the two tensors are identical. In this case it is simple 
+ * elementwise addition.
+ * 2. The second tensor is a vector. In this case broadcast it. We assume 
+ * other.dims == (dimN) && this->dims == (dim0, dim1,..., dimN).
  */
 Tensor Tensor::operator+(const Tensor& other) const {
   if(values->getDevice()==Device::CUDA){
-    __throw_invalid_argument("Multiplication not implemented on CUDA");
+    __throw_invalid_argument("Addition not implemented on CUDA");
   }
 
-  if(this->dims != other.dims){
-    __throw_invalid_argument("Tensors need same dimensions");
+  if(this->dims != other.dims && 
+    !(other.dims.nDims() == 1 && other.dims.get(0) == dims.get(-1))){
+    __throw_invalid_argument("Tensors need matching dimensions");
   }
   else if(values->getDevice()!=other.values->getDevice()){
     __throw_runtime_error("Tensors on different devices.");
   }
 
-  assert(values->getSize()==other.values->getSize());
-  Tensor res(dims, values->getDevice(), false);
-  for(tensorSize_t i=0; i<values->getSize(); i++){
-    (*res.values)[i] = (*values)[i] + (*other.values)[i];
+  Tensor res(dims, values->getDevice());
+
+  if(dims==other.dims){
+    // elementwise add
+    for(tensorSize_t i=0; i<values->getSize(); i++){
+      (*res.values)[i] = (*values)[i] + (*other.values)[i];
+    }
+  }
+  else { [[likely]]
+    // broadcasted add
+    const auto stride = static_cast<tensorSize_t>(other.dims.get(0));
+    for(tensorSize_t offset=0; offset<values->getSize(); offset+=stride){
+      for(tensorSize_t i=0; i<stride; i++){
+        (*res.values)[offset+i] = (*values)[offset+i] + (*other.values)[i];
+      }
+    }
   }
 
   return res;
@@ -402,17 +424,18 @@ Tensor Tensor::add(const Tensor& other) const {
  * @brief Elementwise multiplication.
  */
 Tensor Tensor::operator*(const Tensor& other) const {
+  assert(values->getDevice()==other.values->getDevice());
   if(values->getDevice()==Device::CUDA){
     __throw_invalid_argument("Multiplication not implemented on CUDA");
   }
 
   // TODO: check what to do about these two gradients and if you want broadcasting here at all
-  if(other.dims.getSize()==1){
+/*   if(other.dims.getSize()==1){
     return multiplyScalar(other, *this);
   }
   else if(dims.getSize()==1){
     return multiplyScalar(*this, other);
-  }
+  } */
 
   if(this->dims != other.dims){
     __throw_invalid_argument("Tensors need same dimensions");
@@ -501,26 +524,25 @@ void Tensor::backward() {
     }
   }
 
-  vector<Tensor*> sortedTensors = graph::TopologicalSort::reverseSort(this);
+  vector<Tensor*> sortedTensors = cgraph::TopologicalSort::reverseSort(this);
   for(auto tPtr: sortedTensors){
     auto& tensor = *tPtr;
     assert(tensor.grads && !tensor.grads->requiresGrad); // gradient should not require grad
 
-    if(tensor.cgNode){
-      auto incomingGrads = tensor.cgNode->backward(*tensor.grads);
-      const auto& parents = tensor.cgNode->getParents();
+    auto incomingGrads = tensor.cgNode->backward(*tensor.grads);
 
-      for(size_t i=0; i<parents.size(); i++){
-        auto parent = parents[i];
-        if(!parent->requiresGrad){
-          continue;
-        }
-        else if(!parent->grads){
-          parent->grads = incomingGrads[i];
-        }
-        else{
-          *parent->grads->values += *incomingGrads[i]->values;
-        }
+    const auto& parents = tensor.cgNode->getParents();
+
+    for(size_t i=0; i<parents.size(); i++){
+      auto parent = parents[i];
+      if(!parent->requiresGrad){
+        continue;
+      }
+      else if(!parent->grads){
+        parent->grads = incomingGrads[i];
+      }
+      else{
+        *parent->grads->values += *incomingGrads[i]->values;
       }
     }
   }
@@ -529,7 +551,7 @@ void Tensor::backward() {
 /**
  * @brief Get gradients
  */
-shared_ptr<const Tensor> Tensor::getGrads() const {
+shared_ptr<Tensor> Tensor::getGrads() const {
   if(!grads){
     __throw_runtime_error("Tensor has no gradients.");
   }
@@ -574,7 +596,7 @@ void Tensor::transposeImpl(Tensor& target, const int dim1, const int dim2) const
     // strides for source
     tensorSize_t stride = 1;
     for(int d = numDims - 1; d >= 0; d--) {
-        dimSizes[d] = source.dims.getItem(d);
+        dimSizes[d] = source.dims.get(d);
         sourceStrides[d] = stride;
         stride *= dimSizes[d];
     }
@@ -647,8 +669,8 @@ void Tensor::transposeImpl2D(Tensor& target, const int dim1, const int dim2) con
   transposedValues->resize(source.values->getSize());
 
   tensorSize_t resIdx = 0;
-  for(tensorSize_t smallDimCount=0; smallDimCount<source.dims.getItem(smallDim); smallDimCount++){
-    for(tensorSize_t largeDimCount=0; largeDimCount<source.dims.getItem(largeDim); largeDimCount++){
+  for(tensorSize_t smallDimCount=0; smallDimCount<source.dims.get(smallDim); smallDimCount++){
+    for(tensorSize_t largeDimCount=0; largeDimCount<source.dims.get(largeDim); largeDimCount++){
       tensorSize_t offset = largeDimCount * largeDimOffset + smallDimCount * smallDimOffset;
 
       for(tensorSize_t smallDimIdx=0; smallDimIdx<smallDimOffset; smallDimIdx++){
@@ -736,7 +758,7 @@ void Tensor::permute(const std::vector<tensorDim_t>&& newOrder) noexcept {
 /**
  * @brief Populates the tensor with value.
  */
-void Tensor::reset(const ftype x) {
+void Tensor::reset(const ftype x) noexcept {
   for(tensorSize_t i=0; i<values->getSize(); i++){
     (*values)[i] = x;
   }
@@ -745,8 +767,7 @@ void Tensor::reset(const ftype x) {
 /**
  * @brief Populates the tensor with values drawn according to initializer.
  */
-void Tensor::reset(const utility::InitClass ic) {
-  const auto init = utility::InitializerFactory::getInitializer(ic);
+void Tensor::reset(const shared_ptr<utility::InitializerBase> init) noexcept {
   for(tensorSize_t i=0; i<values->getSize(); i++){
     (*values)[i] = init->drawNumber();
   }
@@ -776,6 +797,46 @@ Device Tensor::getDevice() const noexcept {
   return values->getDevice();
 }
 
+/**
+ * @brief Gets a slice of this tensor.
+ * 
+ * Quick and dirty implementation for now: Copies and
+ * returns. 
+ * 
+ * @param low Lower idx, inclusive bound.
+ * @param high Upper idx, non-inclusive bound.
+ * @return Tensor The slices tensor.
+ */
+Tensor Tensor::getSlice(tensorSize_t low, tensorSize_t high) const {
+  if(high<=low){
+    __throw_invalid_argument("Upper bound most be larger than lower bound.");
+  }
+
+  auto resDims = dims.toVector();
+  resDims[0] = high-low;
+  Tensor res(std::move(resDims), values->getDevice(), false);
+  values->copyValues(*res.values, low, high, 0);
+  return res;
+}
+
+/**
+ * @brief Like overload, but gets the slicing according to the 
+ * indices given by the argument. Used e.g. in batch-size.
+ * 
+ * @param indices A list of indices
+ * @return Tensor The result.
+ */
+Tensor Tensor::getSlice(span<const tensorDim_t> indices) const {
+  assert(indices.size()>0);
+  
+  auto resDims = dims.toVector();
+  resDims[0] = indices.size();
+
+  Tensor res(std::move(resDims), values->getDevice(), false);
+  values->copyValues(*res.values, indices, getDimOffset(0, resDims));
+  return res;
+}
+
 /**
  * @brief Prints only sample of up to 2D tensors.
  */
@@ -784,9 +845,9 @@ void printValuesCpu(std::ostream& os, const Tensor& t) {
     constexpr auto MAX_IDX = static_cast<tensorDim_t>(10);
 
     if(t.dims.nDims()==2){
-      for(tensorDim_t i=0; i<min(MAX_IDX, t.dims.getItem(0)); i++){
-        for(tensorDim_t j=0; j<min(MAX_IDX, t.dims.getItem(1)); j++){
-          os << t.getItem({i, j}) << " ";
+      for(tensorDim_t i=0; i<min(MAX_IDX, t.dims.get(0)); i++){
+        for(tensorDim_t j=0; j<min(MAX_IDX, t.dims.get(1)); j++){
+          os << t.get({i, j}) << " ";
         }
         os << "\n";
       }
@@ -800,7 +861,7 @@ void printValuesCpu(std::ostream& os, const Tensor& t) {
 
   printVals(t);
   if(t.grads){
-    os << "Grads:\n";
+    os << "\n\nGrads:\n";
     printVals(*t.grads);
   }
 }
@@ -845,12 +906,12 @@ tensorSize_t Tensor::computeLinearIdx(const std::vector<tensorDim_t>& idx, const
   }
 
   const auto lastIdx = idx.size()-1;
-  tensorSize_t offsetFactor = dims.getItem(lastIdx);
+  tensorSize_t offsetFactor = dims.get(lastIdx);
   
   tensorSize_t res = idx[lastIdx];
   for(int i=lastIdx-1; i>=0; i--){
     res += idx[i] * offsetFactor;
-    offsetFactor *= dims.getItem(i);
+    offsetFactor *= dims.get(i);
   }
 
   return res;
@@ -864,7 +925,7 @@ tensorSize_t Tensor::getDimOffset(const tensorDim_t dim, const Dimension& dims)
   tensorSize_t res = 1; // minimum possible dimsize
 
   for(size_t idx = dims.nDims()-1; idx>dim; idx--){
-    res *= dims.getItem(idx);
+    res *= dims.get(idx);
   }
 
   assert(res!=0);
@@ -881,7 +942,7 @@ tensorSize_t Tensor::getDimOffset(const int dim, const Dimension& dims) {
 /**
  * @brief No explanation needed.
  */
-ftype Tensor::getItem(const std::vector<tensorDim_t>& idx) const {
+ftype Tensor::get(const std::vector<tensorDim_t>& idx) const {
   return (*values)[computeLinearIdx(idx, dims)]; 
 }
 
@@ -889,26 +950,34 @@ ftype Tensor::getItem(const std::vector<tensorDim_t>& idx) const {
  * @brief Special getter, indexes the contained underlying array linearly.
  * Can lead to unexpected results in multidimensional tensors.
  */
-ftype Tensor::getItem(tensorSize_t idx) const {
+ftype Tensor::get(tensorSize_t idx) const {
+  return (*this)[idx];
+}
+
+/**
+ * @brief For convenience.
+ */
+ftype Tensor::operator[](tensorSize_t idx) const {
   return (*values)[idx];
 }
 
-ftype Tensor::getItem(tensorDim_t idx0, tensorDim_t idx1) const {
-  return getItem({idx0, idx1});
+
+ftype Tensor::get(tensorDim_t idx0, tensorDim_t idx1) const {
+  return get({idx0, idx1});
 }
 
-ftype Tensor::getItem(tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2) const {
-  return getItem({idx0, idx1, idx2});
+ftype Tensor::get(tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2) const {
+  return get({idx0, idx1, idx2});
 }
 
-ftype Tensor::getItem(tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2, tensorDim_t idx3) const {
-  return getItem({idx0, idx1, idx2, idx3});
+ftype Tensor::get(tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2, tensorDim_t idx3) const {
+  return get({idx0, idx1, idx2, idx3});
 }
 
 /**
  * @brief No explanation needed.
  */
-void Tensor::setItem(ftype item, const std::vector<tensorDim_t>& idx) {
+void Tensor::set(ftype item, const std::vector<tensorDim_t>& idx) {
   (*values)[computeLinearIdx(idx, dims)] = item;
 }
 
@@ -916,18 +985,18 @@ void Tensor::setItem(ftype item, const std::vector<tensorDim_t>& idx) {
  * @brief Special setter, indexes the contained underlying array linearly.
  * Can lead to unexpected results in multidimensional tensors.
  */
-void Tensor::setItem(ftype item, tensorDim_t idx) { 
+void Tensor::set(ftype item, tensorDim_t idx) { 
   (*values)[idx] = item;
 }
 
-void Tensor::setItem(ftype item, tensorDim_t idx0, tensorDim_t idx1) { 
-  setItem(item, {idx0, idx1});
+void Tensor::set(ftype item, tensorDim_t idx0, tensorDim_t idx1) { 
+  set(item, {idx0, idx1});
 }
 
-void Tensor::setItem(ftype item, tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2) { 
-  setItem(item, {idx0, idx1, idx2});
+void Tensor::set(ftype item, tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2) { 
+  set(item, {idx0, idx1, idx2});
 }
 
-void Tensor::setItem(ftype item, tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2, tensorDim_t idx3) { 
-  setItem(item, {idx0, idx1, idx2, idx3});
+void Tensor::set(ftype item, tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2, tensorDim_t idx3) { 
+  set(item, {idx0, idx1, idx2, idx3});
 }
\ No newline at end of file
diff --git a/src/backend/data_modeling/tensor.h b/src/backend/data_modeling/tensor.h
index 9c1384b..d8a59fc 100644
--- a/src/backend/data_modeling/tensor.h
+++ b/src/backend/data_modeling/tensor.h
@@ -12,6 +12,7 @@
 #pragma once
 
 #include "dim_type.h"
+#include "device.h"
 #include "computational_graph/topological_sort.h"
 #include "computational_graph/graph_node.h"
 
@@ -19,38 +20,22 @@
 #include "utility/initializers.h"
 
 #include <memory>
-#include <optional>
+#include <span>
 
 #include <iostream>
 
 #include <concepts>
+#include <type_traits>
 #include <cassert>
 
 // break circular dependency
-namespace graph {
+namespace cgraph {
     class GraphNode;
     class TopologicalSort;
 }
 
-enum class Device {
-    CPU,
-    CUDA
-};
-
-constexpr const char* DeviceToString(Device d) {
-    switch(d){
-        case Device::CPU:
-            return "CPU";
-        case Device::CUDA:
-            return "CUDA";
-    }
-
-    std::__throw_invalid_argument("Unknown device encountered");
-    return ""; // suppress
-}
-
 class Tensor final : public std::enable_shared_from_this<Tensor> {
-    friend class graph::TopologicalSort;
+    friend class cgraph::TopologicalSort;
 
     private:
         /**
@@ -85,8 +70,8 @@ class Tensor final : public std::enable_shared_from_this<Tensor> {
             ftype& operator[](const tensorSize_t idx);
             ftype operator[](const tensorSize_t idx) const;
 
-            void setItem(ftype v, tensorSize_t idx);
-            ftype getItem(tensorSize_t idx);
+            void set(ftype v, tensorSize_t idx);
+            ftype get(tensorSize_t idx);
 
             tensorSize_t getSize() const noexcept;
 
@@ -110,7 +95,9 @@ class Tensor final : public std::enable_shared_from_this<Tensor> {
             void setDevice(const Device d) noexcept;
             Device getDevice() const noexcept;
 
-            static void copyValues(tensorValues_t& target, const tensorValues_t& origin);
+            void copyValues(tensorValues_t& target) const;
+            void copyValues(tensorValues_t& target, tensorSize_t low, tensorSize_t high, tensorSize_t targetOffset) const;
+            void copyValues(tensorValues_t& target, std::span<const tensorDim_t> indices, const tensorSize_t sizeOfDim) const;
 
             static void setDefaultDevice(const Device d) noexcept;
             static Device getDefaultDevice() noexcept;
@@ -121,13 +108,15 @@ class Tensor final : public std::enable_shared_from_this<Tensor> {
         
         bool requiresGrad = false;
         std::shared_ptr<Tensor> grads = nullptr; // gradients
-        std::shared_ptr<graph::GraphNode> cgNode = nullptr;
+        std::shared_ptr<cgraph::GraphNode> cgNode = nullptr;
     
         static Tensor multiplyScalar(const Tensor& scalar, const Tensor& other) noexcept;
-        static void matMul2DCpu(Tensor& res, const Tensor& left, const Tensor& right, const tensorSize_t resOffset, 
-                           const tensorSize_t leftOffset, const tensorSize_t rightOffset);
 
-        Tensor matMulImpl(const Tensor& left, const Tensor& right) const;
+        static Tensor matMulImpl(const Tensor& left, const Tensor& right);
+        static void matMul2DCpu(Tensor& res, const Tensor& left, const Tensor& right, 
+                                const tensorSize_t resOffset, const tensorSize_t leftOffset, 
+                                const tensorSize_t rightOffset);
+
         void transposeImpl2D(Tensor& target, const int dim1, const int dim2) const noexcept;
         void transposeImpl(Tensor& target, const int dim1, const int dim2) const noexcept;
 
@@ -165,7 +154,7 @@ class Tensor final : public std::enable_shared_from_this<Tensor> {
         explicit Tensor(const std::vector<tensorDim_t>& dims, const std::vector<ftype>& initValues, Device d, bool requiresGrad=false) :
             Tensor{dims, d, requiresGrad} {   
             for(tensorSize_t i=0; i<initValues.size(); i++){
-                values->setItem(initValues[i], i);
+                values->set(initValues[i], i);
             }
         }
 
@@ -187,8 +176,8 @@ class Tensor final : public std::enable_shared_from_this<Tensor> {
         Tensor(Tensor&& other) noexcept;
         Tensor& operator=(Tensor&& other) noexcept;
 
-        void reset(const ftype x);
-        void reset(const utility::InitClass ic);
+        void reset(const ftype x) noexcept;
+        void reset(const std::shared_ptr<utility::InitializerBase> init) noexcept;
         
         const Dimension& getDims() const noexcept;
         tensorSize_t getSize() const noexcept;
@@ -217,9 +206,12 @@ class Tensor final : public std::enable_shared_from_this<Tensor> {
         friend Tensor operator+(ftype scalar, const Tensor& tensor);                                    
 
         void backward();
-
+        
+        std::shared_ptr<Tensor> getGrads() const;
+        void setGrads(std::shared_ptr<Tensor> grads) noexcept {
+          this->grads = std::move(grads);
+        }
         bool hasGrads() const noexcept { return grads!=nullptr; }
-        std::shared_ptr<const Tensor> getGrads() const;
 
         void transposeThis() noexcept;
         void transposeThis(int dim1, int dim2) noexcept;
@@ -232,35 +224,30 @@ class Tensor final : public std::enable_shared_from_this<Tensor> {
         friend std::ostream& operator<<(std::ostream& os, const Tensor& t) noexcept;
 
         // for convenience we provide some simple getters
-        ftype getItem(tensorSize_t idx) const;
-        ftype getItem(tensorDim_t idx0, tensorDim_t idx1) const;
-        ftype getItem(tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2) const;
-        ftype getItem(tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2, tensorDim_t idx3) const;
+        ftype get(tensorSize_t idx) const;
+        ftype get(tensorDim_t idx0, tensorDim_t idx1) const;
+        ftype get(tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2) const;
+        ftype get(tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2, tensorDim_t idx3) const;
 
-        ftype getItem(const std::vector<tensorDim_t>& idx) const;
+        // non-const version of operator[] does not exist because of CUDA
+        ftype operator[](tensorSize_t idx) const;
+
+        ftype get(const std::vector<tensorDim_t>& idx) const;
 
         // for convenience we provide some simple setters
-        void setItem(ftype item, tensorDim_t idx);
-        void setItem(ftype item, tensorDim_t idx0, tensorDim_t idx1);
-        void setItem(ftype item, tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2);
-        void setItem(ftype item, tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2, tensorDim_t idx3);
-        void setItem(ftype item, const std::vector<tensorDim_t>& idx);
+        void set(ftype item, tensorDim_t idx);
+        void set(ftype item, tensorDim_t idx0, tensorDim_t idx1);
+        void set(ftype item, tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2);
+        void set(ftype item, tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2, tensorDim_t idx3);
+        void set(ftype item, const std::vector<tensorDim_t>& idx);
 
         void setDevice(const Device d) noexcept;
         Device getDevice() const noexcept;
 
         bool getRequiresGrad() const noexcept { return requiresGrad; }
-        void setRequiresGrad(const bool requiresGrad) noexcept { 
-            this->requiresGrad=requiresGrad;
-            if(!requiresGrad && cgNode){
-                cgNode = nullptr;
-            }
-            if(!requiresGrad && grads){
-                grads = nullptr;
-            }
-        }
+        void setRequiresGrad(const bool requiresGrad) noexcept { this->requiresGrad=requiresGrad; }
 
-        void setCgNode(std::shared_ptr<graph::GraphNode> node) noexcept { 
+        void setCgNode(std::shared_ptr<cgraph::GraphNode> node) noexcept { 
             cgNode = std::move(node);
             requiresGrad = true; 
         }
@@ -276,6 +263,9 @@ class Tensor final : public std::enable_shared_from_this<Tensor> {
             }        
         }
 
+        Tensor getSlice(tensorSize_t low, tensorSize_t high) const;
+        Tensor getSlice(std::span<const tensorDim_t> indices) const;
+
         // these two should not be exposed to the python interface
         static void setDefaultDevice(const Device d) noexcept;
         static Device getDefaultDevice() noexcept;
diff --git a/src/backend/data_modeling/tensor_functions.cpp b/src/backend/data_modeling/tensor_functions.cpp
index 3ac032e..5f06d2d 100644
--- a/src/backend/data_modeling/tensor_functions.cpp
+++ b/src/backend/data_modeling/tensor_functions.cpp
@@ -33,27 +33,29 @@ Tensor TensorFunctions::Ones(vector<tensorDim_t> dims, const bool requiresGrad)
   return Ones(std::move(dims), Tensor::getDefaultDevice(), requiresGrad);
 }
 
-Tensor TensorFunctions::Gaussian(vector<tensorDim_t> dims, Device d, const bool requiresGrad) {
+Tensor TensorFunctions::Gaussian(vector<tensorDim_t> dims, const Device d, 
+                                 const ftype stddev, const bool requiresGrad) {
   auto res = Tensor(std::move(dims), d, requiresGrad);
-  res.reset(utility::InitClass::Gaussian);
+  res.reset(std::make_shared<utility::GaussianInitializer>(stddev));
   return res;
 }
     
-Tensor TensorFunctions::Gaussian(vector<tensorDim_t> dims, const bool requiresGrad) {
-  return Gaussian(std::move(dims), Tensor::getDefaultDevice(), requiresGrad);
+Tensor TensorFunctions::Gaussian(vector<tensorDim_t> dims, const ftype stddev, 
+                                 const bool requiresGrad) {
+  return Gaussian(std::move(dims), Tensor::getDefaultDevice(), stddev, requiresGrad);
 }
 
 // Tensor manipulation
-void TensorFunctions::ToZeros(Tensor& t) {
+void TensorFunctions::ToZeros(Tensor& t) noexcept {
   t.reset(0);
 }
 
-void TensorFunctions::ToOnes(Tensor& t) {
+void TensorFunctions::ToOnes(Tensor& t) noexcept {
   t.reset(1);
 }
 
-void TensorFunctions::ToGaussian(Tensor& t) {
-  t.reset(utility::InitClass::Gaussian);
+void TensorFunctions::ToGaussian(Tensor& t, const ftype stddev) {
+  t.reset(std::make_shared<utility::GaussianInitializer>(stddev));
 }
 
 shared_ptr<Tensor> TensorFunctions::makeSharedTensor(const vector<tensorDim_t>& dims, bool requiresGrad){
@@ -75,4 +77,38 @@ shared_ptr<Tensor> TensorFunctions::makeSharedTensor(const vector<tensorDim_t>&
                                            Device d, 
                                            bool requiresGrad){
   return make_shared<Tensor>(dims, initValues, d, requiresGrad);   
+}
+
+/************************************************************************************
+ ************************************ Arithmetics ***********************************
+ ***********************************************************************************/
+
+ /**
+  * @brief Sums over the dimensions. If input is (b-size, dim1, dim2), and 
+  * input dim-parameter is 1, then output will be (b-size, dim2). If 
+  * input dim-parameter is 0, then output will be (dim1, dim2).
+  * Input dim must be smaller then t.dims.nDims()-1
+  */
+Tensor TensorFunctions::SumOverDims(const Tensor& t, tensorDim_t dim) {
+  if(dim>=t.getDims().nDims()-1){
+    __throw_invalid_argument("Dim parameter must be smaller than number of dims, but was " + dim);
+  }
+
+  auto resDims = t.getDims().collapseDimension(dim);
+  Tensor res = Zeros(resDims.toVector(), t.getDevice(), t.getRequiresGrad()); // inefficiency toVector
+
+  tensorSize_t stride = 1;
+  for(tensorDim_t i=dim+1; i<t.getDims().nDims(); i++){
+    stride *= t.getDims()[i];
+  }
+  
+  tensorSize_t targetOffset = 0;
+  for(tensorDim_t loop=0; loop<t.getDims()[dim]; loop++){
+    for(tensorSize_t i=0; i<stride; i++){
+      res.set(res.get(i) + t.get(targetOffset), i);
+      targetOffset++;
+    }
+  }
+  
+  return res;
 }
\ No newline at end of file
diff --git a/src/backend/data_modeling/tensor_functions.h b/src/backend/data_modeling/tensor_functions.h
index 78f68fe..2d93811 100644
--- a/src/backend/data_modeling/tensor_functions.h
+++ b/src/backend/data_modeling/tensor_functions.h
@@ -32,8 +32,8 @@ namespace TensorFunctions { // class name acts as namespace for us
   Tensor Ones(std::vector<tensorDim_t> dims, Device d, const bool requiresGrad=false);
   Tensor Ones(std::vector<tensorDim_t> dims, const bool requiresGrad=false);
 
-  Tensor Gaussian(std::vector<tensorDim_t> dims, Device d, const bool requiresGrad=false);
-  Tensor Gaussian(std::vector<tensorDim_t> dims, const bool requiresGrad=false);
+  Tensor Gaussian(std::vector<tensorDim_t> dims, Device d, ftype stddev, const bool requiresGrad=false);
+  Tensor Gaussian(std::vector<tensorDim_t> dims, ftype stddev=1, const bool requiresGrad=false);
 
   std::shared_ptr<Tensor> makeSharedTensor(const std::vector<tensorDim_t>& dims, bool requiresGrad=false);
 
@@ -48,7 +48,10 @@ namespace TensorFunctions { // class name acts as namespace for us
                                            Device d, bool requiresGrad=false);
 
   // Tensor manipulation
-  void ToZeros(Tensor& t);
-  void ToOnes(Tensor& t);
-  void ToGaussian(Tensor& t);
+  void ToZeros(Tensor& t) noexcept;
+  void ToOnes(Tensor& t) noexcept;
+  void ToGaussian(Tensor& t, ftype stddev);
+
+  // Arithmetics
+  Tensor SumOverDims(const Tensor& t, tensorDim_t dim=0); // default 0 for batch-size
 }
\ No newline at end of file
diff --git a/src/backend/layers/activation_functions/activation_function_base.cpp b/src/backend/layers/activation_functions/activation_function_base.cpp
deleted file mode 100644
index 8e482c3..0000000
--- a/src/backend/layers/activation_functions/activation_function_base.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-/**
- * @file activation_function_base.cpp
- * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
- * @brief 
- * @version 0.1
- * @date 2026-02-02
- * 
- * @copyright Copyright (c) 2026
- * 
- */
-
-#include "activation_function_base.h"
-
-using namespace activation;
-
-Tensor ActivationFunctionBase::forward(Tensor& t) const noexcept {
-  return (*this)(t);
-}
\ No newline at end of file
diff --git a/src/backend/layers/activation_functions/activation_function_base.h b/src/backend/layers/activation_functions/activation_function_base.h
deleted file mode 100644
index b0370c6..0000000
--- a/src/backend/layers/activation_functions/activation_function_base.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/**
- * @file function_base.h
- * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
- * @brief 
- * @version 0.1
- * @date 2026-02-01
- * 
- * @copyright Copyright (c) 2026
- * 
- */
-
-#pragma once
-
-#include "tensor.h"
-
-#include <optional>
-
-namespace activation {
-  class ActivationFunctionBase {
-    public:
-      virtual Tensor operator()(Tensor& t) const noexcept = 0;
-      Tensor forward(Tensor& t) const noexcept;
-
-      virtual Tensor gradient(const Tensor& t) noexcept = 0;
-  };
-}
diff --git a/src/backend/layers/activation_functions/leaky_relu.h b/src/backend/layers/activation_functions/leaky_relu.h
deleted file mode 100644
index fdcacc4..0000000
--- a/src/backend/layers/activation_functions/leaky_relu.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/**
- * @file leaky_relu.h
- * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
- * @brief 
- * @version 0.1
- * @date 2026-02-01
- * 
- * @copyright Copyright (c) 2026
- * 
- */
-
-#pragma once
-
-#include "activation_function_base.h"
-#include "tensor.h"
-
-namespace activation {
-  class LeakyReLU final : public ActivationFunctionBase {
-    public:
-      Tensor operator()(Tensor& t) const noexcept override;
-      Tensor gradient(const Tensor& t) noexcept override;
-  };
-}
diff --git a/src/backend/layers/activation_functions/relu.cpp b/src/backend/layers/activation_functions/relu.cpp
deleted file mode 100644
index fd2b42d..0000000
--- a/src/backend/layers/activation_functions/relu.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/**
- * @file relu.cpp
- * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
- * @brief 
- * @version 0.1
- * @date 2026-02-01
- * 
- * @copyright Copyright (c) 2026
- * 
- */
-
-#include "relu.h"
-#include "global_params.h"
-
-using namespace activation;
-
-Tensor ReLU::operator()(Tensor& t) const noexcept {
-  for(tensorSize_t i=0; i<t.getSize(); i++){
-    constexpr ftype zero = 0;
-    auto& target = t[i];
-    if(zero > target){
-      t[i] = 0;
-    }
-  }
-  return t;
-}
-
-Tensor ReLU::gradient(const Tensor& t) noexcept {
-/*   for(tensorSize_t i=0; i<t.getSize(); i++){
-    constexpr ftype zero = 0;
-    auto& target = t[i];
-    if(zero > target){
-      t[i] = 0;
-    }
-  }
-  return t; */
-}
\ No newline at end of file
diff --git a/src/backend/layers/activation_functions/relu.h b/src/backend/layers/activation_functions/relu.h
deleted file mode 100644
index 4c223f0..0000000
--- a/src/backend/layers/activation_functions/relu.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/**
- * @file relu.h
- * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
- * @brief 
- * @version 0.1
- * @date 2026-02-01
- * 
- * @copyright Copyright (c) 2026
- * 
- */
-
-#pragma once
-
-#include "activation_function_base.h"
-#include "tensor.h"
-
-namespace activation {
-  class ReLU final : public ActivationFunctionBase {
-    public:
-      Tensor operator()(Tensor& t) const noexcept override;
-      Tensor gradient(const Tensor& t) noexcept override;
-  };
-}
diff --git a/src/backend/layers/activation_functions/sigmoid.h b/src/backend/layers/activation_functions/sigmoid.h
deleted file mode 100644
index e69de29..0000000
diff --git a/src/backend/layers/ff_layer.cpp b/src/backend/layers/ff_layer.cpp
deleted file mode 100644
index 11cc82b..0000000
--- a/src/backend/layers/ff_layer.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/**
- * @file ff_layer.cpp
- * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
- * @brief 
- * @version 0.1
- * @date 2025-12-07
- * 
- * @copyright Copyright (c) 2025
- * 
- */
-
-#include "ff_layer.h"
-
-#include <cstdlib>
-#include <utility>
-
-using namespace std;
-using namespace layers;
-
-FfLayer::FfLayer(const tensorDim_t in_size, const tensorDim_t out_size) {
-    //weights.emplace(Device::CPU, in_size, out_size);
-    //weights->reset(utility::InitClass::Gaussian);
-}
-
-Tensor FfLayer::forward(const Tensor& input) const {
-    return *weights * input;
-}
-
-//ftype* FfLayer::backward(ftype* input) {
-
-//}
\ No newline at end of file
diff --git a/src/backend/layers/ff_layer.h b/src/backend/layers/ff_layer.h
deleted file mode 100644
index 1c2ea06..0000000
--- a/src/backend/layers/ff_layer.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/**
- * @file ff_layer.h
- * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
- * @brief 
- * @version 0.1
- * @date 2025-12-07
- * 
- * @copyright Copyright (c) 2025
- * 
- */
-
-#pragma once
-
-#include "layer_base.h"
-#include "utility/initializers.h"
-
-#include <optional>
-
-namespace layers {
-    class FfLayer : public LayerBase {
-        protected:            
-            // memoization 
-            // TODO: necessary?
-            //mutable std::optional<Tensor> v1;
-
-        public:
-            FfLayer(tensorDim_t in_size, tensorDim_t out_size);
-
-            Tensor forward(const Tensor& input) const override;
-            //ftype* backward(ftype* input) override;
-    };
-}
diff --git a/src/backend/layers/layer_base.cpp b/src/backend/layers/layer_base.cpp
deleted file mode 100644
index 7abcb0b..0000000
--- a/src/backend/layers/layer_base.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/**
- * @file layer_base.cpp
- * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
- * @brief 
- * @version 0.1
- * @date 2026-01-25
- * 
- * @copyright Copyright (c) 2026
- * 
- */
-
-#include "layer_base.h"
-
-#include <utility>
-
-using namespace std;
-using namespace layers;
-
-ftype LayerBase::getItem(vector<tensorDim_t>&&idx) const {
-  assert(weights);
-  return weights.value().getItem(std::move(idx));
-}
-
-void LayerBase::setItem(ftype item, vector<tensorDim_t>&& idx) {
-  assert(weights);
-  weights.value().setItem(item, std::move(idx));
-}
-
-void LayerBase::print(ostream& os) const noexcept {
-  assert(weights);
-  os << weights.value();
-}
-
-ostream& operator<<(ostream& os, const LayerBase& l) noexcept {
-  l.print(os);
-  return os;
-}
\ No newline at end of file
diff --git a/src/backend/layers/layer_base.h b/src/backend/layers/layer_base.h
deleted file mode 100644
index 2803540..0000000
--- a/src/backend/layers/layer_base.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/**
- * @file layer_base.h
- * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
- * @brief 
- * @version 0.1
- * @date 2025-12-07
- * 
- * @copyright Copyright (c) 2025
- * 
- */
-
-#pragma once
-
-#include "data_modeling/tensor.h"
-
-#include "utility/global_params.h"
-
-#include <optional>
-#include <iostream>
-
-namespace layers {
-    /** 
-     * The base class for all the layers that we have. Not instantiable.
-     */
-    class LayerBase {       
-        protected:
-            std::optional<Tensor> weights = std::nullopt;
-            std::optional<Tensor> bias = std::nullopt;
-
-        public:
-            LayerBase() = default;
-            virtual ~LayerBase() noexcept = default;
-
-            virtual Tensor forward(const Tensor& input) const = 0;
-            //virtual ftype* backward(ftype* input) = 0;
-
-            // weights should always exist, never nullopt outside of c'tor
-            const Dimension& getDims() const noexcept { 
-                return weights.value().getDims(); 
-            }
-
-            ftype getItem(std::vector<tensorDim_t>&& idx) const;
-            void setItem(ftype item, std::vector<tensorDim_t>&& idx);
-
-
-            virtual void print(std::ostream& os) const noexcept;
-            friend std::ostream& operator<<(std::ostream& os, const LayerBase& t) noexcept;
-    };
-}
\ No newline at end of file
diff --git a/src/backend/module/activation_functions/leaky_relu.cpp b/src/backend/module/activation_functions/leaky_relu.cpp
new file mode 100644
index 0000000..687e2bf
--- /dev/null
+++ b/src/backend/module/activation_functions/leaky_relu.cpp
@@ -0,0 +1,40 @@
+/**
+ * @file leaky_relu.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-07
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include "leaky_relu.h"
+#include "computational_graph/activation_functions/leaky_relu_node.h"
+
+using namespace std;
+using namespace module;
+
+Tensor LeakyReLu::operator()(const Tensor& t) const {
+  auto res = t.createDeepCopy();
+
+  for(tensorSize_t i=0; i<t.getSize(); i++){
+    constexpr ftype zero = 0;
+    if(t[i] < zero){
+      res.set(eps, i);
+    }
+  }
+
+  return res;
+}
+
+shared_ptr<Tensor> LeakyReLu::operator()(const shared_ptr<Tensor>& t) const {
+  auto res = make_shared<Tensor>((*this)(*t));
+  
+  if(t->getRequiresGrad()){
+    res->setCgNode(make_shared<cgraph::LeakyReLuNode>(t, eps));
+    assert(res->getRequiresGrad());
+  }
+
+  return res;  
+}
\ No newline at end of file
diff --git a/src/backend/module/activation_functions/leaky_relu.h b/src/backend/module/activation_functions/leaky_relu.h
new file mode 100644
index 0000000..94f0c3d
--- /dev/null
+++ b/src/backend/module/activation_functions/leaky_relu.h
@@ -0,0 +1,32 @@
+/**
+ * @file leaky_relu.h
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-02-01
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#pragma once
+
+#include "module/module_base.h"
+
+namespace module {
+  class LeakyReLu final : public ModuleBase {
+    private:
+      const ftype eps;
+
+    public:
+      LeakyReLu(ftype eps=0.01) : eps{eps}
+      { }
+
+      Tensor operator()(const Tensor& t) const override;
+      std::shared_ptr<Tensor> operator()(const std::shared_ptr<Tensor>& t) const override;
+
+      void print(std::ostream& os) const noexcept override { 
+        os << "\nLeakyReLU\neps: " << eps; 
+      }
+  };
+}
diff --git a/src/backend/module/activation_functions/relu.cpp b/src/backend/module/activation_functions/relu.cpp
new file mode 100644
index 0000000..7935304
--- /dev/null
+++ b/src/backend/module/activation_functions/relu.cpp
@@ -0,0 +1,40 @@
+/**
+ * @file relu.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-02-01
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include "relu.h"
+#include "computational_graph/activation_functions/relu_node.h"
+
+using namespace std;
+using namespace module;
+
+Tensor ReLu::operator()(const Tensor& t) const {
+  auto res = t.createDeepCopy();
+
+  for(tensorSize_t i=0; i<t.getSize(); i++){
+    constexpr ftype zero = 0;
+    if(t[i] < zero){
+      res.set(0, i);
+    }
+  }
+
+  return res;
+}
+
+shared_ptr<Tensor> ReLu::operator()(const shared_ptr<Tensor>& t) const {
+  auto res = make_shared<Tensor>((*this)(*t));
+  
+  if(t->getRequiresGrad()){
+    res->setCgNode(make_shared<cgraph::ReLuNode>(t));
+    assert(res->getRequiresGrad());
+  }
+
+  return res;  
+}
diff --git a/src/backend/module/activation_functions/relu.h b/src/backend/module/activation_functions/relu.h
new file mode 100644
index 0000000..05268e0
--- /dev/null
+++ b/src/backend/module/activation_functions/relu.h
@@ -0,0 +1,24 @@
+/**
+ * @file relu.h
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-02-01
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#pragma once
+
+#include "module/module_base.h"
+
+namespace module {
+  class ReLu final : public ModuleBase {
+    public:
+      ReLu() = default;
+      
+      Tensor operator()(const Tensor& t) const override;
+      std::shared_ptr<Tensor> operator()(const std::shared_ptr<Tensor>& t) const override;
+  };
+}
diff --git a/src/backend/module/activation_functions/sigmoid.cpp b/src/backend/module/activation_functions/sigmoid.cpp
new file mode 100644
index 0000000..765b44a
--- /dev/null
+++ b/src/backend/module/activation_functions/sigmoid.cpp
@@ -0,0 +1,52 @@
+/**
+ * @file sigmoid.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-07
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include "sigmoid.h"
+
+#include "computational_graph/activation_functions/sigmoid_node.h"
+
+#include <cmath>
+
+using namespace std;
+using namespace module;
+
+/**
+ * @brief Sigmoid activation function.
+ */
+Tensor Sigmoid::operator()(const Tensor& t) const {
+  auto res = t.createEmptyCopy();
+
+  constexpr ftype one = 1.0;
+  auto compute = [](ftype x){
+    if(x>=0){
+      return one / (one + exp(-x));
+    }
+    auto e = exp(x);
+    return e / (one + e);
+  };
+
+  for(tensorSize_t i=0; i<t.getSize(); i++){
+    res.set(compute(t[i]), i);
+  }
+
+  return res;
+}
+
+shared_ptr<Tensor> Sigmoid::operator()(const shared_ptr<Tensor>& t) const {
+  auto res = make_shared<Tensor>((*this)(*t));
+
+  if(t->getRequiresGrad()){
+    res->setCgNode(make_shared<cgraph::SigmoidNode>(t, res));
+    assert(res->getRequiresGrad());
+  }
+
+  return res;
+}
diff --git a/src/backend/module/activation_functions/sigmoid.h b/src/backend/module/activation_functions/sigmoid.h
new file mode 100644
index 0000000..1cdf8d7
--- /dev/null
+++ b/src/backend/module/activation_functions/sigmoid.h
@@ -0,0 +1,22 @@
+/**
+ * @file sigmoid.h
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-07
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#pragma once
+
+#include "module/module_base.h"
+
+namespace module {
+  class Sigmoid final : public ModuleBase {
+    public:
+      Tensor operator()(const Tensor& t) const override;
+      std::shared_ptr<Tensor> operator()(const std::shared_ptr<Tensor>& t) const override;
+  };
+}
diff --git a/src/backend/module/activation_functions/softmax.cpp b/src/backend/module/activation_functions/softmax.cpp
new file mode 100644
index 0000000..a001a69
--- /dev/null
+++ b/src/backend/module/activation_functions/softmax.cpp
@@ -0,0 +1,81 @@
+/**
+ * @file softmax.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-07
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include "softmax.h"
+
+#include "computational_graph/activation_functions/softmax_node.h"
+
+#include <cmath>
+
+using namespace std;
+using namespace module;
+
+/**
+ * @brief Softmax over last dimension. Expects shape
+ * (dim1, dim2, ..., n_classes)
+ * @return Tensor of shape (dim1, dim2, ..., n_classes) [== input.shape]
+ */
+Tensor Softmax::operator()(const Tensor& t) const {
+  if(t.getDims().nDims()<2){
+    __throw_invalid_argument("Softmax expects input shape of minimum two dimensions");
+  }
+
+  const auto nRows = t.getDims()[-2];
+  const auto nCols = t.getDims()[-1];
+
+  // pre-compute exponents
+  Tensor tmp(t.getDims(), t.getDevice(), false);
+  for(tensorDim_t i=0; i<nRows; i++){
+    // for numerical stability, avoid large values
+    // by centering around maxValue for each sample
+    ftype maxValue = -std::numeric_limits<ftype>::infinity();
+    for(tensorDim_t j=0; j<nCols; j++){
+      maxValue = std::max(maxValue, t.get(i, j));
+    }
+
+    for(tensorDim_t j=0; j<nCols; j++){
+      ftype e = t.get(i, j)-maxValue;
+      tmp.set(exp(e), i, j);
+    }
+  }
+
+  const tensorSize_t stride = t.getDims()[-1];
+  Tensor res(t.getDims(), t.getDevice());
+  auto compute = [&res, &tmp, stride](tensorSize_t start){
+    ftype sum = 0;
+    for(tensorSize_t i=start; i<start+stride; i++){
+      sum += tmp[i];
+    }
+
+    for(tensorSize_t i=start; i<start+stride; i++){
+      res.set(tmp[i] / sum, i);
+    }
+  };
+  
+  tensorSize_t offset=0;
+  while(offset<res.getSize()) {
+    compute(offset);
+    offset += stride;
+  }
+
+  return res;
+}
+
+shared_ptr<Tensor> Softmax::operator()(const shared_ptr<Tensor>& t) const {
+  auto res = make_shared<Tensor>((*this)(*t));
+  
+  if(t->getRequiresGrad()){
+    res->setCgNode(make_shared<cgraph::SoftmaxNode>(t, res));
+    assert(res->getRequiresGrad());
+  }
+
+  return res;  
+}
diff --git a/src/backend/module/activation_functions/softmax.h b/src/backend/module/activation_functions/softmax.h
new file mode 100644
index 0000000..d3c4ade
--- /dev/null
+++ b/src/backend/module/activation_functions/softmax.h
@@ -0,0 +1,22 @@
+/**
+ * @file softmax.h
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-07
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#pragma once
+
+#include "module/module_base.h"
+
+namespace module {
+  class Softmax final : public ModuleBase {
+    public:
+      Tensor operator()(const Tensor& t) const override;
+      std::shared_ptr<Tensor> operator()(const std::shared_ptr<Tensor>& t) const override;
+  };
+}
diff --git a/src/backend/module/layers/ff_layer.cpp b/src/backend/module/layers/ff_layer.cpp
new file mode 100644
index 0000000..82fed69
--- /dev/null
+++ b/src/backend/module/layers/ff_layer.cpp
@@ -0,0 +1,84 @@
+/**
+ * @file ff_layer.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2025-12-07
+ * 
+ * @copyright Copyright (c) 2025
+ * 
+ */
+
+#include "ff_layer.h"
+#include "data_modeling/tensor_functions.h"
+
+#include "computational_graph/tensor_ops/graph_creation.h"
+
+#include <cstdlib>
+#include <utility>
+
+using namespace std;
+using namespace module;
+using namespace utility;
+
+FfLayer::FfLayer(tensorDim_t inSize, tensorDim_t outSize, bool useBias, bool requiresGrad, shared_ptr<InitializerBase> init) 
+    : FfLayer(inSize, outSize, Tensor::getDefaultDevice(), useBias, requiresGrad, init) {}
+
+/**
+ * @brief Construct a new Ff Layer:: Ff Layer object
+ * Assumption for dims: (in-size, out-size)
+ * @param dims Dimensions, see above.
+ * @param d The device.
+ * @param useBias Use a bias if true. Bias will receiver shape (n_rows)
+ * @param requiresGrad If true train this layer.
+ */
+FfLayer::FfLayer(tensorDim_t inSize, tensorDim_t outSize, Device d, 
+    bool useBias, bool requiresGrad, shared_ptr<InitializerBase> init)
+  : useBias{useBias}, requiresGrad{requiresGrad} 
+{
+  if(!init){
+    init = make_shared<NormalXavierInitializer>(inSize, outSize);  
+  }
+
+  weights = make_shared<Tensor>(Dimension({inSize, outSize}), d, requiresGrad);
+  weights->reset(init);
+    
+  if(useBias){
+    bias = make_shared<Tensor>(vector<tensorDim_t>{outSize}, d, requiresGrad);
+    TensorFunctions::ToZeros(*bias);
+  }
+}
+
+/**
+ * @brief Normal forward function. Does not build computational graph.
+ * 
+ * Assumption for input: (b-size, ..., dim1, in-size)
+ */
+Tensor FfLayer::operator()(const Tensor& input) const {
+  auto res = input.matmul(*weights);
+
+  if(useBias){
+    res = res + *bias;
+  }
+
+  return res;
+}
+
+/**
+ * @brief Like overload, but creates computational graph.
+ */
+std::shared_ptr<Tensor> FfLayer::operator()(const std::shared_ptr<Tensor>& input) const {
+  auto res = cgraph::matmul(input, weights);
+  if(useBias){
+    res = cgraph::add(res, bias);
+  }
+
+  return res;  
+}
+
+void FfLayer::print(ostream& os) const noexcept {
+  os << "\nFfLayer\nWeigths:\n" << *weights;
+  if(bias){
+    os << "\nBias:\n" << *bias;
+  }
+}
\ No newline at end of file
diff --git a/src/backend/module/layers/ff_layer.h b/src/backend/module/layers/ff_layer.h
new file mode 100644
index 0000000..8c58dc2
--- /dev/null
+++ b/src/backend/module/layers/ff_layer.h
@@ -0,0 +1,55 @@
+/**
+ * @file ff_layer.h
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2025-12-07
+ * 
+ * @copyright Copyright (c) 2025
+ * 
+ */
+
+#pragma once
+
+#include "module/module_base.h"
+#include "utility/initializers.h"
+
+#include <optional>
+
+namespace module {
+  class FfLayer : public ModuleBase {
+    bool requiresGrad = false;
+    bool useBias = false;
+
+    std::shared_ptr<Tensor> weights = nullptr;
+    std::shared_ptr<Tensor> bias = nullptr;
+
+  public:
+    FfLayer(tensorDim_t inSize, tensorDim_t outSize,
+            bool useBias=true, bool requiresGrad=false, std::shared_ptr<utility::InitializerBase> init=nullptr);
+    
+    FfLayer(tensorDim_t inSize, tensorDim_t outSize, Device d, 
+            bool useBias=true, bool requiresGrad=false, std::shared_ptr<utility::InitializerBase> init=nullptr);
+
+    Tensor operator()(const Tensor& input) const override;
+    std::shared_ptr<Tensor> operator()(const std::shared_ptr<Tensor>& input) const override;
+
+    const Dimension& getDims() const {
+      assert(weights);
+      return weights->getDims();
+    }
+
+    auto getWeights() const noexcept { return weights; }
+    auto getBias() const noexcept { return bias; }
+
+    bool hasWeights() const {
+      return weights != nullptr;
+    }
+
+    std::vector< std::shared_ptr<Tensor> > parameters() const override {
+      return {weights, bias};
+    }
+
+    void print(std::ostream& os) const noexcept override;
+  };
+}
diff --git a/src/backend/module/module_base.cpp b/src/backend/module/module_base.cpp
new file mode 100644
index 0000000..951f96c
--- /dev/null
+++ b/src/backend/module/module_base.cpp
@@ -0,0 +1,21 @@
+/**
+ * @file module.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-13
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include "module/module_base.h"
+
+#include <utility>
+
+using namespace std;
+
+ostream& module::operator<<(ostream& os, const module::ModuleBase& l) noexcept {
+  l.print(os); // calling vtable
+  return os;
+}
\ No newline at end of file
diff --git a/src/backend/module/module_base.h b/src/backend/module/module_base.h
new file mode 100644
index 0000000..28247ab
--- /dev/null
+++ b/src/backend/module/module_base.h
@@ -0,0 +1,66 @@
+/**
+ * @file module_base.h
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-13
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#pragma once
+
+#include "data_modeling/tensor.h"
+#include "utility/global_params.h"
+
+#include <optional>
+#include <memory>
+#include <utility>
+
+#include <iostream>
+
+// if GCC or Clang
+#ifdef __GNUC__
+#include <cxxabi.h>
+#endif // __GNUC__
+
+namespace module {
+  /** 
+   * The base class for all the layers that we have. Not instantiable.
+   */
+  class ModuleBase {
+    public:
+      ModuleBase() = default;
+
+      ModuleBase(const ModuleBase& other) = delete;
+      ModuleBase& operator=(const ModuleBase& other) = delete;
+
+      ModuleBase(ModuleBase&& other) noexcept = default;
+      ModuleBase& operator=(ModuleBase&& other) noexcept = default;
+
+      ~ModuleBase() noexcept = default;
+
+      // for inference -> no graph creation
+      virtual Tensor operator()(const Tensor& input) const = 0;
+      // for training -> creates graph
+      virtual std::shared_ptr<Tensor> operator()(const std::shared_ptr<Tensor>& input) const = 0;
+
+      virtual std::vector< std::shared_ptr<Tensor> > parameters() const { return {}; }
+
+      virtual void print(std::ostream& os) const noexcept {
+        os << "\n";
+      #ifdef __GNUC__
+        // demangle name on gcc and clang
+        int status;
+        char* demangled = abi::__cxa_demangle(typeid(*this).name(), nullptr, nullptr, &status);
+        os << (status == 0 ? demangled : typeid(*this).name());
+        std::free(demangled);
+      #else
+        os << typeid(*this).name();
+      #endif
+      };
+
+      friend std::ostream& operator<<(std::ostream& os, const ModuleBase& t) noexcept;
+  };
+}
\ No newline at end of file
diff --git a/src/backend/module/networks/sequential.cpp b/src/backend/module/networks/sequential.cpp
new file mode 100644
index 0000000..e807e51
--- /dev/null
+++ b/src/backend/module/networks/sequential.cpp
@@ -0,0 +1,67 @@
+/**
+ * @file sequential.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2025-12-07
+ * 
+ * @copyright Copyright (c) 2025
+ * 
+ */
+
+#include "sequential.h"
+
+using namespace std;
+using namespace module;
+
+Tensor Sequential::operator()(const Tensor& input) const {
+  if(layers.size()==0){
+    __throw_invalid_argument("Network empy, cannot be called.");
+  }
+
+  auto x = layers[0]->operator()(input);
+  for(int i=1; i<layers.size(); i++){
+    x = layers[i]->operator()(x);
+  }
+
+  return x;
+}
+
+shared_ptr<Tensor> Sequential::operator()(const shared_ptr<Tensor>& input) const {
+  if(layers.size()==0){
+    __throw_invalid_argument("Network empy, cannot be called.");
+  }
+
+  auto x = layers[0]->operator()(input);
+  for(int i=1; i<layers.size(); i++){
+    x = layers[i]->operator()(x);
+  }
+
+  return x;
+}
+
+vector<shared_ptr<Tensor>> Sequential::parameters() const {
+  vector<shared_ptr<Tensor>> res;
+
+  for(const auto& layer: layers) {
+    auto p = layer->parameters();
+    for(auto& pp: p){
+      if(pp){
+        res.push_back(std::move(pp));
+      }
+    }
+  }
+
+  return res;
+}
+
+void Sequential::append(shared_ptr<module::ModuleBase> l) {
+  layers.push_back(move(l));
+}
+
+void Sequential::print(std::ostream& os) const noexcept {
+  os << "\nSequential";
+  for(const auto& l: layers){
+    os << *l;
+  }
+}
\ No newline at end of file
diff --git a/src/backend/module/networks/sequential.h b/src/backend/module/networks/sequential.h
new file mode 100644
index 0000000..cf13417
--- /dev/null
+++ b/src/backend/module/networks/sequential.h
@@ -0,0 +1,44 @@
+/**
+ * @file sequential.h
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2025-12-07
+ * 
+ * @copyright Copyright (c) 2025
+ * 
+ */
+
+#pragma once
+
+#include "module/module_base.h"
+
+#include <vector>
+#include <memory>
+
+namespace module {
+  class Sequential : public ModuleBase {
+    protected:
+      std::vector< std::shared_ptr<module::ModuleBase> > layers;
+
+    public:
+      Sequential() = default;
+          
+      Sequential(const Sequential& other) = delete;
+      Sequential& operator=(const Sequential& other) = delete;
+
+      Sequential(Sequential&& other) noexcept = default;
+      Sequential& operator=(Sequential&& other) noexcept = default;
+
+      ~Sequential() noexcept = default;
+
+      Tensor operator()(const Tensor& input) const override;
+      std::shared_ptr<Tensor> operator()(const std::shared_ptr<Tensor>& input) const override;
+
+      std::vector<std::shared_ptr<Tensor>> parameters() const override;
+
+      void append(std::shared_ptr<module::ModuleBase> l);
+
+      void print(std::ostream& os) const noexcept override;
+  };
+}
diff --git a/src/backend/networks/sequential.cpp b/src/backend/networks/sequential.cpp
deleted file mode 100644
index 08617e5..0000000
--- a/src/backend/networks/sequential.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/**
- * @file sequential.cpp
- * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
- * @brief 
- * @version 0.1
- * @date 2025-12-07
- * 
- * @copyright Copyright (c) 2025
- * 
- */
-
-#include "sequential.h"
-
-using namespace std;
-using namespace layers;
-
-/**
- * @brief Returns true if dimensions valid, else false. 
- * Ensures consistency along network.
- */
-bool SequentialNetwork::assertDims(const LayerBase& layer) const noexcept {
-  if(layers.size() == 0)
-    return true;
-
-  return layers.at(layers.size()-1).getDims() == layer.getDims(); 
-}
-
-Tensor SequentialNetwork::forward(const Tensor& input) const {
-  if(input.getDims().getItem(1) != layers.at(0).getDims().getItem(0)){
-    // TODO: show meaningful message rather than exception
-    __throw_invalid_argument("Not implemented yet. Dimensions don't match");
-  }
-
-  if(layers.size()==0){
-    // TODO: show meaningful message rather than exception
-    __throw_invalid_argument("Network empy, cannot be called.");
-  }
-
-  Tensor x = layers.at(0).forward(input);
-  for(int i=1; i<layers.size(); i++){
-    x = layers.at(i).forward(x);
-  }
-
-  return x;
-}
\ No newline at end of file
diff --git a/src/backend/networks/sequential.h b/src/backend/networks/sequential.h
deleted file mode 100644
index f05fafd..0000000
--- a/src/backend/networks/sequential.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/**
- * @file sequential.h
- * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
- * @brief 
- * @version 0.1
- * @date 2025-12-07
- * 
- * @copyright Copyright (c) 2025
- * 
- */
-
-#pragma once
-
-#include "layers/layer_base.h"
-
-#include <vector>
-#include <utility>
-#include <type_traits>
-
-class SequentialNetwork {
-    protected:
-        std::vector<layers::LayerBase> layers;
-        bool assertDims(const layers::LayerBase& layer) const noexcept;
-
-        template <typename T>
-        requires (std::derived_from< std::remove_cvref_t<T>, layers::LayerBase >)
-        void addLayer(T&& layer) {
-            if(!assertDims(layer)){
-                // TODO: show warning that the dims don't match
-                return;
-            }
-            layers.push_back(std::forward<T>(layer));
-        }
-
-    public:
-        SequentialNetwork() = default;
-
-        Tensor forward(const Tensor& input) const;
-};
-
-/*template<typename T>
-void SequentialNetwork::addLayer(LayerBase&& layer) noexcept {
-    layers.push_back(std::forward<LayerBase>(layer));
-}*/
\ No newline at end of file
diff --git a/src/backend/system/sys_functions.cpp b/src/backend/system/sys_functions.cpp
index 02a501f..d653663 100644
--- a/src/backend/system/sys_functions.cpp
+++ b/src/backend/system/sys_functions.cpp
@@ -10,13 +10,20 @@
  */
 
 #include "sys_functions.h"
+#include "data_modeling/tensor.h"
 
-using namespace global;
+#include "utility/initializers.h"
 
-void setDevice(Device d) noexcept {
+using namespace sys;
+
+void sys::setDevice(Device d) noexcept {
   Tensor::setDefaultDevice(d);
 }
 
-Device getDevice() noexcept {
+Device sys::getDevice() noexcept {
   return Tensor::getDefaultDevice();
+}
+
+void sys::setRandomSeed(const unsigned int s) noexcept {
+  utility::InitializerBase::setSeed(s);
 }
\ No newline at end of file
diff --git a/src/backend/system/sys_functions.h b/src/backend/system/sys_functions.h
index ad58358..b3a0f63 100644
--- a/src/backend/system/sys_functions.h
+++ b/src/backend/system/sys_functions.h
@@ -12,9 +12,11 @@
 
 #pragma once
 
-#include "data_modeling/tensor.h"
+#include "data_modeling/device.h"
 
-namespace global {
+namespace sys {
   void setDevice(Device d) noexcept;
   Device getDevice() noexcept;
+
+  void setRandomSeed(unsigned int s) noexcept;
 }
\ No newline at end of file
diff --git a/src/backend/training/loss_functions/bce_loss.cpp b/src/backend/training/loss_functions/bce_loss.cpp
new file mode 100644
index 0000000..a6df909
--- /dev/null
+++ b/src/backend/training/loss_functions/bce_loss.cpp
@@ -0,0 +1,52 @@
+/**
+ * @file bce_loss.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-07
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include "bce_loss.h"
+
+#include "computational_graph/loss_functions/bce_node.h"
+
+#include <cmath>
+
+using namespace std;
+using namespace train;
+
+/**
+ * @brief Expected shapes: (batchsize) or (batchsize, 1)
+ * @return Tensor of shape (1)
+ */
+shared_ptr<Tensor> BceLoss::operator()(const shared_ptr<Tensor> y, const shared_ptr<Tensor> ypred) const {
+  if(!ypred->getRequiresGrad()) {
+    __throw_invalid_argument("ypred must have gradient enabled");
+  }  
+  else if(y->getDevice() != ypred->getDevice()){
+    __throw_invalid_argument("y and ypred must be on same device");
+  }
+  else if(y->getDims()!=ypred->getDims()){
+    __throw_invalid_argument("Tensors must be of same shape");
+  }
+
+  auto bce = [](ftype y, ftype ypred){
+    return y*log(std::max(ypred, epsBce)) + (1-y)*log(std::max(1-ypred, epsBce));
+  };
+
+  const auto nBatches = y->getDims()[0];
+
+  ftype loss = 0;
+  for(tensorSize_t i=0; i<nBatches; i++){
+    loss += bce((*y)[i], (*ypred)[i]);
+  }
+
+  auto res = make_shared<Tensor>(std::vector<tensorDim_t>{1}, std::vector<ftype>{-loss / nBatches}, y->getDevice(), true);
+  res->setCgNode(make_shared<cgraph::BceNode>(y, ypred));
+  assert(res->getRequiresGrad());
+
+  return res; 
+}
\ No newline at end of file
diff --git a/src/backend/training/loss_functions/bce_loss.h b/src/backend/training/loss_functions/bce_loss.h
new file mode 100644
index 0000000..9ddc938
--- /dev/null
+++ b/src/backend/training/loss_functions/bce_loss.h
@@ -0,0 +1,22 @@
+/**
+ * @file bce_loss.h
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-07
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#pragma once
+
+#include "loss_base.h"
+
+namespace train {
+  class BceLoss final : public LossBase {
+    public:
+      std::shared_ptr<Tensor> operator()(const std::shared_ptr<Tensor> y, 
+                                         const std::shared_ptr<Tensor> ypred) const override;
+  };
+}
diff --git a/src/backend/training/loss_functions/bce_sigmoid_loss.cpp b/src/backend/training/loss_functions/bce_sigmoid_loss.cpp
new file mode 100644
index 0000000..2634bf4
--- /dev/null
+++ b/src/backend/training/loss_functions/bce_sigmoid_loss.cpp
@@ -0,0 +1,53 @@
+/**
+ * @file bce_logits_loss.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-17
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+ #include "bce_sigmoid_loss.h"
+
+ #include "computational_graph/loss_functions/bce_sigmoid_node.h"
+
+ #include <cmath>
+
+using namespace std;
+using namespace train;
+
+/**
+ * @brief Expected shapes: (batchsize) or (batchsize, 1)
+ * @return Tensor of shape (1)
+ */
+shared_ptr<Tensor> BceSigmoidLoss::operator()(const shared_ptr<Tensor> y, const shared_ptr<Tensor> logits) const {
+  if(!logits->getRequiresGrad()) {
+    __throw_invalid_argument("logits must have gradient enabled");
+  }  
+  else if(y->getDevice() != logits->getDevice()){
+    __throw_invalid_argument("y and logits must be on same device");
+  }
+  else if(y->getDims()!=logits->getDims()){
+    __throw_invalid_argument("Tensors must be of same shape");
+  }
+
+  auto bceSimplified = [](ftype y, ftype logit){
+    constexpr ftype zero = 0;
+    return std::max(logit, zero) - logit*y + log(1+exp(-std::abs(logit)));
+  };
+
+  const auto nBatches = y->getDims()[0];
+
+  ftype loss = 0;
+  for(tensorSize_t i=0; i<nBatches; i++){
+    loss += bceSimplified((*y)[i], (*logits)[i]);
+  }
+
+  auto res = make_shared<Tensor>(std::vector<tensorDim_t>{1}, std::vector<ftype>{loss / nBatches}, y->getDevice(), true);
+  res->setCgNode(make_shared<cgraph::BceSigmoidNode>(y, logits));
+  assert(res->getRequiresGrad());
+
+  return res; 
+}
\ No newline at end of file
diff --git a/src/backend/training/loss_functions/bce_sigmoid_loss.h b/src/backend/training/loss_functions/bce_sigmoid_loss.h
new file mode 100644
index 0000000..7aae6bc
--- /dev/null
+++ b/src/backend/training/loss_functions/bce_sigmoid_loss.h
@@ -0,0 +1,22 @@
+/**
+ * @file bce_loss.h
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-07
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#pragma once
+
+#include "loss_base.h"
+
+namespace train {
+  class BceSigmoidLoss final : public LossBase {
+    public:
+      std::shared_ptr<Tensor> operator()(const std::shared_ptr<Tensor> y, 
+                                         const std::shared_ptr<Tensor> logits) const override;
+  };
+}
diff --git a/src/backend/training/loss_functions/crossentropy_loss.cpp b/src/backend/training/loss_functions/crossentropy_loss.cpp
new file mode 100644
index 0000000..d1a5291
--- /dev/null
+++ b/src/backend/training/loss_functions/crossentropy_loss.cpp
@@ -0,0 +1,55 @@
+/**
+ * @file crossentropy_loss.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-17
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include "crossentropy_loss.h"
+
+#include "computational_graph/loss_functions/crossentropy_node.h"
+
+#include <cmath>
+
+using namespace std;
+using namespace train;
+
+/**
+ * @brief Expected shapes: (batch_size, n_classes)
+ * @return Tensor of shape (1)
+ */
+shared_ptr<Tensor> CrossEntropyLoss::operator()(const shared_ptr<Tensor> y, const shared_ptr<Tensor> ypred) const {
+  if(!ypred->getRequiresGrad()) {
+    __throw_invalid_argument("ypred must have gradient enabled");
+  }
+  else if(y->getDevice() != ypred->getDevice()){
+    __throw_invalid_argument("y and ypred must be on same device");
+  }
+  else if(y->getDims()!=ypred->getDims()){
+    __throw_invalid_argument("Tensors must be of same shape");
+  }
+
+  auto ce = [&y, &ypred](const tensorDim_t b){
+    ftype res = 0;
+    for(tensorDim_t i=0; i<y->getDims()[-1]; i++){
+      res += y->get(b, i) * log(std::max(ypred->get(b, i), epsCrossentropy));
+    }
+    return res;
+  };
+
+  const auto nBatches = y->getDims()[0];
+  ftype loss = 0;
+  for(tensorSize_t b=0; b<nBatches; b++){
+    loss += ce(b);
+  }
+
+  auto res = make_shared<Tensor>(std::vector<tensorDim_t>{1}, std::vector<ftype>{-loss / nBatches}, y->getDevice(), true);
+  res->setCgNode(std::make_shared<cgraph::CrossEntropyNode>(y, ypred));
+  assert(res->getRequiresGrad());
+
+  return res;
+}
\ No newline at end of file
diff --git a/src/backend/training/loss_functions/crossentropy_loss.h b/src/backend/training/loss_functions/crossentropy_loss.h
new file mode 100644
index 0000000..dfd71b3
--- /dev/null
+++ b/src/backend/training/loss_functions/crossentropy_loss.h
@@ -0,0 +1,22 @@
+/**
+ * @file crossentropy_loss.h
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-07
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#pragma once
+
+#include "loss_base.h"
+
+namespace train {
+  class CrossEntropyLoss final : public LossBase {
+    public:
+      std::shared_ptr<Tensor> operator()(const std::shared_ptr<Tensor> y, 
+                                         const std::shared_ptr<Tensor> ypred) const override;
+  };
+}
diff --git a/src/backend/training/loss_functions/crossentropy_softmax_loss.cpp b/src/backend/training/loss_functions/crossentropy_softmax_loss.cpp
new file mode 100644
index 0000000..a2b7866
--- /dev/null
+++ b/src/backend/training/loss_functions/crossentropy_softmax_loss.cpp
@@ -0,0 +1,93 @@
+/**
+ * @file crossentropy_logits_loss.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelflogits->nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-17
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include "crossentropy_softmax_loss.h"
+
+#include "computational_graph/loss_functions/crossentropy_softmax_node.h"
+
+#include <cmath>
+
+using namespace std;
+using namespace train;
+
+/**
+ * @brief Expected shapes: (batch_size, n_classes)
+ * @return Tensor of shape (1)
+ */
+shared_ptr<Tensor> CrossEntropySoftmaxLoss::operator()(const shared_ptr<Tensor> y, const shared_ptr<Tensor> logits) const {
+  if(!logits->getRequiresGrad()) {
+    __throw_invalid_argument("logits must have gradient enabled");
+  }
+  else if(y->getDevice() != logits->getDevice()){
+    __throw_invalid_argument("y and logits must be on same device");
+  }
+  else if(y->getDims()!=logits->getDims()){
+    __throw_invalid_argument("Tensors must be of same shape");
+  }
+
+  ////////////////////////////////////////////////
+
+  const auto nRows = logits->getDims()[-2];
+  const auto nCols = logits->getDims()[-1];
+
+  // pre-compute exponents and max-values
+  vector<ftype> maxValues(nRows); 
+  Tensor tmp(logits->getDims(), logits->getDevice(), false);
+  for(tensorDim_t i=0; i<nRows; i++){
+    // for numerical stability, avoid inf
+    ftype maxV = -std::numeric_limits<ftype>::infinity();
+    for(tensorDim_t j=0; j<nCols; j++){
+      maxV = std::max(maxV, logits->get(i, j));
+    }
+
+    maxValues[i] = maxV;
+
+    for(tensorDim_t j=0; j<nCols; j++){
+      ftype e = logits->get(i, j)-maxV;
+      tmp.set(exp(e), i, j);
+    }
+  }
+
+  const tensorSize_t stride = logits->getDims()[-1];
+  ftype loss = 0;
+
+  /** 
+   * CE = -sum_i(y_i * z_i) + log(sum_j(exp(z_j))) with 
+   * log(sum_j(exp(z_j))) = max(z) + log(sum_j(exp(z_j - max(z)))).
+   * for numerical stability 
+   */ 
+  auto compute = [&loss, &y, &logits, &tmp, &maxValues, stride](tensorSize_t start){
+    ftype lsum = 0;
+    for(tensorSize_t i=start; i<start+stride; i++){
+      lsum += tmp[i];
+    }
+    lsum = log(lsum);
+
+    const tensorSize_t j = start/stride;
+    for(tensorSize_t i=start; i<start+stride; i++){
+      if((*y)[i]>0){ // y either zero or one
+        loss += -(*logits)[i] + maxValues[j] + lsum;
+      }
+    }
+  };
+  
+  tensorSize_t offset=0;
+  while(offset<logits->getSize()) {
+    compute(offset);
+    offset += stride;
+  }
+
+  auto res = make_shared<Tensor>(std::vector<tensorDim_t>{1}, std::vector<ftype>{loss / logits->getDims()[0]}, y->getDevice(), true);
+  res->setCgNode(std::make_shared<cgraph::CrossEntropySoftmaxNode>(y, logits));
+  assert(res->getRequiresGrad());
+
+  return res;
+}
\ No newline at end of file
diff --git a/src/backend/training/loss_functions/crossentropy_softmax_loss.h b/src/backend/training/loss_functions/crossentropy_softmax_loss.h
new file mode 100644
index 0000000..6feb16a
--- /dev/null
+++ b/src/backend/training/loss_functions/crossentropy_softmax_loss.h
@@ -0,0 +1,22 @@
+/**
+ * @file crossentropy_loss.h
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-07
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#pragma once
+
+#include "loss_base.h"
+
+namespace train {
+  class CrossEntropySoftmaxLoss final : public LossBase {
+    public:
+      std::shared_ptr<Tensor> operator()(const std::shared_ptr<Tensor> y, 
+                                         const std::shared_ptr<Tensor> logits) const override;
+  };
+}
diff --git a/src/backend/training/loss_functions/loss_base.h b/src/backend/training/loss_functions/loss_base.h
index e6c7922..9da95ea 100644
--- a/src/backend/training/loss_functions/loss_base.h
+++ b/src/backend/training/loss_functions/loss_base.h
@@ -3,7 +3,7 @@
  * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
  * @brief 
  * @version 0.1
- * @date 2026-02-02
+ * @date 2026-03-07
  * 
  * @copyright Copyright (c) 2026
  * 
@@ -13,7 +13,22 @@
 
 #include "data_modeling/tensor.h"
 
-class LossBase {
-  public:
-    virtual Tensor operator()(Tensor& y, const Tensor& y_target) const noexcept = 0;
-};
\ No newline at end of file
+#include <memory>
+
+namespace train {
+  class LossBase {
+    public:
+      LossBase() = default;
+
+      LossBase(const LossBase& other) = delete;
+      LossBase& operator=(const LossBase& other) = delete;
+
+      LossBase(LossBase&& other) noexcept = default;
+      LossBase& operator=(LossBase&& other) noexcept = default;
+
+      ~LossBase() noexcept = default;
+      
+      virtual std::shared_ptr<Tensor> operator()(const std::shared_ptr<Tensor> y, 
+                                                 const std::shared_ptr<Tensor> ypred) const = 0;
+  };
+}
diff --git a/src/backend/training/loss_functions/mse_loss.cpp b/src/backend/training/loss_functions/mse_loss.cpp
deleted file mode 100644
index 7c7c478..0000000
--- a/src/backend/training/loss_functions/mse_loss.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/**
- * @file mse_loss.cpp
- * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
- * @brief 
- * @version 0.1
- * @date 2026-02-03
- * 
- * @copyright Copyright (c) 2026
- * 
- */
-
-#include "mse_loss.h"
-#include "utility/global_params.h"
-
-#include <cmath>
-
-/**
- * @brief Expects shape (b-size, 1), or simply (batch-size)
- * 
- * @param y Predicted output
- * @param t_target Target
- * @return Tensor of shape (b-size, 1)
- */
-Tensor MseLoss::operator()(Tensor& y, const Tensor& y_target) const noexcept {
-  auto res = Tensor(y);
-  for(tensorSize_t i = 0; i<y.getSize(); i++){
-    res[i] = y.getItem(i) - y_target.getItem(i);
-  }
-
-  for(tensorSize_t i = 0; i<y.getSize(); i++){
-    res[i] = std::sqrt(res[i] * res[i]);
-  }
-
-  return res;
-}
\ No newline at end of file
diff --git a/src/backend/training/loss_functions/mse_loss.h b/src/backend/training/loss_functions/mse_loss.h
deleted file mode 100644
index 97564d5..0000000
--- a/src/backend/training/loss_functions/mse_loss.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/**
- * @file mse_loss.h
- * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
- * @brief 
- * @version 0.1
- * @date 2026-02-03
- * 
- * @copyright Copyright (c) 2026
- * 
- */
-
-#pragma once
-
-#include "loss_base.h"
-
-class MseLoss final : private LossBase {
-  private:
-    Tensor operator()(Tensor& y, const Tensor& y_target) const noexcept override;
-
-  public:
-    MseLoss() = default;
-};
diff --git a/src/backend/training/loss_functions/rmse_loss.cpp b/src/backend/training/loss_functions/rmse_loss.cpp
new file mode 100644
index 0000000..30c750a
--- /dev/null
+++ b/src/backend/training/loss_functions/rmse_loss.cpp
@@ -0,0 +1,55 @@
+/**
+ * @file rmse_loss.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-14
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include "rmse_loss.h"
+
+#include "computational_graph/loss_functions/rmse_node.h"
+
+#include <cmath>
+#include <iostream>
+
+using namespace std;
+using namespace train;
+
+/**
+ * @brief Expected shapes: (batchsize) or (batchsize, 1)
+ * @return Tensor of shape (1)
+ */
+shared_ptr<Tensor> RmseLoss::operator()(const shared_ptr<Tensor> y, const shared_ptr<Tensor> ypred) const {
+  if(!ypred->getRequiresGrad()) {
+    __throw_invalid_argument("ypred must have gradient enabled");
+  }  
+  else if(y->getDevice() != ypred->getDevice()){
+    __throw_invalid_argument("y and ypred must be on same device");
+  }
+  else if(y->getDims()!=ypred->getDims()){
+    __throw_invalid_argument("Tensors must be of same shape");
+  }
+
+  auto diffPow = [](ftype y, ftype ypred){
+    auto diff = y - ypred;
+    return diff * diff;
+  };
+
+  const auto nBatches = y->getDims()[0];
+
+  ftype loss = 0;
+  for(tensorSize_t i=0; i<nBatches; i++){
+    loss += diffPow((*y)[i], (*ypred)[i]);
+  }
+  loss = sqrt(loss/nBatches);
+
+  auto res = make_shared<Tensor>(std::vector<tensorDim_t>{1}, std::vector<ftype>{loss}, y->getDevice(), true);
+  res->setCgNode(make_shared<cgraph::RmseNode>(y, ypred, loss));
+  assert(res->getRequiresGrad());
+
+  return res; 
+}
\ No newline at end of file
diff --git a/src/backend/training/loss_functions/rmse_loss.h b/src/backend/training/loss_functions/rmse_loss.h
new file mode 100644
index 0000000..804f88d
--- /dev/null
+++ b/src/backend/training/loss_functions/rmse_loss.h
@@ -0,0 +1,22 @@
+/**
+ * @file rmse_loss.h
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-14
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#pragma once
+
+#include "loss_base.h"
+
+namespace train {
+  class RmseLoss final : public LossBase {
+    public:
+      std::shared_ptr<Tensor> operator()(const std::shared_ptr<Tensor> y, 
+                                         const std::shared_ptr<Tensor> ypred) const override;
+  };
+}
diff --git a/src/backend/training/optimizers/optimizer_base.cpp b/src/backend/training/optimizers/optimizer_base.cpp
index 58e35c3..e2a6d8d 100644
--- a/src/backend/training/optimizers/optimizer_base.cpp
+++ b/src/backend/training/optimizers/optimizer_base.cpp
@@ -1,9 +1,9 @@
 /**
- * @file optimizer_base.h
- * @author your name (you@domain.com)
+ * @file optimizer_base.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
  * @brief 
  * @version 0.1
- * @date 2026-02-02
+ * @date 2026-03-14
  * 
  * @copyright Copyright (c) 2026
  * 
@@ -11,10 +11,15 @@
 
 #include "optimizer_base.h"
 
-float OptimizerBase::getLr() const noexcept {
-    return lr;
-}
+#include "data_modeling/tensor_functions.h"
 
-void OptimizerBase::setLr(const float lr) noexcept {
-    this->lr = lr;
+using namespace train;
+
+void OptimizerBase::zeroGrad() noexcept{ 
+  for(auto& p: params){
+    auto grads = p->getGrads();
+    
+    if(grads)
+      TensorFunctions::ToZeros(*grads);
+  }
 }
\ No newline at end of file
diff --git a/src/backend/training/optimizers/optimizer_base.h b/src/backend/training/optimizers/optimizer_base.h
index 732b219..af4f6ef 100644
--- a/src/backend/training/optimizers/optimizer_base.h
+++ b/src/backend/training/optimizers/optimizer_base.h
@@ -3,7 +3,7 @@
  * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
  * @brief 
  * @version 0.1
- * @date 2026-02-02
+ * @date 2026-03-07
  * 
  * @copyright Copyright (c) 2026
  * 
@@ -12,13 +12,38 @@
 #pragma once
 
 #include "data_modeling/tensor.h"
+#include "training/loss_functions/loss_base.h"
 
-class OptimizerBase {
-    private:
-        float lr = 0.05;
+#include <memory>
+#include <utility>
+
+namespace train {
+  class OptimizerBase {
+    protected:
+      const ftype lr;
+      const std::vector< std::shared_ptr<Tensor> > params;
 
     public:
-        virtual Tensor operator()(Tensor& t) const noexcept;
-        float getLr() const noexcept;
-        void setLr(const float lr) noexcept;
-};
\ No newline at end of file
+      OptimizerBase(std::vector< std::shared_ptr<Tensor> > params, ftype lr) 
+        : params{std::move(params)}, lr{lr} 
+        {
+#ifndef NDEBUG
+          for(const auto& param: params){
+            assert(param); // we don't want nullptrs here
+          }
+#endif // NDEBUG
+        };
+      
+      ~OptimizerBase() noexcept = default;
+
+      OptimizerBase(const OptimizerBase& other) = delete;
+      OptimizerBase& operator=(const OptimizerBase& other) = delete;
+
+      OptimizerBase(OptimizerBase&& other) noexcept = default;
+      OptimizerBase& operator=(OptimizerBase&& other) noexcept = default;
+
+      virtual void step() = 0;
+
+      void zeroGrad() noexcept;
+  };
+}
\ No newline at end of file
diff --git a/src/backend/training/optimizers/rmsprop.cpp b/src/backend/training/optimizers/rmsprop.cpp
new file mode 100644
index 0000000..c5a93a8
--- /dev/null
+++ b/src/backend/training/optimizers/rmsprop.cpp
@@ -0,0 +1,47 @@
+/**
+ * @file rmsprop.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-10
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include "rmsprop.h"
+
+using namespace std;
+using namespace train;
+
+void RmsPropOptimizer::step() {
+  constexpr ftype eps = 1e-9;
+  for(const auto& param: params){
+    auto tPtr = param.get();
+    const auto gPtr = tPtr->getGrads().get();
+    auto vPtr = movingAvg[tPtr].get();
+
+    // update moving avg
+    if(vPtr!=nullptr) { // hot path
+      for(tensorSize_t i=0; i<gPtr->getSize(); i++){ 
+        auto g = (*gPtr)[i];
+        auto update = decay * (*vPtr)[i] + (1-decay)*g*g;
+        vPtr->set(update, i);
+      }
+    }
+    else { // init loop
+      movingAvg[tPtr] = make_unique<Tensor>(tPtr->getDims(), tPtr->getDevice(), false); // create empty tensor
+      vPtr = movingAvg[tPtr].get();
+      for(tensorSize_t i=0; i<tPtr->getSize(); i++) {
+        auto g = (*gPtr)[i];
+        vPtr->set((1-decay)*g*g, i);
+      }
+    }
+
+    // update gradients
+    for(tensorSize_t i=0; i<tPtr->getSize(); i++) {
+      auto update = (*tPtr)[i] - lr * (*gPtr)[i] / ((*vPtr)[i] + eps);
+      tPtr->set(update, i);
+    }
+  }
+}
\ No newline at end of file
diff --git a/src/backend/training/optimizers/rmsprop.h b/src/backend/training/optimizers/rmsprop.h
new file mode 100644
index 0000000..967c18e
--- /dev/null
+++ b/src/backend/training/optimizers/rmsprop.h
@@ -0,0 +1,34 @@
+/**
+ * @file rmsprop.h
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-10
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include "optimizer_base.h"
+#include "utility/global_params.h"
+
+#include <unordered_map>
+
+namespace train {
+  class RmsPropOptimizer final : public OptimizerBase {
+    private:
+      const ftype decay;
+      std::unordered_map<Tensor*, std::unique_ptr<Tensor>> movingAvg;
+
+    public:
+        RmsPropOptimizer(std::vector< std::shared_ptr<Tensor> > params, ftype lr, ftype decay) 
+          : OptimizerBase(std::move(params), lr), decay{decay} 
+          {
+            for(const auto& param: params) {
+              movingAvg[param.get()] = nullptr; // lazy initialization
+            }
+          }
+
+        void step() override;
+  };
+}
\ No newline at end of file
diff --git a/src/backend/training/optimizers/sgd.cpp b/src/backend/training/optimizers/sgd.cpp
new file mode 100644
index 0000000..83f122d
--- /dev/null
+++ b/src/backend/training/optimizers/sgd.cpp
@@ -0,0 +1,25 @@
+/**
+ * @file sgd.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-08
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include "sgd.h"
+
+using namespace std;
+using namespace train;
+
+void SgdOptimizer::step() {
+  for(auto& t: params){
+    auto grads = t->getGrads();
+    for(auto idx=0; idx<t->getSize(); idx++){
+      auto updatedWeight = (*t)[idx] - lr*(*grads)[idx];
+      t->set(updatedWeight, idx);
+    }
+  }
+}
\ No newline at end of file
diff --git a/src/backend/training/optimizers/sgd.h b/src/backend/training/optimizers/sgd.h
new file mode 100644
index 0000000..0d8a891
--- /dev/null
+++ b/src/backend/training/optimizers/sgd.h
@@ -0,0 +1,24 @@
+/**
+ * @file sgd.h
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-08
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include "optimizer_base.h"
+
+#include "utility/global_params.h"
+
+namespace train {
+  class SgdOptimizer final : public OptimizerBase {
+    public:
+        SgdOptimizer(std::vector< std::shared_ptr<Tensor> > params, ftype lr) 
+          : OptimizerBase(std::move(params), lr) { }
+
+      void step() override;
+  };
+}
\ No newline at end of file
diff --git a/src/backend/training/trainers/base_train_loop.cpp b/src/backend/training/trainers/base_train_loop.cpp
new file mode 100644
index 0000000..ff95688
--- /dev/null
+++ b/src/backend/training/trainers/base_train_loop.cpp
@@ -0,0 +1,63 @@
+/**
+ * @file base_train_loop.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-11
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include "base_train_loop.h"
+
+#include <span>
+
+#include <numeric>
+#include <algorithm>
+#include <random>
+
+#include <iostream>
+
+using namespace std;
+using namespace train;
+
+void BaseTrainLoop::run(shared_ptr<Tensor>& x, shared_ptr<Tensor>& y, const bool shuffle, const bool verbose) {
+  const auto nSamples = x->getDims()[0];
+
+  for(size_t e=0; e<epochs; e++){
+    std::vector<tensorDim_t> indices(nSamples);
+    std::iota(indices.begin(), indices.end(), 0);
+
+    if(verbose)
+      cout << "\nEpoch " << e;
+
+    if(shuffle){
+      std::random_device rd;
+      std::mt19937 rng(rd());
+      std::shuffle(indices.begin(), indices.end(), rng);
+    }
+
+    tensorDim_t low = 0;
+
+    int batch = 0;
+    while(low < nSamples){
+      if(verbose)
+        cout << "\nBatch " << batch << endl;
+
+      std::span<const tensorDim_t> batchSpan(indices.data() + low, low+bsize < nSamples ? bsize : nSamples-low);
+
+      auto xBatch = make_shared<Tensor>(x->getSlice(batchSpan));
+      auto yBatch = make_shared<Tensor>(y->getSlice(batchSpan));
+
+      auto yPred = (*graph)(xBatch);
+      auto l = (*loss)(yBatch, yPred);
+
+      l->backward();
+      optim->step();
+      optim->zeroGrad();
+
+      low += bsize;
+    }
+  }
+}
\ No newline at end of file
diff --git a/src/backend/training/trainers/base_train_loop.h b/src/backend/training/trainers/base_train_loop.h
new file mode 100644
index 0000000..3beeb46
--- /dev/null
+++ b/src/backend/training/trainers/base_train_loop.h
@@ -0,0 +1,49 @@
+/**
+ * @file base_train_loop.h
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-13
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#pragma once
+
+#include "data_modeling/tensor.h"
+#include "module/module_base.h"
+
+#include "training/optimizers/optimizer_base.h"
+#include "training/loss_functions/loss_base.h"
+
+#include <memory>
+#include <utility>
+
+namespace train {
+  class BaseTrainLoop {
+    protected:
+      const size_t epochs;
+      const tensorDim_t bsize;
+
+      std::shared_ptr<LossBase> loss;
+      std::shared_ptr<OptimizerBase> optim;
+      std::shared_ptr<module::ModuleBase> graph;
+
+    public:
+      BaseTrainLoop(std::shared_ptr<module::ModuleBase> graph, std::shared_ptr<LossBase> loss,
+                  std::shared_ptr<OptimizerBase> optim, size_t epochs, tensorDim_t bsize) 
+        : graph{std::move(graph)}, optim{std::move(optim)}, loss{loss}, epochs{epochs}, bsize{bsize} 
+        { }
+      
+      ~BaseTrainLoop() noexcept = default;
+
+      BaseTrainLoop(const BaseTrainLoop& other) = delete;
+      BaseTrainLoop& operator=(const BaseTrainLoop& other) = delete;
+
+      BaseTrainLoop(BaseTrainLoop&& other) noexcept = default;
+      BaseTrainLoop& operator=(BaseTrainLoop&& other) noexcept = default;
+
+      void run(std::shared_ptr<Tensor>& x, std::shared_ptr<Tensor>& y, bool shuffle, bool verbose=true);
+  };
+}
\ No newline at end of file
diff --git a/src/backend/utility/global_params.h b/src/backend/utility/global_params.h
index 4e91fb2..3d6edcb 100644
--- a/src/backend/utility/global_params.h
+++ b/src/backend/utility/global_params.h
@@ -13,7 +13,7 @@
 
 #include <cstdint>
 
-using ftype = float; // TODO: make compiler flag
+using ftype = float; // TODO: make compiler flag?
 
 /** 
  * IMPORTANT: For the following block we assume that
@@ -27,4 +27,13 @@ using ftype = float; // TODO: make compiler flag
  * request fit into datatype tensorDim_t.
  */ 
 using tensorDim_t = std::uint16_t;
-using tensorSize_t = std::uint32_t;
\ No newline at end of file
+using tensorSize_t = std::uint32_t;
+
+// we assert this here so during conversions of tensorDim_t to 
+// tensorSize_t we do not need to cast explicitely
+static_assert(sizeof(tensorDim_t)<=sizeof(tensorSize_t));
+
+// ----------------- Numerical stability -------------------
+
+constexpr ftype epsCrossentropy = 1e-5;
+constexpr ftype epsBce = 1e-5;
\ No newline at end of file
diff --git a/src/backend/utility/initializers.cpp b/src/backend/utility/initializers.cpp
index 4240345..0fad81c 100644
--- a/src/backend/utility/initializers.cpp
+++ b/src/backend/utility/initializers.cpp
@@ -11,36 +11,27 @@
 
 #include "initializers.h"
 
-#include <random>
-#include <algorithm>
+#include <cmath>
 
 using namespace std;
 using namespace utility;
 
-namespace {
-    class GaussianInitializer final : public InitializerBase {
-    public:
-        GaussianInitializer();
-        ftype drawNumber() const override;
-    };
+ftype GaussianInitializer::drawNumber() const {
+  return dist(gen);
+}
 
-    GaussianInitializer::GaussianInitializer() : InitializerBase() {}
+ftype UniformXavierInitializer::computeRange(ftype nInputs, ftype nOutputs) {
+  return sqrt(6/nInputs + nOutputs);
+}
 
-    ftype GaussianInitializer::drawNumber() const {
-        static std::random_device rd;
-        static std::mt19937 gen{rd()};
-        static std::normal_distribution<ftype> dist;
+ftype UniformXavierInitializer::drawNumber() const {
+  return dist(gen);
+}
 
-        return dist(gen);
-    }
+ftype NormalXavierInitializer::computeSigma(ftype nInputs, ftype nOutputs) {
+  return sqrt(6/nInputs + nOutputs);
 }
 
-unique_ptr<InitializerBase> InitializerFactory::getInitializer(InitClass ic) {
-    switch(ic){
-        case InitClass::Gaussian:
-            return make_unique<GaussianInitializer>();
-        default:
-            __throw_invalid_argument("Init class not implemented yet");
-    }
-    return nullptr; // never reached, suppress warning
+ftype NormalXavierInitializer::drawNumber() const {
+  return dist(gen);
 }
\ No newline at end of file
diff --git a/src/backend/utility/initializers.h b/src/backend/utility/initializers.h
index ba76707..85eb9c5 100644
--- a/src/backend/utility/initializers.h
+++ b/src/backend/utility/initializers.h
@@ -13,24 +13,98 @@
 
 #include "global_params.h"
 
+#include <random>
+#include <algorithm>
+
 #include <memory>
-#include <type_traits>
-
-namespace utility{
-    enum class InitClass {
-        Gaussian    
-    };
-
-    class InitializerBase {
-        public:
-            InitializerBase() = default;
-            virtual ~InitializerBase() = default;
-            virtual ftype drawNumber() const = 0;
-    };
-
-    class InitializerFactory final {
-        public:
-            InitializerFactory() = delete;
-            static std::unique_ptr<InitializerBase> getInitializer(InitClass ic);
-    };
+#include <optional>
+
+namespace utility {
+  class InitializerBase {
+    protected: 
+      static inline std::optional<unsigned int> randomSeed_opt = std::nullopt;
+
+    public:
+      InitializerBase() = default;
+
+      virtual ~InitializerBase() = default;
+      virtual ftype drawNumber() const = 0;
+
+      static void setSeed(unsigned int s) noexcept { randomSeed_opt = s; }
+  };
+
+  class GaussianInitializer final : public InitializerBase {
+    private:
+      std::random_device rd{};
+      mutable std::mt19937 gen;
+      mutable std::normal_distribution<ftype> dist;
+
+    public:
+        GaussianInitializer(ftype stddev) : gen{rd()}, dist{0, stddev} 
+        {
+          if(randomSeed_opt){
+            gen = std::mt19937{randomSeed_opt.value()};
+          }
+        }
+
+        GaussianInitializer(ftype stddev, unsigned int seed) 
+          : dist{0, stddev}
+        {
+          gen = std::mt19937{seed};
+        }
+        
+        ftype drawNumber() const override;
+  };
+
+  class UniformXavierInitializer final : public InitializerBase {
+    private:
+      std::random_device rd{};
+      mutable std::mt19937 gen;
+      mutable std::uniform_real_distribution<ftype> dist;
+
+      ftype computeRange(ftype nInputs, ftype nOutputs);
+
+    public:
+        UniformXavierInitializer(tensorDim_t nInputs, tensorDim_t nOutputs)
+          : gen{rd()}, dist{-computeRange(nInputs, nOutputs), computeRange(nInputs, nOutputs)} 
+          {
+            if(randomSeed_opt){
+              gen = std::mt19937{randomSeed_opt.value()};
+            }
+          }
+
+        UniformXavierInitializer(tensorDim_t nInputs, tensorDim_t nOutputs, unsigned int seed)
+          : dist{-computeRange(nInputs, nOutputs), computeRange(nInputs, nOutputs)}
+          {
+            gen = std::mt19937{seed};
+          }
+
+        ftype drawNumber() const override;
+  };
+
+  class NormalXavierInitializer final : public InitializerBase {
+    private:
+      std::random_device rd{};
+      mutable std::mt19937 gen;
+      mutable std::normal_distribution<ftype> dist;
+
+      ftype computeSigma(ftype nInputs, ftype nOutputs);
+
+    public:
+        NormalXavierInitializer(tensorDim_t nInputs, tensorDim_t nOutputs)
+          : gen{rd()}, dist{0, computeSigma(nInputs, nOutputs)}
+          {
+            if(randomSeed_opt){
+              gen = std::mt19937{randomSeed_opt.value()};
+            }
+          }
+
+        NormalXavierInitializer(tensorDim_t nInputs, tensorDim_t nOutputs, unsigned int seed)
+          : dist{0, computeSigma(nInputs, nOutputs)}
+          {
+            gen = std::mt19937{seed};
+          }
+
+        ftype drawNumber() const override;
+  };
 }
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index 6c5370d..405d1f3 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -1,16 +1,41 @@
 include_directories(
-        "${CMAKE_CURRENT_SOURCE_DIR}/python_utility"
+        "${CMAKE_CURRENT_SOURCE_DIR}/py_utility"
         )
 
 # remove the lib... prefix 
 set(CMAKE_SHARED_MODULE_PREFIX "")
 
 add_library(_core MODULE
-            data_modeling/py_data_modeling.cpp
-            data_modeling/py_data_modeling_util.cpp
+            py_core/py_core.cpp
+            py_core/py_core_util.cpp
             )
-            
-set_target_properties(_core PROPERTIES 
+
+add_library(_nn MODULE
+            py_nn/py_nn.cpp
+            )
+
+add_library(_sys MODULE
+            py_sys/py_sys.cpp
+            )
+
+add_library(_train MODULE
+            py_train/py_train.cpp
+            )
+
+set_target_properties(_core _nn _sys _train PROPERTIES 
                       PREFIX ""
-                      OUTPUT_NAME "_core"
-                      LIBRARY_OUTPUT_DIRECTORY ${PYTHON_MODULE_DIR})
\ No newline at end of file
+                      INSTALL_RPATH "$ORIGIN" # to find shared backend-core lib
+                      BUILD_WITH_INSTALL_RPATH TRUE # use install RPATH even during build
+                      LIBRARY_OUTPUT_DIRECTORY "${PYTHON_MODULE_DIR}")
+
+set_target_properties(_core PROPERTIES 
+                      OUTPUT_NAME "_core")
+
+set_target_properties(_nn PROPERTIES 
+                      OUTPUT_NAME "_nn")
+
+set_target_properties(_sys PROPERTIES 
+                      OUTPUT_NAME "_sys")
+
+set_target_properties(_train PROPERTIES 
+                      OUTPUT_NAME "_train")
\ No newline at end of file
diff --git a/src/python/data_modeling/py_data_modeling.cpp b/src/python/data_modeling/py_data_modeling.cpp
deleted file mode 100644
index 88b4bec..0000000
--- a/src/python/data_modeling/py_data_modeling.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-/**
- * @file py_data_modeling.cpp
- * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
- * @brief 
- * @version 0.1
- * @date 2026-02-21
- * 
- * @copyright Copyright (c) 2026
- * 
- */
-
-#include "data_modeling/tensor.h"
-
-#include "py_data_modeling_util.h"
-#include "python_templates.h"
-#include "custom_converters.h"
-
-#include "data_modeling/tensor.h"
-#include "data_modeling/tensor_functions.h"
-#include "computational_graph/graph_creation.h"
-
-#include <boost/python.hpp>
-#include <boost/python/enum.hpp>
-#include <boost/python/return_internal_reference.hpp>
-
-BOOST_PYTHON_MODULE(_core)
-{
-    using namespace boost::python;
-
-    // some macros to make code below easier to read
-    #define WRAP_TENSOR_METHOD_1(method) \
-    +[](const Tensor& self, const Tensor& other) -> std::shared_ptr<Tensor> { \
-        return std::make_shared<Tensor>(self.method(other)); \
-    }
-
-    #define WRAP_SCALAR(method, T) \
-    +[](const Tensor& self, T val) -> std::shared_ptr<Tensor> { \
-        return std::make_shared<Tensor>(self.method(val)); \
-    }
-
-    #define WRAP_SCALAR_REVERSE(op, T) \
-    +[](const Tensor& self, T val) -> std::shared_ptr<Tensor> { \
-        return std::make_shared<Tensor>(val op self); \
-    }
-
-    // different, since those are not methods anymore
-    #define WRAP_FREE_MEMBER_FUNC_1(fPtr, T1, T2) \
-    +[](const Tensor& self, int v1, int v2) -> std::shared_ptr<Tensor> { \
-        return std::make_shared<Tensor>((self.*fPtr)(v1, v2)); \
-    }
-
-    #define WRAP_FREE_MEMBER_FUNC_2(fPtr, T1, T2, T3) \
-    +[](const Tensor& self, T1 v1, T2 v2, T3 v3) -> std::shared_ptr<Tensor> { \
-        return std::make_shared<Tensor>((self.*fPtr)(v1, v2, v3)); \
-    }
-
-    #define WRAP_FREE_FUNC_1(fPtr, T1) \
-    +[](T1 v1) -> std::shared_ptr<Tensor> { \
-        return std::make_shared<Tensor>((*fPtr)(v1)); \
-    }
-
-    #define WRAP_FREE_FUNC_2(fPtr, T1, T2) \
-    +[](T1 v1, T2 v2) -> std::shared_ptr<Tensor> { \
-        return std::make_shared<Tensor>((*fPtr)(v1, v2)); \
-    }
-
-    #define WRAP_FREE_FUNC_3(fPtr, T1, T2, T3) \
-    +[](T1 v1, T2 v2, T3 v3) -> std::shared_ptr<Tensor> { \
-        return std::make_shared<Tensor>((*fPtr)(v1, v2, v3)); \
-    }
-
-    #define WRAP_FREE_FUNC_4(fPtr, T) \
-    +[](const Tensor& self, T val) -> std::shared_ptr<Tensor> { \
-        return (*fPtr)(self.getSharedPtr(), val); \
-    }
-
-    #define WRAP_FREE_FUNC_5(fPtr) \
-    +[](const Tensor& self, const Tensor& other) -> std::shared_ptr<Tensor> { \
-        return (*fPtr)(self.getSharedPtr(), other.getSharedPtr()); \
-    }
-
-    #define WRAP_FREE_FUNC_6(fPtr, T) \
-    +[](const Tensor& self, T val) -> std::shared_ptr<Tensor> { \
-        return (*fPtr)(val, self.getSharedPtr()); \
-    }
-
-    #define WRAP_FREE_FUNC_7(fPtr) \
-    +[](const Tensor& self) -> std::shared_ptr<Tensor> { \
-        return (*fPtr)(self.getSharedPtr()); \
-    }
-
-    #define WRAP_FUNC_AND_CONVERT_DTYPE_1(method) \
-    +[](const Tensor& self, int v1) -> ftype { \
-        return self.method(static_cast<tensorSize_t>(v1)); \
-    }
-
-    #define WRAP_FUNC_AND_CONVERT_DTYPE_2(method) \
-    +[](const Tensor& self, int v1, int v2) -> ftype { \
-        return self.method(static_cast<tensorDim_t>(v1), static_cast<tensorDim_t>(v2)); \
-    }
-
-    #define WRAP_FUNC_AND_CONVERT_DTYPE_3(method) \
-    +[](const Tensor& self, int v1, int v2, int v3) -> ftype { \
-        return self.method(static_cast<tensorDim_t>(v1), static_cast<tensorDim_t>(v2), \
-                           static_cast<tensorDim_t>(v3)); \
-    }
-
-    #define WRAP_FUNC_AND_CONVERT_DTYPE_4(method) \
-    +[](const Tensor& self, int v1, int v2, int v3, int v4) -> ftype { \
-        return self.method(static_cast<tensorDim_t>(v1), static_cast<tensorDim_t>(v2), \
-                           static_cast<tensorDim_t>(v3), static_cast<tensorDim_t>(v4)); \
-    }
-
-    // classes
-    class_<Dimension>("Dimension", no_init)
-        .add_property("list", &Dimension::getItem)
-        .def("__str__", &Py_Util::toString<Dimension>)
-        .def("__eq__", Py_DataModeling::dimEquals1)
-        .def("__eq__", Py_DataModeling::dimEquals2)
-        .def("__ne__", Py_DataModeling::nDimEquals1)
-        .def("__ne__", Py_DataModeling::nDimEquals2)
-    ;
-
-    enum_<Device>("Device")
-        .value("CPU", Device::CPU)
-        .value("CUDA", Device::CUDA)
-    ;
-
-    // register implicit dtype conversion
-    custom_converters::PyListToVectorConverter<tensorDim_t>();
-    custom_converters::PyListToVectorConverter<ftype>();
-
-    // to convert std::shared_ptr<const Tensor> to std::shared_ptr<Tensor>> in Python
-    boost::python::register_ptr_to_python< std::shared_ptr<const Tensor> >();
-
-    // we manage via shared_ptr, since we deleted copy-ctor
-    class_<Tensor, std::shared_ptr<Tensor>, boost::noncopyable>("Tensor", no_init)
-        .def(init<const std::vector<tensorDim_t>&, optional<bool> >())
-        .def(init<const std::vector<tensorDim_t>&, Device, optional<bool> >())
-        .def(init<const std::vector<tensorDim_t>&, const std::vector<ftype>&, optional<bool> >())
-        .def(init<const std::vector<tensorDim_t>&, const std::vector<ftype>&, Device, optional<bool> >())
-        
-        // static creation methods
-        .def("ones", WRAP_FREE_FUNC_1(Py_DataModeling::Ones0, std::vector<tensorDim_t>))
-        .def("ones", WRAP_FREE_FUNC_2(Py_DataModeling::Ones1, std::vector<tensorDim_t>, Device))
-        .def("ones", WRAP_FREE_FUNC_2(Py_DataModeling::Ones2, std::vector<tensorDim_t>, const bool))
-        .def("ones", WRAP_FREE_FUNC_3(Py_DataModeling::Ones3, std::vector<tensorDim_t>, Device, const bool)).staticmethod("ones")
-
-        .def("zeros", WRAP_FREE_FUNC_1(Py_DataModeling::Zeros0, std::vector<tensorDim_t>))
-        .def("zeros", WRAP_FREE_FUNC_2(Py_DataModeling::Zeros1, std::vector<tensorDim_t>, Device))
-        .def("zeros", WRAP_FREE_FUNC_2(Py_DataModeling::Zeros2, std::vector<tensorDim_t>, const bool))
-        .def("zeros", WRAP_FREE_FUNC_3(Py_DataModeling::Zeros3, std::vector<tensorDim_t>, Device, const bool)).staticmethod("zeros")
-
-        .def("gauss", WRAP_FREE_FUNC_1(Py_DataModeling::Gaussian0, std::vector<tensorDim_t>))
-        .def("gauss", WRAP_FREE_FUNC_2(Py_DataModeling::Gaussian1, std::vector<tensorDim_t>, Device))
-        .def("gauss", WRAP_FREE_FUNC_2(Py_DataModeling::Gaussian2, std::vector<tensorDim_t>, const bool))
-        .def("gauss", WRAP_FREE_FUNC_3(Py_DataModeling::Gaussian3, std::vector<tensorDim_t>, Device, const bool)).staticmethod("gauss")
-
-        // properties
-        .add_property("device", &Tensor::getDevice, &Tensor::setDevice)
-        .add_property("dims", make_function(&Tensor::getDims, return_internal_reference<>()))
-        .add_property("grads", make_function(&Tensor::getGrads))
-        .add_property("requiresGrad", &Tensor::getRequiresGrad, &Tensor::setRequiresGrad)
-
-        // operators
-        .def("__str__", &Py_Util::toString<Tensor>)
-        .def("__repr__", &Py_Util::toString<Tensor>)
-        .def("__len__", &Tensor::getSize)
-        .def("__getitem__", WRAP_FREE_FUNC_4(&Py_DataModeling::getItemAsTensor1, tensorSize_t))
-        .def("__getitem__", WRAP_FREE_FUNC_4(&Py_DataModeling::getItemAsTensor2, std::vector<tensorDim_t>))
-        .def("__setitem__", &Py_DataModeling::tensorSetItem)
-
-        // arithmetics
-        .def("__matmul__", WRAP_FREE_FUNC_5(Py_DataModeling::matmul))
-        .def("__add__", WRAP_FREE_FUNC_5(Py_DataModeling::elementwiseadd)) // elementwise add
-        .def("__add__", WRAP_FREE_FUNC_4(Py_DataModeling::scalaradd, ftype))
-        .def("__radd__", WRAP_FREE_FUNC_6(Py_DataModeling::rscalaradd, ftype))
-
-        .def("__mul__", WRAP_FREE_FUNC_5(Py_DataModeling::elementwisemul)) // elementwise mult
-        .def("__mul__", WRAP_FREE_FUNC_4(Py_DataModeling::scalarmul, ftype))
-        .def("__rmul__", WRAP_FREE_FUNC_6(Py_DataModeling::rscalarmul, ftype))
-        
-        .def("__sub__", WRAP_FREE_FUNC_4(Py_DataModeling::scalarsub, ftype))
-        .def("__truediv__", WRAP_FREE_FUNC_4(Py_DataModeling::scalardiv, ftype))
-
-        // member functions
-        .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_1(Tensor::getItem))
-        .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_2(Tensor::getItem))
-        .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_3(Tensor::getItem))
-        .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_4(Tensor::getItem))
-        .def("getitem", Py_DataModeling::getItemVector) // the vector arg
-
-        .def("sum", WRAP_FREE_FUNC_7(&(graph::sumTensor)))
-        
-        .def("reset", Py_DataModeling::reset1)
-        .def("reset", Py_DataModeling::reset2)
-
-        .def("transpose", WRAP_FREE_MEMBER_FUNC_1(Py_DataModeling::transpose1, int, int))
-        .def("transpose", WRAP_FREE_MEMBER_FUNC_2(Py_DataModeling::transpose2, int, int, bool))
-        .def("transposeThis", Py_DataModeling::transposeThis1)
-        .def("transposeThis", Py_DataModeling::transposeThis2)
-        
-        .def("backward", &Tensor::backward)
-    ;
-
-    // functions
-    def("Ones", WRAP_FREE_FUNC_1(Py_DataModeling::Ones0, std::vector<tensorDim_t>));
-    def("Ones", WRAP_FREE_FUNC_2(Py_DataModeling::Ones1, std::vector<tensorDim_t>, Device));
-    def("Ones", WRAP_FREE_FUNC_2(Py_DataModeling::Ones2, std::vector<tensorDim_t>, const bool));
-    def("Ones", WRAP_FREE_FUNC_3(Py_DataModeling::Ones3, std::vector<tensorDim_t>, Device, const bool));
-
-    def("Zeros", WRAP_FREE_FUNC_1(Py_DataModeling::Zeros0, std::vector<tensorDim_t>));
-    def("Zeros", WRAP_FREE_FUNC_2(Py_DataModeling::Zeros1, std::vector<tensorDim_t>, Device));
-    def("Zeros", WRAP_FREE_FUNC_2(Py_DataModeling::Zeros2, std::vector<tensorDim_t>, const bool));
-    def("Zeros", WRAP_FREE_FUNC_3(Py_DataModeling::Zeros3, std::vector<tensorDim_t>, Device, const bool));
-
-    def("Gaussian", WRAP_FREE_FUNC_1(Py_DataModeling::Gaussian0, std::vector<tensorDim_t>));
-    def("Gaussian", WRAP_FREE_FUNC_2(Py_DataModeling::Gaussian1, std::vector<tensorDim_t>, Device));
-    def("Gaussian", WRAP_FREE_FUNC_2(Py_DataModeling::Gaussian2, std::vector<tensorDim_t>, const bool));
-    def("Gaussian", WRAP_FREE_FUNC_3(Py_DataModeling::Gaussian3, std::vector<tensorDim_t>, Device, const bool));
-}
\ No newline at end of file
diff --git a/src/python/data_modeling/py_data_modeling_util.h b/src/python/data_modeling/py_data_modeling_util.h
deleted file mode 100644
index 82a8343..0000000
--- a/src/python/data_modeling/py_data_modeling_util.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/**
- * @file util.h
- * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
- * @brief Helper and wrapper functions
- * @version 0.1
- * @date 2026-02-21
- * 
- * @copyright Copyright (c) 2026
- * 
- */
-
-#pragma once
-
-#include "data_modeling/dim_type.h"
-
-#include "data_modeling/tensor.h"
-#include "data_modeling/tensor_functions.h"
-#include "computational_graph/graph_creation.h"
-
-#include <boost/python.hpp>
-#include <boost/python/object.hpp>
-
-#include <memory>
-
-namespace Py_DataModeling {
-
-    /*********************************************************************************************************
-    ********************************************** Dimension *************************************************
-    *********************************************************************************************************/
-
-    inline bool (Dimension::*dimEquals1)(const Dimension&) const                   = &Dimension::operator==;
-    inline bool (Dimension::*dimEquals2)(const std::vector<tensorDim_t>&) const    = &Dimension::operator==;
-
-    inline bool (Dimension::*nDimEquals1)(const Dimension&) const                  = &Dimension::operator!=;
-    inline bool (Dimension::*nDimEquals2)(const std::vector<tensorDim_t>&) const   = &Dimension::operator!=;
-    /*********************************************************************************************************
-    *********************************************** Tensor ***************************************************
-    *********************************************************************************************************/
-
-    ftype tensorGetItem(const Tensor& self, boost::python::object index);
-    void tensorSetItem(Tensor& self, boost::python::object index, ftype value);
-
-    // need wrappers for default arguments, see
-    // https://beta.boost.org/doc/libs/develop/libs/python/doc/html/tutorial/tutorial/functions.html
-    inline auto OnesWrapper0(std::vector<tensorDim_t> dims) { 
-        return TensorFunctions::Ones(std::move(dims)); 
-    }
-
-    inline auto OnesWrapper1(std::vector<tensorDim_t> dims, Device d) { 
-        return TensorFunctions::Ones(std::move(dims), d); 
-    }
-
-    inline auto ZerosWrapper0(std::vector<tensorDim_t> dims) { 
-        return TensorFunctions::Zeros(std::move(dims)); 
-    }
-
-    inline auto ZerosWrapper1(std::vector<tensorDim_t> dims, Device d) { 
-        return TensorFunctions::Zeros(std::move(dims), d); 
-    }
-
-    inline auto GaussianWrapper0(std::vector<tensorDim_t> dims) { 
-        return TensorFunctions::Gaussian(std::move(dims)); 
-    }
-
-    inline auto GaussianWrapper1(std::vector<tensorDim_t> dims, Device d) { 
-        return TensorFunctions::Gaussian(std::move(dims), d); 
-    }
-
-    inline Tensor    (*Ones0)(std::vector<tensorDim_t>)                                             = &OnesWrapper0;
-    inline Tensor    (*Ones1)(std::vector<tensorDim_t>, Device)                                     = &OnesWrapper1;
-    inline Tensor    (*Ones2)(std::vector<tensorDim_t>, const bool)                                 = &(TensorFunctions::Ones);
-    inline Tensor    (*Ones3)(std::vector<tensorDim_t>, Device, const bool)                         = &(TensorFunctions::Ones);
-
-    inline Tensor    (*Zeros0)(std::vector<tensorDim_t>)                                            = &ZerosWrapper0;
-    inline Tensor    (*Zeros1)(std::vector<tensorDim_t>, Device)                                    = &ZerosWrapper1;
-    inline Tensor    (*Zeros2)(std::vector<tensorDim_t>, const bool)                                = &(TensorFunctions::Zeros);
-    inline Tensor    (*Zeros3)(std::vector<tensorDim_t>, Device, const bool)                        = &(TensorFunctions::Zeros);
-
-    inline Tensor    (*Gaussian0)(std::vector<tensorDim_t>)                                         = &GaussianWrapper0;
-    inline Tensor    (*Gaussian1)(std::vector<tensorDim_t>, Device)                                 = &GaussianWrapper1;
-    inline Tensor    (*Gaussian2)(std::vector<tensorDim_t>, const bool)                             = &(TensorFunctions::Gaussian);
-    inline Tensor    (*Gaussian3)(std::vector<tensorDim_t>, Device, const bool)                     = &(TensorFunctions::Gaussian);
-
-    inline void    (Tensor::*reset1)(const ftype)                                                   = &Tensor::reset;
-    inline void    (Tensor::*reset2)(const utility::InitClass)                                      = &Tensor::reset;
-
-    inline void    (Tensor::*transposeThis1)()                                                      = &Tensor::transposeThis;
-    inline void    (Tensor::*transposeThis2)(int, int)                                              = &Tensor::transposeThis;
-    inline Tensor  (Tensor::*transpose1)(int, int) const                                            = &Tensor::transpose;
-    inline Tensor  (Tensor::*transpose2)(int, int, bool) const                                      = &Tensor::transpose;
-
-    inline ftype   (Tensor::*getItemVector)(const std::vector<tensorDim_t>&) const                       = &Tensor::getItem;
-
-    /*********************************************************************************************************
-    ***************************************** Graph creation *************************************************
-    *********************************************************************************************************/
-
-    // multiplications
-    inline std::shared_ptr<Tensor> (*elementwisemul) 
-    (const std::shared_ptr<Tensor> left, const std::shared_ptr<Tensor> right)           = &(graph::mul);
-
-    inline std::shared_ptr<Tensor> (*scalarmul) 
-    (const std::shared_ptr<Tensor>, ftype)                                              = &(graph::mul);
-
-    inline std::shared_ptr<Tensor> (*rscalarmul) 
-    (ftype, const std::shared_ptr<Tensor>)                                              = &(graph::mul);
-
-    // additions
-    inline std::shared_ptr<Tensor> (*elementwiseadd) 
-    (const std::shared_ptr<Tensor> left, const std::shared_ptr<Tensor> right)           = &(graph::add);
-
-    inline std::shared_ptr<Tensor> (*scalaradd) 
-    (const std::shared_ptr<Tensor>, ftype)                                              = &(graph::add);
-
-    inline std::shared_ptr<Tensor> (*rscalaradd) 
-    (ftype, const std::shared_ptr<Tensor>)                                              = &(graph::add);
-
-    // matmul
-    inline std::shared_ptr<Tensor> (*matmul) 
-    (const std::shared_ptr<Tensor> left, const std::shared_ptr<Tensor> right)           = &(graph::matmul);
-    
-    // sub, div
-    inline std::shared_ptr<Tensor> (*scalarsub) 
-    (const std::shared_ptr<Tensor>, ftype)                                              = &(graph::sub);
-
-    inline std::shared_ptr<Tensor> (*scalardiv) 
-    (const std::shared_ptr<Tensor>, ftype)                                              = &(graph::div);
-
-    // get
-    inline std::shared_ptr<Tensor> (*getItemAsTensor1) 
-    (const std::shared_ptr<Tensor>& t, tensorSize_t idx)                                = &(graph::get);
-
-    inline std::shared_ptr<Tensor> (*getItemAsTensor2) 
-    (const std::shared_ptr<Tensor>& t, const std::vector<tensorDim_t>& idx)             = &(graph::get);
-}
\ No newline at end of file
diff --git a/src/python/layers/py_layers.cpp b/src/python/layers/py_layers.cpp
deleted file mode 100644
index 5fc3613..0000000
--- a/src/python/layers/py_layers.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/**
- * @file layers.cpp
- * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
- * @brief 
- * @version 0.1
- * @date 2025-11-17
- * 
- * @copyright Copyright (c) 2025
- * 
- */
-
-#include "py_layers.h"
-
-#include <stdexcept>
-
-using namespace boost::python;
-
-ftype Py_Layers::layerGetItem(const layers::LayerBase& self, boost::python::object index) {
-  extract<int> int_extractor(index);
-        
-  // Single integer index (1D)
-  if(int_extractor.check()) {
-      int i0 = int_extractor();
-      return self.getItem(i0);
-  }
-        
-  // Tuple index (2D, 3D, or 4D)
-  extract<tuple> tuple_extractor(index);
-  if(tuple_extractor.check()) {
-      tuple idx_tuple = tuple_extractor();
-      int ndim = boost::python::len(idx_tuple);
-      
-      if (ndim == 2) {
-        int i0 = extract<int>(idx_tuple[0]);
-        int i1 = extract<int>(idx_tuple[1]);
-        return self.getItem(i0, i1);
-      }
-      else if (ndim == 3) {
-        int i0 = extract<int>(idx_tuple[0]);
-        int i1 = extract<int>(idx_tuple[1]);
-        int i2 = extract<int>(idx_tuple[2]);
-        return self.getItem(i0, i1, i2);
-      }
-      else if (ndim == 4) {
-        int i0 = extract<int>(idx_tuple[0]);
-        int i1 = extract<int>(idx_tuple[1]);
-        int i2 = extract<int>(idx_tuple[2]);
-        int i3 = extract<int>(idx_tuple[3]);
-        return self.getItem(i0, i1, i2, i3);
-      }
-      else {
-        PyErr_SetString(PyExc_IndexError, "Unsupported number of dimensions");
-        throw_error_already_set();
-      }
-  }
-        
-  PyErr_SetString(PyExc_TypeError, "Index must be an integer or tuple");
-  throw_error_already_set();
-  return 0.0; // Never reached
-}
-
-void Py_Layers::layerSetItem(layers::LayerBase& self, boost::python::object index, ftype value) {
-  extract<int> int_extractor(index);
-        
-  // Single integer index (1D)
-  if(int_extractor.check()) {
-      int i0 = int_extractor();
-      self.setItem(value, i0);\
-      return;
-  }
-        
-  // Tuple index (2D, 3D, or 4D)
-  extract<tuple> tuple_extractor(index);
-  if(tuple_extractor.check()) {
-      tuple idx_tuple = tuple_extractor();
-      int ndim = boost::python::len(idx_tuple);
-      
-      if (ndim == 2) {
-        int i0 = extract<int>(idx_tuple[0]);
-        int i1 = extract<int>(idx_tuple[1]);
-        self.setItem(value, i0, i1);
-      }
-      else if (ndim == 3) {
-        int i0 = extract<int>(idx_tuple[0]);
-        int i1 = extract<int>(idx_tuple[1]);
-        int i2 = extract<int>(idx_tuple[2]);
-        self.setItem(value, i0, i1, i2);
-      }
-      else if (ndim == 4) {
-        int i0 = extract<int>(idx_tuple[0]);
-        int i1 = extract<int>(idx_tuple[1]);
-        int i2 = extract<int>(idx_tuple[2]);
-        int i3 = extract<int>(idx_tuple[3]);
-        self.setItem(value, i0, i1, i2, i3);
-      }
-      else {
-        PyErr_SetString(PyExc_IndexError, "Unsupported number of dimensions");
-        throw_error_already_set();
-      }
-      return;
-  }
-        
-  PyErr_SetString(PyExc_TypeError, "Index must be an integer or tuple");
-  throw_error_already_set();
-}
\ No newline at end of file
diff --git a/src/python/layers/py_layers.h b/src/python/layers/py_layers.h
deleted file mode 100644
index c90929a..0000000
--- a/src/python/layers/py_layers.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/**
- * @file layers.h
- * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
- * @brief 
- * @version 0.1
- * @date 2025-11-17
- * 
- * @copyright Copyright (c) 2025
- * 
- */
-
-#pragma once
-
-#include "ff_layer.h"
-#include "python_templates.h"
-
-#include <boost/python.hpp>
-#include <boost/python/wrapper.hpp>
-#include <boost/python/object.hpp>
-#include <boost/python/return_internal_reference.hpp>
-
-namespace Py_Layers {
-    ftype layerGetItem(const layers::LayerBase& self, boost::python::object index);
-    void layerSetItem(layers::LayerBase& self, boost::python::object index, ftype value);
-}
-
-BOOST_PYTHON_MODULE(py_layers)
-{
-    using namespace boost::python;
-
-    /**
-     * @brief Wrapper class needed for Boost Python to get the virtual function working 
-     * the way it is intended. See documentation here: 
-     * https://beta.boost.org/doc/libs/develop/libs/python/doc/html/tutorial/tutorial/exposing.html
-     * 
-     */
-    struct LayerBaseWrap : layers::LayerBase, wrapper<layers::LayerBase> {
-        Tensor forward(const Tensor& input) const {
-            return this->get_override("forward")(input);
-        }
-
-        void print(std::ostream& os) const noexcept {
-            this->get_override("print")(os);
-        }
-    };
-
-    class_<LayerBaseWrap, boost::noncopyable>("LayerBase", no_init)
-        .def("forward", pure_virtual(&layers::LayerBase::forward))
-        //.def("backward", &FfLayer::backward)
-        .def("getDims", &layers::LayerBase::getDims, return_internal_reference<>())
-        .def("getTensor", &layers::LayerBase::getDims, return_internal_reference<>())
-        .def("__getitem__", &Py_Layers::layerGetItem)
-        .def("__setitem__", &Py_Layers::layerSetItem)
-        .def("__str__", &toString<layers::LayerBase>)
-    ;
-
-    class_<layers::FfLayer, bases<layers::LayerBase> >("FfLayer", init<tensorDim_t, tensorDim_t>())
-        .def("forward", &layers::FfLayer::forward)
-        //.def("backward", &FfLayer::backward)
-    ;
-}
\ No newline at end of file
diff --git a/src/python/networks/py_sequential.cpp b/src/python/networks/py_sequential.cpp
deleted file mode 100644
index e69de29..0000000
diff --git a/src/python/networks/py_sequential.h b/src/python/networks/py_sequential.h
deleted file mode 100644
index e69de29..0000000
diff --git a/src/python/py_core/py_core.cpp b/src/python/py_core/py_core.cpp
new file mode 100644
index 0000000..846d6e9
--- /dev/null
+++ b/src/python/py_core/py_core.cpp
@@ -0,0 +1,239 @@
+/**
+ * @file py_data_modeling.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-02-21
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include "data_modeling/tensor.h"
+
+#include "py_core_util.h"
+#include "python_templates.h"
+#include "custom_converters.h"
+
+#include "data_modeling/tensor.h"
+#include "data_modeling/tensor_functions.h"
+#include "computational_graph/tensor_ops/graph_creation.h"
+
+#include <boost/python.hpp>
+#include <boost/python/enum.hpp>
+#include <boost/python/return_internal_reference.hpp>
+
+BOOST_PYTHON_MODULE(_core)
+{
+  using namespace boost::python;
+
+  // some macros to make code below easier to read
+  #define WRAP_TENSOR_METHOD_1(method) \
+  +[](const Tensor& self, const Tensor& other) -> std::shared_ptr<Tensor> { \
+    return std::make_shared<Tensor>(self.method(other)); \
+  }
+
+  #define WRAP_SCALAR(method, T) \
+  +[](const Tensor& self, T val) -> std::shared_ptr<Tensor> { \
+    return std::make_shared<Tensor>(self.method(val)); \
+  }
+
+  #define WRAP_SCALAR_REVERSE(op, T) \
+  +[](const Tensor& self, T val) -> std::shared_ptr<Tensor> { \
+    return std::make_shared<Tensor>(val op self); \
+  }
+
+  // different, since those are not methods anymore
+  #define WRAP_FREE_MEMBER_FUNC_1(fPtr, T1, T2) \
+  +[](const Tensor& self, int v1, int v2) -> std::shared_ptr<Tensor> { \
+    return std::make_shared<Tensor>((self.*fPtr)(v1, v2)); \
+  }
+
+  #define WRAP_FREE_MEMBER_FUNC_2(fPtr, T1, T2, T3) \
+  +[](const Tensor& self, T1 v1, T2 v2, T3 v3) -> std::shared_ptr<Tensor> { \
+    return std::make_shared<Tensor>((self.*fPtr)(v1, v2, v3)); \
+  }
+
+  #define WRAP_FREE_FUNC_1(fPtr, T1) \
+  +[](T1 v1) -> std::shared_ptr<Tensor> { \
+    return std::make_shared<Tensor>((*fPtr)(v1)); \
+  }
+
+  #define WRAP_FREE_FUNC_2(fPtr, T1, T2) \
+  +[](T1 v1, T2 v2) -> std::shared_ptr<Tensor> { \
+    return std::make_shared<Tensor>((*fPtr)(v1, v2)); \
+  }
+
+  #define WRAP_FREE_FUNC_3(fPtr, T1, T2, T3) \
+  +[](T1 v1, T2 v2, T3 v3) -> std::shared_ptr<Tensor> { \
+    return std::make_shared<Tensor>((*fPtr)(v1, v2, v3)); \
+  }
+
+  #define WRAP_FREE_FUNC_4(fPtr, T) \
+  +[](const Tensor& self, T val) -> std::shared_ptr<Tensor> { \
+    return (*fPtr)(self.getSharedPtr(), val); \
+  }
+
+  #define WRAP_FREE_FUNC_5(fPtr) \
+  +[](const Tensor& self, const Tensor& other) -> std::shared_ptr<Tensor> { \
+    return (*fPtr)(self.getSharedPtr(), other.getSharedPtr()); \
+  }
+
+  #define WRAP_FREE_FUNC_6(fPtr, T) \
+  +[](const Tensor& self, T val) -> std::shared_ptr<Tensor> { \
+    return (*fPtr)(val, self.getSharedPtr()); \
+  }
+
+  #define WRAP_FREE_FUNC_7(fPtr) \
+  +[](const Tensor& self) -> std::shared_ptr<Tensor> { \
+    return (*fPtr)(self.getSharedPtr()); \
+  }
+
+  #define WRAP_FREE_FUNC_8(fPtr, T1, T2, T3, T4) \
+  +[](T1 v1, T2 v2, T3 v3, T4 v4) -> std::shared_ptr<Tensor> { \
+    return std::make_shared<Tensor>((*fPtr)(v1, v2, v3, v4)); \
+  }
+
+  #define WRAP_FREE_FUNC_9(fPtr, T1, T2, T3, T4, T5) \
+  +[](T1 v1, T2 v2, T3 v3, T4 v4, T5 v5) -> std::shared_ptr<Tensor> { \
+    return std::make_shared<Tensor>((*fPtr)(v1, v2, v3, v4, v5)); \
+  }
+
+  #define WRAP_FUNC_AND_CONVERT_DTYPE_1(method) \
+  +[](const Tensor& self, int v1) -> ftype { \
+    return self.method(static_cast<tensorSize_t>(v1)); \
+  }
+
+  #define WRAP_FUNC_AND_CONVERT_DTYPE_2(method) \
+  +[](const Tensor& self, int v1, int v2) -> ftype { \
+    return self.method(static_cast<tensorDim_t>(v1), static_cast<tensorDim_t>(v2)); \
+  }
+
+  #define WRAP_FUNC_AND_CONVERT_DTYPE_3(method) \
+  +[](const Tensor& self, int v1, int v2, int v3) -> ftype { \
+    return self.method(static_cast<tensorDim_t>(v1), static_cast<tensorDim_t>(v2), \
+                       static_cast<tensorDim_t>(v3)); \
+  }
+
+  #define WRAP_FUNC_AND_CONVERT_DTYPE_4(method) \
+  +[](const Tensor& self, int v1, int v2, int v3, int v4) -> ftype { \
+    return self.method(static_cast<tensorDim_t>(v1), static_cast<tensorDim_t>(v2), \
+                       static_cast<tensorDim_t>(v3), static_cast<tensorDim_t>(v4)); \
+  }
+
+  // classes
+  class_<Dimension>("Dimension", no_init)
+    .add_property("list", &Dimension::get)
+    .def("__str__", &Py_Util::toString<Dimension>)
+    .def("__eq__", Py_DataModeling::dimEquals1)
+    .def("__eq__", Py_DataModeling::dimEquals2)
+    .def("__ne__", Py_DataModeling::nDimEquals1)
+    .def("__ne__", Py_DataModeling::nDimEquals2)
+  ;
+
+  enum_<Device>("Device")
+    .value("CPU", Device::CPU)
+    .value("CUDA", Device::CUDA)
+  ;
+
+  // register implicit dtype conversion
+  custom_converters::PyListToVectorConverter<tensorDim_t>();
+  custom_converters::PyListToVectorConverter<ftype>();
+
+  // to convert std::shared_ptr<const Tensor> to std::shared_ptr<Tensor>> in Python
+  boost::python::register_ptr_to_python< std::shared_ptr<const Tensor> >();
+
+  // we manage via shared_ptr, since we deleted copy-ctor
+  class_<Tensor, std::shared_ptr<Tensor>, boost::noncopyable>("Tensor", no_init)
+    .def(init<const std::vector<tensorDim_t>&, optional<bool> >())
+    .def(init<const std::vector<tensorDim_t>&, Device, optional<bool> >())
+    .def(init<const std::vector<tensorDim_t>&, const std::vector<ftype>&, optional<bool> >())
+    .def(init<const std::vector<tensorDim_t>&, const std::vector<ftype>&, Device, optional<bool> >())
+        
+    // static creation methods
+    .def("ones", WRAP_FREE_FUNC_1(Py_DataModeling::Ones0, std::vector<tensorDim_t>))
+    .def("ones", WRAP_FREE_FUNC_2(Py_DataModeling::Ones1, std::vector<tensorDim_t>, Device))
+    .def("ones", WRAP_FREE_FUNC_2(Py_DataModeling::Ones2, std::vector<tensorDim_t>, const bool))
+    .def("ones", WRAP_FREE_FUNC_3(Py_DataModeling::Ones3, std::vector<tensorDim_t>, Device, const bool))
+    .staticmethod("ones")
+
+    .def("zeros", WRAP_FREE_FUNC_1(Py_DataModeling::Zeros0, std::vector<tensorDim_t>))
+    .def("zeros", WRAP_FREE_FUNC_2(Py_DataModeling::Zeros1, std::vector<tensorDim_t>, Device))
+    .def("zeros", WRAP_FREE_FUNC_2(Py_DataModeling::Zeros2, std::vector<tensorDim_t>, const bool))
+    .def("zeros", WRAP_FREE_FUNC_3(Py_DataModeling::Zeros3, std::vector<tensorDim_t>, Device, const bool))
+    .staticmethod("zeros")
+
+    .def("gauss", WRAP_FREE_FUNC_2(Py_DataModeling::Gaussian0, std::vector<tensorDim_t>, ftype))
+    .def("gauss", WRAP_FREE_FUNC_3(Py_DataModeling::Gaussian1, std::vector<tensorDim_t>, Device, ftype))
+    .def("gauss", WRAP_FREE_FUNC_3(Py_DataModeling::Gaussian2, std::vector<tensorDim_t>, ftype, const bool))
+    .def("gauss", WRAP_FREE_FUNC_8(Py_DataModeling::Gaussian3, std::vector<tensorDim_t>, Device, ftype, const bool))
+    .staticmethod("gauss")
+
+    // properties
+    .add_property("device", &Tensor::getDevice, &Tensor::setDevice)
+    .add_property("dims", make_function(&Tensor::getDims, return_internal_reference<>()))
+    .add_property("grads", make_function(&Tensor::getGrads))
+    .add_property("requiresGrad", &Tensor::getRequiresGrad, &Tensor::setRequiresGrad)
+
+    // operators
+    .def("__str__", &Py_Util::toString<Tensor>)
+    .def("__repr__", &Py_Util::toString<Tensor>)
+    .def("__len__", &Tensor::getSize)
+    .def("__getitem__", WRAP_FREE_FUNC_4(&Py_DataModeling::getItemAsTensor1, tensorSize_t))
+    .def("__getitem__", WRAP_FREE_FUNC_4(&Py_DataModeling::getItemAsTensor2, std::vector<tensorDim_t>))
+    .def("__setitem__", &Py_DataModeling::tensorSetItem)
+
+    // arithmetics
+    .def("__matmul__", WRAP_FREE_FUNC_5(Py_DataModeling::matmul))
+    .def("__add__", WRAP_FREE_FUNC_5(Py_DataModeling::elementwiseadd)) // elementwise add
+    .def("__add__", WRAP_FREE_FUNC_4(Py_DataModeling::scalaradd, ftype))
+    .def("__radd__", WRAP_FREE_FUNC_6(Py_DataModeling::rscalaradd, ftype))
+
+    .def("__mul__", WRAP_FREE_FUNC_5(Py_DataModeling::elementwisemul)) // elementwise mult
+    .def("__mul__", WRAP_FREE_FUNC_4(Py_DataModeling::scalarmul, ftype))
+    .def("__rmul__", WRAP_FREE_FUNC_6(Py_DataModeling::rscalarmul, ftype))
+        
+    .def("__sub__", WRAP_FREE_FUNC_4(Py_DataModeling::scalarsub, ftype))
+    .def("__truediv__", WRAP_FREE_FUNC_4(Py_DataModeling::scalardiv, ftype))
+
+    // member functions
+    .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_1(Tensor::get))
+    .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_2(Tensor::get))
+    .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_3(Tensor::get))
+    .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_4(Tensor::get))
+    .def("getitem", Py_DataModeling::getItemVector) // the vector arg
+
+    .def("sum", WRAP_FREE_FUNC_7(&(cgraph::sumTensor)))
+        
+    .def("reset", Py_DataModeling::reset1)
+    .def("reset", Py_DataModeling::reset2)
+    
+    .def("hasGrads", &Tensor::hasGrads)
+    .def("hasGrads", +[](const std::shared_ptr<Tensor>& t) -> bool {
+      return t->hasGrads();
+      })
+
+    .def("transpose", WRAP_FREE_MEMBER_FUNC_1(Py_DataModeling::transpose1, int, int))
+    .def("transpose", WRAP_FREE_MEMBER_FUNC_2(Py_DataModeling::transpose2, int, int, bool))
+    .def("transposeThis", Py_DataModeling::transposeThis1)
+    .def("transposeThis", Py_DataModeling::transposeThis2)
+        
+    .def("backward", &Tensor::backward)
+  ;
+
+  // free functions
+  def("Ones", WRAP_FREE_FUNC_1(Py_DataModeling::Ones0, std::vector<tensorDim_t>));
+  def("Ones", WRAP_FREE_FUNC_2(Py_DataModeling::Ones1, std::vector<tensorDim_t>, Device));
+  def("Ones", WRAP_FREE_FUNC_2(Py_DataModeling::Ones2, std::vector<tensorDim_t>, const bool));
+  def("Ones", WRAP_FREE_FUNC_3(Py_DataModeling::Ones3, std::vector<tensorDim_t>, Device, const bool));
+
+  def("Zeros", WRAP_FREE_FUNC_1(Py_DataModeling::Zeros0, std::vector<tensorDim_t>));
+  def("Zeros", WRAP_FREE_FUNC_2(Py_DataModeling::Zeros1, std::vector<tensorDim_t>, Device));
+  def("Zeros", WRAP_FREE_FUNC_2(Py_DataModeling::Zeros2, std::vector<tensorDim_t>, const bool));
+  def("Zeros", WRAP_FREE_FUNC_3(Py_DataModeling::Zeros3, std::vector<tensorDim_t>, Device, const bool));
+
+  def("Gaussian", WRAP_FREE_FUNC_2(Py_DataModeling::Gaussian0, std::vector<tensorDim_t>, ftype));
+  def("Gaussian", WRAP_FREE_FUNC_3(Py_DataModeling::Gaussian1, std::vector<tensorDim_t>, Device, ftype));
+  def("Gaussian", WRAP_FREE_FUNC_3(Py_DataModeling::Gaussian2, std::vector<tensorDim_t>, ftype, const bool));
+  def("Gaussian", WRAP_FREE_FUNC_8(Py_DataModeling::Gaussian3, std::vector<tensorDim_t>, Device, ftype, const bool));
+}
\ No newline at end of file
diff --git a/src/python/data_modeling/py_data_modeling_util.cpp b/src/python/py_core/py_core_util.cpp
similarity index 87%
rename from src/python/data_modeling/py_data_modeling_util.cpp
rename to src/python/py_core/py_core_util.cpp
index d495300..581c99f 100644
--- a/src/python/data_modeling/py_data_modeling_util.cpp
+++ b/src/python/py_core/py_core_util.cpp
@@ -1,5 +1,5 @@
 /**
- * @file py_data_modeling_util.cpp
+ * @file py_core_util.cpp
  * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
  * @brief 
  * @version 0.1
@@ -9,7 +9,7 @@
  * 
  */
 
-#include "py_data_modeling_util.h"
+#include "py_core_util.h"
 
 #include <stdexcept>
 #include <utility>
@@ -22,7 +22,7 @@ ftype Py_DataModeling::tensorGetItem(const Tensor& self, boost::python::object i
   // Single integer index (1D)
   if(int_extractor.check()) {
     auto i0 = static_cast<tensorDim_t>(int_extractor());
-    return self.getItem(i0);
+    return self.get(i0);
   }
         
   // Tuple index (2D, 3D, or 4D, or list)
@@ -32,25 +32,25 @@ ftype Py_DataModeling::tensorGetItem(const Tensor& self, boost::python::object i
     // Dispatch to convenience functions for 1-4 args
     if (len == 1) {
       auto i0 = static_cast<tensorDim_t>(extract<int>(index[0]));
-      return self.getItem(i0);
+      return self.get(i0);
     }
     else if (len == 2) {
       auto i0 = static_cast<tensorDim_t>(extract<int>(index[0]));
       auto i1 = static_cast<tensorDim_t>(extract<int>(index[1]));
-      return self.getItem(i0, i1);
+      return self.get(i0, i1);
     }
     else if (len == 3) {
       auto i0 = static_cast<tensorDim_t>(extract<int>(index[0]));
       auto i1 = static_cast<tensorDim_t>(extract<int>(index[1]));
       auto i2 = static_cast<tensorDim_t>(extract<int>(index[2]));
-      return self.getItem(i0, i1, i2);
+      return self.get(i0, i1, i2);
     }
     else if (len == 4) {
       auto i0 = static_cast<tensorDim_t>(extract<int>(index[0]));
       auto i1 = static_cast<tensorDim_t>(extract<int>(index[1]));
       auto i2 = static_cast<tensorDim_t>(extract<int>(index[2]));
       auto i3 = static_cast<tensorDim_t>(extract<int>(index[3]));
-      return self.getItem(i0, i1, i2, i3);
+      return self.get(i0, i1, i2, i3);
     }
     else {
       // Arbitrary length - use vector version
@@ -58,7 +58,7 @@ ftype Py_DataModeling::tensorGetItem(const Tensor& self, boost::python::object i
       for (int i = 0; i < len; ++i) {
         indices.push_back(static_cast<tensorDim_t>(extract<int>(index[i])));
       }
-      return self.getItem(std::move(indices));
+      return self.get(std::move(indices));
     }
   }
         
@@ -71,7 +71,7 @@ void Py_DataModeling::tensorSetItem(Tensor& self, boost::python::object index, f
   extract<int> int_extractor(index);
   if(int_extractor.check()) {
       auto i0 = static_cast<tensorDim_t>(int_extractor());
-      self.setItem(value, i0);
+      self.set(value, i0);
       return;
   }
         
@@ -83,25 +83,25 @@ void Py_DataModeling::tensorSetItem(Tensor& self, boost::python::object index, f
     // Dispatch to convenience functions for 1-4 args
     if (len == 1) {
       auto i0 = static_cast<tensorDim_t>(extract<int>(index[0]));
-      self.setItem(value, i0);
+      self.set(value, i0);
     }
     else if (len == 2) {
       auto i0 = static_cast<tensorDim_t>(extract<int>(index[0]));
       auto i1 = static_cast<tensorDim_t>(extract<int>(index[1]));
-      self.setItem(value, i0, i1);
+      self.set(value, i0, i1);
     }
     else if (len == 3) {
       auto i0 = static_cast<tensorDim_t>(extract<int>(index[0]));
       auto i1 = static_cast<tensorDim_t>(extract<int>(index[1]));
       auto i2 = static_cast<tensorDim_t>(extract<int>(index[2]));
-      self.setItem(value, i0, i1, i2);
+      self.set(value, i0, i1, i2);
     }
     else if (len == 4) {
       auto i0 = static_cast<tensorDim_t>(extract<int>(index[0]));
       auto i1 = static_cast<tensorDim_t>(extract<int>(index[1]));
       auto i2 = static_cast<tensorDim_t>(extract<int>(index[2]));
       auto i3 = static_cast<tensorDim_t>(extract<int>(index[3]));
-      self.setItem(value, i0, i1, i2, i3);
+      self.set(value, i0, i1, i2, i3);
     }
     else {
       // Arbitrary length - use vector version
@@ -109,7 +109,7 @@ void Py_DataModeling::tensorSetItem(Tensor& self, boost::python::object index, f
       for (int i = 0; i < len; ++i) {
         indices.push_back(static_cast<tensorDim_t>(extract<int>(index[i])));
       }
-      self.setItem(value, std::move(indices));
+      self.set(value, std::move(indices));
     }
     return;
   }
diff --git a/src/python/py_core/py_core_util.h b/src/python/py_core/py_core_util.h
new file mode 100644
index 0000000..7aa01d0
--- /dev/null
+++ b/src/python/py_core/py_core_util.h
@@ -0,0 +1,136 @@
+/**
+ * @file util.h
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief Helper and wrapper functions
+ * @version 0.1
+ * @date 2026-02-21
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#pragma once
+
+#include "data_modeling/dim_type.h"
+#include "utility/initializers.h"
+
+#include "data_modeling/tensor.h"
+#include "data_modeling/tensor_functions.h"
+#include "computational_graph/tensor_ops/graph_creation.h"
+
+#include <boost/python.hpp>
+#include <boost/python/object.hpp>
+
+#include <memory>
+
+namespace Py_DataModeling {
+
+  /*********************************************************************************************************
+  ********************************************** Dimension *************************************************
+  *********************************************************************************************************/
+
+  inline bool (Dimension::*dimEquals1)(const Dimension&) const                   = &Dimension::operator==;
+  inline bool (Dimension::*dimEquals2)(const std::vector<tensorDim_t>&) const    = &Dimension::operator==;
+
+  inline bool (Dimension::*nDimEquals1)(const Dimension&) const                  = &Dimension::operator!=;
+  inline bool (Dimension::*nDimEquals2)(const std::vector<tensorDim_t>&) const   = &Dimension::operator!=;
+  /*********************************************************************************************************
+  *********************************************** Tensor ***************************************************
+  *********************************************************************************************************/
+
+  ftype tensorGetItem(const Tensor& self, boost::python::object index);
+  void tensorSetItem(Tensor& self, boost::python::object index, ftype value);
+
+  // need wrappers for default arguments, see
+  // https://beta.boost.org/doc/libs/develop/libs/python/doc/html/tutorial/tutorial/functions.html
+  inline auto OnesWrapper0(std::vector<tensorDim_t> dims) { 
+    return TensorFunctions::Ones(std::move(dims)); 
+  }
+
+  inline auto OnesWrapper1(std::vector<tensorDim_t> dims, Device d) { 
+    return TensorFunctions::Ones(std::move(dims), d); 
+  }
+
+  inline auto ZerosWrapper0(std::vector<tensorDim_t> dims) { 
+    return TensorFunctions::Zeros(std::move(dims)); 
+  }
+
+  inline auto ZerosWrapper1(std::vector<tensorDim_t> dims, Device d) { 
+    return TensorFunctions::Zeros(std::move(dims), d); 
+  }
+
+  inline auto GaussianWrapper0(std::vector<tensorDim_t> dims, ftype stddev) { 
+    return TensorFunctions::Gaussian(std::move(dims), stddev); 
+  }
+
+  inline auto GaussianWrapper1(std::vector<tensorDim_t> dims, Device d, ftype stddev) { 
+    return TensorFunctions::Gaussian(std::move(dims), d, stddev); 
+  }
+
+  inline Tensor    (*Ones0)(std::vector<tensorDim_t>)                                             = &OnesWrapper0;
+  inline Tensor    (*Ones1)(std::vector<tensorDim_t>, Device)                                     = &OnesWrapper1;
+  inline Tensor    (*Ones2)(std::vector<tensorDim_t>, const bool)                                 = &(TensorFunctions::Ones);
+  inline Tensor    (*Ones3)(std::vector<tensorDim_t>, Device, const bool)                         = &(TensorFunctions::Ones);
+
+  inline Tensor    (*Zeros0)(std::vector<tensorDim_t>)                                            = &ZerosWrapper0;
+  inline Tensor    (*Zeros1)(std::vector<tensorDim_t>, Device)                                    = &ZerosWrapper1;
+  inline Tensor    (*Zeros2)(std::vector<tensorDim_t>, const bool)                                = &(TensorFunctions::Zeros);
+  inline Tensor    (*Zeros3)(std::vector<tensorDim_t>, Device, const bool)                        = &(TensorFunctions::Zeros);
+
+  inline Tensor    (*Gaussian0)(std::vector<tensorDim_t>, ftype)                                  = &GaussianWrapper0;
+  inline Tensor    (*Gaussian1)(std::vector<tensorDim_t>, Device, ftype)                          = &GaussianWrapper1;
+  inline Tensor    (*Gaussian2)(std::vector<tensorDim_t>, ftype, const bool)                      = &(TensorFunctions::Gaussian);
+  inline Tensor    (*Gaussian3)(std::vector<tensorDim_t>, Device, ftype, const bool)              = &(TensorFunctions::Gaussian);
+
+  inline void    (Tensor::*reset1)(const ftype)                                                   = &Tensor::reset;
+  inline void    (Tensor::*reset2)(const std::shared_ptr<utility::InitializerBase>)               = &Tensor::reset;
+
+  inline void    (Tensor::*transposeThis1)()                                                      = &Tensor::transposeThis;
+  inline void    (Tensor::*transposeThis2)(int, int)                                              = &Tensor::transposeThis;
+  inline Tensor  (Tensor::*transpose1)(int, int) const                                            = &Tensor::transpose;
+  inline Tensor  (Tensor::*transpose2)(int, int, bool) const                                      = &Tensor::transpose;
+
+  inline ftype   (Tensor::*getItemVector)(const std::vector<tensorDim_t>&) const                       = &Tensor::get;
+
+  /*********************************************************************************************************
+  ***************************************** Graph creation *************************************************
+  *********************************************************************************************************/
+
+  // multiplications
+  inline std::shared_ptr<Tensor> (*elementwisemul) 
+  (const std::shared_ptr<Tensor> left, const std::shared_ptr<Tensor> right)           = &(cgraph::mul);
+
+  inline std::shared_ptr<Tensor> (*scalarmul) 
+  (const std::shared_ptr<Tensor>, ftype)                                              = &(cgraph::mul);
+
+  inline std::shared_ptr<Tensor> (*rscalarmul) 
+  (ftype, const std::shared_ptr<Tensor>)                                              = &(cgraph::mul);
+
+  // additions
+  inline std::shared_ptr<Tensor> (*elementwiseadd) 
+  (const std::shared_ptr<Tensor> left, const std::shared_ptr<Tensor> right)           = &(cgraph::add);
+
+  inline std::shared_ptr<Tensor> (*scalaradd) 
+  (const std::shared_ptr<Tensor>, ftype)                                              = &(cgraph::add);
+
+  inline std::shared_ptr<Tensor> (*rscalaradd) 
+  (ftype, const std::shared_ptr<Tensor>)                                              = &(cgraph::add);
+
+  // matmul
+  inline std::shared_ptr<Tensor> (*matmul) 
+  (const std::shared_ptr<Tensor> left, const std::shared_ptr<Tensor> right)           = &(cgraph::matmul);
+    
+  // sub, div
+  inline std::shared_ptr<Tensor> (*scalarsub) 
+  (const std::shared_ptr<Tensor>, ftype)                                              = &(cgraph::sub);
+
+  inline std::shared_ptr<Tensor> (*scalardiv) 
+  (const std::shared_ptr<Tensor>, ftype)                                              = &(cgraph::div);
+
+  // get
+  inline std::shared_ptr<Tensor> (*getItemAsTensor1) 
+  (const std::shared_ptr<Tensor>& t, tensorSize_t idx)                                = &(cgraph::get);
+
+  inline std::shared_ptr<Tensor> (*getItemAsTensor2) 
+  (const std::shared_ptr<Tensor>& t, const std::vector<tensorDim_t>& idx)             = &(cgraph::get);
+}
\ No newline at end of file
diff --git a/src/python/py_nn/py_nn.cpp b/src/python/py_nn/py_nn.cpp
new file mode 100644
index 0000000..5eb175a
--- /dev/null
+++ b/src/python/py_nn/py_nn.cpp
@@ -0,0 +1,89 @@
+/**
+ * @file layers.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2025-11-17
+ * 
+ * @copyright Copyright (c) 2025
+ * 
+ */
+
+#include "py_nn_util.h"
+#include "python_templates.h"
+#include "custom_converters.h"
+#include "utility/global_params.h"
+
+#include <boost/python/suite/indexing/vector_indexing_suite.hpp>
+
+#include <stdexcept>
+
+BOOST_PYTHON_MODULE(_nn)
+{
+  using namespace Py_Util;
+  
+  using namespace boost::python;
+
+  #define WRAP_METHOD_ONE_TENSORARG(T, method) \
+  +[](const T& self, Tensor& t) -> std::shared_ptr<Tensor> { \
+    return (self.*method)(t.getSharedPtr()); \
+  }
+
+  #define WRAP_METHOD_TWO_TENSORARGS(T, method) \
+  +[](const T& self, Tensor& t1, Tensor& t2) -> std::shared_ptr<Tensor> { \
+    return (self.*method)(t1.getSharedPtr(), t2.getSharedPtr()); \
+  }
+
+  // register vector of shared_ptr<Tensor> converter; needed for ModuleBase::parameters()
+  class_<std::vector<std::shared_ptr<Tensor>>>("TensorList")
+    .def(vector_indexing_suite<std::vector<std::shared_ptr<Tensor>>>())
+  ;
+
+  // convert python list of tensors back to c++ 
+  converter::registry::push_back(
+    &custom_converters::TensorListFromPython::convertible,
+    &custom_converters::TensorListFromPython::construct,
+    type_id<std::vector<std::shared_ptr<Tensor>>>());
+
+  // Networks
+  class_<Py_nn::ModuleBaseWrapper, std::shared_ptr<Py_nn::ModuleBaseWrapper>, boost::noncopyable>("_Module", no_init)
+    // methods  
+    .def("_own_parameters", &module::ModuleBase::parameters)
+    // operators
+    .def("forward", pure_virtual(WRAP_METHOD_ONE_TENSORARG(Py_nn::ModuleBaseWrapper, Py_nn::moduleForward)))
+    .def("__str__", &toString<module::FfLayer>)
+  ;
+
+  class_<module::FfLayer, std::shared_ptr<module::FfLayer>, boost::noncopyable>("FfLayer", no_init)
+    // init
+    .def(init<tensorDim_t, tensorDim_t>())
+    .def(init<tensorDim_t, tensorDim_t, bool>())
+    .def(init<tensorDim_t, tensorDim_t, bool, bool>())
+    .def(init<tensorDim_t, tensorDim_t, Device>())
+    .def(init<tensorDim_t, tensorDim_t, Device, bool>())
+    .def(init<tensorDim_t, tensorDim_t, Device, bool, bool>())
+    // methods
+    .add_property("dims", make_function(&module::FfLayer::getDims, return_internal_reference<>()))
+    .add_property("weights", &module::FfLayer::getWeights)
+    .add_property("bias", &module::FfLayer::getBias)
+    .add_property("params", &module::ModuleBase::parameters)
+        // operators
+    .def("__call__", WRAP_METHOD_ONE_TENSORARG(module::FfLayer, Py_nn::ffForward))
+    .def("__str__", &toString<module::FfLayer>)
+  ;
+
+  class_<module::ReLu, std::shared_ptr<module::ReLu>, boost::noncopyable>("ReLU")
+    .def("__call__", WRAP_METHOD_ONE_TENSORARG(module::ReLu, Py_nn::reluF))
+    .def("__str__", &toString<module::ReLu>)
+  ;
+
+  class_<module::LeakyReLu, std::shared_ptr<module::LeakyReLu>, boost::noncopyable>("LeakyReLU", init<ftype>())
+    .def("__call__", WRAP_METHOD_ONE_TENSORARG(module::LeakyReLu, Py_nn::leakyReluF))
+    .def("__str__", &toString<module::LeakyReLu>)
+  ;
+
+  class_<module::Softmax, std::shared_ptr<module::Softmax>, boost::noncopyable>("Softmax")
+    .def("__call__", WRAP_METHOD_ONE_TENSORARG(module::Softmax, Py_nn::softmaxF))
+    .def("__str__", &toString<module::Softmax>)
+  ;
+}
\ No newline at end of file
diff --git a/src/python/py_nn/py_nn_util.h b/src/python/py_nn/py_nn_util.h
new file mode 100644
index 0000000..766589a
--- /dev/null
+++ b/src/python/py_nn/py_nn_util.h
@@ -0,0 +1,54 @@
+/**
+ * @file layers.h
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2025-11-17
+ * 
+ * @copyright Copyright (c) 2025
+ * 
+ */
+
+#pragma once
+
+#include "module/module_base.h"
+
+#include "module/layers/ff_layer.h"
+
+#include "module/activation_functions/relu.h"
+#include "module/activation_functions/leaky_relu.h"
+#include "module/activation_functions/softmax.h"
+
+#include <boost/python.hpp>
+#include <boost/python/wrapper.hpp>
+#include <boost/python/object.hpp>
+#include <boost/python/return_internal_reference.hpp>
+
+namespace Py_nn {
+  using namespace boost::python;
+
+  /**
+   * @brief Wrapper class needed for Boost Python to get the virtual function working 
+   * the way it is intended. See documentation here: 
+   * https://beta.boost.org/doc/libs/develop/libs/python/doc/html/tutorial/tutorial/exposing.html
+   * 
+   */
+  struct ModuleBaseWrapper : module::ModuleBase, wrapper<module::ModuleBase> {
+    std::shared_ptr<Tensor> operator()(const std::shared_ptr<Tensor>& input) const override {
+      return this->get_override("forward")(input);   
+    }
+
+    Tensor operator()(const Tensor& input) const override {
+      std::__throw_runtime_error("This function should never be called from within Python");   
+    }
+  };
+
+  inline std::shared_ptr<Tensor> (ModuleBaseWrapper::*moduleForward)(const std::shared_ptr<Tensor>&) const  = &ModuleBaseWrapper::operator();
+
+  inline std::shared_ptr<Tensor> (module::FfLayer::*ffForward)(const std::shared_ptr<Tensor>&) const        = &module::FfLayer::operator();
+
+  inline std::shared_ptr<Tensor> (module::ReLu::*reluF)(const std::shared_ptr<Tensor>&) const               = &module::ReLu::operator();
+  inline std::shared_ptr<Tensor> (module::LeakyReLu::*leakyReluF)(const std::shared_ptr<Tensor>&) const     = &module::LeakyReLu::operator();
+  inline std::shared_ptr<Tensor> (module::Softmax::*softmaxF)(const std::shared_ptr<Tensor>&) const         = &module::Softmax::operator();
+}
+
diff --git a/src/python/py_sys/py_sys.cpp b/src/python/py_sys/py_sys.cpp
new file mode 100644
index 0000000..af7d905
--- /dev/null
+++ b/src/python/py_sys/py_sys.cpp
@@ -0,0 +1,24 @@
+/**
+ * @file py_sys.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-08
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+
+#include "system/sys_functions.h"
+
+#include <boost/python.hpp>
+
+BOOST_PYTHON_MODULE(_sys)
+{
+    using namespace boost::python;
+    
+    def("setDevice", &sys::setDevice);
+    def("getDevice", &sys::getDevice);
+    def("setSeed", &sys::setRandomSeed);
+}
\ No newline at end of file
diff --git a/src/python/py_train/py_train.cpp b/src/python/py_train/py_train.cpp
new file mode 100644
index 0000000..9cbb23f
--- /dev/null
+++ b/src/python/py_train/py_train.cpp
@@ -0,0 +1,66 @@
+/**
+ * @file py_train.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-14
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include <boost/python.hpp>
+
+#include "utility/global_params.h"
+
+#include "training/loss_functions/bce_loss.h"
+#include "training/loss_functions/bce_sigmoid_loss.h"
+#include "training/loss_functions/crossentropy_loss.h"
+#include "training/loss_functions/crossentropy_softmax_loss.h"
+
+#include "training/optimizers/sgd.h"
+#include "training/optimizers/rmsprop.h"
+
+#include "training/trainers/base_train_loop.h"
+
+BOOST_PYTHON_MODULE(_train)
+{
+  using namespace boost::python;
+
+  // Loss functions
+  class_<train::BceLoss, std::shared_ptr<train::BceLoss>, boost::noncopyable>("BCE")
+    .def("__call__", &train::BceLoss::operator())
+  ;
+
+  class_<train::BceSigmoidLoss, std::shared_ptr<train::BceSigmoidLoss>, boost::noncopyable>("BceWithSigmoid")
+    .def("__call__", &train::BceSigmoidLoss::operator())
+  ;
+
+  class_<train::CrossEntropyLoss, std::shared_ptr<train::CrossEntropyLoss>, boost::noncopyable>("CrossEntropy")
+    .def("__call__", &train::CrossEntropyLoss::operator())
+  ;
+
+  class_<train::CrossEntropySoftmaxLoss, std::shared_ptr<train::CrossEntropySoftmaxLoss>, boost::noncopyable>("CrossEntropyWithSoftmax")
+    .def("__call__", &train::CrossEntropySoftmaxLoss::operator())
+  ;
+
+  // Optimizers
+  class_<train::SgdOptimizer, std::shared_ptr<train::SgdOptimizer>, boost::noncopyable>("SGD", no_init)
+    .def(init<std::vector< std::shared_ptr<Tensor> >, ftype>())
+    .def("step", &train::SgdOptimizer::step)
+    .def("zeroGrad", &train::SgdOptimizer::zeroGrad)
+  ;
+
+  class_<train::RmsPropOptimizer, std::shared_ptr<train::RmsPropOptimizer>, boost::noncopyable>("RmsProp", no_init)
+    .def(init<std::vector< std::shared_ptr<Tensor> >, ftype, ftype>())
+    .def("step", &train::RmsPropOptimizer::step)
+    .def("zeroGrad", &train::RmsPropOptimizer::zeroGrad)
+  ;
+
+  // Trainers
+  class_<train::BaseTrainLoop, std::shared_ptr<train::BaseTrainLoop>, boost::noncopyable>("TrainLoop", no_init)
+    .def(init<std::shared_ptr<module::ModuleBase>&, std::shared_ptr<train::LossBase>,
+              std::shared_ptr<train::OptimizerBase>, size_t, tensorDim_t>())
+    .def("run", &train::BaseTrainLoop::run)
+  ;
+}
\ No newline at end of file
diff --git a/src/python/python_utility/custom_converters.h b/src/python/py_utility/custom_converters.h
similarity index 69%
rename from src/python/python_utility/custom_converters.h
rename to src/python/py_utility/custom_converters.h
index 5114d7f..68c1ea7 100644
--- a/src/python/python_utility/custom_converters.h
+++ b/src/python/py_utility/custom_converters.h
@@ -50,12 +50,70 @@ namespace custom_converters {
     static void* convertible(PyObject* obj_ptr);
     static void construct(PyObject* obj_ptr,rvalueFromPythonData* data);
   };
+
+  /**
+   * @brief Convert from Python list to std::vector<shared_ptr<Tensor>>
+   */
+  struct TensorListFromPython {
+    using rvalueFromPythonData = boost::python::converter::rvalue_from_python_stage1_data;
+
+    static void* convertible(PyObject* obj);
+    static void construct(PyObject* obj, rvalueFromPythonData* data);
+  };
 }
 
+// TODO: do array instead of tensor
+/* struct DimsFromPython {
+    static void* convertible(PyObject* obj) {
+        if (!PyTuple_Check(obj) && !PyList_Check(obj)) return nullptr;
+        return obj;
+    }
+    
+    static void construct(PyObject* obj, 
+                         bp::converter::rvalue_from_python_stage1_data* data) {
+        void* storage = ((bp::converter::rvalue_from_python_object_data<Dims>*)data)->storage.bytes;
+        Dims* dims = new (storage) Dims();
+        int len = PySequence_Length(obj);
+        dims->ndim = len;
+        for (int i = 0; i < len; i++)
+            dims->data[i] = bp::extract<size_t>(PySequence_GetItem(obj, i));
+        data->convertible = storage;
+    }
+};
+
+// register it in your module init:
+bp::converter::registry::push_back(
+    &DimsFromPython::convertible,
+    &DimsFromPython::construct,
+    bp::type_id<Dims>()); */
+
 /******************************************************************************************/
 /******************************************************************************************/
 /******************************************************************************************/
 
+void* custom_converters::TensorListFromPython::convertible(PyObject* obj) {
+  using namespace boost::python;
+  if (!PyList_Check(obj)) return nullptr;
+  return obj;
+}
+
+void custom_converters::TensorListFromPython::construct(PyObject* obj, rvalueFromPythonData* data) {
+  using namespace boost::python;
+  void* storage = ((converter::rvalue_from_python_storage< std::vector<std::shared_ptr<Tensor>> >*)data)->storage.bytes;
+  //void* storage = ((converter::rvalue_from_python_storage< std::vector<T> >*)data)->storage.bytes;
+
+  new (storage) std::vector<std::shared_ptr<Tensor>>();
+  auto* vec = reinterpret_cast<std::vector<std::shared_ptr<Tensor>>*>(storage);
+    
+  int len = PyList_Size(obj);
+  vec->reserve(len);
+  for (int i = 0; i < len; i++) {
+      vec->push_back(extract<std::shared_ptr<Tensor>>(
+          PyList_GetItem(obj, i)));
+  }
+  data->convertible = storage;
+}
+
 template<typename T>
 requires ( std::is_integral_v< T > || 
            std::is_floating_point_v< T >)
diff --git a/src/python/python_utility/python_templates.h b/src/python/py_utility/python_templates.h
similarity index 52%
rename from src/python/python_utility/python_templates.h
rename to src/python/py_utility/python_templates.h
index 54217d2..e0e625c 100644
--- a/src/python/python_utility/python_templates.h
+++ b/src/python/py_utility/python_templates.h
@@ -24,15 +24,4 @@ namespace Py_Util {
         oss << obj;
         return oss.str();
     }
-
-    /**
-     * @brief Because we manage tensors via shared_ptr, we need this to wrap
-     * return values when a function/method demands it.
-     */
-    /* template<typename Func>
-    auto WrapReturnedTensor(Func f) {
-        return [f](const Tensor& self, auto&&... args) -> std::shared_ptr<Tensor> {
-            return std::make_shared<Tensor>(f(self, std::forward<decltype(args)>(args)...));
-        };
-    } */
 }
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 3b258b9..c8ca76b 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -9,7 +9,10 @@ FetchContent_MakeAvailable(googletest)
 add_executable(unit_tests_backend
     backend/test_data_modeling.cpp
     backend/test_computational_graph.cpp
-)
+    backend/test_module.cpp
+    backend/test_losses.cpp
+    backend/test_train_loop.cpp
+    )
 
 target_link_libraries(unit_tests_backend PRIVATE
     gtest_main # pre-built main, avoids boilerplate if no custom initialization needed
diff --git a/tests/backend/test_computational_graph.cpp b/tests/backend/test_computational_graph.cpp
index d2a686f..ef07e65 100644
--- a/tests/backend/test_computational_graph.cpp
+++ b/tests/backend/test_computational_graph.cpp
@@ -14,7 +14,7 @@
 #include "data_modeling/tensor.h"
 #include "data_modeling/tensor_functions.h"
 
-#include "computational_graph/graph_creation.h"
+#include "computational_graph/tensor_ops/graph_creation.h"
 
 #include <stdexcept>
 
@@ -22,7 +22,7 @@ TEST(AutogradTest, ThrowsIfNoGradientSet) {
     auto t1 = TensorFunctions::makeSharedTensor({1}, {3.0}, false);
     auto t2 = TensorFunctions::makeSharedTensor({1}, {2.0}, false);
 
-    auto loss = graph::add(t1, t2);
+    auto loss = cgraph::add(t1, t2);
     
     EXPECT_THROW(loss->backward(), std::runtime_error);
 }
@@ -31,37 +31,67 @@ TEST(AutogradTest, SimpleAddition) {
     auto t1 = TensorFunctions::makeSharedTensor({1}, {3.0}, true);
     auto t2 = TensorFunctions::makeSharedTensor({1}, {2.0}, true);
 
-    auto t3 = graph::add(t1, t2);
-    auto loss = graph::mul(t3, t3);
+    auto t3 = cgraph::add(t1, t2);
+    auto loss = cgraph::mul(t3, t3);
     
     loss->backward();
     
-    EXPECT_NEAR(t1->getGrads()->getItem(0), 10.0, 1e-5);
-    EXPECT_NEAR(t2->getGrads()->getItem(0), 10.0, 1e-5);
+    EXPECT_NEAR(t1->getGrads()->get(0), 10.0, 1e-5);
+    EXPECT_NEAR(t2->getGrads()->get(0), 10.0, 1e-5);
+}
+
+TEST(AutogradTest, BroadcastAdd) {
+    // gradient of broadcast add w.r.t. bias should be sum over batch dimension
+    // upstream grad: (2,3) of ones → bias grad should be (3) of twos
+    auto t1 = TensorFunctions::makeSharedTensor({2, 3}, 
+        {1.0, 2.0, 3.0,
+         4.0, 5.0, 6.0}, true);
+    auto bias = TensorFunctions::makeSharedTensor({3}, 
+        {0.0, 0.0, 0.0}, true);
+
+    auto res = cgraph::add(t1, bias);
+
+    // set upstream grad to ones and backprop
+    auto upstreamGrad = TensorFunctions::makeSharedTensor({2, 3},
+        {1.0, 1.0, 1.0,
+         1.0, 1.0, 1.0}, false);
+    res->backward();
+
+    // bias grad should be sum over batch: [2, 2, 2]
+    auto biasGrad = bias->getGrads();
+    ASSERT_DOUBLE_EQ((*biasGrad)[0], 2.0);
+    ASSERT_DOUBLE_EQ((*biasGrad)[1], 2.0);
+    ASSERT_DOUBLE_EQ((*biasGrad)[2], 2.0);
+
+    // t1 grad should be ones (add is identity for non-broadcast operand)
+    auto t1Grad = t1->getGrads();
+    for(int i = 0; i < 6; i++) {
+        ASSERT_DOUBLE_EQ((*t1Grad)[i], 1.0);
+    }
 }
 
 TEST(AutogradTest, ScalarMultiplication) {
     auto t1 = TensorFunctions::makeSharedTensor({1}, {2.0}, true);
     auto t2 = TensorFunctions::makeSharedTensor({1}, {3.0}, true);
 
-    auto t3 = graph::mul(t1, t2);
-    auto loss = graph::mul(t3, t3);
+    auto t3 = cgraph::mul(t1, t2);
+    auto loss = cgraph::mul(t3, t3);
     
     loss->backward();
     
-    ASSERT_DOUBLE_EQ(t1->getGrads()->getItem(0), 36.0);
-    ASSERT_DOUBLE_EQ(t2->getGrads()->getItem(0), 24.0);
+    ASSERT_DOUBLE_EQ(t1->getGrads()->get(0), 36.0);
+    ASSERT_DOUBLE_EQ(t2->getGrads()->get(0), 24.0);
 }
 
 TEST(AutogradTest, MatMul) {
     auto t1 = TensorFunctions::makeSharedTensor({2, 3}, {1, 2, 3, 4, 5, 6}, true);
     auto t2 = TensorFunctions::makeSharedTensor({3, 2}, {1, 2, 3, 4, 5, 6}, true);
     
-    auto t3 = graph::matmul(t1, t2);
+    auto t3 = cgraph::matmul(t1, t2);
 
     auto loss = TensorFunctions::makeSharedTensor({1}, {0.0}, true);
     for (size_t i = 0; i < t3->getSize(); ++i) {
-        loss = graph::add(loss, graph::get(t3, i));
+        loss = cgraph::add(loss, cgraph::get(t3, i));
     }
     
     loss->backward();
@@ -70,65 +100,51 @@ TEST(AutogradTest, MatMul) {
     EXPECT_TRUE(t2->hasGrads());
 
     // dL/dt1 = dloss/dt3 @ t2^t = Ones({2, 2}) @ t2^t
-    ASSERT_DOUBLE_EQ(t1->getGrads()->getItem({0, 0}), 3.0);
-    ASSERT_DOUBLE_EQ(t1->getGrads()->getItem({0, 1}), 7.0);
-    ASSERT_DOUBLE_EQ(t1->getGrads()->getItem({0, 2}), 11.0);
-    ASSERT_DOUBLE_EQ(t1->getGrads()->getItem({1, 0}), 3.0);
-    ASSERT_DOUBLE_EQ(t1->getGrads()->getItem({1, 1}), 7.0);
-    ASSERT_DOUBLE_EQ(t1->getGrads()->getItem({1, 2}), 11.0);
+    ASSERT_DOUBLE_EQ(t1->getGrads()->get({0, 0}), 3.0);
+    ASSERT_DOUBLE_EQ(t1->getGrads()->get({0, 1}), 7.0);
+    ASSERT_DOUBLE_EQ(t1->getGrads()->get({0, 2}), 11.0);
+    ASSERT_DOUBLE_EQ(t1->getGrads()->get({1, 0}), 3.0);
+    ASSERT_DOUBLE_EQ(t1->getGrads()->get({1, 1}), 7.0);
+    ASSERT_DOUBLE_EQ(t1->getGrads()->get({1, 2}), 11.0);
 
     // dL/dt2 = t1^t @ dloss/dt3 = t1^t @ Ones({2, 2})
-    ASSERT_DOUBLE_EQ(t2->getGrads()->getItem({0, 0}), 5.0);
-    ASSERT_DOUBLE_EQ(t2->getGrads()->getItem({0, 1}), 5.0);
-    ASSERT_DOUBLE_EQ(t2->getGrads()->getItem({1, 0}), 7.0);
-    ASSERT_DOUBLE_EQ(t2->getGrads()->getItem({1, 1}), 7.0);
-    ASSERT_DOUBLE_EQ(t2->getGrads()->getItem({2, 0}), 9.0);
-    ASSERT_DOUBLE_EQ(t2->getGrads()->getItem({2, 1}), 9.0);
+    ASSERT_DOUBLE_EQ(t2->getGrads()->get({0, 0}), 5.0);
+    ASSERT_DOUBLE_EQ(t2->getGrads()->get({0, 1}), 5.0);
+    ASSERT_DOUBLE_EQ(t2->getGrads()->get({1, 0}), 7.0);
+    ASSERT_DOUBLE_EQ(t2->getGrads()->get({1, 1}), 7.0);
+    ASSERT_DOUBLE_EQ(t2->getGrads()->get({2, 0}), 9.0);
+    ASSERT_DOUBLE_EQ(t2->getGrads()->get({2, 1}), 9.0);
 }
 
 TEST(AutogradTest, ChainRule) {
     auto x = TensorFunctions::makeSharedTensor({1}, {2.0}, true);
     
-    auto y = graph::mul(x, x); // y = x^2
-    auto z = graph::add(x, y); // z = x^2 + x
-    auto loss = graph::mul(z, z);   // loss = (x^2 + x)^2
+    auto y = cgraph::mul(x, x); // y = x^2
+    auto z = cgraph::add(x, y); // z = x^2 + x
+    auto loss = cgraph::mul(z, z);   // loss = (x^2 + x)^2
     
     loss->backward();
     
     // dloss/dx = 2(x^2 + x) * (2x + 1)
     // At x=2: 2(4 + 2) * (4 + 1) = 2 * 6 * 5 = 60
-    ASSERT_DOUBLE_EQ(x->getGrads()->getItem(0), 60.0);
+    ASSERT_DOUBLE_EQ(x->getGrads()->get(0), 60.0);
 }
 
 TEST(AutogradTest, MultiVariateChainRule) {
     auto x = TensorFunctions::makeSharedTensor({2}, {1.0, 2.0}, true);
     
-    auto y = graph::mul(x, 3.0); // y = [3, 6]
+    auto y = cgraph::mul(x, 3.0); // y = [3, 6]
     auto loss = TensorFunctions::makeSharedTensor({1}, {0.0}, true);
     for(int i=0; i<y->getSize(); i++){
-        loss = graph::add(loss, graph::get(y, i));
+        loss = cgraph::add(loss, cgraph::get(y, i));
     }    // loss = 9
     
     loss->backward();
     
     // dloss/dx = scalar = 3
-    ASSERT_DOUBLE_EQ(x->getGrads()->getItem(0), 3.0);
-    ASSERT_DOUBLE_EQ(x->getGrads()->getItem(1), 3.0);
-
-    ASSERT_DOUBLE_EQ(y->getGrads()->getItem(0), 1.0);
-    ASSERT_DOUBLE_EQ(y->getGrads()->getItem(1), 1.0);
-}
+    ASSERT_DOUBLE_EQ(x->getGrads()->get(0), 3.0);
+    ASSERT_DOUBLE_EQ(x->getGrads()->get(1), 3.0);
 
-/* TEST(AutogradTest, ReLU) {
-    Tensor x({3}, {-1.0, 0.0, 2.0}, true);
-    
-    Tensor y = relu(x);    // [0, 0, 2]
-    Tensor loss = sum(y);  // loss = 2
-    
-    loss.backward();
-    
-    // Gradient: [0, 0, 1] (only where input > 0)
-    EXPECT_NEAR(t.getGrads()->getItem(0), 0.0, 1e-5);
-    EXPECT_NEAR(t.getGrads()->getItem(1), 0.0, 1e-5);
-    EXPECT_NEAR(t.getGrads()->getItem(2), 1.0, 1e-5);
-} */
\ No newline at end of file
+    ASSERT_DOUBLE_EQ(y->getGrads()->get(0), 1.0);
+    ASSERT_DOUBLE_EQ(y->getGrads()->get(1), 1.0);
+}
\ No newline at end of file
diff --git a/tests/backend/test_data_modeling.cpp b/tests/backend/test_data_modeling.cpp
index 3bff63f..69f585b 100644
--- a/tests/backend/test_data_modeling.cpp
+++ b/tests/backend/test_data_modeling.cpp
@@ -23,10 +23,10 @@ TEST(TensorOpsTest, TestCtor) {
   ASSERT_EQ(t.getDevice(), Device::CPU);
   ASSERT_TRUE(!t.getRequiresGrad());
 
-  ASSERT_DOUBLE_EQ(t.getItem(0, 0), 2.0);
-  ASSERT_DOUBLE_EQ(t.getItem(0, 1), 3.0);
-  ASSERT_DOUBLE_EQ(t.getItem(1, 0), 4.0);
-  ASSERT_DOUBLE_EQ(t.getItem(1, 1), 5.0);
+  ASSERT_DOUBLE_EQ(t.get(0, 0), 2.0);
+  ASSERT_DOUBLE_EQ(t.get(0, 1), 3.0);
+  ASSERT_DOUBLE_EQ(t.get(1, 0), 4.0);
+  ASSERT_DOUBLE_EQ(t.get(1, 1), 5.0);
 }
 
 TEST(TensorOpsTest, ScalarAddWorks) {
@@ -35,9 +35,9 @@ TEST(TensorOpsTest, ScalarAddWorks) {
   auto res = t1 + 1.5;
 
   constexpr ftype sum = 2.5;
-  for(auto i=0; i<t1.getDims().getItem(0); i++) {
-    for(auto j=0; j<t1.getDims().getItem(1); j++) {
-      ASSERT_DOUBLE_EQ(res.getItem(i, j), sum);
+  for(auto i=0; i<t1.getDims().get(0); i++) {
+    for(auto j=0; j<t1.getDims().get(1); j++) {
+      ASSERT_DOUBLE_EQ(res.get(i, j), sum);
     }
   }
 }
@@ -49,13 +49,52 @@ TEST(TensorOpsTest, TensorAddWorks) {
   auto res = t1 + t2;
 
   constexpr ftype sum = 5.0;
-  for(auto i=0; i<t1.getDims().getItem(0); i++) {
-    for(auto j=0; j<t1.getDims().getItem(1); j++) {
-      ASSERT_DOUBLE_EQ(res.getItem(i, j), sum);
+  for(auto i=0; i<t1.getDims().get(0); i++) {
+    for(auto j=0; j<t1.getDims().get(1); j++) {
+      ASSERT_DOUBLE_EQ(res.get(i, j), sum);
     }
   }
 }
 
+TEST(TensorOpsTest, TensorAddCanBroadCast) {
+  auto t1 = TensorFunctions::Ones({3, 2, 2}, false);
+  auto t2 = Tensor({2}, {2, 3}, false);
+
+  auto res = t1 + t2;
+
+  ASSERT_EQ(res.getDims(), t1.getDims());
+  
+  for(auto i=0; i<res.getDims().get(0); i++) {
+    for(auto j=0; j<res.getDims().get(1); j++) {
+      ASSERT_DOUBLE_EQ(res.get(i, j, 0), 3.0);
+      ASSERT_DOUBLE_EQ(res.get(i, j, 1), 4.0);
+    }
+  }
+}
+
+TEST(TensorOpsTest, BroadcastAdd_2D) {
+    // (2,3) + (3) 
+    auto t1 = Tensor({2, 3}, {1.0, 2.0, 3.0,
+                               4.0, 5.0, 6.0}, false);
+    auto t2 = Tensor({3}, {10.0, 20.0, 30.0}, false);
+    auto res = t1 + t2;
+
+    // expected: each row of t1 gets t2 added elementwise
+    ASSERT_DOUBLE_EQ(res.get(0, 0), 11.0);
+    ASSERT_DOUBLE_EQ(res.get(0, 1), 22.0);
+    ASSERT_DOUBLE_EQ(res.get(0, 2), 33.0);
+    ASSERT_DOUBLE_EQ(res.get(1, 0), 14.0);
+    ASSERT_DOUBLE_EQ(res.get(1, 1), 25.0);
+    ASSERT_DOUBLE_EQ(res.get(1, 2), 36.0);
+}
+
+TEST(TensorOpsTest, TensorAddBroadcastNotCommutative) {
+  auto t1 = TensorFunctions::Ones({3, 2, 2}, false);
+  auto t2 = Tensor({2}, {2, 3}, false);
+
+  EXPECT_THROW(t2 + t1, std::invalid_argument);
+}
+
 TEST(TensorOpsTest, TensorAddThrowsOnDimMismatch) {
   auto t1 = TensorFunctions::Ones({2, 2}, false);
   auto t2 = TensorFunctions::Ones({2, 3}, false) * 4;
@@ -69,9 +108,9 @@ TEST(TensorOpsTest, ScalarMulWorks) {
   constexpr ftype f = 2.5;
   auto res = t1 * f;
     
-  for(auto i=0; i<t1.getDims().getItem(0); i++) {
-    for(auto j=0; j<t1.getDims().getItem(1); j++) {
-      ASSERT_DOUBLE_EQ(res.getItem(i, j), f);
+  for(auto i=0; i<t1.getDims().get(0); i++) {
+    for(auto j=0; j<t1.getDims().get(1); j++) {
+      ASSERT_DOUBLE_EQ(res.get(i, j), f);
     }
   }
 }
@@ -84,9 +123,9 @@ TEST(TensorOpsTest, MatrixAddGivesCorrectResults) {
     
   constexpr ftype resSum = 2.0;
 
-  for(auto i=0; i<t1.getDims().getItem(0); i++) {
-    for(auto j=0; j<t1.getDims().getItem(1); j++) {
-      ASSERT_DOUBLE_EQ(res.getItem(i, j), resSum);
+  for(auto i=0; i<t1.getDims().get(0); i++) {
+    for(auto j=0; j<t1.getDims().get(1); j++) {
+      ASSERT_DOUBLE_EQ(res.get(i, j), resSum);
     }
   }
 }
@@ -98,9 +137,9 @@ TEST(TensorOpsTest, ElementwiseMulGivesCorrectResults) {
     
   auto res = t1 * t2;
     
-  for(auto i=0; i<t1.getDims().getItem(0); i++) {
-    for(auto j=0; j<t1.getDims().getItem(1); j++) {
-      ASSERT_DOUBLE_EQ(res.getItem(i, j), factor);
+  for(auto i=0; i<t1.getDims().get(0); i++) {
+    for(auto j=0; j<t1.getDims().get(1); j++) {
+      ASSERT_DOUBLE_EQ(res.get(i, j), factor);
     }
   }
 }
@@ -122,9 +161,9 @@ TEST(TensorOpsTest, MatMulGivesCorrectValues1) {
   ASSERT_EQ(res.getDims().toVector(), expectedDims);
 
   constexpr ftype resSum = 3.0;
-  for(auto i=0; i<t1.getDims().getItem(0); i++) {
-    for(auto j=0; j<t1.getDims().getItem(1); j++) {
-      ASSERT_DOUBLE_EQ(res.getItem(i, j), resSum);
+  for(auto i=0; i<t1.getDims().get(0); i++) {
+    for(auto j=0; j<t1.getDims().get(1); j++) {
+      ASSERT_DOUBLE_EQ(res.get(i, j), resSum);
     }
   }
 }
@@ -136,10 +175,10 @@ TEST(TensorOpsTest, MatMulGivesCorrectValues2) {
   auto cmpRes = Tensor({2, 2}, false);
 
   auto populateTensor = [](Tensor& t, ftype v1, ftype v2, ftype v3, ftype v4) {
-    t.setItem(v1, {0, 0});
-    t.setItem(v2, {0, 1});
-    t.setItem(v3, {1, 0});
-    t.setItem(v4, {1, 1});
+    t.set(v1, {0, 0});
+    t.set(v2, {0, 1});
+    t.set(v3, {1, 0});
+    t.set(v4, {1, 1});
   };
 
   populateTensor(t1, 1, 2, 3, 4);
@@ -152,34 +191,9 @@ TEST(TensorOpsTest, MatMulGivesCorrectValues2) {
   ASSERT_EQ(res.getDims().toVector(), expectedDims);
 
   constexpr ftype resSum = 3.0;
-  for(auto i=0; i<t1.getDims().getItem(0); i++) {
-    for(auto j=0; j<t1.getDims().getItem(1); j++) {
-      ASSERT_DOUBLE_EQ(res.getItem(i, j), cmpRes.getItem(i, j));
-    }
-  }
-}
-
-TEST(TensorOpsTest, MatMulBroadcastsOn1DTensor) {
-  constexpr ftype factor = 1.5;
-
-  auto t1 = TensorFunctions::Ones({2, 2}, false);
-  auto t2 = TensorFunctions::Ones({1}, false) * factor;
-  
-  // left to right
-  auto res1 = t1.matmul(t2);
-  ASSERT_EQ(res1.getDims(), t1.getDims());
-  for(auto i=0; i<t1.getDims().getItem(0); i++) {
-    for(auto j=0; j<t1.getDims().getItem(1); j++) {
-      ASSERT_DOUBLE_EQ(res1.getItem(i, j), factor);
-    }
-  }
-
-  // right to left
-  auto res2 = t2.matmul(t1);
-  ASSERT_EQ(res2.getDims(), t1.getDims());
-  for(auto i=0; i<t1.getDims().getItem(0); i++) {
-    for(auto j=0; j<t1.getDims().getItem(1); j++) {
-      ASSERT_DOUBLE_EQ(res2.getItem(i, j), factor);
+  for(auto i=0; i<t1.getDims().get(0); i++) {
+    for(auto j=0; j<t1.getDims().get(1); j++) {
+      ASSERT_DOUBLE_EQ(res.get(i, j), cmpRes.get(i, j));
     }
   }
 }
@@ -195,13 +209,13 @@ TEST(TensorOpsTest, TransposeWorksAsIntended1) {
   auto t = TensorFunctions::Gaussian({3, 2}, false);
   auto transposed = t.transpose(-1, -2);
   
-  ASSERT_EQ(t.getDims().getItem(-1), transposed.getDims().getItem(-2));
-  ASSERT_EQ(t.getDims().getItem(-2), transposed.getDims().getItem(-1));
+  ASSERT_EQ(t.getDims().get(-1), transposed.getDims().get(-2));
+  ASSERT_EQ(t.getDims().get(-2), transposed.getDims().get(-1));
   ASSERT_EQ(t.getDims().nDims(), transposed.getDims().nDims());
   
-  for(auto row=0; row<t.getDims().getItem(-2); row++) {
-    for(auto col=0; col<t.getDims().getItem(-1); col++) {
-      ASSERT_DOUBLE_EQ(t.getItem(row, col), transposed.getItem(col, row));
+  for(auto row=0; row<t.getDims().get(-2); row++) {
+    for(auto col=0; col<t.getDims().get(-1); col++) {
+      ASSERT_DOUBLE_EQ(t.get(row, col), transposed.get(col, row));
     }
   }
 }
@@ -213,16 +227,16 @@ TEST(TensorOpsTest, TransposeWorksAsIntended2) {
   auto t = TensorFunctions::Gaussian({3, 2, 5}, false);
   auto transposed = t.transpose(0, 1);
 
-  ASSERT_EQ(t.getDims().getItem(0), transposed.getDims().getItem(1));
-  ASSERT_EQ(t.getDims().getItem(1), transposed.getDims().getItem(0));
-  ASSERT_EQ(t.getDims().getItem(-1), transposed.getDims().getItem(-1));
+  ASSERT_EQ(t.getDims().get(0), transposed.getDims().get(1));
+  ASSERT_EQ(t.getDims().get(1), transposed.getDims().get(0));
+  ASSERT_EQ(t.getDims().get(-1), transposed.getDims().get(-1));
   ASSERT_EQ(t.getDims().nDims(), transposed.getDims().nDims());
   
-  for(auto dim1=0; dim1<t.getDims().getItem(0); dim1++) {
-    for(auto dim2=0; dim2<t.getDims().getItem(1); dim2++) {
-      for(auto dim3=0; dim3<t.getDims().getItem(-1); dim3++) {
+  for(auto dim1=0; dim1<t.getDims().get(0); dim1++) {
+    for(auto dim2=0; dim2<t.getDims().get(1); dim2++) {
+      for(auto dim3=0; dim3<t.getDims().get(-1); dim3++) {
         // we transposed dim1 and dim3
-        ASSERT_DOUBLE_EQ(t.getItem(dim1, dim2, dim3), transposed.getItem(dim2, dim1, dim3));
+        ASSERT_DOUBLE_EQ(t.get(dim1, dim2, dim3), transposed.get(dim2, dim1, dim3));
       }
     }
   }
@@ -235,16 +249,16 @@ TEST(TensorOpsTest, TransposeWorksAsIntended3) {
   auto t = TensorFunctions::Gaussian({3, 2, 5}, false);
   auto transposed = t.transpose(0, -1);
 
-  ASSERT_EQ(t.getDims().getItem(0), transposed.getDims().getItem(-1));
-  ASSERT_EQ(t.getDims().getItem(-1), transposed.getDims().getItem(0));
-  ASSERT_EQ(t.getDims().getItem(1), transposed.getDims().getItem(1));
+  ASSERT_EQ(t.getDims().get(0), transposed.getDims().get(-1));
+  ASSERT_EQ(t.getDims().get(-1), transposed.getDims().get(0));
+  ASSERT_EQ(t.getDims().get(1), transposed.getDims().get(1));
   ASSERT_EQ(t.getDims().nDims(), transposed.getDims().nDims());
   
-  for(auto dim1=0; dim1<t.getDims().getItem(0); dim1++) {
-    for(auto dim2=0; dim2<t.getDims().getItem(1); dim2++) {
-      for(auto dim3=0; dim3<t.getDims().getItem(-1); dim3++) {
+  for(auto dim1=0; dim1<t.getDims().get(0); dim1++) {
+    for(auto dim2=0; dim2<t.getDims().get(1); dim2++) {
+      for(auto dim3=0; dim3<t.getDims().get(-1); dim3++) {
         // we transposed dim1 and dim3
-        ASSERT_DOUBLE_EQ(t.getItem(dim1, dim2, dim3), transposed.getItem(dim3, dim2, dim1));
+        ASSERT_DOUBLE_EQ(t.get(dim1, dim2, dim3), transposed.get(dim3, dim2, dim1));
       }
     }
   }
@@ -256,13 +270,13 @@ TEST(TensorOpsTest, TransposeThisWorksAsIntended1) {
 
   t.transposeThis();
 
-  ASSERT_EQ(t.getDims().getItem(-1), tCopy.getDims().getItem(-2));
-  ASSERT_EQ(t.getDims().getItem(-2), tCopy.getDims().getItem(-1));
+  ASSERT_EQ(t.getDims().get(-1), tCopy.getDims().get(-2));
+  ASSERT_EQ(t.getDims().get(-2), tCopy.getDims().get(-1));
   ASSERT_EQ(t.getDims().nDims(), tCopy.getDims().nDims());
   
-  for(auto row=0; row<t.getDims().getItem(-2); row++) {
-    for(auto col=0; col<t.getDims().getItem(-2); col++) {
-      ASSERT_DOUBLE_EQ(t.getItem(row, col), tCopy.getItem(col, row));
+  for(auto row=0; row<t.getDims().get(-2); row++) {
+    for(auto col=0; col<t.getDims().get(-2); col++) {
+      ASSERT_DOUBLE_EQ(t.get(row, col), tCopy.get(col, row));
     }
   }
 }
@@ -273,16 +287,16 @@ TEST(TensorOpsTest, TransposeThisWorksAsIntended2) {
   
   t.transposeThis(0, -1);
 
-  ASSERT_EQ(t.getDims().getItem(0), tCopy.getDims().getItem(-1));
-  ASSERT_EQ(t.getDims().getItem(-1), tCopy.getDims().getItem(0));
-  ASSERT_EQ(t.getDims().getItem(1), tCopy.getDims().getItem(1));
+  ASSERT_EQ(t.getDims().get(0), tCopy.getDims().get(-1));
+  ASSERT_EQ(t.getDims().get(-1), tCopy.getDims().get(0));
+  ASSERT_EQ(t.getDims().get(1), tCopy.getDims().get(1));
   ASSERT_EQ(t.getDims().nDims(), tCopy.getDims().nDims());
   
-  for(auto dim1=0; dim1<t.getDims().getItem(0); dim1++) {
-    for(auto dim2=0; dim2<t.getDims().getItem(1); dim2++) {
-      for(auto dim3=0; dim3<t.getDims().getItem(-1); dim3++) {
+  for(auto dim1=0; dim1<t.getDims().get(0); dim1++) {
+    for(auto dim2=0; dim2<t.getDims().get(1); dim2++) {
+      for(auto dim3=0; dim3<t.getDims().get(-1); dim3++) {
         // we transposed dim1 and dim3
-        ASSERT_DOUBLE_EQ(t.getItem(dim1, dim2, dim3), tCopy.getItem(dim3, dim2, dim1));
+        ASSERT_DOUBLE_EQ(t.get(dim1, dim2, dim3), tCopy.get(dim3, dim2, dim1));
       }
     }
   }
diff --git a/tests/backend/test_losses.cpp b/tests/backend/test_losses.cpp
new file mode 100644
index 0000000..180d6ae
--- /dev/null
+++ b/tests/backend/test_losses.cpp
@@ -0,0 +1,248 @@
+/**
+ * @file test_losses.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-14
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include <gtest/gtest.h>
+
+#include "data_modeling/tensor_functions.h"
+
+#include "training/loss_functions/rmse_loss.h"
+#include "training/loss_functions/bce_loss.h"
+#include "training/loss_functions/crossentropy_loss.h"
+
+#include <cmath>
+
+using namespace train;
+
+static constexpr ftype kTol = 1e-4f;
+
+// ─── CrossEntropy ────────────────────────────────────────────────────────────
+
+TEST(LossTest, CrossEntropyFoward) {
+    auto y = TensorFunctions::makeSharedTensor(
+        {2, 3}, {1.0, 0.0, 0.0,
+                 0.0, 1.0, 0.0}, false);
+
+    auto ypred = TensorFunctions::makeSharedTensor(
+        {2, 3}, {0.7, 0.2, 0.1,
+                 0.1, 0.8, 0.1}, true);
+
+    CrossEntropyLoss loss;
+    auto result = loss(y, ypred);
+
+    // expected: -( log(0.7) + log(0.8) ) / 2 = 0.2899
+    const ftype expected = -(std::log(0.7f) + std::log(0.8f)) / 2.0f;
+    EXPECT_NEAR((*result)[0], expected, kTol);
+}
+
+TEST(LossTest, CrossEntropyPerfectPrediction) {
+    auto y = TensorFunctions::makeSharedTensor(
+        {2, 3}, {1.0, 0.0, 0.0,
+                 0.0, 1.0, 0.0}, false);
+
+    // near-perfect predictions — can't use exactly 1.0 due to log(0)
+    auto ypred = TensorFunctions::makeSharedTensor(
+        {2, 3}, {0.999, 0.0005, 0.0005,
+                 0.0005, 0.999, 0.0005}, true);
+
+    CrossEntropyLoss loss;
+    auto result = loss(y, ypred);
+
+    // loss should be very small
+    EXPECT_LT((*result)[0], 0.01f);
+}
+
+TEST(LossTest, CrossEntropyUniformPrediction) {
+    // uniform prediction should give log(3) ~ 1.0986
+    auto y = TensorFunctions::makeSharedTensor(
+        {1, 3}, {1.0, 0.0, 0.0}, false);
+
+    auto ypred = TensorFunctions::makeSharedTensor(
+        {1, 3}, {1.0f/3, 1.0f/3, 1.0f/3}, true);
+
+    CrossEntropyLoss loss;
+    auto result = loss(y, ypred);
+
+    EXPECT_NEAR((*result)[0], std::log(3.0f), kTol);
+}
+
+TEST(LossTest, CrossEntropyThrowsOnDimMismatch) {
+    auto y = TensorFunctions::makeSharedTensor(
+        {2, 3}, {1.0, 0.0, 0.0, 0.0, 1.0, 0.0}, false);
+    auto ypred = TensorFunctions::makeSharedTensor(
+        {2, 2}, {0.5, 0.5, 0.5, 0.5}, true);
+
+    CrossEntropyLoss loss;
+    EXPECT_THROW(loss(y, ypred), std::invalid_argument);
+}
+
+TEST(LossTest, CrossEntropyBackward) {
+    // y = [[1,0,0],[0,1,0]], ypred = [[0.7,0.2,0.1],[0.1,0.8,0.1]]
+    // grad CE w.r.t. ypred[b,i] = -y[b,i] / (ypred[b,i] * n)
+    // grad[0,0] = -1/(0.7*2) = -0.7143
+    // grad[0,1] =  0
+    // grad[0,2] =  0
+    // grad[1,0] =  0
+    // grad[1,1] = -1/(0.8*2) = -0.625
+    // grad[1,2] =  0
+    auto y = TensorFunctions::makeSharedTensor(
+        {2, 3}, {1.0, 0.0, 0.0,
+                 0.0, 1.0, 0.0}, false);
+    auto ypred = TensorFunctions::makeSharedTensor(
+        {2, 3}, {0.7, 0.2, 0.1,
+                 0.1, 0.8, 0.1}, true);
+
+    CrossEntropyLoss loss;
+    auto result = loss(y, ypred);
+    result->backward();
+
+    auto grads = ypred->getGrads();
+    EXPECT_NEAR((*grads)[0], -0.7143f, kTol);
+    EXPECT_NEAR((*grads)[1],  0.0f,    kTol);
+    EXPECT_NEAR((*grads)[2],  0.0f,    kTol);
+    EXPECT_NEAR((*grads)[3],  0.0f,    kTol);
+    EXPECT_NEAR((*grads)[4], -0.625f,  kTol);
+    EXPECT_NEAR((*grads)[5],  0.0f,    kTol);
+}
+
+// ─── BCE ─────────────────────────────────────────────────────────────────────
+
+TEST(LossTest, BceForward) {
+    auto y = TensorFunctions::makeSharedTensor(
+        {4, 1}, {0.0, 1.0, 1.0, 0.0}, false);
+
+    auto ypred = TensorFunctions::makeSharedTensor(
+        {4, 1}, {0.1, 0.9, 0.8, 0.2}, true);
+
+    BceLoss loss;
+    auto result = loss(y, ypred);
+
+    // expected: -( log(0.9) + log(0.9) + log(0.8) + log(0.8) ) / 4 = 0.1643
+    const ftype expected = -(std::log(0.9f) + std::log(0.9f) + 
+                              std::log(0.8f) + std::log(0.8f)) / 4.0f;
+    EXPECT_NEAR((*result)[0], expected, kTol);
+}
+
+TEST(LossTest, BcePerfectPrediction) {
+    auto y = TensorFunctions::makeSharedTensor(
+        {2, 1}, {1.0, 0.0}, false);
+
+    auto ypred = TensorFunctions::makeSharedTensor(
+        {2, 1}, {0.999, 0.001}, true);
+
+    BceLoss loss;
+    auto result = loss(y, ypred);
+
+    EXPECT_LT((*result)[0], 0.01f);
+}
+
+TEST(LossTest, BceRandomPrediction) {
+    // ypred = 0.5 for all -> loss = log(2) ~ 0.6931
+    auto y = TensorFunctions::makeSharedTensor(
+        {2, 1}, {1.0, 0.0}, false);
+
+    auto ypred = TensorFunctions::makeSharedTensor(
+        {2, 1}, {0.5, 0.5}, true);
+
+    BceLoss loss;
+    auto result = loss(y, ypred);
+
+    EXPECT_NEAR((*result)[0], std::log(2.0f), kTol);
+}
+
+TEST(LossTest, BceThrowsOnDimMismatch) {
+    auto y = TensorFunctions::makeSharedTensor(
+        {2, 1}, {1.0, 0.0}, false);
+    auto ypred = TensorFunctions::makeSharedTensor(
+        {3, 1}, {0.5, 0.5, 0.5}, true);
+
+    BceLoss loss;
+    EXPECT_THROW(loss(y, ypred), std::invalid_argument);
+}
+
+TEST(LossTest, BceNoInfOrNanOnNearZeroPred) {
+    auto y = TensorFunctions::makeSharedTensor(
+        {1, 1}, {1.0}, false);
+    auto ypred = TensorFunctions::makeSharedTensor(
+        {1, 1}, {0.0}, true);
+
+    BceLoss loss;
+    auto result = loss(y, ypred);
+
+    // clipping prevents log(0)
+    EXPECT_FALSE(std::isinf((*result)[0]));
+}
+
+TEST(LossTest, BceBackward) {
+    // y = [1, 0], ypred = [0.8, 0.3]
+    // grad BCE w.r.t. ypred_i = (-y/ypred + (1-y)/(1-ypred)) / n
+    // grad[0] = (-1/0.8 + 0) / 2 = -0.625
+    // grad[1] = (0 + 1/0.7)  / 2 =  0.7143
+    auto y = TensorFunctions::makeSharedTensor(
+        {2, 1}, {1.0, 0.0}, false);
+    auto ypred = TensorFunctions::makeSharedTensor(
+        {2, 1}, {0.8, 0.3}, true);
+
+    BceLoss loss;
+    auto result = loss(y, ypred);
+    result->backward();
+
+    auto grads = ypred->getGrads();
+    EXPECT_NEAR((*grads)[0], -0.625f,  kTol);
+    EXPECT_NEAR((*grads)[1],  0.7143f, kTol);
+}
+
+TEST(LossTest, RmseForward) {
+    // y = [1, 2, 3], ypred = [1.5, 2.5, 2.5]
+    // diffs = [-0.5, -0.5, 0.5]
+    // MSE = (0.25 + 0.25 + 0.25) / 3 = 0.25
+    // RMSE = 0.5
+    auto y = TensorFunctions::makeSharedTensor(
+        {3}, {1.0, 2.0, 3.0}, false);
+    auto ypred = TensorFunctions::makeSharedTensor(
+        {3}, {1.5, 2.5, 2.5}, true);
+
+    auto loss = RmseLoss{};
+    auto result = loss(y, ypred);
+
+    EXPECT_NEAR((*result)[0], 0.5f, kTol);
+}
+
+TEST(LossTest, RmsePerfectPrediction) {
+    auto y = TensorFunctions::makeSharedTensor(
+        {3}, {1.0, 2.0, 3.0}, false);
+    auto ypred = TensorFunctions::makeSharedTensor(
+        {3}, {1.0, 2.0, 3.0}, true);
+
+    RmseLoss loss;
+    auto result = loss(y, ypred);
+
+    EXPECT_NEAR((*result)[0], 0.0f, kTol);
+}
+
+TEST(LossTest, RmseBackward) {
+    // y = [1, 0], ypred = [0.5, 0.5]
+    // diffs = [0.5, -0.5], MSE = 0.25, RMSE = 0.5
+    // grad_i = -(y_i - ypred_i) / (n * RMSE)
+    // grad[0] = -(1 - 0.5) / (2 * 0.5) = -0.5
+    // grad[1] = -(0 - 0.5) / (2 * 0.5) =  0.5
+    auto y = TensorFunctions::makeSharedTensor(
+        {2}, {1.0, 0.0}, false);
+    auto ypred = TensorFunctions::makeSharedTensor(
+        {2}, {0.5, 0.5}, true);
+
+    RmseLoss loss;
+    auto result = loss(y, ypred);
+    result->backward();
+
+    auto grads = ypred->getGrads();
+    EXPECT_NEAR((*grads)[0], -0.5f, kTol);
+    EXPECT_NEAR((*grads)[1],  0.5f, kTol);
+}
\ No newline at end of file
diff --git a/tests/backend/test_module.cpp b/tests/backend/test_module.cpp
new file mode 100644
index 0000000..a74c88c
--- /dev/null
+++ b/tests/backend/test_module.cpp
@@ -0,0 +1,239 @@
+/**
+ * @file test_layers.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-09
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include <gtest/gtest.h>
+
+#include "module/layers/ff_layer.h"
+
+#include "module/activation_functions/relu.h"
+#include "module/activation_functions/leaky_relu.h"
+#include "module/activation_functions/softmax.h"
+#include "module/activation_functions/sigmoid.h"
+
+#include "data_modeling/tensor_functions.h"
+#include "computational_graph/tensor_ops/graph_creation.h"
+
+#include <cmath>
+
+constexpr ftype delta = 1e-3;
+
+TEST(ActivationTest, ReluForward) {
+  auto t1 = TensorFunctions::Ones({3, 2}, false);
+  auto f = module::ReLu();
+
+  auto res = f(t1);
+
+  for(size_t i=0; i<t1.getSize(); i++){
+    ASSERT_DOUBLE_EQ(res[i], t1[i]);
+  }
+}
+
+TEST(ActivationTest, ReluInputNegative) {
+  auto t1 = TensorFunctions::Ones({3, 2}, false) * -1;
+  auto f = module::ReLu();
+
+  auto res = f(t1);
+
+  constexpr ftype zero = 0; 
+  for(size_t i=0; i<t1.getSize(); i++){
+    ASSERT_DOUBLE_EQ(res[i], zero);
+  }
+}
+
+TEST(AutogradTest, ReLUBackward) {
+    auto x = TensorFunctions::makeSharedTensor({3}, {-1.0, 0.0, 2.0}, true);
+    auto relu = module::ReLu();
+
+    auto y = relu(x);    // [0, 0, 2]
+    auto loss = cgraph::sumTensor(y);  // loss = 2
+    
+    loss->backward();
+    
+    // Gradient: [0, 0, 1] (only where input > 0)
+    ASSERT_DOUBLE_EQ(x->getGrads()->get(0), 0.0);
+    ASSERT_DOUBLE_EQ(x->getGrads()->get(1), 0.0);
+    ASSERT_DOUBLE_EQ(x->getGrads()->get(2), 1.0);
+}
+
+TEST(ActivationTest, LeakyReluForward) {
+  auto t1 = TensorFunctions::Ones({3, 2}, false);
+
+  auto f = module::LeakyReLu(0.3);
+  auto res = f(t1);
+
+  for(size_t i=0; i<t1.getSize(); i++){
+    ASSERT_DOUBLE_EQ(res[i], t1[i]);
+  }
+}
+
+TEST(ActivationTest, LeakyReluInputNegative) {
+  auto t1 = TensorFunctions::Ones({3, 2}, false) * -1;
+  
+  constexpr ftype eps = 0.3;
+  auto f = module::LeakyReLu(eps);
+  auto res = f(t1);
+
+  for(size_t i=0; i<t1.getSize(); i++){
+    ASSERT_DOUBLE_EQ(res[i], eps);
+  }
+}
+
+TEST(AutogradTest, LeakyReLUBackward) {
+    auto x = TensorFunctions::makeSharedTensor({3}, {-1.0, 0.0, 2.0}, true);
+
+    constexpr ftype eps = 0.3;
+    auto relu = module::LeakyReLu(eps);
+
+    auto y = relu(x);    // [0, 0, 2]
+    auto loss = cgraph::sumTensor(y);  // loss = 2
+    
+    loss->backward();
+    
+    // Gradient: [0, 0, 1] (only where input > 0)
+    ASSERT_DOUBLE_EQ(x->getGrads()->get(0), eps);
+    ASSERT_DOUBLE_EQ(x->getGrads()->get(1), eps); // by convention
+    ASSERT_DOUBLE_EQ(x->getGrads()->get(2), 1.0);
+}
+
+TEST(ActivationTest, SigmoidForward) {
+    // sigmoid(0) = 0.5, sigmoid(1) = 0.7311, sigmoid(-1) = 0.2689
+    auto t = Tensor({3}, {0.0, 1.0, -1.0}, true);
+    
+    module::Sigmoid sig;
+    auto res = sig(t);
+
+    EXPECT_NEAR(res[0], 0.5, delta);
+    EXPECT_NEAR(res[1], 0.7311, delta);
+    EXPECT_NEAR(res[2], 0.2689, delta);
+}
+
+TEST(ActivationTest, SigmoidLargePositive) {
+    // sigmoid(100) should be ~1, not inf or nan
+    auto t = Tensor({1}, {100.0}, true);
+    
+    module::Sigmoid sig;
+    auto res = sig(t);
+
+    EXPECT_NEAR(res[0], 1.0, delta);
+    EXPECT_FALSE(std::isnan(res[0]));
+    EXPECT_FALSE(std::isinf(res[0]));
+}
+
+TEST(ActivationTest, SigmoidLargeNegative) {
+    // sigmoid(-100) should be ~0, not nan
+    auto t = Tensor({1}, {-100.0}, true);
+    
+    module::Sigmoid sig;
+    auto res = sig(t);
+
+    EXPECT_NEAR(res[0], 0.0, delta);
+    EXPECT_FALSE(std::isnan(res[0]));
+    EXPECT_FALSE(std::isinf(res[0]));
+}
+
+TEST(AutogradTest, SigmoidBackward) {
+    // grad of sigmoid = sigmoid(x) * (1 - sigmoid(x))
+    // for x=0: grad = 0.5 * 0.5 = 0.25
+    // for x=1: grad = 0.7311 * 0.2689 = 0.1966
+    auto t = TensorFunctions::makeSharedTensor(
+        {2}, {0.0, 1.0}, true);
+    
+    module::Sigmoid sig;
+    auto res = sig(t);
+    res->backward();
+
+    auto grads = t->getGrads();
+    EXPECT_NEAR((*grads)[0], 0.25, delta);
+    EXPECT_NEAR((*grads)[1], 0.1966, delta);
+}
+
+TEST(ActivationTest, SoftmaxForward) {
+    // softmax([1, 2, 3])
+    // exp([1,2,3]) = [2.7183, 7.3891, 20.0855]
+    // sum = 30.1929
+    // softmax = [0.0900, 0.2447, 0.6652]
+    auto t = Tensor({1, 3}, {1.0, 2.0, 3.0}, true);
+    
+    module::Softmax sm;
+    auto res = sm(t);
+
+    EXPECT_NEAR(res[0], 0.0900, delta);
+    EXPECT_NEAR(res[1], 0.2447, delta);
+    EXPECT_NEAR(res[2], 0.6652, delta);
+}
+
+TEST(ActivationTest, SoftmaxSumsToOne) {
+    auto t = Tensor({2, 4}, 
+                    {1.0, 2.0, 3.0, 4.0,
+                     2.0, 1.0, 4.0, 3.0}, 
+                     true);
+    
+    module::Softmax sm;
+    auto res = sm(t);
+
+    // each row must sum to 1
+    ftype row0sum = res[0] + res[1] + res[2] + res[3];
+    ftype row1sum = res[4] + res[5] + res[6] + res[7];
+    EXPECT_NEAR(row0sum, 1.0, delta);
+    EXPECT_NEAR(row1sum, 1.0, delta);
+}
+
+TEST(ActivationTest, SoftmaxForwardNumericalStability) {
+    // large values should not produce nan or inf
+    auto t = Tensor({1, 3}, {100.0, 101.0, 102.0}, true);
+    
+    module::Softmax sm;
+    auto res = sm(t);
+
+    for(int i = 0; i < 3; i++) {
+        EXPECT_FALSE(std::isnan(res[i]));
+        EXPECT_FALSE(std::isinf(res[i]));
+    }
+    ftype rowsum = res[0] + res[1] + res[2];
+    EXPECT_NEAR(rowsum, 1.0, delta);
+}
+
+TEST(AutogradTest, SoftmaxBackward) {
+    // for softmax with upstream grad of ones, the gradient is zero
+    // because d/dx_i sum(softmax(x)) = 0 (softmax sums to 1 always)
+    // more useful: upstream = [1, 0, 0]
+    // grad[i] = softmax[i] * (upstream[i] - dot(upstream, softmax))
+    // for x=[1,2,3], softmax=[0.09, 0.2447, 0.6652]
+    // dot([1,0,0], softmax) = 0.09
+    // grad[0] = 0.09   * (1 - 0.09)   =  0.0819
+    // grad[1] = 0.2447 * (0 - 0.09)   = -0.0220
+    // grad[2] = 0.6652 * (0 - 0.09)   = -0.0599
+    auto t = TensorFunctions::makeSharedTensor(
+        {1, 3}, {1.0, 2.0, 3.0}, true);
+    
+    module::Softmax sm;
+    auto resPtr = sm(t);
+    
+    // set upstream gradient to [1, 0, 0]
+    auto upstream = TensorFunctions::makeSharedTensor(
+        {1, 3}, {1.0, 0.0, 0.0}, false);
+    resPtr->setGrads(upstream);
+    resPtr->backward();
+
+    auto grads = t->getGrads();
+    EXPECT_NEAR((*grads)[0],  0.0819, delta);
+    EXPECT_NEAR((*grads)[1], -0.0220, delta);
+    EXPECT_NEAR((*grads)[2], -0.0599, delta);
+}
+
+TEST(LayerTest, TestFfLayer) {
+  auto t1 = TensorFunctions::Ones({3, 2}, false);
+  auto layer = module::FfLayer(2, 1, true, false);
+
+  auto res = layer(t1);
+
+  ASSERT_EQ(res.getDims(), Dimension({3, 1}));
+}
\ No newline at end of file
diff --git a/tests/backend/test_train_loop.cpp b/tests/backend/test_train_loop.cpp
new file mode 100644
index 0000000..247b9b0
--- /dev/null
+++ b/tests/backend/test_train_loop.cpp
@@ -0,0 +1,234 @@
+/**
+ * @file test_training.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-03-14
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include <gtest/gtest.h>
+
+#include "module/networks/sequential.h"
+#include "module/layers/ff_layer.h"
+
+#include "module/activation_functions/sigmoid.h"
+#include "module/activation_functions/relu.h"
+#include "module/activation_functions/leaky_relu.h"
+#include "module/activation_functions/softmax.h"
+
+#include "training/optimizers/sgd.h"
+#include "training/optimizers/rmsprop.h"
+
+#include "training/loss_functions/bce_loss.h"
+#include "training/loss_functions/crossentropy_loss.h"
+#include "training/loss_functions/bce_sigmoid_loss.h"
+#include "training/loss_functions/crossentropy_softmax_loss.h"
+
+#include "training/trainers/base_train_loop.h"
+
+#include "data_modeling/tensor_functions.h"
+
+#include "system/sys_functions.h"
+
+using namespace std;
+
+static shared_ptr<module::Sequential> makeBinaryNet() {
+    auto net = make_shared<module::Sequential>();
+
+    net->append(make_shared<module::FfLayer>(2, 4, true, true));
+
+    net->append(make_shared<module::LeakyReLu>(0.01));
+
+    net->append(make_shared<module::FfLayer>(4, 1, true, true));
+
+    net->append(make_shared<module::Sigmoid>());
+    return net;
+}
+
+static shared_ptr<module::Sequential> makeBinaryNet2() {
+    auto net = make_shared<module::Sequential>();
+
+    net->append(make_shared<module::FfLayer>(2, 4, true, true));
+
+    net->append(make_shared<module::LeakyReLu>(0.01));
+
+    net->append(make_shared<module::FfLayer>(4, 1, true, true));
+
+    return net;
+}
+
+static shared_ptr<module::Sequential> makeMulticlassNet() {
+    auto net = make_shared<module::Sequential>();
+
+    net->append(make_shared<module::FfLayer>(2, 8, true, true));
+
+    net->append(make_shared<module::LeakyReLu>(0.01));
+
+    net->append(make_shared<module::FfLayer>(8, 3, true, true));
+
+    net->append(make_shared<module::Softmax>());
+    return net;
+}
+
+static shared_ptr<module::Sequential> makeMulticlassNet2() {
+    auto net = make_shared<module::Sequential>();
+
+    net->append(make_shared<module::FfLayer>(2, 8, true, true));
+
+    net->append(make_shared<module::LeakyReLu>(0.01));
+
+    net->append(make_shared<module::FfLayer>(8, 3, true, true));
+
+    return net;
+}
+
+int main(int argc, char** argv) {
+    testing::InitGoogleTest(&argc, argv);
+    sys::setRandomSeed(42);
+    return RUN_ALL_TESTS();
+}
+
+TEST(OverfitTest, BceSgdOverfitsSmallDataset) {
+    // XOR-like: 4 samples, 2 features, binary labels
+    auto x = TensorFunctions::makeSharedTensor(
+        {4, 2}, {0.0, 0.0,
+                 0.0, 1.0,
+                 1.0, 0.0,
+                 1.0, 1.0}, false);
+
+    auto y = TensorFunctions::makeSharedTensor(
+        {4, 1}, {0.0,
+                 1.0,
+                 1.0,
+                 0.0}, false);
+
+    auto net = makeBinaryNet();    
+    auto loss = make_shared<train::BceLoss>();
+    auto optim = make_shared<train::SgdOptimizer>(
+        net->parameters(), /*lr=*/0.05);
+
+    auto trainLoop = train::BaseTrainLoop(
+        net, loss, optim, /*epochs=*/2000, /*bsize=*/static_cast<tensorDim_t>(4));
+
+    trainLoop.run(x, y, /*shuffle=*/false, /*verbose=*/false);
+
+    // forward one more time to get final loss
+    auto pred = (*net)(x);
+    auto finalLoss = (*loss)(y, pred);
+
+    EXPECT_LT((*finalLoss)[0], 0.05f)
+        << "Network failed to overfit binary dataset\n"
+        << "Final prediction: " << *pred << "\nFinal loss: " << *finalLoss;
+}
+
+TEST(OverfitTest, BceSgdOverfitsSmallDataset_OptimizedLoss) {    
+    // XOR-like: 4 samples, 2 features, binary labels
+    auto x = TensorFunctions::makeSharedTensor(
+        {4, 2}, {0.0, 0.0,
+                 0.0, 1.0,
+                 1.0, 0.0,
+                 1.0, 1.0}, false);
+
+    auto y = TensorFunctions::makeSharedTensor(
+        {4, 1}, {0.0,
+                 1.0,
+                 1.0,
+                 0.0}, false);
+
+    auto net = makeBinaryNet2();    
+    auto loss = make_shared<train::BceSigmoidLoss>();
+    auto optim = make_shared<train::SgdOptimizer>(
+        net->parameters(), /*lr=*/0.05);
+
+    auto trainLoop = train::BaseTrainLoop(
+        net, loss, optim, /*epochs=*/2000, /*bsize=*/static_cast<tensorDim_t>(4));
+
+    trainLoop.run(x, y, /*shuffle=*/false, /*verbose=*/false);
+
+    // forward one more time to get final loss
+    auto pred = (*net)(x);
+    auto finalLoss = (*loss)(y, pred);
+
+    auto sigmoid = module::Sigmoid();
+    EXPECT_LT((*finalLoss)[0], 0.05f)
+        << "Network failed to overfit binary dataset\n"
+        << "Final prediction: " << sigmoid(*pred) << "\nFinal loss: " << *finalLoss;
+}
+
+TEST(OverfitTest, CrossEntropyRMSPropOverfitsSmallDataset) {
+    // 6 samples, 2 features, 3 classes
+    auto x = TensorFunctions::makeSharedTensor(
+        {6, 2}, {1.0, 0.0,
+                 1.0, 0.1,
+                 0.0, 1.0,
+                 0.1, 1.0,
+                 0.5, 0.5,
+                 0.4, 0.6}, false);
+
+    // one-hot encoded labels
+    auto y = TensorFunctions::makeSharedTensor(
+        {6, 3}, {1.0, 0.0, 0.0,
+                 1.0, 0.0, 0.0,
+                 0.0, 1.0, 0.0,
+                 0.0, 1.0, 0.0,
+                 0.0, 0.0, 1.0,
+                 0.0, 0.0, 1.0}, false);
+
+    auto net = makeMulticlassNet();
+    auto loss = make_shared<train::CrossEntropyLoss>();
+    auto optim = make_shared<train::RmsPropOptimizer>(
+        net->parameters(), /*lr=*/0.0001, /*decay=*/0.95);
+
+    auto trainLoop = train::BaseTrainLoop(
+        net, loss, optim, /*epochs=*/2000, /*bsize=*/6);
+
+    trainLoop.run(x, y, /*shuffle=*/false, /*verbose=*/false);
+
+    auto pred = (*net)(x);
+    auto finalLoss = (*loss)(y, pred);
+
+    EXPECT_LT((*finalLoss)[0], 0.05f)
+        << "Network failed to overfit multiclass dataset"
+        << "Final prediction: " << *pred << "\nFinal loss: " << *finalLoss;
+}
+
+TEST(OverfitTest, CrossEntropyRMSPropOverfitsSmallDataset_OptimizedLoss) {
+    // 6 samples, 2 features, 3 classes
+    auto x = TensorFunctions::makeSharedTensor(
+        {6, 2}, {1.0, 0.0,
+                 1.0, 0.1,
+                 0.0, 1.0,
+                 0.1, 1.0,
+                 0.5, 0.5,
+                 0.4, 0.6}, false);
+
+    // one-hot encoded labels
+    auto y = TensorFunctions::makeSharedTensor(
+        {6, 3}, {1.0, 0.0, 0.0,
+                 1.0, 0.0, 0.0,
+                 0.0, 1.0, 0.0,
+                 0.0, 1.0, 0.0,
+                 0.0, 0.0, 1.0,
+                 0.0, 0.0, 1.0}, false);
+
+    auto net = makeMulticlassNet2();
+    auto loss = make_shared<train::CrossEntropySoftmaxLoss>();
+    auto optim = make_shared<train::RmsPropOptimizer>(
+        net->parameters(), /*lr=*/0.0003, /*decay=*/0.95);
+
+    auto trainLoop = train::BaseTrainLoop(
+        net, loss, optim, /*epochs=*/10000, /*bsize=*/6);
+
+    trainLoop.run(x, y, /*shuffle=*/false, /*verbose=*/false);
+
+    auto pred = (*net)(x);
+    auto finalLoss = (*loss)(y, pred);
+
+    auto softmax = module::Softmax();
+    EXPECT_LT((*finalLoss)[0], 0.05f)
+        << "Network failed to overfit multiclass dataset"
+        << "Final prediction: " << softmax(*pred) << "\nFinal loss: " << *finalLoss;
+}
\ No newline at end of file
diff --git a/tests/python/test_training.py b/tests/python/test_training.py
new file mode 100644
index 0000000..d974a0f
--- /dev/null
+++ b/tests/python/test_training.py
@@ -0,0 +1,123 @@
+"""
+Robert Baumgartner, r.baumgartner-1@tudelft.nl
+"""
+
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent / "python_lib"))
+print(sys.path)
+
+from dl_lib import Tensor
+from dl_lib.nn import FfLayer, Sequential
+from dl_lib.nn.activation import LeakyReLU
+from dl_lib.train.loss import BCE, BceWithSigmoid, CrossEntropyWithSoftmax
+from dl_lib.train.optim import SGD, RmsProp
+
+from dl_lib.sys import setSeed
+import pytest
+
+setSeed(42)
+
+def train(net, loss_fn, optim, x, y, epochs):
+  for epoch in range(epochs):
+    ypred = net.forward(x)
+    loss = loss_fn(y, ypred)
+
+    loss.backward()
+    optim.step()
+    optim.zeroGrad()
+
+  return loss
+
+def make_binary_net():
+  net = Sequential()
+  net.append(FfLayer(2, 4, True, True))
+  net.append(LeakyReLU(0.01))
+  net.append(FfLayer(4, 1, True, True))
+  return net
+
+def make_multiclass_net():
+  net = Sequential()
+  net.append(FfLayer(2, 8, True, True))
+  net.append(LeakyReLU(0.01))
+  net.append(FfLayer(8, 3, True, True))
+  return net
+
+def make_xor_data():
+  x = Tensor([4, 2], [0.0, 0.0,
+                      0.0, 1.0,
+                      1.0, 0.0,
+                      1.0, 1.0], False)
+  y = Tensor([4, 1], [0.0,
+                      1.0,
+                      1.0,
+                      0.0], False)
+  return x, y
+
+def make_multiclass_data():
+  x = Tensor([6, 2], [1.0, 0.0,
+                      1.0, 0.1,
+                      0.0, 1.0,
+                      0.1, 1.0,
+                      0.5, 0.5,
+                      0.4, 0.6], False)
+  y = Tensor([6, 3], [1.0, 0.0, 0.0,
+                      1.0, 0.0, 0.0,
+                      0.0, 1.0, 0.0,
+                      0.0, 1.0, 0.0,
+                      0.0, 0.0, 1.0,
+                      0.0, 0.0, 1.0], False)
+  return x, y
+
+class TestOverfitBinary:
+  def test_binary_sgd_overfits(self):
+    x, y = make_xor_data()
+    net = make_binary_net()
+    loss_fn = BceWithSigmoid()
+    optim = SGD(net.parameters(), 0.05)
+
+    final_loss = train(net, loss_fn, optim, x, y, epochs=2000)
+
+    assert final_loss.getitem(0) < 0.05, \
+      f"SGD failed to overfit XOR, loss={final_loss.getitem(0)}"
+
+  def test_binary_rmsprop_overfits(self):
+    x, y = make_xor_data()
+    net = make_binary_net()
+    loss_fn = BceWithSigmoid()
+    optim = RmsProp(net.parameters(), 0.0001, 0.95)
+
+    final_loss = train(net, loss_fn, optim, x, y, epochs=5000)
+
+    assert final_loss.getitem(0) < 0.05, \
+      f"RmsProp failed to overfit XOR, loss={final_loss.getitem(0)}"
+
+  def test_multiclass_rmsprop_overfits(self):
+    x, y = make_multiclass_data()
+    net = make_multiclass_net()
+    loss_fn = CrossEntropyWithSoftmax()
+    optim = RmsProp(net.parameters(), 0.0003, 0.95)
+
+    final_loss = train(net, loss_fn, optim, x, y, epochs=10000)
+
+    assert final_loss.getitem(0) < 0.05, \
+        f"RmsProp failed to overfit multiclass, loss={final_loss.getitem(0)}"
+
+  def test_loss_decreases(self):
+    """Loss should be strictly lower after training than before"""
+    x, y = make_xor_data()
+    net = make_binary_net()
+    loss_fn = BceWithSigmoid()
+
+    optim = SGD(net.parameters(), 0.001)
+    initial_pred = net.forward(x)
+    initial_loss = loss_fn(y, initial_pred).getitem(0)
+    train(net, loss_fn, optim, x, y, epochs=2000)
+
+    final_pred = net.forward(x)
+    final_loss = loss_fn(y, final_pred).getitem(0)
+
+    assert final_loss < initial_loss, \
+      f"Loss did not decrease: {initial_loss} -> {final_loss}"
+  
\ No newline at end of file