diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 32e18f5..add2b6f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -6,8 +6,8 @@ on:
     branches: [main]
 
 jobs:
-  build:
-    name: Build project
+  test-debug:
+    name: Build project and test in debug mode
     runs-on: ubuntu-latest
     strategy:
       matrix:
@@ -20,13 +20,16 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Setup Python 3.10
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
           python-version: '3.10'
 
+      - name: Install pytest
+        run: pip install pytest
+
       - name: Install Boost Python and Python dev headers
         run: |
-          sudo apt-get install -y \
+          sudo apt-get update && sudo apt-get install -y \
             libboost-python-dev \
             python3-dev
 
@@ -49,37 +52,78 @@ jobs:
 
       - name: Build in Debug Mode
         run: |
-          mkdir build && cd build && cmake -DCMake_Build_Type=Debug ..
+          mkdir build && cd build 
+          cmake -DCMake_Build_Type=Debug -DBUILD_TESTS=ON ..
           make
 
+      - name: Run Unit tests
+        run: cd build && ctest --output-on-failure --verbose
+
+      - name: Upload artifacts
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: debug-build-output
+          path: build/
+          retention-days: 2
+
+  test-release:
+    name: Build project and test in relese mode
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        gcc-version: [13]
+        cmake-version: ['3.31.3']
+      fail-fast: true
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python 3.10
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Install pytest
+        run: pip install pytest
+
+      - name: Install Boost Python and Python dev headers
+        run: |
+          sudo apt-get update && sudo apt-get install -y \
+            libboost-python-dev \
+            python3-dev
+
+      - name: Setup CMake
+        uses: jwlawson/actions-setup-cmake@v2
+        with:
+          cmake-version: ${{ matrix.cmake-version }}
+
+      - name: Setup GCC
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y gcc-${{ matrix.gcc-version }} g++-${{ matrix.gcc-version }}
+          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-${{ matrix.gcc-version }} 100
+          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-${{ matrix.gcc-version }} 100
+
+      - name: Verify versions
+        run: |
+          gcc --version
+          cmake --version
+
       - name: Build in Release Mode
         run: |
-          cd build
-          echo "Deleting the debug build files in $(pwd)/build" 
-          rm -rf *
-          cmake -DCMake_Build_Type=Release ..
+          mkdir build && cd build 
+          cmake -DCMake_Build_Type=Release -DBUILD_TESTS=ON ..
           make
 
+      - name: Run Unit tests
+        run: cd build && ctest --output-on-failure --verbose
+
       - name: Upload artifacts
+        if: failure()
         uses: actions/upload-artifact@v4
         with:
-          name: build-output
+          name: release-build-output
           path: build/
-          retention-days: 1
-
-  # example of artifacts
-  #    - name: Upload artifacts
-  #      uses: actions/upload-artifact@v4
-  #      with:
-  #        name: build-${{ matrix.gcc-version }}-${{ github.run_id }}
-  #        path: output/
-  #
-  #test:
-  #  needs: build  # Runs after build job
-  #  runs-on: ubuntu-latest
-  #  steps:
-  #    - name: Download artifact
-  #      uses: actions/download-artifact@v4
-  #      with:
-  #        name: build-output
-  #        path: output/
\ No newline at end of file
+          retention-days: 2
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index a0e8554..ef9f13c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,6 @@
 build
 .vscode
 unit_tests_backend
-*.txt
\ No newline at end of file
+*.txt
+python_lib/dl_lib/_compiled
+*__pycache__*
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 620f07b..b09189f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,11 +5,10 @@ project(dllib VERSION 1.0.0 LANGUAGES CXX)
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR})
-set(LIBRARY_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR})
 
 # to link shared libs. TODO: can we get rid of this, as it may cost runtime?
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+set(PYTHON_MODULE_DIR "${CMAKE_SOURCE_DIR}/python_lib/dl_lib/_compiled")
 
 # to enable find boost, see https://stackoverflow.com/a/79147222
 if(POLICY CMP0167)
@@ -104,10 +103,9 @@ include_directories("${PROJECT_SOURCE_DIR}/src"
                     "${PROJECT_SOURCE_DIR}/examples")
 
 add_subdirectory(src)
-add_subdirectory(examples)
 
-option(BUILD_TESTING "Build tests" ON)
-if(BUILD_TESTING)
+option(BUILD_TESTS "Build tests" OFF)
+if(BUILD_TESTS)
     enable_testing()
     add_subdirectory(tests)
 endif()
\ No newline at end of file
diff --git a/python_lib/dl_lib/__init__.py b/python_lib/dl_lib/__init__.py
new file mode 100644
index 0000000..9817df9
--- /dev/null
+++ b/python_lib/dl_lib/__init__.py
@@ -0,0 +1,3 @@
+from ._compiled._core import Tensor, Dimension, Device, Ones, Zeros, Gaussian
+
+__all__ = ['Tensor', 'Device', 'Dimension']
\ No newline at end of file
diff --git a/python_lib/dl_lib/nn/__init__.py b/python_lib/dl_lib/nn/__init__.py
new file mode 100644
index 0000000..75fefbc
--- /dev/null
+++ b/python_lib/dl_lib/nn/__init__.py
@@ -0,0 +1,4 @@
+#from .._compiled._layers import FfLayer, ReLU
+#from .._compiled._core import Tensor  # re-export if needed
+
+#__all__ = ['FfLayer', 'ReLU']
\ No newline at end of file
diff --git a/readme.md b/readme.md
index f2d3688..bba8bee 100644
--- a/readme.md
+++ b/readme.md
@@ -2,6 +2,10 @@
 
 A from-scratch deep learning framework in modern C++ with Python bindings.
 
+## Motivation
+
+Built to understand deep learning frameworks from first principles - from computational graphs to gradient computation to optimization algorithms.
+
 ## Features
 
 - **Computational Graph**: Dynamic graph construction with automatic differentiation
@@ -38,15 +42,16 @@ Roadmap:
 mkdir build && cd build
 cmake ..
 make
-./run_tests
+ctest
 ```
 
 ## Required
 
-- Python 3 (we test with 3.10, but it should work with any version)
+- Compiler capable of C++20 at least (we test with gcc 12.3.0)
 - Boost Python
 - Cmake > 3.24
-- Compiler capable of C++20 at least (we test with gcc 12.3.0)
+- Python 3 (we test with 3.10, but it should work with any version)
+- pytest for unit tests (we use 9.0.2)
 
 ## Troubleshooting
 
@@ -55,10 +60,6 @@ make
 
 The implementation of the Python wrapper does not work on MSVC6/7 in its current form. This is due to an issue that arises from Boost Python in combination with these compilers. Workarounds are proposed, but not implemented. More information here [here](https://beta.boost.org/doc/libs/develop/libs/python/doc/html/tutorial/tutorial/exposing.html).
 
-## Motivation
-
-Built to understand deep learning frameworks from first principles - from computational graphs to gradient computation to optimization algorithms.
-
 ## License
 
 MIT
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 15b916c..0d564e2 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -2,12 +2,12 @@
 add_subdirectory(backend)
 add_subdirectory(python)
 
-target_link_libraries(py_data_modeling 
+target_link_libraries(_core 
                       ${Boost_LIBRARIES} 
                       ${PYTHON_LIBRARIES} 
                       BackendCore)
 
-target_include_directories(py_data_modeling PRIVATE 
+target_include_directories(_core PRIVATE 
                            ${PYTHON_INCLUDE_DIRS} 
                            ${Boost_INCLUDE_DIRS})
 
diff --git a/src/backend/computational_graph/getter_node.cpp b/src/backend/computational_graph/getter_node.cpp
index 7cc86a4..e1a3ac0 100644
--- a/src/backend/computational_graph/getter_node.cpp
+++ b/src/backend/computational_graph/getter_node.cpp
@@ -15,6 +15,23 @@ using namespace std;
 using namespace graph;
 
 vector< shared_ptr<Tensor> > GetterNode::backward(const Tensor& upstreamGrad) {
-  assert(!upstreamGrad.getRequiresGrad());
-  return { make_shared<Tensor>(upstreamGrad.createDeepCopy()) };
+  // upstreamGrad is scalar by definition
+  assert(!upstreamGrad.getRequiresGrad() && upstreamGrad.getDims().nDims()==1);
+
+  auto res = make_shared<Tensor>(parents[0]->getDims(), parents[0]->getDevice(), false);
+  for(tensorSize_t i=0; i<res->getSize(); i++){
+    res->setItem(0, i);
+  }
+
+  if(std::holds_alternative<tensorSize_t>(idx)){
+    res->setItem(upstreamGrad.getItem(0), std::get<tensorSize_t>(idx));
+  }
+  else if(std::holds_alternative<multiDimIdx_t>(idx)){
+    res->setItem(upstreamGrad.getItem(0), std::get<multiDimIdx_t>(idx));
+  }
+  else{
+    __throw_runtime_error("Idx variant in unexpected state");
+  }
+
+  return { std::move(res) };
 }
\ No newline at end of file
diff --git a/src/backend/computational_graph/getter_node.h b/src/backend/computational_graph/getter_node.h
index 78777e0..e55b2d5 100644
--- a/src/backend/computational_graph/getter_node.h
+++ b/src/backend/computational_graph/getter_node.h
@@ -13,6 +13,9 @@
 
 #include "graph_node.h"
 
+#include <vector>
+#include <variant>
+
 namespace graph{
   /**
    * @brief When calling a get function, say as in 
@@ -21,9 +24,17 @@ namespace graph{
    * 
    */
   class GetterNode final : public GraphNode {
+    using multiDimIdx_t = std::vector<tensorDim_t>;
+
+    private:
+      const std::variant<tensorSize_t, multiDimIdx_t> idx;
+
     public:
-      explicit GetterNode(std::shared_ptr<Tensor> t) 
-        : GraphNode({std::move(t)}) {}
+      explicit GetterNode(std::shared_ptr<Tensor> t, const tensorSize_t idx) 
+        : GraphNode({std::move(t)}), idx{idx} {}
+
+      explicit GetterNode(std::shared_ptr<Tensor> t, const multiDimIdx_t& idx) 
+        : GraphNode({std::move(t)}), idx{idx} {}
 
       GetterNode(const GetterNode& other) = delete;
       GetterNode& operator=(const GetterNode& other) = delete;
diff --git a/src/backend/computational_graph/graph_creation.cpp b/src/backend/computational_graph/graph_creation.cpp
index fd701d2..1955493 100644
--- a/src/backend/computational_graph/graph_creation.cpp
+++ b/src/backend/computational_graph/graph_creation.cpp
@@ -103,7 +103,7 @@ shared_ptr<Tensor> graph::get(const shared_ptr<Tensor>& t, tensorSize_t idx) {
                              t->getDevice());
                              
   if(t->getRequiresGrad()){
-    res->setCgNode(std::make_shared<graph::GetterNode>(t));
+    res->setCgNode(std::make_shared<graph::GetterNode>(t, idx));
     assert(res->getRequiresGrad());
   }
   return res;
@@ -115,13 +115,25 @@ shared_ptr<Tensor> graph::get(const shared_ptr<Tensor>& t, tensorSize_t idx) {
  * 
  * loss = loss + other.get(i), we need to make sure get(i) can map to computational graph.
  */
-shared_ptr<Tensor> graph::get(const shared_ptr<Tensor>& t, vector<tensorDim_t>&& idx) {
+shared_ptr<Tensor> graph::get(const shared_ptr<Tensor>& t, const vector<tensorDim_t>& idx) {
   ftype val = t->getItem(std::move(idx));
   auto res = make_shared<Tensor>(std::vector<tensorDim_t>{1}, std::vector<ftype>{val}, 
                              t->getDevice());
   if(t->getRequiresGrad()){
-    res->setCgNode(std::make_shared<graph::GetterNode>(t));
+    res->setCgNode(std::make_shared<graph::GetterNode>(t, idx));
     assert(res->getRequiresGrad());
   }
   return res;
+}
+
+/**
+ * @brief Takes the sum of the whole tensor, then returns result as vector.
+ */
+shared_ptr<Tensor> graph::sumTensor(const shared_ptr<Tensor> t) {
+  auto res = make_shared<Tensor>(std::vector<tensorDim_t>{1}, std::vector<ftype>{0.0}, 
+                                 t->getDevice(), t->getRequiresGrad());
+  for(tensorSize_t i=0; i<t->getSize(); i++){
+    res = graph::add(res, graph::get(t, i));
+  }
+  return res;
 }
\ No newline at end of file
diff --git a/src/backend/computational_graph/graph_creation.h b/src/backend/computational_graph/graph_creation.h
index f1e972c..f68cb4c 100644
--- a/src/backend/computational_graph/graph_creation.h
+++ b/src/backend/computational_graph/graph_creation.h
@@ -16,22 +16,25 @@
 #include <memory>
 
 namespace graph {
+  // Artithmetic operations
   std::shared_ptr<Tensor> mul(const std::shared_ptr<Tensor> left, const std::shared_ptr<Tensor> right);
-
-  std::shared_ptr<Tensor> add(const std::shared_ptr<Tensor> left, const std::shared_ptr<Tensor> right);
-
-  std::shared_ptr<Tensor> matmul(const std::shared_ptr<Tensor> left, const std::shared_ptr<Tensor> right);
-
   std::shared_ptr<Tensor> mul(const std::shared_ptr<Tensor> left, ftype scalar); 
   std::shared_ptr<Tensor> mul(ftype scalar, const std::shared_ptr<Tensor> left); 
 
+  std::shared_ptr<Tensor> add(const std::shared_ptr<Tensor> left, const std::shared_ptr<Tensor> right);
   std::shared_ptr<Tensor> add(const std::shared_ptr<Tensor> left, ftype scalar);    
   std::shared_ptr<Tensor> add(ftype scalar, const std::shared_ptr<Tensor> left);
 
+  std::shared_ptr<Tensor> matmul(const std::shared_ptr<Tensor> left, const std::shared_ptr<Tensor> right);
+
   std::shared_ptr<Tensor> sub(const std::shared_ptr<Tensor> left, ftype scalar);
   std::shared_ptr<Tensor> div(const std::shared_ptr<Tensor> left, ftype scalar);
 
+  // Getter methods
   std::shared_ptr<Tensor> get(const std::shared_ptr<Tensor>& t, tensorSize_t idx);
-  std::shared_ptr<Tensor> get(const std::shared_ptr<Tensor>& t, std::vector<tensorDim_t>&& idx);
+  std::shared_ptr<Tensor> get(const std::shared_ptr<Tensor>& t, const std::vector<tensorDim_t>& idx);
+
+  // Composite operations
+  std::shared_ptr<Tensor> sumTensor(const std::shared_ptr<Tensor> t);
 }
  
\ No newline at end of file
diff --git a/src/backend/computational_graph/topological_sort.cpp b/src/backend/computational_graph/topological_sort.cpp
index 60fcf3d..d6250ca 100644
--- a/src/backend/computational_graph/topological_sort.cpp
+++ b/src/backend/computational_graph/topological_sort.cpp
@@ -36,10 +36,11 @@ bool TopologicalSort::hasCycles(const Tensor* root) {
     assert(start->cgNode);
 
     stack<const Tensor*> tStack;
+    unordered_set<const Tensor*> visited;
 
-    auto pushParentsOnStack = [&tStack](const Tensor* t){
+    auto pushParentsOnStack = [&tStack, &visited](const Tensor* t){
       for(auto parent: t->cgNode->getParents()){
-        if(parent->cgNode){
+        if(parent->cgNode && !visited.contains(parent.get())){
           tStack.push(parent.get());
         }
       }
@@ -57,6 +58,7 @@ bool TopologicalSort::hasCycles(const Tensor* root) {
       tStack.pop();
       
       pushParentsOnStack(t);
+      visited.insert(t);
     }
 
     return false;
diff --git a/src/backend/data_modeling/dim_type.h b/src/backend/data_modeling/dim_type.h
index 1d0aa77..babdc96 100644
--- a/src/backend/data_modeling/dim_type.h
+++ b/src/backend/data_modeling/dim_type.h
@@ -79,9 +79,18 @@ class Dimension final {
       return this->dims == other.dims;
     }
 
+    bool operator==(const std::vector<tensorDim_t>& other) const {
+      assert(size!=0);
+      return this->dims == other;
+    }
+
     bool operator!=(const Dimension& other) const {
       return !(*this == other);
     }
 
+    bool operator!=(const std::vector<tensorDim_t>& other) const {
+      return !(*this == other);
+    }
+
     friend std::ostream& operator<<(std::ostream& os, const Dimension& d) noexcept;
 };
\ No newline at end of file
diff --git a/src/backend/data_modeling/tensor.cpp b/src/backend/data_modeling/tensor.cpp
index 10b6813..04a94aa 100644
--- a/src/backend/data_modeling/tensor.cpp
+++ b/src/backend/data_modeling/tensor.cpp
@@ -529,7 +529,7 @@ void Tensor::backward() {
 /**
  * @brief Get gradients
  */
-const shared_ptr<Tensor>& Tensor::getGrads() const {
+shared_ptr<const Tensor> Tensor::getGrads() const {
   if(!grads){
     __throw_runtime_error("Tensor has no gradients.");
   }
@@ -540,9 +540,7 @@ const shared_ptr<Tensor>& Tensor::getGrads() const {
  * @brief Sometimes we do accept negative dim-values. In accordance with e.g. 
  * NumPy we map from the end to the beginning in that case. 
  */
-tensorDim_t Tensor::mapDim(const int dim, optional<const Dimension> dimOpt) const {
-  const auto& dims = dimOpt ? dimOpt.value() : this->dims;
-
+tensorDim_t Tensor::mapDim(const int dim, const Dimension& dims) {
   if(dim>=0){
     return dim;
   }
@@ -642,8 +640,8 @@ void Tensor::transposeImpl2D(Tensor& target, const int dim1, const int dim2) con
   const auto smallDim = dim1Mapped < dim2Mapped ? dim2Mapped : dim1Mapped;
 
   // largeDimSize >= smallDimSize
-  const auto largeDimSize = getTotalDimSize(largeDim);
-  const auto smallDimSize = getTotalDimSize(smallDim);
+  const auto largeDimOffset = getDimOffset(largeDim, dims);
+  const auto smallDimOffset = getDimOffset(smallDim, dims);
 
   auto transposedValues = make_unique<tensorValues_t>(source.values->getDevice());
   transposedValues->resize(source.values->getSize());
@@ -651,9 +649,9 @@ void Tensor::transposeImpl2D(Tensor& target, const int dim1, const int dim2) con
   tensorSize_t resIdx = 0;
   for(tensorSize_t smallDimCount=0; smallDimCount<source.dims.getItem(smallDim); smallDimCount++){
     for(tensorSize_t largeDimCount=0; largeDimCount<source.dims.getItem(largeDim); largeDimCount++){
-      tensorSize_t offset = largeDimCount * largeDimSize + smallDimCount * smallDimSize;
+      tensorSize_t offset = largeDimCount * largeDimOffset + smallDimCount * smallDimOffset;
 
-      for(tensorSize_t smallDimIdx=0; smallDimIdx<smallDimSize; smallDimIdx++){
+      for(tensorSize_t smallDimIdx=0; smallDimIdx<smallDimOffset; smallDimIdx++){
         (*transposedValues)[resIdx] = (*source.values)[offset];
         resIdx++;
         offset++;
@@ -829,8 +827,8 @@ ostream& operator<<(ostream& os, const Tensor& t) noexcept {
  * 
  * WARNING: Does not check for overflow.
  */
-tensorSize_t Tensor::computeIdx(const std::vector<tensorDim_t>&& idx) const {
-  return computeIdx(idx);
+tensorSize_t Tensor::computeLinearIdx(const std::vector<tensorDim_t>&& idx, const Dimension& dims) {
+  return computeLinearIdx(idx, dims);
 }
 
 /**
@@ -838,7 +836,7 @@ tensorSize_t Tensor::computeIdx(const std::vector<tensorDim_t>&& idx) const {
  * 
  * WARNING: Does not check for overflow.
  */
-tensorSize_t Tensor::computeIdx(const std::vector<tensorDim_t>& idx) const {
+tensorSize_t Tensor::computeLinearIdx(const std::vector<tensorDim_t>& idx, const Dimension& dims) {
   if(idx.size()!=dims.nDims()) {
     __throw_invalid_argument("Number of idxs must match number of dimensions.");
   }
@@ -862,7 +860,7 @@ tensorSize_t Tensor::computeIdx(const std::vector<tensorDim_t>& idx) const {
  * @brief Gets the total size of a dimension. E.g. if dims=(2, 3, 4),
  * the offset of dim1 is 3*4==12, and that of dim0 is 2*3*4==24.
  */
-tensorSize_t Tensor::getTotalDimSize(const tensorDim_t dim) const {
+tensorSize_t Tensor::getDimOffset(const tensorDim_t dim, const Dimension& dims) {
   tensorSize_t res = 1; // minimum possible dimsize
 
   for(size_t idx = dims.nDims()-1; idx>dim; idx--){
@@ -876,26 +874,22 @@ tensorSize_t Tensor::getTotalDimSize(const tensorDim_t dim) const {
 /**
  * @brief Like overload, but accepts negative dims.
  */
-tensorSize_t Tensor::getTotalDimSize(const int dim) const {
-  return getTotalDimSize(mapDim(dim));
+tensorSize_t Tensor::getDimOffset(const int dim, const Dimension& dims) {
+  return getDimOffset(mapDim(dim, dims), dims);
 }
 
 /**
  * @brief No explanation needed.
  */
-ftype Tensor::getItem(const std::vector<tensorDim_t>&& idx) const {
-  return (*values)[computeIdx(idx)]; 
-}
-
-Tensor Tensor::getAsTensor(const std::vector<tensorDim_t>&& idx) const {
-  return Tensor({1}, {(*values)[computeIdx(idx)]}, values->getDevice(), requiresGrad); 
+ftype Tensor::getItem(const std::vector<tensorDim_t>& idx) const {
+  return (*values)[computeLinearIdx(idx, dims)]; 
 }
 
 /**
  * @brief Special getter, indexes the contained underlying array linearly.
  * Can lead to unexpected results in multidimensional tensors.
  */
-ftype Tensor::getItem(tensorDim_t idx) const {
+ftype Tensor::getItem(tensorSize_t idx) const {
   return (*values)[idx];
 }
 
@@ -914,8 +908,8 @@ ftype Tensor::getItem(tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2, tens
 /**
  * @brief No explanation needed.
  */
-void Tensor::setItem(ftype item, const std::vector<tensorDim_t>&& idx) {
-  (*values)[computeIdx(idx)] = item;
+void Tensor::setItem(ftype item, const std::vector<tensorDim_t>& idx) {
+  (*values)[computeLinearIdx(idx, dims)] = item;
 }
 
 /**
diff --git a/src/backend/data_modeling/tensor.h b/src/backend/data_modeling/tensor.h
index 4f02101..9c1384b 100644
--- a/src/backend/data_modeling/tensor.h
+++ b/src/backend/data_modeling/tensor.h
@@ -49,7 +49,7 @@ constexpr const char* DeviceToString(Device d) {
     return ""; // suppress
 }
 
-class Tensor final {
+class Tensor final : public std::enable_shared_from_this<Tensor> {
     friend class graph::TopologicalSort;
 
     private:
@@ -132,11 +132,12 @@ class Tensor final {
         void transposeImpl(Tensor& target, const int dim1, const int dim2) const noexcept;
 
         // convenience functions that appear in multiple places
-        tensorSize_t computeIdx(const std::vector<tensorDim_t>&& idx) const;
-        tensorSize_t computeIdx(const std::vector<tensorDim_t>& idx) const;
-        tensorSize_t getTotalDimSize(const tensorDim_t dim) const;
-        tensorSize_t getTotalDimSize(const int dim) const;
-        tensorDim_t mapDim(const int dim, std::optional<const Dimension> dimsOpt=std::nullopt) const;
+        static tensorSize_t computeLinearIdx(const std::vector<tensorDim_t>&& idx, const Dimension& dims);
+        static tensorSize_t computeLinearIdx(const std::vector<tensorDim_t>& idx, const Dimension& dims);
+
+        static tensorSize_t getDimOffset(const tensorDim_t dim, const Dimension& dims);
+        static tensorSize_t getDimOffset(const int dim, const Dimension& dims);
+        static tensorDim_t mapDim(const int dim, const Dimension& dims);
 
         friend void printValuesCpu(std::ostream& os, const Tensor& t);
 
@@ -157,11 +158,11 @@ class Tensor final {
             values->resize(this->dims.getSize());
         }
 
-        explicit Tensor(const std::vector<tensorDim_t>& dims, std::vector<ftype>&& initValues, bool requiresGrad=false) :
+        explicit Tensor(const std::vector<tensorDim_t>& dims, const std::vector<ftype>& initValues, bool requiresGrad=false) :
             Tensor{dims, std::move(initValues), Tensor::getDefaultDevice(), requiresGrad} {
             }
 
-        explicit Tensor(const std::vector<tensorDim_t>& dims, std::vector<ftype>&& initValues, Device d, bool requiresGrad=false) :
+        explicit Tensor(const std::vector<tensorDim_t>& dims, const std::vector<ftype>& initValues, Device d, bool requiresGrad=false) :
             Tensor{dims, d, requiresGrad} {   
             for(tensorSize_t i=0; i<initValues.size(); i++){
                 values->setItem(initValues[i], i);
@@ -218,7 +219,7 @@ class Tensor final {
         void backward();
 
         bool hasGrads() const noexcept { return grads!=nullptr; }
-        const std::shared_ptr<Tensor>& getGrads() const;
+        std::shared_ptr<const Tensor> getGrads() const;
 
         void transposeThis() noexcept;
         void transposeThis(int dim1, int dim2) noexcept;
@@ -231,21 +232,19 @@ class Tensor final {
         friend std::ostream& operator<<(std::ostream& os, const Tensor& t) noexcept;
 
         // for convenience we provide some simple getters
-        ftype getItem(tensorDim_t idx) const;
+        ftype getItem(tensorSize_t idx) const;
         ftype getItem(tensorDim_t idx0, tensorDim_t idx1) const;
         ftype getItem(tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2) const;
         ftype getItem(tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2, tensorDim_t idx3) const;
 
-        ftype getItem(const std::vector<tensorDim_t>&& idx) const;
-
-        Tensor getAsTensor(const std::vector<tensorDim_t>&& idx) const;
+        ftype getItem(const std::vector<tensorDim_t>& idx) const;
 
         // for convenience we provide some simple setters
         void setItem(ftype item, tensorDim_t idx);
         void setItem(ftype item, tensorDim_t idx0, tensorDim_t idx1);
         void setItem(ftype item, tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2);
         void setItem(ftype item, tensorDim_t idx0, tensorDim_t idx1, tensorDim_t idx2, tensorDim_t idx3);
-        void setItem(ftype item, const std::vector<tensorDim_t>&& idx);
+        void setItem(ftype item, const std::vector<tensorDim_t>& idx);
 
         void setDevice(const Device d) noexcept;
         Device getDevice() const noexcept;
@@ -266,6 +265,17 @@ class Tensor final {
             requiresGrad = true; 
         }
 
+        std::shared_ptr<Tensor> getSharedPtr() const {
+            try {
+                return std::const_pointer_cast<Tensor>(shared_from_this());
+            } 
+            catch (const std::bad_weak_ptr&) {
+                throw std::runtime_error(
+                    "Tensor must be managed by shared_ptr for autograd operations"
+                );
+            }        
+        }
+
         // these two should not be exposed to the python interface
         static void setDefaultDevice(const Device d) noexcept;
         static Device getDefaultDevice() noexcept;
diff --git a/src/backend/data_modeling/tensor_functions.cpp b/src/backend/data_modeling/tensor_functions.cpp
index 6bfae2d..3ac032e 100644
--- a/src/backend/data_modeling/tensor_functions.cpp
+++ b/src/backend/data_modeling/tensor_functions.cpp
@@ -65,14 +65,14 @@ shared_ptr<Tensor> TensorFunctions::makeSharedTensor(const vector<tensorDim_t>&
 }
 
 shared_ptr<Tensor> TensorFunctions::makeSharedTensor(const vector<tensorDim_t>& dims, 
-                                         vector<ftype>&& initValues, 
+                                         const vector<ftype>& initValues, 
                                          bool requiresGrad) {
-  return make_shared<Tensor>(dims, std::move(initValues), requiresGrad);   
+  return make_shared<Tensor>(dims, initValues, requiresGrad);
 }
 
 shared_ptr<Tensor> TensorFunctions::makeSharedTensor(const vector<tensorDim_t>& dims, 
-                                           vector<ftype>&& initValues, 
+                                           const vector<ftype>& initValues, 
                                            Device d, 
                                            bool requiresGrad){
-  return make_shared<Tensor>(dims, std::move(initValues), d, requiresGrad);   
+  return make_shared<Tensor>(dims, initValues, d, requiresGrad);   
 }
\ No newline at end of file
diff --git a/src/backend/data_modeling/tensor_functions.h b/src/backend/data_modeling/tensor_functions.h
index e3818bf..78f68fe 100644
--- a/src/backend/data_modeling/tensor_functions.h
+++ b/src/backend/data_modeling/tensor_functions.h
@@ -40,11 +40,11 @@ namespace TensorFunctions { // class name acts as namespace for us
   std::shared_ptr<Tensor> makeSharedTensor(const std::vector<tensorDim_t>& dims, Device d, bool requiresGrad=false);
 
   std::shared_ptr<Tensor> makeSharedTensor(const std::vector<tensorDim_t>& dims, 
-                                           std::vector<ftype>&& initValues, 
+                                           const std::vector<ftype>& initValues, 
                                            bool requiresGrad=false);
 
   std::shared_ptr<Tensor> makeSharedTensor(const std::vector<tensorDim_t>& dims, 
-                                           std::vector<ftype>&& initValues, 
+                                           const std::vector<ftype>& initValues, 
                                            Device d, bool requiresGrad=false);
 
   // Tensor manipulation
diff --git a/src/backend/networks/sequential.h b/src/backend/networks/sequential.h
index a66e251..f05fafd 100644
--- a/src/backend/networks/sequential.h
+++ b/src/backend/networks/sequential.h
@@ -23,7 +23,7 @@ class SequentialNetwork {
         bool assertDims(const layers::LayerBase& layer) const noexcept;
 
         template <typename T>
-        requires (std::derived_from< std::remove_const_t<T>, layers::LayerBase >)
+        requires (std::derived_from< std::remove_cvref_t<T>, layers::LayerBase >)
         void addLayer(T&& layer) {
             if(!assertDims(layer)){
                 // TODO: show warning that the dims don't match
diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index a13adbc..6c5370d 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -5,17 +5,12 @@ include_directories(
 # remove the lib... prefix 
 set(CMAKE_SHARED_MODULE_PREFIX "")
 
-add_library(py_data_modeling MODULE
+add_library(_core MODULE
             data_modeling/py_data_modeling.cpp
+            data_modeling/py_data_modeling_util.cpp
             )
-set_target_properties(py_data_modeling PROPERTIES PREFIX "") # don't add "lib" in front
-
-#target_link_libraries(py_data_modeling ${Boost_LIBRARIES} ${PYTHON_LIBRARIES} BackendCore)
-#target_include_directories(py_data_modeling PRIVATE ${PYTHON_INCLUDE_DIRS} ${Boost_INCLUDE_DIRS})
-
-#add_library(py_layers MODULE
-#            layers/py_layers.cpp
-#            )
-#set_target_properties(py_layers PROPERTIES PREFIX "") # don't add "lib" in front
-
-#target_link_libraries(layers INTERFACE data_modeling)
\ No newline at end of file
+            
+set_target_properties(_core PROPERTIES 
+                      PREFIX ""
+                      OUTPUT_NAME "_core"
+                      LIBRARY_OUTPUT_DIRECTORY ${PYTHON_MODULE_DIR})
\ No newline at end of file
diff --git a/src/python/data_modeling/py_data_modeling.cpp b/src/python/data_modeling/py_data_modeling.cpp
index cbd6b27..88b4bec 100644
--- a/src/python/data_modeling/py_data_modeling.cpp
+++ b/src/python/data_modeling/py_data_modeling.cpp
@@ -1,119 +1,221 @@
 /**
- * @file tensor.cpp
+ * @file py_data_modeling.cpp
  * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
  * @brief 
  * @version 0.1
- * @date 2026-01-11
+ * @date 2026-02-21
  * 
  * @copyright Copyright (c) 2026
  * 
  */
 
-#include "py_data_modeling.h"
+#include "data_modeling/tensor.h"
 
-#include <stdexcept>
-#include <utility>
+#include "py_data_modeling_util.h"
+#include "python_templates.h"
+#include "custom_converters.h"
 
-using namespace boost::python;
+#include "data_modeling/tensor.h"
+#include "data_modeling/tensor_functions.h"
+#include "computational_graph/graph_creation.h"
 
-ftype Py_DataModeling::tensorGetItem(const Tensor& self, boost::python::object index) {
-  extract<int> int_extractor(index);
-        
-  // Single integer index (1D)
-  if(int_extractor.check()) {
-    auto i0 = static_cast<tensorDim_t>(int_extractor());
-    return self.getItem(i0);
-  }
-        
-  // Tuple index (2D, 3D, or 4D, or list)
-  if (PySequence_Check(index.ptr())) {
-    int len = PySequence_Length(index.ptr());
-        
-    // Dispatch to convenience functions for 1-4 args
-    if (len == 1) {
-      auto i0 = static_cast<tensorDim_t>(extract<int>(index[0]));
-      return self.getItem(i0);
-    }
-    else if (len == 2) {
-      auto i0 = static_cast<tensorDim_t>(extract<int>(index[0]));
-      auto i1 = static_cast<tensorDim_t>(extract<int>(index[1]));
-      return self.getItem(i0, i1);
-    }
-    else if (len == 3) {
-      auto i0 = static_cast<tensorDim_t>(extract<int>(index[0]));
-      auto i1 = static_cast<tensorDim_t>(extract<int>(index[1]));
-      auto i2 = static_cast<tensorDim_t>(extract<int>(index[2]));
-      return self.getItem(i0, i1, i2);
-    }
-    else if (len == 4) {
-      auto i0 = static_cast<tensorDim_t>(extract<int>(index[0]));
-      auto i1 = static_cast<tensorDim_t>(extract<int>(index[1]));
-      auto i2 = static_cast<tensorDim_t>(extract<int>(index[2]));
-      auto i3 = static_cast<tensorDim_t>(extract<int>(index[3]));
-      return self.getItem(i0, i1, i2, i3);
-    }
-    else {
-      // Arbitrary length - use vector version
-      std::vector<tensorDim_t> indices;
-      for (int i = 0; i < len; ++i) {
-        indices.push_back(static_cast<tensorDim_t>(extract<int>(index[i])));
-      }
-      return self.getItem(std::move(indices));
-    }
-  }
+#include <boost/python.hpp>
+#include <boost/python/enum.hpp>
+#include <boost/python/return_internal_reference.hpp>
+
+BOOST_PYTHON_MODULE(_core)
+{
+    using namespace boost::python;
+
+    // some macros to make code below easier to read
+    #define WRAP_TENSOR_METHOD_1(method) \
+    +[](const Tensor& self, const Tensor& other) -> std::shared_ptr<Tensor> { \
+        return std::make_shared<Tensor>(self.method(other)); \
+    }
+
+    #define WRAP_SCALAR(method, T) \
+    +[](const Tensor& self, T val) -> std::shared_ptr<Tensor> { \
+        return std::make_shared<Tensor>(self.method(val)); \
+    }
+
+    #define WRAP_SCALAR_REVERSE(op, T) \
+    +[](const Tensor& self, T val) -> std::shared_ptr<Tensor> { \
+        return std::make_shared<Tensor>(val op self); \
+    }
+
+    // different, since those are not methods anymore
+    #define WRAP_FREE_MEMBER_FUNC_1(fPtr, T1, T2) \
+    +[](const Tensor& self, int v1, int v2) -> std::shared_ptr<Tensor> { \
+        return std::make_shared<Tensor>((self.*fPtr)(v1, v2)); \
+    }
+
+    #define WRAP_FREE_MEMBER_FUNC_2(fPtr, T1, T2, T3) \
+    +[](const Tensor& self, T1 v1, T2 v2, T3 v3) -> std::shared_ptr<Tensor> { \
+        return std::make_shared<Tensor>((self.*fPtr)(v1, v2, v3)); \
+    }
+
+    #define WRAP_FREE_FUNC_1(fPtr, T1) \
+    +[](T1 v1) -> std::shared_ptr<Tensor> { \
+        return std::make_shared<Tensor>((*fPtr)(v1)); \
+    }
+
+    #define WRAP_FREE_FUNC_2(fPtr, T1, T2) \
+    +[](T1 v1, T2 v2) -> std::shared_ptr<Tensor> { \
+        return std::make_shared<Tensor>((*fPtr)(v1, v2)); \
+    }
+
+    #define WRAP_FREE_FUNC_3(fPtr, T1, T2, T3) \
+    +[](T1 v1, T2 v2, T3 v3) -> std::shared_ptr<Tensor> { \
+        return std::make_shared<Tensor>((*fPtr)(v1, v2, v3)); \
+    }
+
+    #define WRAP_FREE_FUNC_4(fPtr, T) \
+    +[](const Tensor& self, T val) -> std::shared_ptr<Tensor> { \
+        return (*fPtr)(self.getSharedPtr(), val); \
+    }
+
+    #define WRAP_FREE_FUNC_5(fPtr) \
+    +[](const Tensor& self, const Tensor& other) -> std::shared_ptr<Tensor> { \
+        return (*fPtr)(self.getSharedPtr(), other.getSharedPtr()); \
+    }
+
+    #define WRAP_FREE_FUNC_6(fPtr, T) \
+    +[](const Tensor& self, T val) -> std::shared_ptr<Tensor> { \
+        return (*fPtr)(val, self.getSharedPtr()); \
+    }
+
+    #define WRAP_FREE_FUNC_7(fPtr) \
+    +[](const Tensor& self) -> std::shared_ptr<Tensor> { \
+        return (*fPtr)(self.getSharedPtr()); \
+    }
+
+    #define WRAP_FUNC_AND_CONVERT_DTYPE_1(method) \
+    +[](const Tensor& self, int v1) -> ftype { \
+        return self.method(static_cast<tensorSize_t>(v1)); \
+    }
+
+    #define WRAP_FUNC_AND_CONVERT_DTYPE_2(method) \
+    +[](const Tensor& self, int v1, int v2) -> ftype { \
+        return self.method(static_cast<tensorDim_t>(v1), static_cast<tensorDim_t>(v2)); \
+    }
+
+    #define WRAP_FUNC_AND_CONVERT_DTYPE_3(method) \
+    +[](const Tensor& self, int v1, int v2, int v3) -> ftype { \
+        return self.method(static_cast<tensorDim_t>(v1), static_cast<tensorDim_t>(v2), \
+                           static_cast<tensorDim_t>(v3)); \
+    }
+
+    #define WRAP_FUNC_AND_CONVERT_DTYPE_4(method) \
+    +[](const Tensor& self, int v1, int v2, int v3, int v4) -> ftype { \
+        return self.method(static_cast<tensorDim_t>(v1), static_cast<tensorDim_t>(v2), \
+                           static_cast<tensorDim_t>(v3), static_cast<tensorDim_t>(v4)); \
+    }
+
+    // classes
+    class_<Dimension>("Dimension", no_init)
+        .add_property("list", &Dimension::getItem)
+        .def("__str__", &Py_Util::toString<Dimension>)
+        .def("__eq__", Py_DataModeling::dimEquals1)
+        .def("__eq__", Py_DataModeling::dimEquals2)
+        .def("__ne__", Py_DataModeling::nDimEquals1)
+        .def("__ne__", Py_DataModeling::nDimEquals2)
+    ;
+
+    enum_<Device>("Device")
+        .value("CPU", Device::CPU)
+        .value("CUDA", Device::CUDA)
+    ;
+
+    // register implicit dtype conversion
+    custom_converters::PyListToVectorConverter<tensorDim_t>();
+    custom_converters::PyListToVectorConverter<ftype>();
+
+    // to convert std::shared_ptr<const Tensor> to std::shared_ptr<Tensor>> in Python
+    boost::python::register_ptr_to_python< std::shared_ptr<const Tensor> >();
+
+    // we manage via shared_ptr, since we deleted copy-ctor
+    class_<Tensor, std::shared_ptr<Tensor>, boost::noncopyable>("Tensor", no_init)
+        .def(init<const std::vector<tensorDim_t>&, optional<bool> >())
+        .def(init<const std::vector<tensorDim_t>&, Device, optional<bool> >())
+        .def(init<const std::vector<tensorDim_t>&, const std::vector<ftype>&, optional<bool> >())
+        .def(init<const std::vector<tensorDim_t>&, const std::vector<ftype>&, Device, optional<bool> >())
         
-  PyErr_SetString(PyExc_TypeError, "Index must be a number of up to 4integers or a list");
-  throw_error_already_set();
-  return 0.0; // Never reached
-}
-
-void Py_DataModeling::tensorSetItem(Tensor& self, boost::python::object index, ftype value) {
-  extract<int> int_extractor(index);
-  if(int_extractor.check()) {
-      auto i0 = static_cast<tensorDim_t>(int_extractor());
-      self.setItem(value, i0);
-      return;
-  }
+        // static creation methods
+        .def("ones", WRAP_FREE_FUNC_1(Py_DataModeling::Ones0, std::vector<tensorDim_t>))
+        .def("ones", WRAP_FREE_FUNC_2(Py_DataModeling::Ones1, std::vector<tensorDim_t>, Device))
+        .def("ones", WRAP_FREE_FUNC_2(Py_DataModeling::Ones2, std::vector<tensorDim_t>, const bool))
+        .def("ones", WRAP_FREE_FUNC_3(Py_DataModeling::Ones3, std::vector<tensorDim_t>, Device, const bool)).staticmethod("ones")
+
+        .def("zeros", WRAP_FREE_FUNC_1(Py_DataModeling::Zeros0, std::vector<tensorDim_t>))
+        .def("zeros", WRAP_FREE_FUNC_2(Py_DataModeling::Zeros1, std::vector<tensorDim_t>, Device))
+        .def("zeros", WRAP_FREE_FUNC_2(Py_DataModeling::Zeros2, std::vector<tensorDim_t>, const bool))
+        .def("zeros", WRAP_FREE_FUNC_3(Py_DataModeling::Zeros3, std::vector<tensorDim_t>, Device, const bool)).staticmethod("zeros")
+
+        .def("gauss", WRAP_FREE_FUNC_1(Py_DataModeling::Gaussian0, std::vector<tensorDim_t>))
+        .def("gauss", WRAP_FREE_FUNC_2(Py_DataModeling::Gaussian1, std::vector<tensorDim_t>, Device))
+        .def("gauss", WRAP_FREE_FUNC_2(Py_DataModeling::Gaussian2, std::vector<tensorDim_t>, const bool))
+        .def("gauss", WRAP_FREE_FUNC_3(Py_DataModeling::Gaussian3, std::vector<tensorDim_t>, Device, const bool)).staticmethod("gauss")
+
+        // properties
+        .add_property("device", &Tensor::getDevice, &Tensor::setDevice)
+        .add_property("dims", make_function(&Tensor::getDims, return_internal_reference<>()))
+        .add_property("grads", make_function(&Tensor::getGrads))
+        .add_property("requiresGrad", &Tensor::getRequiresGrad, &Tensor::setRequiresGrad)
+
+        // operators
+        .def("__str__", &Py_Util::toString<Tensor>)
+        .def("__repr__", &Py_Util::toString<Tensor>)
+        .def("__len__", &Tensor::getSize)
+        .def("__getitem__", WRAP_FREE_FUNC_4(&Py_DataModeling::getItemAsTensor1, tensorSize_t))
+        .def("__getitem__", WRAP_FREE_FUNC_4(&Py_DataModeling::getItemAsTensor2, std::vector<tensorDim_t>))
+        .def("__setitem__", &Py_DataModeling::tensorSetItem)
+
+        // arithmetics
+        .def("__matmul__", WRAP_FREE_FUNC_5(Py_DataModeling::matmul))
+        .def("__add__", WRAP_FREE_FUNC_5(Py_DataModeling::elementwiseadd)) // elementwise add
+        .def("__add__", WRAP_FREE_FUNC_4(Py_DataModeling::scalaradd, ftype))
+        .def("__radd__", WRAP_FREE_FUNC_6(Py_DataModeling::rscalaradd, ftype))
+
+        .def("__mul__", WRAP_FREE_FUNC_5(Py_DataModeling::elementwisemul)) // elementwise mult
+        .def("__mul__", WRAP_FREE_FUNC_4(Py_DataModeling::scalarmul, ftype))
+        .def("__rmul__", WRAP_FREE_FUNC_6(Py_DataModeling::rscalarmul, ftype))
         
-  // Tuple index (2D, 3D, or 4D, or list)
-  extract<tuple> tuple_extractor(index);
-  if (PySequence_Check(index.ptr())) {
-    int len = PySequence_Length(index.ptr());
+        .def("__sub__", WRAP_FREE_FUNC_4(Py_DataModeling::scalarsub, ftype))
+        .def("__truediv__", WRAP_FREE_FUNC_4(Py_DataModeling::scalardiv, ftype))
+
+        // member functions
+        .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_1(Tensor::getItem))
+        .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_2(Tensor::getItem))
+        .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_3(Tensor::getItem))
+        .def("getitem", WRAP_FUNC_AND_CONVERT_DTYPE_4(Tensor::getItem))
+        .def("getitem", Py_DataModeling::getItemVector) // the vector arg
+
+        .def("sum", WRAP_FREE_FUNC_7(&(graph::sumTensor)))
         
-    // Dispatch to convenience functions for 1-4 args
-    if (len == 1) {
-      auto i0 = static_cast<tensorDim_t>(extract<int>(index[0]));
-      self.setItem(value, i0);
-    }
-    else if (len == 2) {
-      auto i0 = static_cast<tensorDim_t>(extract<int>(index[0]));
-      auto i1 = static_cast<tensorDim_t>(extract<int>(index[1]));
-      self.setItem(value, i0, i1);
-    }
-    else if (len == 3) {
-      auto i0 = static_cast<tensorDim_t>(extract<int>(index[0]));
-      auto i1 = static_cast<tensorDim_t>(extract<int>(index[1]));
-      auto i2 = static_cast<tensorDim_t>(extract<int>(index[2]));
-      self.setItem(value, i0, i1, i2);
-    }
-    else if (len == 4) {
-      auto i0 = static_cast<tensorDim_t>(extract<int>(index[0]));
-      auto i1 = static_cast<tensorDim_t>(extract<int>(index[1]));
-      auto i2 = static_cast<tensorDim_t>(extract<int>(index[2]));
-      auto i3 = static_cast<tensorDim_t>(extract<int>(index[3]));
-      self.setItem(value, i0, i1, i2, i3);
-    }
-    else {
-      // Arbitrary length - use vector version
-      std::vector<tensorDim_t> indices;
-      for (int i = 0; i < len; ++i) {
-        indices.push_back(static_cast<tensorDim_t>(extract<int>(index[i])));
-      }
-      self.setItem(value, std::move(indices));
-    }
-    return;
-  }
+        .def("reset", Py_DataModeling::reset1)
+        .def("reset", Py_DataModeling::reset2)
+
+        .def("transpose", WRAP_FREE_MEMBER_FUNC_1(Py_DataModeling::transpose1, int, int))
+        .def("transpose", WRAP_FREE_MEMBER_FUNC_2(Py_DataModeling::transpose2, int, int, bool))
+        .def("transposeThis", Py_DataModeling::transposeThis1)
+        .def("transposeThis", Py_DataModeling::transposeThis2)
         
-  PyErr_SetString(PyExc_TypeError, "Index must be a number of up to 4integers or a list");
-  throw_error_already_set();
+        .def("backward", &Tensor::backward)
+    ;
+
+    // functions
+    def("Ones", WRAP_FREE_FUNC_1(Py_DataModeling::Ones0, std::vector<tensorDim_t>));
+    def("Ones", WRAP_FREE_FUNC_2(Py_DataModeling::Ones1, std::vector<tensorDim_t>, Device));
+    def("Ones", WRAP_FREE_FUNC_2(Py_DataModeling::Ones2, std::vector<tensorDim_t>, const bool));
+    def("Ones", WRAP_FREE_FUNC_3(Py_DataModeling::Ones3, std::vector<tensorDim_t>, Device, const bool));
+
+    def("Zeros", WRAP_FREE_FUNC_1(Py_DataModeling::Zeros0, std::vector<tensorDim_t>));
+    def("Zeros", WRAP_FREE_FUNC_2(Py_DataModeling::Zeros1, std::vector<tensorDim_t>, Device));
+    def("Zeros", WRAP_FREE_FUNC_2(Py_DataModeling::Zeros2, std::vector<tensorDim_t>, const bool));
+    def("Zeros", WRAP_FREE_FUNC_3(Py_DataModeling::Zeros3, std::vector<tensorDim_t>, Device, const bool));
+
+    def("Gaussian", WRAP_FREE_FUNC_1(Py_DataModeling::Gaussian0, std::vector<tensorDim_t>));
+    def("Gaussian", WRAP_FREE_FUNC_2(Py_DataModeling::Gaussian1, std::vector<tensorDim_t>, Device));
+    def("Gaussian", WRAP_FREE_FUNC_2(Py_DataModeling::Gaussian2, std::vector<tensorDim_t>, const bool));
+    def("Gaussian", WRAP_FREE_FUNC_3(Py_DataModeling::Gaussian3, std::vector<tensorDim_t>, Device, const bool));
 }
\ No newline at end of file
diff --git a/src/python/data_modeling/py_data_modeling.h b/src/python/data_modeling/py_data_modeling.h
deleted file mode 100644
index 981dddd..0000000
--- a/src/python/data_modeling/py_data_modeling.h
+++ /dev/null
@@ -1,183 +0,0 @@
-/**
- * @file tensor.h
- * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
- * @brief 
- * @version 0.1
- * @date 2026-01-11
- * 
- * @copyright Copyright (c) 2026
- * 
- */
-
-#pragma once
-
-#include "data_modeling/tensor.h"
-#include "data_modeling/tensor_functions.h"
-
-#include "python_templates.h"
-#include "custom_converters.h"
-
-#include <boost/python.hpp>
-#include <boost/python/enum.hpp>
-#include <boost/python/return_internal_reference.hpp>
-#include <boost/python/object.hpp>
-
-namespace Py_DataModeling {
-    ftype tensorGetItem(const Tensor& self, boost::python::object index);
-    void tensorSetItem(Tensor& self, boost::python::object index, ftype value);
-
-    // need wrappers for default arguments, see
-    // https://beta.boost.org/doc/libs/develop/libs/python/doc/html/tutorial/tutorial/functions.html
-    auto OnesWrapper0(std::vector<tensorDim_t> dims) { 
-        return TensorFunctions::Ones(std::move(dims)); 
-    }
-
-    auto OnesWrapper1(std::vector<tensorDim_t> dims, Device d) { 
-        return TensorFunctions::Ones(std::move(dims), d); 
-    }
-
-    auto ZerosWrapper0(std::vector<tensorDim_t> dims) { 
-        return TensorFunctions::Zeros(std::move(dims)); 
-    }
-
-    auto ZerosWrapper1(std::vector<tensorDim_t> dims, Device d) { 
-        return TensorFunctions::Zeros(std::move(dims), d); 
-    }
-
-    auto GaussianWrapper0(std::vector<tensorDim_t> dims) { 
-        return TensorFunctions::Gaussian(std::move(dims)); 
-    }
-
-    auto GaussianWrapper1(std::vector<tensorDim_t> dims, Device d) { 
-        return TensorFunctions::Gaussian(std::move(dims), d); 
-    }
-
-    Tensor    (*Ones0)(std::vector<tensorDim_t>)                         = &OnesWrapper0;
-    Tensor    (*Ones1)(std::vector<tensorDim_t>, Device)                 = &OnesWrapper1;
-    Tensor    (*Ones2)(std::vector<tensorDim_t>, const bool)             = &(TensorFunctions::Ones);
-    Tensor    (*Ones3)(std::vector<tensorDim_t>, Device, const bool)     = &(TensorFunctions::Ones);
-
-    Tensor    (*Zeros0)(std::vector<tensorDim_t>)                        = &ZerosWrapper0;
-    Tensor    (*Zeros1)(std::vector<tensorDim_t>, Device)                = &ZerosWrapper1;
-    Tensor    (*Zeros2)(std::vector<tensorDim_t>, const bool)            = &(TensorFunctions::Zeros);
-    Tensor    (*Zeros3)(std::vector<tensorDim_t>, Device, const bool)    = &(TensorFunctions::Zeros);
-
-    Tensor    (*Gaussian0)(std::vector<tensorDim_t>)                     = &GaussianWrapper0;
-    Tensor    (*Gaussian1)(std::vector<tensorDim_t>, Device)             = &GaussianWrapper1;
-    Tensor    (*Gaussian2)(std::vector<tensorDim_t>, const bool)         = &(TensorFunctions::Gaussian);
-    Tensor    (*Gaussian3)(std::vector<tensorDim_t>, Device, const bool) = &(TensorFunctions::Gaussian);
-
-    void    (Tensor::*reset1)(const ftype)                                = &Tensor::reset;
-    void    (Tensor::*reset2)(const utility::InitClass)                   = &Tensor::reset;
-
-    void    (Tensor::*transposeThis1)()                                   = &Tensor::transposeThis;
-    void    (Tensor::*transposeThis2)(int, int)                           = &Tensor::transposeThis;
-    Tensor  (Tensor::*transpose1)(int, int) const                         = &Tensor::transpose;
-    Tensor  (Tensor::*transpose2)(int, int, bool) const                   = &Tensor::transpose;
-}
-
-BOOST_PYTHON_MODULE(py_data_modeling)
-{
-    using namespace boost::python;
-
-    // some macros to make code below easier to read
-    #define WRAP_TENSOR_METHOD_1(method) \
-    +[](const Tensor& self, const Tensor& other) -> std::shared_ptr<Tensor> { \
-        return std::make_shared<Tensor>(self.method(other)); \
-    }
-
-    #define WRAP_SCALAR(method, T) \
-    +[](const Tensor& self, T val) -> std::shared_ptr<Tensor> { \
-        return std::make_shared<Tensor>(self.method(val)); \
-    }
-
-    #define WRAP_SCALAR_REVERSE(op, T) \
-    +[](const Tensor& self, T val) -> std::shared_ptr<Tensor> { \
-        return std::make_shared<Tensor>(val op self); \
-    }
-
-    // different, since those are not methods anymore
-    #define WRAP_FREE_MEMBER_FUNC_1(fPtr, T1, T2) \
-    +[](const Tensor& self, int v1, int v2) -> std::shared_ptr<Tensor> { \
-        return std::make_shared<Tensor>((self.*fPtr)(v1, v2)); \
-    }
-
-    #define WRAP_FREE_MEMBER_FUNC_2(fPtr, T1, T2, T3) \
-    +[](const Tensor& self, T1 v1, T2 v2, T3 v3) -> std::shared_ptr<Tensor> { \
-        return std::make_shared<Tensor>((self.*fPtr)(v1, v2, v3)); \
-    }
-
-    #define WRAP_FREE_FUNC_1(fPtr, T1) \
-    +[](T1 v1) -> std::shared_ptr<Tensor> { \
-        return std::make_shared<Tensor>((*fPtr)(v1)); \
-    }
-
-    #define WRAP_FREE_FUNC_2(fPtr, T1, T2) \
-    +[](T1 v1, T2 v2) -> std::shared_ptr<Tensor> { \
-        return std::make_shared<Tensor>((*fPtr)(v1, v2)); \
-    }
-
-    #define WRAP_FREE_FUNC_3(fPtr, T1, T2, T3) \
-    +[](T1 v1, T2 v2, T3 v3) -> std::shared_ptr<Tensor> { \
-        return std::make_shared<Tensor>((*fPtr)(v1, v2, v3)); \
-    }
-
-    // register implicit dtype conversion
-    converters::PyListToVectorConverter<tensorDim_t>();
-
-    // classes
-    class_<Dimension>("Dimension", no_init)
-        .add_property("list", &Dimension::getItem)
-        .def("__str__", &Py_Util::toString<Dimension>)
-    ;
-
-    enum_<Device>("Device")
-        .value("CPU", Device::CPU)
-        .value("CUDA", Device::CUDA)
-    ;
-
-    // we manage via shared_ptr, since we deleted copy-ctor
-    class_<Tensor, std::shared_ptr<Tensor>, boost::noncopyable>("Tensor", no_init)
-        .def(init<const std::vector<tensorDim_t>&, optional<bool> >())
-        .def(init<const std::vector<tensorDim_t>&, optional<Device, bool> >())
-        .add_property("device", &Tensor::getDevice, &Tensor::setDevice)
-        .add_property("dims", make_function(&Tensor::getDims, return_internal_reference<>()))
-        .add_property("grads", make_function(&Tensor::getGrads, return_internal_reference<>()))
-        .def("__str__", &Py_Util::toString<Tensor>)
-        .def("__repr__", &Py_Util::toString<Tensor>)
-        .def("__getitem__", &Py_DataModeling::tensorGetItem)
-        .def("__setitem__", &Py_DataModeling::tensorSetItem)
-        .def("__matmul__", WRAP_TENSOR_METHOD_1(matmul))
-        .def("__add__", WRAP_TENSOR_METHOD_1(operator+)) // elementwise add
-        .def("__mul__", WRAP_TENSOR_METHOD_1(operator*)) // elementwise mult
-        .def("__mul__", WRAP_SCALAR(operator*, float))
-        .def("__rmul__", WRAP_SCALAR_REVERSE(*, float))
-        .def("__add__", WRAP_SCALAR(operator+, float))
-        .def("__radd__", WRAP_SCALAR_REVERSE(+, float))
-        .def("__sub__", WRAP_SCALAR(operator-, float))
-        .def("__truediv__", WRAP_SCALAR(operator/, float))
-        .def("reset", Py_DataModeling::reset1)
-        .def("reset", Py_DataModeling::reset2)
-        .def("transpose", WRAP_FREE_MEMBER_FUNC_1(Py_DataModeling::transpose1, int, int))
-        .def("transpose", WRAP_FREE_MEMBER_FUNC_2(Py_DataModeling::transpose2, int, int, bool))
-        .def("transposeThis", Py_DataModeling::transposeThis1)
-        .def("transposeThis", Py_DataModeling::transposeThis2)
-        .def("backward", &Tensor::backward)
-    ;
-
-    // functions
-    def("Ones", WRAP_FREE_FUNC_1(Py_DataModeling::Ones0, std::vector<tensorDim_t>));
-    def("Ones", WRAP_FREE_FUNC_2(Py_DataModeling::Ones1, std::vector<tensorDim_t>, Device));
-    def("Ones", WRAP_FREE_FUNC_2(Py_DataModeling::Ones2, std::vector<tensorDim_t>, const bool));
-    def("Ones", WRAP_FREE_FUNC_3(Py_DataModeling::Ones3, std::vector<tensorDim_t>, Device, const bool));
-
-    def("Zeros", WRAP_FREE_FUNC_1(Py_DataModeling::Zeros0, std::vector<tensorDim_t>));
-    def("Zeros", WRAP_FREE_FUNC_2(Py_DataModeling::Zeros1, std::vector<tensorDim_t>, Device));
-    def("Zeros", WRAP_FREE_FUNC_2(Py_DataModeling::Zeros2, std::vector<tensorDim_t>, const bool));
-    def("Zeros", WRAP_FREE_FUNC_3(Py_DataModeling::Zeros3, std::vector<tensorDim_t>, Device, const bool));
-
-    def("Gaussian", WRAP_FREE_FUNC_1(Py_DataModeling::Gaussian0, std::vector<tensorDim_t>));
-    def("Gaussian", WRAP_FREE_FUNC_2(Py_DataModeling::Gaussian1, std::vector<tensorDim_t>, Device));
-    def("Gaussian", WRAP_FREE_FUNC_2(Py_DataModeling::Gaussian2, std::vector<tensorDim_t>, const bool));
-    def("Gaussian", WRAP_FREE_FUNC_3(Py_DataModeling::Gaussian3, std::vector<tensorDim_t>, Device, const bool));
-}
\ No newline at end of file
diff --git a/src/python/data_modeling/py_data_modeling_util.cpp b/src/python/data_modeling/py_data_modeling_util.cpp
new file mode 100644
index 0000000..d495300
--- /dev/null
+++ b/src/python/data_modeling/py_data_modeling_util.cpp
@@ -0,0 +1,119 @@
+/**
+ * @file py_data_modeling_util.cpp
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief 
+ * @version 0.1
+ * @date 2026-02-21
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#include "py_data_modeling_util.h"
+
+#include <stdexcept>
+#include <utility>
+
+using namespace boost::python;
+
+ftype Py_DataModeling::tensorGetItem(const Tensor& self, boost::python::object index) {
+  extract<int> int_extractor(index);
+        
+  // Single integer index (1D)
+  if(int_extractor.check()) {
+    auto i0 = static_cast<tensorDim_t>(int_extractor());
+    return self.getItem(i0);
+  }
+        
+  // Tuple index (2D, 3D, or 4D, or list)
+  if (PySequence_Check(index.ptr())) {
+    int len = PySequence_Length(index.ptr());
+        
+    // Dispatch to convenience functions for 1-4 args
+    if (len == 1) {
+      auto i0 = static_cast<tensorDim_t>(extract<int>(index[0]));
+      return self.getItem(i0);
+    }
+    else if (len == 2) {
+      auto i0 = static_cast<tensorDim_t>(extract<int>(index[0]));
+      auto i1 = static_cast<tensorDim_t>(extract<int>(index[1]));
+      return self.getItem(i0, i1);
+    }
+    else if (len == 3) {
+      auto i0 = static_cast<tensorDim_t>(extract<int>(index[0]));
+      auto i1 = static_cast<tensorDim_t>(extract<int>(index[1]));
+      auto i2 = static_cast<tensorDim_t>(extract<int>(index[2]));
+      return self.getItem(i0, i1, i2);
+    }
+    else if (len == 4) {
+      auto i0 = static_cast<tensorDim_t>(extract<int>(index[0]));
+      auto i1 = static_cast<tensorDim_t>(extract<int>(index[1]));
+      auto i2 = static_cast<tensorDim_t>(extract<int>(index[2]));
+      auto i3 = static_cast<tensorDim_t>(extract<int>(index[3]));
+      return self.getItem(i0, i1, i2, i3);
+    }
+    else {
+      // Arbitrary length - use vector version
+      std::vector<tensorDim_t> indices;
+      for (int i = 0; i < len; ++i) {
+        indices.push_back(static_cast<tensorDim_t>(extract<int>(index[i])));
+      }
+      return self.getItem(std::move(indices));
+    }
+  }
+        
+  PyErr_SetString(PyExc_TypeError, "Index must be a number of up to 4integers or a list");
+  throw_error_already_set();
+  return 0.0; // Never reached
+}
+
+void Py_DataModeling::tensorSetItem(Tensor& self, boost::python::object index, ftype value) {
+  extract<int> int_extractor(index);
+  if(int_extractor.check()) {
+      auto i0 = static_cast<tensorDim_t>(int_extractor());
+      self.setItem(value, i0);
+      return;
+  }
+        
+  // Tuple index (2D, 3D, or 4D, or list)
+  extract<tuple> tuple_extractor(index);
+  if (PySequence_Check(index.ptr())) {
+    int len = PySequence_Length(index.ptr());
+        
+    // Dispatch to convenience functions for 1-4 args
+    if (len == 1) {
+      auto i0 = static_cast<tensorDim_t>(extract<int>(index[0]));
+      self.setItem(value, i0);
+    }
+    else if (len == 2) {
+      auto i0 = static_cast<tensorDim_t>(extract<int>(index[0]));
+      auto i1 = static_cast<tensorDim_t>(extract<int>(index[1]));
+      self.setItem(value, i0, i1);
+    }
+    else if (len == 3) {
+      auto i0 = static_cast<tensorDim_t>(extract<int>(index[0]));
+      auto i1 = static_cast<tensorDim_t>(extract<int>(index[1]));
+      auto i2 = static_cast<tensorDim_t>(extract<int>(index[2]));
+      self.setItem(value, i0, i1, i2);
+    }
+    else if (len == 4) {
+      auto i0 = static_cast<tensorDim_t>(extract<int>(index[0]));
+      auto i1 = static_cast<tensorDim_t>(extract<int>(index[1]));
+      auto i2 = static_cast<tensorDim_t>(extract<int>(index[2]));
+      auto i3 = static_cast<tensorDim_t>(extract<int>(index[3]));
+      self.setItem(value, i0, i1, i2, i3);
+    }
+    else {
+      // Arbitrary length - use vector version
+      std::vector<tensorDim_t> indices;
+      for (int i = 0; i < len; ++i) {
+        indices.push_back(static_cast<tensorDim_t>(extract<int>(index[i])));
+      }
+      self.setItem(value, std::move(indices));
+    }
+    return;
+  }
+        
+  PyErr_SetString(PyExc_TypeError, "Index must be a number of up to 4integers or a list");
+  throw_error_already_set();
+}
\ No newline at end of file
diff --git a/src/python/data_modeling/py_data_modeling_util.h b/src/python/data_modeling/py_data_modeling_util.h
new file mode 100644
index 0000000..82a8343
--- /dev/null
+++ b/src/python/data_modeling/py_data_modeling_util.h
@@ -0,0 +1,135 @@
+/**
+ * @file util.h
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief Helper and wrapper functions
+ * @version 0.1
+ * @date 2026-02-21
+ * 
+ * @copyright Copyright (c) 2026
+ * 
+ */
+
+#pragma once
+
+#include "data_modeling/dim_type.h"
+
+#include "data_modeling/tensor.h"
+#include "data_modeling/tensor_functions.h"
+#include "computational_graph/graph_creation.h"
+
+#include <boost/python.hpp>
+#include <boost/python/object.hpp>
+
+#include <memory>
+
+namespace Py_DataModeling {
+
+    /*********************************************************************************************************
+    ********************************************** Dimension *************************************************
+    *********************************************************************************************************/
+
+    inline bool (Dimension::*dimEquals1)(const Dimension&) const                   = &Dimension::operator==;
+    inline bool (Dimension::*dimEquals2)(const std::vector<tensorDim_t>&) const    = &Dimension::operator==;
+
+    inline bool (Dimension::*nDimEquals1)(const Dimension&) const                  = &Dimension::operator!=;
+    inline bool (Dimension::*nDimEquals2)(const std::vector<tensorDim_t>&) const   = &Dimension::operator!=;
+    /*********************************************************************************************************
+    *********************************************** Tensor ***************************************************
+    *********************************************************************************************************/
+
+    ftype tensorGetItem(const Tensor& self, boost::python::object index);
+    void tensorSetItem(Tensor& self, boost::python::object index, ftype value);
+
+    // need wrappers for default arguments, see
+    // https://beta.boost.org/doc/libs/develop/libs/python/doc/html/tutorial/tutorial/functions.html
+    inline auto OnesWrapper0(std::vector<tensorDim_t> dims) { 
+        return TensorFunctions::Ones(std::move(dims)); 
+    }
+
+    inline auto OnesWrapper1(std::vector<tensorDim_t> dims, Device d) { 
+        return TensorFunctions::Ones(std::move(dims), d); 
+    }
+
+    inline auto ZerosWrapper0(std::vector<tensorDim_t> dims) { 
+        return TensorFunctions::Zeros(std::move(dims)); 
+    }
+
+    inline auto ZerosWrapper1(std::vector<tensorDim_t> dims, Device d) { 
+        return TensorFunctions::Zeros(std::move(dims), d); 
+    }
+
+    inline auto GaussianWrapper0(std::vector<tensorDim_t> dims) { 
+        return TensorFunctions::Gaussian(std::move(dims)); 
+    }
+
+    inline auto GaussianWrapper1(std::vector<tensorDim_t> dims, Device d) { 
+        return TensorFunctions::Gaussian(std::move(dims), d); 
+    }
+
+    inline Tensor    (*Ones0)(std::vector<tensorDim_t>)                                             = &OnesWrapper0;
+    inline Tensor    (*Ones1)(std::vector<tensorDim_t>, Device)                                     = &OnesWrapper1;
+    inline Tensor    (*Ones2)(std::vector<tensorDim_t>, const bool)                                 = &(TensorFunctions::Ones);
+    inline Tensor    (*Ones3)(std::vector<tensorDim_t>, Device, const bool)                         = &(TensorFunctions::Ones);
+
+    inline Tensor    (*Zeros0)(std::vector<tensorDim_t>)                                            = &ZerosWrapper0;
+    inline Tensor    (*Zeros1)(std::vector<tensorDim_t>, Device)                                    = &ZerosWrapper1;
+    inline Tensor    (*Zeros2)(std::vector<tensorDim_t>, const bool)                                = &(TensorFunctions::Zeros);
+    inline Tensor    (*Zeros3)(std::vector<tensorDim_t>, Device, const bool)                        = &(TensorFunctions::Zeros);
+
+    inline Tensor    (*Gaussian0)(std::vector<tensorDim_t>)                                         = &GaussianWrapper0;
+    inline Tensor    (*Gaussian1)(std::vector<tensorDim_t>, Device)                                 = &GaussianWrapper1;
+    inline Tensor    (*Gaussian2)(std::vector<tensorDim_t>, const bool)                             = &(TensorFunctions::Gaussian);
+    inline Tensor    (*Gaussian3)(std::vector<tensorDim_t>, Device, const bool)                     = &(TensorFunctions::Gaussian);
+
+    inline void    (Tensor::*reset1)(const ftype)                                                   = &Tensor::reset;
+    inline void    (Tensor::*reset2)(const utility::InitClass)                                      = &Tensor::reset;
+
+    inline void    (Tensor::*transposeThis1)()                                                      = &Tensor::transposeThis;
+    inline void    (Tensor::*transposeThis2)(int, int)                                              = &Tensor::transposeThis;
+    inline Tensor  (Tensor::*transpose1)(int, int) const                                            = &Tensor::transpose;
+    inline Tensor  (Tensor::*transpose2)(int, int, bool) const                                      = &Tensor::transpose;
+
+    inline ftype   (Tensor::*getItemVector)(const std::vector<tensorDim_t>&) const                       = &Tensor::getItem;
+
+    /*********************************************************************************************************
+    ***************************************** Graph creation *************************************************
+    *********************************************************************************************************/
+
+    // multiplications
+    inline std::shared_ptr<Tensor> (*elementwisemul) 
+    (const std::shared_ptr<Tensor> left, const std::shared_ptr<Tensor> right)           = &(graph::mul);
+
+    inline std::shared_ptr<Tensor> (*scalarmul) 
+    (const std::shared_ptr<Tensor>, ftype)                                              = &(graph::mul);
+
+    inline std::shared_ptr<Tensor> (*rscalarmul) 
+    (ftype, const std::shared_ptr<Tensor>)                                              = &(graph::mul);
+
+    // additions
+    inline std::shared_ptr<Tensor> (*elementwiseadd) 
+    (const std::shared_ptr<Tensor> left, const std::shared_ptr<Tensor> right)           = &(graph::add);
+
+    inline std::shared_ptr<Tensor> (*scalaradd) 
+    (const std::shared_ptr<Tensor>, ftype)                                              = &(graph::add);
+
+    inline std::shared_ptr<Tensor> (*rscalaradd) 
+    (ftype, const std::shared_ptr<Tensor>)                                              = &(graph::add);
+
+    // matmul
+    inline std::shared_ptr<Tensor> (*matmul) 
+    (const std::shared_ptr<Tensor> left, const std::shared_ptr<Tensor> right)           = &(graph::matmul);
+    
+    // sub, div
+    inline std::shared_ptr<Tensor> (*scalarsub) 
+    (const std::shared_ptr<Tensor>, ftype)                                              = &(graph::sub);
+
+    inline std::shared_ptr<Tensor> (*scalardiv) 
+    (const std::shared_ptr<Tensor>, ftype)                                              = &(graph::div);
+
+    // get
+    inline std::shared_ptr<Tensor> (*getItemAsTensor1) 
+    (const std::shared_ptr<Tensor>& t, tensorSize_t idx)                                = &(graph::get);
+
+    inline std::shared_ptr<Tensor> (*getItemAsTensor2) 
+    (const std::shared_ptr<Tensor>& t, const std::vector<tensorDim_t>& idx)             = &(graph::get);
+}
\ No newline at end of file
diff --git a/src/python/python_utility/custom_converters.h b/src/python/python_utility/custom_converters.h
index 8bc4f10..5114d7f 100644
--- a/src/python/python_utility/custom_converters.h
+++ b/src/python/python_utility/custom_converters.h
@@ -19,13 +19,14 @@
 #include <vector>
 #include <limits>
 
-namespace converters {
+namespace custom_converters {
   /**
    * @brief We use this class to convert Python lists of int into vectors of
    * internal types, such as tensorDim_t.
    */
   template<typename T>
-  requires ( std::is_integral_v< std::remove_const_t<T> >)
+  requires ( std::is_integral_v< T > || 
+             std::is_floating_point_v< T >)
   struct PyListToVectorConverter {
     using rvalueFromPythonData = boost::python::converter::rvalue_from_python_stage1_data;
 
@@ -40,7 +41,7 @@ namespace converters {
    * internal types, such as tensorDim_t.
    */
   template<typename T>
-  requires ( std::is_integral_v< std::remove_const_t<T> >)
+  requires ( std::is_integral_v< T >)
   struct PyIntToIntegralValueConverter {
     using rvalueFromPythonData = boost::python::converter::rvalue_from_python_stage1_data;
 
@@ -56,8 +57,9 @@ namespace converters {
 /******************************************************************************************/
 
 template<typename T>
-requires ( std::is_integral_v< std::remove_const_t<T> >)
-converters::PyListToVectorConverter<T>::PyListToVectorConverter() {
+requires ( std::is_integral_v< T > || 
+           std::is_floating_point_v< T >)
+custom_converters::PyListToVectorConverter<T>::PyListToVectorConverter() {
   using namespace boost::python;
 
   // register converter with Boost.Python's conversion system
@@ -69,8 +71,9 @@ converters::PyListToVectorConverter<T>::PyListToVectorConverter() {
 }
 
 template<typename T>
-requires ( std::is_integral_v< std::remove_const_t<T> >)
-void* converters::PyListToVectorConverter<T>::convertible(PyObject* obj_ptr) {
+requires ( std::is_integral_v< T > || 
+           std::is_floating_point_v< T >)
+void* custom_converters::PyListToVectorConverter<T>::convertible(PyObject* obj_ptr) {
   using namespace boost::python;
   
   if (!PySequence_Check(obj_ptr)) 
@@ -80,8 +83,9 @@ void* converters::PyListToVectorConverter<T>::convertible(PyObject* obj_ptr) {
 }
 
 template<typename T>
-requires ( std::is_integral_v< std::remove_const_t<T> >)
-void converters::PyListToVectorConverter<T>::construct(PyObject* obj_ptr, rvalueFromPythonData* data) {
+requires ( std::is_integral_v< T > || 
+           std::is_floating_point_v< T >)
+void custom_converters::PyListToVectorConverter<T>::construct(PyObject* obj_ptr, rvalueFromPythonData* data) {
 
   using namespace boost::python;
 
@@ -96,8 +100,15 @@ void converters::PyListToVectorConverter<T>::construct(PyObject* obj_ptr, rvalue
         
   // Fill it with converted values
   for (int i = 0; i < len(py_list); ++i) {
-    int val = extract<int>(py_list[i]);
-    vec->push_back(static_cast<T>(val));
+
+    if constexpr(std::is_integral_v< T >){
+      auto val = extract<int>(py_list[i]);
+      vec->push_back(static_cast<T>(val));
+    }
+    else if constexpr(std::is_floating_point_v< T >) {
+      auto val = extract<ftype>(py_list[i]);
+      vec->push_back(static_cast<T>(val));
+    }
   }
         
   // Tell Boost.Python where the constructed object is
@@ -105,8 +116,8 @@ void converters::PyListToVectorConverter<T>::construct(PyObject* obj_ptr, rvalue
 }
 
 template<typename T>
-requires ( std::is_integral_v< std::remove_const_t<T> >)
-converters::PyIntToIntegralValueConverter<T>::PyIntToIntegralValueConverter() {
+requires ( std::is_integral_v< T >)
+custom_converters::PyIntToIntegralValueConverter<T>::PyIntToIntegralValueConverter() {
   using namespace boost::python;
 
   // register converter with Boost.Python's conversion system
@@ -118,8 +129,8 @@ converters::PyIntToIntegralValueConverter<T>::PyIntToIntegralValueConverter() {
 }
 
 template<typename T>
-requires ( std::is_integral_v< std::remove_const_t<T> >)
-void* converters::PyIntToIntegralValueConverter<T>::convertible(PyObject* obj_ptr) {
+requires ( std::is_integral_v< T >)
+void* custom_converters::PyIntToIntegralValueConverter<T>::convertible(PyObject* obj_ptr) {
   using namespace boost::python;
 
   if (!PyLong_Check(obj_ptr)) 
@@ -129,8 +140,8 @@ void* converters::PyIntToIntegralValueConverter<T>::convertible(PyObject* obj_pt
 }
 
 template<typename T>
-requires ( std::is_integral_v< std::remove_const_t<T> >)
-void converters::PyIntToIntegralValueConverter<T>::construct(PyObject* obj_ptr, rvalueFromPythonData* data) {
+requires ( std::is_integral_v< T >)
+void custom_converters::PyIntToIntegralValueConverter<T>::construct(PyObject* obj_ptr, rvalueFromPythonData* data) {
   using namespace boost::python;
 
   // Extract Python int
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index d4fdd04..3b258b9 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -19,10 +19,17 @@ target_link_libraries(unit_tests_backend PRIVATE
 include(GoogleTest)
 gtest_discover_tests(unit_tests_backend)
 
-#find_package(Python3 COMPONENTS Interpreter)
-#if(Python3_FOUND)
-#    add_test(NAME python_tests
-#        COMMAND ${Python3_EXECUTABLE} -m pytest ${CMAKE_CURRENT_SOURCE_DIR}/python
-#        WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
-#    )
-#endif()
\ No newline at end of file
+find_package(Python3 COMPONENTS Interpreter)
+if(Python3_FOUND)
+    # replace the placeholder variables and copy resulting file in .py file
+    add_test(
+        NAME python_tests
+        COMMAND ${Python3_EXECUTABLE} -m pytest 
+                ${CMAKE_CURRENT_SOURCE_DIR}/python
+    )
+
+    # Set environment for Python to find the module
+    set_tests_properties(python_tests PROPERTIES
+        ENVIRONMENT "PYTHONPATH=${PYTHON_MODULE_DIR}:$ENV{PYTHONPATH}"
+    )
+endif()
\ No newline at end of file
diff --git a/tests/backend/test_computational_graph.cpp b/tests/backend/test_computational_graph.cpp
index 62062a1..d2a686f 100644
--- a/tests/backend/test_computational_graph.cpp
+++ b/tests/backend/test_computational_graph.cpp
@@ -16,86 +16,119 @@
 
 #include "computational_graph/graph_creation.h"
 
+#include <stdexcept>
+
+TEST(AutogradTest, ThrowsIfNoGradientSet) {
+    auto t1 = TensorFunctions::makeSharedTensor({1}, {3.0}, false);
+    auto t2 = TensorFunctions::makeSharedTensor({1}, {2.0}, false);
+
+    auto loss = graph::add(t1, t2);
+    
+    EXPECT_THROW(loss->backward(), std::runtime_error);
+}
+
 TEST(AutogradTest, SimpleAddition) {
     auto t1 = TensorFunctions::makeSharedTensor({1}, {3.0}, true);
     auto t2 = TensorFunctions::makeSharedTensor({1}, {2.0}, true);
 
-    auto res = graph::add(t1, t2);
-    auto loss = graph::mul(res, res);
+    auto t3 = graph::add(t1, t2);
+    auto loss = graph::mul(t3, t3);
     
     loss->backward();
     
-    EXPECT_NEAR(t1->getGrads()->getItem(0), 10.0f, 1e-5);
-    EXPECT_NEAR(t2->getGrads()->getItem(0), 10.0f, 1e-5);
+    EXPECT_NEAR(t1->getGrads()->getItem(0), 10.0, 1e-5);
+    EXPECT_NEAR(t2->getGrads()->getItem(0), 10.0, 1e-5);
 }
 
 TEST(AutogradTest, ScalarMultiplication) {
     auto t1 = TensorFunctions::makeSharedTensor({1}, {2.0}, true);
     auto t2 = TensorFunctions::makeSharedTensor({1}, {3.0}, true);
 
-    auto res = graph::mul(t1, t2);
-    auto loss = graph::mul(res, res);
+    auto t3 = graph::mul(t1, t2);
+    auto loss = graph::mul(t3, t3);
     
     loss->backward();
     
-    EXPECT_NEAR(t1->getGrads()->getItem(0), 36.0f, 1e-5);
-    EXPECT_NEAR(t2->getGrads()->getItem(0), 24.0f, 1e-5);
+    ASSERT_DOUBLE_EQ(t1->getGrads()->getItem(0), 36.0);
+    ASSERT_DOUBLE_EQ(t2->getGrads()->getItem(0), 24.0);
 }
 
 TEST(AutogradTest, MatMul) {
     auto t1 = TensorFunctions::makeSharedTensor({2, 3}, {1, 2, 3, 4, 5, 6}, true);
     auto t2 = TensorFunctions::makeSharedTensor({3, 2}, {1, 2, 3, 4, 5, 6}, true);
     
-    auto res = graph::matmul(t1, t2);
+    auto t3 = graph::matmul(t1, t2);
 
-    auto loss = TensorFunctions::makeSharedTensor({1}, {0.0f}, true);
-    for (size_t i = 0; i < res->getSize(); ++i) {
-        loss = graph::add(loss, graph::get(res, i));
+    auto loss = TensorFunctions::makeSharedTensor({1}, {0.0}, true);
+    for (size_t i = 0; i < t3->getSize(); ++i) {
+        loss = graph::add(loss, graph::get(t3, i));
     }
     
     loss->backward();
     
     EXPECT_TRUE(t1->hasGrads());
     EXPECT_TRUE(t2->hasGrads());
+
+    // dL/dt1 = dloss/dt3 @ t2^t = Ones({2, 2}) @ t2^t
+    ASSERT_DOUBLE_EQ(t1->getGrads()->getItem({0, 0}), 3.0);
+    ASSERT_DOUBLE_EQ(t1->getGrads()->getItem({0, 1}), 7.0);
+    ASSERT_DOUBLE_EQ(t1->getGrads()->getItem({0, 2}), 11.0);
+    ASSERT_DOUBLE_EQ(t1->getGrads()->getItem({1, 0}), 3.0);
+    ASSERT_DOUBLE_EQ(t1->getGrads()->getItem({1, 1}), 7.0);
+    ASSERT_DOUBLE_EQ(t1->getGrads()->getItem({1, 2}), 11.0);
+
+    // dL/dt2 = t1^t @ dloss/dt3 = t1^t @ Ones({2, 2})
+    ASSERT_DOUBLE_EQ(t2->getGrads()->getItem({0, 0}), 5.0);
+    ASSERT_DOUBLE_EQ(t2->getGrads()->getItem({0, 1}), 5.0);
+    ASSERT_DOUBLE_EQ(t2->getGrads()->getItem({1, 0}), 7.0);
+    ASSERT_DOUBLE_EQ(t2->getGrads()->getItem({1, 1}), 7.0);
+    ASSERT_DOUBLE_EQ(t2->getGrads()->getItem({2, 0}), 9.0);
+    ASSERT_DOUBLE_EQ(t2->getGrads()->getItem({2, 1}), 9.0);
 }
 
-/* TEST(AutogradTest, ChainRule) {
-    Tensor x({1}, {2.0f}, true);
+TEST(AutogradTest, ChainRule) {
+    auto x = TensorFunctions::makeSharedTensor({1}, {2.0}, true);
     
-    Tensor y = x * x;      // y = x^2
-    Tensor z = y + x;      // z = x^2 + x
-    Tensor loss = z * z;   // loss = (x^2 + x)^2
+    auto y = graph::mul(x, x); // y = x^2
+    auto z = graph::add(x, y); // z = x^2 + x
+    auto loss = graph::mul(z, z);   // loss = (x^2 + x)^2
     
-    loss.backward();
+    loss->backward();
     
     // dloss/dx = 2(x^2 + x) * (2x + 1)
     // At x=2: 2(4 + 2) * (4 + 1) = 2 * 6 * 5 = 60
-    EXPECT_NEAR(loss.getGrads()->getItem(0), 60.0f, 1e-4);
-} */
+    ASSERT_DOUBLE_EQ(x->getGrads()->getItem(0), 60.0);
+}
 
-/* TEST(AutogradTest, ReLU) {
-    Tensor x({3}, {-1.0f, 0.0f, 2.0f}, true);
+TEST(AutogradTest, MultiVariateChainRule) {
+    auto x = TensorFunctions::makeSharedTensor({2}, {1.0, 2.0}, true);
     
-    Tensor y = relu(x);    // [0, 0, 2]
-    Tensor loss = sum(y);  // loss = 2
+    auto y = graph::mul(x, 3.0); // y = [3, 6]
+    auto loss = TensorFunctions::makeSharedTensor({1}, {0.0}, true);
+    for(int i=0; i<y->getSize(); i++){
+        loss = graph::add(loss, graph::get(y, i));
+    }    // loss = 9
     
-    loss.backward();
+    loss->backward();
     
-    // Gradient: [0, 0, 1] (only where input > 0)
-    EXPECT_NEAR(t.getGrads()->getItem(0), 0.0f, 1e-5);
-    EXPECT_NEAR(t.getGrads()->getItem(1), 0.0f, 1e-5);
-    EXPECT_NEAR(t.getGrads()->getItem(2), 1.0f, 1e-5);
+    // dloss/dx = scalar = 3
+    ASSERT_DOUBLE_EQ(x->getGrads()->getItem(0), 3.0);
+    ASSERT_DOUBLE_EQ(x->getGrads()->getItem(1), 3.0);
+
+    ASSERT_DOUBLE_EQ(y->getGrads()->getItem(0), 1.0);
+    ASSERT_DOUBLE_EQ(y->getGrads()->getItem(1), 1.0);
 }
 
-TEST(AutogradTest, ScalarMultiplication) {
-    Tensor x({2}, {1.0f, 2.0f}, true);
+/* TEST(AutogradTest, ReLU) {
+    Tensor x({3}, {-1.0, 0.0, 2.0}, true);
     
-    Tensor y = x * 3.0f;     // y = [3, 6]
-    Tensor loss = sum(y);    // loss = 9
+    Tensor y = relu(x);    // [0, 0, 2]
+    Tensor loss = sum(y);  // loss = 2
     
     loss.backward();
     
-    // dloss/dx = scalar = 3
-    EXPECT_NEAR(t.getGrads()->getItem(0), 3.0f, 1e-5);
-    EXPECT_NEAR(t.getGrads()->getItem(1), 3.0f, 1e-5);
+    // Gradient: [0, 0, 1] (only where input > 0)
+    EXPECT_NEAR(t.getGrads()->getItem(0), 0.0, 1e-5);
+    EXPECT_NEAR(t.getGrads()->getItem(1), 0.0, 1e-5);
+    EXPECT_NEAR(t.getGrads()->getItem(2), 1.0, 1e-5);
 } */
\ No newline at end of file
diff --git a/tests/backend/test_data_modeling.cpp b/tests/backend/test_data_modeling.cpp
index 4afc140..3bff63f 100644
--- a/tests/backend/test_data_modeling.cpp
+++ b/tests/backend/test_data_modeling.cpp
@@ -16,6 +16,19 @@
 
 #include <stdexcept>
 
+TEST(TensorOpsTest, TestCtor) {
+  auto t = Tensor({2, 2}, {2.0, 3.0, 4.0, 5.0}, Device::CPU, false);
+
+  ASSERT_EQ(t.getDims(), Dimension({2, 2}));
+  ASSERT_EQ(t.getDevice(), Device::CPU);
+  ASSERT_TRUE(!t.getRequiresGrad());
+
+  ASSERT_DOUBLE_EQ(t.getItem(0, 0), 2.0);
+  ASSERT_DOUBLE_EQ(t.getItem(0, 1), 3.0);
+  ASSERT_DOUBLE_EQ(t.getItem(1, 0), 4.0);
+  ASSERT_DOUBLE_EQ(t.getItem(1, 1), 5.0);
+}
+
 TEST(TensorOpsTest, ScalarAddWorks) {
   auto t1 = TensorFunctions::Ones({2, 2}, false);
 
@@ -29,6 +42,27 @@ TEST(TensorOpsTest, ScalarAddWorks) {
   }
 }
 
+TEST(TensorOpsTest, TensorAddWorks) {
+  auto t1 = TensorFunctions::Ones({2, 2}, false);
+  auto t2 = TensorFunctions::Ones({2, 2}, false) * 4;
+
+  auto res = t1 + t2;
+
+  constexpr ftype sum = 5.0;
+  for(auto i=0; i<t1.getDims().getItem(0); i++) {
+    for(auto j=0; j<t1.getDims().getItem(1); j++) {
+      ASSERT_DOUBLE_EQ(res.getItem(i, j), sum);
+    }
+  }
+}
+
+TEST(TensorOpsTest, TensorAddThrowsOnDimMismatch) {
+  auto t1 = TensorFunctions::Ones({2, 2}, false);
+  auto t2 = TensorFunctions::Ones({2, 3}, false) * 4;
+
+  EXPECT_THROW(t1 + t2, std::invalid_argument);
+}
+
 TEST(TensorOpsTest, ScalarMulWorks) {
   auto t1 = TensorFunctions::Ones({2, 2}, false);
 
@@ -57,14 +91,6 @@ TEST(TensorOpsTest, MatrixAddGivesCorrectResults) {
   }
 }
 
-TEST(TensorOpsTest, MatrixAddThrowsOnDimensionMismatch) {
-  constexpr ftype factor = 0.5;
-  auto t1 = TensorFunctions::Ones({2, 2}, false);
-  auto t2 = TensorFunctions::Ones({2, 3}, false) * 0.5;
-    
-  EXPECT_THROW(t1 + t2, std::invalid_argument);
-}
-
 TEST(TensorOpsTest, ElementwiseMulGivesCorrectResults) {
   constexpr ftype factor = 0.5;
   auto t1 = TensorFunctions::Ones({2, 2}, false);
@@ -159,33 +185,10 @@ TEST(TensorOpsTest, MatMulBroadcastsOn1DTensor) {
 }
 
 TEST(TensorOpsTest, MatMulThrowsWhenDimensionsNotMatched) {
-  auto t1 = Tensor({2, 2}, false);
-  auto t2 = Tensor({2, 2}, false);
-
-  auto cmpRes = Tensor({2, 2}, false);
-
-  auto populateTensor = [](Tensor& t, ftype v1, ftype v2, ftype v3, ftype v4) {
-    t.setItem(v1, {0, 0});
-    t.setItem(v2, {0, 1});
-    t.setItem(v3, {1, 0});
-    t.setItem(v4, {1, 1});
-  };
-
-  populateTensor(t1, 1, 2, 3, 4);
-  populateTensor(t2, 5, 6, 7, 8);
-  populateTensor(cmpRes, 19, 22, 43, 50);
-
-  auto res = t1.matmul(t2);
-  
-  auto expectedDims = std::vector<tensorDim_t>{2, 2};
-  ASSERT_EQ(res.getDims().toVector(), expectedDims);
+  auto t1 = TensorFunctions::Ones({2, 2}, false);
+  auto t2 = TensorFunctions::Ones({3, 2}, false);
 
-  constexpr ftype resSum = 3.0;
-  for(auto i=0; i<t1.getDims().getItem(0); i++) {
-    for(auto j=0; j<t1.getDims().getItem(1); j++) {
-      ASSERT_DOUBLE_EQ(res.getItem(i, j), cmpRes.getItem(i, j));
-    }
-  }
+  EXPECT_THROW(t1.matmul(t2), std::runtime_error);
 }
 
 TEST(TensorOpsTest, TransposeWorksAsIntended1) {
diff --git a/examples/CMakeLists.txt b/tests/python/__init__.py
similarity index 100%
rename from examples/CMakeLists.txt
rename to tests/python/__init__.py
diff --git a/tests/python/test_autograd.py b/tests/python/test_autograd.py
new file mode 100644
index 0000000..aa4b975
--- /dev/null
+++ b/tests/python/test_autograd.py
@@ -0,0 +1,111 @@
+"""
+Robert Baumgartner, r.baumgartner-1@tudelft.nl
+"""
+
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent / "python_lib"))
+print(sys.path)
+
+from dl_lib import Tensor
+import pytest
+
+class TestAutograd:
+    def test_backward(self):
+        t = Tensor([2, 2], True)
+
+        loss = (t * 2).sum()
+        loss.backward()
+        
+        assert t.grads is not None
+
+    def test_nograd_throws(self):
+        t1 = Tensor([1], [3.0], False)
+        t2 = Tensor([1], [3.0], False)
+
+        t3 = t1*t2
+
+        assert not t3.requiresGrad
+        with pytest.raises(RuntimeError):
+            t3.backward()
+
+    def test_add(self):
+        t1 = Tensor([1], [3.0], True)
+        t2 = Tensor([1], [2.0], True)
+        
+        t3 = t1+t2
+        loss = t3*t3
+
+        loss.backward()
+
+        assert t1.grads.getitem(0) == pytest.approx(10.0)
+        assert t2.grads.getitem(0) == pytest.approx(10.0)
+
+    def test_scalar_mul(self):
+        t1 = Tensor([1], [2.0], True)
+        t2 = Tensor([1], [3.0], True)
+
+        t3 = t1*t2
+        loss = t3*t3
+
+        loss.backward()
+
+        assert t1.grads.getitem(0) == pytest.approx(36.0)
+        assert t2.grads.getitem(0) == pytest.approx(24.0)
+
+    def test_matmul(self):
+        t1 = Tensor([2, 3], [1, 2, 3, 4, 5, 6], True)
+        t2 = Tensor([3, 2], [1, 2, 3, 4, 5, 6], True)
+
+        t3 = t1@t2
+        loss = t3.sum()
+
+        loss.backward()
+
+        # dL/dt1 = dloss/dt3 @ t2^t = Ones({2, 2}) @ t2^t
+        assert t1.grads.getitem([0, 0]) == pytest.approx(3.0)
+        assert t1.grads.getitem([0, 1]) == pytest.approx(7.0)
+        assert t1.grads.getitem([0, 2]) == pytest.approx(11.0)
+        assert t1.grads.getitem([1, 0]) == pytest.approx(3.0)
+        assert t1.grads.getitem([1, 1]) == pytest.approx(7.0)
+        assert t1.grads.getitem([1, 2]) == pytest.approx(11.0)
+
+        # dL/dt2 = t1^t @ dloss/dt3 = t1^t @ Ones({2, 2})
+        assert t2.grads.getitem([0, 0]) == pytest.approx(5.0)
+        assert t2.grads.getitem([0, 1]) == pytest.approx(5.0)
+        assert t2.grads.getitem([1, 0]) == pytest.approx(7.0)
+        assert t2.grads.getitem([1, 1]) == pytest.approx(7.0)
+        assert t2.grads.getitem([2, 0]) == pytest.approx(9.0)
+        assert t2.grads.getitem([2, 1]) == pytest.approx(9.0)
+
+    def test_chainrule(self):
+        x = Tensor([1], [2.0], True)
+
+        y = x * x
+        z = x + y
+        loss = z * z
+
+        loss.backward()
+
+        # dloss/dx = 2(x^2 + x) * (2x + 1)
+        # At x=2: 2(4 + 2) * (4 + 1) = 2 * 6 * 5 = 60
+        assert x.grads.getitem(0) == pytest.approx(60.0)
+
+    def test_multivariate_chainrule(self):
+        x = Tensor([2], [1.0, 2.0], True)
+        y = x * 3
+        
+        loss = Tensor([1], [0.0], True)
+        for i in range(len(y)):
+            loss = loss + y[i]
+        loss.backward()
+
+        assert x.grads.getitem(0) == pytest.approx(3.0)
+        assert x.grads.getitem(1) == pytest.approx(3.0)
+
+        assert y.grads.getitem(0) == pytest.approx(1.0)
+        assert y.grads.getitem(1) == pytest.approx(1.0)
+
+if __name__ == '__main__':
+    raise RuntimeError("Not a standalone script")
\ No newline at end of file
diff --git a/tests/python/test_tensorops.py b/tests/python/test_tensorops.py
new file mode 100644
index 0000000..3bcce8b
--- /dev/null
+++ b/tests/python/test_tensorops.py
@@ -0,0 +1,38 @@
+"""
+Robert Baumgartner, r.baumgartner-1@tudelft.nl
+"""
+
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent / "python_lib"))
+
+from dl_lib import Tensor, Device
+
+class TestTensorOps():
+    def test_ones(self):
+        t = Tensor.ones([2, 2])
+        assert t.dims == [2, 2]
+
+    def test_ctor(self):
+        t = Tensor([2], [1.0, 2.0], Device.CPU, False)
+
+        assert t.getitem(0) == 1.0
+        assert t.getitem(1) == 2.0
+
+        assert t.device == Device.CPU
+        assert t.requiresGrad == False
+
+    def test_multiplication(self):
+        a = Tensor.ones([2, 2]) * 3
+        b = Tensor.ones([2, 2]) * 0.5
+        c = a * b
+
+        assert c.dims == [2, 2]
+        assert c.getitem([0, 0]) == 1.5
+        assert c.getitem([0, 1]) == 1.5
+        assert c.getitem([1, 0]) == 1.5
+        assert c.getitem([1, 1]) == 1.5
+
+if __name__ == '__main__':
+    raise RuntimeError("Not a standalone script")
\ No newline at end of file