From 056b7e0226adbc098714686f511f570705c74da2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Val=C3=A9rian=20Rey?= <valerian.rey@gmail.com>
Date: Fri, 6 Mar 2026 22:17:39 +0100
Subject: [PATCH 1/8] Fix outputs

---
 src/torchjd/autojac/_jac.py         | 4 ++--
 src/torchjd/autojac/_jac_to_grad.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/torchjd/autojac/_jac.py b/src/torchjd/autojac/_jac.py
index 911d5b9a..51f25474 100644
--- a/src/torchjd/autojac/_jac.py
+++ b/src/torchjd/autojac/_jac.py
@@ -68,8 +68,8 @@ def jac(
             >>> jacobians = jac([y1, y2], param)
             >>>
             >>> jacobians
-            (tensor([[-1., 1.],
-                    [ 2., 4.]]),)
+            (tensor([[-1.,  1.],
+                    [ 2.,  4.]]),)
 
     .. admonition::
         Example
diff --git a/src/torchjd/autojac/_jac_to_grad.py b/src/torchjd/autojac/_jac_to_grad.py
index 5947a998..03ab3b60 100644
--- a/src/torchjd/autojac/_jac_to_grad.py
+++ b/src/torchjd/autojac/_jac_to_grad.py
@@ -105,7 +105,7 @@ def jac_to_grad(
             >>> param.grad
             tensor([0.5000, 2.5000])
             >>> weights
-            tensor([0.5,  0.5])
+            tensor([0.5000, 0.5000])
 
         The ``.grad`` field of ``param`` now contains the aggregation (by UPGrad) of the Jacobian of
         :math:`\begin{bmatrix}y_1 \\ y_2\end{bmatrix}` with respect to ``param``. In this case, the

From 9bee3300ba57981a0bd87f5cdc588fd179b1df0d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Val=C3=A9rian=20Rey?= <valerian.rey@gmail.com>
Date: Fri, 6 Mar 2026 22:57:01 +0100
Subject: [PATCH 2/8] Add sphinx.ext.doctest extension

---
 docs/source/conf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index ecad14af..474d9e99 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -40,6 +40,7 @@
     "sphinx.ext.intersphinx",
     "myst_parser",  # Enables markdown support
     "sphinx_design",  # Enables side to side cards
+    "sphinx.ext.doctest",
 ]
 
 # -- Options for HTML output -------------------------------------------------

From f95b9b8bb236f94c6e7cd1312975d50e1fc0c633 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Val=C3=A9rian=20Rey?= <valerian.rey@gmail.com>
Date: Fri, 6 Mar 2026 22:57:32 +0100
Subject: [PATCH 3/8] Add emphasize-lines to testcode

---
 docs/source/conf.py | 36 ++++++++++++++++++++++++++++++++++++
 pyproject.toml      |  1 +
 2 files changed, 37 insertions(+)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 474d9e99..5fd11c00 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -6,8 +6,14 @@
 import inspect
 import os
 import sys
+from typing import ClassVar
 
 import tomli
+from docutils.parsers.rst import directives
+from sphinx.application import Sphinx
+from sphinx.directives.code import parse_line_num_spec
+from sphinx.ext.doctest import TestcodeDirective
+from sphinx.util.typing import OptionSpec
 
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
@@ -135,3 +141,33 @@ def _get_version_str() -> str:
     except KeyError:
         version_str = "main"
     return version_str
+
+
+class _TestcodeWithEmphasisDirective(TestcodeDirective):
+    """
+    Extension of ``.. testcode::`` that additionally supports ``:emphasize-lines:``.
+
+    Sphinx's built-in ``.. testcode::`` directive does not support ``:emphasize-lines:``. This
+    subclass adds that option and forwards it as ``highlight_args['hl_lines']`` on the resulting
+    node, which is the same mechanism used by ``.. code-block::``.
+    """
+
+    option_spec: ClassVar[OptionSpec] = {
+        **TestcodeDirective.option_spec,
+        "emphasize-lines": directives.unchanged_required,
+    }
+
+    def run(self) -> list:
+        result = super().run()
+        linespec = self.options.get("emphasize-lines")
+        if linespec and result:
+            node = result[0]
+            nlines = len(self.content)
+            hl_lines = parse_line_num_spec(linespec, nlines)
+            hl_lines = [x + 1 for x in hl_lines if x < nlines]
+            node["highlight_args"] = {"hl_lines": hl_lines}
+        return result
+
+
+def setup(app: Sphinx) -> None:
+    app.add_directive("testcode", _TestcodeWithEmphasisDirective, override=True)
diff --git a/pyproject.toml b/pyproject.toml
index 4332dc06..1172decd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -78,6 +78,7 @@ doc = [
     "sphinx-autodoc-typehints>=3.5.0",  # Bugged Union on Python 3.14 before 3.5.0
     "myst-parser>=3.0.1",  # Never tested lower versions
     "sphinx-design>=0.6.0",  # Never tested lower versions
+    "docutils>=0.22.4",  # Never tested lower versions
 ]
 
 test = [

From 6113f071fdeb499026e9e51b31c225c2d72b800e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Val=C3=A9rian=20Rey?= <valerian.rey@gmail.com>
Date: Fri, 6 Mar 2026 23:15:57 +0100
Subject: [PATCH 4/8] Remove pytest doc tests

---
 tests/doc/__init__.py         |   0
 tests/doc/test_aggregation.py |  35 ---
 tests/doc/test_autogram.py    |  33 ---
 tests/doc/test_backward.py    |  39 ----
 tests/doc/test_jac.py         |  62 -----
 tests/doc/test_jac_to_grad.py |  24 --
 tests/doc/test_rst.py         | 427 ----------------------------------
 7 files changed, 620 deletions(-)
 delete mode 100644 tests/doc/__init__.py
 delete mode 100644 tests/doc/test_aggregation.py
 delete mode 100644 tests/doc/test_autogram.py
 delete mode 100644 tests/doc/test_backward.py
 delete mode 100644 tests/doc/test_jac.py
 delete mode 100644 tests/doc/test_jac_to_grad.py
 delete mode 100644 tests/doc/test_rst.py

diff --git a/tests/doc/__init__.py b/tests/doc/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/tests/doc/test_aggregation.py b/tests/doc/test_aggregation.py
deleted file mode 100644
index a4219e4e..00000000
--- a/tests/doc/test_aggregation.py
+++ /dev/null
@@ -1,35 +0,0 @@
-"""This file contains the test corresponding to the usage example of Aggregator and Weighting."""
-
-import torch
-from torch.testing import assert_close
-
-
-def test_aggregation_and_weighting() -> None:
-    from torch import tensor
-
-    from torchjd.aggregation import UPGrad, UPGradWeighting
-
-    aggregator = UPGrad()
-    jacobian = tensor([[-4.0, 1.0, 1.0], [6.0, 1.0, 1.0]])
-    aggregation = aggregator(jacobian)
-
-    assert_close(aggregation, tensor([0.2929, 1.9004, 1.9004]), rtol=0, atol=1e-4)
-
-    weighting = UPGradWeighting()
-    gramian = jacobian @ jacobian.T
-    weights = weighting(gramian)
-
-    assert_close(weights, tensor([1.1109, 0.7894]), rtol=0, atol=1e-4)
-
-
-def test_generalized_weighting() -> None:
-    from torch import ones
-
-    from torchjd.aggregation import Flattening, UPGradWeighting
-
-    weighting = Flattening(UPGradWeighting())
-    # Generate a generalized Gramian filled with ones, for the sake of the example
-    generalized_gramian = ones((2, 3, 3, 2))
-    weights = weighting(generalized_gramian)
-
-    assert_close(weights, torch.full((2, 3), 0.1667), rtol=0, atol=1e-4)
diff --git a/tests/doc/test_autogram.py b/tests/doc/test_autogram.py
deleted file mode 100644
index a0861e24..00000000
--- a/tests/doc/test_autogram.py
+++ /dev/null
@@ -1,33 +0,0 @@
-"""This file contains tests for the usage examples related to autogram."""
-
-
-def test_engine() -> None:
-    import torch
-    from torch.nn import Linear, MSELoss, ReLU, Sequential
-    from torch.optim import SGD
-
-    from torchjd.aggregation import UPGradWeighting
-    from torchjd.autogram import Engine
-
-    # Generate data (8 batches of 16 examples of dim 5) for the sake of the example
-    inputs = torch.randn(8, 16, 5)
-    targets = torch.randn(8, 16)
-
-    model = Sequential(Linear(5, 4), ReLU(), Linear(4, 1))
-    optimizer = SGD(model.parameters())
-
-    criterion = MSELoss(reduction="none")  # Important to use reduction="none"
-    weighting = UPGradWeighting()
-
-    # Create the engine before the backward pass, and only once.
-    engine = Engine(model, batch_dim=0)
-
-    for input, target in zip(inputs, targets, strict=True):
-        output = model(input).squeeze(dim=1)  # shape: [16]
-        losses = criterion(output, target)  # shape: [16]
-
-        gramian = engine.compute_gramian(losses)  # shape: [16, 16]
-        weights = weighting(gramian)  # shape: [16]
-        losses.backward(weights)
-        optimizer.step()
-        optimizer.zero_grad()
diff --git a/tests/doc/test_backward.py b/tests/doc/test_backward.py
deleted file mode 100644
index d08d2c2f..00000000
--- a/tests/doc/test_backward.py
+++ /dev/null
@@ -1,39 +0,0 @@
-"""
-This file contains the test of the backward usage example, with a verification of the value of the
-obtained `.jac` field.
-"""
-
-from utils.asserts import assert_jac_close
-
-
-def test_backward() -> None:
-    import torch
-
-    from torchjd.autojac import backward
-
-    param = torch.tensor([1.0, 2.0], requires_grad=True)
-    # Compute arbitrary quantities that are function of param
-    y1 = torch.tensor([-1.0, 1.0]) @ param
-    y2 = (param**2).sum()
-
-    backward([y1, y2])
-
-    assert_jac_close(param, torch.tensor([[-1.0, 1.0], [2.0, 4.0]]), rtol=0.0, atol=1e-04)
-
-
-def test_backward2() -> None:
-    import torch
-
-    from torchjd.autojac import backward
-
-    param = torch.tensor([1.0, 2.0], requires_grad=True)
-    # Compute arbitrary quantities that are function of param
-    y1 = torch.tensor([-1.0, 1.0]) @ param
-    y2 = (param**2).sum()
-
-    J1 = torch.tensor([1.0, 0.0])
-    J2 = torch.tensor([0.0, 1.0])
-
-    backward([y1, y2], jac_tensors=[J1, J2])
-
-    assert_jac_close(param, torch.tensor([[-1.0, 1.0], [2.0, 4.0]]), rtol=0.0, atol=1e-04)
diff --git a/tests/doc/test_jac.py b/tests/doc/test_jac.py
deleted file mode 100644
index 1a0b79a2..00000000
--- a/tests/doc/test_jac.py
+++ /dev/null
@@ -1,62 +0,0 @@
-"""
-This file contains the test of the jac usage example, with a verification of the value of the obtained jacobians tuple.
-"""
-
-from torch.testing import assert_close
-
-
-def test_jac() -> None:
-    import torch
-
-    from torchjd.autojac import jac
-
-    param = torch.tensor([1.0, 2.0], requires_grad=True)
-    # Compute arbitrary quantities that are function of param
-    y1 = torch.tensor([-1.0, 1.0]) @ param
-    y2 = (param**2).sum()
-    jacobians = jac([y1, y2], param)
-
-    assert len(jacobians) == 1
-    assert_close(jacobians[0], torch.tensor([[-1.0, 1.0], [2.0, 4.0]]), rtol=0.0, atol=1e-04)
-
-
-def test_jac_2() -> None:
-    import torch
-
-    from torchjd.autojac import jac
-
-    weight = torch.tensor([[1.0, 2.0], [3.0, 4.0]], requires_grad=True)  # shape: [2, 2]
-    bias = torch.tensor([0.5, -0.5], requires_grad=True)  # shape: [2]
-    # Compute arbitrary quantities that are function of weight and bias
-    input_vec = torch.tensor([1.0, -1.0])
-    y1 = weight @ input_vec + bias  # shape: [2]
-    y2 = (weight**2).sum() + (bias**2).sum()  # shape: [] (scalar)
-    jacobians = jac([y1, y2], [weight, bias])  # shapes: [3, 2, 2], [3, 2]
-    jacobian_matrices = tuple(J.flatten(1) for J in jacobians)  # shapes: [3, 4], [3, 2]
-    combined_jacobian_matrix = torch.concat(jacobian_matrices, dim=1)  # shape: [3, 6]
-    gramian = combined_jacobian_matrix @ combined_jacobian_matrix.T  # shape: [3, 3]
-
-    assert_close(
-        gramian,
-        torch.tensor([[3.0, 0.0, -1.0], [0.0, 3.0, -3.0], [-1.0, -3.0, 122.0]]),
-        rtol=0.0,
-        atol=1e-04,
-    )
-
-
-def test_jac_3() -> None:
-    import torch
-
-    from torchjd.autojac import jac
-
-    x = torch.tensor([1.0, 2.0], requires_grad=True)
-    # Compose functions: x -> h -> y
-    h = x**2
-    y1 = h.sum()
-    y2 = torch.tensor([1.0, -1.0]) @ h
-    # Step 1: Compute d[y1,y2]/dh
-    jac_h = jac([y1, y2], [h])[0]  # Shape: [2, 2]
-    # Step 2: Use jac_outputs to compute d[y1,y2]/dx = (d[y1,y2]/dh) @ (dh/dx)
-    jac_x = jac(h, x, jac_outputs=jac_h)[0]
-
-    assert_close(jac_x, torch.tensor([[2.0, 4.0], [2.0, -4.0]]), rtol=0.0, atol=1e-04)
diff --git a/tests/doc/test_jac_to_grad.py b/tests/doc/test_jac_to_grad.py
deleted file mode 100644
index 21f06830..00000000
--- a/tests/doc/test_jac_to_grad.py
+++ /dev/null
@@ -1,24 +0,0 @@
-"""
-This file contains the test of the jac_to_grad usage example, with a verification of the value of
-the obtained `.grad` field.
-"""
-
-from torch.testing import assert_close
-from utils.asserts import assert_grad_close
-
-
-def test_jac_to_grad() -> None:
-    import torch
-
-    from torchjd.aggregation import UPGrad
-    from torchjd.autojac import backward, jac_to_grad
-
-    param = torch.tensor([1.0, 2.0], requires_grad=True)
-    # Compute arbitrary quantities that are function of param
-    y1 = torch.tensor([-1.0, 1.0]) @ param
-    y2 = (param**2).sum()
-    backward([y1, y2])  # param now has a .jac field
-    weights = jac_to_grad([param], UPGrad())  # param now has a .grad field
-
-    assert_grad_close(param, torch.tensor([0.5000, 2.5000]), rtol=0.0, atol=1e-04)
-    assert_close(weights, torch.tensor([0.5, 0.5]), rtol=0.0, atol=0.0)
diff --git a/tests/doc/test_rst.py b/tests/doc/test_rst.py
deleted file mode 100644
index 90dc099a..00000000
--- a/tests/doc/test_rst.py
+++ /dev/null
@@ -1,427 +0,0 @@
-"""
-This file contains the tests corresponding to the extra usage examples contained in the `.rst` files
-of the documentation. When there are multiple examples within a single `.rst` file, we use nested
-functions here to test them.
-"""
-
-from typing import no_type_check
-
-from pytest import mark
-
-
-def test_amp() -> None:
-    import torch
-    from torch.amp import GradScaler
-    from torch.nn import Linear, MSELoss, ReLU, Sequential
-    from torch.optim import SGD
-
-    from torchjd.aggregation import UPGrad
-    from torchjd.autojac import jac_to_grad, mtl_backward
-
-    shared_module = Sequential(Linear(10, 5), ReLU(), Linear(5, 3), ReLU())
-    task1_module = Linear(3, 1)
-    task2_module = Linear(3, 1)
-    params = [
-        *shared_module.parameters(),
-        *task1_module.parameters(),
-        *task2_module.parameters(),
-    ]
-    scaler = GradScaler(device="cpu")
-    loss_fn = MSELoss()
-    optimizer = SGD(params, lr=0.1)
-    aggregator = UPGrad()
-
-    inputs = torch.randn(8, 16, 10)  # 8 batches of 16 random input vectors of length 10
-    task1_targets = torch.randn(8, 16, 1)  # 8 batches of 16 targets for the first task
-    task2_targets = torch.randn(8, 16, 1)  # 8 batches of 16 targets for the second task
-
-    for input, target1, target2 in zip(inputs, task1_targets, task2_targets, strict=False):
-        with torch.autocast(device_type="cpu", dtype=torch.float16):
-            features = shared_module(input)
-            output1 = task1_module(features)
-            output2 = task2_module(features)
-            loss1 = loss_fn(output1, target1)
-            loss2 = loss_fn(output2, target2)
-
-        scaled_losses = scaler.scale([loss1, loss2])
-        mtl_backward(scaled_losses, features=features)
-        jac_to_grad(shared_module.parameters(), aggregator)
-        scaler.step(optimizer)
-        scaler.update()
-        optimizer.zero_grad()
-
-
-def test_basic_usage() -> None:
-    import torch
-    from torch.nn import Linear, MSELoss, ReLU, Sequential
-    from torch.optim import SGD
-
-    from torchjd import autojac
-    from torchjd.aggregation import UPGrad
-    from torchjd.autojac import jac_to_grad
-
-    model = Sequential(Linear(10, 5), ReLU(), Linear(5, 2))
-    optimizer = SGD(model.parameters(), lr=0.1)
-
-    aggregator = UPGrad()
-    input = torch.randn(16, 10)  # Batch of 16 random input vectors of length 10
-    target1 = torch.randn(16)  # First batch of 16 targets
-    target2 = torch.randn(16)  # Second batch of 16 targets
-
-    loss_fn = MSELoss()
-    output = model(input)
-    loss1 = loss_fn(output[:, 0], target1)
-    loss2 = loss_fn(output[:, 1], target2)
-
-    autojac.backward([loss1, loss2])
-    jac_to_grad(model.parameters(), aggregator)
-    optimizer.step()
-    optimizer.zero_grad()
-
-
-def test_iwmtl() -> None:
-    import torch
-    from torch.nn import Linear, MSELoss, ReLU, Sequential
-    from torch.optim import SGD
-
-    from torchjd.aggregation import Flattening, UPGradWeighting
-    from torchjd.autogram import Engine
-
-    shared_module = Sequential(Linear(10, 5), ReLU(), Linear(5, 3), ReLU())
-    task1_module = Linear(3, 1)
-    task2_module = Linear(3, 1)
-    params = [
-        *shared_module.parameters(),
-        *task1_module.parameters(),
-        *task2_module.parameters(),
-    ]
-
-    optimizer = SGD(params, lr=0.1)
-    mse = MSELoss(reduction="none")
-    weighting = Flattening(UPGradWeighting())
-    engine = Engine(shared_module, batch_dim=0)
-
-    inputs = torch.randn(8, 16, 10)  # 8 batches of 16 random input vectors of length 10
-    task1_targets = torch.randn(8, 16)  # 8 batches of 16 targets for the first task
-    task2_targets = torch.randn(8, 16)  # 8 batches of 16 targets for the second task
-
-    for input, target1, target2 in zip(inputs, task1_targets, task2_targets, strict=False):
-        features = shared_module(input)  # shape: [16, 3]
-        out1 = task1_module(features).squeeze(1)  # shape: [16]
-        out2 = task2_module(features).squeeze(1)  # shape: [16]
-
-        # Compute the matrix of losses: one loss per element of the batch and per task
-        losses = torch.stack([mse(out1, target1), mse(out2, target2)], dim=1)  # shape: [16, 2]
-
-        # Compute the gramian (inner products between pairs of gradients of the losses)
-        gramian = engine.compute_gramian(losses)  # shape: [16, 2, 2, 16]
-
-        # Obtain the weights that lead to no conflict between reweighted gradients
-        weights = weighting(gramian)  # shape: [16, 2]
-
-        # Do the standard backward pass, but weighted using the obtained weights
-        losses.backward(weights)
-        optimizer.step()
-        optimizer.zero_grad()
-
-
-def test_iwrm() -> None:
-    def test_autograd() -> None:
-        import torch
-        from torch.nn import Linear, MSELoss, ReLU, Sequential
-        from torch.optim import SGD
-
-        X = torch.randn(8, 16, 10)
-        Y = torch.randn(8, 16)
-
-        model = Sequential(Linear(10, 5), ReLU(), Linear(5, 1))
-        loss_fn = MSELoss()
-
-        params = model.parameters()
-        optimizer = SGD(params, lr=0.1)
-
-        for x, y in zip(X, Y, strict=False):
-            y_hat = model(x).squeeze(dim=1)  # shape: [16]
-            loss = loss_fn(y_hat, y)  # shape: [] (scalar)
-            loss.backward()
-            optimizer.step()
-            optimizer.zero_grad()
-
-    def test_autojac() -> None:
-        import torch
-        from torch.nn import Linear, MSELoss, ReLU, Sequential
-        from torch.optim import SGD
-
-        from torchjd.aggregation import UPGrad
-        from torchjd.autojac import backward, jac_to_grad
-
-        X = torch.randn(8, 16, 10)
-        Y = torch.randn(8, 16)
-
-        model = Sequential(Linear(10, 5), ReLU(), Linear(5, 1))
-        loss_fn = MSELoss(reduction="none")
-
-        params = model.parameters()
-        optimizer = SGD(params, lr=0.1)
-        aggregator = UPGrad()
-
-        for x, y in zip(X, Y, strict=False):
-            y_hat = model(x).squeeze(dim=1)  # shape: [16]
-            losses = loss_fn(y_hat, y)  # shape: [16]
-            backward(losses)
-            jac_to_grad(model.parameters(), aggregator)
-            optimizer.step()
-            optimizer.zero_grad()
-
-    def test_autogram() -> None:
-        import torch
-        from torch.nn import Linear, MSELoss, ReLU, Sequential
-        from torch.optim import SGD
-
-        from torchjd.aggregation import UPGradWeighting
-        from torchjd.autogram import Engine
-
-        X = torch.randn(8, 16, 10)
-        Y = torch.randn(8, 16)
-
-        model = Sequential(Linear(10, 5), ReLU(), Linear(5, 1))
-        loss_fn = MSELoss(reduction="none")
-
-        params = model.parameters()
-        optimizer = SGD(params, lr=0.1)
-        weighting = UPGradWeighting()
-        engine = Engine(model, batch_dim=0)
-
-        for x, y in zip(X, Y, strict=False):
-            y_hat = model(x).squeeze(dim=1)  # shape: [16]
-            losses = loss_fn(y_hat, y)  # shape: [16]
-            gramian = engine.compute_gramian(losses)  # shape: [16, 16]
-            weights = weighting(gramian)  # shape: [16]
-            losses.backward(weights)
-            optimizer.step()
-            optimizer.zero_grad()
-
-    test_autograd()
-    test_autojac()
-    test_autogram()
-
-
-@mark.filterwarnings(
-    "ignore::DeprecationWarning",
-    "ignore::FutureWarning",
-    "ignore::lightning.fabric.utilities.warnings.PossibleUserWarning",
-)
-@no_type_check  # Typing is annoying with Lightning, which would make the example too hard to read.
-def test_lightning_integration() -> None:
-    # Extra ----------------------------------------------------------------------------------------
-    import logging
-
-    logging.disable(logging.INFO)
-    # ----------------------------------------------------------------------------------------------
-
-    import torch
-    from lightning import LightningModule, Trainer
-    from lightning.pytorch.utilities.types import OptimizerLRScheduler
-    from torch.nn import Linear, ReLU, Sequential
-    from torch.nn.functional import mse_loss
-    from torch.optim import Adam
-    from torch.utils.data import DataLoader, TensorDataset
-
-    from torchjd.aggregation import UPGrad
-    from torchjd.autojac import jac_to_grad, mtl_backward
-
-    class Model(LightningModule):
-        def __init__(self) -> None:
-            super().__init__()
-            self.feature_extractor = Sequential(Linear(10, 5), ReLU(), Linear(5, 3), ReLU())
-            self.task1_head = Linear(3, 1)
-            self.task2_head = Linear(3, 1)
-            self.automatic_optimization = False
-
-        def training_step(self, batch, batch_idx) -> None:  # noqa: ANN001
-            input, target1, target2 = batch
-
-            features = self.feature_extractor(input)
-            output1 = self.task1_head(features)
-            output2 = self.task2_head(features)
-
-            loss1 = mse_loss(output1, target1)
-            loss2 = mse_loss(output2, target2)
-
-            opt = self.optimizers()
-
-            mtl_backward([loss1, loss2], features=features)
-            jac_to_grad(self.feature_extractor.parameters(), UPGrad())
-            opt.step()
-            opt.zero_grad()
-
-        def configure_optimizers(self) -> OptimizerLRScheduler:
-            optimizer = Adam(self.parameters(), lr=1e-3)
-            return optimizer
-
-    model = Model()
-
-    inputs = torch.randn(8, 16, 10)  # 8 batches of 16 random input vectors of length 10
-    task1_targets = torch.randn(8, 16, 1)  # 8 batches of 16 targets for the first task
-    task2_targets = torch.randn(8, 16, 1)  # 8 batches of 16 targets for the second task
-
-    dataset = TensorDataset(inputs, task1_targets, task2_targets)
-    train_loader = DataLoader(dataset)
-    trainer = Trainer(
-        accelerator="cpu",
-        max_epochs=1,
-        enable_checkpointing=False,
-        logger=False,
-        enable_progress_bar=False,
-    )
-
-    trainer.fit(model=model, train_dataloaders=train_loader)
-
-
-def test_monitoring() -> None:
-    import torch
-    from torch.nn import Linear, MSELoss, ReLU, Sequential
-    from torch.nn.functional import cosine_similarity
-    from torch.optim import SGD
-
-    from torchjd.aggregation import UPGrad
-    from torchjd.autojac import jac_to_grad, mtl_backward
-
-    def print_weights(_, __, weights: torch.Tensor) -> None:
-        """Prints the extracted weights."""
-        print(f"Weights: {weights}")
-
-    def print_gd_similarity(_, inputs: tuple[torch.Tensor, ...], aggregation: torch.Tensor) -> None:
-        """Prints the cosine similarity between the aggregation and the average gradient."""
-        matrix = inputs[0]
-        gd_output = matrix.mean(dim=0)
-        similarity = cosine_similarity(aggregation, gd_output, dim=0)
-        print(f"Cosine similarity: {similarity.item():.4f}")
-
-    shared_module = Sequential(Linear(10, 5), ReLU(), Linear(5, 3), ReLU())
-    task1_module = Linear(3, 1)
-    task2_module = Linear(3, 1)
-    params = [
-        *shared_module.parameters(),
-        *task1_module.parameters(),
-        *task2_module.parameters(),
-    ]
-
-    loss_fn = MSELoss()
-    optimizer = SGD(params, lr=0.1)
-    aggregator = UPGrad()
-
-    aggregator.gramian_weighting.register_forward_hook(print_weights)
-    aggregator.register_forward_hook(print_gd_similarity)
-
-    inputs = torch.randn(8, 16, 10)  # 8 batches of 16 random input vectors of length 10
-    task1_targets = torch.randn(8, 16, 1)  # 8 batches of 16 targets for the first task
-    task2_targets = torch.randn(8, 16, 1)  # 8 batches of 16 targets for the second task
-
-    for input, target1, target2 in zip(inputs, task1_targets, task2_targets, strict=False):
-        features = shared_module(input)
-        output1 = task1_module(features)
-        output2 = task2_module(features)
-        loss1 = loss_fn(output1, target1)
-        loss2 = loss_fn(output2, target2)
-
-        mtl_backward([loss1, loss2], features=features)
-        jac_to_grad(shared_module.parameters(), aggregator)
-        optimizer.step()
-        optimizer.zero_grad()
-
-
-def test_mtl() -> None:
-    import torch
-    from torch.nn import Linear, MSELoss, ReLU, Sequential
-    from torch.optim import SGD
-
-    from torchjd.aggregation import UPGrad
-    from torchjd.autojac import jac_to_grad, mtl_backward
-
-    shared_module = Sequential(Linear(10, 5), ReLU(), Linear(5, 3), ReLU())
-    task1_module = Linear(3, 1)
-    task2_module = Linear(3, 1)
-    params = [
-        *shared_module.parameters(),
-        *task1_module.parameters(),
-        *task2_module.parameters(),
-    ]
-
-    loss_fn = MSELoss()
-    optimizer = SGD(params, lr=0.1)
-    aggregator = UPGrad()
-
-    inputs = torch.randn(8, 16, 10)  # 8 batches of 16 random input vectors of length 10
-    task1_targets = torch.randn(8, 16, 1)  # 8 batches of 16 targets for the first task
-    task2_targets = torch.randn(8, 16, 1)  # 8 batches of 16 targets for the second task
-
-    for input, target1, target2 in zip(inputs, task1_targets, task2_targets, strict=False):
-        features = shared_module(input)
-        output1 = task1_module(features)
-        output2 = task2_module(features)
-        loss1 = loss_fn(output1, target1)
-        loss2 = loss_fn(output2, target2)
-
-        mtl_backward([loss1, loss2], features=features)
-        jac_to_grad(shared_module.parameters(), aggregator)
-        optimizer.step()
-        optimizer.zero_grad()
-
-
-def test_partial_jd() -> None:
-    import torch
-    from torch.nn import Linear, MSELoss, ReLU, Sequential
-    from torch.optim import SGD
-
-    from torchjd.aggregation import UPGradWeighting
-    from torchjd.autogram import Engine
-
-    X = torch.randn(8, 16, 10)
-    Y = torch.randn(8, 16)
-
-    model = Sequential(Linear(10, 8), ReLU(), Linear(8, 5), ReLU(), Linear(5, 1))
-    loss_fn = MSELoss(reduction="none")
-
-    weighting = UPGradWeighting()
-
-    # Create the autogram engine that will compute the Gramian of the
-    # Jacobian with respect to the two last Linear layers' parameters.
-    engine = Engine(model[2:], batch_dim=0)
-
-    params = model.parameters()
-    optimizer = SGD(params, lr=0.1)
-
-    for x, y in zip(X, Y, strict=False):
-        y_hat = model(x).squeeze(dim=1)  # shape: [16]
-        losses = loss_fn(y_hat, y)  # shape: [16]
-        gramian = engine.compute_gramian(losses)
-        weights = weighting(gramian)
-        losses.backward(weights)
-        optimizer.step()
-        optimizer.zero_grad()
-
-
-def test_rnn() -> None:
-    import torch
-    from torch.nn import RNN
-    from torch.optim import SGD
-
-    from torchjd.aggregation import UPGrad
-    from torchjd.autojac import backward, jac_to_grad
-
-    rnn = RNN(input_size=10, hidden_size=20, num_layers=2)
-    optimizer = SGD(rnn.parameters(), lr=0.1)
-    aggregator = UPGrad()
-
-    inputs = torch.randn(8, 5, 3, 10)  # 8 batches of 3 sequences of length 5 and of dim 10.
-    targets = torch.randn(8, 5, 3, 20)  # 8 batches of 3 sequences of length 5 and of dim 20.
-
-    for input, target in zip(inputs, targets, strict=False):
-        output, _ = rnn(input)  # output is of shape [5, 3, 20].
-        losses = ((output - target) ** 2).mean(dim=[1, 2])  # 1 loss per sequence element.
-
-        backward(losses, parallel_chunk_size=1)
-        jac_to_grad(rnn.parameters(), aggregator)
-        optimizer.step()
-        optimizer.zero_grad()

From 9ef13016f29e8c74a69a1bfe7af911aa8e635b18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Val=C3=A9rian=20Rey?= <valerian.rey@gmail.com>
Date: Fri, 6 Mar 2026 23:17:03 +0100
Subject: [PATCH 5/8] Add testcode directives for doctests

---
 docs/source/examples/amp.rst                  |  2 +-
 docs/source/examples/basic_usage.rst          | 16 ++++++------
 docs/source/examples/iwmtl.rst                |  2 +-
 docs/source/examples/iwrm.rst                 |  6 ++---
 .../source/examples/lightning_integration.rst | 13 +++++++++-
 docs/source/examples/monitoring.rst           | 26 ++++++++++++++++++-
 docs/source/examples/mtl.rst                  |  2 +-
 docs/source/examples/partial_jd.rst           |  2 +-
 docs/source/examples/rnn.rst                  |  2 +-
 src/torchjd/autogram/_engine.py               | 26 ++++++++++++-------
 10 files changed, 69 insertions(+), 28 deletions(-)

diff --git a/docs/source/examples/amp.rst b/docs/source/examples/amp.rst
index 2de486c1..a5b0fc38 100644
--- a/docs/source/examples/amp.rst
+++ b/docs/source/examples/amp.rst
@@ -11,7 +11,7 @@ case, the losses) should preferably be scaled with a `GradScaler
 <https://pytorch.org/docs/stable/amp.html#gradient-scaling>`_ to avoid gradient underflow. The
 following example shows the resulting code for a multi-task learning use-case.
 
-.. code-block:: python
+.. testcode::
     :emphasize-lines: 2, 17, 27, 34-35, 37-38
 
     import torch
diff --git a/docs/source/examples/basic_usage.rst b/docs/source/examples/basic_usage.rst
index 64a1dbcd..58b435e4 100644
--- a/docs/source/examples/basic_usage.rst
+++ b/docs/source/examples/basic_usage.rst
@@ -12,7 +12,7 @@ the parameters are updated using the resulting aggregation.
 
 Import several classes from ``torch`` and ``torchjd``:
 
-.. code-block:: python
+.. testcode::
 
     import torch
     from torch.nn import Linear, MSELoss, ReLU, Sequential
@@ -24,14 +24,14 @@ Import several classes from ``torch`` and ``torchjd``:
 
 Define the model and the optimizer, as usual:
 
-.. code-block:: python
+.. testcode::
 
     model = Sequential(Linear(10, 5), ReLU(), Linear(5, 2))
     optimizer = SGD(model.parameters(), lr=0.1)
 
 Define the aggregator that will be used to combine the Jacobian matrix:
 
-.. code-block:: python
+.. testcode::
 
     aggregator = UPGrad()
 
@@ -41,7 +41,7 @@ negatively affected by the update.
 
 Now that everything is defined, we can train the model. Define the input and the associated target:
 
-.. code-block:: python
+.. testcode::
 
     input = torch.randn(16, 10)  # Batch of 16 random input vectors of length 10
     target1 = torch.randn(16)  # First batch of 16 targets
@@ -51,7 +51,7 @@ Here, we generate fake inputs and labels for the sake of the example.
 
 We can now compute the losses associated to each element of the batch.
 
-.. code-block:: python
+.. testcode::
 
     loss_fn = MSELoss()
     output = model(input)
@@ -62,7 +62,7 @@ The last steps are similar to gradient descent-based optimization, but using the
 
 Perform the Jacobian descent backward pass:
 
-.. code-block:: python
+.. testcode::
 
     autojac.backward([loss1, loss2])
     jac_to_grad(model.parameters(), aggregator)
@@ -73,7 +73,7 @@ field of the parameters. It also deletes the ``.jac`` fields save some memory.
 
 Update each parameter based on its ``.grad`` field, using the ``optimizer``:
 
-.. code-block:: python
+.. testcode::
 
     optimizer.step()
 
@@ -81,6 +81,6 @@ The model's parameters have been updated!
 
 As usual, you should now reset the ``.grad`` field of each model parameter:
 
-.. code-block:: python
+.. testcode::
 
     optimizer.zero_grad()
diff --git a/docs/source/examples/iwmtl.rst b/docs/source/examples/iwmtl.rst
index 8b2410f7..3235e0ee 100644
--- a/docs/source/examples/iwmtl.rst
+++ b/docs/source/examples/iwmtl.rst
@@ -9,7 +9,7 @@ this Gramian to reweight the gradients and resolve conflict entirely.
 
 The following example shows how to do that.
 
-.. code-block:: python
+.. testcode::
     :emphasize-lines: 5-6, 18-20, 31-32, 34-35, 37-38, 40-41
 
     import torch
diff --git a/docs/source/examples/iwrm.rst b/docs/source/examples/iwrm.rst
index ebc2bde5..4d553f43 100644
--- a/docs/source/examples/iwrm.rst
+++ b/docs/source/examples/iwrm.rst
@@ -41,7 +41,7 @@ batch of data. When minimizing per-instance losses (IWRM), we use either autojac
 .. tab-set::
     .. tab-item:: autograd (baseline)
 
-        .. code-block:: python
+        .. testcode::
 
             import torch
             from torch.nn import Linear, MSELoss, ReLU, Sequential
@@ -75,7 +75,7 @@ batch of data. When minimizing per-instance losses (IWRM), we use either autojac
 
     .. tab-item:: autojac
 
-        .. code-block:: python
+        .. testcode::
             :emphasize-lines: 5-6, 12, 16, 21-23
 
             import torch
@@ -110,7 +110,7 @@ batch of data. When minimizing per-instance losses (IWRM), we use either autojac
 
     .. tab-item:: autogram (recommended)
 
-        .. code-block:: python
+        .. testcode::
             :emphasize-lines: 5-6, 12, 16-17, 21-24
 
             import torch
diff --git a/docs/source/examples/lightning_integration.rst b/docs/source/examples/lightning_integration.rst
index c8416083..115f4226 100644
--- a/docs/source/examples/lightning_integration.rst
+++ b/docs/source/examples/lightning_integration.rst
@@ -10,7 +10,18 @@ The following code example demonstrates a basic multi-task learning setup using
 :class:`~lightning.pytorch.core.LightningModule` that will call :doc:`mtl_backward
 <../docs/autojac/mtl_backward>` at each training iteration.
 
-.. code-block:: python
+.. testsetup::
+
+    import warnings
+    import logging
+    from lightning.fabric.utilities.warnings import PossibleUserWarning
+
+    logging.disable(logging.INFO)
+    warnings.filterwarnings("ignore", category=DeprecationWarning)
+    warnings.filterwarnings("ignore", category=FutureWarning)
+    warnings.filterwarnings("ignore", category=PossibleUserWarning)
+
+.. testcode::
     :emphasize-lines: 9-10, 18, 31-32
 
     import torch
diff --git a/docs/source/examples/monitoring.rst b/docs/source/examples/monitoring.rst
index 784eea4d..0570c9f1 100644
--- a/docs/source/examples/monitoring.rst
+++ b/docs/source/examples/monitoring.rst
@@ -14,7 +14,12 @@ Jacobian descent is doing something different than gradient descent. With
 :doc:`UPGrad <../docs/aggregation/upgrad>`, this happens when the original gradients conflict (i.e.
 they have a negative inner product).
 
-.. code-block:: python
+.. testsetup::
+
+    import torch
+    torch.manual_seed(0)
+
+.. testcode::
     :emphasize-lines: 9-11, 13-18, 33-34
 
     import torch
@@ -67,3 +72,22 @@ they have a negative inner product).
         jac_to_grad(shared_module.parameters(), aggregator)
         optimizer.step()
         optimizer.zero_grad()
+
+.. testoutput::
+
+    Weights: tensor([0.5000, 0.5000])
+    Cosine similarity: 1.0000
+    Weights: tensor([0.5000, 0.5000])
+    Cosine similarity: 1.0000
+    Weights: tensor([0.5000, 0.5000])
+    Cosine similarity: 1.0000
+    Weights: tensor([0.6618, 1.0554])
+    Cosine similarity: 0.9249
+    Weights: tensor([0.6569, 1.2146])
+    Cosine similarity: 0.8661
+    Weights: tensor([0.5004, 0.5060])
+    Cosine similarity: 1.0000
+    Weights: tensor([0.5000, 0.5000])
+    Cosine similarity: 1.0000
+    Weights: tensor([0.5746, 1.1607])
+    Cosine similarity: 0.9301
diff --git a/docs/source/examples/mtl.rst b/docs/source/examples/mtl.rst
index 147a999b..e0654529 100644
--- a/docs/source/examples/mtl.rst
+++ b/docs/source/examples/mtl.rst
@@ -18,7 +18,7 @@ For the sake of the example, we generate a fake dataset consisting of 8 batches
 vectors of dimension 10, and their corresponding scalar labels for both tasks.
 
 
-.. code-block:: python
+.. testcode::
     :emphasize-lines: 5-6, 19, 32-33
 
     import torch
diff --git a/docs/source/examples/partial_jd.rst b/docs/source/examples/partial_jd.rst
index ad82205a..add73f4f 100644
--- a/docs/source/examples/partial_jd.rst
+++ b/docs/source/examples/partial_jd.rst
@@ -13,7 +13,7 @@ perform the partial descent by considering only the parameters of the last two `
 doing this, we avoid computing the Jacobian and its Gramian with respect to the parameters of the
 first ``Linear`` layer, thereby reducing memory usage and computation time.
 
-.. code-block:: python
+.. testcode::
     :emphasize-lines: 16-18
 
     import torch
diff --git a/docs/source/examples/rnn.rst b/docs/source/examples/rnn.rst
index 43cf24c7..cdfbae07 100644
--- a/docs/source/examples/rnn.rst
+++ b/docs/source/examples/rnn.rst
@@ -5,7 +5,7 @@ When training recurrent neural networks for sequence modelling, we can easily ob
 element of the output sequences. If the gradients of these losses are likely to conflict, Jacobian
 descent can be leveraged to enhance optimization.
 
-.. code-block:: python
+.. testcode::
     :emphasize-lines: 5-6, 10, 17, 19-20
 
     import torch
diff --git a/src/torchjd/autogram/_engine.py b/src/torchjd/autogram/_engine.py
index 7b7eae96..bed1c998 100644
--- a/src/torchjd/autogram/_engine.py
+++ b/src/torchjd/autogram/_engine.py
@@ -78,7 +78,7 @@ class Engine:
 
         Train a model using Gramian-based Jacobian descent.
 
-        .. code-block:: python
+        .. testcode::
             :emphasize-lines: 5-6, 15-16, 18-19, 26-29
 
             import torch
@@ -162,15 +162,21 @@ class Engine:
         Parent modules should call their child modules directly rather than using their child
         modules' parameters themselves. For instance, the following model is not supported:
 
-        >>> class Model(nn.Module):
-        >>>     def __init__(self):
-        >>>         super().__init__()
-        >>>         self.linear = nn.Linear(2, 3)  # Child module
-        >>>
-        >>>     def forward(self, input: Tensor) -> Tensor:
-        >>>         # Incorrect: Use the child module's parameters directly without calling it.
-        >>>         return input @ self.linear.weight.T + self.linear.bias
-        >>>         # Correct alternative: return self.linear(input)
+        .. testsetup::
+
+            from torch import nn
+
+        .. testcode::
+
+            class Model(nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.linear = nn.Linear(2, 3)  # Child module
+
+                def forward(self, input: Tensor) -> Tensor:
+                    # Incorrect: Use the child module's parameters directly without calling it.
+                    return input @ self.linear.weight.T + self.linear.bias
+                    # Correct alternative: return self.linear(input)
 
     .. note::
           For maximum efficiency, modules should ideally not contain both direct trainable

From 017b7783e4cce8fe20889ff6f6992fe097f0acfe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Val=C3=A9rian=20Rey?= <valerian.rey@gmail.com>
Date: Fri, 6 Mar 2026 23:32:05 +0100
Subject: [PATCH 6/8] Update instructions on doc testing

---
 AGENTS.md       | 3 ---
 CONTRIBUTING.md | 5 ++---
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index 78177b50..d08db835 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -6,9 +6,6 @@
 - We use uv for everything (e.g. we do `uv run python ...` to run some python code, and
   `uv run pytest tests/unit` to run unit tests). Please prefer `uv run python -c ...` over
   `python3 -c ...`
-- When you create or modify a code example in a public docstring, always update the corresponding
-  doc test in the appropriate file of `tests/doc`. This also applies to any change in an example of
-  a `.rst` file, that must be updated in the corresponding test in `tests/doc/test_rst.py`.
 - After generating code, please run `uv run ty check`, `uv run ruff check` and `uv run ruff format`.
   Fix any error.
 - After changing anything in `src` or in `tests/unit` or `tests/doc`, please identify the affected
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 81017471..95077588 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -99,10 +99,9 @@ uv run pre-commit install
      CUBLAS_WORKSPACE_CONFIG=:4096:8 PYTEST_TORCH_DEVICE=cuda:0 uv run pytest tests/unit
      ```
 
-   - To check that the usage examples from docstrings and `.rst` files are correct, we test their
-   behavior in `tests/doc`. To run these tests, do:
+   - To check that the usage examples from docstrings and `.rst` files are correct, run:
      ```bash
-     uv run pytest tests/doc
+     uv run make doctest -C docs
      ```
 
   - To compute the code coverage locally, you should run the unit tests and the doc tests together,

From 209d12581e04b6283f8362e05520887d4c984e76 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Val=C3=A9rian=20Rey?= <valerian.rey@gmail.com>
Date: Fri, 6 Mar 2026 23:34:49 +0100
Subject: [PATCH 7/8] Move lightning from test to doc deps

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 1172decd..84700e94 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -79,12 +79,12 @@ doc = [
     "myst-parser>=3.0.1",  # Never tested lower versions
     "sphinx-design>=0.6.0",  # Never tested lower versions
     "docutils>=0.22.4",  # Never tested lower versions
+    "lightning>=2.0.9",  # No OptimizerLRScheduler public type before 2.0.9
 ]
 
 test = [
     "pytest>=7.3",  # Before version 7.3, not all tests are run
     "pytest-cov>=6.0.0",  # Recent version to avoid problems, could be relaxed
-    "lightning>=2.0.9",  # No OptimizerLRScheduler public type before 2.0.9
     "torchvision>=0.18.0"
 ]
 

From bf59f4a37350a73161b9d27f254f225db572b884 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Val=C3=A9rian=20Rey?= <valerian.rey@gmail.com>
Date: Fri, 6 Mar 2026 23:35:40 +0100
Subject: [PATCH 8/8] Update checks so that the build-doc job runs doctest

---
 .github/workflows/checks.yml | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
index 1f68ee15..bb5b941b 100644
--- a/.github/workflows/checks.yml
+++ b/.github/workflows/checks.yml
@@ -54,8 +54,8 @@ jobs:
           options: ${{ matrix.options || 'full' }}
           groups: test ${{ matrix.extra_groups }}
 
-      - name: Run tests
-        run: uv run pytest -W error tests/unit tests/doc --cov=src --cov-report=xml
+      - name: Run unit tests
+        run: uv run pytest -W error tests/unit --cov=src --cov-report=xml
         env:
           PYTEST_TORCH_DTYPE: ${{ matrix.dtype || 'float32' }}
 
@@ -65,7 +65,7 @@ jobs:
           token: ${{ secrets.CODECOV_TOKEN }}
 
   build-doc:
-    name: Build documentation
+    name: Build and test documentation
     runs-on: ubuntu-latest
     steps:
       - name: Checkout repository
@@ -84,6 +84,10 @@ jobs:
         working-directory: docs
         run: uv run make dirhtml
 
+      - name: Test Documentation
+        working-directory: docs
+        run: uv run make doctest
+
   check-links:
     name: Link correctness
     runs-on: ubuntu-latest