SimplexLab · ValerianRey · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026
diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
@@ -54,8 +54,8 @@ jobs:
           options: ${{ matrix.options || 'full' }}
           groups: test ${{ matrix.extra_groups }}
 
-      - name: Run tests
-        run: uv run pytest -W error tests/unit tests/doc --cov=src --cov-report=xml
+      - name: Run unit tests
+        run: uv run pytest -W error tests/unit --cov=src --cov-report=xml
         env:
           PYTEST_TORCH_DTYPE: ${{ matrix.dtype || 'float32' }}
 
@@ -65,7 +65,7 @@ jobs:
           token: ${{ secrets.CODECOV_TOKEN }}
 
   build-doc:
-    name: Build documentation
+    name: Build and test documentation
     runs-on: ubuntu-latest
     steps:
       - name: Checkout repository
@@ -84,6 +84,10 @@ jobs:
         working-directory: docs
         run: uv run make dirhtml
 
+      - name: Test Documentation
+        working-directory: docs
+        run: uv run make doctest
+
   check-links:
     name: Link correctness
     runs-on: ubuntu-latest

diff --git a/AGENTS.md b/AGENTS.md
@@ -6,9 +6,6 @@
 - We use uv for everything (e.g. we do `uv run python ...` to run some python code, and
   `uv run pytest tests/unit` to run unit tests). Please prefer `uv run python -c ...` over
   `python3 -c ...`
-- When you create or modify a code example in a public docstring, always update the corresponding
-  doc test in the appropriate file of `tests/doc`. This also applies to any change in an example of
-  a `.rst` file, that must be updated in the corresponding test in `tests/doc/test_rst.py`.
 - After generating code, please run `uv run ty check`, `uv run ruff check` and `uv run ruff format`.
   Fix any error.
 - After changing anything in `src` or in `tests/unit` or `tests/doc`, please identify the affected

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -99,10 +99,9 @@ uv run pre-commit install
      CUBLAS_WORKSPACE_CONFIG=:4096:8 PYTEST_TORCH_DEVICE=cuda:0 uv run pytest tests/unit
      ```
 
-   - To check that the usage examples from docstrings and `.rst` files are correct, we test their
-   behavior in `tests/doc`. To run these tests, do:
+   - To check that the usage examples from docstrings and `.rst` files are correct, run:
      ```bash
-     uv run pytest tests/doc
+     uv run make doctest -C docs
      ```
 
   - To compute the code coverage locally, you should run the unit tests and the doc tests together,

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -6,8 +6,14 @@
 import inspect
 import os
 import sys
+from typing import ClassVar
 
 import tomli
+from docutils.parsers.rst import directives
+from sphinx.application import Sphinx
+from sphinx.directives.code import parse_line_num_spec
+from sphinx.ext.doctest import TestcodeDirective
+from sphinx.util.typing import OptionSpec
 
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
@@ -40,6 +46,7 @@
     "sphinx.ext.intersphinx",
     "myst_parser",  # Enables markdown support
     "sphinx_design",  # Enables side to side cards
+    "sphinx.ext.doctest",
 ]
 
 # -- Options for HTML output -------------------------------------------------
@@ -134,3 +141,33 @@ def _get_version_str() -> str:
     except KeyError:
         version_str = "main"
     return version_str
+
+
+class _TestcodeWithEmphasisDirective(TestcodeDirective):
+    """
+    Extension of ``.. testcode::`` that additionally supports ``:emphasize-lines:``.
+
+    Sphinx's built-in ``.. testcode::`` directive does not support ``:emphasize-lines:``. This
+    subclass adds that option and forwards it as ``highlight_args['hl_lines']`` on the resulting
+    node, which is the same mechanism used by ``.. code-block::``.
+    """
+
+    option_spec: ClassVar[OptionSpec] = {
+        **TestcodeDirective.option_spec,
+        "emphasize-lines": directives.unchanged_required,
+    }
+
+    def run(self) -> list:
+        result = super().run()
+        linespec = self.options.get("emphasize-lines")
+        if linespec and result:
+            node = result[0]
+            nlines = len(self.content)
+            hl_lines = parse_line_num_spec(linespec, nlines)
+            hl_lines = [x + 1 for x in hl_lines if x < nlines]
+            node["highlight_args"] = {"hl_lines": hl_lines}
+        return result
+
+
+def setup(app: Sphinx) -> None:
+    app.add_directive("testcode", _TestcodeWithEmphasisDirective, override=True)
diff --git a/docs/source/examples/amp.rst b/docs/source/examples/amp.rst
@@ -11,7 +11,7 @@ case, the losses) should preferably be scaled with a `GradScaler
 <https://pytorch.org/docs/stable/amp.html#gradient-scaling>`_ to avoid gradient underflow. The
 following example shows the resulting code for a multi-task learning use-case.
 
-.. code-block:: python
+.. testcode::
     :emphasize-lines: 2, 17, 27, 34-35, 37-38
 
     import torch

diff --git a/docs/source/examples/basic_usage.rst b/docs/source/examples/basic_usage.rst
@@ -12,7 +12,7 @@ the parameters are updated using the resulting aggregation.
 
 Import several classes from ``torch`` and ``torchjd``:
 
-.. code-block:: python
+.. testcode::
 
     import torch
     from torch.nn import Linear, MSELoss, ReLU, Sequential
@@ -24,14 +24,14 @@ Import several classes from ``torch`` and ``torchjd``:
 
 Define the model and the optimizer, as usual:
 
-.. code-block:: python
+.. testcode::
 
     model = Sequential(Linear(10, 5), ReLU(), Linear(5, 2))
     optimizer = SGD(model.parameters(), lr=0.1)
 
 Define the aggregator that will be used to combine the Jacobian matrix:
 
-.. code-block:: python
+.. testcode::
 
     aggregator = UPGrad()
 
@@ -41,7 +41,7 @@ negatively affected by the update.
 
 Now that everything is defined, we can train the model. Define the input and the associated target:
 
-.. code-block:: python
+.. testcode::
 
     input = torch.randn(16, 10)  # Batch of 16 random input vectors of length 10
     target1 = torch.randn(16)  # First batch of 16 targets
@@ -51,7 +51,7 @@ Here, we generate fake inputs and labels for the sake of the example.
 
 We can now compute the losses associated to each element of the batch.
 
-.. code-block:: python
+.. testcode::
 
     loss_fn = MSELoss()
     output = model(input)
@@ -62,7 +62,7 @@ The last steps are similar to gradient descent-based optimization, but using the
 
 Perform the Jacobian descent backward pass:
 
-.. code-block:: python
+.. testcode::
 
     autojac.backward([loss1, loss2])
     jac_to_grad(model.parameters(), aggregator)
@@ -73,14 +73,14 @@ field of the parameters. It also deletes the ``.jac`` fields save some memory.
 
 Update each parameter based on its ``.grad`` field, using the ``optimizer``:
 
-.. code-block:: python
+.. testcode::
 
     optimizer.step()
 
 The model's parameters have been updated!
 
 As usual, you should now reset the ``.grad`` field of each model parameter:
 
-.. code-block:: python
+.. testcode::
 
     optimizer.zero_grad()
diff --git a/docs/source/examples/iwmtl.rst b/docs/source/examples/iwmtl.rst
@@ -9,7 +9,7 @@ this Gramian to reweight the gradients and resolve conflict entirely.
 
 The following example shows how to do that.
 
-.. code-block:: python
+.. testcode::
     :emphasize-lines: 5-6, 18-20, 31-32, 34-35, 37-38, 40-41
 
     import torch

diff --git a/docs/source/examples/iwrm.rst b/docs/source/examples/iwrm.rst
@@ -41,7 +41,7 @@ batch of data. When minimizing per-instance losses (IWRM), we use either autojac
 .. tab-set::
     .. tab-item:: autograd (baseline)
 
-        .. code-block:: python
+        .. testcode::
 
             import torch
             from torch.nn import Linear, MSELoss, ReLU, Sequential
@@ -75,7 +75,7 @@ batch of data. When minimizing per-instance losses (IWRM), we use either autojac
 
     .. tab-item:: autojac
 
-        .. code-block:: python
+        .. testcode::
             :emphasize-lines: 5-6, 12, 16, 21-23
 
             import torch
@@ -110,7 +110,7 @@ batch of data. When minimizing per-instance losses (IWRM), we use either autojac
 
     .. tab-item:: autogram (recommended)
 
-        .. code-block:: python
+        .. testcode::
             :emphasize-lines: 5-6, 12, 16-17, 21-24
 
             import torch

diff --git a/docs/source/examples/lightning_integration.rst b/docs/source/examples/lightning_integration.rst
@@ -10,7 +10,18 @@ The following code example demonstrates a basic multi-task learning setup using
 :class:`~lightning.pytorch.core.LightningModule` that will call :doc:`mtl_backward
 <../docs/autojac/mtl_backward>` at each training iteration.
 
-.. code-block:: python
+.. testsetup::
+
+    import warnings
+    import logging
+    from lightning.fabric.utilities.warnings import PossibleUserWarning
+
+    logging.disable(logging.INFO)
+    warnings.filterwarnings("ignore", category=DeprecationWarning)
+    warnings.filterwarnings("ignore", category=FutureWarning)
+    warnings.filterwarnings("ignore", category=PossibleUserWarning)
+
+.. testcode::
     :emphasize-lines: 9-10, 18, 31-32
 
     import torch

diff --git a/docs/source/examples/monitoring.rst b/docs/source/examples/monitoring.rst
@@ -14,7 +14,12 @@ Jacobian descent is doing something different than gradient descent. With
 :doc:`UPGrad <../docs/aggregation/upgrad>`, this happens when the original gradients conflict (i.e.
 they have a negative inner product).
 
-.. code-block:: python
+.. testsetup::
+
+    import torch
+    torch.manual_seed(0)
+
+.. testcode::
     :emphasize-lines: 9-11, 13-18, 33-34
 
     import torch
@@ -67,3 +72,22 @@ they have a negative inner product).
         jac_to_grad(shared_module.parameters(), aggregator)
         optimizer.step()
         optimizer.zero_grad()
+
+.. testoutput::
+
+    Weights: tensor([0.5000, 0.5000])
+    Cosine similarity: 1.0000
+    Weights: tensor([0.5000, 0.5000])
+    Cosine similarity: 1.0000
+    Weights: tensor([0.5000, 0.5000])
+    Cosine similarity: 1.0000
+    Weights: tensor([0.6618, 1.0554])
+    Cosine similarity: 0.9249
+    Weights: tensor([0.6569, 1.2146])
+    Cosine similarity: 0.8661
+    Weights: tensor([0.5004, 0.5060])
+    Cosine similarity: 1.0000
+    Weights: tensor([0.5000, 0.5000])
+    Cosine similarity: 1.0000
+    Weights: tensor([0.5746, 1.1607])
+    Cosine similarity: 0.9301
diff --git a/docs/source/examples/mtl.rst b/docs/source/examples/mtl.rst
@@ -18,7 +18,7 @@ For the sake of the example, we generate a fake dataset consisting of 8 batches
 vectors of dimension 10, and their corresponding scalar labels for both tasks.
 
 
-.. code-block:: python
+.. testcode::
     :emphasize-lines: 5-6, 19, 32-33
 
     import torch

diff --git a/docs/source/examples/partial_jd.rst b/docs/source/examples/partial_jd.rst
@@ -13,7 +13,7 @@ perform the partial descent by considering only the parameters of the last two `
 doing this, we avoid computing the Jacobian and its Gramian with respect to the parameters of the
 first ``Linear`` layer, thereby reducing memory usage and computation time.
 
-.. code-block:: python
+.. testcode::
     :emphasize-lines: 16-18
 
     import torch

diff --git a/docs/source/examples/rnn.rst b/docs/source/examples/rnn.rst
@@ -5,7 +5,7 @@ When training recurrent neural networks for sequence modelling, we can easily ob
 element of the output sequences. If the gradients of these losses are likely to conflict, Jacobian
 descent can be leveraged to enhance optimization.
 
-.. code-block:: python
+.. testcode::
     :emphasize-lines: 5-6, 10, 17, 19-20
 
     import torch

diff --git a/pyproject.toml b/pyproject.toml
@@ -78,12 +78,13 @@ doc = [
     "sphinx-autodoc-typehints>=3.5.0",  # Bugged Union on Python 3.14 before 3.5.0
     "myst-parser>=3.0.1",  # Never tested lower versions
     "sphinx-design>=0.6.0",  # Never tested lower versions
+    "docutils>=0.22.4",  # Never tested lower versions
+    "lightning>=2.0.9",  # No OptimizerLRScheduler public type before 2.0.9
 ]
 
 test = [
     "pytest>=7.3",  # Before version 7.3, not all tests are run
     "pytest-cov>=6.0.0",  # Recent version to avoid problems, could be relaxed
-    "lightning>=2.0.9",  # No OptimizerLRScheduler public type before 2.0.9
     "torchvision>=0.18.0"
 ]
 

diff --git a/src/torchjd/autogram/_engine.py b/src/torchjd/autogram/_engine.py
@@ -78,7 +78,7 @@ class Engine:
 
         Train a model using Gramian-based Jacobian descent.
 
-        .. code-block:: python
+        .. testcode::
             :emphasize-lines: 5-6, 15-16, 18-19, 26-29
 
             import torch
@@ -162,15 +162,21 @@ class Engine:
         Parent modules should call their child modules directly rather than using their child
         modules' parameters themselves. For instance, the following model is not supported:
 
-        >>> class Model(nn.Module):
-        >>>     def __init__(self):
-        >>>         super().__init__()
-        >>>         self.linear = nn.Linear(2, 3)  # Child module
-        >>>
-        >>>     def forward(self, input: Tensor) -> Tensor:
-        >>>         # Incorrect: Use the child module's parameters directly without calling it.
-        >>>         return input @ self.linear.weight.T + self.linear.bias
-        >>>         # Correct alternative: return self.linear(input)
+        .. testsetup::
+
+            from torch import nn
+
+        .. testcode::
+
+            class Model(nn.Module):
+                def __init__(self):
+                    super().__init__()
+                    self.linear = nn.Linear(2, 3)  # Child module
+
+                def forward(self, input: Tensor) -> Tensor:
+                    # Incorrect: Use the child module's parameters directly without calling it.
+                    return input @ self.linear.weight.T + self.linear.bias
+                    # Correct alternative: return self.linear(input)
 
     .. note::
           For maximum efficiency, modules should ideally not contain both direct trainable

diff --git a/src/torchjd/autojac/_jac.py b/src/torchjd/autojac/_jac.py
@@ -68,8 +68,8 @@ def jac(
             >>> jacobians = jac([y1, y2], param)
             >>>
             >>> jacobians
-            (tensor([[-1., 1.],
-                    [ 2., 4.]]),)
+            (tensor([[-1.,  1.],
+                    [ 2.,  4.]]),)
 
     .. admonition::
         Example

diff --git a/src/torchjd/autojac/_jac_to_grad.py b/src/torchjd/autojac/_jac_to_grad.py
@@ -105,7 +105,7 @@ def jac_to_grad(
             >>> param.grad
             tensor([0.5000, 2.5000])
             >>> weights
-            tensor([0.5,  0.5])
+            tensor([0.5000, 0.5000])
 
         The ``.grad`` field of ``param`` now contains the aggregation (by UPGrad) of the Jacobian of
         :math:`\begin{bmatrix}y_1 \\ y_2\end{bmatrix}` with respect to ``param``. In this case, the

diff --git a/tests/doc/__init__.py b/tests/doc/__init__.py