From d72bce166c2a7cc161012f30b95343976c16ef65 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@ustc.edu.cn>
Date: Mon, 9 Feb 2026 02:52:19 +0800
Subject: [PATCH 1/6] style: use mdformat to format Markdown

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@ustc.edu.cn>
---
 .pre-commit-config.yaml | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 59a973571f..d9b0402ac1 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -53,23 +53,30 @@ repos:
       - id: velin
         args: ["--write"]
         exclude: ^source/3rdparty
-  # Python inside docs
-  - repo: https://github.com/asottile/blacken-docs
-    rev: 1.20.0
+  # markdown
+  - repo: https://github.com/hukkin/mdformat
+    rev: 1.0.0
     hooks:
-      - id: blacken-docs
+    - id: mdformat
+      additional_dependencies:
+      - mdformat-myst==0.3.0
+      - mdformat-ruff==0.1.3
+      - mdformat-web==0.2.0
+      - mdformat-config==0.2.1
+      - mdformat-beautysh==1.0.0
+      - mdformat-gfm-alerts==2.0.0
   # C++
   - repo: https://github.com/pre-commit/mirrors-clang-format
     rev: v21.1.8
     hooks:
       - id: clang-format
         exclude: ^(source/3rdparty|source/lib/src/gpu/cudart/.+\.inc|.+\.ipynb$|.+\.json$)
-  # markdown, yaml, CSS, javascript
+  # yaml, CSS, javascript
   - repo: https://github.com/pre-commit/mirrors-prettier
     rev: v4.0.0-alpha.8
     hooks:
       - id: prettier
-        types_or: [markdown, yaml, css]
+        types_or: [yaml, css]
         # workflow files cannot be modified by pre-commit.ci
         exclude: ^(source/3rdparty|\.github/workflows|\.clang-format)
   # Shell

From a3243a14ad7ae6751ef45d6530689b1c9762ffd2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 8 Feb 2026 18:54:40 +0000
Subject: [PATCH 2/6] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .pre-commit-config.yaml                   |  16 +--
 AGENTS.md                                 |  18 +--
 CONTRIBUTING.md                           |  23 ++--
 README.md                                 |   2 +-
 doc/backend.md                            |   2 +-
 doc/data/data-conv.md                     |   2 +-
 doc/data/system.md                        |  52 ++++-----
 doc/development/cmake.md                  |   2 +-
 doc/development/create-a-model-pt.md      |   8 +-
 doc/development/create-a-model-tf.md      |   6 +-
 doc/development/type-embedding.md         |   6 +-
 doc/env.md                                |  22 ++--
 doc/freeze/compress.md                    |  12 +-
 doc/freeze/freeze.md                      |   8 +-
 doc/inference/cxx.md                      |   2 +-
 doc/inference/nodejs.md                   |  10 +-
 doc/inference/python.md                   |   4 +-
 doc/install/build-conda.md                |   4 +-
 doc/install/easy-install-dev.md           |   2 +-
 doc/install/easy-install.md               |  44 +++----
 doc/install/install-from-c-library.md     |   4 +-
 doc/install/install-from-source.md        | 113 +++++++++---------
 doc/install/install-gromacs.md            |   9 +-
 doc/install/install-lammps.md             |   2 +-
 doc/install/install-nodejs.md             |   4 +-
 doc/model/change-bias.md                  |   8 +-
 doc/model/dpa2.md                         |   2 +-
 doc/model/dpa3.md                         |   8 +-
 doc/model/dplr.md                         |  10 +-
 doc/model/dprc.md                         | 134 +++++++++++++++++-----
 doc/model/linear.md                       |   2 +-
 doc/model/overall.md                      |  18 +--
 doc/model/pairtab.md                      |  10 +-
 doc/model/sel.md                          |  10 +-
 doc/model/show-model-info.md              |   7 ++
 doc/model/train-energy-hessian.md         |  26 ++---
 doc/model/train-energy-spin.md            |  36 +++---
 doc/model/train-energy.md                 |   6 +-
 doc/model/train-fitting-dos.md            |  14 +--
 doc/model/train-fitting-property.md       |  16 +--
 doc/model/train-fitting-tensor.md         |  24 ++--
 doc/model/train-hybrid.md                 |   6 +-
 doc/model/train-se-a-mask.md              |   4 +-
 doc/model/train-se-atten.md               |   8 +-
 doc/model/train-se-e2-a-tebd.md           |   6 +-
 doc/model/train-se-e2-a.md                |   6 +-
 doc/model/train-se-e2-r.md                |   6 +-
 doc/model/train-se-e3-tebd.md             |   2 +-
 doc/model/train-se-e3.md                  |   6 +-
 doc/nvnmd/nvnmd.md                        |  50 +++++---
 doc/test/model-deviation.md               |   4 +-
 doc/third-party/ase.md                    |  10 +-
 doc/third-party/dpdata.md                 |   2 +-
 doc/third-party/gromacs.md                |   2 +-
 doc/third-party/ipi.md                    |   2 +-
 doc/third-party/lammps-command.md         |  25 ++--
 doc/train/finetuning.md                   |  12 +-
 doc/train/gpu-limitations.md              |   8 +-
 doc/train/multi-task-training.md          |  19 +--
 doc/train/parallel-training.md            |  15 ++-
 doc/train/tensorboard.md                  |   2 +-
 doc/train/training-advanced.md            |   6 +-
 doc/train/training.md                     |  22 ++--
 doc/troubleshooting/howtoset_num_nodes.md |  18 +--
 examples/property/train/README.md         |   4 +-
 source/3rdparty/README.md                 |  10 +-
 source/3rdparty/implib/arch/e2k/README.md |   7 +-
 source/nodejs/README.md                   |   4 +-
 68 files changed, 544 insertions(+), 430 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 3fc33c8b98..efa5840469 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -57,14 +57,14 @@ repos:
   - repo: https://github.com/hukkin/mdformat
     rev: 1.0.0
     hooks:
-    - id: mdformat
-      additional_dependencies:
-      - mdformat-myst==0.3.0
-      - mdformat-ruff==0.1.3
-      - mdformat-web==0.2.0
-      - mdformat-config==0.2.1
-      - mdformat-beautysh==1.0.0
-      - mdformat-gfm-alerts==2.0.0
+      - id: mdformat
+        additional_dependencies:
+          - mdformat-myst==0.3.0
+          - mdformat-ruff==0.1.3
+          - mdformat-web==0.2.0
+          - mdformat-config==0.2.1
+          - mdformat-beautysh==1.0.0
+          - mdformat-gfm-alerts==2.0.0
   # C++
   - repo: https://github.com/pre-commit/mirrors-clang-format
     rev: v21.1.8
diff --git a/AGENTS.md b/AGENTS.md
index c629a08def..bcac9f1514 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -23,8 +23,8 @@ DeePMD-kit is a deep learning package for many-body potential energy representat
 ### Lint and Format Code
 
 - Install linter: `uv pip install ruff`
-- Run linting: `ruff check .` -- takes <1 second
-- Format code: `ruff format .` -- takes <1 second
+- Run linting: `ruff check .` -- takes \<1 second
+- Format code: `ruff format .` -- takes \<1 second
 - **Always run `ruff check .` and `ruff format .` before committing changes or the CI will fail.**
 
 ### Training and Validation
@@ -40,19 +40,19 @@ DeePMD-kit is a deep learning package for many-body potential energy representat
 ### Basic Functionality Validation
 
 1. **CLI Interface**: Run `dp --version` and `dp -h` to verify installation
-2. **Python Interface**: Run `python -c "import deepmd; import deepmd.tf; print('Both interfaces work')"`
-3. **Backend Selection**: Test `dp --tf -h`, `dp --pt -h`, `dp --jax -h`, `dp --pd -h`
+1. **Python Interface**: Run `python -c "import deepmd; import deepmd.tf; print('Both interfaces work')"`
+1. **Backend Selection**: Test `dp --tf -h`, `dp --pt -h`, `dp --jax -h`, `dp --pd -h`
 
 ### Training Workflow Validation
 
 1. **TensorFlow Training**: `cd examples/water/se_e2_a && timeout 60 dp train input.json --skip-neighbor-stat` -- should start training and show decreasing loss
-2. **PyTorch Training**: `cd examples/water/se_e2_a && timeout 60 dp --pt train input_torch.json --skip-neighbor-stat` -- should start training and show decreasing loss
-3. **Verify training output**: Look for "batch X: trn: rmse" messages showing decreasing error values
+1. **PyTorch Training**: `cd examples/water/se_e2_a && timeout 60 dp --pt train input_torch.json --skip-neighbor-stat` -- should start training and show decreasing loss
+1. **Verify training output**: Look for "batch X: trn: rmse" messages showing decreasing error values
 
 ### Test-Based Validation
 
 1. **Core Tests**: `pytest source/tests/tf/test_dp_test.py::TestDPTestEner::test_1frame -v` -- should pass in ~10 seconds
-2. **Multi-backend**: Test both TensorFlow and PyTorch components work
+1. **Multi-backend**: Test both TensorFlow and PyTorch components work
 
 ## Common Commands and Timing
 
@@ -127,8 +127,8 @@ source/               # C++ source code and tests
 
 ### Linting and Formatting
 
-- **Ruff check**: <1 second
-- **Ruff format**: <1 second
+- **Ruff check**: \<1 second
+- **Ruff format**: \<1 second
 - **Pre-commit hooks**: May have network issues, use individual tools
 
 ### Commit Messages and PR Titles
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 67491672e8..a8378350e4 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -47,7 +47,7 @@ Please perform the following steps to create your Pull Request to this repositor
 ### Step 1: Fork the repository
 
 1. Visit the project: <https://github.com/deepmodeling/deepmd-kit>
-2. Click the **Fork** button on the top right and wait it to finish.
+1. Click the **Fork** button on the top right and wait it to finish.
 
 ### Step 2: Clone the forked repository to local storage and set configurations
 
@@ -58,7 +58,7 @@ Please perform the following steps to create your Pull Request to this repositor
    # Replace `$username` with your GitHub ID
    ```
 
-2. Add deepmodeling's repo as your remote repo, we can name it "upstream". And fetch upstream's latest codes to your workstation.
+1. Add deepmodeling's repo as your remote repo, we can name it "upstream". And fetch upstream's latest codes to your workstation.
 
    ```bash
    git remote add upstream https://github.com/deepmodeling/deepmd-kit.git
@@ -70,9 +70,9 @@ Please perform the following steps to create your Pull Request to this repositor
    git merge upstream/master
    ```
 
-3. Modify your codes and design unit tests.
+1. Modify your codes and design unit tests.
 
-4. Commit your changes to a new branch
+1. Commit your changes to a new branch
 
    ```bash
    git checkout -b branch1
@@ -81,7 +81,8 @@ Please perform the following steps to create your Pull Request to this repositor
    git commit -m "commit-message: update the xx"
    ```
 
-5. Push the changed codes to your original repo on github.
+1. Push the changed codes to your original repo on github.
+
    ```bash
    git push origin branch1
    ```
@@ -97,15 +98,15 @@ Please perform the following steps to create your Pull Request to this repositor
    git rebase upstream/master
    ```
 
-2. Create a new branch based on the master branch.
+1. Create a new branch based on the master branch.
 
    ```bash
    git checkout -b new-branch-name
    ```
 
-3. Modify your codes and design unit tests.
+1. Modify your codes and design unit tests.
 
-4. Commit your changes
+1. Commit your changes
 
    ```bash
    git status # Checks the local status
@@ -113,7 +114,7 @@ Please perform the following steps to create your Pull Request to this repositor
    git commit -m "commit-message: update the xx"
    ```
 
-5. Keep your branch in sync with upstream/master
+1. Keep your branch in sync with upstream/master
 
    ```bash
    # While on your new branch
@@ -121,7 +122,7 @@ Please perform the following steps to create your Pull Request to this repositor
    git rebase upstream/master
    ```
 
-6. Push your changes to the remote
+1. Push your changes to the remote
 
    ```bash
    git push -u origin new-branch-name # "-u" is used to track the remote branch from origin
@@ -130,7 +131,7 @@ Please perform the following steps to create your Pull Request to this repositor
 ### Step 3: Create a pull request
 
 1. Visit your fork at <https://github.com/$username/deepmd-kit> (replace `$username` with your GitHub ID)
-2. Click `pull requests`, followed by `New pull request` and `Compare & pull request` to create your PR.
+1. Click `pull requests`, followed by `New pull request` and `Compare & pull request` to create your PR.
 
 Now, your PR is successfully submitted! After this PR is merged, you will automatically become a contributor to DeePMD-kit.
 
diff --git a/README.md b/README.md
index 143ed1b0ab..58ec1fec7f 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 [<picture><source media="(prefers-color-scheme: dark)" srcset="./doc/_static/logo-dark.svg"><source media="(prefers-color-scheme: light)" srcset="./doc/_static/logo.svg"><img alt="DeePMD-kit logo" src="./doc/_static/logo.svg"></picture>](./doc/logo.md)
 
----
+______________________________________________________________________
 
 # DeePMD-kit
 
diff --git a/doc/backend.md b/doc/backend.md
index b2f7dc4826..3e2b7e4acb 100644
--- a/doc/backend.md
+++ b/doc/backend.md
@@ -46,7 +46,7 @@ The `.pd` extension is used for model checkpoint storage, which is commonly util
 
 ### DP {{ dpmodel_icon }}
 
-:::{note}
+:::\{note}
 This backend is only for development and should not take into production.
 :::
 
diff --git a/doc/data/data-conv.md b/doc/data/data-conv.md
index 56ce526480..30be98bcfe 100644
--- a/doc/data/data-conv.md
+++ b/doc/data/data-conv.md
@@ -57,7 +57,7 @@ In the raw format, the property of one frame is provided per line, ending with `
 ```bash
 $ cat force.raw
 -0.724  2.039 -0.951  0.841 -0.464  0.363
- 6.737  1.554 -5.587 -2.803  0.062  2.222
+6.737  1.554 -5.587 -2.803  0.062  2.222
 -1.968 -0.163  1.020 -0.225 -0.789  0.343
 ```
 
diff --git a/doc/data/system.md b/doc/data/system.md
index f6da7b534b..5f834c4345 100644
--- a/doc/data/system.md
+++ b/doc/data/system.md
@@ -12,29 +12,29 @@ A system should contain system properties, input frame properties, and labeled f
 
 The input frame properties contain the following property, the first axis of which is the number of frames:
 
-| ID        | Property                                            | Raw file   | Unit | Required/Optional    | Shape                    | Description                               |
-| --------- | --------------------------------------------------- | ---------- | ---- | -------------------- | ------------------------ | ----------------------------------------- |
-| coord     | Atomic coordinates                                  | coord.raw  | Å    | Required             | Nframes \* Natoms \* 3   |
-| box       | Boxes                                               | box.raw    | Å    | Required if periodic | Nframes \* 3 \* 3        | in the order `XX XY XZ YX YY YZ ZX ZY ZZ` |
-| fparam    | Extra frame parameters                              | fparam.raw | Any  | Optional             | Nframes \* Any           |
-| aparam    | Extra atomic parameters                             | aparam.raw | Any  | Optional             | Nframes \* aparam \* Any |
-| numb_copy | Each frame is copied by the `numb_copy` (int) times | prob.raw   | 1    | Optional             | Nframes                  | Integer; Default is 1 for all frames      |
+| ID        | Property                                            | Raw file   | Unit | Required/Optional    | Shape                  | Description                               |
+| --------- | --------------------------------------------------- | ---------- | ---- | -------------------- | ---------------------- | ----------------------------------------- |
+| coord     | Atomic coordinates                                  | coord.raw  | Å    | Required             | Nframes * Natoms * 3   |                                           |
+| box       | Boxes                                               | box.raw    | Å    | Required if periodic | Nframes * 3 * 3        | in the order `XX XY XZ YX YY YZ ZX ZY ZZ` |
+| fparam    | Extra frame parameters                              | fparam.raw | Any  | Optional             | Nframes * Any          |                                           |
+| aparam    | Extra atomic parameters                             | aparam.raw | Any  | Optional             | Nframes * aparam * Any |                                           |
+| numb_copy | Each frame is copied by the `numb_copy` (int) times | prob.raw   | 1    | Optional             | Nframes                | Integer; Default is 1 for all frames      |
 
 The labeled frame properties are listed as follows, all of which will be used for training if and only if the loss function contains such property:
 
-| ID                    | Property                                                                         | Raw file                  | Unit   | Shape                                 | Description                               |
-| --------------------- | -------------------------------------------------------------------------------- | ------------------------- | ------ | ------------------------------------- | ----------------------------------------- |
-| energy                | Frame energies                                                                   | energy.raw                | eV     | Nframes                               |
-| force                 | Atomic forces                                                                    | force.raw                 | eV/Å   | Nframes \* Natoms \* 3                |
-| virial                | Frame virial                                                                     | virial.raw                | eV     | Nframes \* 9                          | in the order `XX XY XZ YX YY YZ ZX ZY ZZ` |
-| hessian               | Frame energy Hessian matrices                                                    | hessian.raw               | eV/Å^2 | Nframes \* Natoms \* 3 \* Natoms \* 3 | full Hessian matrices                     |
-| atom_ener             | Atomic energies                                                                  | atom_ener.raw             | eV     | Nframes \* Natoms                     |
-| atom_pref             | Weights of atomic forces                                                         | atom_pref.raw             | 1      | Nframes \* Natoms                     |
-| dipole                | Frame dipole                                                                     | dipole.raw                | Any    | Nframes \* 3                          |
-| atomic_dipole         | Atomic dipole                                                                    | atomic_dipole.raw         | Any    | Nframes \* Natoms \* 3                |
-| polarizability        | Frame polarizability                                                             | polarizability.raw        | Any    | Nframes \* 9                          | in the order `XX XY XZ YX YY YZ ZX ZY ZZ` |
-| atomic_polarizability | Atomic polarizability                                                            | atomic_polarizability.raw | Any    | Nframes \* Natoms \* 9                | in the order `XX XY XZ YX YY YZ ZX ZY ZZ` |
-| drdq                  | Partial derivative of atomic coordinates with respect to generalized coordinates | drdq.raw                  | 1      | Nframes \* Natoms \* 3 \* Ngen_coords |
+| ID                    | Property                                                                         | Raw file                  | Unit   | Shape                              | Description                               |
+| --------------------- | -------------------------------------------------------------------------------- | ------------------------- | ------ | ---------------------------------- | ----------------------------------------- |
+| energy                | Frame energies                                                                   | energy.raw                | eV     | Nframes                            |                                           |
+| force                 | Atomic forces                                                                    | force.raw                 | eV/Å   | Nframes * Natoms * 3               |                                           |
+| virial                | Frame virial                                                                     | virial.raw                | eV     | Nframes * 9                        | in the order `XX XY XZ YX YY YZ ZX ZY ZZ` |
+| hessian               | Frame energy Hessian matrices                                                    | hessian.raw               | eV/Å^2 | Nframes * Natoms * 3 * Natoms * 3  | full Hessian matrices                     |
+| atom_ener             | Atomic energies                                                                  | atom_ener.raw             | eV     | Nframes * Natoms                   |                                           |
+| atom_pref             | Weights of atomic forces                                                         | atom_pref.raw             | 1      | Nframes * Natoms                   |                                           |
+| dipole                | Frame dipole                                                                     | dipole.raw                | Any    | Nframes * 3                        |                                           |
+| atomic_dipole         | Atomic dipole                                                                    | atomic_dipole.raw         | Any    | Nframes * Natoms * 3               |                                           |
+| polarizability        | Frame polarizability                                                             | polarizability.raw        | Any    | Nframes * 9                        | in the order `XX XY XZ YX YY YZ ZX ZY ZZ` |
+| atomic_polarizability | Atomic polarizability                                                            | atomic_polarizability.raw | Any    | Nframes * Natoms * 9               | in the order `XX XY XZ YX YY YZ ZX ZY ZZ` |
+| drdq                  | Partial derivative of atomic coordinates with respect to generalized coordinates | drdq.raw                  | 1      | Nframes * Natoms * 3 * Ngen_coords |                                           |
 
 In general, we always use the following convention of units:
 
@@ -50,7 +50,7 @@ In general, we always use the following convention of units:
 
 ## Mixed type
 
-:::{note}
+:::\{note}
 Only the [DPA-1](../model/train-se-atten.md) and [DPA-2](../model/dpa2.md) descriptors support this format.
 :::
 
@@ -73,11 +73,11 @@ set.*/real_atom_types.npy
 
 This system contains `Nframes` frames with the same atom number `Natoms`, the total number of element types contained in all frames is `Ntypes`. Most files are the same as those in [standard formats](../data/system.md), here we only list the distinct ones:
 
-| ID       | Property                         | File                | Required/Optional | Shape             | Description                                                                                                              |
-| -------- | -------------------------------- | ------------------- | ----------------- | ----------------- | ------------------------------------------------------------------------------------------------------------------------ |
-| /        | Atom type indexes (place holder) | type.raw            | Required          | Natoms            | All zeros to fake the type input                                                                                         |
-| type_map | Atom type names                  | type_map.raw        | Required          | Ntypes            | Atom names that map to atom type contained in all the frames, which is unnecessart to be contained in the periodic table |
-| type     | Atom type indexes of each frame  | real_atom_types.npy | Required          | Nframes \* Natoms | Integers that describe atom types in each frame, corresponding to indexes in type_map. `-1` means virtual atoms.         |
+| ID       | Property                         | File                | Required/Optional | Shape            | Description                                                                                                              |
+| -------- | -------------------------------- | ------------------- | ----------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------ |
+| /        | Atom type indexes (place holder) | type.raw            | Required          | Natoms           | All zeros to fake the type input                                                                                         |
+| type_map | Atom type names                  | type_map.raw        | Required          | Ntypes           | Atom names that map to atom type contained in all the frames, which is unnecessart to be contained in the periodic table |
+| type     | Atom type indexes of each frame  | real_atom_types.npy | Required          | Nframes * Natoms | Integers that describe atom types in each frame, corresponding to indexes in type_map. `-1` means virtual atoms.         |
 
 With these edited files, one can put together frames with the same `Natoms`, instead of the same formula (like `H2O`).
 
diff --git a/doc/development/cmake.md b/doc/development/cmake.md
index f8508d8992..5654d0cea2 100644
--- a/doc/development/cmake.md
+++ b/doc/development/cmake.md
@@ -6,7 +6,7 @@ After DeePMD-kit C/C++ library is installed, one can find DeePMD-kit from CMake:
 find_package(DeePMD REQUIRED)
 ```
 
-Note that you may need to add ${deepmd_root} to the cached CMake variable `CMAKE_PREFIX_PATH`.
+Note that you may need to add \$\{deepmd_root} to the cached CMake variable `CMAKE_PREFIX_PATH`.
 
 To link against the C interface library, using
 
diff --git a/doc/development/create-a-model-pt.md b/doc/development/create-a-model-pt.md
index 7eb75b7026..b321b26cf7 100644
--- a/doc/development/create-a-model-pt.md
+++ b/doc/development/create-a-model-pt.md
@@ -1,6 +1,6 @@
 # Create a model in other backends {{ pytorch_icon }} {{ dpmodel_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: PyTorch {{ pytorch_icon }}, DP {{ dpmodel_icon }}
 
 In the following context, we use the PyTorch backend as the example, while it also applies to other backends listed above.
@@ -11,9 +11,9 @@ If you'd like to create a new model that isn't covered by the existing DeePMD-ki
 To incorporate your custom model you'll need to:
 
 1. Register and implement new components (e.g. descriptor) in a Python file.
-2. Register new arguments for user inputs.
-3. Package new codes into a Python package.
-4. Test new models.
+1. Register new arguments for user inputs.
+1. Package new codes into a Python package.
+1. Test new models.
 
 ## Design a new component
 
diff --git a/doc/development/create-a-model-tf.md b/doc/development/create-a-model-tf.md
index cc7ad1999d..7720ced0ca 100644
--- a/doc/development/create-a-model-tf.md
+++ b/doc/development/create-a-model-tf.md
@@ -5,9 +5,9 @@ If you'd like to create a new model that isn't covered by the existing DeePMD-ki
 To incorporate your custom model you'll need to:
 
 1. Register and implement new components (e.g. descriptor) in a Python file. You may also want to register new TensorFlow OPs if necessary.
-2. Register new arguments for user inputs.
-3. Package new codes into a Python package.
-4. Test new models.
+1. Register new arguments for user inputs.
+1. Package new codes into a Python package.
+1. Test new models.
 
 ## Design a new component
 
diff --git a/doc/development/type-embedding.md b/doc/development/type-embedding.md
index 10eeed6ee9..b3de75aaf3 100644
--- a/doc/development/type-embedding.md
+++ b/doc/development/type-embedding.md
@@ -66,7 +66,7 @@ In trainer.py, it will parse the parameter from the input JSON file. If a `type_
 
 ### model (model/ener.py)
 
-When building the operation graph of the `model` in `model.build`. If a `TypeEmbedNet` is detected, it will build the operation graph of `type embed net`, `embedding net` and `fitting net` by order. The building process of `type embed net` can be found in `TypeEmbedNet.build`, which output the type embedding vector of each atom type (of [$\text{ntypes} \times \text{nchanl}$] dimensions). We then save the type embedding vector into `input_dict`, so that they can be fetched later in `embedding net` and `fitting net`.
+When building the operation graph of the `model` in `model.build`. If a `TypeEmbedNet` is detected, it will build the operation graph of `type embed net`, `embedding net` and `fitting net` by order. The building process of `type embed net` can be found in `TypeEmbedNet.build`, which output the type embedding vector of each atom type (of \[$\text{ntypes} \times \text{nchanl}$\] dimensions). We then save the type embedding vector into `input_dict`, so that they can be fetched later in `embedding net` and `fitting net`.
 
 ### embedding net (descriptor/se\*.py)
 
@@ -84,8 +84,8 @@ build -> _pass_filter -> _filter -> _filter_lower
 
 ### fitting net (fit/ener.py)
 
-In `fitting net`, it takes the descriptor vector as input, whose dimension is [natoms, $M_1\times M_2$]. Because we need to involve information on the centric atom in this step, we need to generate a matrix named `atype_embed` (of dim [natoms, nchanl]), in which each row is the type embedding vector of the specific centric atom. The input is sorted by type of centric atom, we also know the number of a particular atom type (stored in `natoms[2+i]`), thus we get the type vector of the centric atom. In the build phase of the fitting net, it will check whether type embedding exists in `input_dict` and fetch them. After that, call `embed_atom_type` function to look up the embedding vector for the type vector of the centric atom to obtain `atype_embed`, and concat input with it ([input, atype_embed]). The modified input goes through `fitting` net` to get predicted energy.
+In `fitting net`, it takes the descriptor vector as input, whose dimension is \[natoms, $M_1\times M_2$\]. Because we need to involve information on the centric atom in this step, we need to generate a matrix named `atype_embed` (of dim [natoms, nchanl]), in which each row is the type embedding vector of the specific centric atom. The input is sorted by type of centric atom, we also know the number of a particular atom type (stored in `natoms[2+i]`), thus we get the type vector of the centric atom. In the build phase of the fitting net, it will check whether type embedding exists in `input_dict` and fetch them. After that, call `embed_atom_type` function to look up the embedding vector for the type vector of the centric atom to obtain `atype_embed`, and concat input with it ([input, atype_embed]). The modified input goes through `fitting` net\` to get predicted energy.
 
-:::{note}
+:::\{note}
 You can't apply the compression method while using atom-type embedding.
 :::
diff --git a/doc/env.md b/doc/env.md
index 1688e0af9c..d2b25c2ddd 100644
--- a/doc/env.md
+++ b/doc/env.md
@@ -1,12 +1,12 @@
 # Runtime environment variables
 
-:::{note}
+:::\{note}
 For build-time environment variables, see [Install from source code](./install/install-from-source.md).
 :::
 
 ## All interfaces
 
-:::{envvar} DP_INTER_OP_PARALLELISM_THREADS
+:::\{envvar} DP_INTER_OP_PARALLELISM_THREADS
 
 **Alias**: `TF_INTER_OP_PARALLELISM_THREADS`
 **Default**: `0`
@@ -15,7 +15,7 @@ Control parallelism within TensorFlow (when TensorFlow is built against Eigen) a
 See [How to control the parallelism of a job](./troubleshooting/howtoset_num_nodes.md) for details.
 :::
 
-:::{envvar} DP_INTRA_OP_PARALLELISM_THREADS
+:::\{envvar} DP_INTRA_OP_PARALLELISM_THREADS
 
 **Alias**: `TF_INTRA_OP_PARALLELISM_THREADS`\*\*
 **Default**: `0`
@@ -35,42 +35,42 @@ See [How to control the parallelism of a job](./troubleshooting/howtoset_num_nod
 
 ## Python interface only
 
-:::{envvar} DP_INTERFACE_PREC
+:::\{envvar} DP_INTERFACE_PREC
 
 **Choices**: `high`, `low`; **Default**: `high`
 
 Control high (double) or low (float) precision of training.
 :::
 
-:::{envvar} DP_AUTO_PARALLELIZATION
+:::\{envvar} DP_AUTO_PARALLELIZATION
 
 **Choices**: `0`, `1`; **Default**: `0`
 
 {{ tensorflow_icon }} Enable auto parallelization for CPU operators.
 :::
 
-:::{envvar} DP_JIT
+:::\{envvar} DP_JIT
 
 **Choices**: `0`, `1`; **Default**: `0`
 
 {{ tensorflow_icon }} Enable JIT. Note that this option may either improve or decrease the performance. Requires TensorFlow to support JIT.
 :::
 
-:::{envvar} DP_INFER_BATCH_SIZE
+:::\{envvar} DP_INFER_BATCH_SIZE
 
 **Default**: `1024` on CPUs and as maximum as possible until out-of-memory on GPUs
 
 Inference batch size, calculated by multiplying the number of frames with the number of atoms.
 :::
 
-:::{envvar} DP_BACKEND
+:::\{envvar} DP_BACKEND
 
 **Default**: `tensorflow`
 
 Default backend.
 :::
 
-:::{envvar} NUM_WORKERS
+:::\{envvar} NUM_WORKERS
 
 **Default**: 4 or the number of cores (whichever is smaller)
 
@@ -83,14 +83,14 @@ See [PyTorch documentation](https://pytorch.org/docs/stable/data.html) for detai
 
 These environment variables also apply to third-party programs using the C++ interface, such as [LAMMPS](./third-party/lammps-command.md).
 
-:::{envvar} DP_PLUGIN_PATH
+:::\{envvar} DP_PLUGIN_PATH
 
 **Type**: List of paths, split by `:` on Unix and `;` on Windows
 
 List of customized OP plugin libraries to load, such as `/path/to/plugin1.so:/path/to/plugin2.so` on Linux and `/path/to/plugin1.dll;/path/to/plugin2.dll` on Windows.
 :::
 
-:::{envvar} DP_PROFILER
+:::\{envvar} DP_PROFILER
 
 {{ pytorch_icon }} Enable the built-in PyTorch Kineto profiler for the PyTorch C++ (inference) backend.
 
diff --git a/doc/freeze/compress.md b/doc/freeze/compress.md
index d827c71525..e83f14e5fd 100644
--- a/doc/freeze/compress.md
+++ b/doc/freeze/compress.md
@@ -1,6 +1,6 @@
 # Compress a model {{ tensorflow_icon }} {{ pytorch_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}
 :::
 
@@ -58,15 +58,13 @@ If the number of neighbors of an atom is smaller than $N_c$, the corresponding p
 In practice, if the real number of neighbors is significantly smaller than $N_c$, a notable operation is spent on the multiplication of padding zeros.
 In the compressed DP model, the number of neighbors is precisely indexed at the tabulated inference stage, further saving computational costs.[^1]
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 ## Instructions
 
 Once the frozen model is obtained from DeePMD-kit, we can get the neural network structure and its parameters (weights, biases, etc.) from the trained model, and compress it in the following way:
 
-::::{tab-set}
+::::\{tab-set}
 
-:::{tab-item} TensorFlow {{ tensorflow_icon }}
+:::\{tab-item} TensorFlow {{ tensorflow_icon }}
 
 ```bash
 dp compress -i graph.pb -o graph-compress.pb
@@ -74,7 +72,7 @@ dp compress -i graph.pb -o graph-compress.pb
 
 :::
 
-:::{tab-item} PyTorch {{ pytorch_icon }}
+:::\{tab-item} PyTorch {{ pytorch_icon }}
 
 ```bash
 dp --pt compress -i model.pth -o model-compress.pth
@@ -129,3 +127,5 @@ See the documentation of a specific descriptor to see whether it supports model
 When compressing models in the PyTorch backend, the customized OP library for the Python interface must be installed when [freezing the model](../freeze/freeze.md).
 
 The customized OP library for the Python interface can be installed by setting environment variable {envvar}`DP_ENABLE_PYTORCH` to `1` during [installation](../install/install-from-source.md).
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/freeze/freeze.md b/doc/freeze/freeze.md
index 20f02177c6..70f71c810b 100644
--- a/doc/freeze/freeze.md
+++ b/doc/freeze/freeze.md
@@ -3,9 +3,9 @@
 The trained neural network is extracted from a checkpoint and dumped into a model file. This process is called "freezing" a model.
 To freeze a model, typically one does
 
-::::{tab-set}
+::::\{tab-set}
 
-:::{tab-item} TensorFlow {{ tensorflow_icon }}
+:::\{tab-item} TensorFlow {{ tensorflow_icon }}
 
 ```bash
 $ dp freeze -o model.pb
@@ -16,7 +16,7 @@ The idea and part of our code are from [Morgan](https://blog.metaflow.fr/tensorf
 
 :::
 
-:::{tab-item} PyTorch {{ pytorch_icon }}
+:::\{tab-item} PyTorch {{ pytorch_icon }}
 
 ```bash
 $ dp --pt freeze -o model.pth
@@ -35,7 +35,7 @@ The output model is called `model_branch1.pth`, which is the specifically frozen
 
 :::
 
-:::{tab-item} Paddle {{ paddle_icon }}
+:::\{tab-item} Paddle {{ paddle_icon }}
 
 ```bash
 $ dp --pd freeze -o model
diff --git a/doc/inference/cxx.md b/doc/inference/cxx.md
index ec8a3248a1..2367261c09 100644
--- a/doc/inference/cxx.md
+++ b/doc/inference/cxx.md
@@ -1,6 +1,6 @@
 # C/C++ interface
 
-:::{note}
+:::\{note}
 See [Environment variables](../env.md) for the runtime environment variables.
 :::
 
diff --git a/doc/inference/nodejs.md b/doc/inference/nodejs.md
index abe9dc36ab..f7059dd874 100644
--- a/doc/inference/nodejs.md
+++ b/doc/inference/nodejs.md
@@ -1,6 +1,6 @@
 # Node.js interface
 
-:::{note}
+:::\{note}
 See [Environment variables](../env.md) for the runtime environment variables.
 :::
 
@@ -32,12 +32,12 @@ energy = dp.compute(energy, v_forces, v_virials, v_coord, v_atype, v_cell);
 
 console.log("energy:", energy);
 console.log(
-  "forces:",
-  [...Array(v_forces.size()).keys()].map((i) => v_forces.get(i)),
+    "forces:",
+    [...Array(v_forces.size()).keys()].map((i) => v_forces.get(i)),
 );
 console.log(
-  "virials:",
-  [...Array(v_virials.size()).keys()].map((i) => v_virials.get(i)),
+    "virials:",
+    [...Array(v_virials.size()).keys()].map((i) => v_virials.get(i)),
 );
 ```
 
diff --git a/doc/inference/python.md b/doc/inference/python.md
index 361db7b64f..ff8d5f2cc0 100644
--- a/doc/inference/python.md
+++ b/doc/inference/python.md
@@ -1,6 +1,6 @@
 # Python interface
 
-:::{note}
+:::\{note}
 See [Environment variables](../env.md) for the runtime environment variables.
 :::
 
@@ -53,7 +53,7 @@ Otherwise, TensorFlow or PyTorch will never release the memory, and this may lea
 
 ## External neighbor list algorithm {{ tensorflow_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}
 :::
 
diff --git a/doc/install/build-conda.md b/doc/install/build-conda.md
index e54849f75d..cdd70c5cf8 100644
--- a/doc/install/build-conda.md
+++ b/doc/install/build-conda.md
@@ -1,7 +1,7 @@
 # Building conda packages
 
-::::{danger}
-:::{deprecated} 3.0.0
+::::\{danger}
+:::\{deprecated} 3.0.0
 The official channel has been deprecated since 3.0.0.
 Refer to [conda-forge documentation](https://conda-forge.org/docs/maintainer/adding_pkgs/) for how to contribute and build packages locally.
 :::
diff --git a/doc/install/easy-install-dev.md b/doc/install/easy-install-dev.md
index 9a0154320e..428abb2745 100644
--- a/doc/install/easy-install-dev.md
+++ b/doc/install/easy-install-dev.md
@@ -22,7 +22,7 @@ pip install -U --pre deepmd-kit[gpu,cu12,lmp,torch] --extra-index-url https://de
 
 ## Download pre-compiled C Library {{ tensorflow_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}
 :::
 
diff --git a/doc/install/easy-install.md b/doc/install/easy-install.md
index 32650b4a80..e7358ed373 100644
--- a/doc/install/easy-install.md
+++ b/doc/install/easy-install.md
@@ -4,12 +4,12 @@ There are various easy methods to install DeePMD-kit. Choose one that you prefer
 
 After your easy installation, DeePMD-kit (`dp`) and LAMMPS (`lmp`) will be available to execute. You can try `dp -h` and `lmp -h` to see the help. `mpirun` is also available considering you may want to train models or run LAMMPS in parallel.
 
-:::{note}
+:::\{note}
 Note: The off-line packages and conda packages require the [GNU C Library](https://www.gnu.org/software/libc/) 2.17 or above. The GPU version requires [compatible NVIDIA driver](https://docs.nvidia.com/deploy/cuda-compatibility/index.html#minor-version-compatibility) to be installed in advance. It is possible to force conda to [override detection](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-virtual.html#overriding-detected-packages) when installation, but these requirements are still necessary during runtime.
 You can refer to [DeepModeling conda FAQ](https://docs.deepmodeling.com/faq/conda.html) for more information.
 :::
 
-:::{note}
+:::\{note}
 Python 3.10 or above is required for Python interface.
 :::
 
@@ -57,8 +57,8 @@ Read [conda-forge FAQ](https://conda-forge.org/docs/user/tipsandtricks.html#inst
 
 ### Official channel (deprecated)
 
-::::{danger}
-:::{deprecated} 3.0.0
+::::\{danger}
+:::\{deprecated} 3.0.0
 The official channel has been deprecated since 3.0.0, due to the challenging work of building dependencies for [multiple backends](../backend.md).
 Old packages will still be available at https://conda.deepmodeling.com.
 Maintainers will build packages in the conda-forge organization together with other conda-forge members.
@@ -85,13 +85,13 @@ docker pull ghcr.io/deepmodeling/deepmd-kit:2.2.8_cuda12.0_gpu
 
 [Create a new environment](https://docs.deepmodeling.com/faq/conda.html#how-to-create-a-new-conda-pip-environment), and then execute the following command:
 
-:::::::{tab-set}
+:::::::\{tab-set}
 
-::::::{tab-item} TensorFlow {{ tensorflow_icon }}
+::::::\{tab-item} TensorFlow {{ tensorflow_icon }}
 
-:::::{tab-set}
+:::::\{tab-set}
 
-::::{tab-item} CUDA 12
+::::\{tab-item} CUDA 12
 
 ```bash
 pip install deepmd-kit[gpu,cu12]
@@ -101,7 +101,7 @@ pip install deepmd-kit[gpu,cu12]
 
 ::::
 
-::::{tab-item} CPU
+::::\{tab-item} CPU
 
 ```bash
 pip install deepmd-kit[cpu]
@@ -113,11 +113,11 @@ pip install deepmd-kit[cpu]
 
 ::::::
 
-::::::{tab-item} PyTorch {{ pytorch_icon }}
+::::::\{tab-item} PyTorch {{ pytorch_icon }}
 
-:::::{tab-set}
+:::::\{tab-set}
 
-::::{tab-item} CUDA 12
+::::\{tab-item} CUDA 12
 
 ```bash
 pip install deepmd-kit[torch]
@@ -125,7 +125,7 @@ pip install deepmd-kit[torch]
 
 ::::
 
-::::{tab-item} CPU
+::::\{tab-item} CPU
 
 ```bash
 pip install torch --index-url https://download.pytorch.org/whl/cpu
@@ -138,11 +138,11 @@ pip install deepmd-kit
 
 ::::::
 
-::::::{tab-item} JAX {{ jax_icon }}
+::::::\{tab-item} JAX {{ jax_icon }}
 
-:::::{tab-set}
+:::::\{tab-set}
 
-::::{tab-item} CUDA 12
+::::\{tab-item} CUDA 12
 
 ```bash
 pip install deepmd-kit[jax] jax[cuda12]
@@ -150,7 +150,7 @@ pip install deepmd-kit[jax] jax[cuda12]
 
 ::::
 
-::::{tab-item} CPU
+::::\{tab-item} CPU
 
 ```bash
 pip install deepmd-kit[jax]
@@ -166,11 +166,11 @@ Switch to the TensorFlow {{ tensorflow_icon }} tab for more information.
 
 ::::::
 
-::::::{tab-item} Paddle {{ paddle_icon }}
+::::::\{tab-item} Paddle {{ paddle_icon }}
 
-:::::{tab-set}
+:::::\{tab-set}
 
-::::{tab-item} CUDA 12.6
+::::\{tab-item} CUDA 12.6
 
 ```bash
 # release version
@@ -182,7 +182,7 @@ pip install deepmd-kit
 
 ::::
 
-::::{tab-item} CPU
+::::\{tab-item} CPU
 
 ```bash
 # release version
@@ -202,7 +202,7 @@ pip install deepmd-kit
 
 The supported platform includes Linux x86-64 and aarch64 with GNU C Library 2.28 or above, macOS x86-64 and arm64, and Windows x86-64.
 
-:::{Warning}
+:::\{Warning}
 If your platform is not supported, or you want to build against the installed backends, or you want to enable ROCM support, please [build from source](install-from-source.md).
 :::
 
diff --git a/doc/install/install-from-c-library.md b/doc/install/install-from-c-library.md
index 4568cdb6c9..e6f0de3eb6 100644
--- a/doc/install/install-from-c-library.md
+++ b/doc/install/install-from-c-library.md
@@ -1,6 +1,6 @@
 # Install from pre-compiled C library {{ tensorflow_icon }} {{ jax_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, JAX {{ jax_icon }}
 :::
 
@@ -30,7 +30,7 @@ make install
 Then the i-PI driver `dp_ipi` will be built and installed.
 One can also follow the manual [Install LAMMPS](./install-lammps.md) and/or [Install GROMACS](./install-gromacs.md).
 
-:::{cmake:variable} DEEPMD_C_ROOT
+:::\{cmake:variable} DEEPMD_C_ROOT
 
 **Type**: `Path`
 
diff --git a/doc/install/install-from-source.md b/doc/install/install-from-source.md
index 1e03563c66..2a26aa87ed 100644
--- a/doc/install/install-from-source.md
+++ b/doc/install/install-from-source.md
@@ -36,9 +36,9 @@ source $deepmd_venv/bin/activate
 pip install --upgrade pip
 ```
 
-::::{tab-set}
+::::\{tab-set}
 
-:::{tab-item} TensorFlow {{ tensorflow_icon }}
+:::\{tab-item} TensorFlow {{ tensorflow_icon }}
 
 The full instruction to install TensorFlow can be found on the official [TensorFlow website](https://www.tensorflow.org/install/pip). TensorFlow 2.7 or later is supported.
 
@@ -64,7 +64,7 @@ One can also [build the TensorFlow Python interface from source](https://www.ten
 
 :::
 
-:::{tab-item} PyTorch {{ pytorch_icon }}
+:::\{tab-item} PyTorch {{ pytorch_icon }}
 
 To install PyTorch, run
 
@@ -78,7 +78,7 @@ One can also [use conda](https://docs.deepmodeling.com/faq/conda.html) to instal
 
 :::
 
-:::{tab-item} JAX {{ jax_icon }}
+:::\{tab-item} JAX {{ jax_icon }}
 
 To install [JAX AI Stack](https://github.com/jax-ml/jax-ai-stack), run
 
@@ -93,7 +93,7 @@ One can also [use conda](https://docs.deepmodeling.com/faq/conda.html) to instal
 
 :::
 
-:::{tab-item} Paddle {{ paddle_icon }}
+:::\{tab-item} Paddle {{ paddle_icon }}
 
 To install Paddle, run
 
@@ -146,15 +146,15 @@ gcc --version
 By default, DeePMD-kit uses C++ 14, so the compiler needs to support C++ 14 (GCC 5 or later).
 The backend package may use a higher C++ standard version, and thus require a higher compiler version (for example, GCC 7 for C++ 17).
 
-::::{tab-set}
+::::\{tab-set}
 
-:::{tab-item} TensorFlow {{ tensorflow_icon }}
+:::\{tab-item} TensorFlow {{ tensorflow_icon }}
 
 Note that TensorFlow may have specific requirements for the compiler version to support the C++ standard version and [`_GLIBCXX_USE_CXX11_ABI`](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html) used by TensorFlow. It is recommended to use [the same compiler version as TensorFlow](https://www.tensorflow.org/install/source#tested_build_configurations), which can be printed by `python -c "import tensorflow;print(tensorflow.version.COMPILER_VERSION)"`.
 
 :::
 
-:::{tab-item} PyTorch {{ pytorch_icon }}
+:::\{tab-item} PyTorch {{ pytorch_icon }}
 
 You can set the environment variable `export DP_ENABLE_PYTORCH=1` to enable customized C++ OPs in the PyTorch backend.
 Note that PyTorch may have specific requirements for the compiler version to support the C++ standard version and [`_GLIBCXX_USE_CXX11_ABI`](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html) used by PyTorch.
@@ -172,21 +172,21 @@ pip install .
 
 One may set the following environment variables before executing `pip`:
 
-:::{envvar} DP_VARIANT
+:::\{envvar} DP_VARIANT
 
 **Choices**: `cpu`, `cuda`, `rocm`; **Default**: `cpu`
 
 Build CPU variant or GPU variant with CUDA or ROCM support.
 :::
 
-:::{envvar} CUDAToolkit_ROOT
+:::\{envvar} CUDAToolkit_ROOT
 
 **Type**: Path; **Default**: Detected automatically
 
 The path to the CUDA toolkit directory. CUDA 9.0 or later is supported. NVCC is required.
 :::
 
-:::{envvar} ROCM_ROOT
+:::\{envvar} ROCM_ROOT
 
 **Type**: Path; **Default**: Detected automatically
 
@@ -194,49 +194,49 @@ The path to the ROCM toolkit directory. If `ROCM_ROOT` is not set, it will look
 
 :::
 
-:::{envvar} DP_ENABLE_TENSORFLOW
+:::\{envvar} DP_ENABLE_TENSORFLOW
 
 **Choices**: `0`, `1`; **Default**: `1`
 
 {{ tensorflow_icon }} Enable the TensorFlow backend.
 :::
 
-:::{envvar} DP_ENABLE_PYTORCH
+:::\{envvar} DP_ENABLE_PYTORCH
 
 **Choices**: `0`, `1`; **Default**: `0`
 
 {{ pytorch_icon }} Enable customized C++ OPs for the PyTorch backend. PyTorch can still run without customized C++ OPs, but features will be limited.
 :::
 
-:::{envvar} TENSORFLOW_ROOT
+:::\{envvar} TENSORFLOW_ROOT
 
 **Type**: Path; **Default**: Detected automatically
 
 {{ tensorflow_icon }} The path to TensorFlow Python library. If not given, by default the installer only finds TensorFlow under user site-package directory (`site.getusersitepackages()`) or system site-package directory (`sysconfig.get_path("purelib")`) due to limitation of [PEP-517](https://peps.python.org/pep-0517/). If not found, the latest TensorFlow (or the environment variable `TENSORFLOW_VERSION` if given) from PyPI will be built against.
 :::
 
-:::{envvar} PYTORCH_ROOT
+:::\{envvar} PYTORCH_ROOT
 
 **Type**: Path; **Default**: Detected automatically
 
 {{ pytorch_icon }} The path to PyTorch Python library. If not given, by default, the installer only finds PyTorch under the user site-package directory (`site.getusersitepackages()`) or the system site-package directory (`sysconfig.get_path("purelib")`) due to the limitation of [PEP-517](https://peps.python.org/pep-0517/). If not found, the latest PyTorch (or the environment variable `PYTORCH_VERSION` if given) from PyPI will be built against.
 :::
 
-:::{envvar} DP_ENABLE_NATIVE_OPTIMIZATION
+:::\{envvar} DP_ENABLE_NATIVE_OPTIMIZATION
 
 **Choices**: `0`, `1`; **Default**: `0`
 
 Enable compilation optimization for the native machine's CPU type. Do not enable it if generated code will run on different CPUs.
 :::
 
-:::{envvar} CMAKE_ARGS
+:::\{envvar} CMAKE_ARGS
 
 **Type**: string
 
 Additional CMake arguments.
 :::
 
-:::{envvar} <LANG>FLAGS
+:::\{envvar} <LANG>FLAGS
 
 `<LANG>`=`CXX`, `CUDA` or `HIP`
 
@@ -267,7 +267,7 @@ It will print the help information like
 
 ### Install horovod and mpi4py {{ tensorflow_icon }}
 
-:::{warning}
+:::\{warning}
 Horovod has not released a new version for a long time.
 As of December 2025, the latest Horovod release does not support the latest TensorFlow versions.
 You can check the patches required to support the latest TensorFlow at [conda-forge/horovod-feedstock](https://github.com/conda-forge/horovod-feedstock/blob/main/recipe/meta.yaml).
@@ -295,20 +295,20 @@ $ horovodrun --check-build
 Horovod v0.22.1:
 
 Available Frameworks:
-    [X] TensorFlow
-    [X] PyTorch
-    [ ] MXNet
+[X] TensorFlow
+[X] PyTorch
+[ ] MXNet
 
 Available Controllers:
-    [X] MPI
-    [X] Gloo
+[X] MPI
+[X] Gloo
 
 Available Tensor Operations:
-    [X] NCCL
-    [ ] DDL
-    [ ] CCL
-    [X] MPI
-    [X] Gloo
+[X] NCCL
+[ ] DDL
+[ ] CCL
+[X] MPI
+[X] Gloo
 ```
 
 Since version 2.0.1, Horovod and mpi4py with MPICH support are shipped with the installer.
@@ -321,9 +321,9 @@ If one does not need to use DeePMD-kit with LAMMPS or i-PI, then the python inte
 
 ### Install Backends' C++ interface (optional)
 
-::::{tab-set}
+::::\{tab-set}
 
-:::{tab-item} TensorFlow {{ tensorflow_icon }} / JAX {{ jax_icon }}
+:::\{tab-item} TensorFlow {{ tensorflow_icon }} / JAX {{ jax_icon }}
 
 The C++ interfaces of both TensorFlow and JAX backends are based on the TensorFlow C++ library.
 
@@ -335,14 +335,14 @@ First, the C++ interface of TensorFlow should be installed. It is noted that the
 
 :::
 
-:::{tab-item} PyTorch {{ pytorch_icon }}
+:::\{tab-item} PyTorch {{ pytorch_icon }}
 
 If you have installed PyTorch using pip, you can use libtorch inside the PyTorch Python package.
 You can also download libtorch prebuilt library from the [PyTorch website](https://pytorch.org/get-started/locally/).
 
 :::
 
-:::{tab-item} JAX {{ jax_icon }}
+:::\{tab-item} JAX {{ jax_icon }}
 
 The JAX backend only depends on the TensorFlow C API, which is included in both TensorFlow C++ library and [TensorFlow C library](https://www.tensorflow.org/install/lang_c).
 If you want to use the TensorFlow C++ library, just enable the TensorFlow backend (which depends on the TensorFlow C++ library) and nothing else needs to do.
@@ -351,7 +351,7 @@ download the TensorFlow C library from [this page](https://www.tensorflow.org/in
 
 :::
 
-:::{tab-item} Paddle {{ paddle_icon }}
+:::\{tab-item} Paddle {{ paddle_icon }}
 
 If you want to use C++ interface of Paddle, you need to compile the Paddle inference library(C++ interface) manually from the [linux-compile-by-make](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/install/compile/linux-compile-by-make.html), then use the `.so` and `.a` files in `Paddle/build/paddle_inference_install_dir/`.
 
@@ -385,9 +385,9 @@ You must enable at least one backend.
 If you enable two or more backends, these backend libraries must be built in a compatible way, e.g. using the same `_GLIBCXX_USE_CXX11_ABI` flag.
 We recommend using [conda packages](https://docs.deepmodeling.com/faq/conda.html) from [conda-forge](https://conda-forge.org), which are usually compatible to each other.
 
-::::{tab-set}
+::::\{tab-set}
 
-:::{tab-item} TensorFlow {{ tensorflow_icon }} / JAX {{ jax_icon }}
+:::\{tab-item} TensorFlow {{ tensorflow_icon }} / JAX {{ jax_icon }}
 
 I assume you have activated the TensorFlow Python environment and want to install DeePMD-kit into path `$deepmd_root`, then execute CMake
 
@@ -399,7 +399,7 @@ If you specify `-DUSE_TF_PYTHON_LIBS=FALSE`, you need to give the location where
 
 :::
 
-:::{tab-item} PyTorch {{ pytorch_icon }}
+:::\{tab-item} PyTorch {{ pytorch_icon }}
 
 I assume you have installed the PyTorch (either Python or C++ interface) to `$torch_root`, then execute CMake
 
@@ -415,7 +415,7 @@ cmake -DENABLE_PYTORCH=TRUE -DUSE_PT_PYTHON_LIBS=TRUE -DCMAKE_INSTALL_PREFIX=$de
 
 :::
 
-:::{tab-item} JAX {{ jax_icon }}
+:::\{tab-item} JAX {{ jax_icon }}
 
 If you want to use the TensorFlow C++ library, just enable the TensorFlow backend and nothing else needs to do.
 If you want to use the TensorFlow C library and disable the TensorFlow backend, set {cmake:variable}`ENABLE_JAX` to `ON` and `CMAKE_PREFIX_PATH` to the root directory of the [TensorFlow C library](https://www.tensorflow.org/install/lang_c).
@@ -426,7 +426,7 @@ cmake -DENABLE_JAX=ON -D CMAKE_PREFIX_PATH=${tensorflow_c_root} ..
 
 :::
 
-:::{tab-item} Paddle {{ paddle_icon }}
+:::\{tab-item} Paddle {{ paddle_icon }}
 
 I assume you have get the Paddle inference library(C++ interface) to `$PADDLE_INFERENCE_DIR`, then execute CMake
 
@@ -440,7 +440,7 @@ cmake -DENABLE_PADDLE=ON -DPADDLE_INFERENCE_DIR=$PADDLE_INFERENCE_DIR -DCMAKE_IN
 
 One may add the following CMake variables to `cmake` using the [`-D <var>=<value>` option](https://cmake.org/cmake/help/latest/manual/cmake.1.html#cmdoption-cmake-D):
 
-:::{cmake:variable} ENABLE_TENSORFLOW
+:::\{cmake:variable} ENABLE_TENSORFLOW
 
 **Type**: `BOOL` (`ON`/`OFF`), Default: `OFF`
 
@@ -449,7 +449,7 @@ Setting this option to `ON` will also set {cmake:variable}`ENABLE_JAX` to `ON`.
 
 :::
 
-:::{cmake:variable} ENABLE_PYTORCH
+:::\{cmake:variable} ENABLE_PYTORCH
 
 **Type**: `BOOL` (`ON`/`OFF`), Default: `OFF`
 
@@ -457,7 +457,7 @@ Setting this option to `ON` will also set {cmake:variable}`ENABLE_JAX` to `ON`.
 
 :::
 
-:::{cmake:variable} ENABLE_JAX
+:::\{cmake:variable} ENABLE_JAX
 
 **Type**: `BOOL` (`ON`/`OFF`), Default: `OFF`
 
@@ -467,7 +467,7 @@ If {cmake:variable}`ENABLE_TENSORFLOW` is `OFF`, the TensorFlow C library is use
 
 :::
 
-:::{cmake:variable} ENABLE_PADDLE
+:::\{cmake:variable} ENABLE_PADDLE
 
 **Type**: `BOOL` (`ON`/`OFF`), Default: `OFF`
 
@@ -475,7 +475,7 @@ If {cmake:variable}`ENABLE_TENSORFLOW` is `OFF`, the TensorFlow C library is use
 
 :::
 
-:::{cmake:variable} TENSORFLOW_ROOT
+:::\{cmake:variable} TENSORFLOW_ROOT
 
 **Type**: `PATH`
 
@@ -483,7 +483,7 @@ If {cmake:variable}`ENABLE_TENSORFLOW` is `OFF`, the TensorFlow C library is use
 
 :::
 
-:::{cmake:variable} PADDLE_INFERENCE_DIR
+:::\{cmake:variable} PADDLE_INFERENCE_DIR
 
 **Type**: `PATH`
 
@@ -491,7 +491,7 @@ If {cmake:variable}`ENABLE_TENSORFLOW` is `OFF`, the TensorFlow C library is use
 
 :::
 
-:::{cmake:variable} CMAKE_INSTALL_PREFIX
+:::\{cmake:variable} CMAKE_INSTALL_PREFIX
 
 **Type**: `PATH`
 
@@ -500,7 +500,7 @@ See also [CMake documentation](https://cmake.org/cmake/help/latest/variable/CMAK
 
 :::
 
-:::{cmake:variable} USE_CUDA_TOOLKIT
+:::\{cmake:variable} USE_CUDA_TOOLKIT
 
 **Type**: `BOOL` (`ON`/`OFF`), Default: `OFF`
 
@@ -508,7 +508,7 @@ If `TRUE`, Build GPU support with CUDA toolkit.
 
 :::
 
-:::{cmake:variable} CUDAToolkit_ROOT
+:::\{cmake:variable} CUDAToolkit_ROOT
 
 **Type**: `PATH`, **Default**: [Search automatically](https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html)
 
@@ -517,7 +517,7 @@ See also [CMake documentation](https://cmake.org/cmake/help/latest/module/FindCU
 
 :::
 
-:::{cmake:variable} USE_ROCM_TOOLKIT
+:::\{cmake:variable} USE_ROCM_TOOLKIT
 
 **Type**: `BOOL` (`ON`/`OFF`), Default: `OFF`
 
@@ -525,7 +525,7 @@ If `TRUE`, Build GPU support with ROCM toolkit.
 
 :::
 
-:::{cmake:variable} CMAKE_HIP_COMPILER_ROCM_ROOT
+:::\{cmake:variable} CMAKE_HIP_COMPILER_ROCM_ROOT
 
 **Type**: `PATH`, **Default**: [Search automatically](https://rocm.docs.amd.com/en/latest/conceptual/cmake-packages.html)
 
@@ -534,7 +534,7 @@ See also [ROCm documentation](https://rocm.docs.amd.com/en/latest/conceptual/cma
 
 :::
 
-:::{cmake:variable} LAMMPS_SOURCE_ROOT
+:::\{cmake:variable} LAMMPS_SOURCE_ROOT
 
 **Type**: `PATH`
 
@@ -545,7 +545,7 @@ If not assigned, the plugin mode will not be enabled.
 
 :::
 
-:::{cmake:variable} USE_TF_PYTHON_LIBS
+:::\{cmake:variable} USE_TF_PYTHON_LIBS
 
 **Type**: `BOOL` (`ON`/`OFF`), Default: `OFF`
 
@@ -554,7 +554,7 @@ There's no need for building TensorFlow's C++ interface.
 
 :::
 
-:::{cmake:variable} USE_PT_PYTHON_LIBS
+:::\{cmake:variable} USE_PT_PYTHON_LIBS
 
 **Type**: `BOOL` (`ON`/`OFF`), Default: `OFF`
 
@@ -563,7 +563,7 @@ There's no need for downloading PyTorch's C++ libraries.
 
 :::
 
-:::{cmake:variable} ENABLE_NATIVE_OPTIMIZATION
+:::\{cmake:variable} ENABLE_NATIVE_OPTIMIZATION
 
 **Type**: `BOOL` (`ON`/`OFF`), Default: `OFF`
 
@@ -573,7 +573,8 @@ Do not enable it if generated code will run on different CPUs.
 :::
 
 <!-- prettier-ignore -->
-:::{cmake:variable} CMAKE_<LANG>_FLAGS
+
+:::\{cmake:variable} CMAKE\_<LANG>\_FLAGS
 
 (`<LANG>`=`CXX`, `CUDA` or `HIP`)
 
@@ -584,7 +585,7 @@ See also [CMake documentation](https://cmake.org/cmake/help/latest/variable/CMAK
 
 :::
 
----
+______________________________________________________________________
 
 If the CMake has been executed successfully, then run the following make commands to build the package:
 
diff --git a/doc/install/install-gromacs.md b/doc/install/install-gromacs.md
index 90ed73841c..02b0b4b194 100644
--- a/doc/install/install-gromacs.md
+++ b/doc/install/install-gromacs.md
@@ -15,6 +15,7 @@ where `deepmd_kit_root` is the directory where the latest version of DeePMD-kit
 
 <!-- ## Install C++ api of deepmd-kit and tensorflow
 The C++ interface of `deepmd-kit 2.x` and `tensorflow 2.x` are required. -->
+
 <!-- + Tips: C++ api of deepmd and TensorFlow could be easily installed from the deepmd-kit offline packages. But before using tensorflow, you need to manually change the protobuf package to [version 3.9.2](https://github.com/protocolbuffers/protobuf/releases/tag/v3.9.2) in `$deepmd_env_dir/include/google/protobuf` (the offline package will install a version of 3.14, which will cause incompatibility). Here `deepmd_env_dir` refers to the directory of conda environment created by the deepmd-kit offline packages.  -->
 
 ## Compile GROMACS with deepmd-kit
@@ -30,10 +31,10 @@ mkdir build
 cd build
 
 cmake3 .. -DCMAKE_CXX_STANDARD=14 \ # not required, but c++14 seems to be more compatible with higher version of tensorflow
-          -DGMX_MPI=ON \
-          -DGMX_GPU=CUDA \ # Gromacs on ROCm has not been fully developed yet
-          -DCUDAToolkit_ROOT=/path/to/cuda \
-          -DCMAKE_INSTALL_PREFIX=/path/to/gromacs-2020.2-deepmd
+-DGMX_MPI=ON \
+    -DGMX_GPU=CUDA \ # Gromacs on ROCm has not been fully developed yet
+-DCUDAToolkit_ROOT=/path/to/cuda \
+    -DCMAKE_INSTALL_PREFIX=/path/to/gromacs-2020.2-deepmd
 make -j
 make install
 ```
diff --git a/doc/install/install-lammps.md b/doc/install/install-lammps.md
index cb65188002..30f3aad7de 100644
--- a/doc/install/install-lammps.md
+++ b/doc/install/install-lammps.md
@@ -119,7 +119,7 @@ If everything works fine, you will end up with an executable `${deepmd_root}/bin
 ${deepmd_root}/bin/lmp -h
 ```
 
-:::{note}
+:::\{note}
 If `${tensorflow_root}`, `${deepmd_root}`, or the path to TensorFlow Python package if applicable is different from the prefix of LAMMPS, you need to append the library path to [`RUNPATH`](https://man7.org/linux/man-pages/man8/ld.so.8.html) of `liblammps.so`. For example, use patchelf >= 0.13
 
 ```sh
diff --git a/doc/install/install-nodejs.md b/doc/install/install-nodejs.md
index 7137723c31..c6ec70d5cb 100644
--- a/doc/install/install-nodejs.md
+++ b/doc/install/install-nodejs.md
@@ -18,8 +18,8 @@ When using CMake to [build DeePMD-kit from source](./install-from-source.md), se
 
 ```sh
 cmake -D BUILD_NODEJS_IF=ON \
-      -D NODEJS_INCLUDE_DIRS=/path/to/nodejs/include \
-      .. # and other arguments
+    -D NODEJS_INCLUDE_DIRS=/path/to/nodejs/include \
+    .. # and other arguments
 make
 make install
 ```
diff --git a/doc/model/change-bias.md b/doc/model/change-bias.md
index 2a9b098606..2f39ff0823 100644
--- a/doc/model/change-bias.md
+++ b/doc/model/change-bias.md
@@ -1,6 +1,6 @@
 # Change the model output bias for trained model {{ tensorflow_icon }} {{ pytorch_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}
 :::
 
@@ -12,9 +12,9 @@ or manually setting the output bias.
 
 The `dp change-bias` command supports the following methods for adjusting the bias:
 
-::::{tab-set}
+::::\{tab-set}
 
-:::{tab-item} TensorFlow Backend {{ tensorflow_icon }}
+:::\{tab-item} TensorFlow Backend {{ tensorflow_icon }}
 
 **Changing bias using provided systems for trained checkpoint:**
 
@@ -30,7 +30,7 @@ dp --tf change-bias model.ckpt -b -92.523 -187.66 -o model_updated.pb
 
 :::
 
-:::{tab-item} PyTorch Backend {{ pytorch_icon }}
+:::\{tab-item} PyTorch Backend {{ pytorch_icon }}
 
 **Changing bias using provided systems for trained `.pt`/`.pth` models:**
 
diff --git a/doc/model/dpa2.md b/doc/model/dpa2.md
index 466a4de4f2..6c7632497f 100644
--- a/doc/model/dpa2.md
+++ b/doc/model/dpa2.md
@@ -1,6 +1,6 @@
 # Descriptor DPA-2 {{ pytorch_icon }} {{ jax_icon }} {{ paddle_icon }} {{ dpmodel_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, Paddle {{ paddle_icon }}, DP {{ dpmodel_icon }}
 :::
 
diff --git a/doc/model/dpa3.md b/doc/model/dpa3.md
index 0ff46c438f..81b1d3ad99 100644
--- a/doc/model/dpa3.md
+++ b/doc/model/dpa3.md
@@ -1,6 +1,6 @@
 # Descriptor DPA3 {{ pytorch_icon }} {{ jax_icon }} {{ paddle_icon }} {{ dpmodel_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, DP {{ dpmodel_icon }}
 :::
 
@@ -42,9 +42,9 @@ Note that we set `float32` in all DPA3 models, while `float64` in other models b
 
 ## Requirements of installation from source code {{ pytorch_icon }} {{ paddle_icon }}
 
-::::{tab-set}
+::::\{tab-set}
 
-:::{tab-item} PyTorch {{ pytorch_icon }}
+:::\{tab-item} PyTorch {{ pytorch_icon }}
 
 To run the DPA3 model on LAMMPS via source code installation
 (users can skip this step if using [easy installation](../install/easy-install.md)),
@@ -59,7 +59,7 @@ otherwise the communication between GPU cards falls back to the slower CPU imple
 
 :::
 
-:::{tab-item} Paddle {{ paddle_icon }}
+:::\{tab-item} Paddle {{ paddle_icon }}
 
 The customized OP library for the Python interface can be installed by
 
diff --git a/doc/model/dplr.md b/doc/model/dplr.md
index 61327bb55e..60a6a49f19 100644
--- a/doc/model/dplr.md
+++ b/doc/model/dplr.md
@@ -1,6 +1,6 @@
 # Deep potential long-range (DPLR) {{ tensorflow_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}
 :::
 
@@ -36,8 +36,6 @@ $L$ is the cutoff in Fourier space and $S(m)$, the structure factor, is given by
 where $\imath = \sqrt{-1}$ denotes the imaginary unit, $\boldsymbol r_i$ indicates ion coordinates, $q_i$ is the charge of the ion $i$, and $W_n$ is the $n$-th Wannier centroid (WC) which can be obtained from a separated [dipole model](./train-fitting-tensor.md).
 It can be proved that the error in the electrostatic energy introduced by the Gaussian approximations is dominated by a summation of dipole-quadrupole interactions that decay as $r^{-4}$, where $r$ is the distance between the dipole and quadrupole.[^1]
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 ## Train a deep Wannier model for Wannier centroids
 
 We use the deep Wannier model (DW) to represent the relative position of the Wannier centroid (WC) with the atom with which it is associated. One may consult the introduction of the [dipole model](train-fitting-tensor.md) for a detailed introduction. An example input `wc.json` and a small dataset `data` for tutorial purposes can be found in
@@ -192,9 +190,11 @@ fix ID group-ID style_name keyword value ...
 ```
 
 <!-- See https://github.com/prettier/prettier/issues/16160 -->
+
 <!-- prettier-ignore -->
+
 - ID, group-ID are documented in :doc:`fix <fix>` command
-- style\_name = _dplr_
+- style_name = _dplr_
 - three or more keyword/value pairs may be appended
 
 ```
@@ -266,4 +266,6 @@ The MD simulation lasts for only 20 steps. If one runs a longer simulation, it w
 
 Another restriction that should be noted is that the energies printed at the zero steps are not correct. This is because at the zero steps the position of the WC has not been updated with the DW model. The energies printed in later steps are correct.
 
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
 [1]: https://arxiv.org/abs/2112.13327
diff --git a/doc/model/dprc.md b/doc/model/dprc.md
index 9f3eee244d..d6e25faae2 100644
--- a/doc/model/dprc.md
+++ b/doc/model/dprc.md
@@ -1,6 +1,6 @@
 # Deep Potential - Range Correction (DPRc) {{ tensorflow_icon }} {{ pytorch_icon }} {{ dpmodel_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, DP {{ dpmodel_icon }}
 :::
 
@@ -44,8 +44,6 @@ The fitting network is revised to remove energy bias from MM atoms:
 where $\mathbf{0}$ is a zero matrix.
 It is worth mentioning that usage of DPRc is not limited to its initial design for QM/MM correction and can be expanded to any similar interaction.[^1]
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 See the [JCTC paper](https://doi.org/10.1021/acs.jctc.1c00201) for details.
 
 ## Training data
@@ -68,9 +66,9 @@ In a DPRc model, QM atoms and MM atoms have different atom types. Assuming we ha
 
 As described in the paper, the DPRc model only corrects $E_\text{QM}$ and $E_\text{QM/MM}$ within the cutoff, so we use a hybrid descriptor to describe them separately:
 
-::::{tab-set}
+::::\{tab-set}
 
-:::{tab-item} TensorFlow {{ tensorflow_icon }}
+:::\{tab-item} TensorFlow {{ tensorflow_icon }}
 
 ```json
 "descriptor" :{
@@ -103,7 +101,7 @@ As described in the paper, the DPRc model only corrects $E_\text{QM}$ and $E_\te
 
 :::
 
-:::{tab-item} PyTorch {{ pytorch_icon }}
+:::\{tab-item} PyTorch {{ pytorch_icon }}
 
 ```json
 "descriptor" :{
@@ -181,7 +179,7 @@ The DPRc model has the best practices with the [AMBER](../third-party/out-of-dee
 
 ## Pairwise DPRc
 
-:::{note}
+:::\{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}
 :::
 
@@ -200,9 +198,18 @@ It is noted that the [`se_atten` descriptor](./train-se-atten.md) should be used
 {
   "model": {
     "type": "pairwise_dprc",
-    "type_map": ["C", "P", "O", "H", "OW", "HW"],
+    "type_map": [
+      "C",
+      "P",
+      "O",
+      "H",
+      "OW",
+      "HW"
+    ],
     "type_embedding": {
-      "neuron": [8],
+      "neuron": [
+        8
+      ],
       "precision": "float32"
     },
     "qm_model": {
@@ -212,7 +219,11 @@ It is noted that the [`se_atten` descriptor](./train-se-atten.md) should be used
         "rcut_smth": 0.5,
         "rcut": 9.0,
         "attn_layer": 0,
-        "neuron": [25, 50, 100],
+        "neuron": [
+          25,
+          50,
+          100
+        ],
         "resnet_dt": false,
         "axis_neuron": 12,
         "precision": "float32",
@@ -220,10 +231,21 @@ It is noted that the [`se_atten` descriptor](./train-se-atten.md) should be used
       },
       "fitting_net": {
         "type": "ener",
-        "neuron": [240, 240, 240],
+        "neuron": [
+          240,
+          240,
+          240
+        ],
         "resnet_dt": true,
         "precision": "float32",
-        "atom_ener": [null, null, null, null, 0.0, 0.0],
+        "atom_ener": [
+          null,
+          null,
+          null,
+          null,
+          0.0,
+          0.0
+        ],
         "seed": 1
       }
     },
@@ -234,35 +256,89 @@ It is noted that the [`se_atten` descriptor](./train-se-atten.md) should be used
         "rcut_smth": 0.5,
         "rcut": 6.0,
         "attn_layer": 0,
-        "neuron": [25, 50, 100],
+        "neuron": [
+          25,
+          50,
+          100
+        ],
         "resnet_dt": false,
         "axis_neuron": 12,
         "set_davg_zero": true,
         "exclude_types": [
-          [0, 0],
-          [0, 1],
-          [0, 2],
-          [0, 3],
-          [1, 1],
-          [1, 2],
-          [1, 3],
-          [2, 2],
-          [2, 3],
-          [3, 3],
-          [4, 4],
-          [4, 5],
-          [5, 5]
+          [
+            0,
+            0
+          ],
+          [
+            0,
+            1
+          ],
+          [
+            0,
+            2
+          ],
+          [
+            0,
+            3
+          ],
+          [
+            1,
+            1
+          ],
+          [
+            1,
+            2
+          ],
+          [
+            1,
+            3
+          ],
+          [
+            2,
+            2
+          ],
+          [
+            2,
+            3
+          ],
+          [
+            3,
+            3
+          ],
+          [
+            4,
+            4
+          ],
+          [
+            4,
+            5
+          ],
+          [
+            5,
+            5
+          ]
         ],
         "precision": "float32",
         "seed": 1
       },
       "fitting_net": {
         "type": "ener",
-        "neuron": [240, 240, 240],
+        "neuron": [
+          240,
+          240,
+          240
+        ],
         "resnet_dt": true,
         "seed": 1,
         "precision": "float32",
-        "atom_ener": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+        "atom_ener": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
       }
     }
   }
@@ -273,3 +349,5 @@ The pairwise model needs information for MM residues.
 The model uses [`aparam`](../data/system.md) with the shape of `nframes x natoms` to get the residue index.
 The QM residue should always use `0` as the index.
 For example, `0 0 0 1 1 1 2 2 2` means these 9 atoms are grouped into one QM residue and two MM residues.
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/linear.md b/doc/model/linear.md
index 47fdd1750b..a6693e668e 100644
--- a/doc/model/linear.md
+++ b/doc/model/linear.md
@@ -1,6 +1,6 @@
 ## Linear model {{ tensorflow_icon }} {{ pytorch_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}
 :::
 
diff --git a/doc/model/overall.md b/doc/model/overall.md
index cc72aa3887..85d4eaa42b 100644
--- a/doc/model/overall.md
+++ b/doc/model/overall.md
@@ -24,8 +24,6 @@ From the above equation, one may compute the global property of the system by
 where $N$ is the number of atoms in a frame.
 For example, if $y_i$ represents the potential energy contribution of atom $i$, then $y$ gives the total potential energy of the frame.[^1]
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 ## Instructions
 
 A model has two parts, a descriptor that maps atomic configuration to a set of symmetry invariant features, and a fitting net that takes descriptor as input and predicts the atomic contribution to the target physical property. It's defined in the {ref}`model <model>` section of the `input.json`, for example,
@@ -49,19 +47,21 @@ The {ref}`type_map <model/type_map>` is optional, which provides the element nam
 DeePMD-kit implements the following descriptors:
 
 1. [`se_e2_a`](train-se-e2-a.md): DeepPot-SE constructed from all information (both angular and radial) of atomic configurations. The embedding takes the distance between atoms as input.
-2. [`se_e2_r`](train-se-e2-r.md): DeepPot-SE constructed from radial information of atomic configurations. The embedding takes the distance between atoms as input.
-3. [`se_e3`](train-se-e3.md): DeepPot-SE constructed from all information (both angular and radial) of atomic configurations. The embedding takes angles between two neighboring atoms as input.
-4. [`se_a_mask`](train-se-a-mask.md): DeepPot-SE constructed from all information (both angular and radial) of atomic configurations. The input frames in one system can have a varied number of atoms. Input particles are padded with virtual particles of the same length.
-5. `loc_frame`: Defines a local frame at each atom and compute the descriptor as local coordinates under this frame.
-6. [`hybrid`](train-hybrid.md): Concate a list of descriptors to form a new descriptor.
+1. [`se_e2_r`](train-se-e2-r.md): DeepPot-SE constructed from radial information of atomic configurations. The embedding takes the distance between atoms as input.
+1. [`se_e3`](train-se-e3.md): DeepPot-SE constructed from all information (both angular and radial) of atomic configurations. The embedding takes angles between two neighboring atoms as input.
+1. [`se_a_mask`](train-se-a-mask.md): DeepPot-SE constructed from all information (both angular and radial) of atomic configurations. The input frames in one system can have a varied number of atoms. Input particles are padded with virtual particles of the same length.
+1. `loc_frame`: Defines a local frame at each atom and compute the descriptor as local coordinates under this frame.
+1. [`hybrid`](train-hybrid.md): Concate a list of descriptors to form a new descriptor.
 
 The fitting of the following physical properties is supported
 
 1. [`ener`](train-energy.md): Fit the energy of the system. The force (derivative with atom positions), the virial (derivative with the box tensor) and the hessian (second-order derivative with atom positions) can also be trained.
 
-:::{warning}
+:::\{warning}
 Due to the restrictions of torch jit script, the models trained with hessian are not jitable so that the frozen models cannot output hessians.
 :::
 
 2. [`dipole`](train-fitting-tensor.md): The dipole moment.
-3. [`polar`](train-fitting-tensor.md): The polarizability.
+1. [`polar`](train-fitting-tensor.md): The polarizability.
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/pairtab.md b/doc/model/pairtab.md
index 57fe23f5e9..3c062efedd 100644
--- a/doc/model/pairtab.md
+++ b/doc/model/pairtab.md
@@ -1,6 +1,6 @@
 # Interpolation or combination with a pairwise potential {{ tensorflow_icon }} {{ pytorch_icon }} {{ dpmodel_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, DP {{ dpmodel_icon }}
 :::
 
@@ -45,8 +45,6 @@ In the range $[r_a, r_b]$, the DP model smoothly switched off and the pairwise p
 where the scale $\alpha_s$ is a tunable scale of the interatomic distance $r_{ij}$.
 The pairwise potential $u^{\textrm{pair}}(r)$ is defined by a user-defined table that provides the value of $u^{\textrm{pair}}$ on an evenly discretized grid from 0 to the cutoff distance.[^1]
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 DeePMD-kit also supports combination with a pairwise potential {{ tensorflow_icon }}:
 
 ```math
@@ -62,7 +60,7 @@ in the order of Type_0-Type_0, Type_0-Type_1, ..., Type_0-Type_N, Type_1-Type_1,
 
 The interaction should be smooth at the cut-off distance.
 
-:::{note}
+:::\{note}
 In instances where the interaction at the cut-off distance is not delineated within the table file, extrapolation will be conducted utilizing the available interaction data. This extrapolative procedure guarantees a smooth transition from the table-provided value to `0` whenever feasible.
 :::
 
@@ -104,9 +102,11 @@ To combine with a pairwise potential, use the [linear model](./linear.md):
 
 The {ref}`rcut <model[pairtab]/rcut>` can be larger than that of the DP model.
 
-:::{note}
+:::\{note}
 The above example shows a example of combining D3 dispersion.
 However, it is more efficient to train a model using plain DFT calculations without the dispersion correction, and add the dispersion correction during the simulation via the LAMMPS [`pair_style dispersion/d3` command](https://docs.lammps.org/pair_dispersion_d3.html#pair-style-dispersion-d3-command).
 Training against data with dispersion directly is discouraged.
 See the [D3 dispersion section](../third-party/lammps-command.md#d3-dispersion) for details.
 :::
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/sel.md b/doc/model/sel.md
index 5b85318dd9..942cdccc22 100644
--- a/doc/model/sel.md
+++ b/doc/model/sel.md
@@ -6,9 +6,9 @@ All descriptors require to set `sel`, which means the expected maximum number of
 
 To determine a proper `sel`, one can calculate the neighbor stat of the training data before training:
 
-::::{tab-set}
+::::\{tab-set}
 
-:::{tab-item} TensorFlow {{ tensorflow_icon }}
+:::\{tab-item} TensorFlow {{ tensorflow_icon }}
 
 ```sh
 dp --tf neighbor-stat -s data -r 6.0 -t O H
@@ -16,7 +16,7 @@ dp --tf neighbor-stat -s data -r 6.0 -t O H
 
 :::
 
-:::{tab-item} PyTorch {{ pytorch_icon }}
+:::\{tab-item} PyTorch {{ pytorch_icon }}
 
 ```sh
 dp --pt neighbor-stat -s data -r 6.0 -t O H
@@ -24,7 +24,7 @@ dp --pt neighbor-stat -s data -r 6.0 -t O H
 
 :::
 
-:::{tab-item} JAX {{ jax_icon }}
+:::\{tab-item} JAX {{ jax_icon }}
 
 ```sh
 dp --jax neighbor-stat -s data -r 6.0 -t O H
@@ -32,7 +32,7 @@ dp --jax neighbor-stat -s data -r 6.0 -t O H
 
 :::
 
-:::{tab-item} Paddle {{ paddle_icon }}
+:::\{tab-item} Paddle {{ paddle_icon }}
 
 ```sh
 dp --pd neighbor-stat -s data -r 6.0 -t O H
diff --git a/doc/model/show-model-info.md b/doc/model/show-model-info.md
index 67d82610de..5bffacb871 100644
--- a/doc/model/show-model-info.md
+++ b/doc/model/show-model-info.md
@@ -33,28 +33,35 @@ dp show frozen_model.pth type-map descriptor fitting-net size
 Depending on the provided attributes and the model type, the output includes:
 
 - **Model Type**
+
   - Logs whether the loaded model is a _singletask_ or _multitask_ model.
 
 - **model-branch**
+
   - _Only available for multitask models._
   - Lists all available model branches and the special `"RANDOM"` branch, which refers to a randomly initialized fitting net.
 
 - **type-map**
+
   - For multitask models: Shows the type map for each branch.
   - For singletask models: Shows the model's type map.
 
 - **descriptor**
+
   - For multitask models: Displays the descriptor parameter for each branch.
   - For singletask models: Displays the descriptor parameter.
 
 - **fitting-net**
+
   - For multitask models: Shows the fitting network parameters for each branch.
   - For singletask models: Shows the fitting network parameters.
 
 - **size**
+
   - Prints the number of parameters for each component (`descriptor`, `fitting-net`, etc.), as well as the total parameter count.
 
 - **observed-type**
+
   - Displays the count and list of observed element types of the model during data statistics.
   - For multitask models, it shows the observed types for each branch.
   - Note: This info shows the types observed during training data statistics, which may differ from the type map.
diff --git a/doc/model/train-energy-hessian.md b/doc/model/train-energy-hessian.md
index d77e7f3e88..61ebd0530c 100644
--- a/doc/model/train-energy-hessian.md
+++ b/doc/model/train-energy-hessian.md
@@ -1,6 +1,6 @@
 # Fit energy Hessian {{ pytorch_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: PyTorch {{ pytorch_icon }}
 :::
 
@@ -43,9 +43,9 @@ set.*/hessian.npy
 
 This system contains `Nframes` frames with the same atom number `Natoms`, the total number of elements contained in all frames is `Ntypes`. Most files are the same as those in [standard formats](../data/system.md), here we only list the distinct ones:
 
-| ID      | Property         | Raw file    | Unit   | Shape                                   | Description                                             |
-| ------- | ---------------- | ----------- | ------ | --------------------------------------- | ------------------------------------------------------- |
-| hessian | Hessian matrices | hessian.npy | eV/Å^2 | Nframes \* (Natoms \* 3 \* Natoms \* 3) | Second-order derivatives of energies w.r.t coordinates. |
+| ID      | Property         | Raw file    | Unit   | Shape                               | Description                                             |
+| ------- | ---------------- | ----------- | ------ | ----------------------------------- | ------------------------------------------------------- |
+| hessian | Hessian matrices | hessian.npy | eV/Å^2 | Nframes * (Natoms * 3 * Natoms * 3) | Second-order derivatives of energies w.r.t coordinates. |
 
 Note that the `hessian.npy` should contain the **full** Hessian matrices with shape of `(3Natoms * 3Natoms)` for each frame, rather than the upper or lower triangular matrices with shape of `(3Natoms * (3Natoms + 1) / 2)` for each frame.
 
@@ -53,9 +53,9 @@ Note that the `hessian.npy` should contain the **full** Hessian matrices with sh
 
 There are two approaches to training a Hessian model. The first method involves training the model from scratch using the same command as in the `ener` mode within the PyTorch backend:
 
-::::{tab-set}
+::::\{tab-set}
 
-:::{tab-item} PyTorch {{ pytorch_icon }}
+:::\{tab-item} PyTorch {{ pytorch_icon }}
 
 ```bash
 dp --pt train input.json
@@ -67,9 +67,9 @@ dp --pt train input.json
 
 The second approach is to train a Hessian model from a pretrained energy model, following the same command as the `finetune` strategy within the PyTorch backend:
 
-::::{tab-set}
+::::\{tab-set}
 
-:::{tab-item} PyTorch {{ pytorch_icon }}
+:::\{tab-item} PyTorch {{ pytorch_icon }}
 
 ```bash
 dp --pt train input.json --finetune pretrained_energy.pt
@@ -93,15 +93,15 @@ The detailed loss can be found in `lcurve.out`:
 
 ## Test the Model
 
-:::{warning}
+:::\{warning}
 A model trained with Hessian cannot be frozen. If freezing is enforced, the model will be treated as a standard energy model, and the frozen one will no longer be able to output Hessian predictions.
 :::
 
 If one do freeze and test a Hessian model using the commands:
 
-::::{tab-set}
+::::\{tab-set}
 
-:::{tab-item} PyTorch {{ pytorch_icon }}
+:::\{tab-item} PyTorch {{ pytorch_icon }}
 
 ```bash
 
@@ -123,9 +123,9 @@ ${output_prefix}.v.out   ${output_prefix}.v_peratom.out
 
 If one intends to use the trained model for Hessian predictions, then he/she is supposed to test the model directly without performing a freezing operation:
 
-::::{tab-set}
+::::\{tab-set}
 
-:::{tab-item} PyTorch {{ pytorch_icon }}
+:::\{tab-item} PyTorch {{ pytorch_icon }}
 
 ```bash
 
diff --git a/doc/model/train-energy-spin.md b/doc/model/train-energy-spin.md
index 52a470f2a6..eee7d49991 100644
--- a/doc/model/train-energy-spin.md
+++ b/doc/model/train-energy-spin.md
@@ -1,13 +1,13 @@
 # Fit spin energy {{ tensorflow_icon }} {{ pytorch_icon }} {{ dpmodel_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, DP {{ dpmodel_icon }}
 :::
 
 To train a model that takes additional spin information as input, you only need to modify the following sections to define the spin-specific settings,
 keeping other sections the same as the normal energy model's input script.
 
-:::{warning}
+:::\{warning}
 Note that when adding spin into the model, there will be some implicit modifications automatically done by the program:
 
 - In the TensorFlow backend, the `se_e2_a` descriptor will treat those atom types with spin as new (virtual) types,
@@ -23,7 +23,7 @@ Note that when adding spin into the model, there will be some implicit modificat
 
 The spin settings are given by the {ref}`spin <model/spin>` section, which sets the magnetism for each type of atoms as described in the following sections.
 
-:::{note}
+:::\{note}
 Note that the construction of spin settings is different between TensorFlow and PyTorch/DP.
 :::
 
@@ -68,11 +68,11 @@ See `se_e2_a` examples in `$deepmd_source_dir/examples/spin/se_e2_a/input_torch.
   between a virtual atom representing spin and its corresponding real atom
   for each atom type with spin. This factor is defined as the virtual distance
   divided by the magnitude of atomic spin for each atom type with spin.
-  The virtual coordinate is defined as the real coordinate plus spin \* virtual_scale.
+  The virtual coordinate is defined as the real coordinate plus spin * virtual_scale.
   List of float values with shape of `ntypes` or `ntypes_spin` or one single float value for all types,
   only used when {ref}`use_spin <model/spin[ener_spin]/use_spin>` is True for each atom type.
 
-:::{note}
+:::\{note}
 It should be noted that the spin models in PyTorch/DP are capable of addressing scenarios where the spin approaches zero
 (indicating the virtual atom is in close proximity to the real atom) by adjusting the non-zero
 {ref}`env_protection <model[standard]/descriptor[se_e2_a]/env_protection>` parameter within the descriptor.
@@ -88,7 +88,7 @@ $$L = p_e L_e + p_{fr} L_{fr} + p_{fm} L_{fm} + p_v L_v$$
 
 where $L_e$, $L_{fr}$, $L_{fm}$ and $L_v$ denote the loss in energy, atomic force, magnatic force and virial, respectively. $p_e$, $p_{fr}$, $p_{fm}$ and $p_v$ give the prefactors of the energy, atomic force, magnatic force and virial losses.
 
-:::{note}
+:::\{note}
 Please note that the virial and atomic virial are not currently supported in spin models.
 :::
 
@@ -124,7 +124,7 @@ If one does not want to train with virial, then he/she may set the virial prefac
 
 ## Data format
 
-:::{note}
+:::\{note}
 Note that the spin data format is different between TensorFlow and PyTorch/DP.
 :::
 
@@ -148,13 +148,13 @@ where $\bm{R}_{i^p}$, $\bm{R}_i$, and $\bm{S}_i$ denote the virtual atomic coord
 
 We list the details about spin system data format in TensorFlow backend:
 
-| ID     | Property                   | Raw file   | Unit | Shape                             | Description                                                                                                                                               |
-| ------ | -------------------------- | ---------- | ---- | --------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| type   | Atom type indexes          | type.raw   | \    | Natoms + Nspins                   | Integers that start with 0. The first `Natoms` entries represent real atom types, followed by `Nspins` entries representing virtual atom types.           |
-| coord  | Coordinates                | coord.raw  | Å    | Nframes \* (Natoms + Nspins) \* 3 | The first `3 \* Natoms` columns represent the coordinates of real atoms, followed by `3 \* Nspins` columns representing the coordinates of virtual atoms. |
-| box    | Boxes                      | box.raw    | Å    | Nframes \* 3 \* 3                 | in the order `XX XY XZ YX YY YZ ZX ZY ZZ`                                                                                                                 |
-| energy | Frame energies             | energy.raw | eV   | Nframes                           |
-| force  | Atomic and magnetic forces | force.raw  | eV/Å | Nframes \* (Natoms + Nspins) \* 3 | The first `3 \* Natoms` columns represent atomic forces, followed by `3 \* Nspins` columns representing magnetic forces.                                  |
+| ID     | Property                   | Raw file   | Unit | Shape                           | Description                                                                                                                                               |
+| ------ | -------------------------- | ---------- | ---- | ------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| type   | Atom type indexes          | type.raw   | \\   | Natoms + Nspins                 | Integers that start with 0. The first `Natoms` entries represent real atom types, followed by `Nspins` entries representing virtual atom types.           |
+| coord  | Coordinates                | coord.raw  | Å    | Nframes * (Natoms + Nspins) * 3 | The first `3 \* Natoms` columns represent the coordinates of real atoms, followed by `3 \* Nspins` columns representing the coordinates of virtual atoms. |
+| box    | Boxes                      | box.raw    | Å    | Nframes * 3 * 3                 | in the order `XX XY XZ YX YY YZ ZX ZY ZZ`                                                                                                                 |
+| energy | Frame energies             | energy.raw | eV   | Nframes                         |                                                                                                                                                           |
+| force  | Atomic and magnetic forces | force.raw  | eV/Å | Nframes * (Natoms + Nspins) * 3 | The first `3 \* Natoms` columns represent atomic forces, followed by `3 \* Nspins` columns representing magnetic forces.                                  |
 
 ### Spin data format in PyTorch/DP
 
@@ -172,7 +172,7 @@ set.*/force_mag.npy
 
 This system contains `Nframes` frames with the same atom number `Natoms`, the total number of element contained in all frames is `Ntypes`. Most files are the same as those in [standard formats](../data/system.md), here we only list the distinct ones:
 
-| ID             | Property         | Raw file      | Unit    | Shape                  | Description                                                         |
-| -------------- | ---------------- | ------------- | ------- | ---------------------- | ------------------------------------------------------------------- |
-| spin           | Magnetic moments | spin.raw      | $\mu_B$ | Nframes \* Natoms \* 3 | Spin for magnetic atoms and zero for non-magnetic atoms.            |
-| magnetic force | Magnetic forces  | force_mag.raw | eV/Å    | Nframes \* Natoms \* 3 | Magnetic forces for magnetic atoms and zero for non-magnetic atoms. |
+| ID             | Property         | Raw file      | Unit    | Shape                | Description                                                         |
+| -------------- | ---------------- | ------------- | ------- | -------------------- | ------------------------------------------------------------------- |
+| spin           | Magnetic moments | spin.raw      | $\mu_B$ | Nframes * Natoms * 3 | Spin for magnetic atoms and zero for non-magnetic atoms.            |
+| magnetic force | Magnetic forces  | force_mag.raw | eV/Å    | Nframes * Natoms * 3 | Magnetic forces for magnetic atoms and zero for non-magnetic atoms. |
diff --git a/doc/model/train-energy.md b/doc/model/train-energy.md
index 128779ee16..4d0a3d5ce8 100644
--- a/doc/model/train-energy.md
+++ b/doc/model/train-energy.md
@@ -1,6 +1,6 @@
 # Fit energy {{ tensorflow_icon }} {{ pytorch_icon }} {{ jax_icon }} {{ paddle_icon }} {{ dpmodel_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, Paddle {{ paddle_icon }}, DP {{ dpmodel_icon }}
 :::
 
@@ -75,8 +75,6 @@ where $\nu$ is a small constant used to protect
 an atom where the magnitude of $\boldsymbol{F}^\ast_k$ is small from having a large $L^r_F$.
 Benefiting from the relative force loss, small forces can be fitted more accurately.[^1]
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 ## The fitting network
 
 The construction of the fitting net is given by section {ref}`fitting_net <model[standard]/fitting_net>`
@@ -125,3 +123,5 @@ The {ref}`loss <loss>` section in the `input.json` is
 The options {ref}`start_pref_e <loss[ener]/start_pref_e>`, {ref}`limit_pref_e <loss[ener]/limit_pref_e>`, {ref}`start_pref_f <loss[ener]/start_pref_f>`, {ref}`limit_pref_f <loss[ener]/limit_pref_f>`, {ref}`start_pref_v <loss[ener]/start_pref_v>` and {ref}`limit_pref_v <loss[ener]/limit_pref_v>` determine the start and limit prefactors of energy, force and virial, respectively.
 
 If one does not want to train with virial, then he/she may set the virial prefactors {ref}`start_pref_v <loss[ener]/start_pref_v>` and {ref}`limit_pref_v <loss[ener]/limit_pref_v>` to 0.
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/train-fitting-dos.md b/doc/model/train-fitting-dos.md
index fb4a3677e5..600a619b8a 100644
--- a/doc/model/train-fitting-dos.md
+++ b/doc/model/train-fitting-dos.md
@@ -1,6 +1,6 @@
 # Fit electronic density of states (DOS) {{ tensorflow_icon }} {{ pytorch_icon }} {{ jax_icon }} {{ dpmodel_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, DP {{ dpmodel_icon }}
 :::
 
@@ -82,9 +82,9 @@ To prepare the data, we recommend shifting the DOS data by the Fermi level.
 
 The training command is the same as `ener` mode, i.e.
 
-::::{tab-set}
+::::\{tab-set}
 
-:::{tab-item} TensorFlow {{ tensorflow_icon }}
+:::\{tab-item} TensorFlow {{ tensorflow_icon }}
 
 ```bash
 dp --tf train input.json
@@ -92,7 +92,7 @@ dp --tf train input.json
 
 :::
 
-:::{tab-item} PyTorch {{ pytorch_icon }}
+:::\{tab-item} PyTorch {{ pytorch_icon }}
 
 ```bash
 dp --pt train input.json
@@ -133,9 +133,9 @@ The detailed loss can be found in `lcurve.out`:
 
 In this earlier version, we can use `dp test` to infer the electronic density of state for given frames.
 
-::::{tab-set}
+::::\{tab-set}
 
-:::{tab-item} TensorFlow {{ tensorflow_icon }}
+:::\{tab-item} TensorFlow {{ tensorflow_icon }}
 
 ```bash
 
@@ -146,7 +146,7 @@ dp --tf test -m frozen_model.pb -s ../data/111/$k -d ${output_prefix} -a -n 100
 
 :::
 
-:::{tab-item} PyTorch {{ pytorch_icon }}
+:::\{tab-item} PyTorch {{ pytorch_icon }}
 
 ```bash
 
diff --git a/doc/model/train-fitting-property.md b/doc/model/train-fitting-property.md
index be1b63bf6f..a2d4c7c9e8 100644
--- a/doc/model/train-fitting-property.md
+++ b/doc/model/train-fitting-property.md
@@ -1,6 +1,6 @@
 # Fit other properties {{ pytorch_icon }} {{ jax_icon }} {{ dpmodel_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, DP {{ dpmodel_icon }}
 :::
 
@@ -118,9 +118,9 @@ ls.to_deepmd_npy_mixed("deepmd")
 
 The training command is the same as `ener` mode, i.e.
 
-::::{tab-set}
+::::\{tab-set}
 
-:::{tab-item} PyTorch {{ pytorch_icon }}
+:::\{tab-item} PyTorch {{ pytorch_icon }}
 
 ```bash
 dp --pt train input.json
@@ -152,9 +152,9 @@ The detailed loss can be found in `lcurve.out`:
 
 We can use `dp test` to infer the properties for given frames.
 
-::::{tab-set}
+::::\{tab-set}
 
-:::{tab-item} PyTorch {{ pytorch_icon }}
+:::\{tab-item} PyTorch {{ pytorch_icon }}
 
 ```bash
 
@@ -185,10 +185,10 @@ for `*.property.out.*`, it contains matrix with shape of `(2, task_dim)`,
 ## Data Normalization
 
 When `fitting_net/type` is `ener`, the energy bias layer “$e_{bias}$” adds a constant bias to the atomic energy contribution according to the atomic number.i.e.,
-$$e_{bias} (Z_i) (MLP(D_i))= MLP(D_i) + e_{bias} (Z_i)$$
+\$$e_{bias} (Z_i) (MLP(D_i))= MLP(D_i) + e_{bias} (Z_i)$\$
 
 But when `fitting_net/type` is `property`. The property bias layer is used to normalize the property output of the model.i.e.,
-$$p_{bias} (MLP(D_i))= MLP(D_i) * std+ mean$$
+\$$p_{bias} (MLP(D_i))= MLP(D_i) * std+ mean$\$
 
 1. `std`: The standard deviation of the property label
-2. `mean`: The average value of the property label
+1. `mean`: The average value of the property label
diff --git a/doc/model/train-fitting-tensor.md b/doc/model/train-fitting-tensor.md
index 29c95b2d68..0eef9d9432 100644
--- a/doc/model/train-fitting-tensor.md
+++ b/doc/model/train-fitting-tensor.md
@@ -1,14 +1,14 @@
 # Fit `tensor` like `Dipole` and `Polarizability` {{ tensorflow_icon }} {{ pytorch_icon }} {{ jax_icon }} {{ dpmodel_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, DP {{ dpmodel_icon }}
 :::
 
 Unlike `energy`, which is a scalar, one may want to fit some high dimensional physical quantity, like `dipole` (vector) and `polarizability` (matrix, shorted as `polar`). Deep Potential has provided different APIs to do this. In this example, we will show you how to train a model to fit a water system. A complete training input script of the examples can be found in
 
-::::{tab-set}
+::::\{tab-set}
 
-:::{tab-item} TensorFlow {{ tensorflow_icon }}
+:::\{tab-item} TensorFlow {{ tensorflow_icon }}
 
 ```bash
 $deepmd_source_dir/examples/water_tensor/dipole/dipole_input.json
@@ -17,7 +17,7 @@ $deepmd_source_dir/examples/water_tensor/polar/polar_input.json
 
 :::
 
-:::{tab-item} PyTorch {{ pytorch_icon }}
+:::\{tab-item} PyTorch {{ pytorch_icon }}
 
 ```bash
 $deepmd_source_dir/examples/water_tensor/dipole/dipole_input_torch.json
@@ -69,15 +69,13 @@ The total tensor $\boldsymbol{T}$ (total dipole $\boldsymbol{T}^{(1)}$ or total
 
 The tensorial models can be used to calculate IR spectrum and Raman spectrum.[^1]
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 ## The fitting Network
 
 The {ref}`fitting_net <model[standard]/fitting_net>` section tells DP which fitting net to use.
 
-::::{tab-set}
+::::\{tab-set}
 
-:::{tab-item} TensorFlow {{ tensorflow_icon }}
+:::\{tab-item} TensorFlow {{ tensorflow_icon }}
 
 The JSON of `dipole` type should be provided like
 
@@ -109,7 +107,7 @@ The JSON of `polar` type should be provided like
 
 :::
 
-:::{tab-item} PyTorch {{ pytorch_icon }}
+:::\{tab-item} PyTorch {{ pytorch_icon }}
 
 The JSON of `dipole` type should be provided like
 
@@ -183,9 +181,9 @@ In this case, please check the file name of the label.
 
 The training command is the same as `ener` mode, i.e.
 
-::::{tab-set}
+::::\{tab-set}
 
-:::{tab-item} TensorFlow {{ tensorflow_icon }}
+:::\{tab-item} TensorFlow {{ tensorflow_icon }}
 
 ```bash
 dp train input.json
@@ -193,7 +191,7 @@ dp train input.json
 
 :::
 
-:::{tab-item} PyTorch {{ pytorch_icon }}
+:::\{tab-item} PyTorch {{ pytorch_icon }}
 
 ```bash
 dp --pt train input.json
@@ -248,3 +246,5 @@ During training, at each step when the `lcurve.out` is printed, the system used
 To only fit against a subset of atomic types, in the TensorFlow backend, {ref}`fitting_net/sel_type <model[standard]/fitting_net[dipole]/sel_type>` should be set to selected types;
 in other backends, {ref}`atom_exclude_types <model/atom_exclude_types>` should be set to excluded types.
 The TensorFlow backend does not support {ref}`numb_fparam <model[standard]/fitting_net[dipole]/numb_fparam>` and {ref}`numb_aparam <model[standard]/fitting_net[dipole]/numb_aparam>`.
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/train-hybrid.md b/doc/model/train-hybrid.md
index d565af5c9a..843e6eac3d 100644
--- a/doc/model/train-hybrid.md
+++ b/doc/model/train-hybrid.md
@@ -1,6 +1,6 @@
 # Descriptor `"hybrid"` {{ tensorflow_icon }} {{ pytorch_icon }} {{ jax_icon }} {{ dpmodel_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, DP {{ dpmodel_icon }}
 :::
 
@@ -21,8 +21,6 @@ A hybrid descriptor $\mathcal{D}^i_\text{hyb}$ concatenates multiple kinds of de
 The list of descriptors can be different types or the same descriptors with different parameters.
 This way, one can set the different cutoff radii for different descriptors.[^1]
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 ## Instructions
 
 To use the descriptor in DeePMD-kit, one firstly set the {ref}`type <model[standard]/descriptor/type>` to {ref}`hybrid <model[standard]/descriptor[hybrid]>`, then provide the definitions of the descriptors by the items in the `list`,
@@ -58,3 +56,5 @@ In other backends, each descriptor has its own type embedding and their paramete
 ## Model compression
 
 Model compression is supported if all sub-descriptors support model compression.
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/train-se-a-mask.md b/doc/model/train-se-a-mask.md
index ff1ee76c12..a453e1d3dc 100644
--- a/doc/model/train-se-a-mask.md
+++ b/doc/model/train-se-a-mask.md
@@ -1,6 +1,6 @@
 # Descriptor `"se_a_mask"` {{ tensorflow_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}
 :::
 
@@ -68,7 +68,7 @@ To make the `aparam.npy` used for descriptor `se_a_mask`, two variables in `fitt
 - {ref}`use_aparam_as_mask <model[standard]/fitting_net[ener]/use_aparam_as_mask>` is set to `true` to use the `aparam.npy` as the mask of the atoms in the descriptor `se_a_mask`.
 
 Finally, to make a reasonable fitting task with `se_a_mask` descriptor for DP/MM simulations, the loss function with `se_a_mask` is designed to include the atomic forces difference in specific atoms of the input particles only.
-More details about the selection of the specific atoms can be found in paper [DP/MM](left to be filled).
+More details about the selection of the specific atoms can be found in paper \[DP/MM\](left to be filled).
 Thus, `atom_pref.npy` ( [ nframes * natoms ] ) is required as the indicator of the specific atoms in the input particles.
 And the `loss` section in the training input script should be set as follows.
 
diff --git a/doc/model/train-se-atten.md b/doc/model/train-se-atten.md
index 2e0c236cf6..52e9b114b9 100644
--- a/doc/model/train-se-atten.md
+++ b/doc/model/train-se-atten.md
@@ -1,6 +1,6 @@
 # Descriptor `"se_atten"` {{ tensorflow_icon }} {{ pytorch_icon }} {{ jax_icon }} {{ paddle_icon }} {{ dpmodel_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, Paddle {{ paddle_icon }}, DP {{ dpmodel_icon }}
 :::
 
@@ -64,8 +64,6 @@ Then layer normalization is added in a residual way to finally obtain the self-a
 \mathcal{G}^{i,l} = \mathcal{G}^{i,l-1} + \mathrm{LayerNorm}(A(\mathcal{Q}^{i,l}, \mathcal{K}^{i,l}, \mathcal{V}^{i,l}, \mathcal{R}^{i,l})).
 ```
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 ## Descriptor `"se_atten"`
 
 Next, we will list the detailed settings in input.json and the data format, especially for large systems with dozens of elements. An example of DPA-1 input can be found in `examples/water/se_atten/input.json`.
@@ -134,7 +132,7 @@ You can use descriptor `"se_atten_v2"` and is not allowed to set `tebd_input_mod
 
 Practical evidence demonstrates that `"se_atten_v2"` offers better and more stable performance compared to `"se_atten"`.
 
-:::{note}
+:::\{note}
 Model compression support differs across backends. See [Model compression](#model-compression) for backend-specific requirements.
 :::
 
@@ -197,3 +195,5 @@ Model compression is supported for any {ref}`attn_layer <model[standard]/descrip
 Here we upload the AlMgCu example shown in the paper, you can download it here:
 [Baidu disk](https://pan.baidu.com/s/1Mk9CihPHCmf8quwaMhT-nA?pwd=d586);
 [Google disk](https://drive.google.com/file/d/11baEpRrvHoqxORFPSdJiGWusb3Y4AnRE/view?usp=sharing).
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/train-se-e2-a-tebd.md b/doc/model/train-se-e2-a-tebd.md
index 00726c0d3e..45d045ea06 100644
--- a/doc/model/train-se-e2-a-tebd.md
+++ b/doc/model/train-se-e2-a-tebd.md
@@ -1,6 +1,6 @@
 # Type embedding approach {{ tensorflow_icon }} {{ pytorch_icon }} {{ jax_icon }} {{ dpmodel_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, DP {{ dpmodel_icon }}
 :::
 
@@ -61,8 +61,6 @@ E_i=\mathcal{F}_0(\{\mathcal{D}^i, \mathcal{A}^i\}).
 
 In this way, all chemical species share the same network parameters through the type embedding.[^1]
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 ## Instructions for TensorFlow backend {{ tensorflow_icon }}
 
 In the TensorFlow backend, the type embedding is at the model level.
@@ -114,3 +112,5 @@ See documentation for each descriptor for details.
 In other backends, the type embedding is within the descriptor itself.
 
 See documentation for each descriptor for details.
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/train-se-e2-a.md b/doc/model/train-se-e2-a.md
index 9382c78d8e..65c3002227 100644
--- a/doc/model/train-se-e2-a.md
+++ b/doc/model/train-se-e2-a.md
@@ -1,6 +1,6 @@
 # Descriptor `"se_e2_a"` {{ tensorflow_icon }} {{ pytorch_icon }} {{ jax_icon }} {{ paddle_icon }} {{ dpmodel_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, Paddle {{ paddle_icon }}, DP {{ dpmodel_icon }}
 :::
 
@@ -58,8 +58,6 @@ $\mathcal{G}^i_< \in \mathbb{R}^{N_c \times M_<}$ only takes first $M_<$ columns
 $r_s$, $r_c$, $M$ and $M_<$ are hyperparameters provided by the user.
 The DeepPot-SE is continuous up to the second-order derivative in its domain.[^1]
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 ## Instructions
 
 In this example, we will train a DeepPot-SE model for a water system. A complete training input script of this example can be found in the directory.
@@ -109,3 +107,5 @@ In the JAX backend, {ref}`type_one_side <model[standard]/descriptor[se_e2_a]/typ
 
 Model compression is supported when type embedding is not used.
 To use model compression with type embedding in the TensorFlow backend, use `se_a_tebd_v2` instead.
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/train-se-e2-r.md b/doc/model/train-se-e2-r.md
index 62c7311a08..d1bbf69da6 100644
--- a/doc/model/train-se-e2-r.md
+++ b/doc/model/train-se-e2-r.md
@@ -1,6 +1,6 @@
 # Descriptor `"se_e2_r"` {{ tensorflow_icon }} {{ pytorch_icon }} {{ jax_icon }} {{ dpmodel_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, DP {{ dpmodel_icon }}
 :::
 
@@ -42,8 +42,6 @@ In the above equations, the network parameters are not explicitly written.
 $r_s$, $r_c$ and $M$ are hyperparameters provided by the user.
 The DeepPot-SE is continuous up to the second-order derivative in its domain.[^1]
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 ## Instructions
 
 A complete training input script of this example can be found in the directory
@@ -82,3 +80,5 @@ In the PyTorch, JAX, and DP backend, {ref}`type_one_side <model[standard]/descri
 ## Model compression
 
 Model compression is supported when type embedding is not used.
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/train-se-e3-tebd.md b/doc/model/train-se-e3-tebd.md
index f0001f4e67..903cd18e61 100644
--- a/doc/model/train-se-e3-tebd.md
+++ b/doc/model/train-se-e3-tebd.md
@@ -1,6 +1,6 @@
 # Descriptor `"se_e3_tebd"` {{ pytorch_icon }} {{ jax_icon }} {{ dpmodel_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, DP {{ dpmodel_icon }}
 :::
 
diff --git a/doc/model/train-se-e3.md b/doc/model/train-se-e3.md
index 2f1b3ba972..b50440df15 100644
--- a/doc/model/train-se-e3.md
+++ b/doc/model/train-se-e3.md
@@ -1,6 +1,6 @@
 # Descriptor `"se_e3"` {{ tensorflow_icon }} {{ pytorch_icon }} {{ jax_icon }} {{ dpmodel_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, DP {{ dpmodel_icon }}
 :::
 
@@ -38,8 +38,6 @@ Each element of $\mathcal{G}^i \in \mathbb{R}^{N_c \times N_c \times M}$ comes f
 where $(\theta_i)_ {jk} = (\mathcal{R}^i)_ {j,\\{2,3,4\\}}\cdot (\mathcal{R}^i)_ {k,\\{2,3,4\\}}$ considers the angle form of two neighbours ($j$ and $k$).
 The notation $:$ in the equation indicates the contraction between matrix $\mathcal{R}^i(\mathcal{R}^i)^T$ and the first two dimensions of tensor $\mathcal{G}^i$.[^1]
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 ## Instructions
 
 A complete training input script of this example can be found in the directory
@@ -76,3 +74,5 @@ In the TensorFlow backend, {ref}`env_protection <model[standard]/descriptor[se_e
 ## Model compression
 
 Model compression is supported.
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/nvnmd/nvnmd.md b/doc/nvnmd/nvnmd.md
index 279236ec96..793b933f04 100644
--- a/doc/nvnmd/nvnmd.md
+++ b/doc/nvnmd/nvnmd.md
@@ -1,6 +1,6 @@
 # Introduction {{ tensorflow_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}
 :::
 
@@ -65,10 +65,16 @@ The "nvnmd" section is defined as
   "version": 0,
   "max_nnei": 128,
   "net_size": 128,
-  "sel": [60, 60],
+  "sel": [
+    60,
+    60
+  ],
   "rcut": 6.0,
   "rcut_smth": 0.5,
-  "type_map": ["Ge", "Te"]
+  "type_map": [
+    "Ge",
+    "Te"
+  ]
 }
 ```
 
@@ -80,15 +86,15 @@ where items are defined as:
 | max_nnei  | the maximum number of neighbors that do not distinguish element types | 128 or 256                                                                   |
 | net_size  | the size of neural network                                            | 128                                                                          |
 | sel       | the number of neighbors                                               | version 0: integer list of lengths 1 to 4 are acceptable; version 1: integer |
-| rcut      | the cutoff radial                                                     | (0, 8.0]                                                                     |
-| rcut_smth | the smooth cutoff parameter                                           | (0, 8.0]                                                                     |
+| rcut      | the cutoff radial                                                     | (0, 8.0\]                                                                    |
+| rcut_smth | the smooth cutoff parameter                                           | (0, 8.0\]                                                                    |
 | type_map  | mapping atom type to the name (str) of the type                       | string list, optional                                                        |
 
 Multiple versions of the nvnmd model correspond to different network structures. `nvnmd-v0` and `nvnmd-v1` differ in the following ways:
 
 1. `nvnmd-v0` and `nvnmd-v1` use the `se_a` descriptor and `se_atten` descriptor, respectively
-2. `nvnmd-v0` has 1 set of parameters for each element and supports up to 4 element types. `nvnmd-v1` shares 1 set of parameters for each element and supports up to 31 types.
-3. `nvnmd-v0` distinguishes between neighboring atoms, so `sel` is a list of integers. `nvnmd-v1` does not distinguish between neighboring atoms, so `sel` is an integer.
+1. `nvnmd-v0` has 1 set of parameters for each element and supports up to 4 element types. `nvnmd-v1` shares 1 set of parameters for each element and supports up to 31 types.
+1. `nvnmd-v0` distinguishes between neighboring atoms, so `sel` is a list of integers. `nvnmd-v1` does not distinguish between neighboring atoms, so `sel` is an integer.
 
 ### learning_rate
 
@@ -97,20 +103,20 @@ The "learning_rate" section is defined as
 ```json
 {
   "type": "exp",
-  "start_lr": 1e-3,
-  "stop_lr": 3e-8,
+  "start_lr": 0.001,
+  "stop_lr": 3e-08,
   "decay_steps": 5000
 }
 ```
 
 where items are defined as:
 
-| Item        | Mean                                                             | Optional Value         |
-| ----------- | ---------------------------------------------------------------- | ---------------------- |
-| type        | learning rate variant type                                       | exp                    |
-| start_lr    | the learning rate at the beginning of the training               | a positive real number |
-| stop_lr     | the desired learning rate at the end of the training             | a positive real number |
-| decay_stops | the learning rate is decaying every {decay_stops} training steps | a positive integer     |
+| Item        | Mean                                                              | Optional Value         |
+| ----------- | ----------------------------------------------------------------- | ---------------------- |
+| type        | learning rate variant type                                        | exp                    |
+| start_lr    | the learning rate at the beginning of the training                | a positive real number |
+| stop_lr     | the desired learning rate at the end of the training              | a positive real number |
+| decay_stops | the learning rate is decaying every \{decay_stops} training steps | a positive integer     |
 
 ### loss
 
@@ -152,8 +158,16 @@ The "training" section is defined as
   "save_ckpt": "model.ckpt",
   "save_freq": 10000,
   "training_data": {
-    "systems": ["system1_path", "system2_path", "..."],
-    "batch_size": ["batch_size_of_system1", "batch_size_of_system2", "..."]
+    "systems": [
+      "system1_path",
+      "system2_path",
+      "..."
+    ],
+    "batch_size": [
+      "batch_size_of_system1",
+      "batch_size_of_system2",
+      "..."
+    ]
   }
 }
 ```
@@ -164,7 +178,7 @@ where items are defined as:
 | ---------- | --------------------------------------------------- | ------------------ |
 | seed       | the random seed                                     | a integer          |
 | stop_batch | the total training steps                            | a positive integer |
-| numb_test  | the accuracy is test by using {numb_test} sample    | a positive integer |
+| numb_test  | the accuracy is test by using \{numb_test} sample   | a positive integer |
 | disp_file  | the log file where the training message display     | a string           |
 | disp_freq  | display frequency                                   | a positive integer |
 | save_ckpt  | path prefix of check point files                    | a string           |
diff --git a/doc/test/model-deviation.md b/doc/test/model-deviation.md
index f4c58f1a38..40637352dc 100644
--- a/doc/test/model-deviation.md
+++ b/doc/test/model-deviation.md
@@ -49,8 +49,6 @@ an atom where the magnitude of $\boldsymbol{F}_i$ or $\boldsymbol{\Xi}$ is small
 Statistics of $\epsilon_{\boldsymbol{F},i}$ and $\epsilon_{\boldsymbol{\Xi},{\alpha \beta}}$ can be provided, including the maximum, average, and minimal model deviation over the atom index $i$ and over the component index $\alpha,\beta$, respectively.
 The maximum model deviation of forces $\epsilon_{\boldsymbol F,\text{max}}$ in a frame was found to be the best error indicator in a concurrent or active learning algorithm.[^1]
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 ## Instructions
 
 One can also use a subcommand to calculate the deviation of predicted forces or virials for a bunch of models in the following way:
@@ -77,3 +75,5 @@ where $D_{f_i}$ is the absolute model deviation of the force on atom $i$, $f_i$
 If the argument `--relative_v` is set, then the relative model deviation of the virial will be output instead of the absolute value, with the same definition of that of the force:
 
 $$E_{v_i}=\frac{\left|D_{v_i}\right|}{\left|v_i\right|+l}$$
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/third-party/ase.md b/doc/third-party/ase.md
index 183efa7cbb..eba322b30e 100644
--- a/doc/third-party/ase.md
+++ b/doc/third-party/ase.md
@@ -1,14 +1,14 @@
 # Use deep potential with ASE
 
-:::{note}
+:::\{note}
 See [Environment variables](../env.md) for the runtime environment variables.
 :::
 
 Deep potential can be set up as a calculator with ASE to obtain potential energies and forces.
 
-::::{tab-set}
+::::\{tab-set}
 
-:::{tab-item} TensorFlow {{ tensorflow_icon }}
+:::\{tab-item} TensorFlow {{ tensorflow_icon }}
 
 ```python
 from ase import Atoms
@@ -26,7 +26,7 @@ print(water.get_forces())
 
 :::
 
-:::{tab-item} PyTorch {{ pytorch_icon }}
+:::\{tab-item} PyTorch {{ pytorch_icon }}
 
 ```python
 from ase import Atoms
@@ -44,7 +44,7 @@ print(water.get_forces())
 
 :::
 
-:::{tab-item} Paddle {{ paddle_icon }}
+:::\{tab-item} Paddle {{ paddle_icon }}
 
 ```python
 from ase import Atoms
diff --git a/doc/third-party/dpdata.md b/doc/third-party/dpdata.md
index ddb8f13aad..65bf104ad6 100644
--- a/doc/third-party/dpdata.md
+++ b/doc/third-party/dpdata.md
@@ -1,6 +1,6 @@
 # Use deep potential with dpdata
 
-:::{note}
+:::\{note}
 See [Environment variables](../env.md) for the runtime environment variables.
 :::
 
diff --git a/doc/third-party/gromacs.md b/doc/third-party/gromacs.md
index 791caeb419..5d3e098e32 100644
--- a/doc/third-party/gromacs.md
+++ b/doc/third-party/gromacs.md
@@ -1,6 +1,6 @@
 # Running MD with GROMACS
 
-:::{note}
+:::\{note}
 See [Environment variables](../env.md) for the runtime environment variables.
 :::
 
diff --git a/doc/third-party/ipi.md b/doc/third-party/ipi.md
index 117512138e..8d745fac1f 100644
--- a/doc/third-party/ipi.md
+++ b/doc/third-party/ipi.md
@@ -1,6 +1,6 @@
 # Run path-integral MD with i-PI
 
-:::{note}
+:::\{note}
 See [Environment variables](../env.md) for the runtime environment variables.
 :::
 
diff --git a/doc/third-party/lammps-command.md b/doc/third-party/lammps-command.md
index 25a77f8670..38b0981d2b 100644
--- a/doc/third-party/lammps-command.md
+++ b/doc/third-party/lammps-command.md
@@ -1,10 +1,10 @@
 # Run MD with LAMMPS
 
-:::{note}
+:::\{note}
 See [Environment variables](../env.md) for the runtime environment variables.
 :::
 
-:::{note}
+:::\{note}
 Each MPI rank can only use at most one GPU card.
 See [How to control the parallelism of a job](../troubleshooting/howtoset_num_nodes.md) for details.
 :::
@@ -50,6 +50,7 @@ pair_style deepmd models ... keyword value ...
   If multiple models are provided, then only the first model serves to provide energy and force prediction for each timestep of molecular dynamics,
   and the model deviation will be computed among all models every `out_freq` timesteps.
 - keyword = _out_file_ or _out_freq_ or _fparam_ or _fparam_from_compute_ or _aparam_from_compute_ or _atomic_ or _relative_ or _relative_v_ or _aparam_ or _ttm_
+
 <pre>
     <i>out_file</i> value = filename
         filename = The file name for the model deviation output. Default is model_devi.out
@@ -91,7 +92,7 @@ compute    1 all ke/atom
 
 ### Description
 
-Evaluate the interaction of the system by using [Deep Potential][DP] or [Deep Potential Smooth Edition][DP-SE]. It is noticed that deep potential is not a "pairwise" interaction, but a multi-body interaction.
+Evaluate the interaction of the system by using [Deep Potential][dp] or [Deep Potential Smooth Edition][dp-se]. It is noticed that deep potential is not a "pairwise" interaction, but a multi-body interaction.
 
 This pair style takes the deep potential defined in a model file that usually has .pb/.pth/.savedmodel extensions. The model can be trained and frozen from multiple backends by package [DeePMD-kit](https://github.com/deepmodeling/deepmd-kit), which can have either double or single float precision interface.
 
@@ -137,7 +138,7 @@ pair_style deepspin models ... keyword value ...
   and the model deviation will be computed among all models every `out_freq` timesteps.
 - keyword = _out_file_ or _out_freq_ or _fparam_ or _fparam_from_compute_ or _aparam_from_compute_ or _atomic_ or _relative_ or _aparam_ or _ttm_
 
-:::{note}
+:::\{note}
 Please note that the virial and atomic virial are not currently supported in spin models.
 :::
 
@@ -180,7 +181,7 @@ compute    1 all ke/atom
 
 ### Description
 
-Evaluate the interaction of the system with spin by using [DeepSPIN][DPSPIN] models. It is noticed that deep spin model is not a "pairwise" interaction, but a multi-body interaction.
+Evaluate the interaction of the system with spin by using [DeepSPIN][dpspin] models. It is noticed that deep spin model is not a "pairwise" interaction, but a multi-body interaction.
 
 This pair style takes the deep spin model defined in a model file that usually has .pb/.pth/.savedmodel extensions. The model can be trained and frozen from multiple backends by package [DeePMD-kit](https://github.com/deepmodeling/deepmd-kit), which can have either double or single float precision interface.
 
@@ -189,7 +190,7 @@ The unit follows [LAMMPS units](#units) and the [scale factor](https://docs.lamm
 
 Other settings and output for this pair style is the same as `deepmd` pair style, please see the detailed description [above](#pair_style-deepmd).
 
-:::{note}
+:::\{note}
 Please note that the virial and atomic virial are not currently supported in spin models.
 :::
 
@@ -260,7 +261,7 @@ compute ID group-ID centroid/stress/atom NULL virial
 
 see [LAMMPS doc page](https://docs.lammps.org/compute_stress_atom.html#thompson2) for more details on the meaning of the keywords.
 
-:::{versionchanged} v2.2.3
+:::\{versionchanged} v2.2.3
 v2.2.2 or previous versions passed per-atom stress (`cvatom`) with the per-atom pressure tensor, which is inconsistent with [LAMMPS's definition](https://docs.lammps.org/compute_stress_atom.html). LAMMPS defines per-atom stress as the negative of the per-atom pressure tensor. Such behavior is corrected in v2.2.3.
 :::
 
@@ -304,13 +305,9 @@ compute flux all heat/flux ke pe stress
 
 If you use these features please cite [D. Tisi, L. Zhang, R. Bertossa, H. Wang, R. Car, S. Baroni - arXiv preprint arXiv:2108.10850, 2021](https://arxiv.org/abs/2108.10850)
 
-[DP]: https://journals.aps.org/prl/abstract/10.1103/PhysRevLett.120.143001
-[DP-SE]: https://dl.acm.org/doi/10.5555/3327345.3327356
-[DPSPIN]: https://doi.org/10.1103/PhysRevB.110.064427
-
 ### D3 dispersion
 
-:::{note}
+:::\{note}
 Requires LAMMPS version 4Feb2025 or newer.
 :::
 
@@ -322,3 +319,7 @@ pair_style hybrid/overlay deepmd water.pb dispersion/d3 original pbe0 30.0 20.0
 pair_coeff * * deepmd O H
 pair_coeff * * dispersion/d3 O H
 ```
+
+[dp]: https://journals.aps.org/prl/abstract/10.1103/PhysRevLett.120.143001
+[dp-se]: https://dl.acm.org/doi/10.5555/3327345.3327356
+[dpspin]: https://doi.org/10.1103/PhysRevB.110.064427
diff --git a/doc/train/finetuning.md b/doc/train/finetuning.md
index 78a7b8a71e..142a12cc2e 100644
--- a/doc/train/finetuning.md
+++ b/doc/train/finetuning.md
@@ -1,6 +1,6 @@
 # Finetune the pre-trained model {{ tensorflow_icon }} {{ pytorch_icon }} {{ paddle_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, Paddle {{ paddle_icon }}
 :::
 
@@ -28,7 +28,7 @@ $ dp train input.json --finetune pretrained.pb
 The command above will change the energy bias in the last layer of the fitting net in `pretrained.pb`,
 according to the training dataset in input.json.
 
-:::{warning}
+:::\{warning}
 Note that in TensorFlow, model parameters including the `type_map` will be overwritten based on those in the pre-trained model.
 Please ensure you are familiar with the configurations in the pre-trained model, especially `type_map`, before starting the fine-tuning process.
 The elements in the training dataset must be contained in the pre-trained dataset.
@@ -105,7 +105,7 @@ one can select a specific branch (e.g., `CHOOSEN_BRANCH`) included in `multitask
 $ dp --pt train input.json --finetune multitask_pretrained.pt --model-branch CHOOSEN_BRANCH
 ```
 
-:::{note}
+:::\{note}
 One can check the available model branches in multi-task pre-trained model by referring to the documentation of the pre-trained model or by using the following command:
 
 ```bash
@@ -131,9 +131,9 @@ Then, prepare a suitable input script for multitask fine-tuning `multi_input.jso
 
 1. Refer to the [`multi-task-training`](./multi-task-training) document to prepare a multitask training script for two systems,
    ideally extracting parts (i.e. {ref}`model_dict <model/model_dict>`, {ref}`loss_dict <loss_dict>`, {ref}`data_dict <training/data_dict>` and {ref}`model_prob <training/model_prob>` parts) corresponding to `PRE_DATA1` and `PRE_DATA2` directly from the training script of the pre-trained model.
-2. For `DOWNSTREAM_DATA`, select a desired branch to fine-tune from (e.g., `PRE_DATA1`), copy the configurations of `PRE_DATA1` as the configuration for `DOWNSTREAM_DATA` and insert the corresponding data path into the {ref}`data_dict <training/data_dict>`,
+1. For `DOWNSTREAM_DATA`, select a desired branch to fine-tune from (e.g., `PRE_DATA1`), copy the configurations of `PRE_DATA1` as the configuration for `DOWNSTREAM_DATA` and insert the corresponding data path into the {ref}`data_dict <training/data_dict>`,
    thereby generating a three-system multitask training script.
-3. In the {ref}`model_dict <model/model_dict>` for `DOWNSTREAM_DATA`, specify the branch from which `DOWNSTREAM_DATA` is to fine-tune using:
+1. In the {ref}`model_dict <model/model_dict>` for `DOWNSTREAM_DATA`, specify the branch from which `DOWNSTREAM_DATA` is to fine-tune using:
    `"finetune_head": "PRE_DATA1"`.
 
 The complete `multi_input.json` should appear as follows ("..." means copied from input script of pre-trained model):
@@ -251,7 +251,7 @@ one can select a specific branch (e.g., `CHOOSEN_BRANCH`) included in `multitask
 $ dp --pd train input.json --finetune multitask_pretrained.pd --model-branch CHOOSEN_BRANCH
 ```
 
-:::{note}
+:::\{note}
 One can check the available model branches in multi-task pre-trained model by refering to the documentation of the pre-trained model or by using the following command:
 
 ```bash
diff --git a/doc/train/gpu-limitations.md b/doc/train/gpu-limitations.md
index 44c9697dd4..eb4f2bc373 100644
--- a/doc/train/gpu-limitations.md
+++ b/doc/train/gpu-limitations.md
@@ -3,7 +3,7 @@
 If you use DeePMD-kit in a GPU environment, the acceptable value range of some variables is additionally restricted compared to the CPU environment due to the software's GPU implementations:
 
 1. The number of atom types of a given system must be less than 128.
-2. The maximum distance between an atom and its neighbors must be less than 128. It can be controlled by setting the rcut value of training parameters.
-3. Theoretically, the maximum number of atoms that a single GPU can accept is about 10,000,000. However, this value is limited by the GPU memory size currently, usually within 1000,000 atoms even in the model compression mode.
-4. The total sel value of training parameters(in `model[standard]/descriptor` section) must be less than 4096.
-5. The size of the last layer of the embedding net must be less than 1024 during the model compression process.
+1. The maximum distance between an atom and its neighbors must be less than 128. It can be controlled by setting the rcut value of training parameters.
+1. Theoretically, the maximum number of atoms that a single GPU can accept is about 10,000,000. However, this value is limited by the GPU memory size currently, usually within 1000,000 atoms even in the model compression mode.
+1. The total sel value of training parameters(in `model[standard]/descriptor` section) must be less than 4096.
+1. The size of the last layer of the embedding net must be less than 1024 during the model compression process.
diff --git a/doc/train/multi-task-training.md b/doc/train/multi-task-training.md
index 115c463cc2..f7cc2e8a69 100644
--- a/doc/train/multi-task-training.md
+++ b/doc/train/multi-task-training.md
@@ -1,10 +1,10 @@
 # Multi-task training {{ pytorch_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: PyTorch {{ pytorch_icon }}
 :::
 
-:::{warning}
+:::\{warning}
 We have deprecated TensorFlow backend multi-task training, please use the PyTorch one.
 :::
 
@@ -26,8 +26,6 @@ and the Adam optimizer is executed to minimize $L^{(t)}$ for one step to update
 In the case of multi-GPU parallel training, different GPUs will independently select their tasks.
 In the DPA-2 model, this multi-task training framework is adopted.[^1]
 
-[^1]: Duo Zhang, Xinzijian Liu, Xiangyu Zhang, Chengqian Zhang, Chun Cai, Hangrui Bi, Yiming Du, Xuejian Qin, Anyang Peng, Jiameng Huang, Bowen Li, Yifan Shan, Jinzhe Zeng, Yuzhi Zhang, Siyuan Liu, Yifan Li, Junhan Chang, Xinyan Wang, Shuo Zhou, Jianchuan Liu, Xiaoshan Luo, Zhenyu Wang, Wanrun Jiang, Jing Wu, Yudi Yang, Jiyuan Yang, Manyi Yang, Fu-Qiang Gong, Linshuang Zhang, Mengchao Shi, Fu-Zhi Dai, Darrin M. York, Shi Liu, Tong Zhu, Zhicheng Zhong, Jian Lv, Jun Cheng, Weile Jia, Mohan Chen, Guolin Ke, Weinan E, Linfeng Zhang, Han Wang, DPA-2: a large atomic model as a multi-task learner. npj Comput Mater 10, 293 (2024). [DOI: 10.1038/s41524-024-01493-2](https://doi.org/10.1038/s41524-024-01493-2) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 Compared with the previous TensorFlow implementation, the new support in PyTorch is more flexible and efficient.
 In particular, it makes multi-GPU parallel training and even tasks beyond DFT possible,
 enabling larger-scale and more general multi-task training to obtain more general pre-trained models.
@@ -48,6 +46,7 @@ Specifically, there are several parts that need to be modified:
 - {ref}`model/model_dict <model/model_dict>`: The core definition of the model part and the explanation of sharing rules,
   starting with user-defined model name keys `model_key`, such as `my_model_1`.
   Each model part needs to align with the components of the single-task training {ref}`model <model>`, but with the following sharing rules:
+
   - If you want to share the current model component with other tasks, which should be part of the {ref}`model/shared_dict <model/shared_dict>`,
     you can directly fill in the corresponding `part_key`, such as
     `"descriptor": "my_descriptor", `
@@ -63,7 +62,7 @@ Specifically, there are several parts that need to be modified:
   - For fitting nets, we only support the default `shared_level`=0, where all parameters will be shared except for `bias_atom_e` and `case_embd`.
   - To conduct multitask training, there are two typical approaches:
     1. **Descriptor sharing only**: Share the descriptor with `shared_level`=0. See [here](../../examples/water_multi_task/pytorch_example/input_torch.json) for an example.
-    2. **Descriptor and fitting network sharing with data identification**:
+    1. **Descriptor and fitting network sharing with data identification**:
        - Share the descriptor and the fitting network with `shared_level`=0.
        - {ref}`dim_case_embd <model[standard]/fitting_net[ener]/dim_case_embd>` must be set to the number of model branches, which will distinguish different data tasks using a one-hot embedding.
        - See [here](../../examples/water_multi_task/pytorch_example/input_torch_sharefit.json) for an example.
@@ -84,8 +83,10 @@ Specifically, there are several parts that need to be modified:
 An example input for multi-task training two models in water system is shown as following:
 
 ```{literalinclude} ../../examples/water_multi_task/pytorch_example/input_torch.json
-:language: json
-:linenos:
+---
+language: json
+linenos:
+---
 ```
 
 ## Finetune from the pre-trained multi-task model
@@ -95,7 +96,7 @@ users can refer to [this section](./finetuning.md#fine-tuning-from-a-multi-task-
 
 ## Multi-task specific parameters
 
-:::{note}
+:::\{note}
 Details of some parameters that are the same as [the regular parameters](./train-input.rst) are not shown below.
 :::
 
@@ -104,3 +105,5 @@ Details of some parameters that are the same as [the regular parameters](./train
    :module: deepmd.utils.argcheck
    :func: gen_args_multi_task
 ```
+
+[^1]: Duo Zhang, Xinzijian Liu, Xiangyu Zhang, Chengqian Zhang, Chun Cai, Hangrui Bi, Yiming Du, Xuejian Qin, Anyang Peng, Jiameng Huang, Bowen Li, Yifan Shan, Jinzhe Zeng, Yuzhi Zhang, Siyuan Liu, Yifan Li, Junhan Chang, Xinyan Wang, Shuo Zhou, Jianchuan Liu, Xiaoshan Luo, Zhenyu Wang, Wanrun Jiang, Jing Wu, Yudi Yang, Jiyuan Yang, Manyi Yang, Fu-Qiang Gong, Linshuang Zhang, Mengchao Shi, Fu-Zhi Dai, Darrin M. York, Shi Liu, Tong Zhu, Zhicheng Zhong, Jian Lv, Jun Cheng, Weile Jia, Mohan Chen, Guolin Ke, Weinan E, Linfeng Zhang, Han Wang, DPA-2: a large atomic model as a multi-task learner. npj Comput Mater 10, 293 (2024). [DOI: 10.1038/s41524-024-01493-2](https://doi.org/10.1038/s41524-024-01493-2) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/train/parallel-training.md b/doc/train/parallel-training.md
index 998f1c3bec..2867e825bc 100644
--- a/doc/train/parallel-training.md
+++ b/doc/train/parallel-training.md
@@ -1,6 +1,6 @@
 # Parallel training {{ tensorflow_icon }} {{ pytorch_icon }} {{ paddle_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, Paddle {{ paddle_icon }}
 :::
 
@@ -105,9 +105,10 @@ We utilize the PyTorch framework and have designed and implemented a multiproces
 
 First, we establish a DeepmdData class for each system, which is consistent with the TensorFlow version in this level. Then, we create a dataloader for each system, resulting in the same number of dataloaders as the number of systems. Next, we create a dataset for the dataloaders obtained in the previous step. This allows us to query the data for each system through this dataset, while the iteration pointers for each system are maintained by their respective dataloaders. Finally, a dataloader is created for the outermost dataset.
 
-We achieve custom sampling methods using a weighted sampler. The length of the sampler is set to total_batch_num \* num_workers.The parameter "num_workers" defines the number of threads involved in multi-threaded loading, which can be modified by setting the environment variable NUM_WORKERS (default: min(8, ncpus)).
+We achieve custom sampling methods using a weighted sampler. The length of the sampler is set to total_batch_num * num_workers.The parameter "num_workers" defines the number of threads involved in multi-threaded loading, which can be modified by setting the environment variable NUM_WORKERS (default: min(8, ncpus)).
 
-> **Note** The underlying dataloader will use a distributed sampler to ensure that each GPU receives batches with different content in parallel mode, which will use sequential sampler in serial mode. In the TensorFlow version, Horovod shuffles the dataset using different random seeds for the same purpose..
+> [!NOTE]
+> The underlying dataloader will use a distributed sampler to ensure that each GPU receives batches with different content in parallel mode, which will use sequential sampler in serial mode. In the TensorFlow version, Horovod shuffles the dataset using different random seeds for the same purpose..
 
 ```mermaid
 flowchart LR
@@ -183,9 +184,11 @@ torchrun --rdzv_endpoint=node0:12321 --nnodes=2 --nproc_per_node=4 --node_rank=0
 torchrun --rdzv_endpoint=node0:12321 --nnodes=2 --nproc_per_node=4 --node_rank=1 --no_python dp --pt train tests/water/se_e2_a.json
 ```
 
-> **Note** Set environment variables to tune [CPU specific optimizations](https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#cpu-specific-optimizations) in advance.
+> [!NOTE]
+> Set environment variables to tune [CPU specific optimizations](https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#cpu-specific-optimizations) in advance.
 
-> **Note** for developers: `torchrun` by default passes settings as environment variables [(list here)](https://pytorch.org/docs/stable/elastic/run.html#environment-variables).
+> [!NOTE]
+> for developers: `torchrun` by default passes settings as environment variables [(list here)](https://pytorch.org/docs/stable/elastic/run.html#environment-variables).
 
 > To check forward, backward, and communication time, please set env var `TORCH_CPP_LOG_LEVEL=INFO TORCH_DISTRIBUTED_DEBUG=DETAIL`. More details can be found [here](https://pytorch.org/docs/stable/distributed.html#logging).
 
@@ -233,7 +236,7 @@ Then, run the script on the first node with:
 mpirun run_pp.sh
 ```
 
-:::{note}
+:::\{note}
 
 If `NUM_WORKERS` is too large, it may cause the program to be terminated by the system;
 if it is too small, it may slow down data reading. You can try adjusting it to an appropriate size.
diff --git a/doc/train/tensorboard.md b/doc/train/tensorboard.md
index 3c45ebba34..b271a80093 100644
--- a/doc/train/tensorboard.md
+++ b/doc/train/tensorboard.md
@@ -1,6 +1,6 @@
 # TensorBoard Usage {{ tensorflow_icon }} {{ pytorch_icon }} {{ paddle_icon }}
 
-:::{note}
+:::\{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, Paddle {{ paddle_icon }}
 :::
 
diff --git a/doc/train/training-advanced.md b/doc/train/training-advanced.md
index af4b4b31d9..e60312a8b0 100644
--- a/doc/train/training-advanced.md
+++ b/doc/train/training-advanced.md
@@ -21,8 +21,6 @@ where $\tau \in \mathbb{N}$ is the index of the training step, $\gamma^0  \in \m
 where $\tau^{\text{stop}} \in \mathbb{N}$, $\gamma^{\text{stop}} \in \mathbb{R}$, and $s \in \mathbb{N}$ are the stopping step, the stopping learning rate, and the decay steps, respectively, all of which are hyperparameters provided in advance.
 [^1]
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 ### Instructions
 
 The {ref}`learning_rate <learning_rate>` section in `input.json` is given as follows
@@ -38,7 +36,9 @@ The {ref}`learning_rate <learning_rate>` section in `input.json` is given as fol
 ```
 
 - {ref}`start_lr <learning_rate[exp]/start_lr>` gives the learning rate at the beginning of the training.
+
 - {ref}`stop_lr <learning_rate[exp]/stop_lr>` gives the learning rate at the end of the training. It should be small enough to ensure that the network parameters satisfactorily converge.
+
 - During the training, the learning rate decays exponentially from {ref}`start_lr <learning_rate[exp]/start_lr>` to {ref}`stop_lr <learning_rate[exp]/stop_lr>` following the formula:
 
   ```
@@ -182,3 +182,5 @@ dp freeze -o frozen_model_adjusted_sel.pb
 Two models should give the same result when the input satisfies both constraints.
 
 Note: At this time, this feature is only supported by [`se_e2_a`](../model/train-se-e2-a.md) descriptor with [`set_davg_true`](./train-input.rst) enabled, or `hybrid` composed of the above descriptors.
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/train/training.md b/doc/train/training.md
index 6ccb43bbd7..04ed29af03 100644
--- a/doc/train/training.md
+++ b/doc/train/training.md
@@ -8,9 +8,9 @@ $ cd $deepmd_source_dir/examples/water/se_e2_a/
 
 After switching to that directory, the training can be invoked by
 
-::::{tab-set}
+::::\{tab-set}
 
-:::{tab-item} TensorFlow {{ tensorflow_icon }}
+:::\{tab-item} TensorFlow {{ tensorflow_icon }}
 
 ```bash
 $ dp --tf train input.json
@@ -18,7 +18,7 @@ $ dp --tf train input.json
 
 :::
 
-:::{tab-item} PyTorch {{ pytorch_icon }}
+:::\{tab-item} PyTorch {{ pytorch_icon }}
 
 ```bash
 $ dp --pt train input.json
@@ -26,7 +26,7 @@ $ dp --pt train input.json
 
 :::
 
-:::{tab-item} Paddle {{ paddle_icon }}
+:::\{tab-item} Paddle {{ paddle_icon }}
 
 ```bash
 # training model
@@ -69,12 +69,12 @@ During the training, the error of the model is tested every {ref}`disp_freq <tra
 
 ```bash
 #  step      rmse_val    rmse_trn    rmse_e_val  rmse_e_trn    rmse_f_val  rmse_f_trn         lr
-      0      3.33e+01    3.41e+01      1.03e+01    1.03e+01      8.39e-01    8.72e-01    1.0e-03
-    100      2.57e+01    2.56e+01      1.87e+00    1.88e+00      8.03e-01    8.02e-01    1.0e-03
-    200      2.45e+01    2.56e+01      2.26e-01    2.21e-01      7.73e-01    8.10e-01    1.0e-03
-    300      1.62e+01    1.66e+01      5.01e-02    4.46e-02      5.11e-01    5.26e-01    1.0e-03
-    400      1.36e+01    1.32e+01      1.07e-02    2.07e-03      4.29e-01    4.19e-01    1.0e-03
-    500      1.07e+01    1.05e+01      2.45e-03    4.11e-03      3.38e-01    3.31e-01    1.0e-03
+0      3.33e+01    3.41e+01      1.03e+01    1.03e+01      8.39e-01    8.72e-01    1.0e-03
+100      2.57e+01    2.56e+01      1.87e+00    1.88e+00      8.03e-01    8.02e-01    1.0e-03
+200      2.45e+01    2.56e+01      2.26e-01    2.21e-01      7.73e-01    8.10e-01    1.0e-03
+300      1.62e+01    1.66e+01      5.01e-02    4.46e-02      5.11e-01    5.26e-01    1.0e-03
+400      1.36e+01    1.32e+01      1.07e-02    2.07e-03      4.29e-01    4.19e-01    1.0e-03
+500      1.07e+01    1.05e+01      2.45e-03    4.11e-03      3.38e-01    3.31e-01    1.0e-03
 ```
 
 The file contains 8 columns, from left to right, which are the training step, the validation loss, training loss, root mean square (RMS) validation error of energy, RMS training error of energy, RMS validation error of force, RMS training error of force and the learning rate. The RMS error (RMSE) of the energy is normalized by the number of atoms in the system. One can visualize this file with a simple Python script:
@@ -97,6 +97,6 @@ plt.show()
 
 Checkpoints will be written to files with the prefix {ref}`save_ckpt <training/save_ckpt>` every {ref}`save_freq <training/save_freq>` training steps.
 
-:::{warning}
+:::\{warning}
 It is warned that the example water data (in folder `examples/water/data`) is of very limited amount, is provided only for testing purposes, and should not be used to train a production model.
 :::
diff --git a/doc/troubleshooting/howtoset_num_nodes.md b/doc/troubleshooting/howtoset_num_nodes.md
index b09fb80cb6..1748bd1598 100644
--- a/doc/troubleshooting/howtoset_num_nodes.md
+++ b/doc/troubleshooting/howtoset_num_nodes.md
@@ -8,14 +8,14 @@ One should make sure the product of the parallel numbers is less than or equal t
 
 Parallelism for MPI is optional and used for multiple nodes, multiple GPU cards, or sometimes multiple CPU cores.
 
-::::{tab-set}
+::::\{tab-set}
 
-:::{tab-item} TensorFlow {{ tensorflow_icon }}
+:::\{tab-item} TensorFlow {{ tensorflow_icon }}
 
 To enable MPI support for training in the TensorFlow interface, one should [install horovod](../install/install-from-source.md#install-horovod-and-mpi4py) in advance.
 
 :::
-:::{tab-item} PyTorch {{ pytorch_icon }}
+:::\{tab-item} PyTorch {{ pytorch_icon }}
 
 Multiprocessing support for training in the PyTorch backend is implemented with [torchrun](https://pytorch.org/docs/stable/elastic/run.html).
 
@@ -68,14 +68,14 @@ export OMP_NUM_THREADS=2
 
 There are several other environment variables for OpenMP, such as `KMP_BLOCKTIME`.
 
-::::{tab-set}
+::::\{tab-set}
 
-:::{tab-item} TensorFlow {{ tensorflow_icon }}
+:::\{tab-item} TensorFlow {{ tensorflow_icon }}
 
 See [Intel documentation](https://www.intel.com/content/www/us/en/developer/articles/technical/maximize-tensorflow-performance-on-cpu-considerations-and-recommendations-for-inference.html) for detailed information.
 
 :::
-:::{tab-item} PyTorch {{ pytorch_icon }}
+:::\{tab-item} PyTorch {{ pytorch_icon }}
 
 See [PyTorch documentation](https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html) for detailed information.
 
@@ -89,9 +89,9 @@ There is no one general parallel configuration that works for all situations, so
 Here are some empirical examples.
 If you wish to use 3 cores of 2 CPUs on one node, you may set the environment variables and run DeePMD-kit as follows:
 
-::::{tab-set}
+::::\{tab-set}
 
-:::{tab-item} TensorFlow {{ tensorflow_icon }}
+:::\{tab-item} TensorFlow {{ tensorflow_icon }}
 
 ```bash
 export OMP_NUM_THREADS=3
@@ -102,7 +102,7 @@ dp --tf train input.json
 
 :::
 
-:::{tab-item} PyTorch {{ pytorch_icon }}
+:::\{tab-item} PyTorch {{ pytorch_icon }}
 
 ```bash
 export OMP_NUM_THREADS=3
diff --git a/examples/property/train/README.md b/examples/property/train/README.md
index e4dc9ed704..6e9345395c 100644
--- a/examples/property/train/README.md
+++ b/examples/property/train/README.md
@@ -1,5 +1,5 @@
 Some explanations of the parameters in `input.json`:
 
 1. `fitting_net/property_name` is the name of the property to be predicted. It should be consistent with the property name in the dataset. In each system, code will read `set.*/{property_name}.npy` file as prediction label if you use NumPy format data.
-2. `fitting_net/task_dim` is the dimension of model output. It should be consistent with the property dimension in the dataset, which means if the shape of data stored in `set.*/{property_name}.npy` is `batch size * 3`, `fitting_net/task_dim` should be set to 3.
-3. `fitting/intensive` indicates whether the fitting property is intensive. If `intensive` is `true`, the model output is the average of the property contribution of each atom. If `intensive` is `false`, the model output is the sum of the property contribution of each atom.
+1. `fitting_net/task_dim` is the dimension of model output. It should be consistent with the property dimension in the dataset, which means if the shape of data stored in `set.*/{property_name}.npy` is `batch size * 3`, `fitting_net/task_dim` should be set to 3.
+1. `fitting/intensive` indicates whether the fitting property is intensive. If `intensive` is `true`, the model output is the average of the property contribution of each atom. If `intensive` is `false`, the model output is the sum of the property contribution of each atom.
diff --git a/source/3rdparty/README.md b/source/3rdparty/README.md
index ac9cfd4edc..9de615d505 100644
--- a/source/3rdparty/README.md
+++ b/source/3rdparty/README.md
@@ -1,7 +1,7 @@
 # 3rd-party source codes
 
-| Name                      | Repository                         | Version | License |
-| ------------------------- | ---------------------------------- | ------- | ------- |
-| json                      | https://github.com/nlohmann/json   | 3.9.1   | MIT     |
-| Implib.so                 | https://github.com/yugr/Implib.so  | 0ddaa71 | MIT     |
-| coverage_plugins          | https://github.com/pytorch/pytorch | 2.2.0   | BSD-3   |
+| Name             | Repository                         | Version | License |
+| ---------------- | ---------------------------------- | ------- | ------- |
+| json             | https://github.com/nlohmann/json   | 3.9.1   | MIT     |
+| Implib.so        | https://github.com/yugr/Implib.so  | 0ddaa71 | MIT     |
+| coverage_plugins | https://github.com/pytorch/pytorch | 2.2.0   | BSD-3   |
diff --git a/source/3rdparty/implib/arch/e2k/README.md b/source/3rdparty/implib/arch/e2k/README.md
index eb87f54f85..9c9fc02487 100644
--- a/source/3rdparty/implib/arch/e2k/README.md
+++ b/source/3rdparty/implib/arch/e2k/README.md
@@ -1,4 +1,5 @@
 Reference materials:
-  * Руководство по эффективному программированию на платформе «Эльбрус» (http://www.mcst.ru/files/5ed39a/dd0cd8/50506b/000000/elbrus_prog_2020-05-30.pdf)
-  * Микропроцессоры и вычислительные комплексы семейства Эльбрус (http://www.mcst.ru/doc/book_121130.pdf)
-  * https://github.com/OpenE2K
+
+- Руководство по эффективному программированию на платформе «Эльбрус» (http://www.mcst.ru/files/5ed39a/dd0cd8/50506b/000000/elbrus_prog_2020-05-30.pdf)
+- Микропроцессоры и вычислительные комплексы семейства Эльбрус (http://www.mcst.ru/doc/book_121130.pdf)
+- https://github.com/OpenE2K
diff --git a/source/nodejs/README.md b/source/nodejs/README.md
index ad78359761..e6b2c8d476 100644
--- a/source/nodejs/README.md
+++ b/source/nodejs/README.md
@@ -23,8 +23,8 @@ When using CMake to build DeePMD-kit, set argument `BUILD_NODEJS_IF=ON` and `NOD
 
 ```sh
 cmake -D BUILD_NODEJS_IF=ON \
-      -D NODEJS_INCLUDE_DIRS=/path/to/nodejs/include \
-      .. # and other arguments
+    -D NODEJS_INCLUDE_DIRS=/path/to/nodejs/include \
+    .. # and other arguments
 make
 make install
 ```

From c8f811423bbf40f0fbfc02f5ac9308cacab9486f Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@ustc.edu.cn>
Date: Mon, 9 Feb 2026 03:27:09 +0800
Subject: [PATCH 3/6] Revert "[pre-commit.ci] auto fixes from pre-commit.com
 hooks"

This reverts commit a3243a14ad7ae6751ef45d6530689b1c9762ffd2.
---
 .pre-commit-config.yaml                   |  16 +--
 AGENTS.md                                 |  18 +--
 CONTRIBUTING.md                           |  23 ++--
 README.md                                 |   2 +-
 doc/backend.md                            |   2 +-
 doc/data/data-conv.md                     |   2 +-
 doc/data/system.md                        |  52 ++++-----
 doc/development/cmake.md                  |   2 +-
 doc/development/create-a-model-pt.md      |   8 +-
 doc/development/create-a-model-tf.md      |   6 +-
 doc/development/type-embedding.md         |   6 +-
 doc/env.md                                |  22 ++--
 doc/freeze/compress.md                    |  12 +-
 doc/freeze/freeze.md                      |   8 +-
 doc/inference/cxx.md                      |   2 +-
 doc/inference/nodejs.md                   |  10 +-
 doc/inference/python.md                   |   4 +-
 doc/install/build-conda.md                |   4 +-
 doc/install/easy-install-dev.md           |   2 +-
 doc/install/easy-install.md               |  44 +++----
 doc/install/install-from-c-library.md     |   4 +-
 doc/install/install-from-source.md        | 113 +++++++++---------
 doc/install/install-gromacs.md            |   9 +-
 doc/install/install-lammps.md             |   2 +-
 doc/install/install-nodejs.md             |   4 +-
 doc/model/change-bias.md                  |   8 +-
 doc/model/dpa2.md                         |   2 +-
 doc/model/dpa3.md                         |   8 +-
 doc/model/dplr.md                         |  10 +-
 doc/model/dprc.md                         | 134 +++++-----------------
 doc/model/linear.md                       |   2 +-
 doc/model/overall.md                      |  18 +--
 doc/model/pairtab.md                      |  10 +-
 doc/model/sel.md                          |  10 +-
 doc/model/show-model-info.md              |   7 --
 doc/model/train-energy-hessian.md         |  26 ++---
 doc/model/train-energy-spin.md            |  36 +++---
 doc/model/train-energy.md                 |   6 +-
 doc/model/train-fitting-dos.md            |  14 +--
 doc/model/train-fitting-property.md       |  16 +--
 doc/model/train-fitting-tensor.md         |  24 ++--
 doc/model/train-hybrid.md                 |   6 +-
 doc/model/train-se-a-mask.md              |   4 +-
 doc/model/train-se-atten.md               |   8 +-
 doc/model/train-se-e2-a-tebd.md           |   6 +-
 doc/model/train-se-e2-a.md                |   6 +-
 doc/model/train-se-e2-r.md                |   6 +-
 doc/model/train-se-e3-tebd.md             |   2 +-
 doc/model/train-se-e3.md                  |   6 +-
 doc/nvnmd/nvnmd.md                        |  50 +++-----
 doc/test/model-deviation.md               |   4 +-
 doc/third-party/ase.md                    |  10 +-
 doc/third-party/dpdata.md                 |   2 +-
 doc/third-party/gromacs.md                |   2 +-
 doc/third-party/ipi.md                    |   2 +-
 doc/third-party/lammps-command.md         |  25 ++--
 doc/train/finetuning.md                   |  12 +-
 doc/train/gpu-limitations.md              |   8 +-
 doc/train/multi-task-training.md          |  19 ++-
 doc/train/parallel-training.md            |  15 +--
 doc/train/tensorboard.md                  |   2 +-
 doc/train/training-advanced.md            |   6 +-
 doc/train/training.md                     |  22 ++--
 doc/troubleshooting/howtoset_num_nodes.md |  18 +--
 examples/property/train/README.md         |   4 +-
 source/3rdparty/README.md                 |  10 +-
 source/3rdparty/implib/arch/e2k/README.md |   7 +-
 source/nodejs/README.md                   |   4 +-
 68 files changed, 430 insertions(+), 544 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index efa5840469..3fc33c8b98 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -57,14 +57,14 @@ repos:
   - repo: https://github.com/hukkin/mdformat
     rev: 1.0.0
     hooks:
-      - id: mdformat
-        additional_dependencies:
-          - mdformat-myst==0.3.0
-          - mdformat-ruff==0.1.3
-          - mdformat-web==0.2.0
-          - mdformat-config==0.2.1
-          - mdformat-beautysh==1.0.0
-          - mdformat-gfm-alerts==2.0.0
+    - id: mdformat
+      additional_dependencies:
+      - mdformat-myst==0.3.0
+      - mdformat-ruff==0.1.3
+      - mdformat-web==0.2.0
+      - mdformat-config==0.2.1
+      - mdformat-beautysh==1.0.0
+      - mdformat-gfm-alerts==2.0.0
   # C++
   - repo: https://github.com/pre-commit/mirrors-clang-format
     rev: v21.1.8
diff --git a/AGENTS.md b/AGENTS.md
index bcac9f1514..c629a08def 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -23,8 +23,8 @@ DeePMD-kit is a deep learning package for many-body potential energy representat
 ### Lint and Format Code
 
 - Install linter: `uv pip install ruff`
-- Run linting: `ruff check .` -- takes \<1 second
-- Format code: `ruff format .` -- takes \<1 second
+- Run linting: `ruff check .` -- takes <1 second
+- Format code: `ruff format .` -- takes <1 second
 - **Always run `ruff check .` and `ruff format .` before committing changes or the CI will fail.**
 
 ### Training and Validation
@@ -40,19 +40,19 @@ DeePMD-kit is a deep learning package for many-body potential energy representat
 ### Basic Functionality Validation
 
 1. **CLI Interface**: Run `dp --version` and `dp -h` to verify installation
-1. **Python Interface**: Run `python -c "import deepmd; import deepmd.tf; print('Both interfaces work')"`
-1. **Backend Selection**: Test `dp --tf -h`, `dp --pt -h`, `dp --jax -h`, `dp --pd -h`
+2. **Python Interface**: Run `python -c "import deepmd; import deepmd.tf; print('Both interfaces work')"`
+3. **Backend Selection**: Test `dp --tf -h`, `dp --pt -h`, `dp --jax -h`, `dp --pd -h`
 
 ### Training Workflow Validation
 
 1. **TensorFlow Training**: `cd examples/water/se_e2_a && timeout 60 dp train input.json --skip-neighbor-stat` -- should start training and show decreasing loss
-1. **PyTorch Training**: `cd examples/water/se_e2_a && timeout 60 dp --pt train input_torch.json --skip-neighbor-stat` -- should start training and show decreasing loss
-1. **Verify training output**: Look for "batch X: trn: rmse" messages showing decreasing error values
+2. **PyTorch Training**: `cd examples/water/se_e2_a && timeout 60 dp --pt train input_torch.json --skip-neighbor-stat` -- should start training and show decreasing loss
+3. **Verify training output**: Look for "batch X: trn: rmse" messages showing decreasing error values
 
 ### Test-Based Validation
 
 1. **Core Tests**: `pytest source/tests/tf/test_dp_test.py::TestDPTestEner::test_1frame -v` -- should pass in ~10 seconds
-1. **Multi-backend**: Test both TensorFlow and PyTorch components work
+2. **Multi-backend**: Test both TensorFlow and PyTorch components work
 
 ## Common Commands and Timing
 
@@ -127,8 +127,8 @@ source/               # C++ source code and tests
 
 ### Linting and Formatting
 
-- **Ruff check**: \<1 second
-- **Ruff format**: \<1 second
+- **Ruff check**: <1 second
+- **Ruff format**: <1 second
 - **Pre-commit hooks**: May have network issues, use individual tools
 
 ### Commit Messages and PR Titles
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index a8378350e4..67491672e8 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -47,7 +47,7 @@ Please perform the following steps to create your Pull Request to this repositor
 ### Step 1: Fork the repository
 
 1. Visit the project: <https://github.com/deepmodeling/deepmd-kit>
-1. Click the **Fork** button on the top right and wait it to finish.
+2. Click the **Fork** button on the top right and wait it to finish.
 
 ### Step 2: Clone the forked repository to local storage and set configurations
 
@@ -58,7 +58,7 @@ Please perform the following steps to create your Pull Request to this repositor
    # Replace `$username` with your GitHub ID
    ```
 
-1. Add deepmodeling's repo as your remote repo, we can name it "upstream". And fetch upstream's latest codes to your workstation.
+2. Add deepmodeling's repo as your remote repo, we can name it "upstream". And fetch upstream's latest codes to your workstation.
 
    ```bash
    git remote add upstream https://github.com/deepmodeling/deepmd-kit.git
@@ -70,9 +70,9 @@ Please perform the following steps to create your Pull Request to this repositor
    git merge upstream/master
    ```
 
-1. Modify your codes and design unit tests.
+3. Modify your codes and design unit tests.
 
-1. Commit your changes to a new branch
+4. Commit your changes to a new branch
 
    ```bash
    git checkout -b branch1
@@ -81,8 +81,7 @@ Please perform the following steps to create your Pull Request to this repositor
    git commit -m "commit-message: update the xx"
    ```
 
-1. Push the changed codes to your original repo on github.
-
+5. Push the changed codes to your original repo on github.
    ```bash
    git push origin branch1
    ```
@@ -98,15 +97,15 @@ Please perform the following steps to create your Pull Request to this repositor
    git rebase upstream/master
    ```
 
-1. Create a new branch based on the master branch.
+2. Create a new branch based on the master branch.
 
    ```bash
    git checkout -b new-branch-name
    ```
 
-1. Modify your codes and design unit tests.
+3. Modify your codes and design unit tests.
 
-1. Commit your changes
+4. Commit your changes
 
    ```bash
    git status # Checks the local status
@@ -114,7 +113,7 @@ Please perform the following steps to create your Pull Request to this repositor
    git commit -m "commit-message: update the xx"
    ```
 
-1. Keep your branch in sync with upstream/master
+5. Keep your branch in sync with upstream/master
 
    ```bash
    # While on your new branch
@@ -122,7 +121,7 @@ Please perform the following steps to create your Pull Request to this repositor
    git rebase upstream/master
    ```
 
-1. Push your changes to the remote
+6. Push your changes to the remote
 
    ```bash
    git push -u origin new-branch-name # "-u" is used to track the remote branch from origin
@@ -131,7 +130,7 @@ Please perform the following steps to create your Pull Request to this repositor
 ### Step 3: Create a pull request
 
 1. Visit your fork at <https://github.com/$username/deepmd-kit> (replace `$username` with your GitHub ID)
-1. Click `pull requests`, followed by `New pull request` and `Compare & pull request` to create your PR.
+2. Click `pull requests`, followed by `New pull request` and `Compare & pull request` to create your PR.
 
 Now, your PR is successfully submitted! After this PR is merged, you will automatically become a contributor to DeePMD-kit.
 
diff --git a/README.md b/README.md
index 58ec1fec7f..143ed1b0ab 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 [<picture><source media="(prefers-color-scheme: dark)" srcset="./doc/_static/logo-dark.svg"><source media="(prefers-color-scheme: light)" srcset="./doc/_static/logo.svg"><img alt="DeePMD-kit logo" src="./doc/_static/logo.svg"></picture>](./doc/logo.md)
 
-______________________________________________________________________
+---
 
 # DeePMD-kit
 
diff --git a/doc/backend.md b/doc/backend.md
index 3e2b7e4acb..b2f7dc4826 100644
--- a/doc/backend.md
+++ b/doc/backend.md
@@ -46,7 +46,7 @@ The `.pd` extension is used for model checkpoint storage, which is commonly util
 
 ### DP {{ dpmodel_icon }}
 
-:::\{note}
+:::{note}
 This backend is only for development and should not take into production.
 :::
 
diff --git a/doc/data/data-conv.md b/doc/data/data-conv.md
index 30be98bcfe..56ce526480 100644
--- a/doc/data/data-conv.md
+++ b/doc/data/data-conv.md
@@ -57,7 +57,7 @@ In the raw format, the property of one frame is provided per line, ending with `
 ```bash
 $ cat force.raw
 -0.724  2.039 -0.951  0.841 -0.464  0.363
-6.737  1.554 -5.587 -2.803  0.062  2.222
+ 6.737  1.554 -5.587 -2.803  0.062  2.222
 -1.968 -0.163  1.020 -0.225 -0.789  0.343
 ```
 
diff --git a/doc/data/system.md b/doc/data/system.md
index 5f834c4345..f6da7b534b 100644
--- a/doc/data/system.md
+++ b/doc/data/system.md
@@ -12,29 +12,29 @@ A system should contain system properties, input frame properties, and labeled f
 
 The input frame properties contain the following property, the first axis of which is the number of frames:
 
-| ID        | Property                                            | Raw file   | Unit | Required/Optional    | Shape                  | Description                               |
-| --------- | --------------------------------------------------- | ---------- | ---- | -------------------- | ---------------------- | ----------------------------------------- |
-| coord     | Atomic coordinates                                  | coord.raw  | Å    | Required             | Nframes * Natoms * 3   |                                           |
-| box       | Boxes                                               | box.raw    | Å    | Required if periodic | Nframes * 3 * 3        | in the order `XX XY XZ YX YY YZ ZX ZY ZZ` |
-| fparam    | Extra frame parameters                              | fparam.raw | Any  | Optional             | Nframes * Any          |                                           |
-| aparam    | Extra atomic parameters                             | aparam.raw | Any  | Optional             | Nframes * aparam * Any |                                           |
-| numb_copy | Each frame is copied by the `numb_copy` (int) times | prob.raw   | 1    | Optional             | Nframes                | Integer; Default is 1 for all frames      |
+| ID        | Property                                            | Raw file   | Unit | Required/Optional    | Shape                    | Description                               |
+| --------- | --------------------------------------------------- | ---------- | ---- | -------------------- | ------------------------ | ----------------------------------------- |
+| coord     | Atomic coordinates                                  | coord.raw  | Å    | Required             | Nframes \* Natoms \* 3   |
+| box       | Boxes                                               | box.raw    | Å    | Required if periodic | Nframes \* 3 \* 3        | in the order `XX XY XZ YX YY YZ ZX ZY ZZ` |
+| fparam    | Extra frame parameters                              | fparam.raw | Any  | Optional             | Nframes \* Any           |
+| aparam    | Extra atomic parameters                             | aparam.raw | Any  | Optional             | Nframes \* aparam \* Any |
+| numb_copy | Each frame is copied by the `numb_copy` (int) times | prob.raw   | 1    | Optional             | Nframes                  | Integer; Default is 1 for all frames      |
 
 The labeled frame properties are listed as follows, all of which will be used for training if and only if the loss function contains such property:
 
-| ID                    | Property                                                                         | Raw file                  | Unit   | Shape                              | Description                               |
-| --------------------- | -------------------------------------------------------------------------------- | ------------------------- | ------ | ---------------------------------- | ----------------------------------------- |
-| energy                | Frame energies                                                                   | energy.raw                | eV     | Nframes                            |                                           |
-| force                 | Atomic forces                                                                    | force.raw                 | eV/Å   | Nframes * Natoms * 3               |                                           |
-| virial                | Frame virial                                                                     | virial.raw                | eV     | Nframes * 9                        | in the order `XX XY XZ YX YY YZ ZX ZY ZZ` |
-| hessian               | Frame energy Hessian matrices                                                    | hessian.raw               | eV/Å^2 | Nframes * Natoms * 3 * Natoms * 3  | full Hessian matrices                     |
-| atom_ener             | Atomic energies                                                                  | atom_ener.raw             | eV     | Nframes * Natoms                   |                                           |
-| atom_pref             | Weights of atomic forces                                                         | atom_pref.raw             | 1      | Nframes * Natoms                   |                                           |
-| dipole                | Frame dipole                                                                     | dipole.raw                | Any    | Nframes * 3                        |                                           |
-| atomic_dipole         | Atomic dipole                                                                    | atomic_dipole.raw         | Any    | Nframes * Natoms * 3               |                                           |
-| polarizability        | Frame polarizability                                                             | polarizability.raw        | Any    | Nframes * 9                        | in the order `XX XY XZ YX YY YZ ZX ZY ZZ` |
-| atomic_polarizability | Atomic polarizability                                                            | atomic_polarizability.raw | Any    | Nframes * Natoms * 9               | in the order `XX XY XZ YX YY YZ ZX ZY ZZ` |
-| drdq                  | Partial derivative of atomic coordinates with respect to generalized coordinates | drdq.raw                  | 1      | Nframes * Natoms * 3 * Ngen_coords |                                           |
+| ID                    | Property                                                                         | Raw file                  | Unit   | Shape                                 | Description                               |
+| --------------------- | -------------------------------------------------------------------------------- | ------------------------- | ------ | ------------------------------------- | ----------------------------------------- |
+| energy                | Frame energies                                                                   | energy.raw                | eV     | Nframes                               |
+| force                 | Atomic forces                                                                    | force.raw                 | eV/Å   | Nframes \* Natoms \* 3                |
+| virial                | Frame virial                                                                     | virial.raw                | eV     | Nframes \* 9                          | in the order `XX XY XZ YX YY YZ ZX ZY ZZ` |
+| hessian               | Frame energy Hessian matrices                                                    | hessian.raw               | eV/Å^2 | Nframes \* Natoms \* 3 \* Natoms \* 3 | full Hessian matrices                     |
+| atom_ener             | Atomic energies                                                                  | atom_ener.raw             | eV     | Nframes \* Natoms                     |
+| atom_pref             | Weights of atomic forces                                                         | atom_pref.raw             | 1      | Nframes \* Natoms                     |
+| dipole                | Frame dipole                                                                     | dipole.raw                | Any    | Nframes \* 3                          |
+| atomic_dipole         | Atomic dipole                                                                    | atomic_dipole.raw         | Any    | Nframes \* Natoms \* 3                |
+| polarizability        | Frame polarizability                                                             | polarizability.raw        | Any    | Nframes \* 9                          | in the order `XX XY XZ YX YY YZ ZX ZY ZZ` |
+| atomic_polarizability | Atomic polarizability                                                            | atomic_polarizability.raw | Any    | Nframes \* Natoms \* 9                | in the order `XX XY XZ YX YY YZ ZX ZY ZZ` |
+| drdq                  | Partial derivative of atomic coordinates with respect to generalized coordinates | drdq.raw                  | 1      | Nframes \* Natoms \* 3 \* Ngen_coords |
 
 In general, we always use the following convention of units:
 
@@ -50,7 +50,7 @@ In general, we always use the following convention of units:
 
 ## Mixed type
 
-:::\{note}
+:::{note}
 Only the [DPA-1](../model/train-se-atten.md) and [DPA-2](../model/dpa2.md) descriptors support this format.
 :::
 
@@ -73,11 +73,11 @@ set.*/real_atom_types.npy
 
 This system contains `Nframes` frames with the same atom number `Natoms`, the total number of element types contained in all frames is `Ntypes`. Most files are the same as those in [standard formats](../data/system.md), here we only list the distinct ones:
 
-| ID       | Property                         | File                | Required/Optional | Shape            | Description                                                                                                              |
-| -------- | -------------------------------- | ------------------- | ----------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------ |
-| /        | Atom type indexes (place holder) | type.raw            | Required          | Natoms           | All zeros to fake the type input                                                                                         |
-| type_map | Atom type names                  | type_map.raw        | Required          | Ntypes           | Atom names that map to atom type contained in all the frames, which is unnecessart to be contained in the periodic table |
-| type     | Atom type indexes of each frame  | real_atom_types.npy | Required          | Nframes * Natoms | Integers that describe atom types in each frame, corresponding to indexes in type_map. `-1` means virtual atoms.         |
+| ID       | Property                         | File                | Required/Optional | Shape             | Description                                                                                                              |
+| -------- | -------------------------------- | ------------------- | ----------------- | ----------------- | ------------------------------------------------------------------------------------------------------------------------ |
+| /        | Atom type indexes (place holder) | type.raw            | Required          | Natoms            | All zeros to fake the type input                                                                                         |
+| type_map | Atom type names                  | type_map.raw        | Required          | Ntypes            | Atom names that map to atom type contained in all the frames, which is unnecessart to be contained in the periodic table |
+| type     | Atom type indexes of each frame  | real_atom_types.npy | Required          | Nframes \* Natoms | Integers that describe atom types in each frame, corresponding to indexes in type_map. `-1` means virtual atoms.         |
 
 With these edited files, one can put together frames with the same `Natoms`, instead of the same formula (like `H2O`).
 
diff --git a/doc/development/cmake.md b/doc/development/cmake.md
index 5654d0cea2..f8508d8992 100644
--- a/doc/development/cmake.md
+++ b/doc/development/cmake.md
@@ -6,7 +6,7 @@ After DeePMD-kit C/C++ library is installed, one can find DeePMD-kit from CMake:
 find_package(DeePMD REQUIRED)
 ```
 
-Note that you may need to add \$\{deepmd_root} to the cached CMake variable `CMAKE_PREFIX_PATH`.
+Note that you may need to add ${deepmd_root} to the cached CMake variable `CMAKE_PREFIX_PATH`.
 
 To link against the C interface library, using
 
diff --git a/doc/development/create-a-model-pt.md b/doc/development/create-a-model-pt.md
index b321b26cf7..7eb75b7026 100644
--- a/doc/development/create-a-model-pt.md
+++ b/doc/development/create-a-model-pt.md
@@ -1,6 +1,6 @@
 # Create a model in other backends {{ pytorch_icon }} {{ dpmodel_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: PyTorch {{ pytorch_icon }}, DP {{ dpmodel_icon }}
 
 In the following context, we use the PyTorch backend as the example, while it also applies to other backends listed above.
@@ -11,9 +11,9 @@ If you'd like to create a new model that isn't covered by the existing DeePMD-ki
 To incorporate your custom model you'll need to:
 
 1. Register and implement new components (e.g. descriptor) in a Python file.
-1. Register new arguments for user inputs.
-1. Package new codes into a Python package.
-1. Test new models.
+2. Register new arguments for user inputs.
+3. Package new codes into a Python package.
+4. Test new models.
 
 ## Design a new component
 
diff --git a/doc/development/create-a-model-tf.md b/doc/development/create-a-model-tf.md
index 7720ced0ca..cc7ad1999d 100644
--- a/doc/development/create-a-model-tf.md
+++ b/doc/development/create-a-model-tf.md
@@ -5,9 +5,9 @@ If you'd like to create a new model that isn't covered by the existing DeePMD-ki
 To incorporate your custom model you'll need to:
 
 1. Register and implement new components (e.g. descriptor) in a Python file. You may also want to register new TensorFlow OPs if necessary.
-1. Register new arguments for user inputs.
-1. Package new codes into a Python package.
-1. Test new models.
+2. Register new arguments for user inputs.
+3. Package new codes into a Python package.
+4. Test new models.
 
 ## Design a new component
 
diff --git a/doc/development/type-embedding.md b/doc/development/type-embedding.md
index b3de75aaf3..10eeed6ee9 100644
--- a/doc/development/type-embedding.md
+++ b/doc/development/type-embedding.md
@@ -66,7 +66,7 @@ In trainer.py, it will parse the parameter from the input JSON file. If a `type_
 
 ### model (model/ener.py)
 
-When building the operation graph of the `model` in `model.build`. If a `TypeEmbedNet` is detected, it will build the operation graph of `type embed net`, `embedding net` and `fitting net` by order. The building process of `type embed net` can be found in `TypeEmbedNet.build`, which output the type embedding vector of each atom type (of \[$\text{ntypes} \times \text{nchanl}$\] dimensions). We then save the type embedding vector into `input_dict`, so that they can be fetched later in `embedding net` and `fitting net`.
+When building the operation graph of the `model` in `model.build`. If a `TypeEmbedNet` is detected, it will build the operation graph of `type embed net`, `embedding net` and `fitting net` by order. The building process of `type embed net` can be found in `TypeEmbedNet.build`, which output the type embedding vector of each atom type (of [$\text{ntypes} \times \text{nchanl}$] dimensions). We then save the type embedding vector into `input_dict`, so that they can be fetched later in `embedding net` and `fitting net`.
 
 ### embedding net (descriptor/se\*.py)
 
@@ -84,8 +84,8 @@ build -> _pass_filter -> _filter -> _filter_lower
 
 ### fitting net (fit/ener.py)
 
-In `fitting net`, it takes the descriptor vector as input, whose dimension is \[natoms, $M_1\times M_2$\]. Because we need to involve information on the centric atom in this step, we need to generate a matrix named `atype_embed` (of dim [natoms, nchanl]), in which each row is the type embedding vector of the specific centric atom. The input is sorted by type of centric atom, we also know the number of a particular atom type (stored in `natoms[2+i]`), thus we get the type vector of the centric atom. In the build phase of the fitting net, it will check whether type embedding exists in `input_dict` and fetch them. After that, call `embed_atom_type` function to look up the embedding vector for the type vector of the centric atom to obtain `atype_embed`, and concat input with it ([input, atype_embed]). The modified input goes through `fitting` net\` to get predicted energy.
+In `fitting net`, it takes the descriptor vector as input, whose dimension is [natoms, $M_1\times M_2$]. Because we need to involve information on the centric atom in this step, we need to generate a matrix named `atype_embed` (of dim [natoms, nchanl]), in which each row is the type embedding vector of the specific centric atom. The input is sorted by type of centric atom, we also know the number of a particular atom type (stored in `natoms[2+i]`), thus we get the type vector of the centric atom. In the build phase of the fitting net, it will check whether type embedding exists in `input_dict` and fetch them. After that, call `embed_atom_type` function to look up the embedding vector for the type vector of the centric atom to obtain `atype_embed`, and concat input with it ([input, atype_embed]). The modified input goes through `fitting` net` to get predicted energy.
 
-:::\{note}
+:::{note}
 You can't apply the compression method while using atom-type embedding.
 :::
diff --git a/doc/env.md b/doc/env.md
index d2b25c2ddd..1688e0af9c 100644
--- a/doc/env.md
+++ b/doc/env.md
@@ -1,12 +1,12 @@
 # Runtime environment variables
 
-:::\{note}
+:::{note}
 For build-time environment variables, see [Install from source code](./install/install-from-source.md).
 :::
 
 ## All interfaces
 
-:::\{envvar} DP_INTER_OP_PARALLELISM_THREADS
+:::{envvar} DP_INTER_OP_PARALLELISM_THREADS
 
 **Alias**: `TF_INTER_OP_PARALLELISM_THREADS`
 **Default**: `0`
@@ -15,7 +15,7 @@ Control parallelism within TensorFlow (when TensorFlow is built against Eigen) a
 See [How to control the parallelism of a job](./troubleshooting/howtoset_num_nodes.md) for details.
 :::
 
-:::\{envvar} DP_INTRA_OP_PARALLELISM_THREADS
+:::{envvar} DP_INTRA_OP_PARALLELISM_THREADS
 
 **Alias**: `TF_INTRA_OP_PARALLELISM_THREADS`\*\*
 **Default**: `0`
@@ -35,42 +35,42 @@ See [How to control the parallelism of a job](./troubleshooting/howtoset_num_nod
 
 ## Python interface only
 
-:::\{envvar} DP_INTERFACE_PREC
+:::{envvar} DP_INTERFACE_PREC
 
 **Choices**: `high`, `low`; **Default**: `high`
 
 Control high (double) or low (float) precision of training.
 :::
 
-:::\{envvar} DP_AUTO_PARALLELIZATION
+:::{envvar} DP_AUTO_PARALLELIZATION
 
 **Choices**: `0`, `1`; **Default**: `0`
 
 {{ tensorflow_icon }} Enable auto parallelization for CPU operators.
 :::
 
-:::\{envvar} DP_JIT
+:::{envvar} DP_JIT
 
 **Choices**: `0`, `1`; **Default**: `0`
 
 {{ tensorflow_icon }} Enable JIT. Note that this option may either improve or decrease the performance. Requires TensorFlow to support JIT.
 :::
 
-:::\{envvar} DP_INFER_BATCH_SIZE
+:::{envvar} DP_INFER_BATCH_SIZE
 
 **Default**: `1024` on CPUs and as maximum as possible until out-of-memory on GPUs
 
 Inference batch size, calculated by multiplying the number of frames with the number of atoms.
 :::
 
-:::\{envvar} DP_BACKEND
+:::{envvar} DP_BACKEND
 
 **Default**: `tensorflow`
 
 Default backend.
 :::
 
-:::\{envvar} NUM_WORKERS
+:::{envvar} NUM_WORKERS
 
 **Default**: 4 or the number of cores (whichever is smaller)
 
@@ -83,14 +83,14 @@ See [PyTorch documentation](https://pytorch.org/docs/stable/data.html) for detai
 
 These environment variables also apply to third-party programs using the C++ interface, such as [LAMMPS](./third-party/lammps-command.md).
 
-:::\{envvar} DP_PLUGIN_PATH
+:::{envvar} DP_PLUGIN_PATH
 
 **Type**: List of paths, split by `:` on Unix and `;` on Windows
 
 List of customized OP plugin libraries to load, such as `/path/to/plugin1.so:/path/to/plugin2.so` on Linux and `/path/to/plugin1.dll;/path/to/plugin2.dll` on Windows.
 :::
 
-:::\{envvar} DP_PROFILER
+:::{envvar} DP_PROFILER
 
 {{ pytorch_icon }} Enable the built-in PyTorch Kineto profiler for the PyTorch C++ (inference) backend.
 
diff --git a/doc/freeze/compress.md b/doc/freeze/compress.md
index e83f14e5fd..d827c71525 100644
--- a/doc/freeze/compress.md
+++ b/doc/freeze/compress.md
@@ -1,6 +1,6 @@
 # Compress a model {{ tensorflow_icon }} {{ pytorch_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}
 :::
 
@@ -58,13 +58,15 @@ If the number of neighbors of an atom is smaller than $N_c$, the corresponding p
 In practice, if the real number of neighbors is significantly smaller than $N_c$, a notable operation is spent on the multiplication of padding zeros.
 In the compressed DP model, the number of neighbors is precisely indexed at the tabulated inference stage, further saving computational costs.[^1]
 
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
 ## Instructions
 
 Once the frozen model is obtained from DeePMD-kit, we can get the neural network structure and its parameters (weights, biases, etc.) from the trained model, and compress it in the following way:
 
-::::\{tab-set}
+::::{tab-set}
 
-:::\{tab-item} TensorFlow {{ tensorflow_icon }}
+:::{tab-item} TensorFlow {{ tensorflow_icon }}
 
 ```bash
 dp compress -i graph.pb -o graph-compress.pb
@@ -72,7 +74,7 @@ dp compress -i graph.pb -o graph-compress.pb
 
 :::
 
-:::\{tab-item} PyTorch {{ pytorch_icon }}
+:::{tab-item} PyTorch {{ pytorch_icon }}
 
 ```bash
 dp --pt compress -i model.pth -o model-compress.pth
@@ -127,5 +129,3 @@ See the documentation of a specific descriptor to see whether it supports model
 When compressing models in the PyTorch backend, the customized OP library for the Python interface must be installed when [freezing the model](../freeze/freeze.md).
 
 The customized OP library for the Python interface can be installed by setting environment variable {envvar}`DP_ENABLE_PYTORCH` to `1` during [installation](../install/install-from-source.md).
-
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/freeze/freeze.md b/doc/freeze/freeze.md
index 70f71c810b..20f02177c6 100644
--- a/doc/freeze/freeze.md
+++ b/doc/freeze/freeze.md
@@ -3,9 +3,9 @@
 The trained neural network is extracted from a checkpoint and dumped into a model file. This process is called "freezing" a model.
 To freeze a model, typically one does
 
-::::\{tab-set}
+::::{tab-set}
 
-:::\{tab-item} TensorFlow {{ tensorflow_icon }}
+:::{tab-item} TensorFlow {{ tensorflow_icon }}
 
 ```bash
 $ dp freeze -o model.pb
@@ -16,7 +16,7 @@ The idea and part of our code are from [Morgan](https://blog.metaflow.fr/tensorf
 
 :::
 
-:::\{tab-item} PyTorch {{ pytorch_icon }}
+:::{tab-item} PyTorch {{ pytorch_icon }}
 
 ```bash
 $ dp --pt freeze -o model.pth
@@ -35,7 +35,7 @@ The output model is called `model_branch1.pth`, which is the specifically frozen
 
 :::
 
-:::\{tab-item} Paddle {{ paddle_icon }}
+:::{tab-item} Paddle {{ paddle_icon }}
 
 ```bash
 $ dp --pd freeze -o model
diff --git a/doc/inference/cxx.md b/doc/inference/cxx.md
index 2367261c09..ec8a3248a1 100644
--- a/doc/inference/cxx.md
+++ b/doc/inference/cxx.md
@@ -1,6 +1,6 @@
 # C/C++ interface
 
-:::\{note}
+:::{note}
 See [Environment variables](../env.md) for the runtime environment variables.
 :::
 
diff --git a/doc/inference/nodejs.md b/doc/inference/nodejs.md
index f7059dd874..abe9dc36ab 100644
--- a/doc/inference/nodejs.md
+++ b/doc/inference/nodejs.md
@@ -1,6 +1,6 @@
 # Node.js interface
 
-:::\{note}
+:::{note}
 See [Environment variables](../env.md) for the runtime environment variables.
 :::
 
@@ -32,12 +32,12 @@ energy = dp.compute(energy, v_forces, v_virials, v_coord, v_atype, v_cell);
 
 console.log("energy:", energy);
 console.log(
-    "forces:",
-    [...Array(v_forces.size()).keys()].map((i) => v_forces.get(i)),
+  "forces:",
+  [...Array(v_forces.size()).keys()].map((i) => v_forces.get(i)),
 );
 console.log(
-    "virials:",
-    [...Array(v_virials.size()).keys()].map((i) => v_virials.get(i)),
+  "virials:",
+  [...Array(v_virials.size()).keys()].map((i) => v_virials.get(i)),
 );
 ```
 
diff --git a/doc/inference/python.md b/doc/inference/python.md
index ff8d5f2cc0..361db7b64f 100644
--- a/doc/inference/python.md
+++ b/doc/inference/python.md
@@ -1,6 +1,6 @@
 # Python interface
 
-:::\{note}
+:::{note}
 See [Environment variables](../env.md) for the runtime environment variables.
 :::
 
@@ -53,7 +53,7 @@ Otherwise, TensorFlow or PyTorch will never release the memory, and this may lea
 
 ## External neighbor list algorithm {{ tensorflow_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}
 :::
 
diff --git a/doc/install/build-conda.md b/doc/install/build-conda.md
index cdd70c5cf8..e54849f75d 100644
--- a/doc/install/build-conda.md
+++ b/doc/install/build-conda.md
@@ -1,7 +1,7 @@
 # Building conda packages
 
-::::\{danger}
-:::\{deprecated} 3.0.0
+::::{danger}
+:::{deprecated} 3.0.0
 The official channel has been deprecated since 3.0.0.
 Refer to [conda-forge documentation](https://conda-forge.org/docs/maintainer/adding_pkgs/) for how to contribute and build packages locally.
 :::
diff --git a/doc/install/easy-install-dev.md b/doc/install/easy-install-dev.md
index 428abb2745..9a0154320e 100644
--- a/doc/install/easy-install-dev.md
+++ b/doc/install/easy-install-dev.md
@@ -22,7 +22,7 @@ pip install -U --pre deepmd-kit[gpu,cu12,lmp,torch] --extra-index-url https://de
 
 ## Download pre-compiled C Library {{ tensorflow_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}
 :::
 
diff --git a/doc/install/easy-install.md b/doc/install/easy-install.md
index e7358ed373..32650b4a80 100644
--- a/doc/install/easy-install.md
+++ b/doc/install/easy-install.md
@@ -4,12 +4,12 @@ There are various easy methods to install DeePMD-kit. Choose one that you prefer
 
 After your easy installation, DeePMD-kit (`dp`) and LAMMPS (`lmp`) will be available to execute. You can try `dp -h` and `lmp -h` to see the help. `mpirun` is also available considering you may want to train models or run LAMMPS in parallel.
 
-:::\{note}
+:::{note}
 Note: The off-line packages and conda packages require the [GNU C Library](https://www.gnu.org/software/libc/) 2.17 or above. The GPU version requires [compatible NVIDIA driver](https://docs.nvidia.com/deploy/cuda-compatibility/index.html#minor-version-compatibility) to be installed in advance. It is possible to force conda to [override detection](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-virtual.html#overriding-detected-packages) when installation, but these requirements are still necessary during runtime.
 You can refer to [DeepModeling conda FAQ](https://docs.deepmodeling.com/faq/conda.html) for more information.
 :::
 
-:::\{note}
+:::{note}
 Python 3.10 or above is required for Python interface.
 :::
 
@@ -57,8 +57,8 @@ Read [conda-forge FAQ](https://conda-forge.org/docs/user/tipsandtricks.html#inst
 
 ### Official channel (deprecated)
 
-::::\{danger}
-:::\{deprecated} 3.0.0
+::::{danger}
+:::{deprecated} 3.0.0
 The official channel has been deprecated since 3.0.0, due to the challenging work of building dependencies for [multiple backends](../backend.md).
 Old packages will still be available at https://conda.deepmodeling.com.
 Maintainers will build packages in the conda-forge organization together with other conda-forge members.
@@ -85,13 +85,13 @@ docker pull ghcr.io/deepmodeling/deepmd-kit:2.2.8_cuda12.0_gpu
 
 [Create a new environment](https://docs.deepmodeling.com/faq/conda.html#how-to-create-a-new-conda-pip-environment), and then execute the following command:
 
-:::::::\{tab-set}
+:::::::{tab-set}
 
-::::::\{tab-item} TensorFlow {{ tensorflow_icon }}
+::::::{tab-item} TensorFlow {{ tensorflow_icon }}
 
-:::::\{tab-set}
+:::::{tab-set}
 
-::::\{tab-item} CUDA 12
+::::{tab-item} CUDA 12
 
 ```bash
 pip install deepmd-kit[gpu,cu12]
@@ -101,7 +101,7 @@ pip install deepmd-kit[gpu,cu12]
 
 ::::
 
-::::\{tab-item} CPU
+::::{tab-item} CPU
 
 ```bash
 pip install deepmd-kit[cpu]
@@ -113,11 +113,11 @@ pip install deepmd-kit[cpu]
 
 ::::::
 
-::::::\{tab-item} PyTorch {{ pytorch_icon }}
+::::::{tab-item} PyTorch {{ pytorch_icon }}
 
-:::::\{tab-set}
+:::::{tab-set}
 
-::::\{tab-item} CUDA 12
+::::{tab-item} CUDA 12
 
 ```bash
 pip install deepmd-kit[torch]
@@ -125,7 +125,7 @@ pip install deepmd-kit[torch]
 
 ::::
 
-::::\{tab-item} CPU
+::::{tab-item} CPU
 
 ```bash
 pip install torch --index-url https://download.pytorch.org/whl/cpu
@@ -138,11 +138,11 @@ pip install deepmd-kit
 
 ::::::
 
-::::::\{tab-item} JAX {{ jax_icon }}
+::::::{tab-item} JAX {{ jax_icon }}
 
-:::::\{tab-set}
+:::::{tab-set}
 
-::::\{tab-item} CUDA 12
+::::{tab-item} CUDA 12
 
 ```bash
 pip install deepmd-kit[jax] jax[cuda12]
@@ -150,7 +150,7 @@ pip install deepmd-kit[jax] jax[cuda12]
 
 ::::
 
-::::\{tab-item} CPU
+::::{tab-item} CPU
 
 ```bash
 pip install deepmd-kit[jax]
@@ -166,11 +166,11 @@ Switch to the TensorFlow {{ tensorflow_icon }} tab for more information.
 
 ::::::
 
-::::::\{tab-item} Paddle {{ paddle_icon }}
+::::::{tab-item} Paddle {{ paddle_icon }}
 
-:::::\{tab-set}
+:::::{tab-set}
 
-::::\{tab-item} CUDA 12.6
+::::{tab-item} CUDA 12.6
 
 ```bash
 # release version
@@ -182,7 +182,7 @@ pip install deepmd-kit
 
 ::::
 
-::::\{tab-item} CPU
+::::{tab-item} CPU
 
 ```bash
 # release version
@@ -202,7 +202,7 @@ pip install deepmd-kit
 
 The supported platform includes Linux x86-64 and aarch64 with GNU C Library 2.28 or above, macOS x86-64 and arm64, and Windows x86-64.
 
-:::\{Warning}
+:::{Warning}
 If your platform is not supported, or you want to build against the installed backends, or you want to enable ROCM support, please [build from source](install-from-source.md).
 :::
 
diff --git a/doc/install/install-from-c-library.md b/doc/install/install-from-c-library.md
index e6f0de3eb6..4568cdb6c9 100644
--- a/doc/install/install-from-c-library.md
+++ b/doc/install/install-from-c-library.md
@@ -1,6 +1,6 @@
 # Install from pre-compiled C library {{ tensorflow_icon }} {{ jax_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, JAX {{ jax_icon }}
 :::
 
@@ -30,7 +30,7 @@ make install
 Then the i-PI driver `dp_ipi` will be built and installed.
 One can also follow the manual [Install LAMMPS](./install-lammps.md) and/or [Install GROMACS](./install-gromacs.md).
 
-:::\{cmake:variable} DEEPMD_C_ROOT
+:::{cmake:variable} DEEPMD_C_ROOT
 
 **Type**: `Path`
 
diff --git a/doc/install/install-from-source.md b/doc/install/install-from-source.md
index 2a26aa87ed..1e03563c66 100644
--- a/doc/install/install-from-source.md
+++ b/doc/install/install-from-source.md
@@ -36,9 +36,9 @@ source $deepmd_venv/bin/activate
 pip install --upgrade pip
 ```
 
-::::\{tab-set}
+::::{tab-set}
 
-:::\{tab-item} TensorFlow {{ tensorflow_icon }}
+:::{tab-item} TensorFlow {{ tensorflow_icon }}
 
 The full instruction to install TensorFlow can be found on the official [TensorFlow website](https://www.tensorflow.org/install/pip). TensorFlow 2.7 or later is supported.
 
@@ -64,7 +64,7 @@ One can also [build the TensorFlow Python interface from source](https://www.ten
 
 :::
 
-:::\{tab-item} PyTorch {{ pytorch_icon }}
+:::{tab-item} PyTorch {{ pytorch_icon }}
 
 To install PyTorch, run
 
@@ -78,7 +78,7 @@ One can also [use conda](https://docs.deepmodeling.com/faq/conda.html) to instal
 
 :::
 
-:::\{tab-item} JAX {{ jax_icon }}
+:::{tab-item} JAX {{ jax_icon }}
 
 To install [JAX AI Stack](https://github.com/jax-ml/jax-ai-stack), run
 
@@ -93,7 +93,7 @@ One can also [use conda](https://docs.deepmodeling.com/faq/conda.html) to instal
 
 :::
 
-:::\{tab-item} Paddle {{ paddle_icon }}
+:::{tab-item} Paddle {{ paddle_icon }}
 
 To install Paddle, run
 
@@ -146,15 +146,15 @@ gcc --version
 By default, DeePMD-kit uses C++ 14, so the compiler needs to support C++ 14 (GCC 5 or later).
 The backend package may use a higher C++ standard version, and thus require a higher compiler version (for example, GCC 7 for C++ 17).
 
-::::\{tab-set}
+::::{tab-set}
 
-:::\{tab-item} TensorFlow {{ tensorflow_icon }}
+:::{tab-item} TensorFlow {{ tensorflow_icon }}
 
 Note that TensorFlow may have specific requirements for the compiler version to support the C++ standard version and [`_GLIBCXX_USE_CXX11_ABI`](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html) used by TensorFlow. It is recommended to use [the same compiler version as TensorFlow](https://www.tensorflow.org/install/source#tested_build_configurations), which can be printed by `python -c "import tensorflow;print(tensorflow.version.COMPILER_VERSION)"`.
 
 :::
 
-:::\{tab-item} PyTorch {{ pytorch_icon }}
+:::{tab-item} PyTorch {{ pytorch_icon }}
 
 You can set the environment variable `export DP_ENABLE_PYTORCH=1` to enable customized C++ OPs in the PyTorch backend.
 Note that PyTorch may have specific requirements for the compiler version to support the C++ standard version and [`_GLIBCXX_USE_CXX11_ABI`](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html) used by PyTorch.
@@ -172,21 +172,21 @@ pip install .
 
 One may set the following environment variables before executing `pip`:
 
-:::\{envvar} DP_VARIANT
+:::{envvar} DP_VARIANT
 
 **Choices**: `cpu`, `cuda`, `rocm`; **Default**: `cpu`
 
 Build CPU variant or GPU variant with CUDA or ROCM support.
 :::
 
-:::\{envvar} CUDAToolkit_ROOT
+:::{envvar} CUDAToolkit_ROOT
 
 **Type**: Path; **Default**: Detected automatically
 
 The path to the CUDA toolkit directory. CUDA 9.0 or later is supported. NVCC is required.
 :::
 
-:::\{envvar} ROCM_ROOT
+:::{envvar} ROCM_ROOT
 
 **Type**: Path; **Default**: Detected automatically
 
@@ -194,49 +194,49 @@ The path to the ROCM toolkit directory. If `ROCM_ROOT` is not set, it will look
 
 :::
 
-:::\{envvar} DP_ENABLE_TENSORFLOW
+:::{envvar} DP_ENABLE_TENSORFLOW
 
 **Choices**: `0`, `1`; **Default**: `1`
 
 {{ tensorflow_icon }} Enable the TensorFlow backend.
 :::
 
-:::\{envvar} DP_ENABLE_PYTORCH
+:::{envvar} DP_ENABLE_PYTORCH
 
 **Choices**: `0`, `1`; **Default**: `0`
 
 {{ pytorch_icon }} Enable customized C++ OPs for the PyTorch backend. PyTorch can still run without customized C++ OPs, but features will be limited.
 :::
 
-:::\{envvar} TENSORFLOW_ROOT
+:::{envvar} TENSORFLOW_ROOT
 
 **Type**: Path; **Default**: Detected automatically
 
 {{ tensorflow_icon }} The path to TensorFlow Python library. If not given, by default the installer only finds TensorFlow under user site-package directory (`site.getusersitepackages()`) or system site-package directory (`sysconfig.get_path("purelib")`) due to limitation of [PEP-517](https://peps.python.org/pep-0517/). If not found, the latest TensorFlow (or the environment variable `TENSORFLOW_VERSION` if given) from PyPI will be built against.
 :::
 
-:::\{envvar} PYTORCH_ROOT
+:::{envvar} PYTORCH_ROOT
 
 **Type**: Path; **Default**: Detected automatically
 
 {{ pytorch_icon }} The path to PyTorch Python library. If not given, by default, the installer only finds PyTorch under the user site-package directory (`site.getusersitepackages()`) or the system site-package directory (`sysconfig.get_path("purelib")`) due to the limitation of [PEP-517](https://peps.python.org/pep-0517/). If not found, the latest PyTorch (or the environment variable `PYTORCH_VERSION` if given) from PyPI will be built against.
 :::
 
-:::\{envvar} DP_ENABLE_NATIVE_OPTIMIZATION
+:::{envvar} DP_ENABLE_NATIVE_OPTIMIZATION
 
 **Choices**: `0`, `1`; **Default**: `0`
 
 Enable compilation optimization for the native machine's CPU type. Do not enable it if generated code will run on different CPUs.
 :::
 
-:::\{envvar} CMAKE_ARGS
+:::{envvar} CMAKE_ARGS
 
 **Type**: string
 
 Additional CMake arguments.
 :::
 
-:::\{envvar} <LANG>FLAGS
+:::{envvar} <LANG>FLAGS
 
 `<LANG>`=`CXX`, `CUDA` or `HIP`
 
@@ -267,7 +267,7 @@ It will print the help information like
 
 ### Install horovod and mpi4py {{ tensorflow_icon }}
 
-:::\{warning}
+:::{warning}
 Horovod has not released a new version for a long time.
 As of December 2025, the latest Horovod release does not support the latest TensorFlow versions.
 You can check the patches required to support the latest TensorFlow at [conda-forge/horovod-feedstock](https://github.com/conda-forge/horovod-feedstock/blob/main/recipe/meta.yaml).
@@ -295,20 +295,20 @@ $ horovodrun --check-build
 Horovod v0.22.1:
 
 Available Frameworks:
-[X] TensorFlow
-[X] PyTorch
-[ ] MXNet
+    [X] TensorFlow
+    [X] PyTorch
+    [ ] MXNet
 
 Available Controllers:
-[X] MPI
-[X] Gloo
+    [X] MPI
+    [X] Gloo
 
 Available Tensor Operations:
-[X] NCCL
-[ ] DDL
-[ ] CCL
-[X] MPI
-[X] Gloo
+    [X] NCCL
+    [ ] DDL
+    [ ] CCL
+    [X] MPI
+    [X] Gloo
 ```
 
 Since version 2.0.1, Horovod and mpi4py with MPICH support are shipped with the installer.
@@ -321,9 +321,9 @@ If one does not need to use DeePMD-kit with LAMMPS or i-PI, then the python inte
 
 ### Install Backends' C++ interface (optional)
 
-::::\{tab-set}
+::::{tab-set}
 
-:::\{tab-item} TensorFlow {{ tensorflow_icon }} / JAX {{ jax_icon }}
+:::{tab-item} TensorFlow {{ tensorflow_icon }} / JAX {{ jax_icon }}
 
 The C++ interfaces of both TensorFlow and JAX backends are based on the TensorFlow C++ library.
 
@@ -335,14 +335,14 @@ First, the C++ interface of TensorFlow should be installed. It is noted that the
 
 :::
 
-:::\{tab-item} PyTorch {{ pytorch_icon }}
+:::{tab-item} PyTorch {{ pytorch_icon }}
 
 If you have installed PyTorch using pip, you can use libtorch inside the PyTorch Python package.
 You can also download libtorch prebuilt library from the [PyTorch website](https://pytorch.org/get-started/locally/).
 
 :::
 
-:::\{tab-item} JAX {{ jax_icon }}
+:::{tab-item} JAX {{ jax_icon }}
 
 The JAX backend only depends on the TensorFlow C API, which is included in both TensorFlow C++ library and [TensorFlow C library](https://www.tensorflow.org/install/lang_c).
 If you want to use the TensorFlow C++ library, just enable the TensorFlow backend (which depends on the TensorFlow C++ library) and nothing else needs to do.
@@ -351,7 +351,7 @@ download the TensorFlow C library from [this page](https://www.tensorflow.org/in
 
 :::
 
-:::\{tab-item} Paddle {{ paddle_icon }}
+:::{tab-item} Paddle {{ paddle_icon }}
 
 If you want to use C++ interface of Paddle, you need to compile the Paddle inference library(C++ interface) manually from the [linux-compile-by-make](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/install/compile/linux-compile-by-make.html), then use the `.so` and `.a` files in `Paddle/build/paddle_inference_install_dir/`.
 
@@ -385,9 +385,9 @@ You must enable at least one backend.
 If you enable two or more backends, these backend libraries must be built in a compatible way, e.g. using the same `_GLIBCXX_USE_CXX11_ABI` flag.
 We recommend using [conda packages](https://docs.deepmodeling.com/faq/conda.html) from [conda-forge](https://conda-forge.org), which are usually compatible to each other.
 
-::::\{tab-set}
+::::{tab-set}
 
-:::\{tab-item} TensorFlow {{ tensorflow_icon }} / JAX {{ jax_icon }}
+:::{tab-item} TensorFlow {{ tensorflow_icon }} / JAX {{ jax_icon }}
 
 I assume you have activated the TensorFlow Python environment and want to install DeePMD-kit into path `$deepmd_root`, then execute CMake
 
@@ -399,7 +399,7 @@ If you specify `-DUSE_TF_PYTHON_LIBS=FALSE`, you need to give the location where
 
 :::
 
-:::\{tab-item} PyTorch {{ pytorch_icon }}
+:::{tab-item} PyTorch {{ pytorch_icon }}
 
 I assume you have installed the PyTorch (either Python or C++ interface) to `$torch_root`, then execute CMake
 
@@ -415,7 +415,7 @@ cmake -DENABLE_PYTORCH=TRUE -DUSE_PT_PYTHON_LIBS=TRUE -DCMAKE_INSTALL_PREFIX=$de
 
 :::
 
-:::\{tab-item} JAX {{ jax_icon }}
+:::{tab-item} JAX {{ jax_icon }}
 
 If you want to use the TensorFlow C++ library, just enable the TensorFlow backend and nothing else needs to do.
 If you want to use the TensorFlow C library and disable the TensorFlow backend, set {cmake:variable}`ENABLE_JAX` to `ON` and `CMAKE_PREFIX_PATH` to the root directory of the [TensorFlow C library](https://www.tensorflow.org/install/lang_c).
@@ -426,7 +426,7 @@ cmake -DENABLE_JAX=ON -D CMAKE_PREFIX_PATH=${tensorflow_c_root} ..
 
 :::
 
-:::\{tab-item} Paddle {{ paddle_icon }}
+:::{tab-item} Paddle {{ paddle_icon }}
 
 I assume you have get the Paddle inference library(C++ interface) to `$PADDLE_INFERENCE_DIR`, then execute CMake
 
@@ -440,7 +440,7 @@ cmake -DENABLE_PADDLE=ON -DPADDLE_INFERENCE_DIR=$PADDLE_INFERENCE_DIR -DCMAKE_IN
 
 One may add the following CMake variables to `cmake` using the [`-D <var>=<value>` option](https://cmake.org/cmake/help/latest/manual/cmake.1.html#cmdoption-cmake-D):
 
-:::\{cmake:variable} ENABLE_TENSORFLOW
+:::{cmake:variable} ENABLE_TENSORFLOW
 
 **Type**: `BOOL` (`ON`/`OFF`), Default: `OFF`
 
@@ -449,7 +449,7 @@ Setting this option to `ON` will also set {cmake:variable}`ENABLE_JAX` to `ON`.
 
 :::
 
-:::\{cmake:variable} ENABLE_PYTORCH
+:::{cmake:variable} ENABLE_PYTORCH
 
 **Type**: `BOOL` (`ON`/`OFF`), Default: `OFF`
 
@@ -457,7 +457,7 @@ Setting this option to `ON` will also set {cmake:variable}`ENABLE_JAX` to `ON`.
 
 :::
 
-:::\{cmake:variable} ENABLE_JAX
+:::{cmake:variable} ENABLE_JAX
 
 **Type**: `BOOL` (`ON`/`OFF`), Default: `OFF`
 
@@ -467,7 +467,7 @@ If {cmake:variable}`ENABLE_TENSORFLOW` is `OFF`, the TensorFlow C library is use
 
 :::
 
-:::\{cmake:variable} ENABLE_PADDLE
+:::{cmake:variable} ENABLE_PADDLE
 
 **Type**: `BOOL` (`ON`/`OFF`), Default: `OFF`
 
@@ -475,7 +475,7 @@ If {cmake:variable}`ENABLE_TENSORFLOW` is `OFF`, the TensorFlow C library is use
 
 :::
 
-:::\{cmake:variable} TENSORFLOW_ROOT
+:::{cmake:variable} TENSORFLOW_ROOT
 
 **Type**: `PATH`
 
@@ -483,7 +483,7 @@ If {cmake:variable}`ENABLE_TENSORFLOW` is `OFF`, the TensorFlow C library is use
 
 :::
 
-:::\{cmake:variable} PADDLE_INFERENCE_DIR
+:::{cmake:variable} PADDLE_INFERENCE_DIR
 
 **Type**: `PATH`
 
@@ -491,7 +491,7 @@ If {cmake:variable}`ENABLE_TENSORFLOW` is `OFF`, the TensorFlow C library is use
 
 :::
 
-:::\{cmake:variable} CMAKE_INSTALL_PREFIX
+:::{cmake:variable} CMAKE_INSTALL_PREFIX
 
 **Type**: `PATH`
 
@@ -500,7 +500,7 @@ See also [CMake documentation](https://cmake.org/cmake/help/latest/variable/CMAK
 
 :::
 
-:::\{cmake:variable} USE_CUDA_TOOLKIT
+:::{cmake:variable} USE_CUDA_TOOLKIT
 
 **Type**: `BOOL` (`ON`/`OFF`), Default: `OFF`
 
@@ -508,7 +508,7 @@ If `TRUE`, Build GPU support with CUDA toolkit.
 
 :::
 
-:::\{cmake:variable} CUDAToolkit_ROOT
+:::{cmake:variable} CUDAToolkit_ROOT
 
 **Type**: `PATH`, **Default**: [Search automatically](https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html)
 
@@ -517,7 +517,7 @@ See also [CMake documentation](https://cmake.org/cmake/help/latest/module/FindCU
 
 :::
 
-:::\{cmake:variable} USE_ROCM_TOOLKIT
+:::{cmake:variable} USE_ROCM_TOOLKIT
 
 **Type**: `BOOL` (`ON`/`OFF`), Default: `OFF`
 
@@ -525,7 +525,7 @@ If `TRUE`, Build GPU support with ROCM toolkit.
 
 :::
 
-:::\{cmake:variable} CMAKE_HIP_COMPILER_ROCM_ROOT
+:::{cmake:variable} CMAKE_HIP_COMPILER_ROCM_ROOT
 
 **Type**: `PATH`, **Default**: [Search automatically](https://rocm.docs.amd.com/en/latest/conceptual/cmake-packages.html)
 
@@ -534,7 +534,7 @@ See also [ROCm documentation](https://rocm.docs.amd.com/en/latest/conceptual/cma
 
 :::
 
-:::\{cmake:variable} LAMMPS_SOURCE_ROOT
+:::{cmake:variable} LAMMPS_SOURCE_ROOT
 
 **Type**: `PATH`
 
@@ -545,7 +545,7 @@ If not assigned, the plugin mode will not be enabled.
 
 :::
 
-:::\{cmake:variable} USE_TF_PYTHON_LIBS
+:::{cmake:variable} USE_TF_PYTHON_LIBS
 
 **Type**: `BOOL` (`ON`/`OFF`), Default: `OFF`
 
@@ -554,7 +554,7 @@ There's no need for building TensorFlow's C++ interface.
 
 :::
 
-:::\{cmake:variable} USE_PT_PYTHON_LIBS
+:::{cmake:variable} USE_PT_PYTHON_LIBS
 
 **Type**: `BOOL` (`ON`/`OFF`), Default: `OFF`
 
@@ -563,7 +563,7 @@ There's no need for downloading PyTorch's C++ libraries.
 
 :::
 
-:::\{cmake:variable} ENABLE_NATIVE_OPTIMIZATION
+:::{cmake:variable} ENABLE_NATIVE_OPTIMIZATION
 
 **Type**: `BOOL` (`ON`/`OFF`), Default: `OFF`
 
@@ -573,8 +573,7 @@ Do not enable it if generated code will run on different CPUs.
 :::
 
 <!-- prettier-ignore -->
-
-:::\{cmake:variable} CMAKE\_<LANG>\_FLAGS
+:::{cmake:variable} CMAKE_<LANG>_FLAGS
 
 (`<LANG>`=`CXX`, `CUDA` or `HIP`)
 
@@ -585,7 +584,7 @@ See also [CMake documentation](https://cmake.org/cmake/help/latest/variable/CMAK
 
 :::
 
-______________________________________________________________________
+---
 
 If the CMake has been executed successfully, then run the following make commands to build the package:
 
diff --git a/doc/install/install-gromacs.md b/doc/install/install-gromacs.md
index 02b0b4b194..90ed73841c 100644
--- a/doc/install/install-gromacs.md
+++ b/doc/install/install-gromacs.md
@@ -15,7 +15,6 @@ where `deepmd_kit_root` is the directory where the latest version of DeePMD-kit
 
 <!-- ## Install C++ api of deepmd-kit and tensorflow
 The C++ interface of `deepmd-kit 2.x` and `tensorflow 2.x` are required. -->
-
 <!-- + Tips: C++ api of deepmd and TensorFlow could be easily installed from the deepmd-kit offline packages. But before using tensorflow, you need to manually change the protobuf package to [version 3.9.2](https://github.com/protocolbuffers/protobuf/releases/tag/v3.9.2) in `$deepmd_env_dir/include/google/protobuf` (the offline package will install a version of 3.14, which will cause incompatibility). Here `deepmd_env_dir` refers to the directory of conda environment created by the deepmd-kit offline packages.  -->
 
 ## Compile GROMACS with deepmd-kit
@@ -31,10 +30,10 @@ mkdir build
 cd build
 
 cmake3 .. -DCMAKE_CXX_STANDARD=14 \ # not required, but c++14 seems to be more compatible with higher version of tensorflow
--DGMX_MPI=ON \
-    -DGMX_GPU=CUDA \ # Gromacs on ROCm has not been fully developed yet
--DCUDAToolkit_ROOT=/path/to/cuda \
-    -DCMAKE_INSTALL_PREFIX=/path/to/gromacs-2020.2-deepmd
+          -DGMX_MPI=ON \
+          -DGMX_GPU=CUDA \ # Gromacs on ROCm has not been fully developed yet
+          -DCUDAToolkit_ROOT=/path/to/cuda \
+          -DCMAKE_INSTALL_PREFIX=/path/to/gromacs-2020.2-deepmd
 make -j
 make install
 ```
diff --git a/doc/install/install-lammps.md b/doc/install/install-lammps.md
index 30f3aad7de..cb65188002 100644
--- a/doc/install/install-lammps.md
+++ b/doc/install/install-lammps.md
@@ -119,7 +119,7 @@ If everything works fine, you will end up with an executable `${deepmd_root}/bin
 ${deepmd_root}/bin/lmp -h
 ```
 
-:::\{note}
+:::{note}
 If `${tensorflow_root}`, `${deepmd_root}`, or the path to TensorFlow Python package if applicable is different from the prefix of LAMMPS, you need to append the library path to [`RUNPATH`](https://man7.org/linux/man-pages/man8/ld.so.8.html) of `liblammps.so`. For example, use patchelf >= 0.13
 
 ```sh
diff --git a/doc/install/install-nodejs.md b/doc/install/install-nodejs.md
index c6ec70d5cb..7137723c31 100644
--- a/doc/install/install-nodejs.md
+++ b/doc/install/install-nodejs.md
@@ -18,8 +18,8 @@ When using CMake to [build DeePMD-kit from source](./install-from-source.md), se
 
 ```sh
 cmake -D BUILD_NODEJS_IF=ON \
-    -D NODEJS_INCLUDE_DIRS=/path/to/nodejs/include \
-    .. # and other arguments
+      -D NODEJS_INCLUDE_DIRS=/path/to/nodejs/include \
+      .. # and other arguments
 make
 make install
 ```
diff --git a/doc/model/change-bias.md b/doc/model/change-bias.md
index 2f39ff0823..2a9b098606 100644
--- a/doc/model/change-bias.md
+++ b/doc/model/change-bias.md
@@ -1,6 +1,6 @@
 # Change the model output bias for trained model {{ tensorflow_icon }} {{ pytorch_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}
 :::
 
@@ -12,9 +12,9 @@ or manually setting the output bias.
 
 The `dp change-bias` command supports the following methods for adjusting the bias:
 
-::::\{tab-set}
+::::{tab-set}
 
-:::\{tab-item} TensorFlow Backend {{ tensorflow_icon }}
+:::{tab-item} TensorFlow Backend {{ tensorflow_icon }}
 
 **Changing bias using provided systems for trained checkpoint:**
 
@@ -30,7 +30,7 @@ dp --tf change-bias model.ckpt -b -92.523 -187.66 -o model_updated.pb
 
 :::
 
-:::\{tab-item} PyTorch Backend {{ pytorch_icon }}
+:::{tab-item} PyTorch Backend {{ pytorch_icon }}
 
 **Changing bias using provided systems for trained `.pt`/`.pth` models:**
 
diff --git a/doc/model/dpa2.md b/doc/model/dpa2.md
index 6c7632497f..466a4de4f2 100644
--- a/doc/model/dpa2.md
+++ b/doc/model/dpa2.md
@@ -1,6 +1,6 @@
 # Descriptor DPA-2 {{ pytorch_icon }} {{ jax_icon }} {{ paddle_icon }} {{ dpmodel_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, Paddle {{ paddle_icon }}, DP {{ dpmodel_icon }}
 :::
 
diff --git a/doc/model/dpa3.md b/doc/model/dpa3.md
index 81b1d3ad99..0ff46c438f 100644
--- a/doc/model/dpa3.md
+++ b/doc/model/dpa3.md
@@ -1,6 +1,6 @@
 # Descriptor DPA3 {{ pytorch_icon }} {{ jax_icon }} {{ paddle_icon }} {{ dpmodel_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, DP {{ dpmodel_icon }}
 :::
 
@@ -42,9 +42,9 @@ Note that we set `float32` in all DPA3 models, while `float64` in other models b
 
 ## Requirements of installation from source code {{ pytorch_icon }} {{ paddle_icon }}
 
-::::\{tab-set}
+::::{tab-set}
 
-:::\{tab-item} PyTorch {{ pytorch_icon }}
+:::{tab-item} PyTorch {{ pytorch_icon }}
 
 To run the DPA3 model on LAMMPS via source code installation
 (users can skip this step if using [easy installation](../install/easy-install.md)),
@@ -59,7 +59,7 @@ otherwise the communication between GPU cards falls back to the slower CPU imple
 
 :::
 
-:::\{tab-item} Paddle {{ paddle_icon }}
+:::{tab-item} Paddle {{ paddle_icon }}
 
 The customized OP library for the Python interface can be installed by
 
diff --git a/doc/model/dplr.md b/doc/model/dplr.md
index 60a6a49f19..61327bb55e 100644
--- a/doc/model/dplr.md
+++ b/doc/model/dplr.md
@@ -1,6 +1,6 @@
 # Deep potential long-range (DPLR) {{ tensorflow_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}
 :::
 
@@ -36,6 +36,8 @@ $L$ is the cutoff in Fourier space and $S(m)$, the structure factor, is given by
 where $\imath = \sqrt{-1}$ denotes the imaginary unit, $\boldsymbol r_i$ indicates ion coordinates, $q_i$ is the charge of the ion $i$, and $W_n$ is the $n$-th Wannier centroid (WC) which can be obtained from a separated [dipole model](./train-fitting-tensor.md).
 It can be proved that the error in the electrostatic energy introduced by the Gaussian approximations is dominated by a summation of dipole-quadrupole interactions that decay as $r^{-4}$, where $r$ is the distance between the dipole and quadrupole.[^1]
 
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
 ## Train a deep Wannier model for Wannier centroids
 
 We use the deep Wannier model (DW) to represent the relative position of the Wannier centroid (WC) with the atom with which it is associated. One may consult the introduction of the [dipole model](train-fitting-tensor.md) for a detailed introduction. An example input `wc.json` and a small dataset `data` for tutorial purposes can be found in
@@ -190,11 +192,9 @@ fix ID group-ID style_name keyword value ...
 ```
 
 <!-- See https://github.com/prettier/prettier/issues/16160 -->
-
 <!-- prettier-ignore -->
-
 - ID, group-ID are documented in :doc:`fix <fix>` command
-- style_name = _dplr_
+- style\_name = _dplr_
 - three or more keyword/value pairs may be appended
 
 ```
@@ -266,6 +266,4 @@ The MD simulation lasts for only 20 steps. If one runs a longer simulation, it w
 
 Another restriction that should be noted is that the energies printed at the zero steps are not correct. This is because at the zero steps the position of the WC has not been updated with the DW model. The energies printed in later steps are correct.
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 [1]: https://arxiv.org/abs/2112.13327
diff --git a/doc/model/dprc.md b/doc/model/dprc.md
index d6e25faae2..9f3eee244d 100644
--- a/doc/model/dprc.md
+++ b/doc/model/dprc.md
@@ -1,6 +1,6 @@
 # Deep Potential - Range Correction (DPRc) {{ tensorflow_icon }} {{ pytorch_icon }} {{ dpmodel_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, DP {{ dpmodel_icon }}
 :::
 
@@ -44,6 +44,8 @@ The fitting network is revised to remove energy bias from MM atoms:
 where $\mathbf{0}$ is a zero matrix.
 It is worth mentioning that usage of DPRc is not limited to its initial design for QM/MM correction and can be expanded to any similar interaction.[^1]
 
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
 See the [JCTC paper](https://doi.org/10.1021/acs.jctc.1c00201) for details.
 
 ## Training data
@@ -66,9 +68,9 @@ In a DPRc model, QM atoms and MM atoms have different atom types. Assuming we ha
 
 As described in the paper, the DPRc model only corrects $E_\text{QM}$ and $E_\text{QM/MM}$ within the cutoff, so we use a hybrid descriptor to describe them separately:
 
-::::\{tab-set}
+::::{tab-set}
 
-:::\{tab-item} TensorFlow {{ tensorflow_icon }}
+:::{tab-item} TensorFlow {{ tensorflow_icon }}
 
 ```json
 "descriptor" :{
@@ -101,7 +103,7 @@ As described in the paper, the DPRc model only corrects $E_\text{QM}$ and $E_\te
 
 :::
 
-:::\{tab-item} PyTorch {{ pytorch_icon }}
+:::{tab-item} PyTorch {{ pytorch_icon }}
 
 ```json
 "descriptor" :{
@@ -179,7 +181,7 @@ The DPRc model has the best practices with the [AMBER](../third-party/out-of-dee
 
 ## Pairwise DPRc
 
-:::\{note}
+:::{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}
 :::
 
@@ -198,18 +200,9 @@ It is noted that the [`se_atten` descriptor](./train-se-atten.md) should be used
 {
   "model": {
     "type": "pairwise_dprc",
-    "type_map": [
-      "C",
-      "P",
-      "O",
-      "H",
-      "OW",
-      "HW"
-    ],
+    "type_map": ["C", "P", "O", "H", "OW", "HW"],
     "type_embedding": {
-      "neuron": [
-        8
-      ],
+      "neuron": [8],
       "precision": "float32"
     },
     "qm_model": {
@@ -219,11 +212,7 @@ It is noted that the [`se_atten` descriptor](./train-se-atten.md) should be used
         "rcut_smth": 0.5,
         "rcut": 9.0,
         "attn_layer": 0,
-        "neuron": [
-          25,
-          50,
-          100
-        ],
+        "neuron": [25, 50, 100],
         "resnet_dt": false,
         "axis_neuron": 12,
         "precision": "float32",
@@ -231,21 +220,10 @@ It is noted that the [`se_atten` descriptor](./train-se-atten.md) should be used
       },
       "fitting_net": {
         "type": "ener",
-        "neuron": [
-          240,
-          240,
-          240
-        ],
+        "neuron": [240, 240, 240],
         "resnet_dt": true,
         "precision": "float32",
-        "atom_ener": [
-          null,
-          null,
-          null,
-          null,
-          0.0,
-          0.0
-        ],
+        "atom_ener": [null, null, null, null, 0.0, 0.0],
         "seed": 1
       }
     },
@@ -256,89 +234,35 @@ It is noted that the [`se_atten` descriptor](./train-se-atten.md) should be used
         "rcut_smth": 0.5,
         "rcut": 6.0,
         "attn_layer": 0,
-        "neuron": [
-          25,
-          50,
-          100
-        ],
+        "neuron": [25, 50, 100],
         "resnet_dt": false,
         "axis_neuron": 12,
         "set_davg_zero": true,
         "exclude_types": [
-          [
-            0,
-            0
-          ],
-          [
-            0,
-            1
-          ],
-          [
-            0,
-            2
-          ],
-          [
-            0,
-            3
-          ],
-          [
-            1,
-            1
-          ],
-          [
-            1,
-            2
-          ],
-          [
-            1,
-            3
-          ],
-          [
-            2,
-            2
-          ],
-          [
-            2,
-            3
-          ],
-          [
-            3,
-            3
-          ],
-          [
-            4,
-            4
-          ],
-          [
-            4,
-            5
-          ],
-          [
-            5,
-            5
-          ]
+          [0, 0],
+          [0, 1],
+          [0, 2],
+          [0, 3],
+          [1, 1],
+          [1, 2],
+          [1, 3],
+          [2, 2],
+          [2, 3],
+          [3, 3],
+          [4, 4],
+          [4, 5],
+          [5, 5]
         ],
         "precision": "float32",
         "seed": 1
       },
       "fitting_net": {
         "type": "ener",
-        "neuron": [
-          240,
-          240,
-          240
-        ],
+        "neuron": [240, 240, 240],
         "resnet_dt": true,
         "seed": 1,
         "precision": "float32",
-        "atom_ener": [
-          0.0,
-          0.0,
-          0.0,
-          0.0,
-          0.0,
-          0.0
-        ]
+        "atom_ener": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
       }
     }
   }
@@ -349,5 +273,3 @@ The pairwise model needs information for MM residues.
 The model uses [`aparam`](../data/system.md) with the shape of `nframes x natoms` to get the residue index.
 The QM residue should always use `0` as the index.
 For example, `0 0 0 1 1 1 2 2 2` means these 9 atoms are grouped into one QM residue and two MM residues.
-
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/linear.md b/doc/model/linear.md
index a6693e668e..47fdd1750b 100644
--- a/doc/model/linear.md
+++ b/doc/model/linear.md
@@ -1,6 +1,6 @@
 ## Linear model {{ tensorflow_icon }} {{ pytorch_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}
 :::
 
diff --git a/doc/model/overall.md b/doc/model/overall.md
index 85d4eaa42b..cc72aa3887 100644
--- a/doc/model/overall.md
+++ b/doc/model/overall.md
@@ -24,6 +24,8 @@ From the above equation, one may compute the global property of the system by
 where $N$ is the number of atoms in a frame.
 For example, if $y_i$ represents the potential energy contribution of atom $i$, then $y$ gives the total potential energy of the frame.[^1]
 
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
 ## Instructions
 
 A model has two parts, a descriptor that maps atomic configuration to a set of symmetry invariant features, and a fitting net that takes descriptor as input and predicts the atomic contribution to the target physical property. It's defined in the {ref}`model <model>` section of the `input.json`, for example,
@@ -47,21 +49,19 @@ The {ref}`type_map <model/type_map>` is optional, which provides the element nam
 DeePMD-kit implements the following descriptors:
 
 1. [`se_e2_a`](train-se-e2-a.md): DeepPot-SE constructed from all information (both angular and radial) of atomic configurations. The embedding takes the distance between atoms as input.
-1. [`se_e2_r`](train-se-e2-r.md): DeepPot-SE constructed from radial information of atomic configurations. The embedding takes the distance between atoms as input.
-1. [`se_e3`](train-se-e3.md): DeepPot-SE constructed from all information (both angular and radial) of atomic configurations. The embedding takes angles between two neighboring atoms as input.
-1. [`se_a_mask`](train-se-a-mask.md): DeepPot-SE constructed from all information (both angular and radial) of atomic configurations. The input frames in one system can have a varied number of atoms. Input particles are padded with virtual particles of the same length.
-1. `loc_frame`: Defines a local frame at each atom and compute the descriptor as local coordinates under this frame.
-1. [`hybrid`](train-hybrid.md): Concate a list of descriptors to form a new descriptor.
+2. [`se_e2_r`](train-se-e2-r.md): DeepPot-SE constructed from radial information of atomic configurations. The embedding takes the distance between atoms as input.
+3. [`se_e3`](train-se-e3.md): DeepPot-SE constructed from all information (both angular and radial) of atomic configurations. The embedding takes angles between two neighboring atoms as input.
+4. [`se_a_mask`](train-se-a-mask.md): DeepPot-SE constructed from all information (both angular and radial) of atomic configurations. The input frames in one system can have a varied number of atoms. Input particles are padded with virtual particles of the same length.
+5. `loc_frame`: Defines a local frame at each atom and compute the descriptor as local coordinates under this frame.
+6. [`hybrid`](train-hybrid.md): Concate a list of descriptors to form a new descriptor.
 
 The fitting of the following physical properties is supported
 
 1. [`ener`](train-energy.md): Fit the energy of the system. The force (derivative with atom positions), the virial (derivative with the box tensor) and the hessian (second-order derivative with atom positions) can also be trained.
 
-:::\{warning}
+:::{warning}
 Due to the restrictions of torch jit script, the models trained with hessian are not jitable so that the frozen models cannot output hessians.
 :::
 
 2. [`dipole`](train-fitting-tensor.md): The dipole moment.
-1. [`polar`](train-fitting-tensor.md): The polarizability.
-
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+3. [`polar`](train-fitting-tensor.md): The polarizability.
diff --git a/doc/model/pairtab.md b/doc/model/pairtab.md
index 3c062efedd..57fe23f5e9 100644
--- a/doc/model/pairtab.md
+++ b/doc/model/pairtab.md
@@ -1,6 +1,6 @@
 # Interpolation or combination with a pairwise potential {{ tensorflow_icon }} {{ pytorch_icon }} {{ dpmodel_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, DP {{ dpmodel_icon }}
 :::
 
@@ -45,6 +45,8 @@ In the range $[r_a, r_b]$, the DP model smoothly switched off and the pairwise p
 where the scale $\alpha_s$ is a tunable scale of the interatomic distance $r_{ij}$.
 The pairwise potential $u^{\textrm{pair}}(r)$ is defined by a user-defined table that provides the value of $u^{\textrm{pair}}$ on an evenly discretized grid from 0 to the cutoff distance.[^1]
 
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
 DeePMD-kit also supports combination with a pairwise potential {{ tensorflow_icon }}:
 
 ```math
@@ -60,7 +62,7 @@ in the order of Type_0-Type_0, Type_0-Type_1, ..., Type_0-Type_N, Type_1-Type_1,
 
 The interaction should be smooth at the cut-off distance.
 
-:::\{note}
+:::{note}
 In instances where the interaction at the cut-off distance is not delineated within the table file, extrapolation will be conducted utilizing the available interaction data. This extrapolative procedure guarantees a smooth transition from the table-provided value to `0` whenever feasible.
 :::
 
@@ -102,11 +104,9 @@ To combine with a pairwise potential, use the [linear model](./linear.md):
 
 The {ref}`rcut <model[pairtab]/rcut>` can be larger than that of the DP model.
 
-:::\{note}
+:::{note}
 The above example shows a example of combining D3 dispersion.
 However, it is more efficient to train a model using plain DFT calculations without the dispersion correction, and add the dispersion correction during the simulation via the LAMMPS [`pair_style dispersion/d3` command](https://docs.lammps.org/pair_dispersion_d3.html#pair-style-dispersion-d3-command).
 Training against data with dispersion directly is discouraged.
 See the [D3 dispersion section](../third-party/lammps-command.md#d3-dispersion) for details.
 :::
-
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/sel.md b/doc/model/sel.md
index 942cdccc22..5b85318dd9 100644
--- a/doc/model/sel.md
+++ b/doc/model/sel.md
@@ -6,9 +6,9 @@ All descriptors require to set `sel`, which means the expected maximum number of
 
 To determine a proper `sel`, one can calculate the neighbor stat of the training data before training:
 
-::::\{tab-set}
+::::{tab-set}
 
-:::\{tab-item} TensorFlow {{ tensorflow_icon }}
+:::{tab-item} TensorFlow {{ tensorflow_icon }}
 
 ```sh
 dp --tf neighbor-stat -s data -r 6.0 -t O H
@@ -16,7 +16,7 @@ dp --tf neighbor-stat -s data -r 6.0 -t O H
 
 :::
 
-:::\{tab-item} PyTorch {{ pytorch_icon }}
+:::{tab-item} PyTorch {{ pytorch_icon }}
 
 ```sh
 dp --pt neighbor-stat -s data -r 6.0 -t O H
@@ -24,7 +24,7 @@ dp --pt neighbor-stat -s data -r 6.0 -t O H
 
 :::
 
-:::\{tab-item} JAX {{ jax_icon }}
+:::{tab-item} JAX {{ jax_icon }}
 
 ```sh
 dp --jax neighbor-stat -s data -r 6.0 -t O H
@@ -32,7 +32,7 @@ dp --jax neighbor-stat -s data -r 6.0 -t O H
 
 :::
 
-:::\{tab-item} Paddle {{ paddle_icon }}
+:::{tab-item} Paddle {{ paddle_icon }}
 
 ```sh
 dp --pd neighbor-stat -s data -r 6.0 -t O H
diff --git a/doc/model/show-model-info.md b/doc/model/show-model-info.md
index 5bffacb871..67d82610de 100644
--- a/doc/model/show-model-info.md
+++ b/doc/model/show-model-info.md
@@ -33,35 +33,28 @@ dp show frozen_model.pth type-map descriptor fitting-net size
 Depending on the provided attributes and the model type, the output includes:
 
 - **Model Type**
-
   - Logs whether the loaded model is a _singletask_ or _multitask_ model.
 
 - **model-branch**
-
   - _Only available for multitask models._
   - Lists all available model branches and the special `"RANDOM"` branch, which refers to a randomly initialized fitting net.
 
 - **type-map**
-
   - For multitask models: Shows the type map for each branch.
   - For singletask models: Shows the model's type map.
 
 - **descriptor**
-
   - For multitask models: Displays the descriptor parameter for each branch.
   - For singletask models: Displays the descriptor parameter.
 
 - **fitting-net**
-
   - For multitask models: Shows the fitting network parameters for each branch.
   - For singletask models: Shows the fitting network parameters.
 
 - **size**
-
   - Prints the number of parameters for each component (`descriptor`, `fitting-net`, etc.), as well as the total parameter count.
 
 - **observed-type**
-
   - Displays the count and list of observed element types of the model during data statistics.
   - For multitask models, it shows the observed types for each branch.
   - Note: This info shows the types observed during training data statistics, which may differ from the type map.
diff --git a/doc/model/train-energy-hessian.md b/doc/model/train-energy-hessian.md
index 61ebd0530c..d77e7f3e88 100644
--- a/doc/model/train-energy-hessian.md
+++ b/doc/model/train-energy-hessian.md
@@ -1,6 +1,6 @@
 # Fit energy Hessian {{ pytorch_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: PyTorch {{ pytorch_icon }}
 :::
 
@@ -43,9 +43,9 @@ set.*/hessian.npy
 
 This system contains `Nframes` frames with the same atom number `Natoms`, the total number of elements contained in all frames is `Ntypes`. Most files are the same as those in [standard formats](../data/system.md), here we only list the distinct ones:
 
-| ID      | Property         | Raw file    | Unit   | Shape                               | Description                                             |
-| ------- | ---------------- | ----------- | ------ | ----------------------------------- | ------------------------------------------------------- |
-| hessian | Hessian matrices | hessian.npy | eV/Å^2 | Nframes * (Natoms * 3 * Natoms * 3) | Second-order derivatives of energies w.r.t coordinates. |
+| ID      | Property         | Raw file    | Unit   | Shape                                   | Description                                             |
+| ------- | ---------------- | ----------- | ------ | --------------------------------------- | ------------------------------------------------------- |
+| hessian | Hessian matrices | hessian.npy | eV/Å^2 | Nframes \* (Natoms \* 3 \* Natoms \* 3) | Second-order derivatives of energies w.r.t coordinates. |
 
 Note that the `hessian.npy` should contain the **full** Hessian matrices with shape of `(3Natoms * 3Natoms)` for each frame, rather than the upper or lower triangular matrices with shape of `(3Natoms * (3Natoms + 1) / 2)` for each frame.
 
@@ -53,9 +53,9 @@ Note that the `hessian.npy` should contain the **full** Hessian matrices with sh
 
 There are two approaches to training a Hessian model. The first method involves training the model from scratch using the same command as in the `ener` mode within the PyTorch backend:
 
-::::\{tab-set}
+::::{tab-set}
 
-:::\{tab-item} PyTorch {{ pytorch_icon }}
+:::{tab-item} PyTorch {{ pytorch_icon }}
 
 ```bash
 dp --pt train input.json
@@ -67,9 +67,9 @@ dp --pt train input.json
 
 The second approach is to train a Hessian model from a pretrained energy model, following the same command as the `finetune` strategy within the PyTorch backend:
 
-::::\{tab-set}
+::::{tab-set}
 
-:::\{tab-item} PyTorch {{ pytorch_icon }}
+:::{tab-item} PyTorch {{ pytorch_icon }}
 
 ```bash
 dp --pt train input.json --finetune pretrained_energy.pt
@@ -93,15 +93,15 @@ The detailed loss can be found in `lcurve.out`:
 
 ## Test the Model
 
-:::\{warning}
+:::{warning}
 A model trained with Hessian cannot be frozen. If freezing is enforced, the model will be treated as a standard energy model, and the frozen one will no longer be able to output Hessian predictions.
 :::
 
 If one do freeze and test a Hessian model using the commands:
 
-::::\{tab-set}
+::::{tab-set}
 
-:::\{tab-item} PyTorch {{ pytorch_icon }}
+:::{tab-item} PyTorch {{ pytorch_icon }}
 
 ```bash
 
@@ -123,9 +123,9 @@ ${output_prefix}.v.out   ${output_prefix}.v_peratom.out
 
 If one intends to use the trained model for Hessian predictions, then he/she is supposed to test the model directly without performing a freezing operation:
 
-::::\{tab-set}
+::::{tab-set}
 
-:::\{tab-item} PyTorch {{ pytorch_icon }}
+:::{tab-item} PyTorch {{ pytorch_icon }}
 
 ```bash
 
diff --git a/doc/model/train-energy-spin.md b/doc/model/train-energy-spin.md
index eee7d49991..52a470f2a6 100644
--- a/doc/model/train-energy-spin.md
+++ b/doc/model/train-energy-spin.md
@@ -1,13 +1,13 @@
 # Fit spin energy {{ tensorflow_icon }} {{ pytorch_icon }} {{ dpmodel_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, DP {{ dpmodel_icon }}
 :::
 
 To train a model that takes additional spin information as input, you only need to modify the following sections to define the spin-specific settings,
 keeping other sections the same as the normal energy model's input script.
 
-:::\{warning}
+:::{warning}
 Note that when adding spin into the model, there will be some implicit modifications automatically done by the program:
 
 - In the TensorFlow backend, the `se_e2_a` descriptor will treat those atom types with spin as new (virtual) types,
@@ -23,7 +23,7 @@ Note that when adding spin into the model, there will be some implicit modificat
 
 The spin settings are given by the {ref}`spin <model/spin>` section, which sets the magnetism for each type of atoms as described in the following sections.
 
-:::\{note}
+:::{note}
 Note that the construction of spin settings is different between TensorFlow and PyTorch/DP.
 :::
 
@@ -68,11 +68,11 @@ See `se_e2_a` examples in `$deepmd_source_dir/examples/spin/se_e2_a/input_torch.
   between a virtual atom representing spin and its corresponding real atom
   for each atom type with spin. This factor is defined as the virtual distance
   divided by the magnitude of atomic spin for each atom type with spin.
-  The virtual coordinate is defined as the real coordinate plus spin * virtual_scale.
+  The virtual coordinate is defined as the real coordinate plus spin \* virtual_scale.
   List of float values with shape of `ntypes` or `ntypes_spin` or one single float value for all types,
   only used when {ref}`use_spin <model/spin[ener_spin]/use_spin>` is True for each atom type.
 
-:::\{note}
+:::{note}
 It should be noted that the spin models in PyTorch/DP are capable of addressing scenarios where the spin approaches zero
 (indicating the virtual atom is in close proximity to the real atom) by adjusting the non-zero
 {ref}`env_protection <model[standard]/descriptor[se_e2_a]/env_protection>` parameter within the descriptor.
@@ -88,7 +88,7 @@ $$L = p_e L_e + p_{fr} L_{fr} + p_{fm} L_{fm} + p_v L_v$$
 
 where $L_e$, $L_{fr}$, $L_{fm}$ and $L_v$ denote the loss in energy, atomic force, magnatic force and virial, respectively. $p_e$, $p_{fr}$, $p_{fm}$ and $p_v$ give the prefactors of the energy, atomic force, magnatic force and virial losses.
 
-:::\{note}
+:::{note}
 Please note that the virial and atomic virial are not currently supported in spin models.
 :::
 
@@ -124,7 +124,7 @@ If one does not want to train with virial, then he/she may set the virial prefac
 
 ## Data format
 
-:::\{note}
+:::{note}
 Note that the spin data format is different between TensorFlow and PyTorch/DP.
 :::
 
@@ -148,13 +148,13 @@ where $\bm{R}_{i^p}$, $\bm{R}_i$, and $\bm{S}_i$ denote the virtual atomic coord
 
 We list the details about spin system data format in TensorFlow backend:
 
-| ID     | Property                   | Raw file   | Unit | Shape                           | Description                                                                                                                                               |
-| ------ | -------------------------- | ---------- | ---- | ------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| type   | Atom type indexes          | type.raw   | \\   | Natoms + Nspins                 | Integers that start with 0. The first `Natoms` entries represent real atom types, followed by `Nspins` entries representing virtual atom types.           |
-| coord  | Coordinates                | coord.raw  | Å    | Nframes * (Natoms + Nspins) * 3 | The first `3 \* Natoms` columns represent the coordinates of real atoms, followed by `3 \* Nspins` columns representing the coordinates of virtual atoms. |
-| box    | Boxes                      | box.raw    | Å    | Nframes * 3 * 3                 | in the order `XX XY XZ YX YY YZ ZX ZY ZZ`                                                                                                                 |
-| energy | Frame energies             | energy.raw | eV   | Nframes                         |                                                                                                                                                           |
-| force  | Atomic and magnetic forces | force.raw  | eV/Å | Nframes * (Natoms + Nspins) * 3 | The first `3 \* Natoms` columns represent atomic forces, followed by `3 \* Nspins` columns representing magnetic forces.                                  |
+| ID     | Property                   | Raw file   | Unit | Shape                             | Description                                                                                                                                               |
+| ------ | -------------------------- | ---------- | ---- | --------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| type   | Atom type indexes          | type.raw   | \    | Natoms + Nspins                   | Integers that start with 0. The first `Natoms` entries represent real atom types, followed by `Nspins` entries representing virtual atom types.           |
+| coord  | Coordinates                | coord.raw  | Å    | Nframes \* (Natoms + Nspins) \* 3 | The first `3 \* Natoms` columns represent the coordinates of real atoms, followed by `3 \* Nspins` columns representing the coordinates of virtual atoms. |
+| box    | Boxes                      | box.raw    | Å    | Nframes \* 3 \* 3                 | in the order `XX XY XZ YX YY YZ ZX ZY ZZ`                                                                                                                 |
+| energy | Frame energies             | energy.raw | eV   | Nframes                           |
+| force  | Atomic and magnetic forces | force.raw  | eV/Å | Nframes \* (Natoms + Nspins) \* 3 | The first `3 \* Natoms` columns represent atomic forces, followed by `3 \* Nspins` columns representing magnetic forces.                                  |
 
 ### Spin data format in PyTorch/DP
 
@@ -172,7 +172,7 @@ set.*/force_mag.npy
 
 This system contains `Nframes` frames with the same atom number `Natoms`, the total number of element contained in all frames is `Ntypes`. Most files are the same as those in [standard formats](../data/system.md), here we only list the distinct ones:
 
-| ID             | Property         | Raw file      | Unit    | Shape                | Description                                                         |
-| -------------- | ---------------- | ------------- | ------- | -------------------- | ------------------------------------------------------------------- |
-| spin           | Magnetic moments | spin.raw      | $\mu_B$ | Nframes * Natoms * 3 | Spin for magnetic atoms and zero for non-magnetic atoms.            |
-| magnetic force | Magnetic forces  | force_mag.raw | eV/Å    | Nframes * Natoms * 3 | Magnetic forces for magnetic atoms and zero for non-magnetic atoms. |
+| ID             | Property         | Raw file      | Unit    | Shape                  | Description                                                         |
+| -------------- | ---------------- | ------------- | ------- | ---------------------- | ------------------------------------------------------------------- |
+| spin           | Magnetic moments | spin.raw      | $\mu_B$ | Nframes \* Natoms \* 3 | Spin for magnetic atoms and zero for non-magnetic atoms.            |
+| magnetic force | Magnetic forces  | force_mag.raw | eV/Å    | Nframes \* Natoms \* 3 | Magnetic forces for magnetic atoms and zero for non-magnetic atoms. |
diff --git a/doc/model/train-energy.md b/doc/model/train-energy.md
index 4d0a3d5ce8..128779ee16 100644
--- a/doc/model/train-energy.md
+++ b/doc/model/train-energy.md
@@ -1,6 +1,6 @@
 # Fit energy {{ tensorflow_icon }} {{ pytorch_icon }} {{ jax_icon }} {{ paddle_icon }} {{ dpmodel_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, Paddle {{ paddle_icon }}, DP {{ dpmodel_icon }}
 :::
 
@@ -75,6 +75,8 @@ where $\nu$ is a small constant used to protect
 an atom where the magnitude of $\boldsymbol{F}^\ast_k$ is small from having a large $L^r_F$.
 Benefiting from the relative force loss, small forces can be fitted more accurately.[^1]
 
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
 ## The fitting network
 
 The construction of the fitting net is given by section {ref}`fitting_net <model[standard]/fitting_net>`
@@ -123,5 +125,3 @@ The {ref}`loss <loss>` section in the `input.json` is
 The options {ref}`start_pref_e <loss[ener]/start_pref_e>`, {ref}`limit_pref_e <loss[ener]/limit_pref_e>`, {ref}`start_pref_f <loss[ener]/start_pref_f>`, {ref}`limit_pref_f <loss[ener]/limit_pref_f>`, {ref}`start_pref_v <loss[ener]/start_pref_v>` and {ref}`limit_pref_v <loss[ener]/limit_pref_v>` determine the start and limit prefactors of energy, force and virial, respectively.
 
 If one does not want to train with virial, then he/she may set the virial prefactors {ref}`start_pref_v <loss[ener]/start_pref_v>` and {ref}`limit_pref_v <loss[ener]/limit_pref_v>` to 0.
-
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/train-fitting-dos.md b/doc/model/train-fitting-dos.md
index 600a619b8a..fb4a3677e5 100644
--- a/doc/model/train-fitting-dos.md
+++ b/doc/model/train-fitting-dos.md
@@ -1,6 +1,6 @@
 # Fit electronic density of states (DOS) {{ tensorflow_icon }} {{ pytorch_icon }} {{ jax_icon }} {{ dpmodel_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, DP {{ dpmodel_icon }}
 :::
 
@@ -82,9 +82,9 @@ To prepare the data, we recommend shifting the DOS data by the Fermi level.
 
 The training command is the same as `ener` mode, i.e.
 
-::::\{tab-set}
+::::{tab-set}
 
-:::\{tab-item} TensorFlow {{ tensorflow_icon }}
+:::{tab-item} TensorFlow {{ tensorflow_icon }}
 
 ```bash
 dp --tf train input.json
@@ -92,7 +92,7 @@ dp --tf train input.json
 
 :::
 
-:::\{tab-item} PyTorch {{ pytorch_icon }}
+:::{tab-item} PyTorch {{ pytorch_icon }}
 
 ```bash
 dp --pt train input.json
@@ -133,9 +133,9 @@ The detailed loss can be found in `lcurve.out`:
 
 In this earlier version, we can use `dp test` to infer the electronic density of state for given frames.
 
-::::\{tab-set}
+::::{tab-set}
 
-:::\{tab-item} TensorFlow {{ tensorflow_icon }}
+:::{tab-item} TensorFlow {{ tensorflow_icon }}
 
 ```bash
 
@@ -146,7 +146,7 @@ dp --tf test -m frozen_model.pb -s ../data/111/$k -d ${output_prefix} -a -n 100
 
 :::
 
-:::\{tab-item} PyTorch {{ pytorch_icon }}
+:::{tab-item} PyTorch {{ pytorch_icon }}
 
 ```bash
 
diff --git a/doc/model/train-fitting-property.md b/doc/model/train-fitting-property.md
index a2d4c7c9e8..be1b63bf6f 100644
--- a/doc/model/train-fitting-property.md
+++ b/doc/model/train-fitting-property.md
@@ -1,6 +1,6 @@
 # Fit other properties {{ pytorch_icon }} {{ jax_icon }} {{ dpmodel_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, DP {{ dpmodel_icon }}
 :::
 
@@ -118,9 +118,9 @@ ls.to_deepmd_npy_mixed("deepmd")
 
 The training command is the same as `ener` mode, i.e.
 
-::::\{tab-set}
+::::{tab-set}
 
-:::\{tab-item} PyTorch {{ pytorch_icon }}
+:::{tab-item} PyTorch {{ pytorch_icon }}
 
 ```bash
 dp --pt train input.json
@@ -152,9 +152,9 @@ The detailed loss can be found in `lcurve.out`:
 
 We can use `dp test` to infer the properties for given frames.
 
-::::\{tab-set}
+::::{tab-set}
 
-:::\{tab-item} PyTorch {{ pytorch_icon }}
+:::{tab-item} PyTorch {{ pytorch_icon }}
 
 ```bash
 
@@ -185,10 +185,10 @@ for `*.property.out.*`, it contains matrix with shape of `(2, task_dim)`,
 ## Data Normalization
 
 When `fitting_net/type` is `ener`, the energy bias layer “$e_{bias}$” adds a constant bias to the atomic energy contribution according to the atomic number.i.e.,
-\$$e_{bias} (Z_i) (MLP(D_i))= MLP(D_i) + e_{bias} (Z_i)$\$
+$$e_{bias} (Z_i) (MLP(D_i))= MLP(D_i) + e_{bias} (Z_i)$$
 
 But when `fitting_net/type` is `property`. The property bias layer is used to normalize the property output of the model.i.e.,
-\$$p_{bias} (MLP(D_i))= MLP(D_i) * std+ mean$\$
+$$p_{bias} (MLP(D_i))= MLP(D_i) * std+ mean$$
 
 1. `std`: The standard deviation of the property label
-1. `mean`: The average value of the property label
+2. `mean`: The average value of the property label
diff --git a/doc/model/train-fitting-tensor.md b/doc/model/train-fitting-tensor.md
index 0eef9d9432..29c95b2d68 100644
--- a/doc/model/train-fitting-tensor.md
+++ b/doc/model/train-fitting-tensor.md
@@ -1,14 +1,14 @@
 # Fit `tensor` like `Dipole` and `Polarizability` {{ tensorflow_icon }} {{ pytorch_icon }} {{ jax_icon }} {{ dpmodel_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, DP {{ dpmodel_icon }}
 :::
 
 Unlike `energy`, which is a scalar, one may want to fit some high dimensional physical quantity, like `dipole` (vector) and `polarizability` (matrix, shorted as `polar`). Deep Potential has provided different APIs to do this. In this example, we will show you how to train a model to fit a water system. A complete training input script of the examples can be found in
 
-::::\{tab-set}
+::::{tab-set}
 
-:::\{tab-item} TensorFlow {{ tensorflow_icon }}
+:::{tab-item} TensorFlow {{ tensorflow_icon }}
 
 ```bash
 $deepmd_source_dir/examples/water_tensor/dipole/dipole_input.json
@@ -17,7 +17,7 @@ $deepmd_source_dir/examples/water_tensor/polar/polar_input.json
 
 :::
 
-:::\{tab-item} PyTorch {{ pytorch_icon }}
+:::{tab-item} PyTorch {{ pytorch_icon }}
 
 ```bash
 $deepmd_source_dir/examples/water_tensor/dipole/dipole_input_torch.json
@@ -69,13 +69,15 @@ The total tensor $\boldsymbol{T}$ (total dipole $\boldsymbol{T}^{(1)}$ or total
 
 The tensorial models can be used to calculate IR spectrum and Raman spectrum.[^1]
 
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
 ## The fitting Network
 
 The {ref}`fitting_net <model[standard]/fitting_net>` section tells DP which fitting net to use.
 
-::::\{tab-set}
+::::{tab-set}
 
-:::\{tab-item} TensorFlow {{ tensorflow_icon }}
+:::{tab-item} TensorFlow {{ tensorflow_icon }}
 
 The JSON of `dipole` type should be provided like
 
@@ -107,7 +109,7 @@ The JSON of `polar` type should be provided like
 
 :::
 
-:::\{tab-item} PyTorch {{ pytorch_icon }}
+:::{tab-item} PyTorch {{ pytorch_icon }}
 
 The JSON of `dipole` type should be provided like
 
@@ -181,9 +183,9 @@ In this case, please check the file name of the label.
 
 The training command is the same as `ener` mode, i.e.
 
-::::\{tab-set}
+::::{tab-set}
 
-:::\{tab-item} TensorFlow {{ tensorflow_icon }}
+:::{tab-item} TensorFlow {{ tensorflow_icon }}
 
 ```bash
 dp train input.json
@@ -191,7 +193,7 @@ dp train input.json
 
 :::
 
-:::\{tab-item} PyTorch {{ pytorch_icon }}
+:::{tab-item} PyTorch {{ pytorch_icon }}
 
 ```bash
 dp --pt train input.json
@@ -246,5 +248,3 @@ During training, at each step when the `lcurve.out` is printed, the system used
 To only fit against a subset of atomic types, in the TensorFlow backend, {ref}`fitting_net/sel_type <model[standard]/fitting_net[dipole]/sel_type>` should be set to selected types;
 in other backends, {ref}`atom_exclude_types <model/atom_exclude_types>` should be set to excluded types.
 The TensorFlow backend does not support {ref}`numb_fparam <model[standard]/fitting_net[dipole]/numb_fparam>` and {ref}`numb_aparam <model[standard]/fitting_net[dipole]/numb_aparam>`.
-
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/train-hybrid.md b/doc/model/train-hybrid.md
index 843e6eac3d..d565af5c9a 100644
--- a/doc/model/train-hybrid.md
+++ b/doc/model/train-hybrid.md
@@ -1,6 +1,6 @@
 # Descriptor `"hybrid"` {{ tensorflow_icon }} {{ pytorch_icon }} {{ jax_icon }} {{ dpmodel_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, DP {{ dpmodel_icon }}
 :::
 
@@ -21,6 +21,8 @@ A hybrid descriptor $\mathcal{D}^i_\text{hyb}$ concatenates multiple kinds of de
 The list of descriptors can be different types or the same descriptors with different parameters.
 This way, one can set the different cutoff radii for different descriptors.[^1]
 
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
 ## Instructions
 
 To use the descriptor in DeePMD-kit, one firstly set the {ref}`type <model[standard]/descriptor/type>` to {ref}`hybrid <model[standard]/descriptor[hybrid]>`, then provide the definitions of the descriptors by the items in the `list`,
@@ -56,5 +58,3 @@ In other backends, each descriptor has its own type embedding and their paramete
 ## Model compression
 
 Model compression is supported if all sub-descriptors support model compression.
-
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/train-se-a-mask.md b/doc/model/train-se-a-mask.md
index a453e1d3dc..ff1ee76c12 100644
--- a/doc/model/train-se-a-mask.md
+++ b/doc/model/train-se-a-mask.md
@@ -1,6 +1,6 @@
 # Descriptor `"se_a_mask"` {{ tensorflow_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}
 :::
 
@@ -68,7 +68,7 @@ To make the `aparam.npy` used for descriptor `se_a_mask`, two variables in `fitt
 - {ref}`use_aparam_as_mask <model[standard]/fitting_net[ener]/use_aparam_as_mask>` is set to `true` to use the `aparam.npy` as the mask of the atoms in the descriptor `se_a_mask`.
 
 Finally, to make a reasonable fitting task with `se_a_mask` descriptor for DP/MM simulations, the loss function with `se_a_mask` is designed to include the atomic forces difference in specific atoms of the input particles only.
-More details about the selection of the specific atoms can be found in paper \[DP/MM\](left to be filled).
+More details about the selection of the specific atoms can be found in paper [DP/MM](left to be filled).
 Thus, `atom_pref.npy` ( [ nframes * natoms ] ) is required as the indicator of the specific atoms in the input particles.
 And the `loss` section in the training input script should be set as follows.
 
diff --git a/doc/model/train-se-atten.md b/doc/model/train-se-atten.md
index 52e9b114b9..2e0c236cf6 100644
--- a/doc/model/train-se-atten.md
+++ b/doc/model/train-se-atten.md
@@ -1,6 +1,6 @@
 # Descriptor `"se_atten"` {{ tensorflow_icon }} {{ pytorch_icon }} {{ jax_icon }} {{ paddle_icon }} {{ dpmodel_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, Paddle {{ paddle_icon }}, DP {{ dpmodel_icon }}
 :::
 
@@ -64,6 +64,8 @@ Then layer normalization is added in a residual way to finally obtain the self-a
 \mathcal{G}^{i,l} = \mathcal{G}^{i,l-1} + \mathrm{LayerNorm}(A(\mathcal{Q}^{i,l}, \mathcal{K}^{i,l}, \mathcal{V}^{i,l}, \mathcal{R}^{i,l})).
 ```
 
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
 ## Descriptor `"se_atten"`
 
 Next, we will list the detailed settings in input.json and the data format, especially for large systems with dozens of elements. An example of DPA-1 input can be found in `examples/water/se_atten/input.json`.
@@ -132,7 +134,7 @@ You can use descriptor `"se_atten_v2"` and is not allowed to set `tebd_input_mod
 
 Practical evidence demonstrates that `"se_atten_v2"` offers better and more stable performance compared to `"se_atten"`.
 
-:::\{note}
+:::{note}
 Model compression support differs across backends. See [Model compression](#model-compression) for backend-specific requirements.
 :::
 
@@ -195,5 +197,3 @@ Model compression is supported for any {ref}`attn_layer <model[standard]/descrip
 Here we upload the AlMgCu example shown in the paper, you can download it here:
 [Baidu disk](https://pan.baidu.com/s/1Mk9CihPHCmf8quwaMhT-nA?pwd=d586);
 [Google disk](https://drive.google.com/file/d/11baEpRrvHoqxORFPSdJiGWusb3Y4AnRE/view?usp=sharing).
-
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/train-se-e2-a-tebd.md b/doc/model/train-se-e2-a-tebd.md
index 45d045ea06..00726c0d3e 100644
--- a/doc/model/train-se-e2-a-tebd.md
+++ b/doc/model/train-se-e2-a-tebd.md
@@ -1,6 +1,6 @@
 # Type embedding approach {{ tensorflow_icon }} {{ pytorch_icon }} {{ jax_icon }} {{ dpmodel_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, DP {{ dpmodel_icon }}
 :::
 
@@ -61,6 +61,8 @@ E_i=\mathcal{F}_0(\{\mathcal{D}^i, \mathcal{A}^i\}).
 
 In this way, all chemical species share the same network parameters through the type embedding.[^1]
 
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
 ## Instructions for TensorFlow backend {{ tensorflow_icon }}
 
 In the TensorFlow backend, the type embedding is at the model level.
@@ -112,5 +114,3 @@ See documentation for each descriptor for details.
 In other backends, the type embedding is within the descriptor itself.
 
 See documentation for each descriptor for details.
-
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/train-se-e2-a.md b/doc/model/train-se-e2-a.md
index 65c3002227..9382c78d8e 100644
--- a/doc/model/train-se-e2-a.md
+++ b/doc/model/train-se-e2-a.md
@@ -1,6 +1,6 @@
 # Descriptor `"se_e2_a"` {{ tensorflow_icon }} {{ pytorch_icon }} {{ jax_icon }} {{ paddle_icon }} {{ dpmodel_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, Paddle {{ paddle_icon }}, DP {{ dpmodel_icon }}
 :::
 
@@ -58,6 +58,8 @@ $\mathcal{G}^i_< \in \mathbb{R}^{N_c \times M_<}$ only takes first $M_<$ columns
 $r_s$, $r_c$, $M$ and $M_<$ are hyperparameters provided by the user.
 The DeepPot-SE is continuous up to the second-order derivative in its domain.[^1]
 
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
 ## Instructions
 
 In this example, we will train a DeepPot-SE model for a water system. A complete training input script of this example can be found in the directory.
@@ -107,5 +109,3 @@ In the JAX backend, {ref}`type_one_side <model[standard]/descriptor[se_e2_a]/typ
 
 Model compression is supported when type embedding is not used.
 To use model compression with type embedding in the TensorFlow backend, use `se_a_tebd_v2` instead.
-
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/train-se-e2-r.md b/doc/model/train-se-e2-r.md
index d1bbf69da6..62c7311a08 100644
--- a/doc/model/train-se-e2-r.md
+++ b/doc/model/train-se-e2-r.md
@@ -1,6 +1,6 @@
 # Descriptor `"se_e2_r"` {{ tensorflow_icon }} {{ pytorch_icon }} {{ jax_icon }} {{ dpmodel_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, DP {{ dpmodel_icon }}
 :::
 
@@ -42,6 +42,8 @@ In the above equations, the network parameters are not explicitly written.
 $r_s$, $r_c$ and $M$ are hyperparameters provided by the user.
 The DeepPot-SE is continuous up to the second-order derivative in its domain.[^1]
 
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
 ## Instructions
 
 A complete training input script of this example can be found in the directory
@@ -80,5 +82,3 @@ In the PyTorch, JAX, and DP backend, {ref}`type_one_side <model[standard]/descri
 ## Model compression
 
 Model compression is supported when type embedding is not used.
-
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/train-se-e3-tebd.md b/doc/model/train-se-e3-tebd.md
index 903cd18e61..f0001f4e67 100644
--- a/doc/model/train-se-e3-tebd.md
+++ b/doc/model/train-se-e3-tebd.md
@@ -1,6 +1,6 @@
 # Descriptor `"se_e3_tebd"` {{ pytorch_icon }} {{ jax_icon }} {{ dpmodel_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, DP {{ dpmodel_icon }}
 :::
 
diff --git a/doc/model/train-se-e3.md b/doc/model/train-se-e3.md
index b50440df15..2f1b3ba972 100644
--- a/doc/model/train-se-e3.md
+++ b/doc/model/train-se-e3.md
@@ -1,6 +1,6 @@
 # Descriptor `"se_e3"` {{ tensorflow_icon }} {{ pytorch_icon }} {{ jax_icon }} {{ dpmodel_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, DP {{ dpmodel_icon }}
 :::
 
@@ -38,6 +38,8 @@ Each element of $\mathcal{G}^i \in \mathbb{R}^{N_c \times N_c \times M}$ comes f
 where $(\theta_i)_ {jk} = (\mathcal{R}^i)_ {j,\\{2,3,4\\}}\cdot (\mathcal{R}^i)_ {k,\\{2,3,4\\}}$ considers the angle form of two neighbours ($j$ and $k$).
 The notation $:$ in the equation indicates the contraction between matrix $\mathcal{R}^i(\mathcal{R}^i)^T$ and the first two dimensions of tensor $\mathcal{G}^i$.[^1]
 
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
 ## Instructions
 
 A complete training input script of this example can be found in the directory
@@ -74,5 +76,3 @@ In the TensorFlow backend, {ref}`env_protection <model[standard]/descriptor[se_e
 ## Model compression
 
 Model compression is supported.
-
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/nvnmd/nvnmd.md b/doc/nvnmd/nvnmd.md
index 793b933f04..279236ec96 100644
--- a/doc/nvnmd/nvnmd.md
+++ b/doc/nvnmd/nvnmd.md
@@ -1,6 +1,6 @@
 # Introduction {{ tensorflow_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}
 :::
 
@@ -65,16 +65,10 @@ The "nvnmd" section is defined as
   "version": 0,
   "max_nnei": 128,
   "net_size": 128,
-  "sel": [
-    60,
-    60
-  ],
+  "sel": [60, 60],
   "rcut": 6.0,
   "rcut_smth": 0.5,
-  "type_map": [
-    "Ge",
-    "Te"
-  ]
+  "type_map": ["Ge", "Te"]
 }
 ```
 
@@ -86,15 +80,15 @@ where items are defined as:
 | max_nnei  | the maximum number of neighbors that do not distinguish element types | 128 or 256                                                                   |
 | net_size  | the size of neural network                                            | 128                                                                          |
 | sel       | the number of neighbors                                               | version 0: integer list of lengths 1 to 4 are acceptable; version 1: integer |
-| rcut      | the cutoff radial                                                     | (0, 8.0\]                                                                    |
-| rcut_smth | the smooth cutoff parameter                                           | (0, 8.0\]                                                                    |
+| rcut      | the cutoff radial                                                     | (0, 8.0]                                                                     |
+| rcut_smth | the smooth cutoff parameter                                           | (0, 8.0]                                                                     |
 | type_map  | mapping atom type to the name (str) of the type                       | string list, optional                                                        |
 
 Multiple versions of the nvnmd model correspond to different network structures. `nvnmd-v0` and `nvnmd-v1` differ in the following ways:
 
 1. `nvnmd-v0` and `nvnmd-v1` use the `se_a` descriptor and `se_atten` descriptor, respectively
-1. `nvnmd-v0` has 1 set of parameters for each element and supports up to 4 element types. `nvnmd-v1` shares 1 set of parameters for each element and supports up to 31 types.
-1. `nvnmd-v0` distinguishes between neighboring atoms, so `sel` is a list of integers. `nvnmd-v1` does not distinguish between neighboring atoms, so `sel` is an integer.
+2. `nvnmd-v0` has 1 set of parameters for each element and supports up to 4 element types. `nvnmd-v1` shares 1 set of parameters for each element and supports up to 31 types.
+3. `nvnmd-v0` distinguishes between neighboring atoms, so `sel` is a list of integers. `nvnmd-v1` does not distinguish between neighboring atoms, so `sel` is an integer.
 
 ### learning_rate
 
@@ -103,20 +97,20 @@ The "learning_rate" section is defined as
 ```json
 {
   "type": "exp",
-  "start_lr": 0.001,
-  "stop_lr": 3e-08,
+  "start_lr": 1e-3,
+  "stop_lr": 3e-8,
   "decay_steps": 5000
 }
 ```
 
 where items are defined as:
 
-| Item        | Mean                                                              | Optional Value         |
-| ----------- | ----------------------------------------------------------------- | ---------------------- |
-| type        | learning rate variant type                                        | exp                    |
-| start_lr    | the learning rate at the beginning of the training                | a positive real number |
-| stop_lr     | the desired learning rate at the end of the training              | a positive real number |
-| decay_stops | the learning rate is decaying every \{decay_stops} training steps | a positive integer     |
+| Item        | Mean                                                             | Optional Value         |
+| ----------- | ---------------------------------------------------------------- | ---------------------- |
+| type        | learning rate variant type                                       | exp                    |
+| start_lr    | the learning rate at the beginning of the training               | a positive real number |
+| stop_lr     | the desired learning rate at the end of the training             | a positive real number |
+| decay_stops | the learning rate is decaying every {decay_stops} training steps | a positive integer     |
 
 ### loss
 
@@ -158,16 +152,8 @@ The "training" section is defined as
   "save_ckpt": "model.ckpt",
   "save_freq": 10000,
   "training_data": {
-    "systems": [
-      "system1_path",
-      "system2_path",
-      "..."
-    ],
-    "batch_size": [
-      "batch_size_of_system1",
-      "batch_size_of_system2",
-      "..."
-    ]
+    "systems": ["system1_path", "system2_path", "..."],
+    "batch_size": ["batch_size_of_system1", "batch_size_of_system2", "..."]
   }
 }
 ```
@@ -178,7 +164,7 @@ where items are defined as:
 | ---------- | --------------------------------------------------- | ------------------ |
 | seed       | the random seed                                     | a integer          |
 | stop_batch | the total training steps                            | a positive integer |
-| numb_test  | the accuracy is test by using \{numb_test} sample   | a positive integer |
+| numb_test  | the accuracy is test by using {numb_test} sample    | a positive integer |
 | disp_file  | the log file where the training message display     | a string           |
 | disp_freq  | display frequency                                   | a positive integer |
 | save_ckpt  | path prefix of check point files                    | a string           |
diff --git a/doc/test/model-deviation.md b/doc/test/model-deviation.md
index 40637352dc..f4c58f1a38 100644
--- a/doc/test/model-deviation.md
+++ b/doc/test/model-deviation.md
@@ -49,6 +49,8 @@ an atom where the magnitude of $\boldsymbol{F}_i$ or $\boldsymbol{\Xi}$ is small
 Statistics of $\epsilon_{\boldsymbol{F},i}$ and $\epsilon_{\boldsymbol{\Xi},{\alpha \beta}}$ can be provided, including the maximum, average, and minimal model deviation over the atom index $i$ and over the component index $\alpha,\beta$, respectively.
 The maximum model deviation of forces $\epsilon_{\boldsymbol F,\text{max}}$ in a frame was found to be the best error indicator in a concurrent or active learning algorithm.[^1]
 
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
 ## Instructions
 
 One can also use a subcommand to calculate the deviation of predicted forces or virials for a bunch of models in the following way:
@@ -75,5 +77,3 @@ where $D_{f_i}$ is the absolute model deviation of the force on atom $i$, $f_i$
 If the argument `--relative_v` is set, then the relative model deviation of the virial will be output instead of the absolute value, with the same definition of that of the force:
 
 $$E_{v_i}=\frac{\left|D_{v_i}\right|}{\left|v_i\right|+l}$$
-
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/third-party/ase.md b/doc/third-party/ase.md
index eba322b30e..183efa7cbb 100644
--- a/doc/third-party/ase.md
+++ b/doc/third-party/ase.md
@@ -1,14 +1,14 @@
 # Use deep potential with ASE
 
-:::\{note}
+:::{note}
 See [Environment variables](../env.md) for the runtime environment variables.
 :::
 
 Deep potential can be set up as a calculator with ASE to obtain potential energies and forces.
 
-::::\{tab-set}
+::::{tab-set}
 
-:::\{tab-item} TensorFlow {{ tensorflow_icon }}
+:::{tab-item} TensorFlow {{ tensorflow_icon }}
 
 ```python
 from ase import Atoms
@@ -26,7 +26,7 @@ print(water.get_forces())
 
 :::
 
-:::\{tab-item} PyTorch {{ pytorch_icon }}
+:::{tab-item} PyTorch {{ pytorch_icon }}
 
 ```python
 from ase import Atoms
@@ -44,7 +44,7 @@ print(water.get_forces())
 
 :::
 
-:::\{tab-item} Paddle {{ paddle_icon }}
+:::{tab-item} Paddle {{ paddle_icon }}
 
 ```python
 from ase import Atoms
diff --git a/doc/third-party/dpdata.md b/doc/third-party/dpdata.md
index 65bf104ad6..ddb8f13aad 100644
--- a/doc/third-party/dpdata.md
+++ b/doc/third-party/dpdata.md
@@ -1,6 +1,6 @@
 # Use deep potential with dpdata
 
-:::\{note}
+:::{note}
 See [Environment variables](../env.md) for the runtime environment variables.
 :::
 
diff --git a/doc/third-party/gromacs.md b/doc/third-party/gromacs.md
index 5d3e098e32..791caeb419 100644
--- a/doc/third-party/gromacs.md
+++ b/doc/third-party/gromacs.md
@@ -1,6 +1,6 @@
 # Running MD with GROMACS
 
-:::\{note}
+:::{note}
 See [Environment variables](../env.md) for the runtime environment variables.
 :::
 
diff --git a/doc/third-party/ipi.md b/doc/third-party/ipi.md
index 8d745fac1f..117512138e 100644
--- a/doc/third-party/ipi.md
+++ b/doc/third-party/ipi.md
@@ -1,6 +1,6 @@
 # Run path-integral MD with i-PI
 
-:::\{note}
+:::{note}
 See [Environment variables](../env.md) for the runtime environment variables.
 :::
 
diff --git a/doc/third-party/lammps-command.md b/doc/third-party/lammps-command.md
index 38b0981d2b..25a77f8670 100644
--- a/doc/third-party/lammps-command.md
+++ b/doc/third-party/lammps-command.md
@@ -1,10 +1,10 @@
 # Run MD with LAMMPS
 
-:::\{note}
+:::{note}
 See [Environment variables](../env.md) for the runtime environment variables.
 :::
 
-:::\{note}
+:::{note}
 Each MPI rank can only use at most one GPU card.
 See [How to control the parallelism of a job](../troubleshooting/howtoset_num_nodes.md) for details.
 :::
@@ -50,7 +50,6 @@ pair_style deepmd models ... keyword value ...
   If multiple models are provided, then only the first model serves to provide energy and force prediction for each timestep of molecular dynamics,
   and the model deviation will be computed among all models every `out_freq` timesteps.
 - keyword = _out_file_ or _out_freq_ or _fparam_ or _fparam_from_compute_ or _aparam_from_compute_ or _atomic_ or _relative_ or _relative_v_ or _aparam_ or _ttm_
-
 <pre>
     <i>out_file</i> value = filename
         filename = The file name for the model deviation output. Default is model_devi.out
@@ -92,7 +91,7 @@ compute    1 all ke/atom
 
 ### Description
 
-Evaluate the interaction of the system by using [Deep Potential][dp] or [Deep Potential Smooth Edition][dp-se]. It is noticed that deep potential is not a "pairwise" interaction, but a multi-body interaction.
+Evaluate the interaction of the system by using [Deep Potential][DP] or [Deep Potential Smooth Edition][DP-SE]. It is noticed that deep potential is not a "pairwise" interaction, but a multi-body interaction.
 
 This pair style takes the deep potential defined in a model file that usually has .pb/.pth/.savedmodel extensions. The model can be trained and frozen from multiple backends by package [DeePMD-kit](https://github.com/deepmodeling/deepmd-kit), which can have either double or single float precision interface.
 
@@ -138,7 +137,7 @@ pair_style deepspin models ... keyword value ...
   and the model deviation will be computed among all models every `out_freq` timesteps.
 - keyword = _out_file_ or _out_freq_ or _fparam_ or _fparam_from_compute_ or _aparam_from_compute_ or _atomic_ or _relative_ or _aparam_ or _ttm_
 
-:::\{note}
+:::{note}
 Please note that the virial and atomic virial are not currently supported in spin models.
 :::
 
@@ -181,7 +180,7 @@ compute    1 all ke/atom
 
 ### Description
 
-Evaluate the interaction of the system with spin by using [DeepSPIN][dpspin] models. It is noticed that deep spin model is not a "pairwise" interaction, but a multi-body interaction.
+Evaluate the interaction of the system with spin by using [DeepSPIN][DPSPIN] models. It is noticed that deep spin model is not a "pairwise" interaction, but a multi-body interaction.
 
 This pair style takes the deep spin model defined in a model file that usually has .pb/.pth/.savedmodel extensions. The model can be trained and frozen from multiple backends by package [DeePMD-kit](https://github.com/deepmodeling/deepmd-kit), which can have either double or single float precision interface.
 
@@ -190,7 +189,7 @@ The unit follows [LAMMPS units](#units) and the [scale factor](https://docs.lamm
 
 Other settings and output for this pair style is the same as `deepmd` pair style, please see the detailed description [above](#pair_style-deepmd).
 
-:::\{note}
+:::{note}
 Please note that the virial and atomic virial are not currently supported in spin models.
 :::
 
@@ -261,7 +260,7 @@ compute ID group-ID centroid/stress/atom NULL virial
 
 see [LAMMPS doc page](https://docs.lammps.org/compute_stress_atom.html#thompson2) for more details on the meaning of the keywords.
 
-:::\{versionchanged} v2.2.3
+:::{versionchanged} v2.2.3
 v2.2.2 or previous versions passed per-atom stress (`cvatom`) with the per-atom pressure tensor, which is inconsistent with [LAMMPS's definition](https://docs.lammps.org/compute_stress_atom.html). LAMMPS defines per-atom stress as the negative of the per-atom pressure tensor. Such behavior is corrected in v2.2.3.
 :::
 
@@ -305,9 +304,13 @@ compute flux all heat/flux ke pe stress
 
 If you use these features please cite [D. Tisi, L. Zhang, R. Bertossa, H. Wang, R. Car, S. Baroni - arXiv preprint arXiv:2108.10850, 2021](https://arxiv.org/abs/2108.10850)
 
+[DP]: https://journals.aps.org/prl/abstract/10.1103/PhysRevLett.120.143001
+[DP-SE]: https://dl.acm.org/doi/10.5555/3327345.3327356
+[DPSPIN]: https://doi.org/10.1103/PhysRevB.110.064427
+
 ### D3 dispersion
 
-:::\{note}
+:::{note}
 Requires LAMMPS version 4Feb2025 or newer.
 :::
 
@@ -319,7 +322,3 @@ pair_style hybrid/overlay deepmd water.pb dispersion/d3 original pbe0 30.0 20.0
 pair_coeff * * deepmd O H
 pair_coeff * * dispersion/d3 O H
 ```
-
-[dp]: https://journals.aps.org/prl/abstract/10.1103/PhysRevLett.120.143001
-[dp-se]: https://dl.acm.org/doi/10.5555/3327345.3327356
-[dpspin]: https://doi.org/10.1103/PhysRevB.110.064427
diff --git a/doc/train/finetuning.md b/doc/train/finetuning.md
index 142a12cc2e..78a7b8a71e 100644
--- a/doc/train/finetuning.md
+++ b/doc/train/finetuning.md
@@ -1,6 +1,6 @@
 # Finetune the pre-trained model {{ tensorflow_icon }} {{ pytorch_icon }} {{ paddle_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, Paddle {{ paddle_icon }}
 :::
 
@@ -28,7 +28,7 @@ $ dp train input.json --finetune pretrained.pb
 The command above will change the energy bias in the last layer of the fitting net in `pretrained.pb`,
 according to the training dataset in input.json.
 
-:::\{warning}
+:::{warning}
 Note that in TensorFlow, model parameters including the `type_map` will be overwritten based on those in the pre-trained model.
 Please ensure you are familiar with the configurations in the pre-trained model, especially `type_map`, before starting the fine-tuning process.
 The elements in the training dataset must be contained in the pre-trained dataset.
@@ -105,7 +105,7 @@ one can select a specific branch (e.g., `CHOOSEN_BRANCH`) included in `multitask
 $ dp --pt train input.json --finetune multitask_pretrained.pt --model-branch CHOOSEN_BRANCH
 ```
 
-:::\{note}
+:::{note}
 One can check the available model branches in multi-task pre-trained model by referring to the documentation of the pre-trained model or by using the following command:
 
 ```bash
@@ -131,9 +131,9 @@ Then, prepare a suitable input script for multitask fine-tuning `multi_input.jso
 
 1. Refer to the [`multi-task-training`](./multi-task-training) document to prepare a multitask training script for two systems,
    ideally extracting parts (i.e. {ref}`model_dict <model/model_dict>`, {ref}`loss_dict <loss_dict>`, {ref}`data_dict <training/data_dict>` and {ref}`model_prob <training/model_prob>` parts) corresponding to `PRE_DATA1` and `PRE_DATA2` directly from the training script of the pre-trained model.
-1. For `DOWNSTREAM_DATA`, select a desired branch to fine-tune from (e.g., `PRE_DATA1`), copy the configurations of `PRE_DATA1` as the configuration for `DOWNSTREAM_DATA` and insert the corresponding data path into the {ref}`data_dict <training/data_dict>`,
+2. For `DOWNSTREAM_DATA`, select a desired branch to fine-tune from (e.g., `PRE_DATA1`), copy the configurations of `PRE_DATA1` as the configuration for `DOWNSTREAM_DATA` and insert the corresponding data path into the {ref}`data_dict <training/data_dict>`,
    thereby generating a three-system multitask training script.
-1. In the {ref}`model_dict <model/model_dict>` for `DOWNSTREAM_DATA`, specify the branch from which `DOWNSTREAM_DATA` is to fine-tune using:
+3. In the {ref}`model_dict <model/model_dict>` for `DOWNSTREAM_DATA`, specify the branch from which `DOWNSTREAM_DATA` is to fine-tune using:
    `"finetune_head": "PRE_DATA1"`.
 
 The complete `multi_input.json` should appear as follows ("..." means copied from input script of pre-trained model):
@@ -251,7 +251,7 @@ one can select a specific branch (e.g., `CHOOSEN_BRANCH`) included in `multitask
 $ dp --pd train input.json --finetune multitask_pretrained.pd --model-branch CHOOSEN_BRANCH
 ```
 
-:::\{note}
+:::{note}
 One can check the available model branches in multi-task pre-trained model by refering to the documentation of the pre-trained model or by using the following command:
 
 ```bash
diff --git a/doc/train/gpu-limitations.md b/doc/train/gpu-limitations.md
index eb4f2bc373..44c9697dd4 100644
--- a/doc/train/gpu-limitations.md
+++ b/doc/train/gpu-limitations.md
@@ -3,7 +3,7 @@
 If you use DeePMD-kit in a GPU environment, the acceptable value range of some variables is additionally restricted compared to the CPU environment due to the software's GPU implementations:
 
 1. The number of atom types of a given system must be less than 128.
-1. The maximum distance between an atom and its neighbors must be less than 128. It can be controlled by setting the rcut value of training parameters.
-1. Theoretically, the maximum number of atoms that a single GPU can accept is about 10,000,000. However, this value is limited by the GPU memory size currently, usually within 1000,000 atoms even in the model compression mode.
-1. The total sel value of training parameters(in `model[standard]/descriptor` section) must be less than 4096.
-1. The size of the last layer of the embedding net must be less than 1024 during the model compression process.
+2. The maximum distance between an atom and its neighbors must be less than 128. It can be controlled by setting the rcut value of training parameters.
+3. Theoretically, the maximum number of atoms that a single GPU can accept is about 10,000,000. However, this value is limited by the GPU memory size currently, usually within 1000,000 atoms even in the model compression mode.
+4. The total sel value of training parameters(in `model[standard]/descriptor` section) must be less than 4096.
+5. The size of the last layer of the embedding net must be less than 1024 during the model compression process.
diff --git a/doc/train/multi-task-training.md b/doc/train/multi-task-training.md
index f7cc2e8a69..115c463cc2 100644
--- a/doc/train/multi-task-training.md
+++ b/doc/train/multi-task-training.md
@@ -1,10 +1,10 @@
 # Multi-task training {{ pytorch_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: PyTorch {{ pytorch_icon }}
 :::
 
-:::\{warning}
+:::{warning}
 We have deprecated TensorFlow backend multi-task training, please use the PyTorch one.
 :::
 
@@ -26,6 +26,8 @@ and the Adam optimizer is executed to minimize $L^{(t)}$ for one step to update
 In the case of multi-GPU parallel training, different GPUs will independently select their tasks.
 In the DPA-2 model, this multi-task training framework is adopted.[^1]
 
+[^1]: Duo Zhang, Xinzijian Liu, Xiangyu Zhang, Chengqian Zhang, Chun Cai, Hangrui Bi, Yiming Du, Xuejian Qin, Anyang Peng, Jiameng Huang, Bowen Li, Yifan Shan, Jinzhe Zeng, Yuzhi Zhang, Siyuan Liu, Yifan Li, Junhan Chang, Xinyan Wang, Shuo Zhou, Jianchuan Liu, Xiaoshan Luo, Zhenyu Wang, Wanrun Jiang, Jing Wu, Yudi Yang, Jiyuan Yang, Manyi Yang, Fu-Qiang Gong, Linshuang Zhang, Mengchao Shi, Fu-Zhi Dai, Darrin M. York, Shi Liu, Tong Zhu, Zhicheng Zhong, Jian Lv, Jun Cheng, Weile Jia, Mohan Chen, Guolin Ke, Weinan E, Linfeng Zhang, Han Wang, DPA-2: a large atomic model as a multi-task learner. npj Comput Mater 10, 293 (2024). [DOI: 10.1038/s41524-024-01493-2](https://doi.org/10.1038/s41524-024-01493-2) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
 Compared with the previous TensorFlow implementation, the new support in PyTorch is more flexible and efficient.
 In particular, it makes multi-GPU parallel training and even tasks beyond DFT possible,
 enabling larger-scale and more general multi-task training to obtain more general pre-trained models.
@@ -46,7 +48,6 @@ Specifically, there are several parts that need to be modified:
 - {ref}`model/model_dict <model/model_dict>`: The core definition of the model part and the explanation of sharing rules,
   starting with user-defined model name keys `model_key`, such as `my_model_1`.
   Each model part needs to align with the components of the single-task training {ref}`model <model>`, but with the following sharing rules:
-
   - If you want to share the current model component with other tasks, which should be part of the {ref}`model/shared_dict <model/shared_dict>`,
     you can directly fill in the corresponding `part_key`, such as
     `"descriptor": "my_descriptor", `
@@ -62,7 +63,7 @@ Specifically, there are several parts that need to be modified:
   - For fitting nets, we only support the default `shared_level`=0, where all parameters will be shared except for `bias_atom_e` and `case_embd`.
   - To conduct multitask training, there are two typical approaches:
     1. **Descriptor sharing only**: Share the descriptor with `shared_level`=0. See [here](../../examples/water_multi_task/pytorch_example/input_torch.json) for an example.
-    1. **Descriptor and fitting network sharing with data identification**:
+    2. **Descriptor and fitting network sharing with data identification**:
        - Share the descriptor and the fitting network with `shared_level`=0.
        - {ref}`dim_case_embd <model[standard]/fitting_net[ener]/dim_case_embd>` must be set to the number of model branches, which will distinguish different data tasks using a one-hot embedding.
        - See [here](../../examples/water_multi_task/pytorch_example/input_torch_sharefit.json) for an example.
@@ -83,10 +84,8 @@ Specifically, there are several parts that need to be modified:
 An example input for multi-task training two models in water system is shown as following:
 
 ```{literalinclude} ../../examples/water_multi_task/pytorch_example/input_torch.json
----
-language: json
-linenos:
----
+:language: json
+:linenos:
 ```
 
 ## Finetune from the pre-trained multi-task model
@@ -96,7 +95,7 @@ users can refer to [this section](./finetuning.md#fine-tuning-from-a-multi-task-
 
 ## Multi-task specific parameters
 
-:::\{note}
+:::{note}
 Details of some parameters that are the same as [the regular parameters](./train-input.rst) are not shown below.
 :::
 
@@ -105,5 +104,3 @@ Details of some parameters that are the same as [the regular parameters](./train
    :module: deepmd.utils.argcheck
    :func: gen_args_multi_task
 ```
-
-[^1]: Duo Zhang, Xinzijian Liu, Xiangyu Zhang, Chengqian Zhang, Chun Cai, Hangrui Bi, Yiming Du, Xuejian Qin, Anyang Peng, Jiameng Huang, Bowen Li, Yifan Shan, Jinzhe Zeng, Yuzhi Zhang, Siyuan Liu, Yifan Li, Junhan Chang, Xinyan Wang, Shuo Zhou, Jianchuan Liu, Xiaoshan Luo, Zhenyu Wang, Wanrun Jiang, Jing Wu, Yudi Yang, Jiyuan Yang, Manyi Yang, Fu-Qiang Gong, Linshuang Zhang, Mengchao Shi, Fu-Zhi Dai, Darrin M. York, Shi Liu, Tong Zhu, Zhicheng Zhong, Jian Lv, Jun Cheng, Weile Jia, Mohan Chen, Guolin Ke, Weinan E, Linfeng Zhang, Han Wang, DPA-2: a large atomic model as a multi-task learner. npj Comput Mater 10, 293 (2024). [DOI: 10.1038/s41524-024-01493-2](https://doi.org/10.1038/s41524-024-01493-2) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/train/parallel-training.md b/doc/train/parallel-training.md
index 2867e825bc..998f1c3bec 100644
--- a/doc/train/parallel-training.md
+++ b/doc/train/parallel-training.md
@@ -1,6 +1,6 @@
 # Parallel training {{ tensorflow_icon }} {{ pytorch_icon }} {{ paddle_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, Paddle {{ paddle_icon }}
 :::
 
@@ -105,10 +105,9 @@ We utilize the PyTorch framework and have designed and implemented a multiproces
 
 First, we establish a DeepmdData class for each system, which is consistent with the TensorFlow version in this level. Then, we create a dataloader for each system, resulting in the same number of dataloaders as the number of systems. Next, we create a dataset for the dataloaders obtained in the previous step. This allows us to query the data for each system through this dataset, while the iteration pointers for each system are maintained by their respective dataloaders. Finally, a dataloader is created for the outermost dataset.
 
-We achieve custom sampling methods using a weighted sampler. The length of the sampler is set to total_batch_num * num_workers.The parameter "num_workers" defines the number of threads involved in multi-threaded loading, which can be modified by setting the environment variable NUM_WORKERS (default: min(8, ncpus)).
+We achieve custom sampling methods using a weighted sampler. The length of the sampler is set to total_batch_num \* num_workers.The parameter "num_workers" defines the number of threads involved in multi-threaded loading, which can be modified by setting the environment variable NUM_WORKERS (default: min(8, ncpus)).
 
-> [!NOTE]
-> The underlying dataloader will use a distributed sampler to ensure that each GPU receives batches with different content in parallel mode, which will use sequential sampler in serial mode. In the TensorFlow version, Horovod shuffles the dataset using different random seeds for the same purpose..
+> **Note** The underlying dataloader will use a distributed sampler to ensure that each GPU receives batches with different content in parallel mode, which will use sequential sampler in serial mode. In the TensorFlow version, Horovod shuffles the dataset using different random seeds for the same purpose..
 
 ```mermaid
 flowchart LR
@@ -184,11 +183,9 @@ torchrun --rdzv_endpoint=node0:12321 --nnodes=2 --nproc_per_node=4 --node_rank=0
 torchrun --rdzv_endpoint=node0:12321 --nnodes=2 --nproc_per_node=4 --node_rank=1 --no_python dp --pt train tests/water/se_e2_a.json
 ```
 
-> [!NOTE]
-> Set environment variables to tune [CPU specific optimizations](https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#cpu-specific-optimizations) in advance.
+> **Note** Set environment variables to tune [CPU specific optimizations](https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#cpu-specific-optimizations) in advance.
 
-> [!NOTE]
-> for developers: `torchrun` by default passes settings as environment variables [(list here)](https://pytorch.org/docs/stable/elastic/run.html#environment-variables).
+> **Note** for developers: `torchrun` by default passes settings as environment variables [(list here)](https://pytorch.org/docs/stable/elastic/run.html#environment-variables).
 
 > To check forward, backward, and communication time, please set env var `TORCH_CPP_LOG_LEVEL=INFO TORCH_DISTRIBUTED_DEBUG=DETAIL`. More details can be found [here](https://pytorch.org/docs/stable/distributed.html#logging).
 
@@ -236,7 +233,7 @@ Then, run the script on the first node with:
 mpirun run_pp.sh
 ```
 
-:::\{note}
+:::{note}
 
 If `NUM_WORKERS` is too large, it may cause the program to be terminated by the system;
 if it is too small, it may slow down data reading. You can try adjusting it to an appropriate size.
diff --git a/doc/train/tensorboard.md b/doc/train/tensorboard.md
index b271a80093..3c45ebba34 100644
--- a/doc/train/tensorboard.md
+++ b/doc/train/tensorboard.md
@@ -1,6 +1,6 @@
 # TensorBoard Usage {{ tensorflow_icon }} {{ pytorch_icon }} {{ paddle_icon }}
 
-:::\{note}
+:::{note}
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, Paddle {{ paddle_icon }}
 :::
 
diff --git a/doc/train/training-advanced.md b/doc/train/training-advanced.md
index e60312a8b0..af4b4b31d9 100644
--- a/doc/train/training-advanced.md
+++ b/doc/train/training-advanced.md
@@ -21,6 +21,8 @@ where $\tau \in \mathbb{N}$ is the index of the training step, $\gamma^0  \in \m
 where $\tau^{\text{stop}} \in \mathbb{N}$, $\gamma^{\text{stop}} \in \mathbb{R}$, and $s \in \mathbb{N}$ are the stopping step, the stopping learning rate, and the decay steps, respectively, all of which are hyperparameters provided in advance.
 [^1]
 
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
 ### Instructions
 
 The {ref}`learning_rate <learning_rate>` section in `input.json` is given as follows
@@ -36,9 +38,7 @@ The {ref}`learning_rate <learning_rate>` section in `input.json` is given as fol
 ```
 
 - {ref}`start_lr <learning_rate[exp]/start_lr>` gives the learning rate at the beginning of the training.
-
 - {ref}`stop_lr <learning_rate[exp]/stop_lr>` gives the learning rate at the end of the training. It should be small enough to ensure that the network parameters satisfactorily converge.
-
 - During the training, the learning rate decays exponentially from {ref}`start_lr <learning_rate[exp]/start_lr>` to {ref}`stop_lr <learning_rate[exp]/stop_lr>` following the formula:
 
   ```
@@ -182,5 +182,3 @@ dp freeze -o frozen_model_adjusted_sel.pb
 Two models should give the same result when the input satisfies both constraints.
 
 Note: At this time, this feature is only supported by [`se_e2_a`](../model/train-se-e2-a.md) descriptor with [`set_davg_true`](./train-input.rst) enabled, or `hybrid` composed of the above descriptors.
-
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/train/training.md b/doc/train/training.md
index 04ed29af03..6ccb43bbd7 100644
--- a/doc/train/training.md
+++ b/doc/train/training.md
@@ -8,9 +8,9 @@ $ cd $deepmd_source_dir/examples/water/se_e2_a/
 
 After switching to that directory, the training can be invoked by
 
-::::\{tab-set}
+::::{tab-set}
 
-:::\{tab-item} TensorFlow {{ tensorflow_icon }}
+:::{tab-item} TensorFlow {{ tensorflow_icon }}
 
 ```bash
 $ dp --tf train input.json
@@ -18,7 +18,7 @@ $ dp --tf train input.json
 
 :::
 
-:::\{tab-item} PyTorch {{ pytorch_icon }}
+:::{tab-item} PyTorch {{ pytorch_icon }}
 
 ```bash
 $ dp --pt train input.json
@@ -26,7 +26,7 @@ $ dp --pt train input.json
 
 :::
 
-:::\{tab-item} Paddle {{ paddle_icon }}
+:::{tab-item} Paddle {{ paddle_icon }}
 
 ```bash
 # training model
@@ -69,12 +69,12 @@ During the training, the error of the model is tested every {ref}`disp_freq <tra
 
 ```bash
 #  step      rmse_val    rmse_trn    rmse_e_val  rmse_e_trn    rmse_f_val  rmse_f_trn         lr
-0      3.33e+01    3.41e+01      1.03e+01    1.03e+01      8.39e-01    8.72e-01    1.0e-03
-100      2.57e+01    2.56e+01      1.87e+00    1.88e+00      8.03e-01    8.02e-01    1.0e-03
-200      2.45e+01    2.56e+01      2.26e-01    2.21e-01      7.73e-01    8.10e-01    1.0e-03
-300      1.62e+01    1.66e+01      5.01e-02    4.46e-02      5.11e-01    5.26e-01    1.0e-03
-400      1.36e+01    1.32e+01      1.07e-02    2.07e-03      4.29e-01    4.19e-01    1.0e-03
-500      1.07e+01    1.05e+01      2.45e-03    4.11e-03      3.38e-01    3.31e-01    1.0e-03
+      0      3.33e+01    3.41e+01      1.03e+01    1.03e+01      8.39e-01    8.72e-01    1.0e-03
+    100      2.57e+01    2.56e+01      1.87e+00    1.88e+00      8.03e-01    8.02e-01    1.0e-03
+    200      2.45e+01    2.56e+01      2.26e-01    2.21e-01      7.73e-01    8.10e-01    1.0e-03
+    300      1.62e+01    1.66e+01      5.01e-02    4.46e-02      5.11e-01    5.26e-01    1.0e-03
+    400      1.36e+01    1.32e+01      1.07e-02    2.07e-03      4.29e-01    4.19e-01    1.0e-03
+    500      1.07e+01    1.05e+01      2.45e-03    4.11e-03      3.38e-01    3.31e-01    1.0e-03
 ```
 
 The file contains 8 columns, from left to right, which are the training step, the validation loss, training loss, root mean square (RMS) validation error of energy, RMS training error of energy, RMS validation error of force, RMS training error of force and the learning rate. The RMS error (RMSE) of the energy is normalized by the number of atoms in the system. One can visualize this file with a simple Python script:
@@ -97,6 +97,6 @@ plt.show()
 
 Checkpoints will be written to files with the prefix {ref}`save_ckpt <training/save_ckpt>` every {ref}`save_freq <training/save_freq>` training steps.
 
-:::\{warning}
+:::{warning}
 It is warned that the example water data (in folder `examples/water/data`) is of very limited amount, is provided only for testing purposes, and should not be used to train a production model.
 :::
diff --git a/doc/troubleshooting/howtoset_num_nodes.md b/doc/troubleshooting/howtoset_num_nodes.md
index 1748bd1598..b09fb80cb6 100644
--- a/doc/troubleshooting/howtoset_num_nodes.md
+++ b/doc/troubleshooting/howtoset_num_nodes.md
@@ -8,14 +8,14 @@ One should make sure the product of the parallel numbers is less than or equal t
 
 Parallelism for MPI is optional and used for multiple nodes, multiple GPU cards, or sometimes multiple CPU cores.
 
-::::\{tab-set}
+::::{tab-set}
 
-:::\{tab-item} TensorFlow {{ tensorflow_icon }}
+:::{tab-item} TensorFlow {{ tensorflow_icon }}
 
 To enable MPI support for training in the TensorFlow interface, one should [install horovod](../install/install-from-source.md#install-horovod-and-mpi4py) in advance.
 
 :::
-:::\{tab-item} PyTorch {{ pytorch_icon }}
+:::{tab-item} PyTorch {{ pytorch_icon }}
 
 Multiprocessing support for training in the PyTorch backend is implemented with [torchrun](https://pytorch.org/docs/stable/elastic/run.html).
 
@@ -68,14 +68,14 @@ export OMP_NUM_THREADS=2
 
 There are several other environment variables for OpenMP, such as `KMP_BLOCKTIME`.
 
-::::\{tab-set}
+::::{tab-set}
 
-:::\{tab-item} TensorFlow {{ tensorflow_icon }}
+:::{tab-item} TensorFlow {{ tensorflow_icon }}
 
 See [Intel documentation](https://www.intel.com/content/www/us/en/developer/articles/technical/maximize-tensorflow-performance-on-cpu-considerations-and-recommendations-for-inference.html) for detailed information.
 
 :::
-:::\{tab-item} PyTorch {{ pytorch_icon }}
+:::{tab-item} PyTorch {{ pytorch_icon }}
 
 See [PyTorch documentation](https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html) for detailed information.
 
@@ -89,9 +89,9 @@ There is no one general parallel configuration that works for all situations, so
 Here are some empirical examples.
 If you wish to use 3 cores of 2 CPUs on one node, you may set the environment variables and run DeePMD-kit as follows:
 
-::::\{tab-set}
+::::{tab-set}
 
-:::\{tab-item} TensorFlow {{ tensorflow_icon }}
+:::{tab-item} TensorFlow {{ tensorflow_icon }}
 
 ```bash
 export OMP_NUM_THREADS=3
@@ -102,7 +102,7 @@ dp --tf train input.json
 
 :::
 
-:::\{tab-item} PyTorch {{ pytorch_icon }}
+:::{tab-item} PyTorch {{ pytorch_icon }}
 
 ```bash
 export OMP_NUM_THREADS=3
diff --git a/examples/property/train/README.md b/examples/property/train/README.md
index 6e9345395c..e4dc9ed704 100644
--- a/examples/property/train/README.md
+++ b/examples/property/train/README.md
@@ -1,5 +1,5 @@
 Some explanations of the parameters in `input.json`:
 
 1. `fitting_net/property_name` is the name of the property to be predicted. It should be consistent with the property name in the dataset. In each system, code will read `set.*/{property_name}.npy` file as prediction label if you use NumPy format data.
-1. `fitting_net/task_dim` is the dimension of model output. It should be consistent with the property dimension in the dataset, which means if the shape of data stored in `set.*/{property_name}.npy` is `batch size * 3`, `fitting_net/task_dim` should be set to 3.
-1. `fitting/intensive` indicates whether the fitting property is intensive. If `intensive` is `true`, the model output is the average of the property contribution of each atom. If `intensive` is `false`, the model output is the sum of the property contribution of each atom.
+2. `fitting_net/task_dim` is the dimension of model output. It should be consistent with the property dimension in the dataset, which means if the shape of data stored in `set.*/{property_name}.npy` is `batch size * 3`, `fitting_net/task_dim` should be set to 3.
+3. `fitting/intensive` indicates whether the fitting property is intensive. If `intensive` is `true`, the model output is the average of the property contribution of each atom. If `intensive` is `false`, the model output is the sum of the property contribution of each atom.
diff --git a/source/3rdparty/README.md b/source/3rdparty/README.md
index 9de615d505..ac9cfd4edc 100644
--- a/source/3rdparty/README.md
+++ b/source/3rdparty/README.md
@@ -1,7 +1,7 @@
 # 3rd-party source codes
 
-| Name             | Repository                         | Version | License |
-| ---------------- | ---------------------------------- | ------- | ------- |
-| json             | https://github.com/nlohmann/json   | 3.9.1   | MIT     |
-| Implib.so        | https://github.com/yugr/Implib.so  | 0ddaa71 | MIT     |
-| coverage_plugins | https://github.com/pytorch/pytorch | 2.2.0   | BSD-3   |
+| Name                      | Repository                         | Version | License |
+| ------------------------- | ---------------------------------- | ------- | ------- |
+| json                      | https://github.com/nlohmann/json   | 3.9.1   | MIT     |
+| Implib.so                 | https://github.com/yugr/Implib.so  | 0ddaa71 | MIT     |
+| coverage_plugins          | https://github.com/pytorch/pytorch | 2.2.0   | BSD-3   |
diff --git a/source/3rdparty/implib/arch/e2k/README.md b/source/3rdparty/implib/arch/e2k/README.md
index 9c9fc02487..eb87f54f85 100644
--- a/source/3rdparty/implib/arch/e2k/README.md
+++ b/source/3rdparty/implib/arch/e2k/README.md
@@ -1,5 +1,4 @@
 Reference materials:
-
-- Руководство по эффективному программированию на платформе «Эльбрус» (http://www.mcst.ru/files/5ed39a/dd0cd8/50506b/000000/elbrus_prog_2020-05-30.pdf)
-- Микропроцессоры и вычислительные комплексы семейства Эльбрус (http://www.mcst.ru/doc/book_121130.pdf)
-- https://github.com/OpenE2K
+  * Руководство по эффективному программированию на платформе «Эльбрус» (http://www.mcst.ru/files/5ed39a/dd0cd8/50506b/000000/elbrus_prog_2020-05-30.pdf)
+  * Микропроцессоры и вычислительные комплексы семейства Эльбрус (http://www.mcst.ru/doc/book_121130.pdf)
+  * https://github.com/OpenE2K
diff --git a/source/nodejs/README.md b/source/nodejs/README.md
index e6b2c8d476..ad78359761 100644
--- a/source/nodejs/README.md
+++ b/source/nodejs/README.md
@@ -23,8 +23,8 @@ When using CMake to build DeePMD-kit, set argument `BUILD_NODEJS_IF=ON` and `NOD
 
 ```sh
 cmake -D BUILD_NODEJS_IF=ON \
-    -D NODEJS_INCLUDE_DIRS=/path/to/nodejs/include \
-    .. # and other arguments
+      -D NODEJS_INCLUDE_DIRS=/path/to/nodejs/include \
+      .. # and other arguments
 make
 make install
 ```

From 5eb91160cd9a907c01d597c0cfe8c04c274a3cad Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@ustc.edu.cn>
Date: Mon, 9 Feb 2026 03:29:33 +0800
Subject: [PATCH 4/6] use a developing version of mdformat-myst

---
 .pre-commit-config.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 3fc33c8b98..2a971b4238 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -59,7 +59,9 @@ repos:
     hooks:
     - id: mdformat
       additional_dependencies:
-      - mdformat-myst==0.3.0
+      # - mdformat-myst==0.3.0
+      # See https://github.com/executablebooks/mdformat-myst/issues/13
+      - "git+https://github.com/brunobeltran/mdformat-myst@4b3ff3a800e2877b78c11a9f5d713d667b0ecccf#egg=mdformat-myst"
       - mdformat-ruff==0.1.3
       - mdformat-web==0.2.0
       - mdformat-config==0.2.1

From 57ab0bd1525ed86749a43c60e17b4d9c63cf101d Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@ustc.edu.cn>
Date: Mon, 9 Feb 2026 03:31:17 +0800
Subject: [PATCH 5/6] add exclude

---
 .pre-commit-config.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2a971b4238..4607a1c491 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,5 +1,6 @@
 # See https://pre-commit.com for more information
 # See https://pre-commit.com/hooks.html for more hooks
+exclude: ^source/3rdparty/.+/
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v6.0.0

From e310debc5e0fd4a1cf651aa5c5c7d6f18670cbf1 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 8 Feb 2026 19:34:13 +0000
Subject: [PATCH 6/6] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .pre-commit-config.yaml                   |  20 ++--
 AGENTS.md                                 |  18 +--
 CONTRIBUTING.md                           |  23 ++--
 README.md                                 |   2 +-
 doc/data/data-conv.md                     |   2 +-
 doc/data/system.md                        |  50 ++++-----
 doc/development/cmake.md                  |   2 +-
 doc/development/create-a-model-pt.md      |   6 +-
 doc/development/create-a-model-tf.md      |   6 +-
 doc/development/type-embedding.md         |   4 +-
 doc/env.md                                |  12 --
 doc/freeze/compress.md                    |   9 +-
 doc/freeze/freeze.md                      |   6 -
 doc/inference/nodejs.md                   |   8 +-
 doc/install/build-conda.md                |   2 +
 doc/install/easy-install.md               |  71 +++++-------
 doc/install/install-from-c-library.md     |   2 -
 doc/install/install-from-source.md        |  94 ++--------------
 doc/install/install-gromacs.md            |   9 +-
 doc/install/install-lammps.md             |   1 -
 doc/install/install-nodejs.md             |   4 +-
 doc/model/change-bias.md                  |   4 -
 doc/model/dpa3.md                         |   4 -
 doc/model/dplr.md                         |   8 +-
 doc/model/dprc.md                         | 128 +++++++++++++++++-----
 doc/model/overall.md                      |  16 +--
 doc/model/pairtab.md                      |   4 +-
 doc/model/sel.md                          |   8 --
 doc/model/show-model-info.md              |   7 ++
 doc/model/train-energy-hessian.md         |  14 +--
 doc/model/train-energy-spin.md            |  26 ++---
 doc/model/train-energy.md                 |   4 +-
 doc/model/train-fitting-dos.md            |   8 --
 doc/model/train-fitting-property.md       |  10 +-
 doc/model/train-fitting-tensor.md         |  17 +--
 doc/model/train-hybrid.md                 |   4 +-
 doc/model/train-se-a-mask.md              |   2 +-
 doc/model/train-se-atten.md               |   4 +-
 doc/model/train-se-e2-a-tebd.md           |   4 +-
 doc/model/train-se-e2-a.md                |   4 +-
 doc/model/train-se-e2-r.md                |   4 +-
 doc/model/train-se-e3.md                  |   4 +-
 doc/nvnmd/nvnmd.md                        |  48 +++++---
 doc/test/model-deviation.md               |   4 +-
 doc/third-party/ase.md                    |   6 -
 doc/third-party/lammps-command.md         |  13 ++-
 doc/train/finetuning.md                   |   6 +-
 doc/train/gpu-limitations.md              |   8 +-
 doc/train/multi-task-training.md          |  17 +--
 doc/train/parallel-training.md            |  13 ++-
 doc/train/training-advanced.md            |   6 +-
 doc/train/training.md                     |  18 +--
 doc/troubleshooting/howtoset_num_nodes.md |  16 +--
 examples/property/train/README.md         |   4 +-
 source/3rdparty/README.md                 |  10 +-
 source/nodejs/README.md                   |   4 +-
 56 files changed, 369 insertions(+), 439 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4607a1c491..e5f1e63dc9 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -58,16 +58,16 @@ repos:
   - repo: https://github.com/hukkin/mdformat
     rev: 1.0.0
     hooks:
-    - id: mdformat
-      additional_dependencies:
-      # - mdformat-myst==0.3.0
-      # See https://github.com/executablebooks/mdformat-myst/issues/13
-      - "git+https://github.com/brunobeltran/mdformat-myst@4b3ff3a800e2877b78c11a9f5d713d667b0ecccf#egg=mdformat-myst"
-      - mdformat-ruff==0.1.3
-      - mdformat-web==0.2.0
-      - mdformat-config==0.2.1
-      - mdformat-beautysh==1.0.0
-      - mdformat-gfm-alerts==2.0.0
+      - id: mdformat
+        additional_dependencies:
+          # - mdformat-myst==0.3.0
+          # See https://github.com/executablebooks/mdformat-myst/issues/13
+          - "git+https://github.com/brunobeltran/mdformat-myst@4b3ff3a800e2877b78c11a9f5d713d667b0ecccf#egg=mdformat-myst"
+          - mdformat-ruff==0.1.3
+          - mdformat-web==0.2.0
+          - mdformat-config==0.2.1
+          - mdformat-beautysh==1.0.0
+          - mdformat-gfm-alerts==2.0.0
   # C++
   - repo: https://github.com/pre-commit/mirrors-clang-format
     rev: v21.1.8
diff --git a/AGENTS.md b/AGENTS.md
index c629a08def..bcac9f1514 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -23,8 +23,8 @@ DeePMD-kit is a deep learning package for many-body potential energy representat
 ### Lint and Format Code
 
 - Install linter: `uv pip install ruff`
-- Run linting: `ruff check .` -- takes <1 second
-- Format code: `ruff format .` -- takes <1 second
+- Run linting: `ruff check .` -- takes \<1 second
+- Format code: `ruff format .` -- takes \<1 second
 - **Always run `ruff check .` and `ruff format .` before committing changes or the CI will fail.**
 
 ### Training and Validation
@@ -40,19 +40,19 @@ DeePMD-kit is a deep learning package for many-body potential energy representat
 ### Basic Functionality Validation
 
 1. **CLI Interface**: Run `dp --version` and `dp -h` to verify installation
-2. **Python Interface**: Run `python -c "import deepmd; import deepmd.tf; print('Both interfaces work')"`
-3. **Backend Selection**: Test `dp --tf -h`, `dp --pt -h`, `dp --jax -h`, `dp --pd -h`
+1. **Python Interface**: Run `python -c "import deepmd; import deepmd.tf; print('Both interfaces work')"`
+1. **Backend Selection**: Test `dp --tf -h`, `dp --pt -h`, `dp --jax -h`, `dp --pd -h`
 
 ### Training Workflow Validation
 
 1. **TensorFlow Training**: `cd examples/water/se_e2_a && timeout 60 dp train input.json --skip-neighbor-stat` -- should start training and show decreasing loss
-2. **PyTorch Training**: `cd examples/water/se_e2_a && timeout 60 dp --pt train input_torch.json --skip-neighbor-stat` -- should start training and show decreasing loss
-3. **Verify training output**: Look for "batch X: trn: rmse" messages showing decreasing error values
+1. **PyTorch Training**: `cd examples/water/se_e2_a && timeout 60 dp --pt train input_torch.json --skip-neighbor-stat` -- should start training and show decreasing loss
+1. **Verify training output**: Look for "batch X: trn: rmse" messages showing decreasing error values
 
 ### Test-Based Validation
 
 1. **Core Tests**: `pytest source/tests/tf/test_dp_test.py::TestDPTestEner::test_1frame -v` -- should pass in ~10 seconds
-2. **Multi-backend**: Test both TensorFlow and PyTorch components work
+1. **Multi-backend**: Test both TensorFlow and PyTorch components work
 
 ## Common Commands and Timing
 
@@ -127,8 +127,8 @@ source/               # C++ source code and tests
 
 ### Linting and Formatting
 
-- **Ruff check**: <1 second
-- **Ruff format**: <1 second
+- **Ruff check**: \<1 second
+- **Ruff format**: \<1 second
 - **Pre-commit hooks**: May have network issues, use individual tools
 
 ### Commit Messages and PR Titles
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 67491672e8..a8378350e4 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -47,7 +47,7 @@ Please perform the following steps to create your Pull Request to this repositor
 ### Step 1: Fork the repository
 
 1. Visit the project: <https://github.com/deepmodeling/deepmd-kit>
-2. Click the **Fork** button on the top right and wait it to finish.
+1. Click the **Fork** button on the top right and wait it to finish.
 
 ### Step 2: Clone the forked repository to local storage and set configurations
 
@@ -58,7 +58,7 @@ Please perform the following steps to create your Pull Request to this repositor
    # Replace `$username` with your GitHub ID
    ```
 
-2. Add deepmodeling's repo as your remote repo, we can name it "upstream". And fetch upstream's latest codes to your workstation.
+1. Add deepmodeling's repo as your remote repo, we can name it "upstream". And fetch upstream's latest codes to your workstation.
 
    ```bash
    git remote add upstream https://github.com/deepmodeling/deepmd-kit.git
@@ -70,9 +70,9 @@ Please perform the following steps to create your Pull Request to this repositor
    git merge upstream/master
    ```
 
-3. Modify your codes and design unit tests.
+1. Modify your codes and design unit tests.
 
-4. Commit your changes to a new branch
+1. Commit your changes to a new branch
 
    ```bash
    git checkout -b branch1
@@ -81,7 +81,8 @@ Please perform the following steps to create your Pull Request to this repositor
    git commit -m "commit-message: update the xx"
    ```
 
-5. Push the changed codes to your original repo on github.
+1. Push the changed codes to your original repo on github.
+
    ```bash
    git push origin branch1
    ```
@@ -97,15 +98,15 @@ Please perform the following steps to create your Pull Request to this repositor
    git rebase upstream/master
    ```
 
-2. Create a new branch based on the master branch.
+1. Create a new branch based on the master branch.
 
    ```bash
    git checkout -b new-branch-name
    ```
 
-3. Modify your codes and design unit tests.
+1. Modify your codes and design unit tests.
 
-4. Commit your changes
+1. Commit your changes
 
    ```bash
    git status # Checks the local status
@@ -113,7 +114,7 @@ Please perform the following steps to create your Pull Request to this repositor
    git commit -m "commit-message: update the xx"
    ```
 
-5. Keep your branch in sync with upstream/master
+1. Keep your branch in sync with upstream/master
 
    ```bash
    # While on your new branch
@@ -121,7 +122,7 @@ Please perform the following steps to create your Pull Request to this repositor
    git rebase upstream/master
    ```
 
-6. Push your changes to the remote
+1. Push your changes to the remote
 
    ```bash
    git push -u origin new-branch-name # "-u" is used to track the remote branch from origin
@@ -130,7 +131,7 @@ Please perform the following steps to create your Pull Request to this repositor
 ### Step 3: Create a pull request
 
 1. Visit your fork at <https://github.com/$username/deepmd-kit> (replace `$username` with your GitHub ID)
-2. Click `pull requests`, followed by `New pull request` and `Compare & pull request` to create your PR.
+1. Click `pull requests`, followed by `New pull request` and `Compare & pull request` to create your PR.
 
 Now, your PR is successfully submitted! After this PR is merged, you will automatically become a contributor to DeePMD-kit.
 
diff --git a/README.md b/README.md
index 143ed1b0ab..58ec1fec7f 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 [<picture><source media="(prefers-color-scheme: dark)" srcset="./doc/_static/logo-dark.svg"><source media="(prefers-color-scheme: light)" srcset="./doc/_static/logo.svg"><img alt="DeePMD-kit logo" src="./doc/_static/logo.svg"></picture>](./doc/logo.md)
 
----
+______________________________________________________________________
 
 # DeePMD-kit
 
diff --git a/doc/data/data-conv.md b/doc/data/data-conv.md
index 56ce526480..30be98bcfe 100644
--- a/doc/data/data-conv.md
+++ b/doc/data/data-conv.md
@@ -57,7 +57,7 @@ In the raw format, the property of one frame is provided per line, ending with `
 ```bash
 $ cat force.raw
 -0.724  2.039 -0.951  0.841 -0.464  0.363
- 6.737  1.554 -5.587 -2.803  0.062  2.222
+6.737  1.554 -5.587 -2.803  0.062  2.222
 -1.968 -0.163  1.020 -0.225 -0.789  0.343
 ```
 
diff --git a/doc/data/system.md b/doc/data/system.md
index f6da7b534b..e809016420 100644
--- a/doc/data/system.md
+++ b/doc/data/system.md
@@ -12,29 +12,29 @@ A system should contain system properties, input frame properties, and labeled f
 
 The input frame properties contain the following property, the first axis of which is the number of frames:
 
-| ID        | Property                                            | Raw file   | Unit | Required/Optional    | Shape                    | Description                               |
-| --------- | --------------------------------------------------- | ---------- | ---- | -------------------- | ------------------------ | ----------------------------------------- |
-| coord     | Atomic coordinates                                  | coord.raw  | Å    | Required             | Nframes \* Natoms \* 3   |
-| box       | Boxes                                               | box.raw    | Å    | Required if periodic | Nframes \* 3 \* 3        | in the order `XX XY XZ YX YY YZ ZX ZY ZZ` |
-| fparam    | Extra frame parameters                              | fparam.raw | Any  | Optional             | Nframes \* Any           |
-| aparam    | Extra atomic parameters                             | aparam.raw | Any  | Optional             | Nframes \* aparam \* Any |
-| numb_copy | Each frame is copied by the `numb_copy` (int) times | prob.raw   | 1    | Optional             | Nframes                  | Integer; Default is 1 for all frames      |
+| ID        | Property                                            | Raw file   | Unit | Required/Optional    | Shape                  | Description                               |
+| --------- | --------------------------------------------------- | ---------- | ---- | -------------------- | ---------------------- | ----------------------------------------- |
+| coord     | Atomic coordinates                                  | coord.raw  | Å    | Required             | Nframes * Natoms * 3   |                                           |
+| box       | Boxes                                               | box.raw    | Å    | Required if periodic | Nframes * 3 * 3        | in the order `XX XY XZ YX YY YZ ZX ZY ZZ` |
+| fparam    | Extra frame parameters                              | fparam.raw | Any  | Optional             | Nframes * Any          |                                           |
+| aparam    | Extra atomic parameters                             | aparam.raw | Any  | Optional             | Nframes * aparam * Any |                                           |
+| numb_copy | Each frame is copied by the `numb_copy` (int) times | prob.raw   | 1    | Optional             | Nframes                | Integer; Default is 1 for all frames      |
 
 The labeled frame properties are listed as follows, all of which will be used for training if and only if the loss function contains such property:
 
-| ID                    | Property                                                                         | Raw file                  | Unit   | Shape                                 | Description                               |
-| --------------------- | -------------------------------------------------------------------------------- | ------------------------- | ------ | ------------------------------------- | ----------------------------------------- |
-| energy                | Frame energies                                                                   | energy.raw                | eV     | Nframes                               |
-| force                 | Atomic forces                                                                    | force.raw                 | eV/Å   | Nframes \* Natoms \* 3                |
-| virial                | Frame virial                                                                     | virial.raw                | eV     | Nframes \* 9                          | in the order `XX XY XZ YX YY YZ ZX ZY ZZ` |
-| hessian               | Frame energy Hessian matrices                                                    | hessian.raw               | eV/Å^2 | Nframes \* Natoms \* 3 \* Natoms \* 3 | full Hessian matrices                     |
-| atom_ener             | Atomic energies                                                                  | atom_ener.raw             | eV     | Nframes \* Natoms                     |
-| atom_pref             | Weights of atomic forces                                                         | atom_pref.raw             | 1      | Nframes \* Natoms                     |
-| dipole                | Frame dipole                                                                     | dipole.raw                | Any    | Nframes \* 3                          |
-| atomic_dipole         | Atomic dipole                                                                    | atomic_dipole.raw         | Any    | Nframes \* Natoms \* 3                |
-| polarizability        | Frame polarizability                                                             | polarizability.raw        | Any    | Nframes \* 9                          | in the order `XX XY XZ YX YY YZ ZX ZY ZZ` |
-| atomic_polarizability | Atomic polarizability                                                            | atomic_polarizability.raw | Any    | Nframes \* Natoms \* 9                | in the order `XX XY XZ YX YY YZ ZX ZY ZZ` |
-| drdq                  | Partial derivative of atomic coordinates with respect to generalized coordinates | drdq.raw                  | 1      | Nframes \* Natoms \* 3 \* Ngen_coords |
+| ID                    | Property                                                                         | Raw file                  | Unit   | Shape                              | Description                               |
+| --------------------- | -------------------------------------------------------------------------------- | ------------------------- | ------ | ---------------------------------- | ----------------------------------------- |
+| energy                | Frame energies                                                                   | energy.raw                | eV     | Nframes                            |                                           |
+| force                 | Atomic forces                                                                    | force.raw                 | eV/Å   | Nframes * Natoms * 3               |                                           |
+| virial                | Frame virial                                                                     | virial.raw                | eV     | Nframes * 9                        | in the order `XX XY XZ YX YY YZ ZX ZY ZZ` |
+| hessian               | Frame energy Hessian matrices                                                    | hessian.raw               | eV/Å^2 | Nframes * Natoms * 3 * Natoms * 3  | full Hessian matrices                     |
+| atom_ener             | Atomic energies                                                                  | atom_ener.raw             | eV     | Nframes * Natoms                   |                                           |
+| atom_pref             | Weights of atomic forces                                                         | atom_pref.raw             | 1      | Nframes * Natoms                   |                                           |
+| dipole                | Frame dipole                                                                     | dipole.raw                | Any    | Nframes * 3                        |                                           |
+| atomic_dipole         | Atomic dipole                                                                    | atomic_dipole.raw         | Any    | Nframes * Natoms * 3               |                                           |
+| polarizability        | Frame polarizability                                                             | polarizability.raw        | Any    | Nframes * 9                        | in the order `XX XY XZ YX YY YZ ZX ZY ZZ` |
+| atomic_polarizability | Atomic polarizability                                                            | atomic_polarizability.raw | Any    | Nframes * Natoms * 9               | in the order `XX XY XZ YX YY YZ ZX ZY ZZ` |
+| drdq                  | Partial derivative of atomic coordinates with respect to generalized coordinates | drdq.raw                  | 1      | Nframes * Natoms * 3 * Ngen_coords |                                           |
 
 In general, we always use the following convention of units:
 
@@ -73,11 +73,11 @@ set.*/real_atom_types.npy
 
 This system contains `Nframes` frames with the same atom number `Natoms`, the total number of element types contained in all frames is `Ntypes`. Most files are the same as those in [standard formats](../data/system.md), here we only list the distinct ones:
 
-| ID       | Property                         | File                | Required/Optional | Shape             | Description                                                                                                              |
-| -------- | -------------------------------- | ------------------- | ----------------- | ----------------- | ------------------------------------------------------------------------------------------------------------------------ |
-| /        | Atom type indexes (place holder) | type.raw            | Required          | Natoms            | All zeros to fake the type input                                                                                         |
-| type_map | Atom type names                  | type_map.raw        | Required          | Ntypes            | Atom names that map to atom type contained in all the frames, which is unnecessart to be contained in the periodic table |
-| type     | Atom type indexes of each frame  | real_atom_types.npy | Required          | Nframes \* Natoms | Integers that describe atom types in each frame, corresponding to indexes in type_map. `-1` means virtual atoms.         |
+| ID       | Property                         | File                | Required/Optional | Shape            | Description                                                                                                              |
+| -------- | -------------------------------- | ------------------- | ----------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------ |
+| /        | Atom type indexes (place holder) | type.raw            | Required          | Natoms           | All zeros to fake the type input                                                                                         |
+| type_map | Atom type names                  | type_map.raw        | Required          | Ntypes           | Atom names that map to atom type contained in all the frames, which is unnecessart to be contained in the periodic table |
+| type     | Atom type indexes of each frame  | real_atom_types.npy | Required          | Nframes * Natoms | Integers that describe atom types in each frame, corresponding to indexes in type_map. `-1` means virtual atoms.         |
 
 With these edited files, one can put together frames with the same `Natoms`, instead of the same formula (like `H2O`).
 
diff --git a/doc/development/cmake.md b/doc/development/cmake.md
index f8508d8992..5654d0cea2 100644
--- a/doc/development/cmake.md
+++ b/doc/development/cmake.md
@@ -6,7 +6,7 @@ After DeePMD-kit C/C++ library is installed, one can find DeePMD-kit from CMake:
 find_package(DeePMD REQUIRED)
 ```
 
-Note that you may need to add ${deepmd_root} to the cached CMake variable `CMAKE_PREFIX_PATH`.
+Note that you may need to add \$\{deepmd_root} to the cached CMake variable `CMAKE_PREFIX_PATH`.
 
 To link against the C interface library, using
 
diff --git a/doc/development/create-a-model-pt.md b/doc/development/create-a-model-pt.md
index 7eb75b7026..16c973c29d 100644
--- a/doc/development/create-a-model-pt.md
+++ b/doc/development/create-a-model-pt.md
@@ -11,9 +11,9 @@ If you'd like to create a new model that isn't covered by the existing DeePMD-ki
 To incorporate your custom model you'll need to:
 
 1. Register and implement new components (e.g. descriptor) in a Python file.
-2. Register new arguments for user inputs.
-3. Package new codes into a Python package.
-4. Test new models.
+1. Register new arguments for user inputs.
+1. Package new codes into a Python package.
+1. Test new models.
 
 ## Design a new component
 
diff --git a/doc/development/create-a-model-tf.md b/doc/development/create-a-model-tf.md
index cc7ad1999d..7720ced0ca 100644
--- a/doc/development/create-a-model-tf.md
+++ b/doc/development/create-a-model-tf.md
@@ -5,9 +5,9 @@ If you'd like to create a new model that isn't covered by the existing DeePMD-ki
 To incorporate your custom model you'll need to:
 
 1. Register and implement new components (e.g. descriptor) in a Python file. You may also want to register new TensorFlow OPs if necessary.
-2. Register new arguments for user inputs.
-3. Package new codes into a Python package.
-4. Test new models.
+1. Register new arguments for user inputs.
+1. Package new codes into a Python package.
+1. Test new models.
 
 ## Design a new component
 
diff --git a/doc/development/type-embedding.md b/doc/development/type-embedding.md
index 10eeed6ee9..f03bda0888 100644
--- a/doc/development/type-embedding.md
+++ b/doc/development/type-embedding.md
@@ -66,7 +66,7 @@ In trainer.py, it will parse the parameter from the input JSON file. If a `type_
 
 ### model (model/ener.py)
 
-When building the operation graph of the `model` in `model.build`. If a `TypeEmbedNet` is detected, it will build the operation graph of `type embed net`, `embedding net` and `fitting net` by order. The building process of `type embed net` can be found in `TypeEmbedNet.build`, which output the type embedding vector of each atom type (of [$\text{ntypes} \times \text{nchanl}$] dimensions). We then save the type embedding vector into `input_dict`, so that they can be fetched later in `embedding net` and `fitting net`.
+When building the operation graph of the `model` in `model.build`. If a `TypeEmbedNet` is detected, it will build the operation graph of `type embed net`, `embedding net` and `fitting net` by order. The building process of `type embed net` can be found in `TypeEmbedNet.build`, which output the type embedding vector of each atom type (of \[$\text{ntypes} \times \text{nchanl}$\] dimensions). We then save the type embedding vector into `input_dict`, so that they can be fetched later in `embedding net` and `fitting net`.
 
 ### embedding net (descriptor/se\*.py)
 
@@ -84,7 +84,7 @@ build -> _pass_filter -> _filter -> _filter_lower
 
 ### fitting net (fit/ener.py)
 
-In `fitting net`, it takes the descriptor vector as input, whose dimension is [natoms, $M_1\times M_2$]. Because we need to involve information on the centric atom in this step, we need to generate a matrix named `atype_embed` (of dim [natoms, nchanl]), in which each row is the type embedding vector of the specific centric atom. The input is sorted by type of centric atom, we also know the number of a particular atom type (stored in `natoms[2+i]`), thus we get the type vector of the centric atom. In the build phase of the fitting net, it will check whether type embedding exists in `input_dict` and fetch them. After that, call `embed_atom_type` function to look up the embedding vector for the type vector of the centric atom to obtain `atype_embed`, and concat input with it ([input, atype_embed]). The modified input goes through `fitting` net` to get predicted energy.
+In `fitting net`, it takes the descriptor vector as input, whose dimension is \[natoms, $M_1\times M_2$\]. Because we need to involve information on the centric atom in this step, we need to generate a matrix named `atype_embed` (of dim [natoms, nchanl]), in which each row is the type embedding vector of the specific centric atom. The input is sorted by type of centric atom, we also know the number of a particular atom type (stored in `natoms[2+i]`), thus we get the type vector of the centric atom. In the build phase of the fitting net, it will check whether type embedding exists in `input_dict` and fetch them. After that, call `embed_atom_type` function to look up the embedding vector for the type vector of the centric atom to obtain `atype_embed`, and concat input with it ([input, atype_embed]). The modified input goes through `fitting` net\` to get predicted energy.
 
 :::{note}
 You can't apply the compression method while using atom-type embedding.
diff --git a/doc/env.md b/doc/env.md
index 1688e0af9c..0c14e785b6 100644
--- a/doc/env.md
+++ b/doc/env.md
@@ -7,7 +7,6 @@ For build-time environment variables, see [Install from source code](./install/i
 ## All interfaces
 
 :::{envvar} DP_INTER_OP_PARALLELISM_THREADS
-
 **Alias**: `TF_INTER_OP_PARALLELISM_THREADS`
 **Default**: `0`
 
@@ -16,7 +15,6 @@ See [How to control the parallelism of a job](./troubleshooting/howtoset_num_nod
 :::
 
 :::{envvar} DP_INTRA_OP_PARALLELISM_THREADS
-
 **Alias**: `TF_INTRA_OP_PARALLELISM_THREADS`\*\*
 **Default**: `0`
 
@@ -36,47 +34,40 @@ See [How to control the parallelism of a job](./troubleshooting/howtoset_num_nod
 ## Python interface only
 
 :::{envvar} DP_INTERFACE_PREC
-
 **Choices**: `high`, `low`; **Default**: `high`
 
 Control high (double) or low (float) precision of training.
 :::
 
 :::{envvar} DP_AUTO_PARALLELIZATION
-
 **Choices**: `0`, `1`; **Default**: `0`
 
 {{ tensorflow_icon }} Enable auto parallelization for CPU operators.
 :::
 
 :::{envvar} DP_JIT
-
 **Choices**: `0`, `1`; **Default**: `0`
 
 {{ tensorflow_icon }} Enable JIT. Note that this option may either improve or decrease the performance. Requires TensorFlow to support JIT.
 :::
 
 :::{envvar} DP_INFER_BATCH_SIZE
-
 **Default**: `1024` on CPUs and as maximum as possible until out-of-memory on GPUs
 
 Inference batch size, calculated by multiplying the number of frames with the number of atoms.
 :::
 
 :::{envvar} DP_BACKEND
-
 **Default**: `tensorflow`
 
 Default backend.
 :::
 
 :::{envvar} NUM_WORKERS
-
 **Default**: 4 or the number of cores (whichever is smaller)
 
 {{ pytorch_icon }} Number of subprocesses to use for data loading in the PyTorch backend.
 See [PyTorch documentation](https://pytorch.org/docs/stable/data.html) for details.
-
 :::
 
 ## C++ interface only
@@ -84,14 +75,12 @@ See [PyTorch documentation](https://pytorch.org/docs/stable/data.html) for detai
 These environment variables also apply to third-party programs using the C++ interface, such as [LAMMPS](./third-party/lammps-command.md).
 
 :::{envvar} DP_PLUGIN_PATH
-
 **Type**: List of paths, split by `:` on Unix and `;` on Windows
 
 List of customized OP plugin libraries to load, such as `/path/to/plugin1.so:/path/to/plugin2.so` on Linux and `/path/to/plugin1.dll;/path/to/plugin2.dll` on Windows.
 :::
 
 :::{envvar} DP_PROFILER
-
 {{ pytorch_icon }} Enable the built-in PyTorch Kineto profiler for the PyTorch C++ (inference) backend.
 
 **Type**: string (output file stem)
@@ -120,5 +109,4 @@ Tips:
 
 - Large runs can generate sizable JSON files; consider limiting numbers of MD steps, like 20.
 - Currently this feature only supports single process, or multi-process runs where each process uses a distinct GPU on the same node.
-
 :::
diff --git a/doc/freeze/compress.md b/doc/freeze/compress.md
index d827c71525..7a75e7bcf5 100644
--- a/doc/freeze/compress.md
+++ b/doc/freeze/compress.md
@@ -58,8 +58,6 @@ If the number of neighbors of an atom is smaller than $N_c$, the corresponding p
 In practice, if the real number of neighbors is significantly smaller than $N_c$, a notable operation is spent on the multiplication of padding zeros.
 In the compressed DP model, the number of neighbors is precisely indexed at the tabulated inference stage, further saving computational costs.[^1]
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 ## Instructions
 
 Once the frozen model is obtained from DeePMD-kit, we can get the neural network structure and its parameters (weights, biases, etc.) from the trained model, and compress it in the following way:
@@ -67,20 +65,17 @@ Once the frozen model is obtained from DeePMD-kit, we can get the neural network
 ::::{tab-set}
 
 :::{tab-item} TensorFlow {{ tensorflow_icon }}
-
 ```bash
 dp compress -i graph.pb -o graph-compress.pb
 ```
-
 :::
 
 :::{tab-item} PyTorch {{ pytorch_icon }}
-
 ```bash
 dp --pt compress -i model.pth -o model-compress.pth
 ```
-
 :::
+
 ::::
 
 where `-i` gives the original frozen model, `-o` gives the compressed model. Several other command line options can be passed to `dp compress`, which can be checked with
@@ -129,3 +124,5 @@ See the documentation of a specific descriptor to see whether it supports model
 When compressing models in the PyTorch backend, the customized OP library for the Python interface must be installed when [freezing the model](../freeze/freeze.md).
 
 The customized OP library for the Python interface can be installed by setting environment variable {envvar}`DP_ENABLE_PYTORCH` to `1` during [installation](../install/install-from-source.md).
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/freeze/freeze.md b/doc/freeze/freeze.md
index 20f02177c6..adb3ba74ab 100644
--- a/doc/freeze/freeze.md
+++ b/doc/freeze/freeze.md
@@ -6,18 +6,15 @@ To freeze a model, typically one does
 ::::{tab-set}
 
 :::{tab-item} TensorFlow {{ tensorflow_icon }}
-
 ```bash
 $ dp freeze -o model.pb
 ```
 
 in the folder where the model is trained. The output model is called `model.pb`.
 The idea and part of our code are from [Morgan](https://blog.metaflow.fr/tensorflow-how-to-freeze-a-model-and-serve-it-with-a-python-api-d4f3596b3adc).
-
 :::
 
 :::{tab-item} PyTorch {{ pytorch_icon }}
-
 ```bash
 $ dp --pt freeze -o model.pth
 ```
@@ -32,11 +29,9 @@ $ dp --pt freeze -o model_branch1.pth --head CHOSEN_BRANCH
 ```
 
 The output model is called `model_branch1.pth`, which is the specifically frozen model with the `CHOSEN_BRANCH` head.
-
 :::
 
 :::{tab-item} Paddle {{ paddle_icon }}
-
 ```bash
 $ dp --pd freeze -o model
 ```
@@ -51,7 +46,6 @@ $ dp --pd freeze -o model_branch1 --head CHOSEN_BRANCH
 ```
 
 The output model is called `model_branch1.json`, which is the specifically frozen model with the `CHOSEN_BRANCH` head.
-
 :::
 
 ::::
diff --git a/doc/inference/nodejs.md b/doc/inference/nodejs.md
index abe9dc36ab..a33429f270 100644
--- a/doc/inference/nodejs.md
+++ b/doc/inference/nodejs.md
@@ -32,12 +32,12 @@ energy = dp.compute(energy, v_forces, v_virials, v_coord, v_atype, v_cell);
 
 console.log("energy:", energy);
 console.log(
-  "forces:",
-  [...Array(v_forces.size()).keys()].map((i) => v_forces.get(i)),
+    "forces:",
+    [...Array(v_forces.size()).keys()].map((i) => v_forces.get(i)),
 );
 console.log(
-  "virials:",
-  [...Array(v_virials.size()).keys()].map((i) => v_virials.get(i)),
+    "virials:",
+    [...Array(v_virials.size()).keys()].map((i) => v_virials.get(i)),
 );
 ```
 
diff --git a/doc/install/build-conda.md b/doc/install/build-conda.md
index e54849f75d..b6f571f47e 100644
--- a/doc/install/build-conda.md
+++ b/doc/install/build-conda.md
@@ -1,8 +1,10 @@
 # Building conda packages
 
 ::::{danger}
+
 :::{deprecated} 3.0.0
 The official channel has been deprecated since 3.0.0.
 Refer to [conda-forge documentation](https://conda-forge.org/docs/maintainer/adding_pkgs/) for how to contribute and build packages locally.
 :::
+
 ::::
diff --git a/doc/install/easy-install.md b/doc/install/easy-install.md
index 32650b4a80..c3d5eef04a 100644
--- a/doc/install/easy-install.md
+++ b/doc/install/easy-install.md
@@ -58,11 +58,13 @@ Read [conda-forge FAQ](https://conda-forge.org/docs/user/tipsandtricks.html#inst
 ### Official channel (deprecated)
 
 ::::{danger}
+
 :::{deprecated} 3.0.0
 The official channel has been deprecated since 3.0.0, due to the challenging work of building dependencies for [multiple backends](../backend.md).
 Old packages will still be available at https://conda.deepmodeling.com.
 Maintainers will build packages in the conda-forge organization together with other conda-forge members.
 :::
+
 ::::
 
 ## Install with docker
@@ -85,93 +87,79 @@ docker pull ghcr.io/deepmodeling/deepmd-kit:2.2.8_cuda12.0_gpu
 
 [Create a new environment](https://docs.deepmodeling.com/faq/conda.html#how-to-create-a-new-conda-pip-environment), and then execute the following command:
 
-:::::::{tab-set}
-
-::::::{tab-item} TensorFlow {{ tensorflow_icon }}
+::::::{tab-set}
 
-:::::{tab-set}
+:::::{tab-item} TensorFlow {{ tensorflow_icon }}
 
-::::{tab-item} CUDA 12
+::::{tab-set}
 
+:::{tab-item} CUDA 12
 ```bash
 pip install deepmd-kit[gpu,cu12]
 ```
 
 `cu12` is required only when CUDA Toolkit and cuDNN were not installed.
+:::
 
-::::
-
-::::{tab-item} CPU
-
+:::{tab-item} CPU
 ```bash
 pip install deepmd-kit[cpu]
 ```
+:::
 
 ::::
 
 :::::
 
-::::::
-
-::::::{tab-item} PyTorch {{ pytorch_icon }}
-
-:::::{tab-set}
+:::::{tab-item} PyTorch {{ pytorch_icon }}
 
-::::{tab-item} CUDA 12
+::::{tab-set}
 
+:::{tab-item} CUDA 12
 ```bash
 pip install deepmd-kit[torch]
 ```
+:::
 
-::::
-
-::::{tab-item} CPU
-
+:::{tab-item} CPU
 ```bash
 pip install torch --index-url https://download.pytorch.org/whl/cpu
 pip install deepmd-kit
 ```
+:::
 
 ::::
 
 :::::
 
-::::::
-
-::::::{tab-item} JAX {{ jax_icon }}
+:::::{tab-item} JAX {{ jax_icon }}
 
-:::::{tab-set}
-
-::::{tab-item} CUDA 12
+::::{tab-set}
 
+:::{tab-item} CUDA 12
 ```bash
 pip install deepmd-kit[jax] jax[cuda12]
 ```
+:::
 
-::::
-
-::::{tab-item} CPU
-
+:::{tab-item} CPU
 ```bash
 pip install deepmd-kit[jax]
 ```
+:::
 
 ::::
 
-:::::
-
 To generate a SavedModel and use [the LAMMPS module](../third-party/lammps-command.md) and [the i-PI driver](../third-party/ipi.md),
 you need to install the TensorFlow.
 Switch to the TensorFlow {{ tensorflow_icon }} tab for more information.
+:::::
 
-::::::
-
-::::::{tab-item} Paddle {{ paddle_icon }}
-
-:::::{tab-set}
+:::::{tab-item} Paddle {{ paddle_icon }}
 
-::::{tab-item} CUDA 12.6
+::::{tab-set}
 
+:::{tab-item} CUDA 12.6
 ```bash
 # release version
 pip install paddlepaddle-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
@@ -179,11 +167,9 @@ pip install paddlepaddle-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/
 # pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
 pip install deepmd-kit
 ```
+:::
 
-::::
-
-::::{tab-item} CPU
-
+:::{tab-item} CPU
 ```bash
 # release version
 pip install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
@@ -191,6 +177,7 @@ pip install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stab
 # pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
 pip install deepmd-kit
 ```
+:::
 
 ::::
 
@@ -198,8 +185,6 @@ pip install deepmd-kit
 
 ::::::
 
-:::::::
-
 The supported platform includes Linux x86-64 and aarch64 with GNU C Library 2.28 or above, macOS x86-64 and arm64, and Windows x86-64.
 
 :::{Warning}
diff --git a/doc/install/install-from-c-library.md b/doc/install/install-from-c-library.md
index 4568cdb6c9..a9e944eaae 100644
--- a/doc/install/install-from-c-library.md
+++ b/doc/install/install-from-c-library.md
@@ -31,9 +31,7 @@ Then the i-PI driver `dp_ipi` will be built and installed.
 One can also follow the manual [Install LAMMPS](./install-lammps.md) and/or [Install GROMACS](./install-gromacs.md).
 
 :::{cmake:variable} DEEPMD_C_ROOT
-
 **Type**: `Path`
 
 Prefix to the pre-compiled C library.
-
 :::
diff --git a/doc/install/install-from-source.md b/doc/install/install-from-source.md
index 1e03563c66..5b731b5ed4 100644
--- a/doc/install/install-from-source.md
+++ b/doc/install/install-from-source.md
@@ -39,7 +39,6 @@ pip install --upgrade pip
 ::::{tab-set}
 
 :::{tab-item} TensorFlow {{ tensorflow_icon }}
-
 The full instruction to install TensorFlow can be found on the official [TensorFlow website](https://www.tensorflow.org/install/pip). TensorFlow 2.7 or later is supported.
 
 ```bash
@@ -61,11 +60,9 @@ python -c "import tensorflow as tf;print(tf.reduce_sum(tf.random.normal([1000, 1
 ```
 
 One can also [build the TensorFlow Python interface from source](https://www.tensorflow.org/install/source) for customized hardware optimization, such as CUDA, ROCM, or OneDNN support.
-
 :::
 
 :::{tab-item} PyTorch {{ pytorch_icon }}
-
 To install PyTorch, run
 
 ```sh
@@ -75,11 +72,9 @@ pip install torch
 Follow [PyTorch documentation](https://pytorch.org/get-started/locally/) to install PyTorch built against different CUDA versions or without CUDA.
 
 One can also [use conda](https://docs.deepmodeling.com/faq/conda.html) to install PyTorch from [conda-forge](https://conda-forge.org).
-
 :::
 
 :::{tab-item} JAX {{ jax_icon }}
-
 To install [JAX AI Stack](https://github.com/jax-ml/jax-ai-stack), run
 
 ```sh
@@ -90,11 +85,9 @@ One can also install packages in JAX AI Stack manually.
 Follow [JAX documentation](https://jax.readthedocs.io/en/latest/installation.html) to install JAX built against different CUDA versions or without CUDA.
 
 One can also [use conda](https://docs.deepmodeling.com/faq/conda.html) to install JAX from [conda-forge](https://conda-forge.org).
-
 :::
 
 :::{tab-item} Paddle {{ paddle_icon }}
-
 To install Paddle, run
 
 ```sh
@@ -110,7 +103,6 @@ pip install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stab
 # nightly-build version
 # pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
 ```
-
 :::
 
 ::::
@@ -149,16 +141,12 @@ The backend package may use a higher C++ standard version, and thus require a hi
 ::::{tab-set}
 
 :::{tab-item} TensorFlow {{ tensorflow_icon }}
-
 Note that TensorFlow may have specific requirements for the compiler version to support the C++ standard version and [`_GLIBCXX_USE_CXX11_ABI`](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html) used by TensorFlow. It is recommended to use [the same compiler version as TensorFlow](https://www.tensorflow.org/install/source#tested_build_configurations), which can be printed by `python -c "import tensorflow;print(tensorflow.version.COMPILER_VERSION)"`.
-
 :::
 
 :::{tab-item} PyTorch {{ pytorch_icon }}
-
 You can set the environment variable `export DP_ENABLE_PYTORCH=1` to enable customized C++ OPs in the PyTorch backend.
 Note that PyTorch may have specific requirements for the compiler version to support the C++ standard version and [`_GLIBCXX_USE_CXX11_ABI`](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html) used by PyTorch.
-
 :::
 
 ::::
@@ -173,71 +161,60 @@ pip install .
 One may set the following environment variables before executing `pip`:
 
 :::{envvar} DP_VARIANT
-
 **Choices**: `cpu`, `cuda`, `rocm`; **Default**: `cpu`
 
 Build CPU variant or GPU variant with CUDA or ROCM support.
 :::
 
 :::{envvar} CUDAToolkit_ROOT
-
 **Type**: Path; **Default**: Detected automatically
 
 The path to the CUDA toolkit directory. CUDA 9.0 or later is supported. NVCC is required.
 :::
 
 :::{envvar} ROCM_ROOT
-
 **Type**: Path; **Default**: Detected automatically
 
 The path to the ROCM toolkit directory. If `ROCM_ROOT` is not set, it will look for `ROCM_PATH`; if `ROCM_PATH` is also not set, it will be detected using `hipconfig --rocmpath`.
-
 :::
 
 :::{envvar} DP_ENABLE_TENSORFLOW
-
 **Choices**: `0`, `1`; **Default**: `1`
 
 {{ tensorflow_icon }} Enable the TensorFlow backend.
 :::
 
 :::{envvar} DP_ENABLE_PYTORCH
-
 **Choices**: `0`, `1`; **Default**: `0`
 
 {{ pytorch_icon }} Enable customized C++ OPs for the PyTorch backend. PyTorch can still run without customized C++ OPs, but features will be limited.
 :::
 
 :::{envvar} TENSORFLOW_ROOT
-
 **Type**: Path; **Default**: Detected automatically
 
 {{ tensorflow_icon }} The path to TensorFlow Python library. If not given, by default the installer only finds TensorFlow under user site-package directory (`site.getusersitepackages()`) or system site-package directory (`sysconfig.get_path("purelib")`) due to limitation of [PEP-517](https://peps.python.org/pep-0517/). If not found, the latest TensorFlow (or the environment variable `TENSORFLOW_VERSION` if given) from PyPI will be built against.
 :::
 
 :::{envvar} PYTORCH_ROOT
-
 **Type**: Path; **Default**: Detected automatically
 
 {{ pytorch_icon }} The path to PyTorch Python library. If not given, by default, the installer only finds PyTorch under the user site-package directory (`site.getusersitepackages()`) or the system site-package directory (`sysconfig.get_path("purelib")`) due to the limitation of [PEP-517](https://peps.python.org/pep-0517/). If not found, the latest PyTorch (or the environment variable `PYTORCH_VERSION` if given) from PyPI will be built against.
 :::
 
 :::{envvar} DP_ENABLE_NATIVE_OPTIMIZATION
-
 **Choices**: `0`, `1`; **Default**: `0`
 
 Enable compilation optimization for the native machine's CPU type. Do not enable it if generated code will run on different CPUs.
 :::
 
 :::{envvar} CMAKE_ARGS
-
 **Type**: string
 
 Additional CMake arguments.
 :::
 
 :::{envvar} <LANG>FLAGS
-
 `<LANG>`=`CXX`, `CUDA` or `HIP`
 
 **Type**: string
@@ -295,20 +272,20 @@ $ horovodrun --check-build
 Horovod v0.22.1:
 
 Available Frameworks:
-    [X] TensorFlow
-    [X] PyTorch
-    [ ] MXNet
+[X] TensorFlow
+[X] PyTorch
+[ ] MXNet
 
 Available Controllers:
-    [X] MPI
-    [X] Gloo
+[X] MPI
+[X] Gloo
 
 Available Tensor Operations:
-    [X] NCCL
-    [ ] DDL
-    [ ] CCL
-    [X] MPI
-    [X] Gloo
+[X] NCCL
+[ ] DDL
+[ ] CCL
+[X] MPI
+[X] Gloo
 ```
 
 Since version 2.0.1, Horovod and mpi4py with MPICH support are shipped with the installer.
@@ -324,7 +301,6 @@ If one does not need to use DeePMD-kit with LAMMPS or i-PI, then the python inte
 ::::{tab-set}
 
 :::{tab-item} TensorFlow {{ tensorflow_icon }} / JAX {{ jax_icon }}
-
 The C++ interfaces of both TensorFlow and JAX backends are based on the TensorFlow C++ library.
 
 Since TensorFlow 2.12, TensorFlow C++ library (`libtensorflow_cc`) is packaged inside the Python library. Thus, you can skip building TensorFlow C++ library manually. If that does not work for you, you can still build it manually.
@@ -332,27 +308,21 @@ Since TensorFlow 2.12, TensorFlow C++ library (`libtensorflow_cc`) is packaged i
 The C++ interface of DeePMD-kit was tested with compiler GCC >= 4.8. It is noticed that the i-PI support is only compiled with GCC >= 4.8. Note that TensorFlow may have specific requirements for the compiler version.
 
 First, the C++ interface of TensorFlow should be installed. It is noted that the version of TensorFlow should be consistent with the python interface. You may follow [the instruction](install-tf.2.12.md) or run the script `$deepmd_source_dir/source/install/build_tf.py` to install the corresponding C++ interface.
-
 :::
 
 :::{tab-item} PyTorch {{ pytorch_icon }}
-
 If you have installed PyTorch using pip, you can use libtorch inside the PyTorch Python package.
 You can also download libtorch prebuilt library from the [PyTorch website](https://pytorch.org/get-started/locally/).
-
 :::
 
 :::{tab-item} JAX {{ jax_icon }}
-
 The JAX backend only depends on the TensorFlow C API, which is included in both TensorFlow C++ library and [TensorFlow C library](https://www.tensorflow.org/install/lang_c).
 If you want to use the TensorFlow C++ library, just enable the TensorFlow backend (which depends on the TensorFlow C++ library) and nothing else needs to do.
 If you want to use the TensorFlow C library and disable the TensorFlow backend,
 download the TensorFlow C library from [this page](https://www.tensorflow.org/install/lang_c#download_and_extract).
-
 :::
 
 :::{tab-item} Paddle {{ paddle_icon }}
-
 If you want to use C++ interface of Paddle, you need to compile the Paddle inference library(C++ interface) manually from the [linux-compile-by-make](https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/install/compile/linux-compile-by-make.html), then use the `.so` and `.a` files in `Paddle/build/paddle_inference_install_dir/`.
 
 We also provide a weekly-build Paddle C++ inference library for Linux x86_64 with CUDA 12.3/CPU below:
@@ -360,7 +330,6 @@ We also provide a weekly-build Paddle C++ inference library for Linux x86_64 wit
 CUDA 12.3: [Cuda123_cudnn900_Trt8616_D1/latest/paddle_inference.tgz](https://paddle-qa.bj.bcebos.com/paddle-pipeline/GITHUB_Docker_Compile_Test_Cuda123_cudnn900_Trt8616_D1/latest/paddle_inference.tgz)
 
 CPU: [GITHUB_Docker_Compile_Test_Cpu_Mkl_Avx_D1/latest/paddle_inference.tgz](https://paddle-qa.bj.bcebos.com/paddle-pipeline/GITHUB_Docker_Compile_Test_Cpu_Mkl_Avx_D1/latest/paddle_inference.tgz)
-
 :::
 
 ::::
@@ -388,7 +357,6 @@ We recommend using [conda packages](https://docs.deepmodeling.com/faq/conda.html
 ::::{tab-set}
 
 :::{tab-item} TensorFlow {{ tensorflow_icon }} / JAX {{ jax_icon }}
-
 I assume you have activated the TensorFlow Python environment and want to install DeePMD-kit into path `$deepmd_root`, then execute CMake
 
 ```bash
@@ -396,11 +364,9 @@ cmake -DENABLE_TENSORFLOW=TRUE -DUSE_TF_PYTHON_LIBS=TRUE -DCMAKE_INSTALL_PREFIX=
 ```
 
 If you specify `-DUSE_TF_PYTHON_LIBS=FALSE`, you need to give the location where TensorFlow's C++ interface is installed to `-DTENSORFLOW_ROOT=${tensorflow_root}`.
-
 :::
 
 :::{tab-item} PyTorch {{ pytorch_icon }}
-
 I assume you have installed the PyTorch (either Python or C++ interface) to `$torch_root`, then execute CMake
 
 ```bash
@@ -412,28 +378,23 @@ You can specify `-DUSE_PT_PYTHON_LIBS=TRUE` to use libtorch from the Python inst
 ```bash
 cmake -DENABLE_PYTORCH=TRUE -DUSE_PT_PYTHON_LIBS=TRUE -DCMAKE_INSTALL_PREFIX=$deepmd_root ..
 ```
-
 :::
 
 :::{tab-item} JAX {{ jax_icon }}
-
 If you want to use the TensorFlow C++ library, just enable the TensorFlow backend and nothing else needs to do.
 If you want to use the TensorFlow C library and disable the TensorFlow backend, set {cmake:variable}`ENABLE_JAX` to `ON` and `CMAKE_PREFIX_PATH` to the root directory of the [TensorFlow C library](https://www.tensorflow.org/install/lang_c).
 
 ```bash
 cmake -DENABLE_JAX=ON -D CMAKE_PREFIX_PATH=${tensorflow_c_root} ..
 ```
-
 :::
 
 :::{tab-item} Paddle {{ paddle_icon }}
-
 I assume you have get the Paddle inference library(C++ interface) to `$PADDLE_INFERENCE_DIR`, then execute CMake
 
 ```bash
 cmake -DENABLE_PADDLE=ON -DPADDLE_INFERENCE_DIR=$PADDLE_INFERENCE_DIR -DCMAKE_INSTALL_PREFIX=$deepmd_root ..
 ```
-
 :::
 
 ::::
@@ -441,150 +402,119 @@ cmake -DENABLE_PADDLE=ON -DPADDLE_INFERENCE_DIR=$PADDLE_INFERENCE_DIR -DCMAKE_IN
 One may add the following CMake variables to `cmake` using the [`-D <var>=<value>` option](https://cmake.org/cmake/help/latest/manual/cmake.1.html#cmdoption-cmake-D):
 
 :::{cmake:variable} ENABLE_TENSORFLOW
-
 **Type**: `BOOL` (`ON`/`OFF`), Default: `OFF`
 
 {{ tensorflow_icon }} {{ jax_icon }} Whether building the TensorFlow backend and the JAX backend.
 Setting this option to `ON` will also set {cmake:variable}`ENABLE_JAX` to `ON`.
-
 :::
 
 :::{cmake:variable} ENABLE_PYTORCH
-
 **Type**: `BOOL` (`ON`/`OFF`), Default: `OFF`
 
 {{ pytorch_icon }} Whether building the PyTorch backend.
-
 :::
 
 :::{cmake:variable} ENABLE_JAX
-
 **Type**: `BOOL` (`ON`/`OFF`), Default: `OFF`
 
 {{ jax_icon }} Build the JAX backend.
 If {cmake:variable}`ENABLE_TENSORFLOW` is `ON`, the TensorFlow C++ library is used to build the JAX backend;
 If {cmake:variable}`ENABLE_TENSORFLOW` is `OFF`, the TensorFlow C library is used to build the JAX backend.
-
 :::
 
 :::{cmake:variable} ENABLE_PADDLE
-
 **Type**: `BOOL` (`ON`/`OFF`), Default: `OFF`
 
 {{ paddle_icon }} Whether building the Paddle backend.
-
 :::
 
 :::{cmake:variable} TENSORFLOW_ROOT
-
 **Type**: `PATH`
 
 {{ tensorflow_icon }} {{ jax_icon }} The Path to TensorFlow's C++ interface.
-
 :::
 
 :::{cmake:variable} PADDLE_INFERENCE_DIR
-
 **Type**: `PATH`
 
 {{ paddle_icon }} The Path to Paddle's C++ inference directory, such as `/path/to/paddle_inference_install_dir` or `/path/to/paddle_inference`.
-
 :::
 
 :::{cmake:variable} CMAKE_INSTALL_PREFIX
-
 **Type**: `PATH`
 
 The Path where DeePMD-kit will be installed.
 See also [CMake documentation](https://cmake.org/cmake/help/latest/variable/CMAKE_INSTALL_PREFIX.html).
-
 :::
 
 :::{cmake:variable} USE_CUDA_TOOLKIT
-
 **Type**: `BOOL` (`ON`/`OFF`), Default: `OFF`
 
 If `TRUE`, Build GPU support with CUDA toolkit.
-
 :::
 
 :::{cmake:variable} CUDAToolkit_ROOT
-
 **Type**: `PATH`, **Default**: [Search automatically](https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html)
 
 The path to the CUDA toolkit directory. CUDA 9.0 or later is supported. NVCC is required.
 See also [CMake documentation](https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html).
-
 :::
 
 :::{cmake:variable} USE_ROCM_TOOLKIT
-
 **Type**: `BOOL` (`ON`/`OFF`), Default: `OFF`
 
 If `TRUE`, Build GPU support with ROCM toolkit.
-
 :::
 
 :::{cmake:variable} CMAKE_HIP_COMPILER_ROCM_ROOT
-
 **Type**: `PATH`, **Default**: [Search automatically](https://rocm.docs.amd.com/en/latest/conceptual/cmake-packages.html)
 
 The path to the ROCM toolkit directory.
 See also [ROCm documentation](https://rocm.docs.amd.com/en/latest/conceptual/cmake-packages.html).
-
 :::
 
 :::{cmake:variable} LAMMPS_SOURCE_ROOT
-
 **Type**: `PATH`
 
 Only necessary for using [LAMMPS plugin mode](./install-lammps.md#install-lammps-plugin-mode).
 The path to the [LAMMPS source code](install-lammps.md).
 LAMMPS 8Apr2021 or later is supported.
 If not assigned, the plugin mode will not be enabled.
-
 :::
 
 :::{cmake:variable} USE_TF_PYTHON_LIBS
-
 **Type**: `BOOL` (`ON`/`OFF`), Default: `OFF`
 
 {{ tensorflow_icon }} If `TRUE`, Build C++ interface with TensorFlow's Python libraries (TensorFlow's Python Interface is required).
 There's no need for building TensorFlow's C++ interface.
-
 :::
 
 :::{cmake:variable} USE_PT_PYTHON_LIBS
-
 **Type**: `BOOL` (`ON`/`OFF`), Default: `OFF`
 
 {{ pytorch_icon }} If `TRUE`, Build C++ interface with PyTorch's Python libraries (PyTorch's Python Interface is required).
 There's no need for downloading PyTorch's C++ libraries.
-
 :::
 
 :::{cmake:variable} ENABLE_NATIVE_OPTIMIZATION
-
 **Type**: `BOOL` (`ON`/`OFF`), Default: `OFF`
 
 Enable compilation optimization for the native machine's CPU type.
 Do not enable it if generated code will run on different CPUs.
-
 :::
 
 <!-- prettier-ignore -->
-:::{cmake:variable} CMAKE_<LANG>_FLAGS
 
+:::{cmake:variable} CMAKE_<LANG>_FLAGS
 (`<LANG>`=`CXX`, `CUDA` or `HIP`)
 
 **Type**: `STRING`
 
 Default compilation flags to be used when compiling `<LANG>` files.
 See also [CMake documentation](https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_FLAGS.html).
-
 :::
 
----
+______________________________________________________________________
 
 If the CMake has been executed successfully, then run the following make commands to build the package:
 
diff --git a/doc/install/install-gromacs.md b/doc/install/install-gromacs.md
index 90ed73841c..02b0b4b194 100644
--- a/doc/install/install-gromacs.md
+++ b/doc/install/install-gromacs.md
@@ -15,6 +15,7 @@ where `deepmd_kit_root` is the directory where the latest version of DeePMD-kit
 
 <!-- ## Install C++ api of deepmd-kit and tensorflow
 The C++ interface of `deepmd-kit 2.x` and `tensorflow 2.x` are required. -->
+
 <!-- + Tips: C++ api of deepmd and TensorFlow could be easily installed from the deepmd-kit offline packages. But before using tensorflow, you need to manually change the protobuf package to [version 3.9.2](https://github.com/protocolbuffers/protobuf/releases/tag/v3.9.2) in `$deepmd_env_dir/include/google/protobuf` (the offline package will install a version of 3.14, which will cause incompatibility). Here `deepmd_env_dir` refers to the directory of conda environment created by the deepmd-kit offline packages.  -->
 
 ## Compile GROMACS with deepmd-kit
@@ -30,10 +31,10 @@ mkdir build
 cd build
 
 cmake3 .. -DCMAKE_CXX_STANDARD=14 \ # not required, but c++14 seems to be more compatible with higher version of tensorflow
-          -DGMX_MPI=ON \
-          -DGMX_GPU=CUDA \ # Gromacs on ROCm has not been fully developed yet
-          -DCUDAToolkit_ROOT=/path/to/cuda \
-          -DCMAKE_INSTALL_PREFIX=/path/to/gromacs-2020.2-deepmd
+-DGMX_MPI=ON \
+    -DGMX_GPU=CUDA \ # Gromacs on ROCm has not been fully developed yet
+-DCUDAToolkit_ROOT=/path/to/cuda \
+    -DCMAKE_INSTALL_PREFIX=/path/to/gromacs-2020.2-deepmd
 make -j
 make install
 ```
diff --git a/doc/install/install-lammps.md b/doc/install/install-lammps.md
index cb65188002..20ecdde56e 100644
--- a/doc/install/install-lammps.md
+++ b/doc/install/install-lammps.md
@@ -125,5 +125,4 @@ If `${tensorflow_root}`, `${deepmd_root}`, or the path to TensorFlow Python pack
 ```sh
 patchelf --add-rpath "${tensorflow_root}/lib" liblammps.so
 ```
-
 :::
diff --git a/doc/install/install-nodejs.md b/doc/install/install-nodejs.md
index 7137723c31..c6ec70d5cb 100644
--- a/doc/install/install-nodejs.md
+++ b/doc/install/install-nodejs.md
@@ -18,8 +18,8 @@ When using CMake to [build DeePMD-kit from source](./install-from-source.md), se
 
 ```sh
 cmake -D BUILD_NODEJS_IF=ON \
-      -D NODEJS_INCLUDE_DIRS=/path/to/nodejs/include \
-      .. # and other arguments
+    -D NODEJS_INCLUDE_DIRS=/path/to/nodejs/include \
+    .. # and other arguments
 make
 make install
 ```
diff --git a/doc/model/change-bias.md b/doc/model/change-bias.md
index 2a9b098606..81964d78f8 100644
--- a/doc/model/change-bias.md
+++ b/doc/model/change-bias.md
@@ -15,7 +15,6 @@ The `dp change-bias` command supports the following methods for adjusting the bi
 ::::{tab-set}
 
 :::{tab-item} TensorFlow Backend {{ tensorflow_icon }}
-
 **Changing bias using provided systems for trained checkpoint:**
 
 ```sh
@@ -27,11 +26,9 @@ dp --tf change-bias model.ckpt -s data_dir -o model_updated.pb
 ```sh
 dp --tf change-bias model.ckpt -b -92.523 -187.66 -o model_updated.pb
 ```
-
 :::
 
 :::{tab-item} PyTorch Backend {{ pytorch_icon }}
-
 **Changing bias using provided systems for trained `.pt`/`.pth` models:**
 
 ```sh
@@ -49,7 +46,6 @@ For multitask models, where `--model-branch` must be specified:
 ```sh
 dp --pt change-bias multi_model.pt -s data_dir -o model_updated.pt --model-branch model_1
 ```
-
 :::
 
 ::::
diff --git a/doc/model/dpa3.md b/doc/model/dpa3.md
index 0ff46c438f..d706b2817c 100644
--- a/doc/model/dpa3.md
+++ b/doc/model/dpa3.md
@@ -45,7 +45,6 @@ Note that we set `float32` in all DPA3 models, while `float64` in other models b
 ::::{tab-set}
 
 :::{tab-item} PyTorch {{ pytorch_icon }}
-
 To run the DPA3 model on LAMMPS via source code installation
 (users can skip this step if using [easy installation](../install/easy-install.md)),
 the custom OP library for Python interface integration must be compiled and linked
@@ -56,11 +55,9 @@ The customized OP library for the Python interface can be installed by setting e
 If one runs LAMMPS with MPI, the customized OP library for the C++ interface should be compiled against the same MPI library as the runtime MPI.
 If one runs LAMMPS with MPI and CUDA devices, it is recommended to compile the customized OP library for the C++ interface with a [CUDA-Aware MPI](https://developer.nvidia.com/mpi-solutions-gpus) library and CUDA,
 otherwise the communication between GPU cards falls back to the slower CPU implementation.
-
 :::
 
 :::{tab-item} Paddle {{ paddle_icon }}
-
 The customized OP library for the Python interface can be installed by
 
 ```sh
@@ -71,7 +68,6 @@ python setup.py install
 If one runs LAMMPS with MPI, the customized OP library for the C++ interface should be compiled against the same MPI library as the runtime MPI.
 If one runs LAMMPS with MPI and CUDA devices, it is recommended to compile the customized OP library for the C++ interface with a [CUDA-Aware MPI](https://developer.nvidia.com/mpi-solutions-gpus) library and CUDA,
 otherwise the communication between GPU cards falls back to the slower CPU implementation.
-
 :::
 
 ::::
diff --git a/doc/model/dplr.md b/doc/model/dplr.md
index 61327bb55e..64d7006dd2 100644
--- a/doc/model/dplr.md
+++ b/doc/model/dplr.md
@@ -36,8 +36,6 @@ $L$ is the cutoff in Fourier space and $S(m)$, the structure factor, is given by
 where $\imath = \sqrt{-1}$ denotes the imaginary unit, $\boldsymbol r_i$ indicates ion coordinates, $q_i$ is the charge of the ion $i$, and $W_n$ is the $n$-th Wannier centroid (WC) which can be obtained from a separated [dipole model](./train-fitting-tensor.md).
 It can be proved that the error in the electrostatic energy introduced by the Gaussian approximations is dominated by a summation of dipole-quadrupole interactions that decay as $r^{-4}$, where $r$ is the distance between the dipole and quadrupole.[^1]
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 ## Train a deep Wannier model for Wannier centroids
 
 We use the deep Wannier model (DW) to represent the relative position of the Wannier centroid (WC) with the atom with which it is associated. One may consult the introduction of the [dipole model](train-fitting-tensor.md) for a detailed introduction. An example input `wc.json` and a small dataset `data` for tutorial purposes can be found in
@@ -192,9 +190,11 @@ fix ID group-ID style_name keyword value ...
 ```
 
 <!-- See https://github.com/prettier/prettier/issues/16160 -->
+
 <!-- prettier-ignore -->
+
 - ID, group-ID are documented in :doc:`fix <fix>` command
-- style\_name = _dplr_
+- style_name = _dplr_
 - three or more keyword/value pairs may be appended
 
 ```
@@ -266,4 +266,6 @@ The MD simulation lasts for only 20 steps. If one runs a longer simulation, it w
 
 Another restriction that should be noted is that the energies printed at the zero steps are not correct. This is because at the zero steps the position of the WC has not been updated with the DW model. The energies printed in later steps are correct.
 
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
+
 [1]: https://arxiv.org/abs/2112.13327
diff --git a/doc/model/dprc.md b/doc/model/dprc.md
index 9f3eee244d..1cc2276197 100644
--- a/doc/model/dprc.md
+++ b/doc/model/dprc.md
@@ -44,8 +44,6 @@ The fitting network is revised to remove energy bias from MM atoms:
 where $\mathbf{0}$ is a zero matrix.
 It is worth mentioning that usage of DPRc is not limited to its initial design for QM/MM correction and can be expanded to any similar interaction.[^1]
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 See the [JCTC paper](https://doi.org/10.1021/acs.jctc.1c00201) for details.
 
 ## Training data
@@ -71,7 +69,6 @@ As described in the paper, the DPRc model only corrects $E_\text{QM}$ and $E_\te
 ::::{tab-set}
 
 :::{tab-item} TensorFlow {{ tensorflow_icon }}
-
 ```json
 "descriptor" :{
     "type":             "hybrid",
@@ -100,11 +97,9 @@ As described in the paper, the DPRc model only corrects $E_\text{QM}$ and $E_\te
     ]
 }
 ```
-
 :::
 
 :::{tab-item} PyTorch {{ pytorch_icon }}
-
 ```json
 "descriptor" :{
     "type":             "hybrid",
@@ -135,7 +130,6 @@ As described in the paper, the DPRc model only corrects $E_\text{QM}$ and $E_\te
     ]
 }
 ```
-
 :::
 
 ::::
@@ -200,9 +194,18 @@ It is noted that the [`se_atten` descriptor](./train-se-atten.md) should be used
 {
   "model": {
     "type": "pairwise_dprc",
-    "type_map": ["C", "P", "O", "H", "OW", "HW"],
+    "type_map": [
+      "C",
+      "P",
+      "O",
+      "H",
+      "OW",
+      "HW"
+    ],
     "type_embedding": {
-      "neuron": [8],
+      "neuron": [
+        8
+      ],
       "precision": "float32"
     },
     "qm_model": {
@@ -212,7 +215,11 @@ It is noted that the [`se_atten` descriptor](./train-se-atten.md) should be used
         "rcut_smth": 0.5,
         "rcut": 9.0,
         "attn_layer": 0,
-        "neuron": [25, 50, 100],
+        "neuron": [
+          25,
+          50,
+          100
+        ],
         "resnet_dt": false,
         "axis_neuron": 12,
         "precision": "float32",
@@ -220,10 +227,21 @@ It is noted that the [`se_atten` descriptor](./train-se-atten.md) should be used
       },
       "fitting_net": {
         "type": "ener",
-        "neuron": [240, 240, 240],
+        "neuron": [
+          240,
+          240,
+          240
+        ],
         "resnet_dt": true,
         "precision": "float32",
-        "atom_ener": [null, null, null, null, 0.0, 0.0],
+        "atom_ener": [
+          null,
+          null,
+          null,
+          null,
+          0.0,
+          0.0
+        ],
         "seed": 1
       }
     },
@@ -234,35 +252,89 @@ It is noted that the [`se_atten` descriptor](./train-se-atten.md) should be used
         "rcut_smth": 0.5,
         "rcut": 6.0,
         "attn_layer": 0,
-        "neuron": [25, 50, 100],
+        "neuron": [
+          25,
+          50,
+          100
+        ],
         "resnet_dt": false,
         "axis_neuron": 12,
         "set_davg_zero": true,
         "exclude_types": [
-          [0, 0],
-          [0, 1],
-          [0, 2],
-          [0, 3],
-          [1, 1],
-          [1, 2],
-          [1, 3],
-          [2, 2],
-          [2, 3],
-          [3, 3],
-          [4, 4],
-          [4, 5],
-          [5, 5]
+          [
+            0,
+            0
+          ],
+          [
+            0,
+            1
+          ],
+          [
+            0,
+            2
+          ],
+          [
+            0,
+            3
+          ],
+          [
+            1,
+            1
+          ],
+          [
+            1,
+            2
+          ],
+          [
+            1,
+            3
+          ],
+          [
+            2,
+            2
+          ],
+          [
+            2,
+            3
+          ],
+          [
+            3,
+            3
+          ],
+          [
+            4,
+            4
+          ],
+          [
+            4,
+            5
+          ],
+          [
+            5,
+            5
+          ]
         ],
         "precision": "float32",
         "seed": 1
       },
       "fitting_net": {
         "type": "ener",
-        "neuron": [240, 240, 240],
+        "neuron": [
+          240,
+          240,
+          240
+        ],
         "resnet_dt": true,
         "seed": 1,
         "precision": "float32",
-        "atom_ener": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+        "atom_ener": [
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0,
+          0.0
+        ]
       }
     }
   }
@@ -273,3 +345,5 @@ The pairwise model needs information for MM residues.
 The model uses [`aparam`](../data/system.md) with the shape of `nframes x natoms` to get the residue index.
 The QM residue should always use `0` as the index.
 For example, `0 0 0 1 1 1 2 2 2` means these 9 atoms are grouped into one QM residue and two MM residues.
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/overall.md b/doc/model/overall.md
index cc72aa3887..2922cc9ccf 100644
--- a/doc/model/overall.md
+++ b/doc/model/overall.md
@@ -24,8 +24,6 @@ From the above equation, one may compute the global property of the system by
 where $N$ is the number of atoms in a frame.
 For example, if $y_i$ represents the potential energy contribution of atom $i$, then $y$ gives the total potential energy of the frame.[^1]
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 ## Instructions
 
 A model has two parts, a descriptor that maps atomic configuration to a set of symmetry invariant features, and a fitting net that takes descriptor as input and predicts the atomic contribution to the target physical property. It's defined in the {ref}`model <model>` section of the `input.json`, for example,
@@ -49,11 +47,11 @@ The {ref}`type_map <model/type_map>` is optional, which provides the element nam
 DeePMD-kit implements the following descriptors:
 
 1. [`se_e2_a`](train-se-e2-a.md): DeepPot-SE constructed from all information (both angular and radial) of atomic configurations. The embedding takes the distance between atoms as input.
-2. [`se_e2_r`](train-se-e2-r.md): DeepPot-SE constructed from radial information of atomic configurations. The embedding takes the distance between atoms as input.
-3. [`se_e3`](train-se-e3.md): DeepPot-SE constructed from all information (both angular and radial) of atomic configurations. The embedding takes angles between two neighboring atoms as input.
-4. [`se_a_mask`](train-se-a-mask.md): DeepPot-SE constructed from all information (both angular and radial) of atomic configurations. The input frames in one system can have a varied number of atoms. Input particles are padded with virtual particles of the same length.
-5. `loc_frame`: Defines a local frame at each atom and compute the descriptor as local coordinates under this frame.
-6. [`hybrid`](train-hybrid.md): Concate a list of descriptors to form a new descriptor.
+1. [`se_e2_r`](train-se-e2-r.md): DeepPot-SE constructed from radial information of atomic configurations. The embedding takes the distance between atoms as input.
+1. [`se_e3`](train-se-e3.md): DeepPot-SE constructed from all information (both angular and radial) of atomic configurations. The embedding takes angles between two neighboring atoms as input.
+1. [`se_a_mask`](train-se-a-mask.md): DeepPot-SE constructed from all information (both angular and radial) of atomic configurations. The input frames in one system can have a varied number of atoms. Input particles are padded with virtual particles of the same length.
+1. `loc_frame`: Defines a local frame at each atom and compute the descriptor as local coordinates under this frame.
+1. [`hybrid`](train-hybrid.md): Concate a list of descriptors to form a new descriptor.
 
 The fitting of the following physical properties is supported
 
@@ -64,4 +62,6 @@ Due to the restrictions of torch jit script, the models trained with hessian are
 :::
 
 2. [`dipole`](train-fitting-tensor.md): The dipole moment.
-3. [`polar`](train-fitting-tensor.md): The polarizability.
+1. [`polar`](train-fitting-tensor.md): The polarizability.
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/pairtab.md b/doc/model/pairtab.md
index 57fe23f5e9..106134ab06 100644
--- a/doc/model/pairtab.md
+++ b/doc/model/pairtab.md
@@ -45,8 +45,6 @@ In the range $[r_a, r_b]$, the DP model smoothly switched off and the pairwise p
 where the scale $\alpha_s$ is a tunable scale of the interatomic distance $r_{ij}$.
 The pairwise potential $u^{\textrm{pair}}(r)$ is defined by a user-defined table that provides the value of $u^{\textrm{pair}}$ on an evenly discretized grid from 0 to the cutoff distance.[^1]
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 DeePMD-kit also supports combination with a pairwise potential {{ tensorflow_icon }}:
 
 ```math
@@ -110,3 +108,5 @@ However, it is more efficient to train a model using plain DFT calculations with
 Training against data with dispersion directly is discouraged.
 See the [D3 dispersion section](../third-party/lammps-command.md#d3-dispersion) for details.
 :::
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/sel.md b/doc/model/sel.md
index 5b85318dd9..67fe6f2c29 100644
--- a/doc/model/sel.md
+++ b/doc/model/sel.md
@@ -9,35 +9,27 @@ To determine a proper `sel`, one can calculate the neighbor stat of the training
 ::::{tab-set}
 
 :::{tab-item} TensorFlow {{ tensorflow_icon }}
-
 ```sh
 dp --tf neighbor-stat -s data -r 6.0 -t O H
 ```
-
 :::
 
 :::{tab-item} PyTorch {{ pytorch_icon }}
-
 ```sh
 dp --pt neighbor-stat -s data -r 6.0 -t O H
 ```
-
 :::
 
 :::{tab-item} JAX {{ jax_icon }}
-
 ```sh
 dp --jax neighbor-stat -s data -r 6.0 -t O H
 ```
-
 :::
 
 :::{tab-item} Paddle {{ paddle_icon }}
-
 ```sh
 dp --pd neighbor-stat -s data -r 6.0 -t O H
 ```
-
 :::
 
 ::::
diff --git a/doc/model/show-model-info.md b/doc/model/show-model-info.md
index 67d82610de..5bffacb871 100644
--- a/doc/model/show-model-info.md
+++ b/doc/model/show-model-info.md
@@ -33,28 +33,35 @@ dp show frozen_model.pth type-map descriptor fitting-net size
 Depending on the provided attributes and the model type, the output includes:
 
 - **Model Type**
+
   - Logs whether the loaded model is a _singletask_ or _multitask_ model.
 
 - **model-branch**
+
   - _Only available for multitask models._
   - Lists all available model branches and the special `"RANDOM"` branch, which refers to a randomly initialized fitting net.
 
 - **type-map**
+
   - For multitask models: Shows the type map for each branch.
   - For singletask models: Shows the model's type map.
 
 - **descriptor**
+
   - For multitask models: Displays the descriptor parameter for each branch.
   - For singletask models: Displays the descriptor parameter.
 
 - **fitting-net**
+
   - For multitask models: Shows the fitting network parameters for each branch.
   - For singletask models: Shows the fitting network parameters.
 
 - **size**
+
   - Prints the number of parameters for each component (`descriptor`, `fitting-net`, etc.), as well as the total parameter count.
 
 - **observed-type**
+
   - Displays the count and list of observed element types of the model during data statistics.
   - For multitask models, it shows the observed types for each branch.
   - Note: This info shows the types observed during training data statistics, which may differ from the type map.
diff --git a/doc/model/train-energy-hessian.md b/doc/model/train-energy-hessian.md
index d77e7f3e88..442ffc021d 100644
--- a/doc/model/train-energy-hessian.md
+++ b/doc/model/train-energy-hessian.md
@@ -43,9 +43,9 @@ set.*/hessian.npy
 
 This system contains `Nframes` frames with the same atom number `Natoms`, the total number of elements contained in all frames is `Ntypes`. Most files are the same as those in [standard formats](../data/system.md), here we only list the distinct ones:
 
-| ID      | Property         | Raw file    | Unit   | Shape                                   | Description                                             |
-| ------- | ---------------- | ----------- | ------ | --------------------------------------- | ------------------------------------------------------- |
-| hessian | Hessian matrices | hessian.npy | eV/Å^2 | Nframes \* (Natoms \* 3 \* Natoms \* 3) | Second-order derivatives of energies w.r.t coordinates. |
+| ID      | Property         | Raw file    | Unit   | Shape                               | Description                                             |
+| ------- | ---------------- | ----------- | ------ | ----------------------------------- | ------------------------------------------------------- |
+| hessian | Hessian matrices | hessian.npy | eV/Å^2 | Nframes * (Natoms * 3 * Natoms * 3) | Second-order derivatives of energies w.r.t coordinates. |
 
 Note that the `hessian.npy` should contain the **full** Hessian matrices with shape of `(3Natoms * 3Natoms)` for each frame, rather than the upper or lower triangular matrices with shape of `(3Natoms * (3Natoms + 1) / 2)` for each frame.
 
@@ -56,11 +56,9 @@ There are two approaches to training a Hessian model. The first method involves
 ::::{tab-set}
 
 :::{tab-item} PyTorch {{ pytorch_icon }}
-
 ```bash
 dp --pt train input.json
 ```
-
 :::
 
 ::::
@@ -70,11 +68,9 @@ The second approach is to train a Hessian model from a pretrained energy model,
 ::::{tab-set}
 
 :::{tab-item} PyTorch {{ pytorch_icon }}
-
 ```bash
 dp --pt train input.json --finetune pretrained_energy.pt
 ```
-
 :::
 
 ::::
@@ -102,14 +98,12 @@ If one do freeze and test a Hessian model using the commands:
 ::::{tab-set}
 
 :::{tab-item} PyTorch {{ pytorch_icon }}
-
 ```bash
 
 dp --pt freeze -o frozen_model.pth
 
 dp --pt test -m frozen_model.pth -s test_system -d ${output_prefix} -a -n 1
 ```
-
 :::
 
 ::::
@@ -126,12 +120,10 @@ If one intends to use the trained model for Hessian predictions, then he/she is
 ::::{tab-set}
 
 :::{tab-item} PyTorch {{ pytorch_icon }}
-
 ```bash
 
 dp --pt test -m model.pt -s test_system -d ${output_prefix} -a -n 1
 ```
-
 :::
 
 ::::
diff --git a/doc/model/train-energy-spin.md b/doc/model/train-energy-spin.md
index 52a470f2a6..3f1a97e4f7 100644
--- a/doc/model/train-energy-spin.md
+++ b/doc/model/train-energy-spin.md
@@ -17,7 +17,7 @@ Note that when adding spin into the model, there will be some implicit modificat
   while in other descriptors with mixed types (such as `dpa1` or `dpa2`), the sel number will not be changed for clarity.
   If you are using descriptors with mixed types, to achieve better performance,
   you should manually extend your sel number (maybe double) depending on the balance between performance and efficiency.
-  :::
+:::
 
 ## Spin
 
@@ -68,7 +68,7 @@ See `se_e2_a` examples in `$deepmd_source_dir/examples/spin/se_e2_a/input_torch.
   between a virtual atom representing spin and its corresponding real atom
   for each atom type with spin. This factor is defined as the virtual distance
   divided by the magnitude of atomic spin for each atom type with spin.
-  The virtual coordinate is defined as the real coordinate plus spin \* virtual_scale.
+  The virtual coordinate is defined as the real coordinate plus spin * virtual_scale.
   List of float values with shape of `ntypes` or `ntypes_spin` or one single float value for all types,
   only used when {ref}`use_spin <model/spin[ener_spin]/use_spin>` is True for each atom type.
 
@@ -148,13 +148,13 @@ where $\bm{R}_{i^p}$, $\bm{R}_i$, and $\bm{S}_i$ denote the virtual atomic coord
 
 We list the details about spin system data format in TensorFlow backend:
 
-| ID     | Property                   | Raw file   | Unit | Shape                             | Description                                                                                                                                               |
-| ------ | -------------------------- | ---------- | ---- | --------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| type   | Atom type indexes          | type.raw   | \    | Natoms + Nspins                   | Integers that start with 0. The first `Natoms` entries represent real atom types, followed by `Nspins` entries representing virtual atom types.           |
-| coord  | Coordinates                | coord.raw  | Å    | Nframes \* (Natoms + Nspins) \* 3 | The first `3 \* Natoms` columns represent the coordinates of real atoms, followed by `3 \* Nspins` columns representing the coordinates of virtual atoms. |
-| box    | Boxes                      | box.raw    | Å    | Nframes \* 3 \* 3                 | in the order `XX XY XZ YX YY YZ ZX ZY ZZ`                                                                                                                 |
-| energy | Frame energies             | energy.raw | eV   | Nframes                           |
-| force  | Atomic and magnetic forces | force.raw  | eV/Å | Nframes \* (Natoms + Nspins) \* 3 | The first `3 \* Natoms` columns represent atomic forces, followed by `3 \* Nspins` columns representing magnetic forces.                                  |
+| ID     | Property                   | Raw file   | Unit | Shape                           | Description                                                                                                                                               |
+| ------ | -------------------------- | ---------- | ---- | ------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| type   | Atom type indexes          | type.raw   | \\   | Natoms + Nspins                 | Integers that start with 0. The first `Natoms` entries represent real atom types, followed by `Nspins` entries representing virtual atom types.           |
+| coord  | Coordinates                | coord.raw  | Å    | Nframes * (Natoms + Nspins) * 3 | The first `3 \* Natoms` columns represent the coordinates of real atoms, followed by `3 \* Nspins` columns representing the coordinates of virtual atoms. |
+| box    | Boxes                      | box.raw    | Å    | Nframes * 3 * 3                 | in the order `XX XY XZ YX YY YZ ZX ZY ZZ`                                                                                                                 |
+| energy | Frame energies             | energy.raw | eV   | Nframes                         |                                                                                                                                                           |
+| force  | Atomic and magnetic forces | force.raw  | eV/Å | Nframes * (Natoms + Nspins) * 3 | The first `3 \* Natoms` columns represent atomic forces, followed by `3 \* Nspins` columns representing magnetic forces.                                  |
 
 ### Spin data format in PyTorch/DP
 
@@ -172,7 +172,7 @@ set.*/force_mag.npy
 
 This system contains `Nframes` frames with the same atom number `Natoms`, the total number of element contained in all frames is `Ntypes`. Most files are the same as those in [standard formats](../data/system.md), here we only list the distinct ones:
 
-| ID             | Property         | Raw file      | Unit    | Shape                  | Description                                                         |
-| -------------- | ---------------- | ------------- | ------- | ---------------------- | ------------------------------------------------------------------- |
-| spin           | Magnetic moments | spin.raw      | $\mu_B$ | Nframes \* Natoms \* 3 | Spin for magnetic atoms and zero for non-magnetic atoms.            |
-| magnetic force | Magnetic forces  | force_mag.raw | eV/Å    | Nframes \* Natoms \* 3 | Magnetic forces for magnetic atoms and zero for non-magnetic atoms. |
+| ID             | Property         | Raw file      | Unit    | Shape                | Description                                                         |
+| -------------- | ---------------- | ------------- | ------- | -------------------- | ------------------------------------------------------------------- |
+| spin           | Magnetic moments | spin.raw      | $\mu_B$ | Nframes * Natoms * 3 | Spin for magnetic atoms and zero for non-magnetic atoms.            |
+| magnetic force | Magnetic forces  | force_mag.raw | eV/Å    | Nframes * Natoms * 3 | Magnetic forces for magnetic atoms and zero for non-magnetic atoms. |
diff --git a/doc/model/train-energy.md b/doc/model/train-energy.md
index 128779ee16..56a2d6b1e3 100644
--- a/doc/model/train-energy.md
+++ b/doc/model/train-energy.md
@@ -75,8 +75,6 @@ where $\nu$ is a small constant used to protect
 an atom where the magnitude of $\boldsymbol{F}^\ast_k$ is small from having a large $L^r_F$.
 Benefiting from the relative force loss, small forces can be fitted more accurately.[^1]
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 ## The fitting network
 
 The construction of the fitting net is given by section {ref}`fitting_net <model[standard]/fitting_net>`
@@ -125,3 +123,5 @@ The {ref}`loss <loss>` section in the `input.json` is
 The options {ref}`start_pref_e <loss[ener]/start_pref_e>`, {ref}`limit_pref_e <loss[ener]/limit_pref_e>`, {ref}`start_pref_f <loss[ener]/start_pref_f>`, {ref}`limit_pref_f <loss[ener]/limit_pref_f>`, {ref}`start_pref_v <loss[ener]/start_pref_v>` and {ref}`limit_pref_v <loss[ener]/limit_pref_v>` determine the start and limit prefactors of energy, force and virial, respectively.
 
 If one does not want to train with virial, then he/she may set the virial prefactors {ref}`start_pref_v <loss[ener]/start_pref_v>` and {ref}`limit_pref_v <loss[ener]/limit_pref_v>` to 0.
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/train-fitting-dos.md b/doc/model/train-fitting-dos.md
index fb4a3677e5..7e7ff73ab6 100644
--- a/doc/model/train-fitting-dos.md
+++ b/doc/model/train-fitting-dos.md
@@ -85,19 +85,15 @@ The training command is the same as `ener` mode, i.e.
 ::::{tab-set}
 
 :::{tab-item} TensorFlow {{ tensorflow_icon }}
-
 ```bash
 dp --tf train input.json
 ```
-
 :::
 
 :::{tab-item} PyTorch {{ pytorch_icon }}
-
 ```bash
 dp --pt train input.json
 ```
-
 :::
 
 ::::
@@ -136,25 +132,21 @@ In this earlier version, we can use `dp test` to infer the electronic density of
 ::::{tab-set}
 
 :::{tab-item} TensorFlow {{ tensorflow_icon }}
-
 ```bash
 
 dp --tf freeze -o frozen_model.pb
 
 dp --tf test -m frozen_model.pb -s ../data/111/$k -d ${output_prefix} -a -n 100
 ```
-
 :::
 
 :::{tab-item} PyTorch {{ pytorch_icon }}
-
 ```bash
 
 dp --pt freeze -o frozen_model.pth
 
 dp --pt test -m frozen_model.pth -s ../data/111/$k -d ${output_prefix} -a -n 100
 ```
-
 :::
 
 ::::
diff --git a/doc/model/train-fitting-property.md b/doc/model/train-fitting-property.md
index be1b63bf6f..2b76913cae 100644
--- a/doc/model/train-fitting-property.md
+++ b/doc/model/train-fitting-property.md
@@ -121,11 +121,9 @@ The training command is the same as `ener` mode, i.e.
 ::::{tab-set}
 
 :::{tab-item} PyTorch {{ pytorch_icon }}
-
 ```bash
 dp --pt train input.json
 ```
-
 :::
 
 ::::
@@ -155,14 +153,12 @@ We can use `dp test` to infer the properties for given frames.
 ::::{tab-set}
 
 :::{tab-item} PyTorch {{ pytorch_icon }}
-
 ```bash
 
 dp --pt freeze -o frozen_model.pth
 
 dp --pt test -m frozen_model.pth -s ../data/data_0/ -d ${output_prefix} -n 100
 ```
-
 :::
 
 ::::
@@ -185,10 +181,10 @@ for `*.property.out.*`, it contains matrix with shape of `(2, task_dim)`,
 ## Data Normalization
 
 When `fitting_net/type` is `ener`, the energy bias layer “$e_{bias}$” adds a constant bias to the atomic energy contribution according to the atomic number.i.e.,
-$$e_{bias} (Z_i) (MLP(D_i))= MLP(D_i) + e_{bias} (Z_i)$$
+\$$e_{bias} (Z_i) (MLP(D_i))= MLP(D_i) + e_{bias} (Z_i)$\$
 
 But when `fitting_net/type` is `property`. The property bias layer is used to normalize the property output of the model.i.e.,
-$$p_{bias} (MLP(D_i))= MLP(D_i) * std+ mean$$
+\$$p_{bias} (MLP(D_i))= MLP(D_i) * std+ mean$\$
 
 1. `std`: The standard deviation of the property label
-2. `mean`: The average value of the property label
+1. `mean`: The average value of the property label
diff --git a/doc/model/train-fitting-tensor.md b/doc/model/train-fitting-tensor.md
index 29c95b2d68..d3dc1efdfd 100644
--- a/doc/model/train-fitting-tensor.md
+++ b/doc/model/train-fitting-tensor.md
@@ -9,21 +9,17 @@ Unlike `energy`, which is a scalar, one may want to fit some high dimensional ph
 ::::{tab-set}
 
 :::{tab-item} TensorFlow {{ tensorflow_icon }}
-
 ```bash
 $deepmd_source_dir/examples/water_tensor/dipole/dipole_input.json
 $deepmd_source_dir/examples/water_tensor/polar/polar_input.json
 ```
-
 :::
 
 :::{tab-item} PyTorch {{ pytorch_icon }}
-
 ```bash
 $deepmd_source_dir/examples/water_tensor/dipole/dipole_input_torch.json
 $deepmd_source_dir/examples/water_tensor/polar/polar_input_torch.json
 ```
-
 :::
 
 ::::
@@ -69,8 +65,6 @@ The total tensor $\boldsymbol{T}$ (total dipole $\boldsymbol{T}^{(1)}$ or total
 
 The tensorial models can be used to calculate IR spectrum and Raman spectrum.[^1]
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 ## The fitting Network
 
 The {ref}`fitting_net <model[standard]/fitting_net>` section tells DP which fitting net to use.
@@ -78,7 +72,6 @@ The {ref}`fitting_net <model[standard]/fitting_net>` section tells DP which fitt
 ::::{tab-set}
 
 :::{tab-item} TensorFlow {{ tensorflow_icon }}
-
 The JSON of `dipole` type should be provided like
 
 ```json
@@ -106,11 +99,9 @@ The JSON of `polar` type should be provided like
 - `type` specifies which type of fitting net should be used. It should be either `dipole` or `polar`. Note that `global_polar` mode in version 1.x is already **deprecated** and is merged into `polar`. To specify whether a system is global or atomic, please see [here](train-se-e2-a.md).
 - `sel_type` is a list specifying which type of atoms have the quantity you want to fit. For example, in the water system, `sel_type` is `[0]` since `0` represents atom `O`. If left unset, all types of atoms will be fitted.
 - The rest arguments have the same meaning as they do in `ener` mode.
-
 :::
 
 :::{tab-item} PyTorch {{ pytorch_icon }}
-
 The JSON of `dipole` type should be provided like
 
 ```json
@@ -142,7 +133,7 @@ The JSON of `polar` type should be provided like
 - `type` specifies which type of fitting net should be used. It should be either `dipole` or `polar`. Note that `global_polar` mode in version 1.x is already **deprecated** and is merged into `polar`. To specify whether a system is global or atomic, please see [here](train-se-e2-a.md).
 - `atom_exclude_types` is a list specifying the which type of atoms have the quantity you want to set to zero. For example, in the water system, `atom_exclude_types` is `[1]` since `1` represents atom `H`.
 - The rest arguments have the same meaning as they do in `ener` mode.
-  :::
+:::
 
 ::::
 
@@ -186,19 +177,15 @@ The training command is the same as `ener` mode, i.e.
 ::::{tab-set}
 
 :::{tab-item} TensorFlow {{ tensorflow_icon }}
-
 ```bash
 dp train input.json
 ```
-
 :::
 
 :::{tab-item} PyTorch {{ pytorch_icon }}
-
 ```bash
 dp --pt train input.json
 ```
-
 :::
 
 ::::
@@ -248,3 +235,5 @@ During training, at each step when the `lcurve.out` is printed, the system used
 To only fit against a subset of atomic types, in the TensorFlow backend, {ref}`fitting_net/sel_type <model[standard]/fitting_net[dipole]/sel_type>` should be set to selected types;
 in other backends, {ref}`atom_exclude_types <model/atom_exclude_types>` should be set to excluded types.
 The TensorFlow backend does not support {ref}`numb_fparam <model[standard]/fitting_net[dipole]/numb_fparam>` and {ref}`numb_aparam <model[standard]/fitting_net[dipole]/numb_aparam>`.
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/train-hybrid.md b/doc/model/train-hybrid.md
index d565af5c9a..ca4b285c3a 100644
--- a/doc/model/train-hybrid.md
+++ b/doc/model/train-hybrid.md
@@ -21,8 +21,6 @@ A hybrid descriptor $\mathcal{D}^i_\text{hyb}$ concatenates multiple kinds of de
 The list of descriptors can be different types or the same descriptors with different parameters.
 This way, one can set the different cutoff radii for different descriptors.[^1]
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 ## Instructions
 
 To use the descriptor in DeePMD-kit, one firstly set the {ref}`type <model[standard]/descriptor/type>` to {ref}`hybrid <model[standard]/descriptor[hybrid]>`, then provide the definitions of the descriptors by the items in the `list`,
@@ -58,3 +56,5 @@ In other backends, each descriptor has its own type embedding and their paramete
 ## Model compression
 
 Model compression is supported if all sub-descriptors support model compression.
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/train-se-a-mask.md b/doc/model/train-se-a-mask.md
index ff1ee76c12..1356cdd566 100644
--- a/doc/model/train-se-a-mask.md
+++ b/doc/model/train-se-a-mask.md
@@ -68,7 +68,7 @@ To make the `aparam.npy` used for descriptor `se_a_mask`, two variables in `fitt
 - {ref}`use_aparam_as_mask <model[standard]/fitting_net[ener]/use_aparam_as_mask>` is set to `true` to use the `aparam.npy` as the mask of the atoms in the descriptor `se_a_mask`.
 
 Finally, to make a reasonable fitting task with `se_a_mask` descriptor for DP/MM simulations, the loss function with `se_a_mask` is designed to include the atomic forces difference in specific atoms of the input particles only.
-More details about the selection of the specific atoms can be found in paper [DP/MM](left to be filled).
+More details about the selection of the specific atoms can be found in paper \[DP/MM\](left to be filled).
 Thus, `atom_pref.npy` ( [ nframes * natoms ] ) is required as the indicator of the specific atoms in the input particles.
 And the `loss` section in the training input script should be set as follows.
 
diff --git a/doc/model/train-se-atten.md b/doc/model/train-se-atten.md
index 2e0c236cf6..e504207ac2 100644
--- a/doc/model/train-se-atten.md
+++ b/doc/model/train-se-atten.md
@@ -64,8 +64,6 @@ Then layer normalization is added in a residual way to finally obtain the self-a
 \mathcal{G}^{i,l} = \mathcal{G}^{i,l-1} + \mathrm{LayerNorm}(A(\mathcal{Q}^{i,l}, \mathcal{K}^{i,l}, \mathcal{V}^{i,l}, \mathcal{R}^{i,l})).
 ```
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 ## Descriptor `"se_atten"`
 
 Next, we will list the detailed settings in input.json and the data format, especially for large systems with dozens of elements. An example of DPA-1 input can be found in `examples/water/se_atten/input.json`.
@@ -197,3 +195,5 @@ Model compression is supported for any {ref}`attn_layer <model[standard]/descrip
 Here we upload the AlMgCu example shown in the paper, you can download it here:
 [Baidu disk](https://pan.baidu.com/s/1Mk9CihPHCmf8quwaMhT-nA?pwd=d586);
 [Google disk](https://drive.google.com/file/d/11baEpRrvHoqxORFPSdJiGWusb3Y4AnRE/view?usp=sharing).
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/train-se-e2-a-tebd.md b/doc/model/train-se-e2-a-tebd.md
index 00726c0d3e..a97eae5499 100644
--- a/doc/model/train-se-e2-a-tebd.md
+++ b/doc/model/train-se-e2-a-tebd.md
@@ -61,8 +61,6 @@ E_i=\mathcal{F}_0(\{\mathcal{D}^i, \mathcal{A}^i\}).
 
 In this way, all chemical species share the same network parameters through the type embedding.[^1]
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 ## Instructions for TensorFlow backend {{ tensorflow_icon }}
 
 In the TensorFlow backend, the type embedding is at the model level.
@@ -114,3 +112,5 @@ See documentation for each descriptor for details.
 In other backends, the type embedding is within the descriptor itself.
 
 See documentation for each descriptor for details.
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/train-se-e2-a.md b/doc/model/train-se-e2-a.md
index 9382c78d8e..e972f2614b 100644
--- a/doc/model/train-se-e2-a.md
+++ b/doc/model/train-se-e2-a.md
@@ -58,8 +58,6 @@ $\mathcal{G}^i_< \in \mathbb{R}^{N_c \times M_<}$ only takes first $M_<$ columns
 $r_s$, $r_c$, $M$ and $M_<$ are hyperparameters provided by the user.
 The DeepPot-SE is continuous up to the second-order derivative in its domain.[^1]
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 ## Instructions
 
 In this example, we will train a DeepPot-SE model for a water system. A complete training input script of this example can be found in the directory.
@@ -109,3 +107,5 @@ In the JAX backend, {ref}`type_one_side <model[standard]/descriptor[se_e2_a]/typ
 
 Model compression is supported when type embedding is not used.
 To use model compression with type embedding in the TensorFlow backend, use `se_a_tebd_v2` instead.
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/train-se-e2-r.md b/doc/model/train-se-e2-r.md
index 62c7311a08..6e310f4a20 100644
--- a/doc/model/train-se-e2-r.md
+++ b/doc/model/train-se-e2-r.md
@@ -42,8 +42,6 @@ In the above equations, the network parameters are not explicitly written.
 $r_s$, $r_c$ and $M$ are hyperparameters provided by the user.
 The DeepPot-SE is continuous up to the second-order derivative in its domain.[^1]
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 ## Instructions
 
 A complete training input script of this example can be found in the directory
@@ -82,3 +80,5 @@ In the PyTorch, JAX, and DP backend, {ref}`type_one_side <model[standard]/descri
 ## Model compression
 
 Model compression is supported when type embedding is not used.
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/model/train-se-e3.md b/doc/model/train-se-e3.md
index 2f1b3ba972..1eda986392 100644
--- a/doc/model/train-se-e3.md
+++ b/doc/model/train-se-e3.md
@@ -38,8 +38,6 @@ Each element of $\mathcal{G}^i \in \mathbb{R}^{N_c \times N_c \times M}$ comes f
 where $(\theta_i)_ {jk} = (\mathcal{R}^i)_ {j,\\{2,3,4\\}}\cdot (\mathcal{R}^i)_ {k,\\{2,3,4\\}}$ considers the angle form of two neighbours ($j$ and $k$).
 The notation $:$ in the equation indicates the contraction between matrix $\mathcal{R}^i(\mathcal{R}^i)^T$ and the first two dimensions of tensor $\mathcal{G}^i$.[^1]
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 ## Instructions
 
 A complete training input script of this example can be found in the directory
@@ -76,3 +74,5 @@ In the TensorFlow backend, {ref}`env_protection <model[standard]/descriptor[se_e
 ## Model compression
 
 Model compression is supported.
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/nvnmd/nvnmd.md b/doc/nvnmd/nvnmd.md
index 279236ec96..bd14b5a25e 100644
--- a/doc/nvnmd/nvnmd.md
+++ b/doc/nvnmd/nvnmd.md
@@ -65,10 +65,16 @@ The "nvnmd" section is defined as
   "version": 0,
   "max_nnei": 128,
   "net_size": 128,
-  "sel": [60, 60],
+  "sel": [
+    60,
+    60
+  ],
   "rcut": 6.0,
   "rcut_smth": 0.5,
-  "type_map": ["Ge", "Te"]
+  "type_map": [
+    "Ge",
+    "Te"
+  ]
 }
 ```
 
@@ -80,15 +86,15 @@ where items are defined as:
 | max_nnei  | the maximum number of neighbors that do not distinguish element types | 128 or 256                                                                   |
 | net_size  | the size of neural network                                            | 128                                                                          |
 | sel       | the number of neighbors                                               | version 0: integer list of lengths 1 to 4 are acceptable; version 1: integer |
-| rcut      | the cutoff radial                                                     | (0, 8.0]                                                                     |
-| rcut_smth | the smooth cutoff parameter                                           | (0, 8.0]                                                                     |
+| rcut      | the cutoff radial                                                     | (0, 8.0\]                                                                    |
+| rcut_smth | the smooth cutoff parameter                                           | (0, 8.0\]                                                                    |
 | type_map  | mapping atom type to the name (str) of the type                       | string list, optional                                                        |
 
 Multiple versions of the nvnmd model correspond to different network structures. `nvnmd-v0` and `nvnmd-v1` differ in the following ways:
 
 1. `nvnmd-v0` and `nvnmd-v1` use the `se_a` descriptor and `se_atten` descriptor, respectively
-2. `nvnmd-v0` has 1 set of parameters for each element and supports up to 4 element types. `nvnmd-v1` shares 1 set of parameters for each element and supports up to 31 types.
-3. `nvnmd-v0` distinguishes between neighboring atoms, so `sel` is a list of integers. `nvnmd-v1` does not distinguish between neighboring atoms, so `sel` is an integer.
+1. `nvnmd-v0` has 1 set of parameters for each element and supports up to 4 element types. `nvnmd-v1` shares 1 set of parameters for each element and supports up to 31 types.
+1. `nvnmd-v0` distinguishes between neighboring atoms, so `sel` is a list of integers. `nvnmd-v1` does not distinguish between neighboring atoms, so `sel` is an integer.
 
 ### learning_rate
 
@@ -97,20 +103,20 @@ The "learning_rate" section is defined as
 ```json
 {
   "type": "exp",
-  "start_lr": 1e-3,
-  "stop_lr": 3e-8,
+  "start_lr": 0.001,
+  "stop_lr": 3e-08,
   "decay_steps": 5000
 }
 ```
 
 where items are defined as:
 
-| Item        | Mean                                                             | Optional Value         |
-| ----------- | ---------------------------------------------------------------- | ---------------------- |
-| type        | learning rate variant type                                       | exp                    |
-| start_lr    | the learning rate at the beginning of the training               | a positive real number |
-| stop_lr     | the desired learning rate at the end of the training             | a positive real number |
-| decay_stops | the learning rate is decaying every {decay_stops} training steps | a positive integer     |
+| Item        | Mean                                                              | Optional Value         |
+| ----------- | ----------------------------------------------------------------- | ---------------------- |
+| type        | learning rate variant type                                        | exp                    |
+| start_lr    | the learning rate at the beginning of the training                | a positive real number |
+| stop_lr     | the desired learning rate at the end of the training              | a positive real number |
+| decay_stops | the learning rate is decaying every \{decay_stops} training steps | a positive integer     |
 
 ### loss
 
@@ -152,8 +158,16 @@ The "training" section is defined as
   "save_ckpt": "model.ckpt",
   "save_freq": 10000,
   "training_data": {
-    "systems": ["system1_path", "system2_path", "..."],
-    "batch_size": ["batch_size_of_system1", "batch_size_of_system2", "..."]
+    "systems": [
+      "system1_path",
+      "system2_path",
+      "..."
+    ],
+    "batch_size": [
+      "batch_size_of_system1",
+      "batch_size_of_system2",
+      "..."
+    ]
   }
 }
 ```
@@ -164,7 +178,7 @@ where items are defined as:
 | ---------- | --------------------------------------------------- | ------------------ |
 | seed       | the random seed                                     | a integer          |
 | stop_batch | the total training steps                            | a positive integer |
-| numb_test  | the accuracy is test by using {numb_test} sample    | a positive integer |
+| numb_test  | the accuracy is test by using \{numb_test} sample   | a positive integer |
 | disp_file  | the log file where the training message display     | a string           |
 | disp_freq  | display frequency                                   | a positive integer |
 | save_ckpt  | path prefix of check point files                    | a string           |
diff --git a/doc/test/model-deviation.md b/doc/test/model-deviation.md
index f4c58f1a38..40637352dc 100644
--- a/doc/test/model-deviation.md
+++ b/doc/test/model-deviation.md
@@ -49,8 +49,6 @@ an atom where the magnitude of $\boldsymbol{F}_i$ or $\boldsymbol{\Xi}$ is small
 Statistics of $\epsilon_{\boldsymbol{F},i}$ and $\epsilon_{\boldsymbol{\Xi},{\alpha \beta}}$ can be provided, including the maximum, average, and minimal model deviation over the atom index $i$ and over the component index $\alpha,\beta$, respectively.
 The maximum model deviation of forces $\epsilon_{\boldsymbol F,\text{max}}$ in a frame was found to be the best error indicator in a concurrent or active learning algorithm.[^1]
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 ## Instructions
 
 One can also use a subcommand to calculate the deviation of predicted forces or virials for a bunch of models in the following way:
@@ -77,3 +75,5 @@ where $D_{f_i}$ is the absolute model deviation of the force on atom $i$, $f_i$
 If the argument `--relative_v` is set, then the relative model deviation of the virial will be output instead of the absolute value, with the same definition of that of the force:
 
 $$E_{v_i}=\frac{\left|D_{v_i}\right|}{\left|v_i\right|+l}$$
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/third-party/ase.md b/doc/third-party/ase.md
index 183efa7cbb..10d09960f6 100644
--- a/doc/third-party/ase.md
+++ b/doc/third-party/ase.md
@@ -9,7 +9,6 @@ Deep potential can be set up as a calculator with ASE to obtain potential energi
 ::::{tab-set}
 
 :::{tab-item} TensorFlow {{ tensorflow_icon }}
-
 ```python
 from ase import Atoms
 from deepmd.calculator import DP
@@ -23,11 +22,9 @@ water = Atoms(
 print(water.get_potential_energy())
 print(water.get_forces())
 ```
-
 :::
 
 :::{tab-item} PyTorch {{ pytorch_icon }}
-
 ```python
 from ase import Atoms
 from deepmd.calculator import DP
@@ -41,11 +38,9 @@ water = Atoms(
 print(water.get_potential_energy())
 print(water.get_forces())
 ```
-
 :::
 
 :::{tab-item} Paddle {{ paddle_icon }}
-
 ```python
 from ase import Atoms
 from deepmd.calculator import DP
@@ -59,7 +54,6 @@ water = Atoms(
 print(water.get_potential_energy())
 print(water.get_forces())
 ```
-
 :::
 
 ::::
diff --git a/doc/third-party/lammps-command.md b/doc/third-party/lammps-command.md
index 25a77f8670..70df75d22d 100644
--- a/doc/third-party/lammps-command.md
+++ b/doc/third-party/lammps-command.md
@@ -50,6 +50,7 @@ pair_style deepmd models ... keyword value ...
   If multiple models are provided, then only the first model serves to provide energy and force prediction for each timestep of molecular dynamics,
   and the model deviation will be computed among all models every `out_freq` timesteps.
 - keyword = _out_file_ or _out_freq_ or _fparam_ or _fparam_from_compute_ or _aparam_from_compute_ or _atomic_ or _relative_ or _relative_v_ or _aparam_ or _ttm_
+
 <pre>
     <i>out_file</i> value = filename
         filename = The file name for the model deviation output. Default is model_devi.out
@@ -91,7 +92,7 @@ compute    1 all ke/atom
 
 ### Description
 
-Evaluate the interaction of the system by using [Deep Potential][DP] or [Deep Potential Smooth Edition][DP-SE]. It is noticed that deep potential is not a "pairwise" interaction, but a multi-body interaction.
+Evaluate the interaction of the system by using [Deep Potential][dp] or [Deep Potential Smooth Edition][dp-se]. It is noticed that deep potential is not a "pairwise" interaction, but a multi-body interaction.
 
 This pair style takes the deep potential defined in a model file that usually has .pb/.pth/.savedmodel extensions. The model can be trained and frozen from multiple backends by package [DeePMD-kit](https://github.com/deepmodeling/deepmd-kit), which can have either double or single float precision interface.
 
@@ -180,7 +181,7 @@ compute    1 all ke/atom
 
 ### Description
 
-Evaluate the interaction of the system with spin by using [DeepSPIN][DPSPIN] models. It is noticed that deep spin model is not a "pairwise" interaction, but a multi-body interaction.
+Evaluate the interaction of the system with spin by using [DeepSPIN][dpspin] models. It is noticed that deep spin model is not a "pairwise" interaction, but a multi-body interaction.
 
 This pair style takes the deep spin model defined in a model file that usually has .pb/.pth/.savedmodel extensions. The model can be trained and frozen from multiple backends by package [DeePMD-kit](https://github.com/deepmodeling/deepmd-kit), which can have either double or single float precision interface.
 
@@ -304,10 +305,6 @@ compute flux all heat/flux ke pe stress
 
 If you use these features please cite [D. Tisi, L. Zhang, R. Bertossa, H. Wang, R. Car, S. Baroni - arXiv preprint arXiv:2108.10850, 2021](https://arxiv.org/abs/2108.10850)
 
-[DP]: https://journals.aps.org/prl/abstract/10.1103/PhysRevLett.120.143001
-[DP-SE]: https://dl.acm.org/doi/10.5555/3327345.3327356
-[DPSPIN]: https://doi.org/10.1103/PhysRevB.110.064427
-
 ### D3 dispersion
 
 :::{note}
@@ -322,3 +319,7 @@ pair_style hybrid/overlay deepmd water.pb dispersion/d3 original pbe0 30.0 20.0
 pair_coeff * * deepmd O H
 pair_coeff * * dispersion/d3 O H
 ```
+
+[dp]: https://journals.aps.org/prl/abstract/10.1103/PhysRevLett.120.143001
+[dp-se]: https://dl.acm.org/doi/10.5555/3327345.3327356
+[dpspin]: https://doi.org/10.1103/PhysRevB.110.064427
diff --git a/doc/train/finetuning.md b/doc/train/finetuning.md
index 78a7b8a71e..563aa76e08 100644
--- a/doc/train/finetuning.md
+++ b/doc/train/finetuning.md
@@ -111,7 +111,6 @@ One can check the available model branches in multi-task pre-trained model by re
 ```bash
 $ dp --pt show multitask_pretrained.pt model-branch
 ```
-
 :::
 
 This command will start fine-tuning based on the pre-trained model's descriptor and the selected branch's fitting net.
@@ -131,9 +130,9 @@ Then, prepare a suitable input script for multitask fine-tuning `multi_input.jso
 
 1. Refer to the [`multi-task-training`](./multi-task-training) document to prepare a multitask training script for two systems,
    ideally extracting parts (i.e. {ref}`model_dict <model/model_dict>`, {ref}`loss_dict <loss_dict>`, {ref}`data_dict <training/data_dict>` and {ref}`model_prob <training/model_prob>` parts) corresponding to `PRE_DATA1` and `PRE_DATA2` directly from the training script of the pre-trained model.
-2. For `DOWNSTREAM_DATA`, select a desired branch to fine-tune from (e.g., `PRE_DATA1`), copy the configurations of `PRE_DATA1` as the configuration for `DOWNSTREAM_DATA` and insert the corresponding data path into the {ref}`data_dict <training/data_dict>`,
+1. For `DOWNSTREAM_DATA`, select a desired branch to fine-tune from (e.g., `PRE_DATA1`), copy the configurations of `PRE_DATA1` as the configuration for `DOWNSTREAM_DATA` and insert the corresponding data path into the {ref}`data_dict <training/data_dict>`,
    thereby generating a three-system multitask training script.
-3. In the {ref}`model_dict <model/model_dict>` for `DOWNSTREAM_DATA`, specify the branch from which `DOWNSTREAM_DATA` is to fine-tune using:
+1. In the {ref}`model_dict <model/model_dict>` for `DOWNSTREAM_DATA`, specify the branch from which `DOWNSTREAM_DATA` is to fine-tune using:
    `"finetune_head": "PRE_DATA1"`.
 
 The complete `multi_input.json` should appear as follows ("..." means copied from input script of pre-trained model):
@@ -257,7 +256,6 @@ One can check the available model branches in multi-task pre-trained model by re
 ```bash
 $ dp --pd show multitask_pretrained.pd model-branch
 ```
-
 :::
 
 This command will start fine-tuning based on the pre-trained model's descriptor and the selected branch's fitting net.
diff --git a/doc/train/gpu-limitations.md b/doc/train/gpu-limitations.md
index 44c9697dd4..eb4f2bc373 100644
--- a/doc/train/gpu-limitations.md
+++ b/doc/train/gpu-limitations.md
@@ -3,7 +3,7 @@
 If you use DeePMD-kit in a GPU environment, the acceptable value range of some variables is additionally restricted compared to the CPU environment due to the software's GPU implementations:
 
 1. The number of atom types of a given system must be less than 128.
-2. The maximum distance between an atom and its neighbors must be less than 128. It can be controlled by setting the rcut value of training parameters.
-3. Theoretically, the maximum number of atoms that a single GPU can accept is about 10,000,000. However, this value is limited by the GPU memory size currently, usually within 1000,000 atoms even in the model compression mode.
-4. The total sel value of training parameters(in `model[standard]/descriptor` section) must be less than 4096.
-5. The size of the last layer of the embedding net must be less than 1024 during the model compression process.
+1. The maximum distance between an atom and its neighbors must be less than 128. It can be controlled by setting the rcut value of training parameters.
+1. Theoretically, the maximum number of atoms that a single GPU can accept is about 10,000,000. However, this value is limited by the GPU memory size currently, usually within 1000,000 atoms even in the model compression mode.
+1. The total sel value of training parameters(in `model[standard]/descriptor` section) must be less than 4096.
+1. The size of the last layer of the embedding net must be less than 1024 during the model compression process.
diff --git a/doc/train/multi-task-training.md b/doc/train/multi-task-training.md
index 115c463cc2..72f02e9e62 100644
--- a/doc/train/multi-task-training.md
+++ b/doc/train/multi-task-training.md
@@ -26,8 +26,6 @@ and the Adam optimizer is executed to minimize $L^{(t)}$ for one step to update
 In the case of multi-GPU parallel training, different GPUs will independently select their tasks.
 In the DPA-2 model, this multi-task training framework is adopted.[^1]
 
-[^1]: Duo Zhang, Xinzijian Liu, Xiangyu Zhang, Chengqian Zhang, Chun Cai, Hangrui Bi, Yiming Du, Xuejian Qin, Anyang Peng, Jiameng Huang, Bowen Li, Yifan Shan, Jinzhe Zeng, Yuzhi Zhang, Siyuan Liu, Yifan Li, Junhan Chang, Xinyan Wang, Shuo Zhou, Jianchuan Liu, Xiaoshan Luo, Zhenyu Wang, Wanrun Jiang, Jing Wu, Yudi Yang, Jiyuan Yang, Manyi Yang, Fu-Qiang Gong, Linshuang Zhang, Mengchao Shi, Fu-Zhi Dai, Darrin M. York, Shi Liu, Tong Zhu, Zhicheng Zhong, Jian Lv, Jun Cheng, Weile Jia, Mohan Chen, Guolin Ke, Weinan E, Linfeng Zhang, Han Wang, DPA-2: a large atomic model as a multi-task learner. npj Comput Mater 10, 293 (2024). [DOI: 10.1038/s41524-024-01493-2](https://doi.org/10.1038/s41524-024-01493-2) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 Compared with the previous TensorFlow implementation, the new support in PyTorch is more flexible and efficient.
 In particular, it makes multi-GPU parallel training and even tasks beyond DFT possible,
 enabling larger-scale and more general multi-task training to obtain more general pre-trained models.
@@ -48,6 +46,7 @@ Specifically, there are several parts that need to be modified:
 - {ref}`model/model_dict <model/model_dict>`: The core definition of the model part and the explanation of sharing rules,
   starting with user-defined model name keys `model_key`, such as `my_model_1`.
   Each model part needs to align with the components of the single-task training {ref}`model <model>`, but with the following sharing rules:
+
   - If you want to share the current model component with other tasks, which should be part of the {ref}`model/shared_dict <model/shared_dict>`,
     you can directly fill in the corresponding `part_key`, such as
     `"descriptor": "my_descriptor", `
@@ -63,7 +62,7 @@ Specifically, there are several parts that need to be modified:
   - For fitting nets, we only support the default `shared_level`=0, where all parameters will be shared except for `bias_atom_e` and `case_embd`.
   - To conduct multitask training, there are two typical approaches:
     1. **Descriptor sharing only**: Share the descriptor with `shared_level`=0. See [here](../../examples/water_multi_task/pytorch_example/input_torch.json) for an example.
-    2. **Descriptor and fitting network sharing with data identification**:
+    1. **Descriptor and fitting network sharing with data identification**:
        - Share the descriptor and the fitting network with `shared_level`=0.
        - {ref}`dim_case_embd <model[standard]/fitting_net[ener]/dim_case_embd>` must be set to the number of model branches, which will distinguish different data tasks using a one-hot embedding.
        - See [here](../../examples/water_multi_task/pytorch_example/input_torch_sharefit.json) for an example.
@@ -84,8 +83,10 @@ Specifically, there are several parts that need to be modified:
 An example input for multi-task training two models in water system is shown as following:
 
 ```{literalinclude} ../../examples/water_multi_task/pytorch_example/input_torch.json
-:language: json
-:linenos:
+---
+language: json
+linenos:
+---
 ```
 
 ## Finetune from the pre-trained multi-task model
@@ -101,6 +102,8 @@ Details of some parameters that are the same as [the regular parameters](./train
 
 ```{eval-rst}
 .. dargs::
-   :module: deepmd.utils.argcheck
-   :func: gen_args_multi_task
+:module: deepmd.utils.argcheck
+:func: gen_args_multi_task
 ```
+
+[^1]: Duo Zhang, Xinzijian Liu, Xiangyu Zhang, Chengqian Zhang, Chun Cai, Hangrui Bi, Yiming Du, Xuejian Qin, Anyang Peng, Jiameng Huang, Bowen Li, Yifan Shan, Jinzhe Zeng, Yuzhi Zhang, Siyuan Liu, Yifan Li, Junhan Chang, Xinyan Wang, Shuo Zhou, Jianchuan Liu, Xiaoshan Luo, Zhenyu Wang, Wanrun Jiang, Jing Wu, Yudi Yang, Jiyuan Yang, Manyi Yang, Fu-Qiang Gong, Linshuang Zhang, Mengchao Shi, Fu-Zhi Dai, Darrin M. York, Shi Liu, Tong Zhu, Zhicheng Zhong, Jian Lv, Jun Cheng, Weile Jia, Mohan Chen, Guolin Ke, Weinan E, Linfeng Zhang, Han Wang, DPA-2: a large atomic model as a multi-task learner. npj Comput Mater 10, 293 (2024). [DOI: 10.1038/s41524-024-01493-2](https://doi.org/10.1038/s41524-024-01493-2) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/train/parallel-training.md b/doc/train/parallel-training.md
index 998f1c3bec..0f5af03460 100644
--- a/doc/train/parallel-training.md
+++ b/doc/train/parallel-training.md
@@ -105,9 +105,10 @@ We utilize the PyTorch framework and have designed and implemented a multiproces
 
 First, we establish a DeepmdData class for each system, which is consistent with the TensorFlow version in this level. Then, we create a dataloader for each system, resulting in the same number of dataloaders as the number of systems. Next, we create a dataset for the dataloaders obtained in the previous step. This allows us to query the data for each system through this dataset, while the iteration pointers for each system are maintained by their respective dataloaders. Finally, a dataloader is created for the outermost dataset.
 
-We achieve custom sampling methods using a weighted sampler. The length of the sampler is set to total_batch_num \* num_workers.The parameter "num_workers" defines the number of threads involved in multi-threaded loading, which can be modified by setting the environment variable NUM_WORKERS (default: min(8, ncpus)).
+We achieve custom sampling methods using a weighted sampler. The length of the sampler is set to total_batch_num * num_workers.The parameter "num_workers" defines the number of threads involved in multi-threaded loading, which can be modified by setting the environment variable NUM_WORKERS (default: min(8, ncpus)).
 
-> **Note** The underlying dataloader will use a distributed sampler to ensure that each GPU receives batches with different content in parallel mode, which will use sequential sampler in serial mode. In the TensorFlow version, Horovod shuffles the dataset using different random seeds for the same purpose..
+> [!NOTE]
+> The underlying dataloader will use a distributed sampler to ensure that each GPU receives batches with different content in parallel mode, which will use sequential sampler in serial mode. In the TensorFlow version, Horovod shuffles the dataset using different random seeds for the same purpose..
 
 ```mermaid
 flowchart LR
@@ -183,9 +184,11 @@ torchrun --rdzv_endpoint=node0:12321 --nnodes=2 --nproc_per_node=4 --node_rank=0
 torchrun --rdzv_endpoint=node0:12321 --nnodes=2 --nproc_per_node=4 --node_rank=1 --no_python dp --pt train tests/water/se_e2_a.json
 ```
 
-> **Note** Set environment variables to tune [CPU specific optimizations](https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#cpu-specific-optimizations) in advance.
+> [!NOTE]
+> Set environment variables to tune [CPU specific optimizations](https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html#cpu-specific-optimizations) in advance.
 
-> **Note** for developers: `torchrun` by default passes settings as environment variables [(list here)](https://pytorch.org/docs/stable/elastic/run.html#environment-variables).
+> [!NOTE]
+> for developers: `torchrun` by default passes settings as environment variables [(list here)](https://pytorch.org/docs/stable/elastic/run.html#environment-variables).
 
 > To check forward, backward, and communication time, please set env var `TORCH_CPP_LOG_LEVEL=INFO TORCH_DISTRIBUTED_DEBUG=DETAIL`. More details can be found [here](https://pytorch.org/docs/stable/distributed.html#logging).
 
@@ -234,8 +237,6 @@ mpirun run_pp.sh
 ```
 
 :::{note}
-
 If `NUM_WORKERS` is too large, it may cause the program to be terminated by the system;
 if it is too small, it may slow down data reading. You can try adjusting it to an appropriate size.
-
 :::
diff --git a/doc/train/training-advanced.md b/doc/train/training-advanced.md
index af4b4b31d9..e60312a8b0 100644
--- a/doc/train/training-advanced.md
+++ b/doc/train/training-advanced.md
@@ -21,8 +21,6 @@ where $\tau \in \mathbb{N}$ is the index of the training step, $\gamma^0  \in \m
 where $\tau^{\text{stop}} \in \mathbb{N}$, $\gamma^{\text{stop}} \in \mathbb{R}$, and $s \in \mathbb{N}$ are the stopping step, the stopping learning rate, and the decay steps, respectively, all of which are hyperparameters provided in advance.
 [^1]
 
-[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
-
 ### Instructions
 
 The {ref}`learning_rate <learning_rate>` section in `input.json` is given as follows
@@ -38,7 +36,9 @@ The {ref}`learning_rate <learning_rate>` section in `input.json` is given as fol
 ```
 
 - {ref}`start_lr <learning_rate[exp]/start_lr>` gives the learning rate at the beginning of the training.
+
 - {ref}`stop_lr <learning_rate[exp]/stop_lr>` gives the learning rate at the end of the training. It should be small enough to ensure that the network parameters satisfactorily converge.
+
 - During the training, the learning rate decays exponentially from {ref}`start_lr <learning_rate[exp]/start_lr>` to {ref}`stop_lr <learning_rate[exp]/stop_lr>` following the formula:
 
   ```
@@ -182,3 +182,5 @@ dp freeze -o frozen_model_adjusted_sel.pb
 Two models should give the same result when the input satisfies both constraints.
 
 Note: At this time, this feature is only supported by [`se_e2_a`](../model/train-se-e2-a.md) descriptor with [`set_davg_true`](./train-input.rst) enabled, or `hybrid` composed of the above descriptors.
+
+[^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
diff --git a/doc/train/training.md b/doc/train/training.md
index 6ccb43bbd7..7724c9951e 100644
--- a/doc/train/training.md
+++ b/doc/train/training.md
@@ -11,23 +11,18 @@ After switching to that directory, the training can be invoked by
 ::::{tab-set}
 
 :::{tab-item} TensorFlow {{ tensorflow_icon }}
-
 ```bash
 $ dp --tf train input.json
 ```
-
 :::
 
 :::{tab-item} PyTorch {{ pytorch_icon }}
-
 ```bash
 $ dp --pt train input.json
 ```
-
 :::
 
 :::{tab-item} Paddle {{ paddle_icon }}
-
 ```bash
 # training model
 $ dp --pd train input.json
@@ -39,7 +34,6 @@ $ CINN=1 dp --pd train input.json
 ## If the shape(s) of batch input data are fixed during training, e.g., examples/water.
 $ CINN=1 CINN_ALLOW_DYNAMIC_SHAPE=0 dp --pd train input.json
 ```
-
 :::
 
 ::::
@@ -69,12 +63,12 @@ During the training, the error of the model is tested every {ref}`disp_freq <tra
 
 ```bash
 #  step      rmse_val    rmse_trn    rmse_e_val  rmse_e_trn    rmse_f_val  rmse_f_trn         lr
-      0      3.33e+01    3.41e+01      1.03e+01    1.03e+01      8.39e-01    8.72e-01    1.0e-03
-    100      2.57e+01    2.56e+01      1.87e+00    1.88e+00      8.03e-01    8.02e-01    1.0e-03
-    200      2.45e+01    2.56e+01      2.26e-01    2.21e-01      7.73e-01    8.10e-01    1.0e-03
-    300      1.62e+01    1.66e+01      5.01e-02    4.46e-02      5.11e-01    5.26e-01    1.0e-03
-    400      1.36e+01    1.32e+01      1.07e-02    2.07e-03      4.29e-01    4.19e-01    1.0e-03
-    500      1.07e+01    1.05e+01      2.45e-03    4.11e-03      3.38e-01    3.31e-01    1.0e-03
+0      3.33e+01    3.41e+01      1.03e+01    1.03e+01      8.39e-01    8.72e-01    1.0e-03
+100      2.57e+01    2.56e+01      1.87e+00    1.88e+00      8.03e-01    8.02e-01    1.0e-03
+200      2.45e+01    2.56e+01      2.26e-01    2.21e-01      7.73e-01    8.10e-01    1.0e-03
+300      1.62e+01    1.66e+01      5.01e-02    4.46e-02      5.11e-01    5.26e-01    1.0e-03
+400      1.36e+01    1.32e+01      1.07e-02    2.07e-03      4.29e-01    4.19e-01    1.0e-03
+500      1.07e+01    1.05e+01      2.45e-03    4.11e-03      3.38e-01    3.31e-01    1.0e-03
 ```
 
 The file contains 8 columns, from left to right, which are the training step, the validation loss, training loss, root mean square (RMS) validation error of energy, RMS training error of energy, RMS validation error of force, RMS training error of force and the learning rate. The RMS error (RMSE) of the energy is normalized by the number of atoms in the system. One can visualize this file with a simple Python script:
diff --git a/doc/troubleshooting/howtoset_num_nodes.md b/doc/troubleshooting/howtoset_num_nodes.md
index b09fb80cb6..d239aa729b 100644
--- a/doc/troubleshooting/howtoset_num_nodes.md
+++ b/doc/troubleshooting/howtoset_num_nodes.md
@@ -11,15 +11,13 @@ Parallelism for MPI is optional and used for multiple nodes, multiple GPU cards,
 ::::{tab-set}
 
 :::{tab-item} TensorFlow {{ tensorflow_icon }}
-
 To enable MPI support for training in the TensorFlow interface, one should [install horovod](../install/install-from-source.md#install-horovod-and-mpi4py) in advance.
-
 :::
-:::{tab-item} PyTorch {{ pytorch_icon }}
 
+:::{tab-item} PyTorch {{ pytorch_icon }}
 Multiprocessing support for training in the PyTorch backend is implemented with [torchrun](https://pytorch.org/docs/stable/elastic/run.html).
-
 :::
+
 ::::
 
 Note that the parallelism mode is data parallelism, so it is not expected to see the training time per batch decreases.
@@ -71,15 +69,13 @@ There are several other environment variables for OpenMP, such as `KMP_BLOCKTIME
 ::::{tab-set}
 
 :::{tab-item} TensorFlow {{ tensorflow_icon }}
-
 See [Intel documentation](https://www.intel.com/content/www/us/en/developer/articles/technical/maximize-tensorflow-performance-on-cpu-considerations-and-recommendations-for-inference.html) for detailed information.
-
 :::
-:::{tab-item} PyTorch {{ pytorch_icon }}
 
+:::{tab-item} PyTorch {{ pytorch_icon }}
 See [PyTorch documentation](https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html) for detailed information.
-
 :::
+
 ::::
 
 ## Tune the performance
@@ -92,25 +88,21 @@ If you wish to use 3 cores of 2 CPUs on one node, you may set the environment va
 ::::{tab-set}
 
 :::{tab-item} TensorFlow {{ tensorflow_icon }}
-
 ```bash
 export OMP_NUM_THREADS=3
 export DP_INTRA_OP_PARALLELISM_THREADS=3
 export DP_INTER_OP_PARALLELISM_THREADS=2
 dp --tf train input.json
 ```
-
 :::
 
 :::{tab-item} PyTorch {{ pytorch_icon }}
-
 ```bash
 export OMP_NUM_THREADS=3
 export DP_INTRA_OP_PARALLELISM_THREADS=3
 export DP_INTER_OP_PARALLELISM_THREADS=2
 dp --pt train input.json
 ```
-
 :::
 
 ::::
diff --git a/examples/property/train/README.md b/examples/property/train/README.md
index e4dc9ed704..6e9345395c 100644
--- a/examples/property/train/README.md
+++ b/examples/property/train/README.md
@@ -1,5 +1,5 @@
 Some explanations of the parameters in `input.json`:
 
 1. `fitting_net/property_name` is the name of the property to be predicted. It should be consistent with the property name in the dataset. In each system, code will read `set.*/{property_name}.npy` file as prediction label if you use NumPy format data.
-2. `fitting_net/task_dim` is the dimension of model output. It should be consistent with the property dimension in the dataset, which means if the shape of data stored in `set.*/{property_name}.npy` is `batch size * 3`, `fitting_net/task_dim` should be set to 3.
-3. `fitting/intensive` indicates whether the fitting property is intensive. If `intensive` is `true`, the model output is the average of the property contribution of each atom. If `intensive` is `false`, the model output is the sum of the property contribution of each atom.
+1. `fitting_net/task_dim` is the dimension of model output. It should be consistent with the property dimension in the dataset, which means if the shape of data stored in `set.*/{property_name}.npy` is `batch size * 3`, `fitting_net/task_dim` should be set to 3.
+1. `fitting/intensive` indicates whether the fitting property is intensive. If `intensive` is `true`, the model output is the average of the property contribution of each atom. If `intensive` is `false`, the model output is the sum of the property contribution of each atom.
diff --git a/source/3rdparty/README.md b/source/3rdparty/README.md
index ac9cfd4edc..9de615d505 100644
--- a/source/3rdparty/README.md
+++ b/source/3rdparty/README.md
@@ -1,7 +1,7 @@
 # 3rd-party source codes
 
-| Name                      | Repository                         | Version | License |
-| ------------------------- | ---------------------------------- | ------- | ------- |
-| json                      | https://github.com/nlohmann/json   | 3.9.1   | MIT     |
-| Implib.so                 | https://github.com/yugr/Implib.so  | 0ddaa71 | MIT     |
-| coverage_plugins          | https://github.com/pytorch/pytorch | 2.2.0   | BSD-3   |
+| Name             | Repository                         | Version | License |
+| ---------------- | ---------------------------------- | ------- | ------- |
+| json             | https://github.com/nlohmann/json   | 3.9.1   | MIT     |
+| Implib.so        | https://github.com/yugr/Implib.so  | 0ddaa71 | MIT     |
+| coverage_plugins | https://github.com/pytorch/pytorch | 2.2.0   | BSD-3   |
diff --git a/source/nodejs/README.md b/source/nodejs/README.md
index ad78359761..e6b2c8d476 100644
--- a/source/nodejs/README.md
+++ b/source/nodejs/README.md
@@ -23,8 +23,8 @@ When using CMake to build DeePMD-kit, set argument `BUILD_NODEJS_IF=ON` and `NOD
 
 ```sh
 cmake -D BUILD_NODEJS_IF=ON \
-      -D NODEJS_INCLUDE_DIRS=/path/to/nodejs/include \
-      .. # and other arguments
+    -D NODEJS_INCLUDE_DIRS=/path/to/nodejs/include \
+    .. # and other arguments
 make
 make install
 ```