From 9e91fb76d776ea4c97a604ae061f8a33b7cd9dde Mon Sep 17 00:00:00 2001
From: Kshitij Surjuse <kshitijsurjuse100@gmail.com>
Date: Thu, 5 Feb 2026 14:36:57 -0500
Subject: [PATCH 01/22] Set-up preprocessor macro for defining RKB ERIs

---
 CMakeLists.txt                    | 203 ++++++++++++++++--------------
 cmake/modules/int_am.cmake        |   3 +-
 include/libint2/config.h.cmake.in |  19 +++
 include/libint2/cxxapi.h          |   2 +-
 src/bin/libint/build_libint.cc    |   5 +
 5 files changed, 134 insertions(+), 98 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4d767bdcc..5b88cf73b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,28 +11,28 @@ endif ()
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules)
 include(DynamicVersion)
 dynamic_version(
-  PROJECT_PREFIX Libint2Compiler_
-  GIT_ARCHIVAL_FILE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/.git_archival.txt
-  VERSION_FULL_MODE POST
-  OUTPUT_COMMIT LibintRepository_COMMIT
-  OUTPUT_VERSION LibintRepository_VERSION
-  OUTPUT_DESCRIBE LibintRepository_DESCRIBE
-  OUTPUT_DISTANCE LibintRepository_DISTANCE
-  OUTPUT_SHORT_HASH LibintRepository_SHORT_HASH
-  OUTPUT_VERSION_FULL LibintRepository_VERSION_FULL
-  )
+        PROJECT_PREFIX Libint2Compiler_
+        GIT_ARCHIVAL_FILE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/.git_archival.txt
+        VERSION_FULL_MODE POST
+        OUTPUT_COMMIT LibintRepository_COMMIT
+        OUTPUT_VERSION LibintRepository_VERSION
+        OUTPUT_DESCRIBE LibintRepository_DESCRIBE
+        OUTPUT_DISTANCE LibintRepository_DISTANCE
+        OUTPUT_SHORT_HASH LibintRepository_SHORT_HASH
+        OUTPUT_VERSION_FULL LibintRepository_VERSION_FULL
+)
 
 set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build.")  # foil Ninja Debug on Windows
 
 project(
-  Libint2Compiler
-  VERSION ${LibintRepository_VERSION}
-  DESCRIPTION
-    "A library for the evaluation of molecular integrals of many-body operators over Gaussian functions"
-  HOMEPAGE_URL "http://libint.valeyev.net"
-  LANGUAGES CXX
-  )
-  # * http://libint.valeyev.net/ redirects to https://github.com/evaleev/libint
+        Libint2Compiler
+        VERSION ${LibintRepository_VERSION}
+        DESCRIPTION
+        "A library for the evaluation of molecular integrals of many-body operators over Gaussian functions"
+        HOMEPAGE_URL "http://libint.valeyev.net"
+        LANGUAGES CXX
+)
+# * http://libint.valeyev.net/ redirects to https://github.com/evaleev/libint
 
 set(${PROJECT_NAME}_AUTHORS "Edward F. Valeev")
 set(${PROJECT_NAME}_LICENSE "GPL-3.0-only for generator; LGPL-3.0-only for generated library")
@@ -109,64 +109,67 @@ option_with_default(CMAKE_BUILD_TYPE "Build type (Release or Debug)" Release)
 
 ### compiler-only
 option_with_print(LIBINT2_BUILD_LIBRARY_AS_SUBPROJECT
-  "[EXPERT] Build generated library as a subproject: if FALSE will configure and build separately" OFF)
+        "[EXPERT] Build generated library as a subproject: if FALSE will configure and build separately" OFF)
 
 ### library-only
 option_with_print(LIBINT2_REQUIRE_CXX_API
-  "C++11 Libint API: define library targets + test (requires Eigen3, Boost is optional but strongly recommended)" ON)
+        "C++11 Libint API: define library targets + test (requires Eigen3, Boost is optional but strongly recommended)" ON)
 option_with_print(LIBINT2_REQUIRE_CXX_API_COMPILED
-  "Build C++11 Compiled (not just header-only) targets (requires Eigen3, Boost strongly recommended)" ON)
+        "Build C++11 Compiled (not just header-only) targets (requires Eigen3, Boost strongly recommended)" ON)
 option_with_print(LIBINT2_ENABLE_FORTRAN
-  "Build Fortran03+ Libint interface (requires Fortran)" OFF)
+        "Build Fortran03+ Libint interface (requires Fortran)" OFF)
 option_with_print(LIBINT2_ENABLE_PYTHON
-  "Build Python bindings (requires Python and Pybind11 and Eigen3)" OFF)
+        "Build Python bindings (requires Python and Pybind11 and Eigen3)" OFF)
 option_with_print(LIBINT2_PREFIX_PYTHON_INSTALL
-  "For LIBINT2_ENABLE_PYTHON=ON, whether to install the Python module in the Linux manner to CMAKE_INSTALL_PREFIX or to not install it. See target libint2-python-wheel for alternate installation in the Python manner to Python_EXECUTABLE's site-packages." OFF)
+        "For LIBINT2_ENABLE_PYTHON=ON, whether to install the Python module in the Linux manner to CMAKE_INSTALL_PREFIX or to not install it. See target libint2-python-wheel for alternate installation in the Python manner to Python_EXECUTABLE's site-packages." OFF)
 option_with_print(BUILD_SHARED_LIBS
-  "Build Libint library as shared, not static" OFF)
+        "Build Libint library as shared, not static" OFF)
 option_with_print(LIBINT2_BUILD_SHARED_AND_STATIC_LIBS
-  "Build both shared and static Libint libraries in one shot. Uses -fPIC." OFF)
+        "Build both shared and static Libint libraries in one shot. Uses -fPIC." OFF)
 option_with_print(LIBINT2_ENABLE_MPFR
-  "Use GNU MPFR library for high-precision testing (EXPERTS ONLY). Consumed at library build-time." OFF)
+        "Use GNU MPFR library for high-precision testing (EXPERTS ONLY). Consumed at library build-time." OFF)
 
 #  <<<  Which Integrals Classes, Which Derivative Levels  >>>
 
 option_with_default(LIBINT2_ENABLE_ONEBODY
-  "Compile with support for up to N-th derivatives of 1-body integrals (-1 for OFF)" 0)
+        "Compile with support for up to N-th derivatives of 1-body integrals (-1 for OFF)" 0)
 option_with_default(LIBINT2_ENABLE_ERI
-  "Compile with support for up to N-th derivatives of 4-center electron repulsion integrals (-1 for OFF)" 0)
+        "Compile with support for up to N-th derivatives of 4-center electron repulsion integrals (-1 for OFF)" 0)
 option_with_default(LIBINT2_ENABLE_ERI3
-  "Compile with support for up to N-th derivatives of 3-center electron repulsion integrals (-1 for OFF)" -1)
+        "Compile with support for up to N-th derivatives of 3-center electron repulsion integrals (-1 for OFF)" -1)
 option_with_default(LIBINT2_ENABLE_ERI2
-  "Compile with support for up to N-th derivatives of 2-center electron repulsion integrals (-1 for OFF)" -1)
+        "Compile with support for up to N-th derivatives of 2-center electron repulsion integrals (-1 for OFF)" -1)
+option_with_default(LIBINT2_ENABLE_RKB_ERI
+        "Compile with support for up to N-th derivatives of relativistic restricted kinetic
+         balance (RKB) 4-center electron repulsion integrals (-1 for OFF)" 0)
 option_with_default(LIBINT2_ENABLE_G12
-  "Compile with support for N-th derivatives of MP2-F12 energies with Gaussian factors (-1 for OFF)" -1)
+        "Compile with support for N-th derivatives of MP2-F12 energies with Gaussian factors (-1 for OFF)" -1)
 option_with_default(LIBINT2_ENABLE_G12DKH
-  "Compile with support for N-th derivatives of DKH-MP2-F12 energies with Gaussian factors (-1 for OFF)" -1)
+        "Compile with support for N-th derivatives of DKH-MP2-F12 energies with Gaussian factors (-1 for OFF)" -1)
 
 option_with_print(LIBINT2_DISABLE_ONEBODY_PROPERTY_DERIVS
-  "Disable geometric derivatives of 1-body property integrals (all but overlap, kinetic, elecpot).
+        "Disable geometric derivatives of 1-body property integrals (all but overlap, kinetic, elecpot).
    These derivatives are disabled by default to save compile time. (enable with OFF)
    Note that the libtool build won't enable this- if forcibly enabled, build_libint balks." ON)
 option_with_print(LIBINT2_ENABLE_T1G12
-  "Enable [Ti,G12] integrals when G12 integrals are enabled. Irrelevant when `LIBINT2_ENABLE_G12=OFF`. (disable with OFF)" ON)
+        "Enable [Ti,G12] integrals when G12 integrals are enabled. Irrelevant when `LIBINT2_ENABLE_G12=OFF`. (disable with OFF)" ON)
 
 #  <<<  Ordering Conventions  >>>
 
 option_with_default(LIBINT2_SHGAUSS_ORDERING
-  "Ordering for shells of solid harmonic Gaussians:
+        "Ordering for shells of solid harmonic Gaussians:
     standard -- standard ordering (-l, -l+1 ... l)
     gaussian -- the Gaussian ordering (0, 1, -1, 2, -2, ... l, -l)
    See https://github.com/evaleev/libint/blob/master/INSTALL.md#solid-harmonic-ordering-scope-and-history ." standard)
 option_with_default(LIBINT2_CARTGAUSS_ORDERING
-  "Orderings for shells of cartesian Gaussians:
+        "Orderings for shells of cartesian Gaussians:
     standard -- standard ordering (xxx, xxy, xxz, xyy, xyz, xzz, yyy, ...)
     intv3  -- intv3 ordering (yyy, yyz, yzz, zzz, xyy, xyz, xzz, xxy, xxz, xxx)
     gamess -- GAMESS ordering (xxx, yyy, zzz, xxy, xxz, yyx, yyz, zzx, zzy, xyz)
     orca -- ORCA ordering (hydrid between GAMESS and standard)
     bagel -- axis-permuted version of intv3 (xxx, xxy, xyy, yyy, xxz, xyz, yyz, xzz, yzz, zzz)" standard)
 option_with_default(LIBINT2_SHELL_SET
-  "Support computation of shell sets sets subject to these restrictions:
+        "Support computation of shell sets sets subject to these restrictions:
     standard -- standard ordering:
       for (ab|cd):
         l(a) >= l(b),
@@ -195,99 +198,107 @@ option_with_default(LIBINT2_SHELL_SET
 #    `export CMAKE_BUILD_PARALLEL_LEVEL=N`.
 
 option_with_default(LIBINT2_MAX_AM
-  "Support Gaussians of angular momentum up to N.
+        "Support Gaussians of angular momentum up to N.
    Can specify values for each derivative level as a semicolon-separated string.
    If ERI3 ints are enabled, this option also controls the AM of the paired centers." 4)
 option_with_default(LIBINT2_OPT_AM
-  "Optimize maximally for up to angular momentum N (N <= max-am).
+        "Optimize maximally for up to angular momentum N (N <= max-am).
    Can specify values for each derivative level as a semicolon-separated string. (default: (libint_max_am/2)+1)" -1)
 
 option_with_default(LIBINT2_MULTIPOLE_MAX_ORDER
-  "Maximum order of spherical multipole integrals. There is no maximum" 4)
+        "Maximum order of spherical multipole integrals. There is no maximum" 4)
 option_with_default(LIBINT2_ONEBODY_MAX_AM
-  "Support 1-body ints for Gaussians of angular momentum up to N.
+        "Support 1-body ints for Gaussians of angular momentum up to N.
    Can specify values for each derivative level as a semicolon-separated string. (default: max_am)" -1)
 option_with_default(LIBINT2_ONEBODY_OPT_AM
-  "Optimize 1-body ints maximally for up to angular momentum N (N <= max-am).
+        "Optimize 1-body ints maximally for up to angular momentum N (N <= max-am).
    Can specify values for each derivative level as a semicolon-separated string (default: (max_am/2)+1)" -1)
 
 option_with_default(LIBINT2_ERI_MAX_AM
-  "Support 4-center ERIs for Gaussians of angular momentum up to N.
+        "Support 4-center ERIs for Gaussians of angular momentum up to N.
    Can specify values for each derivative level as a semicolon-separated string. (default: max_am)" -1)
 option_with_default(LIBINT2_ERI_OPT_AM
-  "Optimize 4-center ERIs maximally for up to angular momentum N (N <= max-am).
+        "Optimize 4-center ERIs maximally for up to angular momentum N (N <= max-am).
    Can specify values for each derivative level as a semicolon-separated string (default: (max_am/2)+1)" -1)
 
+option_with_default(LIBINT2_RKB_ERI_MAX_AM
+        "Support relativistic restricted kinetic balance (RKB) 4-center ERIs for Gaussians of angular momentum up to N.
+   Can specify values for each derivative level as a semicolon-separated string. (default: max_am)" -1)
+option_with_default(LIBINT2_RKB_ERI_OPT_AM
+        "Optimize relativistic restricted kinetic balance (RKB) 4-center ERIs maximally for up to angular momentum N (N <= max-am).
+   Can specify values for each derivative level as a semicolon-separated string (default: (max_am/2)+1)" -1)
+
+
 option_with_default(LIBINT2_ERI3_MAX_AM
-  "Support 3-center ERIs for Gaussians of angular momentum up to N.
+        "Support 3-center ERIs for Gaussians of angular momentum up to N.
    Can specify values for each derivative level as a semicolon-separated string. (default: max_am)
    This option controls only the single fitting center. The paired centers use LIBINT2_MAX_AM." -1)
 option_with_default(LIBINT2_ERI3_OPT_AM
-  "Optimize 3-center ERIs maximally for up to angular momentum N (N <= max-am).
+        "Optimize 3-center ERIs maximally for up to angular momentum N (N <= max-am).
    Can specify values for each derivative level as a semicolon-separated string. (default: (max_am/2)+1)" -1)
 option_with_print(LIBINT2_ERI3_PURE_SH
-  "Assume the 'unpaired' center of 3-center ERIs will be transformed to pure solid harmonics" OFF)
+        "Assume the 'unpaired' center of 3-center ERIs will be transformed to pure solid harmonics" OFF)
 
 option_with_default(LIBINT2_ERI2_MAX_AM
-  "Support 2-center ERIs for Gaussians of angular momentum up to N.
+        "Support 2-center ERIs for Gaussians of angular momentum up to N.
     Can specify values for each derivative level as a semicolon-separated string. (default: max_am)" -1)
 option_with_default(LIBINT2_ERI2_OPT_AM
-  "Optimize 2-center ERIs maximally for up to angular momentum N (N <= max-am).
+        "Optimize 2-center ERIs maximally for up to angular momentum N (N <= max-am).
    Can specify values for each derivative level as a semicolon-separated string. (default: (max_am/2)+1)" -1)
 option_with_print(LIBINT2_ERI2_PURE_SH
-  "Assume the 2-center ERIs will be transformed to pure solid harmonics" OFF)
+        "Assume the 2-center ERIs will be transformed to pure solid harmonics" OFF)
 
 option_with_default(LIBINT2_G12_MAX_AM
-  "Support integrals for G12 methods of angular momentum up to N. (default: max_am)" -1)
+        "Support integrals for G12 methods of angular momentum up to N. (default: max_am)" -1)
 option_with_default(LIBINT2_G12_OPT_AM
-  "Optimize G12 integrals for up to angular momentum N (N <= max-am). (default: (max_am/2)+1)" -1)
+        "Optimize G12 integrals for up to angular momentum N (N <= max-am). (default: (max_am/2)+1)" -1)
 
 option_with_default(LIBINT2_G12DKH_MAX_AM
-  "Support integrals for relativistic G12 methods of angular momentum up to N. (default: max_am)" -1)
+        "Support integrals for relativistic G12 methods of angular momentum up to N. (default: max_am)" -1)
 option_with_default(LIBINT2_G12DKH_OPT_AM
-  "Optimize G12DKH integrals for up to angular momentum N (N <= max-am). (default: (max_am/2)+1)" -1)
+        "Optimize G12DKH integrals for up to angular momentum N (N <= max-am). (default: (max_am/2)+1)" -1)
 
 #  <<<  Miscellaneous  >>>
 
 option_with_print(LIBINT2_CONTRACTED_INTS
-  "Turn on support for contracted integrals." ON)
+        "Turn on support for contracted integrals." ON)
 option_with_default(LIBINT2_ERI_STRATEGY
-  "(EXPERT) Compute ERIs using the following strategy. (0 for OS, 1 for HGP, 2 for HL)" 1)
+        "(EXPERT) Compute ERIs using the following strategy. (0 for OS, 1 for HGP, 2 for HL)" 1)
 option_with_print(LIBINT2_USE_COMPOSITE_EVALUATORS
-  "Libint will use composite evaluators (i.e. every evaluator will compute one integral type only)" ON)
+        "Libint will use composite evaluators (i.e. every evaluator will compute one integral type only)" ON)
 option_with_print(LIBINT2_SINGLE_EVALTYPE
-  "Generate single evaluator type (i.e. all tasks use the same evaluator). OFF is NYI" ON)
+        "Generate single evaluator type (i.e. all tasks use the same evaluator). OFF is NYI" ON)
 option_with_default(LIBINT2_ENABLE_UNROLLING
-  "Unroll shell sets into integrals (will unroll shell sets larger than N) (0 for never, N for N, 1000000000 for always)" 100)
+        "Unroll shell sets into integrals (will unroll shell sets larger than N) (0 for never, N for N, 1000000000 for always)" 100)
 option_with_default(LIBINT2_ALIGN_SIZE
-  "(EXPERT) if posix_memalign is available, this will specify alignment of Libint data, in units of
+        "(EXPERT) if posix_memalign is available, this will specify alignment of Libint data, in units of
    sizeof(LIBINT2_REALTYPE). Default is to use built-in heuristics: system-determined for vectorization off (default) or veclen * sizeof(LIBINT2_REALTYPE) for vectorization on." 0)
 mark_as_advanced(LIBINT2_ALIGN_SIZE)
 option_with_default(LIBINT2_REALTYPE
-  "Specifies the floating-point data type used by the library. Consumed at library build-time." double)
+        "Specifies the floating-point data type used by the library. Consumed at library build-time." double)
 option_with_print(LIBINT2_USER_DEFINED_REAL_INCLUDES
-  "Additional #includes necessary to use the real type." OFF)
+        "Additional #includes necessary to use the real type." OFF)
 include(int_userreal)
 option_with_print(LIBINT2_GENERATE_FMA
-  "Generate FMA (fused multiply-add) instructions (to benefit must have FMA-capable hardware and compiler)" OFF)
+        "Generate FMA (fused multiply-add) instructions (to benefit must have FMA-capable hardware and compiler)" OFF)
 option_with_print(LIBINT2_ENABLE_GENERIC_CODE
-  "Use manually-written generic code" OFF)
+        "Use manually-written generic code" OFF)
 option_with_print(LIBINT2_API_PREFIX
-  "Prepend this string to every name in the library API (except for the types)." OFF)
+        "Prepend this string to every name in the library API (except for the types)." OFF)
 option_with_print(LIBINT2_VECTOR_LENGTH
-  "Compute integrals in vectors of length N." OFF)
+        "Compute integrals in vectors of length N." OFF)
 option_with_default(LIBINT2_VECTOR_METHOD
-  "Specifies how to vectorize integrals. Irrelevant when `LIBINT2_VECTOR_LENGTH=OFF. Allowed values are 'block' (default), and 'line'." block)
+        "Specifies how to vectorize integrals. Irrelevant when `LIBINT2_VECTOR_LENGTH=OFF. Allowed values are 'block' (default), and 'line'." block)
 option_with_print(LIBINT2_ACCUM_INTS
-  "Accumulate integrals to the buffer, rather than copy (OFF for copy, ON for accum)." OFF)
+        "Accumulate integrals to the buffer, rather than copy (OFF for copy, ON for accum)." OFF)
 option_with_print(LIBINT2_FLOP_COUNT
-  "Support (approximate) FLOP counting by the library. (Generated code will require C++11!)" OFF)
+        "Support (approximate) FLOP counting by the library. (Generated code will require C++11!)" OFF)
 option_with_print(LIBINT2_PROFILE
-  "Turn on profiling instrumentation of the library. (Generated code will require C++11!)" OFF)
+        "Turn on profiling instrumentation of the library. (Generated code will require C++11!)" OFF)
 option_with_print(LIBINT2_ENABLE_MPFR
-  "Use GNU MPFR library for high-precision testing (EXPERTS ONLY). Consumed at library build-time." OFF)
+        "Use GNU MPFR library for high-precision testing (EXPERTS ONLY). Consumed at library build-time." OFF)
 option_with_default(LIBINT2_EXPORT_COMPRESSOR
-  "Export tarball with compression gzip or bzip2" gzip)
+        "Export tarball with compression gzip or bzip2" gzip)
 # next one defined by `include(CTest)`
 message(STATUS "Showing option BUILD_TESTING: ${BUILD_TESTING}")
 
@@ -304,13 +315,13 @@ include(int_am)
 check_function_exists(posix_memalign HAVE_POSIX_MEMALIGN)
 if (NOT HAVE_POSIX_MEMALIGN)
     message(FATAL_ERROR "did not find posix_memalign ... this SHOULD NOT happen. Cannot proceed.")
-endif()
+endif ()
 
 check_include_file_cxx(stdint.h HAVE_STDINT_H)  # limits.h?
 
 if (cxx_std_11 IN_LIST CMAKE_CXX_COMPILE_FEATURES)
     set(LIBINT_HAS_CXX11 1)
-endif()
+endif ()
 
 booleanize01(LIBINT2_ERI3_PURE_SH)
 booleanize01(LIBINT2_ERI2_PURE_SH)
@@ -331,9 +342,9 @@ if (LIBINT2_EXPORT_COMPRESSOR STREQUAL "gzip")
 elseif (LIBINT2_EXPORT_COMPRESSOR STREQUAL "bzip2")
     set(LIBINT_EXPORT_COMPRESSOR_CMD "jcf")
     set(LIBINT_EXPORT_COMPRESSOR_EXT "tbz2")
-else()
+else ()
     message(FATAL_ERROR "No valid compressor; invoke CMake with -DLIBINT2_EXPORT_COMPRESSOR=gzip|bzip2")
-endif()
+endif ()
 
 
 ################################## Dependencies #################################
@@ -344,9 +355,9 @@ if (LIBINT2_ENABLE_MPFR)
     # mpfr detected in CMakeLists.txt.export at appropriate time for library, but prechecking here
     find_package(Multiprecision MODULE REQUIRED COMPONENTS gmpxx mpfr)
     set(LIBINT_HAS_MPFR 1)
-else()
+else ()
     find_package(Multiprecision MODULE REQUIRED COMPONENTS gmpxx)
-endif()
+endif ()
 
 get_property(_loc TARGET Multiprecision::gmp PROPERTY LOCATION)
 message(VERBOSE "${Cyan}Found GMP${ColourReset}: ${_loc}")
@@ -355,12 +366,12 @@ message(VERBOSE "${Cyan}Found GMPXX${ColourReset}: ${_loc}")
 if (TARGET Multiprecision::mpfr)
     get_property(_loc TARGET Multiprecision::mpfr PROPERTY LOCATION)
     message(VERBOSE "${Cyan}Found MPFR${ColourReset}: ${_loc} (found version ${MPFR_VERSION})")
-endif()
+endif ()
 
 find_package(Boost 1.57 REQUIRED)
 if (TARGET Boost::headers)
     set(LIBINT_HAS_SYSTEM_BOOST_PREPROCESSOR_VARIADICS 1)
-endif()
+endif ()
 
 # deferring find_package(Eigen3) to library (CMakeLists.txt.export)
 
@@ -370,9 +381,9 @@ endif()
 set(EXPORT_STAGE_DIR ${PROJECT_BINARY_DIR}/libint-${LIBINT_EXT_VERSION})
 
 configure_file(
-  cmake/modules/int_computed.cmake.in
-  cmake/modules/int_computed.cmake
-  @ONLY)
+        cmake/modules/int_computed.cmake.in
+        cmake/modules/int_computed.cmake
+        @ONLY)
 
 # CMake data transmitted to C++ via config.h for generator/compiler (_EXPORT_MODE=0).
 #   Same info is positioned for the library export, but _EXPORT_MODE=1 turns on
@@ -380,28 +391,28 @@ configure_file(
 #   library build time.
 set(_EXPORT_MODE 0)
 # convert user-facing LIBINT2_ variables to LIBINT_ internal variables
-foreach(_var API_PREFIX;ERI3_PURE_SH;ERI2_PURE_SH;DISABLE_ONEBODY_PROPERTY_DERIVS;ENABLE_UNROLLING;ENABLE_GENERIC_CODE;VECTOR_LENGTH;VECTOR_METHOD;ALIGN_SIZE;USER_DEFINED_REAL;USER_DEFINED_REAL_INCLUDES;GENERATE_FMA;ACCUM_INTS;FLOP_COUNT;PROFILE;CONTRACTED_INTS;SINGLE_EVALTYPE;USE_COMPOSITE_EVALUATORS;ERI_STRATEGY;MULTIPOLE_MAX_ORDER)
+foreach (_var API_PREFIX;ERI3_PURE_SH;ERI2_PURE_SH;DISABLE_ONEBODY_PROPERTY_DERIVS;ENABLE_UNROLLING;ENABLE_GENERIC_CODE;VECTOR_LENGTH;VECTOR_METHOD;ALIGN_SIZE;USER_DEFINED_REAL;USER_DEFINED_REAL_INCLUDES;GENERATE_FMA;ACCUM_INTS;FLOP_COUNT;PROFILE;CONTRACTED_INTS;SINGLE_EVALTYPE;USE_COMPOSITE_EVALUATORS;ERI_STRATEGY;MULTIPOLE_MAX_ORDER)
     if (DEFINED LIBINT2_${_var})
         if (DEFINED LIBINT_${_var})
             message(FATAL_ERROR "renaming user-facing LIBINT2_${_var} variable but internal variable LIBINT_${_var} already exists")
         else ()
             set(LIBINT_${_var} ${LIBINT2_${_var}})
-        endif()
-    endif()
+        endif ()
+    endif ()
 endforeach ()
 configure_file(
-  include/libint2/config.h.cmake.in
-  include/libint2/config.h
-  @ONLY)
+        include/libint2/config.h.cmake.in
+        include/libint2/config.h
+        @ONLY)
 set(_EXPORT_MODE 1)
 configure_file(
-  include/libint2/config.h.cmake.in
-  ${EXPORT_STAGE_DIR}/include/libint2/config.h
-  @ONLY)
+        include/libint2/config.h.cmake.in
+        ${EXPORT_STAGE_DIR}/include/libint2/config.h
+        @ONLY)
 configure_file(
-  include/libint2/config2.h.cmake.in
-  ${EXPORT_STAGE_DIR}/include/libint2/config2.h.cmake.in
-  COPYONLY)
+        include/libint2/config2.h.cmake.in
+        ${EXPORT_STAGE_DIR}/include/libint2/config2.h.cmake.in
+        COPYONLY)
 
 add_subdirectory(src)
 
diff --git a/cmake/modules/int_am.cmake b/cmake/modules/int_am.cmake
index c1d61cd55..782048aaf 100644
--- a/cmake/modules/int_am.cmake
+++ b/cmake/modules/int_am.cmake
@@ -357,6 +357,7 @@ endmacro()
 
 process_integrals_class(ONEBODY)
 process_integrals_class(ERI)
+process_integrals_class(RKB_ERI)
 process_integrals_class(ERI3)
 process_integrals_class(ERI2)
 # unlike above, these classes (1) don't do AM_LIST and (2) require value in config.h if enabled
@@ -396,7 +397,7 @@ list(REVERSE _amlist)
 list(APPEND Libint2_ERI_COMPONENTS "${_amlist}")
 message(VERBOSE "setting components ${_amlist}")
 
-foreach(_cls ONEBODY;ERI;ERI3;ERI2;G12;G12DKH)
+foreach(_cls ONEBODY;ERI;RKB_ERI;ERI3;ERI2;G12;G12DKH)
     if((_cls STREQUAL G12) OR (_cls STREQUAL G12DKH))
         add_feature_info(
           "integral class ${_cls}"
diff --git a/include/libint2/config.h.cmake.in b/include/libint2/config.h.cmake.in
index 640e68099..6018873e5 100644
--- a/include/libint2/config.h.cmake.in
+++ b/include/libint2/config.h.cmake.in
@@ -71,6 +71,13 @@
 #undef LIBINT_INCLUDE_ERI
 #endif
 
+/* Support ERI derivatives up to this order */
+#define LIBINT_INCLUDE_RKB_ERI @LIBINT_INCLUDE_RKB_ERI@
+#if @LIBINT_INCLUDE_RKB_ERI@ == -1
+#undef LIBINT_INCLUDE_RKB_ERI
+#endif
+
+
 /* Support 3-center ERI derivatives up to this order */
 #define LIBINT_INCLUDE_ERI3 @LIBINT_INCLUDE_ERI3@
 #if @LIBINT_INCLUDE_ERI3@ == -1
@@ -122,6 +129,18 @@
 /* Max optimized AM for ERI and its derivatives */
 #cmakedefine LIBINT_ERI_OPT_AM_LIST "@LIBINT_ERI_OPT_AM_LIST@"
 
+/* Max AM for RKB_ERI (same for all derivatives; if not defined see LIBINT_ERI_MAX_AM_LIST) */
+#cmakedefine LIBINT_RKB_ERI_MAX_AM @LIBINT_RKB_ERI_MAX_AM@
+
+/* Max AM for RKB_ERI and its derivatives */
+#cmakedefine LIBINT_RKB_ERI_MAX_AM_LIST "@LIBINT_RKB_ERI_MAX_AM_LIST@"
+
+/* Max optimized AM for ERI (same for all derivatives; if not defined see LIBINT_ERI_OPT_AM_LIST) */
+#cmakedefine LIBINT_RKB_ERI_OPT_AM @LIBINT_RKB_ERI_OPT_AM@
+
+/* Max optimized AM for ERI and its derivatives */
+#cmakedefine LIBINT_RKB_ERI_OPT_AM_LIST "@LIBINT_RKB_ERI_OPT_AM_LIST@"
+
 /* Max AM for 3-center ERI (same for all derivatives; if not defined see LIBINT_ERI3_MAX_AM_LIST) */
 #cmakedefine LIBINT_ERI3_MAX_AM @LIBINT_ERI3_MAX_AM@
 
diff --git a/include/libint2/cxxapi.h b/include/libint2/cxxapi.h
index 0aceb509b..a54802817 100644
--- a/include/libint2/cxxapi.h
+++ b/include/libint2/cxxapi.h
@@ -35,7 +35,7 @@
 
 #if !defined(LIBINT_INCLUDE_ONEBODY) ||                              \
     !(defined(LIBINT_INCLUDE_ERI) || defined(LIBINT_INCLUDE_ERI3) || \
-      defined(LIBINT_INCLUDE_ERI2))
+      defined(LIBINT_INCLUDE_ERI2) || defined(LIBINT_INCLUDE_RKB_ERI))
 #error \
     "C++ API is only supported if both 1-body and some (eri, eri3, eri2) 2-body integrals are enabled"
 #endif
diff --git a/src/bin/libint/build_libint.cc b/src/bin/libint/build_libint.cc
index 7baef7f1c..c0a69c6bd 100644
--- a/src/bin/libint/build_libint.cc
+++ b/src/bin/libint/build_libint.cc
@@ -567,6 +567,7 @@ void try_main(int argc, char* argv[]) {
     taskmgr.add(task_label("eri", d));
   }
 #endif
+
 #ifdef LIBINT_INCLUDE_ERI3
   for (unsigned int d = 0; d <= LIBINT_INCLUDE_ERI3; ++d) {
     taskmgr.add(task_label("3eri", d));
@@ -985,8 +986,12 @@ void print_config(std::ostream& os) {
 #ifdef LIBINT_INCLUDE_G12DKH
   os << "Will support G12DKH" << endl;
 #endif
+#ifdef LIBINT_INCLUDE_RKB_ERI
+  os << "RKB works " << std::endl;
+#endif
 }
 
+
 #ifdef LIBINT_INCLUDE_ERI
 void build_TwoPRep_2b_2k(std::ostream& os,
                          const std::shared_ptr<CompilationParameters>& cparams,

From 7941816503701ba47473d9660e9380bac7297c9a Mon Sep 17 00:00:00 2001
From: Kshitij Surjuse <kshitijsurjuse100@gmail.com>
Date: Sat, 7 Feb 2026 14:32:26 -0500
Subject: [PATCH 02/22] =?UTF-8?q?Can=20generate=20code=20for=20`(LL|SS)`?=
 =?UTF-8?q?=20type=20integral,=20i.e.,=20`(=CE=BC=20=CE=BD=20|=20(=CF=83.p?=
 =?UTF-8?q?)=CE=BA=20(=CF=83.p)=CE=BB)`?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/bin/libint/build_libint.cc                | 186 ++++++++++++++----
 .../comp_11_Coulomb\317\203p\317\203p_11.h"   | 167 ++++++++++++++++
 .../libint/comp_1_\317\203pV\317\203p_1.h"    |   4 +-
 src/bin/libint/master_ints_list.h             |   6 +-
 src/bin/libint/master_rrs_list.h              |   3 +
 src/bin/libint/oper.h                         |  58 +++++-
 src/bin/libint/strategy.cc                    |   9 +
 7 files changed, 383 insertions(+), 50 deletions(-)
 create mode 100644 "src/bin/libint/comp_11_Coulomb\317\203p\317\203p_11.h"

diff --git a/src/bin/libint/build_libint.cc b/src/bin/libint/build_libint.cc
index c0a69c6bd..9ff8f5710 100644
--- a/src/bin/libint/build_libint.cc
+++ b/src/bin/libint/build_libint.cc
@@ -193,12 +193,16 @@ static void config_to_api(const std::shared_ptr<CompilationParameters>& cparams,
 #ifdef LIBINT_INCLUDE_ERI
 #define USE_GENERIC_ERI_BUILD 1
 #if !USE_GENERIC_ERI_BUILD
+template <typename OperType>
 static void build_TwoPRep_2b_2k(
-    std::ostream& os, const std::shared_ptr<CompilationParameters>& cparams,
+    std::ostream& os, std::string label,
+    const std::shared_ptr<CompilationParameters>& cparams,
     std::shared_ptr<Libint2Iface>& iface);
 #else
+template <typename OperType>
 static void build_TwoPRep_2b_2k(
-    std::ostream& os, const std::shared_ptr<CompilationParameters>& cparams,
+    std::ostream& os, std::string label,
+    const std::shared_ptr<CompilationParameters>& cparams,
     std::shared_ptr<Libint2Iface>& iface, unsigned int deriv_level);
 #endif
 #endif
@@ -568,6 +572,23 @@ void try_main(int argc, char* argv[]) {
   }
 #endif
 
+#ifdef LIBINT_INCLUDE_RKB_ERI
+#define BOOST_PP_RKB_ERI_TASK_TUPLE (coulomb_opop)
+#define BOOST_PP_RKB_ERI_TASK_OPER_TUPLE (CoulombσpσpOper)
+#define BOOST_PP_RKB_ERI_TASK_LIST \
+  BOOST_PP_TUPLE_TO_LIST(BOOST_PP_RKB_ERI_TASK_TUPLE)
+#define BOOST_PP_RKB_ERI_TASK_OPER_LIST \
+  BOOST_PP_TUPLE_TO_LIST(BOOST_PP_RKB_ERI_TASK_OPER_TUPLE)
+
+  for (unsigned int d = 0; d <= LIBINT_INCLUDE_RKB_ERI; ++d) {
+#define BOOST_PP_RKB_ERI_MCR1(r, data, elem) \
+  taskmgr.add(task_label(BOOST_PP_STRINGIZE(elem), d));
+
+    BOOST_PP_LIST_FOR_EACH(BOOST_PP_RKB_ERI_MCR1, _, BOOST_PP_RKB_ERI_TASK_LIST)
+#undef BOOST_PP_RKB_ERI_MCR1
+  }
+#endif
+
 #ifdef LIBINT_INCLUDE_ERI3
   for (unsigned int d = 0; d <= LIBINT_INCLUDE_ERI3; ++d) {
     taskmgr.add(task_label("3eri", d));
@@ -669,6 +690,46 @@ void try_main(int argc, char* argv[]) {
     cparams->num_bf(task_label("eri", d), 4);
   }
 #endif
+
+#ifdef LIBINT_INCLUDE_RKB_ERI
+  for (unsigned int d = 0; d <= LIBINT_INCLUDE_RKB_ERI; ++d) {
+#if defined(LIBINT_RKB_ERI_MAX_AM_LIST)
+#define BOOST_PP_RKB_ERI_MCR2(r, data, elem)   \
+  cparams->max_am(                             \
+      task_label(BOOST_PP_STRINGIZE(elem), d), \
+                 token<unsigned int>(LIBINT_RKB_ERI_MAX_AM_LIST, ',', d));
+    BOOST_PP_LIST_FOR_EACH(BOOST_PP_RKB_ERI_MCR2, _, BOOST_PP_RKB_ERI_TASK_LIST)
+#undef BOOST_PP_RKB_ERI_MCR2
+#elif defined(LIBINT_RKB_ERI_MAX_AM)
+#define BOOST_PP_RKB_ERI_MCR3(r, data, elem)               \
+  cparams->max_am(task_label(BOOST_PP_STRINGIZE(elem), d), \
+                             LIBINT_RKB_ERI_MAX_AM);
+    BOOST_PP_LIST_FOR_EACH(BOOST_PP_RKB_ERI_MCR3, _, BOOST_PP_RKB_ERI_TASK_LIST)
+#undef BOOST_PP_RKB_ERI_MCR3
+#endif
+#if defined(LIBINT_RKB_ERI_OPT_AM_LIST)
+#define BOOST_PP_RKB_ERI_MCR4(r, data, elem)   \
+  cparams->max_am_opt(                         \
+      task_label(BOOST_PP_STRINGIZE(elem), d), \
+                 token<unsigned int>(LIBINT_RKB_ERI_OPT_AM_LIST, ',', d));
+    BOOST_PP_LIST_FOR_EACH(BOOST_PP_RKB_ERI_MCR4, _, BOOST_PP_RKB_ERI_TASK_LIST)
+#undef BOOST_PP_RKB_ERI_MCR4
+#elif defined(LIBINT_RKB_ERI_OPT_AM)
+#define BOOST_PP_RKB_ERI_MCR5(r, data, elem)                   \
+  cparams->max_am_opt(task_label(BOOST_PP_STRINGIZE(elem), d), \
+                                 LIBINT_RKB_ERI_OPT_AM);
+    BOOST_PP_LIST_FOR_EACH(BOOST_PP_RKB_ERI_MCR5, _, BOOST_PP_RKB_ERI_TASK_LIST)
+#undef BOOST_PP_RKB_ERI_MCR5
+#endif
+  }
+  for (unsigned int d = 0; d <= LIBINT_INCLUDE_RKB_ERI; ++d) {
+#define BOOST_PP_RKB_ERI_MCR6(r, data, elem) \
+  cparams->num_bf(task_label(BOOST_PP_STRINGIZE(elem), d), 4);
+    BOOST_PP_LIST_FOR_EACH(BOOST_PP_RKB_ERI_MCR6, _, BOOST_PP_RKB_ERI_TASK_LIST)
+#undef BOOST_PP_RKB_ERI_MCR6
+  }
+#endif  // LIBINT_INCLUDE_RKB_ERI
+
 #ifdef LIBINT_INCLUDE_ERI3
   for (unsigned int d = 0; d <= LIBINT_INCLUDE_ERI3; ++d) {
 #if defined(LIBINT_ERI3_MAX_AM_LIST)
@@ -853,6 +914,9 @@ void try_main(int argc, char* argv[]) {
 #ifdef LIBINT_INCLUDE_ERI
   max_deriv = std::max(LIBINT_INCLUDE_ERI, max_deriv);
 #endif
+#ifdef LIBINT_INCLUDE_RKB_ERI
+  max_deriv = std::max(LIBINT_INCLUDE_RKB_ERI, max_deriv);
+#endif
 #ifdef LIBINT_INCLUDE_ERI3
   max_deriv = std::max(LIBINT_INCLUDE_ERI3, max_deriv);
 #endif
@@ -880,13 +944,25 @@ void try_main(int argc, char* argv[]) {
 #endif
 #ifdef LIBINT_INCLUDE_ERI
 #if !USE_GENERIC_ERI_BUILD
-  build_TwoPRep_2b_2k(os, cparams, iface);
+  build_TwoPRep_2b_2k<TwoPRep>(os, "eri", cparams, iface);
 #else
   for (unsigned int d = 0; d <= LIBINT_INCLUDE_ERI; ++d) {
-    build_TwoPRep_2b_2k(os, cparams, iface, d);
+    build_TwoPRep_2b_2k<TwoPRep>(os, "eri", cparams, iface, d);
   }
 #endif
 #endif
+
+#ifdef LIBINT_INCLUDE_RKB_ERI
+  for (unsigned int d = 0; d <= LIBINT_INCLUDE_RKB_ERI; ++d) {
+#define BOOST_PP_RKB_ERI_MCR7(r, data, i, elem)                              \
+  build_TwoPRep_2b_2k<BOOST_PP_LIST_AT(BOOST_PP_RKB_ERI_TASK_OPER_LIST, i)>( \
+      os, BOOST_PP_STRINGIZE(elem), cparams, iface, d);
+    BOOST_PP_LIST_FOR_EACH_I(BOOST_PP_RKB_ERI_MCR7, _,
+                             BOOST_PP_RKB_ERI_TASK_LIST)
+#undef BOOST_PP_RKB_ERI_MCR7
+  }
+#endif
+
 #ifdef LIBINT_INCLUDE_ERI3
   for (unsigned int d = 0; d <= LIBINT_INCLUDE_ERI3; ++d) {
     build_TwoPRep_1b_2k(os, cparams, iface, d);
@@ -987,18 +1063,22 @@ void print_config(std::ostream& os) {
   os << "Will support G12DKH" << endl;
 #endif
 #ifdef LIBINT_INCLUDE_RKB_ERI
-  os << "RKB works " << std::endl;
+  os << "Will support restricted kinetically balance (RKB) 4-center ERIs "
+     << std::endl;
 #endif
 }
 
-
 #ifdef LIBINT_INCLUDE_ERI
-void build_TwoPRep_2b_2k(std::ostream& os,
+template <typename OperType>
+void build_TwoPRep_2b_2k(std::ostream& os, std::string label,
                          const std::shared_ptr<CompilationParameters>& cparams,
                          std::shared_ptr<Libint2Iface>& iface,
                          unsigned int deriv_level) {
-  const std::string task = task_label("eri", deriv_level);
-  typedef TwoPRep_11_11_sq TwoPRep_sh_11_11;
+  typedef GenIntegralSet_11_11<CGShell, OperType, mType> TwoBody_sh_11_11;
+  typedef typename OperType::Descriptor OperDescrType;
+
+  const std::string task = task_label(label, deriv_level);
+
   vector<CGShell*> shells;
   unsigned int lmax = cparams->max_am(task);
   for (unsigned int l = 0; l <= lmax; l++) {
@@ -1010,6 +1090,7 @@ void build_TwoPRep_2b_2k(std::ostream& os,
   taskmgr.current(task);
   iface->to_params(iface->macro_define(std::string("MAX_AM_") + task, lmax));
 
+  const auto nullaux = typename TwoBody_sh_11_11::AuxIndexType(0u);
   //
   // Construct graphs for each desired target integral and
   // 1) generate source code for the found traversal path
@@ -1041,9 +1122,25 @@ void build_TwoPRep_2b_2k(std::ostream& os,
           const int lim = 1;
           if (!(la == lim && lb == lim && lc == lim && ld == lim)) continue;
 #endif
+          // this will hold all target shell sets
+          std::vector<std::shared_ptr<TwoBody_sh_11_11>> targets;
+
+          /////////////////////////////////
+          // loop over operator components
+          /////////////////////////////////
+          // most important operators have 1 component ...
+          std::vector<OperDescrType> descrs(1);  // operator descriptors
+          // important EXCEPTION: multipole moments
+          if (std::is_same<OperType, CoulombσpσpOper>::value) {
+            // reset descriptors array
+            descrs.resize(0);
+            // iterate over quaternion components
+            for (int p = 0; p != 4; ++p) {
+              descrs.emplace_back(make_descr<OperDescrType>(p));
+            }
+          }
 
-          // unroll only if max_am <= cparams->max_am_opt(task)
-          using std::max;
+          // unroll only if max_am <= cparams->max_am_opt(task) using std::max;
           const unsigned int max_am = max(max(la, lb), max(lc, ld));
           const bool need_to_optimize = (max_am <= cparams->max_am_opt(task));
           const bool need_to_unroll =
@@ -1072,7 +1169,7 @@ void build_TwoPRep_2b_2k(std::ostream& os,
           ////////////
           // NB translational invariance is now handled by CR_DerivGauss
           CartesianDerivIterator<4> diter(deriv_level);
-          std::vector<std::shared_ptr<TwoPRep_sh_11_11>> targets;
+
           bool last_deriv = false;
           do {
             CGShell a(la);
@@ -1089,18 +1186,22 @@ void build_TwoPRep_2b_2k(std::ostream& os,
               }
             }
 
-            std::shared_ptr<TwoPRep_sh_11_11> abcd =
-                TwoPRep_sh_11_11::Instance(a, b, c, d, mType(0u));
-            targets.push_back(abcd);
+            // operator component loop
+            for (unsigned int op = 0; op != descrs.size(); ++op) {
+              OperType oper(descrs[op]);
+
+              std::shared_ptr<TwoBody_sh_11_11> abcd =
+                  TwoBody_sh_11_11::Instance(a, b, c, d, nullaux, oper);
+              targets.push_back(abcd);
+            }
+
             last_deriv = diter.last();
             if (!last_deriv) diter.next();
           } while (!last_deriv);
           // append all derivatives as targets to the graph
-          for (std::vector<std::shared_ptr<TwoPRep_sh_11_11>>::const_iterator
-                   t = targets.begin();
-               t != targets.end(); ++t) {
+          for (auto it = targets.begin(); it != targets.end(); ++it) {
             std::shared_ptr<DGVertex> t_ptr =
-                std::dynamic_pointer_cast<DGVertex, TwoPRep_sh_11_11>(*t);
+                std::dynamic_pointer_cast<DGVertex, TwoBody_sh_11_11>(*it);
             dg_xxxx->append_target(t_ptr);
           }
 
@@ -1112,23 +1213,35 @@ void build_TwoPRep_2b_2k(std::ostream& os,
             CGShell b(lb);
             CGShell c(lc);
             CGShell d(ld);
-            std::shared_ptr<TwoPRep_sh_11_11> abcd =
-                TwoPRep_sh_11_11::Instance(a, b, c, d, mType(0u));
-            abcd_label = abcd->label();
+
+            if constexpr (!std::is_same<OperType, CoulombσpσpOper>::value) {
+              OperType oper;
+              oper = OperType(descrs[0]);
+              std::shared_ptr<TwoBody_sh_11_11> abcd =
+                  TwoBody_sh_11_11::Instance(a, b, c, d, nullaux, oper);
+              abcd_label = abcd->label();
+            } else {
+              std::ostringstream oss;
+              oss << cparams->api_prefix();
+              oss << "_" << a.label() << "_" << b.label();
+              oss << "_" << label;
+              oss << "_" << c.label() << "_" << d.label();
+              abcd_label = oss.str();
+            }
           }
           // + derivative level (if deriv_level > 0)
-          std::string label;
+          std::string eval_label;
           {
-            label = cparams->api_prefix();
+            eval_label = cparams->api_prefix();
             if (deriv_level != 0) {
               std::ostringstream oss;
               oss << "deriv" << deriv_level;
-              label += oss.str();
+              eval_label += oss.str();
             }
-            label += abcd_label;
+            eval_label += abcd_label;
           }
 
-          std::cout << "working on " << label << " ... ";
+          std::cout << "working on " << eval_label << " ... ";
           std::cout.flush();
 
           std::string prefix(cparams->source_directory());
@@ -1138,7 +1251,8 @@ void build_TwoPRep_2b_2k(std::ostream& os,
           // this will generate code for these targets, and potentially generate
           // code for its prerequisites
           GenerateCode(dg_xxxx, context, cparams, strat, tactic, memman,
-                       decl_filenames, def_filenames, prefix, label, false);
+                       decl_filenames, def_filenames, prefix, eval_label,
+                       false);
 
           // update max stack size and # of targets
           const std::shared_ptr<TaskParameters>& tparams =
@@ -1153,7 +1267,7 @@ void build_TwoPRep_2b_2k(std::ostream& os,
           oss << context->label_to_name(cparams->api_prefix())
               << "libint2_build_" << task << "[" << la << "][" << lb << "]["
               << lc << "][" << ld
-              << "] = " << context->label_to_name(label_to_funcname(label))
+              << "] = " << context->label_to_name(label_to_funcname(eval_label))
               << context->end_of_stat() << endl;
           iface->to_static_init(oss.str());
 
@@ -1369,10 +1483,9 @@ void build_TwoPRep_1b_2k(std::ostream& os,
         iface->to_static_init(oss.str());
 
         // need to declare this function internally
-        for (std::deque<std::string>::const_iterator i = decl_filenames.begin();
-             i != decl_filenames.end(); ++i) {
+        for (auto& decl_filename : decl_filenames) {
           oss.str("");
-          oss << "#include <" << *i << ">" << endl;
+          oss << "#include <" << decl_filename << ">" << endl;
           iface->to_int_iface(oss.str());
         }
 
@@ -1381,7 +1494,7 @@ void build_TwoPRep_1b_2k(std::ostream& os,
 #endif
         dg_xxx->reset();
         memman->reset();
-
+        std::cout << "done" << std::endl;
       }  // end of d loop
     }    // end of c loop
   }      // end of bra loop
@@ -1566,10 +1679,9 @@ void build_TwoPRep_1b_1k(std::ostream& os,
       iface->to_static_init(oss.str());
 
       // need to declare this function internally
-      for (std::deque<std::string>::const_iterator i = decl_filenames.begin();
-           i != decl_filenames.end(); ++i) {
+      for (auto& decl_filename : decl_filenames) {
         oss.str("");
-        oss << "#include <" << *i << ">" << endl;
+        oss << "#include <" << decl_filename << ">" << endl;
         iface->to_int_iface(oss.str());
       }
 
@@ -1578,7 +1690,7 @@ void build_TwoPRep_1b_1k(std::ostream& os,
 #endif
       dg_xxx->reset();
       memman->reset();
-
+      std::cout << "done" << std::endl;
     }  // end of ket loop
   }    // end of bra loop
 }
diff --git "a/src/bin/libint/comp_11_Coulomb\317\203p\317\203p_11.h" "b/src/bin/libint/comp_11_Coulomb\317\203p\317\203p_11.h"
new file mode 100644
index 000000000..252eabf5d
--- /dev/null
+++ "b/src/bin/libint/comp_11_Coulomb\317\203p\317\203p_11.h"
@@ -0,0 +1,167 @@
+/*
+ *  Copyright (C) 2004-2024 Edward F. Valeev
+ *
+ *  This file is part of Libint compiler.
+ *
+ *  Libint compiler is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Libint compiler is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with Libint compiler.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef LIBINT_COMP_11_COULOMBΣPΣP_11_H
+#define LIBINT_COMP_11_COULOMBΣPΣP_11_H
+
+#include <gaussoper.h>
+#include <generic_rr.h>
+#include <twoprep_11_11.h>
+
+namespace libint2 {
+
+/**
+ * this computes integral of
+ * \f$ \frac{1}{r_{ij}} \sigma \cdot \hat{p}_1 \sigma \cdot \hat{p}_2 \f$ over
+ * CGShell/CGF by rewriting it as a linear combination of integrals over
+ * derivatives of \frac{1}{r_{ij}}
+ * @tparam F basis function type. valid choices are CGShell or CGF
+ */
+template <typename F>
+class CR_11_Coulombσpσp_11
+    : public GenericRecurrenceRelation<
+          CR_11_Coulombσpσp_11<F>, F,
+          GenIntegralSet_11_11<F, CoulombσpσpOper, mType>> {
+ public:
+  typedef CR_11_Coulombσpσp_11<F> ThisType;
+  typedef F BasisFunctionType;
+  typedef CoulombσpσpOper OperType;
+  typedef GenIntegralSet_11_11<F, CoulombσpσpOper, mType> TargetType;
+  typedef GenericRecurrenceRelation<ThisType, BasisFunctionType, TargetType>
+      ParentType;
+  friend class GenericRecurrenceRelation<ThisType, BasisFunctionType,
+                                         TargetType>;
+  static const unsigned int max_nchildren = 100;  // TODO figure out
+
+  using ParentType::Instance;
+
+  static bool directional() { return false; }
+
+ private:
+  using ParentType::is_simple;
+  using ParentType::target_;
+  using ParentType::RecurrenceRelation::expr_;
+  using ParentType::RecurrenceRelation::nflops_;
+
+  /// Constructor is private, used by ParentType::Instance that maintains
+  /// registry of these objects
+  CR_11_Coulombσpσp_11(const std::shared_ptr<TargetType> &, unsigned int = 0);
+
+  static std::string descr() { return "CR"; }
+};
+
+template <typename F>
+CR_11_Coulombσpσp_11<F>::CR_11_Coulombσpσp_11(
+    const std::shared_ptr<TargetType> &Tint, unsigned int)
+    : ParentType(Tint, 0) {
+  assert(Tint->num_func_bra(/* particle */ 0) == 1);
+  assert(Tint->num_func_bra(/* particle */ 1) == 1);
+  assert(Tint->num_func_ket(/* particle */ 0) == 1);
+  assert(Tint->num_func_ket(/* particle */ 1) == 1);
+
+  F a(Tint->bra(0, 0));
+  F b(Tint->ket(0, 0));
+  F c(Tint->bra(1, 0));
+  F d(Tint->ket(1, 0));
+
+  const auto &oper = Tint->oper();
+
+  // can express integrals of Coulombσpσp in terms of
+  // derivative integrals of 1/r12 for primitive Gaussians
+  // only
+  if (a.contracted() || b.contracted() || c.contracted() || d.contracted())
+    return;
+
+  using namespace libint2::algebra;
+  using namespace libint2::prefactor;
+  using libint2::algebra::operator*;
+
+  const mType zero_m(0u);
+
+  ChildFactory<ThisType,
+               GenIntegralSet_11_11<BasisFunctionType, TwoPRep, mType>>
+      factory(this);
+
+  constexpr auto x = 0;
+  constexpr auto y = 1;
+  constexpr auto z = 2;
+
+  F c_x{c};
+  c_x.deriv().inc(x);  // d(c)/dx = c_x
+  F c_y{c};
+  c_y.deriv().inc(y);  // d(c)/dy = c_y
+  F c_z{c};
+  c_z.deriv().inc(z);  // d(c)/dz = c_z
+
+  F d_x{d};
+  d_x.deriv().inc(x);  // d(d)/dx = d_x
+  F d_y{d};
+  d_y.deriv().inc(y);  // d(d)/dy = d_y
+  F d_z{d};
+  d_z.deriv().inc(z);  // d(d)/dz = d_z
+
+  // Component wise generation for quaternion ( a b | 1/r12 | (σ.p) c (σ.p) d )
+  switch (oper->descr().quaternion_index()) {
+    case 0: {
+      // zeroth component = (a b | c_x d_x) + (a b | c_y d_y) + (a b | c_z d_z)
+      auto a_b_cx_dx = factory.make_child(a, b, c_x, d_x, zero_m);
+      auto a_b_cy_dy = factory.make_child(a, b, c_y, d_y, zero_m);
+      auto a_b_cz_dz = factory.make_child(a, b, c_z, d_z, zero_m);
+      if (is_simple()) {
+        expr_ = a_b_cx_dx + a_b_cy_dy + a_b_cz_dz;
+        nflops_ += 2;
+      }
+    } break;
+    case 1: {
+      // x component = (a b | c_y d_z) - (a b | c_z d_y)
+      auto a_b_cy_dz = factory.make_child(a, b, c_y, d_z, zero_m);
+      auto a_b_cz_dy = factory.make_child(a, b, c_z, d_y, zero_m);
+      if (is_simple()) {
+        expr_ = a_b_cy_dz - a_b_cz_dy;
+        nflops_ += 1;
+      }
+    } break;
+    case 2: {
+      // y component = (a b | c_z d_x) - (a b | c_x d_z)
+      auto a_b_cz_dx = factory.make_child(a, b, c_z, d_x, zero_m);
+      auto a_b_cx_dz = factory.make_child(a, b, c_x, d_z, zero_m);
+      if (is_simple()) {
+        expr_ = a_b_cz_dx - a_b_cx_dz;
+        nflops_ += 1;
+      }
+    } break;
+    case 3: {
+      // z component = (a b | c_x d_y) - (a b | c_y d_x)
+      auto a_b_cx_dy = factory.make_child(a, b, c_x, d_y, zero_m);
+      auto a_b_cy_dx = factory.make_child(a, b, c_y, d_x, zero_m);
+      if (is_simple()) {
+        expr_ = a_b_cx_dy - a_b_cy_dx;
+        nflops_ += 1;
+      }
+    } break;
+    default:
+      throw std::runtime_error(
+          "CR_11_Coulombσpσp_11: invalid quaternionic index");
+  }
+
+}  // CR_11_Coulombσpσp_11<F>::CR_11_Coulombσpσp_11
+};  // namespace libint2
+
+#endif  // LIBINT_COMP_11_COULOMBΣPΣP_11_H
diff --git "a/src/bin/libint/comp_1_\317\203pV\317\203p_1.h" "b/src/bin/libint/comp_1_\317\203pV\317\203p_1.h"
index cb131ebb8..9fbdef361 100644
--- "a/src/bin/libint/comp_1_\317\203pV\317\203p_1.h"
+++ "b/src/bin/libint/comp_1_\317\203pV\317\203p_1.h"
@@ -107,7 +107,7 @@ CR_1_σpVσp_1<F>::CR_1_σpVσp_1(const std::shared_ptr<TargetType> &Tint,
 
   // (a|W0|b) = (d a/dAx | V | d b/dBx) + (d a/dAy | V | d b/dBy) + (d a/dAz | V
   // | d b/dBz)
-  switch (oper->descr().pauli_index()) {
+  switch (oper->descr().quaternion_index()) {
     case 0: {
       auto Dx_a_V_Dx_b = factory.make_child(Dx_a, Dx_b, zero_m);
       auto Dy_a_V_Dy_b = factory.make_child(Dy_a, Dy_b, zero_m);
@@ -146,7 +146,7 @@ CR_1_σpVσp_1<F>::CR_1_σpVσp_1(const std::shared_ptr<TargetType> &Tint,
       }
     } break;
     default:
-      throw std::runtime_error("CR_1_σpVσp_1: invalid Pauli index");
+      throw std::runtime_error("CR_1_σpVσp_1: invalid quaternionic index");
   }
 
 }  // CR_1_σpVσp_1<F>::CR_1_σpVσp_1
diff --git a/src/bin/libint/master_ints_list.h b/src/bin/libint/master_ints_list.h
index 1aa8c3e64..c5a6f4655 100644
--- a/src/bin/libint/master_ints_list.h
+++ b/src/bin/libint/master_ints_list.h
@@ -106,6 +106,9 @@ typedef GenIntegralSet_1_1<CGF1d<CartesianAxis_Z>, CartesianMultipoleOper<1u>,
 //////////////////////////
 typedef GenIntegralSet_11_11<CGShell, TwoPRep, mType> TwoPRep_11_11_sq;
 typedef GenIntegralSet_11_11<CGF, TwoPRep, mType> TwoPRep_11_11_int;
+typedef GenIntegralSet_11_11<CGShell, CoulombσpσpOper, mType>
+    Coulombσpσp_11_11_sq;
+typedef GenIntegralSet_11_11<CGF, CoulombσpσpOper, mType> Coulombσpσp_11_11_int;
 typedef GenIntegralSet_11_11<CGShell, R12kG12, mType> R12kG12_11_11_sq;
 typedef GenIntegralSet_11_11<CGF, R12kG12, mType> R12kG12_11_11_int;
 typedef GenIntegralSet_11_11<CGShell, R12kR12lG12, EmptySet>
@@ -144,7 +147,8 @@ typedef boost::mpl::list<
     CMultipole_1_1_int_y, CMultipole_1_1_int_z, SMultipole_1_1_sh,
     SMultipole_1_1_int,
 #endif
-    TwoPRep_11_11_sq, TwoPRep_11_11_int, R12kG12_11_11_sq, R12kG12_11_11_int,
+    TwoPRep_11_11_sq, TwoPRep_11_11_int, Coulombσpσp_11_11_sq,
+    Coulombσpσp_11_11_int, R12kG12_11_11_sq, R12kG12_11_11_int,
     R12kR12lG12_11_11_sq, R12kR12lG12_11_11_int, TiG12_11_11_sq,
     TiG12_11_11_int, G12TiG12_11_11_sq, G12TiG12_11_11_int,
     DivG12prime_xTx_11_11_sq, DivG12prime_xTx_11_11_int,
diff --git a/src/bin/libint/master_rrs_list.h b/src/bin/libint/master_rrs_list.h
index f3ec4e2d2..32e56878f 100644
--- a/src/bin/libint/master_rrs_list.h
+++ b/src/bin/libint/master_rrs_list.h
@@ -21,6 +21,7 @@
 #ifndef _libint2_src_bin_libint_masterrrslist_h_
 #define _libint2_src_bin_libint_masterrrslist_h_
 
+#include <comp_11_Coulombσpσp_11.h>
 #include <comp_11_DivG12prime_xTx_11.h>
 #include <comp_11_g12tig12_11.h>
 #include <comp_11_r12kr12lg12_11.h>
@@ -266,6 +267,8 @@ typedef CR_DerivGauss<TwoPRep_11_11_int, 1, InKet, trinvskip2_part,
                       trinvskip2_where>
     Deriv_d_11_TwoPRep_11_int;
 
+typedef CR_11_Coulombσpσp_11<CGShell> CR_11_Coulombσpσp_11_sh;
+typedef CR_11_Coulombσpσp_11<CGF> CR_11_Coulombσpσp_11_int;
 };  // namespace libint2
 
 #endif  // header guard
diff --git a/src/bin/libint/oper.h b/src/bin/libint/oper.h
index 091df3ac0..daa6aa644 100644
--- a/src/bin/libint/oper.h
+++ b/src/bin/libint/oper.h
@@ -289,22 +289,22 @@ BOOST_PP_LIST_FOR_EACH(BOOST_PP_DECLARE_HERMITIAN_ONEBODY_DESCRIPTOR,
 struct σpVσp_Descr : public Contractable<σpVσp_Descr> {
   typedef MultiplicativeODep1Body_Props Properties;
 
-  σpVσp_Descr() : pauli_index_(0) {}
-  σpVσp_Descr(int pauli_index) : pauli_index_(pauli_index) {
-    assert(pauli_index <= 3);
+  σpVσp_Descr() : quaternion_index_(0) {}
+  σpVσp_Descr(int quaternion_index) : quaternion_index_(quaternion_index) {
+    assert(quaternion_index <= 3);
   }
 
   static const unsigned int max_key = 4;
-  unsigned int key() const { return pauli_index(); }
+  unsigned int key() const { return quaternion_index(); }
   std::string description() const {
     std::string descr("opVop[");
-    if (pauli_index() == 0)
+    if (quaternion_index() == 0)
       descr += "0";
-    else if (pauli_index() == 1)
+    else if (quaternion_index() == 1)
       descr += "Z";
-    else if (pauli_index() == 2)
+    else if (quaternion_index() == 2)
       descr += "X";
-    else if (pauli_index() == 3)
+    else if (quaternion_index() == 3)
       descr += "Y";
     else
       abort();
@@ -314,10 +314,10 @@ struct σpVσp_Descr : public Contractable<σpVσp_Descr> {
   int psymm(int i, int j) const { abort(); }
   int hermitian(int i) const { return +1; }
 
-  int pauli_index() const { return pauli_index_; }
+  int quaternion_index() const { return quaternion_index_; }
 
  private:
-  const int pauli_index_ = -1;
+  const int quaternion_index_ = -1;
 };
 typedef GenOper<σpVσp_Descr> σpVσpOper;
 
@@ -399,6 +399,44 @@ struct TwoPRep_Descr : public Contractable<TwoPRep_Descr> {
 };
 typedef GenOper<TwoPRep_Descr> TwoPRep;
 
+/** Coulombσpσp is the two-body repulsion operator.
+ */
+struct Coulombσpσp_Descr : public Contractable<Coulombσpσp_Descr> {
+  typedef MultiplicativeSymm2Body_Props Properties;
+
+  Coulombσpσp_Descr() : quaternion_index_(0) {}
+  Coulombσpσp_Descr(int quaternion_index)
+      : quaternion_index_(quaternion_index) {
+    assert(quaternion_index <= 3);
+  }
+
+  static const unsigned int max_key = 4;
+  unsigned int key() const { return quaternion_index(); }
+  std::string description() const {
+    std::string descr("coulomb_opop[");
+    if (quaternion_index() == 0)
+      descr += "0";
+    else if (quaternion_index() == 1)
+      descr += "Z";
+    else if (quaternion_index() == 2)
+      descr += "X";
+    else if (quaternion_index() == 3)
+      descr += "Y";
+    else
+      abort();
+    return descr + "]";
+  }
+  std::string label() const { return description(); }
+  int psymm(int i, int j) const { abort(); }
+  int hermitian(int i) const { return +1; }
+
+  int quaternion_index() const { return quaternion_index_; }
+
+ private:
+  const int quaternion_index_ = -1;
+};
+typedef GenOper<Coulombσpσp_Descr> CoulombσpσpOper;
+
 /** GTG_1d is the two-body 1-dimensional Gaussian geminal
  */
 struct GTG_1d_Descr : public Contractable<GTG_1d_Descr> {
diff --git a/src/bin/libint/strategy.cc b/src/bin/libint/strategy.cc
index bbfc3fe21..bcab9dcad 100644
--- a/src/bin/libint/strategy.cc
+++ b/src/bin/libint/strategy.cc
@@ -115,6 +115,15 @@ struct MasterStrategy<TwoPRep_11_11_int> {
 };
 #endif
 
+template <>
+struct MasterStrategy<Coulombσpσp_11_11_sq> {
+  typedef boost::mpl::list<CR_11_Coulombσpσp_11_sh> value;
+};
+template <>
+struct MasterStrategy<Coulombσpσp_11_11_int> {
+  typedef boost::mpl::list<CR_11_Coulombσpσp_11_int> value;
+};
+
 #if LIBINT_SHELLQUARTET_STRATEGY == LIBINT_SHELLQUARTET_STRATEGY_A0C0
 template <>
 struct MasterStrategy<R12kG12_11_11_sq> {

From 05e9ff5c31bc4f3670f6b165f4851ef9169a2112 Mon Sep 17 00:00:00 2001
From: Kshitij Surjuse <kshitijsurjuse100@gmail.com>
Date: Mon, 9 Feb 2026 11:24:33 -0500
Subject: [PATCH 03/22] =?UTF-8?q?bugfix:=20added=20the=20missing=20`make?=
 =?UTF-8?q?=5Fdescr<Coulomb=CF=83p=CF=83p=5FDescr>`?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cmake/modules/int_am.cmake     |  2 +
 include/libint2/cxxapi.h       |  2 +-
 include/libint2/engine.h       | 12 ++++++
 include/libint2/engine.impl.h  | 74 +++++++++++++++++++---------------
 src/bin/libint/build_libint.cc |  6 +++
 5 files changed, 62 insertions(+), 34 deletions(-)

diff --git a/cmake/modules/int_am.cmake b/cmake/modules/int_am.cmake
index 782048aaf..cc86b7aa7 100644
--- a/cmake/modules/int_am.cmake
+++ b/cmake/modules/int_am.cmake
@@ -434,6 +434,8 @@ foreach(_cls ONEBODY;ERI;RKB_ERI;ERI3;ERI2;G12;G12DKH)
                     list(APPEND _amlist         "onebody_${_am${_l}}${_am${_l}}_d${_d}")
                 elseif (_cls STREQUAL "G12")
                     list(APPEND _amlist             "g12_${_am${_l}}${_am${_l}}${_am${_l}}${_am${_l}}_d${_d}")
+                elseif (_cls STREQUAL "RKB_ERI")
+                    list(APPEND _amlist             "rkb_eri_${_am${_l}}${_am${_l}}${_am${_l}}${_am${_l}}_d${_d}")
                 endif()
             endforeach()
             if (_cls STREQUAL "ERI3")
diff --git a/include/libint2/cxxapi.h b/include/libint2/cxxapi.h
index a54802817..22686f958 100644
--- a/include/libint2/cxxapi.h
+++ b/include/libint2/cxxapi.h
@@ -37,7 +37,7 @@
     !(defined(LIBINT_INCLUDE_ERI) || defined(LIBINT_INCLUDE_ERI3) || \
       defined(LIBINT_INCLUDE_ERI2) || defined(LIBINT_INCLUDE_RKB_ERI))
 #error \
-    "C++ API is only supported if both 1-body and some (eri, eri3, eri2) 2-body integrals are enabled"
+    "C++ API is only supported if both 1-body and some (eri, eri3, eri2, rkb_eri) 2-body integrals are enabled"
 #endif
 
 #include <libint2/atom.h>
diff --git a/include/libint2/engine.h b/include/libint2/engine.h
index 7e29eb710..81048b049 100644
--- a/include/libint2/engine.h
+++ b/include/libint2/engine.h
@@ -153,6 +153,9 @@ enum class Operator {
   coulomb,
   /// alias for Operator::coulomb
   r12_m1 = coulomb,
+  /// (2-body) \f$ r_{12}^{-1} (σ.p_{k1})(σ.p_{k2})\f$ where k1  & k2 are
+  /// centers of ket1 and ket2, respectively
+  coulomb_opop,
   /// contracted Gaussian geminal
   cgtg,
   /// contracted Gaussian geminal times Coulomb
@@ -246,6 +249,7 @@ struct operator_traits<Operator::nuclear>
   typedef const libint2::FmEval_Reference<scalar_type> core_eval_type;
 #endif
 };
+
 template <>
 struct operator_traits<Operator::opVop>
     : public operator_traits<Operator::nuclear> {
@@ -346,6 +350,14 @@ struct operator_traits<Operator::coulomb>
   typedef const libint2::FmEval_Reference<scalar_type> core_eval_type;
 #endif
 };
+
+template <>
+struct operator_traits<Operator::coulomb_opop>
+    : public operator_traits<Operator::coulomb> {
+  static constexpr auto nopers = 4;
+  static constexpr auto intrinsic_deriv_order = 2;
+};
+
 namespace detail {
 template <int K>
 struct cgtg_operator_traits : public detail::default_operator_traits {
diff --git a/include/libint2/engine.impl.h b/include/libint2/engine.impl.h
index 05d5808ea..8e107449b 100644
--- a/include/libint2/engine.impl.h
+++ b/include/libint2/engine.impl.h
@@ -70,30 +70,31 @@ typename std::remove_all_extents<T>::type* to_ptr1(T (&a)[N]) {
 /// These MUST appear in the same order as in Operator.
 /// You must also update BOOST_PP_NBODY_OPERATOR_LAST_ONEBODY_INDEX when you add
 /// one-body ints
-#define BOOST_PP_NBODY_OPERATOR_LIST              \
-  (overlap,                  /* overlap */        \
-   (kinetic,                 /* kinetic */        \
-    (elecpot,                /* nuclear */        \
-     (elecpot,               /* erf_nuclear */    \
-      (elecpot,              /* erfc_nuclear */   \
-       (elecpot,             /* erfx_nuclear */   \
-        (1emultipole,        /* emultipole1 */    \
-         (2emultipole,       /* emultipole2 */    \
-          (3emultipole,      /* emultipole3 */    \
-           (sphemultipole,   /* sphemultipole */  \
-            (opVop,          /* opVop */          \
-             (eri,           /* delta */          \
-              (eri,          /* coulomb */        \
-               (eri,         /* cgtg */           \
-                (eri,        /* cgtg_x_coulomb */ \
-                 (eri,       /* delcgtg2 */       \
-                  (eri,      /* r12 */            \
-                   (eri,     /* erf_coulomb */    \
-                    (eri,    /* erfc_coulomb */   \
-                     (eri,   /* erfx_coulomb */   \
-                      (eri,  /* stg */            \
-                       (eri, /* yukawa */         \
-                        BOOST_PP_NIL))))))))))))))))))))))
+#define BOOST_PP_NBODY_OPERATOR_LIST               \
+  (overlap,                   /* overlap */        \
+   (kinetic,                  /* kinetic */        \
+    (elecpot,                 /* nuclear */        \
+     (elecpot,                /* erf_nuclear */    \
+      (elecpot,               /* erfc_nuclear */   \
+       (elecpot,              /* erfx_nuclear */   \
+        (1emultipole,         /* emultipole1 */    \
+         (2emultipole,        /* emultipole2 */    \
+          (3emultipole,       /* emultipole3 */    \
+           (sphemultipole,    /* sphemultipole */  \
+            (opVop,           /* opVop */          \
+             (eri,            /* delta */          \
+              (eri,           /* coulomb */        \
+               (eri,          /* coulomb_opop */   \
+                (eri,         /* cgtg */           \
+                 (eri,        /* cgtg_x_coulomb */ \
+                  (eri,       /* delcgtg2 */       \
+                   (eri,      /* r12 */            \
+                    (eri,     /* erf_coulomb */    \
+                     (eri,    /* erfc_coulomb */   \
+                      (eri,   /* erfx_coulomb */   \
+                       (eri,  /* stg */            \
+                        (eri, /* yukawa */         \
+                         BOOST_PP_NIL)))))))))))))))))))))))
 
 #define BOOST_PP_NBODY_OPERATOR_INDEX_TUPLE \
   BOOST_PP_MAKE_TUPLE(BOOST_PP_LIST_SIZE(BOOST_PP_NBODY_OPERATOR_LIST))
@@ -663,23 +664,23 @@ __libint2_engine_inline void Engine::initialize(size_t max_nprim) {
   // validate braket
 #ifndef LIBINT_INCLUDE_ONEBODY
   assert(braket_ != BraKet::x_x &&
-         "this braket type not supported by the library; give --enable-1body "
-         "to configure");
+         "this braket type not supported by the library; configure with "
+         "-DLIBINT_INCLUDE_ONEBODY >= 0");
 #endif
 #ifndef LIBINT_INCLUDE_ERI
   assert(braket_ != BraKet::xx_xx &&
-         "this braket type not supported by the library; give --enable-eri to "
-         "configure");
+         "this braket type not supported by the library; configure with "
+         "-DLIBINT_INCLUDE_ERI >= 0");
 #endif
 #ifndef LIBINT_INCLUDE_ERI3
   assert((braket_ != BraKet::xs_xx && braket_ != BraKet::xx_xs) &&
-         "this braket type not supported by the library; give --enable-eri3 to "
-         "configure");
+         "this braket type not supported by the library; configure with "
+         "-DLIBINT_INCLUDE_ERI3 >= 0");
 #endif
 #ifndef LIBINT_INCLUDE_ERI2
   assert(braket_ != BraKet::xs_xs &&
-         "this braket type not supported by the library; give --enable-eri2 to "
-         "configure");
+         "this braket type not supported by the library; configure with "
+         "-DLIBINT_INCLUDE_ERI2 >= 0");
 #endif
 
   // make sure it's no default initialized
@@ -1421,7 +1422,7 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
             const scalar_type rho = gammap * gammaq * oogammapq;
             const scalar_type T = PQ2 * rho;
             auto* gm_ptr = &(primdata.LIBINT_T_SS_EREP_SS(0)[0]);
-            const auto mmax = l + deriv_order_;
+            const auto mmax = l + deriv_order_ + intrinsic_deriv_order();
 
             if (!skip_core_ints) {
               switch (oper_) {
@@ -1432,6 +1433,13 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
                           .first();
                   core_eval_ptr->eval(gm_ptr, T, mmax);
                 } break;
+                case Operator::coulomb_opop: {
+                  const auto& core_eval_ptr =
+                      any_cast<const detail::core_eval_pack_type<
+                          Operator::coulomb_opop>&>(core_eval_pack_)
+                          .first();
+                  core_eval_ptr->eval(gm_ptr, T, mmax);
+                } break;
                 case Operator::cgtg_x_coulomb: {
                   const auto& core_eval_ptr =
                       any_cast<const detail::core_eval_pack_type<
diff --git a/src/bin/libint/build_libint.cc b/src/bin/libint/build_libint.cc
index 9ff8f5710..1a3a93b95 100644
--- a/src/bin/libint/build_libint.cc
+++ b/src/bin/libint/build_libint.cc
@@ -278,6 +278,12 @@ template <>
 σpVσp_Descr make_descr<σpVσp_Descr>(int p, int, int) {
   return σpVσp_Descr(p);
 }
+
+template <>
+Coulombσpσp_Descr make_descr<Coulombσpσp_Descr>(int p, int, int) {
+  return Coulombσpσp_Descr(p);
+}
+
 }  // namespace
 
 template <typename _OperType>

From f3274a99e927bfaa40837831d64056e3762c2e55 Mon Sep 17 00:00:00 2001
From: Kshitij Surjuse <kshitijsurjuse100@gmail.com>
Date: Tue, 10 Feb 2026 12:03:00 -0500
Subject: [PATCH 04/22] `Engine` can initialize for coulomb_opop operator

---
 include/libint2/engine.impl.h  |  4 ++--
 src/bin/libint/build_libint.cc | 14 ++++++++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/include/libint2/engine.impl.h b/include/libint2/engine.impl.h
index 8e107449b..40d50763b 100644
--- a/include/libint2/engine.impl.h
+++ b/include/libint2/engine.impl.h
@@ -84,7 +84,7 @@ typename std::remove_all_extents<T>::type* to_ptr1(T (&a)[N]) {
             (opVop,           /* opVop */          \
              (eri,            /* delta */          \
               (eri,           /* coulomb */        \
-               (eri,          /* coulomb_opop */   \
+               (coulomb_opop, /* coulomb_opop */   \
                 (eri,         /* cgtg */           \
                  (eri,        /* cgtg_x_coulomb */ \
                   (eri,       /* delcgtg2 */       \
@@ -1462,7 +1462,7 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
                           const detail::core_eval_pack_type<Operator::cgtg>&>(
                           core_eval_pack_)
                           .first();
-                  const auto& core_ints_params =
+                  ` const auto& core_ints_params =
                       any_cast<const typename operator_traits<
                           Operator::cgtg>::oper_params_type&>(
                           core_ints_params_);
diff --git a/src/bin/libint/build_libint.cc b/src/bin/libint/build_libint.cc
index 1a3a93b95..8acb3814d 100644
--- a/src/bin/libint/build_libint.cc
+++ b/src/bin/libint/build_libint.cc
@@ -1071,6 +1071,9 @@ void print_config(std::ostream& os) {
 #ifdef LIBINT_INCLUDE_RKB_ERI
   os << "Will support restricted kinetically balance (RKB) 4-center ERIs "
      << std::endl;
+  if (LIBINT_INCLUDE_RKB_ERI > 0)
+    os << "(deriv order = " << LIBINT_INCLUDE_RKB_ERI << ")";
+  os << endl;
 #endif
 }
 
@@ -2262,6 +2265,12 @@ void config_to_api(const std::shared_ptr<CompilationParameters>& cparams,
   iface->to_params(iface->macro_define("DERIV_ERI_ORDER", LIBINT_INCLUDE_ERI));
   max_deriv_order = std::max(max_deriv_order, LIBINT_INCLUDE_ERI);
 #endif
+#ifdef LIBINT_INCLUDE_RKB_ERI
+  iface->to_params(iface->macro_define("SUPPORT_RKB_ERI", 1));
+  iface->to_params(
+      iface->macro_define("DERIV_RKB_ERI_ORDER", LIBINT_INCLUDE_RKB_ERI));
+  max_deriv_order = std::max(max_deriv_order, LIBINT_INCLUDE_RKB_ERI);
+#endif
 #ifdef LIBINT_INCLUDE_ERI3
   iface->to_params(iface->macro_define("SUPPORT_ERI3", 1));
   iface->to_params(
@@ -2317,8 +2326,9 @@ void config_to_api(const std::shared_ptr<CompilationParameters>& cparams,
 
       {  // 2-body ints
 
-#define BOOST_PP_TWOBODY_TASKOPER_TUPLE \
-  ("eri", "r12kg12", "r12_0_g12", "r12_2_g12", "g12_T1_g12", "g12dkh")
+#define BOOST_PP_TWOBODY_TASKOPER_TUPLE                                      \
+  ("eri", "coulomb_opop", "r12kg12", "r12_0_g12", "r12_2_g12", "g12_T1_g12", \
+   "g12dkh")
 #define BOOST_PP_TWOBODY_TASKOPER_LIST \
   BOOST_PP_TUPLE_TO_LIST(BOOST_PP_TWOBODY_TASKOPER_TUPLE)
 

From 2bf62f641d175110b367c7ed0217cfc748b96fbb Mon Sep 17 00:00:00 2001
From: Kshitij Surjuse <kshitijsurjuse100@gmail.com>
Date: Tue, 10 Feb 2026 21:20:58 -0500
Subject: [PATCH 05/22] use and invoke correct `buildfnptr`s if engine is
 initialized with`Operator::coulomb_opop`

---
 include/libint2/engine.impl.h                           | 7 ++++---
 "src/bin/libint/comp_11_Coulomb\317\203p\317\203p_11.h" | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/libint2/engine.impl.h b/include/libint2/engine.impl.h
index 40d50763b..3f5669d96 100644
--- a/include/libint2/engine.impl.h
+++ b/include/libint2/engine.impl.h
@@ -1462,7 +1462,7 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
                           const detail::core_eval_pack_type<Operator::cgtg>&>(
                           core_eval_pack_)
                           .first();
-                  ` const auto& core_ints_params =
+                  const auto& core_ints_params =
                       any_cast<const typename operator_traits<
                           Operator::cgtg>::oper_params_type&>(
                           core_ints_params_);
@@ -1754,7 +1754,7 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
 #endif
 
               // prefactors for derivative ERI relations
-              if (deriv_order_ > 0) {
+              if (deriv_order_ + intrinsic_deriv_order() > 0) {
 #if LIBINT2_DEFINED(eri, alpha1_rho_over_zeta2)
                 primdata.alpha1_rho_over_zeta2[0] =
                     alpha0 * (oogammap * gammaq_o_gammapgammaq);
@@ -1837,7 +1837,8 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
   }
 
   // compute directly (ss|ss)
-  const auto compute_directly = lmax == 0 && deriv_order_ == 0;
+  const auto compute_directly =
+      lmax == 0 && deriv_order_ == 0 & intrinsic_deriv_order() == 0;
 
   if (compute_directly) {
 #ifdef LIBINT2_ENGINE_TIMERS
diff --git "a/src/bin/libint/comp_11_Coulomb\317\203p\317\203p_11.h" "b/src/bin/libint/comp_11_Coulomb\317\203p\317\203p_11.h"
index 252eabf5d..82ee512ef 100644
--- "a/src/bin/libint/comp_11_Coulomb\317\203p\317\203p_11.h"
+++ "b/src/bin/libint/comp_11_Coulomb\317\203p\317\203p_11.h"
@@ -1,5 +1,5 @@
 /*
- *  Copyright (C) 2004-2024 Edward F. Valeev
+ *  Copyright (C) 2004-2026 Edward F. Valeev
  *
  *  This file is part of Libint compiler.
  *

From b6c376be3812b1195b323c3c895c9230f40be1eb Mon Sep 17 00:00:00 2001
From: Kshitij Surjuse <kshitijsurjuse100@gmail.com>
Date: Wed, 11 Feb 2026 13:18:47 -0500
Subject: [PATCH 06/22] Apply @loriab 's review suggestions

---
 include/libint2/config.h.cmake.in | 4 ++--
 include/libint2/engine.impl.h     | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/libint2/config.h.cmake.in b/include/libint2/config.h.cmake.in
index 6018873e5..3eb32e2f0 100644
--- a/include/libint2/config.h.cmake.in
+++ b/include/libint2/config.h.cmake.in
@@ -129,13 +129,13 @@
 /* Max optimized AM for ERI and its derivatives */
 #cmakedefine LIBINT_ERI_OPT_AM_LIST "@LIBINT_ERI_OPT_AM_LIST@"
 
-/* Max AM for RKB_ERI (same for all derivatives; if not defined see LIBINT_ERI_MAX_AM_LIST) */
+/* Max AM for RKB_ERI (same for all derivatives; if not defined see LIBINT_RKB_ERI_MAX_AM_LIST) */
 #cmakedefine LIBINT_RKB_ERI_MAX_AM @LIBINT_RKB_ERI_MAX_AM@
 
 /* Max AM for RKB_ERI and its derivatives */
 #cmakedefine LIBINT_RKB_ERI_MAX_AM_LIST "@LIBINT_RKB_ERI_MAX_AM_LIST@"
 
-/* Max optimized AM for ERI (same for all derivatives; if not defined see LIBINT_ERI_OPT_AM_LIST) */
+/* Max optimized AM for ERI (same for all derivatives; if not defined see LIBINT_RKB_ERI_OPT_AM_LIST) */
 #cmakedefine LIBINT_RKB_ERI_OPT_AM @LIBINT_RKB_ERI_OPT_AM@
 
 /* Max optimized AM for ERI and its derivatives */
diff --git a/include/libint2/engine.impl.h b/include/libint2/engine.impl.h
index 3f5669d96..9381b972a 100644
--- a/include/libint2/engine.impl.h
+++ b/include/libint2/engine.impl.h
@@ -665,22 +665,22 @@ __libint2_engine_inline void Engine::initialize(size_t max_nprim) {
 #ifndef LIBINT_INCLUDE_ONEBODY
   assert(braket_ != BraKet::x_x &&
          "this braket type not supported by the library; configure with "
-         "-DLIBINT_INCLUDE_ONEBODY >= 0");
+         "-DLIBINT2_ENABLE_ONEBODY >= 0");
 #endif
 #ifndef LIBINT_INCLUDE_ERI
   assert(braket_ != BraKet::xx_xx &&
          "this braket type not supported by the library; configure with "
-         "-DLIBINT_INCLUDE_ERI >= 0");
+         "-DLIBINT2_ENABLE_ERI >= 0");
 #endif
 #ifndef LIBINT_INCLUDE_ERI3
   assert((braket_ != BraKet::xs_xx && braket_ != BraKet::xx_xs) &&
          "this braket type not supported by the library; configure with "
-         "-DLIBINT_INCLUDE_ERI3 >= 0");
+         "-DLIBINT2_ENABLE_ERI3 >= 0");
 #endif
 #ifndef LIBINT_INCLUDE_ERI2
   assert(braket_ != BraKet::xs_xs &&
          "this braket type not supported by the library; configure with "
-         "-DLIBINT_INCLUDE_ERI2 >= 0");
+         "-DLIBINT2_ENABLE_ERI2 >= 0");
 #endif
 
   // make sure it's no default initialized

From 0aa7b83f6116d8140563eace3d8119fa777dc5f5 Mon Sep 17 00:00:00 2001
From: Kshitij Surjuse <kshitijsurjuse100@gmail.com>
Date: Fri, 13 Feb 2026 19:16:53 -0500
Subject: [PATCH 07/22] Added unittest for `Operator::coulomb_opop` and fixed
 logic errors in unique am shell sets and phase change for this operator

---
 export/tests/unit/test-2body.cc | 167 ++++++++++++++++++++++++++++++--
 include/libint2/engine.impl.h   |  38 +++++---
 src/bin/libint/build_libint.cc  |  15 ++-
 src/bin/libint/oper.h           |  12 +--
 4 files changed, 198 insertions(+), 34 deletions(-)

diff --git a/export/tests/unit/test-2body.cc b/export/tests/unit/test-2body.cc
index fd602a910..ee08ed972 100644
--- a/export/tests/unit/test-2body.cc
+++ b/export/tests/unit/test-2body.cc
@@ -1,5 +1,5 @@
 /*
- *  Copyright (C) 2004-2024 Edward F. Valeev
+ *  Copyright (C) 2004-2026 Edward F. Valeev
  *
  *  This file is part of Libint library.
  *
@@ -344,6 +344,153 @@ TEST_CASE("eri geometric derivatives", "[engine][2-body]") {
   }
 }
 
+TEST_CASE("RKB Coulomb integrals", "[engine][2-body]") {
+  std::vector<Shell> obs{
+      // pseudorandom s
+      Shell{{1.0, 0.3}, {{0, false, {0.9, 0.3}}}, {{0.0, 0.0, 0.0}}},
+      // pseudorandom p
+      Shell{{2.0, 0.4}, {{1, false, {0.8, -0.2}}}, {{1.0, 1.0, 1.0}}}};
+
+  const auto max_nprim = libint2::max_nprim(obs);
+  const auto max_l = libint2::max_l(obs);
+  typedef std::array<unsigned int, 12> der_idx;
+
+  SECTION("Coulombσpσp") {
+    Engine engine;
+    try {
+      engine = Engine(Operator::coulomb_opop, max_nprim, max_l, 0);
+      // TODO: need another unit test for derivatives of RKB ERIs
+    } catch (
+        Engine::lmax_exceeded &) {  // skip the test if lmax exceeded or libint2
+                                    // not configured with RKB support
+      return;
+    }
+
+    const auto nshell = obs.size();
+    for (int s0 = 0; s0 != nshell; ++s0) {
+      for (int s1 = 0; s1 != nshell; ++s1) {
+        for (int s2 = 0; s2 != nshell; ++s2) {
+          for (int s3 = 0; s3 != nshell; ++s3) {
+            const auto &results =
+                engine.compute(obs[s0], obs[s1], obs[s2], obs[s3]);
+            assert(results.size() ==
+                   4);  // we get 4 buffers for each quaternion component
+
+            LIBINT2_REF_REALTYPE Aref[3];
+            for (int i = 0; i < 3; ++i) Aref[i] = obs[s0].O[i];
+            LIBINT2_REF_REALTYPE Bref[3];
+            for (int i = 0; i < 3; ++i) Bref[i] = obs[s1].O[i];
+            LIBINT2_REF_REALTYPE Cref[3];
+            for (int i = 0; i < 3; ++i) Cref[i] = obs[s2].O[i];
+            LIBINT2_REF_REALTYPE Dref[3];
+            for (int i = 0; i < 3; ++i) Dref[i] = obs[s3].O[i];
+
+            int ijkl = 0;
+
+            int l0, m0, n0;
+            FOR_CART(l0, m0, n0, obs[s0].contr[0].l)
+
+            int l1, m1, n1;
+            FOR_CART(l1, m1, n1, obs[s1].contr[0].l)
+
+            int l2, m2, n2;
+            FOR_CART(l2, m2, n2, obs[s2].contr[0].l)
+
+            int l3, m3, n3;
+            FOR_CART(l3, m3, n3, obs[s3].contr[0].l)
+
+            std::array<LIBINT2_REF_REALTYPE, 4> ref_coulomb_opop{0.0, 0.0, 0.0,
+                                                                 0.0};
+            uint p0123 = 0;
+            for (uint p0 = 0; p0 < obs[s0].nprim(); p0++) {
+              for (uint p1 = 0; p1 < obs[s1].nprim(); p1++) {
+                for (uint p2 = 0; p2 < obs[s2].nprim(); p2++) {
+                  for (uint p3 = 0; p3 < obs[s3].nprim(); p3++, p0123++) {
+                    const LIBINT2_REF_REALTYPE alpha0 = obs[s0].alpha[p0];
+                    const LIBINT2_REF_REALTYPE alpha1 = obs[s1].alpha[p1];
+                    const LIBINT2_REF_REALTYPE alpha2 = obs[s2].alpha[p2];
+                    const LIBINT2_REF_REALTYPE alpha3 = obs[s3].alpha[p3];
+
+                    const LIBINT2_REF_REALTYPE c0 = obs[s0].contr[0].coeff[p0];
+                    const LIBINT2_REF_REALTYPE c1 = obs[s1].contr[0].coeff[p1];
+                    const LIBINT2_REF_REALTYPE c2 = obs[s2].contr[0].coeff[p2];
+                    const LIBINT2_REF_REALTYPE c3 = obs[s3].contr[0].coeff[p3];
+                    const LIBINT2_REF_REALTYPE c0123 = c0 * c1 * c2 * c3;
+
+                    auto eri_drr = [&](der_idx d_rr) {
+                      return eri(d_rr.data(), l0, m0, n0, alpha0, Aref, l1, m1,
+                                 n1, alpha1, Bref, l2, m2, n2, alpha2, Cref, l3,
+                                 m3, n3, alpha3, Dref, 0);
+                    };
+
+                    // e.g. d_xx maps the derivative index of derivative w.r.t x
+                    // coord of ket1 and x coord of ket2 in Chemist notation.
+                    der_idx d_xx = {0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0};
+                    der_idx d_yy = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0};
+                    der_idx d_zz = {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1};
+                    ref_coulomb_opop[0] +=
+                        c0123 * (eri_drr(d_xx) + eri_drr(d_yy) + eri_drr(d_zz));
+
+                    der_idx d_yz = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1};
+                    der_idx d_zy = {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0};
+                    ref_coulomb_opop[1] +=
+                        c0123 * (eri_drr(d_yz) - eri_drr(d_zy));
+
+                    der_idx d_zx = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0};
+                    der_idx d_xz = {0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1};
+                    ref_coulomb_opop[2] +=
+                        c0123 * (eri_drr(d_zx) - eri_drr(d_xz));
+
+                    der_idx d_xy = {0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0};
+                    der_idx d_yx = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0};
+                    ref_coulomb_opop[3] +=
+                        c0123 * (eri_drr(d_xy) - eri_drr(d_yx));
+                  }
+                }
+              }
+            }
+
+            const double ABSOLUTE_DEVIATION_THRESHOLD = 5.0E-14;
+            const double RELATIVE_DEVIATION_THRESHOLD =
+                1.0E-9;  // For more detail on choice of these thresholds, see
+                         // the comments in the TEST_CASE "eri geometric
+                         // derivatives"
+
+            std::array<LIBINT2_REF_REALTYPE, 4> abs_errs;
+            std::array<LIBINT2_REF_REALTYPE, 4> rel_abs_errs;
+
+            for (auto comp = 0; comp < 4; ++comp) {
+              abs_errs[comp] =
+                  abs(ref_coulomb_opop[comp] - results[comp][ijkl]);
+              rel_abs_errs[comp] = abs(abs_errs[comp] / ref_coulomb_opop[comp]);
+              bool not_ok = rel_abs_errs[comp] > RELATIVE_DEVIATION_THRESHOLD &&
+                            abs_errs[comp] > ABSOLUTE_DEVIATION_THRESHOLD;
+              // no 3^n prefactor here since the intrinsic deriv order is 2
+              if (not_ok) {
+                std::cout << "(l0 l1| l2 l3) = "
+                          << "(" << s0 << " " << s1 << " | " << s2 << " " << s3
+                          << ") "
+                          << "Elem " << ijkl << " comp= " << comp
+                          << " : ref = " << ref_coulomb_opop[comp]
+                          << " libint = " << results[comp][ijkl]
+                          << " relabs_error = " << rel_abs_errs[comp]
+                          << " abs_error = " << abs_errs[comp] << std::endl;
+              }
+              REQUIRE(!not_ok);
+            }
+
+            ++ijkl;
+            END_FOR_CART
+            END_FOR_CART
+            END_FOR_CART
+            END_FOR_CART
+          }
+        }
+      }
+    }
+  }
+}
+
 TEST_CASE("Erfx_Coulomb integrals", "[engine][2-body]") {
   // pseudorandom s shells
   std::vector<Shell> obs{
@@ -374,12 +521,12 @@ TEST_CASE("Erfx_Coulomb integrals", "[engine][2-body]") {
       REQUIRE(results[0] != nullptr);
       switch (k) {
         /* VALIDATION WOLFRAM CODE:
-(* Integral of Coulomb kernel damped by (\[Lambda] Erf[\[Omega] r] + \
-\[Sigma] Erfc[\[Omega] r]), over unit-normalized s functions, \
-see Eq 52 in DOI 10.1039/b605188j *)
-F0[T_] := If[T == 0, 1, Sqrt[\[Pi]/T]*Erf[Sqrt[T]]/2];
-sN[a_] := ((2 a)/\[Pi])^(3/4);
-VVeeErfx[\[Alpha]1_, A1_List, \[Alpha]2_, A2_List, \[Beta]1_,
+ (* Integral of Coulomb kernel damped by (\[Lambda] Erf[\[Omega] r] + \
+ \[Sigma] Erfc[\[Omega] r]), over unit-normalized s functions, \
+ see Eq 52 in DOI 10.1039/b605188j *)
+ F0[T_] := If[T == 0, 1, Sqrt[\[Pi]/T]*Erf[Sqrt[T]]/2];
+ sN[a_] := ((2 a)/\[Pi])^(3/4);
+ VVeeErfx[\[Alpha]1_, A1_List, \[Alpha]2_, A2_List, \[Beta]1_,
    B1_List, \[Beta]2_, B2_List, \[Omega]_, \[Lambda]_, \[Sigma]_] :=
   Module[{\[Gamma]1, \[Gamma]2, P1, P2, K1, K2, T, result, \[Rho]},
    \[Gamma]1 = \[Alpha]1 + \[Beta]1;
@@ -397,13 +544,13 @@ VVeeErfx[\[Alpha]1_, A1_List, \[Alpha]2_, A2_List, \[Beta]1_,
            T]) sN[\[Alpha]1] sN[\[Alpha]2] sN[\[Beta]1] sN[\[Beta]2];
    Return[result];
    ];
-Print[CForm[
+ Print[CForm[
   N[VVeeErfx[1, {0, 0, 0},  3, {2, 2, 2}, 2, {1, 1, 1}, 4, {3, 3, 3},
     1.1, 1, 0], 20]]]
-Print[CForm[
+ Print[CForm[
   N[VVeeErfx[1, {0, 0, 0},  3, {2, 2, 2}, 2, {1, 1, 1}, 4, {3, 3, 3},
     1.1, 0, 1], 20]]]
-Print[CForm[
+ Print[CForm[
   N[VVeeErfx[1, {0, 0, 0},  3, {2, 2, 2}, 2, {1, 1, 1}, 4, {3, 3, 3},
     1.1, 2, 3], 20]]]
          */
diff --git a/include/libint2/engine.impl.h b/include/libint2/engine.impl.h
index 9381b972a..ec570c045 100644
--- a/include/libint2/engine.impl.h
+++ b/include/libint2/engine.impl.h
@@ -1215,17 +1215,21 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
     LIBINT2_SHELLQUARTET_SET_STANDARD  // standard angular momentum ordering
   const auto swap_tbra = (tbra1.contr[0].l < tbra2.contr[0].l);
   const auto swap_tket = (tket1.contr[0].l < tket2.contr[0].l);
-  const auto swap_braket =
-      ((braket_ == BraKet::xx_xx) && (tbra1.contr[0].l + tbra2.contr[0].l >
-                                      tket1.contr[0].l + tket2.contr[0].l)) ||
-      braket_ == BraKet::xx_xs;
+  const auto swap_braket = ((braket_ == BraKet::xx_xx) &&
+                            (tbra1.contr[0].l + tbra2.contr[0].l <
+                             tket1.contr[0].l + tket2.contr[0].l) &&
+                            (oper_ != Operator::coulomb_opop)) ||
+                           braket_ == BraKet::xx_xs;
+  // N.B. cannot swap bra and ket for coulomb_opop since the ket is mutated by
+  // this operator
 #else  // orca angular momentum ordering
   const auto swap_tbra = (tbra1.contr[0].l > tbra2.contr[0].l);
   const auto swap_tket = (tket1.contr[0].l > tket2.contr[0].l);
-  const auto swap_braket =
-      ((braket_ == BraKet::xx_xx) && (tbra1.contr[0].l + tbra2.contr[0].l <
-                                      tket1.contr[0].l + tket2.contr[0].l)) ||
-      braket_ == BraKet::xx_xs;
+  const auto swap_braket = ((braket_ == BraKet::xx_xx) &&
+                            (tbra1.contr[0].l + tbra2.contr[0].l <
+                             tket1.contr[0].l + tket2.contr[0].l) &&
+                            (oper_ != Operator::coulomb_opop)) ||
+                           braket_ == BraKet::xx_xs;
   assert(false && "feature not implemented");
   abort();
 #endif
@@ -1659,7 +1663,7 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
               const auto Wz =
                   (gammap_o_gammapgammaq * P[2] + gammaq_o_gammapgammaq * Q[2]);
 
-              if (deriv_order_ > 0 || lmax_bra > 0) {
+              if (deriv_order_ + intrinsic_deriv_order() > 0 || lmax_bra > 0) {
 #if LIBINT2_DEFINED(eri, WP_x)
                 primdata.WP_x[0] = Wx - P[0];
 #endif
@@ -1670,7 +1674,7 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
                 primdata.WP_z[0] = Wz - P[2];
 #endif
               }
-              if (deriv_order_ > 0 || lmax_ket > 0) {
+              if (deriv_order_ + intrinsic_deriv_order() > 0 || lmax_ket > 0) {
 #if LIBINT2_DEFINED(eri, WQ_x)
                 primdata.WQ_x[0] = Wx - Q[0];
 #endif
@@ -1916,8 +1920,10 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
                "the angular momentum limit is exceeded");
         assert(ket2.contr[0].l <= ket_lmax &&
                "the angular momentum limit is exceeded");
+
         buildfnidx = (bra1.contr[0].l * ket_lmax + ket1.contr[0].l) * ket_lmax +
                      ket2.contr[0].l;
+
 #ifdef LIBINT_ERI3_PURE_SH
         if (bra1.contr[0].l > 1)
           assert(bra1.contr[0].pure &&
@@ -2110,9 +2116,15 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
                 const auto tgt_row_idx =
                     !swap_tbra ? r1 * nr2 + r2 : r2 * nr1 + r1;
                 Map tgt_blk_mat(tgt_ptr + tgt_row_idx * ncol, nc1_tgt, nc2_tgt);
-                if (swap_tket)
-                  tgt_blk_mat = src_blk_mat.transpose();
-                else
+                if (swap_tket) {
+                  Shell::real_t oper_cart_component_phase = 1.0;
+                  if (oper_ == Operator::coulomb_opop && s > 0)
+                    oper_cart_component_phase =
+                        -1.0;  // x,y,z quaternion components flip sign on
+                               // swapping ket for coulomb_opop
+                  tgt_blk_mat =
+                      oper_cart_component_phase * src_blk_mat.transpose();
+                } else
                   tgt_blk_mat = src_blk_mat;
               }
             }  // end of loop
diff --git a/src/bin/libint/build_libint.cc b/src/bin/libint/build_libint.cc
index 8acb3814d..dfb48fbd0 100644
--- a/src/bin/libint/build_libint.cc
+++ b/src/bin/libint/build_libint.cc
@@ -1118,10 +1118,13 @@ void build_TwoPRep_2b_2k(std::ostream& os, std::string label,
     for (unsigned int lb = 0; lb <= lmax; lb++) {
       for (unsigned int lc = 0; lc <= lmax; lc++) {
         for (unsigned int ld = 0; ld <= lmax; ld++) {
-          if (!ShellQuartetSetPredicate<static_cast<ShellSetType>(
-                  LIBINT_SHELL_SET)>::value(la, lb, lc, ld))
-            continue;
-
+          if (std::is_same<OperType, CoulombσpσpOper>::value) {
+            if (!(la >= lb && lc >= ld)) continue;
+          } else {
+            if (!ShellQuartetSetPredicate<static_cast<ShellSetType>(
+                    LIBINT_SHELL_SET)>::value(la, lb, lc, ld))
+              continue;
+          }
           // std::shared_ptr<Tactic> tactic(new ParticleDirectionTactic(la+lb >
           // lc+ld ? false : true));
           std::shared_ptr<Tactic> tactic(
@@ -1223,7 +1226,7 @@ void build_TwoPRep_2b_2k(std::ostream& os, std::string label,
             CGShell c(lc);
             CGShell d(ld);
 
-            if constexpr (!std::is_same<OperType, CoulombσpσpOper>::value) {
+            if constexpr (std::is_same<OperType, TwoPRep>::value) {
               OperType oper;
               oper = OperType(descrs[0]);
               std::shared_ptr<TwoBody_sh_11_11> abcd =
@@ -1236,6 +1239,8 @@ void build_TwoPRep_2b_2k(std::ostream& os, std::string label,
               oss << "_" << label;
               oss << "_" << c.label() << "_" << d.label();
               abcd_label = oss.str();
+              std::cout << "(" << a.label() << " " << b.label() << " | "
+                        << c.label() << " " << d.label() << ") ";
             }
           }
           // + derivative level (if deriv_level > 0)
diff --git a/src/bin/libint/oper.h b/src/bin/libint/oper.h
index daa6aa644..36292613a 100644
--- a/src/bin/libint/oper.h
+++ b/src/bin/libint/oper.h
@@ -301,11 +301,11 @@ struct σpVσp_Descr : public Contractable<σpVσp_Descr> {
     if (quaternion_index() == 0)
       descr += "0";
     else if (quaternion_index() == 1)
-      descr += "Z";
-    else if (quaternion_index() == 2)
       descr += "X";
-    else if (quaternion_index() == 3)
+    else if (quaternion_index() == 2)
       descr += "Y";
+    else if (quaternion_index() == 3)
+      descr += "Z";
     else
       abort();
     return descr + "]";
@@ -417,11 +417,11 @@ struct Coulombσpσp_Descr : public Contractable<Coulombσpσp_Descr> {
     if (quaternion_index() == 0)
       descr += "0";
     else if (quaternion_index() == 1)
-      descr += "Z";
-    else if (quaternion_index() == 2)
       descr += "X";
-    else if (quaternion_index() == 3)
+    else if (quaternion_index() == 2)
       descr += "Y";
+    else if (quaternion_index() == 3)
+      descr += "Z";
     else
       abort();
     return descr + "]";

From 62142d9671e87c397e7938f6535eff0972e127ee Mon Sep 17 00:00:00 2001
From: Kshitij Surjuse <kshitijsurjuse100@gmail.com>
Date: Sat, 14 Feb 2026 10:54:02 -0500
Subject: [PATCH 08/22] bugfix: revert incorrect braket swapping criteria for
 `Operator::coulomb`

---
 include/libint2/engine.impl.h  | 2 +-
 src/bin/libint/build_libint.cc | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/include/libint2/engine.impl.h b/include/libint2/engine.impl.h
index ec570c045..834fd9c6d 100644
--- a/include/libint2/engine.impl.h
+++ b/include/libint2/engine.impl.h
@@ -1216,7 +1216,7 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
   const auto swap_tbra = (tbra1.contr[0].l < tbra2.contr[0].l);
   const auto swap_tket = (tket1.contr[0].l < tket2.contr[0].l);
   const auto swap_braket = ((braket_ == BraKet::xx_xx) &&
-                            (tbra1.contr[0].l + tbra2.contr[0].l <
+                            (tbra1.contr[0].l + tbra2.contr[0].l >
                              tket1.contr[0].l + tket2.contr[0].l) &&
                             (oper_ != Operator::coulomb_opop)) ||
                            braket_ == BraKet::xx_xs;
diff --git a/src/bin/libint/build_libint.cc b/src/bin/libint/build_libint.cc
index dfb48fbd0..f70d16644 100644
--- a/src/bin/libint/build_libint.cc
+++ b/src/bin/libint/build_libint.cc
@@ -77,8 +77,9 @@ struct ShellQuartetSetPredicate {
 };
 template <>
 struct ShellQuartetSetPredicate<ShellSetType_Standard> {
-  static bool value(int la, int lb, int lc, int ld) {
-    return la >= lb && lc >= ld && la + lb <= lc + ld;
+  static bool value(int la, int lb, int lc, int ld,
+                    bool p1p2_swappable = true) {
+    return la >= lb && lc >= ld && (!p1p2_swappable || la + lb <= lc + ld);
   }
 };
 template <>
@@ -1118,7 +1119,7 @@ void build_TwoPRep_2b_2k(std::ostream& os, std::string label,
     for (unsigned int lb = 0; lb <= lmax; lb++) {
       for (unsigned int lc = 0; lc <= lmax; lc++) {
         for (unsigned int ld = 0; ld <= lmax; ld++) {
-          if (std::is_same<OperType, CoulombσpσpOper>::value) {
+          if constexpr (std::is_same<OperType, CoulombσpσpOper>::value) {
             if (!(la >= lb && lc >= ld)) continue;
           } else {
             if (!ShellQuartetSetPredicate<static_cast<ShellSetType>(
@@ -1239,8 +1240,6 @@ void build_TwoPRep_2b_2k(std::ostream& os, std::string label,
               oss << "_" << label;
               oss << "_" << c.label() << "_" << d.label();
               abcd_label = oss.str();
-              std::cout << "(" << a.label() << " " << b.label() << " | "
-                        << c.label() << " " << d.label() << ") ";
             }
           }
           // + derivative level (if deriv_level > 0)

From b2b79ab24c02f9405614e28d4262f86eaacdc0e0 Mon Sep 17 00:00:00 2001
From: Kshitij Surjuse <kshitijsurjuse100@gmail.com>
Date: Wed, 18 Feb 2026 17:19:34 -0500
Subject: [PATCH 09/22] generate code for missing contracted kernels for
 `deriv(>0)-eri` + cleanup

---
 src/bin/libint/build_libint.cc                | 34 +++++++++----------
 .../comp_11_Coulomb\317\203p\317\203p_11.h"   |  8 ++---
 2 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/src/bin/libint/build_libint.cc b/src/bin/libint/build_libint.cc
index f70d16644..c892dd618 100644
--- a/src/bin/libint/build_libint.cc
+++ b/src/bin/libint/build_libint.cc
@@ -73,7 +73,7 @@ enum ShellSetType {
 template <ShellSetType ShSet>
 struct ShellQuartetSetPredicate {
   // return true if this set of angular momenta is included
-  static bool value(int la, int lb, int lc, int ld);
+  static bool value(int la, int lb, int lc, int ld, bool p1p2_swappable = true);
 };
 template <>
 struct ShellQuartetSetPredicate<ShellSetType_Standard> {
@@ -84,8 +84,10 @@ struct ShellQuartetSetPredicate<ShellSetType_Standard> {
 };
 template <>
 struct ShellQuartetSetPredicate<ShellSetType_ORCA> {
-  static bool value(int la, int lb, int lc, int ld) {
-    return la <= lb && lc <= ld && (la < lc || (la == lc && lb <= ld));
+  static bool value(int la, int lb, int lc, int ld,
+                    bool p1p2_swappable = true) {
+    return la <= lb && lc <= ld &&
+           (!p1p2_swappable || (la < lc || (la == lc && lb <= ld)));
   }
 };
 template <ShellSetType ShSet>
@@ -1115,17 +1117,16 @@ void build_TwoPRep_2b_2k(std::ostream& os, std::string label,
   std::shared_ptr<CodeContext> context(new CppCodeContext(cparams));
   std::shared_ptr<MemoryManager> memman(new WorstFitMemoryManager());
 
+  bool p1_p2_swappable = !std::is_same<OperType, CoulombσpσpOper>::value;
+
   for (unsigned int la = 0; la <= lmax; la++) {
     for (unsigned int lb = 0; lb <= lmax; lb++) {
       for (unsigned int lc = 0; lc <= lmax; lc++) {
         for (unsigned int ld = 0; ld <= lmax; ld++) {
-          if constexpr (std::is_same<OperType, CoulombσpσpOper>::value) {
-            if (!(la >= lb && lc >= ld)) continue;
-          } else {
-            if (!ShellQuartetSetPredicate<static_cast<ShellSetType>(
-                    LIBINT_SHELL_SET)>::value(la, lb, lc, ld))
-              continue;
-          }
+          if (!ShellQuartetSetPredicate<static_cast<ShellSetType>(
+                  LIBINT_SHELL_SET)>::value(la, lb, lc, ld, p1_p2_swappable))
+            continue;
+
           // std::shared_ptr<Tactic> tactic(new ParticleDirectionTactic(la+lb >
           // lc+ld ? false : true));
           std::shared_ptr<Tactic> tactic(
@@ -1141,9 +1142,7 @@ void build_TwoPRep_2b_2k(std::ostream& os, std::string label,
           /////////////////////////////////
           // loop over operator components
           /////////////////////////////////
-          // most important operators have 1 component ...
-          std::vector<OperDescrType> descrs(1);  // operator descriptors
-          // important EXCEPTION: multipole moments
+          std::vector<OperDescrType> descrs(1);
           if (std::is_same<OperType, CoulombσpσpOper>::value) {
             // reset descriptors array
             descrs.resize(0);
@@ -1156,8 +1155,9 @@ void build_TwoPRep_2b_2k(std::ostream& os, std::string label,
           // unroll only if max_am <= cparams->max_am_opt(task) using std::max;
           const unsigned int max_am = max(max(la, lb), max(lc, ld));
           const bool need_to_optimize = (max_am <= cparams->max_am_opt(task));
+          const auto nopers = descrs.size();
           const bool need_to_unroll =
-              l_to_cgshellsize(la) * l_to_cgshellsize(lb) *
+              nopers * l_to_cgshellsize(la) * l_to_cgshellsize(lb) *
                   l_to_cgshellsize(lc) * l_to_cgshellsize(ld) <=
               cparams->unroll_threshold();
           const unsigned int unroll_threshold =
@@ -1285,11 +1285,9 @@ void build_TwoPRep_2b_2k(std::ostream& os, std::string label,
           iface->to_static_init(oss.str());
 
           // need to declare this function internally
-          for (std::deque<std::string>::const_iterator i =
-                   decl_filenames.begin();
-               i != decl_filenames.end(); ++i) {
+          for (auto& decl_filename : decl_filenames) {
             oss.str("");
-            oss << "#include <" << *i << ">" << endl;
+            oss << "#include <" << decl_filename << ">" << endl;
             iface->to_int_iface(oss.str());
           }
 
diff --git "a/src/bin/libint/comp_11_Coulomb\317\203p\317\203p_11.h" "b/src/bin/libint/comp_11_Coulomb\317\203p\317\203p_11.h"
index 82ee512ef..14621936b 100644
--- "a/src/bin/libint/comp_11_Coulomb\317\203p\317\203p_11.h"
+++ "b/src/bin/libint/comp_11_Coulomb\317\203p\317\203p_11.h"
@@ -83,11 +83,9 @@ CR_11_Coulombσpσp_11<F>::CR_11_Coulombσpσp_11(
 
   const auto &oper = Tint->oper();
 
-  // can express integrals of Coulombσpσp in terms of
-  // derivative integrals of 1/r12 for primitive Gaussians
-  // only
-  if (a.contracted() || b.contracted() || c.contracted() || d.contracted())
-    return;
+  // TODO: need to do this only for uncontracted gaussians
+  //  if (a.contracted() || b.contracted() || c.contracted() || d.contracted())
+  //    return;
 
   using namespace libint2::algebra;
   using namespace libint2::prefactor;

From d4247b0563b541a2b4ffce48fb16578c550a3e5f Mon Sep 17 00:00:00 2001
From: Kshitij Surjuse <kshitijsurjuse100@gmail.com>
Date: Fri, 20 Feb 2026 16:56:32 -0500
Subject: [PATCH 10/22] =?UTF-8?q?reverted=20allowing=20contracted=20shells?=
 =?UTF-8?q?=20for=20Coulomb=CF=83p=CF=83p=20and=20use=20additional=20diffe?=
 =?UTF-8?q?rentiator=20when=20on=20MacOS?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 include/libint2.h                             | 17 ++++++++-------
 .../comp_11_Coulomb\317\203p\317\203p_11.h"   |  5 ++---
 src/bin/libint/gauss.cc                       | 21 ++++++++++++++++---
 3 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/include/libint2.h b/include/libint2.h
index 59d17ef65..25a231be7 100644
--- a/include/libint2.h
+++ b/include/libint2.h
@@ -22,18 +22,19 @@
 #define _libint2_header_
 
 #define LIBINT_T_SS_EREP_SS(mValue) \
-  _aB_s___0__s___1___TwoPRep_s___0__s___1___Ab__up_##mValue
+  _aB_s____0__s____1___TwoPRep_s____0__s____1___Ab__up_##mValue
 #define LIBINT_T_SS_Km1G12_SS(mValue) \
-  _aB_s___0__s___1___r12_minus_1_g12_s___0__s___1___Ab__up_##mValue
+  _aB_s____0__s____1___r12_minus_1_g12_s____0__s____1___Ab__up_##mValue
 #define LIBINT_T_SS_K0G12_SS_0 \
-  _aB_s___0__s___1___r12_0_g12_s___0__s___1___Ab__up_0
+  _aB_s____0__s____1___r12_0_g12_s____0__s____1___Ab__up_0
 #define LIBINT_T_SS_K2G12_SS_0 \
-  _aB_s___0__s___1___r12_2_g12_s___0__s___1___Ab__up_0
+  _aB_s____0__s____1___r12_2_g12_s____0__s____1___Ab__up_0
 #define LIBINT_T_SS_K4G12_SS_0 \
-  _aB_s___0__s___1___r12_4_g12_s___0__s___1___Ab__up_0
-#define LIBINT_T_S_OVERLAP_S _aB_s___0___Overlap_s___0___Ab__up_
-#define LIBINT_T_S_KINETIC_S _aB_s___0___Kinetic_s___0___Ab__up_
-#define LIBINT_T_S_ELECPOT_S(mValue) _aB_s___0___ElecPot_s___0___Ab__up_##mValue
+  _aB_s____0__s____1___r12_4_g12_s____0__s____1___Ab__up_0
+#define LIBINT_T_S_OVERLAP_S _aB_s____0___Overlap_s____0___Ab__up_
+#define LIBINT_T_S_KINETIC_S _aB_s____0___Kinetic_s____0___Ab__up_
+#define LIBINT_T_S_ELECPOT_S(mValue) \
+  _aB_s____0___ElecPot_s____0___Ab__up_##mValue
 
 #include <libint2/util/configuration.h>
 #include <libint2/util/generated/libint2_params.h>
diff --git "a/src/bin/libint/comp_11_Coulomb\317\203p\317\203p_11.h" "b/src/bin/libint/comp_11_Coulomb\317\203p\317\203p_11.h"
index 14621936b..315135994 100644
--- "a/src/bin/libint/comp_11_Coulomb\317\203p\317\203p_11.h"
+++ "b/src/bin/libint/comp_11_Coulomb\317\203p\317\203p_11.h"
@@ -83,9 +83,8 @@ CR_11_Coulombσpσp_11<F>::CR_11_Coulombσpσp_11(
 
   const auto &oper = Tint->oper();
 
-  // TODO: need to do this only for uncontracted gaussians
-  //  if (a.contracted() || b.contracted() || c.contracted() || d.contracted())
-  //    return;
+  if (a.contracted() || b.contracted() || c.contracted() || d.contracted())
+    return;
 
   using namespace libint2::algebra;
   using namespace libint2::prefactor;
diff --git a/src/bin/libint/gauss.cc b/src/bin/libint/gauss.cc
index 3899283ab..aa60b1de7 100644
--- a/src/bin/libint/gauss.cc
+++ b/src/bin/libint/gauss.cc
@@ -115,8 +115,15 @@ std::string CGF::label() const {
   unsigned int am = qn_[0] + qn_[1] + qn_[2];
   std::string deriv_label;
   if (!deriv_.zero()) deriv_label = deriv_.label();
-  const std::string am_string = am_to_symbol(am, contracted());
+  std::string am_string = am_to_symbol(am, contracted());
   std::ostringstream oss;
+
+  // Some OSs can have case-insensitive filesystem e.g., MacOS. So here we add
+  // additional identifier for primitive function labels
+  if (!this->contracted()) {
+    am_string += "_";
+  }
+
   oss << (pure_sh_ && am > 0 ? "W" : "") << am_string << deriv_label << "_";
   if (am == 0) return oss.str();
 
@@ -223,8 +230,16 @@ CGShell::~CGShell() {}
 
 std::string CGShell::label() const {
   if (is_unit()) return "unit";
-  std::string result = std::string(pure_sh_ && qn_[0] > 0 ? "W" : "") +
-                       am_to_symbol(qn_[0], contracted());
+  std::string am_symbol = am_to_symbol(qn_[0], contracted());
+
+  // Some OSs can have case-insensitive filesystem e.g., MacOS. So here we add
+  // additional identifier for primitive shell labels
+  if (!this->contracted()) {
+    am_symbol += "_";
+  }
+
+  std::string result =
+      std::string(pure_sh_ && qn_[0] > 0 ? "W" : "") + am_symbol;
   if (!deriv_.zero()) result += deriv_.label();
   return result;
 }

From 054b8eba4a03140eeb3ceb66be90d4f3d355da5a Mon Sep 17 00:00:00 2001
From: Kshitij Surjuse <kshitijsurjuse100@gmail.com>
Date: Sat, 21 Feb 2026 11:51:43 -0500
Subject: [PATCH 11/22] can generate code for all small component RKB ERIs
 i.e.,`(SS|SS)`

---
 src/bin/libint/build_libint.cc                |  15 +-
 ...3p\317\203pCoulomb\317\203p\317\203p_11.h" | 264 ++++++++++++++++++
 src/bin/libint/master_ints_list.h             |  12 +-
 src/bin/libint/master_rrs_list.h              |   4 +
 src/bin/libint/oper.h                         |  36 +++
 src/bin/libint/strategy.cc                    |   8 +
 6 files changed, 330 insertions(+), 9 deletions(-)
 create mode 100644 "src/bin/libint/comp_11_\317\203p\317\203pCoulomb\317\203p\317\203p_11.h"

diff --git a/src/bin/libint/build_libint.cc b/src/bin/libint/build_libint.cc
index c892dd618..8712580a5 100644
--- a/src/bin/libint/build_libint.cc
+++ b/src/bin/libint/build_libint.cc
@@ -287,6 +287,11 @@ Coulombσpσp_Descr make_descr<Coulombσpσp_Descr>(int p, int, int) {
   return Coulombσpσp_Descr(p);
 }
 
+template <>
+σpσpCoulombσpσp_Descr make_descr<σpσpCoulombσpσp_Descr>(int p, int, int) {
+  return σpσpCoulombσpσp_Descr(p);
+}
+
 }  // namespace
 
 template <typename _OperType>
@@ -582,8 +587,8 @@ void try_main(int argc, char* argv[]) {
 #endif
 
 #ifdef LIBINT_INCLUDE_RKB_ERI
-#define BOOST_PP_RKB_ERI_TASK_TUPLE (coulomb_opop)
-#define BOOST_PP_RKB_ERI_TASK_OPER_TUPLE (CoulombσpσpOper)
+#define BOOST_PP_RKB_ERI_TASK_TUPLE (coulomb_opop, opop_coulomb_opop)
+#define BOOST_PP_RKB_ERI_TASK_OPER_TUPLE (CoulombσpσpOper, σpσpCoulombσpσpOper)
 #define BOOST_PP_RKB_ERI_TASK_LIST \
   BOOST_PP_TUPLE_TO_LIST(BOOST_PP_RKB_ERI_TASK_TUPLE)
 #define BOOST_PP_RKB_ERI_TASK_OPER_LIST \
@@ -2328,9 +2333,9 @@ void config_to_api(const std::shared_ptr<CompilationParameters>& cparams,
 
       {  // 2-body ints
 
-#define BOOST_PP_TWOBODY_TASKOPER_TUPLE                                      \
-  ("eri", "coulomb_opop", "r12kg12", "r12_0_g12", "r12_2_g12", "g12_T1_g12", \
-   "g12dkh")
+#define BOOST_PP_TWOBODY_TASKOPER_TUPLE                                \
+  ("eri", "coulomb_opop", "opop_coulomb_opop", "r12kg12", "r12_0_g12", \
+   "r12_2_g12", "g12_T1_g12", "g12dkh")
 #define BOOST_PP_TWOBODY_TASKOPER_LIST \
   BOOST_PP_TUPLE_TO_LIST(BOOST_PP_TWOBODY_TASKOPER_TUPLE)
 
diff --git "a/src/bin/libint/comp_11_\317\203p\317\203pCoulomb\317\203p\317\203p_11.h" "b/src/bin/libint/comp_11_\317\203p\317\203pCoulomb\317\203p\317\203p_11.h"
new file mode 100644
index 000000000..1cacc7b3a
--- /dev/null
+++ "b/src/bin/libint/comp_11_\317\203p\317\203pCoulomb\317\203p\317\203p_11.h"
@@ -0,0 +1,264 @@
+/*
+ *  Copyright (C) 2004-2026 Edward F. Valeev
+ *
+ *  This file is part of Libint compiler.
+ *
+ *  Libint compiler is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Libint compiler is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with Libint compiler.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef LIBINT_COMP_11_ΣPΣPCOULOMBΣPΣP_11_H
+#define LIBINT_COMP_11_ΣPΣPCOULOMBΣPΣP_11_H
+
+#include <gaussoper.h>
+#include <generic_rr.h>
+#include <twoprep_11_11.h>
+
+namespace libint2 {
+
+/**
+ * this computes integral of
+ * \sigma \cdot \hat{p}_1 \sigma \cdot \hat{p}_2 \f$ \frac{1}{r_{ij}} \sigma
+ * \cdot \hat{p}_3 \sigma \cdot \hat{p}_4 \f$ over CGShell/CGF by rewriting it
+ * as a linear combination of integrals over derivatives of \frac{1}{r_{ij}}
+ * @tparam F basis function type. valid choices are CGShell or CGF
+ */
+template <typename F>
+class CR_11_σpσpCoulombσpσp_11
+    : public GenericRecurrenceRelation<
+          CR_11_σpσpCoulombσpσp_11<F>, F,
+          GenIntegralSet_11_11<F, σpσpCoulombσpσpOper, mType>> {
+ public:
+  typedef CR_11_σpσpCoulombσpσp_11<F> ThisType;
+  typedef F BasisFunctionType;
+  typedef σpσpCoulombσpσpOper OperType;
+  typedef GenIntegralSet_11_11<F, σpσpCoulombσpσpOper, mType> TargetType;
+  typedef GenericRecurrenceRelation<ThisType, BasisFunctionType, TargetType>
+      ParentType;
+  friend class GenericRecurrenceRelation<ThisType, BasisFunctionType,
+                                         TargetType>;
+  static const unsigned int max_nchildren = 100;  // TODO figure out
+
+  using ParentType::Instance;
+
+  static bool directional() { return false; }
+
+ private:
+  using ParentType::is_simple;
+  using ParentType::target_;
+  using ParentType::RecurrenceRelation::expr_;
+  using ParentType::RecurrenceRelation::nflops_;
+
+  /// Constructor is private, used by ParentType::Instance that maintains
+  /// registry of these objects
+  CR_11_σpσpCoulombσpσp_11(const std::shared_ptr<TargetType> &,
+                           unsigned int = 0);
+
+  static std::string descr() { return "CR"; }
+};
+
+template <typename F>
+CR_11_σpσpCoulombσpσp_11<F>::CR_11_σpσpCoulombσpσp_11(
+    const std::shared_ptr<TargetType> &Tint, unsigned int)
+    : ParentType(Tint, 0) {
+  assert(Tint->num_func_bra(/* particle */ 0) == 1);
+  assert(Tint->num_func_bra(/* particle */ 1) == 1);
+  assert(Tint->num_func_ket(/* particle */ 0) == 1);
+  assert(Tint->num_func_ket(/* particle */ 1) == 1);
+
+  F a(Tint->bra(0, 0));
+  F b(Tint->ket(0, 0));
+  F c(Tint->bra(1, 0));
+  F d(Tint->ket(1, 0));
+
+  const auto &oper = Tint->oper();
+
+  if (a.contracted() || b.contracted() || c.contracted() || d.contracted())
+    return;
+
+  using namespace libint2::algebra;
+  using namespace libint2::prefactor;
+  using libint2::algebra::operator*;
+
+  const mType zero_m(0u);
+
+  ChildFactory<ThisType,
+               GenIntegralSet_11_11<BasisFunctionType, TwoPRep, mType>>
+      factory(this);
+
+  constexpr auto x = 0;
+  constexpr auto y = 1;
+  constexpr auto z = 2;
+
+  auto mc = [&](const int r1, const int r2, const int r3, const int r4) {
+    F a_r1{a};
+    a_r1.deriv().inc(r1);
+    F b_r2{b};
+    b_r2.deriv().inc(r2);
+    F c_r3{c};
+    c_r3.deriv().inc(r3);
+    F d_r4{d};
+    d_r4.deriv().inc(r4);
+    return factory.make_child(a_r1, b_r2, c_r3, d_r4, zero_m);
+  };
+
+  // Component wise generation for quaternion :
+  // ( (σ.p) a (σ.p)b | 1/r12 | (σ.p) c (σ.p) d )
+  switch (oper->descr().quaternion_index()) {
+    case 0: {
+      // zeroth component =
+      // x1 x2 x3 x4 + y1 y2 x3 x4 - y1 x2 y3 x4 + x1 y2 y3 x4 + y1 x2 x3 y4 -
+      // x1 y2 x3 y4 + x1 x2 y3 y4 + y1 y2 y3 y4 + z1 z2 x3 x4 + z1 z2 y3 y4 -
+      // z1 x2 z3 x4 - z1 y2 z3 y4 + x1 z2 z3 x4 + y1 z2 z3 y4 + z1 x2 x3 z4 +
+      // z1 y2 y3 z4 - x1 z2 x3 z4 - y1 z2 y3 z4 + x1 x2 z3 z4 + y1 y2 z3 z4 +
+      // z1 z2 z3 z4
+      auto xxxx = mc(x, x, x, x);
+      auto yyxx = mc(y, y, x, x);
+      auto yxyx = mc(y, x, y, x);
+      auto xyyx = mc(x, y, y, x);
+      auto yxxy = mc(y, x, x, y);
+      auto xyxy = mc(x, y, x, y);
+      auto xxyy = mc(x, x, y, y);
+      auto yyyy = mc(y, y, y, y);
+      auto zzxx = mc(z, z, x, x);
+      auto zzyy = mc(z, z, y, y);
+      auto zxzx = mc(z, x, z, x);
+      auto zyzy = mc(z, y, z, y);
+      auto xzzx = mc(x, z, z, x);
+      auto yzzy = mc(y, z, z, y);
+      auto zxxz = mc(z, x, x, z);
+      auto zyyz = mc(z, y, y, z);
+      auto xzxz = mc(x, z, x, z);
+      auto yzyz = mc(y, z, y, z);
+      auto xxzz = mc(x, x, z, z);
+      auto yyzz = mc(y, y, z, z);
+      auto zzzz = mc(z, z, z, z);
+      if (is_simple()) {
+        expr_ = xxxx + yyxx - yxyx + xyyx + yxxy - xyxy + xxyy + yyyy + zzxx +
+                zzyy - zxzx - zyzy + xzzx + yzzy + zxxz + zyyz - xzxz - yzyz +
+                xxzz + yyzz + zzzz;
+        nflops_ += 20;
+      }
+    } break;
+    case 1: {
+      // x component =
+      // - z1 y2 x3 x4 + z1 x2 y3 x4 - z1 x2 x3 y4 - z1 y2 y3 y4 + y1 z2 x3 x4 -
+      // x1 z2 y3 x4 + x1 z2 x3 y4 + y1 z2 y3 y4 - y1 x2 z3 x4 + x1 y2 z3 x4 -
+      // x1 x2 z3 y4 - y1 y2 z3 y4 - z1 z2 z3 y4 + y1 x2 x3 z4 - x1 y2 x3 z4 +
+      // x1 x2 y3 z4 + y1 y2 y3 z4 + z1 z2 y3 z4 - z1 y2 z3 z4 + y1 z2 z3 z4
+      auto zyxx = mc(z, y, x, x);
+      auto zxyx = mc(z, x, y, x);
+      auto zxxy = mc(z, x, x, y);
+      auto zyyy = mc(z, y, y, y);
+      auto yzxx = mc(y, z, x, x);
+      auto xzyx = mc(x, z, y, x);
+      auto xzxy = mc(x, z, x, y);
+      auto yzyy = mc(y, z, y, y);
+      auto yxzx = mc(y, x, z, x);
+      auto xyzx = mc(x, y, z, x);
+      auto xxzy = mc(x, x, z, y);
+      auto yyzy = mc(y, y, z, y);
+      auto zzzy = mc(z, z, z, y);
+      auto yxxz = mc(y, x, x, z);
+      auto xyxz = mc(x, y, x, z);
+      auto xxyz = mc(x, x, y, z);
+      auto yyyz = mc(y, y, y, z);
+      auto zzyz = mc(z, z, y, z);
+      auto zyzz = mc(z, y, z, z);
+      auto yzzz = mc(y, z, z, z);
+      if (is_simple()) {
+        // swapped order of first two terms compiler does not like negative sign
+        // in front of first term
+        expr_ = zxyx - zyxx - zxxy - zyyy + yzxx - xzyx + xzxy + yzyy - yxzx +
+                xyzx - xxzy - yyzy - zzzy + yxxz - xyxz + xxyz + yyyz + zzyz -
+                zyzz + yzzz;
+        nflops_ += 19;
+      }
+    } break;
+    case 2: {
+      // y component =
+      // z1 x2 x3 x4 + z1 y2 y3 x4 - z1 y2 x3 y4 + z1 x2 y3 y4 - x1 z2 x3 x4 -
+      // y1 z2 y3 x4 + y1 z2 x3 y4 - x1 z2 y3 y4 + x1 x2 z3 x4 + y1 y2 z3 x4 -
+      // y1 x2 z3 y4 + x1 y2 z3 y4 + z1 z2 z3 x4 - x1 x2 x3 z4 - y1 y2 x3 z4 +
+      // y1 x2 y3 z4 - x1 y2 y3 z4 - z1 z2 x3 z4 + z1 x2 z3 z4 - x1 z2 z3 z4
+      auto zxxx = mc(z, x, x, x);
+      auto zyyx = mc(z, y, y, x);
+      auto zyxy = mc(z, y, x, y);
+      auto zxyy = mc(z, x, y, y);
+      auto xzxx = mc(x, z, x, x);
+      auto yzyx = mc(y, z, y, x);
+      auto yzxy = mc(y, z, x, y);
+      auto xzyy = mc(x, z, y, y);
+      auto xxzx = mc(x, x, z, x);
+      auto yyzx = mc(y, y, z, x);
+      auto yxzy = mc(y, x, z, y);
+      auto xyzy = mc(x, y, z, y);
+      auto zzzx = mc(z, z, z, x);
+      auto xxxz = mc(x, x, x, z);
+      auto yyxz = mc(y, y, x, z);
+      auto yxyz = mc(y, x, y, z);
+      auto xyyz = mc(x, y, y, z);
+      auto zzxz = mc(z, z, x, z);
+      auto zxzz = mc(z, x, z, z);
+      auto xzzz = mc(x, z, z, z);
+
+      if (is_simple()) {
+        expr_ = zxxx + zyyx - zyxy + zxyy - xzxx - yzyx + yzxy - xzyy + xxzx +
+                yyzx - yxzy + xyzy + zzzx - xxxz - yyxz + yxyz - xyyz - zzxz +
+                zxzz - xzzz;
+        nflops_ += 19;
+      }
+    } break;
+    case 3: {
+      // z component =
+      // - y1 x2 x3 x4 + x1 y2 x3 x4 - x1 x2 y3 x4 - y1 y2 y3 x4 + x1 x2 x3 y4 +
+      // y1 y2 x3 y4 - y1 x2 y3 y4 + x1 y2 y3 y4 - z1 z2 y3 x4 + z1 z2 x3 y4 +
+      // z1 y2 z3 x4 - z1 x2 z3 y4 - y1 z2 z3 x4 + x1 z2 z3 y4 - z1 y2 x3 z4 +
+      // z1 x2 y3 z4 + y1 z2 x3 z4 - x1 z2 y3 z4 - y1 x2 z3 z4 + x1 y2 z3 z4
+      auto yxxx = mc(y, x, x, x);
+      auto xyxx = mc(x, y, x, x);
+      auto xxyx = mc(x, x, y, x);
+      auto yyyx = mc(y, y, y, x);
+      auto xxxy = mc(x, x, x, y);
+      auto yyxy = mc(y, y, x, y);
+      auto yxyy = mc(y, x, y, y);
+      auto xyyy = mc(x, y, y, y);
+      auto zzyx = mc(z, z, y, x);
+      auto zzxy = mc(z, z, x, y);
+      auto zyzx = mc(z, y, z, x);
+      auto zxzy = mc(z, x, z, y);
+      auto yzzx = mc(y, z, z, x);
+      auto xzzy = mc(x, z, z, y);
+      auto zyxz = mc(z, y, x, z);
+      auto zxyz = mc(z, x, y, z);
+      auto yzxz = mc(y, z, x, z);
+      auto xzyz = mc(x, z, y, z);
+      auto yxzz = mc(y, x, z, z);
+      auto xyzz = mc(x, y, z, z);
+      if (is_simple()) {
+        expr_ = xyxx - yxxx - xxyx - yyyx + xxxy + yyxy - yxyy + xyyy - zzyx +
+                zzxy + zyzx - zxzy - yzzx + xzzy - zyxz + zxyz + yzxz - xzyz -
+                yxzz + xyzz;
+        nflops_ += 19;
+      }
+    } break;
+    default:
+      throw std::runtime_error(
+          "CR_11_σpσpCoulombσpσp_11: invalid quaternionic index");
+  }
+
+}  // CR_11_σpσpCoulombσpσp_11<F>::CR_11_σpσpCoulombσpσp_11
+};  // namespace libint2
+
+#endif  // LIBINT_COMP_11_ΣPΣPCOULOMBΣPΣP_11_H
diff --git a/src/bin/libint/master_ints_list.h b/src/bin/libint/master_ints_list.h
index c5a6f4655..37bfa29a7 100644
--- a/src/bin/libint/master_ints_list.h
+++ b/src/bin/libint/master_ints_list.h
@@ -109,6 +109,10 @@ typedef GenIntegralSet_11_11<CGF, TwoPRep, mType> TwoPRep_11_11_int;
 typedef GenIntegralSet_11_11<CGShell, CoulombσpσpOper, mType>
     Coulombσpσp_11_11_sq;
 typedef GenIntegralSet_11_11<CGF, CoulombσpσpOper, mType> Coulombσpσp_11_11_int;
+typedef GenIntegralSet_11_11<CGShell, σpσpCoulombσpσpOper, mType>
+    σpσpCoulombσpσp_11_11_sq;
+typedef GenIntegralSet_11_11<CGF, σpσpCoulombσpσpOper, mType>
+    σpσpCoulombσpσp_11_11_int;
 typedef GenIntegralSet_11_11<CGShell, R12kG12, mType> R12kG12_11_11_sq;
 typedef GenIntegralSet_11_11<CGF, R12kG12, mType> R12kG12_11_11_int;
 typedef GenIntegralSet_11_11<CGShell, R12kR12lG12, EmptySet>
@@ -148,10 +152,10 @@ typedef boost::mpl::list<
     SMultipole_1_1_int,
 #endif
     TwoPRep_11_11_sq, TwoPRep_11_11_int, Coulombσpσp_11_11_sq,
-    Coulombσpσp_11_11_int, R12kG12_11_11_sq, R12kG12_11_11_int,
-    R12kR12lG12_11_11_sq, R12kR12lG12_11_11_int, TiG12_11_11_sq,
-    TiG12_11_11_int, G12TiG12_11_11_sq, G12TiG12_11_11_int,
-    DivG12prime_xTx_11_11_sq, DivG12prime_xTx_11_11_int,
+    Coulombσpσp_11_11_int, σpσpCoulombσpσp_11_11_sq, σpσpCoulombσpσp_11_11_int,
+    R12kG12_11_11_sq, R12kG12_11_11_int, R12kR12lG12_11_11_sq,
+    R12kR12lG12_11_11_int, TiG12_11_11_sq, TiG12_11_11_int, G12TiG12_11_11_sq,
+    G12TiG12_11_11_int, DivG12prime_xTx_11_11_sq, DivG12prime_xTx_11_11_int,
     DummySymmIntegral_11_11_sq, DummySymmIntegral_11_11_int>
     MasterIntegralTypeList;
 
diff --git a/src/bin/libint/master_rrs_list.h b/src/bin/libint/master_rrs_list.h
index 32e56878f..d55cfa301 100644
--- a/src/bin/libint/master_rrs_list.h
+++ b/src/bin/libint/master_rrs_list.h
@@ -26,6 +26,7 @@
 #include <comp_11_g12tig12_11.h>
 #include <comp_11_r12kr12lg12_11.h>
 #include <comp_11_tig12_11.h>
+#include <comp_11_σpσpCoulombσpσp_11.h>
 #include <comp_1_σpVσp_1.h>
 #include <comp_deriv_gauss.h>
 #include <comp_xyz.h>
@@ -269,6 +270,9 @@ typedef CR_DerivGauss<TwoPRep_11_11_int, 1, InKet, trinvskip2_part,
 
 typedef CR_11_Coulombσpσp_11<CGShell> CR_11_Coulombσpσp_11_sh;
 typedef CR_11_Coulombσpσp_11<CGF> CR_11_Coulombσpσp_11_int;
+
+typedef CR_11_σpσpCoulombσpσp_11<CGShell> CR_11_σpσpCoulombσpσp_11_sh;
+typedef CR_11_σpσpCoulombσpσp_11<CGF> CR_11_σpσpCoulombσpσp_11_int;
 };  // namespace libint2
 
 #endif  // header guard
diff --git a/src/bin/libint/oper.h b/src/bin/libint/oper.h
index 36292613a..cecbda72b 100644
--- a/src/bin/libint/oper.h
+++ b/src/bin/libint/oper.h
@@ -437,6 +437,42 @@ struct Coulombσpσp_Descr : public Contractable<Coulombσpσp_Descr> {
 };
 typedef GenOper<Coulombσpσp_Descr> CoulombσpσpOper;
 
+struct σpσpCoulombσpσp_Descr : public Contractable<σpσpCoulombσpσp_Descr> {
+  typedef MultiplicativeSymm2Body_Props Properties;
+
+  σpσpCoulombσpσp_Descr() : quaternion_index_(0) {}
+  σpσpCoulombσpσp_Descr(int quaternion_index)
+      : quaternion_index_(quaternion_index) {
+    assert(quaternion_index <= 3);
+  }
+
+  static const unsigned int max_key = 4;
+  unsigned int key() const { return quaternion_index(); }
+  std::string description() const {
+    std::string descr("opop_coulomb_opop[");
+    if (quaternion_index() == 0)
+      descr += "0";
+    else if (quaternion_index() == 1)
+      descr += "X";
+    else if (quaternion_index() == 2)
+      descr += "Y";
+    else if (quaternion_index() == 3)
+      descr += "Z";
+    else
+      abort();
+    return descr + "]";
+  }
+  std::string label() const { return description(); }
+  int psymm(int i, int j) const { abort(); }
+  int hermitian(int i) const { return +1; }
+
+  int quaternion_index() const { return quaternion_index_; }
+
+ private:
+  const int quaternion_index_ = -1;
+};
+typedef GenOper<σpσpCoulombσpσp_Descr> σpσpCoulombσpσpOper;
+
 /** GTG_1d is the two-body 1-dimensional Gaussian geminal
  */
 struct GTG_1d_Descr : public Contractable<GTG_1d_Descr> {
diff --git a/src/bin/libint/strategy.cc b/src/bin/libint/strategy.cc
index bcab9dcad..58fb8d2bd 100644
--- a/src/bin/libint/strategy.cc
+++ b/src/bin/libint/strategy.cc
@@ -123,6 +123,14 @@ template <>
 struct MasterStrategy<Coulombσpσp_11_11_int> {
   typedef boost::mpl::list<CR_11_Coulombσpσp_11_int> value;
 };
+template <>
+struct MasterStrategy<σpσpCoulombσpσp_11_11_sq> {
+  typedef boost::mpl::list<CR_11_σpσpCoulombσpσp_11_sh> value;
+};
+template <>
+struct MasterStrategy<σpσpCoulombσpσp_11_11_int> {
+  typedef boost::mpl::list<CR_11_σpσpCoulombσpσp_11_int> value;
+};
 
 #if LIBINT_SHELLQUARTET_STRATEGY == LIBINT_SHELLQUARTET_STRATEGY_A0C0
 template <>

From 60e2117db25cbdc062a930c1dcb9e83837dfb21f Mon Sep 17 00:00:00 2001
From: Kshitij Surjuse <kshitijsurjuse100@gmail.com>
Date: Sat, 28 Feb 2026 21:38:37 -0500
Subject: [PATCH 12/22] Can evaluate `(SS|SS)` integrals. Only for `STANDARD`
 shell quartet for now.

---
 export/tests/unit/test-2body.cc               | 346 ++++++++++++------
 include/libint2/engine.h                      |  30 +-
 include/libint2/engine.impl.h                 |  75 ++--
 src/bin/libint/build_libint.cc                |  80 ++--
 ...3p\317\203pCoulomb\317\203p\317\203p_11.h" | 132 ++-----
 5 files changed, 383 insertions(+), 280 deletions(-)

diff --git a/export/tests/unit/test-2body.cc b/export/tests/unit/test-2body.cc
index ee08ed972..bf3b6a60d 100644
--- a/export/tests/unit/test-2body.cc
+++ b/export/tests/unit/test-2body.cc
@@ -345,20 +345,83 @@ TEST_CASE("eri geometric derivatives", "[engine][2-body]") {
 }
 
 TEST_CASE("RKB Coulomb integrals", "[engine][2-body]") {
-  std::vector<Shell> obs{
-      // pseudorandom s
-      Shell{{1.0, 0.3}, {{0, false, {0.9, 0.3}}}, {{0.0, 0.0, 0.0}}},
-      // pseudorandom p
-      Shell{{2.0, 0.4}, {{1, false, {0.8, -0.2}}}, {{1.0, 1.0, 1.0}}}};
+  std::vector<Shell> obs{// pseudorandom s
+                         Shell{{1.0}, {{0, false, {1.0}}}, {{0.0, 0.0, 0.0}}},
+                         // pseudorandom p
+                         Shell{{2.0}, {{1, false, {1.0}}}, {{1.0, 1.0, 1.0}}}};
 
   const auto max_nprim = libint2::max_nprim(obs);
   const auto max_l = libint2::max_l(obs);
   typedef std::array<unsigned int, 12> der_idx;
 
-  SECTION("Coulombσpσp") {
-    Engine engine;
+  // e.g. d_xx maps the derivative index of derivative w.r.t x
+  // coord of ket1 and x coord of ket2 in Chemist notation.
+  // deriv indices for (LL|SS)
+  der_idx d_xx = {0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0};
+  der_idx d_yy = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0};
+  der_idx d_zz = {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1};
+  der_idx d_yz = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1};
+  der_idx d_zy = {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0};
+  der_idx d_zx = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0};
+  der_idx d_xz = {0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1};
+  der_idx d_xy = {0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0};
+  der_idx d_yx = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0};
+
+  // deriv indices for (SS|SS)
+  // 0th component
+  der_idx xxxx = {1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
+  der_idx yyxx = {0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0};
+  der_idx zzxx = {0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0};
+  der_idx yxyx = {0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0};
+  der_idx xyyx = {1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0};
+  der_idx yxxy = {0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0};
+  der_idx xyxy = {1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0};
+  der_idx xxyy = {1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0};
+  der_idx yyyy = {0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0};
+  der_idx zzyy = {0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0};
+  der_idx xxzz = {1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1};
+  der_idx yyzz = {0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1};
+  der_idx zzzz = {0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1};
+
+  // x-component
+  der_idx zxzx = {0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0};
+  der_idx xzzx = {1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0};
+  der_idx zyzy = {0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0};
+  der_idx yzzy = {0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0};
+  der_idx zxxz = {0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1};
+  der_idx xzxz = {1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1};
+  der_idx zyyz = {0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1};
+  der_idx yzyz = {0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1};
+
+  // y-component
+  der_idx zyzx = {0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0};
+  der_idx yzzx = {0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0};
+  der_idx zxzy = {0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0};
+  der_idx xzzy = {1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0};
+  der_idx zyxz = {0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1};
+  der_idx yzxz = {0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1};
+  der_idx zxyz = {0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1};
+  der_idx xzyz = {1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1};
+
+  // z-component
+  der_idx yxxx = {0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0};
+  der_idx xyxx = {1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0};
+  der_idx xxyx = {1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0};
+  der_idx yyyx = {0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0};
+  der_idx zzyx = {0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0};
+  der_idx xxxy = {1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0};
+  der_idx yyxy = {0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0};
+  der_idx zzxy = {0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0};
+  der_idx yxyy = {0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0};
+  der_idx xyyy = {1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0};
+  der_idx yxzz = {0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1};
+  der_idx xyzz = {1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1};
+
+  SECTION("Coulombσpσp and σpσpCoulombσpσp") {
+    Engine engine_llss, engine_ssss;
     try {
-      engine = Engine(Operator::coulomb_opop, max_nprim, max_l, 0);
+      engine_llss = Engine(Operator::coulomb_opop, max_nprim, max_l, 0);
+      engine_ssss = Engine(Operator::opop_coulomb_opop, max_nprim, max_l, 0);
       // TODO: need another unit test for derivatives of RKB ERIs
     } catch (
         Engine::lmax_exceeded &) {  // skip the test if lmax exceeded or libint2
@@ -371,119 +434,172 @@ TEST_CASE("RKB Coulomb integrals", "[engine][2-body]") {
       for (int s1 = 0; s1 != nshell; ++s1) {
         for (int s2 = 0; s2 != nshell; ++s2) {
           for (int s3 = 0; s3 != nshell; ++s3) {
-            const auto &results =
-                engine.compute(obs[s0], obs[s1], obs[s2], obs[s3]);
-            assert(results.size() ==
-                   4);  // we get 4 buffers for each quaternion component
-
-            LIBINT2_REF_REALTYPE Aref[3];
-            for (int i = 0; i < 3; ++i) Aref[i] = obs[s0].O[i];
-            LIBINT2_REF_REALTYPE Bref[3];
-            for (int i = 0; i < 3; ++i) Bref[i] = obs[s1].O[i];
-            LIBINT2_REF_REALTYPE Cref[3];
-            for (int i = 0; i < 3; ++i) Cref[i] = obs[s2].O[i];
-            LIBINT2_REF_REALTYPE Dref[3];
-            for (int i = 0; i < 3; ++i) Dref[i] = obs[s3].O[i];
-
-            int ijkl = 0;
-
-            int l0, m0, n0;
-            FOR_CART(l0, m0, n0, obs[s0].contr[0].l)
-
-            int l1, m1, n1;
-            FOR_CART(l1, m1, n1, obs[s1].contr[0].l)
-
-            int l2, m2, n2;
-            FOR_CART(l2, m2, n2, obs[s2].contr[0].l)
-
-            int l3, m3, n3;
-            FOR_CART(l3, m3, n3, obs[s3].contr[0].l)
-
-            std::array<LIBINT2_REF_REALTYPE, 4> ref_coulomb_opop{0.0, 0.0, 0.0,
-                                                                 0.0};
-            uint p0123 = 0;
-            for (uint p0 = 0; p0 < obs[s0].nprim(); p0++) {
-              for (uint p1 = 0; p1 < obs[s1].nprim(); p1++) {
-                for (uint p2 = 0; p2 < obs[s2].nprim(); p2++) {
-                  for (uint p3 = 0; p3 < obs[s3].nprim(); p3++, p0123++) {
-                    const LIBINT2_REF_REALTYPE alpha0 = obs[s0].alpha[p0];
-                    const LIBINT2_REF_REALTYPE alpha1 = obs[s1].alpha[p1];
-                    const LIBINT2_REF_REALTYPE alpha2 = obs[s2].alpha[p2];
-                    const LIBINT2_REF_REALTYPE alpha3 = obs[s3].alpha[p3];
-
-                    const LIBINT2_REF_REALTYPE c0 = obs[s0].contr[0].coeff[p0];
-                    const LIBINT2_REF_REALTYPE c1 = obs[s1].contr[0].coeff[p1];
-                    const LIBINT2_REF_REALTYPE c2 = obs[s2].contr[0].coeff[p2];
-                    const LIBINT2_REF_REALTYPE c3 = obs[s3].contr[0].coeff[p3];
-                    const LIBINT2_REF_REALTYPE c0123 = c0 * c1 * c2 * c3;
-
-                    auto eri_drr = [&](der_idx d_rr) {
-                      return eri(d_rr.data(), l0, m0, n0, alpha0, Aref, l1, m1,
-                                 n1, alpha1, Bref, l2, m2, n2, alpha2, Cref, l3,
-                                 m3, n3, alpha3, Dref, 0);
-                    };
-
-                    // e.g. d_xx maps the derivative index of derivative w.r.t x
-                    // coord of ket1 and x coord of ket2 in Chemist notation.
-                    der_idx d_xx = {0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0};
-                    der_idx d_yy = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0};
-                    der_idx d_zz = {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1};
-                    ref_coulomb_opop[0] +=
-                        c0123 * (eri_drr(d_xx) + eri_drr(d_yy) + eri_drr(d_zz));
-
-                    der_idx d_yz = {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1};
-                    der_idx d_zy = {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0};
-                    ref_coulomb_opop[1] +=
-                        c0123 * (eri_drr(d_yz) - eri_drr(d_zy));
-
-                    der_idx d_zx = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0};
-                    der_idx d_xz = {0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1};
-                    ref_coulomb_opop[2] +=
-                        c0123 * (eri_drr(d_zx) - eri_drr(d_xz));
-
-                    der_idx d_xy = {0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0};
-                    der_idx d_yx = {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0};
-                    ref_coulomb_opop[3] +=
-                        c0123 * (eri_drr(d_xy) - eri_drr(d_yx));
+            if (s0 == 0 && s1 == 1 && s2 == 0 && s3 == 1) {
+              const auto &results_llss =
+                  engine_llss.compute(obs[s0], obs[s1], obs[s2], obs[s3]);
+              const auto &results_ssss =
+                  engine_ssss.compute(obs[s0], obs[s1], obs[s2], obs[s3]);
+              assert(results_llss.size() ==
+                     4);  // we get 4 buffers for each quaternion component
+
+              LIBINT2_REF_REALTYPE Aref[3];
+              for (int i = 0; i < 3; ++i) Aref[i] = obs[s0].O[i];
+              LIBINT2_REF_REALTYPE Bref[3];
+              for (int i = 0; i < 3; ++i) Bref[i] = obs[s1].O[i];
+              LIBINT2_REF_REALTYPE Cref[3];
+              for (int i = 0; i < 3; ++i) Cref[i] = obs[s2].O[i];
+              LIBINT2_REF_REALTYPE Dref[3];
+              for (int i = 0; i < 3; ++i) Dref[i] = obs[s3].O[i];
+
+              int ijkl = 0;
+
+              int l0, m0, n0;
+              FOR_CART(l0, m0, n0, obs[s0].contr[0].l)
+
+              int l1, m1, n1;
+              FOR_CART(l1, m1, n1, obs[s1].contr[0].l)
+
+              int l2, m2, n2;
+              FOR_CART(l2, m2, n2, obs[s2].contr[0].l)
+
+              int l3, m3, n3;
+              FOR_CART(l3, m3, n3, obs[s3].contr[0].l)
+
+              std::array<LIBINT2_REF_REALTYPE, 4> ref_coulomb_opop{0.0, 0.0,
+                                                                   0.0, 0.0};
+              std::array<LIBINT2_REF_REALTYPE, 4> ref_opop_coulomb_opop{
+                  0.0, 0.0, 0.0, 0.0};
+              uint p0123 = 0;
+              for (uint p0 = 0; p0 < obs[s0].nprim(); p0++) {
+                for (uint p1 = 0; p1 < obs[s1].nprim(); p1++) {
+                  for (uint p2 = 0; p2 < obs[s2].nprim(); p2++) {
+                    for (uint p3 = 0; p3 < obs[s3].nprim(); p3++, p0123++) {
+                      const LIBINT2_REF_REALTYPE alpha0 = obs[s0].alpha[p0];
+                      const LIBINT2_REF_REALTYPE alpha1 = obs[s1].alpha[p1];
+                      const LIBINT2_REF_REALTYPE alpha2 = obs[s2].alpha[p2];
+                      const LIBINT2_REF_REALTYPE alpha3 = obs[s3].alpha[p3];
+
+                      const LIBINT2_REF_REALTYPE c0 =
+                          obs[s0].contr[0].coeff[p0];
+                      const LIBINT2_REF_REALTYPE c1 =
+                          obs[s1].contr[0].coeff[p1];
+                      const LIBINT2_REF_REALTYPE c2 =
+                          obs[s2].contr[0].coeff[p2];
+                      const LIBINT2_REF_REALTYPE c3 =
+                          obs[s3].contr[0].coeff[p3];
+                      const LIBINT2_REF_REALTYPE c0123 = c0 * c1 * c2 * c3;
+
+                      auto eri_drrrr = [&](der_idx d_rrrr) {
+                        return eri(d_rrrr.data(), l0, m0, n0, alpha0, Aref, l1,
+                                   m1, n1, alpha1, Bref, l2, m2, n2, alpha2,
+                                   Cref, l3, m3, n3, alpha3, Dref, 0);
+                      };
+
+                      // (LL|SS)
+                      ref_coulomb_opop[0] +=
+                          c0123 *
+                          (eri_drrrr(d_xx) + eri_drrrr(d_yy) + eri_drrrr(d_zz));
+                      ref_coulomb_opop[1] +=
+                          c0123 * (eri_drrrr(d_yz) - eri_drrrr(d_zy));
+                      ref_coulomb_opop[2] +=
+                          c0123 * (eri_drrrr(d_zx) - eri_drrrr(d_xz));
+                      ref_coulomb_opop[3] +=
+                          c0123 * (eri_drrrr(d_xy) - eri_drrrr(d_yx));
+
+                      // (SS|SS)
+                      ref_opop_coulomb_opop[0] +=
+                          c0123 *
+                          (eri_drrrr(xxxx) + eri_drrrr(yyxx) + eri_drrrr(zzxx) -
+                           eri_drrrr(yxyx) + eri_drrrr(xyyx) + eri_drrrr(yxxy) -
+                           eri_drrrr(xyxy) + eri_drrrr(xxyy) + eri_drrrr(yyyy) +
+                           eri_drrrr(zzyy) + eri_drrrr(xxzz) + eri_drrrr(yyzz) +
+                           eri_drrrr(zzzz));
+                      ref_opop_coulomb_opop[1] +=
+                          c0123 *
+                          (eri_drrrr(zxzx) - eri_drrrr(xzzx) - eri_drrrr(zyzy) +
+                           eri_drrrr(yzzy) - eri_drrrr(zxxz) + eri_drrrr(xzxz) +
+                           eri_drrrr(zyyz) - eri_drrrr(yzyz));
+                      ref_opop_coulomb_opop[2] +=
+                          c0123 *
+                          (-eri_drrrr(zyzx) + eri_drrrr(yzzx) -
+                           eri_drrrr(zxzy) + eri_drrrr(xzzy) + eri_drrrr(zyxz) -
+                           eri_drrrr(yzxz) + eri_drrrr(zxyz) - eri_drrrr(xzyz));
+                      ref_opop_coulomb_opop[3] +=
+                          c0123 *
+                          (-eri_drrrr(yxxx) + eri_drrrr(xyxx) -
+                           eri_drrrr(xxyx) - eri_drrrr(yyyx) - eri_drrrr(zzyx) +
+                           eri_drrrr(xxxy) + eri_drrrr(yyxy) + eri_drrrr(zzxy) -
+                           eri_drrrr(yxyy) + eri_drrrr(xyyy) - eri_drrrr(yxzz) +
+                           eri_drrrr(xyzz));
+                    }
                   }
                 }
               }
-            }
 
-            const double ABSOLUTE_DEVIATION_THRESHOLD = 5.0E-14;
-            const double RELATIVE_DEVIATION_THRESHOLD =
-                1.0E-9;  // For more detail on choice of these thresholds, see
-                         // the comments in the TEST_CASE "eri geometric
-                         // derivatives"
-
-            std::array<LIBINT2_REF_REALTYPE, 4> abs_errs;
-            std::array<LIBINT2_REF_REALTYPE, 4> rel_abs_errs;
-
-            for (auto comp = 0; comp < 4; ++comp) {
-              abs_errs[comp] =
-                  abs(ref_coulomb_opop[comp] - results[comp][ijkl]);
-              rel_abs_errs[comp] = abs(abs_errs[comp] / ref_coulomb_opop[comp]);
-              bool not_ok = rel_abs_errs[comp] > RELATIVE_DEVIATION_THRESHOLD &&
-                            abs_errs[comp] > ABSOLUTE_DEVIATION_THRESHOLD;
-              // no 3^n prefactor here since the intrinsic deriv order is 2
-              if (not_ok) {
-                std::cout << "(l0 l1| l2 l3) = "
-                          << "(" << s0 << " " << s1 << " | " << s2 << " " << s3
-                          << ") "
-                          << "Elem " << ijkl << " comp= " << comp
-                          << " : ref = " << ref_coulomb_opop[comp]
-                          << " libint = " << results[comp][ijkl]
-                          << " relabs_error = " << rel_abs_errs[comp]
-                          << " abs_error = " << abs_errs[comp] << std::endl;
+              const double ABSOLUTE_DEVIATION_THRESHOLD = 5.0E-14;
+              const double RELATIVE_DEVIATION_THRESHOLD =
+                  1.0E-9;  // For more detail on choice of these thresholds, see
+                           // the comments in the TEST_CASE "eri geometric
+                           // derivatives"
+
+              std::array<LIBINT2_REF_REALTYPE, 4> abs_errs_llss;
+              std::array<LIBINT2_REF_REALTYPE, 4> rel_abs_errs_llss;
+
+              std::array<LIBINT2_REF_REALTYPE, 4> abs_errs_ssss;
+              std::array<LIBINT2_REF_REALTYPE, 4> rel_abs_errs_ssss;
+
+              for (auto comp = 0; comp < 4; ++comp) {
+                abs_errs_llss[comp] =
+                    abs(ref_coulomb_opop[comp] - results_llss[comp][ijkl]);
+                rel_abs_errs_llss[comp] =
+                    abs(abs_errs_llss[comp] / ref_coulomb_opop[comp]);
+
+                abs_errs_ssss[comp] =
+                    abs(ref_opop_coulomb_opop[comp] - results_ssss[comp][ijkl]);
+                rel_abs_errs_ssss[comp] =
+                    abs(abs_errs_ssss[comp] / ref_opop_coulomb_opop[comp]);
+
+                bool llss_not_ok =
+                    rel_abs_errs_llss[comp] > RELATIVE_DEVIATION_THRESHOLD &&
+                    abs_errs_llss[comp] > ABSOLUTE_DEVIATION_THRESHOLD;
+
+                bool ssss_not_ok =
+                    rel_abs_errs_ssss[comp] > RELATIVE_DEVIATION_THRESHOLD &&
+                    abs_errs_ssss[comp] > ABSOLUTE_DEVIATION_THRESHOLD;
+
+                // no 3^n prefactor here since the intrinsic deriv order is 2
+                if (llss_not_ok) {
+                  std::cout << "(l0 l1| l2 l3) = "
+                            << "(" << s0 << " " << s1 << " | " << s2 << " "
+                            << s3 << ") "
+                            << "Elem " << ijkl << " comp= " << comp
+                            << " : ref = " << ref_coulomb_opop[comp]
+                            << " libint = " << results_llss[comp][ijkl]
+                            << " relabs_error = " << rel_abs_errs_llss[comp]
+                            << " abs_error = " << abs_errs_llss[comp]
+                            << std::endl;
+                }
+                if (ssss_not_ok) {
+                  std::cout << "(l0 l1| l2 l3) = "
+                            << "(" << s0 << " " << s1 << " | " << s2 << " "
+                            << s3 << ") "
+                            << "Elem " << ijkl << " comp= " << comp
+                            << " : ref = " << ref_opop_coulomb_opop[comp]
+                            << " libint = " << results_ssss[comp][ijkl]
+                            << " relabs_error = " << rel_abs_errs_ssss[comp]
+                            << " abs_error = " << abs_errs_ssss[comp]
+                            << std::endl;
+                }
+                REQUIRE(!llss_not_ok);
+                REQUIRE(!ssss_not_ok);
               }
-              REQUIRE(!not_ok);
-            }
 
-            ++ijkl;
-            END_FOR_CART
-            END_FOR_CART
-            END_FOR_CART
-            END_FOR_CART
+              ++ijkl;
+              END_FOR_CART
+              END_FOR_CART
+              END_FOR_CART
+              END_FOR_CART
+            }
           }
         }
       }
diff --git a/include/libint2/engine.h b/include/libint2/engine.h
index 81048b049..7253ce417 100644
--- a/include/libint2/engine.h
+++ b/include/libint2/engine.h
@@ -156,6 +156,10 @@ enum class Operator {
   /// (2-body) \f$ r_{12}^{-1} (σ.p_{k1})(σ.p_{k2})\f$ where k1  & k2 are
   /// centers of ket1 and ket2, respectively
   coulomb_opop,
+  /// (2-body) \f$ (σ.p_{b1})(σ.p_{b2}) r_{12}^{-1} (σ.p_{k1})(σ.p_{k2})\f$
+  /// where b1 & b2 are centers of bra1 and bra2 and k1  & k2 are centers of
+  /// ket1 and ket2, respectively
+  opop_coulomb_opop,
   /// contracted Gaussian geminal
   cgtg,
   /// contracted Gaussian geminal times Coulomb
@@ -357,6 +361,12 @@ struct operator_traits<Operator::coulomb_opop>
   static constexpr auto nopers = 4;
   static constexpr auto intrinsic_deriv_order = 2;
 };
+template <>
+struct operator_traits<Operator::opop_coulomb_opop>
+    : public operator_traits<Operator::coulomb> {
+  static constexpr auto nopers = 4;
+  static constexpr auto intrinsic_deriv_order = 4;
+};
 
 namespace detail {
 template <int K>
@@ -851,16 +861,16 @@ class Engine {
       const Shell& ket2, const ShellPair* spbra, const ShellPair* spket);
 
   // clang-format off
-  /** this specifies target precision for computing the integrals, i.e.
-   *  the target absolute (i.e., not relative) error of the integrals.
-   *  It is used to screen out primitive integrals. For some screening
-   *  methods precision can be almost guaranteed (due to finite precision
-   *  of the precomputed interpolation tables used to evaluate the core integrals
-   *  it is not in general possible to guarantee precision rigorously).
-   *
-   *  @param[in] prec the target precision
-   *  @sa ScreeningMethod
-   */
+ /** this specifies target precision for computing the integrals, i.e.
+  *  the target absolute (i.e., not relative) error of the integrals.
+  *  It is used to screen out primitive integrals. For some screening
+  *  methods precision can be almost guaranteed (due to finite precision
+  *  of the precomputed interpolation tables used to evaluate the core integrals
+  *  it is not in general possible to guarantee precision rigorously).
+  *
+  *  @param[in] prec the target precision
+  *  @sa ScreeningMethod
+  */
   // clang-format on
   Engine& set_precision(scalar_type prec) {
     if (prec <= 0.) {
diff --git a/include/libint2/engine.impl.h b/include/libint2/engine.impl.h
index 834fd9c6d..da5275c4c 100644
--- a/include/libint2/engine.impl.h
+++ b/include/libint2/engine.impl.h
@@ -70,31 +70,32 @@ typename std::remove_all_extents<T>::type* to_ptr1(T (&a)[N]) {
 /// These MUST appear in the same order as in Operator.
 /// You must also update BOOST_PP_NBODY_OPERATOR_LAST_ONEBODY_INDEX when you add
 /// one-body ints
-#define BOOST_PP_NBODY_OPERATOR_LIST               \
-  (overlap,                   /* overlap */        \
-   (kinetic,                  /* kinetic */        \
-    (elecpot,                 /* nuclear */        \
-     (elecpot,                /* erf_nuclear */    \
-      (elecpot,               /* erfc_nuclear */   \
-       (elecpot,              /* erfx_nuclear */   \
-        (1emultipole,         /* emultipole1 */    \
-         (2emultipole,        /* emultipole2 */    \
-          (3emultipole,       /* emultipole3 */    \
-           (sphemultipole,    /* sphemultipole */  \
-            (opVop,           /* opVop */          \
-             (eri,            /* delta */          \
-              (eri,           /* coulomb */        \
-               (coulomb_opop, /* coulomb_opop */   \
-                (eri,         /* cgtg */           \
-                 (eri,        /* cgtg_x_coulomb */ \
-                  (eri,       /* delcgtg2 */       \
-                   (eri,      /* r12 */            \
-                    (eri,     /* erf_coulomb */    \
-                     (eri,    /* erfc_coulomb */   \
-                      (eri,   /* erfx_coulomb */   \
-                       (eri,  /* stg */            \
-                        (eri, /* yukawa */         \
-                         BOOST_PP_NIL)))))))))))))))))))))))
+#define BOOST_PP_NBODY_OPERATOR_LIST                     \
+  (overlap,                         /* overlap */        \
+   (kinetic,                        /* kinetic */        \
+    (elecpot,                       /* nuclear */        \
+     (elecpot,                      /* erf_nuclear */    \
+      (elecpot,                     /* erfc_nuclear */   \
+       (elecpot,                    /* erfx_nuclear */   \
+        (1emultipole,               /* emultipole1 */    \
+         (2emultipole,              /* emultipole2 */    \
+          (3emultipole,             /* emultipole3 */    \
+           (sphemultipole,          /* sphemultipole */  \
+            (opVop,                 /* opVop */          \
+             (eri,                  /* delta */          \
+              (eri,                 /* coulomb */        \
+               (coulomb_opop,       /* coulomb_opop */   \
+                (opop_coulomb_opop, /* coulomb_opop */   \
+                 (eri,              /* cgtg */           \
+                  (eri,             /* cgtg_x_coulomb */ \
+                   (eri,            /* delcgtg2 */       \
+                    (eri,           /* r12 */            \
+                     (eri,          /* erf_coulomb */    \
+                      (eri,         /* erfc_coulomb */   \
+                       (eri,        /* erfx_coulomb */   \
+                        (eri,       /* stg */            \
+                         (eri,      /* yukawa */         \
+                          BOOST_PP_NIL))))))))))))))))))))))))
 
 #define BOOST_PP_NBODY_OPERATOR_INDEX_TUPLE \
   BOOST_PP_MAKE_TUPLE(BOOST_PP_LIST_SIZE(BOOST_PP_NBODY_OPERATOR_LIST))
@@ -702,6 +703,7 @@ __libint2_engine_inline void Engine::initialize(size_t max_nprim) {
     // target indices.
     const auto permutable_targets =
         deriv_order_ > 0 &&
+
         (braket_ == BraKet::xx_xx || braket_ == BraKet::xs_xx ||
          braket_ == BraKet::xx_xs);
     if (permutable_targets)
@@ -1213,13 +1215,21 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
 
 #if LIBINT2_SHELLQUARTET_SET == \
     LIBINT2_SHELLQUARTET_SET_STANDARD  // standard angular momentum ordering
-  const auto swap_tbra = (tbra1.contr[0].l < tbra2.contr[0].l);
-  const auto swap_tket = (tket1.contr[0].l < tket2.contr[0].l);
   const auto swap_braket = ((braket_ == BraKet::xx_xx) &&
                             (tbra1.contr[0].l + tbra2.contr[0].l >
                              tket1.contr[0].l + tket2.contr[0].l) &&
                             (oper_ != Operator::coulomb_opop)) ||
                            braket_ == BraKet::xx_xs;
+  bool swap_tbra, swap_tket;
+  if (oper_ == Operator::opop_coulomb_opop) {
+    bool swap_p1p2 = swap_braket ? (tbra1.contr[0].l < tbra2.contr[0].l)
+                                 : (tket1.contr[0].l < tket2.contr[0].l);
+    swap_tbra = swap_tket = swap_p1p2;
+  } else {
+    swap_tbra = (tbra1.contr[0].l < tbra2.contr[0].l);
+    swap_tket = (tket1.contr[0].l < tket2.contr[0].l);
+  }
+
   // N.B. cannot swap bra and ket for coulomb_opop since the ket is mutated by
   // this operator
 #else  // orca angular momentum ordering
@@ -1444,6 +1454,13 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
                           .first();
                   core_eval_ptr->eval(gm_ptr, T, mmax);
                 } break;
+                case Operator::opop_coulomb_opop: {
+                  const auto& core_eval_ptr =
+                      any_cast<const detail::core_eval_pack_type<
+                          Operator::opop_coulomb_opop>&>(core_eval_pack_)
+                          .first();
+                  core_eval_ptr->eval(gm_ptr, T, mmax);
+                } break;
                 case Operator::cgtg_x_coulomb: {
                   const auto& core_eval_ptr =
                       any_cast<const detail::core_eval_pack_type<
@@ -2118,6 +2135,10 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
                 Map tgt_blk_mat(tgt_ptr + tgt_row_idx * ncol, nc1_tgt, nc2_tgt);
                 if (swap_tket) {
                   Shell::real_t oper_cart_component_phase = 1.0;
+                  if (oper_ == Operator::opop_coulomb_opop && s == 3)
+                    oper_cart_component_phase =
+                        -1.0;  // z quaternion components flip sign on
+                               //  swapping ket
                   if (oper_ == Operator::coulomb_opop && s > 0)
                     oper_cart_component_phase =
                         -1.0;  // x,y,z quaternion components flip sign on
diff --git a/src/bin/libint/build_libint.cc b/src/bin/libint/build_libint.cc
index 8712580a5..278e2668a 100644
--- a/src/bin/libint/build_libint.cc
+++ b/src/bin/libint/build_libint.cc
@@ -75,19 +75,35 @@ struct ShellQuartetSetPredicate {
   // return true if this set of angular momenta is included
   static bool value(int la, int lb, int lc, int ld, bool p1p2_swappable = true);
 };
+
+/**
+ * standard ordering for angular momenta la, lb, lc, ld
+ * @param p1p2_swappable whether operator allows swaps of particle 1 and 2
+ * functions (e.g., not allowed for Coulombσpσp but allowed
+ * for Coulomb (TwoPRep)).
+ * @param bra_ket_coswappable whether need to swap within both bra and ket.
+ * Not individually swapping of either ket of bra allowed
+ * ( e.g., for σpσpCoulombσpσp)
+ */
 template <>
 struct ShellQuartetSetPredicate<ShellSetType_Standard> {
-  static bool value(int la, int lb, int lc, int ld,
-                    bool p1p2_swappable = true) {
-    return la >= lb && lc >= ld && (!p1p2_swappable || la + lb <= lc + ld);
+  static bool value(int la, int lb, int lc, int ld, bool p1p2_swappable = true,
+                    bool bra_ket_coswappable = false) {
+    if (bra_ket_coswappable)
+      return (la + lb <= lc + ld) && lc >= ld;
+    else
+      return la >= lb && lc >= ld && (!p1p2_swappable || la + lb <= lc + ld);
   }
 };
 template <>
 struct ShellQuartetSetPredicate<ShellSetType_ORCA> {
-  static bool value(int la, int lb, int lc, int ld,
-                    bool p1p2_swappable = true) {
-    return la <= lb && lc <= ld &&
-           (!p1p2_swappable || (la < lc || (la == lc && lb <= ld)));
+  static bool value(int la, int lb, int lc, int ld, bool p1p2_swappable = true,
+                    bool bra_ket_coswappable = false) {
+    if (bra_ket_coswappable)
+      return (la < lc || (la == lc && lb <= ld));
+    else
+      return la <= lb && lc <= ld &&
+             (!p1p2_swappable || (la < lc || (la == lc && lb <= ld)));
   }
 };
 template <ShellSetType ShSet>
@@ -1123,13 +1139,17 @@ void build_TwoPRep_2b_2k(std::ostream& os, std::string label,
   std::shared_ptr<MemoryManager> memman(new WorstFitMemoryManager());
 
   bool p1_p2_swappable = !std::is_same<OperType, CoulombσpσpOper>::value;
+  bool bra_ket_coswappable = std::is_same<OperType, σpσpCoulombσpσpOper>::value;
 
+  // Note: la, lb, lc, ld generate code for chemist notation (ab|O|cd), where O
+  // is a two-body operator.
   for (unsigned int la = 0; la <= lmax; la++) {
     for (unsigned int lb = 0; lb <= lmax; lb++) {
       for (unsigned int lc = 0; lc <= lmax; lc++) {
         for (unsigned int ld = 0; ld <= lmax; ld++) {
           if (!ShellQuartetSetPredicate<static_cast<ShellSetType>(
-                  LIBINT_SHELL_SET)>::value(la, lb, lc, ld, p1_p2_swappable))
+                  LIBINT_SHELL_SET)>::value(la, lb, lc, ld, p1_p2_swappable,
+                                            bra_ket_coswappable))
             continue;
 
           // std::shared_ptr<Tactic> tactic(new ParticleDirectionTactic(la+lb >
@@ -1148,7 +1168,8 @@ void build_TwoPRep_2b_2k(std::ostream& os, std::string label,
           // loop over operator components
           /////////////////////////////////
           std::vector<OperDescrType> descrs(1);
-          if (std::is_same<OperType, CoulombσpσpOper>::value) {
+          if (std::is_same<OperType, CoulombσpσpOper>::value ||
+              std::is_same<OperType, σpσpCoulombσpσpOper>::value) {
             // reset descriptors array
             descrs.resize(0);
             // iterate over quaternion components
@@ -1157,7 +1178,8 @@ void build_TwoPRep_2b_2k(std::ostream& os, std::string label,
             }
           }
 
-          // unroll only if max_am <= cparams->max_am_opt(task) using std::max;
+          // unroll only if max_am <= cparams->max_am_opt(task) using
+          // std::max;
           const unsigned int max_am = max(max(la, lb), max(lc, ld));
           const bool need_to_optimize = (max_am <= cparams->max_am_opt(task));
           const auto nopers = descrs.size();
@@ -1266,8 +1288,8 @@ void build_TwoPRep_2b_2k(std::ostream& os, std::string label,
           std::deque<std::string> decl_filenames;
           std::deque<std::string> def_filenames;
 
-          // this will generate code for these targets, and potentially generate
-          // code for its prerequisites
+          // this will generate code for these targets, and potentially
+          // generate code for its prerequisites
           GenerateCode(dg_xxxx, context, cparams, strat, tactic, memman,
                        decl_filenames, def_filenames, prefix, eval_label,
                        false);
@@ -1359,9 +1381,9 @@ void build_TwoPRep_1b_2k(std::ostream& os,
                 LIBINT_SHELL_SET)>::value(lbra, lc, ld))
           continue;
 
-        // I will use 4-center recurrence relations and integrals, and have one
-        // center carry an s function unfortunately, depending on the direction
-        // in which the build goes it must be A(0) or B(1)
+        // I will use 4-center recurrence relations and integrals, and have
+        // one center carry an s function unfortunately, depending on the
+        // direction in which the build goes it must be A(0) or B(1)
         const unsigned int dummy_center =
             (LIBINT_SHELL_SET == LIBINT_SHELL_SET_ORCA) ? 0 : 1;
 
@@ -1560,8 +1582,8 @@ void build_TwoPRep_1b_1k(std::ostream& os,
       const unsigned int dummy_center2 =
           (LIBINT_SHELL_SET == LIBINT_SHELL_SET_ORCA) ? 2 : 3;
 
-      // std::shared_ptr<Tactic> tactic(new ParticleDirectionTactic(lbra > lket
-      // ? false : true));
+      // std::shared_ptr<Tactic> tactic(new ParticleDirectionTactic(lbra >
+      // lket ? false : true));
       std::shared_ptr<Tactic> tactic(new FourCenter_OS_Tactic(
           dummy_center1 == 0 ? 0 : lbra, dummy_center1 == 1 ? 0 : lbra,
           dummy_center2 == 2 ? 0 : lket, dummy_center2 == 3 ? 0 : lket));
@@ -1674,8 +1696,8 @@ void build_TwoPRep_1b_1k(std::ostream& os,
       std::deque<std::string> decl_filenames;
       std::deque<std::string> def_filenames;
 
-      // this will generate code for this targets, and potentially generate code
-      // for its prerequisites
+      // this will generate code for this targets, and potentially generate
+      // code for its prerequisites
       GenerateCode(dg_xxx, context, cparams, strat, tactic, memman,
                    decl_filenames, def_filenames, prefix, label, false);
 
@@ -1684,7 +1706,8 @@ void build_TwoPRep_1b_1k(std::ostream& os,
           taskmgr.current().params();
       tparams->max_stack_size(max_am, memman->max_memory_used());
       tparams->max_ntarget(targets.size());
-      // os << " Max memory used = " << memman->max_memory_used() << std::endl;
+      // os << " Max memory used = " << memman->max_memory_used() <<
+      // std::endl;
 
       // set pointer to the top-level evaluator function
       ostringstream oss;
@@ -1875,8 +1898,8 @@ void build_R12kG12_2b_2k(std::ostream& os,
           std::deque<std::string> decl_filenames;
           std::deque<std::string> def_filenames;
 
-          // this will generate code for this targets, and potentially generate
-          // code for its prerequisites
+          // this will generate code for this targets, and potentially
+          // generate code for its prerequisites
           GenerateCode(dg_xxxx, context, cparams, strat, tactic, memman,
                        decl_filenames, def_filenames, prefix, label, false);
 
@@ -2230,11 +2253,11 @@ void build_G12DKH_2b_2k(std::ostream& os,
           oss << "#include <" << decl_filename << ">" << endl;
           iface->to_int_iface(oss.str());
 
-          // For the most expensive (i.e. presumably complete) graph extract all
-          // precomputed quantities -- these will be members of the evaluator
-          // structure also extract all RRs -- need to keep track of these to
-          // figure out which external symbols appearing in RR code belong to
-          // this task also
+          // For the most expensive (i.e. presumably complete) graph extract
+          // all precomputed quantities -- these will be members of the
+          // evaluator structure also extract all RRs -- need to keep track of
+          // these to figure out which external symbols appearing in RR code
+          // belong to this task also
           if (la == lmax && lb == lmax && lc == lmax && ld == lmax)
             extract_symbols(dg_xxxx);
 
@@ -2307,7 +2330,8 @@ void config_to_api(const std::shared_ptr<CompilationParameters>& cparams,
   // generated tasks declare all tasks in a range of valid tasks as defined or
   // not
   LibraryTaskManager& taskmgr = LibraryTaskManager::Instance();
-  // the range is defined by max # of centers, max deriv order, and operator set
+  // the range is defined by max # of centers, max deriv order, and operator
+  // set
   const size_t max_ncenter = 4;
   for (unsigned int ncenter = 0; ncenter <= max_ncenter; ++ncenter) {
     std::stringstream oss;
diff --git "a/src/bin/libint/comp_11_\317\203p\317\203pCoulomb\317\203p\317\203p_11.h" "b/src/bin/libint/comp_11_\317\203p\317\203pCoulomb\317\203p\317\203p_11.h"
index 1cacc7b3a..7bf0a4b9a 100644
--- "a/src/bin/libint/comp_11_\317\203p\317\203pCoulomb\317\203p\317\203p_11.h"
+++ "b/src/bin/libint/comp_11_\317\203p\317\203pCoulomb\317\203p\317\203p_11.h"
@@ -114,143 +114,75 @@ CR_11_σpσpCoulombσpσp_11<F>::CR_11_σpσpCoulombσpσp_11(
   };
 
   // Component wise generation for quaternion :
-  // ( (σ.p) a (σ.p)b | 1/r12 | (σ.p) c (σ.p) d )
+  // ( (σ.p) a (σ.p) b | 1/r12 | (σ.p) c (σ.p) d )
   switch (oper->descr().quaternion_index()) {
     case 0: {
-      // zeroth component =
-      // x1 x2 x3 x4 + y1 y2 x3 x4 - y1 x2 y3 x4 + x1 y2 y3 x4 + y1 x2 x3 y4 -
-      // x1 y2 x3 y4 + x1 x2 y3 y4 + y1 y2 y3 y4 + z1 z2 x3 x4 + z1 z2 y3 y4 -
-      // z1 x2 z3 x4 - z1 y2 z3 y4 + x1 z2 z3 x4 + y1 z2 z3 y4 + z1 x2 x3 z4 +
-      // z1 y2 y3 z4 - x1 z2 x3 z4 - y1 z2 y3 z4 + x1 x2 z3 z4 + y1 y2 z3 z4 +
-      // z1 z2 z3 z4
       auto xxxx = mc(x, x, x, x);
       auto yyxx = mc(y, y, x, x);
+      auto zzxx = mc(z, z, x, x);
       auto yxyx = mc(y, x, y, x);
       auto xyyx = mc(x, y, y, x);
       auto yxxy = mc(y, x, x, y);
       auto xyxy = mc(x, y, x, y);
       auto xxyy = mc(x, x, y, y);
       auto yyyy = mc(y, y, y, y);
-      auto zzxx = mc(z, z, x, x);
       auto zzyy = mc(z, z, y, y);
-      auto zxzx = mc(z, x, z, x);
-      auto zyzy = mc(z, y, z, y);
-      auto xzzx = mc(x, z, z, x);
-      auto yzzy = mc(y, z, z, y);
-      auto zxxz = mc(z, x, x, z);
-      auto zyyz = mc(z, y, y, z);
-      auto xzxz = mc(x, z, x, z);
-      auto yzyz = mc(y, z, y, z);
       auto xxzz = mc(x, x, z, z);
       auto yyzz = mc(y, y, z, z);
       auto zzzz = mc(z, z, z, z);
       if (is_simple()) {
-        expr_ = xxxx + yyxx - yxyx + xyyx + yxxy - xyxy + xxyy + yyyy + zzxx +
-                zzyy - zxzx - zyzy + xzzx + yzzy + zxxz + zyyz - xzxz - yzyz +
-                xxzz + yyzz + zzzz;
-        nflops_ += 20;
+        expr_ = xxxx + yyxx + zzxx - yxyx + xyyx + yxxy - xyxy + xxyy + yyyy +
+                zzyy + xxzz + yyzz + zzzz;
+        nflops_ += 12;
       }
     } break;
     case 1: {
-      // x component =
-      // - z1 y2 x3 x4 + z1 x2 y3 x4 - z1 x2 x3 y4 - z1 y2 y3 y4 + y1 z2 x3 x4 -
-      // x1 z2 y3 x4 + x1 z2 x3 y4 + y1 z2 y3 y4 - y1 x2 z3 x4 + x1 y2 z3 x4 -
-      // x1 x2 z3 y4 - y1 y2 z3 y4 - z1 z2 z3 y4 + y1 x2 x3 z4 - x1 y2 x3 z4 +
-      // x1 x2 y3 z4 + y1 y2 y3 z4 + z1 z2 y3 z4 - z1 y2 z3 z4 + y1 z2 z3 z4
-      auto zyxx = mc(z, y, x, x);
-      auto zxyx = mc(z, x, y, x);
-      auto zxxy = mc(z, x, x, y);
-      auto zyyy = mc(z, y, y, y);
-      auto yzxx = mc(y, z, x, x);
-      auto xzyx = mc(x, z, y, x);
-      auto xzxy = mc(x, z, x, y);
-      auto yzyy = mc(y, z, y, y);
-      auto yxzx = mc(y, x, z, x);
-      auto xyzx = mc(x, y, z, x);
-      auto xxzy = mc(x, x, z, y);
-      auto yyzy = mc(y, y, z, y);
-      auto zzzy = mc(z, z, z, y);
-      auto yxxz = mc(y, x, x, z);
-      auto xyxz = mc(x, y, x, z);
-      auto xxyz = mc(x, x, y, z);
-      auto yyyz = mc(y, y, y, z);
-      auto zzyz = mc(z, z, y, z);
-      auto zyzz = mc(z, y, z, z);
-      auto yzzz = mc(y, z, z, z);
+      auto zxzx = mc(z, x, z, x);
+      auto xzzx = mc(x, z, z, x);
+      auto zyzy = mc(z, y, z, y);
+      auto yzzy = mc(y, z, z, y);
+      auto zxxz = mc(z, x, x, z);
+      auto xzxz = mc(x, z, x, z);
+      auto zyyz = mc(z, y, y, z);
+      auto yzyz = mc(y, z, y, z);
       if (is_simple()) {
-        // swapped order of first two terms compiler does not like negative sign
-        // in front of first term
-        expr_ = zxyx - zyxx - zxxy - zyyy + yzxx - xzyx + xzxy + yzyy - yxzx +
-                xyzx - xxzy - yyzy - zzzy + yxxz - xyxz + xxyz + yyyz + zzyz -
-                zyzz + yzzz;
-        nflops_ += 19;
+        expr_ = zxzx - xzzx - zyzy + yzzy - zxxz + xzxz + zyyz - yzyz;
+        nflops_ += 7;
       }
     } break;
     case 2: {
-      // y component =
-      // z1 x2 x3 x4 + z1 y2 y3 x4 - z1 y2 x3 y4 + z1 x2 y3 y4 - x1 z2 x3 x4 -
-      // y1 z2 y3 x4 + y1 z2 x3 y4 - x1 z2 y3 y4 + x1 x2 z3 x4 + y1 y2 z3 x4 -
-      // y1 x2 z3 y4 + x1 y2 z3 y4 + z1 z2 z3 x4 - x1 x2 x3 z4 - y1 y2 x3 z4 +
-      // y1 x2 y3 z4 - x1 y2 y3 z4 - z1 z2 x3 z4 + z1 x2 z3 z4 - x1 z2 z3 z4
-      auto zxxx = mc(z, x, x, x);
-      auto zyyx = mc(z, y, y, x);
-      auto zyxy = mc(z, y, x, y);
-      auto zxyy = mc(z, x, y, y);
-      auto xzxx = mc(x, z, x, x);
-      auto yzyx = mc(y, z, y, x);
-      auto yzxy = mc(y, z, x, y);
-      auto xzyy = mc(x, z, y, y);
-      auto xxzx = mc(x, x, z, x);
-      auto yyzx = mc(y, y, z, x);
-      auto yxzy = mc(y, x, z, y);
-      auto xyzy = mc(x, y, z, y);
-      auto zzzx = mc(z, z, z, x);
-      auto xxxz = mc(x, x, x, z);
-      auto yyxz = mc(y, y, x, z);
-      auto yxyz = mc(y, x, y, z);
-      auto xyyz = mc(x, y, y, z);
-      auto zzxz = mc(z, z, x, z);
-      auto zxzz = mc(z, x, z, z);
-      auto xzzz = mc(x, z, z, z);
-
+      auto zyzx = mc(z, y, z, x);
+      auto yzzx = mc(y, z, z, x);
+      auto zxzy = mc(z, x, z, y);
+      auto xzzy = mc(x, z, z, y);
+      auto zyxz = mc(z, y, x, z);
+      auto yzxz = mc(y, z, x, z);
+      auto zxyz = mc(z, x, y, z);
+      auto xzyz = mc(x, z, y, z);
       if (is_simple()) {
-        expr_ = zxxx + zyyx - zyxy + zxyy - xzxx - yzyx + yzxy - xzyy + xxzx +
-                yyzx - yxzy + xyzy + zzzx - xxxz - yyxz + yxyz - xyyz - zzxz +
-                zxzz - xzzz;
-        nflops_ += 19;
+        // swapped order of first two terms compiler does not like negative sign
+        // in front of first term
+        expr_ = yzzx - zyzx - zxzy + xzzy + zyxz - yzxz + zxyz - xzyz;
+        nflops_ += 7;
       }
     } break;
     case 3: {
-      // z component =
-      // - y1 x2 x3 x4 + x1 y2 x3 x4 - x1 x2 y3 x4 - y1 y2 y3 x4 + x1 x2 x3 y4 +
-      // y1 y2 x3 y4 - y1 x2 y3 y4 + x1 y2 y3 y4 - z1 z2 y3 x4 + z1 z2 x3 y4 +
-      // z1 y2 z3 x4 - z1 x2 z3 y4 - y1 z2 z3 x4 + x1 z2 z3 y4 - z1 y2 x3 z4 +
-      // z1 x2 y3 z4 + y1 z2 x3 z4 - x1 z2 y3 z4 - y1 x2 z3 z4 + x1 y2 z3 z4
       auto yxxx = mc(y, x, x, x);
       auto xyxx = mc(x, y, x, x);
       auto xxyx = mc(x, x, y, x);
       auto yyyx = mc(y, y, y, x);
+      auto zzyx = mc(z, z, y, x);
       auto xxxy = mc(x, x, x, y);
       auto yyxy = mc(y, y, x, y);
+      auto zzxy = mc(z, z, x, y);
       auto yxyy = mc(y, x, y, y);
       auto xyyy = mc(x, y, y, y);
-      auto zzyx = mc(z, z, y, x);
-      auto zzxy = mc(z, z, x, y);
-      auto zyzx = mc(z, y, z, x);
-      auto zxzy = mc(z, x, z, y);
-      auto yzzx = mc(y, z, z, x);
-      auto xzzy = mc(x, z, z, y);
-      auto zyxz = mc(z, y, x, z);
-      auto zxyz = mc(z, x, y, z);
-      auto yzxz = mc(y, z, x, z);
-      auto xzyz = mc(x, z, y, z);
       auto yxzz = mc(y, x, z, z);
       auto xyzz = mc(x, y, z, z);
       if (is_simple()) {
-        expr_ = xyxx - yxxx - xxyx - yyyx + xxxy + yyxy - yxyy + xyyy - zzyx +
-                zzxy + zyzx - zxzy - yzzx + xzzy - zyxz + zxyz + yzxz - xzyz -
-                yxzz + xyzz;
-        nflops_ += 19;
+        expr_ = xyxx - yxxx - xxyx - yyyx - zzyx + xxxy + yyxy + zzxy - yxyy +
+                xyyy - yxzz + xyzz;
+        nflops_ += 11;
       }
     } break;
     default:

From e29428ab8005654eb789860c1be9b666b946bba1 Mon Sep 17 00:00:00 2001
From: Kshitij Surjuse <kshitijsurjuse100@gmail.com>
Date: Mon, 9 Mar 2026 17:29:27 -0400
Subject: [PATCH 13/22] cleanup: Remove debugging statements from `RKB Coulomb
 integrals` test and more cleanup

---
 export/tests/unit/test-2body.cc | 313 ++++++++++++++++----------------
 src/bin/libint/build_libint.cc  |  17 +-
 2 files changed, 161 insertions(+), 169 deletions(-)

diff --git a/export/tests/unit/test-2body.cc b/export/tests/unit/test-2body.cc
index bf3b6a60d..2f0383fc1 100644
--- a/export/tests/unit/test-2body.cc
+++ b/export/tests/unit/test-2body.cc
@@ -434,172 +434,165 @@ TEST_CASE("RKB Coulomb integrals", "[engine][2-body]") {
       for (int s1 = 0; s1 != nshell; ++s1) {
         for (int s2 = 0; s2 != nshell; ++s2) {
           for (int s3 = 0; s3 != nshell; ++s3) {
-            if (s0 == 0 && s1 == 1 && s2 == 0 && s3 == 1) {
-              const auto &results_llss =
-                  engine_llss.compute(obs[s0], obs[s1], obs[s2], obs[s3]);
-              const auto &results_ssss =
-                  engine_ssss.compute(obs[s0], obs[s1], obs[s2], obs[s3]);
-              assert(results_llss.size() ==
-                     4);  // we get 4 buffers for each quaternion component
-
-              LIBINT2_REF_REALTYPE Aref[3];
-              for (int i = 0; i < 3; ++i) Aref[i] = obs[s0].O[i];
-              LIBINT2_REF_REALTYPE Bref[3];
-              for (int i = 0; i < 3; ++i) Bref[i] = obs[s1].O[i];
-              LIBINT2_REF_REALTYPE Cref[3];
-              for (int i = 0; i < 3; ++i) Cref[i] = obs[s2].O[i];
-              LIBINT2_REF_REALTYPE Dref[3];
-              for (int i = 0; i < 3; ++i) Dref[i] = obs[s3].O[i];
-
-              int ijkl = 0;
-
-              int l0, m0, n0;
-              FOR_CART(l0, m0, n0, obs[s0].contr[0].l)
-
-              int l1, m1, n1;
-              FOR_CART(l1, m1, n1, obs[s1].contr[0].l)
-
-              int l2, m2, n2;
-              FOR_CART(l2, m2, n2, obs[s2].contr[0].l)
-
-              int l3, m3, n3;
-              FOR_CART(l3, m3, n3, obs[s3].contr[0].l)
-
-              std::array<LIBINT2_REF_REALTYPE, 4> ref_coulomb_opop{0.0, 0.0,
-                                                                   0.0, 0.0};
-              std::array<LIBINT2_REF_REALTYPE, 4> ref_opop_coulomb_opop{
-                  0.0, 0.0, 0.0, 0.0};
-              uint p0123 = 0;
-              for (uint p0 = 0; p0 < obs[s0].nprim(); p0++) {
-                for (uint p1 = 0; p1 < obs[s1].nprim(); p1++) {
-                  for (uint p2 = 0; p2 < obs[s2].nprim(); p2++) {
-                    for (uint p3 = 0; p3 < obs[s3].nprim(); p3++, p0123++) {
-                      const LIBINT2_REF_REALTYPE alpha0 = obs[s0].alpha[p0];
-                      const LIBINT2_REF_REALTYPE alpha1 = obs[s1].alpha[p1];
-                      const LIBINT2_REF_REALTYPE alpha2 = obs[s2].alpha[p2];
-                      const LIBINT2_REF_REALTYPE alpha3 = obs[s3].alpha[p3];
-
-                      const LIBINT2_REF_REALTYPE c0 =
-                          obs[s0].contr[0].coeff[p0];
-                      const LIBINT2_REF_REALTYPE c1 =
-                          obs[s1].contr[0].coeff[p1];
-                      const LIBINT2_REF_REALTYPE c2 =
-                          obs[s2].contr[0].coeff[p2];
-                      const LIBINT2_REF_REALTYPE c3 =
-                          obs[s3].contr[0].coeff[p3];
-                      const LIBINT2_REF_REALTYPE c0123 = c0 * c1 * c2 * c3;
-
-                      auto eri_drrrr = [&](der_idx d_rrrr) {
-                        return eri(d_rrrr.data(), l0, m0, n0, alpha0, Aref, l1,
-                                   m1, n1, alpha1, Bref, l2, m2, n2, alpha2,
-                                   Cref, l3, m3, n3, alpha3, Dref, 0);
-                      };
-
-                      // (LL|SS)
-                      ref_coulomb_opop[0] +=
-                          c0123 *
-                          (eri_drrrr(d_xx) + eri_drrrr(d_yy) + eri_drrrr(d_zz));
-                      ref_coulomb_opop[1] +=
-                          c0123 * (eri_drrrr(d_yz) - eri_drrrr(d_zy));
-                      ref_coulomb_opop[2] +=
-                          c0123 * (eri_drrrr(d_zx) - eri_drrrr(d_xz));
-                      ref_coulomb_opop[3] +=
-                          c0123 * (eri_drrrr(d_xy) - eri_drrrr(d_yx));
-
-                      // (SS|SS)
-                      ref_opop_coulomb_opop[0] +=
-                          c0123 *
-                          (eri_drrrr(xxxx) + eri_drrrr(yyxx) + eri_drrrr(zzxx) -
-                           eri_drrrr(yxyx) + eri_drrrr(xyyx) + eri_drrrr(yxxy) -
-                           eri_drrrr(xyxy) + eri_drrrr(xxyy) + eri_drrrr(yyyy) +
-                           eri_drrrr(zzyy) + eri_drrrr(xxzz) + eri_drrrr(yyzz) +
-                           eri_drrrr(zzzz));
-                      ref_opop_coulomb_opop[1] +=
-                          c0123 *
-                          (eri_drrrr(zxzx) - eri_drrrr(xzzx) - eri_drrrr(zyzy) +
-                           eri_drrrr(yzzy) - eri_drrrr(zxxz) + eri_drrrr(xzxz) +
-                           eri_drrrr(zyyz) - eri_drrrr(yzyz));
-                      ref_opop_coulomb_opop[2] +=
-                          c0123 *
-                          (-eri_drrrr(zyzx) + eri_drrrr(yzzx) -
-                           eri_drrrr(zxzy) + eri_drrrr(xzzy) + eri_drrrr(zyxz) -
-                           eri_drrrr(yzxz) + eri_drrrr(zxyz) - eri_drrrr(xzyz));
-                      ref_opop_coulomb_opop[3] +=
-                          c0123 *
-                          (-eri_drrrr(yxxx) + eri_drrrr(xyxx) -
-                           eri_drrrr(xxyx) - eri_drrrr(yyyx) - eri_drrrr(zzyx) +
-                           eri_drrrr(xxxy) + eri_drrrr(yyxy) + eri_drrrr(zzxy) -
-                           eri_drrrr(yxyy) + eri_drrrr(xyyy) - eri_drrrr(yxzz) +
-                           eri_drrrr(xyzz));
-                    }
+            const auto &results_llss =
+                engine_llss.compute(obs[s0], obs[s1], obs[s2], obs[s3]);
+            const auto &results_ssss =
+                engine_ssss.compute(obs[s0], obs[s1], obs[s2], obs[s3]);
+            assert(results_llss.size() ==
+                   4);  // we get 4 buffers for each quaternion component
+
+            LIBINT2_REF_REALTYPE Aref[3];
+            for (int i = 0; i < 3; ++i) Aref[i] = obs[s0].O[i];
+            LIBINT2_REF_REALTYPE Bref[3];
+            for (int i = 0; i < 3; ++i) Bref[i] = obs[s1].O[i];
+            LIBINT2_REF_REALTYPE Cref[3];
+            for (int i = 0; i < 3; ++i) Cref[i] = obs[s2].O[i];
+            LIBINT2_REF_REALTYPE Dref[3];
+            for (int i = 0; i < 3; ++i) Dref[i] = obs[s3].O[i];
+
+            int ijkl = 0;
+
+            int l0, m0, n0;
+            FOR_CART(l0, m0, n0, obs[s0].contr[0].l)
+
+            int l1, m1, n1;
+            FOR_CART(l1, m1, n1, obs[s1].contr[0].l)
+
+            int l2, m2, n2;
+            FOR_CART(l2, m2, n2, obs[s2].contr[0].l)
+
+            int l3, m3, n3;
+            FOR_CART(l3, m3, n3, obs[s3].contr[0].l)
+
+            std::array<LIBINT2_REF_REALTYPE, 4> ref_coulomb_opop{0.0, 0.0, 0.0,
+                                                                 0.0};
+            std::array<LIBINT2_REF_REALTYPE, 4> ref_opop_coulomb_opop{0.0, 0.0,
+                                                                      0.0, 0.0};
+            uint p0123 = 0;
+            for (uint p0 = 0; p0 < obs[s0].nprim(); p0++) {
+              for (uint p1 = 0; p1 < obs[s1].nprim(); p1++) {
+                for (uint p2 = 0; p2 < obs[s2].nprim(); p2++) {
+                  for (uint p3 = 0; p3 < obs[s3].nprim(); p3++, p0123++) {
+                    const LIBINT2_REF_REALTYPE alpha0 = obs[s0].alpha[p0];
+                    const LIBINT2_REF_REALTYPE alpha1 = obs[s1].alpha[p1];
+                    const LIBINT2_REF_REALTYPE alpha2 = obs[s2].alpha[p2];
+                    const LIBINT2_REF_REALTYPE alpha3 = obs[s3].alpha[p3];
+
+                    const LIBINT2_REF_REALTYPE c0 = obs[s0].contr[0].coeff[p0];
+                    const LIBINT2_REF_REALTYPE c1 = obs[s1].contr[0].coeff[p1];
+                    const LIBINT2_REF_REALTYPE c2 = obs[s2].contr[0].coeff[p2];
+                    const LIBINT2_REF_REALTYPE c3 = obs[s3].contr[0].coeff[p3];
+                    const LIBINT2_REF_REALTYPE c0123 = c0 * c1 * c2 * c3;
+
+                    auto eri_drrrr = [&](der_idx d_rrrr) {
+                      return eri(d_rrrr.data(), l0, m0, n0, alpha0, Aref, l1,
+                                 m1, n1, alpha1, Bref, l2, m2, n2, alpha2, Cref,
+                                 l3, m3, n3, alpha3, Dref, 0);
+                    };
+
+                    // (LL|SS)
+                    ref_coulomb_opop[0] +=
+                        c0123 *
+                        (eri_drrrr(d_xx) + eri_drrrr(d_yy) + eri_drrrr(d_zz));
+                    ref_coulomb_opop[1] +=
+                        c0123 * (eri_drrrr(d_yz) - eri_drrrr(d_zy));
+                    ref_coulomb_opop[2] +=
+                        c0123 * (eri_drrrr(d_zx) - eri_drrrr(d_xz));
+                    ref_coulomb_opop[3] +=
+                        c0123 * (eri_drrrr(d_xy) - eri_drrrr(d_yx));
+
+                    // (SS|SS)
+                    ref_opop_coulomb_opop[0] +=
+                        c0123 *
+                        (eri_drrrr(xxxx) + eri_drrrr(yyxx) + eri_drrrr(zzxx) -
+                         eri_drrrr(yxyx) + eri_drrrr(xyyx) + eri_drrrr(yxxy) -
+                         eri_drrrr(xyxy) + eri_drrrr(xxyy) + eri_drrrr(yyyy) +
+                         eri_drrrr(zzyy) + eri_drrrr(xxzz) + eri_drrrr(yyzz) +
+                         eri_drrrr(zzzz));
+                    ref_opop_coulomb_opop[1] +=
+                        c0123 *
+                        (eri_drrrr(zxzx) - eri_drrrr(xzzx) - eri_drrrr(zyzy) +
+                         eri_drrrr(yzzy) - eri_drrrr(zxxz) + eri_drrrr(xzxz) +
+                         eri_drrrr(zyyz) - eri_drrrr(yzyz));
+                    ref_opop_coulomb_opop[2] +=
+                        c0123 *
+                        (-eri_drrrr(zyzx) + eri_drrrr(yzzx) - eri_drrrr(zxzy) +
+                         eri_drrrr(xzzy) + eri_drrrr(zyxz) - eri_drrrr(yzxz) +
+                         eri_drrrr(zxyz) - eri_drrrr(xzyz));
+                    ref_opop_coulomb_opop[3] +=
+                        c0123 *
+                        (-eri_drrrr(yxxx) + eri_drrrr(xyxx) - eri_drrrr(xxyx) -
+                         eri_drrrr(yyyx) - eri_drrrr(zzyx) + eri_drrrr(xxxy) +
+                         eri_drrrr(yyxy) + eri_drrrr(zzxy) - eri_drrrr(yxyy) +
+                         eri_drrrr(xyyy) - eri_drrrr(yxzz) + eri_drrrr(xyzz));
                   }
                 }
               }
+            }
 
-              const double ABSOLUTE_DEVIATION_THRESHOLD = 5.0E-14;
-              const double RELATIVE_DEVIATION_THRESHOLD =
-                  1.0E-9;  // For more detail on choice of these thresholds, see
-                           // the comments in the TEST_CASE "eri geometric
-                           // derivatives"
-
-              std::array<LIBINT2_REF_REALTYPE, 4> abs_errs_llss;
-              std::array<LIBINT2_REF_REALTYPE, 4> rel_abs_errs_llss;
-
-              std::array<LIBINT2_REF_REALTYPE, 4> abs_errs_ssss;
-              std::array<LIBINT2_REF_REALTYPE, 4> rel_abs_errs_ssss;
-
-              for (auto comp = 0; comp < 4; ++comp) {
-                abs_errs_llss[comp] =
-                    abs(ref_coulomb_opop[comp] - results_llss[comp][ijkl]);
-                rel_abs_errs_llss[comp] =
-                    abs(abs_errs_llss[comp] / ref_coulomb_opop[comp]);
-
-                abs_errs_ssss[comp] =
-                    abs(ref_opop_coulomb_opop[comp] - results_ssss[comp][ijkl]);
-                rel_abs_errs_ssss[comp] =
-                    abs(abs_errs_ssss[comp] / ref_opop_coulomb_opop[comp]);
-
-                bool llss_not_ok =
-                    rel_abs_errs_llss[comp] > RELATIVE_DEVIATION_THRESHOLD &&
-                    abs_errs_llss[comp] > ABSOLUTE_DEVIATION_THRESHOLD;
-
-                bool ssss_not_ok =
-                    rel_abs_errs_ssss[comp] > RELATIVE_DEVIATION_THRESHOLD &&
-                    abs_errs_ssss[comp] > ABSOLUTE_DEVIATION_THRESHOLD;
-
-                // no 3^n prefactor here since the intrinsic deriv order is 2
-                if (llss_not_ok) {
-                  std::cout << "(l0 l1| l2 l3) = "
-                            << "(" << s0 << " " << s1 << " | " << s2 << " "
-                            << s3 << ") "
-                            << "Elem " << ijkl << " comp= " << comp
-                            << " : ref = " << ref_coulomb_opop[comp]
-                            << " libint = " << results_llss[comp][ijkl]
-                            << " relabs_error = " << rel_abs_errs_llss[comp]
-                            << " abs_error = " << abs_errs_llss[comp]
-                            << std::endl;
-                }
-                if (ssss_not_ok) {
-                  std::cout << "(l0 l1| l2 l3) = "
-                            << "(" << s0 << " " << s1 << " | " << s2 << " "
-                            << s3 << ") "
-                            << "Elem " << ijkl << " comp= " << comp
-                            << " : ref = " << ref_opop_coulomb_opop[comp]
-                            << " libint = " << results_ssss[comp][ijkl]
-                            << " relabs_error = " << rel_abs_errs_ssss[comp]
-                            << " abs_error = " << abs_errs_ssss[comp]
-                            << std::endl;
-                }
-                REQUIRE(!llss_not_ok);
-                REQUIRE(!ssss_not_ok);
+            const double ABSOLUTE_DEVIATION_THRESHOLD = 5.0E-14;
+            const double RELATIVE_DEVIATION_THRESHOLD =
+                1.0E-9;  // For more detail on choice of these thresholds, see
+                         // the comments in the TEST_CASE "eri geometric
+                         // derivatives"
+
+            std::array<LIBINT2_REF_REALTYPE, 4> abs_errs_llss;
+            std::array<LIBINT2_REF_REALTYPE, 4> rel_abs_errs_llss;
+
+            std::array<LIBINT2_REF_REALTYPE, 4> abs_errs_ssss;
+            std::array<LIBINT2_REF_REALTYPE, 4> rel_abs_errs_ssss;
+
+            for (auto comp = 0; comp < 4; ++comp) {
+              abs_errs_llss[comp] =
+                  abs(ref_coulomb_opop[comp] - results_llss[comp][ijkl]);
+              rel_abs_errs_llss[comp] =
+                  abs(abs_errs_llss[comp] / ref_coulomb_opop[comp]);
+
+              abs_errs_ssss[comp] =
+                  abs(ref_opop_coulomb_opop[comp] - results_ssss[comp][ijkl]);
+              rel_abs_errs_ssss[comp] =
+                  abs(abs_errs_ssss[comp] / ref_opop_coulomb_opop[comp]);
+
+              bool llss_not_ok =
+                  rel_abs_errs_llss[comp] > RELATIVE_DEVIATION_THRESHOLD &&
+                  abs_errs_llss[comp] > ABSOLUTE_DEVIATION_THRESHOLD;
+
+              bool ssss_not_ok =
+                  rel_abs_errs_ssss[comp] > RELATIVE_DEVIATION_THRESHOLD &&
+                  abs_errs_ssss[comp] > ABSOLUTE_DEVIATION_THRESHOLD;
+
+              // no 3^n prefactor here since the intrinsic deriv order is 2
+              if (llss_not_ok) {
+                std::cout << "(l0 l1| l2 l3) = "
+                          << "(" << s0 << " " << s1 << " | " << s2 << " " << s3
+                          << ") "
+                          << "Elem " << ijkl << " comp= " << comp
+                          << " : ref = " << ref_coulomb_opop[comp]
+                          << " libint = " << results_llss[comp][ijkl]
+                          << " relabs_error = " << rel_abs_errs_llss[comp]
+                          << " abs_error = " << abs_errs_llss[comp]
+                          << std::endl;
               }
-
-              ++ijkl;
-              END_FOR_CART
-              END_FOR_CART
-              END_FOR_CART
-              END_FOR_CART
+              if (ssss_not_ok) {
+                std::cout << "(l0 l1| l2 l3) = "
+                          << "(" << s0 << " " << s1 << " | " << s2 << " " << s3
+                          << ") "
+                          << "Elem " << ijkl << " comp= " << comp
+                          << " : ref = " << ref_opop_coulomb_opop[comp]
+                          << " libint = " << results_ssss[comp][ijkl]
+                          << " relabs_error = " << rel_abs_errs_ssss[comp]
+                          << " abs_error = " << abs_errs_ssss[comp]
+                          << std::endl;
+              }
+              REQUIRE(!llss_not_ok);
+              REQUIRE(!ssss_not_ok);
             }
+
+            ++ijkl;
+            END_FOR_CART
+            END_FOR_CART
+            END_FOR_CART
+            END_FOR_CART
           }
         }
       }
diff --git a/src/bin/libint/build_libint.cc b/src/bin/libint/build_libint.cc
index 4bf4bf383..c2a9de6cd 100644
--- a/src/bin/libint/build_libint.cc
+++ b/src/bin/libint/build_libint.cc
@@ -1178,8 +1178,8 @@ void build_TwoPRep_2b_2k(std::ostream& os, std::string label,
             }
           }
 
-          // unroll only if max_am <= cparams->max_am_opt(task) using
-          // std::max;
+          // unroll only if max_am <= cparams->max_am_opt(task)
+          using std::max;
           const unsigned int max_am = max(max(la, lb), max(lc, ld));
           const bool need_to_optimize = (max_am <= cparams->max_am_opt(task));
           const auto nopers = descrs.size();
@@ -1262,7 +1262,6 @@ void build_TwoPRep_2b_2k(std::ostream& os, std::string label,
               abcd_label = abcd->label();
             } else {
               std::ostringstream oss;
-              oss << cparams->api_prefix();
               oss << "_" << a.label() << "_" << b.label();
               oss << "_" << label;
               oss << "_" << c.label() << "_" << d.label();
@@ -1272,7 +1271,7 @@ void build_TwoPRep_2b_2k(std::ostream& os, std::string label,
           // + derivative level (if deriv_level > 0)
           std::string eval_label;
           {
-            eval_label = cparams->api_prefix();
+            eval_label = "";
             if (deriv_level != 0) {
               std::ostringstream oss;
               oss << "deriv" << deriv_level;
@@ -1380,9 +1379,9 @@ void build_TwoPRep_1b_2k(std::ostream& os,
                 LIBINT_SHELL_SET)>::value(lbra, lc, ld))
           continue;
 
-        // I will use 4-center recurrence relations and integrals, and have
-        // one center carry an s function unfortunately, depending on the
-        // direction in which the build goes it must be A(0) or B(1)
+        // I will use 4-center recurrence relations and integrals, and have one
+        // center carry an s function unfortunately, depending on the direction
+        // in which the build goes it must be A(0) or B(1)
         const unsigned int dummy_center =
             (LIBINT_SHELL_SET == LIBINT_SHELL_SET_ORCA) ? 0 : 1;
 
@@ -1581,8 +1580,8 @@ void build_TwoPRep_1b_1k(std::ostream& os,
       const unsigned int dummy_center2 =
           (LIBINT_SHELL_SET == LIBINT_SHELL_SET_ORCA) ? 2 : 3;
 
-      // std::shared_ptr<Tactic> tactic(new ParticleDirectionTactic(lbra >
-      // lket ? false : true));
+      // std::shared_ptr<Tactic> tactic(new ParticleDirectionTactic(lbra > lket
+      // ? false : true));
       std::shared_ptr<Tactic> tactic(new FourCenter_OS_Tactic(
           dummy_center1 == 0 ? 0 : lbra, dummy_center1 == 1 ? 0 : lbra,
           dummy_center2 == 2 ? 0 : lket, dummy_center2 == 3 ? 0 : lket));

From 921da582fe6c830bcd142f778d313ad5132a2c04 Mon Sep 17 00:00:00 2001
From: Kshitij Surjuse <kshitijsurjuse100@gmail.com>
Date: Sat, 21 Mar 2026 14:00:30 -0400
Subject: [PATCH 14/22] Optimize RKB integral code generation: braket symmetry
 + disable CSE + progress bar + sign fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- ShellQuartetSetPredicate: add braket-swap tiebreaker for bra_ket_coswappable
  operators (σpσpCoulombσpσp). When la+lb == lc+ld, use max(la,lb) <= lc to
  pick one canonical representative, reducing duplicate quartet generation.

- Engine (engine.impl.h): update swap_braket logic for opop_coulomb_opop to
  match the new predicate tiebreaker. Add coupled-swap sign correction in the
  swap_braket branch (was missing — exposed by d-shell testing).

- build_libint.cc: disable CSE (do_cse/condense_expr) for multi-component
  operators since their 16 components share no intermediates at the expression
  level. This eliminates the superlinear optimize_rr_out bottleneck (e.g.,
  8.8s → 71ms for (ss|ds) prerequisite DAG).

- build_libint.cc: fix compilation when only LIBINT_INCLUDE_RKB_ERI is defined
  (without LIBINT_INCLUDE_ERI): extend #ifdef guards for build_TwoPRep_2b_2k,
  add forward declaration, move make_descr to detail namespace, use if constexpr
  for component descriptor construction.

- buildtest.h: add CodeGenProgress spinner showing elapsed time, function count,
  and current task name on stderr during code generation.

- int_am.cmake: fix typo in OPT_AM variable reference.
---
 cmake/modules/int_am.cmake                    |   2 +-
 export/tests/unit/test-2body.cc               | 126 ++++++---
 include/libint2/engine.h                      |   4 +-
 include/libint2/engine.impl.h                 | 117 ++++++--
 src/bin/libint/build_libint.cc                | 101 ++++---
 src/bin/libint/buildtest.h                    |  50 +++-
 ...3p\317\203pCoulomb\317\203p\317\203p_11.h" | 259 ++++++++++++++----
 src/bin/libint/oper.h                         |  31 ++-
 8 files changed, 522 insertions(+), 168 deletions(-)

diff --git a/cmake/modules/int_am.cmake b/cmake/modules/int_am.cmake
index cc86b7aa7..350924f49 100644
--- a/cmake/modules/int_am.cmake
+++ b/cmake/modules/int_am.cmake
@@ -262,7 +262,7 @@ macro(process_integrals_class class)
             if (LIBINT2_${class}_OPT_AM EQUAL -1)
                 set(LIBINT_${class}_OPT_AM "")
             else()
-                set($LIBINT_{class}_OPT_AM ${LIBINT2_${class}_OPT_AM})
+                set(LIBINT_${class}_OPT_AM ${LIBINT2_${class}_OPT_AM})
             endif()
         endif()
         if (LIBINT_OPT_AM_LIST)
diff --git a/export/tests/unit/test-2body.cc b/export/tests/unit/test-2body.cc
index 2f0383fc1..708190b90 100644
--- a/export/tests/unit/test-2body.cc
+++ b/export/tests/unit/test-2body.cc
@@ -439,7 +439,7 @@ TEST_CASE("RKB Coulomb integrals", "[engine][2-body]") {
             const auto &results_ssss =
                 engine_ssss.compute(obs[s0], obs[s1], obs[s2], obs[s3]);
             assert(results_llss.size() ==
-                   4);  // we get 4 buffers for each quaternion component
+                   4);  // 4 buffers for single-spin quaternion components
 
             LIBINT2_REF_REALTYPE Aref[3];
             for (int i = 0; i < 3; ++i) Aref[i] = obs[s0].O[i];
@@ -466,8 +466,8 @@ TEST_CASE("RKB Coulomb integrals", "[engine][2-body]") {
 
             std::array<LIBINT2_REF_REALTYPE, 4> ref_coulomb_opop{0.0, 0.0, 0.0,
                                                                  0.0};
-            std::array<LIBINT2_REF_REALTYPE, 4> ref_opop_coulomb_opop{0.0, 0.0,
-                                                                      0.0, 0.0};
+            std::array<LIBINT2_REF_REALTYPE, 16> ref_opop_coulomb_opop{};
+            ref_opop_coulomb_opop.fill(0.0);
             uint p0123 = 0;
             for (uint p0 = 0; p0 < obs[s0].nprim(); p0++) {
               for (uint p1 = 0; p1 < obs[s1].nprim(); p1++) {
@@ -490,6 +490,22 @@ TEST_CASE("RKB Coulomb integrals", "[engine][2-body]") {
                                  l3, m3, n3, alpha3, Dref, 0);
                     };
 
+                    // helper: build der_idx from 4 derivative directions
+                    // (0=x, 1=y, 2=z) for centers A, B, C, D
+                    constexpr int X = 0, Y = 1, Z = 2;
+                    auto didx = [](int a, int b, int c, int d) -> der_idx {
+                      der_idx r = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+                      r[a] = 1;
+                      r[3 + b] = 1;
+                      r[6 + c] = 1;
+                      r[9 + d] = 1;
+                      return r;
+                    };
+                    // shorthand: evaluate derivative ERI from 4 directions
+                    auto D = [&](int a, int b, int c, int d) {
+                      return eri_drrrr(didx(a, b, c, d));
+                    };
+
                     // (LL|SS)
                     ref_coulomb_opop[0] +=
                         c0123 *
@@ -501,30 +517,60 @@ TEST_CASE("RKB Coulomb integrals", "[engine][2-body]") {
                     ref_coulomb_opop[3] +=
                         c0123 * (eri_drrrr(d_xy) - eri_drrrr(d_yx));
 
-                    // (SS|SS)
+                    // (SS|SS) — 16 components, Option A: index = 4*bra + ket
+                    // 0:SS  1:SX  2:SY  3:SZ
                     ref_opop_coulomb_opop[0] +=
-                        c0123 *
-                        (eri_drrrr(xxxx) + eri_drrrr(yyxx) + eri_drrrr(zzxx) -
-                         eri_drrrr(yxyx) + eri_drrrr(xyyx) + eri_drrrr(yxxy) -
-                         eri_drrrr(xyxy) + eri_drrrr(xxyy) + eri_drrrr(yyyy) +
-                         eri_drrrr(zzyy) + eri_drrrr(xxzz) + eri_drrrr(yyzz) +
-                         eri_drrrr(zzzz));
+                        c0123 * (D(X, X, X, X) + D(X, X, Y, Y) + D(X, X, Z, Z) +
+                                 D(Y, Y, X, X) + D(Y, Y, Y, Y) + D(Y, Y, Z, Z) +
+                                 D(Z, Z, X, X) + D(Z, Z, Y, Y) + D(Z, Z, Z, Z));
                     ref_opop_coulomb_opop[1] +=
-                        c0123 *
-                        (eri_drrrr(zxzx) - eri_drrrr(xzzx) - eri_drrrr(zyzy) +
-                         eri_drrrr(yzzy) - eri_drrrr(zxxz) + eri_drrrr(xzxz) +
-                         eri_drrrr(zyyz) - eri_drrrr(yzyz));
+                        c0123 * (D(X, X, Y, Z) - D(X, X, Z, Y) + D(Y, Y, Y, Z) -
+                                 D(Y, Y, Z, Y) + D(Z, Z, Y, Z) - D(Z, Z, Z, Y));
                     ref_opop_coulomb_opop[2] +=
-                        c0123 *
-                        (-eri_drrrr(zyzx) + eri_drrrr(yzzx) - eri_drrrr(zxzy) +
-                         eri_drrrr(xzzy) + eri_drrrr(zyxz) - eri_drrrr(yzxz) +
-                         eri_drrrr(zxyz) - eri_drrrr(xzyz));
+                        c0123 * (D(X, X, Z, X) - D(X, X, X, Z) + D(Y, Y, Z, X) -
+                                 D(Y, Y, X, Z) + D(Z, Z, Z, X) - D(Z, Z, X, Z));
                     ref_opop_coulomb_opop[3] +=
-                        c0123 *
-                        (-eri_drrrr(yxxx) + eri_drrrr(xyxx) - eri_drrrr(xxyx) -
-                         eri_drrrr(yyyx) - eri_drrrr(zzyx) + eri_drrrr(xxxy) +
-                         eri_drrrr(yyxy) + eri_drrrr(zzxy) - eri_drrrr(yxyy) +
-                         eri_drrrr(xyyy) - eri_drrrr(yxzz) + eri_drrrr(xyzz));
+                        c0123 * (D(X, X, X, Y) - D(X, X, Y, X) + D(Y, Y, X, Y) -
+                                 D(Y, Y, Y, X) + D(Z, Z, X, Y) - D(Z, Z, Y, X));
+                    // 4:XS  5:XX  6:XY  7:XZ
+                    ref_opop_coulomb_opop[4] +=
+                        c0123 * (D(Y, Z, X, X) - D(Z, Y, X, X) + D(Y, Z, Y, Y) -
+                                 D(Z, Y, Y, Y) + D(Y, Z, Z, Z) - D(Z, Y, Z, Z));
+                    ref_opop_coulomb_opop[5] +=
+                        c0123 * (-D(Y, Z, Y, Z) + D(Y, Z, Z, Y) +
+                                 D(Z, Y, Y, Z) - D(Z, Y, Z, Y));
+                    ref_opop_coulomb_opop[6] +=
+                        c0123 * (-D(Y, Z, Z, X) + D(Y, Z, X, Z) +
+                                 D(Z, Y, Z, X) - D(Z, Y, X, Z));
+                    ref_opop_coulomb_opop[7] +=
+                        c0123 * (-D(Y, Z, X, Y) + D(Y, Z, Y, X) +
+                                 D(Z, Y, X, Y) - D(Z, Y, Y, X));
+                    // 8:YS  9:YX  10:YY  11:YZ
+                    ref_opop_coulomb_opop[8] +=
+                        c0123 * (D(Z, X, X, X) - D(X, Z, X, X) + D(Z, X, Y, Y) -
+                                 D(X, Z, Y, Y) + D(Z, X, Z, Z) - D(X, Z, Z, Z));
+                    ref_opop_coulomb_opop[9] +=
+                        c0123 * (-D(Z, X, Y, Z) + D(Z, X, Z, Y) +
+                                 D(X, Z, Y, Z) - D(X, Z, Z, Y));
+                    ref_opop_coulomb_opop[10] +=
+                        c0123 * (-D(Z, X, Z, X) + D(Z, X, X, Z) +
+                                 D(X, Z, Z, X) - D(X, Z, X, Z));
+                    ref_opop_coulomb_opop[11] +=
+                        c0123 * (-D(Z, X, X, Y) + D(Z, X, Y, X) +
+                                 D(X, Z, X, Y) - D(X, Z, Y, X));
+                    // 12:ZS  13:ZX  14:ZY  15:ZZ
+                    ref_opop_coulomb_opop[12] +=
+                        c0123 * (D(X, Y, X, X) - D(Y, X, X, X) + D(X, Y, Y, Y) -
+                                 D(Y, X, Y, Y) + D(X, Y, Z, Z) - D(Y, X, Z, Z));
+                    ref_opop_coulomb_opop[13] +=
+                        c0123 * (-D(X, Y, Y, Z) + D(X, Y, Z, Y) +
+                                 D(Y, X, Y, Z) - D(Y, X, Z, Y));
+                    ref_opop_coulomb_opop[14] +=
+                        c0123 * (-D(X, Y, Z, X) + D(X, Y, X, Z) +
+                                 D(Y, X, Z, X) - D(Y, X, X, Z));
+                    ref_opop_coulomb_opop[15] +=
+                        c0123 * (-D(X, Y, X, Y) + D(X, Y, Y, X) +
+                                 D(Y, X, X, Y) - D(Y, X, Y, X));
                   }
                 }
               }
@@ -539,29 +585,17 @@ TEST_CASE("RKB Coulomb integrals", "[engine][2-body]") {
             std::array<LIBINT2_REF_REALTYPE, 4> abs_errs_llss;
             std::array<LIBINT2_REF_REALTYPE, 4> rel_abs_errs_llss;
 
-            std::array<LIBINT2_REF_REALTYPE, 4> abs_errs_ssss;
-            std::array<LIBINT2_REF_REALTYPE, 4> rel_abs_errs_ssss;
-
+            // (LL|SS) has 4 components
             for (auto comp = 0; comp < 4; ++comp) {
               abs_errs_llss[comp] =
                   abs(ref_coulomb_opop[comp] - results_llss[comp][ijkl]);
               rel_abs_errs_llss[comp] =
                   abs(abs_errs_llss[comp] / ref_coulomb_opop[comp]);
 
-              abs_errs_ssss[comp] =
-                  abs(ref_opop_coulomb_opop[comp] - results_ssss[comp][ijkl]);
-              rel_abs_errs_ssss[comp] =
-                  abs(abs_errs_ssss[comp] / ref_opop_coulomb_opop[comp]);
-
               bool llss_not_ok =
                   rel_abs_errs_llss[comp] > RELATIVE_DEVIATION_THRESHOLD &&
                   abs_errs_llss[comp] > ABSOLUTE_DEVIATION_THRESHOLD;
 
-              bool ssss_not_ok =
-                  rel_abs_errs_ssss[comp] > RELATIVE_DEVIATION_THRESHOLD &&
-                  abs_errs_ssss[comp] > ABSOLUTE_DEVIATION_THRESHOLD;
-
-              // no 3^n prefactor here since the intrinsic deriv order is 2
               if (llss_not_ok) {
                 std::cout << "(l0 l1| l2 l3) = "
                           << "(" << s0 << " " << s1 << " | " << s2 << " " << s3
@@ -573,6 +607,20 @@ TEST_CASE("RKB Coulomb integrals", "[engine][2-body]") {
                           << " abs_error = " << abs_errs_llss[comp]
                           << std::endl;
               }
+              REQUIRE(!llss_not_ok);
+            }
+
+            // (SS|SS) has 16 components (two independent spin spaces)
+            for (auto comp = 0; comp < 16; ++comp) {
+              auto abs_err_ssss =
+                  abs(ref_opop_coulomb_opop[comp] - results_ssss[comp][ijkl]);
+              auto rel_abs_err_ssss =
+                  abs(abs_err_ssss / ref_opop_coulomb_opop[comp]);
+
+              bool ssss_not_ok =
+                  rel_abs_err_ssss > RELATIVE_DEVIATION_THRESHOLD &&
+                  abs_err_ssss > ABSOLUTE_DEVIATION_THRESHOLD;
+
               if (ssss_not_ok) {
                 std::cout << "(l0 l1| l2 l3) = "
                           << "(" << s0 << " " << s1 << " | " << s2 << " " << s3
@@ -580,11 +628,9 @@ TEST_CASE("RKB Coulomb integrals", "[engine][2-body]") {
                           << "Elem " << ijkl << " comp= " << comp
                           << " : ref = " << ref_opop_coulomb_opop[comp]
                           << " libint = " << results_ssss[comp][ijkl]
-                          << " relabs_error = " << rel_abs_errs_ssss[comp]
-                          << " abs_error = " << abs_errs_ssss[comp]
-                          << std::endl;
+                          << " relabs_error = " << rel_abs_err_ssss
+                          << " abs_error = " << abs_err_ssss << std::endl;
               }
-              REQUIRE(!llss_not_ok);
               REQUIRE(!ssss_not_ok);
             }
 
diff --git a/include/libint2/engine.h b/include/libint2/engine.h
index 94077da63..fae7d1785 100644
--- a/include/libint2/engine.h
+++ b/include/libint2/engine.h
@@ -364,7 +364,9 @@ struct operator_traits<Operator::coulomb_opop>
 template <>
 struct operator_traits<Operator::opop_coulomb_opop>
     : public operator_traits<Operator::coulomb> {
-  static constexpr auto nopers = 4;
+  /// 16 components: tensor product of two independent spin-space quaternions
+  /// index = 4 * bra_spin + ket_spin, where spin in {S=0, X=1, Y=2, Z=3}
+  static constexpr auto nopers = 16;
   static constexpr auto intrinsic_deriv_order = 4;
 };
 
diff --git a/include/libint2/engine.impl.h b/include/libint2/engine.impl.h
index a39244e7e..ff143d254 100644
--- a/include/libint2/engine.impl.h
+++ b/include/libint2/engine.impl.h
@@ -1216,17 +1216,39 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
 
 #if LIBINT2_SHELLQUARTET_SET == \
     LIBINT2_SHELLQUARTET_SET_STANDARD  // standard angular momentum ordering
-  const auto swap_braket = ((braket_ == BraKet::xx_xx) &&
-                            (tbra1.contr[0].l + tbra2.contr[0].l >
-                             tket1.contr[0].l + tket2.contr[0].l) &&
-                            (oper_ != Operator::coulomb_opop)) ||
-                           braket_ == BraKet::xx_xs;
+  bool swap_braket;
   bool swap_tbra, swap_tket;
   if (oper_ == Operator::opop_coulomb_opop) {
-    bool swap_p1p2 = swap_braket ? (tbra1.contr[0].l < tbra2.contr[0].l)
-                                 : (tket1.contr[0].l < tket2.contr[0].l);
-    swap_tbra = swap_tket = swap_p1p2;
+    // For σpσpCoulombσpσp: (ab|cd) = (cd|ab) = (ba|dc)* = (dc|ba)*
+    // Canonical form: lc >= ld (or la >= lb when lc == ld),
+    // la+lb <= lc+ld (or max(la,lb) <= lc when sums equal)
+    const auto bra_total = tbra1.contr[0].l + tbra2.contr[0].l;
+    const auto ket_total = tket1.contr[0].l + tket2.contr[0].l;
+    const auto bra_max = std::max(tbra1.contr[0].l, tbra2.contr[0].l);
+    const auto ket_max = std::max(tket1.contr[0].l, tket2.contr[0].l);
+    swap_braket = ((braket_ == BraKet::xx_xx) &&
+                   (bra_total > ket_total ||
+                    (bra_total == ket_total && bra_max > ket_max))) ||
+                  braket_ == BraKet::xx_xs;
+    // Coupled swap: after braket swap, sort the pair that ends up in ket
+    // position to ensure lc >= ld; when lc == ld, also sort bra (la >= lb)
+    if (swap_braket) {
+      // After braket swap: new ket = original bra, new bra = original ket.
+      // Coupled swap sorts new ket (ensure lc >= ld).
+      const bool swap_p1p2 = (tbra1.contr[0].l < tbra2.contr[0].l);
+      swap_tbra = swap_tket = swap_p1p2;
+    } else {
+      // No braket swap: ket stays as original ket.
+      // Coupled swap sorts ket (ensure lc >= ld).
+      const bool swap_p1p2 = (tket1.contr[0].l < tket2.contr[0].l);
+      swap_tbra = swap_tket = swap_p1p2;
+    }
   } else {
+    swap_braket = ((braket_ == BraKet::xx_xx) &&
+                   (tbra1.contr[0].l + tbra2.contr[0].l >
+                    tket1.contr[0].l + tket2.contr[0].l) &&
+                   (oper_ != Operator::coulomb_opop)) ||
+                  braket_ == BraKet::xx_xs;
     swap_tbra = (tbra1.contr[0].l < tbra2.contr[0].l);
     swap_tket = (tket1.contr[0].l < tket2.contr[0].l);
   }
@@ -1234,15 +1256,35 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
   // N.B. cannot swap bra and ket for coulomb_opop since the ket is mutated by
   // this operator
 #else  // orca angular momentum ordering
-  const auto swap_tbra = (tbra1.contr[0].l > tbra2.contr[0].l);
-  const auto swap_tket = (tket1.contr[0].l > tket2.contr[0].l);
-  const auto swap_braket = ((braket_ == BraKet::xx_xx) &&
-                            (tbra1.contr[0].l + tbra2.contr[0].l <
-                             tket1.contr[0].l + tket2.contr[0].l) &&
-                            (oper_ != Operator::coulomb_opop)) ||
-                           braket_ == BraKet::xx_xs;
-  assert(false && "feature not implemented");
-  abort();
+  bool swap_braket;
+  bool swap_tbra, swap_tket;
+  if (oper_ == Operator::opop_coulomb_opop) {
+    // ORCA canonical for σpσpCoulombσpσp: lc <= ld (or la <= lb when lc == ld),
+    // la+lb >= lc+ld (or min(la,lb) >= lc when sums equal)
+    const auto bra_total = tbra1.contr[0].l + tbra2.contr[0].l;
+    const auto ket_total = tket1.contr[0].l + tket2.contr[0].l;
+    const auto bra_min = std::min(tbra1.contr[0].l, tbra2.contr[0].l);
+    const auto ket_min = std::min(tket1.contr[0].l, tket2.contr[0].l);
+    swap_braket = ((braket_ == BraKet::xx_xx) &&
+                   (bra_total < ket_total ||
+                    (bra_total == ket_total && bra_min < ket_min))) ||
+                  braket_ == BraKet::xx_xs;
+    if (swap_braket) {
+      const bool swap_p1p2 = (tbra1.contr[0].l > tbra2.contr[0].l);
+      swap_tbra = swap_tket = swap_p1p2;
+    } else {
+      const bool swap_p1p2 = (tket1.contr[0].l > tket2.contr[0].l);
+      swap_tbra = swap_tket = swap_p1p2;
+    }
+  } else {
+    swap_tbra = (tbra1.contr[0].l > tbra2.contr[0].l);
+    swap_tket = (tket1.contr[0].l > tket2.contr[0].l);
+    swap_braket = ((braket_ == BraKet::xx_xx) &&
+                   (tbra1.contr[0].l + tbra2.contr[0].l <
+                    tket1.contr[0].l + tket2.contr[0].l) &&
+                   (oper_ != Operator::coulomb_opop)) ||
+                  braket_ == BraKet::xx_xs;
+  }
 #endif
   const auto& bra1 =
       swap_braket ? (swap_tket ? tket2 : tket1) : (swap_tbra ? tbra2 : tbra1);
@@ -2124,10 +2166,21 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
                     tgt_ptr + tgt_col_idx, nr1_tgt, nr2_tgt,
                     Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>(
                         nr2_tgt * ncol_tgt, ncol_tgt));
+                // Coupled swap sign correction for multi-component operators
+                Shell::real_t oper_cart_component_phase = 1.0;
+                if (swap_tket && oper_ == Operator::opop_coulomb_opop) {
+                  const bool bra_is_spin = (s / 4) > 0;
+                  const bool ket_is_spin = (s % 4) > 0;
+                  if (bra_is_spin != ket_is_spin)
+                    oper_cart_component_phase = -1.0;
+                }
+                if (swap_tket && oper_ == Operator::coulomb_opop && s > 0)
+                  oper_cart_component_phase = -1.0;
                 if (swap_tbra)
-                  tgt_blk_mat = src_blk_mat.transpose();
+                  tgt_blk_mat =
+                      oper_cart_component_phase * src_blk_mat.transpose();
                 else
-                  tgt_blk_mat = src_blk_mat;
+                  tgt_blk_mat = oper_cart_component_phase * src_blk_mat;
               } else {
                 // source row {r1,r2} is mapped to target row {r1,r2} if
                 // !swap_tbra, else to {r2,r1}
@@ -2136,10 +2189,16 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
                 Map tgt_blk_mat(tgt_ptr + tgt_row_idx * ncol, nc1_tgt, nc2_tgt);
                 if (swap_tket) {
                   Shell::real_t oper_cart_component_phase = 1.0;
-                  if (oper_ == Operator::opop_coulomb_opop && s == 3)
-                    oper_cart_component_phase =
-                        -1.0;  // z quaternion components flip sign on
-                               //  swapping ket
+                  if (oper_ == Operator::opop_coulomb_opop) {
+                    // Option A ordering: index = 4*bra + ket
+                    // Coupled swap (a<->b AND c<->d) flips sign when exactly
+                    // one of bra/ket is a cross product (spin != S):
+                    //   bra_spin = s/4, ket_spin = s%4 (0=S, 1-3=X/Y/Z)
+                    const bool bra_is_spin = (s / 4) > 0;
+                    const bool ket_is_spin = (s % 4) > 0;
+                    if (bra_is_spin != ket_is_spin)
+                      oper_cart_component_phase = -1.0;
+                  }
                   if (oper_ == Operator::coulomb_opop && s > 0)
                     oper_cart_component_phase =
                         -1.0;  // x,y,z quaternion components flip sign on
@@ -2171,7 +2230,17 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
                                                     // to primdata_[0].targets
           targets_[s_target] = source;
         }
-      }     // loop over shellsets
+      }  // loop over shellsets
+
+      // For opop_coulomb_opop with swap_braket: swapping particles remaps
+      // component (α,β) → (β,α). With Option A ordering (index=4*bra+ket),
+      // this is a matrix transpose: s_new = 4*(s%4) + (s/4).
+      if (permute && oper_ == Operator::opop_coulomb_opop && swap_braket) {
+        std::array<const value_type*, 16> temp;
+        for (auto s = 0; s != ntargets; ++s) temp[s] = targets_[s];
+        for (auto s = 0; s != ntargets; ++s)
+          targets_[4 * (s % 4) + (s / 4)] = temp[s];
+      }
     }       // if need_scratch => needed to transpose and/or tform
     else {  // did not use scratch? may still need to update targets_
       if (set_targets_) {
diff --git a/src/bin/libint/build_libint.cc b/src/bin/libint/build_libint.cc
index c2a9de6cd..4e9b10152 100644
--- a/src/bin/libint/build_libint.cc
+++ b/src/bin/libint/build_libint.cc
@@ -31,7 +31,9 @@
   */
 
 #include <boost/preprocessor.hpp>
+#include <chrono>
 #include <fstream>
+#include <iomanip>
 #include <iostream>
 #include <limits>
 #include <vector>
@@ -66,6 +68,8 @@
 using namespace std;
 using namespace libint2;
 
+CodeGenProgress g_progress;
+
 enum ShellSetType {
   ShellSetType_Standard = LIBINT_SHELL_SET_STANDARD,
   ShellSetType_ORCA = LIBINT_SHELL_SET_ORCA
@@ -90,7 +94,8 @@ struct ShellQuartetSetPredicate<ShellSetType_Standard> {
   static bool value(int la, int lb, int lc, int ld, bool p1p2_swappable = true,
                     bool bra_ket_coswappable = false) {
     if (bra_ket_coswappable)
-      return (la + lb <= lc + ld) && lc >= ld;
+      return lc >= ld && (la + lb < lc + ld ||
+                          (la + lb == lc + ld && std::max(la, lb) <= lc));
     else
       return la >= lb && lc >= ld && (!p1p2_swappable || la + lb <= lc + ld);
   }
@@ -100,7 +105,8 @@ struct ShellQuartetSetPredicate<ShellSetType_ORCA> {
   static bool value(int la, int lb, int lc, int ld, bool p1p2_swappable = true,
                     bool bra_ket_coswappable = false) {
     if (bra_ket_coswappable)
-      return (la < lc || (la == lc && lb <= ld));
+      return lc <= ld && (la + lb > lc + ld ||
+                          (la + lb == lc + ld && std::min(la, lb) >= lc));
     else
       return la <= lb && lc <= ld &&
              (!p1p2_swappable || (la < lc || (la == lc && lb <= ld)));
@@ -211,7 +217,9 @@ static void config_to_api(const std::shared_ptr<CompilationParameters>& cparams,
 
 #ifdef LIBINT_INCLUDE_ERI
 #define USE_GENERIC_ERI_BUILD 1
-#if !USE_GENERIC_ERI_BUILD
+#endif
+#if defined(LIBINT_INCLUDE_ERI) || defined(LIBINT_INCLUDE_RKB_ERI)
+#if defined(USE_GENERIC_ERI_BUILD) && !USE_GENERIC_ERI_BUILD
 template <typename OperType>
 static void build_TwoPRep_2b_2k(
     std::ostream& os, std::string label,
@@ -222,7 +230,7 @@ template <typename OperType>
 static void build_TwoPRep_2b_2k(
     std::ostream& os, std::string label,
     const std::shared_ptr<CompilationParameters>& cparams,
-    std::shared_ptr<Libint2Iface>& iface, unsigned int deriv_level);
+    std::shared_ptr<Libint2Iface>& iface, unsigned int deriv_level = 0);
 #endif
 #endif
 
@@ -272,6 +280,8 @@ struct AuxQuantaType {
   typedef EmptySet type;
 };
 
+}  // namespace
+
 template <typename OperDescrType>
 OperDescrType make_descr(int, int = 0, int = 0) {
   return OperDescrType();
@@ -308,8 +318,6 @@ template <>
   return σpσpCoulombσpσp_Descr(p);
 }
 
-}  // namespace
-
 template <typename _OperType>
 void build_onebody_1b_1k(std::ostream& os, std::string label,
                          const std::shared_ptr<CompilationParameters>& cparams,
@@ -514,8 +522,8 @@ void build_onebody_1b_1k(std::ostream& os, std::string label,
         eval_label = oss.str();
       }
 
-      std::cout << "working on " << eval_label << " ... ";
-      std::cout.flush();
+      g_progress.current_task = eval_label;
+      g_progress.print();
 
       std::string prefix(cparams->source_directory());
       std::deque<std::string> decl_filenames;
@@ -555,8 +563,6 @@ void build_onebody_1b_1k(std::ostream& os, std::string label,
       dg->reset();
       memman->reset();
 
-      std::cout << "done" << std::endl;
-
     }  // end of b loop
   }    // end of a loop
 }
@@ -962,6 +968,8 @@ void try_main(int argc, char* argv[]) {
 #endif
   cparams->print(os);
 
+  g_progress.start();
+
 #ifdef LIBINT_INCLUDE_ONEBODY
   for (unsigned int d = 0; d <= LIBINT_INCLUDE_ONEBODY; ++d) {
 #define BOOST_PP_ONEBODY_MCR7(r, data, i, elem)                              \
@@ -1020,6 +1028,8 @@ void try_main(int argc, char* argv[]) {
   build_G12DKH_2b_2k(os, cparams, iface);
 #endif
 
+  g_progress.finish();
+
   // Generate code for the set-level RRs
   std::deque<std::string> decl_filenames, def_filenames;
   generate_rr_code(os, cparams, decl_filenames, def_filenames);
@@ -1101,12 +1111,12 @@ void print_config(std::ostream& os) {
 #endif
 }
 
-#ifdef LIBINT_INCLUDE_ERI
+#if defined(LIBINT_INCLUDE_ERI) || defined(LIBINT_INCLUDE_RKB_ERI)
 template <typename OperType>
-void build_TwoPRep_2b_2k(std::ostream& os, std::string label,
-                         const std::shared_ptr<CompilationParameters>& cparams,
-                         std::shared_ptr<Libint2Iface>& iface,
-                         unsigned int deriv_level) {
+static void build_TwoPRep_2b_2k(
+    std::ostream& os, std::string label,
+    const std::shared_ptr<CompilationParameters>& cparams,
+    std::shared_ptr<Libint2Iface>& iface, unsigned int deriv_level) {
   typedef GenIntegralSet_11_11<CGShell, OperType, mType> TwoBody_sh_11_11;
   typedef typename OperType::Descriptor OperDescrType;
 
@@ -1168,13 +1178,20 @@ void build_TwoPRep_2b_2k(std::ostream& os, std::string label,
           // loop over operator components
           /////////////////////////////////
           std::vector<OperDescrType> descrs(1);
-          if (std::is_same<OperType, CoulombσpσpOper>::value ||
-              std::is_same<OperType, σpσpCoulombσpσpOper>::value) {
+          if constexpr (std::is_same<OperType, CoulombσpσpOper>::value) {
             // reset descriptors array
             descrs.resize(0);
-            // iterate over quaternion components
+            // iterate over 4 quaternion components (single spin space)
             for (int p = 0; p != 4; ++p) {
-              descrs.emplace_back(make_descr<OperDescrType>(p));
+              descrs.emplace_back(OperDescrType(p));
+            }
+          }
+          if constexpr (std::is_same<OperType, σpσpCoulombσpσpOper>::value) {
+            // reset descriptors array
+            descrs.resize(0);
+            // iterate over 16 components (tensor product of two spin spaces)
+            for (int p = 0; p != 16; ++p) {
+              descrs.emplace_back(OperDescrType(p));
             }
           }
 
@@ -1192,10 +1209,14 @@ void build_TwoPRep_2b_2k(std::ostream& os, std::string label,
                   ? std::numeric_limits<unsigned int>::max()
                   : 0;
           dg_xxxx->registry()->unroll_threshold(unroll_threshold);
-          dg_xxxx->registry()->do_cse(need_to_optimize);
-          dg_xxxx->registry()->condense_expr(condense_expr(
-              cparams->unroll_threshold(), cparams->max_vector_length() > 1));
-          // dg_xxxx->registry()->condense_expr(true);
+          // For multi-component operators (RKB), components share no
+          // intermediates, so CSE/condense_expr is pure overhead — disable.
+          const bool do_optimize = (nopers > 1) ? false : need_to_optimize;
+          dg_xxxx->registry()->do_cse(do_optimize);
+          dg_xxxx->registry()->condense_expr(
+              do_optimize ? condense_expr(cparams->unroll_threshold(),
+                                          cparams->max_vector_length() > 1)
+                          : false);
           //  Need to accumulate integrals?
           dg_xxxx->registry()->accumulate_targets(
               cparams->accumulate_targets());
@@ -1238,13 +1259,6 @@ void build_TwoPRep_2b_2k(std::ostream& os, std::string label,
             last_deriv = diter.last();
             if (!last_deriv) diter.next();
           } while (!last_deriv);
-          // append all derivatives as targets to the graph
-          for (auto it = targets.begin(); it != targets.end(); ++it) {
-            std::shared_ptr<DGVertex> t_ptr =
-                std::dynamic_pointer_cast<DGVertex, TwoBody_sh_11_11>(*it);
-            dg_xxxx->append_target(t_ptr);
-          }
-
           // make label that characterizes this set of targets
           // use the label of the nondifferentiated integral as a base
           std::string abcd_label;
@@ -1280,13 +1294,20 @@ void build_TwoPRep_2b_2k(std::ostream& os, std::string label,
             eval_label += abcd_label;
           }
 
-          std::cout << "working on " << eval_label << " ... ";
-          std::cout.flush();
+          g_progress.current_task = eval_label;
+          g_progress.print();
 
           std::string prefix(cparams->source_directory());
           std::deque<std::string> decl_filenames;
           std::deque<std::string> def_filenames;
 
+          // append all targets to the graph
+          for (auto it = targets.begin(); it != targets.end(); ++it) {
+            std::shared_ptr<DGVertex> t_ptr =
+                std::dynamic_pointer_cast<DGVertex, TwoBody_sh_11_11>(*it);
+            dg_xxxx->append_target(t_ptr);
+          }
+
           // this will generate code for these targets, and potentially
           // generate code for its prerequisites
           GenerateCode(dg_xxxx, context, cparams, strat, tactic, memman,
@@ -1322,7 +1343,8 @@ void build_TwoPRep_2b_2k(std::ostream& os, std::string label,
           dg_xxxx->reset();
           memman->reset();
 
-          std::cout << "done" << std::endl;
+          ++g_progress.done;
+          g_progress.print();
 
         }  // end of d loop
       }    // end of c loop
@@ -1330,7 +1352,7 @@ void build_TwoPRep_2b_2k(std::ostream& os, std::string label,
   }        // end of a loop
 }
 
-#endif  // LIBINT_INCLUDE_ERI
+#endif  // LIBINT_INCLUDE_ERI || LIBINT_INCLUDE_RKB_ERI
 
 #ifdef LIBINT_INCLUDE_ERI3
 
@@ -1490,8 +1512,8 @@ void build_TwoPRep_1b_2k(std::ostream& os,
           label += abcd_label;
         }
 
-        std::cout << "working on " << label << " ... ";
-        std::cout.flush();
+        g_progress.current_task = label;
+        g_progress.print();
 
         std::string prefix(cparams->source_directory());
         std::deque<std::string> decl_filenames;
@@ -1530,7 +1552,8 @@ void build_TwoPRep_1b_2k(std::ostream& os,
 #endif
         dg_xxx->reset();
         memman->reset();
-        std::cout << "done" << std::endl;
+        ++g_progress.done;
+        g_progress.print();
       }  // end of d loop
     }    // end of c loop
   }      // end of bra loop
@@ -1687,7 +1710,8 @@ void build_TwoPRep_1b_1k(std::ostream& os,
         label += abcd_label;
       }
 
-      std::cout << "working on " << label << " ... ";
+      g_progress.current_task = label;
+      g_progress.print();
       std::cout.flush();
 
       std::string prefix(cparams->source_directory());
@@ -1727,7 +1751,6 @@ void build_TwoPRep_1b_1k(std::ostream& os,
 #endif
       dg_xxx->reset();
       memman->reset();
-      std::cout << "done" << std::endl;
     }  // end of ket loop
   }    // end of bra loop
 }
diff --git a/src/bin/libint/buildtest.h b/src/bin/libint/buildtest.h
index a4923c022..0ea5b571e 100644
--- a/src/bin/libint/buildtest.h
+++ b/src/bin/libint/buildtest.h
@@ -30,13 +30,57 @@
 #include <libint2/config.h>
 #include <strategy.h>
 
+#include <chrono>
 #include <deque>
 #include <fstream>
+#include <iomanip>
 #include <iostream>
 #include <iterator>
 #include <sstream>
 #include <string>
 
+/// Progress tracker for code generation.
+struct CodeGenProgress {
+  unsigned int done = 0;
+  std::string current_task;
+  std::chrono::steady_clock::time_point start_time;
+  bool started = false;
+
+  void start() {
+    start_time = std::chrono::steady_clock::now();
+    started = true;
+  }
+
+  void print() const {
+    if (!started) return;
+    static const char spinner[] = "|/-\\";
+    const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
+                             std::chrono::steady_clock::now() - start_time)
+                             .count();
+    const auto mins = elapsed / 60;
+    const auto secs = elapsed % 60;
+    std::cerr << "\r  " << spinner[done % 4] << "  " << std::setfill('0')
+              << std::setw(2) << mins << ":" << std::setw(2) << secs
+              << std::setfill(' ') << "  [" << done << " functions generated]  "
+              << current_task << "                    " << std::flush;
+  }
+
+  void finish() {
+    if (!started) return;
+    const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
+                             std::chrono::steady_clock::now() - start_time)
+                             .count();
+    const auto mins = elapsed / 60;
+    const auto secs = elapsed % 60;
+    std::cerr << "\r  done  " << std::setfill('0') << std::setw(2) << mins
+              << ":" << std::setw(2) << secs << std::setfill(' ') << "  ["
+              << done << " functions generated]"
+              << "                                        " << std::endl;
+    started = false;
+  }
+};
+extern CodeGenProgress g_progress;
+
 namespace libint2 {
 
 // defined in buildtest.cc
@@ -263,7 +307,6 @@ void GenerateCode(const std::shared_ptr<DirectedGraph>& dg,
   // if there are missing prerequisites -- make a list of them
   PrerequisitesExtractor pe;
   if (dg->missing_prerequisites()) {
-    // std::cout << "missing some prerequisites!" << std::endl;
     dg->foreach (pe);
   }
   std::deque<std::shared_ptr<DGVertex> > prereq_list = pe.vertices;
@@ -296,6 +339,11 @@ void GenerateCode(const std::shared_ptr<DirectedGraph>& dg,
   // extract all external symbols
   extract_symbols(dg);
 
+  // Update progress
+  ++g_progress.done;
+  g_progress.current_task = label;
+  g_progress.print();
+
 #if PRINT_DAG_GRAPHVIZ
   {
     std::basic_ofstream<char> dotfile(dg->label() + ".symb.dot");
diff --git "a/src/bin/libint/comp_11_\317\203p\317\203pCoulomb\317\203p\317\203p_11.h" "b/src/bin/libint/comp_11_\317\203p\317\203pCoulomb\317\203p\317\203p_11.h"
index 7bf0a4b9a..606418d0b 100644
--- "a/src/bin/libint/comp_11_\317\203p\317\203pCoulomb\317\203p\317\203p_11.h"
+++ "b/src/bin/libint/comp_11_\317\203p\317\203pCoulomb\317\203p\317\203p_11.h"
@@ -28,10 +28,38 @@
 namespace libint2 {
 
 /**
- * this computes integral of
- * \sigma \cdot \hat{p}_1 \sigma \cdot \hat{p}_2 \f$ \frac{1}{r_{ij}} \sigma
- * \cdot \hat{p}_3 \sigma \cdot \hat{p}_4 \f$ over CGShell/CGF by rewriting it
- * as a linear combination of integrals over derivatives of \frac{1}{r_{ij}}
+ * Computes integral of
+ * \f$ (\sigma_1 \cdot \hat{p}_a)(\sigma_1 \cdot \hat{p}_b)
+ *     \frac{1}{r_{12}}
+ *     (\sigma_2 \cdot \hat{p}_c)(\sigma_2 \cdot \hat{p}_d) \f$
+ * over CGShell/CGF by rewriting it as a linear combination of integrals
+ * over derivatives of \f$ \frac{1}{r_{12}} \f$.
+ *
+ * The two sigma operators act on independent spin spaces (electron 1 and
+ * electron 2). Using the Dirac identity (see e.g. Eq. 1.27 of I. P. Grant,
+ * "Relativistic Quantum Theory of Atoms and Molecules", Springer, 2007):
+ *   \f$ (\sigma \cdot a)(\sigma \cdot b) = (a \cdot b)I
+ *       + i\sigma \cdot (a \times b) \f$
+ * applied independently to each particle's spin space gives a tensor product
+ * of two quaternions with \f$ 4 \times 4 = 16 \f$ components:
+ *
+ *   index = 4 * bra_spin_index + ket_spin_index
+ *
+ * where spin indices are: 0=S (scalar/dot product), 1=X, 2=Y, 3=Z
+ * (cross product components).
+ *
+ * The 16 components map to:
+ *   T1 (index 0):      SS = (a.b)(c.d)                  [scalar x scalar]
+ *   T2 (indices 1-3):  SX,SY,SZ = (a.b)(cxd)_{x,y,z}   [scalar x spin]
+ *   T3 (indices 4-6):  XS,YS,ZS = (axb)_{x,y,z}(c.d)   [spin x scalar]
+ *   T4 (indices 7-15): XX..ZZ = -(axb)_i(cxd)_j         [spin x spin]
+ *
+ * Sign convention: T4 components include the minus sign from \f$ i^2 = -1 \f$,
+ * arising from the product of two \f$ i \f$ factors in the Dirac identity:
+ *   \f$ [i\sigma_1 \cdot (a \times b)] \otimes [i\sigma_2 \cdot (c \times d)]
+ *     = -\sigma_{1,i} \otimes \sigma_{2,j}\; (a \times b)_i\, (c \times d)_j
+ *   \f$
+ *
  * @tparam F basis function type. valid choices are CGShell or CGF
  */
 template <typename F>
@@ -113,81 +141,216 @@ CR_11_σpσpCoulombσpσp_11<F>::CR_11_σpσpCoulombσpσp_11(
     return factory.make_child(a_r1, b_r2, c_r3, d_r4, zero_m);
   };
 
-  // Component wise generation for quaternion :
-  // ( (σ.p) a (σ.p) b | 1/r12 | (σ.p) c (σ.p) d )
+  // 16-component generation for two independent spin spaces:
+  // ( (σ₁.p) a (σ₁.p) b | 1/r12 | (σ₂.p) c (σ₂.p) d )
+  //
+  // Option A (tensor product) ordering: index = 4 * bra_spin + ket_spin
+  //   bra_spin = index / 4,  ket_spin = index % 4
+  //   spin indices: S=0, X=1, Y=2, Z=3
+  //
+  // Row bra=S: 0=SS, 1=SX, 2=SY, 3=SZ          (T1 + T2)
+  // Row bra=X: 4=XS, 5=XX, 6=XY, 7=XZ          (T3 + T4)
+  // Row bra=Y: 8=YS, 9=YX, 10=YY, 11=YZ        (T3 + T4)
+  // Row bra=Z: 12=ZS, 13=ZX, 14=ZY, 15=ZZ      (T3 + T4)
+  //
+  // T4 components include minus sign from i^2 = -1.
   switch (oper->descr().quaternion_index()) {
+    // ===== 0: SS = (a.b)(c.d) =====
     case 0: {
       auto xxxx = mc(x, x, x, x);
-      auto yyxx = mc(y, y, x, x);
-      auto zzxx = mc(z, z, x, x);
-      auto yxyx = mc(y, x, y, x);
-      auto xyyx = mc(x, y, y, x);
-      auto yxxy = mc(y, x, x, y);
-      auto xyxy = mc(x, y, x, y);
       auto xxyy = mc(x, x, y, y);
-      auto yyyy = mc(y, y, y, y);
-      auto zzyy = mc(z, z, y, y);
       auto xxzz = mc(x, x, z, z);
+      auto yyxx = mc(y, y, x, x);
+      auto yyyy = mc(y, y, y, y);
       auto yyzz = mc(y, y, z, z);
+      auto zzxx = mc(z, z, x, x);
+      auto zzyy = mc(z, z, y, y);
       auto zzzz = mc(z, z, z, z);
       if (is_simple()) {
-        expr_ = xxxx + yyxx + zzxx - yxyx + xyyx + yxxy - xyxy + xxyy + yyyy +
-                zzyy + xxzz + yyzz + zzzz;
-        nflops_ += 12;
+        expr_ = xxxx + xxyy + xxzz + yyxx + yyyy + yyzz + zzxx + zzyy + zzzz;
+        nflops_ += 8;
       }
     } break;
+    // ===== 1: SX = (a.b)(c×d)_x =====
     case 1: {
-      auto zxzx = mc(z, x, z, x);
-      auto xzzx = mc(x, z, z, x);
-      auto zyzy = mc(z, y, z, y);
+      auto xxyz = mc(x, x, y, z);
+      auto xxzy = mc(x, x, z, y);
+      auto yyyz = mc(y, y, y, z);
+      auto yyzy = mc(y, y, z, y);
+      auto zzyz = mc(z, z, y, z);
+      auto zzzy = mc(z, z, z, y);
+      if (is_simple()) {
+        expr_ = xxyz - xxzy + yyyz - yyzy + zzyz - zzzy;
+        nflops_ += 5;
+      }
+    } break;
+    // ===== 2: SY = (a.b)(c×d)_y =====
+    case 2: {
+      auto xxzx = mc(x, x, z, x);
+      auto xxxz = mc(x, x, x, z);
+      auto yyzx = mc(y, y, z, x);
+      auto yyxz = mc(y, y, x, z);
+      auto zzzx = mc(z, z, z, x);
+      auto zzxz = mc(z, z, x, z);
+      if (is_simple()) {
+        expr_ = xxzx - xxxz + yyzx - yyxz + zzzx - zzxz;
+        nflops_ += 5;
+      }
+    } break;
+    // ===== 3: SZ = (a.b)(c×d)_z =====
+    case 3: {
+      auto xxxy = mc(x, x, x, y);
+      auto xxyx = mc(x, x, y, x);
+      auto yyxy = mc(y, y, x, y);
+      auto yyyx = mc(y, y, y, x);
+      auto zzxy = mc(z, z, x, y);
+      auto zzyx = mc(z, z, y, x);
+      if (is_simple()) {
+        expr_ = xxxy - xxyx + yyxy - yyyx + zzxy - zzyx;
+        nflops_ += 5;
+      }
+    } break;
+    // ===== 4: XS = (a×b)_x(c.d) =====
+    case 4: {
+      auto yzxx = mc(y, z, x, x);
+      auto zyxx = mc(z, y, x, x);
+      auto yzyy = mc(y, z, y, y);
+      auto zyyy = mc(z, y, y, y);
+      auto yzzz = mc(y, z, z, z);
+      auto zyzz = mc(z, y, z, z);
+      if (is_simple()) {
+        expr_ = yzxx - zyxx + yzyy - zyyy + yzzz - zyzz;
+        nflops_ += 5;
+      }
+    } break;
+    // ===== 5: XX = -(a×b)_x(c×d)_x (minus from i²=-1) =====
+    case 5: {
+      auto yzyz = mc(y, z, y, z);
       auto yzzy = mc(y, z, z, y);
-      auto zxxz = mc(z, x, x, z);
-      auto xzxz = mc(x, z, x, z);
       auto zyyz = mc(z, y, y, z);
-      auto yzyz = mc(y, z, y, z);
+      auto zyzy = mc(z, y, z, y);
       if (is_simple()) {
-        expr_ = zxzx - xzzx - zyzy + yzzy - zxxz + xzxz + zyyz - yzyz;
-        nflops_ += 7;
+        expr_ = yzzy - yzyz + zyyz - zyzy;
+        nflops_ += 3;
       }
     } break;
-    case 2: {
-      auto zyzx = mc(z, y, z, x);
+    // ===== 6: XY = -(a×b)_x(c×d)_y =====
+    case 6: {
       auto yzzx = mc(y, z, z, x);
-      auto zxzy = mc(z, x, z, y);
-      auto xzzy = mc(x, z, z, y);
-      auto zyxz = mc(z, y, x, z);
       auto yzxz = mc(y, z, x, z);
+      auto zyzx = mc(z, y, z, x);
+      auto zyxz = mc(z, y, x, z);
+      if (is_simple()) {
+        expr_ = yzxz - yzzx + zyzx - zyxz;
+        nflops_ += 3;
+      }
+    } break;
+    // ===== 7: XZ = -(a×b)_x(c×d)_z =====
+    case 7: {
+      auto yzxy = mc(y, z, x, y);
+      auto yzyx = mc(y, z, y, x);
+      auto zyxy = mc(z, y, x, y);
+      auto zyyx = mc(z, y, y, x);
+      if (is_simple()) {
+        expr_ = yzyx - yzxy + zyxy - zyyx;
+        nflops_ += 3;
+      }
+    } break;
+    // ===== 8: YS = (a×b)_y(c.d) =====
+    case 8: {
+      auto zxxx = mc(z, x, x, x);
+      auto xzxx = mc(x, z, x, x);
+      auto zxyy = mc(z, x, y, y);
+      auto xzyy = mc(x, z, y, y);
+      auto zxzz = mc(z, x, z, z);
+      auto xzzz = mc(x, z, z, z);
+      if (is_simple()) {
+        expr_ = zxxx - xzxx + zxyy - xzyy + zxzz - xzzz;
+        nflops_ += 5;
+      }
+    } break;
+    // ===== 9: YX = -(a×b)_y(c×d)_x =====
+    case 9: {
       auto zxyz = mc(z, x, y, z);
+      auto zxzy = mc(z, x, z, y);
       auto xzyz = mc(x, z, y, z);
+      auto xzzy = mc(x, z, z, y);
       if (is_simple()) {
-        // swapped order of first two terms compiler does not like negative sign
-        // in front of first term
-        expr_ = yzzx - zyzx - zxzy + xzzy + zyxz - yzxz + zxyz - xzyz;
-        nflops_ += 7;
+        expr_ = zxzy - zxyz + xzyz - xzzy;
+        nflops_ += 3;
       }
     } break;
-    case 3: {
-      auto yxxx = mc(y, x, x, x);
+    // ===== 10: YY = -(a×b)_y(c×d)_y =====
+    case 10: {
+      auto zxzx = mc(z, x, z, x);
+      auto zxxz = mc(z, x, x, z);
+      auto xzzx = mc(x, z, z, x);
+      auto xzxz = mc(x, z, x, z);
+      if (is_simple()) {
+        expr_ = zxxz - zxzx + xzzx - xzxz;
+        nflops_ += 3;
+      }
+    } break;
+    // ===== 11: YZ = -(a×b)_y(c×d)_z =====
+    case 11: {
+      auto zxxy = mc(z, x, x, y);
+      auto zxyx = mc(z, x, y, x);
+      auto xzxy = mc(x, z, x, y);
+      auto xzyx = mc(x, z, y, x);
+      if (is_simple()) {
+        expr_ = zxyx - zxxy + xzxy - xzyx;
+        nflops_ += 3;
+      }
+    } break;
+    // ===== 12: ZS = (a×b)_z(c.d) =====
+    case 12: {
       auto xyxx = mc(x, y, x, x);
-      auto xxyx = mc(x, x, y, x);
-      auto yyyx = mc(y, y, y, x);
-      auto zzyx = mc(z, z, y, x);
-      auto xxxy = mc(x, x, x, y);
-      auto yyxy = mc(y, y, x, y);
-      auto zzxy = mc(z, z, x, y);
-      auto yxyy = mc(y, x, y, y);
+      auto yxxx = mc(y, x, x, x);
       auto xyyy = mc(x, y, y, y);
-      auto yxzz = mc(y, x, z, z);
+      auto yxyy = mc(y, x, y, y);
       auto xyzz = mc(x, y, z, z);
+      auto yxzz = mc(y, x, z, z);
+      if (is_simple()) {
+        expr_ = xyxx - yxxx + xyyy - yxyy + xyzz - yxzz;
+        nflops_ += 5;
+      }
+    } break;
+    // ===== 13: ZX = -(a×b)_z(c×d)_x =====
+    case 13: {
+      auto xyyz = mc(x, y, y, z);
+      auto xyzy = mc(x, y, z, y);
+      auto yxyz = mc(y, x, y, z);
+      auto yxzy = mc(y, x, z, y);
+      if (is_simple()) {
+        expr_ = xyzy - xyyz + yxyz - yxzy;
+        nflops_ += 3;
+      }
+    } break;
+    // ===== 14: ZY = -(a×b)_z(c×d)_y =====
+    case 14: {
+      auto xyzx = mc(x, y, z, x);
+      auto xyxz = mc(x, y, x, z);
+      auto yxzx = mc(y, x, z, x);
+      auto yxxz = mc(y, x, x, z);
+      if (is_simple()) {
+        expr_ = xyxz - xyzx + yxzx - yxxz;
+        nflops_ += 3;
+      }
+    } break;
+    // ===== 15: ZZ = -(a×b)_z(c×d)_z =====
+    case 15: {
+      auto xyxy = mc(x, y, x, y);
+      auto xyyx = mc(x, y, y, x);
+      auto yxxy = mc(y, x, x, y);
+      auto yxyx = mc(y, x, y, x);
       if (is_simple()) {
-        expr_ = xyxx - yxxx - xxyx - yyyx - zzyx + xxxy + yyxy + zzxy - yxyy +
-                xyyy - yxzz + xyzz;
-        nflops_ += 11;
+        expr_ = xyyx - xyxy + yxxy - yxyx;
+        nflops_ += 3;
       }
     } break;
     default:
       throw std::runtime_error(
-          "CR_11_σpσpCoulombσpσp_11: invalid quaternionic index");
+          "CR_11_σpσpCoulombσpσp_11: invalid component index (expected 0-15)");
   }
 
 }  // CR_11_σpσpCoulombσpσp_11<F>::CR_11_σpσpCoulombσpσp_11
diff --git a/src/bin/libint/oper.h b/src/bin/libint/oper.h
index cecbda72b..678180c28 100644
--- a/src/bin/libint/oper.h
+++ b/src/bin/libint/oper.h
@@ -443,24 +443,27 @@ struct σpσpCoulombσpσp_Descr : public Contractable<σpσpCoulombσpσp_Descr
   σpσpCoulombσpσp_Descr() : quaternion_index_(0) {}
   σpσpCoulombσpσp_Descr(int quaternion_index)
       : quaternion_index_(quaternion_index) {
-    assert(quaternion_index <= 3);
+    assert(quaternion_index >= 0 && quaternion_index <= 15);
   }
 
-  static const unsigned int max_key = 4;
+  /// 16 components from tensor product of two independent spin spaces:
+  /// index = 4 * bra_spin_index + ket_spin_index
+  /// where spin indices are: 0=S (scalar), 1=X, 2=Y, 3=Z (cross product)
+  static const unsigned int max_key = 16;
   unsigned int key() const { return quaternion_index(); }
   std::string description() const {
-    std::string descr("opop_coulomb_opop[");
-    if (quaternion_index() == 0)
-      descr += "0";
-    else if (quaternion_index() == 1)
-      descr += "X";
-    else if (quaternion_index() == 2)
-      descr += "Y";
-    else if (quaternion_index() == 3)
-      descr += "Z";
-    else
-      abort();
-    return descr + "]";
+    // clang-format off
+    // Option A (tensor product order): index = 4 * bra_spin + ket_spin
+    static const char* labels[] = {
+        "SS", "SX", "SY", "SZ",
+        "XS", "XX", "XY", "XZ",
+        "YS", "YX", "YY", "YZ",
+        "ZS", "ZX", "ZY", "ZZ"
+    };
+    // clang-format on
+    const auto qi = quaternion_index();
+    if (qi > 15) abort();
+    return std::string("opop_coulomb_opop[") + labels[qi] + "]";
   }
   std::string label() const { return description(); }
   int psymm(int i, int j) const { abort(); }

From 98d7d375a0e9bd040a14236f3e9a79a798bd42e1 Mon Sep 17 00:00:00 2001
From: Kshitij Surjuse <kshitijsurjuse100@gmail.com>
Date: Thu, 2 Apr 2026 19:28:27 -0400
Subject: [PATCH 15/22] DerivGaussV2: HRR-like code-sharing for derivative
 Gaussians

Apply the same optimization as HRR: since differentiation of a Gaussian
depends only on that shell's quanta (not spectators), generate code once
per unique differentiated shell and pass spectator dims at the call site.

Verified: 295,944 assertions pass (Coulomb + ERI derivs + RKB integrals).
---
 src/bin/libint/comp_deriv_gauss_v2.h | 547 +++++++++++++++++++++++++++
 src/bin/libint/master_rrs_list.h     |  46 +++
 src/bin/libint/strategy.cc           |  62 +--
 3 files changed, 631 insertions(+), 24 deletions(-)
 create mode 100644 src/bin/libint/comp_deriv_gauss_v2.h

diff --git a/src/bin/libint/comp_deriv_gauss_v2.h b/src/bin/libint/comp_deriv_gauss_v2.h
new file mode 100644
index 000000000..970bab0ae
--- /dev/null
+++ b/src/bin/libint/comp_deriv_gauss_v2.h
@@ -0,0 +1,547 @@
+/*
+ *  Copyright (C) 2004-2026 Edward F. Valeev
+ *
+ *  This file is part of Libint compiler.
+ *
+ *  Libint compiler is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Libint compiler is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with Libint compiler.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef _libint2_src_bin_libint_compderivgaussv2_h_
+#define _libint2_src_bin_libint_compderivgaussv2_h_
+
+#include <algebra.h>
+#include <context.h>
+#include <default_params.h>
+#include <dgvertex.h>
+#include <dims.h>
+#include <integral.h>
+#include <prefactors.h>
+#include <rr.h>
+#include <task.h>
+
+#include <cassert>
+#include <iostream>
+#include <set>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace libint2 {
+
+/** Optimized compute relation for (geometric) derivative Gaussian integrals.
+ *
+ * Like CR_DerivGauss, this expands derivative Gaussians via:
+ *   d/dr G(a) = 2*alpha * G(a+1) - a_i * G(a-1)
+ *
+ * Unlike CR_DerivGauss, this uses the HRR-like code-sharing optimization:
+ * since differentiation of a Gaussian at position (part, where) depends only
+ * on that shell's quanta (not spectator shells), we generate code once per
+ * unique differentiated shell and pass spectator dimensions at the call site.
+ *
+ * @tparam IntType integral type
+ * @tparam part particle index of the function to be differentiated
+ * @tparam where position of the function to be differentiated (InBra/InKet)
+ * @tparam trans_inv_part if non-negative, specifies the particle index for
+ *         translational invariance
+ * @tparam trans_inv_where position for translational invariance
+ */
+template <class IntType, int part, FunctionPosition where,
+          int trans_inv_part = -1, FunctionPosition trans_inv_where = InBra>
+class DerivGaussV2 : public RecurrenceRelation {
+ private:
+  static constexpr auto trans_inv_oper =
+      not IntType::OperType::Properties::odep;
+  static constexpr auto using_trans_inv =
+      trans_inv_oper && (part == trans_inv_part) && (where == trans_inv_where);
+
+ public:
+  typedef RecurrenceRelation ParentType;
+  typedef typename IntType::BasisFunctionType BasisFunctionType;
+  typedef DerivGaussV2 ThisType;
+  typedef IntType TargetType;
+  typedef IntType ChildType;
+  typedef RecurrenceRelation::ExprType ExprType;
+
+  static const unsigned int max_nchildren_ =
+      using_trans_inv ? (IntType::num_bf - 1) : 2u;
+
+  static std::shared_ptr<ThisType> Instance(
+      const std::shared_ptr<TargetType>& Tint, unsigned int dir = 0);
+  virtual ~DerivGaussV2() {}
+
+  /// always directional
+  static bool directional() { return true; }
+
+  unsigned int num_children() const override { return nchildren_; }
+  std::shared_ptr<DGVertex> rr_target() const override {
+    return std::static_pointer_cast<DGVertex, TargetType>(target_);
+  }
+  std::shared_ptr<DGVertex> rr_child(unsigned int i) const override {
+    return children_.at(i);
+  }
+  bool is_simple() const override {
+    return TrivialBFSet<BasisFunctionType>::result;
+  }
+
+  std::string spfunction_call(
+      const std::shared_ptr<CodeContext>& context,
+      const std::shared_ptr<ImplicitDimensions>& dims) const override;
+
+ private:
+  DerivGaussV2(const std::shared_ptr<TargetType>& Tint, unsigned int dir);
+
+  unsigned int dir_;
+  std::shared_ptr<TargetType> target_;
+  std::vector<std::shared_ptr<DGVertex>> children_;
+  unsigned int nchildren_;
+
+  std::string generate_label() const override;
+  std::shared_ptr<ImplicitDimensions> adapt_dims_(
+      const std::shared_ptr<ImplicitDimensions>& dims) const override;
+  bool register_with_rrstack() const;
+  bool expl_high_dim() const;
+  bool expl_low_dim() const;
+
+  /// add child, deduplicating
+  const std::shared_ptr<DGVertex>& add_child(
+      const std::shared_ptr<DGVertex>& child) {
+    for (auto& c : children_) {
+      if (c == child) return c;
+    }
+    children_.push_back(child);
+    ++nchildren_;
+    return children_.back();
+  }
+};
+
+//
+// Implementation
+//
+
+template <class IntType, int part, FunctionPosition where, int trans_inv_part,
+          FunctionPosition trans_inv_where>
+std::shared_ptr<
+    DerivGaussV2<IntType, part, where, trans_inv_part, trans_inv_where>>
+DerivGaussV2<IntType, part, where, trans_inv_part, trans_inv_where>::Instance(
+    const std::shared_ptr<TargetType>& Tint, unsigned int dir) {
+  std::shared_ptr<ThisType> this_ptr(new ThisType(Tint, dir));
+  if (this_ptr->num_children() != 0) {
+    this_ptr->register_with_rrstack();
+    return this_ptr;
+  }
+  return std::shared_ptr<ThisType>();
+}
+
+template <class IntType, int part, FunctionPosition where, int trans_inv_part,
+          FunctionPosition trans_inv_where>
+DerivGaussV2<IntType, part, where, trans_inv_part, trans_inv_where>::
+    DerivGaussV2(const std::shared_ptr<TargetType>& Tint, unsigned int dir)
+    : dir_(dir), target_(Tint), nchildren_(0) {
+  using namespace libint2::algebra;
+  using namespace libint2::prefactor;
+  using namespace libint2::braket;
+  typedef BasisFunctionType F;
+  const F& _1 = unit<F>(is_simple() ? dir : 0);
+
+  const typename IntType::AuxQuantaType& aux = Tint->aux();
+  const typename IntType::OperType& oper = Tint->oper();
+
+  children_.reserve(max_nchildren_);
+
+  // the Gaussian must be differentiated in direction dir
+  {
+    if (where == InBra && Tint->bra(part, 0).deriv().d(dir) == 0) return;
+    if (where == InKet && Tint->ket(part, 0).deriv().d(dir) == 0) return;
+  }
+
+  // if not using translational invariance, can only expand primitives
+  if (not using_trans_inv) {
+    if (where == InBra && Tint->bra(part, 0).contracted()) return;
+    if (where == InKet && Tint->ket(part, 0).contracted()) return;
+  }
+
+  typedef typename IntType::BraType IBraType;
+  typedef typename IntType::KetType IKetType;
+  IBraType* bra = new IBraType(Tint->bra());
+  IKetType* ket = new IKetType(Tint->ket());
+
+  if (not using_trans_inv) {  // differentiate
+
+    if (where == InBra) {
+      F a(bra->member(part, 0));
+
+      // add a+1
+      F ap1(bra->member(part, 0) + _1);
+      ap1.deriv().dec(dir);
+      bra->set_member(ap1, part, 0);
+      auto int_ap1 = add_child(IntType::Instance(*bra, *ket, aux, oper));
+      bra->set_member(a, part, 0);
+      if (is_simple()) {
+        std::ostringstream oss;
+        oss << "two_alpha" << part << "_bra";
+        expr_ = Scalar(oss.str()) * int_ap1;
+        nflops_ += 1;
+      }
+
+      // See if a-1 exists
+      F am1(bra->member(part, 0) - _1);
+      if (exists(am1)) {
+        am1.deriv().dec(dir);
+        bra->set_member(am1, part, 0);
+        auto int_am1 = add_child(IntType::Instance(*bra, *ket, aux, oper));
+        bra->set_member(a, part, 0);
+        if (is_simple()) {
+          expr_ -= Scalar(a[dir]) * int_am1;
+          nflops_ += 2;
+        }
+      }
+      delete bra;
+      delete ket;
+      return;
+    }
+
+    if (where == InKet) {
+      F a(ket->member(part, 0));
+
+      // add a+1
+      F ap1(ket->member(part, 0) + _1);
+      ap1.deriv().dec(dir);
+      ket->set_member(ap1, part, 0);
+      auto int_ap1 = add_child(IntType::Instance(*bra, *ket, aux, oper));
+      ket->set_member(a, part, 0);
+      if (is_simple()) {
+        std::ostringstream oss;
+        oss << "two_alpha" << part << "_ket";
+        expr_ = Scalar(oss.str()) * int_ap1;
+        nflops_ += 1;
+      }
+
+      // See if a-1 exists
+      F am1(ket->member(part, 0) - _1);
+      if (exists(am1)) {
+        am1.deriv().dec(dir);
+        ket->set_member(am1, part, 0);
+        auto int_am1 = add_child(IntType::Instance(*bra, *ket, aux, oper));
+        ket->set_member(a, part, 0);
+        if (is_simple()) {
+          expr_ -= Scalar(a[dir]) * int_am1;
+          nflops_ += 2;
+        }
+      }
+      delete bra;
+      delete ket;
+      return;
+    }
+
+  } else {  // use translational invariance
+
+    // remove one deriv quantum from the target function
+    if (where == InBra) bra->member(part, 0).deriv().dec(dir);
+    if (where == InKet) ket->member(part, 0).deriv().dec(dir);
+
+    int term_count = 0;
+    for (int p = 0; p != IntType::num_particles; ++p) {
+      typedef BasisFunctionType F;
+      if (p != trans_inv_part || trans_inv_where != InBra) {
+        F a(bra->member(p, 0));
+        if (not a.is_unit()) {
+          F da(a);
+          da.deriv().inc(dir);
+          bra->set_member(da, p, 0);
+          auto int_da = add_child(IntType::Instance(*bra, *ket, aux, oper));
+          bra->set_member(a, p, 0);
+          if (is_simple()) {
+            if (term_count == 0)
+              expr_ = Scalar(-1) * int_da;
+            else
+              expr_ -= int_da;
+            ++term_count;
+            nflops_ += 1;
+          }
+        }
+      }
+      if (p != trans_inv_part || trans_inv_where != InKet) {
+        F a(ket->member(p, 0));
+        if (not a.is_unit()) {
+          F da(a);
+          da.deriv().inc(dir);
+          ket->set_member(da, p, 0);
+          auto int_da = add_child(IntType::Instance(*bra, *ket, aux, oper));
+          ket->set_member(a, p, 0);
+          if (is_simple()) {
+            if (term_count == 0)
+              expr_ = Scalar(-1) * int_da;
+            else
+              expr_ -= int_da;
+            ++term_count;
+            nflops_ += 1;
+          }
+        }
+      }
+    }
+  }
+
+  delete bra;
+  delete ket;
+}
+
+template <class IntType, int part, FunctionPosition where, int trans_inv_part,
+          FunctionPosition trans_inv_where>
+bool DerivGaussV2<IntType, part, where, trans_inv_part,
+                  trans_inv_where>::register_with_rrstack() const {
+  using std::swap;
+
+  // only register RRs for shell sets (not individual integrals)
+  if (TrivialBFSet<BasisFunctionType>::result) return false;
+
+  // translational invariance path not optimized yet — register as-is
+  if (using_trans_inv) {
+    std::shared_ptr<RRStack> rrstack = RRStack::Instance();
+    std::shared_ptr<ThisType> this_ptr =
+        std::const_pointer_cast<ThisType, const ThisType>(
+            std::static_pointer_cast<const ThisType, const ParentType>(
+                std::enable_shared_from_this<ParentType>::shared_from_this()));
+    rrstack->find(this_ptr);
+    return true;
+  }
+
+  typedef typename IntType::BraType IBraType;
+  typedef typename IntType::KetType IKetType;
+  const IBraType& bra = target_->bra();
+  const IKetType& ket = target_->ket();
+
+  // check if all spectator shells already have zero quanta
+  bool nonzero_quanta = false;
+  unsigned const int npart = IntType::OperatorType::Properties::np;
+  for (unsigned int p = 0; p < npart; p++) {
+    int nfbra = bra.num_members(p);
+    for (int f = 0; f < nfbra; f++) {
+      // skip the differentiated position
+      if (static_cast<int>(p) == part && where == InBra) continue;
+      if (!bra.member(p, f).zero() || !bra.member(p, f).deriv().zero())
+        nonzero_quanta = true;
+    }
+    int nfket = ket.num_members(p);
+    for (int f = 0; f < nfket; f++) {
+      if (static_cast<int>(p) == part && where == InKet) continue;
+      if (!ket.member(p, f).zero() || !ket.member(p, f).deriv().zero())
+        nonzero_quanta = true;
+    }
+  }
+
+  // if all spectators are zero, register this instance directly
+  if (!nonzero_quanta) {
+    std::shared_ptr<RRStack> rrstack = RRStack::Instance();
+    std::shared_ptr<ThisType> this_ptr =
+        std::const_pointer_cast<ThisType, const ThisType>(
+            std::static_pointer_cast<const ThisType, const ParentType>(
+                std::enable_shared_from_this<ParentType>::shared_from_this()));
+    rrstack->find(this_ptr);
+    return true;
+  }
+
+  // Otherwise, zero out all spectator shells and register a dummy
+  IBraType bra_zero(bra);
+  IKetType ket_zero(ket);
+  for (unsigned int p = 0; p < npart; p++) {
+    int nfbra = bra_zero.num_members(p);
+    for (int f = 0; f < nfbra; f++) {
+      if (static_cast<int>(p) == part && where == InBra) continue;
+      typedef typename IBraType::bfs_type bfs_type;
+      typedef typename IBraType::bfs_ref bfs_ref;
+      bfs_ref bfs = bra_zero.member(p, f);
+      if (!bfs.zero() || !bfs.deriv().zero()) {
+        bfs_type null_bfs;
+        swap(bfs, null_bfs);
+      }
+    }
+    int nfket = ket_zero.num_members(p);
+    for (int f = 0; f < nfket; f++) {
+      if (static_cast<int>(p) == part && where == InKet) continue;
+      typedef typename IKetType::bfs_type bfs_type;
+      typedef typename IKetType::bfs_ref bfs_ref;
+      bfs_ref bfs = ket_zero.member(p, f);
+      if (!bfs.zero() || !bfs.deriv().zero()) {
+        bfs_type null_bfs;
+        swap(bfs, null_bfs);
+      }
+    }
+  }
+
+  // create a generic integral with a dummy operator
+  typedef GenOper<GenMultSymmOper_Descr<IntType::OperatorType::Properties::np>>
+      DummyOper;
+  typedef EmptySet DummyQuanta;
+  typedef GenIntegralSet<DummyOper, IncableBFSet, IBraType, IKetType,
+                         DummyQuanta>
+      DummyIntegral;
+  DummyOper dummy_oper;
+  DummyQuanta dummy_quanta(std::vector<int>(0, 0));
+  std::shared_ptr<DummyIntegral> dummy_integral =
+      DummyIntegral::Instance(bra_zero, ket_zero, dummy_quanta, dummy_oper);
+
+  // construct a DerivGaussV2 over the dummy integral and register it
+  typedef DerivGaussV2<DummyIntegral, part, where> DummyDerivGaussV2;
+  std::shared_ptr<DummyDerivGaussV2> dummy_rr =
+      DummyDerivGaussV2::Instance(dummy_integral, dir_);
+  std::shared_ptr<RRStack> rrstack = RRStack::Instance();
+  rrstack->find(dummy_rr);
+  return true;
+}
+
+template <class IntType, int part, FunctionPosition where, int trans_inv_part,
+          FunctionPosition trans_inv_where>
+std::string DerivGaussV2<IntType, part, where, trans_inv_part,
+                         trans_inv_where>::generate_label() const {
+  std::ostringstream os;
+
+  // For translational invariance, children depend on ALL shells, so
+  // the label must include full integral info (no code sharing).
+  // For direct differentiation, only the differentiated shell matters.
+  if constexpr (using_trans_inv) {
+    typedef typename TargetType::AuxIndexType mType;
+    static std::shared_ptr<mType> aux0(new mType(0u));
+    os << "CR_DerivGauss"
+       << "P" << part << to_string(where)
+       << genintegralset_label(target_->bra(), target_->ket(), aux0,
+                               target_->oper());
+    return os.str();
+  }
+
+  os << "DerivGaussV2 P" << part << " " << to_string(where) << " ";
+
+  // Only encode the differentiated shell — not spectators
+  if (where == InBra) {
+    BasisFunctionType sh(target_->bra(part, 0));
+    sh.uncontract();
+    os << sh.label();
+  } else {
+    BasisFunctionType sh(target_->ket(part, 0));
+    sh.uncontract();
+    os << sh.label();
+  }
+
+  return os.str();
+}
+
+template <class IntType, int part, FunctionPosition where, int trans_inv_part,
+          FunctionPosition trans_inv_where>
+std::string
+DerivGaussV2<IntType, part, where, trans_inv_part, trans_inv_where>::
+    spfunction_call(const std::shared_ptr<CodeContext>& context,
+                    const std::shared_ptr<ImplicitDimensions>& dims) const {
+  std::ostringstream os;
+  os << context->label_to_function_name(label()) << "(inteval, "
+     << context->value_to_pointer(rr_target()->symbol());
+
+  const unsigned int nc = num_children();
+  for (unsigned int c = 0; c < nc; c++) {
+    os << ", " << context->value_to_pointer(rr_child(c)->symbol());
+  }
+
+  // compute hsr and lsr — dimensions of spectator shells
+  // canonical order: for each particle p, bra then ket
+  // hsr = product of dims before (part, where)
+  // lsr = product of dims after (part, where)
+  unsigned int hsr = 1;
+  unsigned int lsr = 1;
+  const unsigned int np = IntType::OperType::Properties::np;
+  for (int p = 0; p < static_cast<int>(np); p++) {
+    unsigned int nbra = target_->bra().num_members(p);
+    assert(nbra == 1);
+    for (unsigned int i = 0; i < nbra; i++) {
+      SubIterator* iter = target_->bra().member_subiter(p, i);
+      if (p < part || (p == part && where == InKet)) hsr *= iter->num_iter();
+      // skip p == part && where == InBra (the differentiated shell)
+      if (p > part) lsr *= iter->num_iter();
+      delete iter;
+    }
+    unsigned int nket = target_->ket().num_members(p);
+    assert(nket == 1);
+    for (unsigned int i = 0; i < nket; i++) {
+      SubIterator* iter = target_->ket().member_subiter(p, i);
+      if (p < part) hsr *= iter->num_iter();
+      // skip p == part && where == InKet (the differentiated shell)
+      if (p > part || (p == part && where == InBra)) lsr *= iter->num_iter();
+      delete iter;
+    }
+  }
+
+  // Use TaskParameters to keep track of maximum ranks
+  LibraryTaskManager& taskmgr = LibraryTaskManager::Instance();
+  taskmgr.current().params()->max_hrr_hsrank(hsr);
+
+  if (expl_high_dim()) os << "," << hsr;
+  if (expl_low_dim()) os << "," << lsr;
+  os << ")" << context->end_of_stat() << std::endl;
+  return os.str();
+}
+
+template <class IntType, int part, FunctionPosition where, int trans_inv_part,
+          FunctionPosition trans_inv_where>
+bool DerivGaussV2<IntType, part, where, trans_inv_part,
+                  trans_inv_where>::expl_high_dim() const {
+  // translational invariance: no code sharing, no explicit dims
+  if (using_trans_inv) return false;
+  // need explicit high dim unless this is the first position
+  if (part == 0 && where == InBra) return false;
+  return true;
+}
+
+template <class IntType, int part, FunctionPosition where, int trans_inv_part,
+          FunctionPosition trans_inv_where>
+bool DerivGaussV2<IntType, part, where, trans_inv_part,
+                  trans_inv_where>::expl_low_dim() const {
+  // translational invariance: no code sharing, no explicit dims
+  if (using_trans_inv) return false;
+  unsigned int np = IntType::OperType::Properties::np;
+  // need explicit low dim unless this is the last position
+  if (static_cast<int>(np) - 1 == part && where == InKet) return false;
+  // corner case: 1-particle operator
+  if (np == 1) return true;
+  return true;
+}
+
+template <class IntType, int part, FunctionPosition where, int trans_inv_part,
+          FunctionPosition trans_inv_where>
+std::shared_ptr<ImplicitDimensions>
+DerivGaussV2<IntType, part, where, trans_inv_part, trans_inv_where>::
+    adapt_dims_(const std::shared_ptr<ImplicitDimensions>& dims) const {
+  bool high_rank = expl_high_dim();
+  bool low_rank = expl_low_dim();
+
+  std::shared_ptr<Entity> high_dim, low_dim;
+  if (high_rank) {
+    high_dim =
+        std::shared_ptr<Entity>(new RTimeEntity<EntityTypes::Int>("highdim"));
+  } else {
+    high_dim = dims->high();
+  }
+  if (low_rank) {
+    low_dim =
+        std::shared_ptr<Entity>(new RTimeEntity<EntityTypes::Int>("lowdim"));
+  } else {
+    low_dim = dims->low();
+  }
+
+  std::shared_ptr<ImplicitDimensions> localdims(
+      new ImplicitDimensions(high_dim, low_dim, dims->vecdim()));
+  return localdims;
+}
+
+};  // namespace libint2
+
+#endif
diff --git a/src/bin/libint/master_rrs_list.h b/src/bin/libint/master_rrs_list.h
index d55cfa301..5517d7c5c 100644
--- a/src/bin/libint/master_rrs_list.h
+++ b/src/bin/libint/master_rrs_list.h
@@ -29,6 +29,7 @@
 #include <comp_11_σpσpCoulombσpσp_11.h>
 #include <comp_1_σpVσp_1.h>
 #include <comp_deriv_gauss.h>
+#include <comp_deriv_gauss_v2.h>
 #include <comp_xyz.h>
 #include <generic_rr.h>
 #include <hrr.h>
@@ -268,6 +269,51 @@ typedef CR_DerivGauss<TwoPRep_11_11_int, 1, InKet, trinvskip2_part,
                       trinvskip2_where>
     Deriv_d_11_TwoPRep_11_int;
 
+// DerivGaussV2 for TwoPRep (shell sets)
+typedef DerivGaussV2<TwoPRep_11_11_sq, 0, InBra, trinvskip2_part,
+                     trinvskip2_where>
+    DerivV2_a_11_TwoPRep_11_sh;
+typedef DerivGaussV2<TwoPRep_11_11_sq, 0, InKet, trinvskip2_part,
+                     trinvskip2_where>
+    DerivV2_b_11_TwoPRep_11_sh;
+typedef DerivGaussV2<TwoPRep_11_11_sq, 1, InBra, trinvskip2_part,
+                     trinvskip2_where>
+    DerivV2_c_11_TwoPRep_11_sh;
+typedef DerivGaussV2<TwoPRep_11_11_sq, 1, InKet, trinvskip2_part,
+                     trinvskip2_where>
+    DerivV2_d_11_TwoPRep_11_sh;
+// DerivGaussV2 for TwoPRep (individual integrals)
+typedef DerivGaussV2<TwoPRep_11_11_int, 0, InBra, trinvskip2_part,
+                     trinvskip2_where>
+    DerivV2_a_11_TwoPRep_11_int;
+typedef DerivGaussV2<TwoPRep_11_11_int, 0, InKet, trinvskip2_part,
+                     trinvskip2_where>
+    DerivV2_b_11_TwoPRep_11_int;
+typedef DerivGaussV2<TwoPRep_11_11_int, 1, InBra, trinvskip2_part,
+                     trinvskip2_where>
+    DerivV2_c_11_TwoPRep_11_int;
+typedef DerivGaussV2<TwoPRep_11_11_int, 1, InKet, trinvskip2_part,
+                     trinvskip2_where>
+    DerivV2_d_11_TwoPRep_11_int;
+
+// DerivGaussV2 for DummySymmIntegral (used by register_with_rrstack)
+typedef DerivGaussV2<DummySymmIntegral_11_11_sq, 0, InBra>
+    DerivV2_a_11_Dummy_11_sh;
+typedef DerivGaussV2<DummySymmIntegral_11_11_sq, 0, InKet>
+    DerivV2_b_11_Dummy_11_sh;
+typedef DerivGaussV2<DummySymmIntegral_11_11_sq, 1, InBra>
+    DerivV2_c_11_Dummy_11_sh;
+typedef DerivGaussV2<DummySymmIntegral_11_11_sq, 1, InKet>
+    DerivV2_d_11_Dummy_11_sh;
+typedef DerivGaussV2<DummySymmIntegral_11_11_int, 0, InBra>
+    DerivV2_a_11_Dummy_11_int;
+typedef DerivGaussV2<DummySymmIntegral_11_11_int, 0, InKet>
+    DerivV2_b_11_Dummy_11_int;
+typedef DerivGaussV2<DummySymmIntegral_11_11_int, 1, InBra>
+    DerivV2_c_11_Dummy_11_int;
+typedef DerivGaussV2<DummySymmIntegral_11_11_int, 1, InKet>
+    DerivV2_d_11_Dummy_11_int;
+
 typedef CR_11_Coulombσpσp_11<CGShell> CR_11_Coulombσpσp_11_sh;
 typedef CR_11_Coulombσpσp_11<CGF> CR_11_Coulombσpσp_11_int;
 
diff --git a/src/bin/libint/strategy.cc b/src/bin/libint/strategy.cc
index 58fb8d2bd..4e4804da7 100644
--- a/src/bin/libint/strategy.cc
+++ b/src/bin/libint/strategy.cc
@@ -70,47 +70,51 @@ struct MasterStrategy;
 #if LIBINT_SHELLQUARTET_STRATEGY == LIBINT_SHELLQUARTET_STRATEGY_A0C0
 template <>
 struct MasterStrategy<TwoPRep_11_11_sq> {
-  typedef boost::mpl::list<HRR_ab_11_TwoPRep_11_sh, HRR_cd_11_TwoPRep_11_sh,
-                           Deriv_a_11_TwoPRep_11_sh, Deriv_b_11_TwoPRep_11_sh,
-                           Deriv_c_11_TwoPRep_11_sh, Deriv_d_11_TwoPRep_11_sh,
+  typedef boost::mpl::list<
+      HRR_ab_11_TwoPRep_11_sh, HRR_cd_11_TwoPRep_11_sh,
+      DerivV2_a_11_TwoPRep_11_sh, DerivV2_b_11_TwoPRep_11_sh,
+      DerivV2_c_11_TwoPRep_11_sh, DerivV2_d_11_TwoPRep_11_sh,
 #if LIBINT_ERI_STRATEGY == 2
-                           ITR_a_11_TwoPRep_11_sh, ITR_c_11_TwoPRep_11_sh,
+      ITR_a_11_TwoPRep_11_sh, ITR_c_11_TwoPRep_11_sh,
 #endif
-                           VRR_a_11_TwoPRep_11_sh, VRR_c_11_TwoPRep_11_sh>
+      VRR_a_11_TwoPRep_11_sh, VRR_c_11_TwoPRep_11_sh>
       value;
 };
 template <>
 struct MasterStrategy<TwoPRep_11_11_int> {
-  typedef boost::mpl::list<HRR_ab_11_TwoPRep_11_int, HRR_cd_11_TwoPRep_11_int,
-                           Deriv_a_11_TwoPRep_11_int, Deriv_b_11_TwoPRep_11_int,
-                           Deriv_c_11_TwoPRep_11_int, Deriv_d_11_TwoPRep_11_int,
+  typedef boost::mpl::list<
+      HRR_ab_11_TwoPRep_11_int, HRR_cd_11_TwoPRep_11_int,
+      DerivV2_a_11_TwoPRep_11_int, DerivV2_b_11_TwoPRep_11_int,
+      DerivV2_c_11_TwoPRep_11_int, DerivV2_d_11_TwoPRep_11_int,
 #if LIBINT_ERI_STRATEGY == 2
-                           ITR_a_11_TwoPRep_11_int, ITR_c_11_TwoPRep_11_int,
+      ITR_a_11_TwoPRep_11_int, ITR_c_11_TwoPRep_11_int,
 #endif
-                           VRR_a_11_TwoPRep_11_int, VRR_c_11_TwoPRep_11_int>
+      VRR_a_11_TwoPRep_11_int, VRR_c_11_TwoPRep_11_int>
       value;
 };
 #else  // 0B0D strategy
 template <>
 struct MasterStrategy<TwoPRep_11_11_sq> {
-  typedef boost::mpl::list<HRR_ba_11_TwoPRep_11_sh, HRR_dc_11_TwoPRep_11_sh,
-                           Deriv_a_11_TwoPRep_11_sh, Deriv_b_11_TwoPRep_11_sh,
-                           Deriv_c_11_TwoPRep_11_sh, Deriv_d_11_TwoPRep_11_sh,
+  typedef boost::mpl::list<
+      HRR_ba_11_TwoPRep_11_sh, HRR_dc_11_TwoPRep_11_sh,
+      DerivV2_a_11_TwoPRep_11_sh, DerivV2_b_11_TwoPRep_11_sh,
+      DerivV2_c_11_TwoPRep_11_sh, DerivV2_d_11_TwoPRep_11_sh,
 #if LIBINT_ERI_STRATEGY == 2
-                           ITR_b_11_TwoPRep_11_sh, ITR_d_11_TwoPRep_11_sh,
+      ITR_b_11_TwoPRep_11_sh, ITR_d_11_TwoPRep_11_sh,
 #endif
-                           VRR_b_11_TwoPRep_11_sh, VRR_d_11_TwoPRep_11_sh>
+      VRR_b_11_TwoPRep_11_sh, VRR_d_11_TwoPRep_11_sh>
       value;
 };
 template <>
 struct MasterStrategy<TwoPRep_11_11_int> {
-  typedef boost::mpl::list<HRR_ba_11_TwoPRep_11_int, HRR_dc_11_TwoPRep_11_int,
-                           Deriv_a_11_TwoPRep_11_int, Deriv_b_11_TwoPRep_11_int,
-                           Deriv_c_11_TwoPRep_11_int, Deriv_d_11_TwoPRep_11_int,
+  typedef boost::mpl::list<
+      HRR_ba_11_TwoPRep_11_int, HRR_dc_11_TwoPRep_11_int,
+      DerivV2_a_11_TwoPRep_11_int, DerivV2_b_11_TwoPRep_11_int,
+      DerivV2_c_11_TwoPRep_11_int, DerivV2_d_11_TwoPRep_11_int,
 #if LIBINT_ERI_STRATEGY == 2
-                           ITR_b_11_TwoPRep_11_int, ITR_d_11_TwoPRep_11_int,
+      ITR_b_11_TwoPRep_11_int, ITR_d_11_TwoPRep_11_int,
 #endif
-                           VRR_b_11_TwoPRep_11_int, VRR_d_11_TwoPRep_11_int>
+      VRR_b_11_TwoPRep_11_int, VRR_d_11_TwoPRep_11_int>
       value;
 };
 #endif
@@ -214,21 +218,31 @@ struct MasterStrategy<DivG12prime_xTx_11_11_int> {
 #if LIBINT_SHELLQUARTET_STRATEGY == LIBINT_SHELLQUARTET_STRATEGY_A0C0
 template <>
 struct MasterStrategy<DummySymmIntegral_11_11_sq> {
-  typedef boost::mpl::list<HRR_ab_11_Dummy_11_sh, HRR_cd_11_Dummy_11_sh> value;
+  typedef boost::mpl::list<HRR_ab_11_Dummy_11_sh, HRR_cd_11_Dummy_11_sh,
+                           DerivV2_a_11_Dummy_11_sh, DerivV2_b_11_Dummy_11_sh,
+                           DerivV2_c_11_Dummy_11_sh, DerivV2_d_11_Dummy_11_sh>
+      value;
 };
 template <>
 struct MasterStrategy<DummySymmIntegral_11_11_int> {
-  typedef boost::mpl::list<HRR_ab_11_Dummy_11_int, HRR_cd_11_Dummy_11_int>
+  typedef boost::mpl::list<HRR_ab_11_Dummy_11_int, HRR_cd_11_Dummy_11_int,
+                           DerivV2_a_11_Dummy_11_int, DerivV2_b_11_Dummy_11_int,
+                           DerivV2_c_11_Dummy_11_int, DerivV2_d_11_Dummy_11_int>
       value;
 };
 #else  // 0B0D strategy
 template <>
 struct MasterStrategy<DummySymmIntegral_11_11_sq> {
-  typedef boost::mpl::list<HRR_ba_11_Dummy_11_sh, HRR_dc_11_Dummy_11_sh> value;
+  typedef boost::mpl::list<HRR_ba_11_Dummy_11_sh, HRR_dc_11_Dummy_11_sh,
+                           DerivV2_a_11_Dummy_11_sh, DerivV2_b_11_Dummy_11_sh,
+                           DerivV2_c_11_Dummy_11_sh, DerivV2_d_11_Dummy_11_sh>
+      value;
 };
 template <>
 struct MasterStrategy<DummySymmIntegral_11_11_int> {
-  typedef boost::mpl::list<HRR_ba_11_Dummy_11_int, HRR_dc_11_Dummy_11_int>
+  typedef boost::mpl::list<HRR_ba_11_Dummy_11_int, HRR_dc_11_Dummy_11_int,
+                           DerivV2_a_11_Dummy_11_int, DerivV2_b_11_Dummy_11_int,
+                           DerivV2_c_11_Dummy_11_int, DerivV2_d_11_Dummy_11_int>
       value;
 };
 #endif

From 9b50b0b46010916acb168d8f6bf77b1d3b7fa070 Mon Sep 17 00:00:00 2001
From: Kshitij Surjuse <kshitijsurjuse100@gmail.com>
Date: Fri, 10 Apr 2026 14:02:01 -0400
Subject: [PATCH 16/22] Fix DerivGaussV2 target indexing for size-1 integral
 sets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

handle_trivial_nodes() used default_dims() (hardcoded "1") before
adapt_dims_() provided correct runtime dims ("lowdim"/"highdim").
Pass localdims through optimize_rr_out → handle_trivial_nodes.
---
 src/bin/libint/dg.cc | 10 +++++-----
 src/bin/libint/dg.h  |  9 +++++++--
 src/bin/libint/rr.cc |  6 ++++--
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/src/bin/libint/dg.cc b/src/bin/libint/dg.cc
index eded6a54f..9e03277ae 100644
--- a/src/bin/libint/dg.cc
+++ b/src/bin/libint/dg.cc
@@ -499,10 +499,11 @@ void DirectedGraph::apply_to(const std::shared_ptr<DGVertex>& vertex,
 
 // Optimize out simple recurrence relations
 void DirectedGraph::optimize_rr_out(
-    const std::shared_ptr<CodeContext>& context) {
+    const std::shared_ptr<CodeContext>& context,
+    const std::shared_ptr<ImplicitDimensions>& dims) {
   replace_rr_with_expr();
   remove_trivial_arithmetics();
-  handle_trivial_nodes(context);
+  handle_trivial_nodes(context, dims);
   remove_disconnected_vertices();
   find_subtrees();
 }
@@ -797,7 +798,8 @@ inline std::string to_vector_symbol(const std::shared_ptr<DGVertex>& v) {
 // refer to another node so that no code is generated for it.
 //
 void DirectedGraph::handle_trivial_nodes(
-    const std::shared_ptr<CodeContext>& context) {
+    const std::shared_ptr<CodeContext>& context,
+    const std::shared_ptr<ImplicitDimensions>& dims) {
   typedef vertices::iterator iter;
   for (iter v = stack_.begin(); v != stack_.end(); ++v) {
     const ver_ptr& vptr = vertex_ptr(*v);
@@ -821,8 +823,6 @@ void DirectedGraph::handle_trivial_nodes(
           // if (child->symbol_set() == false)
           {
             const std::string stack_name("stack");
-            const std::shared_ptr<ImplicitDimensions>& dims =
-                ImplicitDimensions::default_dims();
             std::string low_rank = dims->low_label();
             std::string veclen = dims->vecdim_label();
 
diff --git a/src/bin/libint/dg.h b/src/bin/libint/dg.h
index 0f85abc8f..19e96074c 100644
--- a/src/bin/libint/dg.h
+++ b/src/bin/libint/dg.h
@@ -22,6 +22,7 @@
 #define _libint2_src_bin_libint_dg_h_
 
 #include <dgvertex.h>
+#include <dims.h>
 #include <exception.h>
 #include <global_macros.h>
 #include <key.h>
@@ -253,7 +254,9 @@ class DirectedGraph : public std::enable_shared_from_this<DirectedGraph> {
       optimized away. optimize_rr_out() will replace all simple recurrence
      relations with code representing them.
    */
-  void optimize_rr_out(const std::shared_ptr<CodeContext>& context);
+  void optimize_rr_out(const std::shared_ptr<CodeContext>& context,
+                       const std::shared_ptr<ImplicitDimensions>& dims =
+                           ImplicitDimensions::default_dims());
 
   /** after all apply's have been called, traverse()
       construct a heuristic order of traversal for the graph.
@@ -438,7 +441,9 @@ class DirectedGraph : public std::enable_shared_from_this<DirectedGraph> {
   to their equivalents (such as (ss|ss) shell quartet can only be connected to
   (ss|ss) integral)
    */
-  void handle_trivial_nodes(const std::shared_ptr<CodeContext>& context);
+  void handle_trivial_nodes(const std::shared_ptr<CodeContext>& context,
+                            const std::shared_ptr<ImplicitDimensions>& dims =
+                                ImplicitDimensions::default_dims());
   /// This functions removes vertices not connected to other vertices
   void remove_disconnected_vertices();
   /** Finds (binary) subtrees. The subtrees correspond to a single-line code (no
diff --git a/src/bin/libint/rr.cc b/src/bin/libint/rr.cc
index 32bbb9df9..a653bf4b2 100644
--- a/src/bin/libint/rr.cc
+++ b/src/bin/libint/rr.cc
@@ -127,8 +127,11 @@ void RecurrenceRelation::generate_code(
   // Assign symbols for the target and source integral sets
   std::shared_ptr<CodeSymbols> symbols(new CodeSymbols);
   assign_symbols_(symbols);
+  // Compute local dimensions before optimize_rr_out so that
+  // handle_trivial_nodes uses the correct dims (e.g., "lowdim" instead of "1")
+  std::shared_ptr<ImplicitDimensions> localdims = adapt_dims_(dims);
   // Traverse the graph
-  dg->optimize_rr_out(context);
+  dg->optimize_rr_out(context, localdims);
   dg->traverse();
 #if PRINT_DAG_GRAPHVIZ
   {
@@ -138,7 +141,6 @@ void RecurrenceRelation::generate_code(
 #endif
   // Generate code
   std::shared_ptr<MemoryManager> memman(new WorstFitMemoryManager());
-  std::shared_ptr<ImplicitDimensions> localdims = adapt_dims_(dims);
   dg->generate_code(context, memman, localdims, symbols, funcname, decl, def);
 
   // extract all external symbols -- these will be members of the evaluator

From 1db978df470b632c98039c19484d2b77249ac696 Mon Sep 17 00:00:00 2001
From: Kshitij Surjuse <kshitijsurjuse100@gmail.com>
Date: Fri, 10 Apr 2026 14:49:46 -0400
Subject: [PATCH 17/22] =?UTF-8?q?RKB=20CR=20code=20sharing:=20deduplicate?=
 =?UTF-8?q?=20generated=20Coulomb=CF=83p=CF=83p/=CF=83p=CF=83pCoulomb?=
 =?UTF-8?q?=CF=83p=CF=83p=20files?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add code-sharing overrides (generate_label, spfunction_call, adapt_dims_,
generate_code) to CR_11_Coulombσpσp_11 and CR_11_σpσpCoulombσpσp_11.
Shell quartets with the same quaternion component share a single function
parameterized by highdim. Hand-emits element-wise loops to avoid TwoPRep
particle-swap child deduplication issues in DAG-based codegen.
---
 .../comp_11_Coulomb\317\203p\317\203p_11.h"   | 103 ++++++++++++++-
 ...3p\317\203pCoulomb\317\203p\317\203p_11.h" | 120 +++++++++++++++++-
 2 files changed, 215 insertions(+), 8 deletions(-)

diff --git "a/src/bin/libint/comp_11_Coulomb\317\203p\317\203p_11.h" "b/src/bin/libint/comp_11_Coulomb\317\203p\317\203p_11.h"
index 315135994..db28cb7ea 100644
--- "a/src/bin/libint/comp_11_Coulomb\317\203p\317\203p_11.h"
+++ "b/src/bin/libint/comp_11_Coulomb\317\203p\317\203p_11.h"
@@ -21,8 +21,11 @@
 #ifndef LIBINT_COMP_11_COULOMBΣPΣP_11_H
 #define LIBINT_COMP_11_COULOMBΣPΣP_11_H
 
+#include <dims.h>
+#include <entity.h>
 #include <gaussoper.h>
 #include <generic_rr.h>
+#include <task.h>
 #include <twoprep_11_11.h>
 
 namespace libint2 {
@@ -60,16 +63,108 @@ class CR_11_Coulombσpσp_11
   using ParentType::RecurrenceRelation::expr_;
   using ParentType::RecurrenceRelation::nflops_;
 
-  /// Constructor is private, used by ParentType::Instance that maintains
+  /// Constructor is private, used by Instance that maintains
   /// registry of these objects
-  CR_11_Coulombσpσp_11(const std::shared_ptr<TargetType> &, unsigned int = 0);
+  CR_11_Coulombσpσp_11(const std::shared_ptr<TargetType>&, unsigned int = 0);
 
   static std::string descr() { return "CR"; }
+
+  // --- Code sharing overrides ---
+  // All shell combos with the same quaternion component share one function.
+
+  std::string generate_label() const override {
+    return "CR_Coulombopop_" +
+           std::to_string(target_->oper()->descr().quaternion_index());
+  }
+
+  std::string spfunction_call(
+      const std::shared_ptr<CodeContext>& context,
+      const std::shared_ptr<ImplicitDimensions>& dims) const override {
+    std::ostringstream os;
+    os << context->label_to_function_name(this->label()) << "(inteval, "
+       << context->value_to_pointer(this->rr_target()->symbol());
+    const unsigned int nc = this->num_children();
+    for (unsigned int c = 0; c < nc; c++) {
+      os << ", " << context->value_to_pointer(this->rr_child(c)->symbol());
+    }
+    // total_dim = product of all shell dims (all 4 shells are spectators)
+    unsigned int total_dim = 1;
+    for (unsigned int p = 0; p < 2; p++) {
+      SubIterator* si = target_->bra().member_subiter(p, 0);
+      total_dim *= si->num_iter();
+      delete si;
+      si = target_->ket().member_subiter(p, 0);
+      total_dim *= si->num_iter();
+      delete si;
+    }
+    os << "," << total_dim;
+    LibraryTaskManager& taskmgr = LibraryTaskManager::Instance();
+    taskmgr.current().params()->max_hrr_hsrank(total_dim);
+    os << ")" << context->end_of_stat() << std::endl;
+    return os.str();
+  }
+
+  std::shared_ptr<ImplicitDimensions> adapt_dims_(
+      const std::shared_ptr<ImplicitDimensions>& dims) const override {
+    auto high_dim = std::make_shared<RTimeEntity<EntityTypes::Int>>("highdim");
+    return std::make_shared<ImplicitDimensions>(high_dim, dims->low(),
+                                                dims->vecdim());
+  }
+
+  /// Hand-emit a simple element-wise loop function.
+  /// comp 0: target = src0 + src1 + src2 (dot product)
+  /// comp 1-3: target = src0 - src1 (cross product components)
+  void generate_code(const std::shared_ptr<CodeContext>& context,
+                     const std::shared_ptr<ImplicitDimensions>& dims,
+                     const std::string& funcname, std::ostream& decl,
+                     std::ostream& def) override {
+    // declare_function lives in dg.cc
+    extern std::string declare_function(
+        const std::shared_ptr<CodeContext>& context,
+        const std::shared_ptr<ImplicitDimensions>& dims,
+        const std::shared_ptr<CodeSymbols>& args, const std::string& tlabel,
+        const std::string& function_descr, std::ostream& decl);
+
+    std::shared_ptr<ImplicitDimensions> localdims = adapt_dims_(dims);
+    // inline assign_symbols_: set symbol names on target/children and
+    // populate CodeSymbols
+    std::shared_ptr<CodeSymbols> symbols(new CodeSymbols);
+    this->rr_target()->set_symbol("target");
+    symbols->append_symbol("target");
+    for (unsigned int c = 0; c < this->num_children(); c++) {
+      std::string symb = "src" + std::to_string(c);
+      this->rr_child(c)->set_symbol(symb);
+      symbols->append_symbol(symb);
+    }
+    LibraryTaskManager& taskmgr = LibraryTaskManager::Instance();
+    const std::string tlabel = taskmgr.current().label();
+    const std::string func_decl =
+        declare_function(context, localdims, symbols, tlabel, funcname, decl);
+    def << context->std_header();
+    def << "#include <" << context->label_to_name(funcname) << ".h>\n\n";
+    def << context->code_prefix();
+    def << func_decl << context->open_block() << std::endl;
+    def << context->std_function_header();
+    const unsigned int nc = this->num_children();
+    def << "#ifdef __INTEL_COMPILER\n#pragma ivdep\n#endif\n";
+    def << "for(int hsi = 0; hsi<highdim; hsi++) {\n";
+    def << "target[hsi] = ";
+    if (nc == 3) {
+      def << "src0[hsi] + src1[hsi] + src2[hsi]";
+    } else if (nc == 2) {
+      def << "src0[hsi] - src1[hsi]";
+    }
+    def << ";\n}\n";
+    unsigned int nflops = (nc > 1) ? nc - 1 : 0;
+    def << "/** Number of flops = " << nflops << " */\n";
+    def << context->close_block() << std::endl;
+    def << context->code_postfix();
+  }
 };
 
 template <typename F>
 CR_11_Coulombσpσp_11<F>::CR_11_Coulombσpσp_11(
-    const std::shared_ptr<TargetType> &Tint, unsigned int)
+    const std::shared_ptr<TargetType>& Tint, unsigned int)
     : ParentType(Tint, 0) {
   assert(Tint->num_func_bra(/* particle */ 0) == 1);
   assert(Tint->num_func_bra(/* particle */ 1) == 1);
@@ -81,7 +176,7 @@ CR_11_Coulombσpσp_11<F>::CR_11_Coulombσpσp_11(
   F c(Tint->bra(1, 0));
   F d(Tint->ket(1, 0));
 
-  const auto &oper = Tint->oper();
+  const auto& oper = Tint->oper();
 
   if (a.contracted() || b.contracted() || c.contracted() || d.contracted())
     return;
diff --git "a/src/bin/libint/comp_11_\317\203p\317\203pCoulomb\317\203p\317\203p_11.h" "b/src/bin/libint/comp_11_\317\203p\317\203pCoulomb\317\203p\317\203p_11.h"
index 606418d0b..64d1fdee6 100644
--- "a/src/bin/libint/comp_11_\317\203p\317\203pCoulomb\317\203p\317\203p_11.h"
+++ "b/src/bin/libint/comp_11_\317\203p\317\203pCoulomb\317\203p\317\203p_11.h"
@@ -21,8 +21,11 @@
 #ifndef LIBINT_COMP_11_ΣPΣPCOULOMBΣPΣP_11_H
 #define LIBINT_COMP_11_ΣPΣPCOULOMBΣPΣP_11_H
 
+#include <dims.h>
+#include <entity.h>
 #include <gaussoper.h>
 #include <generic_rr.h>
+#include <task.h>
 #include <twoprep_11_11.h>
 
 namespace libint2 {
@@ -88,17 +91,126 @@ class CR_11_σpσpCoulombσpσp_11
   using ParentType::RecurrenceRelation::expr_;
   using ParentType::RecurrenceRelation::nflops_;
 
-  /// Constructor is private, used by ParentType::Instance that maintains
+  /// Constructor is private, used by Instance that maintains
   /// registry of these objects
-  CR_11_σpσpCoulombσpσp_11(const std::shared_ptr<TargetType> &,
+  CR_11_σpσpCoulombσpσp_11(const std::shared_ptr<TargetType>&,
                            unsigned int = 0);
 
   static std::string descr() { return "CR"; }
+
+  // --- Code sharing overrides ---
+  // All shell combos with the same quaternion component share one function.
+
+  std::string generate_label() const override {
+    return "CR_opopCoulombopop_" +
+           std::to_string(target_->oper()->descr().quaternion_index());
+  }
+
+  std::string spfunction_call(
+      const std::shared_ptr<CodeContext>& context,
+      const std::shared_ptr<ImplicitDimensions>& dims) const override {
+    std::ostringstream os;
+    os << context->label_to_function_name(this->label()) << "(inteval, "
+       << context->value_to_pointer(this->rr_target()->symbol());
+    const unsigned int nc = this->num_children();
+    for (unsigned int c = 0; c < nc; c++) {
+      os << ", " << context->value_to_pointer(this->rr_child(c)->symbol());
+    }
+    // total_dim = product of all shell dims (all 4 shells are spectators)
+    unsigned int total_dim = 1;
+    for (unsigned int p = 0; p < 2; p++) {
+      SubIterator* si = target_->bra().member_subiter(p, 0);
+      total_dim *= si->num_iter();
+      delete si;
+      si = target_->ket().member_subiter(p, 0);
+      total_dim *= si->num_iter();
+      delete si;
+    }
+    os << "," << total_dim;
+    LibraryTaskManager& taskmgr = LibraryTaskManager::Instance();
+    taskmgr.current().params()->max_hrr_hsrank(total_dim);
+    os << ")" << context->end_of_stat() << std::endl;
+    return os.str();
+  }
+
+  std::shared_ptr<ImplicitDimensions> adapt_dims_(
+      const std::shared_ptr<ImplicitDimensions>& dims) const override {
+    auto high_dim = std::make_shared<RTimeEntity<EntityTypes::Int>>("highdim");
+    return std::make_shared<ImplicitDimensions>(high_dim, dims->low(),
+                                                dims->vecdim());
+  }
+
+  /// Hand-emit a simple element-wise loop function.
+  /// Cannot use S-shell dummy because TwoPRep particle-swap canonicalization
+  /// deduplicates children (e.g., (S_x S_x|S_y S_y) = (S_y S_y|S_x S_x)),
+  /// giving fewer children than the real instance.
+  void generate_code(const std::shared_ptr<CodeContext>& context,
+                     const std::shared_ptr<ImplicitDimensions>& dims,
+                     const std::string& funcname, std::ostream& decl,
+                     std::ostream& def) override {
+    // declare_function lives in dg.cc
+    extern std::string declare_function(
+        const std::shared_ptr<CodeContext>& context,
+        const std::shared_ptr<ImplicitDimensions>& dims,
+        const std::shared_ptr<CodeSymbols>& args, const std::string& tlabel,
+        const std::string& function_descr, std::ostream& decl);
+
+    std::shared_ptr<ImplicitDimensions> localdims = adapt_dims_(dims);
+    // inline assign_symbols_: set symbol names on target/children and
+    // populate CodeSymbols
+    std::shared_ptr<CodeSymbols> symbols(new CodeSymbols);
+    this->rr_target()->set_symbol("target");
+    symbols->append_symbol("target");
+    for (unsigned int c = 0; c < this->num_children(); c++) {
+      std::string symb = "src" + std::to_string(c);
+      this->rr_child(c)->set_symbol(symb);
+      symbols->append_symbol(symb);
+    }
+    LibraryTaskManager& taskmgr = LibraryTaskManager::Instance();
+    const std::string tlabel = taskmgr.current().label();
+    const std::string func_decl =
+        declare_function(context, localdims, symbols, tlabel, funcname, decl);
+    def << context->std_header();
+    def << "#include <" << context->label_to_name(funcname) << ".h>\n\n";
+    def << context->code_prefix();
+    def << func_decl << context->open_block() << std::endl;
+    def << context->std_function_header();
+    // Sign patterns for each component, indexed by child order in constructor.
+    // comp 0 (SS): 9 children, all +1
+    // comp 1-3 (SX,SY,SZ): 6 children, alternating +1,-1
+    // comp 4,8,12 (XS,YS,ZS): 6 children, alternating +1,-1
+    // comp 5-7,9-11,13-15 (XX..ZZ): 4 children, pattern -1,+1,+1,-1
+    const unsigned int nc = this->num_children();
+    def << "#ifdef __INTEL_COMPILER\n#pragma ivdep\n#endif\n";
+    def << "for(int hsi = 0; hsi<highdim; hsi++) {\n";
+    def << "target[hsi] = ";
+    if (nc == 9) {
+      // SS component: all positive
+      for (unsigned int c = 0; c < 9; c++) {
+        if (c > 0) def << " + ";
+        def << "src" << c << "[hsi]";
+      }
+    } else if (nc == 6) {
+      // SX,SY,SZ,XS,YS,ZS: alternating +/-
+      for (unsigned int c = 0; c < 6; c++) {
+        def << ((c % 2 == 0) ? " + " : " - ");
+        def << "src" << c << "[hsi]";
+      }
+    } else if (nc == 4) {
+      // XX,XY,...,ZZ: -src0 + src1 + src2 - src3
+      def << "- src0[hsi] + src1[hsi] + src2[hsi] - src3[hsi]";
+    }
+    def << ";\n}\n";
+    unsigned int nflops = (nc > 1) ? nc - 1 : 0;
+    def << "/** Number of flops = " << nflops << " */\n";
+    def << context->close_block() << std::endl;
+    def << context->code_postfix();
+  }
 };
 
 template <typename F>
 CR_11_σpσpCoulombσpσp_11<F>::CR_11_σpσpCoulombσpσp_11(
-    const std::shared_ptr<TargetType> &Tint, unsigned int)
+    const std::shared_ptr<TargetType>& Tint, unsigned int)
     : ParentType(Tint, 0) {
   assert(Tint->num_func_bra(/* particle */ 0) == 1);
   assert(Tint->num_func_bra(/* particle */ 1) == 1);
@@ -110,7 +222,7 @@ CR_11_σpσpCoulombσpσp_11<F>::CR_11_σpσpCoulombσpσp_11(
   F c(Tint->bra(1, 0));
   F d(Tint->ket(1, 0));
 
-  const auto &oper = Tint->oper();
+  const auto& oper = Tint->oper();
 
   if (a.contracted() || b.contracted() || c.contracted() || d.contracted())
     return;

From 5104e816d9ac650f83f5031dba9ac3a41bf63323 Mon Sep 17 00:00:00 2001
From: Kshitij Surjuse <kshitijsurjuse100@gmail.com>
Date: Fri, 17 Apr 2026 16:20:02 -0400
Subject: [PATCH 18/22] Add op_coulomb_op: 9-component Gaunt LS bilinear
 integral
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

(μ ∂_a ν | 1/r12 | κ ∂_b λ) for a, b ∈ {x,y,z}. Needed for Gaunt LS
Fock where coulomb_opop's 4 folded outputs are rank-deficient. Only
2-fold bra↔ket symmetry (p1_p2_swappable, no within-side swap); uses
dedicated predicate la+lb <= lc+ld. Also: LIBINT2_SIMPLE_CORE_EVAL_CASE
macro to compactify Coulomb-family dispatch in engine.impl.h.
---
 export/tests/unit/test-2body.cc         | 103 +++++++++++
 include/libint2/engine.h                |  16 ++
 include/libint2/engine.impl.h           | 119 +++++++------
 src/bin/libint/build_libint.cc          |  51 ++++--
 src/bin/libint/comp_11_opCoulombop_11.h | 216 ++++++++++++++++++++++++
 src/bin/libint/master_ints_list.h       |  10 +-
 src/bin/libint/master_rrs_list.h        |   4 +
 src/bin/libint/oper.h                   |  49 ++++++
 src/bin/libint/strategy.cc              |   8 +
 9 files changed, 516 insertions(+), 60 deletions(-)
 create mode 100644 src/bin/libint/comp_11_opCoulombop_11.h

diff --git a/export/tests/unit/test-2body.cc b/export/tests/unit/test-2body.cc
index 708190b90..cee9b0800 100644
--- a/export/tests/unit/test-2body.cc
+++ b/export/tests/unit/test-2body.cc
@@ -644,6 +644,109 @@ TEST_CASE("RKB Coulomb integrals", "[engine][2-body]") {
       }
     }
   }
+
+  SECTION("op_coulomb_op") {
+    Engine engine_opCop;
+    try {
+      engine_opCop = Engine(Operator::op_coulomb_op, max_nprim, max_l, 0);
+    } catch (Engine::lmax_exceeded &) {
+      return;
+    }
+
+    const auto nshell = obs.size();
+    for (int s0 = 0; s0 != nshell; ++s0) {
+      for (int s1 = 0; s1 != nshell; ++s1) {
+        for (int s2 = 0; s2 != nshell; ++s2) {
+          for (int s3 = 0; s3 != nshell; ++s3) {
+            const auto &results =
+                engine_opCop.compute(obs[s0], obs[s1], obs[s2], obs[s3]);
+            assert(results.size() == 9);
+
+            LIBINT2_REF_REALTYPE Aref[3], Bref[3], Cref[3], Dref[3];
+            for (int i = 0; i < 3; ++i) Aref[i] = obs[s0].O[i];
+            for (int i = 0; i < 3; ++i) Bref[i] = obs[s1].O[i];
+            for (int i = 0; i < 3; ++i) Cref[i] = obs[s2].O[i];
+            for (int i = 0; i < 3; ++i) Dref[i] = obs[s3].O[i];
+
+            int ijkl = 0;
+
+            int l0, m0, n0;
+            FOR_CART(l0, m0, n0, obs[s0].contr[0].l)
+            int l1, m1, n1;
+            FOR_CART(l1, m1, n1, obs[s1].contr[0].l)
+            int l2, m2, n2;
+            FOR_CART(l2, m2, n2, obs[s2].contr[0].l)
+            int l3, m3, n3;
+            FOR_CART(l3, m3, n3, obs[s3].contr[0].l)
+
+            std::array<LIBINT2_REF_REALTYPE, 9> ref_op_coulomb_op{};
+            ref_op_coulomb_op.fill(0.0);
+
+            for (uint p0 = 0; p0 < obs[s0].nprim(); p0++) {
+              for (uint p1 = 0; p1 < obs[s1].nprim(); p1++) {
+                for (uint p2 = 0; p2 < obs[s2].nprim(); p2++) {
+                  for (uint p3 = 0; p3 < obs[s3].nprim(); p3++) {
+                    const LIBINT2_REF_REALTYPE alpha0 = obs[s0].alpha[p0];
+                    const LIBINT2_REF_REALTYPE alpha1 = obs[s1].alpha[p1];
+                    const LIBINT2_REF_REALTYPE alpha2 = obs[s2].alpha[p2];
+                    const LIBINT2_REF_REALTYPE alpha3 = obs[s3].alpha[p3];
+                    const LIBINT2_REF_REALTYPE c0 = obs[s0].contr[0].coeff[p0];
+                    const LIBINT2_REF_REALTYPE c1 = obs[s1].contr[0].coeff[p1];
+                    const LIBINT2_REF_REALTYPE c2 = obs[s2].contr[0].coeff[p2];
+                    const LIBINT2_REF_REALTYPE c3 = obs[s3].contr[0].coeff[p3];
+                    const LIBINT2_REF_REALTYPE c0123 = c0 * c1 * c2 * c3;
+
+                    // Deriv on ν (center B, index 1) and on λ (center D, idx
+                    // 3).
+                    auto didx_bd = [](int a, int b) -> der_idx {
+                      der_idx r = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+                      r[3 + a] = 1;
+                      r[9 + b] = 1;
+                      return r;
+                    };
+                    auto D = [&](int a, int b) {
+                      auto di = didx_bd(a, b);
+                      return eri(di.data(), l0, m0, n0, alpha0, Aref, l1, m1,
+                                 n1, alpha1, Bref, l2, m2, n2, alpha2, Cref, l3,
+                                 m3, n3, alpha3, Dref, 0);
+                    };
+                    for (int a = 0; a < 3; ++a)
+                      for (int b = 0; b < 3; ++b)
+                        ref_op_coulomb_op[3 * a + b] += c0123 * D(a, b);
+                  }
+                }
+              }
+            }
+
+            const double ABSOLUTE_DEVIATION_THRESHOLD = 5.0E-14;
+            const double RELATIVE_DEVIATION_THRESHOLD = 1.0E-9;
+            for (auto comp = 0; comp < 9; ++comp) {
+              auto abs_err = abs(ref_op_coulomb_op[comp] - results[comp][ijkl]);
+              auto rel_abs_err = abs(abs_err / ref_op_coulomb_op[comp]);
+              bool not_ok = rel_abs_err > RELATIVE_DEVIATION_THRESHOLD &&
+                            abs_err > ABSOLUTE_DEVIATION_THRESHOLD;
+              if (not_ok) {
+                std::cout << "(l0 l1| l2 l3) = (" << s0 << " " << s1 << " | "
+                          << s2 << " " << s3 << ") Elem " << ijkl
+                          << " comp= " << comp
+                          << " : ref = " << ref_op_coulomb_op[comp]
+                          << " libint = " << results[comp][ijkl]
+                          << " relabs_error = " << rel_abs_err
+                          << " abs_error = " << abs_err << std::endl;
+              }
+              REQUIRE(!not_ok);
+            }
+
+            ++ijkl;
+            END_FOR_CART
+            END_FOR_CART
+            END_FOR_CART
+            END_FOR_CART
+          }
+        }
+      }
+    }
+  }
 }
 
 TEST_CASE("Erfx_Coulomb integrals", "[engine][2-body]") {
diff --git a/include/libint2/engine.h b/include/libint2/engine.h
index fae7d1785..f99c6a898 100644
--- a/include/libint2/engine.h
+++ b/include/libint2/engine.h
@@ -160,6 +160,13 @@ enum class Operator {
   /// where b1 & b2 are centers of bra1 and bra2 and k1  & k2 are centers of
   /// ket1 and ket2, respectively
   opop_coulomb_opop,
+  /// (2-body) \f$ (σ.p_{b2}) r_{12}^{-1} (σ.p_{k2}) \f$ where b2 is the center
+  /// of bra2 and k2 is the center of ket2; Gaunt LS "bilinear" integral.
+  /// Produces 9 components (outer product of two Cartesian directions),
+  /// indexed as `3*a + b` with `a` = bra-side direction, `b` = ket-side
+  /// direction, and `a,b ∈ {x=0, y=1, z=2}`. Unlike coulomb_opop, the 9
+  /// components are NOT contracted via σ·σ — all are kept independent.
+  op_coulomb_op,
   /// contracted Gaussian geminal
   cgtg,
   /// contracted Gaussian geminal times Coulomb
@@ -369,6 +376,15 @@ struct operator_traits<Operator::opop_coulomb_opop>
   static constexpr auto nopers = 16;
   static constexpr auto intrinsic_deriv_order = 4;
 };
+template <>
+struct operator_traits<Operator::op_coulomb_op>
+    : public operator_traits<Operator::coulomb> {
+  /// 9 components: Cartesian dyadic of the two (σ·p) directions.
+  /// index = 3 * a + b, with a = bra-side direction, b = ket-side direction,
+  /// a,b ∈ {x=0, y=1, z=2}.
+  static constexpr auto nopers = 9;
+  static constexpr auto intrinsic_deriv_order = 2;
+};
 
 namespace detail {
 template <int K>
diff --git a/include/libint2/engine.impl.h b/include/libint2/engine.impl.h
index ff143d254..27187cbcc 100644
--- a/include/libint2/engine.impl.h
+++ b/include/libint2/engine.impl.h
@@ -70,32 +70,33 @@ typename std::remove_all_extents<T>::type* to_ptr1(T (&a)[N]) {
 /// These MUST appear in the same order as in Operator.
 /// You must also update BOOST_PP_NBODY_OPERATOR_LAST_ONEBODY_INDEX when you add
 /// one-body ints
-#define BOOST_PP_NBODY_OPERATOR_LIST                     \
-  (overlap,                         /* overlap */        \
-   (kinetic,                        /* kinetic */        \
-    (elecpot,                       /* nuclear */        \
-     (elecpot,                      /* erf_nuclear */    \
-      (elecpot,                     /* erfc_nuclear */   \
-       (elecpot,                    /* erfx_nuclear */   \
-        (1emultipole,               /* emultipole1 */    \
-         (2emultipole,              /* emultipole2 */    \
-          (3emultipole,             /* emultipole3 */    \
-           (sphemultipole,          /* sphemultipole */  \
-            (opVop,                 /* opVop */          \
-             (eri,                  /* delta */          \
-              (eri,                 /* coulomb */        \
-               (coulomb_opop,       /* coulomb_opop */   \
-                (opop_coulomb_opop, /* coulomb_opop */   \
-                 (eri,              /* cgtg */           \
-                  (eri,             /* cgtg_x_coulomb */ \
-                   (eri,            /* delcgtg2 */       \
-                    (eri,           /* r12 */            \
-                     (eri,          /* erf_coulomb */    \
-                      (eri,         /* erfc_coulomb */   \
-                       (eri,        /* erfx_coulomb */   \
-                        (eri,       /* stg */            \
-                         (eri,      /* yukawa */         \
-                          BOOST_PP_NIL))))))))))))))))))))))))
+#define BOOST_PP_NBODY_OPERATOR_LIST                        \
+  (overlap,                         /* overlap */           \
+   (kinetic,                        /* kinetic */           \
+    (elecpot,                       /* nuclear */           \
+     (elecpot,                      /* erf_nuclear */       \
+      (elecpot,                     /* erfc_nuclear */      \
+       (elecpot,                    /* erfx_nuclear */      \
+        (1emultipole,               /* emultipole1 */       \
+         (2emultipole,              /* emultipole2 */       \
+          (3emultipole,             /* emultipole3 */       \
+           (sphemultipole,          /* sphemultipole */     \
+            (opVop,                 /* opVop */             \
+             (eri,                  /* delta */             \
+              (eri,                 /* coulomb */           \
+               (coulomb_opop,       /* coulomb_opop */      \
+                (opop_coulomb_opop, /* opop_coulomb_opop */ \
+                 (op_coulomb_op,    /* op_coulomb_op */     \
+                  (eri,             /* cgtg */              \
+                   (eri,            /* cgtg_x_coulomb */    \
+                    (eri,           /* delcgtg2 */          \
+                     (eri,          /* r12 */               \
+                      (eri,         /* erf_coulomb */       \
+                       (eri,        /* erfc_coulomb */      \
+                        (eri,       /* erfx_coulomb */      \
+                         (eri,      /* stg */               \
+                          (eri,     /* yukawa */            \
+                           BOOST_PP_NIL)))))))))))))))))))))))))
 
 #define BOOST_PP_NBODY_OPERATOR_INDEX_TUPLE \
   BOOST_PP_MAKE_TUPLE(BOOST_PP_LIST_SIZE(BOOST_PP_NBODY_OPERATOR_LIST))
@@ -1243,6 +1244,18 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
       const bool swap_p1p2 = (tket1.contr[0].l < tket2.contr[0].l);
       swap_tbra = swap_tket = swap_p1p2;
     }
+  } else if (oper_ == Operator::op_coulomb_op) {
+    // opCoulombop: only bra↔ket (particle 1↔2) swap is a symmetry (with
+    // (a,b)↔(b,a) component remap). Within-side swap is NOT a symmetry
+    // because σ·p attaches to one specific function per side; moving it to
+    // the other function changes the integral in a way IBP cannot recover
+    // across electrons. Canonical form: la+lb <= lc+ld only.
+    const auto bra_total = tbra1.contr[0].l + tbra2.contr[0].l;
+    const auto ket_total = tket1.contr[0].l + tket2.contr[0].l;
+    swap_braket = ((braket_ == BraKet::xx_xx) && (bra_total > ket_total)) ||
+                  braket_ == BraKet::xx_xs;
+    swap_tbra = false;
+    swap_tket = false;
   } else {
     swap_braket = ((braket_ == BraKet::xx_xx) &&
                    (tbra1.contr[0].l + tbra2.contr[0].l >
@@ -1276,6 +1289,15 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
       const bool swap_p1p2 = (tket1.contr[0].l > tket2.contr[0].l);
       swap_tbra = swap_tket = swap_p1p2;
     }
+  } else if (oper_ == Operator::op_coulomb_op) {
+    // opCoulombop: only bra↔ket swap is a symmetry (with (a,b)↔(b,a) remap).
+    // ORCA canonical form: la+lb >= lc+ld only.
+    const auto bra_total = tbra1.contr[0].l + tbra2.contr[0].l;
+    const auto ket_total = tket1.contr[0].l + tket2.contr[0].l;
+    swap_braket = ((braket_ == BraKet::xx_xx) && (bra_total < ket_total)) ||
+                  braket_ == BraKet::xx_xs;
+    swap_tbra = false;
+    swap_tket = false;
   } else {
     swap_tbra = (tbra1.contr[0].l > tbra2.contr[0].l);
     swap_tket = (tket1.contr[0].l > tket2.contr[0].l);
@@ -1482,28 +1504,21 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
             const auto mmax = l + deriv_order_ + intrinsic_deriv_order();
 
             if (!skip_core_ints) {
+// Simple core-eval dispatch: just `core_eval_ptr->eval(gm_ptr, T, mmax)`.
+// Applies to Coulomb-family operators whose core integral is the bare Fm.
+#define LIBINT2_SIMPLE_CORE_EVAL_CASE(OP)                           \
+  case Operator::OP: {                                              \
+    const auto& core_eval_ptr =                                     \
+        any_cast<const detail::core_eval_pack_type<Operator::OP>&>( \
+            core_eval_pack_)                                        \
+            .first();                                               \
+    core_eval_ptr->eval(gm_ptr, T, mmax);                           \
+  } break
               switch (oper_) {
-                case Operator::coulomb: {
-                  const auto& core_eval_ptr =
-                      any_cast<const detail::core_eval_pack_type<
-                          Operator::coulomb>&>(core_eval_pack_)
-                          .first();
-                  core_eval_ptr->eval(gm_ptr, T, mmax);
-                } break;
-                case Operator::coulomb_opop: {
-                  const auto& core_eval_ptr =
-                      any_cast<const detail::core_eval_pack_type<
-                          Operator::coulomb_opop>&>(core_eval_pack_)
-                          .first();
-                  core_eval_ptr->eval(gm_ptr, T, mmax);
-                } break;
-                case Operator::opop_coulomb_opop: {
-                  const auto& core_eval_ptr =
-                      any_cast<const detail::core_eval_pack_type<
-                          Operator::opop_coulomb_opop>&>(core_eval_pack_)
-                          .first();
-                  core_eval_ptr->eval(gm_ptr, T, mmax);
-                } break;
+                LIBINT2_SIMPLE_CORE_EVAL_CASE(coulomb);
+                LIBINT2_SIMPLE_CORE_EVAL_CASE(coulomb_opop);
+                LIBINT2_SIMPLE_CORE_EVAL_CASE(opop_coulomb_opop);
+                LIBINT2_SIMPLE_CORE_EVAL_CASE(op_coulomb_op);
                 case Operator::cgtg_x_coulomb: {
                   const auto& core_eval_ptr =
                       any_cast<const detail::core_eval_pack_type<
@@ -1622,6 +1637,7 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
                   assert(false && "missing case in a switch");  // unreachable
                   abort();
               }
+#undef LIBINT2_SIMPLE_CORE_EVAL_CASE
             }
 
             for (auto m = 0; m != mmax + 1; ++m) {
@@ -2241,6 +2257,15 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
         for (auto s = 0; s != ntargets; ++s)
           targets_[4 * (s % 4) + (s / 4)] = temp[s];
       }
+      // For op_coulomb_op with swap_braket: (a,b) → (b,a) because bra↔ket swap
+      // exchanges which side each σ·p direction came from. Layout is
+      // index = 3*a + b, so the remap is s_new = 3*(s%3) + (s/3).
+      if (permute && oper_ == Operator::op_coulomb_op && swap_braket) {
+        std::array<const value_type*, 9> temp;
+        for (auto s = 0; s != ntargets; ++s) temp[s] = targets_[s];
+        for (auto s = 0; s != ntargets; ++s)
+          targets_[3 * (s % 3) + (s / 3)] = temp[s];
+      }
     }       // if need_scratch => needed to transpose and/or tform
     else {  // did not use scratch? may still need to update targets_
       if (set_targets_) {
diff --git a/src/bin/libint/build_libint.cc b/src/bin/libint/build_libint.cc
index 4e9b10152..fbc8ac07b 100644
--- a/src/bin/libint/build_libint.cc
+++ b/src/bin/libint/build_libint.cc
@@ -609,8 +609,10 @@ void try_main(int argc, char* argv[]) {
 #endif
 
 #ifdef LIBINT_INCLUDE_RKB_ERI
-#define BOOST_PP_RKB_ERI_TASK_TUPLE (coulomb_opop, opop_coulomb_opop)
-#define BOOST_PP_RKB_ERI_TASK_OPER_TUPLE (CoulombσpσpOper, σpσpCoulombσpσpOper)
+#define BOOST_PP_RKB_ERI_TASK_TUPLE \
+  (coulomb_opop, opop_coulomb_opop, op_coulomb_op)
+#define BOOST_PP_RKB_ERI_TASK_OPER_TUPLE \
+  (CoulombσpσpOper, σpσpCoulombσpσpOper, opCoulombopOper)
 #define BOOST_PP_RKB_ERI_TASK_LIST \
   BOOST_PP_TUPLE_TO_LIST(BOOST_PP_RKB_ERI_TASK_TUPLE)
 #define BOOST_PP_RKB_ERI_TASK_OPER_LIST \
@@ -1148,7 +1150,14 @@ static void build_TwoPRep_2b_2k(
   std::shared_ptr<CodeContext> context(new CppCodeContext(cparams));
   std::shared_ptr<MemoryManager> memman(new WorstFitMemoryManager());
 
-  bool p1_p2_swappable = !std::is_same<OperType, CoulombσpσpOper>::value;
+  // opCoulombop has only a 2-fold bra↔ket-swap symmetry (with (a,b)↔(b,a)
+  // component remap). Within-side particle swap is NOT a symmetry because σ·p
+  // attaches to one specific function per side (ν on bra, λ on ket); swapping
+  // moves the operator to a different physical center that IBP cannot recover
+  // when centers differ. Emit code for every (la,lb,lc,ld) combination to
+  // avoid triggering any within-side swap at runtime.
+  bool p1_p2_swappable = !std::is_same<OperType, CoulombσpσpOper>::value &&
+                         !std::is_same<OperType, opCoulombopOper>::value;
   bool bra_ket_coswappable = std::is_same<OperType, σpσpCoulombσpσpOper>::value;
 
   // Note: la, lb, lc, ld generate code for chemist notation (ab|O|cd), where O
@@ -1157,10 +1166,23 @@ static void build_TwoPRep_2b_2k(
     for (unsigned int lb = 0; lb <= lmax; lb++) {
       for (unsigned int lc = 0; lc <= lmax; lc++) {
         for (unsigned int ld = 0; ld <= lmax; ld++) {
-          if (!ShellQuartetSetPredicate<static_cast<ShellSetType>(
-                  LIBINT_SHELL_SET)>::value(la, lb, lc, ld, p1_p2_swappable,
-                                            bra_ket_coswappable))
-            continue;
+          // opCoulombop has only a bra↔ket (particle 1↔2) swap symmetry;
+          // within-side swap is NOT a symmetry (σ·p would move to the wrong
+          // physical center). Canonical form: la+lb <= lc+ld only
+          // (ORCA: la+lb >= lc+ld). Use a dedicated predicate so within-side
+          // orderings are not reduced away.
+          if constexpr (std::is_same<OperType, opCoulombopOper>::value) {
+#if LIBINT_SHELL_SET == LIBINT_SHELL_SET_STANDARD
+            if (!(la + lb <= lc + ld)) continue;
+#else
+            if (!(la + lb >= lc + ld)) continue;
+#endif
+          } else {
+            if (!ShellQuartetSetPredicate<static_cast<ShellSetType>(
+                    LIBINT_SHELL_SET)>::value(la, lb, lc, ld, p1_p2_swappable,
+                                              bra_ket_coswappable))
+              continue;
+          }
 
           // std::shared_ptr<Tactic> tactic(new ParticleDirectionTactic(la+lb >
           // lc+ld ? false : true));
@@ -1194,6 +1216,15 @@ static void build_TwoPRep_2b_2k(
               descrs.emplace_back(OperDescrType(p));
             }
           }
+          if constexpr (std::is_same<OperType, opCoulombopOper>::value) {
+            // reset descriptors array
+            descrs.resize(0);
+            // iterate over 9 components (3x3 Cartesian dyadic: bra-dir ×
+            // ket-dir)
+            for (int p = 0; p != 9; ++p) {
+              descrs.emplace_back(OperDescrType(p));
+            }
+          }
 
           // unroll only if max_am <= cparams->max_am_opt(task)
           using std::max;
@@ -2376,9 +2407,9 @@ void config_to_api(const std::shared_ptr<CompilationParameters>& cparams,
 
       {  // 2-body ints
 
-#define BOOST_PP_TWOBODY_TASKOPER_TUPLE                                \
-  ("eri", "coulomb_opop", "opop_coulomb_opop", "r12kg12", "r12_0_g12", \
-   "r12_2_g12", "g12_T1_g12", "g12dkh")
+#define BOOST_PP_TWOBODY_TASKOPER_TUPLE                                    \
+  ("eri", "coulomb_opop", "opop_coulomb_opop", "op_coulomb_op", "r12kg12", \
+   "r12_0_g12", "r12_2_g12", "g12_T1_g12", "g12dkh")
 #define BOOST_PP_TWOBODY_TASKOPER_LIST \
   BOOST_PP_TUPLE_TO_LIST(BOOST_PP_TWOBODY_TASKOPER_TUPLE)
 
diff --git a/src/bin/libint/comp_11_opCoulombop_11.h b/src/bin/libint/comp_11_opCoulombop_11.h
new file mode 100644
index 000000000..a6cbb7744
--- /dev/null
+++ b/src/bin/libint/comp_11_opCoulombop_11.h
@@ -0,0 +1,216 @@
+/*
+ *  Copyright (C) 2004-2026 Edward F. Valeev
+ *
+ *  This file is part of Libint compiler.
+ *
+ *  Libint compiler is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Libint compiler is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with Libint compiler.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef LIBINT_COMP_11_OPCOULOMBOP_11_H
+#define LIBINT_COMP_11_OPCOULOMBOP_11_H
+
+#include <dims.h>
+#include <entity.h>
+#include <gaussoper.h>
+#include <generic_rr.h>
+#include <task.h>
+#include <twoprep_11_11.h>
+
+namespace libint2 {
+
+/**
+ * Computes the "Gaunt LS bilinear" integral
+ *   \f$ (\mu\, \sigma\cdot\hat{p}\,\nu | 1/r_{12} | \kappa\,
+ * \sigma\cdot\hat{p}\,\lambda ) \f$ by rewriting each of the 9 (a,b) components
+ * as a single derivative Coulomb integral \f$ ( \mu \cdot \partial_a \nu |
+ * 1/r_{12} | \kappa \cdot \partial_b \lambda ) \f$ with a ∈ {x,y,z} on
+ * bra-function-1 (ν) and b ∈ {x,y,z} on ket-function-1 (λ).
+ *
+ * Unlike Coulombσpσp (which folds 9 → 4 via σ·σ on the ket pair), all 9
+ * components are exposed independently, since the two σ's here act on different
+ * particles and their contraction cannot be absorbed locally.
+ *
+ * @tparam F basis function type. valid choices are CGShell or CGF
+ */
+template <typename F>
+class CR_11_opCoulombop_11
+    : public GenericRecurrenceRelation<
+          CR_11_opCoulombop_11<F>, F,
+          GenIntegralSet_11_11<F, opCoulombopOper, mType>> {
+ public:
+  typedef CR_11_opCoulombop_11<F> ThisType;
+  typedef F BasisFunctionType;
+  typedef opCoulombopOper OperType;
+  typedef GenIntegralSet_11_11<F, opCoulombopOper, mType> TargetType;
+  typedef GenericRecurrenceRelation<ThisType, BasisFunctionType, TargetType>
+      ParentType;
+  friend class GenericRecurrenceRelation<ThisType, BasisFunctionType,
+                                         TargetType>;
+  static const unsigned int max_nchildren = 1;
+
+  using ParentType::Instance;
+
+  static bool directional() { return false; }
+
+ private:
+  using ParentType::is_simple;
+  using ParentType::target_;
+  using ParentType::RecurrenceRelation::expr_;
+  using ParentType::RecurrenceRelation::nflops_;
+
+  /// Constructor is private, used by Instance that maintains
+  /// registry of these objects
+  CR_11_opCoulombop_11(const std::shared_ptr<TargetType>&, unsigned int = 0);
+
+  static std::string descr() { return "CR"; }
+
+  // --- Code sharing overrides (mirror Coulombσpσp pattern) ---
+  // All shell quartets with the same quaternion component share one function.
+
+  std::string generate_label() const override {
+    return "CR_opCoulombop_" +
+           std::to_string(target_->oper()->descr().cartesian_index());
+  }
+
+  std::string spfunction_call(
+      const std::shared_ptr<CodeContext>& context,
+      const std::shared_ptr<ImplicitDimensions>& dims) const override {
+    std::ostringstream os;
+    os << context->label_to_function_name(this->label()) << "(inteval, "
+       << context->value_to_pointer(this->rr_target()->symbol());
+    const unsigned int nc = this->num_children();
+    for (unsigned int c = 0; c < nc; c++) {
+      os << ", " << context->value_to_pointer(this->rr_child(c)->symbol());
+    }
+    // total_dim = product of all shell dims (all 4 shells are spectators)
+    unsigned int total_dim = 1;
+    for (unsigned int p = 0; p < 2; p++) {
+      SubIterator* si = target_->bra().member_subiter(p, 0);
+      total_dim *= si->num_iter();
+      delete si;
+      si = target_->ket().member_subiter(p, 0);
+      total_dim *= si->num_iter();
+      delete si;
+    }
+    os << "," << total_dim;
+    LibraryTaskManager& taskmgr = LibraryTaskManager::Instance();
+    taskmgr.current().params()->max_hrr_hsrank(total_dim);
+    os << ")" << context->end_of_stat() << std::endl;
+    return os.str();
+  }
+
+  std::shared_ptr<ImplicitDimensions> adapt_dims_(
+      const std::shared_ptr<ImplicitDimensions>& dims) const override {
+    auto high_dim = std::make_shared<RTimeEntity<EntityTypes::Int>>("highdim");
+    return std::make_shared<ImplicitDimensions>(high_dim, dims->low(),
+                                                dims->vecdim());
+  }
+
+  /// Each of the 9 components is a single deriv-ERI child ⇒ trivial passthrough
+  /// loop.
+  void generate_code(const std::shared_ptr<CodeContext>& context,
+                     const std::shared_ptr<ImplicitDimensions>& dims,
+                     const std::string& funcname, std::ostream& decl,
+                     std::ostream& def) override {
+    extern std::string declare_function(
+        const std::shared_ptr<CodeContext>& context,
+        const std::shared_ptr<ImplicitDimensions>& dims,
+        const std::shared_ptr<CodeSymbols>& args, const std::string& tlabel,
+        const std::string& function_descr, std::ostream& decl);
+
+    std::shared_ptr<ImplicitDimensions> localdims = adapt_dims_(dims);
+    std::shared_ptr<CodeSymbols> symbols(new CodeSymbols);
+    this->rr_target()->set_symbol("target");
+    symbols->append_symbol("target");
+    for (unsigned int c = 0; c < this->num_children(); c++) {
+      std::string symb = "src" + std::to_string(c);
+      this->rr_child(c)->set_symbol(symb);
+      symbols->append_symbol(symb);
+    }
+    LibraryTaskManager& taskmgr = LibraryTaskManager::Instance();
+    const std::string tlabel = taskmgr.current().label();
+    const std::string func_decl =
+        declare_function(context, localdims, symbols, tlabel, funcname, decl);
+    def << context->std_header();
+    def << "#include <" << context->label_to_name(funcname) << ".h>\n\n";
+    def << context->code_prefix();
+    def << func_decl << context->open_block() << std::endl;
+    def << context->std_function_header();
+    def << "#ifdef __INTEL_COMPILER\n#pragma ivdep\n#endif\n";
+    def << "for(int hsi = 0; hsi<highdim; hsi++) {\n";
+    def << "target[hsi] = src0[hsi];\n";
+    def << "}\n";
+    def << "/** Number of flops = 0 */\n";
+    def << context->close_block() << std::endl;
+    def << context->code_postfix();
+  }
+};
+
+template <typename F>
+CR_11_opCoulombop_11<F>::CR_11_opCoulombop_11(
+    const std::shared_ptr<TargetType>& Tint, unsigned int)
+    : ParentType(Tint, 0) {
+  assert(Tint->num_func_bra(/* particle */ 0) == 1);
+  assert(Tint->num_func_bra(/* particle */ 1) == 1);
+  assert(Tint->num_func_ket(/* particle */ 0) == 1);
+  assert(Tint->num_func_ket(/* particle */ 1) == 1);
+
+  F a(Tint->bra(0, 0));
+  F b(Tint->ket(0, 0));
+  F c(Tint->bra(1, 0));
+  F d(Tint->ket(1, 0));
+
+  const auto& oper = Tint->oper();
+
+  if (a.contracted() || b.contracted() || c.contracted() || d.contracted())
+    return;
+
+  using namespace libint2::algebra;
+  using namespace libint2::prefactor;
+  using libint2::algebra::operator*;
+
+  const mType zero_m(0u);
+
+  ChildFactory<ThisType,
+               GenIntegralSet_11_11<BasisFunctionType, TwoPRep, mType>>
+      factory(this);
+
+  // Chemist notation: (a b | op c op d) — σ·p acts on one function per
+  // electron. Target component is indexed (a_dir, b_dir) where
+  //   a_dir = direction of σ·p on electron 1 (applied to ket(0,0) = b)
+  //   b_dir = direction of σ·p on electron 2 (applied to ket(1,0) = d)
+  // Mirrors Coulombσpσp which places BOTH derivatives on electron 2 (c and d);
+  // here we place ONE derivative on each electron (b on el-1, d on el-2).
+  const int a_dir = oper->descr().cart_a();
+  const int b_dir = oper->descr().cart_b();
+
+  F b_deriv{b};
+  b_deriv.deriv().inc(a_dir);
+  F d_deriv{d};
+  d_deriv.deriv().inc(b_dir);
+
+  auto child = factory.make_child(a, b_deriv, c, d_deriv, zero_m);
+  if (is_simple()) {
+    // Wrap single child in a trivial sum to satisfy expr_'s AlgebraicOperator
+    // type (same pattern as vrr_1_onep_1.h:261).
+    expr_ = Scalar(0u) + child;
+    nflops_ += 0;
+  }
+
+}  // CR_11_opCoulombop_11<F>::CR_11_opCoulombop_11
+
+}  // namespace libint2
+
+#endif  // LIBINT_COMP_11_OPCOULOMBOP_11_H
diff --git a/src/bin/libint/master_ints_list.h b/src/bin/libint/master_ints_list.h
index 37bfa29a7..ee9ca39d4 100644
--- a/src/bin/libint/master_ints_list.h
+++ b/src/bin/libint/master_ints_list.h
@@ -113,6 +113,9 @@ typedef GenIntegralSet_11_11<CGShell, σpσpCoulombσpσpOper, mType>
     σpσpCoulombσpσp_11_11_sq;
 typedef GenIntegralSet_11_11<CGF, σpσpCoulombσpσpOper, mType>
     σpσpCoulombσpσp_11_11_int;
+typedef GenIntegralSet_11_11<CGShell, opCoulombopOper, mType>
+    opCoulombop_11_11_sq;
+typedef GenIntegralSet_11_11<CGF, opCoulombopOper, mType> opCoulombop_11_11_int;
 typedef GenIntegralSet_11_11<CGShell, R12kG12, mType> R12kG12_11_11_sq;
 typedef GenIntegralSet_11_11<CGF, R12kG12, mType> R12kG12_11_11_int;
 typedef GenIntegralSet_11_11<CGShell, R12kR12lG12, EmptySet>
@@ -153,9 +156,10 @@ typedef boost::mpl::list<
 #endif
     TwoPRep_11_11_sq, TwoPRep_11_11_int, Coulombσpσp_11_11_sq,
     Coulombσpσp_11_11_int, σpσpCoulombσpσp_11_11_sq, σpσpCoulombσpσp_11_11_int,
-    R12kG12_11_11_sq, R12kG12_11_11_int, R12kR12lG12_11_11_sq,
-    R12kR12lG12_11_11_int, TiG12_11_11_sq, TiG12_11_11_int, G12TiG12_11_11_sq,
-    G12TiG12_11_11_int, DivG12prime_xTx_11_11_sq, DivG12prime_xTx_11_11_int,
+    opCoulombop_11_11_sq, opCoulombop_11_11_int, R12kG12_11_11_sq,
+    R12kG12_11_11_int, R12kR12lG12_11_11_sq, R12kR12lG12_11_11_int,
+    TiG12_11_11_sq, TiG12_11_11_int, G12TiG12_11_11_sq, G12TiG12_11_11_int,
+    DivG12prime_xTx_11_11_sq, DivG12prime_xTx_11_11_int,
     DummySymmIntegral_11_11_sq, DummySymmIntegral_11_11_int>
     MasterIntegralTypeList;
 
diff --git a/src/bin/libint/master_rrs_list.h b/src/bin/libint/master_rrs_list.h
index 5517d7c5c..62a7bfb08 100644
--- a/src/bin/libint/master_rrs_list.h
+++ b/src/bin/libint/master_rrs_list.h
@@ -24,6 +24,7 @@
 #include <comp_11_Coulombσpσp_11.h>
 #include <comp_11_DivG12prime_xTx_11.h>
 #include <comp_11_g12tig12_11.h>
+#include <comp_11_opCoulombop_11.h>
 #include <comp_11_r12kr12lg12_11.h>
 #include <comp_11_tig12_11.h>
 #include <comp_11_σpσpCoulombσpσp_11.h>
@@ -319,6 +320,9 @@ typedef CR_11_Coulombσpσp_11<CGF> CR_11_Coulombσpσp_11_int;
 
 typedef CR_11_σpσpCoulombσpσp_11<CGShell> CR_11_σpσpCoulombσpσp_11_sh;
 typedef CR_11_σpσpCoulombσpσp_11<CGF> CR_11_σpσpCoulombσpσp_11_int;
+
+typedef CR_11_opCoulombop_11<CGShell> CR_11_opCoulombop_11_sh;
+typedef CR_11_opCoulombop_11<CGF> CR_11_opCoulombop_11_int;
 };  // namespace libint2
 
 #endif  // header guard
diff --git a/src/bin/libint/oper.h b/src/bin/libint/oper.h
index 678180c28..c6d307dd1 100644
--- a/src/bin/libint/oper.h
+++ b/src/bin/libint/oper.h
@@ -476,6 +476,55 @@ struct σpσpCoulombσpσp_Descr : public Contractable<σpσpCoulombσpσp_Descr
 };
 typedef GenOper<σpσpCoulombσpσp_Descr> σpσpCoulombσpσpOper;
 
+/** opCoulombop: (μ σ·p ν | 1/r_{12} | κ σ·p λ).
+ *  Gaunt LS "bilinear" operator with one σ·p on each side.
+ *  Exposes the full 3×3 gradient-gradient tensor as 9 independent components
+ *  (indexed `3*a + b`, with a,b ∈ {x=0,y=1,z=2}), unlike Coulombσpσp which
+ *  collapses 9 → 4 via σ·σ identity on one side only.
+ */
+struct opCoulombop_Descr : public Contractable<opCoulombop_Descr> {
+  typedef MultiplicativeSymm2Body_Props Properties;
+
+  opCoulombop_Descr() : cartesian_index_(0) {}
+  opCoulombop_Descr(int cartesian_index) : cartesian_index_(cartesian_index) {
+    assert(cartesian_index >= 0 && cartesian_index <= 8);
+  }
+
+  /// 9 components = σ_a(1) ⊗ σ_b(2) bilinear, indexed as 3*a + b,
+  /// where a = bra-side derivative direction, b = ket-side derivative
+  /// direction, and a, b ∈ {x=0, y=1, z=2}. Component layout is the outer
+  /// product of two Cartesian unit vectors — a dyadic — analogous in spirit to
+  /// libint's CartesianMultipole index, but over two independent direction
+  /// indices.
+  static const unsigned int max_key = 9;
+  unsigned int key() const { return cartesian_index(); }
+  std::string description() const {
+    // clang-format off
+    static const char* labels[] = {
+        "XX", "XY", "XZ",
+        "YX", "YY", "YZ",
+        "ZX", "ZY", "ZZ"
+    };
+    // clang-format on
+    const auto ci = cartesian_index();
+    if (ci > 8) abort();
+    return std::string("op_coulomb_op[") + labels[ci] + "]";
+  }
+  std::string label() const { return description(); }
+  int psymm(int i, int j) const { abort(); }
+  int hermitian(int i) const { return +1; }
+
+  int cartesian_index() const { return cartesian_index_; }
+  /// bra-side (first σ) derivative direction ∈ {0=x, 1=y, 2=z}
+  int cart_a() const { return cartesian_index_ / 3; }
+  /// ket-side (second σ) derivative direction ∈ {0=x, 1=y, 2=z}
+  int cart_b() const { return cartesian_index_ % 3; }
+
+ private:
+  const int cartesian_index_ = -1;
+};
+typedef GenOper<opCoulombop_Descr> opCoulombopOper;
+
 /** GTG_1d is the two-body 1-dimensional Gaussian geminal
  */
 struct GTG_1d_Descr : public Contractable<GTG_1d_Descr> {
diff --git a/src/bin/libint/strategy.cc b/src/bin/libint/strategy.cc
index 4e4804da7..e59f6e92f 100644
--- a/src/bin/libint/strategy.cc
+++ b/src/bin/libint/strategy.cc
@@ -135,6 +135,14 @@ template <>
 struct MasterStrategy<σpσpCoulombσpσp_11_11_int> {
   typedef boost::mpl::list<CR_11_σpσpCoulombσpσp_11_int> value;
 };
+template <>
+struct MasterStrategy<opCoulombop_11_11_sq> {
+  typedef boost::mpl::list<CR_11_opCoulombop_11_sh> value;
+};
+template <>
+struct MasterStrategy<opCoulombop_11_11_int> {
+  typedef boost::mpl::list<CR_11_opCoulombop_11_int> value;
+};
 
 #if LIBINT_SHELLQUARTET_STRATEGY == LIBINT_SHELLQUARTET_STRATEGY_A0C0
 template <>

From 590ce42f1711386318b8f1cd56adc94c5296849d Mon Sep 17 00:00:00 2001
From: Kshitij Surjuse <kshitijsurjuse100@gmail.com>
Date: Sun, 26 Apr 2026 14:41:43 -0400
Subject: [PATCH 19/22] op_coulomb_op: redesign 9 components as SO(3) irreps

Scalar trace + 3 antisym + 5 sym-TL replaces raw 3*a+b dyadic.
Bra<->ket swap: per-component sign flip on antisym (was: index
transpose). p1_p2_swappable=true. Tests pass: 2,112,533 assertions.
---
 export/tests/unit/test-2body.cc         |  33 ++++-
 include/libint2/engine.impl.h           |  18 +--
 src/bin/libint/build_libint.cc          |  30 ++---
 src/bin/libint/comp_11_opCoulombop_11.h | 171 +++++++++++++++++++-----
 src/bin/libint/oper.h                   |  64 +++++----
 5 files changed, 229 insertions(+), 87 deletions(-)

diff --git a/export/tests/unit/test-2body.cc b/export/tests/unit/test-2body.cc
index cee9b0800..60b6cf812 100644
--- a/export/tests/unit/test-2body.cc
+++ b/export/tests/unit/test-2body.cc
@@ -679,8 +679,12 @@ TEST_CASE("RKB Coulomb integrals", "[engine][2-body]") {
             int l3, m3, n3;
             FOR_CART(l3, m3, n3, obs[s3].contr[0].l)
 
-            std::array<LIBINT2_REF_REALTYPE, 9> ref_op_coulomb_op{};
-            ref_op_coulomb_op.fill(0.0);
+            // Raw 3x3 dyadic T_{ab} = (a b∂_a | c d∂_b), accumulated in
+            // chemist-notation index 3*a+b. After the primitive loop we
+            // project into the 9 SO(3) irreducible components that the engine
+            // returns: Scalar, AntisymX/Y/Z, SymTLDiagA/B, SymTLOffXY/XZ/YZ.
+            std::array<LIBINT2_REF_REALTYPE, 9> ref_raw{};
+            ref_raw.fill(0.0);
 
             for (uint p0 = 0; p0 < obs[s0].nprim(); p0++) {
               for (uint p1 = 0; p1 < obs[s1].nprim(); p1++) {
@@ -712,12 +716,35 @@ TEST_CASE("RKB Coulomb integrals", "[engine][2-body]") {
                     };
                     for (int a = 0; a < 3; ++a)
                       for (int b = 0; b < 3; ++b)
-                        ref_op_coulomb_op[3 * a + b] += c0123 * D(a, b);
+                        ref_raw[3 * a + b] += c0123 * D(a, b);
                   }
                 }
               }
             }
 
+            // Project raw dyadic into the 9 SO(3) irrep components used by
+            // op_coulomb_op (must match opCoulombop_Descr::Component order).
+            const auto Txx = ref_raw[0];
+            const auto Txy = ref_raw[1];
+            const auto Txz = ref_raw[2];
+            const auto Tyx = ref_raw[3];
+            const auto Tyy = ref_raw[4];
+            const auto Tyz = ref_raw[5];
+            const auto Tzx = ref_raw[6];
+            const auto Tzy = ref_raw[7];
+            const auto Tzz = ref_raw[8];
+            std::array<LIBINT2_REF_REALTYPE, 9> ref_op_coulomb_op{
+                Txx + Tyy + Tzz,        // Scalar
+                Tyz - Tzy,              // AntisymX
+                Tzx - Txz,              // AntisymY
+                Txy - Tyx,              // AntisymZ
+                Txx - Tyy,              // SymTLDiagA
+                2.0 * Tzz - Txx - Tyy,  // SymTLDiagB
+                Txy + Tyx,              // SymTLOffXY
+                Txz + Tzx,              // SymTLOffXZ
+                Tyz + Tzy,              // SymTLOffYZ
+            };
+
             const double ABSOLUTE_DEVIATION_THRESHOLD = 5.0E-14;
             const double RELATIVE_DEVIATION_THRESHOLD = 1.0E-9;
             for (auto comp = 0; comp < 9; ++comp) {
diff --git a/include/libint2/engine.impl.h b/include/libint2/engine.impl.h
index 27187cbcc..12cf89303 100644
--- a/include/libint2/engine.impl.h
+++ b/include/libint2/engine.impl.h
@@ -2192,6 +2192,12 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
                 }
                 if (swap_tket && oper_ == Operator::coulomb_opop && s > 0)
                   oper_cart_component_phase = -1.0;
+                // op_coulomb_op irrep layout under bra↔ket swap: antisym
+                // components (s ∈ {1,2,3}) flip sign; scalar (0) and sym-TL
+                // (4..8) are invariant. swap_tket is always false for this
+                // operator; the sign correction applies on swap_braket alone.
+                if (oper_ == Operator::op_coulomb_op && s >= 1 && s <= 3)
+                  oper_cart_component_phase = -1.0;
                 if (swap_tbra)
                   tgt_blk_mat =
                       oper_cart_component_phase * src_blk_mat.transpose();
@@ -2257,15 +2263,9 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
         for (auto s = 0; s != ntargets; ++s)
           targets_[4 * (s % 4) + (s / 4)] = temp[s];
       }
-      // For op_coulomb_op with swap_braket: (a,b) → (b,a) because bra↔ket swap
-      // exchanges which side each σ·p direction came from. Layout is
-      // index = 3*a + b, so the remap is s_new = 3*(s%3) + (s/3).
-      if (permute && oper_ == Operator::op_coulomb_op && swap_braket) {
-        std::array<const value_type*, 9> temp;
-        for (auto s = 0; s != ntargets; ++s) temp[s] = targets_[s];
-        for (auto s = 0; s != ntargets; ++s)
-          targets_[3 * (s % 3) + (s / 3)] = temp[s];
-      }
+      // op_coulomb_op irrep layout: bra↔ket swap is handled in-place by
+      // oper_cart_component_phase above (sign flip on antisym components,
+      // identity on scalar / sym-TL); no pointer remap is needed.
     }       // if need_scratch => needed to transpose and/or tform
     else {  // did not use scratch? may still need to update targets_
       if (set_targets_) {
diff --git a/src/bin/libint/build_libint.cc b/src/bin/libint/build_libint.cc
index fbc8ac07b..5f799ea1c 100644
--- a/src/bin/libint/build_libint.cc
+++ b/src/bin/libint/build_libint.cc
@@ -1150,14 +1150,14 @@ static void build_TwoPRep_2b_2k(
   std::shared_ptr<CodeContext> context(new CppCodeContext(cparams));
   std::shared_ptr<MemoryManager> memman(new WorstFitMemoryManager());
 
-  // opCoulombop has only a 2-fold bra↔ket-swap symmetry (with (a,b)↔(b,a)
-  // component remap). Within-side particle swap is NOT a symmetry because σ·p
-  // attaches to one specific function per side (ν on bra, λ on ket); swapping
-  // moves the operator to a different physical center that IBP cannot recover
-  // when centers differ. Emit code for every (la,lb,lc,ld) combination to
-  // avoid triggering any within-side swap at runtime.
-  bool p1_p2_swappable = !std::is_same<OperType, CoulombσpσpOper>::value &&
-                         !std::is_same<OperType, opCoulombopOper>::value;
+  // opCoulombop has a 2-fold bra↔ket-swap symmetry, with per-component sign
+  // flips under the swap (Antisym* flip sign; Scalar/SymTL* invariant) — this
+  // is captured by p1_p2_swappable=true plus a dedicated predicate that
+  // canonicalizes only la+lb<=lc+ld and emits code for *all* within-side
+  // orderings (within-side swap is NOT a symmetry — σ·p attaches to one
+  // specific function per side, and IBP cannot recover the sign across the
+  // 1/r12 coupling).
+  bool p1_p2_swappable = !std::is_same<OperType, CoulombσpσpOper>::value;
   bool bra_ket_coswappable = std::is_same<OperType, σpσpCoulombσpσpOper>::value;
 
   // Note: la, lb, lc, ld generate code for chemist notation (ab|O|cd), where O
@@ -1166,11 +1166,11 @@ static void build_TwoPRep_2b_2k(
     for (unsigned int lb = 0; lb <= lmax; lb++) {
       for (unsigned int lc = 0; lc <= lmax; lc++) {
         for (unsigned int ld = 0; ld <= lmax; ld++) {
-          // opCoulombop has only a bra↔ket (particle 1↔2) swap symmetry;
-          // within-side swap is NOT a symmetry (σ·p would move to the wrong
-          // physical center). Canonical form: la+lb <= lc+ld only
-          // (ORCA: la+lb >= lc+ld). Use a dedicated predicate so within-side
-          // orderings are not reduced away.
+          // opCoulombop: only bra↔ket (particle 1↔2) swap is a symmetry.
+          // Within-side swap is NOT (σ·p would move to a different physical
+          // center; IBP cannot repair the sign across 1/r12). Dedicated
+          // predicate canonicalizes la+lb<=lc+ld only (ORCA: >=) and accepts
+          // all within-side orderings.
           if constexpr (std::is_same<OperType, opCoulombopOper>::value) {
 #if LIBINT_SHELL_SET == LIBINT_SHELL_SET_STANDARD
             if (!(la + lb <= lc + ld)) continue;
@@ -1219,8 +1219,8 @@ static void build_TwoPRep_2b_2k(
           if constexpr (std::is_same<OperType, opCoulombopOper>::value) {
             // reset descriptors array
             descrs.resize(0);
-            // iterate over 9 components (3x3 Cartesian dyadic: bra-dir ×
-            // ket-dir)
+            // iterate over 9 SO(3) irrep components: 1 scalar trace + 3
+            // antisym (curl-curl) + 5 sym-traceless
             for (int p = 0; p != 9; ++p) {
               descrs.emplace_back(OperDescrType(p));
             }
diff --git a/src/bin/libint/comp_11_opCoulombop_11.h b/src/bin/libint/comp_11_opCoulombop_11.h
index a6cbb7744..38ae4763e 100644
--- a/src/bin/libint/comp_11_opCoulombop_11.h
+++ b/src/bin/libint/comp_11_opCoulombop_11.h
@@ -33,14 +33,12 @@ namespace libint2 {
 /**
  * Computes the "Gaunt LS bilinear" integral
  *   \f$ (\mu\, \sigma\cdot\hat{p}\,\nu | 1/r_{12} | \kappa\,
- * \sigma\cdot\hat{p}\,\lambda ) \f$ by rewriting each of the 9 (a,b) components
- * as a single derivative Coulomb integral \f$ ( \mu \cdot \partial_a \nu |
- * 1/r_{12} | \kappa \cdot \partial_b \lambda ) \f$ with a ∈ {x,y,z} on
- * bra-function-1 (ν) and b ∈ {x,y,z} on ket-function-1 (λ).
- *
- * Unlike Coulombσpσp (which folds 9 → 4 via σ·σ on the ket pair), all 9
- * components are exposed independently, since the two σ's here act on different
- * particles and their contraction cannot be absorbed locally.
+ * \sigma\cdot\hat{p}\,\lambda ) \f$ in the SO(3) irreducible decomposition of
+ * the rank-2 tensor \f$ T_{ab} = ( \mu \cdot \partial_a \nu | 1/r_{12} | \kappa
+ * \cdot \partial_b \lambda ) \f$: 1 scalar trace + 3 antisymmetric (curl-curl)
+ * + 5 symmetric-traceless = 9 components total. Each output is a small linear
+ * combination of raw deriv-TwoPRep children, mirroring
+ * comp_11_Coulombσpσp_11.h's pattern of trace/antisym emission.
  *
  * @tparam F basis function type. valid choices are CGShell or CGF
  */
@@ -58,7 +56,7 @@ class CR_11_opCoulombop_11
       ParentType;
   friend class GenericRecurrenceRelation<ThisType, BasisFunctionType,
                                          TargetType>;
-  static const unsigned int max_nchildren = 1;
+  static const unsigned int max_nchildren = 3;
 
   using ParentType::Instance;
 
@@ -81,7 +79,7 @@ class CR_11_opCoulombop_11
 
   std::string generate_label() const override {
     return "CR_opCoulombop_" +
-           std::to_string(target_->oper()->descr().cartesian_index());
+           std::to_string(target_->oper()->descr().component_index());
   }
 
   std::string spfunction_call(
@@ -118,8 +116,8 @@ class CR_11_opCoulombop_11
                                                 dims->vecdim());
   }
 
-  /// Each of the 9 components is a single deriv-ERI child ⇒ trivial passthrough
-  /// loop.
+  /// Hand-emit the per-component irrep linear combination over deriv-ERI
+  /// children. The combination depends on the target's component index.
   void generate_code(const std::shared_ptr<CodeContext>& context,
                      const std::shared_ptr<ImplicitDimensions>& dims,
                      const std::string& funcname, std::ostream& decl,
@@ -150,9 +148,38 @@ class CR_11_opCoulombop_11
     def << context->std_function_header();
     def << "#ifdef __INTEL_COMPILER\n#pragma ivdep\n#endif\n";
     def << "for(int hsi = 0; hsi<highdim; hsi++) {\n";
-    def << "target[hsi] = src0[hsi];\n";
-    def << "}\n";
-    def << "/** Number of flops = 0 */\n";
+
+    const int comp = this->target_->oper()->descr().component_index();
+    std::string rhs;
+    unsigned int nflops = 0;
+    switch (comp) {
+      case opCoulombop_Descr::Scalar:
+        rhs = "src0[hsi] + src1[hsi] + src2[hsi]";
+        nflops = 2;
+        break;
+      case opCoulombop_Descr::AntisymX:
+      case opCoulombop_Descr::AntisymY:
+      case opCoulombop_Descr::AntisymZ:
+      case opCoulombop_Descr::SymTLDiagA:
+        rhs = "src0[hsi] - src1[hsi]";
+        nflops = 1;
+        break;
+      case opCoulombop_Descr::SymTLDiagB:
+        rhs = "2.0*src0[hsi] - src1[hsi] - src2[hsi]";
+        nflops = 3;
+        break;
+      case opCoulombop_Descr::SymTLOffXY:
+      case opCoulombop_Descr::SymTLOffXZ:
+      case opCoulombop_Descr::SymTLOffYZ:
+        rhs = "src0[hsi] + src1[hsi]";
+        nflops = 1;
+        break;
+      default:
+        throw std::runtime_error(
+            "CR_11_opCoulombop_11::generate_code: invalid component index");
+    }
+    def << "target[hsi] = " << rhs << ";\n}\n";
+    def << "/** Number of flops = " << nflops << " */\n";
     def << context->close_block() << std::endl;
     def << context->code_postfix();
   }
@@ -187,26 +214,100 @@ CR_11_opCoulombop_11<F>::CR_11_opCoulombop_11(
                GenIntegralSet_11_11<BasisFunctionType, TwoPRep, mType>>
       factory(this);
 
-  // Chemist notation: (a b | op c op d) — σ·p acts on one function per
-  // electron. Target component is indexed (a_dir, b_dir) where
-  //   a_dir = direction of σ·p on electron 1 (applied to ket(0,0) = b)
-  //   b_dir = direction of σ·p on electron 2 (applied to ket(1,0) = d)
-  // Mirrors Coulombσpσp which places BOTH derivatives on electron 2 (c and d);
-  // here we place ONE derivative on each electron (b on el-1, d on el-2).
-  const int a_dir = oper->descr().cart_a();
-  const int b_dir = oper->descr().cart_b();
-
-  F b_deriv{b};
-  b_deriv.deriv().inc(a_dir);
-  F d_deriv{d};
-  d_deriv.deriv().inc(b_dir);
-
-  auto child = factory.make_child(a, b_deriv, c, d_deriv, zero_m);
-  if (is_simple()) {
-    // Wrap single child in a trivial sum to satisfy expr_'s AlgebraicOperator
-    // type (same pattern as vrr_1_onep_1.h:261).
-    expr_ = Scalar(0u) + child;
-    nflops_ += 0;
+  // Chemist notation: (a b | op c op d). σ·p acts on one function per electron
+  // — direction `i` on ket(0,0) = b (electron 1), direction `j` on ket(1,0) = d
+  // (electron 2). Each output is an SO(3) irrep combination of the raw 3×3
+  // T_{ij} dyadic; case bodies build only the children each combination needs.
+  constexpr auto x = 0;
+  constexpr auto y = 1;
+  constexpr auto z = 2;
+
+  auto T = [&](int i, int j) {
+    F b_d{b};
+    b_d.deriv().inc(i);
+    F d_d{d};
+    d_d.deriv().inc(j);
+    return factory.make_child(a, b_d, c, d_d, zero_m);
+  };
+
+  switch (oper->descr().component_index()) {
+    case opCoulombop_Descr::Scalar: {
+      auto Txx = T(x, x);
+      auto Tyy = T(y, y);
+      auto Tzz = T(z, z);
+      if (is_simple()) {
+        expr_ = Txx + Tyy + Tzz;
+        nflops_ += 2;
+      }
+    } break;
+    case opCoulombop_Descr::AntisymX: {
+      auto Tyz = T(y, z);
+      auto Tzy = T(z, y);
+      if (is_simple()) {
+        expr_ = Tyz - Tzy;
+        nflops_ += 1;
+      }
+    } break;
+    case opCoulombop_Descr::AntisymY: {
+      auto Tzx = T(z, x);
+      auto Txz = T(x, z);
+      if (is_simple()) {
+        expr_ = Tzx - Txz;
+        nflops_ += 1;
+      }
+    } break;
+    case opCoulombop_Descr::AntisymZ: {
+      auto Txy = T(x, y);
+      auto Tyx = T(y, x);
+      if (is_simple()) {
+        expr_ = Txy - Tyx;
+        nflops_ += 1;
+      }
+    } break;
+    case opCoulombop_Descr::SymTLDiagA: {
+      auto Txx = T(x, x);
+      auto Tyy = T(y, y);
+      if (is_simple()) {
+        expr_ = Txx - Tyy;
+        nflops_ += 1;
+      }
+    } break;
+    case opCoulombop_Descr::SymTLDiagB: {
+      // 2·T_zz − T_xx − T_yy: child order (Tzz, Txx, Tyy) matches generate_code
+      auto Tzz = T(z, z);
+      auto Txx = T(x, x);
+      auto Tyy = T(y, y);
+      if (is_simple()) {
+        expr_ = Scalar(2) * Tzz - Txx - Tyy;
+        nflops_ += 3;
+      }
+    } break;
+    case opCoulombop_Descr::SymTLOffXY: {
+      auto Txy = T(x, y);
+      auto Tyx = T(y, x);
+      if (is_simple()) {
+        expr_ = Txy + Tyx;
+        nflops_ += 1;
+      }
+    } break;
+    case opCoulombop_Descr::SymTLOffXZ: {
+      auto Txz = T(x, z);
+      auto Tzx = T(z, x);
+      if (is_simple()) {
+        expr_ = Txz + Tzx;
+        nflops_ += 1;
+      }
+    } break;
+    case opCoulombop_Descr::SymTLOffYZ: {
+      auto Tyz = T(y, z);
+      auto Tzy = T(z, y);
+      if (is_simple()) {
+        expr_ = Tyz + Tzy;
+        nflops_ += 1;
+      }
+    } break;
+    default:
+      throw std::runtime_error("CR_11_opCoulombop_11: invalid component index");
   }
 
 }  // CR_11_opCoulombop_11<F>::CR_11_opCoulombop_11
diff --git a/src/bin/libint/oper.h b/src/bin/libint/oper.h
index c6d307dd1..8e8a4f567 100644
--- a/src/bin/libint/oper.h
+++ b/src/bin/libint/oper.h
@@ -478,35 +478,43 @@ typedef GenOper<σpσpCoulombσpσp_Descr> σpσpCoulombσpσpOper;
 
 /** opCoulombop: (μ σ·p ν | 1/r_{12} | κ σ·p λ).
  *  Gaunt LS "bilinear" operator with one σ·p on each side.
- *  Exposes the full 3×3 gradient-gradient tensor as 9 independent components
- *  (indexed `3*a + b`, with a,b ∈ {x=0,y=1,z=2}), unlike Coulombσpσp which
- *  collapses 9 → 4 via σ·σ identity on one side only.
+ *  Outputs the SO(3) irreducible decomposition of the 3×3 gradient-gradient
+ *  tensor T_{ab} = ∂_a ∂_b (μν|κλ): 1 scalar trace + 3 antisymmetric
+ *  (curl-curl) + 5 symmetric-traceless = 9 components. Same storage as the
+ *  raw dyadic, but indexed by physics-meaningful irreps so consumers do not
+ *  need to hand-build trace/antisym/sym-TL combinations at every contraction
+ *  site.
  */
 struct opCoulombop_Descr : public Contractable<opCoulombop_Descr> {
   typedef MultiplicativeSymm2Body_Props Properties;
 
-  opCoulombop_Descr() : cartesian_index_(0) {}
-  opCoulombop_Descr(int cartesian_index) : cartesian_index_(cartesian_index) {
-    assert(cartesian_index >= 0 && cartesian_index <= 8);
+  /// SO(3) irreducible components of the rank-2 Cartesian tensor T_{ab}.
+  enum Component : int {
+    Scalar = 0,      ///< T_xx + T_yy + T_zz (trace, ∇·∇)
+    AntisymX = 1,    ///< T_yz − T_zy = (∇×∇)_x
+    AntisymY = 2,    ///< T_zx − T_xz = (∇×∇)_y
+    AntisymZ = 3,    ///< T_xy − T_yx = (∇×∇)_z
+    SymTLDiagA = 4,  ///< T_xx − T_yy
+    SymTLDiagB = 5,  ///< 2·T_zz − T_xx − T_yy
+    SymTLOffXY = 6,  ///< T_xy + T_yx
+    SymTLOffXZ = 7,  ///< T_xz + T_zx
+    SymTLOffYZ = 8,  ///< T_yz + T_zy
+  };
+
+  opCoulombop_Descr() : component_index_(0) {}
+  opCoulombop_Descr(int component_index) : component_index_(component_index) {
+    assert(component_index >= 0 && component_index <= 8);
   }
 
-  /// 9 components = σ_a(1) ⊗ σ_b(2) bilinear, indexed as 3*a + b,
-  /// where a = bra-side derivative direction, b = ket-side derivative
-  /// direction, and a, b ∈ {x=0, y=1, z=2}. Component layout is the outer
-  /// product of two Cartesian unit vectors — a dyadic — analogous in spirit to
-  /// libint's CartesianMultipole index, but over two independent direction
-  /// indices.
   static const unsigned int max_key = 9;
-  unsigned int key() const { return cartesian_index(); }
+  unsigned int key() const { return component_index(); }
   std::string description() const {
-    // clang-format off
     static const char* labels[] = {
-        "XX", "XY", "XZ",
-        "YX", "YY", "YZ",
-        "ZX", "ZY", "ZZ"
+        "scalar",       "antisym_x",    "antisym_y",
+        "antisym_z",    "symtl_diag_a", "symtl_diag_b",
+        "symtl_off_xy", "symtl_off_xz", "symtl_off_yz",
     };
-    // clang-format on
-    const auto ci = cartesian_index();
+    const auto ci = component_index();
     if (ci > 8) abort();
     return std::string("op_coulomb_op[") + labels[ci] + "]";
   }
@@ -514,14 +522,20 @@ struct opCoulombop_Descr : public Contractable<opCoulombop_Descr> {
   int psymm(int i, int j) const { abort(); }
   int hermitian(int i) const { return +1; }
 
-  int cartesian_index() const { return cartesian_index_; }
-  /// bra-side (first σ) derivative direction ∈ {0=x, 1=y, 2=z}
-  int cart_a() const { return cartesian_index_ / 3; }
-  /// ket-side (second σ) derivative direction ∈ {0=x, 1=y, 2=z}
-  int cart_b() const { return cartesian_index_ % 3; }
+  int component_index() const { return component_index_; }
+
+  bool is_scalar() const { return component_index_ == Scalar; }
+  bool is_antisym() const {
+    return component_index_ >= AntisymX && component_index_ <= AntisymZ;
+  }
+  bool is_sym_tl() const {
+    return component_index_ >= SymTLDiagA && component_index_ <= SymTLOffYZ;
+  }
+  /// for antisym components: 0=x, 1=y, 2=z (only valid if is_antisym())
+  int antisym_cart() const { return component_index_ - AntisymX; }
 
  private:
-  const int cartesian_index_ = -1;
+  const int component_index_ = -1;
 };
 typedef GenOper<opCoulombop_Descr> opCoulombopOper;
 

From b463ca8d43632444e950971e454623345a32cd8e Mon Sep 17 00:00:00 2001
From: Kshitij Surjuse <kshitijsurjuse100@gmail.com>
Date: Sun, 26 Apr 2026 14:43:40 -0400
Subject: [PATCH 20/22] =?UTF-8?q?Add=20=CF=83pR=CF=83p=20(oprop):=201-body?=
 =?UTF-8?q?=20=CF=83=C2=B7p=20=C2=B7=20r=20=C2=B7=20=CF=83=C2=B7p=20integr?=
 =?UTF-8?q?al?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

12 components = 3 dipole directions × 4 Pauli quaternion (trace + 3
antisym), mirroring σpVσp's fold. Engine origin via set_params.
Master integral list split via mpl::joint_view (boost list50 limit).
---
 export/tests/unit/test-1body.cc               |  66 +++++++
 include/libint2/engine.h                      |  15 +-
 include/libint2/engine.impl.h                 |  61 +++----
 src/bin/libint/build_libint.cc                |  17 +-
 .../libint/comp_1_\317\203pR\317\203p_1.h"    | 161 ++++++++++++++++++
 src/bin/libint/master_ints_list.h             |  18 +-
 src/bin/libint/master_rrs_list.h              |   4 +
 src/bin/libint/oper.h                         |  39 +++++
 src/bin/libint/strategy.cc                    |   8 +
 9 files changed, 352 insertions(+), 37 deletions(-)
 create mode 100644 "src/bin/libint/comp_1_\317\203pR\317\203p_1.h"

diff --git a/export/tests/unit/test-1body.cc b/export/tests/unit/test-1body.cc
index 57c65f428..54c0cc14d 100644
--- a/export/tests/unit/test-1body.cc
+++ b/export/tests/unit/test-1body.cc
@@ -244,6 +244,72 @@ TEST_CASE_METHOD(libint2::unit::DefaultFixture, "W correctness",
 #endif  // LIBINT2_SUPPORT_ONEBODY
 }
 
+TEST_CASE_METHOD(libint2::unit::DefaultFixture, "σpRσp correctness",
+                 "[engine][1-body]") {
+#if defined(LIBINT2_SUPPORT_ONEBODY)
+  if (LIBINT_SHGSHELL_ORDERING != LIBINT_SHGSHELL_ORDERING_STANDARD) return;
+
+  // Two contracted Gaussian shells at distinct centers. We exercise
+  // `(σ·p) r (σ·p)` over the (s|d) and (d|s) shell pairs and validate
+  // Hermiticity: trace components (q=0) are symmetric under bra↔ket swap,
+  // antisym components (q=1,2,3) are antisymmetric.
+  std::vector<Shell> obs{
+      Shell{{1.0, 3.0}, {{0, true, {1.0, 0.3}}}, {{0.0, 0.0, 0.0}}},
+      Shell{{2.0, 5.0}, {{2, true, {1.0, 0.2}}}, {{1.0, 1.0, 1.0}}}};
+
+  const auto lmax = std::min(2, LIBINT2_MAX_AM_oprop);
+  if (lmax < 2) return;
+
+  auto engine = Engine(Operator::oprop, 2, lmax);
+  engine.set_params(std::array<double, 3>{{0.0, 0.0, 0.0}});
+
+  // (s|σpRσp|d) and (d|σpRσp|s)
+  engine.compute(obs[0], obs[1]);
+  std::array<std::vector<double>, 12> ab;
+  for (int c = 0; c < 12; ++c) {
+    const auto* buf = engine.results()[c];
+    REQUIRE(buf != nullptr);
+    ab[c].assign(buf, buf + (1 * 5));  // n_s × n_d_pure = 1 × 5
+  }
+
+  engine.compute(obs[1], obs[0]);
+  std::array<std::vector<double>, 12> ba;
+  for (int c = 0; c < 12; ++c) {
+    const auto* buf = engine.results()[c];
+    REQUIRE(buf != nullptr);
+    ba[c].assign(buf, buf + (5 * 1));  // n_d_pure × n_s
+  }
+
+  // Hermiticity check: σpRσp is Hermitian, but the Pauli identity routes the
+  // imaginary i factor on the antisym pieces into a real-stored sign flip.
+  //   q=0 trace: matrix is symmetric  ⇒  ab[0+k][i,j] ==  ba[0+k][j,i]
+  //   q=1..3   : matrix is antisym    ⇒  ab[q+k][i,j] == -ba[q+k][j,i]
+  // (Indices: ab is laid out (i=0..n_s-1, j=0..n_d-1); ba is (j, i).)
+  const double tol = 1.0e-12;
+  for (int k = 0; k < 3; ++k) {
+    for (int q = 0; q < 4; ++q) {
+      const int comp = 4 * k + q;
+      const double expected_sign = (q == 0) ? +1.0 : -1.0;
+      for (int i = 0; i < 1; ++i) {
+        for (int j = 0; j < 5; ++j) {
+          const double v_ab = ab[comp][i * 5 + j];
+          const double v_ba = ba[comp][j * 1 + i];
+          REQUIRE(std::isfinite(v_ab));
+          REQUIRE(std::abs(v_ab - expected_sign * v_ba) < tol);
+        }
+      }
+    }
+  }
+
+  // Sanity: not every component is identically zero (would mask codegen bugs).
+  bool any_nonzero = false;
+  for (int c = 0; c < 12; ++c)
+    for (double v : ab[c])
+      if (std::abs(v) > 1.0e-10) any_nonzero = true;
+  REQUIRE(any_nonzero);
+#endif  // LIBINT2_SUPPORT_ONEBODY
+}
+
 // verify that python/tests/test_libint2.py:test_integrals is correct
 TEST_CASE_METHOD(libint2::unit::DefaultFixture, "python correctness",
                  "[engine][1-body]") {
diff --git a/include/libint2/engine.h b/include/libint2/engine.h
index f99c6a898..d90c6709c 100644
--- a/include/libint2/engine.h
+++ b/include/libint2/engine.h
@@ -147,6 +147,12 @@ enum class Operator {
   sphemultipole,
   /// The four components of σp . V . σp, where V is the nuclear potential.
   opVop,
+  /// (1-body) σp . r . σp, the σ·p-on-both-sides analog of the dipole moment.
+  /// Produces 12 components = 3 dipole directions × 4 Pauli quaternion
+  /// components (trace + 3 antisym), indexed as `4*k + q` with `k ∈ {x,y,z}`
+  /// the dipole direction and `q ∈ {0=trace, 1=σ_x, 2=σ_y, 3=σ_z}` the Pauli
+  /// piece. Origin set via `engine.set_params(std::array<double,3>)`.
+  oprop,
   /// \f$ \delta(\vec{r}_1 - \vec{r}_2) \f$
   delta,
   /// (2-body) Coulomb operator = \f$ r_{12}^{-1} \f$
@@ -199,7 +205,7 @@ enum class Operator {
   // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!keep this
   // updated!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   first_1body_oper = overlap,
-  last_1body_oper = opVop,
+  last_1body_oper = oprop,
   first_2body_oper = delta,
   last_2body_oper = stg_x_coulomb,
   first_oper = first_1body_oper,
@@ -352,6 +358,13 @@ struct operator_traits<Operator::sphemultipole>
       (LIBINT_MULTIPOLE_MAX_ORDER + 1) * (LIBINT_MULTIPOLE_MAX_ORDER + 1);
 };
 
+template <>
+struct operator_traits<Operator::oprop>
+    : public operator_traits<Operator::emultipole1> {
+  static constexpr auto nopers = 12;
+  static constexpr auto intrinsic_deriv_order = 2;
+};
+
 template <>
 struct operator_traits<Operator::coulomb>
     : public detail::default_operator_traits {
diff --git a/include/libint2/engine.impl.h b/include/libint2/engine.impl.h
index 12cf89303..a431f8ad1 100644
--- a/include/libint2/engine.impl.h
+++ b/include/libint2/engine.impl.h
@@ -70,40 +70,41 @@ typename std::remove_all_extents<T>::type* to_ptr1(T (&a)[N]) {
 /// These MUST appear in the same order as in Operator.
 /// You must also update BOOST_PP_NBODY_OPERATOR_LAST_ONEBODY_INDEX when you add
 /// one-body ints
-#define BOOST_PP_NBODY_OPERATOR_LIST                        \
-  (overlap,                         /* overlap */           \
-   (kinetic,                        /* kinetic */           \
-    (elecpot,                       /* nuclear */           \
-     (elecpot,                      /* erf_nuclear */       \
-      (elecpot,                     /* erfc_nuclear */      \
-       (elecpot,                    /* erfx_nuclear */      \
-        (1emultipole,               /* emultipole1 */       \
-         (2emultipole,              /* emultipole2 */       \
-          (3emultipole,             /* emultipole3 */       \
-           (sphemultipole,          /* sphemultipole */     \
-            (opVop,                 /* opVop */             \
-             (eri,                  /* delta */             \
-              (eri,                 /* coulomb */           \
-               (coulomb_opop,       /* coulomb_opop */      \
-                (opop_coulomb_opop, /* opop_coulomb_opop */ \
-                 (op_coulomb_op,    /* op_coulomb_op */     \
-                  (eri,             /* cgtg */              \
-                   (eri,            /* cgtg_x_coulomb */    \
-                    (eri,           /* delcgtg2 */          \
-                     (eri,          /* r12 */               \
-                      (eri,         /* erf_coulomb */       \
-                       (eri,        /* erfc_coulomb */      \
-                        (eri,       /* erfx_coulomb */      \
-                         (eri,      /* stg */               \
-                          (eri,     /* yukawa */            \
-                           BOOST_PP_NIL)))))))))))))))))))))))))
+#define BOOST_PP_NBODY_OPERATOR_LIST                         \
+  (overlap,                          /* overlap */           \
+   (kinetic,                         /* kinetic */           \
+    (elecpot,                        /* nuclear */           \
+     (elecpot,                       /* erf_nuclear */       \
+      (elecpot,                      /* erfc_nuclear */      \
+       (elecpot,                     /* erfx_nuclear */      \
+        (1emultipole,                /* emultipole1 */       \
+         (2emultipole,               /* emultipole2 */       \
+          (3emultipole,              /* emultipole3 */       \
+           (sphemultipole,           /* sphemultipole */     \
+            (opVop,                  /* opVop */             \
+             (oprop,                 /* oprop */             \
+              (eri,                  /* delta */             \
+               (eri,                 /* coulomb */           \
+                (coulomb_opop,       /* coulomb_opop */      \
+                 (opop_coulomb_opop, /* opop_coulomb_opop */ \
+                  (op_coulomb_op,    /* op_coulomb_op */     \
+                   (eri,             /* cgtg */              \
+                    (eri,            /* cgtg_x_coulomb */    \
+                     (eri,           /* delcgtg2 */          \
+                      (eri,          /* r12 */               \
+                       (eri,         /* erf_coulomb */       \
+                        (eri,        /* erfc_coulomb */      \
+                         (eri,       /* erfx_coulomb */      \
+                          (eri,      /* stg */               \
+                           (eri,     /* yukawa */            \
+                            BOOST_PP_NIL))))))))))))))))))))))))))
 
 #define BOOST_PP_NBODY_OPERATOR_INDEX_TUPLE \
   BOOST_PP_MAKE_TUPLE(BOOST_PP_LIST_SIZE(BOOST_PP_NBODY_OPERATOR_LIST))
 #define BOOST_PP_NBODY_OPERATOR_INDEX_LIST \
   BOOST_PP_TUPLE_TO_LIST(BOOST_PP_NBODY_OPERATOR_INDEX_TUPLE)
 #define BOOST_PP_NBODY_OPERATOR_LAST_ONEBODY_INDEX \
-  10  // opVop, the 11th member of BOOST_PP_NBODY_OPERATOR_LIST, is the last
+  11  // oprop, the 12th member of BOOST_PP_NBODY_OPERATOR_LIST, is the last
       // 1-body operator
 
 // make list of braket indices for n-body ints
@@ -1028,7 +1029,7 @@ __libint2_engine_inline void Engine::compute_primdata(Libint_t& primdata,
   //  }
 
   if (oper_ == Operator::emultipole1 || oper_ == Operator::emultipole2 ||
-      oper_ == Operator::emultipole3) {
+      oper_ == Operator::emultipole3 || oper_ == Operator::oprop) {
     const auto& O = any_cast<
         const operator_traits<Operator::emultipole1>::oper_params_type&>(
         params_);  // same as emultipoleX
@@ -1076,7 +1077,7 @@ __libint2_engine_inline void Engine::compute_primdata(Libint_t& primdata,
   primdata._0_Overlap_0_z[0] = ovlp_ss_z;
 
   if (oper_ == Operator::kinetic || (deriv_order_ > 0) ||
-      oper_ == Operator::opVop) {
+      oper_ == Operator::opVop || oper_ == Operator::oprop) {
 #if LIBINT2_DEFINED(eri, two_alpha0_bra)
     primdata.two_alpha0_bra[0] = 2.0 * alpha1;
 #endif
diff --git a/src/bin/libint/build_libint.cc b/src/bin/libint/build_libint.cc
index 5f799ea1c..206cd43c4 100644
--- a/src/bin/libint/build_libint.cc
+++ b/src/bin/libint/build_libint.cc
@@ -308,6 +308,11 @@ template <>
   return σpVσp_Descr(p);
 }
 
+template <>
+σpRσp_Descr make_descr<σpRσp_Descr>(int p, int, int) {
+  return σpRσp_Descr(p);
+}
+
 template <>
 Coulombσpσp_Descr make_descr<Coulombσpσp_Descr>(int p, int, int) {
   return Coulombσpσp_Descr(p);
@@ -445,6 +450,14 @@ void build_onebody_1b_1k(std::ostream& os, std::string label,
           descrs.emplace_back(make_descr<OperDescrType>(p));
         }
       }
+      if (std::is_same<_OperType, σpRσpOper>::value) {
+        // reset descriptors array
+        descrs.resize(0);
+        // iterate over 12 = 3 dipole directions × 4 Pauli components
+        for (int p = 0; p != 12; ++p) {
+          descrs.emplace_back(make_descr<OperDescrType>(p));
+        }
+      }
 
       // derivative index is the outermost (slowest running)
       // operator component is second slowest
@@ -584,11 +597,11 @@ void try_main(int argc, char* argv[]) {
 // overlap, kinetic, elecpot cannot be omitted
 #define BOOST_PP_ONEBODY_TASK_TUPLE                                  \
   (overlap, kinetic, elecpot, 1emultipole, 2emultipole, 3emultipole, \
-   sphemultipole, opVop)
+   sphemultipole, opVop, oprop)
 #define BOOST_PP_ONEBODY_TASK_OPER_TUPLE                              \
   (OverlapOper, KineticOper, ElecPotOper, CartesianMultipoleOper<3u>, \
    CartesianMultipoleOper<3u>, CartesianMultipoleOper<3u>,            \
-   SphericalMultipoleOper, σpVσpOper)
+   SphericalMultipoleOper, σpVσpOper, σpRσpOper)
 #define BOOST_PP_ONEBODY_TASK_LIST \
   BOOST_PP_TUPLE_TO_LIST(BOOST_PP_ONEBODY_TASK_TUPLE)
 #define BOOST_PP_ONEBODY_TASK_OPER_LIST \
diff --git "a/src/bin/libint/comp_1_\317\203pR\317\203p_1.h" "b/src/bin/libint/comp_1_\317\203pR\317\203p_1.h"
new file mode 100644
index 000000000..e698c9047
--- /dev/null
+++ "b/src/bin/libint/comp_1_\317\203pR\317\203p_1.h"
@@ -0,0 +1,161 @@
+/*
+ *  Copyright (C) 2004-2026 Edward F. Valeev
+ *
+ *  This file is part of Libint compiler.
+ *
+ *  Libint compiler is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Libint compiler is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with Libint compiler.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef LIBINT_COMP_1_ΣPRΣP_1_H
+#define LIBINT_COMP_1_ΣPRΣP_1_H
+
+#include <generic_rr.h>
+
+namespace libint2 {
+
+/**
+ * Computes the integral of \f$ \sigma \cdot \hat{p}\, r_k\, \sigma \cdot
+ * \hat{p} \f$ over CGShell/CGF by folding the 9 raw \f$ \sigma_a \partial_a r_k
+ * \sigma_b \partial_b \f$ dyadics per dipole direction \f$ k \f$ to 4
+ * Pauli-quaternion components via \f$ \sigma_a \sigma_b = \delta_{ab} +
+ * i\epsilon_{abc}\sigma_c \f$. 12 outputs total = 3 dipole directions × 4
+ * Pauli components, mirroring the σpVσp fold but with the central operator
+ * being a Cartesian dipole instead of the electrostatic potential V.
+ *
+ * @tparam F basis function type. valid choices are CGShell or CGF
+ */
+template <typename F>
+class CR_1_σpRσp_1
+    : public GenericRecurrenceRelation<
+          CR_1_σpRσp_1<F>, F, GenIntegralSet_1_1<F, σpRσpOper, EmptySet>> {
+ public:
+  typedef CR_1_σpRσp_1<F> ThisType;
+  typedef F BasisFunctionType;
+  typedef σpRσpOper OperType;
+  typedef GenIntegralSet_1_1<F, σpRσpOper, EmptySet> TargetType;
+  typedef GenericRecurrenceRelation<ThisType, BasisFunctionType, TargetType>
+      ParentType;
+  friend class GenericRecurrenceRelation<ThisType, BasisFunctionType,
+                                         TargetType>;
+  static const unsigned int max_nchildren = 100;
+
+  using ParentType::Instance;
+
+  static bool directional() { return false; }
+
+ private:
+  using ParentType::is_simple;
+  using ParentType::target_;
+  using ParentType::RecurrenceRelation::expr_;
+  using ParentType::RecurrenceRelation::nflops_;
+
+  CR_1_σpRσp_1(const std::shared_ptr<TargetType> &, unsigned int = 0);
+
+  static std::string descr() { return "CR"; }
+};
+
+template <typename F>
+CR_1_σpRσp_1<F>::CR_1_σpRσp_1(const std::shared_ptr<TargetType> &Tint,
+                              unsigned int)
+    : ParentType(Tint, 0) {
+  assert(Tint->num_func_bra(/* particle */ 0) == 1);
+  assert(Tint->num_func_ket(/* particle */ 0) == 1);
+  const auto &a = Tint->bra(0, 0);
+  const auto &b = Tint->ket(0, 0);
+  const auto &oper = Tint->oper();
+
+  // express σ·p r_k σ·p in terms of derivative integrals of the dipole
+  // operator r_k for primitive Gaussians only
+  if (a.contracted() || b.contracted()) return;
+
+  using namespace libint2::algebra;
+  using namespace libint2::prefactor;
+  using libint2::algebra::operator*;
+
+  ChildFactory<ThisType,
+               GenIntegralSet_1_1<BasisFunctionType, CartesianMultipoleOper<3u>,
+                                  EmptySet>>
+      factory(this);
+
+  constexpr auto x = 0;
+  constexpr auto y = 1;
+  constexpr auto z = 2;
+
+  F Dx_a{a};
+  Dx_a.deriv().inc(x);
+  F Dx_b{b};
+  Dx_b.deriv().inc(x);
+  F Dy_a{a};
+  Dy_a.deriv().inc(y);
+  F Dy_b{b};
+  Dy_b.deriv().inc(y);
+  F Dz_a{a};
+  Dz_a.deriv().inc(z);
+  F Dz_b{b};
+  Dz_b.deriv().inc(z);
+
+  // Build the dipole multipole descriptor for direction k.
+  const auto k = oper->descr().dipole_dir();
+  CartesianMultipole_Descr<3u> mu_k;
+  mu_k.inc(k, 1);  // r_k = (kx,ky,kz) with k_k = 1, others 0
+
+  // Pauli quaternion fold per (k, q):
+  //   q=0: trace δ_ab → Σ_a (∂_a μ | r_k | ∂_a ν)
+  //   q=1: σ_x antisym → (∂_y μ | r_k | ∂_z ν) − (∂_z μ | r_k | ∂_y ν)
+  //   q=2: σ_y antisym → (∂_z μ | r_k | ∂_x ν) − (∂_x μ | r_k | ∂_z ν)
+  //   q=3: σ_z antisym → (∂_x μ | r_k | ∂_y ν) − (∂_y μ | r_k | ∂_x ν)
+  switch (oper->descr().quaternion_index()) {
+    case 0: {
+      auto Dx_a_R_Dx_b = factory.make_child(Dx_a, Dx_b, EmptySet(), mu_k);
+      auto Dy_a_R_Dy_b = factory.make_child(Dy_a, Dy_b, EmptySet(), mu_k);
+      auto Dz_a_R_Dz_b = factory.make_child(Dz_a, Dz_b, EmptySet(), mu_k);
+      if (is_simple()) {
+        expr_ = Dx_a_R_Dx_b + Dy_a_R_Dy_b + Dz_a_R_Dz_b;
+        nflops_ += 2;
+      }
+    } break;
+    case 1: {
+      auto Dy_a_R_Dz_b = factory.make_child(Dy_a, Dz_b, EmptySet(), mu_k);
+      auto Dz_a_R_Dy_b = factory.make_child(Dz_a, Dy_b, EmptySet(), mu_k);
+      if (is_simple()) {
+        expr_ = Dy_a_R_Dz_b - Dz_a_R_Dy_b;
+        nflops_ += 1;
+      }
+    } break;
+    case 2: {
+      auto Dz_a_R_Dx_b = factory.make_child(Dz_a, Dx_b, EmptySet(), mu_k);
+      auto Dx_a_R_Dz_b = factory.make_child(Dx_a, Dz_b, EmptySet(), mu_k);
+      if (is_simple()) {
+        expr_ = Dz_a_R_Dx_b - Dx_a_R_Dz_b;
+        nflops_ += 1;
+      }
+    } break;
+    case 3: {
+      auto Dx_a_R_Dy_b = factory.make_child(Dx_a, Dy_b, EmptySet(), mu_k);
+      auto Dy_a_R_Dx_b = factory.make_child(Dy_a, Dx_b, EmptySet(), mu_k);
+      if (is_simple()) {
+        expr_ = Dx_a_R_Dy_b - Dy_a_R_Dx_b;
+        nflops_ += 1;
+      }
+    } break;
+    default:
+      throw std::runtime_error("CR_1_σpRσp_1: invalid quaternionic index");
+  }
+
+}  // CR_1_σpRσp_1<F>::CR_1_σpRσp_1
+
+};  // namespace libint2
+
+#endif  // LIBINT_COMP_1_ΣPRΣP_1_H
diff --git a/src/bin/libint/master_ints_list.h b/src/bin/libint/master_ints_list.h
index ee9ca39d4..2a6a7cd0f 100644
--- a/src/bin/libint/master_ints_list.h
+++ b/src/bin/libint/master_ints_list.h
@@ -21,12 +21,15 @@
 #ifndef _libint2_src_bin_libint_masterintslist_h_
 #define _libint2_src_bin_libint_masterintslist_h_
 
-// need extra-long mpl list
+// need extra-long mpl list — split across two sub-lists and joined via
+// boost::mpl::joint_view, since boost only ships pre-generated list headers up
+// to size 50.
 #define BOOST_MPL_CFG_NO_PREPROCESSED_HEADERS
 #define BOOST_MPL_LIMIT_LIST_SIZE 50
 #include <bfset.h>
 #include <oper.h>
 
+#include <boost/mpl/joint_view.hpp>
 #include <boost/mpl/list.hpp>
 #if LIBINT_SUPPORT_ONEBODYINTS
 #include <integral_1_1.h>
@@ -47,6 +50,8 @@ typedef GenIntegralSet_1_1<CGShell, ElecPotOper, mType> ElecPot_1_1_sh;
 typedef GenIntegralSet_1_1<CGF, ElecPotOper, mType> ElecPot_1_1_int;
 typedef GenIntegralSet_1_1<CGShell, σpVσpOper, EmptySet> σpVσp_1_1_sh;
 typedef GenIntegralSet_1_1<CGF, σpVσpOper, EmptySet> σpVσp_1_1_int;
+typedef GenIntegralSet_1_1<CGShell, σpRσpOper, EmptySet> σpRσp_1_1_sh;
+typedef GenIntegralSet_1_1<CGF, σpRσpOper, EmptySet> σpRσp_1_1_int;
 typedef GenIntegralSet_1_1<CGShell, CartesianMultipoleOper<3u>, EmptySet>
     CMultipole_1_1_sh;
 typedef GenIntegralSet_1_1<CGF, CartesianMultipoleOper<3u>, EmptySet>
@@ -148,12 +153,14 @@ typedef boost::mpl::list<
     Overlap_1_1_sh_y, Overlap_1_1_int_y, Overlap_1_1_sh_z, Overlap_1_1_int_z,
     Kinetic_1_1_sh, Kinetic_1_1_int, Kinetic_1_1_sh_x, Kinetic_1_1_int_x,
     Kinetic_1_1_sh_y, Kinetic_1_1_int_y, Kinetic_1_1_sh_z, Kinetic_1_1_int_z,
-    ElecPot_1_1_sh, ElecPot_1_1_int, σpVσp_1_1_sh, σpVσp_1_1_int,
-    CMultipole_1_1_sh, CMultipole_1_1_int, CMultipole_1_1_sh_x,
+    ElecPot_1_1_sh, ElecPot_1_1_int, σpVσp_1_1_sh, σpVσp_1_1_int, σpRσp_1_1_sh,
+    σpRσp_1_1_int, CMultipole_1_1_sh, CMultipole_1_1_int, CMultipole_1_1_sh_x,
     CMultipole_1_1_sh_y, CMultipole_1_1_sh_z, CMultipole_1_1_int_x,
     CMultipole_1_1_int_y, CMultipole_1_1_int_z, SMultipole_1_1_sh,
-    SMultipole_1_1_int,
 #endif
+    SMultipole_1_1_int>
+    MasterIntegralTypeList_1body_part;
+typedef boost::mpl::list<
     TwoPRep_11_11_sq, TwoPRep_11_11_int, Coulombσpσp_11_11_sq,
     Coulombσpσp_11_11_int, σpσpCoulombσpσp_11_11_sq, σpσpCoulombσpσp_11_11_int,
     opCoulombop_11_11_sq, opCoulombop_11_11_int, R12kG12_11_11_sq,
@@ -161,6 +168,9 @@ typedef boost::mpl::list<
     TiG12_11_11_sq, TiG12_11_11_int, G12TiG12_11_11_sq, G12TiG12_11_11_int,
     DivG12prime_xTx_11_11_sq, DivG12prime_xTx_11_11_int,
     DummySymmIntegral_11_11_sq, DummySymmIntegral_11_11_int>
+    MasterIntegralTypeList_2body_part;
+typedef boost::mpl::joint_view<MasterIntegralTypeList_1body_part,
+                               MasterIntegralTypeList_2body_part>
     MasterIntegralTypeList;
 
 };  // namespace libint2
diff --git a/src/bin/libint/master_rrs_list.h b/src/bin/libint/master_rrs_list.h
index 62a7bfb08..067ccddf3 100644
--- a/src/bin/libint/master_rrs_list.h
+++ b/src/bin/libint/master_rrs_list.h
@@ -28,6 +28,7 @@
 #include <comp_11_r12kr12lg12_11.h>
 #include <comp_11_tig12_11.h>
 #include <comp_11_σpσpCoulombσpσp_11.h>
+#include <comp_1_σpRσp_1.h>
 #include <comp_1_σpVσp_1.h>
 #include <comp_deriv_gauss.h>
 #include <comp_deriv_gauss_v2.h>
@@ -182,6 +183,9 @@ typedef VRR_1_ElecPot_1<CGF, InKet> VRR_b_1_ElecPot_1_int;
 typedef CR_1_σpVσp_1<CGShell> CR_1_σpVσp_1_sh;
 typedef CR_1_σpVσp_1<CGF> CR_1_σpVσp_1_int;
 
+typedef CR_1_σpRσp_1<CGShell> CR_1_σpRσp_1_sh;
+typedef CR_1_σpRσp_1<CGF> CR_1_σpRσp_1_int;
+
 // TODO investigate whether need to stay away from HRR for now to be sure that
 // multipoles are computed as precisely as possible
 typedef HRR<SMultipole_1_1_sh, CGShell, 0, InBra, 0, InKet, 0>
diff --git a/src/bin/libint/oper.h b/src/bin/libint/oper.h
index 8e8a4f567..b231d80cc 100644
--- a/src/bin/libint/oper.h
+++ b/src/bin/libint/oper.h
@@ -321,6 +321,45 @@ struct σpVσp_Descr : public Contractable<σpVσp_Descr> {
 };
 typedef GenOper<σpVσp_Descr> σpVσpOper;
 
+/** opRop: (μ σ·p | r | σ·p ν), one-body σ·p-on-both-sides analog of dipole.
+ *  σ_a σ_b = δ_ab + iε_abc σ_c folds the 9 raw σ_a∂_a r_k σ_b∂_b dyadics per
+ *  dipole direction k down to 4 Pauli-quaternion components (trace + 3
+ *  antisym), mirroring σpVσp's fold of σ·p V σ·p. 12 outputs total = 3 dipole
+ *  directions × 4 Pauli components, indexed composite_index = 4·k + q.
+ */
+struct σpRσp_Descr : public Contractable<σpRσp_Descr> {
+  typedef MultiplicativeODep1Body_Props Properties;
+
+  σpRσp_Descr() : composite_index_(0) {}
+  σpRσp_Descr(int composite_index) : composite_index_(composite_index) {
+    assert(composite_index >= 0 && composite_index < 12);
+  }
+
+  static const unsigned int max_key = 12;
+  unsigned int key() const { return composite_index(); }
+  std::string description() const {
+    static const char* dipole_lbl[] = {"x", "y", "z"};
+    static const char* pauli_lbl[] = {"0", "X", "Y", "Z"};
+    const auto ci = composite_index();
+    if (ci < 0 || ci >= 12) abort();
+    return std::string("opRop[") + dipole_lbl[ci / 4] + "," +
+           pauli_lbl[ci % 4] + "]";
+  }
+  std::string label() const { return description(); }
+  int psymm(int i, int j) const { abort(); }
+  int hermitian(int i) const { return +1; }
+
+  int composite_index() const { return composite_index_; }
+  /// dipole direction ∈ {0=x, 1=y, 2=z}
+  int dipole_dir() const { return composite_index_ / 4; }
+  /// Pauli quaternion ∈ {0=trace, 1=σ_x, 2=σ_y, 3=σ_z}
+  int quaternion_index() const { return composite_index_ % 4; }
+
+ private:
+  const int composite_index_ = -1;
+};
+typedef GenOper<σpRσp_Descr> σpRσpOper;
+
 /// cartesian multipole operator in \c NDIM dimensions
 /// \f$ \hat{O}(\vec{k}) \equiv \vec{r}^{\cdot \vec{k}} = r_1^{k_1} r_2^{k_2}
 /// \cdots \f$ \internal OriginDerivative<NDIM> is used to store cartesian
diff --git a/src/bin/libint/strategy.cc b/src/bin/libint/strategy.cc
index e59f6e92f..3f0c36b0c 100644
--- a/src/bin/libint/strategy.cc
+++ b/src/bin/libint/strategy.cc
@@ -315,6 +315,14 @@ struct MasterStrategy<σpVσp_1_1_int> {
   typedef boost::mpl::list<CR_1_σpVσp_1_int> value;
 };
 template <>
+struct MasterStrategy<σpRσp_1_1_sh> {
+  typedef boost::mpl::list<CR_1_σpRσp_1_sh> value;
+};
+template <>
+struct MasterStrategy<σpRσp_1_1_int> {
+  typedef boost::mpl::list<CR_1_σpRσp_1_int> value;
+};
+template <>
 struct MasterStrategy<Kinetic_1_1_sh> {
   typedef boost::mpl::list<CR_XYZ_1_1<CGShell, KineticOper>> value;
 };

From 86486371071fcbc41c36b92665c13a56ebabe1ed Mon Sep 17 00:00:00 2001
From: Kshitij Surjuse <kshitijsurjuse100@gmail.com>
Date: Thu, 30 Apr 2026 14:53:03 -0400
Subject: [PATCH 21/22] =?UTF-8?q?Rename=20opCoulombop=20to=20=CF=83pCoulom?=
 =?UTF-8?q?b=CF=83p=20for=20naming=20consistency=20with=20sibling=20RKB=20?=
 =?UTF-8?q?integrals?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 export/tests/unit/test-2body.cc               |  2 +-
 include/libint2/engine.impl.h                 |  4 +-
 src/bin/libint/build_libint.cc                | 10 +--
 .../comp_11_\317\203pCoulomb\317\203p_11.h"   | 66 +++++++++----------
 src/bin/libint/master_ints_list.h             |  8 +--
 src/bin/libint/master_rrs_list.h              |  6 +-
 src/bin/libint/oper.h                         | 10 +--
 src/bin/libint/strategy.cc                    |  8 +--
 8 files changed, 57 insertions(+), 57 deletions(-)
 rename src/bin/libint/comp_11_opCoulombop_11.h => "src/bin/libint/comp_11_\317\203pCoulomb\317\203p_11.h" (85%)

diff --git a/export/tests/unit/test-2body.cc b/export/tests/unit/test-2body.cc
index 60b6cf812..dc0c01b73 100644
--- a/export/tests/unit/test-2body.cc
+++ b/export/tests/unit/test-2body.cc
@@ -723,7 +723,7 @@ TEST_CASE("RKB Coulomb integrals", "[engine][2-body]") {
             }
 
             // Project raw dyadic into the 9 SO(3) irrep components used by
-            // op_coulomb_op (must match opCoulombop_Descr::Component order).
+            // op_coulomb_op (must match σpCoulombσp_Descr::Component order).
             const auto Txx = ref_raw[0];
             const auto Txy = ref_raw[1];
             const auto Txz = ref_raw[2];
diff --git a/include/libint2/engine.impl.h b/include/libint2/engine.impl.h
index abf0db4e7..01269538d 100644
--- a/include/libint2/engine.impl.h
+++ b/include/libint2/engine.impl.h
@@ -1296,7 +1296,7 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
       swap_tbra = swap_tket = swap_p1p2;
     }
   } else if (oper_ == Operator::op_coulomb_op) {
-    // opCoulombop: only bra↔ket (particle 1↔2) swap is a symmetry (with
+    // σpCoulombσp: only bra↔ket (particle 1↔2) swap is a symmetry (with
     // (a,b)↔(b,a) component remap). Within-side swap is NOT a symmetry
     // because σ·p attaches to one specific function per side; moving it to
     // the other function changes the integral in a way IBP cannot recover
@@ -1341,7 +1341,7 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
       swap_tbra = swap_tket = swap_p1p2;
     }
   } else if (oper_ == Operator::op_coulomb_op) {
-    // opCoulombop: only bra↔ket swap is a symmetry (with (a,b)↔(b,a) remap).
+    // σpCoulombσp: only bra↔ket swap is a symmetry (with (a,b)↔(b,a) remap).
     // ORCA canonical form: la+lb >= lc+ld only.
     const auto bra_total = tbra1.contr[0].l + tbra2.contr[0].l;
     const auto ket_total = tket1.contr[0].l + tket2.contr[0].l;
diff --git a/src/bin/libint/build_libint.cc b/src/bin/libint/build_libint.cc
index 206cd43c4..67700a630 100644
--- a/src/bin/libint/build_libint.cc
+++ b/src/bin/libint/build_libint.cc
@@ -625,7 +625,7 @@ void try_main(int argc, char* argv[]) {
 #define BOOST_PP_RKB_ERI_TASK_TUPLE \
   (coulomb_opop, opop_coulomb_opop, op_coulomb_op)
 #define BOOST_PP_RKB_ERI_TASK_OPER_TUPLE \
-  (CoulombσpσpOper, σpσpCoulombσpσpOper, opCoulombopOper)
+  (CoulombσpσpOper, σpσpCoulombσpσpOper, σpCoulombσpOper)
 #define BOOST_PP_RKB_ERI_TASK_LIST \
   BOOST_PP_TUPLE_TO_LIST(BOOST_PP_RKB_ERI_TASK_TUPLE)
 #define BOOST_PP_RKB_ERI_TASK_OPER_LIST \
@@ -1163,7 +1163,7 @@ static void build_TwoPRep_2b_2k(
   std::shared_ptr<CodeContext> context(new CppCodeContext(cparams));
   std::shared_ptr<MemoryManager> memman(new WorstFitMemoryManager());
 
-  // opCoulombop has a 2-fold bra↔ket-swap symmetry, with per-component sign
+  // σpCoulombσp has a 2-fold bra↔ket-swap symmetry, with per-component sign
   // flips under the swap (Antisym* flip sign; Scalar/SymTL* invariant) — this
   // is captured by p1_p2_swappable=true plus a dedicated predicate that
   // canonicalizes only la+lb<=lc+ld and emits code for *all* within-side
@@ -1179,12 +1179,12 @@ static void build_TwoPRep_2b_2k(
     for (unsigned int lb = 0; lb <= lmax; lb++) {
       for (unsigned int lc = 0; lc <= lmax; lc++) {
         for (unsigned int ld = 0; ld <= lmax; ld++) {
-          // opCoulombop: only bra↔ket (particle 1↔2) swap is a symmetry.
+          // σpCoulombσp: only bra↔ket (particle 1↔2) swap is a symmetry.
           // Within-side swap is NOT (σ·p would move to a different physical
           // center; IBP cannot repair the sign across 1/r12). Dedicated
           // predicate canonicalizes la+lb<=lc+ld only (ORCA: >=) and accepts
           // all within-side orderings.
-          if constexpr (std::is_same<OperType, opCoulombopOper>::value) {
+          if constexpr (std::is_same<OperType, σpCoulombσpOper>::value) {
 #if LIBINT_SHELL_SET == LIBINT_SHELL_SET_STANDARD
             if (!(la + lb <= lc + ld)) continue;
 #else
@@ -1229,7 +1229,7 @@ static void build_TwoPRep_2b_2k(
               descrs.emplace_back(OperDescrType(p));
             }
           }
-          if constexpr (std::is_same<OperType, opCoulombopOper>::value) {
+          if constexpr (std::is_same<OperType, σpCoulombσpOper>::value) {
             // reset descriptors array
             descrs.resize(0);
             // iterate over 9 SO(3) irrep components: 1 scalar trace + 3
diff --git a/src/bin/libint/comp_11_opCoulombop_11.h "b/src/bin/libint/comp_11_\317\203pCoulomb\317\203p_11.h"
similarity index 85%
rename from src/bin/libint/comp_11_opCoulombop_11.h
rename to "src/bin/libint/comp_11_\317\203pCoulomb\317\203p_11.h"
index 38ae4763e..7391d3d31 100644
--- a/src/bin/libint/comp_11_opCoulombop_11.h
+++ "b/src/bin/libint/comp_11_\317\203pCoulomb\317\203p_11.h"
@@ -18,8 +18,8 @@
  *
  */
 
-#ifndef LIBINT_COMP_11_OPCOULOMBOP_11_H
-#define LIBINT_COMP_11_OPCOULOMBOP_11_H
+#ifndef LIBINT_COMP_11_ΣPCOULOMBΣP_11_H
+#define LIBINT_COMP_11_ΣPCOULOMBΣP_11_H
 
 #include <dims.h>
 #include <entity.h>
@@ -43,15 +43,15 @@ namespace libint2 {
  * @tparam F basis function type. valid choices are CGShell or CGF
  */
 template <typename F>
-class CR_11_opCoulombop_11
+class CR_11_σpCoulombσp_11
     : public GenericRecurrenceRelation<
-          CR_11_opCoulombop_11<F>, F,
-          GenIntegralSet_11_11<F, opCoulombopOper, mType>> {
+          CR_11_σpCoulombσp_11<F>, F,
+          GenIntegralSet_11_11<F, σpCoulombσpOper, mType>> {
  public:
-  typedef CR_11_opCoulombop_11<F> ThisType;
+  typedef CR_11_σpCoulombσp_11<F> ThisType;
   typedef F BasisFunctionType;
-  typedef opCoulombopOper OperType;
-  typedef GenIntegralSet_11_11<F, opCoulombopOper, mType> TargetType;
+  typedef σpCoulombσpOper OperType;
+  typedef GenIntegralSet_11_11<F, σpCoulombσpOper, mType> TargetType;
   typedef GenericRecurrenceRelation<ThisType, BasisFunctionType, TargetType>
       ParentType;
   friend class GenericRecurrenceRelation<ThisType, BasisFunctionType,
@@ -70,7 +70,7 @@ class CR_11_opCoulombop_11
 
   /// Constructor is private, used by Instance that maintains
   /// registry of these objects
-  CR_11_opCoulombop_11(const std::shared_ptr<TargetType>&, unsigned int = 0);
+  CR_11_σpCoulombσp_11(const std::shared_ptr<TargetType>&, unsigned int = 0);
 
   static std::string descr() { return "CR"; }
 
@@ -78,7 +78,7 @@ class CR_11_opCoulombop_11
   // All shell quartets with the same quaternion component share one function.
 
   std::string generate_label() const override {
-    return "CR_opCoulombop_" +
+    return "CR_σpCoulombσp_" +
            std::to_string(target_->oper()->descr().component_index());
   }
 
@@ -153,30 +153,30 @@ class CR_11_opCoulombop_11
     std::string rhs;
     unsigned int nflops = 0;
     switch (comp) {
-      case opCoulombop_Descr::Scalar:
+      case σpCoulombσp_Descr::Scalar:
         rhs = "src0[hsi] + src1[hsi] + src2[hsi]";
         nflops = 2;
         break;
-      case opCoulombop_Descr::AntisymX:
-      case opCoulombop_Descr::AntisymY:
-      case opCoulombop_Descr::AntisymZ:
-      case opCoulombop_Descr::SymTLDiagA:
+      case σpCoulombσp_Descr::AntisymX:
+      case σpCoulombσp_Descr::AntisymY:
+      case σpCoulombσp_Descr::AntisymZ:
+      case σpCoulombσp_Descr::SymTLDiagA:
         rhs = "src0[hsi] - src1[hsi]";
         nflops = 1;
         break;
-      case opCoulombop_Descr::SymTLDiagB:
+      case σpCoulombσp_Descr::SymTLDiagB:
         rhs = "2.0*src0[hsi] - src1[hsi] - src2[hsi]";
         nflops = 3;
         break;
-      case opCoulombop_Descr::SymTLOffXY:
-      case opCoulombop_Descr::SymTLOffXZ:
-      case opCoulombop_Descr::SymTLOffYZ:
+      case σpCoulombσp_Descr::SymTLOffXY:
+      case σpCoulombσp_Descr::SymTLOffXZ:
+      case σpCoulombσp_Descr::SymTLOffYZ:
         rhs = "src0[hsi] + src1[hsi]";
         nflops = 1;
         break;
       default:
         throw std::runtime_error(
-            "CR_11_opCoulombop_11::generate_code: invalid component index");
+            "CR_11_σpCoulombσp_11::generate_code: invalid component index");
     }
     def << "target[hsi] = " << rhs << ";\n}\n";
     def << "/** Number of flops = " << nflops << " */\n";
@@ -186,7 +186,7 @@ class CR_11_opCoulombop_11
 };
 
 template <typename F>
-CR_11_opCoulombop_11<F>::CR_11_opCoulombop_11(
+CR_11_σpCoulombσp_11<F>::CR_11_σpCoulombσp_11(
     const std::shared_ptr<TargetType>& Tint, unsigned int)
     : ParentType(Tint, 0) {
   assert(Tint->num_func_bra(/* particle */ 0) == 1);
@@ -231,7 +231,7 @@ CR_11_opCoulombop_11<F>::CR_11_opCoulombop_11(
   };
 
   switch (oper->descr().component_index()) {
-    case opCoulombop_Descr::Scalar: {
+    case σpCoulombσp_Descr::Scalar: {
       auto Txx = T(x, x);
       auto Tyy = T(y, y);
       auto Tzz = T(z, z);
@@ -240,7 +240,7 @@ CR_11_opCoulombop_11<F>::CR_11_opCoulombop_11(
         nflops_ += 2;
       }
     } break;
-    case opCoulombop_Descr::AntisymX: {
+    case σpCoulombσp_Descr::AntisymX: {
       auto Tyz = T(y, z);
       auto Tzy = T(z, y);
       if (is_simple()) {
@@ -248,7 +248,7 @@ CR_11_opCoulombop_11<F>::CR_11_opCoulombop_11(
         nflops_ += 1;
       }
     } break;
-    case opCoulombop_Descr::AntisymY: {
+    case σpCoulombσp_Descr::AntisymY: {
       auto Tzx = T(z, x);
       auto Txz = T(x, z);
       if (is_simple()) {
@@ -256,7 +256,7 @@ CR_11_opCoulombop_11<F>::CR_11_opCoulombop_11(
         nflops_ += 1;
       }
     } break;
-    case opCoulombop_Descr::AntisymZ: {
+    case σpCoulombσp_Descr::AntisymZ: {
       auto Txy = T(x, y);
       auto Tyx = T(y, x);
       if (is_simple()) {
@@ -264,7 +264,7 @@ CR_11_opCoulombop_11<F>::CR_11_opCoulombop_11(
         nflops_ += 1;
       }
     } break;
-    case opCoulombop_Descr::SymTLDiagA: {
+    case σpCoulombσp_Descr::SymTLDiagA: {
       auto Txx = T(x, x);
       auto Tyy = T(y, y);
       if (is_simple()) {
@@ -272,7 +272,7 @@ CR_11_opCoulombop_11<F>::CR_11_opCoulombop_11(
         nflops_ += 1;
       }
     } break;
-    case opCoulombop_Descr::SymTLDiagB: {
+    case σpCoulombσp_Descr::SymTLDiagB: {
       // 2·T_zz − T_xx − T_yy: child order (Tzz, Txx, Tyy) matches generate_code
       auto Tzz = T(z, z);
       auto Txx = T(x, x);
@@ -282,7 +282,7 @@ CR_11_opCoulombop_11<F>::CR_11_opCoulombop_11(
         nflops_ += 3;
       }
     } break;
-    case opCoulombop_Descr::SymTLOffXY: {
+    case σpCoulombσp_Descr::SymTLOffXY: {
       auto Txy = T(x, y);
       auto Tyx = T(y, x);
       if (is_simple()) {
@@ -290,7 +290,7 @@ CR_11_opCoulombop_11<F>::CR_11_opCoulombop_11(
         nflops_ += 1;
       }
     } break;
-    case opCoulombop_Descr::SymTLOffXZ: {
+    case σpCoulombσp_Descr::SymTLOffXZ: {
       auto Txz = T(x, z);
       auto Tzx = T(z, x);
       if (is_simple()) {
@@ -298,7 +298,7 @@ CR_11_opCoulombop_11<F>::CR_11_opCoulombop_11(
         nflops_ += 1;
       }
     } break;
-    case opCoulombop_Descr::SymTLOffYZ: {
+    case σpCoulombσp_Descr::SymTLOffYZ: {
       auto Tyz = T(y, z);
       auto Tzy = T(z, y);
       if (is_simple()) {
@@ -307,11 +307,11 @@ CR_11_opCoulombop_11<F>::CR_11_opCoulombop_11(
       }
     } break;
     default:
-      throw std::runtime_error("CR_11_opCoulombop_11: invalid component index");
+      throw std::runtime_error("CR_11_σpCoulombσp_11: invalid component index");
   }
 
-}  // CR_11_opCoulombop_11<F>::CR_11_opCoulombop_11
+}  // CR_11_σpCoulombσp_11<F>::CR_11_σpCoulombσp_11
 
 }  // namespace libint2
 
-#endif  // LIBINT_COMP_11_OPCOULOMBOP_11_H
+#endif  // LIBINT_COMP_11_ΣPCOULOMBΣP_11_H
diff --git a/src/bin/libint/master_ints_list.h b/src/bin/libint/master_ints_list.h
index 2a6a7cd0f..158052fba 100644
--- a/src/bin/libint/master_ints_list.h
+++ b/src/bin/libint/master_ints_list.h
@@ -118,9 +118,9 @@ typedef GenIntegralSet_11_11<CGShell, σpσpCoulombσpσpOper, mType>
     σpσpCoulombσpσp_11_11_sq;
 typedef GenIntegralSet_11_11<CGF, σpσpCoulombσpσpOper, mType>
     σpσpCoulombσpσp_11_11_int;
-typedef GenIntegralSet_11_11<CGShell, opCoulombopOper, mType>
-    opCoulombop_11_11_sq;
-typedef GenIntegralSet_11_11<CGF, opCoulombopOper, mType> opCoulombop_11_11_int;
+typedef GenIntegralSet_11_11<CGShell, σpCoulombσpOper, mType>
+    σpCoulombσp_11_11_sq;
+typedef GenIntegralSet_11_11<CGF, σpCoulombσpOper, mType> σpCoulombσp_11_11_int;
 typedef GenIntegralSet_11_11<CGShell, R12kG12, mType> R12kG12_11_11_sq;
 typedef GenIntegralSet_11_11<CGF, R12kG12, mType> R12kG12_11_11_int;
 typedef GenIntegralSet_11_11<CGShell, R12kR12lG12, EmptySet>
@@ -163,7 +163,7 @@ typedef boost::mpl::list<
 typedef boost::mpl::list<
     TwoPRep_11_11_sq, TwoPRep_11_11_int, Coulombσpσp_11_11_sq,
     Coulombσpσp_11_11_int, σpσpCoulombσpσp_11_11_sq, σpσpCoulombσpσp_11_11_int,
-    opCoulombop_11_11_sq, opCoulombop_11_11_int, R12kG12_11_11_sq,
+    σpCoulombσp_11_11_sq, σpCoulombσp_11_11_int, R12kG12_11_11_sq,
     R12kG12_11_11_int, R12kR12lG12_11_11_sq, R12kR12lG12_11_11_int,
     TiG12_11_11_sq, TiG12_11_11_int, G12TiG12_11_11_sq, G12TiG12_11_11_int,
     DivG12prime_xTx_11_11_sq, DivG12prime_xTx_11_11_int,
diff --git a/src/bin/libint/master_rrs_list.h b/src/bin/libint/master_rrs_list.h
index 067ccddf3..ae4e32311 100644
--- a/src/bin/libint/master_rrs_list.h
+++ b/src/bin/libint/master_rrs_list.h
@@ -24,7 +24,7 @@
 #include <comp_11_Coulombσpσp_11.h>
 #include <comp_11_DivG12prime_xTx_11.h>
 #include <comp_11_g12tig12_11.h>
-#include <comp_11_opCoulombop_11.h>
+#include <comp_11_σpCoulombσp_11.h>
 #include <comp_11_r12kr12lg12_11.h>
 #include <comp_11_tig12_11.h>
 #include <comp_11_σpσpCoulombσpσp_11.h>
@@ -325,8 +325,8 @@ typedef CR_11_Coulombσpσp_11<CGF> CR_11_Coulombσpσp_11_int;
 typedef CR_11_σpσpCoulombσpσp_11<CGShell> CR_11_σpσpCoulombσpσp_11_sh;
 typedef CR_11_σpσpCoulombσpσp_11<CGF> CR_11_σpσpCoulombσpσp_11_int;
 
-typedef CR_11_opCoulombop_11<CGShell> CR_11_opCoulombop_11_sh;
-typedef CR_11_opCoulombop_11<CGF> CR_11_opCoulombop_11_int;
+typedef CR_11_σpCoulombσp_11<CGShell> CR_11_σpCoulombσp_11_sh;
+typedef CR_11_σpCoulombσp_11<CGF> CR_11_σpCoulombσp_11_int;
 };  // namespace libint2
 
 #endif  // header guard
diff --git a/src/bin/libint/oper.h b/src/bin/libint/oper.h
index b231d80cc..29b367378 100644
--- a/src/bin/libint/oper.h
+++ b/src/bin/libint/oper.h
@@ -515,7 +515,7 @@ struct σpσpCoulombσpσp_Descr : public Contractable<σpσpCoulombσpσp_Descr
 };
 typedef GenOper<σpσpCoulombσpσp_Descr> σpσpCoulombσpσpOper;
 
-/** opCoulombop: (μ σ·p ν | 1/r_{12} | κ σ·p λ).
+/** σpCoulombσp: (μ σ·p ν | 1/r_{12} | κ σ·p λ).
  *  Gaunt LS "bilinear" operator with one σ·p on each side.
  *  Outputs the SO(3) irreducible decomposition of the 3×3 gradient-gradient
  *  tensor T_{ab} = ∂_a ∂_b (μν|κλ): 1 scalar trace + 3 antisymmetric
@@ -524,7 +524,7 @@ typedef GenOper<σpσpCoulombσpσp_Descr> σpσpCoulombσpσpOper;
  *  need to hand-build trace/antisym/sym-TL combinations at every contraction
  *  site.
  */
-struct opCoulombop_Descr : public Contractable<opCoulombop_Descr> {
+struct σpCoulombσp_Descr : public Contractable<σpCoulombσp_Descr> {
   typedef MultiplicativeSymm2Body_Props Properties;
 
   /// SO(3) irreducible components of the rank-2 Cartesian tensor T_{ab}.
@@ -540,8 +540,8 @@ struct opCoulombop_Descr : public Contractable<opCoulombop_Descr> {
     SymTLOffYZ = 8,  ///< T_yz + T_zy
   };
 
-  opCoulombop_Descr() : component_index_(0) {}
-  opCoulombop_Descr(int component_index) : component_index_(component_index) {
+  σpCoulombσp_Descr() : component_index_(0) {}
+  σpCoulombσp_Descr(int component_index) : component_index_(component_index) {
     assert(component_index >= 0 && component_index <= 8);
   }
 
@@ -576,7 +576,7 @@ struct opCoulombop_Descr : public Contractable<opCoulombop_Descr> {
  private:
   const int component_index_ = -1;
 };
-typedef GenOper<opCoulombop_Descr> opCoulombopOper;
+typedef GenOper<σpCoulombσp_Descr> σpCoulombσpOper;
 
 /** GTG_1d is the two-body 1-dimensional Gaussian geminal
  */
diff --git a/src/bin/libint/strategy.cc b/src/bin/libint/strategy.cc
index 3f0c36b0c..e01f76a7d 100644
--- a/src/bin/libint/strategy.cc
+++ b/src/bin/libint/strategy.cc
@@ -136,12 +136,12 @@ struct MasterStrategy<σpσpCoulombσpσp_11_11_int> {
   typedef boost::mpl::list<CR_11_σpσpCoulombσpσp_11_int> value;
 };
 template <>
-struct MasterStrategy<opCoulombop_11_11_sq> {
-  typedef boost::mpl::list<CR_11_opCoulombop_11_sh> value;
+struct MasterStrategy<σpCoulombσp_11_11_sq> {
+  typedef boost::mpl::list<CR_11_σpCoulombσp_11_sh> value;
 };
 template <>
-struct MasterStrategy<opCoulombop_11_11_int> {
-  typedef boost::mpl::list<CR_11_opCoulombop_11_int> value;
+struct MasterStrategy<σpCoulombσp_11_11_int> {
+  typedef boost::mpl::list<CR_11_σpCoulombσp_11_int> value;
 };
 
 #if LIBINT_SHELLQUARTET_STRATEGY == LIBINT_SHELLQUARTET_STRATEGY_A0C0

From e5464698cf1a3f1a61f9c27d7fda87af6ed52827 Mon Sep 17 00:00:00 2001
From: Kshitij Surjuse <kshitijsurjuse100@gmail.com>
Date: Tue, 12 May 2026 13:18:18 -0400
Subject: [PATCH 22/22] 3-center integrals in RKB basis

---
 CMakeLists.txt                    |  12 ++
 cmake/modules/int_am.cmake        |  15 +-
 export/tests/unit/test-2body.cc   | 141 ++++++++++++++++++
 include/libint2/config.h.cmake.in |  19 +++
 include/libint2/engine.impl.h     |  31 +++-
 src/bin/libint/build_libint.cc    | 235 +++++++++++++++++++++++++-----
 6 files changed, 408 insertions(+), 45 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 90b641733..cd2d49359 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -142,6 +142,10 @@ option_with_default(LIBINT2_ENABLE_ERI2
 option_with_default(LIBINT2_ENABLE_RKB_ERI
         "Compile with support for up to N-th derivatives of relativistic restricted kinetic
          balance (RKB) 4-center electron repulsion integrals (-1 for OFF)" 0)
+option_with_default(LIBINT2_ENABLE_RKB_ERI3
+        "Compile with support for up to N-th derivatives of relativistic restricted kinetic
+         balance (RKB) 3-center electron repulsion integrals (-1 for OFF).
+         σ·p acts on the two paired (AO) centers; the unpaired/fitting center is a spectator." -1)
 option_with_default(LIBINT2_ENABLE_G12
         "Compile with support for N-th derivatives of MP2-F12 energies with Gaussian factors (-1 for OFF)" -1)
 option_with_default(LIBINT2_ENABLE_G12DKH
@@ -228,6 +232,14 @@ option_with_default(LIBINT2_RKB_ERI_OPT_AM
         "Optimize relativistic restricted kinetic balance (RKB) 4-center ERIs maximally for up to angular momentum N (N <= max-am).
    Can specify values for each derivative level as a semicolon-separated string (default: (max_am/2)+1)" -1)
 
+option_with_default(LIBINT2_RKB_ERI3_MAX_AM
+        "Support relativistic restricted kinetic balance (RKB) 3-center ERIs for Gaussians of angular momentum up to N.
+   Can specify values for each derivative level as a semicolon-separated string. (default: max_am)
+   This option controls only the single fitting center. The paired centers (on which σ·p acts) use LIBINT2_MAX_AM." -1)
+option_with_default(LIBINT2_RKB_ERI3_OPT_AM
+        "Optimize relativistic restricted kinetic balance (RKB) 3-center ERIs maximally for up to angular momentum N (N <= max-am).
+   Can specify values for each derivative level as a semicolon-separated string (default: (max_am/2)+1)" -1)
+
 
 option_with_default(LIBINT2_ERI3_MAX_AM
         "Support 3-center ERIs for Gaussians of angular momentum up to N.
diff --git a/cmake/modules/int_am.cmake b/cmake/modules/int_am.cmake
index 350924f49..2dbd1904a 100644
--- a/cmake/modules/int_am.cmake
+++ b/cmake/modules/int_am.cmake
@@ -358,6 +358,7 @@ endmacro()
 process_integrals_class(ONEBODY)
 process_integrals_class(ERI)
 process_integrals_class(RKB_ERI)
+process_integrals_class(RKB_ERI3)
 process_integrals_class(ERI3)
 process_integrals_class(ERI2)
 # unlike above, these classes (1) don't do AM_LIST and (2) require value in config.h if enabled
@@ -397,7 +398,7 @@ list(REVERSE _amlist)
 list(APPEND Libint2_ERI_COMPONENTS "${_amlist}")
 message(VERBOSE "setting components ${_amlist}")
 
-foreach(_cls ONEBODY;ERI;RKB_ERI;ERI3;ERI2;G12;G12DKH)
+foreach(_cls ONEBODY;ERI;RKB_ERI;RKB_ERI3;ERI3;ERI2;G12;G12DKH)
     if((_cls STREQUAL G12) OR (_cls STREQUAL G12DKH))
         add_feature_info(
           "integral class ${_cls}"
@@ -448,6 +449,18 @@ foreach(_cls ONEBODY;ERI;RKB_ERI;ERI3;ERI2;G12;G12DKH)
                     endforeach()
                 endforeach()
             endif()
+            if (_cls STREQUAL "RKB_ERI3")
+                # Mirror ERI3 component naming: fitting (single) center on bra,
+                # paired (AO) centers on ket where σ·p acts. Paired-center AM
+                # tracks LIBINT_MAX_AM via _eri3_candidate0_d${_d}.
+                foreach(_lfit RANGE ${LIBINT_HARD_MIN_AM} ${_candidate_${_cls}_d${_d}})  # LIBINT_RKB_ERI3_MAX_AM[_LIST], fitting
+                    foreach(_lpr RANGE ${LIBINT_HARD_MIN_AM} ${_eri3_candidate0_d${_d}})  # LIBINT_MAX_AM[_LIST], paired
+                        if (_lfit GREATER_EQUAL _lpr)
+                            list(APPEND _amlist     "rkb_eri_${_am${_lpr}}${_am${_lpr}}${_AM${_lfit}}_d${_d}")
+                        endif()
+                    endforeach()
+                endforeach()
+            endif()
             list(REVERSE _amlist)
             list(APPEND Libint2_ERI_COMPONENTS "${_amlist}")
             message(VERBOSE "setting components ${_amlist}")
diff --git a/export/tests/unit/test-2body.cc b/export/tests/unit/test-2body.cc
index dc0c01b73..e37a2bd99 100644
--- a/export/tests/unit/test-2body.cc
+++ b/export/tests/unit/test-2body.cc
@@ -774,6 +774,147 @@ TEST_CASE("RKB Coulomb integrals", "[engine][2-body]") {
       }
     }
   }
+
+  SECTION("Coulombσpσp 3-center xs_xx") {
+    // 3-center RKB Coulombσpσp integral, (P | σ·p_μ σ·p_ν / r12) with P on
+    // the bra (fitting/DF) and σ·p acting on the AO pair (μ,ν) on the ket.
+    //
+    // Reference: the same operator and BraKet computed via the 4-center
+    // engine with Shell::unit() at the dummy bra position. The 4-center
+    // engine path is already validated by the "Coulombσpσp and
+    // σpσpCoulombσpσp" SECTION above against a primitive-eri reference, so
+    // by transitivity the 4-center engine is a trustworthy reference for the
+    // 3-center engine. Both code paths share the dummy-shell trick
+    // internally, but they are independently generated (different task
+    // labels, different dispatch tables), so this catches dispatch-table /
+    // codegen wiring mistakes in the new 3-center path.
+
+    Shell dfsh{{1.5}, {{0, false, {1.0}}}, {{-1.0, 0.5, 0.0}}};
+    std::vector<Shell> dfs{dfsh};
+    const auto &unitshell = libint2::Shell::unit();
+
+    Engine engine_3c, engine_4c_ref;
+    try {
+      engine_3c = Engine(Operator::coulomb_opop,
+                         std::max(max_nprim, libint2::max_nprim(dfs)),
+                         std::max(max_l, libint2::max_l(dfs)), 0);
+      engine_3c.set(BraKet::xs_xx);
+      // 4-center reference — same operator, default BraKet::xx_xx
+      engine_4c_ref = Engine(Operator::coulomb_opop,
+                             std::max(max_nprim, libint2::max_nprim(dfs)),
+                             std::max(max_l, libint2::max_l(dfs)), 0);
+    } catch (Engine::lmax_exceeded &) {
+      // skip if libint not configured with -DLIBINT2_ENABLE_RKB_ERI3 >= 0
+      // (or the 4-center RKB ERI was disabled)
+      return;
+    }
+
+    const auto nshell = obs.size();
+    for (int sa = 0; sa != nshell; ++sa) {
+      for (int sb = 0; sb != nshell; ++sb) {
+        const auto &results_3c =
+            engine_3c.compute(dfsh, obs[sa], obs[sb]);
+        // 4-center reference: (dfsh, unit | sh_a, sh_b). σ·p still acts on
+        // (C, D) = (sh_a, sh_b), matching the 3-center xs_xx mapping.
+        const auto &results_4c_ref =
+            engine_4c_ref.compute(dfsh, unitshell, obs[sa], obs[sb]);
+        assert(results_3c.size() == 4);
+        assert(results_4c_ref.size() == 4);
+
+        const auto n_df = dfsh.size();
+        const auto n_a = obs[sa].size();
+        const auto n_b = obs[sb].size();
+        const auto n_total = n_df * n_a * n_b;
+        // 4-center buffer is (df, unit, sa, sb); n_unit = 1; so the linear
+        // layout coincides with the 3-center (df, sa, sb) layout.
+
+        const double ABS_TOL = 5.0E-14;
+        const double REL_TOL = 1.0E-9;
+        for (auto comp = 0; comp < 4; ++comp) {
+          for (size_t k = 0; k < n_total; ++k) {
+            const auto v_3c = results_3c[comp][k];
+            const auto v_ref = results_4c_ref[comp][k];
+            const auto abs_err = std::abs(v_3c - v_ref);
+            const auto rel_err =
+                std::abs(v_ref) > 1e-30 ? std::abs(abs_err / v_ref) : 0.0;
+            bool not_ok = rel_err > REL_TOL && abs_err > ABS_TOL;
+            if (not_ok) {
+              std::cout << "(df | sa=" << sa << " sb=" << sb
+                        << ") comp=" << comp << " elem=" << k
+                        << ": 3c=" << v_3c << " 4c_ref=" << v_ref
+                        << " abs_err=" << abs_err << " rel_err=" << rel_err
+                        << std::endl;
+            }
+            REQUIRE(!not_ok);
+          }
+        }
+      }
+    }
+  }
+
+  SECTION("Coulombσpσp 3-center xx_xs alias") {
+    // The xx_xs braket re-routes through the same xs_xx kernel via the
+    // Engine::compute2 bra↔ket swap. Verify the alias yields the same
+    // integral values, with the output buffer in the user-requested
+    // (sh_a, sh_b | dfsh) index layout.
+
+    Shell dfsh{{1.5}, {{0, false, {1.0}}}, {{-1.0, 0.5, 0.0}}};
+    std::vector<Shell> dfs{dfsh};
+
+    Engine eng_xs, eng_xxs;
+    try {
+      eng_xs = Engine(Operator::coulomb_opop,
+                      std::max(max_nprim, libint2::max_nprim(dfs)),
+                      std::max(max_l, libint2::max_l(dfs)), 0);
+      eng_xs.set(BraKet::xs_xx);
+      eng_xxs = Engine(Operator::coulomb_opop,
+                       std::max(max_nprim, libint2::max_nprim(dfs)),
+                       std::max(max_l, libint2::max_l(dfs)), 0);
+      eng_xxs.set(BraKet::xx_xs);
+    } catch (Engine::lmax_exceeded &) {
+      return;
+    }
+
+    for (size_t sa = 0; sa != obs.size(); ++sa) {
+      for (size_t sb = 0; sb != obs.size(); ++sb) {
+        const auto n_df = dfsh.size();
+        const auto n_a = obs[sa].size();
+        const auto n_b = obs[sb].size();
+
+        const auto &res_xs = eng_xs.compute(dfsh, obs[sa], obs[sb]);
+        // re-set xs braket each loop in case Catch2 SECTION re-entry resets
+        eng_xxs.set(BraKet::xx_xs);
+        const auto &res_xxs = eng_xxs.compute(obs[sa], obs[sb], dfsh);
+
+        for (auto c = 0; c < 4; ++c) {
+          for (size_t i_df = 0; i_df < n_df; ++i_df) {
+            for (size_t i_a = 0; i_a < n_a; ++i_a) {
+              for (size_t i_b = 0; i_b < n_b; ++i_b) {
+                const auto v_xs =
+                    res_xs[c][i_df * n_a * n_b + i_a * n_b + i_b];
+                const auto v_xxs =
+                    res_xxs[c][i_a * n_b * n_df + i_b * n_df + i_df];
+                const auto abs_err = abs(v_xs - v_xxs);
+                const auto rel_err = std::abs(v_xs) > 1e-30
+                                         ? double(abs(abs_err / v_xs))
+                                         : 0.0;
+                bool not_ok = rel_err > 1.0E-9 && abs_err > 5.0E-14;
+                if (not_ok) {
+                  std::cout << "xx_xs alias mismatch: (sa=" << sa
+                            << " sb=" << sb << ") comp=" << c
+                            << " idx(df=" << i_df << ",a=" << i_a
+                            << ",b=" << i_b << "): xs=" << v_xs
+                            << " xxs=" << v_xxs << " abs_err=" << abs_err
+                            << std::endl;
+                }
+                REQUIRE(!not_ok);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
 }
 
 TEST_CASE("Erfx_Coulomb integrals", "[engine][2-body]") {
diff --git a/include/libint2/config.h.cmake.in b/include/libint2/config.h.cmake.in
index 3eb32e2f0..e07f3cbc8 100644
--- a/include/libint2/config.h.cmake.in
+++ b/include/libint2/config.h.cmake.in
@@ -77,6 +77,12 @@
 #undef LIBINT_INCLUDE_RKB_ERI
 #endif
 
+/* Support 3-center RKB ERI derivatives up to this order */
+#define LIBINT_INCLUDE_RKB_ERI3 @LIBINT_INCLUDE_RKB_ERI3@
+#if @LIBINT_INCLUDE_RKB_ERI3@ == -1
+#undef LIBINT_INCLUDE_RKB_ERI3
+#endif
+
 
 /* Support 3-center ERI derivatives up to this order */
 #define LIBINT_INCLUDE_ERI3 @LIBINT_INCLUDE_ERI3@
@@ -141,6 +147,19 @@
 /* Max optimized AM for ERI and its derivatives */
 #cmakedefine LIBINT_RKB_ERI_OPT_AM_LIST "@LIBINT_RKB_ERI_OPT_AM_LIST@"
 
+/* Max AM for 3-center RKB ERI fitting center (same for all derivatives; if not defined see LIBINT_RKB_ERI3_MAX_AM_LIST).
+   The paired AO centers (on which σ·p acts) use LIBINT_MAX_AM. */
+#cmakedefine LIBINT_RKB_ERI3_MAX_AM @LIBINT_RKB_ERI3_MAX_AM@
+
+/* Max AM for 3-center RKB ERI and its derivatives */
+#cmakedefine LIBINT_RKB_ERI3_MAX_AM_LIST "@LIBINT_RKB_ERI3_MAX_AM_LIST@"
+
+/* Max optimized AM for 3-center RKB ERI (same for all derivatives; if not defined see LIBINT_RKB_ERI3_OPT_AM_LIST) */
+#cmakedefine LIBINT_RKB_ERI3_OPT_AM @LIBINT_RKB_ERI3_OPT_AM@
+
+/* Max optimized AM for 3-center RKB ERI and its derivatives */
+#cmakedefine LIBINT_RKB_ERI3_OPT_AM_LIST "@LIBINT_RKB_ERI3_OPT_AM_LIST@"
+
 /* Max AM for 3-center ERI (same for all derivatives; if not defined see LIBINT_ERI3_MAX_AM_LIST) */
 #cmakedefine LIBINT_ERI3_MAX_AM @LIBINT_ERI3_MAX_AM@
 
diff --git a/include/libint2/engine.impl.h b/include/libint2/engine.impl.h
index 01269538d..85a744768 100644
--- a/include/libint2/engine.impl.h
+++ b/include/libint2/engine.impl.h
@@ -174,9 +174,22 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute(
     if (nargs == 2)
       return (this->*compute_ptr)(shells[0], Shell::unit(), shells[1],
                                   Shell::unit(), nullptr, nullptr);
-    if (nargs == 3)
+    if (nargs == 3) {
+      // The 3-arg user form depends on which 3-center BraKet was set:
+      //   xs_xx — bra = (fitting, unit), ket = (AO, AO).
+      //           User passes (fitting, AO, AO), kernel sees
+      //           (fitting, unit, AO, AO).
+      //   xx_xs — bra = (AO, AO),       ket = (fitting, unit).
+      //           User passes (AO, AO, fitting), kernel sees
+      //           (AO, AO, fitting, unit).
+      // The non-xx_xs branch is the default (covers xs_xx and any future
+      // 3-center variants that follow the xs_xx convention).
+      if (braket_ == BraKet::xx_xs)
+        return (this->*compute_ptr)(shells[0], shells[1], shells[2],
+                                    Shell::unit(), nullptr, nullptr);
       return (this->*compute_ptr)(shells[0], Shell::unit(), shells[1],
                                   shells[2], nullptr, nullptr);
+    }
     if (nargs == 4)
       return (this->*compute_ptr)(shells[0], shells[1], shells[2], shells[3],
                                   nullptr, nullptr);
@@ -2015,9 +2028,11 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
         break;
 
       case BraKet::xx_xs:
-        assert(false && "this braket is not supported");
-        abort();
-        break;
+        // xx_xs is always canonicalized to xs_xx by swap_braket=true in the
+        // shell-aliasing logic above; the swapped shells (bra1, ket1, ket2)
+        // are already in xs_xx layout by this point. Fall through to the
+        // xs_xx case to compute buildfnidx from those swapped shells.
+        [[fallthrough]];
       case BraKet::xs_xx: {
         /// lmax might be center dependent
         int ket_lmax = hard_lmax_;
@@ -2243,6 +2258,14 @@ __libint2_engine_inline const Engine::target_ptr_vec& Engine::compute2(
                 }
                 if (swap_tket && oper_ == Operator::coulomb_opop && s > 0)
                   oper_cart_component_phase = -1.0;
+                // For coulomb_opop in BraKet::xx_xs (the 3-center alias of
+                // xs_xx): after swap_braket the user's bra (AO pair) becomes
+                // the kernel's ket where σ·p attaches. If swap_tbra is also
+                // applied to canonicalize that AO pair, σ·p_a σ·p_b ↔
+                // σ·p_b σ·p_a — antisymmetric, so x,y,z components (s>0)
+                // pick up a -1 sign. (The scalar component s=0 is invariant.)
+                if (swap_tbra && oper_ == Operator::coulomb_opop && s > 0)
+                  oper_cart_component_phase = -1.0;
                 // op_coulomb_op irrep layout under bra↔ket swap: antisym
                 // components (s ∈ {1,2,3}) flip sign; scalar (0) and sym-TL
                 // (4..8) are invariant. swap_tket is always false for this
diff --git a/src/bin/libint/build_libint.cc b/src/bin/libint/build_libint.cc
index 67700a630..5737e167a 100644
--- a/src/bin/libint/build_libint.cc
+++ b/src/bin/libint/build_libint.cc
@@ -234,9 +234,11 @@ static void build_TwoPRep_2b_2k(
 #endif
 #endif
 
-#ifdef LIBINT_INCLUDE_ERI3
+#if defined(LIBINT_INCLUDE_ERI3) || defined(LIBINT_INCLUDE_RKB_ERI3)
+template <typename OperType>
 static void build_TwoPRep_1b_2k(
-    std::ostream& os, const std::shared_ptr<CompilationParameters>& cparams,
+    std::ostream& os, std::string label,
+    const std::shared_ptr<CompilationParameters>& cparams,
     std::shared_ptr<Libint2Iface>& iface, unsigned int deriv_level);
 #endif
 
@@ -640,6 +642,32 @@ void try_main(int argc, char* argv[]) {
   }
 #endif
 
+#ifdef LIBINT_INCLUDE_RKB_ERI3
+  // 3-center RKB ERIs. σ·p acts on the AO pair (ket); the fitting center is a
+  // spectator. Task labels are namespaced with the "3" ncenter prefix so they
+  // match the BOOST_PP_NBODYENGINE_MCR3_task expansion in engine.impl.h:
+  //   ncenter("3") + oper("coulomb_opop") + deriv("") = "3coulomb_opop".
+  // The macro tuples mirror BOOST_PP_RKB_ERI_TASK_TUPLE but only the operators
+  // that make sense in xs_xx geometry are listed (opop_coulomb_opop requires
+  // σ·p on both bra and ket, which is incompatible with one side being a
+  // dummy-s fitting shell; it is intentionally omitted).
+#define BOOST_PP_RKB_ERI3_TASK_TUPLE (3coulomb_opop)
+#define BOOST_PP_RKB_ERI3_TASK_OPER_TUPLE (CoulombσpσpOper)
+#define BOOST_PP_RKB_ERI3_TASK_LIST \
+  BOOST_PP_TUPLE_TO_LIST(BOOST_PP_RKB_ERI3_TASK_TUPLE)
+#define BOOST_PP_RKB_ERI3_TASK_OPER_LIST \
+  BOOST_PP_TUPLE_TO_LIST(BOOST_PP_RKB_ERI3_TASK_OPER_TUPLE)
+
+  for (unsigned int d = 0; d <= LIBINT_INCLUDE_RKB_ERI3; ++d) {
+#define BOOST_PP_RKB_ERI3_MCR1(r, data, elem) \
+  taskmgr.add(task_label(BOOST_PP_STRINGIZE(elem), d));
+
+    BOOST_PP_LIST_FOR_EACH(BOOST_PP_RKB_ERI3_MCR1, _,
+                           BOOST_PP_RKB_ERI3_TASK_LIST)
+#undef BOOST_PP_RKB_ERI3_MCR1
+  }
+#endif
+
 #ifdef LIBINT_INCLUDE_ERI3
   for (unsigned int d = 0; d <= LIBINT_INCLUDE_ERI3; ++d) {
     taskmgr.add(task_label("3eri", d));
@@ -781,6 +809,75 @@ void try_main(int argc, char* argv[]) {
   }
 #endif  // LIBINT_INCLUDE_RKB_ERI
 
+#ifdef LIBINT_INCLUDE_RKB_ERI3
+  // Per-task CompilationParameters for 3-center RKB ERIs. Parallel structure
+  // to the LIBINT_INCLUDE_ERI3 block below:
+  //   - center 0 (fitting):    LIBINT_RKB_ERI3_MAX_AM[_LIST]
+  //   - centers 1, 2 (AO pair): LIBINT_MAX_AM[_LIST] (paired-center default)
+  //   - num_bf = 3 (3-center dispatch table is 3-dimensional)
+  for (unsigned int d = 0; d <= LIBINT_INCLUDE_RKB_ERI3; ++d) {
+#if defined(LIBINT_RKB_ERI3_MAX_AM_LIST)
+#define BOOST_PP_RKB_ERI3_MCR_MAXAM(r, data, elem)                    \
+  cparams->max_am(task_label(BOOST_PP_STRINGIZE(elem), d),            \
+                  token<unsigned int>(LIBINT_RKB_ERI3_MAX_AM_LIST,    \
+                                      ',', d));
+    BOOST_PP_LIST_FOR_EACH(BOOST_PP_RKB_ERI3_MCR_MAXAM, _,
+                           BOOST_PP_RKB_ERI3_TASK_LIST)
+#undef BOOST_PP_RKB_ERI3_MCR_MAXAM
+#elif defined(LIBINT_RKB_ERI3_MAX_AM)
+#define BOOST_PP_RKB_ERI3_MCR_MAXAM(r, data, elem)         \
+  cparams->max_am(task_label(BOOST_PP_STRINGIZE(elem), d), \
+                  LIBINT_RKB_ERI3_MAX_AM);
+    BOOST_PP_LIST_FOR_EACH(BOOST_PP_RKB_ERI3_MCR_MAXAM, _,
+                           BOOST_PP_RKB_ERI3_TASK_LIST)
+#undef BOOST_PP_RKB_ERI3_MCR_MAXAM
+#endif
+#if defined(LIBINT_RKB_ERI3_OPT_AM_LIST)
+#define BOOST_PP_RKB_ERI3_MCR_OPTAM(r, data, elem)                       \
+  cparams->max_am_opt(task_label(BOOST_PP_STRINGIZE(elem), d),           \
+                      token<unsigned int>(LIBINT_RKB_ERI3_OPT_AM_LIST,   \
+                                          ',', d));
+    BOOST_PP_LIST_FOR_EACH(BOOST_PP_RKB_ERI3_MCR_OPTAM, _,
+                           BOOST_PP_RKB_ERI3_TASK_LIST)
+#undef BOOST_PP_RKB_ERI3_MCR_OPTAM
+#elif defined(LIBINT_RKB_ERI3_OPT_AM)
+#define BOOST_PP_RKB_ERI3_MCR_OPTAM(r, data, elem)             \
+  cparams->max_am_opt(task_label(BOOST_PP_STRINGIZE(elem), d), \
+                      LIBINT_RKB_ERI3_OPT_AM);
+    BOOST_PP_LIST_FOR_EACH(BOOST_PP_RKB_ERI3_MCR_OPTAM, _,
+                           BOOST_PP_RKB_ERI3_TASK_LIST)
+#undef BOOST_PP_RKB_ERI3_MCR_OPTAM
+#endif
+    // Paired centers (1, 2) follow the default basis AM.
+#if defined(LIBINT_MAX_AM_LIST)
+#define BOOST_PP_RKB_ERI3_MCR_PAIRED(r, data, elem)                       \
+  cparams->max_am(task_label(BOOST_PP_STRINGIZE(elem), d),                \
+                  cparams->max_am(task_label("default", d)), 1);          \
+  cparams->max_am(task_label(BOOST_PP_STRINGIZE(elem), d),                \
+                  cparams->max_am(task_label("default", d)), 2);
+    BOOST_PP_LIST_FOR_EACH(BOOST_PP_RKB_ERI3_MCR_PAIRED, _,
+                           BOOST_PP_RKB_ERI3_TASK_LIST)
+#undef BOOST_PP_RKB_ERI3_MCR_PAIRED
+#else
+#define BOOST_PP_RKB_ERI3_MCR_PAIRED(r, data, elem)        \
+  cparams->max_am(task_label(BOOST_PP_STRINGIZE(elem), d), \
+                  cparams->max_am("default"), 1);          \
+  cparams->max_am(task_label(BOOST_PP_STRINGIZE(elem), d), \
+                  cparams->max_am("default"), 2);
+    BOOST_PP_LIST_FOR_EACH(BOOST_PP_RKB_ERI3_MCR_PAIRED, _,
+                           BOOST_PP_RKB_ERI3_TASK_LIST)
+#undef BOOST_PP_RKB_ERI3_MCR_PAIRED
+#endif
+  }
+  for (unsigned int d = 0; d <= LIBINT_INCLUDE_RKB_ERI3; ++d) {
+#define BOOST_PP_RKB_ERI3_MCR_NBF(r, data, elem) \
+  cparams->num_bf(task_label(BOOST_PP_STRINGIZE(elem), d), 3);
+    BOOST_PP_LIST_FOR_EACH(BOOST_PP_RKB_ERI3_MCR_NBF, _,
+                           BOOST_PP_RKB_ERI3_TASK_LIST)
+#undef BOOST_PP_RKB_ERI3_MCR_NBF
+  }
+#endif  // LIBINT_INCLUDE_RKB_ERI3
+
 #ifdef LIBINT_INCLUDE_ERI3
   for (unsigned int d = 0; d <= LIBINT_INCLUDE_ERI3; ++d) {
 #if defined(LIBINT_ERI3_MAX_AM_LIST)
@@ -1016,9 +1113,20 @@ void try_main(int argc, char* argv[]) {
   }
 #endif
 
+#ifdef LIBINT_INCLUDE_RKB_ERI3
+  for (unsigned int d = 0; d <= LIBINT_INCLUDE_RKB_ERI3; ++d) {
+#define BOOST_PP_RKB_ERI3_MCR7(r, data, i, elem)                              \
+  build_TwoPRep_1b_2k<BOOST_PP_LIST_AT(BOOST_PP_RKB_ERI3_TASK_OPER_LIST, i)>( \
+      os, BOOST_PP_STRINGIZE(elem), cparams, iface, d);
+    BOOST_PP_LIST_FOR_EACH_I(BOOST_PP_RKB_ERI3_MCR7, _,
+                             BOOST_PP_RKB_ERI3_TASK_LIST)
+#undef BOOST_PP_RKB_ERI3_MCR7
+  }
+#endif
+
 #ifdef LIBINT_INCLUDE_ERI3
   for (unsigned int d = 0; d <= LIBINT_INCLUDE_ERI3; ++d) {
-    build_TwoPRep_1b_2k(os, cparams, iface, d);
+    build_TwoPRep_1b_2k<TwoPRep>(os, "3eri", cparams, iface, d);
   }
 #if LIBINT_ERI3_PURE_SH
   iface->to_params(iface->macro_define("ERI3_PURE_SH", 1));
@@ -1398,14 +1506,22 @@ static void build_TwoPRep_2b_2k(
 
 #endif  // LIBINT_INCLUDE_ERI || LIBINT_INCLUDE_RKB_ERI
 
-#ifdef LIBINT_INCLUDE_ERI3
+#if defined(LIBINT_INCLUDE_ERI3) || defined(LIBINT_INCLUDE_RKB_ERI3)
 
-void build_TwoPRep_1b_2k(std::ostream& os,
-                         const std::shared_ptr<CompilationParameters>& cparams,
-                         std::shared_ptr<Libint2Iface>& iface,
-                         unsigned int deriv_level) {
-  const std::string task = task_label("3eri", deriv_level);
-  typedef TwoPRep_11_11_sq TwoPRep_sh_11_11;
+// Templated on OperType so the same dummy-center "3-center via 4-center"
+// machinery serves plain TwoPRep (ERI3) and RKB CoulombσpσpOper (RKB_ERI3).
+// The composite recurrence (CR_11_Coulombσpσp_11) decomposes σ·p_c σ·p_d into
+// derivatives of plain TwoPRep_11_11 children; those children naturally route
+// through DerivGaussV2 via the strategy registered for TwoPRep_11_11 — so no
+// special wiring is needed here for RKB.
+template <typename OperType>
+static void build_TwoPRep_1b_2k(
+    std::ostream& os, std::string label,
+    const std::shared_ptr<CompilationParameters>& cparams,
+    std::shared_ptr<Libint2Iface>& iface, unsigned int deriv_level) {
+  const std::string task = task_label(label, deriv_level);
+  typedef GenIntegralSet_11_11<CGShell, OperType, mType> TwoBody_sh_11_11;
+  typedef typename OperType::Descriptor OperDescrType;
   vector<CGShell*> shells;
   const unsigned int lmax = cparams->max_am(task);
   const unsigned int lmax_default =
@@ -1437,6 +1553,8 @@ void build_TwoPRep_1b_2k(std::ostream& os,
   std::shared_ptr<CodeContext> context(new CppCodeContext(cparams));
   std::shared_ptr<MemoryManager> memman(new WorstFitMemoryManager());
 
+  const auto nullaux = typename TwoBody_sh_11_11::AuxIndexType(0u);
+
   for (unsigned int lbra = 0; lbra <= lmax; lbra++) {
     for (unsigned int lc = 0; lc <= lmax_default; lc++) {
       for (unsigned int ld = 0; ld <= lmax_default; ld++) {
@@ -1462,11 +1580,19 @@ void build_TwoPRep_1b_2k(std::ostream& os,
         if (!(lbra == lim && lc == lim && ld == lim)) continue;
 #endif
 
+        // operator components: 1 for plain TwoPRep, 4 for RKB CoulombσpσpOper
+        std::vector<OperDescrType> descrs(1);
+        if constexpr (std::is_same<OperType, CoulombσpσpOper>::value) {
+          descrs.resize(0);
+          for (int p = 0; p != 4; ++p) descrs.emplace_back(OperDescrType(p));
+        }
+        const auto nopers = descrs.size();
+
         // unroll only if max_am <= cparams->max_am_opt(task)
         using std::max;
         const unsigned int max_am = max(max(lc, ld), lbra);
         const bool need_to_optimize = (max_am <= cparams->max_am_opt(task));
-        const bool need_to_unroll = l_to_cgshellsize(lbra) *
+        const bool need_to_unroll = nopers * l_to_cgshellsize(lbra) *
                                         l_to_cgshellsize(lc) *
                                         l_to_cgshellsize(ld) <=
                                     cparams->unroll_threshold();
@@ -1475,10 +1601,15 @@ void build_TwoPRep_1b_2k(std::ostream& os,
                 ? std::numeric_limits<unsigned int>::max()
                 : 0;
         dg_xxx->registry()->unroll_threshold(unroll_threshold);
-        dg_xxx->registry()->do_cse(need_to_optimize);
-        dg_xxx->registry()->condense_expr(condense_expr(
-            cparams->unroll_threshold(), cparams->max_vector_length() > 1));
-        // dg_xxx->registry()->condense_expr(true);
+        // For multi-component operators (RKB), components share no
+        // intermediates, so CSE/condense_expr is pure overhead — disable
+        // (mirrors build_TwoPRep_2b_2k).
+        const bool do_optimize = (nopers > 1) ? false : need_to_optimize;
+        dg_xxx->registry()->do_cse(do_optimize);
+        dg_xxx->registry()->condense_expr(
+            do_optimize ? condense_expr(cparams->unroll_threshold(),
+                                        cparams->max_vector_length() > 1)
+                        : false);
         //  Need to accumulate integrals?
         dg_xxx->registry()->accumulate_targets(cparams->accumulate_targets());
 
@@ -1487,7 +1618,7 @@ void build_TwoPRep_1b_2k(std::ostream& os,
         ////////////
         // NB translational invariance is now handled by CR_DerivGauss
         CartesianDerivIterator<3> diter(deriv_level);
-        std::vector<std::shared_ptr<TwoPRep_sh_11_11>> targets;
+        std::vector<std::shared_ptr<TwoBody_sh_11_11>> targets;
         bool last_deriv = false;
         do {
           CGShell a = (dummy_center == 0) ? CGShell::unit() : CGShell(lbra);
@@ -1495,8 +1626,12 @@ void build_TwoPRep_1b_2k(std::ostream& os,
           CGShell c(lc);
           CGShell d(ld);
 #if LIBINT_ERI3_PURE_SH
-          if (dummy_center == 1 && deriv_level == 0) a.pure_sh(true);
-          if (dummy_center == 0 && deriv_level == 0) b.pure_sh(true);
+          // pure-SH on the fitting center is meaningful only for plain ERI3;
+          // RKB CoulombσpσpOper does not assume the fitting center is pure SH.
+          if constexpr (std::is_same<OperType, TwoPRep>::value) {
+            if (dummy_center == 1 && deriv_level == 0) a.pure_sh(true);
+            if (dummy_center == 0 && deriv_level == 0) b.pure_sh(true);
+          }
 #endif
 
           unsigned int center = 0;
@@ -1511,19 +1646,21 @@ void build_TwoPRep_1b_2k(std::ostream& os,
             ++center;
           }
 
-          // use 4-center integrals
-          std::shared_ptr<TwoPRep_sh_11_11> abcd =
-              TwoPRep_sh_11_11::Instance(a, b, c, d, mType(0u));
-          targets.push_back(abcd);
+          // emit one target per operator component (1 for plain ERI3, 4 for
+          // RKB CoulombσpσpOper)
+          for (unsigned int op = 0; op != descrs.size(); ++op) {
+            OperType oper(descrs[op]);
+            std::shared_ptr<TwoBody_sh_11_11> abcd =
+                TwoBody_sh_11_11::Instance(a, b, c, d, nullaux, oper);
+            targets.push_back(abcd);
+          }
           last_deriv = diter.last();
           if (!last_deriv) diter.next();
         } while (!last_deriv);
         // append all derivatives as targets to the graph
-        for (std::vector<std::shared_ptr<TwoPRep_sh_11_11>>::const_iterator t =
-                 targets.begin();
-             t != targets.end(); ++t) {
+        for (auto t = targets.begin(); t != targets.end(); ++t) {
           std::shared_ptr<DGVertex> t_ptr =
-              std::dynamic_pointer_cast<DGVertex, TwoPRep_sh_11_11>(*t);
+              std::dynamic_pointer_cast<DGVertex, TwoBody_sh_11_11>(*t);
           dg_xxx->append_target(t_ptr);
         }
 
@@ -1536,27 +1673,44 @@ void build_TwoPRep_1b_2k(std::ostream& os,
           CGShell c(lc);
           CGShell d(ld);
 #if LIBINT_ERI3_PURE_SH
-          if (dummy_center == 1 && deriv_level == 0) a.pure_sh(true);
-          if (dummy_center == 0 && deriv_level == 0) b.pure_sh(true);
+          if constexpr (std::is_same<OperType, TwoPRep>::value) {
+            if (dummy_center == 1 && deriv_level == 0) a.pure_sh(true);
+            if (dummy_center == 0 && deriv_level == 0) b.pure_sh(true);
+          }
 #endif
-          std::shared_ptr<TwoPRep_sh_11_11> abcd =
-              TwoPRep_sh_11_11::Instance(a, b, c, d, mType(0u));
-          abcd_label = abcd->label();
+          if constexpr (std::is_same<OperType, TwoPRep>::value) {
+            OperType oper(descrs[0]);
+            std::shared_ptr<TwoBody_sh_11_11> abcd =
+                TwoBody_sh_11_11::Instance(a, b, c, d, nullaux, oper);
+            abcd_label = abcd->label();
+          } else {
+            // For RKB operators, build the label by hand (matches the
+            // build_TwoPRep_2b_2k convention) so that the operator name is
+            // embedded in the source-file name without depending on the
+            // TwoPRep label format.
+            std::ostringstream oss;
+            oss << "_" << a.label() << "_" << b.label();
+            oss << "_" << label;
+            oss << "_" << c.label() << "_" << d.label();
+            abcd_label = oss.str();
+          }
         }
         // + derivative level (if deriv_level > 0)
-        std::string label;
+        std::string eval_label;
         {
-          label = "";
+          eval_label = "";
           if (deriv_level != 0) {
             std::ostringstream oss;
             oss << "deriv" << deriv_level;
-            label += oss.str();
+            eval_label += oss.str();
+          }
+          if constexpr (std::is_same<OperType, TwoPRep>::value) {
+            eval_label += "eri3";
           }
-          label += "eri3";
-          label += abcd_label;
+          eval_label += abcd_label;
         }
 
-        g_progress.current_task = label;
+        g_progress.current_task = eval_label;
         g_progress.print();
 
         std::string prefix(cparams->source_directory());
@@ -1566,7 +1720,7 @@ void build_TwoPRep_1b_2k(std::ostream& os,
         // this will generate code for this targets, and potentially generate
         // code for its prerequisites
         GenerateCode(dg_xxx, context, cparams, strat, tactic, memman,
-                     decl_filenames, def_filenames, prefix, label, false);
+                     decl_filenames, def_filenames, prefix, eval_label, false);
 
         // update max stack size and # of targets
         const std::shared_ptr<TaskParameters>& tparams =
@@ -1580,7 +1734,7 @@ void build_TwoPRep_1b_2k(std::ostream& os,
         ostringstream oss;
         oss << context->label_to_function_name("libint2_build_" + task) << "["
             << lbra << "][" << lc << "][" << ld
-            << "] = " << context->label_to_function_name(label)
+            << "] = " << context->label_to_function_name(eval_label)
             << context->end_of_stat() << endl;
         iface->to_static_init(oss.str());
 
@@ -1602,7 +1756,8 @@ void build_TwoPRep_1b_2k(std::ostream& os,
     }    // end of c loop
   }      // end of bra loop
 }
-#endif  // LIBINT_INCLUDE_ERI3
+
+#endif  // LIBINT_INCLUDE_ERI3 || LIBINT_INCLUDE_RKB_ERI3
 
 #ifdef LIBINT_INCLUDE_ERI2