From bf55959d68100def224b47a7aa1a922e77475f06 Mon Sep 17 00:00:00 2001
From: Robert Carson <carson16@llnl.gov>
Date: Fri, 24 Apr 2026 16:18:24 -0700
Subject: [PATCH 1/7] [claude] Port of SNLSTrDlDenseG solver to ExaNewtonSolver
 Largely had Claude drive this port as it the original SNLS trust region
 dogleg solver was relatively straightforward. Most of it would map over to
 MFEM's framework and wasn't overly complicated. The big boon though from
 having Claude do this is I also had it add the necessary mult transpose
 operators for the nonlinearform integrators and in particular this was done
 for the PA forms as the EA was trivial and the full matrix version already
 implemented it. I was also surprised that it was able to do the BBar PA Grad
 implementation of everything as well. The math behind it was actually more
 straightforward than I was expecting, and it was cool to see how it could
 derive all of it using different relationship it had discovered from looking
 at the full integration form.

---
 src/CMakeLists.txt                          |   2 +
 src/fem_operators/mechanics_integrators.cpp | 543 ++++++++++++++++++++
 src/fem_operators/mechanics_integrators.hpp | 109 +++-
 src/options/option_enum.cpp                 |   7 +-
 src/options/option_parser_v2.cpp            |  40 ++
 src/options/option_parser_v2.hpp            | 112 +++-
 src/options/option_solvers.cpp              | 152 +++++-
 src/solvers/trust_region_solver.cpp         | 350 +++++++++++++
 src/solvers/trust_region_solver.hpp         | 357 +++++++++++++
 src/system_driver.cpp                       |  40 +-
 10 files changed, 1698 insertions(+), 14 deletions(-)
 create mode 100644 src/solvers/trust_region_solver.cpp
 create mode 100644 src/solvers/trust_region_solver.hpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 24e830a..af8cd1a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -21,6 +21,7 @@ set(EXACONSTIT_HEADERS
     postprocessing/mechanics_lightup.hpp
     sim_state/simulation_state.hpp
     solvers/mechanics_solver.hpp
+    solvers/trust_region_solver.hpp
     utilities/dynamic_function_loader.hpp
     utilities/mechanics_kernels.hpp
     utilities/mechanics_log.hpp
@@ -59,6 +60,7 @@ set(EXACONSTIT_SOURCES
     postprocessing/mechanics_lightup.cpp
     sim_state/simulation_state.cpp
     solvers/mechanics_solver.cpp
+    solvers/trust_region_solver.cpp
     utilities/mechanics_kernels.cpp
     utilities/unified_logger.cpp
     )
diff --git a/src/fem_operators/mechanics_integrators.cpp b/src/fem_operators/mechanics_integrators.cpp
index 9ade98d..2bc9d9d 100644
--- a/src/fem_operators/mechanics_integrators.cpp
+++ b/src/fem_operators/mechanics_integrators.cpp
@@ -667,6 +667,113 @@ void ExaNLFIntegrator::AddMultGradPA(const mfem::Vector& x, mfem::Vector& y) con
     } // End of if statement
 }
 
+// -----------------------------------------------------------------------------
+// ExaNLFIntegrator::AddMultTransposeGradPA
+//
+// Native PA kernel computing y += K^T * x where K = B^T D B is the standard
+// (non-BBar) tangent stiffness. Mirrors AddMultGradPA exactly except for the
+// contraction order against the assembled 4th-order tensor D.
+//
+// Algorithm per element, per quadrature point:
+//   1. Compute physical velocity gradient from input vector and shape function
+//      derivatives:
+//         Gx(i,k) = sum_a Gt(a,i,qpt) * X(a,k,elem)
+//      This is the same operation as the forward kernel since B is independent
+//      of the gradient transposition.
+//
+//   2. Apply the TRANSPOSED D tensor contraction:
+//         T(l,n) = sum_{i,k} D(i,k,l,n,qpt,elem) * Gx(i,k)
+//      whereas the forward kernel does
+//         T(i,k) = sum_{l,n} D(i,k,l,n,qpt,elem) * Gx(l,n)
+//      The difference is *which pair* of D's indices are summed against Gx.
+//      For symmetric C, D has major symmetry D(i,k,l,n) = D(l,n,i,k) and the
+//      two contractions agree; for non-symmetric C they disagree.
+//
+//   3. Apply test-function gradients (same operation as forward kernel):
+//         Y(a,n) += sum_l Gt(a,l,qpt) * T(l,n)
+//
+// All quadrature weights and Jacobian determinants are baked into D from the
+// AssembleGradPA step, so this kernel does not need to reapply them.
+// -----------------------------------------------------------------------------
+void ExaNLFIntegrator::AddMultTransposeGradPA(const mfem::Vector &x,
+                                              mfem::Vector &y) const
+{
+    CALI_CXX_MARK_SCOPE("enlfi_amTGPA");
+    if ((space_dims == 1) || (space_dims == 2)) {
+        MFEM_ABORT("Dimensions of 1 or 2 not supported.");
+    }
+    else {
+        const int dim = 3;
+        const int DIM3 = 3;
+        const int DIM6 = 6;
+
+        std::array<RAJA::idx_t, DIM3> perm3 {{ 2, 1, 0 } };
+        std::array<RAJA::idx_t, DIM6> perm6 {{ 5, 4, 3, 2, 1, 0 } };
+
+        // D tensor from AssembleGradPA: D(elem, qpt, i, k, l, n)
+        // The leading dim being elem matches the ordering used in the forward kernel.
+        RAJA::Layout<DIM6> layout_tensor =
+            RAJA::make_permuted_layout({{ dim, dim, dim, dim, nqpts, nelems } }, perm6);
+        RAJA::View<const double, RAJA::Layout<DIM6, RAJA::Index_type, 0> > D(pa_dmat.Read(),
+                                                                            layout_tensor);
+
+        // Field variables: input/output E-vectors
+        RAJA::Layout<DIM3> layout_field = RAJA::make_permuted_layout({{ nnodes, dim, nelems } }, perm3);
+        RAJA::View<const double, RAJA::Layout<DIM3, RAJA::Index_type, 0> > X(x.Read(), layout_field);
+        RAJA::View<double, RAJA::Layout<DIM3, RAJA::Index_type, 0> > Y(y.ReadWrite(), layout_field);
+
+        // Reference shape function derivatives: Gt(node, dim, qpt)
+        RAJA::Layout<DIM3> layout_grads = RAJA::make_permuted_layout({{ nnodes, dim, nqpts } }, perm3);
+        RAJA::View<const double, RAJA::Layout<DIM3, RAJA::Index_type, 0> > Gt(grad.Read(), layout_grads);
+
+        const int nqpts_ = nqpts;
+        const int dim_ = dim;
+        const int nnodes_ = nnodes;
+
+        mfem::forall(nelems, [=] MFEM_HOST_DEVICE(int i_elems) {
+            for (int j_qpts = 0; j_qpts < nqpts_; j_qpts++) {
+            // Step 1: Compute velocity gradient at this quadrature point
+            //   Gx(i, k) = sum_a Gt(a, i, qpt) * X(a, k, elem)
+            double Gx[3][3];
+            for (int ii = 0; ii < dim_; ii++) {
+                for (int kk = 0; kk < dim_; kk++) {
+                    Gx[ii][kk] = 0.0;
+                    for (int a = 0; a < nnodes_; a++) {
+                        Gx[ii][kk] += Gt(a, ii, j_qpts) * X(a, kk, i_elems);
+                    }
+                }
+            }
+
+            // Step 2: Apply TRANSPOSED D contraction
+            //   T(l, n) = sum_{i,k} D(i, k, l, n, qpt, elem) * Gx(i, k)
+            // Compare to forward kernel:
+            //   T(i, k) = sum_{l,n} D(i, k, l, n, qpt, elem) * Gx(l, n)
+            double T[3][3];
+            for (int ll = 0; ll < dim_; ll++) {
+                for (int nn = 0; nn < dim_; nn++) {
+                    T[ll][nn] = 0.0;
+                    for (int ii = 0; ii < dim_; ii++) {
+                        for (int kk = 0; kk < dim_; kk++) {
+                        T[ll][nn] += D(i_elems, j_qpts, ii, kk, ll, nn) * Gx[ii][kk];
+                        }
+                    }
+                }
+            }
+
+            // Step 3: Apply test-function gradients (same as forward kernel)
+            //   Y(a, n) += sum_l Gt(a, l, qpt) * T(l, n)
+            for (int nn = 0; nn < dim_; nn++) {
+                for (int ll = 0; ll < dim_; ll++) {
+                    for (int a = 0; a < nnodes_; a++) {
+                        Y(a, nn, i_elems) += Gt(a, ll, j_qpts) * T[ll][nn];
+                    }
+                }
+            }
+            } // End of nqpts
+        }); // End of nelems
+    } // End of else (3D path)
+}
+
 // This assembles the diagonal of our LHS which can be used as a preconditioner
 void ExaNLFIntegrator::AssembleGradDiagonalPA(mfem::Vector& diag) const {
     CALI_CXX_MARK_SCOPE("enlfi_AssembleGradDiagonalPA");
@@ -1257,6 +1364,70 @@ void ICExaNLFIntegrator::AssembleElementGrad(const mfem::FiniteElement& el,
     return;
 }
 
+// -----------------------------------------------------------------------------
+// ICExaNLFIntegrator::AssembleGradPA
+//
+// Sets up geometric data and ensures element-averaged derivatives are ready.
+// The B-bar gradient PA does NOT pre-assemble a D tensor (unlike the base
+// class) because the volumetric correction couples element-constant data
+// (volume-averaged derivatives N̄) with per-quadrature-point data (C, adj(J))
+// in a way that does not fold cleanly into a single pre-assembled tensor.
+// Instead, AddMultGradPA / AddMultTransposeGradPA access C directly from the
+// quadrature function and apply the B-bar action on the fly in physical space.
+// -----------------------------------------------------------------------------
+void ICExaNLFIntegrator::AssembleGradPA(const mfem::Vector &/* x */,
+                                        const mfem::FiniteElementSpace &fes)
+{
+    this->AssembleGradPA(fes);
+}
+
+void ICExaNLFIntegrator::AssembleGradPA(const mfem::FiniteElementSpace &fes)
+{
+    CALI_CXX_MARK_SCOPE("icenlfi_assembleGradPA");
+
+    mfem::Mesh *mesh = fes.GetMesh();
+    const mfem::FiniteElement &el = *fes.GetFE(0);
+    space_dims = el.GetDim();
+    const mfem::IntegrationRule *ir =
+        &(mfem::IntRules.Get(el.GetGeomType(), 2 * el.GetOrder() + 1));
+
+    nqpts = ir->GetNPoints();
+    nnodes = el.GetDof();
+    nelems = fes.GetNE();
+
+    if ((space_dims == 1) || (space_dims == 2)) {
+        MFEM_ABORT("Dimensions of 1 or 2 not supported.");
+    }
+
+    // Cache geometric factors (Jacobians at quadrature points)
+    geom = mesh->GetGeometricFactors(*ir, mfem::GeometricFactors::JACOBIANS);
+
+    // Cache reference shape function derivatives
+    if (grad.Size() != (nqpts * space_dims * nnodes)) {
+        grad.SetSize(nqpts * space_dims * nnodes, mfem::Device::GetMemoryType());
+        {
+            mfem::DenseMatrix DSh;
+            const int offset = nnodes * space_dims;
+            double *qpts_dshape_data = grad.HostReadWrite();
+            for (int i = 0; i < nqpts; i++) {
+            const mfem::IntegrationPoint &ip = ir->IntPoint(i);
+            DSh.UseExternalData(&qpts_dshape_data[offset * i], nnodes, space_dims);
+            el.CalcDShape(ip, DSh);
+            }
+        }
+        grad.UseDevice(true);
+    }
+
+    // Element-averaged derivatives N̄(a, k, elem) are computed by AssemblePA().
+    // If they have not been computed yet, force a call now so the gradient PA
+    // kernels can use them. The AssemblePA path is idempotent and safe to call
+    // even if it has been called previously (it re-zeroes and recomputes).
+    if (elem_deriv_shapes.Size() != (nnodes * space_dims * nelems)) {
+        this->AssemblePA(fes);
+    }
+}
+
+
 /// Method defining element assembly.
 /** The result of the element assembly is added and stored in the @a emat
     Vector. */
@@ -1265,6 +1436,7 @@ void ICExaNLFIntegrator::AssembleGradEA(const mfem::Vector& /*x*/,
                                         mfem::Vector& emat) {
     AssembleEA(fes, emat);
 }
+
 void ICExaNLFIntegrator::AssembleEA(const mfem::FiniteElementSpace& fes, mfem::Vector& emat) {
     CALI_CXX_MARK_SCOPE("icenlfi_assembleEA");
     const mfem::FiniteElement& el = *fes.GetFE(0);
@@ -2014,6 +2186,377 @@ void ICExaNLFIntegrator::AssemblePA(const mfem::FiniteElementSpace& fes) {
     } // End of space dims if else
 }
 
+// -----------------------------------------------------------------------------
+// ICExaNLFIntegrator::AddMultGradPA
+//
+// Native B-bar tangent stiffness PA action: y += K̄ * x where
+//   K̄ = ∫ B̄^T C B̄ dΩ
+// and B̄ is the B-bar strain-displacement matrix from Hughes (1980).
+//
+// Because B̄ couples element-constant volume-averaged data with per-qpt data,
+// we work in physical space and access C directly from the simulation state's
+// tangent stiffness quadrature function.
+//
+// Algorithm per element, per quadrature point (q):
+//   1. Hoist tr_bar (element-constant) outside the qpt loop:
+//        tr_bar = sum_{a,k} N̄(a,k) * V(a,k)
+//      This is the volume-averaged trace of the velocity gradient that B̄
+//      uses in place of the per-qpt trace.
+//
+//   2. Compute the adjugate matrix and Jacobian determinant from the cached
+//      Jacobian. Adjugate is used to transform reference derivatives Gt to
+//      physical derivatives:
+//        dN(a,j) = (1/detJ) * sum_k Gt(a,k,q) * adj(j,k)
+//      (Adjugate uses inverse-transpose convention; same as in the standard
+//      ExaNLFIntegrator AssembleGradPA kernel.)
+//
+//   3. Compute physical velocity gradient:
+//        L(i,j) = sum_a dN(a,j) * V(a,i)
+//
+//   4. Compute B-bar trace correction:
+//        Δtr = (tr_bar - tr(L)) / 3
+//      and modified velocity gradient:
+//        L̄(i,j) = L(i,j) + δ_ij * Δtr
+//      which replaces the volumetric trace of L with tr_bar (Hughes' B-bar).
+//
+//   5. Apply material tangent (forward direction):
+//        σ'(j,k) = sum_{l,m} C(j,k,l,m) * L̄(l,m)
+//      C is fetched on the fly from the tangent_stiffness quadrature function.
+//
+//   6. Compute pressure (volumetric) part of σ':
+//        p' = (1/3) * tr(σ')
+//
+//   7. Accumulate into Y with B-bar test side. The test side replaces the
+//      pressure contribution to nodal forces using the volume-averaged
+//      derivatives N̄ in place of the per-qpt dN:
+//        Y(a,k) += [sum_j dN(a,j) σ'(j,k) + (N̄(a,k) - dN(a,k)) p'] * w * detJ
+//      The first term is the standard B^T σ' force, the second redirects the
+//      pressure piece through N̄.
+//
+// Verification properties:
+//   - For symmetric C, the result must equal the forward action of any
+//     symmetric formulation (B̄^T C B̄ is symmetric).
+//   - For a uniform-Jacobian mesh where tr_bar agrees with the per-qpt
+//     trace, Δtr → 0 at every qpt and the result must match the standard
+//     (non-B-bar) result.
+// -----------------------------------------------------------------------------
+void ICExaNLFIntegrator::AddMultGradPA(const mfem::Vector &x,
+                                       mfem::Vector &y) const
+{
+    CALI_CXX_MARK_SCOPE("icenlfi_amGPA");
+    if ((space_dims == 1) || (space_dims == 2)) {
+        MFEM_ABORT("Dimensions of 1 or 2 not supported.");
+    }
+    else {
+        const int dim = 3;
+        const int DIM3 = 3;
+        const int DIM4 = 4;
+        const int DIM6 = 6;
+
+        std::array<RAJA::idx_t, DIM3> perm3 {{ 2, 1, 0 } };
+        std::array<RAJA::idx_t, DIM4> perm4 {{ 3, 2, 1, 0 } };
+        std::array<RAJA::idx_t, DIM6> perm6 {{ 5, 4, 3, 2, 1, 0 } };
+
+        // Input / output E-vectors
+        RAJA::Layout<DIM3> layout_field = RAJA::make_permuted_layout({{ nnodes, dim, nelems } }, perm3);
+        RAJA::View<const double, RAJA::Layout<DIM3, RAJA::Index_type, 0> > X(x.Read(), layout_field);
+        RAJA::View<double, RAJA::Layout<DIM3, RAJA::Index_type, 0> > Y(y.ReadWrite(), layout_field);
+
+        // Reference shape function derivatives Gt(node, dim, qpt)
+        RAJA::Layout<DIM3> layout_grads = RAJA::make_permuted_layout({{ nnodes, dim, nqpts } }, perm3);
+        RAJA::View<const double, RAJA::Layout<DIM3, RAJA::Index_type, 0> > Gt(grad.Read(), layout_grads);
+
+        // Element-averaged derivatives N̄(node, dim, elem)
+        RAJA::View<const double, RAJA::Layout<DIM3, RAJA::Index_type, 0> > Nbar(elem_deriv_shapes.Read(),
+                                                                                layout_field);
+
+        // Mesh Jacobians J(dim, dim, qpt, elem) — column-major mfem convention
+        RAJA::Layout<DIM4> layout_jac = RAJA::make_permuted_layout({{ dim, dim, nqpts, nelems } }, perm4);
+        RAJA::View<const double, RAJA::Layout<DIM4, RAJA::Index_type, 0> > J_data(geom->J.Read(), layout_jac);
+
+        // Material tangent C(j, k, l, m, qpt, elem) from quadrature function
+        auto tangent_qf = m_sim_state->GetQuadratureFunction("tangent_stiffness");
+        RAJA::Layout<DIM6> layout_C = RAJA::make_permuted_layout(
+            {{ dim, dim, dim, dim, nqpts, nelems } }, perm6);
+        RAJA::View<const double, RAJA::Layout<DIM6, RAJA::Index_type, 0> > C(tangent_qf->Read(), layout_C);
+
+        // Integration weights from the tangent stiffness QF integration rule
+        const mfem::IntegrationRule &ir =
+            tangent_qf->GetSpace()->GetIntRule(0);
+        auto W = ir.GetWeights().Read();
+
+        const int nqpts_ = nqpts;
+        const int dim_ = dim;
+        const int nnodes_ = nnodes;
+
+        mfem::forall(nelems, [=] MFEM_HOST_DEVICE(int e) {
+            // Step 1: Hoist tr_bar outside the qpt loop (element-constant)
+            double tr_bar = 0.0;
+            for (int a = 0; a < nnodes_; a++) {
+            for (int k = 0; k < dim_; k++) {
+                tr_bar += Nbar(a, k, e) * X(a, k, e);
+            }
+            }
+
+            for (int q = 0; q < nqpts_; q++) {
+            // Step 2: Compute adjugate and Jacobian determinant
+            const double J11 = J_data(0, 0, q, e), J12 = J_data(1, 0, q, e),
+                            J13 = J_data(2, 0, q, e);
+            const double J21 = J_data(0, 1, q, e), J22 = J_data(1, 1, q, e),
+                            J23 = J_data(2, 1, q, e);
+            const double J31 = J_data(0, 2, q, e), J32 = J_data(1, 2, q, e),
+                            J33 = J_data(2, 2, q, e);
+
+            double adj[9];
+            adj[0] = (J22 * J33) - (J23 * J32); // 0,0
+            adj[1] = (J23 * J31) - (J21 * J33); // 0,1
+            adj[2] = (J21 * J32) - (J22 * J31); // 0,2
+            adj[3] = (J13 * J32) - (J12 * J33); // 1,0
+            adj[4] = (J11 * J33) - (J13 * J31); // 1,1
+            adj[5] = (J12 * J31) - (J11 * J32); // 1,2
+            adj[6] = (J12 * J23) - (J13 * J22); // 2,0
+            adj[7] = (J13 * J21) - (J11 * J23); // 2,1
+            adj[8] = (J11 * J22) - (J12 * J21); // 2,2
+
+            const double detJ = J11 * adj[0] + J21 * adj[3] + J31 * adj[6];
+            const double idetJ = 1.0 / detJ;
+            const double w_detJ = W[q] * detJ;
+
+            // Step 3: Physical velocity gradient L(i,j) = sum_a dN(a,j) * V(a,i)
+            // We compute dN(a, :) on-the-fly from Gt and adj.
+            double L[3][3] = {{ 0.0 } };
+            for (int a = 0; a < nnodes_; a++) {
+                double dNa[3];
+                for (int j = 0; j < dim_; j++) {
+                    dNa[j] = idetJ * (Gt(a, 0, q) * adj[j * 3 + 0] +
+                                    Gt(a, 1, q) * adj[j * 3 + 1] +
+                                    Gt(a, 2, q) * adj[j * 3 + 2]);
+                }
+                for (int i = 0; i < dim_; i++) {
+                    for (int j = 0; j < dim_; j++) {
+                        L[i][j] += dNa[j] * X(a, i, e);
+                    }
+                }
+            }
+
+            // Step 4: B-bar trace correction
+            const double tr_std = L[0][0] + L[1][1] + L[2][2];
+            const double dtr = (tr_bar - tr_std) / 3.0;
+
+            double Lbar[3][3];
+            for (int i = 0; i < dim_; i++) {
+                for (int j = 0; j < dim_; j++) {
+                    Lbar[i][j] = L[i][j];
+                }
+            }
+            Lbar[0][0] += dtr;
+            Lbar[1][1] += dtr;
+            Lbar[2][2] += dtr;
+
+            // Step 5: Apply material tangent — forward contraction
+            //   σ'(j, k) = sum_{l,m} C(j, k, l, m) * L̄(l, m)
+            double sigma[3][3] = {{ 0.0 } };
+            for (int j = 0; j < dim_; j++) {
+                for (int k = 0; k < dim_; k++) {
+                    for (int l = 0; l < dim_; l++) {
+                        for (int m = 0; m < dim_; m++) {
+                        sigma[j][k] += C(j, k, l, m, q, e) * Lbar[l][m];
+                        }
+                    }
+                }
+            }
+
+            // Step 6: Pressure (volumetric) part of σ'
+            const double p = (sigma[0][0] + sigma[1][1] + sigma[2][2]) / 3.0;
+
+            // Step 7: Accumulate forces with B-bar test side
+            //   Y(a, k) += [sum_j dN(a,j) σ'(j,k) + (N̄(a,k) - dN(a,k)) p] * w * detJ
+            for (int a = 0; a < nnodes_; a++) {
+                double dNa[3];
+                for (int j = 0; j < dim_; j++) {
+                    dNa[j] = idetJ * (Gt(a, 0, q) * adj[j * 3 + 0] +
+                                    Gt(a, 1, q) * adj[j * 3 + 1] +
+                                    Gt(a, 2, q) * adj[j * 3 + 2]);
+                }
+                for (int k = 0; k < dim_; k++) {
+                    double f_std = 0.0;
+                    for (int j = 0; j < dim_; j++) {
+                        f_std += dNa[j] * sigma[j][k];
+                    }
+                    double f_bbar = (Nbar(a, k, e) - dNa[k]) * p;
+                    Y(a, k, e) += (f_std + f_bbar) * w_detJ;
+                }
+            }
+            } // End of qpts
+        }); // End of nelems
+    } // End of else (3D path)
+}
+
+
+// -----------------------------------------------------------------------------
+// ICExaNLFIntegrator::AddMultTransposeGradPA
+//
+// Native transposed B-bar tangent stiffness PA action: y += K̄^T * x.
+//
+// This is structurally IDENTICAL to AddMultGradPA except for one line: the
+// material tangent contraction uses C(l,m,j,k) instead of C(j,k,l,m). The
+// B-bar geometry (N̄, dN, trace correction, pressure redirection) is the
+// same on both sides of K̄ = B̄^T C B̄ because:
+//   (B̄^T C B̄)^T = B̄^T C^T B̄
+// — only the middle factor C transposes; the outer B̄^T and B̄ remain in
+// place.
+//
+// For symmetric C, this kernel produces results identical to AddMultGradPA
+// (a useful verification check). For non-symmetric C (crystal plasticity
+// with non-associated flow or non-symmetric Schmid coupling) it produces
+// genuinely different results, as required for correct trust-region
+// Cauchy point computation.
+// -----------------------------------------------------------------------------
+void ICExaNLFIntegrator::AddMultTransposeGradPA(const mfem::Vector &x,
+                                                mfem::Vector &y) const
+{
+    CALI_CXX_MARK_SCOPE("icenlfi_amTGPA");
+    if ((space_dims == 1) || (space_dims == 2)) {
+        MFEM_ABORT("Dimensions of 1 or 2 not supported.");
+    }
+    else {
+        const int dim = 3;
+        const int DIM3 = 3;
+        const int DIM4 = 4;
+        const int DIM6 = 6;
+
+        std::array<RAJA::idx_t, DIM3> perm3 {{ 2, 1, 0 } };
+        std::array<RAJA::idx_t, DIM4> perm4 {{ 3, 2, 1, 0 } };
+        std::array<RAJA::idx_t, DIM6> perm6 {{ 5, 4, 3, 2, 1, 0 } };
+
+        RAJA::Layout<DIM3> layout_field = RAJA::make_permuted_layout({{ nnodes, dim, nelems } }, perm3);
+        RAJA::View<const double, RAJA::Layout<DIM3, RAJA::Index_type, 0> > X(x.Read(), layout_field);
+        RAJA::View<double, RAJA::Layout<DIM3, RAJA::Index_type, 0> > Y(y.ReadWrite(), layout_field);
+
+        RAJA::Layout<DIM3> layout_grads = RAJA::make_permuted_layout({{ nnodes, dim, nqpts } }, perm3);
+        RAJA::View<const double, RAJA::Layout<DIM3, RAJA::Index_type, 0> > Gt(grad.Read(), layout_grads);
+
+        RAJA::View<const double, RAJA::Layout<DIM3, RAJA::Index_type, 0> > Nbar(elem_deriv_shapes.Read(),
+                                                                                layout_field);
+
+        RAJA::Layout<DIM4> layout_jac = RAJA::make_permuted_layout({{ dim, dim, nqpts, nelems } }, perm4);
+        RAJA::View<const double, RAJA::Layout<DIM4, RAJA::Index_type, 0> > J_data(geom->J.Read(), layout_jac);
+
+        auto tangent_qf = m_sim_state->GetQuadratureFunction("tangent_stiffness");
+        RAJA::Layout<DIM6> layout_C = RAJA::make_permuted_layout(
+            {{ dim, dim, dim, dim, nqpts, nelems } }, perm6);
+        RAJA::View<const double, RAJA::Layout<DIM6, RAJA::Index_type, 0> > C(tangent_qf->Read(), layout_C);
+
+        const mfem::IntegrationRule &ir =
+            tangent_qf->GetSpace()->GetIntRule(0);
+        auto W = ir.GetWeights().Read();
+
+        const int nqpts_ = nqpts;
+        const int dim_ = dim;
+        const int nnodes_ = nnodes;
+
+        mfem::forall(nelems, [=] MFEM_HOST_DEVICE(int e) {
+            // Step 1: Hoist tr_bar (element-constant)
+            double tr_bar = 0.0;
+            for (int a = 0; a < nnodes_; a++) {
+            for (int k = 0; k < dim_; k++) {
+                tr_bar += Nbar(a, k, e) * X(a, k, e);
+            }
+            }
+
+            for (int q = 0; q < nqpts_; q++) {
+            // Step 2: Adjugate and Jacobian determinant
+            const double J11 = J_data(0, 0, q, e), J12 = J_data(1, 0, q, e),
+                            J13 = J_data(2, 0, q, e);
+            const double J21 = J_data(0, 1, q, e), J22 = J_data(1, 1, q, e),
+                            J23 = J_data(2, 1, q, e);
+            const double J31 = J_data(0, 2, q, e), J32 = J_data(1, 2, q, e),
+                            J33 = J_data(2, 2, q, e);
+
+            double adj[9];
+            adj[0] = (J22 * J33) - (J23 * J32);
+            adj[1] = (J23 * J31) - (J21 * J33);
+            adj[2] = (J21 * J32) - (J22 * J31);
+            adj[3] = (J13 * J32) - (J12 * J33);
+            adj[4] = (J11 * J33) - (J13 * J31);
+            adj[5] = (J12 * J31) - (J11 * J32);
+            adj[6] = (J12 * J23) - (J13 * J22);
+            adj[7] = (J13 * J21) - (J11 * J23);
+            adj[8] = (J11 * J22) - (J12 * J21);
+
+            const double detJ = J11 * adj[0] + J21 * adj[3] + J31 * adj[6];
+            const double idetJ = 1.0 / detJ;
+            const double w_detJ = W[q] * detJ;
+
+            // Step 3: Physical velocity gradient
+            double L[3][3] = {{ 0.0 } };
+            for (int a = 0; a < nnodes_; a++) {
+                double dNa[3];
+                for (int j = 0; j < dim_; j++) {
+                    dNa[j] = idetJ * (Gt(a, 0, q) * adj[j * 3 + 0] +
+                                    Gt(a, 1, q) * adj[j * 3 + 1] +
+                                    Gt(a, 2, q) * adj[j * 3 + 2]);
+                }
+                for (int i = 0; i < dim_; i++) {
+                    for (int j = 0; j < dim_; j++) {
+                        L[i][j] += dNa[j] * X(a, i, e);
+                    }
+                }
+            }
+
+            // Step 4: B-bar trace correction
+            const double tr_std = L[0][0] + L[1][1] + L[2][2];
+            const double dtr = (tr_bar - tr_std) / 3.0;
+
+            double Lbar[3][3];
+            for (int i = 0; i < dim_; i++) {
+                for (int j = 0; j < dim_; j++) {
+                    Lbar[i][j] = L[i][j];
+                }
+            }
+            Lbar[0][0] += dtr;
+            Lbar[1][1] += dtr;
+            Lbar[2][2] += dtr;
+
+            // Step 5: TRANSPOSED material tangent contraction
+            //   σ'(j, k) = sum_{l,m} C(l, m, j, k) * L̄(l, m)
+            // (Compare to forward: C(j, k, l, m) * L̄(l, m))
+            double sigma[3][3] = {{ 0.0 } };
+            for (int j = 0; j < dim_; j++) {
+                for (int k = 0; k < dim_; k++) {
+                    for (int l = 0; l < dim_; l++) {
+                        for (int m = 0; m < dim_; m++) {
+                        sigma[j][k] += C(l, m, j, k, q, e) * Lbar[l][m];
+                        }
+                    }
+                }
+            }
+
+            // Step 6: Pressure
+            const double p = (sigma[0][0] + sigma[1][1] + sigma[2][2]) / 3.0;
+
+            // Step 7: Accumulate with B-bar test side (same as forward kernel)
+            for (int a = 0; a < nnodes_; a++) {
+                double dNa[3];
+                for (int j = 0; j < dim_; j++) {
+                    dNa[j] = idetJ * (Gt(a, 0, q) * adj[j * 3 + 0] +
+                                    Gt(a, 1, q) * adj[j * 3 + 1] +
+                                    Gt(a, 2, q) * adj[j * 3 + 2]);
+                }
+                for (int k = 0; k < dim_; k++) {
+                    double f_std = 0.0;
+                    for (int j = 0; j < dim_; j++) {
+                        f_std += dNa[j] * sigma[j][k];
+                    }
+                    double f_bbar = (Nbar(a, k, e) - dNa[k]) * p;
+                    Y(a, k, e) += (f_std + f_bbar) * w_detJ;
+                }
+            }
+            } // End of qpts
+        }); // End of nelems
+    } // End of else (3D path)
+}
+
 // Here we're applying the following action operation using the assembled "D" 2nd order
 // tensor found above:
 // y_{ik} = \nabla_{ij}\phi^T_{\epsilon} D_{jk}
diff --git a/src/fem_operators/mechanics_integrators.hpp b/src/fem_operators/mechanics_integrators.hpp
index fb7d4f7..0a761ec 100644
--- a/src/fem_operators/mechanics_integrators.hpp
+++ b/src/fem_operators/mechanics_integrators.hpp
@@ -351,6 +351,35 @@ class ExaNLFIntegrator : public mfem::NonlinearFormIntegrator {
      */
     virtual void AddMultGradPA(const mfem::Vector& x, mfem::Vector& y) const override;
 
+    /**
+     * @brief Apply transposed gradient action via partial assembly.
+     *
+     * @param x Input vector for transposed Jacobian-vector product
+     * @param y Output vector for accumulated result
+     *
+     * Native PA kernel computing y += K^T * x where K = B^T D B is the
+     * tangent stiffness operator. The only computational difference from
+     * AddMultGradPA is the contraction order with the assembled 4th-order
+     * tensor D:
+     *
+     *   Forward (AddMultGradPA):
+     *     T(i,k) = D(i,k,l,n,qpt,elem) * Gx(l,n)   — contract last pair
+     *     Y(a,k) += Gt(a,i,qpt) * T(i,k)
+     *
+     *   Transpose (this method):
+     *     T(l,n) = D(i,k,l,n,qpt,elem) * Gx(i,k)   — contract first pair
+     *     Y(a,n) += Gt(a,l,qpt) * T(l,n)
+     *
+     * For symmetric material tangent C, the two operations are identical.
+     * For non-symmetric C (crystal plasticity), they differ. The transpose
+     * is required for trust-region dogleg solver Cauchy point computation
+     * where the merit function gradient is g = J^T * r, not J * r.
+     *
+     * @note GPU-compatible via mfem::forall
+     * @note Requires prior AssembleGradPA() call for the D tensor
+     */
+    virtual void AddMultTransposeGradPA(const mfem::Vector &x, mfem::Vector &y) const override;
+
     using mfem::NonlinearFormIntegrator::AssemblePA;
     /**
      * @brief Initialize partial assembly data structures for residual operations.
@@ -723,10 +752,82 @@ class ICExaNLFIntegrator : public ExaNLFIntegrator {
                                      const mfem::Vector& /*elfun*/,
                                      mfem::DenseMatrix& elmat) override;
 
-    // This method doesn't easily extend to PA formulation, so we're punting on
-    // it for now.
-    using ExaNLFIntegrator::AddMultGradPA;
-    using ExaNLFIntegrator::AssembleGradPA;
+    /**
+     * @brief Initialize partial assembly data structures for B-bar gradient operations.
+     *
+     * @param fes Finite element space providing mesh and element information
+     *
+     * Sets up the geometric data needed by AddMultGradPA() and
+     * AddMultTransposeGradPA() for the B-bar tangent stiffness operator.
+     *
+     * Unlike the base class AssembleGradPA() which pre-assembles a 4th-order
+     * tensor D, the B-bar version stores only the geometric data (Jacobians,
+     * reference shape function derivatives, and element-averaged derivatives)
+     * and applies the material tangent C on-the-fly inside the kernel. This
+     * is because the B-bar correction couples element-constant data (the
+     * volume-averaged derivatives) with quadrature-point-local data (C and
+     * adj(J)) in a way that doesn't fold cleanly into a single pre-assembled
+     * tensor.
+     *
+     * Setup steps:
+     *   1. Cache space_dims, nqpts, nnodes, nelems from the FES
+     *   2. Get geometric factors (Jacobians at quadrature points) from the mesh
+     *   3. Compute and cache reference shape function derivatives Gt(a, k, qpt)
+     *   4. Ensure element-averaged derivatives N̄(a, k, elem) are available
+     *      (calling AssemblePA() if not yet computed)
+     *
+     * @note Must be called before AddMultGradPA() or AddMultTransposeGradPA()
+     * @note Material tangent C is accessed directly from the simulation state
+     *       quadrature function during the AddMult kernels
+     */
+    virtual void AssembleGradPA(const mfem::FiniteElementSpace &fes) override;
+
+    /// State-ful overload that ignores the state vector @a x.
+    virtual void AssembleGradPA(const mfem::Vector &x, const mfem::FiniteElementSpace &fes) override;
+
+    /**
+     * @brief Apply partial-assembly B-bar tangent stiffness action.
+     *
+     * @param x Input E-vector (nodal velocities)
+     * @param y Output E-vector (accumulated)
+     *
+     * Computes y += K̄ * x where K̄ = ∫ B̄^T C B̄ dΩ is the B-bar tangent.
+     *
+     * Algorithm per element, per quadrature point:
+     *   1. Compute adj(J) and detJ from the cached Jacobian
+     *   2. Compute physical derivatives dN(a,j) on-the-fly from Gt and adj(J)
+     *   3. Compute physical velocity gradient L(i,j) = dN(a,j) V(a,i)
+     *   4. Compute B-bar trace correction Δtr = (tr_bar - tr(L)) / 3
+     *      where tr_bar = N̄(a,k) V(a,k) is element-constant (hoisted)
+     *   5. Modified velocity gradient L̄ = L + δ_ij * Δtr
+     *   6. Apply C: σ'(j,k) = C(j,k,l,m) * L̄(l,m)
+     *   7. Pressure correction p' = (1/3) tr(σ')
+     *   8. Accumulate into Y: standard force + B-bar pressure redirection
+     *      Y(a,k) += [Σ_j dN(a,j) σ'(j,k) + (N̄(a,k) - dN(a,k)) p'] * w * detJ
+     *
+     * @note GPU-compatible via mfem::forall
+     * @note Requires prior AssembleGradPA() call
+     */
+    virtual void AddMultGradPA(const mfem::Vector &x, mfem::Vector &y) const override;
+
+    /**
+     * @brief Apply transposed B-bar tangent stiffness action.
+     *
+     * @param x Input E-vector
+     * @param y Output E-vector (accumulated)
+     *
+     * Computes y += K̄^T * x. Identical to AddMultGradPA except the C
+     * contraction order is swapped:
+     *   Forward:   σ'(j,k) = C(j,k,l,m) * L̄(l,m)
+     *   Transpose: σ'(j,k) = C(l,m,j,k) * L̄(l,m)
+     *
+     * The B-bar geometry (N̄, dN, trace correction, pressure redirection)
+     * is identical for both directions because B̄ appears on both the
+     * trial and test sides of K̄ = B̄^T C B̄, and (B̄^T C B̄)^T = B̄^T C^T B̄.
+     *
+     * @note For symmetric C, this produces identical results to AddMultGradPA
+     */
+    virtual void AddMultTransposeGradPA(const mfem::Vector &x, mfem::Vector &y) const override;
 
     /**
      * @brief Initialize partial assembly data structures for B-bar residual operations.
diff --git a/src/options/option_enum.cpp b/src/options/option_enum.cpp
index 6ae4b99..8749ad7 100644
--- a/src/options/option_enum.cpp
+++ b/src/options/option_enum.cpp
@@ -106,12 +106,15 @@ LinearSolverType string_to_linear_solver_type(const std::string& str) {
 
 /**
  * @brief Convert string to NonlinearSolverType enum
- * @param str String representation of nonlinear solver type ("NR", "NRLS")
+ * @param str String representation of nonlinear solver type ("NR", "NRLS", "TRDOG")
  * @return Corresponding NonlinearSolverType enum value
  */
 NonlinearSolverType string_to_nonlinear_solver_type(const std::string& str) {
     static const std::map<std::string, NonlinearSolverType> mapping = {
-        {"NR", NonlinearSolverType::NR}, {"NRLS", NonlinearSolverType::NRLS}};
+        {"NR",    NonlinearSolverType::NR},
+        {"NRLS",  NonlinearSolverType::NRLS},
+        {"TRDOG", NonlinearSolverType::TRDOG}
+    };
 
     return string_to_enum(str, mapping, NonlinearSolverType::NOTYPE, "nonlinear solver");
 }
diff --git a/src/options/option_parser_v2.cpp b/src/options/option_parser_v2.cpp
index efac46e..cb29e98 100644
--- a/src/options/option_parser_v2.cpp
+++ b/src/options/option_parser_v2.cpp
@@ -790,6 +790,9 @@ void ExaOptions::print_solver_options() const {
     case NonlinearSolverType::NRLS:
         std::cout << "Newton-Raphson with line search\n";
         break;
+    case NonlinearSolverType::TRDOG:
+        std::cout << "Trust-region dogleg (SNLS port)\n";
+        break;
     default:
         std::cout << "Unknown\n";
         break;
@@ -798,6 +801,43 @@ void ExaOptions::print_solver_options() const {
     std::cout << "    Maximum iterations: " << solvers.nonlinear_solver.iter << "\n";
     std::cout << "    Relative tolerance: " << solvers.nonlinear_solver.rel_tol << "\n";
     std::cout << "    Absolute tolerance: " << solvers.nonlinear_solver.abs_tol << "\n";
+
+    // Trust-region parameters: print if either the solver is TRDOG or the user
+    // supplied a [trust_region] sub-table. The latter case is informational —
+    // it lets the user spot misconfigurations where they set TR options without
+    // selecting the TRDOG solver.
+    const bool is_trdog = (solvers.nonlinear_solver.nl_solver == NonlinearSolverType::TRDOG);
+    const bool tr_supplied = solvers.nonlinear_solver.trust_region.has_value();
+
+    if (is_trdog || tr_supplied) {
+        std::cout << "\n    Trust-region parameters";
+        if (is_trdog && !tr_supplied) {
+            std::cout << " (using defaults)";
+        }
+        else if (!is_trdog && tr_supplied) {
+            std::cout << " (WARNING: supplied but solver is not TRDOG)";
+        }
+        std::cout << ":\n";
+
+        // Use the supplied options if present, otherwise default-construct
+        // a TrustRegionOptions to print the defaults
+        const TrustRegionOptions tr_opts = tr_supplied
+            ? solvers.nonlinear_solver.trust_region.value()
+            : TrustRegionOptions{};
+
+        std::cout << "      delta_init      = " << tr_opts.delta_init      << "\n";
+        std::cout << "      delta_min       = " << tr_opts.delta_min       << "\n";
+        std::cout << "      delta_max       = " << tr_opts.delta_max       << "\n";
+        std::cout << "      xi_lg           = " << tr_opts.xi_lg           << "\n";
+        std::cout << "      xi_ug           = " << tr_opts.xi_ug           << "\n";
+        std::cout << "      xi_lo           = " << tr_opts.xi_lo           << "\n";
+        std::cout << "      xi_uo           = " << tr_opts.xi_uo           << "\n";
+        std::cout << "      xi_inc          = " << tr_opts.xi_inc          << "\n";
+        std::cout << "      xi_dec          = " << tr_opts.xi_dec          << "\n";
+        std::cout << "      xi_forced_inc   = " << tr_opts.xi_forced_inc   << "\n";
+        std::cout << "      reject_increase = "
+                  << (tr_opts.reject_increase ? "true" : "false") << "\n";
+    }
 }
 
 void ExaOptions::print_material_options() const {
diff --git a/src/options/option_parser_v2.hpp b/src/options/option_parser_v2.hpp
index d38fac7..9f77d13 100644
--- a/src/options/option_parser_v2.hpp
+++ b/src/options/option_parser_v2.hpp
@@ -97,9 +97,10 @@ enum class LinearSolverType {
  * @brief Enumeration for nonlinear solver types
  */
 enum class NonlinearSolverType {
-    NR,    /**< Newton-Raphson method */
-    NRLS,  /**< Newton-Raphson with line search */
-    NOTYPE /**< Uninitialized or invalid nonlinear solver type */
+    NR,     /**< Newton-Raphson method */
+    NRLS,   /**< Newton-Raphson with line search */
+    TRDOG,  /**< Trust-region dogleg method (ported from SNLS) */
+    NOTYPE  /**< Uninitialized or invalid nonlinear solver type */
 };
 
 /**
@@ -623,6 +624,103 @@ struct LinearSolverOptions {
     static LinearSolverOptions from_toml(const toml::value& toml_input);
 };
 
+/**
+ * @brief Trust-region dogleg solver configuration
+ *
+ * @details Controls the trust-region radius management and dogleg step
+ * computation for the ExaTrustRegionSolver. Parameters are ported from
+ * SNLS's TrDeltaControl with sane defaults suitable for solid mechanics
+ * applications. Power users can tune these for difficult crystal plasticity
+ * problems.
+ *
+ * The trust-region radius delta is updated based on the ratio
+ *     rho = actual_residual_change / predicted_residual_change
+ * where predicted change comes from the linearized model at the current iterate.
+ *
+ * Acceptance/rejection bands:
+ *   - "Good" band [xi_lg, xi_ug]: increase delta when rho falls here
+ *   - "OK"  band [xi_lo, xi_uo]: keep delta when rho falls here (outside good)
+ *   - Outside [xi_lo, xi_uo]: decrease delta
+ *
+ * TOML configuration example:
+ * @code
+ * [Solvers.NR.trust_region]
+ *     delta_init      = 1.0
+ *     delta_min       = 1e-12
+ *     delta_max       = 1e4
+ *     xi_lg           = 0.75
+ *     xi_ug           = 1.4
+ *     xi_lo           = 0.35
+ *     xi_uo           = 5.0
+ *     xi_inc          = 1.5
+ *     xi_dec          = 0.25
+ *     xi_forced_inc   = 1.2
+ *     reject_increase = true
+ * @endcode
+ */
+struct TrustRegionOptions {
+    /**
+     * @brief Initial trust-region radius
+     */
+    double delta_init = 1.0;
+
+    /**
+     * @brief Minimum allowed trust-region radius. Solver fails if delta drops below this.
+     */
+    double delta_min = 1e-12;
+
+    /**
+     * @brief Maximum allowed trust-region radius
+     */
+    double delta_max = 1e4;
+
+    /**
+     * @brief Lower bound of the "good" rho band (increase delta when rho > xi_lg)
+     */
+    double xi_lg = 0.75;
+
+    /**
+     * @brief Upper bound of the "good" rho band
+     */
+    double xi_ug = 1.4;
+
+    /**
+     * @brief Lower bound of the "ok" rho band (decrease delta when rho < xi_lo)
+     */
+    double xi_lo = 0.35;
+
+    /**
+     * @brief Upper bound of the "ok" rho band (decrease delta when rho > xi_uo)
+     */
+    double xi_uo = 5.0;
+
+    /**
+     * @brief Factor used to increase delta when a step is accepted in the "good" band
+     */
+    double xi_inc = 1.5;
+
+    /**
+     * @brief Factor used to decrease delta when a step quality is outside the "ok" band
+     */
+    double xi_dec = 0.25;
+
+    /**
+     * @brief Forced-increase factor when the predicted residual change is exactly zero
+     */
+    double xi_forced_inc = 1.2;
+
+    /**
+     * @brief Whether to reject steps that increase the residual norm
+     */
+    bool reject_increase = true;
+
+    // Validation
+    bool validate() const;
+
+    // Conversion from toml
+    static TrustRegionOptions from_toml(const toml::value& toml_input);
+};
+
 /**
  * @brief Nonlinear solver configuration
  */
@@ -647,6 +745,14 @@ struct NonlinearSolverOptions {
      */
     NonlinearSolverType nl_solver = NonlinearSolverType::NR;
 
+    /**
+     * @brief Trust-region configuration (only used when nl_solver == TRDOG).
+     *
+     * If left empty, default TrustRegionOptions values are used. Users with
+     * difficult convergence problems should provide custom values.
+     */
+    std::optional<TrustRegionOptions> trust_region;
+
     // Validation
     bool validate() const;
 
diff --git a/src/options/option_solvers.cpp b/src/options/option_solvers.cpp
index b5f8af7..817b64c 100644
--- a/src/options/option_solvers.cpp
+++ b/src/options/option_solvers.cpp
@@ -39,6 +39,63 @@ LinearSolverOptions LinearSolverOptions::from_toml(const toml::value& toml_input
     return options;
 }
 
+/**
+ * @brief Parse trust-region options from a TOML sub-table.
+ *
+ * Each field is optional — if not present in the TOML, the struct's default
+ * value is preserved. This lets users override only the parameters they need
+ * to tune.
+ */
+TrustRegionOptions TrustRegionOptions::from_toml(const toml::value& toml_input) {
+    TrustRegionOptions options;
+
+    if (toml_input.contains("delta_init")) {
+        options.delta_init = toml::find<double>(toml_input, "delta_init");
+    }
+
+    if (toml_input.contains("delta_min")) {
+        options.delta_min = toml::find<double>(toml_input, "delta_min");
+    }
+
+    if (toml_input.contains("delta_max")) {
+        options.delta_max = toml::find<double>(toml_input, "delta_max");
+    }
+
+    if (toml_input.contains("xi_lg")) {
+        options.xi_lg = toml::find<double>(toml_input, "xi_lg");
+    }
+
+    if (toml_input.contains("xi_ug")) {
+        options.xi_ug = toml::find<double>(toml_input, "xi_ug");
+    }
+
+    if (toml_input.contains("xi_lo")) {
+        options.xi_lo = toml::find<double>(toml_input, "xi_lo");
+    }
+
+    if (toml_input.contains("xi_uo")) {
+        options.xi_uo = toml::find<double>(toml_input, "xi_uo");
+    }
+
+    if (toml_input.contains("xi_inc")) {
+        options.xi_inc = toml::find<double>(toml_input, "xi_inc");
+    }
+
+    if (toml_input.contains("xi_dec")) {
+        options.xi_dec = toml::find<double>(toml_input, "xi_dec");
+    }
+
+    if (toml_input.contains("xi_forced_inc")) {
+        options.xi_forced_inc = toml::find<double>(toml_input, "xi_forced_inc");
+    }
+
+    if (toml_input.contains("reject_increase")) {
+        options.reject_increase = toml::find<bool>(toml_input, "reject_increase");
+    }
+
+    return options;
+}
+
 NonlinearSolverOptions NonlinearSolverOptions::from_toml(const toml::value& toml_input) {
     NonlinearSolverOptions options;
 
@@ -59,6 +116,14 @@ NonlinearSolverOptions NonlinearSolverOptions::from_toml(const toml::value& toml
             toml::find<std::string>(toml_input, "nl_solver"));
     }
 
+    // Parse the optional trust-region sub-table when using the dogleg solver.
+    // We always parse the table if present (regardless of nl_solver) so that
+    // options validation can flag inconsistent configurations later.
+    if (toml_input.contains("trust_region")) {
+        options.trust_region = TrustRegionOptions::from_toml(
+            toml::find(toml_input, "trust_region"));
+    }
+
     return options;
 }
 
@@ -123,6 +188,75 @@ bool LinearSolverOptions::validate() const {
     return true;
 }
 
+/**
+ * @brief Validate trust-region option ranges and consistency.
+ *
+ * Step-by-step verification:
+ *   1. Trust-region radius bounds: delta_min must be positive and delta_max
+ *      must exceed delta_min
+ *   2. Initial radius must lie within [delta_min, delta_max]
+ *   3. The "good" rho band [xi_lg, xi_ug] must lie inside the "ok" band
+ *      [xi_lo, xi_uo] — otherwise the radius update logic is inconsistent
+ *   4. Increase factors must be > 1 and decrease factor must be in (0, 1)
+ *
+ * Each failure is reported with WARNING_0_OPT pointing to the offending field.
+ */
+bool TrustRegionOptions::validate() const {
+    if (delta_min <= 0.0) {
+        WARNING_0_OPT("Error: TrustRegion table provided a non-positive delta_min");
+        return false;
+    }
+
+    if (delta_max <= delta_min) {
+        WARNING_0_OPT("Error: TrustRegion table provided delta_max <= delta_min");
+        return false;
+    }
+
+    if (delta_init < delta_min || delta_init > delta_max) {
+        WARNING_0_OPT("Error: TrustRegion table provided delta_init outside [delta_min, delta_max]");
+        return false;
+    }
+
+    if (xi_lg <= xi_lo) {
+        WARNING_0_OPT("Error: TrustRegion table requires xi_lg > xi_lo "
+                      "(good band must lie inside ok band)");
+        return false;
+    }
+
+    if (xi_ug >= xi_uo) {
+        WARNING_0_OPT("Error: TrustRegion table requires xi_ug < xi_uo "
+                      "(good band must lie inside ok band)");
+        return false;
+    }
+
+    if (xi_lg >= xi_ug) {
+        WARNING_0_OPT("Error: TrustRegion table requires xi_lg < xi_ug");
+        return false;
+    }
+
+    if (xi_lo >= xi_uo) {
+        WARNING_0_OPT("Error: TrustRegion table requires xi_lo < xi_uo");
+        return false;
+    }
+
+    if (xi_inc <= 1.0) {
+        WARNING_0_OPT("Error: TrustRegion table requires xi_inc > 1.0");
+        return false;
+    }
+
+    if (xi_dec <= 0.0 || xi_dec >= 1.0) {
+        WARNING_0_OPT("Error: TrustRegion table requires xi_dec in (0, 1)");
+        return false;
+    }
+
+    if (xi_forced_inc <= 1.0) {
+        WARNING_0_OPT("Error: TrustRegion table requires xi_forced_inc > 1.0");
+        return false;
+    }
+
+    return true;
+}
+
 bool NonlinearSolverOptions::validate() const {
     if (iter < 1) {
         WARNING_0_OPT("Error: NonLinearSolver table did not provide a positive iteration count");
@@ -139,13 +273,23 @@ bool NonlinearSolverOptions::validate() const {
         return false;
     }
 
-    if (nl_solver != NonlinearSolverType::NR && nl_solver != NonlinearSolverType::NRLS) {
-        WARNING_0_OPT("Error: NonLinearSolver table did not provide a valid nl_solver option (`NR` "
-                      "or `NRLS`)");
+    if (nl_solver != NonlinearSolverType::NR &&
+        nl_solver != NonlinearSolverType::NRLS &&
+        nl_solver != NonlinearSolverType::TRDOG) {
+        WARNING_0_OPT("Error: NonLinearSolver table did not provide a valid nl_solver option "
+                      "(`NR`, `NRLS`, or `TRDOG`)");
         return false;
     }
 
-    // Implement validation logic
+    // If trust-region parameters were supplied, verify they are self-consistent.
+    // We allow a TRDOG solver without a [trust_region] sub-table — the defaults
+    // are applied in that case.
+    if (trust_region.has_value()) {
+        if (!trust_region->validate()) {
+            return false;
+        }
+    }
+
     return true;
 }
 
diff --git a/src/solvers/trust_region_solver.cpp b/src/solvers/trust_region_solver.cpp
new file mode 100644
index 0000000..77fd9fe
--- /dev/null
+++ b/src/solvers/trust_region_solver.cpp
@@ -0,0 +1,350 @@
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and
+// other ExaConstit Project Developers. See the top-level LICENSE file for details.
+//
+// SPDX-License-Identifier: MIT
+
+#include "solvers/trust_region_solver.hpp"
+
+#include "utilities/mechanics_log.hpp"
+#include "utilities/unified_logger.hpp"
+
+#include "mfem.hpp"
+#include "mfem/general/globals.hpp"
+#include "mfem/linalg/linalg.hpp"
+
+#include <algorithm>
+#include <cmath>
+#include <iomanip>
+#include <iostream>
+
+/**
+ * @brief Compute the Powell dogleg step inside the trust region.
+ *
+ * @details Step-by-step algorithm:
+ *
+ * 1. **Full Newton step inside trust region**:
+ *    If ||s_N|| <= delta, take the full Newton step. The predicted residual
+ *    is zero (the linear model F + J*s_N = 0 is exactly satisfied).
+ *
+ * 2. **Cauchy point outside trust region**:
+ *    Compute the Cauchy point parameters:
+ *       - alpha = ||g||^2 / ||J*g||^2   (optimal scaling along steepest descent)
+ *       - ||s_sd_opt|| = alpha * ||g||  (norm of the optimal Cauchy step)
+ *    If ||s_sd_opt|| >= delta, the optimal Cauchy point is outside the trust
+ *    region. Step along the steepest descent direction to the boundary:
+ *       delx = -delta * g / ||g||
+ *    The predicted residual norm is computed from the linear model evaluated
+ *    at this truncated Cauchy step.
+ *
+ * 3. **Dogleg interpolation (second leg)**:
+ *    Otherwise, interpolate along the line segment from the Cauchy point to
+ *    the Newton point, finding the parameter beta in [0, 1] such that the
+ *    interpolated step lies on the trust-region boundary. The intersection
+ *    is found by solving a quadratic:
+ *       delx(beta) = beta * s_N - (1 - beta) * alpha * g
+ *       ||delx(beta)||^2 = delta^2
+ *    yielding qa*beta^2 - 2*qb*beta + qc = 0 where:
+ *       qa = ||p||^2,   qb = alpha * (p . g),   qc = ||s_sd_opt||^2 - delta^2
+ *       and p = s_N + alpha * g.
+ *    Beta is taken from the larger root and clamped to [0, 1] for safety.
+ */
+void ExaTrustRegionSolver::Dogleg(double delta, double res_0, double nr_norm,
+                                  double Jg_2, const mfem::Vector &grad,
+                                  const mfem::Vector &nrStep, mfem::Vector &delx,
+                                  double &pred_resid, bool &use_nr) const
+{
+   use_nr = false;
+
+   // --- Case 1: Full Newton step fits inside the trust region ---
+   if (nr_norm <= delta) {
+      use_nr = true;
+      delx = nrStep;
+      pred_resid = 0.0;
+
+      if (print_level > 0) {
+         mfem::out << "TR dogleg: taking full Newton step (||s_N|| = "
+                   << nr_norm << " <= delta = " << delta << ")\n";
+      }
+      return;
+   }
+
+   // Cauchy point parameters using MPI-aware dot products
+   const double norm2_grad = Dot(grad, grad);
+   const double norm_grad = std::sqrt(norm2_grad);
+
+   const double alpha = (Jg_2 > 0.0) ? (norm2_grad / Jg_2) : 1.0;
+   const double norm_grad_inv = (norm_grad > 0.0) ? (1.0 / norm_grad) : 1.0;
+   const double norm_s_sd_opt = alpha * norm_grad;
+
+   // --- Case 2: Cauchy point is outside the trust region ---
+   // Take a step along the steepest descent direction to the trust-region boundary
+   if (norm_s_sd_opt >= delta) {
+      // delx = -delta * (grad / ||grad||)
+      const double factor = -delta * norm_grad_inv;
+      delx = grad;
+      delx *= factor;
+
+      // Predicted residual from linear model at the truncated Cauchy step
+      const double val = -(delta * norm_grad) +
+                         0.5 * delta * delta * Jg_2 *
+                         (norm_grad_inv * norm_grad_inv);
+      pred_resid = std::sqrt(std::max(2.0 * val + res_0 * res_0, 0.0));
+
+      if (print_level > 0) {
+         mfem::out << "TR dogleg: stepping along first leg (steepest descent)\n";
+      }
+   }
+   // --- Case 3: Cauchy inside, Newton outside; interpolate along the second leg ---
+   else {
+      // Reuse delx as workspace for p = nrStep + alpha * grad
+      mfem::Vector &p = delx;
+      add(nrStep, alpha, grad, p);
+
+      // Quadratic coefficients for the trust-region boundary intersection
+      double qa = Dot(p, p);
+      double qb = Dot(p, grad) * alpha;
+      double qc = norm_s_sd_opt * norm_s_sd_opt - delta * delta;
+
+      double discriminant = qb * qb - qa * qc;
+      double beta = (qa > 0.0)
+         ? (qb + std::sqrt(std::max(discriminant, 0.0))) / qa
+         : 0.0;
+
+      // Clamp beta to [0, 1] to handle any roundoff at the boundary
+      beta = std::max(0.0, std::min(1.0, beta));
+
+      // delx = beta * nrStep - (1 - beta) * alpha * grad
+      const double omb = 1.0 - beta;
+      const double omba = omb * alpha;
+      add(beta, nrStep, -omba, grad, delx);
+
+      // Predicted residual from linear model at the dogleg step
+      const double res_cauchy = (Jg_2 > 0.0)
+         ? std::sqrt(std::max(res_0 * res_0 - alpha * norm2_grad, 0.0))
+         : res_0;
+      pred_resid = omb * res_cauchy;
+
+      if (print_level > 0) {
+         mfem::out << "TR dogleg: stepping along second leg (beta = "
+                   << beta << ")\n";
+      }
+   }
+}
+
+/**
+ * @brief Trust-region dogleg Newton iteration implementation.
+ *
+ * @details Step-by-step algorithm for solving F(x) = b:
+ *
+ * **Initial setup**:
+ *   1. Validate that operator (oper_mech), preconditioner (prec_mech), and
+ *      delta_ctrl are properly configured
+ *   2. Allocate all device-aware working vectors (nrStep, grad, delx, Jg_temp,
+ *      x_prev) once before the iteration loop
+ *   3. Evaluate initial residual r = F(x) - b and compute its norm
+ *   4. Set the convergence threshold norm_max = max(rel_tol * res, abs_tol)
+ *   5. Initialize trust-region radius delta from delta_ctrl.deltaInit
+ *
+ * **Main iteration loop** (until convergence or max_iter):
+ *   1. If the previous step was *not* rejected, recompute Newton machinery:
+ *      a. Get Jacobian J = oper_mech->GetGradient(x). The material state is
+ *         consistent with x because Mult(x, r) was just evaluated.
+ *      b. Compute steepest descent: grad = J^T * r (gradient of f = 0.5 ||F||^2)
+ *      c. Compute Jg_2 = ||J * grad||^2 for the optimal Cauchy step length
+ *      d. Solve the Newton system J*c = r via the Krylov solver (prec_mech),
+ *         then negate: nrStep = -c. The negation matches SNLS convention where
+ *         the Newton update is x += nrStep (whereas ExaNewtonSolver uses x -= c).
+ *      e. Compute nr_norm = ||nrStep||
+ *      If the previous step *was* rejected, all of this data is still valid
+ *      from the last accepted iteration and we just recompute the dogleg with
+ *      the smaller delta.
+ *   2. Save x_prev = x for potential rollback on rejection
+ *   3. Compute the dogleg step delx via Dogleg() helper
+ *   4. Apply the trial step: x = x_prev + delx
+ *   5. Evaluate residual at the trial point: r = F(x) - b
+ *   6. Check convergence: if ||r|| <= norm_max, accept and exit
+ *   7. Update delta via delta_ctrl.UpdateDelta() based on actual vs predicted
+ *      reduction. This may also flag the step for rejection.
+ *   8. If rejected: restore x = x_prev, restore residual norm, set reject_prev.
+ *      The material state inside the model handles itself analogously to the
+ *      ExaNewtonLSSolver line-search behavior — when Mult() is called again at
+ *      the next trial point, the model recomputes from the beginning-step state.
+ *
+ * **Performance Profiling**:
+ *   - "TR_dogleg_solver" scope for overall trust-region solver performance
+ *   - "TR_newton_setup" scope for J^T*r and J*g computations
+ *   - "TR_gradient_transpose" scope for the J^T*r call specifically
+ *   - "TR_newton_solve" scope for the Krylov inner solve
+ *   - "TR_trial_eval" scope for residual evaluations at trial points
+ *   - "krylov_solver" scope for the actual Krylov solver call
+ *
+ * @note All scalar quantities (norms, dot products) use MFEM's MPI-aware
+ *       Norm() and Dot() functions through the IterativeSolver base class
+ */
+void ExaTrustRegionSolver::Mult(const mfem::Vector &b, mfem::Vector &x) const
+{
+   CALI_CXX_MARK_SCOPE("TR_dogleg_solver");
+   MFEM_ASSERT_0(oper_mech, "the Operator is not set (use SetOperator).");
+   MFEM_ASSERT_0(prec_mech, "the Solver is not set (use SetSolver).");
+   MFEM_ASSERT(delta_ctrl.Validate(), "TrDeltaControl parameters are invalid.");
+
+   const bool have_b = (b.Size() == Height());
+
+   // --- Allocate working vectors once, reused across iterations ---
+   mfem::Vector nrStep(width, mfem::Device::GetMemoryType());
+   mfem::Vector grad(width, mfem::Device::GetMemoryType());
+   mfem::Vector delx(width, mfem::Device::GetMemoryType());
+   mfem::Vector Jg_temp(width, mfem::Device::GetMemoryType());
+   mfem::Vector x_prev(width, mfem::Device::GetMemoryType());
+
+   nrStep.UseDevice(true);
+   grad.UseDevice(true);
+   delx.UseDevice(true);
+   Jg_temp.UseDevice(true);
+   x_prev.UseDevice(true);
+
+   // --- Initial residual evaluation: r = F(x) - b ---
+   oper_mech->Mult(x, r);
+   if (have_b) { r -= b; }
+
+   double res = Norm(r);
+   double res_0 = res;
+   const double norm_max = std::max(rel_tol * res, abs_tol);
+
+   if (print_level >= 0) {
+      mfem::out << "TR dogleg: initial ||r|| = " << res << "\n";
+   }
+
+   if (res <= norm_max) {
+      converged = true;
+      final_iter = 0;
+      final_norm = res;
+      return;
+   }
+
+   // --- Initialize trust-region state ---
+   double delta = delta_ctrl.deltaInit;
+   double rho = 0.0;
+   bool reject_prev = false;
+
+   // Persisted across iterations when a step is not rejected
+   double Jg_2 = 0.0;
+   double nr_norm = 0.0;
+
+   int it = 0;
+   converged = false;
+
+   // --- Main iteration loop ---
+   while (it < max_iter) {
+      it++;
+
+      // If the previous step was not rejected, recompute Newton direction
+      // and steepest descent direction at the current x. The Jacobian data
+      // is current because oper_mech->Mult(x, r) was just called.
+      if (!reject_prev) {
+         CALI_CXX_MARK_SCOPE("TR_newton_setup");
+
+         mfem::Operator &J = oper_mech->GetGradient(x);
+
+         // Steepest descent direction: grad = J^T * r
+         // This is the gradient of the merit function f(x) = 0.5 * ||F(x)||^2
+         {
+            CALI_CXX_MARK_SCOPE("TR_gradient_transpose");
+            J.MultTranspose(r, grad);
+         }
+
+         // Compute ||J * grad||^2 for the optimal Cauchy step length
+         //    alpha_cauchy = ||grad||^2 / ||J*grad||^2
+         {
+            J.Mult(grad, Jg_temp);
+            Jg_2 = Dot(Jg_temp, Jg_temp);
+         }
+
+         // Solve Newton system: J * c = r, then nrStep = -c
+         // CGSolver follows the same convention as ExaNewtonSolver where the
+         // Krylov solve produces c such that the Newton update would be x -= c.
+         // For the dogleg we need nrStep = -J^{-1}*r, so we negate after the solve.
+         {
+            CALI_CXX_MARK_SCOPE("TR_newton_solve");
+            c = 0.0;
+            this->CGSolver(J, r, c);
+            nrStep = c;
+            nrStep.Neg();
+         }
+
+         nr_norm = Norm(nrStep);
+      }
+
+      // Save state for potential step rejection
+      x_prev = x;
+
+      // Compute the dogleg step
+      double pred_resid = 0.0;
+      bool use_nr = false;
+      Dogleg(delta, res_0, nr_norm, Jg_2, grad, nrStep,
+             delx, pred_resid, use_nr);
+
+      // Apply the trial step: x = x_prev + delx
+      x = x_prev;
+      x += delx;
+
+      // Evaluate residual at the trial point
+      reject_prev = false;
+      {
+         CALI_CXX_MARK_SCOPE("TR_trial_eval");
+         oper_mech->Mult(x, r);
+         if (have_b) { r -= b; }
+      }
+
+      res = Norm(r);
+
+      if (print_level >= 0) {
+         mfem::out << "TR dogleg: iter " << it
+                   << ", ||r|| = " << res
+                   << ", delta = " << delta
+                   << (use_nr ? " [NR]" : " [DL]")
+                   << "\n";
+      }
+
+      // Check convergence
+      if (res <= norm_max) {
+         converged = true;
+         break;
+      }
+
+      // Update delta from actual vs predicted reduction. May flag for rejection.
+      bool delta_ok = delta_ctrl.UpdateDelta(
+         delta, res, res_0, pred_resid, reject_prev,
+         use_nr, nr_norm, rho, print_level);
+
+      if (!delta_ok) {
+         if (print_level >= 0) {
+            mfem::out << "TR dogleg: delta control failure at iter " << it << "\n";
+         }
+         converged = false;
+         break;
+      }
+
+      // If the step is rejected, revert x and residual.
+      // On the next iteration, reject_prev == true so we skip the Newton solve
+      // and recompute the dogleg with the updated (smaller) delta. The Jacobian,
+      // grad, nrStep, and Jg_2 are still valid from the last accepted state.
+      if (reject_prev) {
+         if (print_level > 0) {
+            mfem::out << "TR dogleg: rejecting step, reverting to previous state\n";
+         }
+         x = x_prev;
+         res = res_0;
+      }
+
+      res_0 = res;
+   }
+
+   final_iter = it;
+   final_norm = res;
+
+   if (!converged && print_level >= 0) {
+      mfem::out << "TR dogleg: failed to converge in " << it
+                << " iterations, final ||r|| = " << res << "\n";
+   }
+}
\ No newline at end of file
diff --git a/src/solvers/trust_region_solver.hpp b/src/solvers/trust_region_solver.hpp
new file mode 100644
index 0000000..43950d3
--- /dev/null
+++ b/src/solvers/trust_region_solver.hpp
@@ -0,0 +1,357 @@
+// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and
+// other ExaConstit Project Developers. See the top-level LICENSE file for details.
+//
+// SPDX-License-Identifier: MIT
+#pragma once
+
+#include "solvers/mechanics_solver.hpp"
+
+#include "mfem.hpp"
+#include "mfem/linalg/solvers.hpp"
+
+#include <cmath>
+#include <algorithm>
+#include <memory>
+
+/**
+ * @brief Trust-region radius control parameters for the dogleg solver.
+ *
+ * @details Ported from SNLS's TrDeltaControl. Controls how the trust-region
+ * radius delta is updated based on the ratio rho = actual_reduction / predicted_reduction.
+ *
+ * The update logic:
+ * - If rho is in the "good" band [xiLG, xiUG] and the step reduced the residual,
+ *   increase delta (unless the full Newton step was taken)
+ * - If rho is outside the "ok" band [xiLO, xiUO], decrease delta
+ * - If the predicted change is zero and delta is not at max, force a small increase
+ * - If the residual actually increased, reject the step
+ *
+ * @ingroup ExaConstit_solvers
+ */
+struct TrDeltaControl
+{
+   /// @brief Lower bound of the "good" rho interval (increase delta when rho > xiLG)
+   double xiLG = 0.75;
+   /// @brief Upper bound of the "good" rho interval
+   double xiUG = 1.4;
+   /// @brief Factor by which to increase delta
+   double xiIncDelta = 1.5;
+   /// @brief Lower bound of the "ok" rho interval (decrease delta when rho < xiLO)
+   double xiLO = 0.35;
+   /// @brief Upper bound of the "ok" rho interval (decrease delta when rho > xiUO)
+   double xiUO = 5.0;
+   /// @brief Factor by which to decrease delta
+   double xiDecDelta = 0.25;
+   /// @brief Forced increase factor when predicted change is zero
+   double xiForcedIncDelta = 1.2;
+   /// @brief Initial trust-region radius
+   double deltaInit = 1.0;
+   /// @brief Minimum allowed trust-region radius (solver fails if hit)
+   double deltaMin = 1e-12;
+   /// @brief Maximum allowed trust-region radius
+   double deltaMax = 1e4;
+   /// @brief Whether to reject steps that increase the residual
+   bool rejectResIncrease = true;
+
+   /**
+    * @brief Validate that the control parameters are self-consistent.
+    *
+    * @return true if all parameter relationships are valid, false otherwise
+    *
+    * Verifies the following invariants:
+    * - deltaMin > 0 and deltaMax > deltaMin
+    * - The "good" rho band [xiLG, xiUG] sits inside the "ok" band [xiLO, xiUO]
+    * - The increase factor (xiIncDelta) is greater than 1
+    * - The decrease factor (xiDecDelta) is in (0, 1)
+    * - The forced-increase factor is greater than 1
+    */
+   bool Validate() const
+   {
+      return (deltaMin > 0.0) &&
+             (deltaMax > deltaMin) &&
+             (xiLG > xiLO) &&
+             (xiUG < xiUO) &&
+             (xiIncDelta > 1.0) &&
+             (xiDecDelta > 0.0 && xiDecDelta < 1.0) &&
+             (xiForcedIncDelta > 1.0);
+   }
+
+   /**
+    * @brief Decrease the trust-region radius after a rejected/poor step.
+    *
+    * @param[in,out] delta Current radius, modified on output
+    * @param[in] norm_full Norm of the full Newton step
+    * @param[in] took_full Whether the full Newton step was used at the last iteration
+    * @param[in] print_level Verbosity level for output
+    * @return true if delta is still above deltaMin, false if solver should fail
+    *
+    * @details If the full Newton step was taken, uses a geometric mean blend of
+    * the current delta and the Newton step norm scaled by xiDecDelta. Otherwise
+    * just multiplies delta by xiDecDelta. Returns false (and sets delta to deltaMin)
+    * if the resulting delta drops below the minimum allowed value.
+    */
+   bool DecrDelta(double &delta, double norm_full, bool took_full,
+                  int print_level = 0) const
+   {
+      if (took_full) {
+         double tempa = delta * xiDecDelta;
+         double tempb = norm_full * xiDecDelta;
+         delta = std::sqrt(tempa * tempb);
+      }
+      else {
+         delta *= xiDecDelta;
+      }
+
+      if (delta < deltaMin) {
+         delta = deltaMin;
+         if (print_level >= 0) {
+            mfem::out << "TR: delta at minimum " << delta << "\n";
+         }
+         return false;
+      }
+
+      if (print_level > 0) {
+         mfem::out << "TR: decreased delta to " << delta << "\n";
+      }
+      return true;
+   }
+
+   /**
+    * @brief Increase the trust-region radius after a successful step.
+    *
+    * @param[in,out] delta Current radius, modified on output
+    * @param[in] print_level Verbosity level for output
+    *
+    * @details Multiplies delta by xiIncDelta and clamps at deltaMax.
+    */
+   void IncrDelta(double &delta, int print_level = 0) const
+   {
+      delta *= xiIncDelta;
+      if (delta > deltaMax) {
+         delta = deltaMax;
+         if (print_level > 0) {
+            mfem::out << "TR: delta at maximum " << delta << "\n";
+         }
+      }
+      else if (print_level > 0) {
+         mfem::out << "TR: increased delta to " << delta << "\n";
+      }
+   }
+
+   /**
+    * @brief Update trust-region radius based on actual vs predicted residual change.
+    *
+    * @param[in,out] delta Trust-region radius, modified on output
+    * @param[in] res New residual norm (after the candidate step)
+    * @param[in] res_0 Previous residual norm (before the candidate step)
+    * @param[in] pred_resid Predicted residual norm from the dogleg model
+    * @param[out] reject Whether the step should be rejected (residual increased)
+    * @param[in] took_full Whether the full Newton step was taken
+    * @param[in] norm_full Norm of the full Newton step
+    * @param[out] rho Actual / predicted reduction ratio (output for diagnostics)
+    * @param[in] print_level Verbosity level for output
+    * @return true if the delta update succeeded, false if the solver should fail
+    *
+    * @details Algorithm (ported from SNLS TrDeltaControl::updateDelta):
+    *   1. Compute actual_change = res - res_0 and pred_change = pred_resid - res_0
+    *   2. If pred_change is exactly zero, force delta larger (or fail if at max)
+    *   3. Otherwise compute rho = actual_change / pred_change
+    *   4. If rho is in the "good" band [xiLG, xiUG] and the residual decreased,
+    *      increase delta (unless the full Newton step was already taken)
+    *   5. If rho is outside the "ok" band [xiLO, xiUO], decrease delta
+    *   6. If the residual increased and rejectResIncrease is set, mark for rejection
+    */
+   bool UpdateDelta(double &delta, double res, double res_0,
+                    double pred_resid, bool &reject, bool took_full,
+                    double norm_full, double &rho,
+                    int print_level = 0) const
+   {
+      bool success = true;
+      double actual_change = res - res_0;
+      double pred_change = pred_resid - res_0;
+
+      if (pred_change == 0.0) {
+         if (delta >= deltaMax) {
+            if (print_level >= 0) {
+               mfem::out << "TR: predicted change is zero and delta at max\n";
+            }
+            success = false;
+         }
+         else {
+            if (print_level > 0) {
+               mfem::out << "TR: predicted change is zero, forcing delta larger\n";
+            }
+            delta = std::min(delta * xiForcedIncDelta, deltaMax);
+         }
+      }
+      else {
+         rho = actual_change / pred_change;
+         if (print_level > 0) {
+            mfem::out << "TR: rho = " << rho << "\n";
+         }
+
+         if ((rho > xiLG) && (actual_change < 0.0) && (rho < xiUG)) {
+            // Step is in the "good" band and residual actually decreased
+            if (!took_full) {
+               IncrDelta(delta, print_level);
+            }
+         }
+         else if ((rho < xiLO) || (rho > xiUO)) {
+            // Step quality is outside the acceptable band; shrink delta
+            success = DecrDelta(delta, norm_full, took_full, print_level);
+         }
+      }
+
+      reject = false;
+      // Do not make this >=, may have res and res_0 both zero and that is ok
+      if ((actual_change > 0.0) && rejectResIncrease) {
+         reject = true;
+      }
+
+      return success;
+   }
+};
+
+/**
+ * @brief Trust-region dogleg solver for nonlinear solid mechanics problems.
+ *
+ * @details This class implements a Powell-dogleg trust-region method for solving
+ * nonlinear systems F(x) = b. It extends ExaNewtonSolver and reuses the same
+ * Krylov solver infrastructure (prec_mech) for computing the Newton direction.
+ *
+ * The trust-region method augments standard Newton with a globalization strategy
+ * that interpolates between the steepest descent direction and the full Newton
+ * step, constrained to a trust-region radius delta. Step quality is monitored
+ * via the ratio rho = actual_reduction / predicted_reduction, and delta is
+ * adjusted up or down accordingly.
+ *
+ * This is a direct port of SNLS's SNLSTrDlDenseG solver, lifted from the
+ * material-point dense system to the global FE system.
+ *
+ * Algorithm at each iteration:
+ *   1. Compute steepest descent direction g = J^T * r (gradient of merit f = 0.5 ||F||^2)
+ *   2. Compute ||J*g||^2 for the optimal Cauchy step length
+ *   3. Solve J * c = r for the full Newton direction (using prec_mech Krylov solver)
+ *   4. Compute the dogleg step within the trust region
+ *   5. Evaluate the residual at the trial point
+ *   6. Accept or reject based on the rho ratio; update delta accordingly
+ *
+ * Requirements:
+ * - The gradient operator must support MultTranspose (for J^T*r computation).
+ *   This means the assembly mode must be EA, FA, or PA with the native PA
+ *   transpose kernels enabled.
+ *
+ * @ingroup ExaConstit_solvers
+ */
+class ExaTrustRegionSolver : public ExaNewtonSolver
+{
+   public:
+      /**
+       * @brief Default constructor
+       *
+       * @details Creates an ExaTrustRegionSolver instance for single-processor
+       * execution. The operator and linear solver must be set separately using
+       * SetOperator() and SetSolver(), and the trust-region control parameters
+       * may be customized via SetTrustRegionControl().
+       */
+      ExaTrustRegionSolver() { }
+
+#ifdef MFEM_USE_MPI
+      /**
+       * @brief MPI constructor
+       *
+       * @param _comm MPI communicator for parallel execution
+       *
+       * @details Creates an ExaTrustRegionSolver instance for parallel execution
+       * using the specified MPI communicator. All trust-region scalar quantities
+       * (norms, dot products) use MPI-aware reductions through MFEM's Dot/Norm.
+       */
+      ExaTrustRegionSolver(MPI_Comm _comm) : ExaNewtonSolver(_comm) { }
+#endif
+
+      /** @brief Use parent class SetOperator methods */
+      using ExaNewtonSolver::SetOperator;
+
+      /** @brief Use parent class SetSolver methods */
+      using ExaNewtonSolver::SetSolver;
+
+      /** @brief Use parent class CGSolver method (Krylov solve wrapper) */
+      using ExaNewtonSolver::CGSolver;
+
+      /**
+       * @brief Set trust-region control parameters.
+       *
+       * @param ctrl TrDeltaControl struct with all tuning parameters
+       *
+       * @details Replaces the internal control parameters with a user-supplied
+       * configuration. Typically called after construction (and before Mult())
+       * to wire up parameters parsed from the TOML configuration file.
+       */
+      void SetTrustRegionControl(const TrDeltaControl &ctrl)
+      {
+         delta_ctrl = ctrl;
+      }
+
+      /**
+       * @brief Get a mutable reference to the trust-region control parameters.
+       * @return Reference to the internal TrDeltaControl
+       */
+      TrDeltaControl& GetTrustRegionControl() { return delta_ctrl; }
+
+      /**
+       * @brief Get a const reference to the trust-region control parameters.
+       * @return Const reference to the internal TrDeltaControl
+       */
+      const TrDeltaControl& GetTrustRegionControl() const { return delta_ctrl; }
+
+      /**
+       * @brief Solve the nonlinear system F(x) = b using trust-region dogleg method.
+       *
+       * @param b Right-hand side vector (if b.Size() != Height(), assumes b = 0)
+       * @param x Solution vector (input: initial guess, output: converged solution)
+       *
+       * @details Implements the trust-region dogleg algorithm. See class-level
+       * documentation for the algorithm description. The Newton direction is
+       * computed by the Krylov solver wired in via SetSolver(); J^T*r is
+       * computed by calling MultTranspose() on the gradient operator.
+       *
+       * @pre SetOperator() and SetSolver() must be called before Mult()
+       * @pre The gradient operator must support MultTranspose (EA/FA mode, or
+       *      PA mode with native transpose kernels)
+       *
+       * @post final_iter contains the number of iterations performed
+       * @post final_norm contains the final residual norm
+       * @post converged flag indicates whether the solver converged
+       */
+      virtual void Mult(const mfem::Vector &b, mfem::Vector &x) const;
+
+   private:
+      /**
+       * @brief Compute the dogleg step given the current trust-region radius.
+       *
+       * @param[in] delta Trust-region radius
+       * @param[in] res_0 Current residual norm
+       * @param[in] nr_norm Norm of the full Newton step
+       * @param[in] Jg_2 ||J*g||^2 where g is the steepest descent direction
+       * @param[in] grad Steepest descent direction g = J^T * r
+       * @param[in] nrStep Full Newton step
+       * @param[out] delx The computed dogleg step
+       * @param[out] pred_resid Predicted residual norm after the step
+       * @param[out] use_nr Whether the full Newton step was taken
+       *
+       * @details Ported from SNLS's dogleg() kernel. The dogleg path interpolates
+       * between the steepest descent direction (Cauchy point) and the full Newton
+       * step. Three cases are handled:
+       *   - Newton step inside delta: take full Newton step
+       *   - Cauchy point outside delta: step along steepest descent to boundary
+       *   - Cauchy inside, Newton outside: solve quadratic for the dogleg leg
+       *     intersection with the trust-region boundary
+       */
+      void Dogleg(double delta, double res_0, double nr_norm,
+                  double Jg_2, const mfem::Vector &grad,
+                  const mfem::Vector &nrStep, mfem::Vector &delx,
+                  double &pred_resid, bool &use_nr) const;
+
+      /// @brief Trust-region control parameters (mutable to allow tuning)
+      mutable TrDeltaControl delta_ctrl;
+};
\ No newline at end of file
diff --git a/src/system_driver.cpp b/src/system_driver.cpp
index 15f4e2b..955146f 100644
--- a/src/system_driver.cpp
+++ b/src/system_driver.cpp
@@ -3,6 +3,7 @@
 
 #include "boundary_conditions/BCData.hpp"
 #include "boundary_conditions/BCManager.hpp"
+#include "solvers/trust_region_solver.hpp"
 #include "utilities/mechanics_kernels.hpp"
 #include "utilities/mechanics_log.hpp"
 #include "utilities/unified_logger.hpp"
@@ -358,10 +359,47 @@ SystemDriver::SystemDriver(std::shared_ptr<SimulationState> sim_state)
     if (nonlinear_solver.nl_solver == NonlinearSolverType::NR) {
         newton_solver = std::make_unique<ExaNewtonSolver>(
             m_sim_state->GetMeshParFiniteElementSpace()->GetComm());
-    } else if (nonlinear_solver.nl_solver == NonlinearSolverType::NRLS) {
+    }
+    else if (nonlinear_solver.nl_solver == NonlinearSolverType::NRLS) {
         newton_solver = std::make_unique<ExaNewtonLSSolver>(
             m_sim_state->GetMeshParFiniteElementSpace()->GetComm());
     }
+    else if (nonlinear_solver.nl_solver == NonlinearSolverType::TRDOG) {
+        // Build the trust-region dogleg solver and configure delta-control
+        // parameters from the parsed TOML options. If the user did not supply
+        // a [trust_region] sub-table, the solver's internal defaults (matching
+        // SNLS's TrDeltaControl defaults) are used.
+        auto tr_solver = std::make_unique<ExaTrustRegionSolver>(
+            m_sim_state->GetMeshParFiniteElementSpace()->GetComm());
+
+        if (nonlinear_solver.trust_region.has_value()) {
+            const auto& tr_opts = nonlinear_solver.trust_region.value();
+            TrDeltaControl ctrl;
+            ctrl.deltaInit         = tr_opts.delta_init;
+            ctrl.deltaMin          = tr_opts.delta_min;
+            ctrl.deltaMax          = tr_opts.delta_max;
+            ctrl.xiLG              = tr_opts.xi_lg;
+            ctrl.xiUG              = tr_opts.xi_ug;
+            ctrl.xiLO              = tr_opts.xi_lo;
+            ctrl.xiUO              = tr_opts.xi_uo;
+            ctrl.xiIncDelta        = tr_opts.xi_inc;
+            ctrl.xiDecDelta        = tr_opts.xi_dec;
+            ctrl.xiForcedIncDelta  = tr_opts.xi_forced_inc;
+            ctrl.rejectResIncrease = tr_opts.reject_increase;
+            tr_solver->SetTrustRegionControl(ctrl);
+        }
+
+        newton_solver = std::move(tr_solver);
+
+        // Sanity check: TRDOG requires gradient transpose support (J^T*r). For
+        // PA mode, this requires the native PA transpose kernels in the
+        // integrator. EA and FULL always support transpose. We warn rather than
+        // hard-fail here because PA support exists once the kernels are wired.
+        if (options.solvers.assembly == AssemblyType::PA) {
+            mfem::out << "Note: TRDOG with PA assembly requires native PA transpose "
+                      << "kernels in the gradient operator.\n";
+        }
+    }
 
     // Set the newton solve parameters
     newton_solver->iterative_mode = true;

From f9cf59de33e1d5c2343f000b326549b495ca924d Mon Sep 17 00:00:00 2001
From: Robert Carson <carson16@llnl.gov>
Date: Mon, 27 Apr 2026 13:24:38 -0700
Subject: [PATCH 2/7] Minor update of BC types as std::array<bool> is better
 supported than mfem::Array<bool> Was getting some odd GPU failures here at
 one point and just moved over to std::array<bool, 3> to get rid of them as I
 didn't want to deal with the odd memory issues I was hitting. The MFEM stuff
 was still fine. I just think I was hitting an odd bug in some GPU kernel, but
 moving to the std::array is ultimately the better approach as it honestly
 makes more sense for this stuff...

---
 src/boundary_conditions/BCData.cpp    |  2 +-
 src/boundary_conditions/BCData.hpp    |  3 ++-
 src/boundary_conditions/BCManager.cpp | 28 +++++++++++----------------
 3 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/src/boundary_conditions/BCData.cpp b/src/boundary_conditions/BCData.cpp
index 3714bc1..334e650 100644
--- a/src/boundary_conditions/BCData.cpp
+++ b/src/boundary_conditions/BCData.cpp
@@ -64,7 +64,7 @@ void BCData::SetScales() {
     }
 }
 
-void BCData::GetComponents(int id, mfem::Array<bool>& component) {
+void BCData::GetComponents(int id, std::array<bool, 3>& component) {
     switch (id) {
     case 0:
         component[0] = false;
diff --git a/src/boundary_conditions/BCData.hpp b/src/boundary_conditions/BCData.hpp
index 075e46b..184cb5e 100644
--- a/src/boundary_conditions/BCData.hpp
+++ b/src/boundary_conditions/BCData.hpp
@@ -5,6 +5,7 @@
 #include "mfem.hpp"
 #include "mfem/linalg/vector.hpp"
 
+#include <array>
 #include <fstream>
 
 /**
@@ -101,6 +102,6 @@ class BCData {
      * - id = 6: (true, false, true)
      * - id = 7: (true, true, true)
      */
-    static void GetComponents(int id, mfem::Array<bool>& component);
+    static void GetComponents(int id, std::array<bool, 3>& component);
 };
 #endif
diff --git a/src/boundary_conditions/BCManager.cpp b/src/boundary_conditions/BCManager.cpp
index 5f0e7db..312a685 100644
--- a/src/boundary_conditions/BCManager.cpp
+++ b/src/boundary_conditions/BCManager.cpp
@@ -13,14 +13,12 @@ void BCManager::UpdateBCData(std::unordered_map<std::string, mfem::Array<int>>&
     ess_bdr["total"] = 0;
     scale = 0.0;
 
-    auto ess_comp = map_ess_comp["total"].find(step)->second;
-    auto ess_id = map_ess_id["total"].find(step)->second;
+    const auto& ess_comp = map_ess_comp["total"].find(step)->second;
+    const auto& ess_id = map_ess_id["total"].find(step)->second;
 
-    mfem::Array<bool> cmp_row;
-    cmp_row.SetSize(3);
+    std::array<bool, 3> cmp_row;
 
     component["total"] = false;
-    cmp_row = false;
 
     for (size_t i = 0; i < ess_id.size(); ++i) {
         // set the active boundary attributes
@@ -48,19 +46,17 @@ void BCManager::UpdateBCData(mfem::Array<int>& ess_bdr,
 
     // The size here is set explicitly
     component.SetSize(ess_bdr.Size(), 3);
-    mfem::Array<bool> cmp_row;
-    cmp_row.SetSize(3);
+    std::array<bool, 3> cmp_row;
 
     component = false;
-    cmp_row = false;
 
     if (map_ess_vel.find(step) == map_ess_vel.end()) {
         return;
     }
 
-    auto ess_vel = map_ess_vel.find(step)->second;
-    auto ess_comp = map_ess_comp["ess_vel"].find(step)->second;
-    auto ess_id = map_ess_id["ess_vel"].find(step)->second;
+    const auto& ess_vel = map_ess_vel.find(step)->second;
+    const auto& ess_comp = map_ess_comp["ess_vel"].find(step)->second;
+    const auto& ess_id = map_ess_id["ess_vel"].find(step)->second;
 
     for (size_t i = 0; i < ess_id.size(); ++i) {
         // set the active boundary attributes
@@ -111,19 +107,17 @@ void BCManager::UpdateBCData(mfem::Array<int>& ess_bdr,
 
     // The size here is set explicitly
     component.SetSize(ess_bdr.Size(), 3);
-    mfem::Array<bool> cmp_row;
-    cmp_row.SetSize(3);
+    std::array<bool, 3> cmp_row;
 
     component = false;
-    cmp_row = false;
 
     if (map_ess_vgrad.find(step) == map_ess_vgrad.end()) {
         return;
     }
 
-    auto ess_vgrad = map_ess_vgrad.find(step)->second;
-    auto ess_comp = map_ess_comp["ess_vgrad"].find(step)->second;
-    auto ess_id = map_ess_id["ess_vgrad"].find(step)->second;
+    const auto& ess_vgrad = map_ess_vgrad.find(step)->second;
+    const auto& ess_comp = map_ess_comp["ess_vgrad"].find(step)->second;
+    const auto& ess_id = map_ess_id["ess_vgrad"].find(step)->second;
 
     for (size_t i = 0; i < ess_vgrad.size(); ++i) {
         data[i] = ess_vgrad.at(i);

From eb02364a8570f58d956d365889437644266744ba Mon Sep 17 00:00:00 2001
From: Robert Carson <carson16@llnl.gov>
Date: Mon, 27 Apr 2026 13:33:21 -0700
Subject: [PATCH 3/7] [codex] Changes to work with MFEM v4.9+ Had codex help
 work out what changes were needed to get ExaConstit updated to work with MFEM
 v4.9+ as I wanted to make sure we could use newer versions of Hypre as
 apparently that works better with ROCm v7+

---
 src/mfem_expt/partial_qspace.cpp             |  2 ++
 src/postprocessing/postprocessing_driver.cpp |  2 +-
 src/system_driver.cpp                        | 28 ++++----------------
 src/utilities/mechanics_kernels.hpp          |  4 +--
 4 files changed, 10 insertions(+), 26 deletions(-)

diff --git a/src/mfem_expt/partial_qspace.cpp b/src/mfem_expt/partial_qspace.cpp
index 2e0261f..3230313 100644
--- a/src/mfem_expt/partial_qspace.cpp
+++ b/src/mfem_expt/partial_qspace.cpp
@@ -43,6 +43,8 @@ const mfem::Vector& PartialQuadratureSpace::GetGeometricFactorWeights() const {
 void PartialQuadratureSpace::ConstructOffsets() {
     // Set up offsets based on our partial element set
     const int num_partial_elem = local2global.Size();
+    ne = num_partial_elem;
+    full_offset_cache.SetSize(0);
     offsets.SetSize(num_partial_elem + 1);
     int offset = 0;
     for (int i = 0; i < num_partial_elem; i++) {
diff --git a/src/postprocessing/postprocessing_driver.cpp b/src/postprocessing/postprocessing_driver.cpp
index 8212eb2..d1f8025 100644
--- a/src/postprocessing/postprocessing_driver.cpp
+++ b/src/postprocessing/postprocessing_driver.cpp
@@ -1281,7 +1281,7 @@ void PostProcessingDriver::CalcElementAvg(mfem::expt::PartialQuadratureFunction*
 
     // KEY DIFFERENCE: Get the local-to-global element mapping for partial space
     auto l2g = pqs->GetLocal2Global().Read();    // Maps local element index to global element index
-    auto loc_offsets = pqs->getOffsets().Read(); // Offsets for local data layout
+    auto loc_offsets = pqs->Offsets(mfem::QSpaceOffsetStorage::COMPRESSED).Read(); // Offsets for local data layout
     // auto global_offsets = (pqs->GetGlobalOffset().Size() > 1) ?
     //                        pqs->GetGlobalOffset().Read() : loc_offsets; // Offsets for global
     //                        data layout
diff --git a/src/system_driver.cpp b/src/system_driver.cpp
index 955146f..c4b2797 100644
--- a/src/system_driver.cpp
+++ b/src/system_driver.cpp
@@ -291,29 +291,11 @@ SystemDriver::SystemDriver(std::shared_ptr<SimulationState> sim_state)
     } else {
         if (linear_solvers.preconditioner == PreconditionerType::AMG) {
             auto prec_amg = std::make_shared<mfem::HypreBoomerAMG>();
-            HYPRE_Solver h_amg = static_cast<HYPRE_Solver>(*prec_amg);
-            HYPRE_Real st_val = 0.90;
-            HYPRE_Real rt_val = -10.0;
-            // HYPRE_Real om_val = 1.0;
-            //
-            [[maybe_unused]] int ml = HYPRE_BoomerAMGSetMaxLevels(h_amg, 30);
-            ml = HYPRE_BoomerAMGSetCoarsenType(h_amg, 0);
-            ml = HYPRE_BoomerAMGSetMeasureType(h_amg, 0);
-            ml = HYPRE_BoomerAMGSetStrongThreshold(h_amg, st_val);
-            ml = HYPRE_BoomerAMGSetNumSweeps(h_amg, 3);
-            ml = HYPRE_BoomerAMGSetRelaxType(h_amg, 8);
-            // int rwt = HYPRE_BoomerAMGSetRelaxWt(h_amg, rt_val);
-            // int ro = HYPRE_BoomerAMGSetOuterWt(h_amg, om_val);
-            // Dimensionality of our problem
-            ml = HYPRE_BoomerAMGSetNumFunctions(h_amg, 3);
-            ml = HYPRE_BoomerAMGSetSmoothType(h_amg, 6);
-            ml = HYPRE_BoomerAMGSetSmoothNumLevels(h_amg, 3);
-            ml = HYPRE_BoomerAMGSetSmoothNumSweeps(h_amg, 3);
-            ml = HYPRE_BoomerAMGSetVariant(h_amg, 0);
-            ml = HYPRE_BoomerAMGSetOverlap(h_amg, 0);
-            ml = HYPRE_BoomerAMGSetDomainType(h_amg, 1);
-            ml = HYPRE_BoomerAMGSetSchwarzRlxWeight(h_amg, rt_val);
-
+            const int problem_dim = m_sim_state->GetMesh()->SpaceDimension();
+            const bool order_bynodes = (fe_space->GetOrdering() == mfem::Ordering::byNODES);
+            // Use MFEM's supported systems-AMG configuration so Hypre sees
+            // the correct vector-valued DOF ordering on newer MFEM/Hypre builds.
+            prec_amg->SetSystemsOptions(problem_dim, order_bynodes);
             prec_amg->SetPrintLevel(linear_solvers.print_level);
             J_prec = prec_amg;
         } else if (linear_solvers.preconditioner == PreconditionerType::ILU) {
diff --git a/src/utilities/mechanics_kernels.hpp b/src/utilities/mechanics_kernels.hpp
index e7d139a..bcb21cf 100644
--- a/src/utilities/mechanics_kernels.hpp
+++ b/src/utilities/mechanics_kernels.hpp
@@ -542,7 +542,7 @@ double ComputeVolAvgTensorFilterFromPartial(const mfem::expt::PartialQuadratureF
 
     // Get the local-to-global element mapping and data layout info
     auto l2g = pqs->GetLocal2Global().Read();    // Maps local element index to global element index
-    auto loc_offsets = pqs->getOffsets().Read(); // Offsets for local data layout
+    auto loc_offsets = pqs->Offsets(mfem::QSpaceOffsetStorage::COMPRESSED).Read(); // Offsets for local data layout
     auto global_offsets = (pqs->GetGlobalOffset().Size() > 1)
                               ? pqs->GetGlobalOffset().Read()
                               : loc_offsets; // Offsets for global data layout
@@ -763,7 +763,7 @@ double ComputeVolAvgTensorFromPartial(const mfem::expt::PartialQuadratureFunctio
 
     // Get the local-to-global element mapping and data layout info
     auto l2g = pqs->GetLocal2Global().Read();    // Maps local element index to global element index
-    auto loc_offsets = pqs->getOffsets().Read(); // Offsets for local data layout
+    auto loc_offsets = pqs->Offsets(mfem::QSpaceOffsetStorage::COMPRESSED).Read(); // Offsets for local data layout
     auto global_offsets = (pqs->GetGlobalOffset().Size() > 1)
                               ? pqs->GetGlobalOffset().Read()
                               : loc_offsets; // Offsets for global data layout

From 7c06599013b969ca0e5eb3f0d52538d51b5a74d9 Mon Sep 17 00:00:00 2001
From: Robert Carson <carson16@llnl.gov>
Date: Mon, 27 Apr 2026 14:43:35 -0700
Subject: [PATCH 4/7] [codex partially] Fix a couple different fun bugs in
 post-processing Found a couple bugs in the post-processing: First one is that
 volume average values were getting outputted every time step designated for
 the viz files and the viz files were getting outputted every time step. So
 caught and fixed that issue. Next found in certain cases there could be
 segfaults due to some models not having a variable defined and thus a vdim
 equal to 0 causing segfaults which was a fun bug to run down... Finally this
 was one I had codex help dissect and chase down, we had some fun MPI stalls
 occurring and it was due to how data was being solved requiring the global
 communicator rather than a region defined communicator... Didn't notice this
 in earlier testing as our regions were usually on every rank... I'd still
 blame this one though on how MFEM is handling the communicators for meshes /
 par finite element spaces and in turn how this relates to the data
 collections as well...

---
 src/postprocessing/postprocessing_driver.cpp  | 37 ++++++++++++++-----
 .../postprocessing_file_manager.hpp           | 18 ++++++---
 2 files changed, 39 insertions(+), 16 deletions(-)

diff --git a/src/postprocessing/postprocessing_driver.cpp b/src/postprocessing/postprocessing_driver.cpp
index d1f8025..138e828 100644
--- a/src/postprocessing/postprocessing_driver.cpp
+++ b/src/postprocessing/postprocessing_driver.cpp
@@ -531,17 +531,15 @@ void PostProcessingDriver::UpdateFields([[maybe_unused]] const int step,
 void PostProcessingDriver::Update(const int step, const double time) {
     CALI_CXX_MARK_SCOPE("postprocessing_update");
     UpdateFields(step, time);
-    // Check if we should output volume averages at this step
-    if (ShouldOutputAtStep(step)) {
-        PrintVolValues(time, m_aggregation_mode);
-        ClearVolumeAverageCache();
-    }
 
     // Update data collections for visualization
-    if (m_enable_visualization) {
+    if (ShouldOutputAtStep(step) && m_enable_visualization) {
         UpdateDataCollections(step, time);
     }
 
+    PrintVolValues(time, m_aggregation_mode);
+    ClearVolumeAverageCache();
+
     if (m_light_up_instances.size() > 0) {
         UpdateLightUpAnalysis();
     }
@@ -1393,6 +1391,9 @@ void PostProcessingDriver::InitializeGridFunctions() {
                     const auto gf_name = GetGridFunctionName(reg.display_name, reg_int);
                     // Determine vector dimension from quadrature function
                     const int vdim = reg.region_length[region];
+                    if (vdim < 1) {
+                        continue;
+                    }
                     max_vdim = (vdim > max_vdim) ? vdim : max_vdim;
                     auto fe_space = GetParFiniteElementSpace(reg_int, vdim);
                     m_map_gfs.emplace(gf_name,
@@ -1467,18 +1468,31 @@ void PostProcessingDriver::InitializeDataCollections(ExaOptions& options) {
         return input.substr(0, pos);
     };
 
+    auto has_registered_fields = [this](const std::string& display_region_postfix) {
+        for (const auto& [key, value] : m_map_gfs) {
+            (void)value;
+            if (key.find(display_region_postfix) != std::string::npos) {
+                return true;
+            }
+        }
+        return false;
+    };
+
     if (m_aggregation_mode == AggregationMode::PER_REGION ||
         m_aggregation_mode == AggregationMode::BOTH) {
         for (int region = 0; region < static_cast<int>(m_num_regions); ++region) {
             auto mesh = m_map_submesh[region];
             std::string region_postfix = "region_" + std::to_string(region + 1);
             std::string display_region_postfix = " " + m_sim_state->GetRegionDisplayName(region);
+            if (!has_registered_fields(display_region_postfix)) {
+                continue;
+            }
             fs::path output_dir = output_dir_base / region_postfix;
             fs::path output_dir_vizs = output_dir / m_file_manager->GetBaseFilename();
-            if (m_sim_state->IsRegionActive(region)) {
-                auto region_comm = m_sim_state->GetRegionCommunicator(region);
-                m_file_manager->EnsureDirectoryExists(output_dir, region_comm);
-            }
+            // The subsequent DataCollection::Save() is a parallel operation on the submesh's
+            // communicator, which is still the parent MPI communicator. Prepare directories on
+            // that same communicator so all participating ranks observe the same path state.
+            m_file_manager->EnsureDirectoryExists(output_dir, MPI_COMM_WORLD);
             std::vector<std::string> dcs_keys;
             if (options.visualization.visit) {
                 std::string key = visit_key + region_postfix;
@@ -1534,6 +1548,9 @@ void PostProcessingDriver::InitializeDataCollections(ExaOptions& options) {
 
         std::string region_postfix = "global";
         std::string display_region_postfix = " " + m_sim_state->GetRegionDisplayName(-1);
+        if (!has_registered_fields(display_region_postfix)) {
+            return;
+        }
         fs::path output_dir = output_dir_base / region_postfix;
         fs::path output_dir_vizs = output_dir / m_file_manager->GetBaseFilename();
         m_file_manager->EnsureDirectoryExists(output_dir);
diff --git a/src/postprocessing/postprocessing_file_manager.hpp b/src/postprocessing/postprocessing_file_manager.hpp
index f070029..3784f31 100644
--- a/src/postprocessing/postprocessing_file_manager.hpp
+++ b/src/postprocessing/postprocessing_file_manager.hpp
@@ -186,7 +186,7 @@ class PostProcessingFileManager {
         auto filepath = GetVolumeAverageFilePath(calc_type, region, region_name);
 
         bool file_exists = fs::exists(filepath);
-        auto file = CreateOutputFile(filepath, true);
+        auto file = CreateOutputFile(filepath, true, comm);
 
         if (file && file->is_open()) {
             if (!file_exists) {
@@ -452,6 +452,7 @@ inline bool PostProcessingFileManager::EnsureDirectoryExists(fs::path& output_di
     int rank;
     MPI_Comm_rank(comm, &rank);
     bool success = false;
+    std::string path_str;
     if (rank == 0) {
         try {
             // Use weakly_canonical to resolve as much as possible
@@ -474,6 +475,7 @@ inline bool PostProcessingFileManager::EnsureDirectoryExists(fs::path& output_di
                 } else {
                     std::cout << "Using existing directory: " << canonical_path << std::endl;
                     output_dir = canonical_path;
+                    path_str = canonical_path.string();
                     success = true;
                 }
             } else {
@@ -482,6 +484,8 @@ inline bool PostProcessingFileManager::EnsureDirectoryExists(fs::path& output_di
                 success = fs::create_directories(canonical_path);
                 if (success) {
                     output_dir = canonical_path;
+                    path_str = canonical_path.string();
+
                 } else {
                     std::cerr << "Warning: Failed to create output directory: " << canonical_path
                               << std::endl;
@@ -513,15 +517,17 @@ inline bool PostProcessingFileManager::EnsureDirectoryExists(fs::path& output_di
     }
 
     // Broadcast the potentially updated output_dir to all ranks
-    std::string path_str = output_dir.string();
     int dir_length = static_cast<int>(path_str.length());
     MPI_Bcast(&dir_length, 1, MPI_INT, 0, comm);
-    path_str.resize(static_cast<size_t>(dir_length));
-    MPI_Bcast(&path_str[0], dir_length, MPI_CHAR, 0, comm);
-    output_dir = path_str;
+    if (dir_length > 0) {
+        path_str.resize(static_cast<size_t>(dir_length));
+        MPI_Bcast(path_str.data(), dir_length, MPI_CHAR, 0, comm);
+        output_dir = path_str;
+    }
 
     bool success_t = false;
-    MPI_Allreduce(&success, &success_t, 1, MPI_C_BOOL, MPI_LOR, comm);
+    MPI_Bcast(&success, 1, MPI_C_BOOL, 0, comm);
+    success_t = success;
     return success_t;
 }
 

From 5dd1ca99afef6d19153a6a8cb906401650545480 Mon Sep 17 00:00:00 2001
From: Robert Carson <carson16@llnl.gov>
Date: Mon, 27 Apr 2026 14:55:17 -0700
Subject: [PATCH 5/7] [codex] Nasty GPU-bug fix... Was running into a nasty
 GPU-bug where on newer versions of MFEM if we used > 1 GPUs we got different
 answers than our CPU runs as certain terms were just 0.0. I could not for the
 life of me figure it out other than it was likely due to some thing in the
 velocity field near the time the boundary conditions were being applied was
 not getting set. I threw codex at the problem and it was able to over a
 couple iterations of debugging work out the error and find a suitable new
 MFEM API that we could use that fixed the GPU error and kept our answers on
 the CPU the same.

---
 src/fem_operators/mechanics_operator.cpp | 11 ++++++++++-
 src/system_driver.cpp                    | 11 +++++++++--
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/src/fem_operators/mechanics_operator.cpp b/src/fem_operators/mechanics_operator.cpp
index b95cd74..93b1ebe 100644
--- a/src/fem_operators/mechanics_operator.cpp
+++ b/src/fem_operators/mechanics_operator.cpp
@@ -13,6 +13,15 @@
 #include <iostream>
 #include <stdexcept>
 
+namespace {
+void GetTrueDofsParallel(const mfem::ParGridFunction& gf, mfem::Vector& true_dofs) {
+    // used to do something like:
+    // gf.GetTrueDofs(true_dofs);
+    // but looks like there are issues with that on the GPUs with newer versions of MFEM
+    gf.ParallelAverage(true_dofs);
+}
+} // namespace
+
 NonlinearMechOperator::NonlinearMechOperator(mfem::Array<int>& ess_bdr,
                                              mfem::Array2D<bool>& ess_bdr_comp,
                                              std::shared_ptr<SimulationState> sim_state)
@@ -259,7 +268,7 @@ void NonlinearMechOperator::CalculateDeformationGradient(mfem::QuadratureFunctio
 
     mfem::Vector x_true(fe_space->TrueVSize(), mfem::Device::GetMemoryType());
 
-    x_cur->GetTrueDofs(x_true);
+    GetTrueDofsParallel(*x_cur, x_true);
     // Takes in k vector and transforms into into our E-vector array
     P->Mult(x_true, px);
     elem_restrict_lex->Mult(px, el_x);
diff --git a/src/system_driver.cpp b/src/system_driver.cpp
index c4b2797..d7a6934 100644
--- a/src/system_driver.cpp
+++ b/src/system_driver.cpp
@@ -46,6 +46,13 @@ void DirBdrFunc(int attr_id, mfem::Vector& y) {
 
 namespace {
 
+void GetTrueDofsParallel(const mfem::ParGridFunction& gf, mfem::Vector& true_dofs) {
+    // used to do something like:
+    // gf.GetTrueDofs(true_dofs);
+    // but looks like there are issues with that on the GPUs with newer versions of MFEM
+    gf.ParallelAverage(true_dofs);
+}
+
 /**
  * @brief Helper function to find mesh bounding box for velocity gradient calculations
  *
@@ -518,7 +525,7 @@ void SystemDriver::UpdateVelocity() {
                                                         // pulled off the
                                                         // VectorFunctionRestrictedCoefficient
         // populate the solution vector, v_sol, with the true dofs entries in v_cur.
-        velocity->GetTrueDofs(*vel_tdofs);
+        GetTrueDofsParallel(*velocity, *vel_tdofs);
     }
 
     if (ess_bdr["ess_vgrad"].Sum() > 0) {
@@ -607,7 +614,7 @@ void SystemDriver::UpdateVelocity() {
             mfem::Vector vel_tdof_tmp(*vel_tdofs);
             vel_tdof_tmp.UseDevice(true);
             vel_tdof_tmp = 0.0;
-            velocity->GetTrueDofs(vel_tdof_tmp);
+            GetTrueDofsParallel(*velocity, vel_tdof_tmp);
 
             mfem::Array<int> ess_tdofs(mech_operator->GetEssentialTrueDofs());
             if (!mono_def_flag) {

From 457bfbcc522810b14d67bbcc68231b028f3a454c Mon Sep 17 00:00:00 2001
From: Robert Carson <carson16@llnl.gov>
Date: Mon, 27 Apr 2026 15:01:43 -0700
Subject: [PATCH 6/7] Use Umpire as memory pool manager if MFEM was built with
 it

---
 src/mechanics_driver.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/mechanics_driver.cpp b/src/mechanics_driver.cpp
index 0e9520e..4f3efca 100644
--- a/src/mechanics_driver.cpp
+++ b/src/mechanics_driver.cpp
@@ -211,7 +211,17 @@ int main(int argc, char* argv[]) {
          */
         mfem::Device device;
         if (toml_opt.solvers.rtmodel == RTModel::GPU) {
+#if defined(MFEM_USE_UMPIRE)
+            device.SetMemoryTypes(mfem::MemoryType::HOST_UMPIRE, mfem::MemoryType::DEVICE_UMPIRE);
+#else
             device.SetMemoryTypes(mfem::MemoryType::HOST_64, mfem::MemoryType::DEVICE);
+#endif
+        } else {
+#if defined(MFEM_USE_UMPIRE)
+            device.SetMemoryTypes(mfem::MemoryType::HOST_UMPIRE, mfem::MemoryType::DEVICE_UMPIRE);
+#else
+            device.SetMemoryTypes(mfem::MemoryType::HOST_64, mfem::MemoryType::DEVICE);
+#endif
         }
         device.Configure(device_config.c_str());
 

From 12657c1757c836bee1dedbedfb67a93f8058db43 Mon Sep 17 00:00:00 2001
From: Robert Carson <carson16@llnl.gov>
Date: Sat, 2 May 2026 11:35:07 -0700
Subject: [PATCH 7/7] update install script dependencies

---
 scripts/install/common/dependency_versions.sh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/scripts/install/common/dependency_versions.sh b/scripts/install/common/dependency_versions.sh
index 86f22fe..94b5180 100644
--- a/scripts/install/common/dependency_versions.sh
+++ b/scripts/install/common/dependency_versions.sh
@@ -2,25 +2,25 @@
 # Central version control for all dependencies
 
 # Portability libraries
-export CAMP_VER="v2025.09.2"
-export RAJA_VER="v2025.09.1"
+export CAMP_VER="v2025.12.0"
+export RAJA_VER="v2025.12.2"
 #export UMPIRE_VER="v2025.09.0"
 # For now we need something a little pass the v2025.09.0 release
 # for Umpire as we need a small bug fix for any build with Umpire
-export UMPIRE_VER="54a1909e91ce9604328977974e9b1002bf9f8781"
-export CHAI_VER="v2025.09.1"
+export UMPIRE_VER="v2025.12.0"
+export CHAI_VER="v2025.12.0"
 
 # Material models
 export EXACMECH_REPO="https://github.com/LLNL/ExaCMech.git"
 export EXACMECH_BRANCH="develop"
 
 # FEM infrastructure
-export HYPRE_VER="v2.32.0"
+export HYPRE_VER="v3.1.0"
 export METIS_VER="5.1.0"
 export METIS_URL="https://mfem.github.io/tpls/metis-${METIS_VER}.tar.gz"
 
 export MFEM_REPO="https://github.com/rcarson3/mfem.git"
-export MFEM_BRANCH="exaconstit-dev"
+export MFEM_BRANCH="exaconstit-latest"
 
 # Main application
 export EXACONSTIT_REPO="https://github.com/llnl/ExaConstit.git"