ValeevGroup · evaleev · May 23, 2026 · May 20, 2026 · May 20, 2026 · May 20, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -448,6 +448,7 @@ set(SeQuant_optimize_src
         SeQuant/core/optimize/common_subexpression_elimination.hpp
         SeQuant/core/optimize/fusion.cpp
         SeQuant/core/optimize/fusion.hpp
+        SeQuant/core/optimize/options.hpp
         SeQuant/core/optimize/optimize.cpp
         SeQuant/core/optimize/optimize.hpp
         SeQuant/core/optimize/single_term.hpp

diff --git a/SeQuant/core/optimize/optimize.cpp b/SeQuant/core/optimize/optimize.cpp
@@ -4,105 +4,196 @@
 #include <SeQuant/core/eval/eval_expr.hpp>
 #include <SeQuant/core/expr.hpp>
 #include <SeQuant/core/hash.hpp>
+#include <SeQuant/core/index.hpp>
 #include <SeQuant/core/optimize/optimize.hpp>
 #include <SeQuant/core/optimize/single_term.hpp>
 #include <SeQuant/core/optimize/sum.hpp>
+#include <SeQuant/core/runtime.hpp>
 #include <SeQuant/core/utility/indices.hpp>
 #include <SeQuant/core/utility/macros.hpp>
 
 #include <range/v3/algorithm/all_of.hpp>
 #include <range/v3/iterator/basic_iterator.hpp>
 #include <range/v3/range/access.hpp>
 #include <range/v3/range/conversion.hpp>
-#include <range/v3/view/tail.hpp>
-#include <range/v3/view/transform.hpp>
-#include <range/v3/view/view.hpp>
+#include <range/v3/view/iota.hpp>
 
 #include <cstddef>
-#include <type_traits>
+#include <string>
+#include <string_view>
 #include <utility>
 
 namespace sequant {
 
-namespace opt {
-
-///
-/// \param expr  Expression to be optimized.
-/// \param idxsz An invocable object that maps an Index object to size.
-/// \param reorder_sum If true, the summands are reordered so that terms with
-///                    common sub-expressions appear closer to each other.
-/// \return Optimized expression for lower evaluation cost.
-template <typename IdxToSize, typename = std::enable_if_t<std::is_invocable_r_v<
-                                  size_t, IdxToSize, const Index&>>>
-ExprPtr optimize(ExprPtr const& expr, IdxToSize const& idx2size,
-                 bool reorder_sum) {
-  using ranges::views::transform;
+namespace {
+
+index_to_extent_t default_idx_to_size() {
+  return [](Index const& ix) { return ix.space().approximate_size(); };
+}
+
+/// Optimize a Product that contains only Tensor and scalar factors.
+ExprPtr opt_pure_product(Product const& prod, index_to_extent_t const& idx2size,
+                         OptFor opt_for) {
+  if (opt_for == OptFor::Flops)
+    return opt::single_term_opt<OptFor::Flops>(prod, idx2size);
+  SEQUANT_ASSERT(opt_for == OptFor::Memsize);
+  return opt::single_term_opt<OptFor::Memsize>(prod, idx2size);
+}
+
+/// Deliberately non-identifier label prefix used to stand in for non-Tensor,
+/// non-scalar factors during single-term optimization. Chosen so that no
+/// user-defined tensor label can collide with it.
+inline constexpr std::wstring_view placeholder_label_prefix = L"@__opt_";
+
+/// Optimize a Product that contains some non-Tensor, non-scalar factors by
+/// substituting placeholder tensors with target indices, optimizing the
+/// resulting tensor-only product, then swapping the originals back in.
+ExprPtr opt_mixed_product(Product const& prod,
+                          index_to_extent_t const& idx2size, OptFor opt_for) {
+  container::svector<ExprPtr> non_tensors(prod.size());
+  container::svector<ExprPtr> new_factors;
+  new_factors.reserve(prod.size());
+
+  for (std::size_t i = 0; i < prod.size(); ++i) {
+    auto&& f = prod.factor(i);
+    if (f->is<Tensor>() || f->is_scalar()) {
+      new_factors.emplace_back(f);
+    } else {
+      non_tensors[i] = f;
+      auto target_idxs = get_unique_indices(f);
+      new_factors.emplace_back(ex<Tensor>(
+          std::wstring(placeholder_label_prefix) + std::to_wstring(i),
+          bra(target_idxs.bra), ket(target_idxs.ket), aux(target_idxs.aux)));
+    }
+  }
+
+  auto result = opt_pure_product(
+      Product{prod.scalar(), new_factors, Product::Flatten::No}, idx2size,
+      opt_for);
+
+  auto replacer = [&non_tensors](ExprPtr& out) {
+    if (!out->is<Tensor>()) return;
+    auto label = out->as<Tensor>().label();
+    if (!label.starts_with(placeholder_label_prefix)) return;
+
+    // The placeholder prefix is internal; anything carrying it must have been
+    // emitted by this function, with a pure-decimal suffix indexing
+    // non_tensors. Any deviation is a programming error.
+    auto suffix_view = label.substr(placeholder_label_prefix.size());
+    SEQUANT_ASSERT(!suffix_view.empty());
+    std::size_t suffix = 0;
+    for (wchar_t c : suffix_view) {
+      SEQUANT_ASSERT(c >= L'0' && c <= L'9');
+      suffix = suffix * 10 + static_cast<std::size_t>(c - L'0');
+    }
+    SEQUANT_ASSERT(suffix < non_tensors.size() && non_tensors[suffix]);
+    out = non_tensors[suffix].clone();
+  };
+
+  result->visit(replacer, /* atoms_only = */ true);
+  return result;
+}
+
+/// Recursive workhorse. \p parallel_outer controls whether the (single)
+/// outermost Sum's summands are processed in parallel; nested recursive
+/// calls always run sequentially to avoid `sequant::for_each` oversubscription.
+ExprPtr optimize_impl(ExprPtr const& expr, index_to_extent_t const& idx2size,
+                      OptFor opt_for, bool reorder, bool parallel_outer) {
   if (expr->is<Product>()) {
-    if (ranges::all_of(*expr, [](auto&& x) {
-          return x->template is<Tensor>() || x->is_scalar();
-        }))
-      return opt::single_term_opt(expr->as<Product>(), idx2size);
-    else {
-      auto const& prod = expr->as<Product>();
-
-      container::svector<ExprPtr> non_tensors(prod.size());
-      container::svector<ExprPtr> new_factors;
-
-      for (auto i = 0; i < prod.size(); ++i) {
-        auto&& f = prod.factor(i);
-        if (f->is<Tensor>() || f->is_scalar())
-          new_factors.emplace_back(f);
-        else {
-          non_tensors[i] = f;
-          auto target_idxs = get_unique_indices(f);
-          new_factors.emplace_back(
-              ex<Tensor>(L"I_" + std::to_wstring(i), bra(target_idxs.bra),
-                         ket(target_idxs.ket), aux(target_idxs.aux)));
-        }
-      }
-
-      auto result = opt::single_term_opt(
-          Product(prod.scalar(), new_factors, Product::Flatten::No), idx2size);
-
-      auto replacer = [&non_tensors](ExprPtr& out) {
-        if (!out->is<Tensor>()) return;
-        auto const& tnsr = out->as<Tensor>();
-        auto&& label = tnsr.label();
-        if (label.at(0) == L'I' && label.at(1) == L'_') {
-          size_t suffix = std::stoi(std::wstring(label.data() + 2));
-          out = non_tensors[suffix].clone();
-        }
-      };
-
-      result->visit(replacer, /* atoms_only = */ true);
-      return result;
+    auto const& prod = expr->as<Product>();
+    bool pure = ranges::all_of(prod, [](auto&& x) {
+      return x->template is<Tensor>() || x->is_scalar();
+    });
+    return pure ? opt_pure_product(prod, idx2size, opt_for)
+                : opt_mixed_product(prod, idx2size, opt_for);
+  }
+
+  if (expr->is<Sum>()) {
+    auto const& in_sum = expr->as<Sum>();
+    Sum::summands_type new_smands(in_sum.size());
+
+    auto do_term = [&](std::size_t i) {
+      new_smands[i] =
+          optimize_impl(in_sum.summand(i), idx2size, opt_for,
+                        /*reorder=*/false, /*parallel_outer=*/false);
+    };
+
+    // Thread-safety of the parallel branch rests on two invariants; do NOT
+    // break them without re-auditing:
+    //   1. Each task writes a distinct, pre-allocated new_smands[i] slot, and
+    //      the work below (single_term_opt ->
+    //      TensorNetwork::canonicalize_slots) operates on per-task *clones* of
+    //      the input tensors. The lazily populated `mutable` caches on
+    //      Expr/Index (hash_value_, label_, ...) are unsynchronized, so
+    //      concurrent work must never read/write them on a shared (non-cloned)
+    //      node. Index comparison touches only immutable members, so building
+    //      index sets over shared indices is safe.
+    //   2. The binarize() pass below DOES read Index::label() (a lazy cache
+    //      write) on the optimized summands, so it is run *sequentially, after*
+    //      for_each() has joined -- never inside do_term().
+    // The default Context and cardinal_tensor_labels must also be configured
+    // before entering here (their writes are unsynchronized unless
+    // SEQUANT_CONTEXT_MANIPULATION_THREADSAFE); optimize() only reads them.
+    if (parallel_outer && in_sum.size() > 1) {
+      auto indices = ranges::views::iota(std::size_t{0}, in_sum.size());
+      sequant::for_each(indices, do_term);
+    } else {
+      for (std::size_t i = 0; i < in_sum.size(); ++i) do_term(i);
     }
-  } else if (expr->is<Sum>()) {
-    auto smands = *expr | transform([&idx2size](auto&& s) {
-      return optimize(s, idx2size, /* reorder_sum= */ false);
-    }) | ranges::to_vector;
-    auto sum = Sum{smands.begin(), smands.end()};
-    return reorder_sum ? ex<Sum>(opt::reorder(sum)) : ex<Sum>(std::move(sum));
-  } else
-    return expr->clone();
+
+    Sum new_sum(std::move(new_smands), Sum::move_only_tag{});
+    if (!reorder) return ex<Sum>(std::move(new_sum));
+
+    // Binarize once per optimized summand and hand the nodes to reorder()
+    // so they aren't re-built inside clusters(). NOTE: this runs sequentially
+    // by design -- see invariant (2) above.
+    container::vector<FullBinaryNode<EvalExpr>> nodes;
+    nodes.reserve(new_sum.size());
+    for (auto const& s : new_sum.summands()) nodes.push_back(binarize(s));
+    return ex<Sum>(opt::reorder(new_sum, nodes));
+  }
+
+  return expr->clone();
 }
 
-}  // namespace opt
+}  // namespace
+
+ExprPtr optimize(ExprPtr const& expr, OptimizeOptions opts) {
+  auto idx2size = opts.idx_to_extent ? std::move(opts.idx_to_extent)
+                                     : default_idx_to_size();
+  return optimize_impl(expr, idx2size, opts.opt_for,
+                       opts.reorder == ReorderSum::Reorder,
+                       /*parallel_outer=*/true);
+}
+
+ResultExpr& optimize(ResultExpr& expr, OptimizeOptions opts) {
+  expr.expression() = optimize(expr.expression(), std::move(opts));
+  return expr;
+}
+
+ResultExpr& optimize(ResultExpr&& expr, OptimizeOptions opts) {
+  return optimize(expr, std::move(opts));
+}
+
+// backwards compatibility overloads
+
+namespace {
+inline OptimizeOptions compatibility_opts(bool reorder_sum) {
+  return OptimizeOptions{.reorder = reorder_sum ? ReorderSum::Reorder
+                                                : ReorderSum::NoReorder};
+}
+}  // namespace
 
 ExprPtr optimize(ExprPtr const& expr, bool reorder_sum) {
-  return opt::optimize(
-      expr, [](Index const& ix) { return ix.space().approximate_size(); },
-      reorder_sum);
+  return optimize(expr, compatibility_opts(reorder_sum));
 }
 
 ResultExpr& optimize(ResultExpr& expr, bool reorder_sum) {
-  expr.expression() = optimize(expr.expression(), reorder_sum);
-
-  return expr;
+  return optimize(expr, compatibility_opts(reorder_sum));
 }
 
 ResultExpr& optimize(ResultExpr&& expr, bool reorder_sum) {
-  return optimize(expr, reorder_sum);
+  return optimize(std::move(expr), compatibility_opts(reorder_sum));
 }
 
 }  // namespace sequant
diff --git a/SeQuant/core/optimize/optimize.hpp b/SeQuant/core/optimize/optimize.hpp
@@ -2,39 +2,53 @@
 #define SEQUANT_OPTIMIZE_OPTIMIZE_HPP
 
 #include <SeQuant/core/expr_fwd.hpp>
+#include <SeQuant/core/optimize/options.hpp>
 
 namespace sequant {
 
-///
-/// Optimize the expression using IndexSpace::approximate_size() for reference
-/// index extent.
+/// Optimize the expression for lower evaluation cost.
 ///
 /// \param expr  Expression to be optimized.
-/// \param reorder_sum If true, the summands are reordered so that terms with
-///                    common sub-expressions appear closer to each other.
-///                    True by default.
-/// \return Optimized expression for lower evaluation cost.
-ExprPtr optimize(ExprPtr const& expr, bool reorder_sum = true);
-
-/// Optimize the expression using IndexSpace::approximate_size() for reference
-/// index extent.
+/// \param opts  Optimization parameters; see \c OptimizeOptions. By default:
+///              the cost metric is flop count, index extents are taken from
+///              \c IndexSpace::approximate_size(), and the summands of a sum
+///              are reordered to cluster terms that share intermediates.
+/// \return Optimized expression.
+ExprPtr optimize(ExprPtr const& expr, OptimizeOptions opts = {});
+
+/// \copydoc optimize(ExprPtr const&, OptimizeOptions)
+ResultExpr& optimize(ResultExpr& expr, OptimizeOptions opts = {});
+
+/// \copydoc optimize(ExprPtr const&, OptimizeOptions)
+[[nodiscard]] ResultExpr& optimize(ResultExpr&& expr,
+                                   OptimizeOptions opts = {});
+
+// Overloads for backwards compatibility
+
+/// \deprecated Use the \c OptimizeOptions overload instead.
 ///
-/// \param expr  Expression to be optimized.
-/// \param reorder_sum If true, the summands are reordered so that terms with
-///                    common sub-expressions appear closer to each other.
-///                    True by default.
-/// \return Optimized expression for lower evaluation cost.
-ResultExpr& optimize(ResultExpr& expr, bool reorder_sum = true);
-
-/// Optimize the expression using IndexSpace::approximate_size() for reference
-/// index extent.
+/// Equivalent to calling the primary overload with default \c OptimizeOptions
+/// and \c OptimizeOptions::reorder_sum set to \p reorder_sum.
 ///
-/// \param expr  Expression to be optimized.
-/// \param reorder_sum If true, the summands are reordered so that terms with
-///                    common sub-expressions appear closer to each other.
-///                    True by default.
-/// \return Optimized expression for lower evaluation cost.
-[[nodiscard]] ResultExpr& optimize(ResultExpr&& expr, bool reorder_sum = true);
+/// \param expr         Expression to be optimized.
+/// \param reorder_sum  If true, reorder the summands of a sum to cluster terms
+///                     that share intermediates.
+/// \return Optimized expression.
+[[deprecated(
+    "use the OptimizeOptions"
+    " overload of optimize() instead")]] ExprPtr
+optimize(ExprPtr const& expr, bool reorder_sum);
+
+/// \copydoc optimize(ExprPtr const&, bool)
+[[deprecated(
+    "use the OptimizeOptions"
+    " overload of optimize() instead")]] ResultExpr&
+optimize(ResultExpr& expr, bool reorder_sum);
+
+/// \copydoc optimize(ExprPtr const&, bool)
+[[nodiscard, deprecated("use the OptimizeOptions"
+                        " overload of optimize() instead")]] ResultExpr&
+optimize(ResultExpr&& expr, bool reorder_sum);
 
 }  // namespace sequant
 

diff --git a/SeQuant/core/optimize/options.hpp b/SeQuant/core/optimize/options.hpp
@@ -0,0 +1,42 @@
+#ifndef SEQUANT_CORE_OPTIMIZE_OPTIONS_HPP
+#define SEQUANT_CORE_OPTIMIZE_OPTIONS_HPP
+
+#include <cstddef>
+#include <functional>
+
+namespace sequant {
+
+class Index;
+
+/// Cost metric to optimize for in single-term and top-level optimize routines.
+enum class OptFor { Flops, Memsize };
+
+/// Whether to reorder summands so terms with shared intermediates appear
+/// closer to each other.
+enum class ReorderSum { Reorder, NoReorder };
+
+/// A type-erased provider mapping an Index to its extent. Used by the public
+/// optimize() API. Callers reaching for the templated opt::single_term_opt
+/// overloads (constrained by \ref opt::has_index_extent) should pass the
+/// callable directly instead of wrapping it here — that keeps the cost
+/// function's call site inlineable, whereas a value of this alias goes
+/// through std::function's type-erased dispatch on every Index lookup.
+using index_to_extent_t = std::function<std::size_t(Index const&)>;
+
+/// Options that control behavior of \ref sequant::optimize.
+struct OptimizeOptions {
+  /// Cost metric to minimize.
+  OptFor opt_for = OptFor::Flops;
+
+  /// Whether to reorder summands so terms with shared intermediates appear
+  /// closer to each other.
+  ReorderSum reorder = ReorderSum::Reorder;
+
+  /// Caller-supplied Index to extent provider. If empty, defaults to
+  /// \c IndexSpace::approximate_size().
+  index_to_extent_t idx_to_extent = {};
+};
+
+}  // namespace sequant
+
+#endif  // SEQUANT_CORE_OPTIMIZE_OPTIONS_HPP