diff --git a/CMakeLists.txt b/CMakeLists.txt
index d5da7c30ab..384717c9d3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -448,6 +448,7 @@ set(SeQuant_optimize_src
         SeQuant/core/optimize/common_subexpression_elimination.hpp
         SeQuant/core/optimize/fusion.cpp
         SeQuant/core/optimize/fusion.hpp
+        SeQuant/core/optimize/options.hpp
         SeQuant/core/optimize/optimize.cpp
         SeQuant/core/optimize/optimize.hpp
         SeQuant/core/optimize/single_term.hpp
diff --git a/SeQuant/core/optimize/optimize.cpp b/SeQuant/core/optimize/optimize.cpp
index bb73493b88..aaeab5e4ba 100644
--- a/SeQuant/core/optimize/optimize.cpp
+++ b/SeQuant/core/optimize/optimize.cpp
@@ -4,9 +4,11 @@
 #include <SeQuant/core/eval/eval_expr.hpp>
 #include <SeQuant/core/expr.hpp>
 #include <SeQuant/core/hash.hpp>
+#include <SeQuant/core/index.hpp>
 #include <SeQuant/core/optimize/optimize.hpp>
 #include <SeQuant/core/optimize/single_term.hpp>
 #include <SeQuant/core/optimize/sum.hpp>
+#include <SeQuant/core/runtime.hpp>
 #include <SeQuant/core/utility/indices.hpp>
 #include <SeQuant/core/utility/macros.hpp>
 
@@ -14,95 +16,184 @@
 #include <range/v3/iterator/basic_iterator.hpp>
 #include <range/v3/range/access.hpp>
 #include <range/v3/range/conversion.hpp>
-#include <range/v3/view/tail.hpp>
-#include <range/v3/view/transform.hpp>
-#include <range/v3/view/view.hpp>
+#include <range/v3/view/iota.hpp>
 
 #include <cstddef>
-#include <type_traits>
+#include <string>
+#include <string_view>
 #include <utility>
 
 namespace sequant {
 
-namespace opt {
-
-///
-/// \param expr  Expression to be optimized.
-/// \param idxsz An invocable object that maps an Index object to size.
-/// \param reorder_sum If true, the summands are reordered so that terms with
-///                    common sub-expressions appear closer to each other.
-/// \return Optimized expression for lower evaluation cost.
-template <typename IdxToSize, typename = std::enable_if_t<std::is_invocable_r_v<
-                                  size_t, IdxToSize, const Index&>>>
-ExprPtr optimize(ExprPtr const& expr, IdxToSize const& idx2size,
-                 bool reorder_sum) {
-  using ranges::views::transform;
+namespace {
+
+index_to_extent_t default_idx_to_size() {
+  return [](Index const& ix) { return ix.space().approximate_size(); };
+}
+
+/// Optimize a Product that contains only Tensor and scalar factors.
+ExprPtr opt_pure_product(Product const& prod, index_to_extent_t const& idx2size,
+                         OptFor opt_for) {
+  if (opt_for == OptFor::Flops)
+    return opt::single_term_opt<OptFor::Flops>(prod, idx2size);
+  SEQUANT_ASSERT(opt_for == OptFor::Memsize);
+  return opt::single_term_opt<OptFor::Memsize>(prod, idx2size);
+}
+
+/// Deliberately non-identifier label prefix used to stand in for non-Tensor,
+/// non-scalar factors during single-term optimization. Chosen so that no
+/// user-defined tensor label can collide with it.
+inline constexpr std::wstring_view placeholder_label_prefix = L"@__opt_";
+
+/// Optimize a Product that contains some non-Tensor, non-scalar factors by
+/// substituting placeholder tensors with target indices, optimizing the
+/// resulting tensor-only product, then swapping the originals back in.
+ExprPtr opt_mixed_product(Product const& prod,
+                          index_to_extent_t const& idx2size, OptFor opt_for) {
+  container::svector<ExprPtr> non_tensors(prod.size());
+  container::svector<ExprPtr> new_factors;
+  new_factors.reserve(prod.size());
+
+  for (std::size_t i = 0; i < prod.size(); ++i) {
+    auto&& f = prod.factor(i);
+    if (f->is<Tensor>() || f->is_scalar()) {
+      new_factors.emplace_back(f);
+    } else {
+      non_tensors[i] = f;
+      auto target_idxs = get_unique_indices(f);
+      new_factors.emplace_back(ex<Tensor>(
+          std::wstring(placeholder_label_prefix) + std::to_wstring(i),
+          bra(target_idxs.bra), ket(target_idxs.ket), aux(target_idxs.aux)));
+    }
+  }
+
+  auto result = opt_pure_product(
+      Product{prod.scalar(), new_factors, Product::Flatten::No}, idx2size,
+      opt_for);
+
+  auto replacer = [&non_tensors](ExprPtr& out) {
+    if (!out->is<Tensor>()) return;
+    auto label = out->as<Tensor>().label();
+    if (!label.starts_with(placeholder_label_prefix)) return;
+
+    // The placeholder prefix is internal; anything carrying it must have been
+    // emitted by this function, with a pure-decimal suffix indexing
+    // non_tensors. Any deviation is a programming error.
+    auto suffix_view = label.substr(placeholder_label_prefix.size());
+    SEQUANT_ASSERT(!suffix_view.empty());
+    std::size_t suffix = 0;
+    for (wchar_t c : suffix_view) {
+      SEQUANT_ASSERT(c >= L'0' && c <= L'9');
+      suffix = suffix * 10 + static_cast<std::size_t>(c - L'0');
+    }
+    SEQUANT_ASSERT(suffix < non_tensors.size() && non_tensors[suffix]);
+    out = non_tensors[suffix].clone();
+  };
+
+  result->visit(replacer, /* atoms_only = */ true);
+  return result;
+}
+
+/// Recursive workhorse. \p parallel_outer controls whether the (single)
+/// outermost Sum's summands are processed in parallel; nested recursive
+/// calls always run sequentially to avoid `sequant::for_each` oversubscription.
+ExprPtr optimize_impl(ExprPtr const& expr, index_to_extent_t const& idx2size,
+                      OptFor opt_for, bool reorder, bool parallel_outer) {
   if (expr->is<Product>()) {
-    if (ranges::all_of(*expr, [](auto&& x) {
-          return x->template is<Tensor>() || x->is_scalar();
-        }))
-      return opt::single_term_opt(expr->as<Product>(), idx2size);
-    else {
-      auto const& prod = expr->as<Product>();
-
-      container::svector<ExprPtr> non_tensors(prod.size());
-      container::svector<ExprPtr> new_factors;
-
-      for (auto i = 0; i < prod.size(); ++i) {
-        auto&& f = prod.factor(i);
-        if (f->is<Tensor>() || f->is_scalar())
-          new_factors.emplace_back(f);
-        else {
-          non_tensors[i] = f;
-          auto target_idxs = get_unique_indices(f);
-          new_factors.emplace_back(
-              ex<Tensor>(L"I_" + std::to_wstring(i), bra(target_idxs.bra),
-                         ket(target_idxs.ket), aux(target_idxs.aux)));
-        }
-      }
-
-      auto result = opt::single_term_opt(
-          Product(prod.scalar(), new_factors, Product::Flatten::No), idx2size);
-
-      auto replacer = [&non_tensors](ExprPtr& out) {
-        if (!out->is<Tensor>()) return;
-        auto const& tnsr = out->as<Tensor>();
-        auto&& label = tnsr.label();
-        if (label.at(0) == L'I' && label.at(1) == L'_') {
-          size_t suffix = std::stoi(std::wstring(label.data() + 2));
-          out = non_tensors[suffix].clone();
-        }
-      };
-
-      result->visit(replacer, /* atoms_only = */ true);
-      return result;
+    auto const& prod = expr->as<Product>();
+    bool pure = ranges::all_of(prod, [](auto&& x) {
+      return x->template is<Tensor>() || x->is_scalar();
+    });
+    return pure ? opt_pure_product(prod, idx2size, opt_for)
+                : opt_mixed_product(prod, idx2size, opt_for);
+  }
+
+  if (expr->is<Sum>()) {
+    auto const& in_sum = expr->as<Sum>();
+    Sum::summands_type new_smands(in_sum.size());
+
+    auto do_term = [&](std::size_t i) {
+      new_smands[i] =
+          optimize_impl(in_sum.summand(i), idx2size, opt_for,
+                        /*reorder=*/false, /*parallel_outer=*/false);
+    };
+
+    // Thread-safety of the parallel branch rests on two invariants; do NOT
+    // break them without re-auditing:
+    //   1. Each task writes a distinct, pre-allocated new_smands[i] slot, and
+    //      the work below (single_term_opt ->
+    //      TensorNetwork::canonicalize_slots) operates on per-task *clones* of
+    //      the input tensors. The lazily populated `mutable` caches on
+    //      Expr/Index (hash_value_, label_, ...) are unsynchronized, so
+    //      concurrent work must never read/write them on a shared (non-cloned)
+    //      node. Index comparison touches only immutable members, so building
+    //      index sets over shared indices is safe.
+    //   2. The binarize() pass below DOES read Index::label() (a lazy cache
+    //      write) on the optimized summands, so it is run *sequentially, after*
+    //      for_each() has joined -- never inside do_term().
+    // The default Context and cardinal_tensor_labels must also be configured
+    // before entering here (their writes are unsynchronized unless
+    // SEQUANT_CONTEXT_MANIPULATION_THREADSAFE); optimize() only reads them.
+    if (parallel_outer && in_sum.size() > 1) {
+      auto indices = ranges::views::iota(std::size_t{0}, in_sum.size());
+      sequant::for_each(indices, do_term);
+    } else {
+      for (std::size_t i = 0; i < in_sum.size(); ++i) do_term(i);
     }
-  } else if (expr->is<Sum>()) {
-    auto smands = *expr | transform([&idx2size](auto&& s) {
-      return optimize(s, idx2size, /* reorder_sum= */ false);
-    }) | ranges::to_vector;
-    auto sum = Sum{smands.begin(), smands.end()};
-    return reorder_sum ? ex<Sum>(opt::reorder(sum)) : ex<Sum>(std::move(sum));
-  } else
-    return expr->clone();
+
+    Sum new_sum(std::move(new_smands), Sum::move_only_tag{});
+    if (!reorder) return ex<Sum>(std::move(new_sum));
+
+    // Binarize once per optimized summand and hand the nodes to reorder()
+    // so they aren't re-built inside clusters(). NOTE: this runs sequentially
+    // by design -- see invariant (2) above.
+    container::vector<FullBinaryNode<EvalExpr>> nodes;
+    nodes.reserve(new_sum.size());
+    for (auto const& s : new_sum.summands()) nodes.push_back(binarize(s));
+    return ex<Sum>(opt::reorder(new_sum, nodes));
+  }
+
+  return expr->clone();
 }
 
-}  // namespace opt
+}  // namespace
+
+ExprPtr optimize(ExprPtr const& expr, OptimizeOptions opts) {
+  auto idx2size = opts.idx_to_extent ? std::move(opts.idx_to_extent)
+                                     : default_idx_to_size();
+  return optimize_impl(expr, idx2size, opts.opt_for,
+                       opts.reorder == ReorderSum::Reorder,
+                       /*parallel_outer=*/true);
+}
+
+ResultExpr& optimize(ResultExpr& expr, OptimizeOptions opts) {
+  expr.expression() = optimize(expr.expression(), std::move(opts));
+  return expr;
+}
+
+ResultExpr& optimize(ResultExpr&& expr, OptimizeOptions opts) {
+  return optimize(expr, std::move(opts));
+}
+
+// backwards compatibility overloads
+
+namespace {
+inline OptimizeOptions compatibility_opts(bool reorder_sum) {
+  return OptimizeOptions{.reorder = reorder_sum ? ReorderSum::Reorder
+                                                : ReorderSum::NoReorder};
+}
+}  // namespace
 
 ExprPtr optimize(ExprPtr const& expr, bool reorder_sum) {
-  return opt::optimize(
-      expr, [](Index const& ix) { return ix.space().approximate_size(); },
-      reorder_sum);
+  return optimize(expr, compatibility_opts(reorder_sum));
 }
 
 ResultExpr& optimize(ResultExpr& expr, bool reorder_sum) {
-  expr.expression() = optimize(expr.expression(), reorder_sum);
-
-  return expr;
+  return optimize(expr, compatibility_opts(reorder_sum));
 }
 
 ResultExpr& optimize(ResultExpr&& expr, bool reorder_sum) {
-  return optimize(expr, reorder_sum);
+  return optimize(std::move(expr), compatibility_opts(reorder_sum));
 }
 
 }  // namespace sequant
diff --git a/SeQuant/core/optimize/optimize.hpp b/SeQuant/core/optimize/optimize.hpp
index 77d7571409..caa6952d44 100644
--- a/SeQuant/core/optimize/optimize.hpp
+++ b/SeQuant/core/optimize/optimize.hpp
@@ -2,39 +2,53 @@
 #define SEQUANT_OPTIMIZE_OPTIMIZE_HPP
 
 #include <SeQuant/core/expr_fwd.hpp>
+#include <SeQuant/core/optimize/options.hpp>
 
 namespace sequant {
 
-///
-/// Optimize the expression using IndexSpace::approximate_size() for reference
-/// index extent.
+/// Optimize the expression for lower evaluation cost.
 ///
 /// \param expr  Expression to be optimized.
-/// \param reorder_sum If true, the summands are reordered so that terms with
-///                    common sub-expressions appear closer to each other.
-///                    True by default.
-/// \return Optimized expression for lower evaluation cost.
-ExprPtr optimize(ExprPtr const& expr, bool reorder_sum = true);
-
-/// Optimize the expression using IndexSpace::approximate_size() for reference
-/// index extent.
+/// \param opts  Optimization parameters; see \c OptimizeOptions. By default:
+///              the cost metric is flop count, index extents are taken from
+///              \c IndexSpace::approximate_size(), and the summands of a sum
+///              are reordered to cluster terms that share intermediates.
+/// \return Optimized expression.
+ExprPtr optimize(ExprPtr const& expr, OptimizeOptions opts = {});
+
+/// \copydoc optimize(ExprPtr const&, OptimizeOptions)
+ResultExpr& optimize(ResultExpr& expr, OptimizeOptions opts = {});
+
+/// \copydoc optimize(ExprPtr const&, OptimizeOptions)
+[[nodiscard]] ResultExpr& optimize(ResultExpr&& expr,
+                                   OptimizeOptions opts = {});
+
+// Overloads for backwards compatibility
+
+/// \deprecated Use the \c OptimizeOptions overload instead.
 ///
-/// \param expr  Expression to be optimized.
-/// \param reorder_sum If true, the summands are reordered so that terms with
-///                    common sub-expressions appear closer to each other.
-///                    True by default.
-/// \return Optimized expression for lower evaluation cost.
-ResultExpr& optimize(ResultExpr& expr, bool reorder_sum = true);
-
-/// Optimize the expression using IndexSpace::approximate_size() for reference
-/// index extent.
+/// Equivalent to calling the primary overload with default \c OptimizeOptions
+/// and \c OptimizeOptions::reorder_sum set to \p reorder_sum.
 ///
-/// \param expr  Expression to be optimized.
-/// \param reorder_sum If true, the summands are reordered so that terms with
-///                    common sub-expressions appear closer to each other.
-///                    True by default.
-/// \return Optimized expression for lower evaluation cost.
-[[nodiscard]] ResultExpr& optimize(ResultExpr&& expr, bool reorder_sum = true);
+/// \param expr         Expression to be optimized.
+/// \param reorder_sum  If true, reorder the summands of a sum to cluster terms
+///                     that share intermediates.
+/// \return Optimized expression.
+[[deprecated(
+    "use the OptimizeOptions"
+    " overload of optimize() instead")]] ExprPtr
+optimize(ExprPtr const& expr, bool reorder_sum);
+
+/// \copydoc optimize(ExprPtr const&, bool)
+[[deprecated(
+    "use the OptimizeOptions"
+    " overload of optimize() instead")]] ResultExpr&
+optimize(ResultExpr& expr, bool reorder_sum);
+
+/// \copydoc optimize(ExprPtr const&, bool)
+[[nodiscard, deprecated("use the OptimizeOptions"
+                        " overload of optimize() instead")]] ResultExpr&
+optimize(ResultExpr&& expr, bool reorder_sum);
 
 }  // namespace sequant
 
diff --git a/SeQuant/core/optimize/options.hpp b/SeQuant/core/optimize/options.hpp
new file mode 100644
index 0000000000..8c3669eccc
--- /dev/null
+++ b/SeQuant/core/optimize/options.hpp
@@ -0,0 +1,42 @@
+#ifndef SEQUANT_CORE_OPTIMIZE_OPTIONS_HPP
+#define SEQUANT_CORE_OPTIMIZE_OPTIONS_HPP
+
+#include <cstddef>
+#include <functional>
+
+namespace sequant {
+
+class Index;
+
+/// Cost metric to optimize for in single-term and top-level optimize routines.
+enum class OptFor { Flops, Memsize };
+
+/// Whether to reorder summands so terms with shared intermediates appear
+/// closer to each other.
+enum class ReorderSum { Reorder, NoReorder };
+
+/// A type-erased provider mapping an Index to its extent. Used by the public
+/// optimize() API. Callers reaching for the templated opt::single_term_opt
+/// overloads (constrained by \ref opt::has_index_extent) should pass the
+/// callable directly instead of wrapping it here — that keeps the cost
+/// function's call site inlineable, whereas a value of this alias goes
+/// through std::function's type-erased dispatch on every Index lookup.
+using index_to_extent_t = std::function<std::size_t(Index const&)>;
+
+/// Options that control behavior of \ref sequant::optimize.
+struct OptimizeOptions {
+  /// Cost metric to minimize.
+  OptFor opt_for = OptFor::Flops;
+
+  /// Whether to reorder summands so terms with shared intermediates appear
+  /// closer to each other.
+  ReorderSum reorder = ReorderSum::Reorder;
+
+  /// Caller-supplied Index to extent provider. If empty, defaults to
+  /// \c IndexSpace::approximate_size().
+  index_to_extent_t idx_to_extent = {};
+};
+
+}  // namespace sequant
+
+#endif  // SEQUANT_CORE_OPTIMIZE_OPTIONS_HPP
diff --git a/SeQuant/core/optimize/single_term.hpp b/SeQuant/core/optimize/single_term.hpp
index 2d5b72ed15..636c8378cd 100644
--- a/SeQuant/core/optimize/single_term.hpp
+++ b/SeQuant/core/optimize/single_term.hpp
@@ -4,6 +4,7 @@
 #include <SeQuant/core/algorithm.hpp>
 #include <SeQuant/core/container.hpp>
 #include <SeQuant/core/expr.hpp>
+#include <SeQuant/core/optimize/options.hpp>
 #include <SeQuant/core/tensor_canonicalizer.hpp>
 #include <SeQuant/core/tensor_network.hpp>
 #include <SeQuant/core/utility/indices.hpp>
@@ -23,26 +24,72 @@
 #include <type_traits>
 
 namespace sequant::opt {
-///
-/// \param idxsz An invocable that returns size_t for Index argument.
-/// \param idxs Index objects.
-/// \return flops count
-///
+
 template <typename F>
 concept has_index_extent = std::is_invocable_r_v<size_t, F, Index const&>;
+
 namespace detail {
 
-auto constexpr flops_counter(has_index_extent auto&& ixex) {
-  return [ixex](meta::range_of<Index> auto const& lhs,
-                meta::range_of<Index> auto const& rhs,
-                meta::range_of<Index> auto const& result) {
+/// \brief Cost function returning the flop count of a binary tensor
+/// contraction.
+///
+/// The returned callable computes the product of extents over the union of
+/// indices on \c lhs, \c rhs, and \c result. A product of 1 (scalar
+/// contraction) is reported as zero flops.
+///
+/// \param ixex Invocable mapping an Index to its extent.
+/// \return A callable <tt>(lhs, rhs, result) -> double</tt> yielding the
+/// flop count of the contraction.
+auto flops_counter(has_index_extent auto&& ixex) {
+  return [ixex = std::forward<decltype(ixex)>(ixex)](
+             meta::range_of<Index> auto const& lhs,
+             meta::range_of<Index> auto const& rhs,
+             meta::range_of<Index> auto const& result) -> double {
+    using ranges::views::concat;
+    // <IndexSet> is required here: concatenating the three operands repeats
+    // every contracted/shared index, so it must be deduplicated before taking
+    // the extent product (cf. memsize_counter, which processes each operand
+    // separately and so can use the default vector container).
+    auto tot_idxs = tot_indices<IndexSet>(concat(lhs, rhs, result));
+    double total_flops = ranges::accumulate(
+        concat(tot_idxs.outer, tot_idxs.inner), 1., std::multiplies{}, ixex);
+    // A product of exactly 1. means the index set was empty (the accumulation
+    // init value), i.e. a scalar contraction => zero flops. Extents are
+    // integer-valued, so this equality is exact.
+    return total_flops == 1. ? 0. : total_flops;
+  };
+}
+
+/// \brief Cost function returning the total memory footprint of a binary
+/// tensor contraction.
+///
+/// The returned callable sums, over \c lhs, \c rhs, and \c result, the
+/// product of extents of each operand's indices. Operands whose extent
+/// product is 1 contribute zero.
+///
+/// \param ixex Invocable mapping an Index to its extent.
+/// \return A callable <tt>(lhs, rhs, result) -> double</tt> yielding the
+/// summed memory size of the three operands.
+auto memsize_counter(has_index_extent auto&& ixex) {
+  return [ixex = std::forward<decltype(ixex)>(ixex)](
+             meta::range_of<Index> auto const& lhs,
+             meta::range_of<Index> auto const& rhs,
+             meta::range_of<Index> auto const& result) -> double {
     using ranges::views::concat;
-    auto tot_idxs = tot_indices<container::set<Index, Index::LabelCompare>>(
-        concat(lhs, rhs, result));
-    double ops = 1.;
-    for (auto&& idx : concat(tot_idxs.outer, tot_idxs.inner)) ops *= ixex(idx);
-    // ops == 1 implies zero flops
-    return ops == 1. ? 0. : ops;
+    double total_mem{0};
+    // Each operand is sized independently, so the default (vector) container of
+    // tot_indices suffices -- a single operand's index list has no duplicates,
+    // unlike the concatenated set flops_counter must dedup.
+    for (auto&& tot_idxs :
+         {tot_indices(lhs), tot_indices(rhs), tot_indices(result)}) {
+      double mem = ranges::accumulate(concat(tot_idxs.outer, tot_idxs.inner),
+                                      1., std::multiplies{}, ixex);
+      // mem == 1. means this operand had no indices (the accumulation init
+      // value), i.e. a scalar; it contributes no memory. Same exact-equality
+      // convention as flops_counter above.
+      if (mem != 1.) total_mem += mem;
+    }
+    return total_mem;
   };
 }
 
@@ -54,8 +101,8 @@ struct OptRes {
   /// Free indices remaining upon evaluation
   IndexSet indices;
 
-  /// The flops count of evaluation
-  double flops;
+  /// The operations count of evaluation
+  double ops;
 
   /// The evaluation sequence
   EvalSequence sequence;
@@ -83,6 +130,84 @@ struct SubNetEqual {
   }
 };
 
+/// \brief Seeds the DP table with per-subset open indices and singleton/empty
+/// ops counts.
+///
+/// For each subset \c i of the input tensors, computes the indices that remain
+/// open after evaluating that subset and stores them in \c results[i].indices.
+/// Initializes \c results[i].ops to 0 for empty and singleton subsets, and
+/// to \c max as a sentinel for subsets that will later be filled in by the DP.
+/// Singleton subsets also get their one-element \c sequence pre-populated.
+template <typename TIdxs>
+void init_results(TensorNetwork const& network, TIdxs const& tidxs,
+                  container::vector<OptRes>& results) {
+  using IndexContainer = decltype(OptRes::indices);
+  auto tensor_indices = network.tensors()          //
+                        | ranges::views::indirect  //
+                        | ranges::views::transform(slots);
+  auto imed_indices = subset_target_indices(tensor_indices, tidxs);
+  SEQUANT_ASSERT(ranges::distance(imed_indices) == ranges::distance(results));
+  for (size_t i = 0; i < results.size(); ++i) {
+    results[i].indices =
+        imed_indices[i] | ranges::views::move | ranges::to<IndexContainer>;
+    results[i].ops = std::popcount(i) > 1
+                         ? std::numeric_limits<decltype(OptRes::ops)>::max()
+                         : 0;
+    if (std::popcount(i) == 1)
+      results[i].sequence.emplace_back(std::countr_zero(i));
+    // else results[i].sequence is left uninitialized
+  }
+}
+
+struct SubnetMetadata {
+  /// meta_ids[n] is the canonical-subnet id of subset n, or
+  /// numeric_limits<size_t>::max() for subsets with popcount < 2.
+  container::vector<size_t> meta_ids;
+  /// Cost of evaluating one representative of each canonical subnet id,
+  /// indexed by id. Populated lazily during the DP.
+  container::vector<double> unique_meta_costs;
+};
+
+/// \brief Precomputes canonical-subnet identifiers for every subset of size
+/// >= 2 so that structurally equivalent subnetworks share a CSE id.
+///
+/// Builds a `TensorNetwork` for each subset, canonicalizes it, and assigns a
+/// dense integer id to each distinct canonical form. The returned
+/// `unique_meta_costs` is sized to the number of distinct ids and zero-filled;
+/// it is populated during the DP as each canonical subnet's optimal cost
+/// becomes known.
+///
+/// Side effect: `results[n].indices` may be reordered by `canonicalize_slots`.
+inline SubnetMetadata build_subnet_metadata(
+    TensorNetwork const& network, container::vector<OptRes>& results) {
+  SubnetMetadata out;
+  // Use max as sentinel for entries with popcount < 2 (singletons/empty),
+  // which are skipped below and never assigned a real meta ID.
+  out.meta_ids.resize(results.size(), std::numeric_limits<size_t>::max());
+  container::unordered_map<TensorNetwork::SlotCanonicalizationMetadata, size_t,
+                           SubNetHash, SubNetEqual>
+      meta_to_id;
+
+  for (size_t n = 0; n < results.size(); ++n) {
+    if (std::popcount(n) < 2) continue;
+    auto ts = bits::on_bits_index(n) | bits::sieve(network.tensors());
+    container::vector<ExprPtr> ts_expr;
+    for (auto&& t : ts)
+      ts_expr.emplace_back(std::dynamic_pointer_cast<Tensor>(t)->clone());
+
+    auto tn = TensorNetwork{ts_expr};
+    auto meta = tn.canonicalize_slots(
+        TensorCanonicalizer::cardinal_tensor_labels(), &results[n].indices);
+
+    auto [it, inserted] = meta_to_id.try_emplace(std::move(meta), 0);
+    if (inserted) it->second = meta_to_id.size() - 1;
+
+    out.meta_ids[n] = it->second;
+  }
+  out.unique_meta_costs.resize(meta_to_id.size(), 0.0);
+  return out;
+}
+
 /// \brief Finds the optimal evaluation sequence for a single-term tensor
 /// contraction.
 ///
@@ -130,34 +255,12 @@ EvalSequence single_term_opt_impl(TensorNetwork const& network,
                                   meta::range_of<Index> auto const& tidxs,
                                   CostFn&& cost_fn, bool subnet_cse) {
   using ranges::views::concat;
-  using ranges::views::indirect;
-  using ranges::views::transform;
-  using IndexContainer = decltype(OptRes::indices);
   auto const nt = network.tensors().size();
   if (nt == 1) return EvalSequence{0};
   if (nt == 2) return EvalSequence{0, 1, -1};
 
   container::vector<OptRes> results((size_t{1} << nt));
-
-  // initialize the intermediate results
-  {
-    auto tensor_indices = network.tensors()  //
-                          | indirect         //
-                          | transform(slots);
-    auto imed_indices = subset_target_indices(tensor_indices, tidxs);
-    SEQUANT_ASSERT(ranges::distance(imed_indices) == ranges::distance(results));
-    for (size_t i = 0; i < results.size(); ++i) {
-      results[i].indices =
-          imed_indices[i] | ranges::views::move | ranges::to<IndexContainer>;
-      results[i].flops =
-          std::popcount(i) > 1
-              ? std::numeric_limits<decltype(OptRes::flops)>::max()
-              : 0;
-      if (std::popcount(i) == 1)
-        results[i].sequence.emplace_back(std::countr_zero(i));
-      // else results[i].sequence is left uninitialized
-    }
-  }
+  init_results(network, tidxs, results);
 
   // precompute all subnet_meta if subnet_cse is true
   // Note: the O(2^n) cost is bounded in practice — subset_target_indices above
@@ -165,37 +268,15 @@ EvalSequence single_term_opt_impl(TensorNetwork const& network,
   container::vector<size_t> meta_ids;
   container::vector<double> unique_meta_costs;
   if (subnet_cse) {
-    // Use max as sentinel for entries with popcount < 2 (singletons/empty),
-    // which are skipped below and never assigned a real meta ID.
-    meta_ids.resize(results.size(), std::numeric_limits<size_t>::max());
-    container::unordered_map<TensorNetwork::SlotCanonicalizationMetadata,
-                             size_t, SubNetHash, SubNetEqual>
-        meta_to_id;
-
-    for (size_t n = 0; n < results.size(); ++n) {
-      if (std::popcount(n) < 2) continue;
-      auto ts = bits::on_bits_index(n) | bits::sieve(network.tensors());
-      container::vector<ExprPtr> ts_expr;
-      for (auto&& t : ts) {
-        ts_expr.emplace_back(std::dynamic_pointer_cast<Tensor>(t)->clone());
-      }
-      auto tn = TensorNetwork{ts_expr};
-      auto meta = tn.canonicalize_slots(
-          TensorCanonicalizer::cardinal_tensor_labels(), &results[n].indices);
-
-      auto [it, inserted] = meta_to_id.try_emplace(std::move(meta), 0);
-      if (inserted) {
-        it->second = meta_to_id.size() - 1;
-      }
-      meta_ids[n] = it->second;
-    }
-    unique_meta_costs.resize(meta_to_id.size(), 0.0);
+    auto md = build_subnet_metadata(network, results);
+    meta_ids = std::move(md.meta_ids);
+    unique_meta_costs = std::move(md.unique_meta_costs);
   }
 
   // find the optimal evaluation sequence
   for (size_t n = 0; n < results.size(); ++n) {
     if (std::popcount(n) < 2) continue;
-    for (auto& curr_cost = results[n].flops;
+    for (auto& curr_cost = results[n].ops;
          auto&& [lp, rp] : bits::bipartitions(n)) {
       // do nothing with the trivial bipartition
       // i.e. one subset is the empty set and the other full
@@ -219,7 +300,7 @@ EvalSequence single_term_opt_impl(TensorNetwork const& network,
         new_cost = cost_fn(results[lp].indices,  //
                            results[rp].indices,  //
                            results[n].indices)   //
-                   + results[lp].flops + results[rp].flops;
+                   + results[lp].ops + results[rp].ops;
       }
 
       if (new_cost <= curr_cost) {
@@ -259,34 +340,43 @@ EvalSequence single_term_opt_impl(TensorNetwork const& network,
 }
 
 ///
+/// \tparam OptFor Cost metric to optimize for (Flops or Memsize).
 /// \tparam IdxToSz
 /// \param network A TensorNetwork object.
 /// \param idxsz An invocable on Index, that maps Index to its dimension.
 /// \param subnet_cse Whether to recognize equivalent subnetworks to try
 /// minimizing the ops counts.
-/// \return Optimal evaluation sequence that
-/// minimizes flops. If there are
-///         equivalent optimal sequences then the result is the one that keeps
-///         the order of tensors in the network as original as possible.
+/// \return Optimal evaluation sequence under the chosen cost metric. If there
+///         are equivalent optimal sequences then the result is the one that
+///         keeps the order of tensors in the network as original as possible.
 ///
-template <has_index_extent IdxToSz>
+template <OptFor Metric, has_index_extent IdxToSz>
 EvalSequence single_term_opt(TensorNetwork const& network, IdxToSz&& idxsz,
                              bool subnet_cse) {
-  auto cost_fn = flops_counter(std::forward<IdxToSz>(idxsz));
   decltype(OptRes::indices) tidxs{};
-  return single_term_opt_impl(network, tidxs, cost_fn, subnet_cse);
+  if constexpr (Metric == OptFor::Flops) {
+    auto cost_fn = flops_counter(std::forward<IdxToSz>(idxsz));
+    return single_term_opt_impl(network, tidxs, cost_fn, subnet_cse);
+  } else {
+    static_assert(Metric == OptFor::Memsize,
+                  "Only Flops and Memsize OptFor supported.");
+    auto cost_fn = memsize_counter(std::forward<IdxToSz>(idxsz));
+    return single_term_opt_impl(network, tidxs, cost_fn, subnet_cse);
+  }
 }
 
 }  // namespace detail
 
 ///
+/// \tparam Metric Cost metric to optimize for (Flops by default; Memsize
+///         minimizes total operand memory rather than flops).
 /// \param prod  Product to be optimized.
 /// \param idxsz An invocable object that maps an Index object to size.
 /// \return Parenthesized product expression.
 ///
 /// @note @c prod is assumed to consist of only Tensor expressions
 ///
-template <has_index_extent IdxToSz>
+template <OptFor Metric = OptFor::Flops, has_index_extent IdxToSz>
 ExprPtr single_term_opt(Product const& prod, IdxToSz&& idxsz,
                         bool subnet_cse = false) {
   using ranges::views::filter;
@@ -297,8 +387,8 @@ ExprPtr single_term_opt(Product const& prod, IdxToSz&& idxsz,
                                prod.factors().end(), Product::Flatten::No});
   auto const tensors =
       prod | filter(&ExprPtr::template is<Tensor>) | ranges::to_vector;
-  auto seq = detail::single_term_opt(TensorNetwork{tensors},
-                                     std::forward<IdxToSz>(idxsz), subnet_cse);
+  auto seq = detail::single_term_opt<Metric>(
+      TensorNetwork{tensors}, std::forward<IdxToSz>(idxsz), subnet_cse);
   auto result = container::svector<ExprPtr>{};
   for (auto i : seq)
     if (i == -1) {
diff --git a/SeQuant/core/optimize/sum.cpp b/SeQuant/core/optimize/sum.cpp
index 048720b3da..d0edd09745 100644
--- a/SeQuant/core/optimize/sum.cpp
+++ b/SeQuant/core/optimize/sum.cpp
@@ -1,7 +1,10 @@
+#include <SeQuant/core/binary_node.hpp>
 #include <SeQuant/core/container.hpp>
 #include <SeQuant/core/eval/eval_expr.hpp>
 #include <SeQuant/core/expr.hpp>
+#include <SeQuant/core/hash.hpp>
 #include <SeQuant/core/optimize/sum.hpp>
+#include <SeQuant/core/utility/macros.hpp>
 
 #include <range/v3/range/operations.hpp>
 #include <range/v3/view/map.hpp>
@@ -24,7 +27,9 @@ bool has_only_single_atom(const ExprPtr& term) {
   return term->size() == 1 && has_only_single_atom(*term->begin());
 }
 
-container::vector<container::vector<size_t>> clusters(Sum const& expr) {
+container::vector<container::vector<size_t>> clusters(
+    Sum const& expr, container::vector<FullBinaryNode<EvalExpr>> const& nodes) {
+  SEQUANT_ASSERT(nodes.size() == expr.size());
   using ranges::views::tail;
   using ranges::views::transform;
   using hash_t = size_t;
@@ -44,7 +49,7 @@ container::vector<container::vector<size_t>> clusters(Sum const& expr) {
     };
 
     for (auto const& term : expr) {
-      auto const node = binarize(term);
+      auto const& node = nodes[pos];
       if (has_only_single_atom(term)) {
         visitor(node);
       } else {
@@ -93,14 +98,29 @@ container::vector<container::vector<size_t>> clusters(Sum const& expr) {
   return result;
 }
 
-Sum reorder(Sum const& sum) {
+container::vector<container::vector<size_t>> clusters(Sum const& expr) {
+  container::vector<FullBinaryNode<EvalExpr>> nodes;
+  nodes.reserve(expr.size());
+  for (auto const& term : expr) nodes.push_back(binarize(term));
+  return clusters(expr, nodes);
+}
+
+Sum reorder(Sum const& sum,
+            container::vector<FullBinaryNode<EvalExpr>> const& nodes) {
   Sum result;
 
-  for (auto const& clstr : clusters(sum))
+  for (auto const& clstr : clusters(sum, nodes))
     for (auto p : clstr) result.append(sum.at(p));
 
   SEQUANT_ASSERT(result.size() == sum.size());
   return result;
 }
 
+Sum reorder(Sum const& sum) {
+  container::vector<FullBinaryNode<EvalExpr>> nodes;
+  nodes.reserve(sum.size());
+  for (auto const& term : sum) nodes.push_back(binarize(term));
+  return reorder(sum, nodes);
+}
+
 }  // namespace sequant::opt
diff --git a/SeQuant/core/optimize/sum.hpp b/SeQuant/core/optimize/sum.hpp
index 70b548f0b0..b45ecf67ad 100644
--- a/SeQuant/core/optimize/sum.hpp
+++ b/SeQuant/core/optimize/sum.hpp
@@ -1,7 +1,9 @@
 #ifndef SEQUANT_CORE_OPTIMIZE_SUM_HPP
 #define SEQUANT_CORE_OPTIMIZE_SUM_HPP
 
+#include <SeQuant/core/binary_node.hpp>
 #include <SeQuant/core/container.hpp>
+#include <SeQuant/core/eval/eval_expr.hpp>
 #include <SeQuant/core/expr_fwd.hpp>
 
 namespace sequant::opt {
@@ -16,6 +18,12 @@ namespace sequant::opt {
 ///
 container::vector<container::vector<size_t>> clusters(Sum const& expr);
 
+/// Same as \ref clusters(Sum const&) but uses precomputed eval nodes for the
+/// summands, avoiding an internal \c binarize pass. \p nodes[i] must be the
+/// binarized form of \c expr.at(i).
+container::vector<container::vector<size_t>> clusters(
+    Sum const& expr, container::vector<FullBinaryNode<EvalExpr>> const& nodes);
+
 ///
 /// \brief Reorder summands so that terms having common intermediates appear
 ///        closer.
@@ -25,6 +33,11 @@ container::vector<container::vector<size_t>> clusters(Sum const& expr);
 ///
 Sum reorder(Sum const& sum);
 
+/// Same as \ref reorder(Sum const&) but uses precomputed eval nodes for the
+/// summands. \p nodes[i] must be the binarized form of \c sum.at(i).
+Sum reorder(Sum const& sum,
+            container::vector<FullBinaryNode<EvalExpr>> const& nodes);
+
 }  // namespace sequant::opt
 
 #endif  // SEQUANT_CORE_OPTIMIZE_SUM_HPP
diff --git a/tests/unit/test_optimize.cpp b/tests/unit/test_optimize.cpp
index 9b4e1f232f..530f961deb 100644
--- a/tests/unit/test_optimize.cpp
+++ b/tests/unit/test_optimize.cpp
@@ -12,6 +12,7 @@
 #include <SeQuant/core/optimize/common_subexpression_elimination.hpp>
 #include <SeQuant/core/optimize/optimize.hpp>
 #include <SeQuant/core/optimize/single_term.hpp>
+#include <SeQuant/core/runtime.hpp>
 #include <SeQuant/core/space.hpp>
 #include <SeQuant/domain/mbpt/convention.hpp>
 
@@ -27,6 +28,15 @@ sequant::ExprPtr extract(sequant::ExprPtr expr,
   return result;
 }
 
+// number of Tensor leaves in a (binarized) expression tree
+size_t count_tensor_leaves(sequant::ExprPtr const& expr) {
+  using namespace sequant;
+  size_t n = 0;
+  expr->visit([&n](auto const& x) { n += x->template is<Tensor>() ? 1 : 0; },
+              /*atoms_only=*/true);
+  return n;
+}
+
 TEST_CASE("optimize", "[optimize]") {
   using namespace sequant;
 
@@ -227,6 +237,63 @@ TEST_CASE("optimize", "[optimize]") {
 
       aux->approximate_size(aux_sz);
     }
+
+    SECTION("OptimizeOptions: cost metric and reorder knobs") {
+      auto const prod = parse_expr_antisymm(
+          L"g_{i3,i4}^{a3,a4} t_{a1,a2}^{i3,i4} t_{a3,a4}^{i1,i2}");
+
+      // both metrics must binarize the 3-tensor product into a binary tree:
+      // a 2-factor top product whose leaves are the 3 original tensors
+      for (auto opt_for : {OptFor::Flops, OptFor::Memsize}) {
+        CAPTURE(static_cast<int>(opt_for));
+        auto res = optimize(prod, OptimizeOptions{.opt_for = opt_for});
+        REQUIRE(res->is<Product>());
+        REQUIRE(res->as<Product>().factors().size() == 2);
+        REQUIRE(count_tensor_leaves(res) == 3);
+      }
+
+      // reorder knob: a two-summand sum is optimized either way, and the
+      // optimize() default (reorder) matches an explicit Reorder request
+      auto const sum = parse_expr_antisymm(
+          L"g_{i3,i4}^{a3,a4} t_{a1,a2}^{i3,i4} t_{a3,a4}^{i1,i2}"
+          L" + g_{i3,i4}^{a3,a4} t_{a3,a4}^{i1,i2} t_{a1}^{i3} t_{a2}^{i4}");
+      REQUIRE(sum->is<Sum>());
+
+      auto no_reorder =
+          optimize(sum, OptimizeOptions{.reorder = ReorderSum::NoReorder});
+      auto reorder =
+          optimize(sum, OptimizeOptions{.reorder = ReorderSum::Reorder});
+      REQUIRE(no_reorder->is<Sum>());
+      REQUIRE(reorder->is<Sum>());
+      REQUIRE(no_reorder->as<Sum>().size() == sum->as<Sum>().size());
+      REQUIRE(reorder->as<Sum>().size() == sum->as<Sum>().size());
+      // default options == explicit Reorder
+      REQUIRE(*optimize(sum) == *reorder);
+    }
+
+    SECTION("Parallel optimization of summands matches sequential") {
+      // exercise optimize_impl(..., parallel_outer=true): a multi-summand sum
+      // optimized concurrently must yield the same result as single-threaded.
+      auto const sum = parse_expr_antisymm(
+          L"g_{i3,i4}^{a3,a4} t_{a1,a2}^{i3,i4} t_{a3,a4}^{i1,i2}"
+          L" + g_{i3,i4}^{a3,a4} t_{a3,a4}^{i1,i2} t_{a1}^{i3} t_{a2}^{i4}"
+          L" + g_{i3,i4}^{a3,a4} t_{a1}^{i3} t_{a2}^{i4} t_{a3,a4}^{i1,i2}");
+      REQUIRE(sum->is<Sum>());
+      REQUIRE(sum->as<Sum>().size() > 1);
+
+      auto const nthreads_save = num_threads();
+      struct ThreadGuard {
+        int n;
+        ~ThreadGuard() { set_num_threads(n); }
+      } guard{nthreads_save};
+
+      set_num_threads(1);
+      auto const seq = optimize(sum);
+      set_num_threads(4);
+      auto const par = optimize(sum);
+
+      REQUIRE(*seq == *par);
+    }
   }
 
   SECTION("CSE") {