diff --git a/CMakeLists.txt b/CMakeLists.txt index d5da7c30ab..384717c9d3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -448,6 +448,7 @@ set(SeQuant_optimize_src SeQuant/core/optimize/common_subexpression_elimination.hpp SeQuant/core/optimize/fusion.cpp SeQuant/core/optimize/fusion.hpp + SeQuant/core/optimize/options.hpp SeQuant/core/optimize/optimize.cpp SeQuant/core/optimize/optimize.hpp SeQuant/core/optimize/single_term.hpp diff --git a/SeQuant/core/optimize/optimize.cpp b/SeQuant/core/optimize/optimize.cpp index bb73493b88..aaeab5e4ba 100644 --- a/SeQuant/core/optimize/optimize.cpp +++ b/SeQuant/core/optimize/optimize.cpp @@ -4,9 +4,11 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -14,95 +16,184 @@ #include #include #include -#include -#include -#include +#include #include -#include +#include +#include #include namespace sequant { -namespace opt { - -/// -/// \param expr Expression to be optimized. -/// \param idxsz An invocable object that maps an Index object to size. -/// \param reorder_sum If true, the summands are reordered so that terms with -/// common sub-expressions appear closer to each other. -/// \return Optimized expression for lower evaluation cost. -template >> -ExprPtr optimize(ExprPtr const& expr, IdxToSize const& idx2size, - bool reorder_sum) { - using ranges::views::transform; +namespace { + +index_to_extent_t default_idx_to_size() { + return [](Index const& ix) { return ix.space().approximate_size(); }; +} + +/// Optimize a Product that contains only Tensor and scalar factors. +ExprPtr opt_pure_product(Product const& prod, index_to_extent_t const& idx2size, + OptFor opt_for) { + if (opt_for == OptFor::Flops) + return opt::single_term_opt(prod, idx2size); + SEQUANT_ASSERT(opt_for == OptFor::Memsize); + return opt::single_term_opt(prod, idx2size); +} + +/// Deliberately non-identifier label prefix used to stand in for non-Tensor, +/// non-scalar factors during single-term optimization. Chosen so that no +/// user-defined tensor label can collide with it. +inline constexpr std::wstring_view placeholder_label_prefix = L"@__opt_"; + +/// Optimize a Product that contains some non-Tensor, non-scalar factors by +/// substituting placeholder tensors with target indices, optimizing the +/// resulting tensor-only product, then swapping the originals back in. +ExprPtr opt_mixed_product(Product const& prod, + index_to_extent_t const& idx2size, OptFor opt_for) { + container::svector non_tensors(prod.size()); + container::svector new_factors; + new_factors.reserve(prod.size()); + + for (std::size_t i = 0; i < prod.size(); ++i) { + auto&& f = prod.factor(i); + if (f->is() || f->is_scalar()) { + new_factors.emplace_back(f); + } else { + non_tensors[i] = f; + auto target_idxs = get_unique_indices(f); + new_factors.emplace_back(ex( + std::wstring(placeholder_label_prefix) + std::to_wstring(i), + bra(target_idxs.bra), ket(target_idxs.ket), aux(target_idxs.aux))); + } + } + + auto result = opt_pure_product( + Product{prod.scalar(), new_factors, Product::Flatten::No}, idx2size, + opt_for); + + auto replacer = [&non_tensors](ExprPtr& out) { + if (!out->is()) return; + auto label = out->as().label(); + if (!label.starts_with(placeholder_label_prefix)) return; + + // The placeholder prefix is internal; anything carrying it must have been + // emitted by this function, with a pure-decimal suffix indexing + // non_tensors. Any deviation is a programming error. + auto suffix_view = label.substr(placeholder_label_prefix.size()); + SEQUANT_ASSERT(!suffix_view.empty()); + std::size_t suffix = 0; + for (wchar_t c : suffix_view) { + SEQUANT_ASSERT(c >= L'0' && c <= L'9'); + suffix = suffix * 10 + static_cast(c - L'0'); + } + SEQUANT_ASSERT(suffix < non_tensors.size() && non_tensors[suffix]); + out = non_tensors[suffix].clone(); + }; + + result->visit(replacer, /* atoms_only = */ true); + return result; +} + +/// Recursive workhorse. \p parallel_outer controls whether the (single) +/// outermost Sum's summands are processed in parallel; nested recursive +/// calls always run sequentially to avoid `sequant::for_each` oversubscription. +ExprPtr optimize_impl(ExprPtr const& expr, index_to_extent_t const& idx2size, + OptFor opt_for, bool reorder, bool parallel_outer) { if (expr->is()) { - if (ranges::all_of(*expr, [](auto&& x) { - return x->template is() || x->is_scalar(); - })) - return opt::single_term_opt(expr->as(), idx2size); - else { - auto const& prod = expr->as(); - - container::svector non_tensors(prod.size()); - container::svector new_factors; - - for (auto i = 0; i < prod.size(); ++i) { - auto&& f = prod.factor(i); - if (f->is() || f->is_scalar()) - new_factors.emplace_back(f); - else { - non_tensors[i] = f; - auto target_idxs = get_unique_indices(f); - new_factors.emplace_back( - ex(L"I_" + std::to_wstring(i), bra(target_idxs.bra), - ket(target_idxs.ket), aux(target_idxs.aux))); - } - } - - auto result = opt::single_term_opt( - Product(prod.scalar(), new_factors, Product::Flatten::No), idx2size); - - auto replacer = [&non_tensors](ExprPtr& out) { - if (!out->is()) return; - auto const& tnsr = out->as(); - auto&& label = tnsr.label(); - if (label.at(0) == L'I' && label.at(1) == L'_') { - size_t suffix = std::stoi(std::wstring(label.data() + 2)); - out = non_tensors[suffix].clone(); - } - }; - - result->visit(replacer, /* atoms_only = */ true); - return result; + auto const& prod = expr->as(); + bool pure = ranges::all_of(prod, [](auto&& x) { + return x->template is() || x->is_scalar(); + }); + return pure ? opt_pure_product(prod, idx2size, opt_for) + : opt_mixed_product(prod, idx2size, opt_for); + } + + if (expr->is()) { + auto const& in_sum = expr->as(); + Sum::summands_type new_smands(in_sum.size()); + + auto do_term = [&](std::size_t i) { + new_smands[i] = + optimize_impl(in_sum.summand(i), idx2size, opt_for, + /*reorder=*/false, /*parallel_outer=*/false); + }; + + // Thread-safety of the parallel branch rests on two invariants; do NOT + // break them without re-auditing: + // 1. Each task writes a distinct, pre-allocated new_smands[i] slot, and + // the work below (single_term_opt -> + // TensorNetwork::canonicalize_slots) operates on per-task *clones* of + // the input tensors. The lazily populated `mutable` caches on + // Expr/Index (hash_value_, label_, ...) are unsynchronized, so + // concurrent work must never read/write them on a shared (non-cloned) + // node. Index comparison touches only immutable members, so building + // index sets over shared indices is safe. + // 2. The binarize() pass below DOES read Index::label() (a lazy cache + // write) on the optimized summands, so it is run *sequentially, after* + // for_each() has joined -- never inside do_term(). + // The default Context and cardinal_tensor_labels must also be configured + // before entering here (their writes are unsynchronized unless + // SEQUANT_CONTEXT_MANIPULATION_THREADSAFE); optimize() only reads them. + if (parallel_outer && in_sum.size() > 1) { + auto indices = ranges::views::iota(std::size_t{0}, in_sum.size()); + sequant::for_each(indices, do_term); + } else { + for (std::size_t i = 0; i < in_sum.size(); ++i) do_term(i); } - } else if (expr->is()) { - auto smands = *expr | transform([&idx2size](auto&& s) { - return optimize(s, idx2size, /* reorder_sum= */ false); - }) | ranges::to_vector; - auto sum = Sum{smands.begin(), smands.end()}; - return reorder_sum ? ex(opt::reorder(sum)) : ex(std::move(sum)); - } else - return expr->clone(); + + Sum new_sum(std::move(new_smands), Sum::move_only_tag{}); + if (!reorder) return ex(std::move(new_sum)); + + // Binarize once per optimized summand and hand the nodes to reorder() + // so they aren't re-built inside clusters(). NOTE: this runs sequentially + // by design -- see invariant (2) above. + container::vector> nodes; + nodes.reserve(new_sum.size()); + for (auto const& s : new_sum.summands()) nodes.push_back(binarize(s)); + return ex(opt::reorder(new_sum, nodes)); + } + + return expr->clone(); } -} // namespace opt +} // namespace + +ExprPtr optimize(ExprPtr const& expr, OptimizeOptions opts) { + auto idx2size = opts.idx_to_extent ? std::move(opts.idx_to_extent) + : default_idx_to_size(); + return optimize_impl(expr, idx2size, opts.opt_for, + opts.reorder == ReorderSum::Reorder, + /*parallel_outer=*/true); +} + +ResultExpr& optimize(ResultExpr& expr, OptimizeOptions opts) { + expr.expression() = optimize(expr.expression(), std::move(opts)); + return expr; +} + +ResultExpr& optimize(ResultExpr&& expr, OptimizeOptions opts) { + return optimize(expr, std::move(opts)); +} + +// backwards compatibility overloads + +namespace { +inline OptimizeOptions compatibility_opts(bool reorder_sum) { + return OptimizeOptions{.reorder = reorder_sum ? ReorderSum::Reorder + : ReorderSum::NoReorder}; +} +} // namespace ExprPtr optimize(ExprPtr const& expr, bool reorder_sum) { - return opt::optimize( - expr, [](Index const& ix) { return ix.space().approximate_size(); }, - reorder_sum); + return optimize(expr, compatibility_opts(reorder_sum)); } ResultExpr& optimize(ResultExpr& expr, bool reorder_sum) { - expr.expression() = optimize(expr.expression(), reorder_sum); - - return expr; + return optimize(expr, compatibility_opts(reorder_sum)); } ResultExpr& optimize(ResultExpr&& expr, bool reorder_sum) { - return optimize(expr, reorder_sum); + return optimize(std::move(expr), compatibility_opts(reorder_sum)); } } // namespace sequant diff --git a/SeQuant/core/optimize/optimize.hpp b/SeQuant/core/optimize/optimize.hpp index 77d7571409..caa6952d44 100644 --- a/SeQuant/core/optimize/optimize.hpp +++ b/SeQuant/core/optimize/optimize.hpp @@ -2,39 +2,53 @@ #define SEQUANT_OPTIMIZE_OPTIMIZE_HPP #include +#include namespace sequant { -/// -/// Optimize the expression using IndexSpace::approximate_size() for reference -/// index extent. +/// Optimize the expression for lower evaluation cost. /// /// \param expr Expression to be optimized. -/// \param reorder_sum If true, the summands are reordered so that terms with -/// common sub-expressions appear closer to each other. -/// True by default. -/// \return Optimized expression for lower evaluation cost. -ExprPtr optimize(ExprPtr const& expr, bool reorder_sum = true); - -/// Optimize the expression using IndexSpace::approximate_size() for reference -/// index extent. +/// \param opts Optimization parameters; see \c OptimizeOptions. By default: +/// the cost metric is flop count, index extents are taken from +/// \c IndexSpace::approximate_size(), and the summands of a sum +/// are reordered to cluster terms that share intermediates. +/// \return Optimized expression. +ExprPtr optimize(ExprPtr const& expr, OptimizeOptions opts = {}); + +/// \copydoc optimize(ExprPtr const&, OptimizeOptions) +ResultExpr& optimize(ResultExpr& expr, OptimizeOptions opts = {}); + +/// \copydoc optimize(ExprPtr const&, OptimizeOptions) +[[nodiscard]] ResultExpr& optimize(ResultExpr&& expr, + OptimizeOptions opts = {}); + +// Overloads for backwards compatibility + +/// \deprecated Use the \c OptimizeOptions overload instead. /// -/// \param expr Expression to be optimized. -/// \param reorder_sum If true, the summands are reordered so that terms with -/// common sub-expressions appear closer to each other. -/// True by default. -/// \return Optimized expression for lower evaluation cost. -ResultExpr& optimize(ResultExpr& expr, bool reorder_sum = true); - -/// Optimize the expression using IndexSpace::approximate_size() for reference -/// index extent. +/// Equivalent to calling the primary overload with default \c OptimizeOptions +/// and \c OptimizeOptions::reorder_sum set to \p reorder_sum. /// -/// \param expr Expression to be optimized. -/// \param reorder_sum If true, the summands are reordered so that terms with -/// common sub-expressions appear closer to each other. -/// True by default. -/// \return Optimized expression for lower evaluation cost. -[[nodiscard]] ResultExpr& optimize(ResultExpr&& expr, bool reorder_sum = true); +/// \param expr Expression to be optimized. +/// \param reorder_sum If true, reorder the summands of a sum to cluster terms +/// that share intermediates. +/// \return Optimized expression. +[[deprecated( + "use the OptimizeOptions" + " overload of optimize() instead")]] ExprPtr +optimize(ExprPtr const& expr, bool reorder_sum); + +/// \copydoc optimize(ExprPtr const&, bool) +[[deprecated( + "use the OptimizeOptions" + " overload of optimize() instead")]] ResultExpr& +optimize(ResultExpr& expr, bool reorder_sum); + +/// \copydoc optimize(ExprPtr const&, bool) +[[nodiscard, deprecated("use the OptimizeOptions" + " overload of optimize() instead")]] ResultExpr& +optimize(ResultExpr&& expr, bool reorder_sum); } // namespace sequant diff --git a/SeQuant/core/optimize/options.hpp b/SeQuant/core/optimize/options.hpp new file mode 100644 index 0000000000..8c3669eccc --- /dev/null +++ b/SeQuant/core/optimize/options.hpp @@ -0,0 +1,42 @@ +#ifndef SEQUANT_CORE_OPTIMIZE_OPTIONS_HPP +#define SEQUANT_CORE_OPTIMIZE_OPTIONS_HPP + +#include +#include + +namespace sequant { + +class Index; + +/// Cost metric to optimize for in single-term and top-level optimize routines. +enum class OptFor { Flops, Memsize }; + +/// Whether to reorder summands so terms with shared intermediates appear +/// closer to each other. +enum class ReorderSum { Reorder, NoReorder }; + +/// A type-erased provider mapping an Index to its extent. Used by the public +/// optimize() API. Callers reaching for the templated opt::single_term_opt +/// overloads (constrained by \ref opt::has_index_extent) should pass the +/// callable directly instead of wrapping it here — that keeps the cost +/// function's call site inlineable, whereas a value of this alias goes +/// through std::function's type-erased dispatch on every Index lookup. +using index_to_extent_t = std::function; + +/// Options that control behavior of \ref sequant::optimize. +struct OptimizeOptions { + /// Cost metric to minimize. + OptFor opt_for = OptFor::Flops; + + /// Whether to reorder summands so terms with shared intermediates appear + /// closer to each other. + ReorderSum reorder = ReorderSum::Reorder; + + /// Caller-supplied Index to extent provider. If empty, defaults to + /// \c IndexSpace::approximate_size(). + index_to_extent_t idx_to_extent = {}; +}; + +} // namespace sequant + +#endif // SEQUANT_CORE_OPTIMIZE_OPTIONS_HPP diff --git a/SeQuant/core/optimize/single_term.hpp b/SeQuant/core/optimize/single_term.hpp index 2d5b72ed15..636c8378cd 100644 --- a/SeQuant/core/optimize/single_term.hpp +++ b/SeQuant/core/optimize/single_term.hpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -23,26 +24,72 @@ #include namespace sequant::opt { -/// -/// \param idxsz An invocable that returns size_t for Index argument. -/// \param idxs Index objects. -/// \return flops count -/// + template concept has_index_extent = std::is_invocable_r_v; + namespace detail { -auto constexpr flops_counter(has_index_extent auto&& ixex) { - return [ixex](meta::range_of auto const& lhs, - meta::range_of auto const& rhs, - meta::range_of auto const& result) { +/// \brief Cost function returning the flop count of a binary tensor +/// contraction. +/// +/// The returned callable computes the product of extents over the union of +/// indices on \c lhs, \c rhs, and \c result. A product of 1 (scalar +/// contraction) is reported as zero flops. +/// +/// \param ixex Invocable mapping an Index to its extent. +/// \return A callable (lhs, rhs, result) -> double yielding the +/// flop count of the contraction. +auto flops_counter(has_index_extent auto&& ixex) { + return [ixex = std::forward(ixex)]( + meta::range_of auto const& lhs, + meta::range_of auto const& rhs, + meta::range_of auto const& result) -> double { + using ranges::views::concat; + // is required here: concatenating the three operands repeats + // every contracted/shared index, so it must be deduplicated before taking + // the extent product (cf. memsize_counter, which processes each operand + // separately and so can use the default vector container). + auto tot_idxs = tot_indices(concat(lhs, rhs, result)); + double total_flops = ranges::accumulate( + concat(tot_idxs.outer, tot_idxs.inner), 1., std::multiplies{}, ixex); + // A product of exactly 1. means the index set was empty (the accumulation + // init value), i.e. a scalar contraction => zero flops. Extents are + // integer-valued, so this equality is exact. + return total_flops == 1. ? 0. : total_flops; + }; +} + +/// \brief Cost function returning the total memory footprint of a binary +/// tensor contraction. +/// +/// The returned callable sums, over \c lhs, \c rhs, and \c result, the +/// product of extents of each operand's indices. Operands whose extent +/// product is 1 contribute zero. +/// +/// \param ixex Invocable mapping an Index to its extent. +/// \return A callable (lhs, rhs, result) -> double yielding the +/// summed memory size of the three operands. +auto memsize_counter(has_index_extent auto&& ixex) { + return [ixex = std::forward(ixex)]( + meta::range_of auto const& lhs, + meta::range_of auto const& rhs, + meta::range_of auto const& result) -> double { using ranges::views::concat; - auto tot_idxs = tot_indices>( - concat(lhs, rhs, result)); - double ops = 1.; - for (auto&& idx : concat(tot_idxs.outer, tot_idxs.inner)) ops *= ixex(idx); - // ops == 1 implies zero flops - return ops == 1. ? 0. : ops; + double total_mem{0}; + // Each operand is sized independently, so the default (vector) container of + // tot_indices suffices -- a single operand's index list has no duplicates, + // unlike the concatenated set flops_counter must dedup. + for (auto&& tot_idxs : + {tot_indices(lhs), tot_indices(rhs), tot_indices(result)}) { + double mem = ranges::accumulate(concat(tot_idxs.outer, tot_idxs.inner), + 1., std::multiplies{}, ixex); + // mem == 1. means this operand had no indices (the accumulation init + // value), i.e. a scalar; it contributes no memory. Same exact-equality + // convention as flops_counter above. + if (mem != 1.) total_mem += mem; + } + return total_mem; }; } @@ -54,8 +101,8 @@ struct OptRes { /// Free indices remaining upon evaluation IndexSet indices; - /// The flops count of evaluation - double flops; + /// The operations count of evaluation + double ops; /// The evaluation sequence EvalSequence sequence; @@ -83,6 +130,84 @@ struct SubNetEqual { } }; +/// \brief Seeds the DP table with per-subset open indices and singleton/empty +/// ops counts. +/// +/// For each subset \c i of the input tensors, computes the indices that remain +/// open after evaluating that subset and stores them in \c results[i].indices. +/// Initializes \c results[i].ops to 0 for empty and singleton subsets, and +/// to \c max as a sentinel for subsets that will later be filled in by the DP. +/// Singleton subsets also get their one-element \c sequence pre-populated. +template +void init_results(TensorNetwork const& network, TIdxs const& tidxs, + container::vector& results) { + using IndexContainer = decltype(OptRes::indices); + auto tensor_indices = network.tensors() // + | ranges::views::indirect // + | ranges::views::transform(slots); + auto imed_indices = subset_target_indices(tensor_indices, tidxs); + SEQUANT_ASSERT(ranges::distance(imed_indices) == ranges::distance(results)); + for (size_t i = 0; i < results.size(); ++i) { + results[i].indices = + imed_indices[i] | ranges::views::move | ranges::to; + results[i].ops = std::popcount(i) > 1 + ? std::numeric_limits::max() + : 0; + if (std::popcount(i) == 1) + results[i].sequence.emplace_back(std::countr_zero(i)); + // else results[i].sequence is left uninitialized + } +} + +struct SubnetMetadata { + /// meta_ids[n] is the canonical-subnet id of subset n, or + /// numeric_limits::max() for subsets with popcount < 2. + container::vector meta_ids; + /// Cost of evaluating one representative of each canonical subnet id, + /// indexed by id. Populated lazily during the DP. + container::vector unique_meta_costs; +}; + +/// \brief Precomputes canonical-subnet identifiers for every subset of size +/// >= 2 so that structurally equivalent subnetworks share a CSE id. +/// +/// Builds a `TensorNetwork` for each subset, canonicalizes it, and assigns a +/// dense integer id to each distinct canonical form. The returned +/// `unique_meta_costs` is sized to the number of distinct ids and zero-filled; +/// it is populated during the DP as each canonical subnet's optimal cost +/// becomes known. +/// +/// Side effect: `results[n].indices` may be reordered by `canonicalize_slots`. +inline SubnetMetadata build_subnet_metadata( + TensorNetwork const& network, container::vector& results) { + SubnetMetadata out; + // Use max as sentinel for entries with popcount < 2 (singletons/empty), + // which are skipped below and never assigned a real meta ID. + out.meta_ids.resize(results.size(), std::numeric_limits::max()); + container::unordered_map + meta_to_id; + + for (size_t n = 0; n < results.size(); ++n) { + if (std::popcount(n) < 2) continue; + auto ts = bits::on_bits_index(n) | bits::sieve(network.tensors()); + container::vector ts_expr; + for (auto&& t : ts) + ts_expr.emplace_back(std::dynamic_pointer_cast(t)->clone()); + + auto tn = TensorNetwork{ts_expr}; + auto meta = tn.canonicalize_slots( + TensorCanonicalizer::cardinal_tensor_labels(), &results[n].indices); + + auto [it, inserted] = meta_to_id.try_emplace(std::move(meta), 0); + if (inserted) it->second = meta_to_id.size() - 1; + + out.meta_ids[n] = it->second; + } + out.unique_meta_costs.resize(meta_to_id.size(), 0.0); + return out; +} + /// \brief Finds the optimal evaluation sequence for a single-term tensor /// contraction. /// @@ -130,34 +255,12 @@ EvalSequence single_term_opt_impl(TensorNetwork const& network, meta::range_of auto const& tidxs, CostFn&& cost_fn, bool subnet_cse) { using ranges::views::concat; - using ranges::views::indirect; - using ranges::views::transform; - using IndexContainer = decltype(OptRes::indices); auto const nt = network.tensors().size(); if (nt == 1) return EvalSequence{0}; if (nt == 2) return EvalSequence{0, 1, -1}; container::vector results((size_t{1} << nt)); - - // initialize the intermediate results - { - auto tensor_indices = network.tensors() // - | indirect // - | transform(slots); - auto imed_indices = subset_target_indices(tensor_indices, tidxs); - SEQUANT_ASSERT(ranges::distance(imed_indices) == ranges::distance(results)); - for (size_t i = 0; i < results.size(); ++i) { - results[i].indices = - imed_indices[i] | ranges::views::move | ranges::to; - results[i].flops = - std::popcount(i) > 1 - ? std::numeric_limits::max() - : 0; - if (std::popcount(i) == 1) - results[i].sequence.emplace_back(std::countr_zero(i)); - // else results[i].sequence is left uninitialized - } - } + init_results(network, tidxs, results); // precompute all subnet_meta if subnet_cse is true // Note: the O(2^n) cost is bounded in practice — subset_target_indices above @@ -165,37 +268,15 @@ EvalSequence single_term_opt_impl(TensorNetwork const& network, container::vector meta_ids; container::vector unique_meta_costs; if (subnet_cse) { - // Use max as sentinel for entries with popcount < 2 (singletons/empty), - // which are skipped below and never assigned a real meta ID. - meta_ids.resize(results.size(), std::numeric_limits::max()); - container::unordered_map - meta_to_id; - - for (size_t n = 0; n < results.size(); ++n) { - if (std::popcount(n) < 2) continue; - auto ts = bits::on_bits_index(n) | bits::sieve(network.tensors()); - container::vector ts_expr; - for (auto&& t : ts) { - ts_expr.emplace_back(std::dynamic_pointer_cast(t)->clone()); - } - auto tn = TensorNetwork{ts_expr}; - auto meta = tn.canonicalize_slots( - TensorCanonicalizer::cardinal_tensor_labels(), &results[n].indices); - - auto [it, inserted] = meta_to_id.try_emplace(std::move(meta), 0); - if (inserted) { - it->second = meta_to_id.size() - 1; - } - meta_ids[n] = it->second; - } - unique_meta_costs.resize(meta_to_id.size(), 0.0); + auto md = build_subnet_metadata(network, results); + meta_ids = std::move(md.meta_ids); + unique_meta_costs = std::move(md.unique_meta_costs); } // find the optimal evaluation sequence for (size_t n = 0; n < results.size(); ++n) { if (std::popcount(n) < 2) continue; - for (auto& curr_cost = results[n].flops; + for (auto& curr_cost = results[n].ops; auto&& [lp, rp] : bits::bipartitions(n)) { // do nothing with the trivial bipartition // i.e. one subset is the empty set and the other full @@ -219,7 +300,7 @@ EvalSequence single_term_opt_impl(TensorNetwork const& network, new_cost = cost_fn(results[lp].indices, // results[rp].indices, // results[n].indices) // - + results[lp].flops + results[rp].flops; + + results[lp].ops + results[rp].ops; } if (new_cost <= curr_cost) { @@ -259,34 +340,43 @@ EvalSequence single_term_opt_impl(TensorNetwork const& network, } /// +/// \tparam OptFor Cost metric to optimize for (Flops or Memsize). /// \tparam IdxToSz /// \param network A TensorNetwork object. /// \param idxsz An invocable on Index, that maps Index to its dimension. /// \param subnet_cse Whether to recognize equivalent subnetworks to try /// minimizing the ops counts. -/// \return Optimal evaluation sequence that -/// minimizes flops. If there are -/// equivalent optimal sequences then the result is the one that keeps -/// the order of tensors in the network as original as possible. +/// \return Optimal evaluation sequence under the chosen cost metric. If there +/// are equivalent optimal sequences then the result is the one that +/// keeps the order of tensors in the network as original as possible. /// -template +template EvalSequence single_term_opt(TensorNetwork const& network, IdxToSz&& idxsz, bool subnet_cse) { - auto cost_fn = flops_counter(std::forward(idxsz)); decltype(OptRes::indices) tidxs{}; - return single_term_opt_impl(network, tidxs, cost_fn, subnet_cse); + if constexpr (Metric == OptFor::Flops) { + auto cost_fn = flops_counter(std::forward(idxsz)); + return single_term_opt_impl(network, tidxs, cost_fn, subnet_cse); + } else { + static_assert(Metric == OptFor::Memsize, + "Only Flops and Memsize OptFor supported."); + auto cost_fn = memsize_counter(std::forward(idxsz)); + return single_term_opt_impl(network, tidxs, cost_fn, subnet_cse); + } } } // namespace detail /// +/// \tparam Metric Cost metric to optimize for (Flops by default; Memsize +/// minimizes total operand memory rather than flops). /// \param prod Product to be optimized. /// \param idxsz An invocable object that maps an Index object to size. /// \return Parenthesized product expression. /// /// @note @c prod is assumed to consist of only Tensor expressions /// -template +template ExprPtr single_term_opt(Product const& prod, IdxToSz&& idxsz, bool subnet_cse = false) { using ranges::views::filter; @@ -297,8 +387,8 @@ ExprPtr single_term_opt(Product const& prod, IdxToSz&& idxsz, prod.factors().end(), Product::Flatten::No}); auto const tensors = prod | filter(&ExprPtr::template is) | ranges::to_vector; - auto seq = detail::single_term_opt(TensorNetwork{tensors}, - std::forward(idxsz), subnet_cse); + auto seq = detail::single_term_opt( + TensorNetwork{tensors}, std::forward(idxsz), subnet_cse); auto result = container::svector{}; for (auto i : seq) if (i == -1) { diff --git a/SeQuant/core/optimize/sum.cpp b/SeQuant/core/optimize/sum.cpp index 048720b3da..d0edd09745 100644 --- a/SeQuant/core/optimize/sum.cpp +++ b/SeQuant/core/optimize/sum.cpp @@ -1,7 +1,10 @@ +#include #include #include #include +#include #include +#include #include #include @@ -24,7 +27,9 @@ bool has_only_single_atom(const ExprPtr& term) { return term->size() == 1 && has_only_single_atom(*term->begin()); } -container::vector> clusters(Sum const& expr) { +container::vector> clusters( + Sum const& expr, container::vector> const& nodes) { + SEQUANT_ASSERT(nodes.size() == expr.size()); using ranges::views::tail; using ranges::views::transform; using hash_t = size_t; @@ -44,7 +49,7 @@ container::vector> clusters(Sum const& expr) { }; for (auto const& term : expr) { - auto const node = binarize(term); + auto const& node = nodes[pos]; if (has_only_single_atom(term)) { visitor(node); } else { @@ -93,14 +98,29 @@ container::vector> clusters(Sum const& expr) { return result; } -Sum reorder(Sum const& sum) { +container::vector> clusters(Sum const& expr) { + container::vector> nodes; + nodes.reserve(expr.size()); + for (auto const& term : expr) nodes.push_back(binarize(term)); + return clusters(expr, nodes); +} + +Sum reorder(Sum const& sum, + container::vector> const& nodes) { Sum result; - for (auto const& clstr : clusters(sum)) + for (auto const& clstr : clusters(sum, nodes)) for (auto p : clstr) result.append(sum.at(p)); SEQUANT_ASSERT(result.size() == sum.size()); return result; } +Sum reorder(Sum const& sum) { + container::vector> nodes; + nodes.reserve(sum.size()); + for (auto const& term : sum) nodes.push_back(binarize(term)); + return reorder(sum, nodes); +} + } // namespace sequant::opt diff --git a/SeQuant/core/optimize/sum.hpp b/SeQuant/core/optimize/sum.hpp index 70b548f0b0..b45ecf67ad 100644 --- a/SeQuant/core/optimize/sum.hpp +++ b/SeQuant/core/optimize/sum.hpp @@ -1,7 +1,9 @@ #ifndef SEQUANT_CORE_OPTIMIZE_SUM_HPP #define SEQUANT_CORE_OPTIMIZE_SUM_HPP +#include #include +#include #include namespace sequant::opt { @@ -16,6 +18,12 @@ namespace sequant::opt { /// container::vector> clusters(Sum const& expr); +/// Same as \ref clusters(Sum const&) but uses precomputed eval nodes for the +/// summands, avoiding an internal \c binarize pass. \p nodes[i] must be the +/// binarized form of \c expr.at(i). +container::vector> clusters( + Sum const& expr, container::vector> const& nodes); + /// /// \brief Reorder summands so that terms having common intermediates appear /// closer. @@ -25,6 +33,11 @@ container::vector> clusters(Sum const& expr); /// Sum reorder(Sum const& sum); +/// Same as \ref reorder(Sum const&) but uses precomputed eval nodes for the +/// summands. \p nodes[i] must be the binarized form of \c sum.at(i). +Sum reorder(Sum const& sum, + container::vector> const& nodes); + } // namespace sequant::opt #endif // SEQUANT_CORE_OPTIMIZE_SUM_HPP diff --git a/tests/unit/test_optimize.cpp b/tests/unit/test_optimize.cpp index 9b4e1f232f..530f961deb 100644 --- a/tests/unit/test_optimize.cpp +++ b/tests/unit/test_optimize.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -27,6 +28,15 @@ sequant::ExprPtr extract(sequant::ExprPtr expr, return result; } +// number of Tensor leaves in a (binarized) expression tree +size_t count_tensor_leaves(sequant::ExprPtr const& expr) { + using namespace sequant; + size_t n = 0; + expr->visit([&n](auto const& x) { n += x->template is() ? 1 : 0; }, + /*atoms_only=*/true); + return n; +} + TEST_CASE("optimize", "[optimize]") { using namespace sequant; @@ -227,6 +237,63 @@ TEST_CASE("optimize", "[optimize]") { aux->approximate_size(aux_sz); } + + SECTION("OptimizeOptions: cost metric and reorder knobs") { + auto const prod = parse_expr_antisymm( + L"g_{i3,i4}^{a3,a4} t_{a1,a2}^{i3,i4} t_{a3,a4}^{i1,i2}"); + + // both metrics must binarize the 3-tensor product into a binary tree: + // a 2-factor top product whose leaves are the 3 original tensors + for (auto opt_for : {OptFor::Flops, OptFor::Memsize}) { + CAPTURE(static_cast(opt_for)); + auto res = optimize(prod, OptimizeOptions{.opt_for = opt_for}); + REQUIRE(res->is()); + REQUIRE(res->as().factors().size() == 2); + REQUIRE(count_tensor_leaves(res) == 3); + } + + // reorder knob: a two-summand sum is optimized either way, and the + // optimize() default (reorder) matches an explicit Reorder request + auto const sum = parse_expr_antisymm( + L"g_{i3,i4}^{a3,a4} t_{a1,a2}^{i3,i4} t_{a3,a4}^{i1,i2}" + L" + g_{i3,i4}^{a3,a4} t_{a3,a4}^{i1,i2} t_{a1}^{i3} t_{a2}^{i4}"); + REQUIRE(sum->is()); + + auto no_reorder = + optimize(sum, OptimizeOptions{.reorder = ReorderSum::NoReorder}); + auto reorder = + optimize(sum, OptimizeOptions{.reorder = ReorderSum::Reorder}); + REQUIRE(no_reorder->is()); + REQUIRE(reorder->is()); + REQUIRE(no_reorder->as().size() == sum->as().size()); + REQUIRE(reorder->as().size() == sum->as().size()); + // default options == explicit Reorder + REQUIRE(*optimize(sum) == *reorder); + } + + SECTION("Parallel optimization of summands matches sequential") { + // exercise optimize_impl(..., parallel_outer=true): a multi-summand sum + // optimized concurrently must yield the same result as single-threaded. + auto const sum = parse_expr_antisymm( + L"g_{i3,i4}^{a3,a4} t_{a1,a2}^{i3,i4} t_{a3,a4}^{i1,i2}" + L" + g_{i3,i4}^{a3,a4} t_{a3,a4}^{i1,i2} t_{a1}^{i3} t_{a2}^{i4}" + L" + g_{i3,i4}^{a3,a4} t_{a1}^{i3} t_{a2}^{i4} t_{a3,a4}^{i1,i2}"); + REQUIRE(sum->is()); + REQUIRE(sum->as().size() > 1); + + auto const nthreads_save = num_threads(); + struct ThreadGuard { + int n; + ~ThreadGuard() { set_num_threads(n); } + } guard{nthreads_save}; + + set_num_threads(1); + auto const seq = optimize(sum); + set_num_threads(4); + auto const par = optimize(sum); + + REQUIRE(*seq == *par); + } } SECTION("CSE") {