diff --git a/.gitignore b/.gitignore
index 5112ca62ed..09398e2bf3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -43,3 +43,7 @@
# IDEs
*.idea
*.vscode
+
+
+build/*
+cmake-build*
diff --git a/examples/device/CMakeLists.txt b/examples/device/CMakeLists.txt
index 14da71efae..febe78c57a 100644
--- a/examples/device/CMakeLists.txt
+++ b/examples/device/CMakeLists.txt
@@ -25,7 +25,7 @@
if(TILEDARRAY_HAS_CUDA OR TILEDARRAY_HAS_HIP)
- foreach(_exec device_task ta_dense_device ta_cc_abcd_device ta_vector_device ta_reduce_device)
+ foreach(_exec device_task ta_dense_device ta_cc_abcd_device ta_vector_device ta_reduce_device ta_dense_um_tensor ta_vector_um_tensor)
# Add executable
add_ta_executable(${_exec} "${_exec}.cpp" "tiledarray")
diff --git a/examples/device/ta_dense_um_tensor.cpp b/examples/device/ta_dense_um_tensor.cpp
new file mode 100644
index 0000000000..6b94e315f3
--- /dev/null
+++ b/examples/device/ta_dense_um_tensor.cpp
@@ -0,0 +1,203 @@
+/*
+ * This file is a part of TiledArray.
+ * Copyright (C) 2026 Virginia Tech
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ *
+ * Ajay Melekamburath
+ * Department of Chemistry, Virginia Tech
+ */
+
+// Dense matrix-multiply benchmark using the native UMTensor tile type
+// (TA::Tensor backed by device_um_allocator). Companion to the btas-based
+// ta_dense_device.cpp; same shape + reporting, but the tile is bare
+// `UMTensor` -- no `TA::Tile<>` wrapper -- and the data flows through
+// the device tile-op overloads in src/TiledArray/device/tensor.h.
+//
+// Usage:
+// ta_dense_um_tensor Nm Bm Nn Bn Nk Bk [nrepeat=5]
+//
+// Computes c(Nm,Nn) = a(Nm,Nk) * b(Nk,Nn) with each dimension blocked by
+// Bm/Bn/Bk. Default scalar type is double; nrepeat iterations are timed
+// for an average GFLOPS reading.
+
+#include
+#include
+
+#ifdef TILEDARRAY_HAS_CUDA
+#include
+#endif
+
+#include
+#include
+#include
+
+namespace {
+
+template
+void run(TiledArray::World &world, long Nm, long Bm, long Nn, long Bn, long Nk,
+ long Bk, long nrepeat) {
+ using TA::DistArray;
+ using TA::TiledRange;
+ using TA::TiledRange1;
+ using TA::UMTensor;
+ using TileT = UMTensor;
+ using ArrayT = DistArray;
+
+ constexpr bool complex_T = TA::detail::is_complex_v;
+ // GEMM flops: 2 * M * N * K (8 * for complex).
+ const std::int64_t nflops =
+ (complex_T ? 8 : 2) * static_cast(Nm) *
+ static_cast(Nn) * static_cast(Nk);
+
+ auto blocking = [](long N, long B) {
+ std::vector v;
+ for (long i = 0; i <= N; i += B) v.push_back(static_cast(i));
+ return v;
+ };
+ auto blk_m = blocking(Nm, Bm);
+ auto blk_n = blocking(Nn, Bn);
+ auto blk_k = blocking(Nk, Bk);
+
+ TiledRange trange_a({TiledRange1(blk_m.begin(), blk_m.end()),
+ TiledRange1(blk_k.begin(), blk_k.end())});
+ TiledRange trange_b({TiledRange1(blk_k.begin(), blk_k.end()),
+ TiledRange1(blk_n.begin(), blk_n.end())});
+ TiledRange trange_c({TiledRange1(blk_m.begin(), blk_m.end()),
+ TiledRange1(blk_n.begin(), blk_n.end())});
+
+ if (world.rank() == 0)
+ std::cout << "TiledArray UMTensor dense matrix multiply\n"
+ << " Nodes = " << world.size() << "\n"
+ << " A = " << Nm << " x " << Nk << " ("
+ << double(Nm * Nk * sizeof(T)) / 1.0e9 << " GB)\n"
+ << " B = " << Nk << " x " << Nn << " ("
+ << double(Nk * Nn * sizeof(T)) / 1.0e9 << " GB)\n"
+ << " C = " << Nm << " x " << Nn << " ("
+ << double(Nm * Nn * sizeof(T)) / 1.0e9 << " GB)\n"
+ << " Tile A,B,C = " << Bm << "x" << Bk << ", " << Bk << "x"
+ << Bn << ", " << Bm << "x" << Bn << "\n"
+ << " Iterations = " << nrepeat << "\n";
+
+ ArrayT a(world, trange_a);
+ ArrayT b(world, trange_b);
+ ArrayT c(world, trange_c);
+
+ const T val_a = T(0.03);
+ const T val_b = T(0.02);
+ a.fill(val_a);
+ b.fill(val_b);
+ world.gop.fence();
+
+ // Prefetch inputs to the device once before the timed loop -- the per-tile
+ // ops will also prefetch lazily, but doing it up front keeps the timing
+ // focused on the GEMM kernel cost.
+ TA::to_device(a);
+ TA::to_device(b);
+
+#ifdef TILEDARRAY_HAS_CUDA
+ cudaProfilerStart();
+#endif
+
+ double total_time = 0.0;
+ double total_gflops = 0.0;
+ for (long i = 0; i < nrepeat; ++i) {
+ const double t0 = madness::wall_time();
+ c("m,n") = a("m,k") * b("k,n");
+ world.gop.fence();
+ const double t1 = madness::wall_time();
+ const double dt = t1 - t0;
+ const double gflops = static_cast(nflops) / (dt * 1.0e9);
+ total_time += dt;
+ total_gflops += gflops;
+ if (world.rank() == 0)
+ std::cout << " iter " << (i + 1) << " time=" << dt
+ << " s gflops=" << gflops << "\n";
+ }
+
+#ifdef TILEDARRAY_HAS_CUDA
+ cudaProfilerStop();
+#endif
+
+ if (world.rank() == 0)
+ std::cout << " Average time = " << (total_time / double(nrepeat))
+ << " s\n Average gflops = " << (total_gflops / double(nrepeat))
+ << "\n";
+
+ // Verify: every result element should be Nk * val_a * val_b.
+ const T expected = T(Nk) * val_a * val_b;
+ const auto eps = std::numeric_limits>::epsilon();
+ const auto tolerance = std::abs(expected) * static_cast(Nk) *
+ static_cast(8) * eps;
+ TA::to_host(c);
+ bool ok = true;
+ for (auto it = c.begin(); it != c.end(); ++it) {
+ const auto tile = it->get();
+ for (std::size_t k = 0; k < tile.size(); ++k) {
+ if (std::abs(tile.data()[k] - expected) > tolerance) {
+ ok = false;
+ if (world.rank() == 0)
+ std::cout << " MISMATCH at tile " << it.index() << " element " << k
+ << ": got " << tile.data()[k] << " expected " << expected
+ << "\n";
+ break;
+ }
+ }
+ if (!ok) break;
+ }
+ if (world.rank() == 0)
+ std::cout << (ok ? " Verification PASSED\n" : " Verification FAILED\n");
+}
+
+} // namespace
+
+int try_main(int argc, char **argv) {
+ TiledArray::World &world = TA_SCOPED_INITIALIZE(argc, argv);
+
+ if (argc < 7) {
+ if (world.rank() == 0)
+ std::cerr
+ << "Usage: " << argv[0] << " Nm Bm Nn Bn Nk Bk [nrepeat=5]\n"
+ << " Computes c(Nm,Nn) = a(Nm,Nk) * b(Nk,Nn) with UMTensor tiles\n";
+ return 1;
+ }
+ const long Nm = std::atol(argv[1]);
+ const long Bm = std::atol(argv[2]);
+ const long Nn = std::atol(argv[3]);
+ const long Bn = std::atol(argv[4]);
+ const long Nk = std::atol(argv[5]);
+ const long Bk = std::atol(argv[6]);
+ const long nrepeat = (argc >= 8 ? std::atol(argv[7]) : 5);
+ if (Nm <= 0 || Nn <= 0 || Nk <= 0 || Bm <= 0 || Bn <= 0 || Bk <= 0 ||
+ nrepeat <= 0) {
+ if (world.rank() == 0)
+ std::cerr << "All sizes / blocks / nrepeat must be positive\n";
+ return 1;
+ }
+
+ run(world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
+ return 0;
+}
+
+int main(int argc, char **argv) {
+ try {
+ return try_main(argc, argv);
+ } catch (const std::exception &e) {
+ std::cerr << "exception: " << e.what() << "\n";
+ return 1;
+ } catch (...) {
+ std::cerr << "unknown exception\n";
+ return 1;
+ }
+}
diff --git a/examples/device/ta_vector_um_tensor.cpp b/examples/device/ta_vector_um_tensor.cpp
new file mode 100644
index 0000000000..603a3bd15f
--- /dev/null
+++ b/examples/device/ta_vector_um_tensor.cpp
@@ -0,0 +1,156 @@
+/*
+ * This file is a part of TiledArray.
+ * Copyright (C) 2026 Virginia Tech
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ *
+ * Ajay Melekamburath
+ * Department of Chemistry, Virginia Tech
+ */
+
+// Element-wise vector-op benchmarks (add, scale, permute, Hadamard) using
+// the native UMTensor tile type. Companion to ta_vector_device.cpp.
+//
+// Usage:
+// ta_vector_um_tensor Nm Bm Nn Bn [nrepeat=5]
+//
+// Times each op for nrepeat iterations and reports the average wall time
+// and effective bandwidth (counting one read + one write per element for
+// in-place ops, two reads + one write for binary ops).
+
+#include
+#include
+
+#include
+#include
+#include
+
+namespace {
+
+template
+void run(TiledArray::World &world, long Nm, long Bm, long Nn, long Bn,
+ long nrepeat) {
+ using TA::DistArray;
+ using TA::TiledRange;
+ using TA::TiledRange1;
+ using TA::UMTensor;
+ using TileT = UMTensor;
+ using ArrayT = DistArray;
+
+ auto blocking = [](long N, long B) {
+ std::vector v;
+ for (long i = 0; i <= N; i += B) v.push_back(static_cast(i));
+ return v;
+ };
+ auto blk_m = blocking(Nm, Bm);
+ auto blk_n = blocking(Nn, Bn);
+
+ TiledRange trange({TiledRange1(blk_m.begin(), blk_m.end()),
+ TiledRange1(blk_n.begin(), blk_n.end())});
+ TiledRange trange_T({TiledRange1(blk_n.begin(), blk_n.end()),
+ TiledRange1(blk_m.begin(), blk_m.end())});
+
+ if (world.rank() == 0)
+ std::cout << "TiledArray UMTensor vector-op benchmark\n"
+ << " Nodes = " << world.size() << "\n"
+ << " Matrix = " << Nm << " x " << Nn << " ("
+ << double(Nm * Nn * sizeof(T)) / 1.0e9 << " GB)\n"
+ << " Tile = " << Bm << " x " << Bn << "\n"
+ << " Iterations = " << nrepeat << "\n";
+
+ ArrayT a(world, trange);
+ ArrayT b(world, trange);
+ ArrayT c(world, trange);
+ ArrayT t(world, trange_T); // transposed-shape result for permute test
+
+ a.fill(T(0.03));
+ b.fill(T(0.02));
+ c.fill(T(0.0));
+ t.fill(T(0.0));
+ world.gop.fence();
+ TA::to_device(a);
+ TA::to_device(b);
+
+ const double bytes_per_elem = static_cast(sizeof(T));
+ const double n_elems = static_cast(Nm) * static_cast(Nn);
+
+ auto bench = [&](const char *name, double bytes_per_iter, auto &&op) {
+ double total_time = 0.0;
+ for (long i = 0; i < nrepeat; ++i) {
+ const double t0 = madness::wall_time();
+ op();
+ world.gop.fence();
+ const double t1 = madness::wall_time();
+ total_time += t1 - t0;
+ }
+ const double avg = total_time / static_cast(nrepeat);
+ const double bw_gbs = bytes_per_iter / (avg * 1.0e9);
+ if (world.rank() == 0)
+ std::cout << " " << name << ": avg=" << avg << " s bw=" << bw_gbs
+ << " GB/s\n";
+ };
+
+ // Binary read-read-write: 3 element accesses per element.
+ const double rw3_bytes = 3.0 * n_elems * bytes_per_elem;
+ // Unary read-write: 2 element accesses per element.
+ const double rw2_bytes = 2.0 * n_elems * bytes_per_elem;
+
+ bench("add(c=a+b)", rw3_bytes, [&] { c("m,n") = a("m,n") + b("m,n"); });
+ bench("subt(c=a-b)", rw3_bytes, [&] { c("m,n") = a("m,n") - b("m,n"); });
+ bench("scale(c=2*a)", rw2_bytes, [&] { c("m,n") = 2.0 * a("m,n"); });
+ bench("hadamard(c=a*b)", rw3_bytes, [&] { c("m,n") = a("m,n") * b("m,n"); });
+ bench("permute(t=a^T)", rw2_bytes, [&] { t("n,m") = a("m,n"); });
+ bench("axpy(c+=a)", rw3_bytes, [&] { c("m,n") += a("m,n"); });
+
+ world.gop.fence();
+}
+
+} // namespace
+
+int try_main(int argc, char **argv) {
+ TiledArray::World &world = TA_SCOPED_INITIALIZE(argc, argv);
+
+ if (argc < 5) {
+ if (world.rank() == 0)
+ std::cerr
+ << "Usage: " << argv[0] << " Nm Bm Nn Bn [nrepeat=5]\n"
+ << " Times element-wise vector ops on Nm x Nn UMTensor matrices\n";
+ return 1;
+ }
+ const long Nm = std::atol(argv[1]);
+ const long Bm = std::atol(argv[2]);
+ const long Nn = std::atol(argv[3]);
+ const long Bn = std::atol(argv[4]);
+ const long nrepeat = (argc >= 6 ? std::atol(argv[5]) : 5);
+ if (Nm <= 0 || Nn <= 0 || Bm <= 0 || Bn <= 0 || nrepeat <= 0) {
+ if (world.rank() == 0)
+ std::cerr << "All sizes / blocks / nrepeat must be positive\n";
+ return 1;
+ }
+
+ run(world, Nm, Bm, Nn, Bn, nrepeat);
+ return 0;
+}
+
+int main(int argc, char **argv) {
+ try {
+ return try_main(argc, argv);
+ } catch (const std::exception &e) {
+ std::cerr << "exception: " << e.what() << "\n";
+ return 1;
+ } catch (...) {
+ std::cerr << "unknown exception\n";
+ return 1;
+ }
+}
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e0edad8c7e..28c1eb0bf1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -234,6 +234,7 @@ if(TILEDARRAY_HAS_HIP OR TILEDARRAY_HAS_CUDA)
TiledArray/device/blas.h
TiledArray/device/btas.h
TiledArray/device/btas_um_tensor.h
+ TiledArray/device/tensor.h
TiledArray/device/device_task_fn.h
TiledArray/device/kernel/mult_kernel.h
TiledArray/device/kernel/reduce_kernel.h
@@ -267,6 +268,7 @@ if(TILEDARRAY_HAS_CUDA OR TILEDARRAY_HAS_HIP)
set(TILEDARRAY_DEVICE_SOURCE_FILES
TiledArray/device/btas_um_tensor.cpp
+ TiledArray/device/tensor.cpp
)
if(TILEDARRAY_HAS_CUDA)
diff --git a/src/TiledArray/device/tensor.cpp b/src/TiledArray/device/tensor.cpp
new file mode 100644
index 0000000000..94dc4d3cc0
--- /dev/null
+++ b/src/TiledArray/device/tensor.cpp
@@ -0,0 +1,42 @@
+/*
+ * This file is a part of TiledArray.
+ * Copyright (C) 2026 Virginia Tech
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ *
+ * Ajay Melekamburath
+ * Department of Chemistry, Virginia Tech
+ */
+
+#include
+
+#include
+
+namespace TiledArray {
+
+// Explicit instantiations of the UMTensor class for the standard numeric
+// types. Without these, every TU including device/tensor.h would instantiate
+// the full TA::Tensor> class body.
+// Mirrors the host-side set in tensor/tensor.cpp; paired with the
+// `extern template` declarations in device/tensor.h.
+template class Tensor>;
+template class Tensor>;
+template class Tensor,
+ device_um_allocator>>;
+template class Tensor,
+ device_um_allocator>>;
+template class Tensor>;
+template class Tensor>;
+
+} // namespace TiledArray
diff --git a/src/TiledArray/device/tensor.h b/src/TiledArray/device/tensor.h
new file mode 100644
index 0000000000..4488734191
--- /dev/null
+++ b/src/TiledArray/device/tensor.h
@@ -0,0 +1,946 @@
+/*
+ * This file is a part of TiledArray.
+ * Copyright (C) 2026 Virginia Tech
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ *
+ * Ajay Melekamburath
+ * Department of Chemistry, Virginia Tech
+ */
+
+#ifndef TILEDARRAY_DEVICE_TENSOR_H
+#define TILEDARRAY_DEVICE_TENSOR_H
+
+#include
+
+#ifdef TILEDARRAY_HAS_DEVICE
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+namespace TiledArray {
+namespace detail {
+
+/// UMTensor lives in unified memory; it is identified as a device_tile and
+/// the expression engine must route its tile ops through
+/// madness::add_device_task.
+template
+struct is_device_tile>
+ : public std::bool_constant> {};
+
+/// Prefetch a UMTensor's storage to the device associated with its tile range.
+template
+ requires TiledArray::detail::is_numeric_v
+inline void to_device(const TiledArray::UMTensor& tile) {
+ if (tile.empty()) return;
+ auto stream = device::stream_for(tile.range());
+ if (deviceEnv::instance()->concurrent_managed_access()) {
+ DeviceSafeCall(device::memPrefetchAsync(tile.data(),
+ tile.total_size() * sizeof(T),
+ stream.device, stream.stream));
+ }
+}
+
+/// Prefetch a UMTensor's storage back to the host.
+template
+ requires TiledArray::detail::is_numeric_v
+inline void to_host(const TiledArray::UMTensor& tile) {
+ if (tile.empty()) return;
+ auto stream = device::stream_for(tile.range());
+ if (deviceEnv::instance()->concurrent_managed_access()) {
+ DeviceSafeCall(
+ device::memPrefetchAsync(tile.data(), tile.total_size() * sizeof(T),
+ device::CpuDeviceId, stream.stream));
+ }
+}
+
+} // namespace detail
+
+// clang-format off
+/// Tile-op overloads for UMTensor.
+///
+/// Each overload sits in `namespace TiledArray` so ADL finds it from the
+/// expression engine and from the tile_op layer's free-function defaults.
+/// More-specialized concrete-type overloads win against the generic
+/// forwarder in `tile_op/tile_interface.h`:
+/// \code
+/// template
+/// auto add(Left&& left, Right&& right) {
+/// return left.add(right);
+/// }
+/// \endcode
+/// so we never fall back to the CPU member functions for UMTensor.
+///
+/// All overloads follow the stream/queue contract:
+/// 1. Resolve a queue via `blasqueue_for(range)`. Inside a device task
+/// this is the same queue everyone else in the task uses (see
+/// `external/device.h:899-907`); outside one, it round-robins.
+/// 2. Prefetch every input + the result to the device.
+/// 3. Call into BLAS++ / device kernels on that queue.
+/// 4. `sync_madness_task_with(stream)` so the enclosing MADNESS device
+/// task waits for the queue to drain before completing.
+///
+/// In-place ops provide both an lvalue and an rvalue overload: the lvalue
+/// overload does the work, the rvalue overload forwards to it.
+///
+/// nbatch_ > 1 is not yet supported; the host-side tile
+/// ops don't support them either.
+// clang-format on
+
+/// result[i] = arg[i]
+template
+ requires TiledArray::detail::is_numeric_v
+inline UMTensor clone(const UMTensor& arg) {
+ TA_ASSERT(!arg.empty());
+ TA_ASSERT(arg.nbatch() == 1);
+
+ auto& queue = blasqueue_for(arg.range());
+ const device::Stream stream(queue.device(), queue.stream());
+ DeviceSafeCall(device::setDevice(stream.device));
+
+ UMTensor result(arg.range());
+
+ detail::to_device(arg);
+ detail::to_device(result);
+
+ blas::copy(result.size(), arg.data(), 1, result.data(), 1, queue);
+
+ device::sync_madness_task_with(stream);
+ return result;
+}
+
+namespace detail {
+
+/// Apply a scaling factor in-place on the device, replicating the
+/// ComplexConjugate handling from device/btas.h::scale. Real-valued kernels
+/// reduce to a single `blas::scal`; conjugation+scale on complex tiles
+/// requires a custom kernel that we have not implemented yet.
+template
+inline void apply_scale_factor(T* data, std::size_t n, const Scalar factor,
+ ::blas::Queue& queue) {
+ if constexpr (TiledArray::detail::is_blas_numeric_v ||
+ std::is_arithmetic_v) {
+ ::blas::scal(n, factor, data, 1, queue);
+ } else if constexpr (TiledArray::detail::is_complex_v) {
+ TA_EXCEPTION(
+ "UMTensor scale with ComplexConjugate factor on complex T is not "
+ "implemented (requires a fused conjugation kernel)");
+ } else if constexpr (std::is_same_v<
+ Scalar,
+ TiledArray::detail::ComplexConjugate>) {
+ // conjugation on a real tensor is a no-op
+ } else if constexpr (std::is_same_v>) {
+ ::blas::scal(n, static_cast(-1), data, 1, queue);
+ }
+}
+
+} // namespace detail
+
+/// result[i] = arg[i] * factor
+template
+ requires TiledArray::detail::is_numeric_v &&
+ TiledArray::detail::is_numeric_v
+inline UMTensor scale(const UMTensor& arg, const Scalar factor) {
+ auto result = clone(arg);
+ auto& queue = blasqueue_for(result.range());
+ const device::Stream stream(queue.device(), queue.stream());
+ detail::apply_scale_factor(result.data(), result.size(), factor, queue);
+ device::sync_madness_task_with(stream);
+ return result;
+}
+
+/// result[i] *= factor (in-place)
+template
+ requires TiledArray::detail::is_numeric_v &&
+ TiledArray::detail::is_numeric_v
+inline UMTensor& scale_to(UMTensor& result, const Scalar factor) {
+ TA_ASSERT(!result.empty());
+ TA_ASSERT(result.nbatch() == 1);
+ auto& queue = blasqueue_for(result.range());
+ const device::Stream stream(queue.device(), queue.stream());
+ DeviceSafeCall(device::setDevice(stream.device));
+ detail::to_device(result);
+ detail::apply_scale_factor(result.data(), result.size(), factor, queue);
+ device::sync_madness_task_with(stream);
+ return result;
+}
+
+template
+ requires TiledArray::detail::is_numeric_v &&
+ TiledArray::detail::is_numeric_v
+inline UMTensor& scale_to(UMTensor&& result, const Scalar factor) {
+ return scale_to(result, factor);
+}
+
+/// result[i] = -arg[i]
+template
+ requires TiledArray::detail::is_numeric_v
+inline UMTensor neg(const UMTensor& arg) {
+ return scale(arg, T(-1));
+}
+
+/// arg[i] = -arg[i] (in-place)
+template
+ requires TiledArray::detail::is_numeric_v
+inline UMTensor& neg_to(UMTensor& arg) {
+ return scale_to(arg, T(-1));
+}
+
+template
+ requires TiledArray::detail::is_numeric_v
+inline UMTensor& neg_to(UMTensor&& arg) {
+ return neg_to(arg);
+}
+
+/// result[i] = arg1[i] + arg2[i]
+template
+ requires TiledArray::detail::is_numeric_v
+inline UMTensor add(const UMTensor& arg1, const UMTensor& arg2) {
+ TA_ASSERT(!arg1.empty());
+ TA_ASSERT(!arg2.empty());
+ TA_ASSERT(arg1.nbatch() == 1 && arg2.nbatch() == 1);
+
+ auto& queue = blasqueue_for(arg1.range());
+ const device::Stream stream(queue.device(), queue.stream());
+ DeviceSafeCall(device::setDevice(stream.device));
+
+ UMTensor result(arg1.range());
+
+ detail::to_device(arg1);
+ detail::to_device(arg2);
+ detail::to_device(result);
+
+ ::blas::copy(result.size(), arg1.data(), 1, result.data(), 1, queue);
+ ::blas::axpy(result.size(), T(1), arg2.data(), 1, result.data(), 1, queue);
+
+ device::sync_madness_task_with(stream);
+ return result;
+}
+
+/// result[i] = (arg1[i] + arg2[i]) * factor
+template
+ requires TiledArray::detail::is_numeric_v &&
+ TiledArray::detail::is_numeric_v
+inline UMTensor add(const UMTensor& arg1, const UMTensor& arg2,
+ const Scalar factor) {
+ auto result = add(arg1, arg2);
+ return scale_to(result, factor);
+}
+
+/// result[i] += arg[i]
+template
+ requires TiledArray::detail::is_numeric_v
+inline UMTensor& add_to(UMTensor& result, const UMTensor& arg) {
+ TA_ASSERT(!result.empty());
+ TA_ASSERT(!arg.empty());
+ TA_ASSERT(result.nbatch() == 1 && arg.nbatch() == 1);
+
+ auto& queue = blasqueue_for(result.range());
+ const device::Stream stream(queue.device(), queue.stream());
+ DeviceSafeCall(device::setDevice(stream.device));
+
+ detail::to_device(result);
+ detail::to_device(arg);
+
+ ::blas::axpy(result.size(), T(1), arg.data(), 1, result.data(), 1, queue);
+
+ device::sync_madness_task_with(stream);
+ return result;
+}
+
+template
+ requires TiledArray::detail::is_numeric_v
+inline UMTensor& add_to(UMTensor&& result, const UMTensor& arg) {
+ return add_to(result, arg);
+}
+
+/// result[i] = (result[i] + arg[i]) * factor
+/// Matches TA::Tensor::add_to(right, factor) semantics: `(l += r) *= factor`.
+template
+ requires TiledArray::detail::is_numeric_v &&
+ TiledArray::detail::is_numeric_v
+inline UMTensor& add_to(UMTensor& result, const UMTensor& arg,
+ const Scalar factor) {
+ add_to(result, arg);
+ return scale_to(result, factor);
+}
+
+template
+ requires TiledArray::detail::is_numeric_v &&
+ TiledArray::detail::is_numeric_v
+inline UMTensor& add_to(UMTensor&& result, const UMTensor& arg,
+ const Scalar factor) {
+ return add_to(result, arg, factor);
+}
+
+/// result[i] = arg1[i] - arg2[i]
+template
+ requires TiledArray::detail::is_numeric_v
+inline UMTensor subt(const UMTensor& arg1, const UMTensor& arg2) {
+ TA_ASSERT(!arg1.empty());
+ TA_ASSERT(!arg2.empty());
+ TA_ASSERT(arg1.nbatch() == 1 && arg2.nbatch() == 1);
+
+ auto& queue = blasqueue_for(arg1.range());
+ const device::Stream stream(queue.device(), queue.stream());
+ DeviceSafeCall(device::setDevice(stream.device));
+
+ UMTensor result(arg1.range());
+
+ detail::to_device(arg1);
+ detail::to_device(arg2);
+ detail::to_device(result);
+
+ ::blas::copy(result.size(), arg1.data(), 1, result.data(), 1, queue);
+ ::blas::axpy(result.size(), T(-1), arg2.data(), 1, result.data(), 1, queue);
+
+ device::sync_madness_task_with(stream);
+ return result;
+}
+
+/// result[i] = (arg1[i] - arg2[i]) * factor
+template
+ requires TiledArray::detail::is_numeric_v &&
+ TiledArray::detail::is_numeric_v
+inline UMTensor subt(const UMTensor& arg1, const UMTensor& arg2,
+ const Scalar factor) {
+ auto result = subt(arg1, arg2);
+ return scale_to(result, factor);
+}
+
+/// result[i] -= arg[i]
+template
+ requires TiledArray::detail::is_numeric_v
+inline UMTensor& subt_to(UMTensor& result, const UMTensor& arg) {
+ TA_ASSERT(!result.empty());
+ TA_ASSERT(!arg.empty());
+ TA_ASSERT(result.nbatch() == 1 && arg.nbatch() == 1);
+
+ auto& queue = blasqueue_for(result.range());
+ const device::Stream stream(queue.device(), queue.stream());
+ DeviceSafeCall(device::setDevice(stream.device));
+
+ detail::to_device(result);
+ detail::to_device(arg);
+
+ ::blas::axpy(result.size(), T(-1), arg.data(), 1, result.data(), 1, queue);
+
+ device::sync_madness_task_with(stream);
+ return result;
+}
+
+template
+ requires TiledArray::detail::is_numeric_v
+inline UMTensor& subt_to(UMTensor&& result, const UMTensor& arg) {
+ return subt_to(result, arg);
+}
+
+/// result[i] = (result[i] - arg[i]) * factor
+/// Matches TA::Tensor::subt_to(right, factor) semantics: `(l -= r) *= factor`.
+template
+ requires TiledArray::detail::is_numeric_v &&
+ TiledArray::detail::is_numeric_v
+inline UMTensor& subt_to(UMTensor& result, const UMTensor& arg,
+ const Scalar factor) {
+ subt_to(result, arg);
+ return scale_to(result, factor);
+}
+
+template
+ requires TiledArray::detail::is_numeric_v &&
+ TiledArray::detail::is_numeric_v
+inline UMTensor& subt_to(UMTensor&& result, const UMTensor& arg,
+ const Scalar factor) {
+ return subt_to(result, arg, factor);
+}
+
+/// dot product: scalar = sum_i arg1[i] * arg2[i]
+template
+ requires TiledArray::detail::is_numeric_v
+inline T dot(const UMTensor& arg1, const UMTensor& arg2) {
+ TA_ASSERT(!arg1.empty());
+ TA_ASSERT(!arg2.empty());
+ TA_ASSERT(arg1.nbatch() == 1 && arg2.nbatch() == 1);
+ TA_ASSERT(arg1.size() == arg2.size());
+
+ auto& queue = blasqueue_for(arg1.range());
+ const device::Stream stream(queue.device(), queue.stream());
+ DeviceSafeCall(device::setDevice(stream.device));
+
+ detail::to_device(arg1);
+ detail::to_device(arg2);
+
+ T result(0);
+ ::blas::dot(arg1.size(), arg1.data(), 1, arg2.data(), 1, &result, queue);
+
+ device::sync_madness_task_with(stream);
+ return result;
+}
+
+/// scalar = sum_i arg[i] * arg[i]
+template
+ requires TiledArray::detail::is_numeric_v
+inline auto squared_norm(const UMTensor& arg) {
+ return dot(arg, arg);
+}
+
+/// scalar = sqrt(squared_norm(arg))
+template
+ requires TiledArray::detail::is_numeric_v
+inline auto norm(const UMTensor& arg) {
+ using std::sqrt;
+ using ResultType = TiledArray::detail::scalar_t;
+ return static_cast(sqrt(squared_norm(arg)));
+}
+
+/// result[perm(i)] = arg[i]
+template
+ requires TiledArray::detail::is_numeric_v
+inline UMTensor permute(const UMTensor& arg,
+ const TiledArray::Permutation& perm) {
+ TA_ASSERT(!arg.empty());
+ TA_ASSERT(arg.nbatch() == 1);
+ TA_ASSERT(perm.size() == arg.range().rank());
+
+ auto result_range = perm * arg.range();
+ auto& queue = blasqueue_for(result_range);
+ const device::Stream stream(queue.device(), queue.stream());
+ DeviceSafeCall(device::setDevice(stream.device));
+
+ UMTensor result(result_range);
+
+ detail::to_device(arg);
+ detail::to_device(result);
+
+ // librett operates on the original (unpermuted) range and writes into the
+ // permuted layout; pointers go in as-is.
+ librett_permute(const_cast(arg.data()), result.data(), arg.range(), perm,
+ stream.stream);
+
+ device::sync_madness_task_with(stream);
+ return result;
+}
+
+/// BipartitePermutation -> plain Permutation forward.
+template
+ requires TiledArray::detail::is_numeric_v
+inline UMTensor permute(const UMTensor& arg,
+ const TiledArray::BipartitePermutation& perm) {
+ TA_ASSERT(inner_size(perm) == 0); // UMTensor is a non-nested tile
+ return permute(arg, outer(perm));
+}
+
+/// result[perm(i)] = arg[i] * factor
+template
+ requires TiledArray::detail::is_numeric_v &&
+ TiledArray::detail::is_numeric_v &&
+ TiledArray::detail::is_permutation_v
+inline UMTensor scale(const UMTensor& arg, const Scalar factor,
+ const Perm& perm) {
+ auto scaled = scale(arg, factor);
+ return permute(scaled, perm);
+}
+
+/// result[perm(i)] = -arg[i]
+template
+ requires TiledArray::detail::is_numeric_v &&
+ TiledArray::detail::is_permutation_v
+inline UMTensor neg(const UMTensor& arg, const Perm& perm) {
+ return permute(neg(arg), perm);
+}
+
+/// result[perm(i)] = arg1[i] + arg2[i]
+template
+ requires TiledArray::detail::is_numeric_v &&
+ TiledArray::detail::is_permutation_v
+inline UMTensor add(const UMTensor& arg1, const UMTensor& arg2,
+ const Perm& perm) {
+ return permute(add(arg1, arg2), perm);
+}
+
+/// result[perm(i)] = (arg1[i] + arg2[i]) * factor
+template
+ requires TiledArray::detail::is_numeric_v &&
+ TiledArray::detail::is_numeric_v &&
+ TiledArray::detail::is_permutation_v
+inline UMTensor add(const UMTensor& arg1, const UMTensor& arg2,
+ const Scalar factor, const Perm& perm) {
+ return permute(add(arg1, arg2, factor), perm);
+}
+
+/// result[perm(i)] = arg1[i] - arg2[i]
+template
+ requires TiledArray::detail::is_numeric_v &&
+ TiledArray::detail::is_permutation_v
+inline UMTensor subt(const UMTensor& arg1, const UMTensor& arg2,
+ const Perm& perm) {
+ return permute(subt(arg1, arg2), perm);
+}
+
+/// result[perm(i)] = (arg1[i] - arg2[i]) * factor
+template
+ requires TiledArray::detail::is_numeric_v &&
+ TiledArray::detail::is_numeric_v &&
+ TiledArray::detail::is_permutation_v
+inline UMTensor subt(const UMTensor& arg1, const UMTensor& arg2,
+ const Scalar factor, const Perm& perm) {
+ return permute(subt(arg1, arg2, factor), perm);
+}
+
+/// shift: result has arg's data, range shifted by bound_shift.
+template
+ requires TiledArray::detail::is_numeric_v
+inline UMTensor shift(const UMTensor& arg, const Index& bound_shift) {
+ TA_ASSERT(!arg.empty());
+ TA_ASSERT(arg.nbatch() == 1);
+
+ TiledArray::Range result_range(arg.range());
+ result_range.inplace_shift(bound_shift);
+
+ auto& queue = blasqueue_for(result_range);
+ const device::Stream stream(queue.device(), queue.stream());
+ DeviceSafeCall(device::setDevice(stream.device));
+
+ UMTensor result(result_range);
+
+ detail::to_device(arg);
+ detail::to_device(result);
+
+ ::blas::copy(result.size(), arg.data(), 1, result.data(), 1, queue);
+
+ device::sync_madness_task_with(stream);
+ return result;
+}
+
+/// shift_to: in-place range shift, no data movement.
+template
+ requires TiledArray::detail::is_numeric_v
+inline UMTensor& shift_to(UMTensor& arg, const Index& bound_shift) {
+ return arg.shift_to(bound_shift);
+}
+
+template
+ requires TiledArray::detail::is_numeric_v
+inline UMTensor& shift_to(UMTensor&& arg, const Index& bound_shift) {
+ return shift_to(arg, bound_shift);
+}
+
+/// result[i] = arg1[i] * arg2[i] (element-wise / Hadamard)
+template
+ requires TiledArray::detail::is_numeric_v
+inline UMTensor mult(const UMTensor