ValeevGroup · ajay-mk · May 12, 2026 · May 12, 2026 · May 12, 2026 · May 12, 2026
diff --git a/.gitignore b/.gitignore
@@ -43,3 +43,7 @@
 # IDEs
 *.idea
 *.vscode
+
+
+build/*
+cmake-build*
diff --git a/examples/device/CMakeLists.txt b/examples/device/CMakeLists.txt
@@ -25,7 +25,7 @@
 
 if(TILEDARRAY_HAS_CUDA OR TILEDARRAY_HAS_HIP)
 
-    foreach(_exec device_task ta_dense_device ta_cc_abcd_device ta_vector_device ta_reduce_device)
+    foreach(_exec device_task ta_dense_device ta_cc_abcd_device ta_vector_device ta_reduce_device ta_dense_um_tensor ta_vector_um_tensor)
 
         # Add executable
         add_ta_executable(${_exec} "${_exec}.cpp" "tiledarray")

diff --git a/examples/device/ta_dense_um_tensor.cpp b/examples/device/ta_dense_um_tensor.cpp
@@ -0,0 +1,203 @@
+/*
+ *  This file is a part of TiledArray.
+ *  Copyright (C) 2026  Virginia Tech
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Ajay Melekamburath
+ *  Department of Chemistry, Virginia Tech
+ */
+
+// Dense matrix-multiply benchmark using the native UMTensor tile type
+// (TA::Tensor backed by device_um_allocator). Companion to the btas-based
+// ta_dense_device.cpp; same shape + reporting, but the tile is bare
+// `UMTensor<T>` -- no `TA::Tile<>` wrapper -- and the data flows through
+// the device tile-op overloads in src/TiledArray/device/tensor.h.
+//
+// Usage:
+//   ta_dense_um_tensor Nm Bm Nn Bn Nk Bk [nrepeat=5]
+//
+// Computes c(Nm,Nn) = a(Nm,Nk) * b(Nk,Nn) with each dimension blocked by
+// Bm/Bn/Bk. Default scalar type is double; nrepeat iterations are timed
+// for an average GFLOPS reading.
+
+#include <TiledArray/device/tensor.h>
+#include <tiledarray.h>
+
+#ifdef TILEDARRAY_HAS_CUDA
+#include <cuda_profiler_api.h>
+#endif
+
+#include <cstdint>
+#include <iostream>
+#include <vector>
+
+namespace {
+
+template <typename T>
+void run(TiledArray::World &world, long Nm, long Bm, long Nn, long Bn, long Nk,
+         long Bk, long nrepeat) {
+  using TA::DistArray;
+  using TA::TiledRange;
+  using TA::TiledRange1;
+  using TA::UMTensor;
+  using TileT = UMTensor<T>;
+  using ArrayT = DistArray<TileT, TA::DensePolicy>;
+
+  constexpr bool complex_T = TA::detail::is_complex_v<T>;
+  // GEMM flops: 2 * M * N * K (8 * for complex).
+  const std::int64_t nflops =
+      (complex_T ? 8 : 2) * static_cast<std::int64_t>(Nm) *
+      static_cast<std::int64_t>(Nn) * static_cast<std::int64_t>(Nk);
+
+  auto blocking = [](long N, long B) {
+    std::vector<unsigned int> v;
+    for (long i = 0; i <= N; i += B) v.push_back(static_cast<unsigned int>(i));
+    return v;
+  };
+  auto blk_m = blocking(Nm, Bm);
+  auto blk_n = blocking(Nn, Bn);
+  auto blk_k = blocking(Nk, Bk);
+
+  TiledRange trange_a({TiledRange1(blk_m.begin(), blk_m.end()),
+                       TiledRange1(blk_k.begin(), blk_k.end())});
+  TiledRange trange_b({TiledRange1(blk_k.begin(), blk_k.end()),
+                       TiledRange1(blk_n.begin(), blk_n.end())});
+  TiledRange trange_c({TiledRange1(blk_m.begin(), blk_m.end()),
+                       TiledRange1(blk_n.begin(), blk_n.end())});
+
+  if (world.rank() == 0)
+    std::cout << "TiledArray UMTensor dense matrix multiply\n"
+              << "  Nodes        = " << world.size() << "\n"
+              << "  A            = " << Nm << " x " << Nk << " ("
+              << double(Nm * Nk * sizeof(T)) / 1.0e9 << " GB)\n"
+              << "  B            = " << Nk << " x " << Nn << " ("
+              << double(Nk * Nn * sizeof(T)) / 1.0e9 << " GB)\n"
+              << "  C            = " << Nm << " x " << Nn << " ("
+              << double(Nm * Nn * sizeof(T)) / 1.0e9 << " GB)\n"
+              << "  Tile A,B,C   = " << Bm << "x" << Bk << ", " << Bk << "x"
+              << Bn << ", " << Bm << "x" << Bn << "\n"
+              << "  Iterations   = " << nrepeat << "\n";
+
+  ArrayT a(world, trange_a);
+  ArrayT b(world, trange_b);
+  ArrayT c(world, trange_c);
+
+  const T val_a = T(0.03);
+  const T val_b = T(0.02);
+  a.fill(val_a);
+  b.fill(val_b);
+  world.gop.fence();
+
+  // Prefetch inputs to the device once before the timed loop -- the per-tile
+  // ops will also prefetch lazily, but doing it up front keeps the timing
+  // focused on the GEMM kernel cost.
+  TA::to_device(a);
+  TA::to_device(b);
+
+#ifdef TILEDARRAY_HAS_CUDA
+  cudaProfilerStart();
+#endif
+
+  double total_time = 0.0;
+  double total_gflops = 0.0;
+  for (long i = 0; i < nrepeat; ++i) {
+    const double t0 = madness::wall_time();
+    c("m,n") = a("m,k") * b("k,n");
+    world.gop.fence();
+    const double t1 = madness::wall_time();
+    const double dt = t1 - t0;
+    const double gflops = static_cast<double>(nflops) / (dt * 1.0e9);
+    total_time += dt;
+    total_gflops += gflops;
+    if (world.rank() == 0)
+      std::cout << "  iter " << (i + 1) << "  time=" << dt
+                << " s  gflops=" << gflops << "\n";
+  }
+
+#ifdef TILEDARRAY_HAS_CUDA
+  cudaProfilerStop();
+#endif
+
+  if (world.rank() == 0)
+    std::cout << "  Average time   = " << (total_time / double(nrepeat))
+              << " s\n  Average gflops = " << (total_gflops / double(nrepeat))
+              << "\n";
+
+  // Verify: every result element should be Nk * val_a * val_b.
+  const T expected = T(Nk) * val_a * val_b;
+  const auto eps = std::numeric_limits<TA::detail::scalar_t<T>>::epsilon();
+  const auto tolerance = std::abs(expected) * static_cast<decltype(eps)>(Nk) *
+                         static_cast<decltype(eps)>(8) * eps;
+  TA::to_host(c);
+  bool ok = true;
+  for (auto it = c.begin(); it != c.end(); ++it) {
+    const auto tile = it->get();
+    for (std::size_t k = 0; k < tile.size(); ++k) {
+      if (std::abs(tile.data()[k] - expected) > tolerance) {
+        ok = false;
+        if (world.rank() == 0)
+          std::cout << "  MISMATCH at tile " << it.index() << " element " << k
+                    << ": got " << tile.data()[k] << " expected " << expected
+                    << "\n";
+        break;
+      }
+    }
+    if (!ok) break;
+  }
+  if (world.rank() == 0)
+    std::cout << (ok ? "  Verification PASSED\n" : "  Verification FAILED\n");
+}
+
+}  // namespace
+
+int try_main(int argc, char **argv) {
+  TiledArray::World &world = TA_SCOPED_INITIALIZE(argc, argv);
+
+  if (argc < 7) {
+    if (world.rank() == 0)
+      std::cerr
+          << "Usage: " << argv[0] << " Nm Bm Nn Bn Nk Bk [nrepeat=5]\n"
+          << "  Computes c(Nm,Nn) = a(Nm,Nk) * b(Nk,Nn) with UMTensor tiles\n";
+    return 1;
+  }
+  const long Nm = std::atol(argv[1]);
+  const long Bm = std::atol(argv[2]);
+  const long Nn = std::atol(argv[3]);
+  const long Bn = std::atol(argv[4]);
+  const long Nk = std::atol(argv[5]);
+  const long Bk = std::atol(argv[6]);
+  const long nrepeat = (argc >= 8 ? std::atol(argv[7]) : 5);
+  if (Nm <= 0 || Nn <= 0 || Nk <= 0 || Bm <= 0 || Bn <= 0 || Bk <= 0 ||
+      nrepeat <= 0) {
+    if (world.rank() == 0)
+      std::cerr << "All sizes / blocks / nrepeat must be positive\n";
+    return 1;
+  }
+
+  run<double>(world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  try {
+    return try_main(argc, argv);
+  } catch (const std::exception &e) {
+    std::cerr << "exception: " << e.what() << "\n";
+    return 1;
+  } catch (...) {
+    std::cerr << "unknown exception\n";
+    return 1;
+  }
+}
diff --git a/examples/device/ta_vector_um_tensor.cpp b/examples/device/ta_vector_um_tensor.cpp
@@ -0,0 +1,156 @@
+/*
+ *  This file is a part of TiledArray.
+ *  Copyright (C) 2026  Virginia Tech
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Ajay Melekamburath
+ *  Department of Chemistry, Virginia Tech
+ */
+
+// Element-wise vector-op benchmarks (add, scale, permute, Hadamard) using
+// the native UMTensor tile type. Companion to ta_vector_device.cpp.
+//
+// Usage:
+//   ta_vector_um_tensor Nm Bm Nn Bn [nrepeat=5]
+//
+// Times each op for nrepeat iterations and reports the average wall time
+// and effective bandwidth (counting one read + one write per element for
+// in-place ops, two reads + one write for binary ops).
+
+#include <TiledArray/device/tensor.h>
+#include <tiledarray.h>
+
+#include <cstdint>
+#include <iostream>
+#include <vector>
+
+namespace {
+
+template <typename T>
+void run(TiledArray::World &world, long Nm, long Bm, long Nn, long Bn,
+         long nrepeat) {
+  using TA::DistArray;
+  using TA::TiledRange;
+  using TA::TiledRange1;
+  using TA::UMTensor;
+  using TileT = UMTensor<T>;
+  using ArrayT = DistArray<TileT, TA::DensePolicy>;
+
+  auto blocking = [](long N, long B) {
+    std::vector<unsigned int> v;
+    for (long i = 0; i <= N; i += B) v.push_back(static_cast<unsigned int>(i));
+    return v;
+  };
+  auto blk_m = blocking(Nm, Bm);
+  auto blk_n = blocking(Nn, Bn);
+
+  TiledRange trange({TiledRange1(blk_m.begin(), blk_m.end()),
+                     TiledRange1(blk_n.begin(), blk_n.end())});
+  TiledRange trange_T({TiledRange1(blk_n.begin(), blk_n.end()),
+                       TiledRange1(blk_m.begin(), blk_m.end())});
+
+  if (world.rank() == 0)
+    std::cout << "TiledArray UMTensor vector-op benchmark\n"
+              << "  Nodes        = " << world.size() << "\n"
+              << "  Matrix       = " << Nm << " x " << Nn << " ("
+              << double(Nm * Nn * sizeof(T)) / 1.0e9 << " GB)\n"
+              << "  Tile         = " << Bm << " x " << Bn << "\n"
+              << "  Iterations   = " << nrepeat << "\n";
+
+  ArrayT a(world, trange);
+  ArrayT b(world, trange);
+  ArrayT c(world, trange);
+  ArrayT t(world, trange_T);  // transposed-shape result for permute test
+
+  a.fill(T(0.03));
+  b.fill(T(0.02));
+  c.fill(T(0.0));
+  t.fill(T(0.0));
+  world.gop.fence();
+  TA::to_device(a);
+  TA::to_device(b);
+
+  const double bytes_per_elem = static_cast<double>(sizeof(T));
+  const double n_elems = static_cast<double>(Nm) * static_cast<double>(Nn);
+
+  auto bench = [&](const char *name, double bytes_per_iter, auto &&op) {
+    double total_time = 0.0;
+    for (long i = 0; i < nrepeat; ++i) {
+      const double t0 = madness::wall_time();
+      op();
+      world.gop.fence();
+      const double t1 = madness::wall_time();
+      total_time += t1 - t0;
+    }
+    const double avg = total_time / static_cast<double>(nrepeat);
+    const double bw_gbs = bytes_per_iter / (avg * 1.0e9);
+    if (world.rank() == 0)
+      std::cout << "  " << name << ":  avg=" << avg << " s  bw=" << bw_gbs
+                << " GB/s\n";
+  };
+
+  // Binary read-read-write: 3 element accesses per element.
+  const double rw3_bytes = 3.0 * n_elems * bytes_per_elem;
+  // Unary read-write: 2 element accesses per element.
+  const double rw2_bytes = 2.0 * n_elems * bytes_per_elem;
+
+  bench("add(c=a+b)", rw3_bytes, [&] { c("m,n") = a("m,n") + b("m,n"); });
+  bench("subt(c=a-b)", rw3_bytes, [&] { c("m,n") = a("m,n") - b("m,n"); });
+  bench("scale(c=2*a)", rw2_bytes, [&] { c("m,n") = 2.0 * a("m,n"); });
+  bench("hadamard(c=a*b)", rw3_bytes, [&] { c("m,n") = a("m,n") * b("m,n"); });
+  bench("permute(t=a^T)", rw2_bytes, [&] { t("n,m") = a("m,n"); });
+  bench("axpy(c+=a)", rw3_bytes, [&] { c("m,n") += a("m,n"); });
+
+  world.gop.fence();
+}
+
+}  // namespace
+
+int try_main(int argc, char **argv) {
+  TiledArray::World &world = TA_SCOPED_INITIALIZE(argc, argv);
+
+  if (argc < 5) {
+    if (world.rank() == 0)
+      std::cerr
+          << "Usage: " << argv[0] << " Nm Bm Nn Bn [nrepeat=5]\n"
+          << "  Times element-wise vector ops on Nm x Nn UMTensor matrices\n";
+    return 1;
+  }
+  const long Nm = std::atol(argv[1]);
+  const long Bm = std::atol(argv[2]);
+  const long Nn = std::atol(argv[3]);
+  const long Bn = std::atol(argv[4]);
+  const long nrepeat = (argc >= 6 ? std::atol(argv[5]) : 5);
+  if (Nm <= 0 || Nn <= 0 || Bm <= 0 || Bn <= 0 || nrepeat <= 0) {
+    if (world.rank() == 0)
+      std::cerr << "All sizes / blocks / nrepeat must be positive\n";
+    return 1;
+  }
+
+  run<double>(world, Nm, Bm, Nn, Bn, nrepeat);
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  try {
+    return try_main(argc, argv);
+  } catch (const std::exception &e) {
+    std::cerr << "exception: " << e.what() << "\n";
+    return 1;
+  } catch (...) {
+    std::cerr << "unknown exception\n";
+    return 1;
+  }
+}
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -234,6 +234,7 @@ if(TILEDARRAY_HAS_HIP OR TILEDARRAY_HAS_CUDA)
           TiledArray/device/blas.h
           TiledArray/device/btas.h
           TiledArray/device/btas_um_tensor.h
+          TiledArray/device/tensor.h
           TiledArray/device/device_task_fn.h
           TiledArray/device/kernel/mult_kernel.h
           TiledArray/device/kernel/reduce_kernel.h
@@ -267,6 +268,7 @@ if(TILEDARRAY_HAS_CUDA OR TILEDARRAY_HAS_HIP)
 
   set(TILEDARRAY_DEVICE_SOURCE_FILES
           TiledArray/device/btas_um_tensor.cpp
+          TiledArray/device/tensor.cpp
   )
 
   if(TILEDARRAY_HAS_CUDA)
-Original file line number
+Diff line change
@@ Expand Up / @@ -43,3 +43,7 @@ @@
     # IDEs
     *.idea
     *.vscode
+    build/*
+    cmake-build*