Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
ff878bb
device: scaffold UMTensor type + is_device_tile specialization
ajay-mk May 12, 2026
20a35bb
device: tier-1 tile-op overloads for UMTensor
ajay-mk May 12, 2026
f8e870e
device: add permute / shift / mult + perm-variants for UMTensor
ajay-mk May 12, 2026
ca5c9df
device: fix in-place op dispatch + correct scale semantics, add expre…
ajay-mk May 12, 2026
d7add87
device: expand UMTensor expression tests (in-place ops, blocks, einsu…
ajay-mk May 12, 2026
c53eab4
device: add UMTensor archive support + array-level helpers
ajay-mk May 12, 2026
32f6131
device: explicit instantiations of UMTensor for standard numeric types
ajay-mk May 12, 2026
4dc2eeb
device: add UMTensor dense + vector example programs
ajay-mk May 12, 2026
f0c6ec7
device: drop redundant compile_test_tier1 instantiation probe
ajay-mk May 13, 2026
a00c93a
device: drop unused UMTensorArg concept; reword scaffolding comments
ajay-mk May 13, 2026
1ad8c69
chore: update .gitignore to avoide build directories
ajay-mk May 13, 2026
05d7e55
device: update host-device conversion helpers
ajay-mk May 13, 2026
ee31fa9
device: constrain UMTensor tile ops to numeric element types
ajay-mk May 13, 2026
bab98d3
refactor: move static_asserts to tests and fixup comments in device/t…
ajay-mk May 21, 2026
6e570cd
device: tidy UMTensor tile-op overloads (requires-only constraints, T…
ajay-mk May 21, 2026
3a6039a
test: expand and cleanup UMTensor expression coverage; tighten tolera…
ajay-mk May 21, 2026
69611e4
Merge remote-tracking branch 'origin/master' into ajay/feature/umtensor
ajay-mk May 21, 2026
2b5137f
device: add missing congruence asserts to UMTensor gemm overloads
ajay-mk May 21, 2026
3fabd11
device: sync after async to_host prefetch in UM->host paths
ajay-mk May 21, 2026
ba5efa2
device: small UMTensor cleanups
ajay-mk May 21, 2026
7730bb4
chore: reformat source files
ajay-mk May 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,7 @@
# IDEs
*.idea
*.vscode


build/*
cmake-build*
2 changes: 1 addition & 1 deletion examples/device/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

if(TILEDARRAY_HAS_CUDA OR TILEDARRAY_HAS_HIP)

foreach(_exec device_task ta_dense_device ta_cc_abcd_device ta_vector_device ta_reduce_device)
foreach(_exec device_task ta_dense_device ta_cc_abcd_device ta_vector_device ta_reduce_device ta_dense_um_tensor ta_vector_um_tensor)

# Add executable
add_ta_executable(${_exec} "${_exec}.cpp" "tiledarray")
Expand Down
203 changes: 203 additions & 0 deletions examples/device/ta_dense_um_tensor.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
/*
* This file is a part of TiledArray.
* Copyright (C) 2026 Virginia Tech
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* Ajay Melekamburath
* Department of Chemistry, Virginia Tech
*/

// Dense matrix-multiply benchmark using the native UMTensor tile type
// (TA::Tensor backed by device_um_allocator). Companion to the btas-based
// ta_dense_device.cpp; same shape + reporting, but the tile is bare
// `UMTensor<T>` -- no `TA::Tile<>` wrapper -- and the data flows through
// the device tile-op overloads in src/TiledArray/device/tensor.h.
//
// Usage:
// ta_dense_um_tensor Nm Bm Nn Bn Nk Bk [nrepeat=5]
//
// Computes c(Nm,Nn) = a(Nm,Nk) * b(Nk,Nn) with each dimension blocked by
// Bm/Bn/Bk. Default scalar type is double; nrepeat iterations are timed
// for an average GFLOPS reading.

#include <TiledArray/device/tensor.h>
#include <tiledarray.h>

#ifdef TILEDARRAY_HAS_CUDA
#include <cuda_profiler_api.h>
#endif

#include <cstdint>
#include <iostream>
#include <vector>

namespace {

template <typename T>
void run(TiledArray::World &world, long Nm, long Bm, long Nn, long Bn, long Nk,
long Bk, long nrepeat) {
using TA::DistArray;
using TA::TiledRange;
using TA::TiledRange1;
using TA::UMTensor;
using TileT = UMTensor<T>;
using ArrayT = DistArray<TileT, TA::DensePolicy>;

constexpr bool complex_T = TA::detail::is_complex_v<T>;
// GEMM flops: 2 * M * N * K (8 * for complex).
const std::int64_t nflops =
(complex_T ? 8 : 2) * static_cast<std::int64_t>(Nm) *
static_cast<std::int64_t>(Nn) * static_cast<std::int64_t>(Nk);

auto blocking = [](long N, long B) {
std::vector<unsigned int> v;
for (long i = 0; i <= N; i += B) v.push_back(static_cast<unsigned int>(i));
return v;
};
auto blk_m = blocking(Nm, Bm);
auto blk_n = blocking(Nn, Bn);
auto blk_k = blocking(Nk, Bk);

TiledRange trange_a({TiledRange1(blk_m.begin(), blk_m.end()),
TiledRange1(blk_k.begin(), blk_k.end())});
TiledRange trange_b({TiledRange1(blk_k.begin(), blk_k.end()),
TiledRange1(blk_n.begin(), blk_n.end())});
TiledRange trange_c({TiledRange1(blk_m.begin(), blk_m.end()),
TiledRange1(blk_n.begin(), blk_n.end())});

if (world.rank() == 0)
std::cout << "TiledArray UMTensor dense matrix multiply\n"
<< " Nodes = " << world.size() << "\n"
<< " A = " << Nm << " x " << Nk << " ("
<< double(Nm * Nk * sizeof(T)) / 1.0e9 << " GB)\n"
<< " B = " << Nk << " x " << Nn << " ("
<< double(Nk * Nn * sizeof(T)) / 1.0e9 << " GB)\n"
<< " C = " << Nm << " x " << Nn << " ("
<< double(Nm * Nn * sizeof(T)) / 1.0e9 << " GB)\n"
<< " Tile A,B,C = " << Bm << "x" << Bk << ", " << Bk << "x"
<< Bn << ", " << Bm << "x" << Bn << "\n"
<< " Iterations = " << nrepeat << "\n";

ArrayT a(world, trange_a);
ArrayT b(world, trange_b);
ArrayT c(world, trange_c);

const T val_a = T(0.03);
const T val_b = T(0.02);
a.fill(val_a);
b.fill(val_b);
world.gop.fence();

// Prefetch inputs to the device once before the timed loop -- the per-tile
// ops will also prefetch lazily, but doing it up front keeps the timing
// focused on the GEMM kernel cost.
TA::to_device(a);
TA::to_device(b);

#ifdef TILEDARRAY_HAS_CUDA
cudaProfilerStart();
#endif

double total_time = 0.0;
double total_gflops = 0.0;
for (long i = 0; i < nrepeat; ++i) {
const double t0 = madness::wall_time();
c("m,n") = a("m,k") * b("k,n");
world.gop.fence();
const double t1 = madness::wall_time();
const double dt = t1 - t0;
const double gflops = static_cast<double>(nflops) / (dt * 1.0e9);
total_time += dt;
total_gflops += gflops;
if (world.rank() == 0)
std::cout << " iter " << (i + 1) << " time=" << dt
<< " s gflops=" << gflops << "\n";
}

#ifdef TILEDARRAY_HAS_CUDA
cudaProfilerStop();
#endif

if (world.rank() == 0)
std::cout << " Average time = " << (total_time / double(nrepeat))
<< " s\n Average gflops = " << (total_gflops / double(nrepeat))
<< "\n";

// Verify: every result element should be Nk * val_a * val_b.
const T expected = T(Nk) * val_a * val_b;
const auto eps = std::numeric_limits<TA::detail::scalar_t<T>>::epsilon();
const auto tolerance = std::abs(expected) * static_cast<decltype(eps)>(Nk) *
static_cast<decltype(eps)>(8) * eps;
TA::to_host(c);
bool ok = true;
for (auto it = c.begin(); it != c.end(); ++it) {
const auto tile = it->get();
for (std::size_t k = 0; k < tile.size(); ++k) {
if (std::abs(tile.data()[k] - expected) > tolerance) {
ok = false;
if (world.rank() == 0)
std::cout << " MISMATCH at tile " << it.index() << " element " << k
<< ": got " << tile.data()[k] << " expected " << expected
<< "\n";
break;
}
}
if (!ok) break;
}
if (world.rank() == 0)
std::cout << (ok ? " Verification PASSED\n" : " Verification FAILED\n");
}

} // namespace

int try_main(int argc, char **argv) {
TiledArray::World &world = TA_SCOPED_INITIALIZE(argc, argv);

if (argc < 7) {
if (world.rank() == 0)
std::cerr
<< "Usage: " << argv[0] << " Nm Bm Nn Bn Nk Bk [nrepeat=5]\n"
<< " Computes c(Nm,Nn) = a(Nm,Nk) * b(Nk,Nn) with UMTensor tiles\n";
return 1;
}
const long Nm = std::atol(argv[1]);
const long Bm = std::atol(argv[2]);
const long Nn = std::atol(argv[3]);
const long Bn = std::atol(argv[4]);
const long Nk = std::atol(argv[5]);
const long Bk = std::atol(argv[6]);
const long nrepeat = (argc >= 8 ? std::atol(argv[7]) : 5);
if (Nm <= 0 || Nn <= 0 || Nk <= 0 || Bm <= 0 || Bn <= 0 || Bk <= 0 ||
nrepeat <= 0) {
if (world.rank() == 0)
std::cerr << "All sizes / blocks / nrepeat must be positive\n";
return 1;
}

run<double>(world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
return 0;
}

int main(int argc, char **argv) {
try {
return try_main(argc, argv);
} catch (const std::exception &e) {
std::cerr << "exception: " << e.what() << "\n";
return 1;
} catch (...) {
std::cerr << "unknown exception\n";
return 1;
}
}
156 changes: 156 additions & 0 deletions examples/device/ta_vector_um_tensor.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
/*
* This file is a part of TiledArray.
* Copyright (C) 2026 Virginia Tech
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* Ajay Melekamburath
* Department of Chemistry, Virginia Tech
*/

// Element-wise vector-op benchmarks (add, scale, permute, Hadamard) using
// the native UMTensor tile type. Companion to ta_vector_device.cpp.
//
// Usage:
// ta_vector_um_tensor Nm Bm Nn Bn [nrepeat=5]
//
// Times each op for nrepeat iterations and reports the average wall time
// and effective bandwidth (counting one read + one write per element for
// in-place ops, two reads + one write for binary ops).

#include <TiledArray/device/tensor.h>
#include <tiledarray.h>

#include <cstdint>
#include <iostream>
#include <vector>

namespace {

template <typename T>
void run(TiledArray::World &world, long Nm, long Bm, long Nn, long Bn,
long nrepeat) {
using TA::DistArray;
using TA::TiledRange;
using TA::TiledRange1;
using TA::UMTensor;
using TileT = UMTensor<T>;
using ArrayT = DistArray<TileT, TA::DensePolicy>;

auto blocking = [](long N, long B) {
std::vector<unsigned int> v;
for (long i = 0; i <= N; i += B) v.push_back(static_cast<unsigned int>(i));
return v;
};
auto blk_m = blocking(Nm, Bm);
auto blk_n = blocking(Nn, Bn);

TiledRange trange({TiledRange1(blk_m.begin(), blk_m.end()),
TiledRange1(blk_n.begin(), blk_n.end())});
TiledRange trange_T({TiledRange1(blk_n.begin(), blk_n.end()),
TiledRange1(blk_m.begin(), blk_m.end())});

if (world.rank() == 0)
std::cout << "TiledArray UMTensor vector-op benchmark\n"
<< " Nodes = " << world.size() << "\n"
<< " Matrix = " << Nm << " x " << Nn << " ("
<< double(Nm * Nn * sizeof(T)) / 1.0e9 << " GB)\n"
<< " Tile = " << Bm << " x " << Bn << "\n"
<< " Iterations = " << nrepeat << "\n";

ArrayT a(world, trange);
ArrayT b(world, trange);
ArrayT c(world, trange);
ArrayT t(world, trange_T); // transposed-shape result for permute test

a.fill(T(0.03));
b.fill(T(0.02));
c.fill(T(0.0));
t.fill(T(0.0));
world.gop.fence();
TA::to_device(a);
TA::to_device(b);

const double bytes_per_elem = static_cast<double>(sizeof(T));
const double n_elems = static_cast<double>(Nm) * static_cast<double>(Nn);

auto bench = [&](const char *name, double bytes_per_iter, auto &&op) {
double total_time = 0.0;
for (long i = 0; i < nrepeat; ++i) {
const double t0 = madness::wall_time();
op();
world.gop.fence();
const double t1 = madness::wall_time();
total_time += t1 - t0;
}
const double avg = total_time / static_cast<double>(nrepeat);
const double bw_gbs = bytes_per_iter / (avg * 1.0e9);
if (world.rank() == 0)
std::cout << " " << name << ": avg=" << avg << " s bw=" << bw_gbs
<< " GB/s\n";
};

// Binary read-read-write: 3 element accesses per element.
const double rw3_bytes = 3.0 * n_elems * bytes_per_elem;
// Unary read-write: 2 element accesses per element.
const double rw2_bytes = 2.0 * n_elems * bytes_per_elem;

bench("add(c=a+b)", rw3_bytes, [&] { c("m,n") = a("m,n") + b("m,n"); });
bench("subt(c=a-b)", rw3_bytes, [&] { c("m,n") = a("m,n") - b("m,n"); });
bench("scale(c=2*a)", rw2_bytes, [&] { c("m,n") = 2.0 * a("m,n"); });
bench("hadamard(c=a*b)", rw3_bytes, [&] { c("m,n") = a("m,n") * b("m,n"); });
bench("permute(t=a^T)", rw2_bytes, [&] { t("n,m") = a("m,n"); });
bench("axpy(c+=a)", rw3_bytes, [&] { c("m,n") += a("m,n"); });

world.gop.fence();
}

} // namespace

int try_main(int argc, char **argv) {
TiledArray::World &world = TA_SCOPED_INITIALIZE(argc, argv);

if (argc < 5) {
if (world.rank() == 0)
std::cerr
<< "Usage: " << argv[0] << " Nm Bm Nn Bn [nrepeat=5]\n"
<< " Times element-wise vector ops on Nm x Nn UMTensor matrices\n";
return 1;
}
const long Nm = std::atol(argv[1]);
const long Bm = std::atol(argv[2]);
const long Nn = std::atol(argv[3]);
const long Bn = std::atol(argv[4]);
const long nrepeat = (argc >= 6 ? std::atol(argv[5]) : 5);
if (Nm <= 0 || Nn <= 0 || Bm <= 0 || Bn <= 0 || nrepeat <= 0) {
if (world.rank() == 0)
std::cerr << "All sizes / blocks / nrepeat must be positive\n";
return 1;
}

run<double>(world, Nm, Bm, Nn, Bn, nrepeat);
return 0;
}

int main(int argc, char **argv) {
try {
return try_main(argc, argv);
} catch (const std::exception &e) {
std::cerr << "exception: " << e.what() << "\n";
return 1;
} catch (...) {
std::cerr << "unknown exception\n";
return 1;
}
}
2 changes: 2 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ if(TILEDARRAY_HAS_HIP OR TILEDARRAY_HAS_CUDA)
TiledArray/device/blas.h
TiledArray/device/btas.h
TiledArray/device/btas_um_tensor.h
TiledArray/device/tensor.h
TiledArray/device/device_task_fn.h
TiledArray/device/kernel/mult_kernel.h
TiledArray/device/kernel/reduce_kernel.h
Expand Down Expand Up @@ -267,6 +268,7 @@ if(TILEDARRAY_HAS_CUDA OR TILEDARRAY_HAS_HIP)

set(TILEDARRAY_DEVICE_SOURCE_FILES
TiledArray/device/btas_um_tensor.cpp
TiledArray/device/tensor.cpp
)

if(TILEDARRAY_HAS_CUDA)
Expand Down
Loading
Loading