From 6811a6035bf9cc4a04d1eaab61e91d0e3f8a1506 Mon Sep 17 00:00:00 2001
From: AntonOresten <antonoresten@proton.me>
Date: Mon, 23 Feb 2026 11:35:15 +0100
Subject: [PATCH 1/5] Add tile-indexed methods for existing atomic operations

---
 src/compiler/intrinsics/atomics.jl | 116 ++++++++++++++++++++++
 src/language/atomics.jl            | 147 ++++++++++++++++++++++++++++
 test/codegen/operations.jl         |  58 +++++++++++
 test/execution/atomics.jl          | 148 +++++++++++++++++++++++++++++
 4 files changed, 469 insertions(+)

diff --git a/src/compiler/intrinsics/atomics.jl b/src/compiler/intrinsics/atomics.jl
index 9c480bf..2a84539 100644
--- a/src/compiler/intrinsics/atomics.jl
+++ b/src/compiler/intrinsics/atomics.jl
@@ -177,3 +177,119 @@ efunc(::typeof(Intrinsics.atomic_add), effects::CC.Effects) =
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_add), args)
     emit_atomic_rmw!(ctx, args, AtomicADD)
 end
+
+# ============================================================================
+# Tile-indexed atomic operations
+# These take pre-computed pointer tiles, value tiles, and masks.
+# Used by the public API for tile-indexed atomic operations.
+# ============================================================================
+
+# Shared codegen helper for tile-indexed atomic RMW operations
+function emit_atomic_rmw_tile!(ctx::CGCtx, args::AbstractVector, mode::AtomicRMWMode)
+    cb = ctx.cb
+    tt = ctx.tt
+
+    # args: (ptr_tile, val, mask, memory_order, memory_scope)
+    ptr_tv = emit_value!(ctx, args[1])
+    ptr_tv === nothing && throw(IRError("tile-indexed atomic RMW requires ptr_tile"))
+    val_tv = emit_value!(ctx, args[2])
+    val_tv === nothing && throw(IRError("tile-indexed atomic RMW requires value"))
+    mask_tv = emit_value!(ctx, args[3])
+    mask_tv === nothing && throw(IRError("tile-indexed atomic RMW requires mask"))
+
+    memory_order = @something get_constant(ctx, args[4]) throw(IRError("tile-indexed atomic RMW requires constant memory_order"))
+    memory_scope = @something get_constant(ctx, args[5]) throw(IRError("tile-indexed atomic RMW requires constant memory_scope"))
+
+    shape = val_tv.shape
+    elem_type = eltype(val_tv.jltype)
+
+    dtype = julia_to_tile_dtype!(tt, elem_type)
+    result_tile_type = tile_type!(tt, dtype, collect(shape))
+    token_type = Token(tt)
+
+    # Auto-promote integer ADD to float ADD for floating-point types
+    actual_mode = mode
+    if mode == AtomicADD && elem_type <: AbstractFloat
+        actual_mode = AtomicADDF
+    end
+
+    mem_ordering = memory_order_to_semantics(memory_order)
+    mem_scope = memory_scope_to_scope(memory_scope)
+
+    old_val, new_token = encode_AtomicRMWPtrOp!(cb, result_tile_type, token_type,
+                                                 ptr_tv.v, val_tv.v, actual_mode;
+                                                 mask=mask_tv.v,
+                                                 token=ctx.token,
+                                                 memory_ordering=mem_ordering,
+                                                 memory_scope=mem_scope)
+    ctx.token = new_token
+
+    CGVal(old_val, result_tile_type, Tile{elem_type, Tuple{shape...}}, collect(shape))
+end
+
+# Tile-indexed atomic exchange
+@intrinsic atomic_xchg_tile(ptr_tile, val, mask, memory_order, memory_scope)
+function tfunc(𝕃, ::typeof(Intrinsics.atomic_xchg_tile), @nospecialize(ptrs), @nospecialize(val), @nospecialize args...)
+    CC.widenconst(val)
+end
+efunc(::typeof(Intrinsics.atomic_xchg_tile), effects::CC.Effects) =
+    CC.Effects(effects; effect_free=CC.ALWAYS_FALSE)
+function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_xchg_tile), args)
+    emit_atomic_rmw_tile!(ctx, args, AtomicXCHG)
+end
+
+# Tile-indexed atomic addition
+@intrinsic atomic_add_tile(ptr_tile, val, mask, memory_order, memory_scope)
+function tfunc(𝕃, ::typeof(Intrinsics.atomic_add_tile), @nospecialize(ptrs), @nospecialize(val), @nospecialize args...)
+    CC.widenconst(val)
+end
+efunc(::typeof(Intrinsics.atomic_add_tile), effects::CC.Effects) =
+    CC.Effects(effects; effect_free=CC.ALWAYS_FALSE)
+function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_add_tile), args)
+    emit_atomic_rmw_tile!(ctx, args, AtomicADD)
+end
+
+# Tile-indexed atomic compare-and-swap
+@intrinsic atomic_cas_tile(ptr_tile, expected, desired, mask, memory_order, memory_scope)
+function tfunc(𝕃, ::typeof(Intrinsics.atomic_cas_tile), @nospecialize(ptrs), @nospecialize(expected), @nospecialize args...)
+    CC.widenconst(expected)
+end
+efunc(::typeof(Intrinsics.atomic_cas_tile), effects::CC.Effects) =
+    CC.Effects(effects; effect_free=CC.ALWAYS_FALSE)
+function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_cas_tile), args)
+    cb = ctx.cb
+    tt = ctx.tt
+
+    # args: (ptr_tile, expected, desired, mask, memory_order, memory_scope)
+    ptr_tv = emit_value!(ctx, args[1])
+    ptr_tv === nothing && throw(IRError("tile-indexed atomic CAS requires ptr_tile"))
+    expected_tv = emit_value!(ctx, args[2])
+    expected_tv === nothing && throw(IRError("tile-indexed atomic CAS requires expected value"))
+    desired_tv = emit_value!(ctx, args[3])
+    desired_tv === nothing && throw(IRError("tile-indexed atomic CAS requires desired value"))
+    mask_tv = emit_value!(ctx, args[4])
+    mask_tv === nothing && throw(IRError("tile-indexed atomic CAS requires mask"))
+
+    memory_order = @something get_constant(ctx, args[5]) throw(IRError("tile-indexed atomic CAS requires constant memory_order"))
+    memory_scope = @something get_constant(ctx, args[6]) throw(IRError("tile-indexed atomic CAS requires constant memory_scope"))
+
+    shape = expected_tv.shape
+    elem_type = eltype(expected_tv.jltype)
+
+    dtype = julia_to_tile_dtype!(tt, elem_type)
+    result_tile_type = tile_type!(tt, dtype, collect(shape))
+    token_type = Token(tt)
+
+    mem_ordering = memory_order_to_semantics(memory_order)
+    mem_scope = memory_scope_to_scope(memory_scope)
+
+    old_val, new_token = encode_AtomicCASPtrOp!(cb, result_tile_type, token_type,
+                                                 ptr_tv.v, expected_tv.v, desired_tv.v;
+                                                 mask=mask_tv.v,
+                                                 token=ctx.token,
+                                                 memory_ordering=mem_ordering,
+                                                 memory_scope=mem_scope)
+    ctx.token = new_token
+
+    CGVal(old_val, result_tile_type, Tile{elem_type, Tuple{shape...}}, collect(shape))
+end
diff --git a/src/language/atomics.jl b/src/language/atomics.jl
index 5405449..c3ee839 100644
--- a/src/language/atomics.jl
+++ b/src/language/atomics.jl
@@ -80,3 +80,150 @@ old_val = ct.atomic_add(counters, idx, Int32(1))
                             memory_scope::Int=MemScope.Device) where {T}
     Intrinsics.atomic_add(array, index - One(), val, memory_order, memory_scope)
 end
+
+# ============================================================================
+# Tile-indexed atomic operations (scatter-gather style indexing)
+# These accept Tile indices to perform atomic operations on multiple elements.
+# ============================================================================
+
+# --- Pointer/mask helpers (same pattern as gather/scatter in operations.jl) ---
+
+@inline function _atomic_ptrs_mask(array::TileArray{T, 1}, indices::Tile{I}) where {T, I <: Integer}
+    indices_0 = indices .- one(I)
+    indices_i32 = convert(Tile{Int32}, indices_0)
+    ptr_tile = Intrinsics.offset(array.ptr, indices_i32)
+    zero_0d = Tile(Int32(0))
+    size_0d = Tile(size(array, 1))
+    mask = (indices_i32 .>= zero_0d) .& (indices_i32 .< size_0d)
+    (ptr_tile, mask, size(indices))
+end
+
+@inline function _atomic_ptrs_mask(array::TileArray{T, 2},
+                                    indices::Tuple{Tile{I0}, Tile{I1}}) where {T, I0 <: Integer, I1 <: Integer}
+    idx0_0 = indices[1] .- one(I0)
+    idx1_0 = indices[2] .- one(I1)
+
+    S = broadcast_shape(size(indices[1]), size(indices[2]))
+    idx0_bc = broadcast_to(idx0_0, S)
+    idx1_bc = broadcast_to(idx1_0, S)
+
+    idx0_i32 = convert(Tile{Int32}, idx0_bc)
+    idx1_i32 = convert(Tile{Int32}, idx1_bc)
+
+    stride0_0d = Tile(array.strides[1])
+    stride1_0d = Tile(array.strides[2])
+    stride0 = broadcast_to(stride0_0d, S)
+    stride1 = broadcast_to(stride1_0d, S)
+
+    linear_idx = idx0_i32 .* stride0 + idx1_i32 .* stride1
+    ptr_tile = Intrinsics.offset(array.ptr, linear_idx)
+
+    zero_0d = Tile(Int32(0))
+    zero_bc = broadcast_to(zero_0d, S)
+    size0_bc = broadcast_to(Tile(size(array, 1)), S)
+    size1_bc = broadcast_to(Tile(size(array, 2)), S)
+
+    mask0 = (idx0_i32 .>= zero_bc) .& (idx0_i32 .< size0_bc)
+    mask1 = (idx1_i32 .>= zero_bc) .& (idx1_i32 .< size1_bc)
+    mask = mask0 .& mask1
+
+    (ptr_tile, mask, S)
+end
+
+# --- RMW operations (atomic_add, atomic_xchg) ---
+
+const _ATOMIC_RMW_OPS = (
+    (:add,  :atomic_add_tile),
+    (:xchg, :atomic_xchg_tile),
+)
+
+for (op, intrinsic) in _ATOMIC_RMW_OPS
+    fname = Symbol(:atomic_, op)
+
+    # 1D with scalar value
+    @eval @inline function $fname(array::TileArray{T, 1}, indices::Tile{I}, val::T;
+                                   memory_order::Int=MemoryOrder.AcqRel,
+                                   memory_scope::Int=MemScope.Device) where {T, I <: Integer}
+        ptr_tile, mask, S = _atomic_ptrs_mask(array, indices)
+        val_tile = broadcast_to(Tile(val), S)
+        Intrinsics.$intrinsic(ptr_tile, val_tile, mask, memory_order, memory_scope)
+    end
+
+    # 1D with tile value
+    @eval @inline function $fname(array::TileArray{T, 1}, indices::Tile{I}, val::Tile{T};
+                                   memory_order::Int=MemoryOrder.AcqRel,
+                                   memory_scope::Int=MemScope.Device) where {T, I <: Integer}
+        ptr_tile, mask, _ = _atomic_ptrs_mask(array, indices)
+        Intrinsics.$intrinsic(ptr_tile, val, mask, memory_order, memory_scope)
+    end
+
+    # 2D with scalar value
+    @eval @inline function $fname(array::TileArray{T, 2},
+                                   indices::Tuple{Tile{I0}, Tile{I1}}, val::T;
+                                   memory_order::Int=MemoryOrder.AcqRel,
+                                   memory_scope::Int=MemScope.Device) where {T, I0 <: Integer, I1 <: Integer}
+        ptr_tile, mask, S = _atomic_ptrs_mask(array, indices)
+        val_tile = broadcast_to(Tile(val), S)
+        Intrinsics.$intrinsic(ptr_tile, val_tile, mask, memory_order, memory_scope)
+    end
+
+    # 2D with tile value
+    @eval @inline function $fname(array::TileArray{T, 2},
+                                   indices::Tuple{Tile{I0}, Tile{I1}}, val::Tile{T};
+                                   memory_order::Int=MemoryOrder.AcqRel,
+                                   memory_scope::Int=MemScope.Device) where {T, I0 <: Integer, I1 <: Integer}
+        ptr_tile, mask, S = _atomic_ptrs_mask(array, indices)
+        val_bc = broadcast_to(val, S)
+        Intrinsics.$intrinsic(ptr_tile, val_bc, mask, memory_order, memory_scope)
+    end
+end
+
+# --- CAS operations (separate due to different signature) ---
+
+# 1D with scalar expected/desired
+@inline function atomic_cas(array::TileArray{T, 1}, indices::Tile{I},
+                            expected::T, desired::T;
+                            memory_order::Int=MemoryOrder.AcqRel,
+                            memory_scope::Int=MemScope.Device) where {T, I <: Integer}
+    ptr_tile, mask, S = _atomic_ptrs_mask(array, indices)
+    expected_tile = broadcast_to(Tile(expected), S)
+    desired_tile = broadcast_to(Tile(desired), S)
+    Intrinsics.atomic_cas_tile(ptr_tile, expected_tile, desired_tile, mask,
+                               memory_order, memory_scope)
+end
+
+# 1D with tile expected/desired
+@inline function atomic_cas(array::TileArray{T, 1}, indices::Tile{I},
+                            expected::Tile{T}, desired::Tile{T};
+                            memory_order::Int=MemoryOrder.AcqRel,
+                            memory_scope::Int=MemScope.Device) where {T, I <: Integer}
+    ptr_tile, mask, _ = _atomic_ptrs_mask(array, indices)
+    Intrinsics.atomic_cas_tile(ptr_tile, expected, desired, mask,
+                               memory_order, memory_scope)
+end
+
+# 2D with scalar expected/desired
+@inline function atomic_cas(array::TileArray{T, 2},
+                            indices::Tuple{Tile{I0}, Tile{I1}},
+                            expected::T, desired::T;
+                            memory_order::Int=MemoryOrder.AcqRel,
+                            memory_scope::Int=MemScope.Device) where {T, I0 <: Integer, I1 <: Integer}
+    ptr_tile, mask, S = _atomic_ptrs_mask(array, indices)
+    expected_tile = broadcast_to(Tile(expected), S)
+    desired_tile = broadcast_to(Tile(desired), S)
+    Intrinsics.atomic_cas_tile(ptr_tile, expected_tile, desired_tile, mask,
+                               memory_order, memory_scope)
+end
+
+# 2D with tile expected/desired
+@inline function atomic_cas(array::TileArray{T, 2},
+                            indices::Tuple{Tile{I0}, Tile{I1}},
+                            expected::Tile{T}, desired::Tile{T};
+                            memory_order::Int=MemoryOrder.AcqRel,
+                            memory_scope::Int=MemScope.Device) where {T, I0 <: Integer, I1 <: Integer}
+    ptr_tile, mask, S = _atomic_ptrs_mask(array, indices)
+    expected_bc = broadcast_to(expected, S)
+    desired_bc = broadcast_to(desired, S)
+    Intrinsics.atomic_cas_tile(ptr_tile, expected_bc, desired_bc, mask,
+                               memory_order, memory_scope)
+end
diff --git a/test/codegen/operations.jl b/test/codegen/operations.jl
index 8da55a9..57a2f31 100644
--- a/test/codegen/operations.jl
+++ b/test/codegen/operations.jl
@@ -1418,6 +1418,64 @@
                 end
             end
         end
+
+        @testset "tile-indexed atomic_cas_tko" begin
+            spec = ct.ArraySpec{1}(16, true)
+            @test @filecheck begin
+                @check_label "entry"
+                code_tiled(Tuple{ct.TileArray{Int32,1,spec}}) do arr
+                    @check "iota"
+                    indices = ct.arange((16,), Int)
+                    @check "offset"
+                    @check "atomic_cas_tko"
+                    ct.atomic_cas(arr, indices, Int32(0), Int32(1))
+                    return
+                end
+            end
+        end
+
+        @testset "tile-indexed atomic_rmw_tko" begin
+            spec = ct.ArraySpec{1}(16, true)
+            # xchg
+            @test @filecheck begin
+                @check_label "entry"
+                code_tiled(Tuple{ct.TileArray{Int32,1,spec}}) do arr
+                    @check "iota"
+                    indices = ct.arange((16,), Int)
+                    @check "offset"
+                    @check "atomic_rmw_tko"
+                    ct.atomic_xchg(arr, indices, Int32(42))
+                    return
+                end
+            end
+
+            # add (integer)
+            @test @filecheck begin
+                @check_label "entry"
+                code_tiled(Tuple{ct.TileArray{Int32,1,spec}}) do arr
+                    @check "iota"
+                    indices = ct.arange((16,), Int)
+                    @check "offset"
+                    @check "atomic_rmw_tko"
+                    ct.atomic_add(arr, indices, Int32(1))
+                    return
+                end
+            end
+
+            # add (float)
+            spec_f32 = ct.ArraySpec{1}(16, true)
+            @test @filecheck begin
+                @check_label "entry"
+                code_tiled(Tuple{ct.TileArray{Float32,1,spec_f32}}) do arr
+                    @check "iota"
+                    indices = ct.arange((16,), Int)
+                    @check "offset"
+                    @check "atomic_rmw_tko"
+                    ct.atomic_add(arr, indices, 1.5f0)
+                    return
+                end
+            end
+        end
     end
 
     #=========================================================================
diff --git a/test/execution/atomics.jl b/test/execution/atomics.jl
index 81ffe19..60c93d4 100644
--- a/test/execution/atomics.jl
+++ b/test/execution/atomics.jl
@@ -166,6 +166,154 @@ end
     @test result == n_blocks
 end
 
+# ============================================================================
+# Tile-indexed atomic operations (scatter-gather style indexing)
+# ============================================================================
+
+@testset "atomic_add tile-indexed 1D" begin
+    function atomic_add_tile_kernel(arr::ct.TileArray{Int,1}, TILE::Int)
+        bid = ct.bid(1)
+        base = (bid - 1) * TILE
+        indices = base .+ ct.arange((TILE,), Int)
+        ct.atomic_add(arr, indices, 1;
+                     memory_order=ct.MemoryOrder.AcqRel)
+        return
+    end
+
+    tile_size = 16
+    n = 256
+    n_blocks = div(n, tile_size)
+    arr = CUDA.zeros(Int, n)
+
+    ct.launch(atomic_add_tile_kernel, n_blocks, arr, ct.Constant(tile_size))
+
+    @test all(Array(arr) .== 1)
+end
+
+@testset "atomic_add tile-indexed returns old values" begin
+    function atomic_add_return_kernel(arr::ct.TileArray{Int,1}, out::ct.TileArray{Int,1})
+        indices = ct.arange((16,), Int)
+        old_vals = ct.atomic_add(arr, indices, 1;
+                                memory_order=ct.MemoryOrder.AcqRel)
+        ct.scatter(out, indices, old_vals)
+        return
+    end
+
+    arr = CUDA.zeros(Int, 16)
+    out = CUDA.fill(Int(-1), 16)
+
+    ct.launch(atomic_add_return_kernel, 1, arr, out)
+
+    @test all(Array(out) .== 0)
+    @test all(Array(arr) .== 1)
+end
+
+@testset "atomic_add tile-indexed Float32" begin
+    function atomic_add_f32_tile_kernel(arr::ct.TileArray{Float32,1}, TILE::Int)
+        bid = ct.bid(1)
+        base = (bid - 1) * TILE
+        indices = base .+ ct.arange((TILE,), Int)
+        ct.atomic_add(arr, indices, 1.5f0;
+                     memory_order=ct.MemoryOrder.AcqRel)
+        return
+    end
+
+    tile_size = 16
+    n = 256
+    n_blocks = div(n, tile_size)
+    arr = CUDA.zeros(Float32, n)
+
+    ct.launch(atomic_add_f32_tile_kernel, n_blocks, arr, ct.Constant(tile_size))
+
+    @test all(isapprox.(Array(arr), 1.5f0))
+end
+
+@testset "atomic_add tile-indexed with tile values" begin
+    function atomic_add_tile_val_kernel(arr::ct.TileArray{Int,1},
+                                        vals::ct.TileArray{Int,1})
+        indices = ct.arange((16,), Int)
+        val_tile = ct.gather(vals, indices)
+        ct.atomic_add(arr, indices, val_tile;
+                     memory_order=ct.MemoryOrder.AcqRel)
+        return
+    end
+
+    arr = CUDA.zeros(Int, 16)
+    vals = CuArray(collect(Int, 1:16))
+
+    ct.launch(atomic_add_tile_val_kernel, 1, arr, vals)
+
+    @test Array(arr) == collect(1:16)
+end
+
+@testset "atomic_xchg tile-indexed" begin
+    function atomic_xchg_tile_kernel(arr::ct.TileArray{Int,1})
+        indices = ct.arange((16,), Int)
+        ct.atomic_xchg(arr, indices, 42;
+                      memory_order=ct.MemoryOrder.AcqRel)
+        return
+    end
+
+    arr = CUDA.zeros(Int, 16)
+
+    ct.launch(atomic_xchg_tile_kernel, 1, arr)
+
+    @test all(Array(arr) .== 42)
+end
+
+@testset "atomic_cas tile-indexed success" begin
+    function atomic_cas_tile_kernel(arr::ct.TileArray{Int,1}, out::ct.TileArray{Int,1})
+        indices = ct.arange((16,), Int)
+        old_vals = ct.atomic_cas(arr, indices, 0, 1;
+                                memory_order=ct.MemoryOrder.AcqRel)
+        ct.scatter(out, indices, old_vals)
+        return
+    end
+
+    arr = CUDA.zeros(Int, 16)
+    out = CUDA.fill(Int(-1), 16)
+
+    ct.launch(atomic_cas_tile_kernel, 1, arr, out)
+
+    @test all(Array(out) .== 0)
+    @test all(Array(arr) .== 1)
+end
+
+@testset "atomic_cas tile-indexed failure" begin
+    function atomic_cas_fail_kernel(arr::ct.TileArray{Int,1}, out::ct.TileArray{Int,1})
+        indices = ct.arange((16,), Int)
+        old_vals = ct.atomic_cas(arr, indices, 0, 2;
+                                memory_order=ct.MemoryOrder.AcqRel)
+        ct.scatter(out, indices, old_vals)
+        return
+    end
+
+    arr = CUDA.fill(Int(1), 16)
+    out = CUDA.fill(Int(-1), 16)
+
+    ct.launch(atomic_cas_fail_kernel, 1, arr, out)
+
+    @test all(Array(out) .== 1)   # old values returned
+    @test all(Array(arr) .== 1)   # unchanged (CAS failed)
+end
+
+@testset "atomic_add tile-indexed out-of-bounds" begin
+    function atomic_add_oob_kernel(arr::ct.TileArray{Int,1})
+        # Index tile is larger than array — OOB elements should be masked
+        indices = ct.arange((16,), Int)
+        ct.atomic_add(arr, indices, 1;
+                     memory_order=ct.MemoryOrder.AcqRel)
+        return
+    end
+
+    arr = CUDA.zeros(Int, 8)
+
+    ct.launch(atomic_add_oob_kernel, 1, arr)
+
+    # Only first 8 elements should be updated
+    @test all(Array(arr) .== 1)
+end
+
 @testset "1D gather - simple" begin
     # Simple 1D gather: copy first 16 elements using gather
     function gather_simple_kernel(src::ct.TileArray{Float32,1}, dst::ct.TileArray{Float32,1})

From 732bf63f6b1b1691286177c3cf7aef125211ef9f Mon Sep 17 00:00:00 2001
From: AntonOresten <antonoresten@proton.me>
Date: Mon, 23 Feb 2026 11:53:02 +0100
Subject: [PATCH 2/5] generalize to N dimensions

---
 src/language/atomics.jl    | 147 ++++++++++++++++---------------------
 test/codegen/operations.jl |  17 +++++
 test/execution/atomics.jl  |  18 +++++
 3 files changed, 100 insertions(+), 82 deletions(-)

diff --git a/src/language/atomics.jl b/src/language/atomics.jl
index c3ee839..893c0be 100644
--- a/src/language/atomics.jl
+++ b/src/language/atomics.jl
@@ -82,54 +82,48 @@ old_val = ct.atomic_add(counters, idx, Int32(1))
 end
 
 # ============================================================================
-# Tile-indexed atomic operations (scatter-gather style indexing)
+# Tile-indexed atomic operations
 # These accept Tile indices to perform atomic operations on multiple elements.
 # ============================================================================
 
-# --- Pointer/mask helpers (same pattern as gather/scatter in operations.jl) ---
+# --- Pointer/mask helper (N-dimensional) ---
 
-@inline function _atomic_ptrs_mask(array::TileArray{T, 1}, indices::Tile{I}) where {T, I <: Integer}
-    indices_0 = indices .- one(I)
-    indices_i32 = convert(Tile{Int32}, indices_0)
-    ptr_tile = Intrinsics.offset(array.ptr, indices_i32)
-    zero_0d = Tile(Int32(0))
-    size_0d = Tile(size(array, 1))
-    mask = (indices_i32 .>= zero_0d) .& (indices_i32 .< size_0d)
-    (ptr_tile, mask, size(indices))
-end
-
-@inline function _atomic_ptrs_mask(array::TileArray{T, 2},
-                                    indices::Tuple{Tile{I0}, Tile{I1}}) where {T, I0 <: Integer, I1 <: Integer}
-    idx0_0 = indices[1] .- one(I0)
-    idx1_0 = indices[2] .- one(I1)
+@inline function _atomic_ptrs_mask(array::TileArray{T, N},
+                                    indices::NTuple{N, Tile{<:Integer}}) where {T, N}
+    # Convert each index to 0-indexed
+    indices_0 = ntuple(Val(N)) do d
+        indices[d] .- one(eltype(indices[d]))
+    end
 
-    S = broadcast_shape(size(indices[1]), size(indices[2]))
-    idx0_bc = broadcast_to(idx0_0, S)
-    idx1_bc = broadcast_to(idx1_0, S)
+    # Broadcast all index tiles to a common shape
+    S = reduce(broadcast_shape, ntuple(d -> size(indices[d]), Val(N)))
 
-    idx0_i32 = convert(Tile{Int32}, idx0_bc)
-    idx1_i32 = convert(Tile{Int32}, idx1_bc)
+    # Broadcast and convert to Int32
+    indices_i32 = ntuple(Val(N)) do d
+        convert(Tile{Int32}, broadcast_to(indices_0[d], S))
+    end
 
-    stride0_0d = Tile(array.strides[1])
-    stride1_0d = Tile(array.strides[2])
-    stride0 = broadcast_to(stride0_0d, S)
-    stride1 = broadcast_to(stride1_0d, S)
+    # Linear index: sum(idx[d] * stride[d])
+    linear_idx = reduce(.+, ntuple(Val(N)) do d
+        indices_i32[d] .* broadcast_to(Tile(array.strides[d]), S)
+    end)
 
-    linear_idx = idx0_i32 .* stride0 + idx1_i32 .* stride1
     ptr_tile = Intrinsics.offset(array.ptr, linear_idx)
 
-    zero_0d = Tile(Int32(0))
-    zero_bc = broadcast_to(zero_0d, S)
-    size0_bc = broadcast_to(Tile(size(array, 1)), S)
-    size1_bc = broadcast_to(Tile(size(array, 2)), S)
-
-    mask0 = (idx0_i32 .>= zero_bc) .& (idx0_i32 .< size0_bc)
-    mask1 = (idx1_i32 .>= zero_bc) .& (idx1_i32 .< size1_bc)
-    mask = mask0 .& mask1
+    # Bounds mask: 0 <= idx[d] < size[d] for all d
+    zero_bc = broadcast_to(Tile(Int32(0)), S)
+    mask = reduce(.&, ntuple(Val(N)) do d
+        (indices_i32[d] .>= zero_bc) .& (indices_i32[d] .< broadcast_to(Tile(size(array, d)), S))
+    end)
 
     (ptr_tile, mask, S)
 end
 
+# 1D convenience: single Tile -> 1-tuple
+@inline function _atomic_ptrs_mask(array::TileArray{T, 1}, indices::Tile{<:Integer}) where {T}
+    _atomic_ptrs_mask(array, (indices,))
+end
+
 # --- RMW operations (atomic_add, atomic_xchg) ---
 
 const _ATOMIC_RMW_OPS = (
@@ -140,51 +134,48 @@ const _ATOMIC_RMW_OPS = (
 for (op, intrinsic) in _ATOMIC_RMW_OPS
     fname = Symbol(:atomic_, op)
 
-    # 1D with scalar value
-    @eval @inline function $fname(array::TileArray{T, 1}, indices::Tile{I}, val::T;
+    # N-D with scalar value
+    @eval @inline function $fname(array::TileArray{T, N},
+                                   indices::NTuple{N, Tile{<:Integer}}, val::T;
                                    memory_order::Int=MemoryOrder.AcqRel,
-                                   memory_scope::Int=MemScope.Device) where {T, I <: Integer}
+                                   memory_scope::Int=MemScope.Device) where {T, N}
         ptr_tile, mask, S = _atomic_ptrs_mask(array, indices)
         val_tile = broadcast_to(Tile(val), S)
         Intrinsics.$intrinsic(ptr_tile, val_tile, mask, memory_order, memory_scope)
     end
 
-    # 1D with tile value
-    @eval @inline function $fname(array::TileArray{T, 1}, indices::Tile{I}, val::Tile{T};
+    # N-D with tile value
+    @eval @inline function $fname(array::TileArray{T, N},
+                                   indices::NTuple{N, Tile{<:Integer}}, val::Tile{T};
                                    memory_order::Int=MemoryOrder.AcqRel,
-                                   memory_scope::Int=MemScope.Device) where {T, I <: Integer}
-        ptr_tile, mask, _ = _atomic_ptrs_mask(array, indices)
-        Intrinsics.$intrinsic(ptr_tile, val, mask, memory_order, memory_scope)
+                                   memory_scope::Int=MemScope.Device) where {T, N}
+        ptr_tile, mask, S = _atomic_ptrs_mask(array, indices)
+        val_bc = broadcast_to(val, S)
+        Intrinsics.$intrinsic(ptr_tile, val_bc, mask, memory_order, memory_scope)
     end
 
-    # 2D with scalar value
-    @eval @inline function $fname(array::TileArray{T, 2},
-                                   indices::Tuple{Tile{I0}, Tile{I1}}, val::T;
+    # 1D convenience: single Tile index
+    @eval @inline function $fname(array::TileArray{T, 1}, indices::Tile{<:Integer}, val::T;
                                    memory_order::Int=MemoryOrder.AcqRel,
-                                   memory_scope::Int=MemScope.Device) where {T, I0 <: Integer, I1 <: Integer}
-        ptr_tile, mask, S = _atomic_ptrs_mask(array, indices)
-        val_tile = broadcast_to(Tile(val), S)
-        Intrinsics.$intrinsic(ptr_tile, val_tile, mask, memory_order, memory_scope)
+                                   memory_scope::Int=MemScope.Device) where {T}
+        $fname(array, (indices,), val; memory_order, memory_scope)
     end
 
-    # 2D with tile value
-    @eval @inline function $fname(array::TileArray{T, 2},
-                                   indices::Tuple{Tile{I0}, Tile{I1}}, val::Tile{T};
+    @eval @inline function $fname(array::TileArray{T, 1}, indices::Tile{<:Integer}, val::Tile{T};
                                    memory_order::Int=MemoryOrder.AcqRel,
-                                   memory_scope::Int=MemScope.Device) where {T, I0 <: Integer, I1 <: Integer}
-        ptr_tile, mask, S = _atomic_ptrs_mask(array, indices)
-        val_bc = broadcast_to(val, S)
-        Intrinsics.$intrinsic(ptr_tile, val_bc, mask, memory_order, memory_scope)
+                                   memory_scope::Int=MemScope.Device) where {T}
+        $fname(array, (indices,), val; memory_order, memory_scope)
     end
 end
 
 # --- CAS operations (separate due to different signature) ---
 
-# 1D with scalar expected/desired
-@inline function atomic_cas(array::TileArray{T, 1}, indices::Tile{I},
+# N-D with scalar expected/desired
+@inline function atomic_cas(array::TileArray{T, N},
+                            indices::NTuple{N, Tile{<:Integer}},
                             expected::T, desired::T;
                             memory_order::Int=MemoryOrder.AcqRel,
-                            memory_scope::Int=MemScope.Device) where {T, I <: Integer}
+                            memory_scope::Int=MemScope.Device) where {T, N}
     ptr_tile, mask, S = _atomic_ptrs_mask(array, indices)
     expected_tile = broadcast_to(Tile(expected), S)
     desired_tile = broadcast_to(Tile(desired), S)
@@ -192,38 +183,30 @@ end
                                memory_order, memory_scope)
 end
 
-# 1D with tile expected/desired
-@inline function atomic_cas(array::TileArray{T, 1}, indices::Tile{I},
+# N-D with tile expected/desired
+@inline function atomic_cas(array::TileArray{T, N},
+                            indices::NTuple{N, Tile{<:Integer}},
                             expected::Tile{T}, desired::Tile{T};
                             memory_order::Int=MemoryOrder.AcqRel,
-                            memory_scope::Int=MemScope.Device) where {T, I <: Integer}
-    ptr_tile, mask, _ = _atomic_ptrs_mask(array, indices)
-    Intrinsics.atomic_cas_tile(ptr_tile, expected, desired, mask,
+                            memory_scope::Int=MemScope.Device) where {T, N}
+    ptr_tile, mask, S = _atomic_ptrs_mask(array, indices)
+    expected_bc = broadcast_to(expected, S)
+    desired_bc = broadcast_to(desired, S)
+    Intrinsics.atomic_cas_tile(ptr_tile, expected_bc, desired_bc, mask,
                                memory_order, memory_scope)
 end
 
-# 2D with scalar expected/desired
-@inline function atomic_cas(array::TileArray{T, 2},
-                            indices::Tuple{Tile{I0}, Tile{I1}},
+# 1D convenience: single Tile index
+@inline function atomic_cas(array::TileArray{T, 1}, indices::Tile{<:Integer},
                             expected::T, desired::T;
                             memory_order::Int=MemoryOrder.AcqRel,
-                            memory_scope::Int=MemScope.Device) where {T, I0 <: Integer, I1 <: Integer}
-    ptr_tile, mask, S = _atomic_ptrs_mask(array, indices)
-    expected_tile = broadcast_to(Tile(expected), S)
-    desired_tile = broadcast_to(Tile(desired), S)
-    Intrinsics.atomic_cas_tile(ptr_tile, expected_tile, desired_tile, mask,
-                               memory_order, memory_scope)
+                            memory_scope::Int=MemScope.Device) where {T}
+    atomic_cas(array, (indices,), expected, desired; memory_order, memory_scope)
 end
 
-# 2D with tile expected/desired
-@inline function atomic_cas(array::TileArray{T, 2},
-                            indices::Tuple{Tile{I0}, Tile{I1}},
+@inline function atomic_cas(array::TileArray{T, 1}, indices::Tile{<:Integer},
                             expected::Tile{T}, desired::Tile{T};
                             memory_order::Int=MemoryOrder.AcqRel,
-                            memory_scope::Int=MemScope.Device) where {T, I0 <: Integer, I1 <: Integer}
-    ptr_tile, mask, S = _atomic_ptrs_mask(array, indices)
-    expected_bc = broadcast_to(expected, S)
-    desired_bc = broadcast_to(desired, S)
-    Intrinsics.atomic_cas_tile(ptr_tile, expected_bc, desired_bc, mask,
-                               memory_order, memory_scope)
+                            memory_scope::Int=MemScope.Device) where {T}
+    atomic_cas(array, (indices,), expected, desired; memory_order, memory_scope)
 end
diff --git a/test/codegen/operations.jl b/test/codegen/operations.jl
index 57a2f31..89782c1 100644
--- a/test/codegen/operations.jl
+++ b/test/codegen/operations.jl
@@ -1434,6 +1434,23 @@
             end
         end
 
+        @testset "tile-indexed 3D atomic_add" begin
+            spec3d = ct.ArraySpec{3}(16, true)
+            @test @filecheck begin
+                @check_label "entry"
+                code_tiled(Tuple{ct.TileArray{Int32,3,spec3d}}) do arr
+                    @check "iota"
+                    i = ct.arange((4,), Int)
+                    j = ct.arange((4,), Int)
+                    k = ct.arange((4,), Int)
+                    @check "offset"
+                    @check "atomic_rmw_tko"
+                    ct.atomic_add(arr, (i, j, k), Int32(1))
+                    return
+                end
+            end
+        end
+
         @testset "tile-indexed atomic_rmw_tko" begin
             spec = ct.ArraySpec{1}(16, true)
             # xchg
diff --git a/test/execution/atomics.jl b/test/execution/atomics.jl
index 60c93d4..e757ed8 100644
--- a/test/execution/atomics.jl
+++ b/test/execution/atomics.jl
@@ -314,6 +314,24 @@ end
     @test all(Array(arr) .== 1)
 end
 
+@testset "atomic_add tile-indexed 3D" begin
+    function atomic_add_3d_kernel(arr::ct.TileArray{Int,3})
+        # 3D index tiles — each is length 4, will broadcast to (4,4,4) = 64 elements
+        i = ct.reshape(ct.arange((4,), Int), (4, 1, 1))
+        j = ct.reshape(ct.arange((4,), Int), (1, 4, 1))
+        k = ct.reshape(ct.arange((4,), Int), (1, 1, 4))
+        ct.atomic_add(arr, (i, j, k), 1;
+                     memory_order=ct.MemoryOrder.AcqRel)
+        return
+    end
+
+    arr = CUDA.zeros(Int, 4, 4, 4)
+
+    ct.launch(atomic_add_3d_kernel, 1, arr)
+
+    @test all(Array(arr) .== 1)
+end
+
 @testset "1D gather - simple" begin
     # Simple 1D gather: copy first 16 elements using gather
     function gather_simple_kernel(src::ct.TileArray{Float32,1}, dst::ct.TileArray{Float32,1})

From 55e69b1757b28b89017f58899c72f793c035e61e Mon Sep 17 00:00:00 2001
From: AntonOresten <antonoresten@proton.me>
Date: Mon, 23 Feb 2026 13:57:18 +0100
Subject: [PATCH 3/5] Add tile-space atomic operations

---
 bench/atomic_vs_store.jl   | 232 +++++++++++++++++++++++++++++++++++++
 src/language/atomics.jl    |  76 ++++++++++++
 test/codegen/operations.jl |  15 +++
 test/execution/atomics.jl  | 148 +++++++++++++++++++++++
 4 files changed, 471 insertions(+)
 create mode 100644 bench/atomic_vs_store.jl

diff --git a/bench/atomic_vs_store.jl b/bench/atomic_vs_store.jl
new file mode 100644
index 0000000..59bcb7a
--- /dev/null
+++ b/bench/atomic_vs_store.jl
@@ -0,0 +1,232 @@
+#!/usr/bin/env julia
+#
+# Benchmark: ct.store vs ct.atomic_add (tile-space)
+#
+# 1D: Compares atomic_add vs store when there's no contention.
+# 2D: Compares flattened 1D (N*N) vs native 2D (NxN) tile-space atomic_add.
+#
+# Usage:
+#   julia --project=test bench/atomic_vs_store.jl
+
+using cuTile
+import cuTile as ct
+using CUDA
+
+# --- 1D Kernels ---
+
+function store_1d_kernel(dst::ct.TileArray{Float32,1}, TILE::Int)
+    bid = ct.bid(1)
+    tile = ct.full((TILE,), 1.0f0, Float32)
+    ct.store(dst, bid, tile)
+    return
+end
+
+function atomic_add_1d_kernel(dst::ct.TileArray{Float32,1}, TILE::Int)
+    bid = ct.bid(1)
+    tile = ct.full((TILE,), 1.0f0, Float32)
+    ct.atomic_add(dst, bid, tile)
+    return
+end
+
+# --- 2D Kernels ---
+
+# Flat: treat N*N as a 1D array, one tile per block
+function atomic_add_flat_kernel(dst::ct.TileArray{Float32,1}, TILE::Int)
+    bid = ct.bid(1)
+    tile = ct.full((TILE,), 1.0f0, Float32)
+    ct.atomic_add(dst, bid, tile)
+    return
+end
+
+# Native 2D: NxN tile-space index
+function atomic_add_2d_kernel(dst::ct.TileArray{Float32,2}, TILE_R::Int, NCOLS::Int)
+    bid = ct.bid(1)
+    # bid is linear over the 2D tile grid; convert to (row_tile, col_tile)
+    row = (bid - Int32(1)) ÷ NCOLS + Int32(1)
+    col = (bid - Int32(1)) % NCOLS + Int32(1)
+    tile = ct.full((TILE_R, TILE_R), 1.0f0, Float32)
+    ct.atomic_add(dst, (row, col), tile)
+    return
+end
+
+# --- Benchmark harness ---
+
+function bench(f, grid, args...; warmup=5, iters=100, reset=nothing, kwargs...)
+    for _ in 1:warmup
+        reset !== nothing && reset()
+        ct.launch(f, grid, args...; kwargs...)
+    end
+    CUDA.synchronize()
+
+    times = Float64[]
+    for _ in 1:iters
+        reset !== nothing && reset()
+        CUDA.synchronize()
+        t = CUDA.@elapsed begin
+            ct.launch(f, grid, args...; kwargs...)
+        end
+        push!(times, t)
+    end
+
+    sort!(times)
+    trim = max(1, iters ÷ 10)
+    trimmed = times[trim+1:end-trim]
+
+    return (
+        median = trimmed[length(trimmed) ÷ 2] * 1e6,
+        mean   = sum(trimmed) / length(trimmed) * 1e6,
+        min    = times[1] * 1e6,
+        max    = times[end] * 1e6,
+    )
+end
+
+function print_result(label, t; reference=nothing)
+    line = "  $(rpad(label, 14))$(lpad(round(t.median, digits=2), 8))μs  (min $(round(t.min, digits=2))μs)"
+    if reference !== nothing
+        ratio = t.median / reference.median
+        line *= "  $(round(ratio, digits=2))x"
+    end
+    println(line)
+end
+
+# --- 1D benchmark ---
+
+function bench_1d()
+    TILE = 128
+    println("=" ^ 60)
+    println("1D: ct.store vs ct.atomic_add (no contention)")
+    println("   Each block writes its own tile of $TILE Float32s")
+    println("=" ^ 60)
+    println()
+
+    for n_tiles in [64, 256, 1024, 4096, 16384, 65536]
+        n = n_tiles * TILE
+
+        dst_store = CUDA.zeros(Float32, n)
+        dst_atomic = CUDA.zeros(Float32, n)
+
+        # Correctness
+        ct.launch(store_1d_kernel, n_tiles, dst_store, ct.Constant(TILE))
+        ct.launch(atomic_add_1d_kernel, n_tiles, dst_atomic, ct.Constant(TILE))
+        CUDA.synchronize()
+        @assert all(Array(dst_store) .== 1.0f0)
+        @assert all(Array(dst_atomic) .== 1.0f0)
+
+        t_store = bench(store_1d_kernel, n_tiles, dst_store, ct.Constant(TILE))
+        t_atomic = bench(atomic_add_1d_kernel, n_tiles, dst_atomic, ct.Constant(TILE);
+                         reset=() -> CUDA.fill!(dst_atomic, 0))
+
+        println("$(lpad(n_tiles, 4)) tiles × $TILE = $(lpad(n, 7)) elements")
+        print_result("store", t_store)
+        print_result("atomic_add", t_atomic; reference=t_store)
+        println()
+    end
+end
+
+# --- 2D benchmark ---
+
+function bench_2d()
+    println("=" ^ 60)
+    println("2D: flat 1D (N*N) vs native 2D (NxN) atomic_add")
+    println("   Same total elements, different indexing strategies")
+    println("=" ^ 60)
+    println()
+
+    for (tile_r, grid_r) in [(8, 8), (8, 16), (16, 16), (16, 32), (32, 32), (32, 64), (32, 128), (64, 64), (64, 128)]
+        n_rows = tile_r * grid_r
+        n_cols = n_rows
+        n = n_rows * n_cols
+        n_tiles_flat = n ÷ (tile_r * tile_r)   # total tiles when flattened
+        n_col_tiles = n_cols ÷ tile_r
+
+        dst_flat = CUDA.zeros(Float32, n)
+        dst_2d = CUDA.zeros(Float32, n_rows, n_cols)
+
+        # Correctness
+        ct.launch(atomic_add_flat_kernel, n_tiles_flat, dst_flat, ct.Constant(tile_r * tile_r))
+        ct.launch(atomic_add_2d_kernel, n_tiles_flat, dst_2d,
+                  ct.Constant(tile_r), ct.Constant(n_col_tiles))
+        CUDA.synchronize()
+        @assert all(Array(dst_flat) .== 1.0f0) "flat failed at $(n_rows)×$(n_cols)"
+        @assert all(Array(dst_2d) .== 1.0f0) "2D failed at $(n_rows)×$(n_cols)"
+
+        t_flat = bench(atomic_add_flat_kernel, n_tiles_flat, dst_flat, ct.Constant(tile_r * tile_r);
+                       reset=() -> CUDA.fill!(dst_flat, 0))
+        t_2d = bench(atomic_add_2d_kernel, n_tiles_flat, dst_2d,
+                     ct.Constant(tile_r), ct.Constant(n_col_tiles);
+                     reset=() -> CUDA.fill!(dst_2d, 0))
+
+        println("$(n_rows)×$(n_cols) = $(lpad(n, 7)) elements  ($(tile_r)×$(tile_r) tiles, $(n_tiles_flat) blocks)")
+        print_result("flat 1D", t_flat)
+        print_result("native 2D", t_2d; reference=t_flat)
+        println()
+    end
+end
+
+# --- 3D trailing singleton benchmark ---
+
+# NxN tile in a 2D array
+function atomic_add_2d_ref_kernel(dst::ct.TileArray{Float32,2}, TILE_R::Int, NCOLS::Int)
+    bid = ct.bid(1)
+    row = (bid - Int32(1)) ÷ NCOLS + Int32(1)
+    col = (bid - Int32(1)) % NCOLS + Int32(1)
+    tile = ct.full((TILE_R, TILE_R), 1.0f0, Float32)
+    ct.atomic_add(dst, (row, col), tile)
+    return
+end
+
+# NxNx1 tile in a 3D array (trailing singleton)
+function atomic_add_3d_singleton_kernel(dst::ct.TileArray{Float32,3}, TILE_R::Int, NCOLS::Int)
+    bid = ct.bid(1)
+    row = (bid - Int32(1)) ÷ NCOLS + Int32(1)
+    col = (bid - Int32(1)) % NCOLS + Int32(1)
+    tile = ct.full((TILE_R, TILE_R, 1), 1.0f0, Float32)
+    ct.atomic_add(dst, (row, col, Int32(1)), tile)
+    return
+end
+
+function bench_trailing_singleton()
+    println("=" ^ 60)
+    println("Trailing singleton: 2D (NxN) vs 3D (NxNx1) atomic_add")
+    println("   Same data, extra singleton dimension in 3D")
+    println("=" ^ 60)
+    println()
+
+    for (tile_r, grid_r) in [(8, 8), (8, 16), (16, 16), (16, 32), (32, 32), (32, 64), (32, 128), (64, 64), (64, 128)]
+        n_rows = tile_r * grid_r
+        n_cols = n_rows
+        n = n_rows * n_cols
+        n_tiles = n ÷ (tile_r * tile_r)
+        n_col_tiles = n_cols ÷ tile_r
+
+        dst_2d = CUDA.zeros(Float32, n_rows, n_cols)
+        dst_3d = CUDA.zeros(Float32, n_rows, n_cols, 1)
+
+        # Correctness
+        ct.launch(atomic_add_2d_ref_kernel, n_tiles, dst_2d,
+                  ct.Constant(tile_r), ct.Constant(n_col_tiles))
+        ct.launch(atomic_add_3d_singleton_kernel, n_tiles, dst_3d,
+                  ct.Constant(tile_r), ct.Constant(n_col_tiles))
+        CUDA.synchronize()
+        @assert all(Array(dst_2d) .== 1.0f0) "2D failed"
+        @assert all(Array(dst_3d) .== 1.0f0) "3D singleton failed"
+
+        t_2d = bench(atomic_add_2d_ref_kernel, n_tiles, dst_2d,
+                     ct.Constant(tile_r), ct.Constant(n_col_tiles);
+                     reset=() -> CUDA.fill!(dst_2d, 0))
+        t_3d = bench(atomic_add_3d_singleton_kernel, n_tiles, dst_3d,
+                     ct.Constant(tile_r), ct.Constant(n_col_tiles);
+                     reset=() -> CUDA.fill!(dst_3d, 0))
+
+        println("$(n_rows)×$(n_cols) = $(lpad(n, 7)) elements  ($(tile_r)×$(tile_r) tiles, $(n_tiles) blocks)")
+        print_result("2D (NxN)", t_2d)
+        print_result("3D (NxNx1)", t_3d; reference=t_2d)
+        println()
+    end
+end
+
+# --- Run ---
+
+bench_1d()
+bench_2d()
+bench_trailing_singleton()
diff --git a/src/language/atomics.jl b/src/language/atomics.jl
index 893c0be..ba02e96 100644
--- a/src/language/atomics.jl
+++ b/src/language/atomics.jl
@@ -166,6 +166,24 @@ for (op, intrinsic) in _ATOMIC_RMW_OPS
                                    memory_scope::Int=MemScope.Device) where {T}
         $fname(array, (indices,), val; memory_order, memory_scope)
     end
+
+    # Tile-space: N-D tuple index + tile value (like store)
+    @eval @inline function $fname(array::TileArray{T, N},
+                                   index::NTuple{N, Integer}, tile::Tile{T};
+                                   memory_order::Int=MemoryOrder.AcqRel,
+                                   memory_scope::Int=MemScope.Device) where {T, N}
+        reshaped = _reshape_to_rank(tile, Val(N))
+        ptr_tile, mask = _tile_space_ptrs_mask(array, index, Val(size(reshaped)))
+        Intrinsics.$intrinsic(ptr_tile, reshaped, mask, memory_order, memory_scope)
+    end
+
+    # Tile-space: 1D convenience (scalar index)
+    @eval @inline function $fname(array::TileArray{T, 1},
+                                   index::Integer, tile::Tile{T};
+                                   memory_order::Int=MemoryOrder.AcqRel,
+                                   memory_scope::Int=MemScope.Device) where {T}
+        $fname(array, (index,), tile; memory_order, memory_scope)
+    end
 end
 
 # --- CAS operations (separate due to different signature) ---
@@ -210,3 +228,61 @@ end
                             memory_scope::Int=MemScope.Device) where {T}
     atomic_cas(array, (indices,), expected, desired; memory_order, memory_scope)
 end
+
+# ============================================================================
+# Tile-space atomic operations
+# These accept tile-space integer indices (like store) to atomically operate
+# on contiguous tile-shaped blocks of an array.
+# ============================================================================
+
+# --- Pointer/mask helper for tile-space indexing ---
+
+@inline function _tile_space_ptrs_mask(array::TileArray{T, N},
+                                        index::NTuple{N, Integer},
+                                        ::Val{Shape}) where {T, N, Shape}
+    # Build per-dimension element index tiles (1-indexed)
+    # For dim d: arange [1..Shape[d]], reshaped for N-D broadcasting, plus base offset
+    idx_tiles = ntuple(Val(N)) do d
+        bcast_shape = ntuple(i -> i == d ? Shape[d] : 1, Val(N))
+        base = Int32((index[d] - 1) * Shape[d])
+        reshape(arange((Shape[d],), Int32), bcast_shape) .+ Tile(base)
+    end
+
+    # 0-indexed linear offset: sum((idx[d] - 1) * stride[d])
+    linear_idx = reduce(.+, ntuple(Val(N)) do d
+        (idx_tiles[d] .- Tile(Int32(1))) .* Tile(array.strides[d])
+    end)
+
+    ptr_tile = Intrinsics.offset(array.ptr, linear_idx)
+
+    # Bounds mask: 1 <= idx[d] <= size(array, d) for all d
+    mask = reduce(.&, ntuple(Val(N)) do d
+        (idx_tiles[d] .>= Tile(Int32(1))) .& (idx_tiles[d] .<= Tile(size(array, d)))
+    end)
+
+    (ptr_tile, mask)
+end
+
+# --- Tile-space CAS ---
+
+# N-D tuple index
+@inline function atomic_cas(array::TileArray{T, N},
+                            index::NTuple{N, Integer},
+                            expected::Tile{T}, desired::Tile{T};
+                            memory_order::Int=MemoryOrder.AcqRel,
+                            memory_scope::Int=MemScope.Device) where {T, N}
+    expected_r = _reshape_to_rank(expected, Val(N))
+    desired_r = _reshape_to_rank(desired, Val(N))
+    ptr_tile, mask = _tile_space_ptrs_mask(array, index, Val(size(expected_r)))
+    Intrinsics.atomic_cas_tile(ptr_tile, expected_r, desired_r, mask,
+                               memory_order, memory_scope)
+end
+
+# 1D convenience
+@inline function atomic_cas(array::TileArray{T, 1},
+                            index::Integer,
+                            expected::Tile{T}, desired::Tile{T};
+                            memory_order::Int=MemoryOrder.AcqRel,
+                            memory_scope::Int=MemScope.Device) where {T}
+    atomic_cas(array, (index,), expected, desired; memory_order, memory_scope)
+end
diff --git a/test/codegen/operations.jl b/test/codegen/operations.jl
index 89782c1..b030aa6 100644
--- a/test/codegen/operations.jl
+++ b/test/codegen/operations.jl
@@ -1493,6 +1493,21 @@
                 end
             end
         end
+
+        @testset "tile-space atomic_add" begin
+            spec = ct.ArraySpec{1}(16, true)
+            @test @filecheck begin
+                @check_label "entry"
+                code_tiled(Tuple{ct.TileArray{Int32,1,spec}, Int}) do arr, bid
+                    @check "iota"
+                    tile = ct.full((16,), Int32(1), Int32)
+                    @check "offset"
+                    @check "atomic_rmw_tko"
+                    ct.atomic_add(arr, bid, tile)
+                    return
+                end
+            end
+        end
     end
 
     #=========================================================================
diff --git a/test/execution/atomics.jl b/test/execution/atomics.jl
index e757ed8..b16cba2 100644
--- a/test/execution/atomics.jl
+++ b/test/execution/atomics.jl
@@ -332,6 +332,154 @@ end
     @test all(Array(arr) .== 1)
 end
 
+# Tile-space atomic operations (block-level indexing, like store)
+
+@testset "atomic_add tile-space 1D" begin
+    function atomic_add_ts_kernel(arr::ct.TileArray{Int,1}, TILE::Int)
+        bid = ct.bid(1)
+        tile = ct.full((TILE,), 1, Int)
+        ct.atomic_add(arr, bid, tile)
+        return
+    end
+
+    TILE = 16
+    arr = CUDA.zeros(Int, 64)
+
+    ct.launch(atomic_add_ts_kernel, 4, arr, ct.Constant(TILE))
+
+    @test all(Array(arr) .== 1)
+end
+
+@testset "atomic_add tile-space returns old values" begin
+    function atomic_add_ts_old_kernel(arr::ct.TileArray{Int,1},
+                                      out::ct.TileArray{Int,1})
+        bid = ct.bid(1)
+        tile = ct.full((16,), 1, Int)
+        old = ct.atomic_add(arr, bid, tile)
+        ct.store(out, bid, old)
+        return
+    end
+
+    arr = CUDA.zeros(Int, 16)
+    out = CUDA.fill(Int(-1), 16)
+
+    ct.launch(atomic_add_ts_old_kernel, 1, arr, out)
+
+    @test all(Array(out) .== 0)  # old values were 0
+    @test all(Array(arr) .== 1)  # now incremented
+end
+
+@testset "atomic_xchg tile-space" begin
+    function atomic_xchg_ts_kernel(arr::ct.TileArray{Int,1})
+        bid = ct.bid(1)
+        tile = ct.full((16,), 42, Int)
+        ct.atomic_xchg(arr, bid, tile)
+        return
+    end
+
+    arr = CUDA.zeros(Int, 32)
+
+    ct.launch(atomic_xchg_ts_kernel, 2, arr)
+
+    @test all(Array(arr) .== 42)
+end
+
+@testset "atomic_cas tile-space" begin
+    function atomic_cas_ts_kernel(arr::ct.TileArray{Int32,1})
+        bid = ct.bid(1)
+        expected = ct.full((16,), Int32(0), Int32)
+        desired = ct.full((16,), Int32(1), Int32)
+        ct.atomic_cas(arr, bid, expected, desired)
+        return
+    end
+
+    arr = CUDA.zeros(Int32, 32)
+
+    ct.launch(atomic_cas_ts_kernel, 2, arr)
+
+    @test all(Array(arr) .== 1)
+end
+
+@testset "atomic_add tile-space 1D tuple index" begin
+    # Test the N-D path with a 1-tuple index (not the scalar convenience)
+    function atomic_add_ts_tuple1d_kernel(arr::ct.TileArray{Int,1})
+        bid = ct.bid(1)
+        tile = ct.full((16,), 1, Int)
+        ct.atomic_add(arr, (bid,), tile)
+        return
+    end
+
+    arr = CUDA.zeros(Int, 32)
+
+    ct.launch(atomic_add_ts_tuple1d_kernel, 2, arr)
+
+    @test all(Array(arr) .== 1)
+end
+
+@testset "atomic_add tile-space 2D" begin
+    function atomic_add_ts_2d_kernel(arr::ct.TileArray{Int,2})
+        bid = ct.bid(1)
+        tile = ct.full((4, 4), 1, Int)
+        ct.atomic_add(arr, (bid, Int32(1)), tile)
+        return
+    end
+
+    arr = CUDA.zeros(Int, 4, 8)  # 4 rows, 8 cols = 2 col-tiles of width 4
+
+    ct.launch(atomic_add_ts_2d_kernel, 1, arr)
+
+    result = Array(arr)
+    @test all(result[:, 1:4] .== 1)   # first col-tile updated
+    @test all(result[:, 5:8] .== 0)   # second col-tile untouched
+end
+
+@testset "atomic_add tile-space 2D both dims" begin
+    # 2 blocks: block 1 writes tile (1,1), block 2 writes tile (1,2)
+    function atomic_add_ts_2d_both_kernel(arr::ct.TileArray{Int,2})
+        bid = ct.bid(1)
+        tile = ct.full((4, 4), 1, Int)
+        ct.atomic_add(arr, (Int32(1), bid), tile)
+        return
+    end
+
+    arr = CUDA.zeros(Int, 4, 8)
+
+    ct.launch(atomic_add_ts_2d_both_kernel, 2, arr)
+
+    @test all(Array(arr) .== 1)
+end
+
+@testset "atomic_add tile-space 3D" begin
+    function atomic_add_ts_3d_kernel(arr::ct.TileArray{Int,3})
+        bid = ct.bid(1)
+        tile = ct.full((2, 2, 2), 1, Int)
+        ct.atomic_add(arr, (bid, Int32(1), Int32(1)), tile)
+        return
+    end
+
+    arr = CUDA.zeros(Int, 4, 2, 2)  # 2 tiles along dim 1
+
+    ct.launch(atomic_add_ts_3d_kernel, 2, arr)
+
+    @test all(Array(arr) .== 1)
+end
+
+@testset "atomic_add tile-space trailing singleton" begin
+    # 2D tile into a 3D array — tile should be auto-reshaped to (4, 4, 1)
+    function atomic_add_ts_trailing_kernel(arr::ct.TileArray{Int,3})
+        bid = ct.bid(1)
+        tile = ct.full((4, 4), 1, Int)  # 2D tile
+        ct.atomic_add(arr, (bid, Int32(1), Int32(1)), tile)  # 3D index
+        return
+    end
+
+    arr = CUDA.zeros(Int, 8, 4, 1)  # 2 tiles along dim 1
+
+    ct.launch(atomic_add_ts_trailing_kernel, 2, arr)
+
+    @test all(Array(arr) .== 1)
+end
+
 @testset "1D gather - simple" begin
     # Simple 1D gather: copy first 16 elements using gather
     function gather_simple_kernel(src::ct.TileArray{Float32,1}, dst::ct.TileArray{Float32,1})

From 3f249c1da2e1aa53b1f60bbefa42798ef5e9ed14 Mon Sep 17 00:00:00 2001
From: AntonOresten <antonoresten@proton.me>
Date: Mon, 23 Feb 2026 14:07:24 +0100
Subject: [PATCH 4/5] Remove benchmark script

---
 bench/atomic_vs_store.jl | 232 ---------------------------------------
 1 file changed, 232 deletions(-)
 delete mode 100644 bench/atomic_vs_store.jl

diff --git a/bench/atomic_vs_store.jl b/bench/atomic_vs_store.jl
deleted file mode 100644
index 59bcb7a..0000000
--- a/bench/atomic_vs_store.jl
+++ /dev/null
@@ -1,232 +0,0 @@
-#!/usr/bin/env julia
-#
-# Benchmark: ct.store vs ct.atomic_add (tile-space)
-#
-# 1D: Compares atomic_add vs store when there's no contention.
-# 2D: Compares flattened 1D (N*N) vs native 2D (NxN) tile-space atomic_add.
-#
-# Usage:
-#   julia --project=test bench/atomic_vs_store.jl
-
-using cuTile
-import cuTile as ct
-using CUDA
-
-# --- 1D Kernels ---
-
-function store_1d_kernel(dst::ct.TileArray{Float32,1}, TILE::Int)
-    bid = ct.bid(1)
-    tile = ct.full((TILE,), 1.0f0, Float32)
-    ct.store(dst, bid, tile)
-    return
-end
-
-function atomic_add_1d_kernel(dst::ct.TileArray{Float32,1}, TILE::Int)
-    bid = ct.bid(1)
-    tile = ct.full((TILE,), 1.0f0, Float32)
-    ct.atomic_add(dst, bid, tile)
-    return
-end
-
-# --- 2D Kernels ---
-
-# Flat: treat N*N as a 1D array, one tile per block
-function atomic_add_flat_kernel(dst::ct.TileArray{Float32,1}, TILE::Int)
-    bid = ct.bid(1)
-    tile = ct.full((TILE,), 1.0f0, Float32)
-    ct.atomic_add(dst, bid, tile)
-    return
-end
-
-# Native 2D: NxN tile-space index
-function atomic_add_2d_kernel(dst::ct.TileArray{Float32,2}, TILE_R::Int, NCOLS::Int)
-    bid = ct.bid(1)
-    # bid is linear over the 2D tile grid; convert to (row_tile, col_tile)
-    row = (bid - Int32(1)) ÷ NCOLS + Int32(1)
-    col = (bid - Int32(1)) % NCOLS + Int32(1)
-    tile = ct.full((TILE_R, TILE_R), 1.0f0, Float32)
-    ct.atomic_add(dst, (row, col), tile)
-    return
-end
-
-# --- Benchmark harness ---
-
-function bench(f, grid, args...; warmup=5, iters=100, reset=nothing, kwargs...)
-    for _ in 1:warmup
-        reset !== nothing && reset()
-        ct.launch(f, grid, args...; kwargs...)
-    end
-    CUDA.synchronize()
-
-    times = Float64[]
-    for _ in 1:iters
-        reset !== nothing && reset()
-        CUDA.synchronize()
-        t = CUDA.@elapsed begin
-            ct.launch(f, grid, args...; kwargs...)
-        end
-        push!(times, t)
-    end
-
-    sort!(times)
-    trim = max(1, iters ÷ 10)
-    trimmed = times[trim+1:end-trim]
-
-    return (
-        median = trimmed[length(trimmed) ÷ 2] * 1e6,
-        mean   = sum(trimmed) / length(trimmed) * 1e6,
-        min    = times[1] * 1e6,
-        max    = times[end] * 1e6,
-    )
-end
-
-function print_result(label, t; reference=nothing)
-    line = "  $(rpad(label, 14))$(lpad(round(t.median, digits=2), 8))μs  (min $(round(t.min, digits=2))μs)"
-    if reference !== nothing
-        ratio = t.median / reference.median
-        line *= "  $(round(ratio, digits=2))x"
-    end
-    println(line)
-end
-
-# --- 1D benchmark ---
-
-function bench_1d()
-    TILE = 128
-    println("=" ^ 60)
-    println("1D: ct.store vs ct.atomic_add (no contention)")
-    println("   Each block writes its own tile of $TILE Float32s")
-    println("=" ^ 60)
-    println()
-
-    for n_tiles in [64, 256, 1024, 4096, 16384, 65536]
-        n = n_tiles * TILE
-
-        dst_store = CUDA.zeros(Float32, n)
-        dst_atomic = CUDA.zeros(Float32, n)
-
-        # Correctness
-        ct.launch(store_1d_kernel, n_tiles, dst_store, ct.Constant(TILE))
-        ct.launch(atomic_add_1d_kernel, n_tiles, dst_atomic, ct.Constant(TILE))
-        CUDA.synchronize()
-        @assert all(Array(dst_store) .== 1.0f0)
-        @assert all(Array(dst_atomic) .== 1.0f0)
-
-        t_store = bench(store_1d_kernel, n_tiles, dst_store, ct.Constant(TILE))
-        t_atomic = bench(atomic_add_1d_kernel, n_tiles, dst_atomic, ct.Constant(TILE);
-                         reset=() -> CUDA.fill!(dst_atomic, 0))
-
-        println("$(lpad(n_tiles, 4)) tiles × $TILE = $(lpad(n, 7)) elements")
-        print_result("store", t_store)
-        print_result("atomic_add", t_atomic; reference=t_store)
-        println()
-    end
-end
-
-# --- 2D benchmark ---
-
-function bench_2d()
-    println("=" ^ 60)
-    println("2D: flat 1D (N*N) vs native 2D (NxN) atomic_add")
-    println("   Same total elements, different indexing strategies")
-    println("=" ^ 60)
-    println()
-
-    for (tile_r, grid_r) in [(8, 8), (8, 16), (16, 16), (16, 32), (32, 32), (32, 64), (32, 128), (64, 64), (64, 128)]
-        n_rows = tile_r * grid_r
-        n_cols = n_rows
-        n = n_rows * n_cols
-        n_tiles_flat = n ÷ (tile_r * tile_r)   # total tiles when flattened
-        n_col_tiles = n_cols ÷ tile_r
-
-        dst_flat = CUDA.zeros(Float32, n)
-        dst_2d = CUDA.zeros(Float32, n_rows, n_cols)
-
-        # Correctness
-        ct.launch(atomic_add_flat_kernel, n_tiles_flat, dst_flat, ct.Constant(tile_r * tile_r))
-        ct.launch(atomic_add_2d_kernel, n_tiles_flat, dst_2d,
-                  ct.Constant(tile_r), ct.Constant(n_col_tiles))
-        CUDA.synchronize()
-        @assert all(Array(dst_flat) .== 1.0f0) "flat failed at $(n_rows)×$(n_cols)"
-        @assert all(Array(dst_2d) .== 1.0f0) "2D failed at $(n_rows)×$(n_cols)"
-
-        t_flat = bench(atomic_add_flat_kernel, n_tiles_flat, dst_flat, ct.Constant(tile_r * tile_r);
-                       reset=() -> CUDA.fill!(dst_flat, 0))
-        t_2d = bench(atomic_add_2d_kernel, n_tiles_flat, dst_2d,
-                     ct.Constant(tile_r), ct.Constant(n_col_tiles);
-                     reset=() -> CUDA.fill!(dst_2d, 0))
-
-        println("$(n_rows)×$(n_cols) = $(lpad(n, 7)) elements  ($(tile_r)×$(tile_r) tiles, $(n_tiles_flat) blocks)")
-        print_result("flat 1D", t_flat)
-        print_result("native 2D", t_2d; reference=t_flat)
-        println()
-    end
-end
-
-# --- 3D trailing singleton benchmark ---
-
-# NxN tile in a 2D array
-function atomic_add_2d_ref_kernel(dst::ct.TileArray{Float32,2}, TILE_R::Int, NCOLS::Int)
-    bid = ct.bid(1)
-    row = (bid - Int32(1)) ÷ NCOLS + Int32(1)
-    col = (bid - Int32(1)) % NCOLS + Int32(1)
-    tile = ct.full((TILE_R, TILE_R), 1.0f0, Float32)
-    ct.atomic_add(dst, (row, col), tile)
-    return
-end
-
-# NxNx1 tile in a 3D array (trailing singleton)
-function atomic_add_3d_singleton_kernel(dst::ct.TileArray{Float32,3}, TILE_R::Int, NCOLS::Int)
-    bid = ct.bid(1)
-    row = (bid - Int32(1)) ÷ NCOLS + Int32(1)
-    col = (bid - Int32(1)) % NCOLS + Int32(1)
-    tile = ct.full((TILE_R, TILE_R, 1), 1.0f0, Float32)
-    ct.atomic_add(dst, (row, col, Int32(1)), tile)
-    return
-end
-
-function bench_trailing_singleton()
-    println("=" ^ 60)
-    println("Trailing singleton: 2D (NxN) vs 3D (NxNx1) atomic_add")
-    println("   Same data, extra singleton dimension in 3D")
-    println("=" ^ 60)
-    println()
-
-    for (tile_r, grid_r) in [(8, 8), (8, 16), (16, 16), (16, 32), (32, 32), (32, 64), (32, 128), (64, 64), (64, 128)]
-        n_rows = tile_r * grid_r
-        n_cols = n_rows
-        n = n_rows * n_cols
-        n_tiles = n ÷ (tile_r * tile_r)
-        n_col_tiles = n_cols ÷ tile_r
-
-        dst_2d = CUDA.zeros(Float32, n_rows, n_cols)
-        dst_3d = CUDA.zeros(Float32, n_rows, n_cols, 1)
-
-        # Correctness
-        ct.launch(atomic_add_2d_ref_kernel, n_tiles, dst_2d,
-                  ct.Constant(tile_r), ct.Constant(n_col_tiles))
-        ct.launch(atomic_add_3d_singleton_kernel, n_tiles, dst_3d,
-                  ct.Constant(tile_r), ct.Constant(n_col_tiles))
-        CUDA.synchronize()
-        @assert all(Array(dst_2d) .== 1.0f0) "2D failed"
-        @assert all(Array(dst_3d) .== 1.0f0) "3D singleton failed"
-
-        t_2d = bench(atomic_add_2d_ref_kernel, n_tiles, dst_2d,
-                     ct.Constant(tile_r), ct.Constant(n_col_tiles);
-                     reset=() -> CUDA.fill!(dst_2d, 0))
-        t_3d = bench(atomic_add_3d_singleton_kernel, n_tiles, dst_3d,
-                     ct.Constant(tile_r), ct.Constant(n_col_tiles);
-                     reset=() -> CUDA.fill!(dst_3d, 0))
-
-        println("$(n_rows)×$(n_cols) = $(lpad(n, 7)) elements  ($(tile_r)×$(tile_r) tiles, $(n_tiles) blocks)")
-        print_result("2D (NxN)", t_2d)
-        print_result("3D (NxNx1)", t_3d; reference=t_2d)
-        println()
-    end
-end
-
-# --- Run ---
-
-bench_1d()
-bench_2d()
-bench_trailing_singleton()

From df68a92ddc5445e6c5c82f470e790eb3573c798e Mon Sep 17 00:00:00 2001
From: AntonOresten <antonoresten@proton.me>
Date: Mon, 23 Feb 2026 14:14:35 +0100
Subject: [PATCH 5/5] remove xchg and cas methods

---
 src/language/atomics.jl   | 43 ++++++++++-----------------------------
 test/execution/atomics.jl | 31 ----------------------------
 2 files changed, 11 insertions(+), 63 deletions(-)

diff --git a/src/language/atomics.jl b/src/language/atomics.jl
index ba02e96..7abc444 100644
--- a/src/language/atomics.jl
+++ b/src/language/atomics.jl
@@ -167,23 +167,6 @@ for (op, intrinsic) in _ATOMIC_RMW_OPS
         $fname(array, (indices,), val; memory_order, memory_scope)
     end
 
-    # Tile-space: N-D tuple index + tile value (like store)
-    @eval @inline function $fname(array::TileArray{T, N},
-                                   index::NTuple{N, Integer}, tile::Tile{T};
-                                   memory_order::Int=MemoryOrder.AcqRel,
-                                   memory_scope::Int=MemScope.Device) where {T, N}
-        reshaped = _reshape_to_rank(tile, Val(N))
-        ptr_tile, mask = _tile_space_ptrs_mask(array, index, Val(size(reshaped)))
-        Intrinsics.$intrinsic(ptr_tile, reshaped, mask, memory_order, memory_scope)
-    end
-
-    # Tile-space: 1D convenience (scalar index)
-    @eval @inline function $fname(array::TileArray{T, 1},
-                                   index::Integer, tile::Tile{T};
-                                   memory_order::Int=MemoryOrder.AcqRel,
-                                   memory_scope::Int=MemScope.Device) where {T}
-        $fname(array, (index,), tile; memory_order, memory_scope)
-    end
 end
 
 # --- CAS operations (separate due to different signature) ---
@@ -263,26 +246,22 @@ end
     (ptr_tile, mask)
 end
 
-# --- Tile-space CAS ---
+# --- Tile-space atomic_add ---
 
-# N-D tuple index
-@inline function atomic_cas(array::TileArray{T, N},
-                            index::NTuple{N, Integer},
-                            expected::Tile{T}, desired::Tile{T};
+# N-D tuple index + tile value (like store)
+@inline function atomic_add(array::TileArray{T, N},
+                            index::NTuple{N, Integer}, tile::Tile{T};
                             memory_order::Int=MemoryOrder.AcqRel,
                             memory_scope::Int=MemScope.Device) where {T, N}
-    expected_r = _reshape_to_rank(expected, Val(N))
-    desired_r = _reshape_to_rank(desired, Val(N))
-    ptr_tile, mask = _tile_space_ptrs_mask(array, index, Val(size(expected_r)))
-    Intrinsics.atomic_cas_tile(ptr_tile, expected_r, desired_r, mask,
-                               memory_order, memory_scope)
+    reshaped = _reshape_to_rank(tile, Val(N))
+    ptr_tile, mask = _tile_space_ptrs_mask(array, index, Val(size(reshaped)))
+    Intrinsics.atomic_add_tile(ptr_tile, reshaped, mask, memory_order, memory_scope)
 end
 
-# 1D convenience
-@inline function atomic_cas(array::TileArray{T, 1},
-                            index::Integer,
-                            expected::Tile{T}, desired::Tile{T};
+# 1D convenience (scalar index)
+@inline function atomic_add(array::TileArray{T, 1},
+                            index::Integer, tile::Tile{T};
                             memory_order::Int=MemoryOrder.AcqRel,
                             memory_scope::Int=MemScope.Device) where {T}
-    atomic_cas(array, (index,), expected, desired; memory_order, memory_scope)
+    atomic_add(array, (index,), tile; memory_order, memory_scope)
 end
diff --git a/test/execution/atomics.jl b/test/execution/atomics.jl
index b16cba2..eebbb1e 100644
--- a/test/execution/atomics.jl
+++ b/test/execution/atomics.jl
@@ -369,37 +369,6 @@ end
     @test all(Array(arr) .== 1)  # now incremented
 end
 
-@testset "atomic_xchg tile-space" begin
-    function atomic_xchg_ts_kernel(arr::ct.TileArray{Int,1})
-        bid = ct.bid(1)
-        tile = ct.full((16,), 42, Int)
-        ct.atomic_xchg(arr, bid, tile)
-        return
-    end
-
-    arr = CUDA.zeros(Int, 32)
-
-    ct.launch(atomic_xchg_ts_kernel, 2, arr)
-
-    @test all(Array(arr) .== 42)
-end
-
-@testset "atomic_cas tile-space" begin
-    function atomic_cas_ts_kernel(arr::ct.TileArray{Int32,1})
-        bid = ct.bid(1)
-        expected = ct.full((16,), Int32(0), Int32)
-        desired = ct.full((16,), Int32(1), Int32)
-        ct.atomic_cas(arr, bid, expected, desired)
-        return
-    end
-
-    arr = CUDA.zeros(Int32, 32)
-
-    ct.launch(atomic_cas_ts_kernel, 2, arr)
-
-    @test all(Array(arr) .== 1)
-end
-
 @testset "atomic_add tile-space 1D tuple index" begin
     # Test the N-D path with a 1-tuple index (not the scalar convenience)
     function atomic_add_ts_tuple1d_kernel(arr::ct.TileArray{Int,1})