diff --git a/Project.toml b/Project.toml
index 607558519..a1f20ec60 100644
--- a/Project.toml
+++ b/Project.toml
@@ -55,7 +55,7 @@ Printf = "1"
 Random = "1"
 SafeTestsets = "0.1"
 ScopedValues = "1.3.0"
-Strided = "2"
+Strided = "2.3.4"
 TensorKitSectors = "0.3.6"
 TensorOperations = "5.1"
 Test = "1"
@@ -89,3 +89,6 @@ cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [targets]
 test = ["ArgParse", "Adapt", "Aqua", "AllocCheck", "Combinatorics", "CUDA", "cuTENSOR", "GPUArrays", "JET", "LinearAlgebra", "SafeTestsets", "TensorOperations", "Test", "TestExtras", "ChainRulesCore", "ChainRulesTestUtils", "FiniteDifferences", "Zygote", "Mooncake"]
+
+[sources]
+Strided = {url = "https://github.com/QuantumKitHub/Strided.jl", rev = "ksh/copyto"}
diff --git a/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
index f5efb98bb..4ee4865f1 100644
--- a/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
+++ b/ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
@@ -10,7 +10,7 @@ using TensorKit.Factorizations
 using TensorKit.Strided
 using TensorKit.Factorizations: AbstractAlgorithm
 using TensorKit: SectorDict, tensormaptype, scalar, similarstoragetype, AdjointTensorMap, scalartype, project_symmetric_and_check
-import TensorKit: randisometry, rand, randn
+import TensorKit: randisometry, rand, randn, _copyto!, _add_general_kernel_nonthreaded!, blocktype
 
 using TensorKit: MatrixAlgebraKit
 
diff --git a/ext/TensorKitCUDAExt/cutensormap.jl b/ext/TensorKitCUDAExt/cutensormap.jl
index f065c2ec1..a276cd1a8 100644
--- a/ext/TensorKitCUDAExt/cutensormap.jl
+++ b/ext/TensorKitCUDAExt/cutensormap.jl
@@ -6,6 +6,9 @@ const AdjointCuTensorMap{T, S, N₁, N₂} = AdjointTensorMap{T, S, N₁, N₂,
 function CuTensorMap(t::TensorMap{T, S, N₁, N₂, A}) where {T, S, N₁, N₂, A}
     return CuTensorMap{T, S, N₁, N₂}(CuArray{T}(t.data), space(t))
 end
+function TensorMap{T, S, N₁, N₂, DA}(t::TensorMap{T, S, N₁, N₂, HA}) where {T, S, N₁, N₂, DA <: CuArray{T}, HA <: Array{T}}
+    return CuTensorMap{T, S, N₁, N₂}(CuArray{T}(t.data), space(t))
+end
 
 # project_symmetric! doesn't yet work for GPU types, so do this on the host, then copy
 function TensorKit.project_symmetric_and_check(::Type{T}, ::Type{A}, data::AbstractArray, V::TensorMapSpace; tol = sqrt(eps(real(float(eltype(data)))))) where {T, A <: CuVector{T}}
@@ -101,18 +104,6 @@ function TensorKit.scalar(t::CuTensorMap{T, S, 0, 0}) where {T, S}
     return isempty(inds) ? zero(scalartype(t)) : @allowscalar @inbounds t.data[only(inds)]
 end
 
-function Base.convert(
-        TT::Type{CuTensorMap{T, S, N₁, N₂}},
-        t::AbstractTensorMap{<:Any, S, N₁, N₂}
-    ) where {T, S, N₁, N₂}
-    if typeof(t) === TT
-        return t
-    else
-        tnew = TT(undef, space(t))
-        return copy!(tnew, t)
-    end
-end
-
 function LinearAlgebra.isposdef(t::CuTensorMap)
     domain(t) == codomain(t) ||
         throw(SpaceMismatch("`isposdef` requires domain and codomain to be the same"))
@@ -138,10 +129,9 @@ function Base.promote_rule(
     return CuTensorMap{T, S, N₁, N₂}
 end
 
-TensorKit.promote_storage_rule(::Type{CuArray{T, N}}, ::Type{<:CuArray{T, N}}) where {T, N} =
+TensorKit.promote_storage_rule(::Type{<:CuArray{T, N}}, ::Type{<:CuArray{T, N}}) where {T, N} =
     CuArray{T, N, CUDA.default_memory}
 
-
 # CuTensorMap exponentation:
 function TensorKit.exp!(t::CuTensorMap)
     domain(t) == codomain(t) ||
@@ -168,3 +158,21 @@ for f in (:sqrt, :log, :asin, :acos, :acosh, :atanh, :acoth)
         return tf
     end
 end
+
+function TensorKit.add_kernel_nonthreaded!(
+        tdst::CuTensorMap, tsrc::CuTensorMap, p, transformer::TensorKit.GenericTreeTransformer, α, β, backend...
+    )
+    # preallocate buffers
+    buffers = TensorKit.allocate_buffers(tdst, tsrc, transformer)
+
+    for subtransformer in transformer.data
+        # Special case without intermediate buffers whenever there is only a single block
+        if length(subtransformer[1]) == 1
+            TensorKit._add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend...)
+        else
+            cu_subtransformer = tuple(CUDA.adapt(CuArray, subtransformer[1]), subtransformer[2:end]...)
+            TensorKit._add_transform_multi!(tdst, tsrc, p, cu_subtransformer, buffers, α, β, backend...)
+        end
+    end
+    return nothing
+end
diff --git a/src/tensors/abstracttensor.jl b/src/tensors/abstracttensor.jl
index d7d520b43..8f9d0d8a1 100644
--- a/src/tensors/abstracttensor.jl
+++ b/src/tensors/abstracttensor.jl
@@ -53,9 +53,11 @@ storagetype(t) = storagetype(typeof(t))
 function storagetype(::Type{T}) where {T <: AbstractTensorMap}
     if T isa Union
         # attempt to be slightly more specific by promoting unions
-        Ma = storagetype(T.a)
-        Mb = storagetype(T.b)
-        return promote_storagetype(Ma, Mb)
+        return promote_storagetype(T.a, T.b)
+    elseif eltype(T) isa Union
+        # attempt to be slightly more specific by promoting unions
+        TU = eltype(T)
+        return promote_storagetype(TU.a, TU.b)
     else
         # fallback definition by using scalartype
         return similarstoragetype(scalartype(T))
@@ -103,11 +105,19 @@ similarstoragetype(X::Type, ::Type{T}) where {T <: Number} =
 
 # implement on tensors
 similarstoragetype(::Type{TT}) where {TT <: AbstractTensorMap} = similarstoragetype(storagetype(TT))
-similarstoragetype(::Type{TT}, ::Type{T}) where {TT <: AbstractTensorMap, T <: Number} =
-    similarstoragetype(storagetype(TT), T)
+function similarstoragetype(::Type{TT}, ::Type{T}) where {TT <: AbstractTensorMap, T <: Number}
+    return similarstoragetype(storagetype(TT), T)
+end
+function similarstoragetype(::Type{<:AbstractTensorMap{T, S, N₁, N₂}}, ::Type{TA}) where {T <: Number, TA <: DenseVector, S, N₁, N₂}
+    return similarstoragetype(TA, T)
+end
+function similarstoragetype(t::AbstractTensorMap{T, S, N₁, N₂}, ::Type{TA}) where {T <: Number, TA <: DenseVector, S, N₁, N₂}
+    return similarstoragetype(typeof(t), TA)
+end
 
 # implement on arrays
 similarstoragetype(::Type{A}) where {A <: DenseVector{<:Number}} = A
+similarstoragetype(::Type{A}, ::Type{A}) where {A <: DenseVector{<:Number}} = A
 Base.@assume_effects :foldable similarstoragetype(::Type{A}) where {A <: AbstractArray{<:Number}} =
     Core.Compiler.return_type(similar, Tuple{A, Int})
 Base.@assume_effects :foldable similarstoragetype(::Type{A}, ::Type{T}) where {A <: AbstractArray, T <: Number} =
diff --git a/src/tensors/adjoint.jl b/src/tensors/adjoint.jl
index dfc1a4471..382f309b5 100644
--- a/src/tensors/adjoint.jl
+++ b/src/tensors/adjoint.jl
@@ -22,6 +22,8 @@ Base.adjoint(t::AbstractTensorMap) = AdjointTensorMap(t)
 space(t::AdjointTensorMap) = adjoint(space(parent(t)))
 dim(t::AdjointTensorMap) = dim(parent(t))
 storagetype(::Type{AdjointTensorMap{T, S, N₁, N₂, TT}}) where {T, S, N₁, N₂, TT} = storagetype(TT)
+similarstoragetype(::AdjointTensorMap{T, S, N₁, N₂, TT}, ::Type{T′}) where {T, S, N₁, N₂, TT, T′ <: Number} = similarstoragetype(TT, T′)
+similarstoragetype(::AdjointTensorMap{T, S, N₁, N₂, TT}, ::Type{TA}) where {T, S, N₁, N₂, TT, TA <: DenseVector} = similarstoragetype(TT, TA)
 
 # Blocks and subblocks
 #----------------------
diff --git a/src/tensors/braidingtensor.jl b/src/tensors/braidingtensor.jl
index 3ff8a9abf..8d45c7dc6 100644
--- a/src/tensors/braidingtensor.jl
+++ b/src/tensors/braidingtensor.jl
@@ -145,12 +145,10 @@ function block(b::BraidingTensor, s::Sector)
     # TODO: probably always square?
     m = blockdim(codomain(b), s)
     n = blockdim(domain(b), s)
-    data = Matrix{eltype(b)}(undef, (m, n))
+    data = zeros(eltype(b), (m, n))
 
     length(data) == 0 && return data # s ∉ blocksectors(b)
 
-    data = fill!(data, zero(eltype(b)))
-
     V1, V2 = codomain(b)
     if sectortype(b) === Trivial
         d1, d2 = dim(V1), dim(V2)
@@ -182,12 +180,15 @@ end
 has_shared_permute(t::BraidingTensor, ::Index2Tuple) = false
 function add_transform!(
         tdst::AbstractTensorMap,
-        tsrc::BraidingTensor, (p₁, p₂)::Index2Tuple,
+        tsrc::BraidingTensor{T, S},
+        (p₁, p₂)::Index2Tuple,
         fusiontreetransform,
         α::Number, β::Number, backend::AbstractBackend...
-    )
+    ) where {T, S}
+    tsrc_map = similar(tdst, storagetype(tdst), space(tsrc))
+    copy!(tsrc_map, tsrc)
     return add_transform!(
-        tdst, TensorMap(tsrc), (p₁, p₂), fusiontreetransform, α, β,
+        tdst, tsrc_map, (p₁, p₂), fusiontreetransform, α, β,
         backend...
     )
 end
@@ -287,11 +288,15 @@ function planarcontract!(
         backend, allocator
     )
     # special case only defined for contracting 2 indices
-    length(oindB) == length(cindB) == 2 ||
+    if !(length(oindB) == length(cindB) == 2)
+        # horrible!!!!!
+        tB′ = TensorMap(B)
+        tB = TensorMapWithStorage{eltype(B), similarstoragetype(A, eltype(B)), spacetype(tB′), numout(tB′), numin(tB′)}(tB′)
         return planarcontract!(
-        C, A, (oindA, cindA), TensorMap(B), (cindB, oindB), (p1, p2),
-        α, β, backend, allocator
-    )
+            C, A, (oindA, cindA), tB, (cindB, oindB), (p1, p2),
+            α, β, backend, allocator
+        )
+    end
 
     codA, domA = codomainind(A), domainind(A)
     codB, domB = codomainind(B), domainind(B)
diff --git a/src/tensors/diagonal.jl b/src/tensors/diagonal.jl
index b2ac4134b..e73ad2787 100644
--- a/src/tensors/diagonal.jl
+++ b/src/tensors/diagonal.jl
@@ -280,7 +280,7 @@ end
 # ----------------
 function TO.tensoradd_type(TC, A::DiagonalTensorMap, ::Index2Tuple{1, 1}, ::Bool)
     M = similarstoragetype(A, TC)
-    return DiagonalTensorMap{TC, spacetype(A), M}
+    return DiagonalTensorMap{scalartype(M), spacetype(A), M}
 end
 
 function TO.tensorcontract_type(
diff --git a/src/tensors/indexmanipulations.jl b/src/tensors/indexmanipulations.jl
index 3108abb17..e45789b44 100644
--- a/src/tensors/indexmanipulations.jl
+++ b/src/tensors/indexmanipulations.jl
@@ -17,6 +17,8 @@ for (operation, manipulation) in (
         $promote_op(::Type{T}, ::Type{I}) where {T <: Number, I <: Sector} =
             sectorscalartype(I) <: Integer ? T :
             sectorscalartype(I) <: Real ? float(T) : complex(T)
+        $promote_op(::Type{TA}, ::Type{I}) where {TA <: DenseVector, I <: Sector} =
+            similarstoragetype(TA, $promote_op(eltype(TA), I))
         # TODO: currently the manipulations all use sectorscalartype, change to:
         # $manipulation_scalartype(I) <: Integer ? T :
         # $manipulation_scalartype(I) <: Real ? float(T) : complex(T)
@@ -342,11 +344,11 @@ See also [`insertrightunit`](@ref insertrightunit(::AbstractTensorMap, ::Val{i})
 """
 function insertleftunit(
         t::AbstractTensorMap, ::Val{i} = Val(numind(t) + 1);
-        copy::Bool = false, conj::Bool = false, dual::Bool = false
+        copy::Bool = false, conj::Bool = false, dual::Bool = false,
     ) where {i}
     W = insertleftunit(space(t), Val(i); conj, dual)
     if t isa TensorMap
-        return TensorMap{scalartype(t)}(copy ? Base.copy(t.data) : t.data, W)
+        return TensorMapWithStorage{scalartype(t), storagetype(t)}(copy ? Base.copy(t.data) : t.data, W)
     else
         tdst = similar(t, W)
         for (c, b) in blocks(t)
@@ -371,11 +373,11 @@ See also [`insertleftunit`](@ref insertleftunit(::AbstractTensorMap, ::Val{i}) w
 """
 function insertrightunit(
         t::AbstractTensorMap, ::Val{i} = Val(numind(t));
-        copy::Bool = false, conj::Bool = false, dual::Bool = false
+        copy::Bool = false, conj::Bool = false, dual::Bool = false,
     ) where {i}
     W = insertrightunit(space(t), Val(i); conj, dual)
     if t isa TensorMap
-        return TensorMap{scalartype(t)}(copy ? Base.copy(t.data) : t.data, W)
+        return TensorMapWithStorage{scalartype(t), storagetype(t)}(copy ? Base.copy(t.data) : t.data, W)
     else
         tdst = similar(t, W)
         for (c, b) in blocks(t)
@@ -400,7 +402,7 @@ and [`insertrightunit`](@ref insertrightunit(::AbstractTensorMap, ::Val{i}) wher
 function removeunit(t::AbstractTensorMap, ::Val{i}; copy::Bool = false) where {i}
     W = removeunit(space(t), Val(i))
     if t isa TensorMap
-        return TensorMap{scalartype(t)}(copy ? Base.copy(t.data) : t.data, W)
+        return TensorMapWithStorage{scalartype(t), storagetype(t)}(copy ? Base.copy(t.data) : t.data, W)
     else
         tdst = similar(t, W)
         for (c, b) in blocks(t)
diff --git a/test/cuda/tensors.jl b/test/cuda/tensors.jl
index 7bdd90f9d..90884f4d8 100644
--- a/test/cuda/tensors.jl
+++ b/test/cuda/tensors.jl
@@ -236,8 +236,8 @@ for V in spacelist
                     α = rand(T)
                     @test norm(t, 2) ≈ norm(TensorKit.to_cpu(t), 2)
                     @test dot(t2, t) ≈ dot(TensorKit.to_cpu(t2), TensorKit.to_cpu(t))
-                    @test TensorKit.to_cpu(α * t) ≈ α * TensorKit.to_cpu(t)
-                    @test TensorKit.to_cpu(t + t) ≈ 2 * TensorKit.to_cpu(t)
+                    @test adapt(Vector{T}, (α * t)) ≈ α * adapt(Vector{T}, t)
+                    @test adapt(Vector{T}, (t + t)) ≈ 2 * adapt(Vector{T}, t)
                 end
             end
             @timedtestset "Real and imaginary parts" begin
@@ -290,28 +290,29 @@ for V in spacelist
         @timedtestset "Permutations: test via inner product invariance" begin
             W = V1 ⊗ V2 ⊗ V3 ⊗ V4 ⊗ V5
             t = CUDA.rand(ComplexF64, W)
+            ht = adapt(Vector{ComplexF64}, t)
             t′ = CUDA.randn!(similar(t))
+            ht′ = adapt(Vector{ComplexF64}, t′)
+            dot_htt′ = dot(ht′, ht)
+            dot_tt′ = dot(t′, t)
+            @test dot_tt′ ≈ dot_htt′
+            norm_t = norm(t)
             for k in 0:5
                 for p in permutations(1:5)
                     p1 = ntuple(n -> p[n], k)
                     p2 = ntuple(n -> p[k + n], 5 - k)
-                    CUDA.@allowscalar begin
-                        t2 = @constinferred permute(t, (p1, p2))
-                        t2 = permute(t, (p1, p2))
-                        @test norm(t2) ≈ norm(t)
-                        t2′ = permute(t′, (p1, p2))
-                        @test dot(t2′, t2) ≈ dot(t′, t) ≈ dot(transpose(t2′), transpose(t2))
-                    end
-                end
-
-                CUDA.@allowscalar begin
-                    t3 = @constinferred repartition(t, $k)
-                    t3 = repartition(t, k)
-                    @test norm(t3) ≈ norm(t)
-                    t3′ = @constinferred repartition!(similar(t3), t′)
-                    @test norm(t3′) ≈ norm(t′)
-                    @test dot(t′, t) ≈ dot(t3′, t3)
+                    t2 = @constinferred permute(t, (p1, p2))
+                    t2′ = permute(t′, (p1, p2))
+                    @test norm(t2) ≈ norm_t
+                    @test dot(t2′, t2) ≈ dot_tt′
+                    @test dot(transpose(t2′), transpose(t2)) ≈ dot_tt′
                 end
+                t3 = @constinferred repartition(t, $k)
+                t3 = repartition(t, k)
+                t3′ = @constinferred repartition!(similar(t3), t′)
+                @test norm(t3) ≈ norm(t)
+                @test norm(t3′) ≈ norm(t′)
+                @test dot(t′, t) ≈ dot(t3′, t3)
             end
         end
         if BraidingStyle(I) isa SymmetricBraiding
@@ -322,34 +323,35 @@ for V in spacelist
                     for p in permutations(1:5)
                         p1 = ntuple(n -> p[n], k)
                         p2 = ntuple(n -> p[k + n], 5 - k)
-                        dt2 = CUDA.@allowscalar permute(t, (p1, p2))
-                        ht2 = permute(TensorKit.to_cpu(t), (p1, p2))
-                        @test ht2 == TensorKit.to_cpu(dt2)
+                        ht2 = permute(adapt(Vector{ComplexF64}, t), (p1, p2))
+                        dt2 = permute(t, (p1, p2))
+                        @test ht2 ≈ adapt(Vector{ComplexF64}, dt2)
+                        ht3 = transpose(adapt(Vector{ComplexF64}, dt2))
+                        dt3 = transpose(dt2)
+                        hht3 = adapt(Vector{ComplexF64}, dt3)
+                        @test ht3 ≈ hht3
                     end
-
-                    dt3 = CUDA.@allowscalar repartition(t, k)
-                    ht3 = repartition(TensorKit.to_cpu(t), k)
-                    @test ht3 == TensorKit.to_cpu(dt3)
+                    dt4 = repartition(t, k)
+                    ht4 = repartition(adapt(Vector{ComplexF64}, t), k)
+                    @test ht4 == adapt(Vector{ComplexF64}, dt4)
                 end
             end
         end
         @timedtestset "Full trace: test self-consistency" begin
             t = CUDA.rand(ComplexF64, V1 ⊗ V2' ⊗ V2 ⊗ V1')
-            CUDA.@allowscalar begin
-                t2 = permute(t, ((1, 2), (4, 3)))
-                s = @constinferred tr(t2)
-                @test conj(s) ≈ tr(t2')
-                if !isdual(V1)
-                    t2 = twist!(t2, 1)
-                end
-                if isdual(V2)
-                    t2 = twist!(t2, 2)
-                end
-                ss = tr(t2)
-                @tensor s2 = t[a, b, b, a]
-                @tensor t3[a, b] := t[a, c, c, b]
-                @tensor s3 = t3[a, a]
+            t2 = permute(t, ((1, 2), (4, 3)))
+            s = @constinferred tr(t2)
+            @test conj(s) ≈ tr(t2')
+            if !isdual(V1)
+                t2 = twist!(t2, 1)
+            end
+            if isdual(V2)
+                t2 = twist!(t2, 2)
             end
+            ss = tr(t2)
+            @tensor s2 = t[a, b, b, a]
+            @tensor t3[a, b] := t[a, c, c, b]
+            @tensor s3 = t3[a, a]
             @test ss ≈ s2
             @test ss ≈ s3
         end
@@ -363,24 +365,20 @@ for V in spacelist
         if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
             @timedtestset "Trace: test via conversion" begin
                 t = CUDA.rand(ComplexF64, V1 ⊗ V2' ⊗ V3 ⊗ V2 ⊗ V1' ⊗ V3')
-                CUDA.@allowscalar begin
-                    @tensor t2[a, b] := t[c, d, b, d, c, a]
-                    @tensor t3[a, b] := ad(t)[c, d, b, d, c, a]
-                end
+                @tensor t2[a, b] := t[c, d, b, d, c, a]
+                @tensor t3[a, b] := ad(t)[c, d, b, d, c, a]
                 @test t3 ≈ ad(t2)
             end
         end
         @timedtestset "Trace and contraction" begin
             t1 = CUDA.rand(ComplexF64, V1 ⊗ V2 ⊗ V3)
             t2 = CUDA.rand(ComplexF64, V2' ⊗ V4 ⊗ V1')
-            CUDA.@allowscalar begin
-                t3 = t1 ⊗ t2
-                @tensor ta[a, b] := t1[x, y, a] * t2[y, b, x]
-                @tensor tb[a, b] := t3[x, y, a, y, b, x]
-            end
+            t3 = t1 ⊗ t2
+            @tensor ta[a, b] := t1[x, y, a] * t2[y, b, x]
+            @tensor tb[a, b] := t3[x, y, a, y, b, x]
             @test ta ≈ tb
         end
-        #=if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
+        if BraidingStyle(I) isa Bosonic && hasfusiontensor(I)
             @timedtestset "Tensor contraction: test via CPU" begin
                 dA1 = CUDA.randn(ComplexF64, V1' * V2', V3')
                 dA2 = CUDA.randn(ComplexF64, V3 * V4, V5)
@@ -395,45 +393,39 @@ for V in spacelist
                     TensorKit.to_cpu(dH)[s1, s2, t1, t2]
                 @test TensorKit.to_cpu(dHrA12) ≈ hHrA12
             end
-        end=# # doesn't yet work because of AdjointTensor
+        end
         @timedtestset "Index flipping: test flipping inverse" begin
             t = CUDA.rand(ComplexF64, V1 ⊗ V1' ← V1' ⊗ V1)
             for i in 1:4
-                CUDA.@allowscalar begin
-                    @test t ≈ flip(flip(t, i), i; inv = true)
-                    @test t ≈ flip(flip(t, i; inv = true), i)
-                end
+                @test t ≈ flip(flip(t, i), i; inv = true)
+                @test t ≈ flip(flip(t, i; inv = true), i)
             end
         end
-        #=@timedtestset "Index flipping: test via explicit flip" begin
+        @timedtestset "Index flipping: test via explicit flip" begin
             t = CUDA.rand(ComplexF64, V1 ⊗ V1' ← V1' ⊗ V1)
-            F1 = unitary(flip(V1), V1)
+            F1 = adapt(CuArray{ComplexF64}, unitary(flip(V1), V1))
 
-            CUDA.@allowscalar begin
-                @tensor tf[a, b; c, d] := F1[a, a'] * t[a', b; c, d]
-                @test flip(t, 1) ≈ tf
-                @tensor tf[a, b; c, d] := conj(F1[b, b']) * t[a, b'; c, d]
-                @test twist!(flip(t, 2), 2) ≈ tf
-                @tensor tf[a, b; c, d] := F1[c, c'] * t[a, b; c', d]
-                @test flip(t, 3) ≈ tf
-                @tensor tf[a, b; c, d] := conj(F1[d, d']) * t[a, b; c, d']
-                @test twist!(flip(t, 4), 4) ≈ tf
-            end
+            @tensor tf[a, b; c, d] := F1[a, a'] * t[a', b; c, d]
+            @test flip(t, 1) ≈ tf
+            @tensor tf[a, b; c, d] := conj(F1[b, b']) * t[a, b'; c, d]
+            @test twist!(flip(t, 2), 2) ≈ tf
+            @tensor tf[a, b; c, d] := F1[c, c'] * t[a, b; c', d]
+            @test flip(t, 3) ≈ tf
+            @tensor tf[a, b; c, d] := conj(F1[d, d']) * t[a, b; c, d']
+            @test twist!(flip(t, 4), 4) ≈ tf
         end
         @timedtestset "Index flipping: test via contraction" begin
             t1 = CUDA.rand(ComplexF64, V1 ⊗ V2 ⊗ V3 ← V4)
             t2 = CUDA.rand(ComplexF64, V2' ⊗ V5 ← V4' ⊗ V1)
-            CUDA.@allowscalar begin
-                @tensor ta[a, b] := t1[x, y, a, z] * t2[y, b, z, x]
-                @tensor tb[a, b] := flip(t1, 1)[x, y, a, z] * flip(t2, 4)[y, b, z, x]
-                @test ta ≈ tb
-                @tensor tb[a, b] := flip(t1, (2, 4))[x, y, a, z] * flip(t2, (1, 3))[y, b, z, x]
-                @test ta ≈ tb
-                @tensor tb[a, b] := flip(t1, (1, 2, 4))[x, y, a, z] * flip(t2, (1, 3, 4))[y, b, z, x]
-                @tensor tb[a, b] := flip(t1, (1, 3))[x, y, a, z] * flip(t2, (2, 4))[y, b, z, x]
-                @test flip(ta, (1, 2)) ≈ tb
-            end
-        end=# # TODO
+            @tensor ta[a, b] := t1[x, y, a, z] * t2[y, b, z, x]
+            @tensor tb[a, b] := flip(t1, 1)[x, y, a, z] * flip(t2, 4)[y, b, z, x]
+            @test ta ≈ tb
+            @tensor tb[a, b] := flip(t1, (2, 4))[x, y, a, z] * flip(t2, (1, 3))[y, b, z, x]
+            @test ta ≈ tb
+            @tensor tb[a, b] := flip(t1, (1, 2, 4))[x, y, a, z] * flip(t2, (1, 3, 4))[y, b, z, x]
+            @tensor tb[a, b] := flip(t1, (1, 3))[x, y, a, z] * flip(t2, (2, 4))[y, b, z, x]
+            @test flip(ta, (1, 2)) ≈ tb
+        end
         @timedtestset "Multiplication of isometries: test properties" begin
             W2 = V4 ⊗ V5
             W1 = W2 ⊗ (oneunit(V1) ⊕ oneunit(V1))
@@ -567,9 +559,7 @@ for V in spacelist
             for T in (Float32, ComplexF64)
                 t1 = CUDA.rand(T, V2 ⊗ V3 ⊗ V1, V1 ⊗ V2)
                 t2 = CUDA.rand(T, V2 ⊗ V1 ⊗ V3, V1 ⊗ V1)
-                CUDA.@allowscalar begin
-                    t = @constinferred (t1 ⊗ t2)
-                end
+                t = @constinferred (t1 ⊗ t2)
                 @test norm(t) ≈ norm(t1) * norm(t2)
             end
         end
@@ -582,11 +572,9 @@ for V in spacelist
                     d2 = dim(codomain(t2))
                     d3 = dim(domain(t1))
                     d4 = dim(domain(t2))
-                    CUDA.@allowscalar begin
-                        t = @constinferred (t1 ⊗ t2)
-                        At = ad(t)
-                        @test ad(t) ≈ ad(t1) ⊗ ad(t2)
-                    end
+                    t = @constinferred (t1 ⊗ t2)
+                    At = ad(t)
+                    @test ad(t) ≈ ad(t1) ⊗ ad(t2)
                 end
             end
         end
@@ -594,11 +582,9 @@ for V in spacelist
             for T in (Float32, ComplexF64)
                 t1 = CUDA.rand(T, V2 ⊗ V3 ⊗ V1)
                 t2 = CUDA.rand(T, V2 ⊗ V1 ⊗ V3)
-                CUDA.@allowscalar begin
-                    t = @constinferred (t1 ⊗ t2)
-                    @tensor t′[1, 2, 3, 4, 5, 6] := t1[1, 2, 3] * t2[4, 5, 6]
-                    # @test t ≈ t′ # TODO broken for symmetry: Irrep[ℤ₃]
-                end
+                t = @constinferred (t1 ⊗ t2)
+                @tensor t′[1, 2, 3, 4, 5, 6] := t1[1, 2, 3] * t2[4, 5, 6]
+                # @test t ≈ t′ # TODO broken for symmetry: Irrep[ℤ₃]
             end
         end
     end
@@ -614,16 +600,12 @@ end
         for T in (Float32, ComplexF64)
             t1 = CUDA.rand(T, V1 ⊗ V2, V3' ⊗ V4)
             t2 = CUDA.rand(T, W2, W1 ⊗ W1')
-            CUDA.@allowscalar begin
-                t = @constinferred (t1 ⊠ t2)
-            end
+            t = @constinferred (t1 ⊠ t2)
             d1 = dim(codomain(t1))
             d2 = dim(codomain(t2))
             d3 = dim(domain(t1))
             d4 = dim(domain(t2))
-            CUDA.@allowscalar begin
-                @test ad(t1) ⊠ ad(t2) ≈ ad(t1 ⊠ t2)
-            end
+            @test ad(t1) ⊠ ad(t2) ≈ ad(t1 ⊠ t2)
         end
     end
 end