From b657bb4fdfe8b02c387ab4aed5819974d0d179e4 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Thu, 26 Mar 2026 08:48:37 -0400
Subject: [PATCH 1/7] More tests for CUDA + MPS

---
 ext/MPSKitAdaptExt.jl  |   2 +
 test/cuda/operators.jl | 157 ++++++++++++++++++++++++++++++++++++++++-
 test/cuda/states.jl    |  40 +++++++++++
 3 files changed, 198 insertions(+), 1 deletion(-)

diff --git a/ext/MPSKitAdaptExt.jl b/ext/MPSKitAdaptExt.jl
index 6f0b0c7f2..78f4b41af 100644
--- a/ext/MPSKitAdaptExt.jl
+++ b/ext/MPSKitAdaptExt.jl
@@ -35,5 +35,7 @@ end
     MPSKit.JordanMPOTensor(space(W), adapt(to, W.A), adapt(to, W.B), adapt(to, W.C), adapt(to, W.D))
 @inline Adapt.adapt_structure(to, mpo::MPOHamiltonian) =
     MPOHamiltonian(map(x -> adapt(to, x), mpo.W))
+@inline Adapt.adapt_structure(to, ml::MPSKit.Multiline) = MPSKit.Multiline(map(adapt(to), ml.data))
+@inline Adapt.adapt_structure(to, pa::MPSKit.PeriodicArray) = MPSKit.PeriodicArray(map(adapt(to), pa.data))
 
 end
diff --git a/test/cuda/operators.jl b/test/cuda/operators.jl
index b3f03796c..47da1ed3e 100644
--- a/test/cuda/operators.jl
+++ b/test/cuda/operators.jl
@@ -5,12 +5,15 @@ using MPSKit: GeometryStyle, FiniteChainStyle, InfiniteChainStyle, OperatorStyle
 using TensorKit
 using MatrixAlgebraKit
 using TensorKit: ℙ, tensormaptype, TensorMapWithStorage
-using Adapt, CUDA, cuTENSOR
+using Adapt, CUDA, cuTENSOR, CUDA.CUBLAS
 
 # TODO revisit this once https://github.com/QuantumKitHub/MatrixAlgebraKit.jl/issues/176
 # is resolved
 MPSKit.Defaults.alg_svd() = CUSOLVER_QRIteration()
 
+pspaces = (ℙ^4, Rep[U₁](0 => 2), Rep[SU₂](1 => 1))
+vspaces = (ℙ^10, Rep[U₁]((0 => 20)), Rep[SU₂](1 // 2 => 10, 3 // 2 => 5, 5 // 2 => 1))
+
 @testset "CuFiniteMPO" for V in (ℂ^2, U1Space(0 => 1, 1 => 1))
     # start from random operators
     L = 4
@@ -87,3 +90,155 @@ MPSKit.Defaults.alg_svd() = CUSOLVER_QRIteration()
 
     @test dot(mpomps₁, mpomps₁) ≈ dot(mpo₁, mpo₁)
 end
+
+@testset "Finite CuMPOHamiltonian" begin
+    L = 3
+    T = ComplexF64
+    for T in (Float64, ComplexF64), V in (ℂ^2, U1Space(-1 => 1, 0 => 1, 1 => 1))
+        lattice = fill(V, L)
+        O₁ = randn(T, V, V)
+        O₁ += O₁'
+        E = id(storagetype(O₁), domain(O₁))
+        O₂ = randn(T, V^2 ← V^2)
+        O₂ += O₂'
+
+        H1 = adapt(CuVector{T, CUDA.DeviceMemory}, FiniteMPOHamiltonian(lattice, i => O₁ for i in 1:L))
+        H2 = adapt(CuVector{T, CUDA.DeviceMemory}, FiniteMPOHamiltonian(lattice, (i, i + 1) => O₂ for i in 1:(L - 1)))
+        H3 = adapt(CuVector{T, CUDA.DeviceMemory}, FiniteMPOHamiltonian(lattice, 1 => O₁, (2, 3) => O₂, (1, 3) => O₂))
+        @test TensorKit.storagetype(H1) == CuVector{T, CUDA.DeviceMemory}
+        @test TensorKit.storagetype(H2) == CuVector{T, CUDA.DeviceMemory}
+        @test TensorKit.storagetype(H3) == CuVector{T, CUDA.DeviceMemory}
+
+        @test scalartype(H1) == scalartype(H2) == scalartype(H3) == T
+        #=if !(T <: Complex)
+            for H in (H1, H2, H3)
+                Hc = @constinferred complex(H)
+                @test scalartype(Hc) == complex(T)
+                @test TensorKit.storagetype(Hc) == CuVector{complex(T), CUDA.DeviceMemory}
+            end
+        end=# # TODO
+
+        # check if constructor works by converting back to tensormap
+        #= H1_tm = convert(TensorMap, H1)
+        operators = vcat(fill(E, L - 1), O₁)
+        @test H1_tm ≈ mapreduce(+, 1:L) do i
+            return reduce(⊗, circshift(operators, i))
+        end
+        operators = vcat(fill(E, L - 2), O₂)
+        @test convert(TensorMap, H2) ≈ mapreduce(+, 1:(L - 1)) do i
+            return reduce(⊗, circshift(operators, i))
+        end
+        @test convert(TensorMap, H3) ≈
+            O₁ ⊗ E ⊗ E + E ⊗ O₂ + permute(O₂ ⊗ E, ((1, 3, 2), (4, 6, 5))) =# # needs a fix in BlockTensorKit
+
+        # check if adding terms on the same site works
+        single_terms = Iterators.flatten(Iterators.repeated((i => O₁ / 2 for i in 1:L), 2))
+        H4 = adapt(CuArray, FiniteMPOHamiltonian(lattice, single_terms))
+        @test TensorKit.storagetype(H4) == CuVector{T, CUDA.DeviceMemory}
+        @test H4 ≈ H1 atol = 1.0e-6
+        double_terms = Iterators.flatten(
+            Iterators.repeated(((i, i + 1) => O₂ / 2 for i in 1:(L - 1)), 2)
+        )
+        H5 = adapt(CuArray, FiniteMPOHamiltonian(lattice, double_terms))
+        @test TensorKit.storagetype(H5) == CuVector{T, CUDA.DeviceMemory}
+        @test H5 ≈ H2 atol = 1.0e-6
+
+        # test linear algebra
+        @test H1 ≈
+            adapt(CuArray, FiniteMPOHamiltonian(lattice, 1 => O₁)) +
+            adapt(CuArray, FiniteMPOHamiltonian(lattice, 2 => O₁)) +
+            adapt(CuArray, FiniteMPOHamiltonian(lattice, 3 => O₁))
+        @test TensorKit.storagetype(H1) == CuVector{T, CUDA.DeviceMemory}
+        #@test 0.8 * H1 + 0.2 * H1 ≈ H1 atol = 1.0e-6 # broken due to JordanMPOTensorMap
+        #=@test convert(TensorMap, H1 + H2) ≈ convert(TensorMap, H1) + convert(TensorMap, H2) atol = 1.0e-6
+        H1_trunc = changebonds(H1, SvdCut(; trscheme = truncrank(0)))
+        @test H1_trunc ≈ H1
+        @test all(left_virtualspace(H1_trunc) .== left_virtualspace(H1))=# # needs fix in BlockTensorKit
+
+        # test dot and application
+        state = rand(T, prod(lattice))
+        mps = adapt(CuArray, FiniteMPS(state))
+
+        #=@test convert(TensorMap, H1 * mps) ≈ H1_tm * state # needs fix in BlockTensorKit
+        @test H1 * state ≈ H1_tm * state
+        @test dot(mps, H2, mps) ≈ dot(mps, H2 * mps)=#
+
+        # test constructor from dictionary with mixed linear and Cartesian lattice indices as keys
+        grid = square = fill(V, 3, 3)
+
+        local_operators = Dict((I,) => O₁ for I in eachindex(grid))
+        I_vertical = CartesianIndex(1, 0)
+        vertical_operators = Dict(
+            (I, I + I_vertical) => O₂ for I in eachindex(IndexCartesian(), square) if I[1] < size(square, 1)
+        )
+        I_horizontal = CartesianIndex(0, 1)
+        horizontal_operators = Dict(
+            (I, I + I_horizontal) => O₂ for I in eachindex(IndexCartesian(), square) if I[2] < size(square, 1)
+        )
+        operators = merge(local_operators, vertical_operators, horizontal_operators)
+        H4 = adapt(CuArray, FiniteMPOHamiltonian(grid, operators))
+        @test TensorKit.storagetype(H4) == CuVector{T, CUDA.DeviceMemory}
+
+        @test H4 ≈
+            adapt(CuArray, FiniteMPOHamiltonian(grid, local_operators)) +
+            adapt(CuArray, FiniteMPOHamiltonian(grid, vertical_operators)) +
+            adapt(CuArray, FiniteMPOHamiltonian(grid, horizontal_operators)) atol = 1.0e-4
+        @test TensorKit.storagetype(H4) == CuVector{T, CUDA.DeviceMemory}
+
+        #H4′= H4 / 3 + 2H4 / 3
+        #@test TensorKit.storagetype(H4′) == CuVector{T, CUDA.DeviceMemory}
+        #H5 = changebonds(H4′, SvdCut(; trscheme = trunctol(; atol = 1.0e-12)))
+        #@test TensorKit.storagetype(H5) == CuVector{T, CUDA.DeviceMemory} # more problems with arithmetic operations...
+        #psi = adapt(CuArray, FiniteMPS(physicalspace(H5), V ⊕ rightunitspace(V)))
+        #@test expectation_value(psi, H4) ≈ expectation_value(psi, H5)
+    end
+end
+
+@testset "CuInfiniteMPOHamiltonian $(sectortype(pspace))" for (pspace, Dspace) in zip(pspaces, vspaces)
+    # generate a 1-2-3 body interaction
+    operators = ntuple(3) do i
+        O = rand(ComplexF64, pspace^i, pspace^i)
+        return O += O'
+    end
+
+    H1 = adapt(CuVector{ComplexF64, CUDA.DeviceMemory}, InfiniteMPOHamiltonian(operators[1]))
+    H2 = adapt(CuVector{ComplexF64, CUDA.DeviceMemory}, InfiniteMPOHamiltonian(operators[2]))
+    H3 = adapt(CuVector{ComplexF64, CUDA.DeviceMemory}, repeat(InfiniteMPOHamiltonian(operators[3]), 2))
+
+    @test TensorKit.storagetype(H1) == CuVector{ComplexF64, CUDA.DeviceMemory}
+    @test TensorKit.storagetype(H2) == CuVector{ComplexF64, CUDA.DeviceMemory}
+    @test TensorKit.storagetype(H3) == CuVector{ComplexF64, CUDA.DeviceMemory}
+    # make a teststate to measure expectation values for
+    ψ1 = adapt(CuVector{ComplexF64, CUDA.DeviceMemory}, InfiniteMPS([pspace], [Dspace]))
+    ψ2 = adapt(CuVector{ComplexF64, CUDA.DeviceMemory}, InfiniteMPS([pspace, pspace], [Dspace, Dspace]))
+    @test TensorKit.storagetype(ψ1) == CuVector{ComplexF64, CUDA.DeviceMemory}
+    @test TensorKit.storagetype(ψ2) == CuVector{ComplexF64, CUDA.DeviceMemory}
+
+    #=
+    e1 = expectation_value(ψ1, H1)
+    e2 = expectation_value(ψ1, H2)
+    =# # broken due to BraidingTensor
+
+    # H1 = 2 * H1 - CuArray([1]) # scalar indexing
+    # @test TensorKit.storagetype(H1) == CuVector{ComplexF64, CUDA.DeviceMemory}
+    # @test e1 * 2 - 1 ≈ expectation_value(ψ1, H1) atol = 1.0e-10 # broken due to BraidingTensor
+
+    H1 = H1 + H2
+    @test TensorKit.storagetype(H1) == CuVector{ComplexF64, CUDA.DeviceMemory}
+
+    # @test e1 * 2 + e2 - 1 ≈ expectation_value(ψ1, H1) atol = 1.0e-10 # broken due to BraidingTensor
+
+    H1 = repeat(H1, 2)
+    @test TensorKit.storagetype(H1) == CuVector{ComplexF64, CUDA.DeviceMemory}
+
+    #=e1 = expectation_value(ψ2, H1)
+    e3 = expectation_value(ψ2, H3)
+
+    @test e1 + e3 ≈ expectation_value(ψ2, H1 + H3) atol = 1.0e-10=# # broken due to BraidingTensor
+
+    #H4 = H1 + H3 # broken due to BraidingTensor
+    #@test TensorKit.storagetype(H4) == CuVector{ComplexF64, CUDA.DeviceMemory}
+    #h4 = H4 * H4
+    #@test TensorKit.storagetype(h4) == CuVector{ComplexF64, CUDA.DeviceMemory}
+    #@test real(expectation_value(ψ2, H4)) >= 0 # broken due to BraidingTensor
+end
diff --git a/test/cuda/states.jl b/test/cuda/states.jl
index 39627ac41..7c036ea09 100644
--- a/test/cuda/states.jl
+++ b/test/cuda/states.jl
@@ -50,3 +50,43 @@ using Adapt, CUDA, cuTENSOR
 
     @test norm(2 * ψ + ψ - 3 * ψ) ≈ 0.0 atol = sqrt(eps(real(ComplexF64)))
 end
+
+@testset "CuMultilineMPS ($(sectortype(D)), $elt)" for (D, d, elt) in
+    [(ℙ^10, ℙ^2, ComplexF64), (Rep[U₁](1 => 3), Rep[U₁](0 => 1), ComplexF32)]
+    tol = Float64(eps(real(elt)) * 100)
+    ψ = adapt(
+        CuVector{elt, CUDA.DeviceMemory}, MultilineMPS(
+            [
+                rand(elt, D * d, D) rand(elt, D * d, D)
+                rand(elt, D * d, D) rand(elt, D * d, D)
+            ]; tol
+        )
+    )
+
+    @test GeometryStyle(typeof(ψ)) == InfiniteChainStyle()
+    @test GeometryStyle(ψ) == InfiniteChainStyle()
+    @test TensorKit.storagetype(ψ) == CuVector{elt, CUDA.DeviceMemory}
+    @test TensorKit.sectortype(ψ) == sectortype(D)
+
+    @test !isfinite(typeof(ψ))
+
+    @test physicalspace(ψ) == fill(d, 2, 2)
+    @test all(x -> x ≾ D, left_virtualspace(ψ))
+    @test all(x -> x ≾ D, right_virtualspace(ψ))
+
+    for i in 1:size(ψ, 1), j in 1:size(ψ, 2)
+        @plansor difference[-1 -2; -3] := ψ.AL[i, j][-1 -2; 1] * ψ.C[i, j][1; -3] -
+            ψ.C[i, j - 1][-1; 1] * ψ.AR[i, j][1 -2; -3]
+        @test norm(difference, Inf) < tol * 10
+
+        @test l_LL(ψ, i, j) * TransferMatrix(ψ.AL[i, j], ψ.AL[i, j]) ≈ l_LL(ψ, i, j + 1)
+        @test l_LR(ψ, i, j) * TransferMatrix(ψ.AL[i, j], ψ.AR[i, j]) ≈ l_LR(ψ, i, j + 1)
+        @test l_RL(ψ, i, j) * TransferMatrix(ψ.AR[i, j], ψ.AL[i, j]) ≈ l_RL(ψ, i, j + 1)
+        @test l_RR(ψ, i, j) * TransferMatrix(ψ.AR[i, j], ψ.AR[i, j]) ≈ l_RR(ψ, i, j + 1)
+
+        @test TransferMatrix(ψ.AL[i, j], ψ.AL[i, j]) * r_LL(ψ, i, j) ≈ r_LL(ψ, i, j + 1)
+        @test TransferMatrix(ψ.AL[i, j], ψ.AR[i, j]) * r_LR(ψ, i, j) ≈ r_LR(ψ, i, j + 1)
+        @test TransferMatrix(ψ.AR[i, j], ψ.AL[i, j]) * r_RL(ψ, i, j) ≈ r_RL(ψ, i, j + 1)
+        @test TransferMatrix(ψ.AR[i, j], ψ.AR[i, j]) * r_RR(ψ, i, j) ≈ r_RR(ψ, i, j + 1)
+    end
+end

From 4a0d5055745397eeb7064014fd5bf0fa340cb29d Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Fri, 27 Mar 2026 05:57:50 -0400
Subject: [PATCH 2/7] Fix complex/real for JordanMPOTensor

---
 src/operators/jordanmpotensor.jl | 4 +++-
 test/cuda/operators.jl           | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/operators/jordanmpotensor.jl b/src/operators/jordanmpotensor.jl
index 04c4ee1cb..49704b048 100644
--- a/src/operators/jordanmpotensor.jl
+++ b/src/operators/jordanmpotensor.jl
@@ -205,7 +205,8 @@ end
 for f in (:real, :complex)
     @eval function Base.$f(W::JordanMPOTensor)
         E = $f(scalartype(W))
-        W′ = JordanMPOTensor{E}(undef, space(W))
+        TE = TensorKit.similarstoragetype(TensorKit.storagetype(W), E)
+        W′ = similar(W, TE)
         for (I, v) in nonzero_pairs(W.A)
             W′.A[I] = $f(v)
         end
@@ -218,6 +219,7 @@ for f in (:real, :complex)
         for (I, v) in nonzero_pairs(W.D)
             W′.D[I] = $f(v)
         end
+        @show TensorKit.storagetype(W), TensorKit.storagetype(W′)
         return W′
     end
 end
diff --git a/test/cuda/operators.jl b/test/cuda/operators.jl
index 47da1ed3e..f09cb2b4b 100644
--- a/test/cuda/operators.jl
+++ b/test/cuda/operators.jl
@@ -110,13 +110,13 @@ end
         @test TensorKit.storagetype(H3) == CuVector{T, CUDA.DeviceMemory}
 
         @test scalartype(H1) == scalartype(H2) == scalartype(H3) == T
-        #=if !(T <: Complex)
+        if !(T <: Complex)
             for H in (H1, H2, H3)
                 Hc = @constinferred complex(H)
                 @test scalartype(Hc) == complex(T)
                 @test TensorKit.storagetype(Hc) == CuVector{complex(T), CUDA.DeviceMemory}
             end
-        end=# # TODO
+        end
 
         # check if constructor works by converting back to tensormap
         #= H1_tm = convert(TensorMap, H1)

From 59dca6fa5b57eb16466012ceab4d7b2c6fa660e9 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Tue, 31 Mar 2026 08:34:35 -0400
Subject: [PATCH 3/7] Incremental

---
 src/operators/abstractmpo.jl     |  2 +-
 src/operators/jordanmpotensor.jl |  9 ++++++---
 src/operators/mpohamiltonian.jl  | 14 ++++++--------
 src/operators/ortho.jl           | 17 +++++++++--------
 4 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/src/operators/abstractmpo.jl b/src/operators/abstractmpo.jl
index 54ecf0358..01c49500c 100644
--- a/src/operators/abstractmpo.jl
+++ b/src/operators/abstractmpo.jl
@@ -99,7 +99,7 @@ Base.:\(α::Number, mpo::AbstractMPO) = scale(mpo, inv(α))
 
 function VectorInterface.scale(mpo::AbstractMPO, α::Number)
     T = VectorInterface.promote_scale(scalartype(mpo), scalartype(α))
-    dst = similar(mpo, T)
+    dst = similar(mpo, TensorKit.similarstoragetype(storagetype(mpo), T))
     return scale!(dst, mpo, α)
 end
 
diff --git a/src/operators/jordanmpotensor.jl b/src/operators/jordanmpotensor.jl
index 49704b048..2563496de 100644
--- a/src/operators/jordanmpotensor.jl
+++ b/src/operators/jordanmpotensor.jl
@@ -135,6 +135,10 @@ function jordanmpotensortype(::Type{O}) where {O <: AbstractTensorMap}
 end
 Base.similar(W::JordanMPOTensor, ::Type{TorA}) where {TorA} =
     jordanmpotensortype(spacetype(W), TorA)(undef, space(W))
+Base.similar(W::JordanMPOTensor, V::VectorSpace) =
+    jordanmpotensortype(spacetype(V), storagetype(W))(undef, V)
+Base.similar(W::JordanMPOTensor, V::VectorSpace, ::Type{TorA}) where {TorA} =
+    jordanmpotensortype(spacetype(V), TorA)(undef, V)
 
 # Properties
 # ----------
@@ -219,7 +223,6 @@ for f in (:real, :complex)
         for (I, v) in nonzero_pairs(W.D)
             W′.D[I] = $f(v)
         end
-        @show TensorKit.storagetype(W), TensorKit.storagetype(W′)
         return W′
     end
 end
@@ -244,7 +247,7 @@ end
     elseif 1 < i < size(W, 1) && 1 < j < size(W, 4)
         return W.A[i - 1, 1, 1, j - 1]
     else
-        return zeros(scalartype(W), eachspace(W)[i, 1, 1, j])
+        return zeros(storagetype(W), eachspace(W)[i, 1, 1, j])
     end
 end
 
@@ -359,7 +362,7 @@ function add_physical_charge(O::JordanMPOTensor, charge::Sector)
     Vdst = left_virtualspace(O) ⊗
         fuse(physicalspace(O), auxspace) ←
         fuse(physicalspace(O), auxspace) ⊗ right_virtualspace(O)
-    Odst = JordanMPOTensor{scalartype(O)}(undef, Vdst)
+    Odst = similar(O, Vdst)
     for (I, v) in nonzero_pairs(O)
         Odst[I] = add_physical_charge(v, charge)
     end
diff --git a/src/operators/mpohamiltonian.jl b/src/operators/mpohamiltonian.jl
index 1f2e7f980..bb544d0ae 100644
--- a/src/operators/mpohamiltonian.jl
+++ b/src/operators/mpohamiltonian.jl
@@ -503,10 +503,8 @@ function InfiniteMPOHamiltonian(lattice′::AbstractArray{<:VectorSpace}, local_
     end
 
     # partial sort by interaction range
-    local_mpos = sort!(
-        map(Base.Fix1(instantiate_operator, lattice), collect(local_operators));
-        by = x -> length(x[1])
-    )
+    unsorted_mpos = map(Base.Fix1(instantiate_operator, lattice), [local_operators...])
+    local_mpos = sort!(unsorted_mpos; by = x -> length(x[1]))
 
     for (sites, local_mpo) in local_mpos
         local key_R # trick to define key_R before the first iteration
@@ -701,8 +699,8 @@ end
 function Base.similar(H::MPOHamiltonian, ::Type{O}, L::Int) where {O <: MPOTensor}
     return MPOHamiltonian(similar(parent(H), O, L))
 end
-function Base.similar(H::MPOHamiltonian, ::Type{T}) where {T <: Number}
-    return MPOHamiltonian(similar.(parent(H), T))
+function Base.similar(H::MPOHamiltonian, ::Type{TorA}) where {TorA <: Union{Number, DenseVector}}
+    return MPOHamiltonian(similar.(parent(H), TorA))
 end
 
 # Linear Algebra
@@ -727,7 +725,7 @@ function Base.:+(
             ⊞(Vtriv, right_virtualspace(A), Vtriv)
         V = Vleft ⊗ physicalspace(A) ← physicalspace(A) ⊗ Vright
 
-        H[i] = eltype(H)(V, A, B, C, D)
+        H[i] = JordanMPOTensor(V, A, B, C, D)
     end
     return FiniteMPOHamiltonian(H)
 end
@@ -749,7 +747,7 @@ function Base.:+(
         Vright = ⊞(Vtriv, right_virtualspace(A), Vtriv)
         V = Vleft ⊗ physicalspace(A) ← physicalspace(A) ⊗ Vright
 
-        H[i] = eltype(H)(V, A, B, C, D)
+        H[i] = JordanMPOTensor(V, A, B, C, D)
     end
     return InfiniteMPOHamiltonian(H)
 end
diff --git a/src/operators/ortho.jl b/src/operators/ortho.jl
index d21985797..bbcaa164d 100644
--- a/src/operators/ortho.jl
+++ b/src/operators/ortho.jl
@@ -11,7 +11,7 @@ function left_canonicalize!(
     d = sqrt(dim(P))
 
     # orthogonalize second column against first
-    WI = removeunit(W[1, 1, 1, 1], 1)
+    WI = removeunit(W[1, 1, 1, 1], 1; storage_type = TensorKit.storagetype(H))
     @plansor t[l; r] := conj(WI[p; p' l]) * W.C[p; p' r]
     # TODO: the following is currently broken due to a TensorKit bug
     # @plansor C′[p; p' r] := W.C[p; p' r] - WI[p; p' l] * t[l; r]
@@ -36,7 +36,7 @@ function left_canonicalize!(
         end
         H[i] = JordanMPOTensor(codomain(W) ← physicalspace(W) ⊗ V, Q1, W.B, Q2, W.D)
     else
-        tmp = transpose(cat(insertleftunit(C′, 1), W.A; dims = 1), ((3, 1, 2), (4,)))
+        tmp = transpose(cat(insertleftunit(C′, 1; storage_type = TensorKit.storagetype(H)), W.A; dims = 1), ((3, 1, 2), (4,)))
         Q, R = left_orth!(tmp; alg)
 
         if dim(R) == 0 # fully truncated
@@ -65,7 +65,7 @@ function left_canonicalize!(
 
     if size(W′, 4) > 1
         @plansor C′[l p; p' r] := t[l; r'] * W′.A[r' p; p' r]
-        C′ = add!(removeunit(C′, 1), W′.C)
+        C′ = add!(removeunit(C′, 1; storage_type = TensorKit.storagetype(H)), W′.C)
     else
         C′ = W′.C # empty
     end
@@ -77,7 +77,7 @@ function left_canonicalize!(
     end
 
     @plansor D′[l p; p'] := t[l; r] * W′.B[r p; p']
-    D′ = add!(removeunit(D′, 1), W′.D)
+    D′ = add!(removeunit(D′, 1; storage_type = TensorKit.storagetype(H)), W′.D)
 
     H[i + 1] = JordanMPOTensor(
         right_virtualspace(H[i]) ⊗ physicalspace(W′) ← domain(W′),
@@ -99,7 +99,7 @@ function right_canonicalize!(
     d = sqrt(dim(P))
 
     # orthogonalize second row against last
-    WI = removeunit(W[end, 1, 1, end], 4)
+    WI = removeunit(W[end, 1, 1, end], 4; storage_type = TensorKit.storagetype(H))
     @plansor t[l; r] := conj(WI[r p; p']) * W.B[l p; p']
     # TODO: the following is currently broken due to a TensorKit bug
     # @plansor B′[l p; p'] := W.B[l p; p'] - WI[r p; p'] * t[l; r]
@@ -124,7 +124,8 @@ function right_canonicalize!(
         end
         H[i] = JordanMPOTensor(V ⊗ P ← domain(W), Q1, Q2, W.C, W.D)
     else
-        tmp = transpose(cat(insertleftunit(B′, 4), W.A; dims = 4), ((1,), (3, 4, 2)))
+        B′′ = insertleftunit(B′, 4; storage_type = TensorKit.storagetype(H))
+        tmp = transpose(cat(B′′, W.A; dims = 4), ((1,), (3, 4, 2)))
         R, Q = right_orth!(tmp; alg)
         if dim(R) == 0
             V = _rightunit ⊞ _rightunit
@@ -152,7 +153,7 @@ function right_canonicalize!(
 
     if size(W′, 1) > 1
         @plansor B′[l p; p' r] := W′.A[l p; p' r'] * t[r'; r]
-        B′ = add!(removeunit(B′, 4), W′.B)
+        B′ = add!(removeunit(B′, 4; storage_type = TensorKit.storagetype(H)), W′.B)
     else
         B′ = W′.B
     end
@@ -164,7 +165,7 @@ function right_canonicalize!(
     end
 
     @plansor D′[p; p' r] := W′.C[p; p' r'] * t[r'; r]
-    D′ = add!(removeunit(D′, 3), W′.D)
+    D′ = add!(removeunit(D′, 3; storage_type = TensorKit.storagetype(H)), W′.D)
     H[i - 1] = JordanMPOTensor(codomain(W′) ← physicalspace(W′) ⊗ V, A′, B′, C′, D′)
     return H
 end

From b79ceef808b8043a226fbfccf99846aa57858c0a Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Tue, 31 Mar 2026 08:41:52 -0400
Subject: [PATCH 4/7] Use branch sources

---
 Project.toml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Project.toml b/Project.toml
index 51e51efdf..a5320f170 100644
--- a/Project.toml
+++ b/Project.toml
@@ -76,3 +76,8 @@ cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [targets]
 test = ["Aqua", "Adapt", "CUDA", "cuTENSOR", "Pkg", "Test", "TestExtras", "Plots", "Combinatorics", "ParallelTestRunner", "TensorKitTensors"]
+
+[sources]
+TensorKit = {url = "https://github.com/QuantumKitHub/TensorKit.jl", rev = "ksh/cuda_tweaks"}
+BlockTensorKit = {url = "https://github.com/lkdvos/BlockTensorKit.jl", rev = "ksh/gpu"}
+

From 13ac278012cba8b6bdbdc6d535c50826e16f132e Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Tue, 31 Mar 2026 08:43:35 -0400
Subject: [PATCH 5/7] Fix URL

---
 Project.toml           |  2 +-
 test/cuda/operators.jl | 81 +++++++++++++++++++++++-------------------
 test/cuda/states.jl    |  3 +-
 3 files changed, 48 insertions(+), 38 deletions(-)

diff --git a/Project.toml b/Project.toml
index a5320f170..41d87c327 100644
--- a/Project.toml
+++ b/Project.toml
@@ -79,5 +79,5 @@ test = ["Aqua", "Adapt", "CUDA", "cuTENSOR", "Pkg", "Test", "TestExtras", "Plots
 
 [sources]
 TensorKit = {url = "https://github.com/QuantumKitHub/TensorKit.jl", rev = "ksh/cuda_tweaks"}
-BlockTensorKit = {url = "https://github.com/lkdvos/BlockTensorKit.jl", rev = "ksh/gpu"}
+BlockTensorKit = {url = "https://github.com/kshyatt/BlockTensorKit.jl", rev = "ksh/gpu"}
 
diff --git a/test/cuda/operators.jl b/test/cuda/operators.jl
index f09cb2b4b..38d61eb53 100644
--- a/test/cuda/operators.jl
+++ b/test/cuda/operators.jl
@@ -10,10 +10,11 @@ using Adapt, CUDA, cuTENSOR, CUDA.CUBLAS
 # TODO revisit this once https://github.com/QuantumKitHub/MatrixAlgebraKit.jl/issues/176
 # is resolved
 MPSKit.Defaults.alg_svd() = CUSOLVER_QRIteration()
+MPSKit.Defaults.alg_lq() = LQViaTransposedQR(CUSOLVER_HouseholderQR())
 
 pspaces = (ℙ^4, Rep[U₁](0 => 2), Rep[SU₂](1 => 1))
 vspaces = (ℙ^10, Rep[U₁]((0 => 20)), Rep[SU₂](1 // 2 => 10, 3 // 2 => 5, 5 // 2 => 1))
-
+#=
 @testset "CuFiniteMPO" for V in (ℂ^2, U1Space(0 => 1, 1 => 1))
     # start from random operators
     L = 4
@@ -90,7 +91,7 @@ vspaces = (ℙ^10, Rep[U₁]((0 => 20)), Rep[SU₂](1 // 2 => 10, 3 // 2 => 5, 5
 
     @test dot(mpomps₁, mpomps₁) ≈ dot(mpo₁, mpo₁)
 end
-
+=#
 @testset "Finite CuMPOHamiltonian" begin
     L = 3
     T = ComplexF64
@@ -98,10 +99,12 @@ end
         lattice = fill(V, L)
         O₁ = randn(T, V, V)
         O₁ += O₁'
-        E = id(storagetype(O₁), domain(O₁))
+        E = id(CuVector{T, CUDA.DeviceMemory}, domain(O₁))
         O₂ = randn(T, V^2 ← V^2)
         O₂ += O₂'
 
+        dO₁ = adapt(CuVector{T, CUDA.DeviceMemory}, O₁)
+        dO₂ = adapt(CuVector{T, CUDA.DeviceMemory}, O₂)
         H1 = adapt(CuVector{T, CUDA.DeviceMemory}, FiniteMPOHamiltonian(lattice, i => O₁ for i in 1:L))
         H2 = adapt(CuVector{T, CUDA.DeviceMemory}, FiniteMPOHamiltonian(lattice, (i, i + 1) => O₂ for i in 1:(L - 1)))
         H3 = adapt(CuVector{T, CUDA.DeviceMemory}, FiniteMPOHamiltonian(lattice, 1 => O₁, (2, 3) => O₂, (1, 3) => O₂))
@@ -119,17 +122,17 @@ end
         end
 
         # check if constructor works by converting back to tensormap
-        #= H1_tm = convert(TensorMap, H1)
-        operators = vcat(fill(E, L - 1), O₁)
+        H1_tm = convert(TensorMap, H1)
+        operators = vcat(fill(E, L - 1), dO₁)
         @test H1_tm ≈ mapreduce(+, 1:L) do i
             return reduce(⊗, circshift(operators, i))
         end
-        operators = vcat(fill(E, L - 2), O₂)
+        operators = vcat(fill(E, L - 2), dO₂)
         @test convert(TensorMap, H2) ≈ mapreduce(+, 1:(L - 1)) do i
             return reduce(⊗, circshift(operators, i))
         end
         @test convert(TensorMap, H3) ≈
-            O₁ ⊗ E ⊗ E + E ⊗ O₂ + permute(O₂ ⊗ E, ((1, 3, 2), (4, 6, 5))) =# # needs a fix in BlockTensorKit
+            dO₁ ⊗ E ⊗ E + E ⊗ dO₂ + permute(dO₂ ⊗ E, ((1, 3, 2), (4, 6, 5)))
 
         # check if adding terms on the same site works
         single_terms = Iterators.flatten(Iterators.repeated((i => O₁ / 2 for i in 1:L), 2))
@@ -149,20 +152,27 @@ end
             adapt(CuArray, FiniteMPOHamiltonian(lattice, 2 => O₁)) +
             adapt(CuArray, FiniteMPOHamiltonian(lattice, 3 => O₁))
         @test TensorKit.storagetype(H1) == CuVector{T, CUDA.DeviceMemory}
-        #@test 0.8 * H1 + 0.2 * H1 ≈ H1 atol = 1.0e-6 # broken due to JordanMPOTensorMap
-        #=@test convert(TensorMap, H1 + H2) ≈ convert(TensorMap, H1) + convert(TensorMap, H2) atol = 1.0e-6
+        #=
+        H1′a = 0.8 * H1
+        @test TensorKit.storagetype(H1′a) == CuVector{T, CUDA.DeviceMemory}
+        H1′b = 0.2 * H1
+        @test TensorKit.storagetype(H1′b) == CuVector{T, CUDA.DeviceMemory}
+        H1′ = H1′a + H1′b
+        @test TensorKit.storagetype(H1′) == CuVector{T, CUDA.DeviceMemory}
+        @test H1′ ≈ H1 atol = 1.0e-6
+        @test convert(TensorMap, H1 + H2) ≈ convert(TensorMap, H1) + convert(TensorMap, H2) atol = 1.0e-6
         H1_trunc = changebonds(H1, SvdCut(; trscheme = truncrank(0)))
         @test H1_trunc ≈ H1
-        @test all(left_virtualspace(H1_trunc) .== left_virtualspace(H1))=# # needs fix in BlockTensorKit
+        @test all(left_virtualspace(H1_trunc) .== left_virtualspace(H1))
 
         # test dot and application
         state = rand(T, prod(lattice))
         mps = adapt(CuArray, FiniteMPS(state))
 
-        #=@test convert(TensorMap, H1 * mps) ≈ H1_tm * state # needs fix in BlockTensorKit
+        @test convert(TensorMap, H1 * mps) ≈ H1_tm * state
         @test H1 * state ≈ H1_tm * state
-        @test dot(mps, H2, mps) ≈ dot(mps, H2 * mps)=#
-
+        @test dot(mps, H2, mps) ≈ dot(mps, H2 * mps)
+        =#
         # test constructor from dictionary with mixed linear and Cartesian lattice indices as keys
         grid = square = fill(V, 3, 3)
 
@@ -180,17 +190,17 @@ end
         @test TensorKit.storagetype(H4) == CuVector{T, CUDA.DeviceMemory}
 
         @test H4 ≈
-            adapt(CuArray, FiniteMPOHamiltonian(grid, local_operators)) +
-            adapt(CuArray, FiniteMPOHamiltonian(grid, vertical_operators)) +
-            adapt(CuArray, FiniteMPOHamiltonian(grid, horizontal_operators)) atol = 1.0e-4
+            adapt(CuVector{T, CUDA.DeviceMemory}, FiniteMPOHamiltonian(grid, local_operators)) +
+            adapt(CuVector{T, CUDA.DeviceMemory}, FiniteMPOHamiltonian(grid, vertical_operators)) +
+            adapt(CuVector{T, CUDA.DeviceMemory}, FiniteMPOHamiltonian(grid, horizontal_operators)) atol = 1.0e-4
         @test TensorKit.storagetype(H4) == CuVector{T, CUDA.DeviceMemory}
 
-        #H4′= H4 / 3 + 2H4 / 3
-        #@test TensorKit.storagetype(H4′) == CuVector{T, CUDA.DeviceMemory}
-        #H5 = changebonds(H4′, SvdCut(; trscheme = trunctol(; atol = 1.0e-12)))
-        #@test TensorKit.storagetype(H5) == CuVector{T, CUDA.DeviceMemory} # more problems with arithmetic operations...
-        #psi = adapt(CuArray, FiniteMPS(physicalspace(H5), V ⊕ rightunitspace(V)))
-        #@test expectation_value(psi, H4) ≈ expectation_value(psi, H5)
+        #H4′= H4 / 3 + 2H4 / 3 # trouble here with cat_t
+        #=@test TensorKit.storagetype(H4′) == CuVector{T, CUDA.DeviceMemory}
+        H5 = changebonds(H4′, SvdCut(; trscheme = trunctol(; atol = 1.0e-12)))
+        @test TensorKit.storagetype(H5) == CuVector{T, CUDA.DeviceMemory}
+        psi = adapt(CuArray, FiniteMPS(physicalspace(H5), V ⊕ rightunitspace(V)))
+        @test expectation_value(psi, H4) ≈ expectation_value(psi, H5)=#
     end
 end
 
@@ -214,31 +224,30 @@ end
     @test TensorKit.storagetype(ψ1) == CuVector{ComplexF64, CUDA.DeviceMemory}
     @test TensorKit.storagetype(ψ2) == CuVector{ComplexF64, CUDA.DeviceMemory}
 
-    #=
     e1 = expectation_value(ψ1, H1)
     e2 = expectation_value(ψ1, H2)
-    =# # broken due to BraidingTensor
 
-    # H1 = 2 * H1 - CuArray([1]) # scalar indexing
-    # @test TensorKit.storagetype(H1) == CuVector{ComplexF64, CUDA.DeviceMemory}
-    # @test e1 * 2 - 1 ≈ expectation_value(ψ1, H1) atol = 1.0e-10 # broken due to BraidingTensor
+    H1 = 2 * H1
+    @test TensorKit.storagetype(H1) == CuVector{ComplexF64, CUDA.DeviceMemory}
+    H1 -= [1]
+    @test TensorKit.storagetype(H1) == CuVector{ComplexF64, CUDA.DeviceMemory}
+    @test e1 * 2 - 1 ≈ expectation_value(ψ1, H1) atol = 1.0e-10
 
     H1 = H1 + H2
     @test TensorKit.storagetype(H1) == CuVector{ComplexF64, CUDA.DeviceMemory}
-
-    # @test e1 * 2 + e2 - 1 ≈ expectation_value(ψ1, H1) atol = 1.0e-10 # broken due to BraidingTensor
+    @test e1 * 2 + e2 - 1 ≈ expectation_value(ψ1, H1) atol = 1.0e-10
 
     H1 = repeat(H1, 2)
     @test TensorKit.storagetype(H1) == CuVector{ComplexF64, CUDA.DeviceMemory}
 
-    #=e1 = expectation_value(ψ2, H1)
+    e1 = expectation_value(ψ2, H1)
     e3 = expectation_value(ψ2, H3)
 
-    @test e1 + e3 ≈ expectation_value(ψ2, H1 + H3) atol = 1.0e-10=# # broken due to BraidingTensor
+    @test e1 + e3 ≈ expectation_value(ψ2, H1 + H3) atol = 1.0e-10
 
-    #H4 = H1 + H3 # broken due to BraidingTensor
-    #@test TensorKit.storagetype(H4) == CuVector{ComplexF64, CUDA.DeviceMemory}
-    #h4 = H4 * H4
-    #@test TensorKit.storagetype(h4) == CuVector{ComplexF64, CUDA.DeviceMemory}
-    #@test real(expectation_value(ψ2, H4)) >= 0 # broken due to BraidingTensor
+    H4 = H1 + H3
+    @test TensorKit.storagetype(H4) == CuVector{ComplexF64, CUDA.DeviceMemory}
+    h4 = H4 * H4
+    @test TensorKit.storagetype(h4) == CuVector{ComplexF64, CUDA.DeviceMemory}
+    @test real(expectation_value(ψ2, H4)) >= 0
 end
diff --git a/test/cuda/states.jl b/test/cuda/states.jl
index 7c036ea09..a37712209 100644
--- a/test/cuda/states.jl
+++ b/test/cuda/states.jl
@@ -4,7 +4,7 @@ using MPSKit: GeometryStyle, InfiniteChainStyle, TransferMatrix
 using TensorKit
 using TensorKit: ℙ
 using Adapt, CUDA, cuTENSOR
-
+#=
 @testset "CuMPS ($(sectortype(D)), $elt)" for (D, d, elt) in
     [(ℙ^10, ℙ^2, ComplexF64), (Rep[U₁](1 => 3), Rep[U₁](0 => 1), ComplexF64)]
     tol = Float64(eps(real(elt)) * 100)
@@ -90,3 +90,4 @@ end
         @test TransferMatrix(ψ.AR[i, j], ψ.AR[i, j]) * r_RR(ψ, i, j) ≈ r_RR(ψ, i, j + 1)
     end
 end
+=#

From e3b0186990c07c131d249da48a14503712518f39 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Tue, 31 Mar 2026 13:42:15 -0400
Subject: [PATCH 6/7] Last fix?

---
 src/operators/abstractmpo.jl |  6 +++++-
 test/cuda/operators.jl       | 16 ++++++++--------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/operators/abstractmpo.jl b/src/operators/abstractmpo.jl
index 01c49500c..4e021eae6 100644
--- a/src/operators/abstractmpo.jl
+++ b/src/operators/abstractmpo.jl
@@ -152,7 +152,11 @@ Compute the mpo tensor that arises from multiplying MPOs.
 """
 function fuse_mul_mpo(O1, O2)
     TT = promote_type(scalartype(O1), scalartype(O2))
-    T = TensorKit.similarstoragetype(storagetype(O1), TT)
+    T = if O1 isa BraidingTensor
+            TensorKit.similarstoragetype(storagetype(O2), TT)
+        else
+            TensorKit.similarstoragetype(storagetype(O1), TT)
+        end
     F_left = fuser(T, left_virtualspace(O2), left_virtualspace(O1))
     F_right = fuser(T, right_virtualspace(O2), right_virtualspace(O1))
     return _fuse_mpo_mpo(O1, O2, F_left, F_right)
diff --git a/test/cuda/operators.jl b/test/cuda/operators.jl
index 38d61eb53..facd99682 100644
--- a/test/cuda/operators.jl
+++ b/test/cuda/operators.jl
@@ -14,7 +14,7 @@ MPSKit.Defaults.alg_lq() = LQViaTransposedQR(CUSOLVER_HouseholderQR())
 
 pspaces = (ℙ^4, Rep[U₁](0 => 2), Rep[SU₂](1 => 1))
 vspaces = (ℙ^10, Rep[U₁]((0 => 20)), Rep[SU₂](1 // 2 => 10, 3 // 2 => 5, 5 // 2 => 1))
-#=
+
 @testset "CuFiniteMPO" for V in (ℂ^2, U1Space(0 => 1, 1 => 1))
     # start from random operators
     L = 4
@@ -91,7 +91,7 @@ vspaces = (ℙ^10, Rep[U₁]((0 => 20)), Rep[SU₂](1 // 2 => 10, 3 // 2 => 5, 5
 
     @test dot(mpomps₁, mpomps₁) ≈ dot(mpo₁, mpo₁)
 end
-=#
+
 @testset "Finite CuMPOHamiltonian" begin
     L = 3
     T = ComplexF64
@@ -152,7 +152,7 @@ end
             adapt(CuArray, FiniteMPOHamiltonian(lattice, 2 => O₁)) +
             adapt(CuArray, FiniteMPOHamiltonian(lattice, 3 => O₁))
         @test TensorKit.storagetype(H1) == CuVector{T, CUDA.DeviceMemory}
-        #=
+
         H1′a = 0.8 * H1
         @test TensorKit.storagetype(H1′a) == CuVector{T, CUDA.DeviceMemory}
         H1′b = 0.2 * H1
@@ -172,7 +172,7 @@ end
         @test convert(TensorMap, H1 * mps) ≈ H1_tm * state
         @test H1 * state ≈ H1_tm * state
         @test dot(mps, H2, mps) ≈ dot(mps, H2 * mps)
-        =#
+
         # test constructor from dictionary with mixed linear and Cartesian lattice indices as keys
         grid = square = fill(V, 3, 3)
 
@@ -195,15 +195,15 @@ end
             adapt(CuVector{T, CUDA.DeviceMemory}, FiniteMPOHamiltonian(grid, horizontal_operators)) atol = 1.0e-4
         @test TensorKit.storagetype(H4) == CuVector{T, CUDA.DeviceMemory}
 
-        #H4′= H4 / 3 + 2H4 / 3 # trouble here with cat_t
-        #=@test TensorKit.storagetype(H4′) == CuVector{T, CUDA.DeviceMemory}
+        H4′= H4 / 3 + 2H4 / 3
+        @test TensorKit.storagetype(H4′) == CuVector{T, CUDA.DeviceMemory}
         H5 = changebonds(H4′, SvdCut(; trscheme = trunctol(; atol = 1.0e-12)))
         @test TensorKit.storagetype(H5) == CuVector{T, CUDA.DeviceMemory}
         psi = adapt(CuArray, FiniteMPS(physicalspace(H5), V ⊕ rightunitspace(V)))
-        @test expectation_value(psi, H4) ≈ expectation_value(psi, H5)=#
+        @test expectation_value(psi, H4) ≈ expectation_value(psi, H5)
     end
 end
-
+=#
 @testset "CuInfiniteMPOHamiltonian $(sectortype(pspace))" for (pspace, Dspace) in zip(pspaces, vspaces)
     # generate a 1-2-3 body interaction
     operators = ntuple(3) do i

From 9b224da4fc67b7bd47b6dbf5a01116b65d896752 Mon Sep 17 00:00:00 2001
From: Katharine Hyatt <khyatt@flatironinstitute.org>
Date: Thu, 2 Apr 2026 06:35:39 -0400
Subject: [PATCH 7/7] Working around BraidingTensor

---
 src/operators/abstractmpo.jl    |   8 +-
 src/operators/mpo.jl            |   2 +-
 src/operators/mpohamiltonian.jl |  10 +-
 src/operators/ortho.jl          |  30 +++--
 test/cuda/operators.jl          | 222 +++++++++++++++++---------------
 5 files changed, 151 insertions(+), 121 deletions(-)

diff --git a/src/operators/abstractmpo.jl b/src/operators/abstractmpo.jl
index 4e021eae6..85fb3e5a2 100644
--- a/src/operators/abstractmpo.jl
+++ b/src/operators/abstractmpo.jl
@@ -153,10 +153,10 @@ Compute the mpo tensor that arises from multiplying MPOs.
 function fuse_mul_mpo(O1, O2)
     TT = promote_type(scalartype(O1), scalartype(O2))
     T = if O1 isa BraidingTensor
-            TensorKit.similarstoragetype(storagetype(O2), TT)
-        else
-            TensorKit.similarstoragetype(storagetype(O1), TT)
-        end
+        TensorKit.similarstoragetype(storagetype(O2), TT)
+    else
+        TensorKit.similarstoragetype(storagetype(O1), TT)
+    end
     F_left = fuser(T, left_virtualspace(O2), left_virtualspace(O1))
     F_right = fuser(T, right_virtualspace(O2), right_virtualspace(O1))
     return _fuse_mpo_mpo(O1, O2, F_left, F_right)
diff --git a/src/operators/mpo.jl b/src/operators/mpo.jl
index 8630a0d56..7bf0d16ca 100644
--- a/src/operators/mpo.jl
+++ b/src/operators/mpo.jl
@@ -62,7 +62,7 @@ eachsite(mpo::InfiniteMPO) = PeriodicArray(eachindex(mpo))
 function Base.similar(mpo::MPO{<:MPOTensor}, ::Type{O}, L::Int) where {O <: MPOTensor}
     return MPO(similar(parent(mpo), O, L))
 end
-function Base.similar(mpo::MPO, ::Type{T}) where {T <: Number}
+function Base.similar(mpo::MPO, ::Type{T}) where {T <: Union{Number, DenseVector}}
     return MPO(similar.(parent(mpo), T))
 end
 
diff --git a/src/operators/mpohamiltonian.jl b/src/operators/mpohamiltonian.jl
index bb544d0ae..05fc36f1a 100644
--- a/src/operators/mpohamiltonian.jl
+++ b/src/operators/mpohamiltonian.jl
@@ -660,6 +660,8 @@ function isemptylevel(H::InfiniteMPOHamiltonian, i::Int)
 end
 
 function Base.convert(::Type{TensorMap}, H::FiniteMPOHamiltonian)
+    # TODO this FORCES the output tensor to have Vector storage
+    # in an uncontrollable way, should be fixed
     L = removeunit(H[1], 1)
     R = removeunit(H[end], 4)
     M = Tuple(H[2:(end - 1)])
@@ -853,7 +855,13 @@ end
 function Base.:*(H::FiniteMPOHamiltonian{<:MPOTensor}, x::AbstractTensorMap)
     @assert length(H) > 1
     @assert numout(x) == length(H)
-    L = removeunit(H[1], 1)
+    if H[1] isa BraidingTensor
+        L′ = removeunit(H[1], 1)
+        L = similar(L′, TensorKit.storagetype(H))
+        copy!(L, L′)
+    else
+        L = removeunit(H[1], 1)
+    end
     M = Tuple(H[2:(end - 1)])
     R = removeunit(H[end], 4)
     return TensorMap(_apply_finitempo(x, L, M, R))
diff --git a/src/operators/ortho.jl b/src/operators/ortho.jl
index bbcaa164d..88a37025d 100644
--- a/src/operators/ortho.jl
+++ b/src/operators/ortho.jl
@@ -11,7 +11,14 @@ function left_canonicalize!(
     d = sqrt(dim(P))
 
     # orthogonalize second column against first
-    WI = removeunit(W[1, 1, 1, 1], 1; storage_type = TensorKit.storagetype(H))
+    Wi = W[1, 1, 1, 1]
+    if Wi isa BraidingTensor
+        Wi′ = removeunit(Wi, 1)
+        WI = similar(Wi′, TensorKit.storagetype(H))
+        copy!(WI, Wi′)
+    else
+        WI = removeunit(Wi, 1)
+    end
     @plansor t[l; r] := conj(WI[p; p' l]) * W.C[p; p' r]
     # TODO: the following is currently broken due to a TensorKit bug
     # @plansor C′[p; p' r] := W.C[p; p' r] - WI[p; p' l] * t[l; r]
@@ -36,7 +43,7 @@ function left_canonicalize!(
         end
         H[i] = JordanMPOTensor(codomain(W) ← physicalspace(W) ⊗ V, Q1, W.B, Q2, W.D)
     else
-        tmp = transpose(cat(insertleftunit(C′, 1; storage_type = TensorKit.storagetype(H)), W.A; dims = 1), ((3, 1, 2), (4,)))
+        tmp = transpose(cat(insertleftunit(C′, 1), W.A; dims = 1), ((3, 1, 2), (4,)))
         Q, R = left_orth!(tmp; alg)
 
         if dim(R) == 0 # fully truncated
@@ -65,7 +72,7 @@ function left_canonicalize!(
 
     if size(W′, 4) > 1
         @plansor C′[l p; p' r] := t[l; r'] * W′.A[r' p; p' r]
-        C′ = add!(removeunit(C′, 1; storage_type = TensorKit.storagetype(H)), W′.C)
+        C′ = add!(removeunit(C′, 1), W′.C)
     else
         C′ = W′.C # empty
     end
@@ -77,7 +84,7 @@ function left_canonicalize!(
     end
 
     @plansor D′[l p; p'] := t[l; r] * W′.B[r p; p']
-    D′ = add!(removeunit(D′, 1; storage_type = TensorKit.storagetype(H)), W′.D)
+    D′ = add!(removeunit(D′, 1), W′.D)
 
     H[i + 1] = JordanMPOTensor(
         right_virtualspace(H[i]) ⊗ physicalspace(W′) ← domain(W′),
@@ -99,7 +106,14 @@ function right_canonicalize!(
     d = sqrt(dim(P))
 
     # orthogonalize second row against last
-    WI = removeunit(W[end, 1, 1, end], 4; storage_type = TensorKit.storagetype(H))
+    Wi = W[end, 1, 1, end]
+    if Wi isa BraidingTensor
+        Wi′ = removeunit(Wi, 4)
+        WI = similar(Wi′, TensorKit.storagetype(H))
+        copy!(WI, Wi′)
+    else
+        WI = removeunit(Wi, 4)
+    end
     @plansor t[l; r] := conj(WI[r p; p']) * W.B[l p; p']
     # TODO: the following is currently broken due to a TensorKit bug
     # @plansor B′[l p; p'] := W.B[l p; p'] - WI[r p; p'] * t[l; r]
@@ -124,7 +138,7 @@ function right_canonicalize!(
         end
         H[i] = JordanMPOTensor(V ⊗ P ← domain(W), Q1, Q2, W.C, W.D)
     else
-        B′′ = insertleftunit(B′, 4; storage_type = TensorKit.storagetype(H))
+        B′′ = insertleftunit(B′, 4)
         tmp = transpose(cat(B′′, W.A; dims = 4), ((1,), (3, 4, 2)))
         R, Q = right_orth!(tmp; alg)
         if dim(R) == 0
@@ -153,7 +167,7 @@ function right_canonicalize!(
 
     if size(W′, 1) > 1
         @plansor B′[l p; p' r] := W′.A[l p; p' r'] * t[r'; r]
-        B′ = add!(removeunit(B′, 4; storage_type = TensorKit.storagetype(H)), W′.B)
+        B′ = add!(removeunit(B′, 4), W′.B)
     else
         B′ = W′.B
     end
@@ -165,7 +179,7 @@ function right_canonicalize!(
     end
 
     @plansor D′[p; p' r] := W′.C[p; p' r'] * t[r'; r]
-    D′ = add!(removeunit(D′, 3; storage_type = TensorKit.storagetype(H)), W′.D)
+    D′ = add!(removeunit(D′, 3), W′.D)
     H[i - 1] = JordanMPOTensor(codomain(W′) ← physicalspace(W′) ⊗ V, A′, B′, C′, D′)
     return H
 end
diff --git a/test/cuda/operators.jl b/test/cuda/operators.jl
index facd99682..1913866ef 100644
--- a/test/cuda/operators.jl
+++ b/test/cuda/operators.jl
@@ -92,118 +92,126 @@ vspaces = (ℙ^10, Rep[U₁]((0 => 20)), Rep[SU₂](1 // 2 => 10, 3 // 2 => 5, 5
     @test dot(mpomps₁, mpomps₁) ≈ dot(mpo₁, mpo₁)
 end
 
-@testset "Finite CuMPOHamiltonian" begin
+@testset "Finite CuMPOHamiltonian T ($T), V ($(spacetype(V)))" for T in (Float64, ComplexF64), V in (ℂ^2, U1Space(-1 => 1, 0 => 1, 1 => 1))
     L = 3
-    T = ComplexF64
-    for T in (Float64, ComplexF64), V in (ℂ^2, U1Space(-1 => 1, 0 => 1, 1 => 1))
-        lattice = fill(V, L)
-        O₁ = randn(T, V, V)
-        O₁ += O₁'
-        E = id(CuVector{T, CUDA.DeviceMemory}, domain(O₁))
-        O₂ = randn(T, V^2 ← V^2)
-        O₂ += O₂'
-
-        dO₁ = adapt(CuVector{T, CUDA.DeviceMemory}, O₁)
-        dO₂ = adapt(CuVector{T, CUDA.DeviceMemory}, O₂)
-        H1 = adapt(CuVector{T, CUDA.DeviceMemory}, FiniteMPOHamiltonian(lattice, i => O₁ for i in 1:L))
-        H2 = adapt(CuVector{T, CUDA.DeviceMemory}, FiniteMPOHamiltonian(lattice, (i, i + 1) => O₂ for i in 1:(L - 1)))
-        H3 = adapt(CuVector{T, CUDA.DeviceMemory}, FiniteMPOHamiltonian(lattice, 1 => O₁, (2, 3) => O₂, (1, 3) => O₂))
-        @test TensorKit.storagetype(H1) == CuVector{T, CUDA.DeviceMemory}
-        @test TensorKit.storagetype(H2) == CuVector{T, CUDA.DeviceMemory}
-        @test TensorKit.storagetype(H3) == CuVector{T, CUDA.DeviceMemory}
-
-        @test scalartype(H1) == scalartype(H2) == scalartype(H3) == T
-        if !(T <: Complex)
-            for H in (H1, H2, H3)
-                Hc = @constinferred complex(H)
-                @test scalartype(Hc) == complex(T)
-                @test TensorKit.storagetype(Hc) == CuVector{complex(T), CUDA.DeviceMemory}
-            end
+    lattice = fill(V, L)
+    O₁ = randn(T, V, V)
+    O₁ += O₁'
+    E = id(CuVector{T, CUDA.DeviceMemory}, domain(O₁))
+    O₂ = randn(T, V^2 ← V^2)
+    O₂ += O₂'
+
+    dO₁ = adapt(CuVector{T, CUDA.DeviceMemory}, O₁)
+    dO₂ = adapt(CuVector{T, CUDA.DeviceMemory}, O₂)
+    H1 = adapt(CuVector{T, CUDA.DeviceMemory}, FiniteMPOHamiltonian(lattice, i => O₁ for i in 1:L))
+    H2 = adapt(CuVector{T, CUDA.DeviceMemory}, FiniteMPOHamiltonian(lattice, (i, i + 1) => O₂ for i in 1:(L - 1)))
+    H3 = adapt(CuVector{T, CUDA.DeviceMemory}, FiniteMPOHamiltonian(lattice, 1 => O₁, (2, 3) => O₂, (1, 3) => O₂))
+    @test TensorKit.storagetype(H1) == CuVector{T, CUDA.DeviceMemory}
+    @test TensorKit.storagetype(H2) == CuVector{T, CUDA.DeviceMemory}
+    @test TensorKit.storagetype(H3) == CuVector{T, CUDA.DeviceMemory}
+
+    @test scalartype(H1) == scalartype(H2) == scalartype(H3) == T
+    if !(T <: Complex)
+        for H in (H1, H2, H3)
+            Hc = @constinferred complex(H)
+            @test scalartype(Hc) == complex(T)
+            @test TensorKit.storagetype(Hc) == CuVector{complex(T), CUDA.DeviceMemory}
         end
+    end
 
-        # check if constructor works by converting back to tensormap
-        H1_tm = convert(TensorMap, H1)
-        operators = vcat(fill(E, L - 1), dO₁)
-        @test H1_tm ≈ mapreduce(+, 1:L) do i
-            return reduce(⊗, circshift(operators, i))
-        end
-        operators = vcat(fill(E, L - 2), dO₂)
-        @test convert(TensorMap, H2) ≈ mapreduce(+, 1:(L - 1)) do i
-            return reduce(⊗, circshift(operators, i))
-        end
-        @test convert(TensorMap, H3) ≈
-            dO₁ ⊗ E ⊗ E + E ⊗ dO₂ + permute(dO₂ ⊗ E, ((1, 3, 2), (4, 6, 5)))
-
-        # check if adding terms on the same site works
-        single_terms = Iterators.flatten(Iterators.repeated((i => O₁ / 2 for i in 1:L), 2))
-        H4 = adapt(CuArray, FiniteMPOHamiltonian(lattice, single_terms))
-        @test TensorKit.storagetype(H4) == CuVector{T, CUDA.DeviceMemory}
-        @test H4 ≈ H1 atol = 1.0e-6
-        double_terms = Iterators.flatten(
-            Iterators.repeated(((i, i + 1) => O₂ / 2 for i in 1:(L - 1)), 2)
-        )
-        H5 = adapt(CuArray, FiniteMPOHamiltonian(lattice, double_terms))
-        @test TensorKit.storagetype(H5) == CuVector{T, CUDA.DeviceMemory}
-        @test H5 ≈ H2 atol = 1.0e-6
-
-        # test linear algebra
-        @test H1 ≈
-            adapt(CuArray, FiniteMPOHamiltonian(lattice, 1 => O₁)) +
-            adapt(CuArray, FiniteMPOHamiltonian(lattice, 2 => O₁)) +
-            adapt(CuArray, FiniteMPOHamiltonian(lattice, 3 => O₁))
-        @test TensorKit.storagetype(H1) == CuVector{T, CUDA.DeviceMemory}
-
-        H1′a = 0.8 * H1
-        @test TensorKit.storagetype(H1′a) == CuVector{T, CUDA.DeviceMemory}
-        H1′b = 0.2 * H1
-        @test TensorKit.storagetype(H1′b) == CuVector{T, CUDA.DeviceMemory}
-        H1′ = H1′a + H1′b
-        @test TensorKit.storagetype(H1′) == CuVector{T, CUDA.DeviceMemory}
-        @test H1′ ≈ H1 atol = 1.0e-6
-        @test convert(TensorMap, H1 + H2) ≈ convert(TensorMap, H1) + convert(TensorMap, H2) atol = 1.0e-6
-        H1_trunc = changebonds(H1, SvdCut(; trscheme = truncrank(0)))
-        @test H1_trunc ≈ H1
-        @test all(left_virtualspace(H1_trunc) .== left_virtualspace(H1))
-
-        # test dot and application
-        state = rand(T, prod(lattice))
-        mps = adapt(CuArray, FiniteMPS(state))
-
-        @test convert(TensorMap, H1 * mps) ≈ H1_tm * state
-        @test H1 * state ≈ H1_tm * state
-        @test dot(mps, H2, mps) ≈ dot(mps, H2 * mps)
-
-        # test constructor from dictionary with mixed linear and Cartesian lattice indices as keys
-        grid = square = fill(V, 3, 3)
-
-        local_operators = Dict((I,) => O₁ for I in eachindex(grid))
-        I_vertical = CartesianIndex(1, 0)
-        vertical_operators = Dict(
-            (I, I + I_vertical) => O₂ for I in eachindex(IndexCartesian(), square) if I[1] < size(square, 1)
-        )
-        I_horizontal = CartesianIndex(0, 1)
-        horizontal_operators = Dict(
-            (I, I + I_horizontal) => O₂ for I in eachindex(IndexCartesian(), square) if I[2] < size(square, 1)
-        )
-        operators = merge(local_operators, vertical_operators, horizontal_operators)
-        H4 = adapt(CuArray, FiniteMPOHamiltonian(grid, operators))
-        @test TensorKit.storagetype(H4) == CuVector{T, CUDA.DeviceMemory}
-
-        @test H4 ≈
-            adapt(CuVector{T, CUDA.DeviceMemory}, FiniteMPOHamiltonian(grid, local_operators)) +
-            adapt(CuVector{T, CUDA.DeviceMemory}, FiniteMPOHamiltonian(grid, vertical_operators)) +
-            adapt(CuVector{T, CUDA.DeviceMemory}, FiniteMPOHamiltonian(grid, horizontal_operators)) atol = 1.0e-4
-        @test TensorKit.storagetype(H4) == CuVector{T, CUDA.DeviceMemory}
-
-        H4′= H4 / 3 + 2H4 / 3
-        @test TensorKit.storagetype(H4′) == CuVector{T, CUDA.DeviceMemory}
-        H5 = changebonds(H4′, SvdCut(; trscheme = trunctol(; atol = 1.0e-12)))
-        @test TensorKit.storagetype(H5) == CuVector{T, CUDA.DeviceMemory}
-        psi = adapt(CuArray, FiniteMPS(physicalspace(H5), V ⊕ rightunitspace(V)))
-        @test expectation_value(psi, H4) ≈ expectation_value(psi, H5)
+    # check if constructor works by converting back to tensormap
+    H1_tm = convert(TensorMap, H1)
+    @test TensorKit.storagetype(H1_tm) == CuVector{T, CUDA.DeviceMemory}
+    operators = vcat(fill(E, L - 1), dO₁)
+    @test H1_tm ≈ mapreduce(+, 1:L) do i
+        return reduce(⊗, circshift(operators, i))
+    end
+    operators = vcat(fill(E, L - 2), dO₂)
+    @test convert(TensorMap, H2) ≈ mapreduce(+, 1:(L - 1)) do i
+        return reduce(⊗, circshift(operators, i))
     end
+    @test convert(TensorMap, H3) ≈
+        dO₁ ⊗ E ⊗ E + E ⊗ dO₂ + permute(dO₂ ⊗ E, ((1, 3, 2), (4, 6, 5)))
+
+    # check if adding terms on the same site works
+    single_terms = Iterators.flatten(Iterators.repeated((i => O₁ / 2 for i in 1:L), 2))
+    H4 = adapt(CuArray, FiniteMPOHamiltonian(lattice, single_terms))
+    @test TensorKit.storagetype(H4) == CuVector{T, CUDA.DeviceMemory}
+    @test H4 ≈ H1 atol = 1.0e-6
+    double_terms = Iterators.flatten(
+        Iterators.repeated(((i, i + 1) => O₂ / 2 for i in 1:(L - 1)), 2)
+    )
+    H5 = adapt(CuArray, FiniteMPOHamiltonian(lattice, double_terms))
+    @test TensorKit.storagetype(H5) == CuVector{T, CUDA.DeviceMemory}
+    @test H5 ≈ H2 atol = 1.0e-6
+
+    # test linear algebra
+    @test H1 ≈
+        adapt(CuArray, FiniteMPOHamiltonian(lattice, 1 => O₁)) +
+        adapt(CuArray, FiniteMPOHamiltonian(lattice, 2 => O₁)) +
+        adapt(CuArray, FiniteMPOHamiltonian(lattice, 3 => O₁))
+    @test TensorKit.storagetype(H1) == CuVector{T, CUDA.DeviceMemory}
+
+    H1′a = 0.8 * H1
+    @test TensorKit.storagetype(H1′a) == CuVector{T, CUDA.DeviceMemory}
+    H1′b = 0.2 * H1
+    @test TensorKit.storagetype(H1′b) == CuVector{T, CUDA.DeviceMemory}
+    H1′ = H1′a + H1′b
+    @test TensorKit.storagetype(H1′) == CuVector{T, CUDA.DeviceMemory}
+    @test H1′ ≈ H1 atol = 1.0e-6
+    @test convert(TensorMap, H1 + H2) ≈ convert(TensorMap, H1) + convert(TensorMap, H2) atol = 1.0e-6
+    H1_trunc = changebonds(H1, SvdCut(; trscheme = truncrank(0)))
+    @test H1_trunc ≈ H1
+    @test all(left_virtualspace(H1_trunc) .== left_virtualspace(H1))
+
+    # test dot and application
+    state = adapt(CuVector{T, CUDA.DeviceMemory}, rand(T, prod(lattice)))
+    @test TensorKit.storagetype(state) == CuVector{T, CUDA.DeviceMemory}
+    mps = adapt(CuVector{T, CUDA.DeviceMemory}, FiniteMPS(state))
+    @test TensorKit.storagetype(mps) == CuVector{T, CUDA.DeviceMemory}
+
+    # TODO fix me!
+    #=@test TensorKit.storagetype(H1) == CuVector{T, CUDA.DeviceMemory}
+    H1mps = H1 * mps
+    @test TensorKit.storagetype(H1mps) == CuVector{T, CUDA.DeviceMemory}
+    H1tmstate = H1_tm * state 
+    @test TensorKit.storagetype(H1tmstate) == CuVector{T, CUDA.DeviceMemory}
+    H1mpstm = convert(TensorMap, H1mps)
+    @test TensorKit.storagetype(H1mpstm) == CuVector{T, CUDA.DeviceMemory}
+    @test H1mpstm ≈ H1tmstate
+    @test H1 * state ≈ H1tmstate
+    @test dot(mps, H2, mps) ≈ dot(mps, H2 * mps)=#
+
+    # test constructor from dictionary with mixed linear and Cartesian lattice indices as keys
+    grid = square = fill(V, 3, 3)
+
+    local_operators = Dict((I,) => O₁ for I in eachindex(grid))
+    I_vertical = CartesianIndex(1, 0)
+    vertical_operators = Dict(
+        (I, I + I_vertical) => O₂ for I in eachindex(IndexCartesian(), square) if I[1] < size(square, 1)
+    )
+    I_horizontal = CartesianIndex(0, 1)
+    horizontal_operators = Dict(
+        (I, I + I_horizontal) => O₂ for I in eachindex(IndexCartesian(), square) if I[2] < size(square, 1)
+    )
+    operators = merge(local_operators, vertical_operators, horizontal_operators)
+    H4 = adapt(CuArray, FiniteMPOHamiltonian(grid, operators))
+    @test TensorKit.storagetype(H4) == CuVector{T, CUDA.DeviceMemory}
+
+    @test H4 ≈
+        adapt(CuVector{T, CUDA.DeviceMemory}, FiniteMPOHamiltonian(grid, local_operators)) +
+        adapt(CuVector{T, CUDA.DeviceMemory}, FiniteMPOHamiltonian(grid, vertical_operators)) +
+        adapt(CuVector{T, CUDA.DeviceMemory}, FiniteMPOHamiltonian(grid, horizontal_operators)) atol = 1.0e-4
+    @test TensorKit.storagetype(H4) == CuVector{T, CUDA.DeviceMemory}
+
+    H4′ = H4 / 3 + 2H4 / 3
+    @test TensorKit.storagetype(H4′) == CuVector{T, CUDA.DeviceMemory}
+    #=H5 = changebonds(H4′, SvdCut(; trscheme = trunctol(; atol = 1.0e-12)))
+    @test TensorKit.storagetype(H5) == CuVector{T, CUDA.DeviceMemory}
+    psi = adapt(CuArray, FiniteMPS(physicalspace(H5), V ⊕ rightunitspace(V)))
+    @test expectation_value(psi, H4) ≈ expectation_value(psi, H5)=# # TODO
 end
-=#
+
 @testset "CuInfiniteMPOHamiltonian $(sectortype(pspace))" for (pspace, Dspace) in zip(pspaces, vspaces)
     # generate a 1-2-3 body interaction
     operators = ntuple(3) do i