Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion ext/TensorKitCUDAExt/TensorKitCUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,14 @@ using TensorKit.Factorizations
using TensorKit.Strided
using TensorKit.Factorizations: AbstractAlgorithm
using TensorKit: SectorDict, tensormaptype, scalar, similarstoragetype, AdjointTensorMap, scalartype, project_symmetric_and_check
import TensorKit: randisometry, rand, randn
import TensorKit: randisometry, rand, randn, _copyto!, _add_general_kernel_nonthreaded!, blocktype

using TensorKit: MatrixAlgebraKit

using Random

include("cutensormap.jl")
include("truncation.jl")
include("auxiliary.jl")

end
28 changes: 28 additions & 0 deletions ext/TensorKitCUDAExt/auxiliary.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
function TensorKit._copyto!(A::StridedView{TA, 1, <:CuArray{TA}}, B::StridedView{TB, 2, <:CuArray{TB}}) where {TA, TB}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this make sense to include, and should this not simply fall back to the default copyto!?
This really is just a performance optimization to avoid a bunch of the overhead of Strided.jl, but I would be surprised that building the indexarrays like this really gives an improvement over just a regular strided copyto!.

I think this entire thing should boil down to the following, which is not obvious and I should have added a comment/fallback definition: (up to some off-by-one errors though)

A[A.offset:stride(A, 1):end] .= B.op.(view(B, div(B.offset, stride(B, 2)):stride(B, 1):size(B, 1), 1:stride(B, 2):size(B, 2)))

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems to be necessary to avoid scalar indexing sadness 🤷 . Happy to use the fallback, though!

length(A) == length(B) || throw(DimensionMismatch(lazy"length of A ($(length(A))) does not match length of B ($(length(B))"))

Adata = parent(A)
Astr = stride(A, 1)
IA = A.offset

Bdata = parent(B)
Bstr = strides(B)

IB_1 = B.offset
# build index arrays
IAs = Int[]
IBs = Int[]
@inbounds for _ in axes(B, 2)
IB = IB_1
for _ in axes(B, 1)
IA += Astr
append!(IAs, IA)
IB += Bstr[1]
append!(IBs, IB)
end
IB_1 += Bstr[2]
end
Adata[IAs] .= Bdata[IBs]

return A
end
53 changes: 50 additions & 3 deletions ext/TensorKitCUDAExt/cutensormap.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,17 @@ function CuTensorMap(t::TensorMap{T, S, N₁, N₂, A}) where {T, S, N₁, N₂,
return CuTensorMap{T, S, N₁, N₂}(CuArray{T}(t.data), space(t))
end

#=function TensorKit.TensorMap{T, S₁, N₁, N₂, A}(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

leftover?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

😰

::UndefInitializer, space::TensorMapSpace{S₂, N₁, N₂}
) where {T, S₁, S₂ <: TensorKit.ElementarySpace, N₁, N₂, A <: CuVector{T}}
d = TensorKit.fusionblockstructure(space).totaldim
data = A(undef, d)
if !isbitstype(T)
zerovector!(data)
end
return TensorKit.TensorMap{T, S₂, A}(data, space)
end=#

# project_symmetric! doesn't yet work for GPU types, so do this on the host, then copy
function TensorKit.project_symmetric_and_check(::Type{T}, ::Type{A}, data::AbstractArray, V::TensorMapSpace; tol = sqrt(eps(real(float(eltype(data)))))) where {T, A <: CuVector{T}}
h_t = TensorKit.TensorMapWithStorage{T, Vector{T}}(undef, V)
Expand All @@ -17,6 +28,10 @@ function TensorKit.project_symmetric_and_check(::Type{T}, ::Type{A}, data::Abstr
return TensorKit.TensorMapWithStorage{T, A}(A(h_t.data), V)
end

function TensorKit.blocktype(::Type{<:CuTensorMap{T, S}}) where {T, S}
return SubArray{T, 1, CuVector{T, CUDA.DeviceMemory}, Tuple{UnitRange{Int}}, true}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I somehow had expected the blocktype to be CuMatrix, with the way that CUDA handles views. If this isn't the case, can we force it to be?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually it wanted it to be ReshapedArray of this SubArray 😱 . Really painful. I can swap this to just being a CuMatrix.

end

for (fname, felt) in ((:zeros, :zero), (:ones, :one))
@eval begin
function CUDA.$fname(
Expand Down Expand Up @@ -102,9 +117,21 @@ function TensorKit.scalar(t::CuTensorMap{T, S, 0, 0}) where {T, S}
end

function Base.convert(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm again a bit confused by the necessity of this function, is that not the same definition as the regular TensorMap one?

TT::Type{CuTensorMap{T, S, N₁, N₂}},
t::AbstractTensorMap{<:Any, S, N₁, N₂}
) where {T, S, N₁, N₂}
TT::Type{TensorMap{T, S, N₁, N₂, A}},
t::TensorMap{T, S, N₁, N₂, AA}
) where {T, S, N₁, N₂, A <: CuArray{T}, AA}
if typeof(t) === TT
return t
else
tnew = TT(undef, space(t))
return copy!(tnew, t)
end
end

function Base.convert(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment here

TT::Type{TensorMap{T, S, N₁, N₂, A}},
t::AdjointTensorMap
) where {T, S, N₁, N₂, A <: CuArray{T}}
if typeof(t) === TT
return t
else
Expand Down Expand Up @@ -140,6 +167,8 @@ end

TensorKit.promote_storage_rule(::Type{CuArray{T, N}}, ::Type{<:CuArray{T, N}}) where {T, N} =
CuArray{T, N, CUDA.default_memory}
TensorKit.promote_storage_rule(::Type{<:CuArray{T, N}}, ::Type{CuArray{T, N}}) where {T, N} =
CuArray{T, N, CUDA.default_memory}
Comment on lines 168 to +171
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
TensorKit.promote_storage_rule(::Type{CuArray{T, N}}, ::Type{<:CuArray{T, N}}) where {T, N} =
CuArray{T, N, CUDA.default_memory}
TensorKit.promote_storage_rule(::Type{<:CuArray{T, N}}, ::Type{CuArray{T, N}}) where {T, N} =
CuArray{T, N, CUDA.default_memory}
TensorKit.promote_storage_rule(::Type{<:CuArray{T, N}}, ::Type{<:CuArray{T, N}}) where {T, N} =
CuArray{T, N, CUDA.default_memory}

I should have written the rules in such a way that it is symmetric, so we shouldn't have to define both directions. However, I do think both sides need <: to account for the third type parameter being there, which I also missed in the last PR.



# CuTensorMap exponentation:
Expand Down Expand Up @@ -168,3 +197,21 @@ for f in (:sqrt, :log, :asin, :acos, :acosh, :atanh, :acoth)
return tf
end
end

function TensorKit._add_general_kernel_nonthreaded!(
tdst::CuTensorMap, tsrc::CuTensorMap, p, transformer::TensorKit.GenericTreeTransformer, α, β, backend...
)
# preallocate buffers
buffers = TensorKit.allocate_buffers(tdst, tsrc, transformer)

for subtransformer in transformer.data
# Special case without intermediate buffers whenever there is only a single block
if length(subtransformer[1]) == 1
TensorKit._add_transform_single!(tdst, tsrc, p, subtransformer, α, β, backend...)
else
cu_subtransformer = tuple(CUDA.adapt(CuArray, subtransformer[1]), subtransformer[2:end]...)
TensorKit._add_transform_multi!(tdst, tsrc, p, cu_subtransformer, buffers, α, β, backend...)
end
end
return nothing
end
2 changes: 1 addition & 1 deletion src/auxiliary/auxiliary.jl
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ end
# Low-overhead implementation of `copyto!` for specific case of `stride(B, 1) < stride(B, 2)`
# used in indexmanipulations: avoids the overhead of Strided.jl
function _copyto!(A::StridedView{<:Any, 1}, B::StridedView{<:Any, 2})
length(A) == length(B) || throw(DimensionMismatch())
length(A) == length(B) || throw(DimensionMismatch(lazy"length of A ($(length(A))) does not match length of B ($(length(B))"))

Adata = parent(A)
Astr = stride(A, 1)
Expand Down
9 changes: 6 additions & 3 deletions src/tensors/braidingtensor.jl
Original file line number Diff line number Diff line change
Expand Up @@ -171,12 +171,15 @@ end
has_shared_permute(t::BraidingTensor, ::Index2Tuple) = false
function add_transform!(
tdst::AbstractTensorMap,
tsrc::BraidingTensor, (p₁, p₂)::Index2Tuple,
tsrc::BraidingTensor{T, S},
(p₁, p₂)::Index2Tuple,
fusiontreetransform,
α::Number, β::Number, backend::AbstractBackend...
)
) where {T, S}
tsrc_map = TensorMapWithStorage{scalartype(tdst), storagetype(tdst)}(undef, (tsrc.V2 ⊗ tsrc.V1) ← (tsrc.V1 ⊗ tsrc.V2))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
tsrc_map = TensorMapWithStorage{scalartype(tdst), storagetype(tdst)}(undef, (tsrc.V2 tsrc.V1) (tsrc.V1 tsrc.V2))
tsrc_map = similar(tdst, storagetype(tdst), space(tsrc))

This might be a little cleaner/not use that much "internal"s

copy!(tsrc_map, tsrc)
return add_transform!(
tdst, TensorMap(tsrc), (p₁, p₂), fusiontreetransform, α, β,
tdst, tsrc_map, (p₁, p₂), fusiontreetransform, α, β,
backend...
)
end
Expand Down
10 changes: 6 additions & 4 deletions src/tensors/tensoroperations.jl
Original file line number Diff line number Diff line change
Expand Up @@ -419,8 +419,10 @@ end
# Scalar implementation
#-----------------------
function scalar(t::AbstractTensorMap{T, S, 0, 0}) where {T, S}
Bs = collect(blocks(t))
inds = findall(!iszero last, Bs)
isempty(inds) && return zero(scalartype(t))
return only(last(Bs[only(inds)]))
Bs = blocks(t)
B_ends = collect.(map(last, Bs))
nz_B_ends = [!iszero.(B) for B in B_ends]
valid_Bs = filter(any, B_ends)
isempty(valid_Bs) && return zero(scalartype(t))
return only(last(first(valid_Bs)))
end
2 changes: 1 addition & 1 deletion src/tensors/treetransformers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ function AbelianTreeTransformer(transform, p, Vdst, Vsrc)
end

const _GenericTransformerData{T, N} = Tuple{
Matrix{T},
DenseMatrix{T},
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this change makes the types below abstractly typed, do we need this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, in order to allow device-side matrices to get passed in. Otherwise you get attempts to multiply CuMatrix * Matrix outside of constructors

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, but in that case we would really have to make that an additional type parameter in the GenericTreeTransformer struct -- these were introduced to hyper specialize and get maximal efficiency, so I don't think we can eat a type-instability here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, it would have been helpful to have had a comment or anything that this was why they were there

Tuple{NTuple{N, Int}, Vector{Tuple{NTuple{N, Int}, Int}}},
Tuple{NTuple{N, Int}, Vector{Tuple{NTuple{N, Int}, Int}}},
}
Expand Down
Loading