figured out the way to do it, when I used just AbstractFFTs.jl naively as follows for a GPU/CPU agnostic implementation, it did not work:
function poisson_solve!(out_array::AbstractArray{T,2},
inp_array::AbstractArray{T,2},
coefficient::AbstractArray{T,2}) where T
println("Backend agnostic FFT for Poisson solve")
fourier_array = rfft(inp_array)
fourier_array .*= coefficient
out_array .= irfft(fourier_array, size(inp_array, 2))
return out_array
end
Apparently, you need to specify the backend and without it I was getting StackOverflow errors.
So, I tried the following with multiple dispatch and the Requires.jl library:
function __init__()
# CUDA implementation - only loaded if CUDA.jl is available
@require CUDA="052768ef-5323-5732-b1bb-66c8b64840ba" begin
function poisson_solve!(out_array::CUDA.CuArray{T,2},
inp_array::CUDA.CuArray{T,2},
coefficient::CUDA.CuArray{T,2}) where T
println("Using CUDA FFT for Poisson solve")
fourier_array = rfft(inp_array)
fourier_array .*= coefficient
out_array .= irfft(fourier_array, size(inp_array, 2))
return out_array
end
end
# AMDGPU implementation - only loaded if AMDGPU.jl is available
@require AMDGPU="21141c5a-9bdb-4563-92ae-f87d6854732e" begin
function poisson_solve!(out_array::AMDGPU.ROCArray{T,2},
inp_array::AMDGPU.ROCArray{T,2},
coefficient::AMDGPU.ROCArray{T,2}) where T
println("Using ROCm FFT for Poisson solve")
fourier_array = rfft(inp_array)
fourier_array .*= coefficient
out_array .= irfft(fourier_array, size(inp_array, 2))
return out_array
end
end
end
for CUDA and ROCm and the following for CPU, oneAPI and Metal:
function poisson_solve!(out_array::AbstractArray{T,2},
inp_array::AbstractArray{T,2},
coefficient::AbstractArray{T,2}) where T
# Convert inputs to CPU Arrays if they aren't already
# This handles oneAPI and Metal arrays that need conversion
inp_cpu = Array(inp_array)
coeff_cpu = Array(coefficient)
# Use FFTW's implementation explicitly
fourier_array = FFTW.rfft(inp_cpu)
# Multiply in Fourier space
fourier_array .*= coeff_cpu
# Perform inverse FFT
result = FFTW.irfft(fourier_array, size(inp_cpu, 2))
# Copy back to output array (handles conversion back to original array type)
out_array .= result
return out_array
end
Dependancies: using AbstractFFTs
, using FFTW
, using Requires
, using AMDGPU
or using CUDA
Now, it works regardless of backend. But maybe folks have suggestions for a cleaner implementation.