added first CUDA steps
Some checks failed
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled
Some checks failed
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled
This commit is contained in:
parent
fa643b8b27
commit
35ba685da0
|
@ -1,5 +1,31 @@
|
||||||
module ExpressionExecutorCuda
|
module ExpressionExecutorCuda
|
||||||
|
include("Interpreter.jl")
|
||||||
|
|
||||||
# Write your package code here.
|
export interpret_gpu
|
||||||
|
export evaluate_gpu
|
||||||
|
export test
|
||||||
|
|
||||||
|
# Evaluate Expressions on the GPU
|
||||||
|
function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float64}, p::Vector{Vector{Float64}})::Matrix{Float64}
|
||||||
|
# Ensure that no two expressions are interpreted in the same "warp"
|
||||||
|
expr1 = exprs[1]
|
||||||
|
end
|
||||||
|
|
||||||
|
# Convert Expressions to PTX Code and execute that instead
|
||||||
|
function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float64}, p::Vector{Vector{Float64}})::Matrix{Float64}
|
||||||
|
# Look into this to maybe speed up PTX generation: https://cuda.juliagpu.org/stable/tutorials/introduction/#Parallelization-on-the-CPU
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: See if it is feasible to make 32 versions too (mostly because 32 is faster than 64)
|
||||||
|
# If AMD GPU support gets added, it might even be a good idea to add 16 bit floats, since they are even faster than 32 bit. On Nvidia 16 is either slower or equal in performance than 32 bit
|
||||||
|
function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
|
||||||
|
end
|
||||||
|
function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
|
||||||
|
end
|
||||||
|
|
||||||
|
function test()
|
||||||
|
Interpreter.CudaTest()
|
||||||
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
54
package/src/Interpreter.jl
Normal file
54
package/src/Interpreter.jl
Normal file
|
@ -0,0 +1,54 @@
|
||||||
|
module Interpreter
|
||||||
|
using CUDA
|
||||||
|
|
||||||
|
export CudaTest
|
||||||
|
|
||||||
|
@enum Operators Add=1 Subtract=2
|
||||||
|
|
||||||
|
function CudaTest()
|
||||||
|
N = 2^20
|
||||||
|
x = CUDA.fill(1.0f0, N)
|
||||||
|
y = CUDA.fill(2.0f0, N)
|
||||||
|
|
||||||
|
kernelAdd = @cuda launch=false InterpretExplicit!(Add, x, y)
|
||||||
|
config = launch_configuration(kernelAdd.fun)
|
||||||
|
threads = min(length(y), config.threads)
|
||||||
|
blocks = cld(length(y), threads)
|
||||||
|
|
||||||
|
kernelAdd(Add, x, y; threads, blocks)
|
||||||
|
println(y[1])
|
||||||
|
# @test all(Array(y) .== 3.0f0)
|
||||||
|
|
||||||
|
kernelSubtract = @cuda launch=false InterpretExplicit!(Subtract, x, y)
|
||||||
|
configSub = launch_configuration(kernelSubtract.fun)
|
||||||
|
threadsSub = min(length(y), configSub.threads)
|
||||||
|
blocksSub = cld(length(y), threadsSub)
|
||||||
|
CUDA.fill!(y, 2.0f0)
|
||||||
|
|
||||||
|
kernelSubtract(Subtract, x, y; threadsSub, blocksSub)
|
||||||
|
# @test all(Array(y) .== -1.0f0)
|
||||||
|
println(y[1])
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
# Kernel
|
||||||
|
function InterpretExplicit!(op::Operators, x, y)
|
||||||
|
index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
|
||||||
|
stride = gridDim().x * blockDim().x
|
||||||
|
|
||||||
|
if op == Add
|
||||||
|
@cuprintln("Performing Addition") # Will only be displayed when the GPU is synchronized
|
||||||
|
for i = index:stride:length(y)
|
||||||
|
@inbounds y[i] += x[i]
|
||||||
|
end
|
||||||
|
return
|
||||||
|
elseif op == Subtract
|
||||||
|
@cuprintln("Performing Subtraction") # Will only be displayed when the GPU is synchronized
|
||||||
|
for i = index:stride:length(y)
|
||||||
|
@inbounds y[i] -= x[i]
|
||||||
|
end
|
||||||
|
return
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
Loading…
Reference in New Issue
Block a user