From 35ba685da0e8ae657364105bdb042ae793da9b66 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 7 Jul 2024 15:50:46 +0200 Subject: [PATCH] added first CUDA steps --- package/src/ExpressionExecutorCuda.jl | 28 +++++++++++++- package/src/Interpreter.jl | 54 +++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 package/src/Interpreter.jl diff --git a/package/src/ExpressionExecutorCuda.jl b/package/src/ExpressionExecutorCuda.jl index f581922..cebebc4 100644 --- a/package/src/ExpressionExecutorCuda.jl +++ b/package/src/ExpressionExecutorCuda.jl @@ -1,5 +1,31 @@ module ExpressionExecutorCuda +include("Interpreter.jl") -# Write your package code here. +export interpret_gpu +export evaluate_gpu +export test + +# Evaluate Expressions on the GPU +function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float64}, p::Vector{Vector{Float64}})::Matrix{Float64} + # Ensure that no two expressions are interpreted in the same "warp" + expr1 = exprs[1] +end + +# Convert Expressions to PTX Code and execute that instead +function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float64}, p::Vector{Vector{Float64}})::Matrix{Float64} + # Look into this to maybe speed up PTX generation: https://cuda.juliagpu.org/stable/tutorials/introduction/#Parallelization-on-the-CPU +end + + +# TODO: See if it is feasible to make 32 versions too (mostly because 32 is faster than 64) +# If AMD GPU support gets added, it might even be a good idea to add 16 bit floats, since they are even faster than 32 bit. On Nvidia 16 is either slower or equal in performance than 32 bit +function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32} +end +function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32} +end + +function test() + Interpreter.CudaTest() +end end diff --git a/package/src/Interpreter.jl b/package/src/Interpreter.jl new file mode 100644 index 0000000..b364f2e --- /dev/null +++ b/package/src/Interpreter.jl @@ -0,0 +1,54 @@ +module Interpreter +using CUDA + +export CudaTest + +@enum Operators Add=1 Subtract=2 + +function CudaTest() + N = 2^20 + x = CUDA.fill(1.0f0, N) + y = CUDA.fill(2.0f0, N) + + kernelAdd = @cuda launch=false InterpretExplicit!(Add, x, y) + config = launch_configuration(kernelAdd.fun) + threads = min(length(y), config.threads) + blocks = cld(length(y), threads) + + kernelAdd(Add, x, y; threads, blocks) + println(y[1]) + # @test all(Array(y) .== 3.0f0) + + kernelSubtract = @cuda launch=false InterpretExplicit!(Subtract, x, y) + configSub = launch_configuration(kernelSubtract.fun) + threadsSub = min(length(y), configSub.threads) + blocksSub = cld(length(y), threadsSub) + CUDA.fill!(y, 2.0f0) + + kernelSubtract(Subtract, x, y; threadsSub, blocksSub) + # @test all(Array(y) .== -1.0f0) + println(y[1]) +end + + +# Kernel +function InterpretExplicit!(op::Operators, x, y) + index = (blockIdx().x - 1) * blockDim().x + threadIdx().x + stride = gridDim().x * blockDim().x + + if op == Add + @cuprintln("Performing Addition") # Will only be displayed when the GPU is synchronized + for i = index:stride:length(y) + @inbounds y[i] += x[i] + end + return + elseif op == Subtract + @cuprintln("Performing Subtraction") # Will only be displayed when the GPU is synchronized + for i = index:stride:length(y) + @inbounds y[i] -= x[i] + end + return + end +end + +end \ No newline at end of file