added first CUDA steps

2024-07-07 15:50:46 +02:00
parent fa643b8b27
commit 35ba685da0
2 changed files with 81 additions and 1 deletions
--- a/package/src/ExpressionExecutorCuda.jl
+++ b/package/src/ExpressionExecutorCuda.jl
@ -1,5 +1,31 @@
 module ExpressionExecutorCuda
 include("Interpreter.jl")
-# Write your package code here.
+export interpret_gpu
 export evaluate_gpu
 export test
 # Evaluate Expressions on the GPU
 function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float64}, p::Vector{Vector{Float64}})::Matrix{Float64}
 	# Ensure that no two expressions are interpreted in the same "warp"
 	expr1 = exprs[1]
 end
 # Convert Expressions to PTX Code and execute that instead
 function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float64}, p::Vector{Vector{Float64}})::Matrix{Float64}
 	# Look into this to maybe speed up PTX generation: https://cuda.juliagpu.org/stable/tutorials/introduction/#Parallelization-on-the-CPU
 end
 # TODO: See if it is feasible to make 32 versions too (mostly because 32 is faster than 64)
 # If AMD GPU support gets added, it might even be a good idea to add 16 bit floats, since they are even faster than 32 bit. On Nvidia 16 is either slower or equal in performance than 32 bit
 function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
 end
 function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
 end
 function test() 
 	Interpreter.CudaTest()
 end
 end
--- a/package/src/Interpreter.jl
+++ b/package/src/Interpreter.jl
@ -0,0 +1,54 @@
 module Interpreter
 using CUDA
 export CudaTest
@enum Operators Add=1 Subtract=2
 function CudaTest()
 	N = 2^20
 	x = CUDA.fill(1.0f0, N)
 	y = CUDA.fill(2.0f0, N)
 	kernelAdd = @cuda launch=false InterpretExplicit!(Add, x, y)
 	config = launch_configuration(kernelAdd.fun)
 	threads = min(length(y), config.threads)
 	blocks = cld(length(y), threads)
 	kernelAdd(Add, x, y; threads, blocks)
 	println(y[1])
 	# @test all(Array(y) .== 3.0f0)
 	kernelSubtract = @cuda launch=false InterpretExplicit!(Subtract, x, y)
 	configSub = launch_configuration(kernelSubtract.fun)
 	threadsSub = min(length(y), configSub.threads)
 	blocksSub = cld(length(y), threadsSub)
 	CUDA.fill!(y, 2.0f0)
 	kernelSubtract(Subtract, x, y; threadsSub, blocksSub)
 	# @test all(Array(y) .== -1.0f0)
 	println(y[1])
 end
 # Kernel
 function InterpretExplicit!(op::Operators, x, y)
 	index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
 	stride = gridDim().x * blockDim().x
 	if op == Add
 		@cuprintln("Performing Addition") # Will only be displayed when the GPU is synchronized
 		for i = index:stride:length(y)
 			@inbounds y[i] += x[i]
 		end
 		return
 	elseif op == Subtract
 		@cuprintln("Performing Subtraction") # Will only be displayed when the GPU is synchronized
 		for i = index:stride:length(y)
 			@inbounds y[i] -= x[i]
 		end
 		return
 	end
 end
 end