From 35ba685da0e8ae657364105bdb042ae793da9b66 Mon Sep 17 00:00:00 2001
From: Daniel <danwip@hotmail.com>
Date: Sun, 7 Jul 2024 15:50:46 +0200
Subject: [PATCH] added first CUDA steps

---
 package/src/ExpressionExecutorCuda.jl | 28 +++++++++++++-
 package/src/Interpreter.jl            | 54 +++++++++++++++++++++++++++
 2 files changed, 81 insertions(+), 1 deletion(-)
 create mode 100644 package/src/Interpreter.jl

diff --git a/package/src/ExpressionExecutorCuda.jl b/package/src/ExpressionExecutorCuda.jl
index f581922..cebebc4 100644
--- a/package/src/ExpressionExecutorCuda.jl
+++ b/package/src/ExpressionExecutorCuda.jl
@@ -1,5 +1,31 @@
 module ExpressionExecutorCuda
+include("Interpreter.jl")
 
-# Write your package code here.
+export interpret_gpu
+export evaluate_gpu
+export test
+
+# Evaluate Expressions on the GPU
+function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float64}, p::Vector{Vector{Float64}})::Matrix{Float64}
+	# Ensure that no two expressions are interpreted in the same "warp"
+	expr1 = exprs[1]
+end
+
+# Convert Expressions to PTX Code and execute that instead
+function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float64}, p::Vector{Vector{Float64}})::Matrix{Float64}
+	# Look into this to maybe speed up PTX generation: https://cuda.juliagpu.org/stable/tutorials/introduction/#Parallelization-on-the-CPU
+end
+
+
+# TODO: See if it is feasible to make 32 versions too (mostly because 32 is faster than 64)
+# If AMD GPU support gets added, it might even be a good idea to add 16 bit floats, since they are even faster than 32 bit. On Nvidia 16 is either slower or equal in performance than 32 bit
+function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
+end
+function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
+end
+
+function test() 
+	Interpreter.CudaTest()
+end
 
 end
diff --git a/package/src/Interpreter.jl b/package/src/Interpreter.jl
new file mode 100644
index 0000000..b364f2e
--- /dev/null
+++ b/package/src/Interpreter.jl
@@ -0,0 +1,54 @@
+module Interpreter
+using CUDA
+
+export CudaTest
+
+@enum Operators Add=1 Subtract=2
+
+function CudaTest()
+	N = 2^20
+	x = CUDA.fill(1.0f0, N)
+	y = CUDA.fill(2.0f0, N)
+
+	kernelAdd = @cuda launch=false InterpretExplicit!(Add, x, y)
+	config = launch_configuration(kernelAdd.fun)
+	threads = min(length(y), config.threads)
+	blocks = cld(length(y), threads)
+	
+	kernelAdd(Add, x, y; threads, blocks)
+	println(y[1])
+	# @test all(Array(y) .== 3.0f0)
+	
+	kernelSubtract = @cuda launch=false InterpretExplicit!(Subtract, x, y)
+	configSub = launch_configuration(kernelSubtract.fun)
+	threadsSub = min(length(y), configSub.threads)
+	blocksSub = cld(length(y), threadsSub)
+	CUDA.fill!(y, 2.0f0)
+	
+	kernelSubtract(Subtract, x, y; threadsSub, blocksSub)
+	# @test all(Array(y) .== -1.0f0)
+	println(y[1])
+end
+
+
+# Kernel
+function InterpretExplicit!(op::Operators, x, y)
+	index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+	stride = gridDim().x * blockDim().x
+
+	if op == Add
+		@cuprintln("Performing Addition") # Will only be displayed when the GPU is synchronized
+		for i = index:stride:length(y)
+			@inbounds y[i] += x[i]
+		end
+		return
+	elseif op == Subtract
+		@cuprintln("Performing Subtraction") # Will only be displayed when the GPU is synchronized
+		for i = index:stride:length(y)
+			@inbounds y[i] -= x[i]
+		end
+		return
+	end
+end
+
+end
\ No newline at end of file