updated all to 32-bit to save registers and boost performance

2024-11-01 11:23:58 +01:00
parent 9fc55c4c15
commit 68cedd75fc
7 changed files with 113 additions and 106 deletions
--- a/package/src/ExpressionExecutorCuda.jl
+++ b/package/src/ExpressionExecutorCuda.jl
@ -6,8 +6,6 @@ export interpret_gpu
 export evaluate_gpu
 export test

-# const SymbolTable64 = Dict{Tuple{Expr, Symbol},Float64}
-#
 # Some assertions:
 # Variables and parameters start their naming with "1" meaning the first variable/parameter has to be "x1/p1" and not "x0/p0"
 # each index i in exprs has to have the matching values in the column i in Matrix X so that X[:,i] contains the values for expr[i]. The same goes for p
@ -15,22 +13,14 @@ export test
 #

 # Evaluate Expressions on the GPU
-function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float64}, p::Vector{Vector{Float64}})::Matrix{Float64}
+function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
 	# Ensure that no two expressions are interpreted in the same "warp"
 	exprsPostfix = ExpressionProcessing.expr_to_postfix(exprs[1])
 end

 # Convert Expressions to PTX Code and execute that instead
-function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float64}, p::Vector{Vector{Float64}})::Matrix{Float64}
-	# Look into this to maybe speed up PTX generation: https://cuda.juliagpu.org/stable/tutorials/introduction/#Parallelization-on-the-CPU
-end
-
-
-# TODO: See if it is feasible to make 32 versions too (mostly because 32 is significantly faster than 64)
-# If AMD GPU support gets added, it might even be a good idea to add 16 bit floats, since they are even faster than 32 bit. On Nvidia 16 is either slower or equal in performance to 32 bit
-function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
-end
 function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
+	# Look into this to maybe speed up PTX generation: https://cuda.juliagpu.org/stable/tutorials/introduction/#Parallelization-on-the-CPU
 end

 end
@ -45,4 +35,4 @@ end
 # 
 # The following can be done on the CPU
 #     convert expression to postfix notation (mandatory)
-#     replace every variable with the according value from X and p (reduce extensive memory access on the GPU)
+#     optional: replace every parameter with the correct value (should only improve performance if data transfer is the bottleneck)