benchmarking: tuned blocksizes; slightly improved performance; mostly improved standard deviation

2025-04-12 13:20:50 +02:00
parent c6e2ce47aa
commit 101ccef67b
5 changed files with 8 additions and 7 deletions
--- a/package/src/Interpreter.jl
+++ b/package/src/Interpreter.jl
@ -33,7 +33,7 @@ function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parame
 	@inbounds for i in eachindex(exprs)
 		kernel = @cuda launch=false interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
 		# config = launch_configuration(kernel.fun)
-		threads = min(variableCols, 256)
+		threads = min(variableCols, 128)
 		blocks = cld(variableCols, threads)

 		kernel(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i; threads, blocks)
--- a/package/src/Transpiler.jl
+++ b/package/src/Transpiler.jl
@ -73,7 +73,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
 	# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
 	for kernel in kernels
 		# config = launch_configuration(kernels[i])
-		threads = min(variableCols, 256)
+		threads = min(variableCols, 96)
 		blocks = cld(variableCols, threads)

 		cudacall(kernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)