benchmarking: updated blocksize to 256 with moderate improvements

2025-03-30 13:56:25 +02:00
parent 1dc0c1898d
commit d9c83caad9
5 changed files with 129 additions and 100 deletions
--- a/package/src/Transpiler.jl
+++ b/package/src/Transpiler.jl
@ -55,8 +55,8 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet

 	# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
 	for i in eachindex(kernels)
-		config = launch_configuration(kernels[i])
-		threads = min(variableCols, config.threads)
+		# config = launch_configuration(kernels[i])
+		threads = min(variableCols, 256)
 		blocks = cld(variableCols, threads)

 		cudacall(kernels[i], (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)