benchmarking: tuned interpreter blocksize

2025-05-20 09:05:35 +02:00
parent a9ffd5da63
commit 250deb334c
5 changed files with 26 additions and 15 deletions
--- a/package/src/Interpreter.jl
+++ b/package/src/Interpreter.jl
@ -25,7 +25,7 @@ function interpret(cudaExprs, numExprs::Integer, exprsInnerLength::Integer,

 	# Start kernel for each expression to ensure that no warp is working on different expressions
 	@inbounds Threads.@threads for i in 1:numExprs # multithreaded to speedup dispatching (seems to have improved performance)
-		numThreads = min(variableColumns, 256)
+		numThreads = min(variableColumns, 121)
 		numBlocks = cld(variableColumns, numThreads)

 		@cuda threads=numThreads blocks=numBlocks fastmath=true interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)