benchmarking: tuned blocksizes; slightly improved performance; mostly improved standard deviation

2025-04-12 13:20:50 +02:00
parent c6e2ce47aa
commit 101ccef67b
5 changed files with 8 additions and 7 deletions
--- a/package/src/Interpreter.jl
+++ b/package/src/Interpreter.jl
@ -33,7 +33,7 @@ function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parame
 	@inbounds for i in eachindex(exprs)
 		kernel = @cuda launch=false interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
 		# config = launch_configuration(kernel.fun)
-		threads = min(variableCols, 256)
+		threads = min(variableCols, 128)
 		blocks = cld(variableCols, threads)

 		kernel(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i; threads, blocks)
--- a/package/src/Transpiler.jl
+++ b/package/src/Transpiler.jl
@ -73,7 +73,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
 	# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
 	for kernel in kernels
 		# config = launch_configuration(kernels[i])
-		threads = min(variableCols, 256)
+		threads = min(variableCols, 96)
 		blocks = cld(variableCols, threads)

 		cudacall(kernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
--- a/package/test/PerformanceTests.jl
+++ b/package/test/PerformanceTests.jl
@ -4,7 +4,7 @@ using BenchmarkTools
 using .Transpiler
 using .Interpreter

-const BENCHMARKS_RESULTS_PATH = "./results"
+const BENCHMARKS_RESULTS_PATH = "./results-fh"
 exprsCPU = [
 	# CPU interpreter requires an anonymous function and array ref s
 	:(p[1] * x[1] + p[2]), # 5 op
@ -69,7 +69,7 @@ end
 # Add /usr/local/cuda/bin in .bashrc to PATH to access ncu and nsys (depending how well this works with my 1080 do it on my machine, otherwise re do the tests and perform them on FH PCs)
 # University setup at 10.20.1.7 if needed

-compareWithCPU = false
+compareWithCPU = true


 suite = BenchmarkGroup()
@ -143,9 +143,9 @@ if compareWithCPU
 	println(gpuiVsGPUT_median)
 	println(gpuiVsGPUT_std)
 	
-	# BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/using_inbounds.json", results)
+	BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json", results)
 else
-	resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/using_inbounds.json")[1]
+	resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/2-using_inbounds.json")[1]
 	
 	medianGPUI_old = median(resultsOld["GPUI"])
 	stdGPUI_old = std(resultsOld["GPUI"])
--- a/package/test/PerformanceTuning.jl
+++ b/package/test/PerformanceTuning.jl
@ -26,5 +26,5 @@ end


@testset "Transpiler Tuning" begin
-    CUDA.@profile evaluate_gpu(exprsGPU, X, p; repetitions=expr_reps)
+    # CUDA.@profile evaluate_gpu(exprsGPU, X, p; repetitions=expr_reps)
 end
--- a/package/test/results-fh/3-tuned-blocksize_I128_T96.json
+++ b/package/test/results-fh/3-tuned-blocksize_I128_T96.json