benchmarking: tuned blocksizes; slightly improved performance; mostly improved standard deviation

benchmarking: redid performance tests on uni pc
2025-04-12 13:20:50 +02:00 · 2025-04-12 12:13:28 +02:00
9 changed files with 13 additions and 9 deletions
--- a/package/src/Interpreter.jl
+++ b/package/src/Interpreter.jl
@ -33,7 +33,7 @@ function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parame
 	@inbounds for i in eachindex(exprs)
 		kernel = @cuda launch=false interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
 		# config = launch_configuration(kernel.fun)
-		threads = min(variableCols, 256)
+		threads = min(variableCols, 128)
 		blocks = cld(variableCols, threads)

 		kernel(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i; threads, blocks)
--- a/package/src/Transpiler.jl
+++ b/package/src/Transpiler.jl
@ -73,7 +73,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
 	# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
 	for kernel in kernels
 		# config = launch_configuration(kernels[i])
-		threads = min(variableCols, 256)
+		threads = min(variableCols, 96)
 		blocks = cld(variableCols, threads)

 		cudacall(kernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
--- a/package/test/PerformanceTests.jl
+++ b/package/test/PerformanceTests.jl
@ -4,7 +4,7 @@ using BenchmarkTools
 using .Transpiler
 using .Interpreter

-const BENCHMARKS_RESULTS_PATH = "./results"
+const BENCHMARKS_RESULTS_PATH = "./results-fh"
 exprsCPU = [
 	# CPU interpreter requires an anonymous function and array ref s
 	:(p[1] * x[1] + p[2]), # 5 op
@ -69,7 +69,7 @@ end
 # Add /usr/local/cuda/bin in .bashrc to PATH to access ncu and nsys (depending how well this works with my 1080 do it on my machine, otherwise re do the tests and perform them on FH PCs)
 # University setup at 10.20.1.7 if needed

-compareWithCPU = false
+compareWithCPU = true


 suite = BenchmarkGroup()
@ -143,9 +143,9 @@ if compareWithCPU
 	println(gpuiVsGPUT_median)
 	println(gpuiVsGPUT_std)
 	
-	# BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/using_inbounds.json", results)
+	BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json", results)
 else
-	resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/using_inbounds.json")[1]
+	resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/2-using_inbounds.json")[1]
 	
 	medianGPUI_old = median(resultsOld["GPUI"])
 	stdGPUI_old = std(resultsOld["GPUI"])
--- a/package/test/PerformanceTuning.jl
+++ b/package/test/PerformanceTuning.jl
@ -26,5 +26,5 @@ end


@testset "Transpiler Tuning" begin
-    CUDA.@profile evaluate_gpu(exprsGPU, X, p; repetitions=expr_reps)
+    # CUDA.@profile evaluate_gpu(exprsGPU, X, p; repetitions=expr_reps)
 end
--- a/package/test/results-fh/0-initial_results.json
+++ b/package/test/results-fh/0-initial_results.json
--- a/package/test/results-fh/1-256_blocksize.json
+++ b/package/test/results-fh/1-256_blocksize.json
--- a/package/test/results-fh/2-using_inbounds.json
+++ b/package/test/results-fh/2-using_inbounds.json
--- a/package/test/results-fh/3-tuned-blocksize_I128_T96.json
+++ b/package/test/results-fh/3-tuned-blocksize_I128_T96.json
--- a/package/test/runtests.jl
+++ b/package/test/runtests.jl
@ -19,6 +19,6 @@ end
 # end

@testset "Performance tests" begin
-	include("PerformanceTuning.jl")
-	# include("PerformanceTests.jl")
+	# include("PerformanceTuning.jl")
+	include("PerformanceTests.jl")
 end
Author	SHA1	Message	Date
Wiplinger Daniel - s2310454043	101ccef67b	benchmarking: tuned blocksizes; slightly improved performance; mostly improved standard deviation Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-04-12 13:20:50 +02:00
Wiplinger Daniel - s2310454043	c6e2ce47aa	benchmarking: redid performance tests on uni pc Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-04-12 12:13:28 +02:00