From d7e18f183d7f7546e57088f6541d408f5d99efdf Mon Sep 17 00:00:00 2001
From: Daniel <s2310454043@fhooe.at>
Date: Thu, 15 May 2025 16:25:32 +0200
Subject: [PATCH] benchmarking: updated benchmarking suite and prepared for
 taking the benchmarks

---
 package/src/Transpiler.jl           | 42 ++++++++++-------------------
 package/test/CpuInterpreterTests.jl | 32 +++++++++++++++++++++-
 package/test/PerformanceTests.jl    | 23 ++++++----------
 package/test/params.json            |  2 +-
 package/test/runtests.jl            |  4 +--
 thesis/chapters/conceptdesign.tex   |  2 +-
 6 files changed, 57 insertions(+), 48 deletions(-)

diff --git a/package/src/Transpiler.jl b/package/src/Transpiler.jl
index c6f368e..7df1620 100644
--- a/package/src/Transpiler.jl
+++ b/package/src/Transpiler.jl
@@ -56,40 +56,26 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
 
 	threads = min(variableCols, 256)
 	blocks = cld(variableCols, threads)
-
+	
+	kernelName = "evaluate_gpu"
 	# TODO: Implement batching as a middleground between "transpile everything and then run" and "tranpile one run one" even though cudacall is async
-	@inbounds for i in eachindex(expressions)
+	@inbounds Threads.@threads for i in eachindex(expressions)
 		# if haskey(resultCache, expressions[i])
 		# 	kernels[i] = resultCache[expressions[i]]
 		# 	continue
 		# end
-
-		formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
-		kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i-1) # i-1 because julia is 1-based but PTX needs 0-based indexing
 		
-		# try
-			linker = CuLink()
-			add_data!(linker, "ExpressionProcessing", kernel)
-			
-			image = complete(linker)
-			
-			mod = CuModule(image)
+		formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
+		kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i-1, kernelName) # i-1 because julia is 1-based but PTX needs 0-based indexing
 
-			compiledKernel = CuFunction(mod, "ExpressionProcessing")
-			cudacall(compiledKernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
-
-			# kernels[i] = CuFunction(mod, "ExpressionProcessing")
-		# 	resultCache[expressions[i]] = kernels[i]
-		# catch
-		# 	dump(expressions[i]; maxdepth=10)
-		# 	println()
-		# 	println()
-		# 	println(kernel)
-		# 	println()
-		# 	println()
-		# 	error(current_exceptions())
-		# end
+		linker = CuLink()
+		add_data!(linker, kernelName, kernel)
+		
+		image = complete(linker)
+		mod = CuModule(image)
+		compiledKernel = CuFunction(mod, kernelName)
 
+		cudacall(compiledKernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
 	end
 
 	# for kernel in kernels
@@ -107,13 +93,13 @@ end
 - param ```expressionIndex```: The 0-based index of the expression
 "
 function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Integer, paramSetSize::Integer, 
-				   nrOfVariableSets::Integer, expressionIndex::Integer)::String
+				   nrOfVariableSets::Integer, expressionIndex::Integer, kernelName::String)::String
 	exitJumpLocationMarker = "L__BB0_2"
 	ptxBuffer = IOBuffer()
 	regManager = Utils.RegisterManager(Dict(), Dict())
 
 	# TODO: Suboptimal solution. get_kernel_signature should also return the name of the registers used for the parameters, so further below, we do not have to hard-code them
-	signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Float32, Float32, Float32], regManager) # Vars, Params, Results
+	signature, paramLoading = get_kernel_signature(kernelName, [Float32, Float32, Float32], regManager) # Vars, Params, Results
 	guardClause, threadId64Reg = get_guard_clause(exitJumpLocationMarker, nrOfVariableSets, regManager)
 
 	println(ptxBuffer, get_cuda_header())
diff --git a/package/test/CpuInterpreterTests.jl b/package/test/CpuInterpreterTests.jl
index 1e76b1b..6f1f9cf 100644
--- a/package/test/CpuInterpreterTests.jl
+++ b/package/test/CpuInterpreterTests.jl
@@ -78,4 +78,34 @@ function test_cpu_interpreter_nikuradse()
 end
 
 
-@test test_cpu_interpreter_nikuradse()
+# @test test_cpu_interpreter_nikuradse()
+
+data,varnames = readdlm("data/nikuradse_1.csv", ',', header=true);
+X = convert(Matrix{Float32}, data)
+X_t = permutedims(X) # for gpu
+
+exprs = Expr[]
+parameters = Vector{Vector{Float32}}()
+varnames = ["x$i" for i in 1:10]
+paramnames = ["p$i" for i in 1:20]
+# data/esr_nvar2_len10.txt.gz_9.txt.gz has  ~250_000 exprs
+# data/esr_nvar2_len10.txt.gz_10.txt.gz has ~800_000 exrps
+GZip.open("data/esr_nvar2_len10.txt.gz_9.txt.gz") do io 
+	for line in eachline(io)
+		expr, p = parse_infix(line, varnames, paramnames)
+
+		push!(exprs, expr)
+		push!(parameters, randn(Float32, length(p)))
+	end
+end
+expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially; only p changes but not X)
+
+suite = BenchmarkGroup()
+suite["CPU"] = BenchmarkGroup(["CPUInterpreter"])
+
+suite["CPU"]["nikuradse_1"] = @benchmarkable interpret_cpu(exprs, X, parameters; repetitions=expr_reps, parallel=true)
+
+loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)
+
+results = run(suite, verbose=true, seconds=28800) # 8 hour timeout
+BenchmarkTools.save("./results-fh-new/cpu.json", results)
diff --git a/package/test/PerformanceTests.jl b/package/test/PerformanceTests.jl
index aa2a8f0..91b43e7 100644
--- a/package/test/PerformanceTests.jl
+++ b/package/test/PerformanceTests.jl
@@ -50,37 +50,30 @@ expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially;
 # Add /usr/local/cuda/bin in .bashrc to PATH to access ncu and nsys (do the tests on FH PCs)
 # University setup at 10.20.1.7 and 10.20.1.13
 
-compareWithCPU = false
+compareWithCPU = true
 
 suite = BenchmarkGroup()
-suite["CPU"] = BenchmarkGroup(["CPUInterpreter"])
 suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"])
 suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"])
 
-if compareWithCPU
-	suite["CPU"]["nikuradse_1"] = @benchmarkable interpret_cpu(exprs, X, parameters; repetitions=expr_reps)
-	suite["CPU"]["nikuradse_1_parallel"] = @benchmarkable interpret_cpu(exprs, X, parameters; repetitions=expr_reps, parallel=true)
-end
-
 # cacheInterpreter = Dict{Expr, PostfixType}()
 suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprs, X_t, parameters; repetitions=expr_reps)
 
 # cacheTranspilerFront = Dict{Expr, PostfixType}()
 # cacheTranspilerRes = Dict{Expr, CuFunction}()
-suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprs, X_t, parameters; repetitions=expr_reps) # Takes forever. Needs more investigation
+suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprs, X_t, parameters; repetitions=expr_reps)
 
-tune!(suite)
-BenchmarkTools.save("params.json", params(suite))
-
-throw("finished tuning")
+# tune!(suite)
+# BenchmarkTools.save("params.json", params(suite))
 
 loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)
 
-results = run(suite, verbose=true, seconds=3600) # 1 hour because of CPU. lets see if more is needed
+results = run(suite, verbose=true, seconds=28800) # 8 hour timeout
+resultsCPU = BenchmarkTools.load("./results-fh-new/cpu.json")[1]
 
 if compareWithCPU
-	medianCPU = median(results["CPU"])
-	stdCPU = std(results["CPU"])
+	medianCPU = median(resultsCPU["CPU"])
+	stdCPU = std(resultsCPU["CPU"])
 	
 	medianInterpreter = median(results["GPUI"])
 	stdInterpreter = std(results["GPUI"])
diff --git a/package/test/params.json b/package/test/params.json
index 88e2d6d..2fe0dc8 100644
--- a/package/test/params.json
+++ b/package/test/params.json
@@ -1 +1 @@
-[{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{},"tags":["CPUInterpreter"]}],"GPUT":["BenchmarkGroup",{"data":{},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"nikuradse_1":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]]
\ No newline at end of file
+[{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["CPUInterpreter"]}],"GPUT":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]]
\ No newline at end of file
diff --git a/package/test/runtests.jl b/package/test/runtests.jl
index f769550..9597f50 100644
--- a/package/test/runtests.jl
+++ b/package/test/runtests.jl
@@ -17,10 +17,10 @@ end
 
 
 @testset "CPU Interpreter" begin
-	# include("CpuInterpreterTests.jl")
+	include("CpuInterpreterTests.jl")
 end
 
 @testset "Performance tests" begin
 	# include("PerformanceTuning.jl")
-	include("PerformanceTests.jl")
+	# include("PerformanceTests.jl")
 end
\ No newline at end of file
diff --git a/thesis/chapters/conceptdesign.tex b/thesis/chapters/conceptdesign.tex
index 613457d..39e47ed 100644
--- a/thesis/chapters/conceptdesign.tex
+++ b/thesis/chapters/conceptdesign.tex
@@ -52,7 +52,7 @@ Based on the requirements and data structure above, the architecture of both pro
 	\label{fig:kernel_architecture}
 \end{figure}
 
-A design decision that has been made for both prototypes is to split the evaluation of each expression into a separate kernel or kernel dispatch as seen in Figure \ref{fig:kernel_architecture}. As explained in Section \ref{sec:thread_hierarchy}, it is desirable to reduce the occurrence of thread divergence as much as possible. Although the SIMT programming model tries to mitigate the negative effects of thread divergence, it is still a good idea to avoid it when possible. For this use-case, thread divergence can easily be avoided by not evaluating all expressions in a single kernel or kernel dispatch. GPUs are able to have multiple resident grids, with modern GPUs being able to accommodate 128 grids concurrently \parencite{nvidia_cuda_2025}. One grid corresponds to one kernel dispatch, and therefore allows up-to 128 kernels to be run concurrently. Therefore, dispatching a kernel for each expression, further increases GPU utilisation. In the case of the interpreter, having only one kernel that can be dispatched for each expression, also simplifies the kernel itself. This is because the kernel can focus on evaluating one expression and does not require additional code to handle multiple expressions at once. Similarly, the transpiler can also be simplified, as it can generate many smaller kernels rather than one big kernel. Additionally, the smaller kernels do not need any branching, because the generated code only needs to perform the operations as they occur in the expression itself. This also reduces the overhead on the GPU.
+A design decision that has been made for both prototypes is to split the evaluation of each expression into a separate kernel or kernel dispatch as seen in Figure \ref{fig:kernel_architecture}. As explained in Section \ref{sec:thread_hierarchy}, it is desirable to reduce the occurrence of thread divergence as much as possible. Although the SIMT programming model tries to mitigate the negative effects of thread divergence, it is still a good idea to avoid it when possible. For this use-case, thread divergence can easily be avoided by not evaluating all expressions in a single kernel or kernel dispatch. GPUs are able to have multiple resident grids, with modern GPUs being able to accommodate 128 grids concurrently \parencite{nvidia_cuda_2025}. One grid corresponds to one kernel dispatch, and therefore allows up-to 128 kernels to be run concurrently. Therefore, dispatching a kernel for each expression, further increases GPU utilisation. In the case of the interpreter, having only one kernel that can be dispatched for each expression, also simplifies the kernel itself. This is because the kernel can focus on evaluating one expression and does not require additional code to handle multiple expressions at once. Similarly, the transpiler can also be simplified, as it can generate many smaller kernels rather than one big kernel. Additionally, the smaller kernels do not need any branching, because the generated code only needs to perform the operations as they occur in the expression itself. This also reduces the overhead on the GPU. One drawback of generating a kernel for each expression, is the generation itself. Especially for smaller variable sets, it is possible, that the time it takes to transpile an expression is greater than the time it takes to evaluate it. However, for larger variable sets this should not be a concern.
 
 \subsection{Pre-Processing}
 \label{sec:pre-processing}