benchmarking: updated benchmarking suite and prepared for taking the benchmarks

2025-05-15 16:25:32 +02:00
parent 3d80ae95e4
commit d7e18f183d
6 changed files with 57 additions and 48 deletions
--- a/package/src/Transpiler.jl
+++ b/package/src/Transpiler.jl
@ -57,39 +57,25 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
 	threads = min(variableCols, 256)
 	blocks = cld(variableCols, threads)
 	kernelName = "evaluate_gpu"
 	# TODO: Implement batching as a middleground between "transpile everything and then run" and "tranpile one run one" even though cudacall is async
-	@inbounds for i in eachindex(expressions)
+	@inbounds Threads.@threads for i in eachindex(expressions)
 		# if haskey(resultCache, expressions[i])
 		# 	kernels[i] = resultCache[expressions[i]]
 		# 	continue
 		# end
 		formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
-		kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i-1) # i-1 because julia is 1-based but PTX needs 0-based indexing
+		kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i-1, kernelName) # i-1 because julia is 1-based but PTX needs 0-based indexing
-		# try
+		linker = CuLink()
-			linker = CuLink()
+		add_data!(linker, kernelName, kernel)
 			add_data!(linker, "ExpressionProcessing", kernel)
-			image = complete(linker)
+		image = complete(linker)
-			
+		mod = CuModule(image)
-			mod = CuModule(image)
+		compiledKernel = CuFunction(mod, kernelName)
 			compiledKernel = CuFunction(mod, "ExpressionProcessing")
 			cudacall(compiledKernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
 			# kernels[i] = CuFunction(mod, "ExpressionProcessing")
 		# 	resultCache[expressions[i]] = kernels[i]
 		# catch
 		# 	dump(expressions[i]; maxdepth=10)
 		# 	println()
 		# 	println()
 		# 	println(kernel)
 		# 	println()
 		# 	println()
 		# 	error(current_exceptions())
 		# end
 		cudacall(compiledKernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
 	end
 	# for kernel in kernels
@ -107,13 +93,13 @@ end
 - param ```expressionIndex```: The 0-based index of the expression
 "
 function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Integer, paramSetSize::Integer, 
-				   nrOfVariableSets::Integer, expressionIndex::Integer)::String
+				   nrOfVariableSets::Integer, expressionIndex::Integer, kernelName::String)::String
 	exitJumpLocationMarker = "L__BB0_2"
 	ptxBuffer = IOBuffer()
 	regManager = Utils.RegisterManager(Dict(), Dict())
 	# TODO: Suboptimal solution. get_kernel_signature should also return the name of the registers used for the parameters, so further below, we do not have to hard-code them
-	signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Float32, Float32, Float32], regManager) # Vars, Params, Results
+	signature, paramLoading = get_kernel_signature(kernelName, [Float32, Float32, Float32], regManager) # Vars, Params, Results
 	guardClause, threadId64Reg = get_guard_clause(exitJumpLocationMarker, nrOfVariableSets, regManager)
 	println(ptxBuffer, get_cuda_header())
--- a/package/test/CpuInterpreterTests.jl
+++ b/package/test/CpuInterpreterTests.jl
@ -78,4 +78,34 @@ function test_cpu_interpreter_nikuradse()
 end
-@test test_cpu_interpreter_nikuradse()
+# @test test_cpu_interpreter_nikuradse()
 data,varnames = readdlm("data/nikuradse_1.csv", ',', header=true);
 X = convert(Matrix{Float32}, data)
 X_t = permutedims(X) # for gpu
 exprs = Expr[]
 parameters = Vector{Vector{Float32}}()
 varnames = ["x$i" for i in 1:10]
 paramnames = ["p$i" for i in 1:20]
 # data/esr_nvar2_len10.txt.gz_9.txt.gz has  ~250_000 exprs
 # data/esr_nvar2_len10.txt.gz_10.txt.gz has ~800_000 exrps
 GZip.open("data/esr_nvar2_len10.txt.gz_9.txt.gz") do io 
 	for line in eachline(io)
 		expr, p = parse_infix(line, varnames, paramnames)
 		push!(exprs, expr)
 		push!(parameters, randn(Float32, length(p)))
 	end
 end
 expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially; only p changes but not X)
 suite = BenchmarkGroup()
 suite["CPU"] = BenchmarkGroup(["CPUInterpreter"])
 suite["CPU"]["nikuradse_1"] = @benchmarkable interpret_cpu(exprs, X, parameters; repetitions=expr_reps, parallel=true)
 loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)
 results = run(suite, verbose=true, seconds=28800) # 8 hour timeout
 BenchmarkTools.save("./results-fh-new/cpu.json", results)
--- a/package/test/PerformanceTests.jl
+++ b/package/test/PerformanceTests.jl
@ -50,37 +50,30 @@ expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially;
 # Add /usr/local/cuda/bin in .bashrc to PATH to access ncu and nsys (do the tests on FH PCs)
 # University setup at 10.20.1.7 and 10.20.1.13
-compareWithCPU = false
+compareWithCPU = true
 suite = BenchmarkGroup()
 suite["CPU"] = BenchmarkGroup(["CPUInterpreter"])
 suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"])
 suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"])
 if compareWithCPU
 	suite["CPU"]["nikuradse_1"] = @benchmarkable interpret_cpu(exprs, X, parameters; repetitions=expr_reps)
 	suite["CPU"]["nikuradse_1_parallel"] = @benchmarkable interpret_cpu(exprs, X, parameters; repetitions=expr_reps, parallel=true)
 end
 # cacheInterpreter = Dict{Expr, PostfixType}()
 suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprs, X_t, parameters; repetitions=expr_reps)
 # cacheTranspilerFront = Dict{Expr, PostfixType}()
 # cacheTranspilerRes = Dict{Expr, CuFunction}()
-suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprs, X_t, parameters; repetitions=expr_reps) # Takes forever. Needs more investigation
+suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprs, X_t, parameters; repetitions=expr_reps)
-tune!(suite)
+# tune!(suite)
-BenchmarkTools.save("params.json", params(suite))
+# BenchmarkTools.save("params.json", params(suite))
 throw("finished tuning")
 loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)
-results = run(suite, verbose=true, seconds=3600) # 1 hour because of CPU. lets see if more is needed
+results = run(suite, verbose=true, seconds=28800) # 8 hour timeout
 resultsCPU = BenchmarkTools.load("./results-fh-new/cpu.json")[1]
 if compareWithCPU
-	medianCPU = median(results["CPU"])
+	medianCPU = median(resultsCPU["CPU"])
-	stdCPU = std(results["CPU"])
+	stdCPU = std(resultsCPU["CPU"])
 	medianInterpreter = median(results["GPUI"])
 	stdInterpreter = std(results["GPUI"])
--- a/package/test/params.json
+++ b/package/test/params.json
@ -1 +1 @@
-[{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{},"tags":["CPUInterpreter"]}],"GPUT":["BenchmarkGroup",{"data":{},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"nikuradse_1":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]]
+[{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["CPUInterpreter"]}],"GPUT":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]]
--- a/package/test/runtests.jl
+++ b/package/test/runtests.jl
@ -17,10 +17,10 @@ end
@testset "CPU Interpreter" begin
-	# include("CpuInterpreterTests.jl")
+	include("CpuInterpreterTests.jl")
 end
@testset "Performance tests" begin
 	# include("PerformanceTuning.jl")
-	include("PerformanceTests.jl")
+	# include("PerformanceTests.jl")
 end
--- a/thesis/chapters/conceptdesign.tex
+++ b/thesis/chapters/conceptdesign.tex
@ -52,7 +52,7 @@ Based on the requirements and data structure above, the architecture of both pro
 	\label{fig:kernel_architecture}
 \end{figure}
-A design decision that has been made for both prototypes is to split the evaluation of each expression into a separate kernel or kernel dispatch as seen in Figure \ref{fig:kernel_architecture}. As explained in Section \ref{sec:thread_hierarchy}, it is desirable to reduce the occurrence of thread divergence as much as possible. Although the SIMT programming model tries to mitigate the negative effects of thread divergence, it is still a good idea to avoid it when possible. For this use-case, thread divergence can easily be avoided by not evaluating all expressions in a single kernel or kernel dispatch. GPUs are able to have multiple resident grids, with modern GPUs being able to accommodate 128 grids concurrently \parencite{nvidia_cuda_2025}. One grid corresponds to one kernel dispatch, and therefore allows up-to 128 kernels to be run concurrently. Therefore, dispatching a kernel for each expression, further increases GPU utilisation. In the case of the interpreter, having only one kernel that can be dispatched for each expression, also simplifies the kernel itself. This is because the kernel can focus on evaluating one expression and does not require additional code to handle multiple expressions at once. Similarly, the transpiler can also be simplified, as it can generate many smaller kernels rather than one big kernel. Additionally, the smaller kernels do not need any branching, because the generated code only needs to perform the operations as they occur in the expression itself. This also reduces the overhead on the GPU.
+A design decision that has been made for both prototypes is to split the evaluation of each expression into a separate kernel or kernel dispatch as seen in Figure \ref{fig:kernel_architecture}. As explained in Section \ref{sec:thread_hierarchy}, it is desirable to reduce the occurrence of thread divergence as much as possible. Although the SIMT programming model tries to mitigate the negative effects of thread divergence, it is still a good idea to avoid it when possible. For this use-case, thread divergence can easily be avoided by not evaluating all expressions in a single kernel or kernel dispatch. GPUs are able to have multiple resident grids, with modern GPUs being able to accommodate 128 grids concurrently \parencite{nvidia_cuda_2025}. One grid corresponds to one kernel dispatch, and therefore allows up-to 128 kernels to be run concurrently. Therefore, dispatching a kernel for each expression, further increases GPU utilisation. In the case of the interpreter, having only one kernel that can be dispatched for each expression, also simplifies the kernel itself. This is because the kernel can focus on evaluating one expression and does not require additional code to handle multiple expressions at once. Similarly, the transpiler can also be simplified, as it can generate many smaller kernels rather than one big kernel. Additionally, the smaller kernels do not need any branching, because the generated code only needs to perform the operations as they occur in the expression itself. This also reduces the overhead on the GPU. One drawback of generating a kernel for each expression, is the generation itself. Especially for smaller variable sets, it is possible, that the time it takes to transpile an expression is greater than the time it takes to evaluate it. However, for larger variable sets this should not be a concern.
 \subsection{Pre-Processing}
 \label{sec:pre-processing}
		`@ -1 +1 @@`
			[{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{},"tags":["CPUInterpreter"]}],"GPUT":["BenchmarkGroup",{"data":{},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"nikuradse_1":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]]				[{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["CPUInterpreter"]}],"GPUT":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]]