benchmarking: added results for transpiler

benchmarking: tuned interpreter blocksize
benchmarking: fixed bugs introduced by modification of transpiler
2025-05-20 18:55:15 +02:00 · 2025-05-20 09:05:35 +02:00 · 2025-05-19 12:29:05 +02:00 · 2025-05-19 11:58:24 +02:00 · 2025-05-19 11:39:49 +02:00 · 2025-05-19 09:14:16 +02:00
9 changed files with 710 additions and 94 deletions
--- a/package/src/ExpressionExecutorCuda.jl
+++ b/package/src/ExpressionExecutorCuda.jl
@ -49,19 +49,25 @@ end
 # Convert Expressions to PTX Code and execute that instead
 function evaluate_gpu(expressions::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32}
 	@assert axes(expressions) == axes(p)
-	variableCols = size(X, 2)
-	variableRows = size(X, 1)
+	numVariableSets = size(X, 2) # nr. of columns of X
+	variableSetSize = size(X, 1) # nr. of rows of X
 	
 	variables = CuArray(X)

-	exprs = Vector{ExpressionProcessing.PostfixType}(undef, length(expressions))
+	largestParameterSetSize = Utils.get_max_inner_length(p) # parameters get transformed into matrix. Will be nr. of rows in parameter matrix
+
+	ptxKernels = Vector{String}(undef, length(expressions)) 
+	kernelName = "evaluate_gpu"
 	@inbounds Threads.@threads for i in eachindex(expressions)
-		exprs[i] = ExpressionProcessing.expr_to_postfix(expressions[i])
+		ex = ExpressionProcessing.expr_to_postfix(expressions[i])
+		ptxKernels[i] = Transpiler.transpile(ex, variableSetSize, largestParameterSetSize, numVariableSets, i-1, kernelName) # i-1 because julia is 1-based but PTX needs 0-based indexing
 	end

-	results = Matrix{Float32}(undef, variableCols, length(exprs))
+	results = Matrix{Float32}(undef, numVariableSets, length(expressions))
 	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl)
-		results = Transpiler.evaluate(exprs, variables, variableCols, variableRows, p)
+		# evaluate
+		# results = Transpiler.evaluate(exprs, variables, numVariableSets, variableSetSize, p)
+		results = Transpiler.evaluate(ptxKernels, variables, numVariableSets, p, kernelName)
 	end

 	return results
@ -103,7 +109,6 @@ function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector
 	res
 end

-
 # Flow
 # input: Vector expr    == expressions contains eg. 4 expressions
 #        Matrix X       == |expr| columns, n rows. n == number of variabls x1..xn; n is the same for all expressions --- WRONG
--- a/package/src/Interpreter.jl
+++ b/package/src/Interpreter.jl
@ -25,7 +25,7 @@ function interpret(cudaExprs, numExprs::Integer, exprsInnerLength::Integer,

 	# Start kernel for each expression to ensure that no warp is working on different expressions
 	@inbounds Threads.@threads for i in 1:numExprs # multithreaded to speedup dispatching (seems to have improved performance)
-		numThreads = min(variableColumns, 256)
+		numThreads = min(variableColumns, 121)
 		numBlocks = cld(variableColumns, numThreads)

 		@cuda threads=numThreads blocks=numBlocks fastmath=true interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
--- a/package/src/Transpiler.jl
+++ b/package/src/Transpiler.jl
@ -14,37 +14,6 @@ const Operand = Union{Float32, String} # Operand is either fixed value or regist
 "
 function evaluate(expressions::Vector{ExpressionProcessing.PostfixType}, cudaVars::CuArray{Float32}, variableColumns::Integer, variableRows::Integer, parameters::Vector{Vector{Float32}})::Matrix{Float32}

-	# TODO: test this again with multiple threads. The first time I tried, I was using only one thread
-	# Test this parallel version again when doing performance tests. With the simple "functionality" tests this took 0.03 seconds while sequential took "0.00009" seconds
-	# Threads.@threads for i in eachindex(expressions)
-	# 	cacheLock = ReentrantLock()
-	# 	cacheHit = false
-	# 	lock(cacheLock) do 
-	# 		if haskey(transpilerCache, expressions[i])
-	# 			kernels[i] = transpilerCache[expressions[i]]
-	# 			cacheHit = true
-	# 		end
-	# 	end
-
-	# 	if cacheHit
-	# 		continue
-	# 	end
-
-	# 	formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
-
-	# 	kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableColumns, i-1) # i-1 because julia is 1-based but PTX needs 0-based indexing
-
-	# 	linker = CuLink()
-	# 	add_data!(linker, "ExpressionProcessing", kernel)
-
-	# 	image = complete(linker)
-	
-	# 	mod = CuModule(image)
-	# 	kernels[i] = CuFunction(mod, "ExpressionProcessing")
-
-	# 	@lock cacheLock transpilerCache[expressions[i]] = kernels[i]
-	# end
-
 	cudaParams = Utils.create_cuda_array(parameters, NaN32) # maybe make constant (see PerformanceTests.jl for more info)

 	# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
@ -54,33 +23,45 @@ function evaluate(expressions::Vector{ExpressionProcessing.PostfixType}, cudaVar
 	blocks = cld(variableColumns, threads)
 	
 	kernelName = "evaluate_gpu"
-	# TODO: Implement batching as a middleground between "transpile everything and then run" and "tranpile one run one" even though cudacall is async
 	@inbounds Threads.@threads for i in eachindex(expressions)
-		# if haskey(resultCache, expressions[i])
-		# 	kernels[i] = resultCache[expressions[i]]
-		# 	continue
-		# end
-		
-		# formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
 		kernel = transpile(expressions[i], variableRows, Utils.get_max_inner_length(parameters), variableColumns, i-1, kernelName) # i-1 because julia is 1-based but PTX needs 0-based indexing
-
-		linker = CuLink()
-		add_data!(linker, kernelName, kernel)
-		
-		image = complete(linker)
-		mod = CuModule(image)
-		compiledKernel = CuFunction(mod, kernelName)
-
+		compiledKernel = CompileKernel(kernel, kernelName)
 		cudacall(compiledKernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
 	end

-	# for kernel in kernels
-	# 	cudacall(kernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
-	# end
-	
 	return cudaResults
 end

+"
+A simplified version of the evaluate function. It takes a list of already transpiled kernels to be executed. This should yield better performance, where the same expressions should be evaluated multiple times i.e. for parameter optimisation.
+"
+function evaluate(kernels::Vector{String}, cudaVars::CuArray{Float32}, nrOfVariableSets::Integer, parameters::Vector{Vector{Float32}}, kernelName::String)::Matrix{Float32}
+
+	cudaParams = Utils.create_cuda_array(parameters, NaN32) # maybe make constant (see PerformanceTests.jl for more info)
+
+	# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
+	cudaResults = CuArray{Float32}(undef, nrOfVariableSets, length(kernels))
+
+	threads = min(nrOfVariableSets, 256)
+	blocks = cld(nrOfVariableSets, threads)
+	
+	@inbounds Threads.@threads for i in eachindex(kernels)
+		compiledKernel = CompileKernel(kernels[i], kernelName)
+		cudacall(compiledKernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
+	end
+
+	return cudaResults
+end
+
+function CompileKernel(ptxKernel::String, kernelName::String)::CuFunction
+	linker = CuLink()
+	add_data!(linker, kernelName, ptxKernel)
+		
+	image = complete(linker)
+	mod = CuModule(image)
+	return CuFunction(mod, kernelName)
+end
+
 # To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string
 # seekstart(buf1); write(buf2, buf1)
 "
--- a/package/test/PerformanceTuning.jl
+++ b/package/test/PerformanceTuning.jl
@ -1,30 +1,38 @@
 using CUDA
+using DelimitedFiles
+using GZip

 using .Transpiler
 using .Interpreter

-varsets_medium = 10000
-X = randn(Float32, 5, varsets_medium)
+include("parser.jl") # to parse expressions from a file

-exprsGPU = [
-	# CPU interpreter requires an anonymous function and array ref s
-	:(p1 * x1 + p2), # 5 op
-	:((((x1 + x2) + x3) + x4) + x5), # 9 op
-	:(log(abs(x1))), # 3 op
-	:(powabs(p2 - powabs(p1 + x1, 1/x1),p3)) # 13 op
-] # 30 op

-# p is the same for CPU and GPU
-p = [randn(Float32, 10) for _ in 1:length(exprsGPU)] # generate 10 random parameter values for each expr
+data,varnames = readdlm("data/nikuradse_1.csv", ',', header=true);
+X = permutedims(convert(Matrix{Float32}, data))
+
+exprs = Expr[]
+parameters = Vector{Vector{Float32}}()
+varnames = ["x$i" for i in 1:10]
+paramnames = ["p$i" for i in 1:20]
+# data/esr_nvar2_len10.txt.gz_9.txt.gz has  ~250_000 exprs
+# data/esr_nvar2_len10.txt.gz_10.txt.gz has ~800_000 exrps
+GZip.open("data/esr_nvar2_len10.txt.gz_3.txt.gz") do io 
+	for line in eachline(io)
+		expr, p = parse_infix(line, varnames, paramnames)
+
+		push!(exprs, expr)
+		push!(parameters, randn(Float32, length(p)))
+	end
+end
 expr_reps = 1


-
@testset "Interpreter Tuning" begin
-    CUDA.@profile interpret_gpu(exprsGPU, X, p; repetitions=expr_reps)
+    # CUDA.@profile interpret_gpu(exprs, X, parameters; repetitions=expr_reps)
 end


@testset "Transpiler Tuning" begin
-    CUDA.@profile evaluate_gpu(exprsGPU, X, p; repetitions=expr_reps)
+    CUDA.@profile evaluate_gpu(exprs, X, parameters; repetitions=expr_reps)
 end
--- a/package/test/results-fh-new/0-initial.json
+++ b/package/test/results-fh-new/0-initial.json
@ -1 +1,194 @@
-[{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"GPUT":["BenchmarkGroup",{"data":{},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"nikuradse_1":["Trial",{"allocs":1825331206,"gctimes":[1.8938185191e10,1.7792800779e10,1.8160529276e10,1.7946505031e10,1.77973843e10,1.7616008261e10,1.7620413248e10,1.768910028e10,1.772636066e10,1.7706216778e10,1.8173891003e10,1.7667273912e10,1.7526904901e10,1.749445276e10,1.7567194654e10,1.7649119926e10,1.7639951452e10,1.7533807088e10,1.7517726514e10,1.7626783198e10,1.7511788769e10,1.7492068732e10,1.7553945009e10,1.7478083952e10,1.7437663283e10,1.7472329594e10,1.7519969261e10,1.7519953931e10,1.7526082936e10,1.751558218e10,1.7402059945e10,1.7250338348e10,1.7250474046e10,1.7291033872e10,1.7551432788e10,1.7850397239e10,1.7847877387e10,1.7447038841e10,1.754309134e10,1.7566433958e10,1.7503437877e10,1.7647987775e10,1.7401002748e10,1.7385713445e10,1.7385171642e10,1.7348026466e10,1.7438744763e10,1.7309013112e10,1.7577725655e10,1.7432755306e10],"memory":115414870368,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":50,"evals":1,"gcsample":false,"seconds":28800.0,"overhead":0.0,"memory_tolerance":0.01}],"times":[5.31951749725e11,5.31404501757e11,5.33657147801e11,5.31489160462e11,5.30386250505e11,5.30026023598e11,5.29887080071e11,5.34175638749e11,5.32476620162e11,5.32276123554e11,5.43002738488e11,5.30251592144e11,5.30190125835e11,5.28451973319e11,5.30828202555e11,5.29236820908e11,5.3205118374e11,5.30259980405e11,5.29369982343e11,5.29968522607e11,5.29094509442e11,5.3023736481e11,5.3026832017e11,5.30138026522e11,5.30291814111e11,5.28886430445e11,5.30786719418e11,5.31872294453e11,5.29735616869e11,5.32322531477e11,5.32945923244e11,5.28063077052e11,5.26379810748e11,5.2904720469e11,5.33989526381e11,5.37245240551e11,5.37790009675e11,5.30206196299e11,5.30276314709e11,5.30385782035e11,5.29114269928e11,5.31785585619e11,5.28768646361e11,5.27012226469e11,5.26681637262e11,5.28646301524e11,5.27917175176e11,5.28633753225e11,5.29807712794e11,5.27063144055e11]}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]]
+[
+	{
+		"Julia": "1.11.5",
+		"BenchmarkTools": {
+			"major": 1,
+			"minor": 6,
+			"patch": 0,
+			"prerelease": [],
+			"build": []
+		}
+	},
+	[
+		[
+			"BenchmarkGroup",
+			{
+				"data": {
+					"GPUT": [
+						"BenchmarkGroup",
+						{
+							"data": { 
+                                "nikuradse_1": [
+                                    "Trial",
+                                    {
+                                        "allocs": 10537236713,
+                                        "gctimes": [
+                                            6.422630609021e12
+                                        ],
+                                        "memory": 99746249534032,
+                                        "params": [
+                                            "Parameters",
+                                            {
+                                                "gctrial": true,
+                                                "time_tolerance": 0.05,
+                                                "evals_set": false,
+                                                "samples": 50,
+                                                "evals": 1,
+                                                "gcsample": false,
+                                                "seconds": 28800.0,
+                                                "overhead": 0.0,
+                                                "memory_tolerance": 0.01
+                                            }
+                                        ],
+                                        "times": [
+                                            5.4294504010681e13
+                                        ]
+                                    }
+                                ]
+                            },
+							"tags": [
+								"GPUTranspiler"
+							]
+						}
+					],
+					"GPUI": [
+						"BenchmarkGroup",
+						{
+							"data": {
+								"nikuradse_1": [
+									"Trial",
+									{
+										"allocs": 1825331206,
+										"gctimes": [
+											1.8938185191e10,
+											1.7792800779e10,
+											1.8160529276e10,
+											1.7946505031e10,
+											1.77973843e10,
+											1.7616008261e10,
+											1.7620413248e10,
+											1.768910028e10,
+											1.772636066e10,
+											1.7706216778e10,
+											1.8173891003e10,
+											1.7667273912e10,
+											1.7526904901e10,
+											1.749445276e10,
+											1.7567194654e10,
+											1.7649119926e10,
+											1.7639951452e10,
+											1.7533807088e10,
+											1.7517726514e10,
+											1.7626783198e10,
+											1.7511788769e10,
+											1.7492068732e10,
+											1.7553945009e10,
+											1.7478083952e10,
+											1.7437663283e10,
+											1.7472329594e10,
+											1.7519969261e10,
+											1.7519953931e10,
+											1.7526082936e10,
+											1.751558218e10,
+											1.7402059945e10,
+											1.7250338348e10,
+											1.7250474046e10,
+											1.7291033872e10,
+											1.7551432788e10,
+											1.7850397239e10,
+											1.7847877387e10,
+											1.7447038841e10,
+											1.754309134e10,
+											1.7566433958e10,
+											1.7503437877e10,
+											1.7647987775e10,
+											1.7401002748e10,
+											1.7385713445e10,
+											1.7385171642e10,
+											1.7348026466e10,
+											1.7438744763e10,
+											1.7309013112e10,
+											1.7577725655e10,
+											1.7432755306e10
+										],
+										"memory": 115414870368,
+										"params": [
+											"Parameters",
+											{
+												"gctrial": true,
+												"time_tolerance": 0.05,
+												"evals_set": false,
+												"samples": 50,
+												"evals": 1,
+												"gcsample": false,
+												"seconds": 28800.0,
+												"overhead": 0.0,
+												"memory_tolerance": 0.01
+											}
+										],
+										"times": [
+											5.31951749725e11,
+											5.31404501757e11,
+											5.33657147801e11,
+											5.31489160462e11,
+											5.30386250505e11,
+											5.30026023598e11,
+											5.29887080071e11,
+											5.34175638749e11,
+											5.32476620162e11,
+											5.32276123554e11,
+											5.43002738488e11,
+											5.30251592144e11,
+											5.30190125835e11,
+											5.28451973319e11,
+											5.30828202555e11,
+											5.29236820908e11,
+											5.3205118374e11,
+											5.30259980405e11,
+											5.29369982343e11,
+											5.29968522607e11,
+											5.29094509442e11,
+											5.3023736481e11,
+											5.3026832017e11,
+											5.30138026522e11,
+											5.30291814111e11,
+											5.28886430445e11,
+											5.30786719418e11,
+											5.31872294453e11,
+											5.29735616869e11,
+											5.32322531477e11,
+											5.32945923244e11,
+											5.28063077052e11,
+											5.26379810748e11,
+											5.2904720469e11,
+											5.33989526381e11,
+											5.37245240551e11,
+											5.37790009675e11,
+											5.30206196299e11,
+											5.30276314709e11,
+											5.30385782035e11,
+											5.29114269928e11,
+											5.31785585619e11,
+											5.28768646361e11,
+											5.27012226469e11,
+											5.26681637262e11,
+											5.28646301524e11,
+											5.27917175176e11,
+											5.28633753225e11,
+											5.29807712794e11,
+											5.27063144055e11
+										]
+									}
+								]
+							},
+							"tags": [
+								"GPUInterpreter"
+							]
+						}
+					]
+				},
+				"tags": []
+			}
+		]
+	]
+]
--- a/package/test/results-fh-new/1-fronted-and-data-transfer-to-ExpressionExecutor.json
+++ b/package/test/results-fh-new/1-fronted-and-data-transfer-to-ExpressionExecutor.json
@ -1 +1,194 @@
-[{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"GPUT":["BenchmarkGroup",{"data":{},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"nikuradse_1":["Trial",{"allocs":768768117,"gctimes":[1.1975019005e10,7.985238732e9,1.4256539541e10,8.877686056e9,1.4680883881e10,7.692335492e9,9.536354709e9,1.3536376614e10,1.4238839111e10,1.9925752838e10,9.025028453e9,1.5572506957e10,1.952938358e10,1.1815896105e10,1.3613672963e10,1.155423324e10,1.4004956257e10,8.806173097e9,8.174429914e9,1.3263383027e10,1.0794204698e10,1.5559450665e10,1.1655933294e10,1.0337481053e10,1.736781041e10,1.7557373752e10,1.0408159512e10,1.9575876788e10,1.1552463317e10,1.226612493e10,1.39046431e10,1.4741246638e10,1.3349550404e10,1.1029748223e10,1.2336413042e10,1.8974104972e10,1.62980404e10,1.7060266354e10,1.4275735627e10,1.1090002413e10,9.354486934e9,1.0120009791e10,1.2904978229e10,1.9392024576e10,1.4288312066e10,9.172039439e9,1.1963691856e10,1.7642492412e10,1.4929130699e10,1.5905152758e10],"memory":54082719144,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":50,"evals":1,"gcsample":false,"seconds":43200.0,"overhead":0.0,"memory_tolerance":0.01}],"times":[5.14174363969e11,5.18689077274e11,5.1025535864e11,5.10803229124e11,5.23299818383e11,5.16455770592e11,5.02350694438e11,5.0439224751e11,5.04269366358e11,5.06595858959e11,5.11724089224e11,5.1262595436e11,5.03168612131e11,5.21219083737e11,5.00099394667e11,5.11001185335e11,5.08254610458e11,5.15228010681e11,5.1538764885e11,5.00595179658e11,5.09523742228e11,5.09818545112e11,5.14655215639e11,5.14933349609e11,5.0169600001e11,5.12605187963e11,5.08668518972e11,4.99756633692e11,5.04657100071e11,4.96300433311e11,5.02859857609e11,5.00544153225e11,5.01888246474e11,5.10711561485e11,5.1255887708e11,5.03690773615e11,4.98071106526e11,5.14512763271e11,5.06840174712e11,5.18008421655e11,5.1741870342e11,5.01369775936e11,5.08726698998e11,5.04550273414e11,5.06774233833e11,5.16671635611e11,5.09574401096e11,5.03123609086e11,5.11987873937e11,5.03337347704e11]}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]]
+[
+	{
+		"Julia": "1.11.5",
+		"BenchmarkTools": {
+			"major": 1,
+			"minor": 6,
+			"patch": 0,
+			"prerelease": [],
+			"build": []
+		}
+	},
+	[
+		[
+			"BenchmarkGroup",
+			{
+				"data": {
+					"GPUT": [
+						"BenchmarkGroup",
+						{
+							"data": {
+                                "nikuradse_1": [
+                                    "Trial",
+                                    {
+                                        "allocs": 9578295211,
+                                        "gctimes": [
+                                            5.773640884485e12
+                                        ],
+                                        "memory": 99694581250168,
+                                        "params": [
+                                            "Parameters",
+                                            {
+                                                "gctrial": true,
+                                                "time_tolerance": 0.05,
+                                                "evals_set": false,
+                                                "samples": 50,
+                                                "evals": 1,
+                                                "gcsample": false,
+                                                "seconds": 43200.0,
+                                                "overhead": 0.0,
+                                                "memory_tolerance": 0.01
+                                            }
+                                        ],
+                                        "times": [
+                                            5.1630263257036e13
+                                        ]
+                                    }
+                                ]
+                            },
+							"tags": [
+								"GPUTranspiler"
+							]
+						}
+					],
+					"GPUI": [
+						"BenchmarkGroup",
+						{
+							"data": {
+								"nikuradse_1": [
+									"Trial",
+									{
+										"allocs": 768768117,
+										"gctimes": [
+											1.1975019005e10,
+											7.985238732e9,
+											1.4256539541e10,
+											8.877686056e9,
+											1.4680883881e10,
+											7.692335492e9,
+											9.536354709e9,
+											1.3536376614e10,
+											1.4238839111e10,
+											1.9925752838e10,
+											9.025028453e9,
+											1.5572506957e10,
+											1.952938358e10,
+											1.1815896105e10,
+											1.3613672963e10,
+											1.155423324e10,
+											1.4004956257e10,
+											8.806173097e9,
+											8.174429914e9,
+											1.3263383027e10,
+											1.0794204698e10,
+											1.5559450665e10,
+											1.1655933294e10,
+											1.0337481053e10,
+											1.736781041e10,
+											1.7557373752e10,
+											1.0408159512e10,
+											1.9575876788e10,
+											1.1552463317e10,
+											1.226612493e10,
+											1.39046431e10,
+											1.4741246638e10,
+											1.3349550404e10,
+											1.1029748223e10,
+											1.2336413042e10,
+											1.8974104972e10,
+											1.62980404e10,
+											1.7060266354e10,
+											1.4275735627e10,
+											1.1090002413e10,
+											9.354486934e9,
+											1.0120009791e10,
+											1.2904978229e10,
+											1.9392024576e10,
+											1.4288312066e10,
+											9.172039439e9,
+											1.1963691856e10,
+											1.7642492412e10,
+											1.4929130699e10,
+											1.5905152758e10
+										],
+										"memory": 54082719144,
+										"params": [
+											"Parameters",
+											{
+												"gctrial": true,
+												"time_tolerance": 0.05,
+												"evals_set": false,
+												"samples": 50,
+												"evals": 1,
+												"gcsample": false,
+												"seconds": 43200.0,
+												"overhead": 0.0,
+												"memory_tolerance": 0.01
+											}
+										],
+										"times": [
+											5.14174363969e11,
+											5.18689077274e11,
+											5.1025535864e11,
+											5.10803229124e11,
+											5.23299818383e11,
+											5.16455770592e11,
+											5.02350694438e11,
+											5.0439224751e11,
+											5.04269366358e11,
+											5.06595858959e11,
+											5.11724089224e11,
+											5.1262595436e11,
+											5.03168612131e11,
+											5.21219083737e11,
+											5.00099394667e11,
+											5.11001185335e11,
+											5.08254610458e11,
+											5.15228010681e11,
+											5.1538764885e11,
+											5.00595179658e11,
+											5.09523742228e11,
+											5.09818545112e11,
+											5.14655215639e11,
+											5.14933349609e11,
+											5.0169600001e11,
+											5.12605187963e11,
+											5.08668518972e11,
+											4.99756633692e11,
+											5.04657100071e11,
+											4.96300433311e11,
+											5.02859857609e11,
+											5.00544153225e11,
+											5.01888246474e11,
+											5.10711561485e11,
+											5.1255887708e11,
+											5.03690773615e11,
+											4.98071106526e11,
+											5.14512763271e11,
+											5.06840174712e11,
+											5.18008421655e11,
+											5.1741870342e11,
+											5.01369775936e11,
+											5.08726698998e11,
+											5.04550273414e11,
+											5.06774233833e11,
+											5.16671635611e11,
+											5.09574401096e11,
+											5.03123609086e11,
+											5.11987873937e11,
+											5.03337347704e11
+										]
+									}
+								]
+							},
+							"tags": [
+								"GPUInterpreter"
+							]
+						}
+					]
+				},
+				"tags": []
+			}
+		]
+	]
+]
--- a/package/test/results-fh-new/2-i_blocksize_121__t_transpiling_only_once.json
+++ b/package/test/results-fh-new/2-i_blocksize_121__t_transpiling_only_once.json
@ -0,0 +1,196 @@
+[
+	{
+		"Julia": "1.11.5",
+		"BenchmarkTools": {
+			"major": 1,
+			"minor": 6,
+			"patch": 0,
+			"prerelease": [],
+			"build": []
+		}
+	},
+	[
+		[
+			"BenchmarkGroup",
+			{
+				"data": {
+					"GPUT": [
+						"BenchmarkGroup",
+						{
+							"data": {
+                                "nikuradse_1": [
+                                    "Trial",
+                                    {
+                                        "allocs": 1534044518,
+                                        "gctimes": [
+                                            3.162096218033e12,
+                                            2.514920522839e12
+                                        ],
+                                        "memory": 51380856414712,
+                                        "params": [
+                                            "Parameters",
+                                            {
+                                                "gctrial": true,
+                                                "time_tolerance": 0.05,
+                                                "evals_set": false,
+                                                "samples": 50,
+                                                "evals": 1,
+                                                "gcsample": false,
+                                                "seconds": 43200.0,
+                                                "overhead": 0.0,
+                                                "memory_tolerance": 0.01
+                                            }
+                                        ],
+                                        "times": [
+                                            3.579290341907e13,
+                                            3.6476991686227e13
+                                        ]
+                                    }
+                                ]
+                            },
+							"tags": [
+								"GPUTranspiler"
+							]
+						}
+					],
+					"GPUI": [
+						"BenchmarkGroup",
+						{
+							"data": {
+								"nikuradse_1": [
+									"Trial",
+									{
+										"allocs": 768767740,
+										"gctimes": [
+											1.4209871071e10,
+											8.529233725e9,
+											8.165943693e9,
+											8.180014668e9,
+											8.231263428e9,
+											1.1110946388e10,
+											1.3136749872e10,
+											1.0515143897e10,
+											1.2978886885e10,
+											1.0709110363e10,
+											1.2408937103e10,
+											1.4486745203e10,
+											1.3229416582e10,
+											1.8353010658e10,
+											1.32173253e10,
+											1.1621004633e10,
+											1.1136122325e10,
+											9.614762707e9,
+											1.4564265563e10,
+											9.399404156e9,
+											1.063983064e10,
+											1.2513746965e10,
+											9.039906393e9,
+											1.2382209752e10,
+											1.3127092115e10,
+											1.2713843793e10,
+											1.1111974511e10,
+											1.5837882785e10,
+											1.5005237417e10,
+											1.2439743996e10,
+											9.607861366e9,
+											1.0680724758e10,
+											1.4012997282e10,
+											1.258804731e10,
+											1.020862355e10,
+											9.630750655e9,
+											1.5428270551e10,
+											1.746317266e10,
+											1.3141055589e10,
+											1.5009128259e10,
+											8.453648604e9,
+											1.6874341516e10,
+											1.1411307067e10,
+											1.2542892313e10,
+											1.1232296452e10,
+											1.3458245148e10,
+											1.0818032806e10,
+											9.239119183e9,
+											1.7897566617e10,
+											1.565065385e10
+										],
+										"memory": 54082712568,
+										"params": [
+											"Parameters",
+											{
+												"gctrial": true,
+												"time_tolerance": 0.05,
+												"evals_set": false,
+												"samples": 50,
+												"evals": 1,
+												"gcsample": false,
+												"seconds": 43200.0,
+												"overhead": 0.0,
+												"memory_tolerance": 0.01
+											}
+										],
+										"times": [
+											4.72169572882e11,
+											5.0409909815e11,
+											5.07815085942e11,
+											5.10453558146e11,
+											5.10478958938e11,
+											4.97262381193e11,
+											5.0260603513e11,
+											4.99542972531e11,
+											4.87993778737e11,
+											4.89021704445e11,
+											5.03746768492e11,
+											4.89869107858e11,
+											4.73146154356e11,
+											4.8171801387e11,
+											5.08579879922e11,
+											4.949573335e11,
+											4.72187897068e11,
+											4.99229768599e11,
+											4.60419913288e11,
+											4.69019613895e11,
+											4.50583091837e11,
+											4.72792727311e11,
+											4.72333754492e11,
+											4.65152305777e11,
+											4.82234976786e11,
+											4.72238483765e11,
+											4.73826923338e11,
+											4.76267120461e11,
+											4.87120033427e11,
+											5.04120244741e11,
+											4.69559064737e11,
+											4.72201757593e11,
+											4.69914031792e11,
+											4.93629873162e11,
+											4.71968584791e11,
+											5.01452793581e11,
+											4.80458931455e11,
+											4.83065538379e11,
+											4.99070229147e11,
+											4.71609869279e11,
+											4.71492369998e11,
+											4.58522950715e11,
+											4.80960881323e11,
+											4.91960762476e11,
+											4.73412762655e11,
+											4.69283546561e11,
+											4.66574358844e11,
+											4.67318993209e11,
+											4.5724723899e11,
+											4.7334516285e11
+										]
+									}
+								]
+							},
+							"tags": [
+								"GPUInterpreter"
+							]
+						}
+					]
+				},
+				"tags": []
+			}
+		]
+	]
+]
--- a/thesis/chapters/evaluation.tex
+++ b/thesis/chapters/evaluation.tex
@ -1,42 +1,82 @@
 \chapter{Evaluation}
 \label{cha:evaluation}

-The aim of this thesis is to determine whether at least one of the GPU evaluators is faster than the current CPU evaluator. This chapter describes the performance evaluation. First, the environment in which the performance tests are performed is explained. Then the individual results for the GPU interpreter and the transpiler are presented. In addition, this part also includes the performance tuning steps taken to achieve these results. Finally, the results of the GPU evaluators are compared to the CPU evaluator in order to answer the research questions of this thesis.
+This thesis aims to determine whether one of the two GPU evaluators is faster than the current CPU evaluator. This chapter describes the performance evaluation process. First, the environment in which the performance benchmarks are conducted is explained. Next the individual results for the GPU interpreter and transpiler are presented individually. This section also includes the performance tuning steps taken to achieve these results. Finally, the results of the GPU evaluators are compared to those of the CPU evaluator to answer the research questions of this thesis.

-\section{Test environment}
-Explain the hardware used, as well as the actual data (how many expressions, variables etc.)
+\section{Benchmark Environment}
+In this section, the benchmark environment used to evaluate the performance is outlined. To ensure the validity and reliability of the results, it is necessary to specify the details of the environment. This includes a description of the hardware and software configuration as well as the performance evaluation process. With this, the variance between the results is minimised, which allows for better reproducibility and comparability between the implementations.
+
+\subsection{Hardware Configuration}
+The hardware configuration is the most important aspect of the benchmark environment. The capabilities of both the CPU and GPU can have a significant impact on the resulting performance. The following sections outline the importance of the individual components as well as the hardware used for the benchmarks.
+
+\subsubsection{GPU}
+Especially the GPU is important, as different microarchitectures typically require different optimisations. While the evaluators can generally run on any Nvidia GPU with a compute capability of at least 6.1, they are tuned for the Ampere microarchitecture with a compute capability of 8.6. Despite the evaluators being tuned for this microarchitecture, more modern ones can be used as well. However, additional tuning is required to ensure the evaluators can utilise the hardware to its fullest potential.
+
+\subsubsection{CPU}
+Although the GPU plays a crucial role, work is also carried out on the CPU. The interpreter mainly uses the CPU for data transfer and the pre-processing step and is therefore more GPU-bound. However, the transpiler additionally needs the CPU to perform the transpilation step. This step produces a kernel for each expression and also involves sending these kernels to the driver for compilation, a process which is also performed by the CPU. By contrast, the interpreter only has one kernel that needs to be converted into PTX and compiled by the driver only once. Consequently, the transpiler is much more CPU-bound and variations in the used CPU have a much greater impact. Therefore, using a more powerful CPU benefits the transpiler more than the interpreter.
+
+\subsubsection{System Memory}
+In addition to the hardware configuration of the GPU and CPU, system memory (RAM) also plays a crucial role. While RAM does not directly contribute to the overall performance, it can have a noticeable indirect impact due to its role in caching. Insufficient RAM forces the operating system to use the page file, which is stored on a much slower SSD. This results in slower cache access, thereby reducing the overall performance of the application.
+
+As seen in the list below, only 16 GB of RAM were available during the benchmarking process. This amount is insufficient to utilise caching to the extent outlined in Chapter \ref{cha:implementation}. More RAM was not available, which means some caching had to be disabled, which will be further explained in Section \ref{sec:results}.
+
+\subsubsection{Hardware}
+With the requirements explained above in mind, the following hardware is used to perform the benchmarks for the CPU-based evaluator, which was used as the baseline, as well as for the GPU-based evaluators:
+\begin{itemize}
+	\item Intel i5 12500
+	\item Nvidia RTX 3060 Ti
+	\item 16 GB 4400 MT/s DDR5 RAM
+\end{itemize}
+
+
+\subsection{Software Configuration}
+Apart from the hardware, the performance of the evaluators can also be significantly affected by the software. Primarily these three software components are involved in the performance:
+\begin{itemize}
+	\item GPU Driver
+	\item Julia
+	\item CUDA.jl
+\end{itemize}
+
+Typically, newer versions of these components include performance improvements, among other things. This is why it is important to specify the version which is used for benchmarking. The GPU driver uses version \emph{561.17}, Julia uses version \emph{1.11.5}, and CUDA.jl uses version \emph{5.8.1}. As with the hardware configuration, this ensures that the results are reproducible and comparable to each other.
+
+
+\subsection{Performance evaluation process}
+% explain the actual data
+% Nikuradse dataset (flowrate through rough pipes (fact check that again))
+%    250k expressions; ~300 variable sets; 100 parameter optimisation steps (simulated)
+% using Benchmarktools.jl as a tried and tested benchmark suite
+% 50 samples to eliminate any run-to-run variance

-three scenarios -> few, normal and many variable sets;; expr repetitions to simulate parameter optimisation
-Benchmarktools.jl -> 1000 samples per scenario

 \section{Results}
-talk about what we will see now (results only for interpreter, then transpiler and then compared with each other and a CPU interpreter)
+\label{sec:results}
+talk about what we will see now (results only for interpreter, then transpiler and then compared with each other and the CPU interpreter)
+
+BECAUSE OF RAM CONSTRAINTS, CACHING IS NOT USED TO THE FULL EXTEND AS IN CONTRAST TO HOW IT IS EXPLAINED IN THE IMPLEMENTATION CHAPTER. I hope I can cache the frontend. If only the finished kernels can not be cached, move this explanation to the transpiler section below and update the reference in subsubsection "System Memory"

 \subsection{Interpreter}
 Results only for Interpreter (also contains final kernel configuration and probably quick overview/recap of the implementation used and described in Implementation section)
-\subsection{Performance tuning}
+\subsection{Performance Tuning}
 Document the process of performance tuning

-Initial: CPU-Side single-threaded; up to 1024 threads per block; bounds-checking enabled (especially in kernel)
+Initial: no cache; 256 blocksize; exprs pre-processed and sent to GPU on every call; vars sent on every call; frontend + dispatch are multithreaded
+
+1.) Done before parameter optimisation loop: Frontend, transmitting Exprs and Variables (improved runtime)
+2.) tuned blocksize to have as little wasted threads as possible (new blocksize 121 -> 3-blocks -> 363 threads but 362 threads needed per expression)

-1.) Blocksize reduced to a maximum of 256 -> moderate improvement in medium and large
-2.) Using @inbounds -> noticeable improvement in 2 out of 3
-3.) Tuned blocksize with NSight compute -> slight improvement
-4.) used int32 everywhere to reduce register usage -> significant performance drop (probably because a lot more waiting time "latency hiding not working basically", or more type conversions happening on GPU? look at generated PTX code and use that as an argument to describe why it is slower)
-5.) reverted previous; used fastmath instead -> imporvement (large var set is now faster than on transpiler)

 \subsection{Transpiler}
 Results only for Transpiler (also contains final kernel configuration and probably quick overview/recap of the implementation used and described in Implementation section
-\subsection{Performance tuning}
+
+
+
+\subsection{Performance Tuning}
 Document the process of performance tuning

-Initial: CPU-Side single-threaded; up to 1024 threads per block; bounds-checking enabled
+Initial: no cache; 256 blocksize; exprs pre-processed and transpiled on every call; vars sent on every call; frontend + transpilation + dispatch are multithreaded

-1.) Blocksize reduced to a maximum of 256 -> moderate improvement in medium and large
-2.) Using @inbounds -> small improvement only on CPU side code
-3.) Tuned blocksize with NSight compute -> slight improvement
-4.) Only changed things on interpreter side
-5.) Only changed things on interpreter side
+1.) Done before parameter optimisation loop: Frontend, transmitting Exprs and Variables (improved runtime)
+2.) All expressions to execute are transpiled first (before they were transpiled for every execution, even in parameter optimisation scenarios). Compilation is still done every time, because too little RAM was available (compilation takes the most time, so this is only a minor boost)

 \subsection{Comparison}
 Comparison of Interpreter and Transpiler as well as Comparing the two with CPU interpreter
--- a/thesis/main.pdf
+++ b/thesis/main.pdf
Author	SHA1	Message	Date
Daniel	39302ffe9e	benchmarking: added results for transpiler	2025-05-20 18:55:15 +02:00
Daniel	250deb334c	benchmarking: tuned interpreter blocksize Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details	2025-05-20 09:05:35 +02:00
Daniel	a9ffd5da63	benchmarking: fixed bugs introduced by modification of transpiler Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details	2025-05-19 12:29:05 +02:00
Daniel	e29199d865	benchmarking: moved compilation of kernel to evaluate function, as it required too much memory Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-05-19 11:58:24 +02:00
Daniel	f33551e25f	benchmarking: updated transpiler to drastically reduce the number of transpilations at the expense of memory usage Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-05-19 11:39:49 +02:00
Daniel	33e7edd4c8	benchmarking: added results for first optimisation for transpiler Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-05-19 09:14:16 +02:00
Daniel	09ca7ac6c6	benchmarking: added initial results for transpiler Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-05-19 09:12:06 +02:00
Daniel	93516c54d4	evaluation: started describing benchmark environment Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-05-18 14:17:47 +02:00