benchmarking: added initial results for transpiler

2025-05-19 09:12:59 +02:00
14 changed files with 162 additions and 659 deletions
--- a/package/src/ExpressionExecutorCuda.jl
+++ b/package/src/ExpressionExecutorCuda.jl
@ -9,7 +9,6 @@ include("Code.jl")
 include("CpuInterpreter.jl")
 end
 using CUDA
 using ..ExpressionProcessing
 export interpret_gpu,interpret_cpu
@ -23,51 +22,36 @@ export evaluate_gpu
 #
 # Evaluate Expressions on the GPU
-function interpret_gpu(expressions::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32}
+function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32}
-	@assert axes(expressions) == axes(p)
+	@assert axes(exprs) == axes(p)
-	variableCols = size(X, 2)
+	ncols = size(X, 2)
 	variableRows = size(X, 1)
-	variables = CuArray(X)
+	results = Matrix{Float32}(undef, ncols, length(exprs))
 	# TODO: create CuArray for variables here already, as they never change 
 	#		could/should be done even before calling this, but I guess it would be diminishing returns 
 	# 		TODO: test how this would impact performance, if it gets faster, adapt implementation section
 	# TODO: create CuArray for expressions here already. They also do not change over the course of parameter optimisation and therefore a lot of unnecessary calls to expr_to_postfix can be save (even though a cache is used, this should still be faster)
 	exprs = Vector{ExpressionProcessing.PostfixType}(undef, length(expressions))
 	@inbounds Threads.@threads for i in eachindex(expressions)
 		exprs[i] = ExpressionProcessing.expr_to_postfix(expressions[i])
 	end
 	cudaExprs = Utils.create_cuda_array(exprs, ExpressionProcessing.ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression;
 	exprsLength = length(exprs)
 	exprsInnerLength = Utils.get_max_inner_length(exprs)
 	results = Matrix{Float32}(undef, variableCols, length(exprs))
 	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl)
-		results = Interpreter.interpret(cudaExprs, exprsLength, exprsInnerLength, variables, variableCols, variableRows, p)
+		results = Interpreter.interpret(exprs, X, p)
 	end
 	return results
 end
 # Convert Expressions to PTX Code and execute that instead
-function evaluate_gpu(expressions::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32}
+function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32}
-	@assert axes(expressions) == axes(p)
+	@assert axes(exprs) == axes(p)
-	numVariableSets = size(X, 2) # nr. of columns of X
+	ncols = size(X, 2)
 	variableSetSize = size(X, 1) # nr. of rows of X
 	variables = CuArray(X)
-	largestParameterSetSize = Utils.get_max_inner_length(p) # parameters get transformed into matrix. Will be nr. of rows in parameter matrix
+	results = Matrix{Float32}(undef, ncols, length(exprs))
 	# TODO: create CuArray for variables here already, as they never change 
 	#		could/should be done even before calling this, but I guess it would be diminishing returns 
 	# 		TODO: test how this would impact performance, if it gets faster, adapt implementation section
 	# TODO: create CuArray for expressions here already. They also do not change over the course of parameter optimisation and therefore a lot of unnecessary calls to expr_to_postfix can be save (even though a cache is used, this should still be faster)
 	ptxKernels = Vector{String}(undef, length(expressions)) 
 	kernelName = "evaluate_gpu"
 	@inbounds Threads.@threads for i in eachindex(expressions)
 		ex = ExpressionProcessing.expr_to_postfix(expressions[i])
 		ptxKernels[i] = Transpiler.transpile(ex, variableSetSize, largestParameterSetSize, numVariableSets, i-1, kernelName) # i-1 because julia is 1-based but PTX needs 0-based indexing
 	end
 	results = Matrix{Float32}(undef, numVariableSets, length(expressions))
 	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl)
-		# evaluate
+		results = Transpiler.evaluate(exprs, X, p)
 		# results = Transpiler.evaluate(exprs, variables, numVariableSets, variableSetSize, p)
 		results = Transpiler.evaluate(ptxKernels, variables, numVariableSets, p, kernelName)
 	end
 	return results
@ -109,6 +93,7 @@ function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector
 	res
 end
 # Flow
 # input: Vector expr    == expressions contains eg. 4 expressions
 #        Matrix X       == |expr| columns, n rows. n == number of variabls x1..xn; n is the same for all expressions --- WRONG
--- a/package/src/ExpressionProcessing.jl
+++ b/package/src/ExpressionProcessing.jl
@ -22,6 +22,7 @@ const PostfixType = Vector{ExpressionElement}
 "
 Converts a julia expression to its postfix notation.
 NOTE: All 64-Bit values will be converted to 32-Bit. Be aware of the lost precision.
 NOTE: This function is not thread save, especially cache access is not thread save
 "
 function expr_to_postfix(expression::Expr)::PostfixType
 	expr = expression
--- a/package/src/Interpreter.jl
+++ b/package/src/Interpreter.jl
@ -8,37 +8,39 @@ export interpret
 "Interprets the given expressions with the values provided.
 # Arguments
- - cudaExprs::CuArray{ExpressionProcessing.PostfixType} : The expressions to execute in postfix form and already sent to the GPU. The type information in the signature is missing, because creating a CuArray{ExpressionProcessing.PostfixType} results in a mor everbose type definition
+ - expressions::Vector{ExpressionProcessing.PostfixType} : The expressions to execute in postfix form
- - cudaVars::CuArray{Float32} : The variables to use. Each column is mapped to the variables x1..xn. The type information is missing due to the same reasons as cudaExprs
+ - variables::Matrix{Float32} : The variables to use. Each column is mapped to the variables x1..xn
 - parameters::Vector{Vector{Float32}} : The parameters to use. Each Vector contains the values for the parameters p1..pn. The number of parameters can be different for every expression
 - kwparam ```frontendCache```: The cache that stores the (partial) results of the frontend
 "
-function interpret(cudaExprs, numExprs::Integer, exprsInnerLength::Integer,
+function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
-	cudaVars, variableColumns::Integer, variableRows::Integer, parameters::Vector{Vector{Float32}})::Matrix{Float32}
+	exprs = Vector{ExpressionProcessing.PostfixType}(undef, length(expressions))
-
+	@inbounds for i in eachindex(expressions)
-	cudaParams = Utils.create_cuda_array(parameters, NaN32) # column corresponds to data for one expression
+		exprs[i] = ExpressionProcessing.expr_to_postfix(expressions[i])
 	# put into seperate cuArray, as this is static and would be inefficient to send seperatly to each kernel
 	cudaStepsize = CuArray([exprsInnerLength, Utils.get_max_inner_length(parameters), variableRows]) # max num of values per expression; max nam of parameters per expression; number of variables per expression
 	# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
 	cudaResults = CuArray{Float32}(undef, variableColumns, numExprs)
 	# Start kernel for each expression to ensure that no warp is working on different expressions
 	numThreads = min(variableColumns, 121)
 	numBlocks = cld(variableColumns, numThreads)
 	Threads.@threads for i in 1:numExprs # multithreaded to speedup dispatching (seems to have improved performance)
 		@cuda threads=numThreads blocks=numBlocks fastmath=true interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
 	end
-	# Reduce GC pressure https://cuda.juliagpu.org/stable/usage/memory/#Avoiding-GC-pressure
+	variableCols = size(variables, 2) # number of variable sets to use for each expression
-	CUDA.unsafe_free!(cudaParams)
+	cudaVars = CuArray(variables)
-	CUDA.unsafe_free!(cudaStepsize)
+	cudaParams = Utils.create_cuda_array(parameters, NaN32) # column corresponds to data for one expression
 	cudaExprs = Utils.create_cuda_array(exprs, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression;
 	# put into seperate cuArray, as this is static and would be inefficient to send seperatly to each kernel
 	cudaStepsize = CuArray([Utils.get_max_inner_length(exprs), Utils.get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression
 	# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
 	cudaResults = CuArray{Float32}(undef, variableCols, length(exprs))
 	# Start kernel for each expression to ensure that no warp is working on different expressions
 	@inbounds Threads.@threads for i in eachindex(exprs)
 		numThreads = min(variableCols, 256)
 		numBlocks = cld(variableCols, numThreads)
 		@cuda threads=numThreads blocks=numBlocks fastmath=true interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
 	end
 	return cudaResults
 end
-const MAX_STACK_SIZE = 10 # The depth of the stack to store the values and intermediate results
+const MAX_STACK_SIZE = 25 # The depth of the stack to store the values and intermediate results
 function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, variables::CuDeviceArray{Float32}, parameters::CuDeviceArray{Float32}, results::CuDeviceArray{Float32}, stepsize::CuDeviceArray{Int}, exprIndex::Int)
 	varSetIndex = (blockIdx().x - 1) * blockDim().x + threadIdx().x # ctaid.x * ntid.x + tid.x (1-based)
 	@inbounds variableCols = length(variables) / stepsize[3] # number of variable sets
@ -96,6 +98,7 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var
 			elseif opcode == SQRT
 				operationStack[operationStackTop] = sqrt(operationStack[operationStackTop])
 			elseif opcode == INV
 				# operationStack[operationStackTop] = 1f0 / operationStack[operationStackTop]
 				operationStack[operationStackTop] = inv(operationStack[operationStackTop])
 			end
 		else
--- a/package/src/Transpiler.jl
+++ b/package/src/Transpiler.jl
@ -12,54 +12,77 @@ const Operand = Union{Float32, String} # Operand is either fixed value or regist
 - kwparam ```frontendCache```: The cache that stores the (partial) results of the frontend, to speedup the pre-processing
 - kwparam ```frontendCache```: The cache that stores the result of the transpilation. Useful for parameter optimisation, as the same expression gets executed multiple times
 "
-function evaluate(expressions::Vector{ExpressionProcessing.PostfixType}, cudaVars::CuArray{Float32}, variableColumns::Integer, variableRows::Integer, parameters::Vector{Vector{Float32}})::Matrix{Float32}
+function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
 	varRows = size(variables, 1)
 	variableCols = size(variables, 2)
 	# kernels = Vector{CuFunction}(undef, length(expressions))
 	# TODO: test this again with multiple threads. The first time I tried, I was using only one thread
 	# Test this parallel version again when doing performance tests. With the simple "functionality" tests this took 0.03 seconds while sequential took "0.00009" seconds
 	# Threads.@threads for i in eachindex(expressions)
 	# 	cacheLock = ReentrantLock()
 	# 	cacheHit = false
 	# 	lock(cacheLock) do 
 	# 		if haskey(transpilerCache, expressions[i])
 	# 			kernels[i] = transpilerCache[expressions[i]]
 	# 			cacheHit = true
 	# 		end
 	# 	end
 	# 	if cacheHit
 	# 		continue
 	# 	end
 	# 	formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
 	# 	kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i-1) # i-1 because julia is 1-based but PTX needs 0-based indexing
 	# 	linker = CuLink()
 	# 	add_data!(linker, "ExpressionProcessing", kernel)
 	# 	image = complete(linker)
 	# 	mod = CuModule(image)
 	# 	kernels[i] = CuFunction(mod, "ExpressionProcessing")
 	# 	@lock cacheLock transpilerCache[expressions[i]] = kernels[i]
 	# end
 	cudaVars = CuArray(variables) # maybe put in shared memory (see PerformanceTests.jl for more info)
 	cudaParams = Utils.create_cuda_array(parameters, NaN32) # maybe make constant (see PerformanceTests.jl for more info)
 	# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
-	cudaResults = CuArray{Float32}(undef, variableColumns, length(expressions))
+	cudaResults = CuArray{Float32}(undef, variableCols, length(expressions))
-	threads = min(variableColumns, 256)
+	threads = min(variableCols, 256)
-	blocks = cld(variableColumns, threads)
+	blocks = cld(variableCols, threads)
 	kernelName = "evaluate_gpu"
 	# TODO: Implement batching as a middleground between "transpile everything and then run" and "tranpile one run one" even though cudacall is async
 	@inbounds Threads.@threads for i in eachindex(expressions)
-		kernel = transpile(expressions[i], variableRows, Utils.get_max_inner_length(parameters), variableColumns, i-1, kernelName) # i-1 because julia is 1-based but PTX needs 0-based indexing
+		# if haskey(resultCache, expressions[i])
-		compiledKernel = CompileKernel(kernel, kernelName)
+		# 	kernels[i] = resultCache[expressions[i]]
-		cudacall(compiledKernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
+		# 	continue
-	end
+		# end
 	return cudaResults
 end
 "
 A simplified version of the evaluate function. It takes a list of already transpiled kernels to be executed. This should yield better performance, where the same expressions should be evaluated multiple times i.e. for parameter optimisation.
 "
 function evaluate(kernels::Vector{String}, cudaVars::CuArray{Float32}, nrOfVariableSets::Integer, parameters::Vector{Vector{Float32}}, kernelName::String)::Matrix{Float32}
 	cudaParams = Utils.create_cuda_array(parameters, NaN32) # maybe make constant (see PerformanceTests.jl for more info)
 	# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
 	cudaResults = CuArray{Float32}(undef, nrOfVariableSets, length(kernels))
 	threads = min(nrOfVariableSets, 256)
 	blocks = cld(nrOfVariableSets, threads)
 	@inbounds Threads.@threads for i in eachindex(kernels)
 		compiledKernel = CompileKernel(kernels[i], kernelName)
 		cudacall(compiledKernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
 	end
 	return cudaResults
 end
 function CompileKernel(ptxKernel::String, kernelName::String)::CuFunction
 	linker = CuLink()
 	add_data!(linker, kernelName, ptxKernel)
-	image = complete(linker)
+		formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
-	mod = CuModule(image)
+		kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i-1, kernelName) # i-1 because julia is 1-based but PTX needs 0-based indexing
-	return CuFunction(mod, kernelName)
+
 		linker = CuLink()
 		add_data!(linker, kernelName, kernel)
 		image = complete(linker)
 		mod = CuModule(image)
 		compiledKernel = CuFunction(mod, kernelName)
 		cudacall(compiledKernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
 	end
 	# for kernel in kernels
 	# 	cudacall(kernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
 	# end
 	return cudaResults
 end
 # To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string
--- a/package/test/InterpreterTests.jl
+++ b/package/test/InterpreterTests.jl
@ -21,16 +21,8 @@ parameters[2][1] = 5.0
 parameters[2][2] = 0.0
 function testHelper(expression::Expr, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}}, expectedResult)
-	exprs = [ExpressionProcessing.expr_to_postfix(expression)]
+	exprs = Vector([expression])
-	cudaExprs = Utils.create_cuda_array(exprs, ExpressionProcessing.ExpressionElement(EMPTY, 0))
+	result = Interpreter.interpret(exprs, variables, parameters)
 	exprsLength = length(exprs)
 	exprsInnerLength = Utils.get_max_inner_length(exprs)
 	X = CuArray(variables)
 	variableCols = size(variables, 2)
 	variableRows = size(variables, 1)
 	result = Interpreter.interpret(cudaExprs, exprsLength, exprsInnerLength, X, variableCols, variableRows, parameters)
 	expectedResult32 = convert(Float32, expectedResult)
 	@test isequal(result[1,1], expectedResult32)
@ -135,16 +127,8 @@ end
 	expr1 = :((x1 + 5) * p1 - 3 / abs(x2) + (2^4) - log(8))
 	expr2 = :(1 + 5 * x1 - 10^2 + (p1 - p2) / 9 + exp(x2))
-	exprs = [ExpressionProcessing.expr_to_postfix(expr1), ExpressionProcessing.expr_to_postfix(expr2)]
+	exprs = Vector([expr1, expr2])
-	cudaExprs = Utils.create_cuda_array(exprs, ExpressionProcessing.ExpressionElement(EMPTY, 0))
+	result = Interpreter.interpret(exprs, var, param)
 	exprsLength = length(exprs)
 	exprsInnerLength = Utils.get_max_inner_length(exprs)
 	X = CuArray(var)
 	variableCols = size(var, 2)
 	variableRows = size(var, 1)
 	result = Interpreter.interpret(cudaExprs, exprsLength, exprsInnerLength, X, variableCols, variableRows, param)
 	# var set 1
 	@test isapprox(result[1,1], 37.32, atol=0.01) # expr1
--- a/package/test/PerformanceTests.jl
+++ b/package/test/PerformanceTests.jl
@ -10,7 +10,6 @@ using .ExpressionProcessing
 include("parser.jl") # to parse expressions from a file
 # ATTENTAION: Evaluation information at the very bottom
 const BENCHMARKS_RESULTS_PATH = "./results-fh-new"
 # Number of expressions can get really big (into millions)
@ -57,8 +56,11 @@ suite = BenchmarkGroup()
 suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"])
 suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"])
 # cacheInterpreter = Dict{Expr, PostfixType}()
 suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprs, X_t, parameters; repetitions=expr_reps)
 # cacheTranspilerFront = Dict{Expr, PostfixType}()
 # cacheTranspilerRes = Dict{Expr, CuFunction}()
 suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprs, X_t, parameters; repetitions=expr_reps)
 # tune!(suite)
@ -66,12 +68,10 @@ suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprs, X_t, parameter
 loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)
-results = run(suite, verbose=true, seconds=43200) # 12 hour timeout
+results = run(suite, verbose=true, seconds=28800) # 8 hour timeout
 resultsCPU = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/cpu.json")[1]
 if compareWithCPU
 	BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/3-interpreter-smaller-stack-less-threadblock-allocations.json", results)
 	medianCPU = median(resultsCPU["CPU"])
 	stdCPU = std(resultsCPU["CPU"])
@ -104,6 +104,7 @@ if compareWithCPU
 	println(gpuiVsGPUT_median)
 	println(gpuiVsGPUT_std)
 	BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/0-initial.json", results)
 else
 	resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json")[1]
 	# resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json")[1]
@ -138,8 +139,3 @@ else
 	println(oldVsGPUT_std)
 end
 # Initial implementation: 
 #	- Interpreter: no cache; 256 blocksize; exprs pre-processed and sent to GPU on every call; vars sent on every call; frontend + dispatch are multithreaded
 #	- Transpiler: no cahce; 256 blocksize; exprs pre-processed and transpiled on every call; vars sent on every call; frontend + transpilation + dispatch are multithreaded
--- a/package/test/PerformanceTuning.jl
+++ b/package/test/PerformanceTuning.jl
@ -1,38 +1,30 @@
 using CUDA
 using DelimitedFiles
 using GZip
 using .Transpiler
 using .Interpreter
-include("parser.jl") # to parse expressions from a file
+varsets_medium = 10000
 X = randn(Float32, 5, varsets_medium)
 exprsGPU = [
 	# CPU interpreter requires an anonymous function and array ref s
 	:(p1 * x1 + p2), # 5 op
 	:((((x1 + x2) + x3) + x4) + x5), # 9 op
 	:(log(abs(x1))), # 3 op
 	:(powabs(p2 - powabs(p1 + x1, 1/x1),p3)) # 13 op
 ] # 30 op
-data,varnames = readdlm("data/nikuradse_1.csv", ',', header=true);
+# p is the same for CPU and GPU
-X = permutedims(convert(Matrix{Float32}, data))
+p = [randn(Float32, 10) for _ in 1:length(exprsGPU)] # generate 10 random parameter values for each expr
 exprs = Expr[]
 parameters = Vector{Vector{Float32}}()
 varnames = ["x$i" for i in 1:10]
 paramnames = ["p$i" for i in 1:20]
 # data/esr_nvar2_len10.txt.gz_9.txt.gz has  ~250_000 exprs
 # data/esr_nvar2_len10.txt.gz_10.txt.gz has ~800_000 exrps
 GZip.open("data/esr_nvar2_len10.txt.gz_3.txt.gz") do io 
 	for line in eachline(io)
 		expr, p = parse_infix(line, varnames, paramnames)
 		push!(exprs, expr)
 		push!(parameters, randn(Float32, length(p)))
 	end
 end
 expr_reps = 1
@testset "Interpreter Tuning" begin
-    CUDA.@profile interpret_gpu(exprs, X, parameters; repetitions=expr_reps)
+    CUDA.@profile interpret_gpu(exprsGPU, X, p; repetitions=expr_reps)
 end
@testset "Transpiler Tuning" begin
-    # CUDA.@profile evaluate_gpu(exprs, X, parameters; repetitions=expr_reps)
+    CUDA.@profile evaluate_gpu(exprsGPU, X, p; repetitions=expr_reps)
 end
--- a/package/test/TranspilerTests.jl
+++ b/package/test/TranspilerTests.jl
@ -41,15 +41,19 @@ parameters[2][1] = 5.0
 parameters[2][2] = 0.0
 parameters[3][1] = 16.0
@testset "TEMP" begin
 	return
 	exprs = [:(x1 + p1)]
 	vars = Matrix{Float32}(undef, 1, 1)
 	params = Vector{Vector{Float32}}(undef, 1)
 	vars[1, 1] = 1
 	params[1] = [1]
 	Transpiler.evaluate(exprs, vars, params)
 end
@testset "Test transpiler evaluation" begin
-	variableCols = size(variables, 2)
+	results = Transpiler.evaluate(expressions, variables, parameters)
 	variableRows = size(variables, 1)
 	X = CuArray(variables)
 	exprs = [ExpressionProcessing.expr_to_postfix(expressions[1]), ExpressionProcessing.expr_to_postfix(expressions[2]), ExpressionProcessing.expr_to_postfix(expressions[3])]
 	results = Transpiler.evaluate(exprs, X, variableCols, variableRows, parameters)
 	# dump(expressions[3]; maxdepth=10)
 	# Expr 1:
--- a/package/test/results-fh-new/1-fronted-and-data-transfer-to-ExpressionExecutor.json
+++ b/package/test/results-fh-new/1-fronted-and-data-transfer-to-ExpressionExecutor.json
@ -1,194 +0,0 @@
 [
 	{
 		"Julia": "1.11.5",
 		"BenchmarkTools": {
 			"major": 1,
 			"minor": 6,
 			"patch": 0,
 			"prerelease": [],
 			"build": []
 		}
 	},
 	[
 		[
 			"BenchmarkGroup",
 			{
 				"data": {
 					"GPUT": [
 						"BenchmarkGroup",
 						{
 							"data": {
                                "nikuradse_1": [
                                    "Trial",
                                    {
                                        "allocs": 9578295211,
                                        "gctimes": [
                                            5.773640884485e12
                                        ],
                                        "memory": 99694581250168,
                                        "params": [
                                            "Parameters",
                                            {
                                                "gctrial": true,
                                                "time_tolerance": 0.05,
                                                "evals_set": false,
                                                "samples": 50,
                                                "evals": 1,
                                                "gcsample": false,
                                                "seconds": 43200.0,
                                                "overhead": 0.0,
                                                "memory_tolerance": 0.01
                                            }
                                        ],
                                        "times": [
                                            5.1630263257036e13
                                        ]
                                    }
                                ]
                            },
 							"tags": [
 								"GPUTranspiler"
 							]
 						}
 					],
 					"GPUI": [
 						"BenchmarkGroup",
 						{
 							"data": {
 								"nikuradse_1": [
 									"Trial",
 									{
 										"allocs": 768768117,
 										"gctimes": [
 											1.1975019005e10,
 											7.985238732e9,
 											1.4256539541e10,
 											8.877686056e9,
 											1.4680883881e10,
 											7.692335492e9,
 											9.536354709e9,
 											1.3536376614e10,
 											1.4238839111e10,
 											1.9925752838e10,
 											9.025028453e9,
 											1.5572506957e10,
 											1.952938358e10,
 											1.1815896105e10,
 											1.3613672963e10,
 											1.155423324e10,
 											1.4004956257e10,
 											8.806173097e9,
 											8.174429914e9,
 											1.3263383027e10,
 											1.0794204698e10,
 											1.5559450665e10,
 											1.1655933294e10,
 											1.0337481053e10,
 											1.736781041e10,
 											1.7557373752e10,
 											1.0408159512e10,
 											1.9575876788e10,
 											1.1552463317e10,
 											1.226612493e10,
 											1.39046431e10,
 											1.4741246638e10,
 											1.3349550404e10,
 											1.1029748223e10,
 											1.2336413042e10,
 											1.8974104972e10,
 											1.62980404e10,
 											1.7060266354e10,
 											1.4275735627e10,
 											1.1090002413e10,
 											9.354486934e9,
 											1.0120009791e10,
 											1.2904978229e10,
 											1.9392024576e10,
 											1.4288312066e10,
 											9.172039439e9,
 											1.1963691856e10,
 											1.7642492412e10,
 											1.4929130699e10,
 											1.5905152758e10
 										],
 										"memory": 54082719144,
 										"params": [
 											"Parameters",
 											{
 												"gctrial": true,
 												"time_tolerance": 0.05,
 												"evals_set": false,
 												"samples": 50,
 												"evals": 1,
 												"gcsample": false,
 												"seconds": 43200.0,
 												"overhead": 0.0,
 												"memory_tolerance": 0.01
 											}
 										],
 										"times": [
 											5.14174363969e11,
 											5.18689077274e11,
 											5.1025535864e11,
 											5.10803229124e11,
 											5.23299818383e11,
 											5.16455770592e11,
 											5.02350694438e11,
 											5.0439224751e11,
 											5.04269366358e11,
 											5.06595858959e11,
 											5.11724089224e11,
 											5.1262595436e11,
 											5.03168612131e11,
 											5.21219083737e11,
 											5.00099394667e11,
 											5.11001185335e11,
 											5.08254610458e11,
 											5.15228010681e11,
 											5.1538764885e11,
 											5.00595179658e11,
 											5.09523742228e11,
 											5.09818545112e11,
 											5.14655215639e11,
 											5.14933349609e11,
 											5.0169600001e11,
 											5.12605187963e11,
 											5.08668518972e11,
 											4.99756633692e11,
 											5.04657100071e11,
 											4.96300433311e11,
 											5.02859857609e11,
 											5.00544153225e11,
 											5.01888246474e11,
 											5.10711561485e11,
 											5.1255887708e11,
 											5.03690773615e11,
 											4.98071106526e11,
 											5.14512763271e11,
 											5.06840174712e11,
 											5.18008421655e11,
 											5.1741870342e11,
 											5.01369775936e11,
 											5.08726698998e11,
 											5.04550273414e11,
 											5.06774233833e11,
 											5.16671635611e11,
 											5.09574401096e11,
 											5.03123609086e11,
 											5.11987873937e11,
 											5.03337347704e11
 										]
 									}
 								]
 							},
 							"tags": [
 								"GPUInterpreter"
 							]
 						}
 					]
 				},
 				"tags": []
 			}
 		]
 	]
 ]
--- a/package/test/results-fh-new/2-i_blocksize_121__t_transpiling_only_once.json
+++ b/package/test/results-fh-new/2-i_blocksize_121__t_transpiling_only_once.json
@ -1,196 +0,0 @@
 [
 	{
 		"Julia": "1.11.5",
 		"BenchmarkTools": {
 			"major": 1,
 			"minor": 6,
 			"patch": 0,
 			"prerelease": [],
 			"build": []
 		}
 	},
 	[
 		[
 			"BenchmarkGroup",
 			{
 				"data": {
 					"GPUT": [
 						"BenchmarkGroup",
 						{
 							"data": {
                                "nikuradse_1": [
                                    "Trial",
                                    {
                                        "allocs": 1534044518,
                                        "gctimes": [
                                            3.162096218033e12,
                                            2.514920522839e12
                                        ],
                                        "memory": 51380856414712,
                                        "params": [
                                            "Parameters",
                                            {
                                                "gctrial": true,
                                                "time_tolerance": 0.05,
                                                "evals_set": false,
                                                "samples": 50,
                                                "evals": 1,
                                                "gcsample": false,
                                                "seconds": 43200.0,
                                                "overhead": 0.0,
                                                "memory_tolerance": 0.01
                                            }
                                        ],
                                        "times": [
                                            3.579290341907e13,
                                            3.6476991686227e13
                                        ]
                                    }
                                ]
                            },
 							"tags": [
 								"GPUTranspiler"
 							]
 						}
 					],
 					"GPUI": [
 						"BenchmarkGroup",
 						{
 							"data": {
 								"nikuradse_1": [
 									"Trial",
 									{
 										"allocs": 768767740,
 										"gctimes": [
 											1.4209871071e10,
 											8.529233725e9,
 											8.165943693e9,
 											8.180014668e9,
 											8.231263428e9,
 											1.1110946388e10,
 											1.3136749872e10,
 											1.0515143897e10,
 											1.2978886885e10,
 											1.0709110363e10,
 											1.2408937103e10,
 											1.4486745203e10,
 											1.3229416582e10,
 											1.8353010658e10,
 											1.32173253e10,
 											1.1621004633e10,
 											1.1136122325e10,
 											9.614762707e9,
 											1.4564265563e10,
 											9.399404156e9,
 											1.063983064e10,
 											1.2513746965e10,
 											9.039906393e9,
 											1.2382209752e10,
 											1.3127092115e10,
 											1.2713843793e10,
 											1.1111974511e10,
 											1.5837882785e10,
 											1.5005237417e10,
 											1.2439743996e10,
 											9.607861366e9,
 											1.0680724758e10,
 											1.4012997282e10,
 											1.258804731e10,
 											1.020862355e10,
 											9.630750655e9,
 											1.5428270551e10,
 											1.746317266e10,
 											1.3141055589e10,
 											1.5009128259e10,
 											8.453648604e9,
 											1.6874341516e10,
 											1.1411307067e10,
 											1.2542892313e10,
 											1.1232296452e10,
 											1.3458245148e10,
 											1.0818032806e10,
 											9.239119183e9,
 											1.7897566617e10,
 											1.565065385e10
 										],
 										"memory": 54082712568,
 										"params": [
 											"Parameters",
 											{
 												"gctrial": true,
 												"time_tolerance": 0.05,
 												"evals_set": false,
 												"samples": 50,
 												"evals": 1,
 												"gcsample": false,
 												"seconds": 43200.0,
 												"overhead": 0.0,
 												"memory_tolerance": 0.01
 											}
 										],
 										"times": [
 											4.72169572882e11,
 											5.0409909815e11,
 											5.07815085942e11,
 											5.10453558146e11,
 											5.10478958938e11,
 											4.97262381193e11,
 											5.0260603513e11,
 											4.99542972531e11,
 											4.87993778737e11,
 											4.89021704445e11,
 											5.03746768492e11,
 											4.89869107858e11,
 											4.73146154356e11,
 											4.8171801387e11,
 											5.08579879922e11,
 											4.949573335e11,
 											4.72187897068e11,
 											4.99229768599e11,
 											4.60419913288e11,
 											4.69019613895e11,
 											4.50583091837e11,
 											4.72792727311e11,
 											4.72333754492e11,
 											4.65152305777e11,
 											4.82234976786e11,
 											4.72238483765e11,
 											4.73826923338e11,
 											4.76267120461e11,
 											4.87120033427e11,
 											5.04120244741e11,
 											4.69559064737e11,
 											4.72201757593e11,
 											4.69914031792e11,
 											4.93629873162e11,
 											4.71968584791e11,
 											5.01452793581e11,
 											4.80458931455e11,
 											4.83065538379e11,
 											4.99070229147e11,
 											4.71609869279e11,
 											4.71492369998e11,
 											4.58522950715e11,
 											4.80960881323e11,
 											4.91960762476e11,
 											4.73412762655e11,
 											4.69283546561e11,
 											4.66574358844e11,
 											4.67318993209e11,
 											4.5724723899e11,
 											4.7334516285e11
 										]
 									}
 								]
 							},
 							"tags": [
 								"GPUInterpreter"
 							]
 						}
 					]
 				},
 				"tags": []
 			}
 		]
 	]
 ]
--- a/package/test/results-fh-new/3-interpreter-smaller-stack-less-threadblock-allocations.json
+++ b/package/test/results-fh-new/3-interpreter-smaller-stack-less-threadblock-allocations.json
@ -1 +0,0 @@
 [{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"GPUT":["BenchmarkGroup",{"data":{},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"nikuradse_1":["Trial",{"allocs":768766234,"gctimes":[9.039427718e9,9.064446832e9,9.800666936e9,1.0827322595e10,8.183176119e9,1.0336680452e10,1.2123016536e10,1.1144637536e10,1.1608950879e10,8.957069847e9,1.6269942403e10,1.4918376698e10,1.4251938232e10,1.2206537223e10,9.651032299e9,8.903295497e9,1.0685161605e10,1.3667059513e10,9.290015888e9,9.461223008e9,8.563242328e9,9.004616808e9,1.1567444604e10,1.4886979643e10,1.1748297074e10,1.0925963713e10,1.1739338325e10,1.2370751697e10,9.841839527e9,1.0294011249e10,1.0448009806e10,1.0032240935e10,1.0339378214e10,1.0181439573e10,1.0002432745e10,1.024672632e10,1.0288169821e10,9.9328892e9,9.691621257e9,1.0178716919e10,9.874193006e9,1.0230965657e10,9.986166398e9,1.0348837109e10,9.905019212e9,1.0229049781e10,1.0217382544e10,9.984211393e9,1.085035782e10,9.611515998e9],"memory":54082704216,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":50,"evals":1,"gcsample":false,"seconds":43200.0,"overhead":0.0,"memory_tolerance":0.01}],"times":[4.59689343211e11,4.58928721202e11,4.58790866806e11,4.57001612541e11,4.18694344791e11,4.51768004064e11,4.72273439611e11,4.71801815498e11,4.7151631773e11,4.59466410568e11,4.78707344875e11,4.57553935546e11,4.83383119184e11,4.93000010286e11,4.73094508424e11,4.61605014711e11,4.5350924569e11,4.60262826899e11,4.89557260771e11,4.9072202667e11,5.01206569571e11,4.98388682969e11,4.97754134578e11,4.77393384636e11,4.86333985432e11,4.95544193592e11,4.61734198363e11,4.65888337953e11,4.62496887686e11,4.6684460331e11,4.67632785813e11,4.6746114379e11,4.66166811424e11,4.66344731528e11,4.67420138865e11,4.6812935133e11,4.67987294196e11,4.67112396022e11,4.65770084163e11,4.673875228e11,4.63872430175e11,4.62557110467e11,4.64486258696e11,4.67577200165e11,4.65189110368e11,4.64529885356e11,4.61770978471e11,4.63199044468e11,4.61538097167e11,4.61694021731e11]}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]]
--- a/thesis/chapters/evaluation.tex
+++ b/thesis/chapters/evaluation.tex
@ -1,113 +1,42 @@
 \chapter{Evaluation}
 \label{cha:evaluation}
-This thesis aims to determine whether one of the two GPU evaluators is faster than the current CPU evaluator. This chapter describes the performance evaluation process. First, the environment in which the performance benchmarks are conducted is explained. Next the individual results for the GPU interpreter and transpiler are presented individually. This section also includes the performance tuning steps taken to achieve these results. Finally, the results of the GPU evaluators are compared to those of the CPU evaluator to answer the research questions of this thesis.
+The aim of this thesis is to determine whether at least one of the GPU evaluators is faster than the current CPU evaluator. This chapter describes the performance evaluation. First, the environment in which the performance tests are performed is explained. Then the individual results for the GPU interpreter and the transpiler are presented. In addition, this part also includes the performance tuning steps taken to achieve these results. Finally, the results of the GPU evaluators are compared to the CPU evaluator in order to answer the research questions of this thesis.
-\section{Benchmark Environment}
+\section{Test environment}
-In this section, the benchmark environment used to evaluate the performance is outlined. To ensure the validity and reliability of the results, it is necessary to specify the details of the environment. This includes a description of the hardware and software configuration as well as the performance evaluation process. With this, the variance between the results is minimised, which allows for better reproducibility and comparability between the implementations.
+Explain the hardware used, as well as the actual data (how many expressions, variables etc.)
-\subsection{Hardware Configuration}
+three scenarios -> few, normal and many variable sets;; expr repetitions to simulate parameter optimisation
-The hardware configuration is the most important aspect of the benchmark environment. The capabilities of both the CPU and GPU can have a significant impact on the resulting performance. The following sections outline the importance of the individual components as well as the hardware used for the benchmarks.
+Benchmarktools.jl -> 1000 samples per scenario
 \subsubsection{GPU}
 Especially the GPU is important, as different microarchitectures typically require different optimisations. While the evaluators can generally run on any Nvidia GPU with a compute capability of at least 6.1, they are tuned for the Ampere microarchitecture with a compute capability of 8.6. Despite the evaluators being tuned for this microarchitecture, more modern ones can be used as well. However, additional tuning is required to ensure the evaluators can utilise the hardware to its fullest potential.
 Tuning must also be done on a per-problem basis. Especially the number of variable sets can impact how well the hardware is utilised. Therefore, it is important to see which configuration performs the best. In Section \ref{sec:results} a strategy for tuning the configuration to a new problem is described.
 \subsubsection{CPU}
 Although the GPU plays a crucial role, work is also carried out on the CPU. The interpreter mainly uses the CPU for data transfer and the pre-processing step and is therefore more GPU-bound. However, the transpiler additionally needs the CPU to perform the transpilation step. This step produces a kernel for each expression and also involves sending these kernels to the driver for compilation, a process which is also performed by the CPU. By contrast, the interpreter only has one kernel that needs to be converted into PTX and compiled by the driver only once. Consequently, the transpiler is much more CPU-bound and variations in the used CPU have a much greater impact. Therefore, using a more powerful CPU benefits the transpiler more than the interpreter.
 \subsubsection{System Memory}
 In addition to the hardware configuration of the GPU and CPU, system memory (RAM) also plays a crucial role. While RAM does not directly contribute to the overall performance, it can have a noticeable indirect impact due to its role in caching. Insufficient RAM forces the operating system to use the page file, which is stored on a much slower SSD. This results in slower cache access, thereby reducing the overall performance of the application.
 As seen in the list below, only 16 GB of RAM were available during the benchmarking process. This amount is insufficient to utilise caching to the extent outlined in Chapter \ref{cha:implementation}. More RAM was not available, which means some caching had to be disabled, which will be further explained in Section \ref{sec:results}.
 \subsubsection{Hardware}
 With the requirements explained above in mind, the following hardware is used to perform the benchmarks for the CPU-based evaluator, which was used as the baseline, as well as for the GPU-based evaluators:
 \begin{itemize}
 	\item Intel i5 12500
 	\item Nvidia RTX 3060 Ti
 	\item 16 GB 4400 MT/s DDR5 RAM
 \end{itemize}
 \subsection{Software Configuration}
 Apart from the hardware, the performance of the evaluators can also be significantly affected by the software. Primarily these three software components or libraries are involved in the performance:
 \begin{itemize}
 	\item GPU Driver
 	\item Julia
 	\item CUDA.jl
 \end{itemize}
 Typically, newer versions of these components include performance improvements, among other things. This is why it is important to specify the version which is used for benchmarking. The GPU driver has version \emph{561.17}, Julia has version \emph{1.11.5}, and CUDA.jl has version \emph{5.8.1}. As with the hardware configuration, this ensures that the results are reproducible and comparable to each other.
 \subsection{Performance Evaluation Process}
 With the hardware and software configuration being set, the process of benchmarking the implementations can be described. The process is designed to simulate the load and scenario these evaluators will be used in. The Nikuradse dataset \parencite{nikuradse_laws_1950} has been chosen as the source of the data. The dataset itself models the laws of flow in rough pipes and provides $362$ variable sets, with each set containing two variables. This dataset has first been used by \textcite{guimera_bayesian_2020} to benchmark a symbolic regression algorithm.
 Because only the evaluators are benchmarked, the expressions to be evaluated, need to already exist. Generating the expressions is done, using the exhaustive symbolic regression algorithm proposed by \textcite{bartlett_exhaustive_2024} and the Nikuradse dataset. This ensures that the expressions are exemplary of what needs to be evaluated in a real use-case.
 With roughly $250\,000$ expressions, the second-largest set has been used as the first benchmark. This means that all $250\,000$ expressions are evaluated in a single run, which is much more than what would be evaluated in a typical run. This benchmark is designed to show how the evaluators can handle large amounts of data. However, evaluating such high amount of expressions also has some drawbacks as will be explained in Section \ref{sec:results}.
 A second benchmark with slight adaptations to the first one is also performed. Because GPUs are very good at executing work in parallel, the number of variable sets is increased in this benchmark. Therefore, the second benchmark consists of the same $250\,000$ expressions, but the number of variable sets has been increased by a factor of four to a total of $1\,4448$.
 Lastly a third benchmark will be performed. This benchmark should mimic a realistic load. Therefore, the number of expressions has been reduced to roughly $10\,000$ and the number of variable sets is again $362$. The reason for this benchmark is to demonstrate how the evaluators will most likely perform in a typical run.
 All three benchmarks will also simulate a parameter optimisation step, as this is the scenario, these evaluators will be used in. For parameter optimisation, $100$ steps have been used. This means, that all expressions will be evaluated $100$ times. During the benchmark, this process is simulated by re-transmitting the parameters instead of generating new ones. Generating new parameters is not part of the evaluators and is therefore not implemented. However, because the parameters are re-transmitted every time, the overhead of sending the data is taken into account. This is part of the evaluators and additional overhead the CPU implementation does not have and is therefore important to be measured.
 \subsubsection{Measuring Performance}
 The performance measurements are taken, using the BenchmarkTools.jl\footnote{\url{https://juliaci.github.io/BenchmarkTools.jl/stable/}} package. It is the standard for benchmarking applications in Julia, which makes it an obvious choice for measuring the performance of the evaluators.
 It offers extensive support for measuring and comparing results of different implementations and versions of the same implementation. Benchmark groups allow to categorise the different implementations, take performance measurements and compare them. When taking performance measurements, it also supports setting a timeout and most importantly, set the number of samples to be taken. This is especially important, as it ensures to produce stable results by combating run-to-run variance. For this thesis, a sample size of $50$ has been used. This means that each of the previously-mentioned benchmarks, gets executed $50$ times. 
 \section{Results}
-\label{sec:results}
+talk about what we will see now (results only for interpreter, then transpiler and then compared with each other and a CPU interpreter)
 talk about what we will see now (results only for interpreter, then transpiler and then compared with each other and the CPU interpreter)
 BECAUSE OF RAM CONSTRAINTS, CACHING IS NOT USED TO THE FULL EXTEND AS IN CONTRAST TO HOW IT IS EXPLAINED IN THE IMPLEMENTATION CHAPTER. I hope I can cache the frontend. If only the finished kernels can not be cached, move this explanation to the transpiler section below and update the reference in subsubsection "System Memory"
 % TODO: Do one run with 
 % 	- 250k expressions
 % 	- increase variables to be 4 times as large (nr. of varsets should be 362 * 4)
 % 	- compare CPU with interpreter (probably also transpiler, but only to see if it takes even longer, or roughly the same considering that resources are still available on the GPU)
 % 	- This should demonstrate that bigger varsets lead to better performance (although I kinda doubt considering that the hardware is already fully utilised)
 % TODO: Do another run with
 % 	- 10 000 expressions choose the file that is closest to these 10k
 % 	- nr. var sets stays the same
 % 	- compare CPU, interpreter and transpiler
 % 	- do a second run with kernel compilation being performed before parameter optimisation step (as 10 000 expressions shouldn't fill up the memory as much)
 % 	- depending on how much time I have, also do a run with 4 times as much var sets (if this is done, adapt the above subsection "Performance Evaluation Process")
 \subsection{Interpreter}
 Results only for Interpreter (also contains final kernel configuration and probably quick overview/recap of the implementation used and described in Implementation section)
-\subsection{Performance Tuning}
+\subsection{Performance tuning}
 Document the process of performance tuning
-Initial: no cache; 256 blocksize; exprs pre-processed and sent to GPU on every call; vars sent on every call; frontend + dispatch are multithreaded
+Initial: CPU-Side single-threaded; up to 1024 threads per block; bounds-checking enabled (especially in kernel)
-1.) Done before parameter optimisation loop: Frontend, transmitting Exprs and Variables (improved runtime)
+1.) Blocksize reduced to a maximum of 256 -> moderate improvement in medium and large
-2.) tuned blocksize to have as little wasted threads as possible (new blocksize 121 -> 3-blocks -> 363 threads but 362 threads needed per expression) (128 should lead to the same results. Talk here a bit what to look out for, so block-size should be a multiple of 32 and should divide the nr. of varsets as best as possible to a whole number without going over)
+2.) Using @inbounds -> noticeable improvement in 2 out of 3
-3.) Minor optimisations. Reduced stacksize; reduced memory allocations on the CPU; reduced GC pressure
+3.) Tuned blocksize with NSight compute -> slight improvement
-
+4.) used int32 everywhere to reduce register usage -> significant performance drop (probably because a lot more waiting time "latency hiding not working basically", or more type conversions happening on GPU? look at generated PTX code and use that as an argument to describe why it is slower)
-CPU and GPU are almost all the time at 100\% utilisation (GPU every now and then drops to 70\%), meaning it is quite balanced.
+5.) reverted previous; used fastmath instead -> imporvement (large var set is now faster than on transpiler)
 Uncached but multithreaded frontend only makes up a small percentage of the total runtime (optimisations there are not really needed, which is good because enabling caching took up too much RAM)
 Most of the time is spent doing the parameter optimisation step
 \subsection{Transpiler}
 Results only for Transpiler (also contains final kernel configuration and probably quick overview/recap of the implementation used and described in Implementation section
-
+\subsection{Performance tuning}
 \subsection{Performance Tuning}
 Document the process of performance tuning
-Initial: no cache; 256 blocksize; exprs pre-processed and transpiled on every call; vars sent on every call; frontend + transpilation + dispatch are multithreaded
+Initial: CPU-Side single-threaded; up to 1024 threads per block; bounds-checking enabled
-1.) Done before parameter optimisation loop: Frontend, transmitting Exprs and Variables (improved runtime)
+1.) Blocksize reduced to a maximum of 256 -> moderate improvement in medium and large
-2.) All expressions to execute are transpiled first (before they were transpiled for every execution, even in parameter optimisation scenarios). Compilation is still done every time, because too little RAM was available (compilation takes the most time, so this is only a minor boost). Also tried blocksize of 121. However, kernel itself is very fast anyway, so this didn't make a difference (further proof that the CPU is the bottleneck here)
+2.) Using @inbounds -> small improvement only on CPU side code
-
+3.) Tuned blocksize with NSight compute -> slight improvement
-CPU at 100\% GPU at around 30\%. Heavily CPU bottlenecked. Mainly due to PTX compilation taking by far the longest (while kernels are finished more or less instantly)
+4.) Only changed things on interpreter side
 5.) Only changed things on interpreter side
 \subsection{Comparison}
 Comparison of Interpreter and Transpiler as well as Comparing the two with CPU interpreter
--- a/thesis/main.pdf
+++ b/thesis/main.pdf
--- a/thesis/references.bib
+++ b/thesis/references.bib
@ -400,7 +400,6 @@
 	author = {Winter, Martin and Parger, Mathias and Mlakar, Daniel and Steinberger, Markus},
 	urldate = {2025-02-27},
 	date = {2021-02-17},
 	file = {PDF:C\:\\Users\\danwi\\Zotero\\storage\\UURX5BER\\Winter et al. - 2021 - Are dynamic memory managers on GPUs slow a survey and benchmarks.pdf:application/pdf},
 }
@article{bartlett_exhaustive_2024,
@ -1254,28 +1253,6 @@
 	author = {Faingnaert, Thomas and Besard, Tim and De Sutter, Bjorn},
 	urldate = {2025-04-20},
 	date = {2022-09},
-	keywords = {Graphics processing units, Kernel, Programming, Instruction sets, Codes, graphics processors, high-level programming languages, Libraries, Matrix multiplication, Productivity},
+	keywords = {Codes, Graphics processing units, graphics processors, high-level programming languages, Instruction sets, Kernel, Libraries, Matrix multiplication, Productivity, Programming},
 	file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\QCJ6LSF3\\Faingnaert et al. - 2022 - Flexible Performant GEMM Kernels on GPUs.pdf:application/pdf},
 }
@report{nikuradse_laws_1950,
 	title = {Laws of Flow in Rough Pipes},
 	url = {https://digital.library.unt.edu/ark:/67531/metadc63009/},
 	author = {Nikuradse, J.},
 	date = {1950-11},
 }
@article{guimera_bayesian_2020,
 	title = {A Bayesian machine scientist to aid in the solution of challenging scientific problems},
 	volume = {6},
 	url = {https://www.science.org/doi/10.1126/sciadv.aav6971},
 	doi = {10.1126/sciadv.aav6971},
 	abstract = {Closed-form, interpretable mathematical models have been instrumental for advancing our understanding of the world; with the data revolution, we may now be in a position to uncover new such models for many systems from physics to the social sciences. However, to deal with increasing amounts of data, we need “machine scientists” that are able to extract these models automatically from data. Here, we introduce a Bayesian machine scientist, which establishes the plausibility of models using explicit approximations to the exact marginal posterior over models and establishes its prior expectations about models by learning from a large empirical corpus of mathematical expressions. It explores the space of models using Markov chain Monte Carlo. We show that this approach uncovers accurate models for synthetic and real data and provides out-of-sample predictions that are more accurate than those of existing approaches and of other nonparametric methods.},
 	pages = {eaav6971},
 	number = {5},
 	journaltitle = {Science Advances},
 	author = {Guimerà, Roger and Reichardt, Ignasi and Aguilar-Mogas, Antoni and Massucci, Francesco A. and Miranda, Manuel and Pallarès, Jordi and Sales-Pardo, Marta},
 	urldate = {2025-05-21},
 	date = {2020-01-31},
 	file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\HLG9FD4H\\Guimerà et al. - 2020 - A Bayesian machine scientist to aid in the solution of challenging scientific problems.pdf:application/pdf},
 }
		`@ -1 +0,0 @@`
			[{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"GPUT":["BenchmarkGroup",{"data":{},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"nikuradse_1":["Trial",{"allocs":768766234,"gctimes":[9.039427718e9,9.064446832e9,9.800666936e9,1.0827322595e10,8.183176119e9,1.0336680452e10,1.2123016536e10,1.1144637536e10,1.1608950879e10,8.957069847e9,1.6269942403e10,1.4918376698e10,1.4251938232e10,1.2206537223e10,9.651032299e9,8.903295497e9,1.0685161605e10,1.3667059513e10,9.290015888e9,9.461223008e9,8.563242328e9,9.004616808e9,1.1567444604e10,1.4886979643e10,1.1748297074e10,1.0925963713e10,1.1739338325e10,1.2370751697e10,9.841839527e9,1.0294011249e10,1.0448009806e10,1.0032240935e10,1.0339378214e10,1.0181439573e10,1.0002432745e10,1.024672632e10,1.0288169821e10,9.9328892e9,9.691621257e9,1.0178716919e10,9.874193006e9,1.0230965657e10,9.986166398e9,1.0348837109e10,9.905019212e9,1.0229049781e10,1.0217382544e10,9.984211393e9,1.085035782e10,9.611515998e9],"memory":54082704216,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":50,"evals":1,"gcsample":false,"seconds":43200.0,"overhead":0.0,"memory_tolerance":0.01}],"times":[4.59689343211e11,4.58928721202e11,4.58790866806e11,4.57001612541e11,4.18694344791e11,4.51768004064e11,4.72273439611e11,4.71801815498e11,4.7151631773e11,4.59466410568e11,4.78707344875e11,4.57553935546e11,4.83383119184e11,4.93000010286e11,4.73094508424e11,4.61605014711e11,4.5350924569e11,4.60262826899e11,4.89557260771e11,4.9072202667e11,5.01206569571e11,4.98388682969e11,4.97754134578e11,4.77393384636e11,4.86333985432e11,4.95544193592e11,4.61734198363e11,4.65888337953e11,4.62496887686e11,4.6684460331e11,4.67632785813e11,4.6746114379e11,4.66166811424e11,4.66344731528e11,4.67420138865e11,4.6812935133e11,4.67987294196e11,4.67112396022e11,4.65770084163e11,4.673875228e11,4.63872430175e11,4.62557110467e11,4.64486258696e11,4.67577200165e11,4.65189110368e11,4.64529885356e11,4.61770978471e11,4.63199044468e11,4.61538097167e11,4.61694021731e11]}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]]