benchmarking: updated transpiler to drastically reduce the number of transpilations at the expense of memory usage

2025-05-19 11:39:49 +02:00
parent 33e7edd4c8
commit f33551e25f
4 changed files with 48 additions and 69 deletions
--- a/package/src/ExpressionExecutorCuda.jl
+++ b/package/src/ExpressionExecutorCuda.jl
@ -49,19 +49,26 @@ end
 # Convert Expressions to PTX Code and execute that instead
 function evaluate_gpu(expressions::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32}
 	@assert axes(expressions) == axes(p)
-	variableCols = size(X, 2)
+	numVariableSets = size(X, 2) # nr. of columns of X
-	variableRows = size(X, 1)
+	variableSetSize = size(X, 1) # nr. of rows of X
 	variables = CuArray(X)
-	exprs = Vector{ExpressionProcessing.PostfixType}(undef, length(expressions))
+	largestParameterSetSize = Utils.get_max_inner_length(p) # parameters get transformed into matrix. Will be nr. of rows in parameter matrix
 	compiledKernels = Vector{CuFunction}(undef, length(expressions)) 
 	kernelName = "evaluate_gpu"
 	@inbounds Threads.@threads for i in eachindex(expressions)
-		exprs[i] = ExpressionProcessing.expr_to_postfix(expressions[i])
+		ex = ExpressionProcessing.expr_to_postfix(expressions[i])
 		ptxKernel = Transpiler.transpile(ex, variableSetSize, largestParameterSetSize, numVariableSets, i-1, kernelName) # i-1 because julia is 1-based but PTX needs 0-based indexing
 		compiledKernels[i] = Transpiler.CompileKernel(ptxKernel, kernelName)
 	end
-	results = Matrix{Float32}(undef, variableCols, length(exprs))
+	results = Matrix{Float32}(undef, numVariableSets, length(exprs))
 	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl)
-		results = Transpiler.evaluate(exprs, variables, variableCols, variableRows, p)
+		# evaluate
 		# results = Transpiler.evaluate(exprs, variables, numVariableSets, variableSetSize, p)
 		results = Transpiler.evaluate(compiledKernels, variables, variableSetSize, p)
 	end
 	return results
@ -103,7 +110,6 @@ function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector
 	res
 end
 # Flow
 # input: Vector expr    == expressions contains eg. 4 expressions
 #        Matrix X       == |expr| columns, n rows. n == number of variabls x1..xn; n is the same for all expressions --- WRONG
--- a/package/src/Transpiler.jl
+++ b/package/src/Transpiler.jl
@ -14,37 +14,6 @@ const Operand = Union{Float32, String} # Operand is either fixed value or regist
 "
 function evaluate(expressions::Vector{ExpressionProcessing.PostfixType}, cudaVars::CuArray{Float32}, variableColumns::Integer, variableRows::Integer, parameters::Vector{Vector{Float32}})::Matrix{Float32}
 	# TODO: test this again with multiple threads. The first time I tried, I was using only one thread
 	# Test this parallel version again when doing performance tests. With the simple "functionality" tests this took 0.03 seconds while sequential took "0.00009" seconds
 	# Threads.@threads for i in eachindex(expressions)
 	# 	cacheLock = ReentrantLock()
 	# 	cacheHit = false
 	# 	lock(cacheLock) do 
 	# 		if haskey(transpilerCache, expressions[i])
 	# 			kernels[i] = transpilerCache[expressions[i]]
 	# 			cacheHit = true
 	# 		end
 	# 	end
 	# 	if cacheHit
 	# 		continue
 	# 	end
 	# 	formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
 	# 	kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableColumns, i-1) # i-1 because julia is 1-based but PTX needs 0-based indexing
 	# 	linker = CuLink()
 	# 	add_data!(linker, "ExpressionProcessing", kernel)
 	# 	image = complete(linker)
 	# 	mod = CuModule(image)
 	# 	kernels[i] = CuFunction(mod, "ExpressionProcessing")
 	# 	@lock cacheLock transpilerCache[expressions[i]] = kernels[i]
 	# end
 	cudaParams = Utils.create_cuda_array(parameters, NaN32) # maybe make constant (see PerformanceTests.jl for more info)
 	# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
@ -54,33 +23,44 @@ function evaluate(expressions::Vector{ExpressionProcessing.PostfixType}, cudaVar
 	blocks = cld(variableColumns, threads)
 	kernelName = "evaluate_gpu"
 	# TODO: Implement batching as a middleground between "transpile everything and then run" and "tranpile one run one" even though cudacall is async
 	@inbounds Threads.@threads for i in eachindex(expressions)
 		# if haskey(resultCache, expressions[i])
 		# 	kernels[i] = resultCache[expressions[i]]
 		# 	continue
 		# end
 		# formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
 		kernel = transpile(expressions[i], variableRows, Utils.get_max_inner_length(parameters), variableColumns, i-1, kernelName) # i-1 because julia is 1-based but PTX needs 0-based indexing
-
+		compiledKernel = CompileKernel(kernel, kernelName)
 		linker = CuLink()
 		add_data!(linker, kernelName, kernel)
 		image = complete(linker)
 		mod = CuModule(image)
 		compiledKernel = CuFunction(mod, kernelName)
 		cudacall(compiledKernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
 	end
-	# for kernel in kernels
+	return cudaResults
-	# 	cudacall(kernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
+end
-	# end
+
 "
 A simplified version of the evaluate function. It takes a list of already compiled kernels to be executed. This should yield better performance, where the same expressions should be evaluated multiple times i.e. for parameter optimisation.
 "
 function evaluate(kernels::Vector{CuFunction}, cudaVars::CuArray{Float32}, nrOfVariableSets::Integer, parameters::Vector{Vector{Float32}})::Matrix{Float32}
 	cudaParams = Utils.create_cuda_array(parameters, NaN32) # maybe make constant (see PerformanceTests.jl for more info)
 	# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
 	cudaResults = CuArray{Float32}(undef, nrOfVariableSets, length(expressions))
 	threads = min(nrOfVariableSets, 256)
 	blocks = cld(nrOfVariableSets, threads)
 	@inbounds Threads.@threads for i in eachindex(kernels)
 		cudacall(kernels[i], (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
 	end
 	return cudaResults
 end
 function CompileKernel(ptxKernel::String, kernelName::String)::CuFunction
 	linker = CuLink()
 	add_data!(linker, kernelName, ptxKernel)
 	image = complete(linker)
 	mod = CuModule(image)
 	return CuFunction(mod, kernelName)
 end
 # To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string
 # seekstart(buf1); write(buf2, buf1)
 "
--- a/thesis/chapters/evaluation.tex
+++ b/thesis/chapters/evaluation.tex
@ -59,13 +59,10 @@ Results only for Interpreter (also contains final kernel configuration and proba
 \subsection{Performance Tuning}
 Document the process of performance tuning
-Initial: CPU-Side single-threaded; up to 1024 threads per block; bounds-checking enabled (especially in kernel)
+Initial: no cache; 256 blocksize; exprs pre-processed and sent to GPU on every call; vars sent on every call; frontend + dispatch are multithreaded
 1.) Done before parameter optimisation loop: Frontend, transmitting Exprs and Variables (improved runtime)
 1.) Blocksize reduced to a maximum of 256 -> moderate improvement in medium and large
 2.) Using @inbounds -> noticeable improvement in 2 out of 3
 3.) Tuned blocksize with NSight compute -> slight improvement
 4.) used int32 everywhere to reduce register usage -> significant performance drop (probably because a lot more waiting time "latency hiding not working basically", or more type conversions happening on GPU? look at generated PTX code and use that as an argument to describe why it is slower)
 5.) reverted previous; used fastmath instead -> imporvement (large var set is now faster than on transpiler)
 \subsection{Transpiler}
 Results only for Transpiler (also contains final kernel configuration and probably quick overview/recap of the implementation used and described in Implementation section
@ -75,13 +72,9 @@ Results only for Transpiler (also contains final kernel configuration and probab
 \subsection{Performance Tuning}
 Document the process of performance tuning
-Initial: CPU-Side single-threaded; up to 1024 threads per block; bounds-checking enabled
+Initial: no cache; 256 blocksize; exprs pre-processed and transpiled on every call; vars sent on every call; frontend + transpilation + dispatch are multithreaded
-1.) Blocksize reduced to a maximum of 256 -> moderate improvement in medium and large
+1.) Done before parameter optimisation loop: Frontend, transmitting Exprs and Variables (improved runtime)
 2.) Using @inbounds -> small improvement only on CPU side code
 3.) Tuned blocksize with NSight compute -> slight improvement
 4.) Only changed things on interpreter side
 5.) Only changed things on interpreter side
 \subsection{Comparison}
 Comparison of Interpreter and Transpiler as well as Comparing the two with CPU interpreter
--- a/thesis/main.pdf
+++ b/thesis/main.pdf