benchmarking: prepared tests for using actual data

2025-05-09 13:58:10 +02:00
parent 2c8a9cd2d8
commit 327e4ebf1b
3 changed files with 46 additions and 90 deletions
--- a/package/src/Interpreter.jl
+++ b/package/src/Interpreter.jl
@ -23,8 +23,8 @@ function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parame
 	variableCols = size(variables, 2) # number of variable sets to use for each expression
 	cudaVars = CuArray(variables)
 	cudaParams = Utils.create_cuda_array(parameters, NaN32) # column corresponds to data for one expression
-	cudaExprs = Utils.create_cuda_array(exprs, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression; TODO: replace this 0 with 'undef' if possible
-	# put into seperate cuArray, as this is static and would be inefficient to send seperatly to every kernel
+	cudaExprs = Utils.create_cuda_array(exprs, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression;
+	# put into seperate cuArray, as this is static and would be inefficient to send seperatly to each kernel
 	cudaStepsize = CuArray([Utils.get_max_inner_length(exprs), Utils.get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression

 	# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
@ -32,9 +32,7 @@ function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parame

 	# Start kernel for each expression to ensure that no warp is working on different expressions
 	@inbounds for i in eachindex(exprs)
-		# TODO: Currently only the first expression gets evaluated. Either use a view on "cudaExprs" to determine the correct expression or extend cudaStepsize to include this information (this information was removed in a previous commit)
-		# If a "view" is used, then the ExpressionProcessing must be updated to always include the stop opcode at the end
-		numThreads = min(variableCols, 128)
+		numThreads = min(variableCols, 256)
 		numBlocks = cld(variableCols, numThreads)

 		@cuda threads=numThreads blocks=numBlocks fastmath=true interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
@ -43,7 +41,6 @@ function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parame
 	return cudaResults
 end

-#TODO: Add @inbounds to all indexing after it is verified that all works https://cuda.juliagpu.org/stable/development/kernel/#Bounds-checking
 const MAX_STACK_SIZE = 25 # The depth of the stack to store the values and intermediate results
 function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, variables::CuDeviceArray{Float32}, parameters::CuDeviceArray{Float32}, results::CuDeviceArray{Float32}, stepsize::CuDeviceArray{Int}, exprIndex::Int)
 	varSetIndex = (blockIdx().x - 1) * blockDim().x + threadIdx().x # ctaid.x * ntid.x + tid.x (1-based)
--- a/package/src/Transpiler.jl
+++ b/package/src/Transpiler.jl
@ -56,8 +56,6 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
 		formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i], cacheFrontend)
 		kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i-1) # i-1 because julia is 1-based but PTX needs 0-based indexing
 		
-		# println(kernel)
-
 		linker = CuLink()
 		add_data!(linker, "ExpressionProcessing", kernel)
 		
@ -77,7 +75,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
 	# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
 	for kernel in kernels
 		# config = launch_configuration(kernels[i])
-		threads = min(variableCols, 96)
+		threads = min(variableCols, 256)
 		blocks = cld(variableCols, threads)

 		cudacall(kernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
@ -99,7 +97,7 @@ function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Int
 	ptxBuffer = IOBuffer()
 	regManager = Utils.RegisterManager(Dict(), Dict())

-	# TODO: Suboptimal solution
+	# TODO: Suboptimal solution. get_kernel_signature should also return the name of the registers used for the parameters, so further below, we do not have to hard-code them
 	signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Float32, Float32, Float32], regManager) # Vars, Params, Results
 	guardClause, threadId64Reg = get_guard_clause(exitJumpLocationMarker, nrOfVariableSets, regManager)

@ -123,7 +121,7 @@ function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Int
 	return generatedCode
 end

-# TODO: Make version, target and address_size configurable; also see what address_size means exactly
+# TODO: Make version, target and address_size configurable
 function get_cuda_header()::String
 	return "
 .version 8.5