benchmarking: prepared tests for using actual data

2025-05-09 13:58:10 +02:00
parent 2c8a9cd2d8
commit 327e4ebf1b
3 changed files with 46 additions and 90 deletions
--- a/package/src/Interpreter.jl
+++ b/package/src/Interpreter.jl
@ -23,8 +23,8 @@ function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parame
 	variableCols = size(variables, 2) # number of variable sets to use for each expression
 	cudaVars = CuArray(variables)
 	cudaParams = Utils.create_cuda_array(parameters, NaN32) # column corresponds to data for one expression
-	cudaExprs = Utils.create_cuda_array(exprs, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression; TODO: replace this 0 with 'undef' if possible
+	cudaExprs = Utils.create_cuda_array(exprs, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression;
-	# put into seperate cuArray, as this is static and would be inefficient to send seperatly to every kernel
+	# put into seperate cuArray, as this is static and would be inefficient to send seperatly to each kernel
 	cudaStepsize = CuArray([Utils.get_max_inner_length(exprs), Utils.get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression
 	# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
@ -32,9 +32,7 @@ function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parame
 	# Start kernel for each expression to ensure that no warp is working on different expressions
 	@inbounds for i in eachindex(exprs)
-		# TODO: Currently only the first expression gets evaluated. Either use a view on "cudaExprs" to determine the correct expression or extend cudaStepsize to include this information (this information was removed in a previous commit)
+		numThreads = min(variableCols, 256)
 		# If a "view" is used, then the ExpressionProcessing must be updated to always include the stop opcode at the end
 		numThreads = min(variableCols, 128)
 		numBlocks = cld(variableCols, numThreads)
 		@cuda threads=numThreads blocks=numBlocks fastmath=true interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
@ -43,7 +41,6 @@ function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parame
 	return cudaResults
 end
 #TODO: Add @inbounds to all indexing after it is verified that all works https://cuda.juliagpu.org/stable/development/kernel/#Bounds-checking
 const MAX_STACK_SIZE = 25 # The depth of the stack to store the values and intermediate results
 function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, variables::CuDeviceArray{Float32}, parameters::CuDeviceArray{Float32}, results::CuDeviceArray{Float32}, stepsize::CuDeviceArray{Int}, exprIndex::Int)
 	varSetIndex = (blockIdx().x - 1) * blockDim().x + threadIdx().x # ctaid.x * ntid.x + tid.x (1-based)
--- a/package/src/Transpiler.jl
+++ b/package/src/Transpiler.jl
@ -56,8 +56,6 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
 		formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i], cacheFrontend)
 		kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i-1) # i-1 because julia is 1-based but PTX needs 0-based indexing
 		# println(kernel)
 		linker = CuLink()
 		add_data!(linker, "ExpressionProcessing", kernel)
@ -77,7 +75,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
 	# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
 	for kernel in kernels
 		# config = launch_configuration(kernels[i])
-		threads = min(variableCols, 96)
+		threads = min(variableCols, 256)
 		blocks = cld(variableCols, threads)
 		cudacall(kernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
@ -99,7 +97,7 @@ function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Int
 	ptxBuffer = IOBuffer()
 	regManager = Utils.RegisterManager(Dict(), Dict())
-	# TODO: Suboptimal solution
+	# TODO: Suboptimal solution. get_kernel_signature should also return the name of the registers used for the parameters, so further below, we do not have to hard-code them
 	signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Float32, Float32, Float32], regManager) # Vars, Params, Results
 	guardClause, threadId64Reg = get_guard_clause(exitJumpLocationMarker, nrOfVariableSets, regManager)
@ -123,7 +121,7 @@ function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Int
 	return generatedCode
 end
-# TODO: Make version, target and address_size configurable; also see what address_size means exactly
+# TODO: Make version, target and address_size configurable
 function get_cuda_header()::String
 	return "
 .version 8.5
--- a/package/test/PerformanceTests.jl
+++ b/package/test/PerformanceTests.jl
@ -4,48 +4,40 @@ using BenchmarkTools
 using .Transpiler
 using .Interpreter
-const BENCHMARKS_RESULTS_PATH = "./results-fh"
+const BENCHMARKS_RESULTS_PATH = "./results-fh-new"
-# TODO: Expressions can get much much bigger (into millions) (will be provided by Mr. Kronberger)
+# Number of expressions can get really big (into millions)
-# TODO: Variable-Sets: 1000 can be considered the minimum; 100.000 can be considered the maximum (will be provided by Mr. Kronberger)
+# Variable-Sets: 1000 can be considered the minimum; 100.000 can be considered the maximum
-exprsCPU = [
+data,varnames = readdlm("data/nikuradse_1.csv", ',', header=true);
-	# CPU interpreter requires an anonymous function and array ref s
+X = convert(Matrix{Float32}, data)
 	:(p[1] * x[1] + p[2]), # 5 op
 	:((((x[1] + x[2]) + x[3]) + x[4]) + x[5]), # 9 op
 	:(log(abs(x[1]))), # 3 op
 	:(powabs(p[2] - powabs(p[1] + x[1], 1/x[1]),p[3])) # 13 op
 ] # 30 op
 exprsCPU = map(e -> Expr(:->, :(x,p), e), exprsCPU)
-exprsGPU = [
+exprs = Expr[]
-	# CPU interpreter requires an anonymous function and array ref s
+parameters = Vector{Vector{Float32}}()
-	:(p1 * x1 + p2), # 5 op
+varnames = ["x$i" for i in 1:10]
-	:((((x1 + x2) + x3) + x4) + x5), # 9 op
+paramnames = ["p$i" for i in 1:20]
-	:(log(abs(x1))), # 3 op
+# data/esr_nvar2_len10.txt.gz_9.txt.gz has  ~250_000 exprs
-	:(powabs(p2 - powabs(p1 + x1, 1/x1),p3)) # 13 op
+# data/esr_nvar2_len10.txt.gz_10.txt.gz has ~800_000 exrps
-] # 30 op
+GZip.open("data/esr_nvar2_len10.txt.gz_9.txt.gz") do io 
 	i = 0
 	for line in eachline(io)
 		expr, p = parse_infix(line, varnames, paramnames)
 		if i > 10
 			return
 		end
 		println(expr)
-# p is the same for CPU and GPU
+		push!(exprs, expr)
-p = [randn(Float32, 10) for _ in 1:length(exprsCPU)] # generate 10 random parameter values for each expr
+		push!(parameters, randn(Float32, length(p)))
 		i += 1
 	end
 end
 expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially; only p changes but not X)
-@testset "CPU performance" begin
+# TODO: Tipps for tuning:
 	# warmup
 	# interpret_cpu(exprsCPU, X, p)
 	# @btime interpret_cpu(exprsCPU, X, p; repetitions=expr_reps) # repetitions simulates parameter optimisation
 	# @btime test_cpu_interpreter(1000)
 	# @btime fetch.([Threads.@spawn interpret_cpu(exprsCPU, X, p; repetitions=expr_reps) for i in 1:reps])
 	# test_cpu_interpreter(1000, parallel=true) # start julia -t 6 for six threads
 	# @btime test_cpu_interpreter(10000)
 	# @btime test_cpu_interpreter(10000, parallel=true)
 end
@testset "Interpreter Performance" begin
 	# Put data in shared memory: 
 	# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
@ -54,62 +46,31 @@ end
 	# Memory management like in C++ might help with performance improvements
 	# https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management
 end
-@testset "Transpiler Performance" begin
+# https://cuda.juliagpu.org/stable/development/profiling/#NVIDIA-Nsight-Systems
 	# Put data in shared memory: 
 	# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
 	# Make array const:
 	# https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays
 	# Memory management like in C++ might help with performance improvements
 	# https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management
 end
 # After these tests have been redone, use Nsight Compute/Systems as described here: 
 #https://cuda.juliagpu.org/stable/development/profiling/#NVIDIA-Nsight-Systems
 # Systems and Compute installable via WSL. Compute UI can even be used inside wsl
-# Add /usr/local/cuda/bin in .bashrc to PATH to access ncu and nsys (depending how well this works with my 1080 do it on my machine, otherwise re do the tests and perform them on FH PCs)
+# Add /usr/local/cuda/bin in .bashrc to PATH to access ncu and nsys (do the tests on FH PCs)
-# University setup at 10.20.1.7 if needed
+# University setup at 10.20.1.7 and 10.20.1.13
 compareWithCPU = false
 compareWithCPU = true
 suite = BenchmarkGroup()
 suite["CPU"] = BenchmarkGroup(["CPUInterpreter"])
 suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"])
 suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"])
 # TODO: see CpuInterpreterTests.jl to see how all data is loaded and implement this here
 varsets_small = 1000 # 1k should be absolute minimum
 varsets_medium = 10000
 varsets_large = 100000 # 100k should be absolute maximum (although not as strict as minimum)
 if compareWithCPU
-	X_small = randn(Float32, varsets_small, 5)
+	suite["CPU"]["nikuradse_1"] = @benchmarkable interpret_cpu(exprsCPU, X, parameters; repetitions=expr_reps)
 	suite["CPU"]["small varset"] = @benchmarkable interpret_cpu(exprsCPU, X_small, p; repetitions=expr_reps)
 	X_medium = randn(Float32, varsets_medium, 5)
 	suite["CPU"]["medium varset"] = @benchmarkable interpret_cpu(exprsCPU, X_medium, p; repetitions=expr_reps)
 	X_large = randn(Float32, varsets_large, 5)
 	suite["CPU"]["large varset"] = @benchmarkable interpret_cpu(exprsCPU, X_large, p; repetitions=expr_reps)
 end
-X_small_GPU = randn(Float32, 5, varsets_small) # column-major
+# TODO: Most likely need to transpose X matrix here, as we are expecting a column major matrix for more efficient memory access
-suite["GPUI"]["small varset"] = @benchmarkable interpret_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
+suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprsGPU, X, parameters; repetitions=expr_reps)
-suite["GPUT"]["small varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
+suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprsGPU, X, parameters; repetitions=expr_reps)
-X_medium_GPU = randn(Float32, 5, varsets_medium) # column-major
+for i in 1:10
-suite["GPUI"]["medium varset"] = @benchmarkable interpret_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
+	tune!(suite)
-suite["GPUT"]["medium varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
+end
-
+BenchmarkTools.save("params.json", params(suite))
 X_large_GPU = randn(Float32, 5, varsets_large) # column-major
 suite["GPUI"]["large varset"] = @benchmarkable interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
 suite["GPUT"]["large varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
 # interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
 # tune!(suite)
 # BenchmarkTools.save("params.json", params(suite))
 loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)
@ -148,7 +109,7 @@ if compareWithCPU
 	println(gpuiVsGPUT_median)
 	println(gpuiVsGPUT_std)
-	BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/5-interpreter_using_fastmath.json", results)
+	BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/0_initial.json", results)
 else
 	resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json")[1]
 	# resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json")[1]