benchmarking: updated benchmarking suite and prepared for taking the benchmarks
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run

This commit is contained in:
Daniel
2025-05-15 16:25:32 +02:00
parent 3d80ae95e4
commit d7e18f183d
6 changed files with 57 additions and 48 deletions

View File

@ -57,39 +57,25 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
threads = min(variableCols, 256) threads = min(variableCols, 256)
blocks = cld(variableCols, threads) blocks = cld(variableCols, threads)
kernelName = "evaluate_gpu"
# TODO: Implement batching as a middleground between "transpile everything and then run" and "tranpile one run one" even though cudacall is async # TODO: Implement batching as a middleground between "transpile everything and then run" and "tranpile one run one" even though cudacall is async
@inbounds for i in eachindex(expressions) @inbounds Threads.@threads for i in eachindex(expressions)
# if haskey(resultCache, expressions[i]) # if haskey(resultCache, expressions[i])
# kernels[i] = resultCache[expressions[i]] # kernels[i] = resultCache[expressions[i]]
# continue # continue
# end # end
formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i]) formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i-1) # i-1 because julia is 1-based but PTX needs 0-based indexing kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i-1, kernelName) # i-1 because julia is 1-based but PTX needs 0-based indexing
# try linker = CuLink()
linker = CuLink() add_data!(linker, kernelName, kernel)
add_data!(linker, "ExpressionProcessing", kernel)
image = complete(linker) image = complete(linker)
mod = CuModule(image)
mod = CuModule(image) compiledKernel = CuFunction(mod, kernelName)
compiledKernel = CuFunction(mod, "ExpressionProcessing")
cudacall(compiledKernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
# kernels[i] = CuFunction(mod, "ExpressionProcessing")
# resultCache[expressions[i]] = kernels[i]
# catch
# dump(expressions[i]; maxdepth=10)
# println()
# println()
# println(kernel)
# println()
# println()
# error(current_exceptions())
# end
cudacall(compiledKernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
end end
# for kernel in kernels # for kernel in kernels
@ -107,13 +93,13 @@ end
- param ```expressionIndex```: The 0-based index of the expression - param ```expressionIndex```: The 0-based index of the expression
" "
function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Integer, paramSetSize::Integer, function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Integer, paramSetSize::Integer,
nrOfVariableSets::Integer, expressionIndex::Integer)::String nrOfVariableSets::Integer, expressionIndex::Integer, kernelName::String)::String
exitJumpLocationMarker = "L__BB0_2" exitJumpLocationMarker = "L__BB0_2"
ptxBuffer = IOBuffer() ptxBuffer = IOBuffer()
regManager = Utils.RegisterManager(Dict(), Dict()) regManager = Utils.RegisterManager(Dict(), Dict())
# TODO: Suboptimal solution. get_kernel_signature should also return the name of the registers used for the parameters, so further below, we do not have to hard-code them # TODO: Suboptimal solution. get_kernel_signature should also return the name of the registers used for the parameters, so further below, we do not have to hard-code them
signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Float32, Float32, Float32], regManager) # Vars, Params, Results signature, paramLoading = get_kernel_signature(kernelName, [Float32, Float32, Float32], regManager) # Vars, Params, Results
guardClause, threadId64Reg = get_guard_clause(exitJumpLocationMarker, nrOfVariableSets, regManager) guardClause, threadId64Reg = get_guard_clause(exitJumpLocationMarker, nrOfVariableSets, regManager)
println(ptxBuffer, get_cuda_header()) println(ptxBuffer, get_cuda_header())

View File

@ -78,4 +78,34 @@ function test_cpu_interpreter_nikuradse()
end end
@test test_cpu_interpreter_nikuradse() # @test test_cpu_interpreter_nikuradse()
data,varnames = readdlm("data/nikuradse_1.csv", ',', header=true);
X = convert(Matrix{Float32}, data)
X_t = permutedims(X) # for gpu
exprs = Expr[]
parameters = Vector{Vector{Float32}}()
varnames = ["x$i" for i in 1:10]
paramnames = ["p$i" for i in 1:20]
# data/esr_nvar2_len10.txt.gz_9.txt.gz has ~250_000 exprs
# data/esr_nvar2_len10.txt.gz_10.txt.gz has ~800_000 exrps
GZip.open("data/esr_nvar2_len10.txt.gz_9.txt.gz") do io
for line in eachline(io)
expr, p = parse_infix(line, varnames, paramnames)
push!(exprs, expr)
push!(parameters, randn(Float32, length(p)))
end
end
expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially; only p changes but not X)
suite = BenchmarkGroup()
suite["CPU"] = BenchmarkGroup(["CPUInterpreter"])
suite["CPU"]["nikuradse_1"] = @benchmarkable interpret_cpu(exprs, X, parameters; repetitions=expr_reps, parallel=true)
loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)
results = run(suite, verbose=true, seconds=28800) # 8 hour timeout
BenchmarkTools.save("./results-fh-new/cpu.json", results)

View File

@ -50,37 +50,30 @@ expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially;
# Add /usr/local/cuda/bin in .bashrc to PATH to access ncu and nsys (do the tests on FH PCs) # Add /usr/local/cuda/bin in .bashrc to PATH to access ncu and nsys (do the tests on FH PCs)
# University setup at 10.20.1.7 and 10.20.1.13 # University setup at 10.20.1.7 and 10.20.1.13
compareWithCPU = false compareWithCPU = true
suite = BenchmarkGroup() suite = BenchmarkGroup()
suite["CPU"] = BenchmarkGroup(["CPUInterpreter"])
suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"]) suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"])
suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"]) suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"])
if compareWithCPU
suite["CPU"]["nikuradse_1"] = @benchmarkable interpret_cpu(exprs, X, parameters; repetitions=expr_reps)
suite["CPU"]["nikuradse_1_parallel"] = @benchmarkable interpret_cpu(exprs, X, parameters; repetitions=expr_reps, parallel=true)
end
# cacheInterpreter = Dict{Expr, PostfixType}() # cacheInterpreter = Dict{Expr, PostfixType}()
suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprs, X_t, parameters; repetitions=expr_reps) suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprs, X_t, parameters; repetitions=expr_reps)
# cacheTranspilerFront = Dict{Expr, PostfixType}() # cacheTranspilerFront = Dict{Expr, PostfixType}()
# cacheTranspilerRes = Dict{Expr, CuFunction}() # cacheTranspilerRes = Dict{Expr, CuFunction}()
suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprs, X_t, parameters; repetitions=expr_reps) # Takes forever. Needs more investigation suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprs, X_t, parameters; repetitions=expr_reps)
tune!(suite) # tune!(suite)
BenchmarkTools.save("params.json", params(suite)) # BenchmarkTools.save("params.json", params(suite))
throw("finished tuning")
loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance) loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)
results = run(suite, verbose=true, seconds=3600) # 1 hour because of CPU. lets see if more is needed results = run(suite, verbose=true, seconds=28800) # 8 hour timeout
resultsCPU = BenchmarkTools.load("./results-fh-new/cpu.json")[1]
if compareWithCPU if compareWithCPU
medianCPU = median(results["CPU"]) medianCPU = median(resultsCPU["CPU"])
stdCPU = std(results["CPU"]) stdCPU = std(resultsCPU["CPU"])
medianInterpreter = median(results["GPUI"]) medianInterpreter = median(results["GPUI"])
stdInterpreter = std(results["GPUI"]) stdInterpreter = std(results["GPUI"])

View File

@ -1 +1 @@
[{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{},"tags":["CPUInterpreter"]}],"GPUT":["BenchmarkGroup",{"data":{},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"nikuradse_1":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]] [{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["CPUInterpreter"]}],"GPUT":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]]

View File

@ -17,10 +17,10 @@ end
@testset "CPU Interpreter" begin @testset "CPU Interpreter" begin
# include("CpuInterpreterTests.jl") include("CpuInterpreterTests.jl")
end end
@testset "Performance tests" begin @testset "Performance tests" begin
# include("PerformanceTuning.jl") # include("PerformanceTuning.jl")
include("PerformanceTests.jl") # include("PerformanceTests.jl")
end end

View File

@ -52,7 +52,7 @@ Based on the requirements and data structure above, the architecture of both pro
\label{fig:kernel_architecture} \label{fig:kernel_architecture}
\end{figure} \end{figure}
A design decision that has been made for both prototypes is to split the evaluation of each expression into a separate kernel or kernel dispatch as seen in Figure \ref{fig:kernel_architecture}. As explained in Section \ref{sec:thread_hierarchy}, it is desirable to reduce the occurrence of thread divergence as much as possible. Although the SIMT programming model tries to mitigate the negative effects of thread divergence, it is still a good idea to avoid it when possible. For this use-case, thread divergence can easily be avoided by not evaluating all expressions in a single kernel or kernel dispatch. GPUs are able to have multiple resident grids, with modern GPUs being able to accommodate 128 grids concurrently \parencite{nvidia_cuda_2025}. One grid corresponds to one kernel dispatch, and therefore allows up-to 128 kernels to be run concurrently. Therefore, dispatching a kernel for each expression, further increases GPU utilisation. In the case of the interpreter, having only one kernel that can be dispatched for each expression, also simplifies the kernel itself. This is because the kernel can focus on evaluating one expression and does not require additional code to handle multiple expressions at once. Similarly, the transpiler can also be simplified, as it can generate many smaller kernels rather than one big kernel. Additionally, the smaller kernels do not need any branching, because the generated code only needs to perform the operations as they occur in the expression itself. This also reduces the overhead on the GPU. A design decision that has been made for both prototypes is to split the evaluation of each expression into a separate kernel or kernel dispatch as seen in Figure \ref{fig:kernel_architecture}. As explained in Section \ref{sec:thread_hierarchy}, it is desirable to reduce the occurrence of thread divergence as much as possible. Although the SIMT programming model tries to mitigate the negative effects of thread divergence, it is still a good idea to avoid it when possible. For this use-case, thread divergence can easily be avoided by not evaluating all expressions in a single kernel or kernel dispatch. GPUs are able to have multiple resident grids, with modern GPUs being able to accommodate 128 grids concurrently \parencite{nvidia_cuda_2025}. One grid corresponds to one kernel dispatch, and therefore allows up-to 128 kernels to be run concurrently. Therefore, dispatching a kernel for each expression, further increases GPU utilisation. In the case of the interpreter, having only one kernel that can be dispatched for each expression, also simplifies the kernel itself. This is because the kernel can focus on evaluating one expression and does not require additional code to handle multiple expressions at once. Similarly, the transpiler can also be simplified, as it can generate many smaller kernels rather than one big kernel. Additionally, the smaller kernels do not need any branching, because the generated code only needs to perform the operations as they occur in the expression itself. This also reduces the overhead on the GPU. One drawback of generating a kernel for each expression, is the generation itself. Especially for smaller variable sets, it is possible, that the time it takes to transpile an expression is greater than the time it takes to evaluate it. However, for larger variable sets this should not be a concern.
\subsection{Pre-Processing} \subsection{Pre-Processing}
\label{sec:pre-processing} \label{sec:pre-processing}