benchmarking: prepared tests for using actual data
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run

This commit is contained in:
Daniel 2025-05-09 13:58:10 +02:00
parent 2c8a9cd2d8
commit 327e4ebf1b
3 changed files with 46 additions and 90 deletions

View File

@ -23,8 +23,8 @@ function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parame
variableCols = size(variables, 2) # number of variable sets to use for each expression variableCols = size(variables, 2) # number of variable sets to use for each expression
cudaVars = CuArray(variables) cudaVars = CuArray(variables)
cudaParams = Utils.create_cuda_array(parameters, NaN32) # column corresponds to data for one expression cudaParams = Utils.create_cuda_array(parameters, NaN32) # column corresponds to data for one expression
cudaExprs = Utils.create_cuda_array(exprs, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression; TODO: replace this 0 with 'undef' if possible cudaExprs = Utils.create_cuda_array(exprs, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression;
# put into seperate cuArray, as this is static and would be inefficient to send seperatly to every kernel # put into seperate cuArray, as this is static and would be inefficient to send seperatly to each kernel
cudaStepsize = CuArray([Utils.get_max_inner_length(exprs), Utils.get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression cudaStepsize = CuArray([Utils.get_max_inner_length(exprs), Utils.get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression
# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions # each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
@ -32,9 +32,7 @@ function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parame
# Start kernel for each expression to ensure that no warp is working on different expressions # Start kernel for each expression to ensure that no warp is working on different expressions
@inbounds for i in eachindex(exprs) @inbounds for i in eachindex(exprs)
# TODO: Currently only the first expression gets evaluated. Either use a view on "cudaExprs" to determine the correct expression or extend cudaStepsize to include this information (this information was removed in a previous commit) numThreads = min(variableCols, 256)
# If a "view" is used, then the ExpressionProcessing must be updated to always include the stop opcode at the end
numThreads = min(variableCols, 128)
numBlocks = cld(variableCols, numThreads) numBlocks = cld(variableCols, numThreads)
@cuda threads=numThreads blocks=numBlocks fastmath=true interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i) @cuda threads=numThreads blocks=numBlocks fastmath=true interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
@ -43,7 +41,6 @@ function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parame
return cudaResults return cudaResults
end end
#TODO: Add @inbounds to all indexing after it is verified that all works https://cuda.juliagpu.org/stable/development/kernel/#Bounds-checking
const MAX_STACK_SIZE = 25 # The depth of the stack to store the values and intermediate results const MAX_STACK_SIZE = 25 # The depth of the stack to store the values and intermediate results
function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, variables::CuDeviceArray{Float32}, parameters::CuDeviceArray{Float32}, results::CuDeviceArray{Float32}, stepsize::CuDeviceArray{Int}, exprIndex::Int) function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, variables::CuDeviceArray{Float32}, parameters::CuDeviceArray{Float32}, results::CuDeviceArray{Float32}, stepsize::CuDeviceArray{Int}, exprIndex::Int)
varSetIndex = (blockIdx().x - 1) * blockDim().x + threadIdx().x # ctaid.x * ntid.x + tid.x (1-based) varSetIndex = (blockIdx().x - 1) * blockDim().x + threadIdx().x # ctaid.x * ntid.x + tid.x (1-based)

View File

@ -56,8 +56,6 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i], cacheFrontend) formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i], cacheFrontend)
kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i-1) # i-1 because julia is 1-based but PTX needs 0-based indexing kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i-1) # i-1 because julia is 1-based but PTX needs 0-based indexing
# println(kernel)
linker = CuLink() linker = CuLink()
add_data!(linker, "ExpressionProcessing", kernel) add_data!(linker, "ExpressionProcessing", kernel)
@ -77,7 +75,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance) # execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
for kernel in kernels for kernel in kernels
# config = launch_configuration(kernels[i]) # config = launch_configuration(kernels[i])
threads = min(variableCols, 96) threads = min(variableCols, 256)
blocks = cld(variableCols, threads) blocks = cld(variableCols, threads)
cudacall(kernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks) cudacall(kernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
@ -99,7 +97,7 @@ function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Int
ptxBuffer = IOBuffer() ptxBuffer = IOBuffer()
regManager = Utils.RegisterManager(Dict(), Dict()) regManager = Utils.RegisterManager(Dict(), Dict())
# TODO: Suboptimal solution # TODO: Suboptimal solution. get_kernel_signature should also return the name of the registers used for the parameters, so further below, we do not have to hard-code them
signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Float32, Float32, Float32], regManager) # Vars, Params, Results signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Float32, Float32, Float32], regManager) # Vars, Params, Results
guardClause, threadId64Reg = get_guard_clause(exitJumpLocationMarker, nrOfVariableSets, regManager) guardClause, threadId64Reg = get_guard_clause(exitJumpLocationMarker, nrOfVariableSets, regManager)
@ -123,7 +121,7 @@ function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Int
return generatedCode return generatedCode
end end
# TODO: Make version, target and address_size configurable; also see what address_size means exactly # TODO: Make version, target and address_size configurable
function get_cuda_header()::String function get_cuda_header()::String
return " return "
.version 8.5 .version 8.5

View File

@ -4,48 +4,40 @@ using BenchmarkTools
using .Transpiler using .Transpiler
using .Interpreter using .Interpreter
const BENCHMARKS_RESULTS_PATH = "./results-fh" const BENCHMARKS_RESULTS_PATH = "./results-fh-new"
# TODO: Expressions can get much much bigger (into millions) (will be provided by Mr. Kronberger) # Number of expressions can get really big (into millions)
# TODO: Variable-Sets: 1000 can be considered the minimum; 100.000 can be considered the maximum (will be provided by Mr. Kronberger) # Variable-Sets: 1000 can be considered the minimum; 100.000 can be considered the maximum
exprsCPU = [ data,varnames = readdlm("data/nikuradse_1.csv", ',', header=true);
# CPU interpreter requires an anonymous function and array ref s X = convert(Matrix{Float32}, data)
:(p[1] * x[1] + p[2]), # 5 op
:((((x[1] + x[2]) + x[3]) + x[4]) + x[5]), # 9 op
:(log(abs(x[1]))), # 3 op
:(powabs(p[2] - powabs(p[1] + x[1], 1/x[1]),p[3])) # 13 op
] # 30 op
exprsCPU = map(e -> Expr(:->, :(x,p), e), exprsCPU)
exprsGPU = [ exprs = Expr[]
# CPU interpreter requires an anonymous function and array ref s parameters = Vector{Vector{Float32}}()
:(p1 * x1 + p2), # 5 op varnames = ["x$i" for i in 1:10]
:((((x1 + x2) + x3) + x4) + x5), # 9 op paramnames = ["p$i" for i in 1:20]
:(log(abs(x1))), # 3 op # data/esr_nvar2_len10.txt.gz_9.txt.gz has ~250_000 exprs
:(powabs(p2 - powabs(p1 + x1, 1/x1),p3)) # 13 op # data/esr_nvar2_len10.txt.gz_10.txt.gz has ~800_000 exrps
] # 30 op GZip.open("data/esr_nvar2_len10.txt.gz_9.txt.gz") do io
i = 0
for line in eachline(io)
expr, p = parse_infix(line, varnames, paramnames)
if i > 10
return
end
println(expr)
# p is the same for CPU and GPU push!(exprs, expr)
p = [randn(Float32, 10) for _ in 1:length(exprsCPU)] # generate 10 random parameter values for each expr push!(parameters, randn(Float32, length(p)))
i += 1
end
end
expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially; only p changes but not X) expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially; only p changes but not X)
@testset "CPU performance" begin # TODO: Tipps for tuning:
# warmup
# interpret_cpu(exprsCPU, X, p)
# @btime interpret_cpu(exprsCPU, X, p; repetitions=expr_reps) # repetitions simulates parameter optimisation
# @btime test_cpu_interpreter(1000)
# @btime fetch.([Threads.@spawn interpret_cpu(exprsCPU, X, p; repetitions=expr_reps) for i in 1:reps])
# test_cpu_interpreter(1000, parallel=true) # start julia -t 6 for six threads
# @btime test_cpu_interpreter(10000)
# @btime test_cpu_interpreter(10000, parallel=true)
end
@testset "Interpreter Performance" begin
# Put data in shared memory: # Put data in shared memory:
# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory # https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
@ -54,62 +46,31 @@ end
# Memory management like in C++ might help with performance improvements # Memory management like in C++ might help with performance improvements
# https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management # https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management
end
@testset "Transpiler Performance" begin # https://cuda.juliagpu.org/stable/development/profiling/#NVIDIA-Nsight-Systems
# Put data in shared memory:
# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
# Make array const:
# https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays
# Memory management like in C++ might help with performance improvements
# https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management
end
# After these tests have been redone, use Nsight Compute/Systems as described here:
#https://cuda.juliagpu.org/stable/development/profiling/#NVIDIA-Nsight-Systems
# Systems and Compute installable via WSL. Compute UI can even be used inside wsl # Systems and Compute installable via WSL. Compute UI can even be used inside wsl
# Add /usr/local/cuda/bin in .bashrc to PATH to access ncu and nsys (depending how well this works with my 1080 do it on my machine, otherwise re do the tests and perform them on FH PCs) # Add /usr/local/cuda/bin in .bashrc to PATH to access ncu and nsys (do the tests on FH PCs)
# University setup at 10.20.1.7 if needed # University setup at 10.20.1.7 and 10.20.1.13
compareWithCPU = false
compareWithCPU = true
suite = BenchmarkGroup() suite = BenchmarkGroup()
suite["CPU"] = BenchmarkGroup(["CPUInterpreter"]) suite["CPU"] = BenchmarkGroup(["CPUInterpreter"])
suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"]) suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"])
suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"]) suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"])
# TODO: see CpuInterpreterTests.jl to see how all data is loaded and implement this here
varsets_small = 1000 # 1k should be absolute minimum
varsets_medium = 10000
varsets_large = 100000 # 100k should be absolute maximum (although not as strict as minimum)
if compareWithCPU if compareWithCPU
X_small = randn(Float32, varsets_small, 5) suite["CPU"]["nikuradse_1"] = @benchmarkable interpret_cpu(exprsCPU, X, parameters; repetitions=expr_reps)
suite["CPU"]["small varset"] = @benchmarkable interpret_cpu(exprsCPU, X_small, p; repetitions=expr_reps)
X_medium = randn(Float32, varsets_medium, 5)
suite["CPU"]["medium varset"] = @benchmarkable interpret_cpu(exprsCPU, X_medium, p; repetitions=expr_reps)
X_large = randn(Float32, varsets_large, 5)
suite["CPU"]["large varset"] = @benchmarkable interpret_cpu(exprsCPU, X_large, p; repetitions=expr_reps)
end end
X_small_GPU = randn(Float32, 5, varsets_small) # column-major # TODO: Most likely need to transpose X matrix here, as we are expecting a column major matrix for more efficient memory access
suite["GPUI"]["small varset"] = @benchmarkable interpret_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps) suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprsGPU, X, parameters; repetitions=expr_reps)
suite["GPUT"]["small varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps) suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprsGPU, X, parameters; repetitions=expr_reps)
X_medium_GPU = randn(Float32, 5, varsets_medium) # column-major for i in 1:10
suite["GPUI"]["medium varset"] = @benchmarkable interpret_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps) tune!(suite)
suite["GPUT"]["medium varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps) end
BenchmarkTools.save("params.json", params(suite))
X_large_GPU = randn(Float32, 5, varsets_large) # column-major
suite["GPUI"]["large varset"] = @benchmarkable interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
suite["GPUT"]["large varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
# interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
# tune!(suite)
# BenchmarkTools.save("params.json", params(suite))
loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance) loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)
@ -148,7 +109,7 @@ if compareWithCPU
println(gpuiVsGPUT_median) println(gpuiVsGPUT_median)
println(gpuiVsGPUT_std) println(gpuiVsGPUT_std)
BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/5-interpreter_using_fastmath.json", results) BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/0_initial.json", results)
else else
resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json")[1] resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json")[1]
# resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json")[1] # resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json")[1]