benchmarking: tuned interpreter blocksize
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run

This commit is contained in:
2025-05-20 09:05:35 +02:00
parent a9ffd5da63
commit 250deb334c
5 changed files with 26 additions and 15 deletions

View File

@ -25,7 +25,7 @@ function interpret(cudaExprs, numExprs::Integer, exprsInnerLength::Integer,
# Start kernel for each expression to ensure that no warp is working on different expressions # Start kernel for each expression to ensure that no warp is working on different expressions
@inbounds Threads.@threads for i in 1:numExprs # multithreaded to speedup dispatching (seems to have improved performance) @inbounds Threads.@threads for i in 1:numExprs # multithreaded to speedup dispatching (seems to have improved performance)
numThreads = min(variableColumns, 256) numThreads = min(variableColumns, 121)
numBlocks = cld(variableColumns, numThreads) numBlocks = cld(variableColumns, numThreads)
@cuda threads=numThreads blocks=numBlocks fastmath=true interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i) @cuda threads=numThreads blocks=numBlocks fastmath=true interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)

View File

@ -1,30 +1,38 @@
using CUDA using CUDA
using DelimitedFiles
using GZip
using .Transpiler using .Transpiler
using .Interpreter using .Interpreter
varsets_medium = 10000 include("parser.jl") # to parse expressions from a file
X = randn(Float32, 5, varsets_medium)
exprsGPU = [
# CPU interpreter requires an anonymous function and array ref s
:(p1 * x1 + p2), # 5 op
:((((x1 + x2) + x3) + x4) + x5), # 9 op
:(log(abs(x1))), # 3 op
:(powabs(p2 - powabs(p1 + x1, 1/x1),p3)) # 13 op
] # 30 op
# p is the same for CPU and GPU data,varnames = readdlm("data/nikuradse_1.csv", ',', header=true);
p = [randn(Float32, 10) for _ in 1:length(exprsGPU)] # generate 10 random parameter values for each expr X = permutedims(convert(Matrix{Float32}, data))
exprs = Expr[]
parameters = Vector{Vector{Float32}}()
varnames = ["x$i" for i in 1:10]
paramnames = ["p$i" for i in 1:20]
# data/esr_nvar2_len10.txt.gz_9.txt.gz has ~250_000 exprs
# data/esr_nvar2_len10.txt.gz_10.txt.gz has ~800_000 exrps
GZip.open("data/esr_nvar2_len10.txt.gz_3.txt.gz") do io
for line in eachline(io)
expr, p = parse_infix(line, varnames, paramnames)
push!(exprs, expr)
push!(parameters, randn(Float32, length(p)))
end
end
expr_reps = 1 expr_reps = 1
@testset "Interpreter Tuning" begin @testset "Interpreter Tuning" begin
CUDA.@profile interpret_gpu(exprsGPU, X, p; repetitions=expr_reps) # CUDA.@profile interpret_gpu(exprs, X, parameters; repetitions=expr_reps)
end end
@testset "Transpiler Tuning" begin @testset "Transpiler Tuning" begin
CUDA.@profile evaluate_gpu(exprsGPU, X, p; repetitions=expr_reps) CUDA.@profile evaluate_gpu(exprs, X, parameters; repetitions=expr_reps)
end end

View File

@ -0,0 +1 @@
[{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"GPUT":["BenchmarkGroup",{"data":{},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"nikuradse_1":["Trial",{"allocs":768767740,"gctimes":[1.4209871071e10,8.529233725e9,8.165943693e9,8.180014668e9,8.231263428e9,1.1110946388e10,1.3136749872e10,1.0515143897e10,1.2978886885e10,1.0709110363e10,1.2408937103e10,1.4486745203e10,1.3229416582e10,1.8353010658e10,1.32173253e10,1.1621004633e10,1.1136122325e10,9.614762707e9,1.4564265563e10,9.399404156e9,1.063983064e10,1.2513746965e10,9.039906393e9,1.2382209752e10,1.3127092115e10,1.2713843793e10,1.1111974511e10,1.5837882785e10,1.5005237417e10,1.2439743996e10,9.607861366e9,1.0680724758e10,1.4012997282e10,1.258804731e10,1.020862355e10,9.630750655e9,1.5428270551e10,1.746317266e10,1.3141055589e10,1.5009128259e10,8.453648604e9,1.6874341516e10,1.1411307067e10,1.2542892313e10,1.1232296452e10,1.3458245148e10,1.0818032806e10,9.239119183e9,1.7897566617e10,1.565065385e10],"memory":54082712568,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":50,"evals":1,"gcsample":false,"seconds":43200.0,"overhead":0.0,"memory_tolerance":0.01}],"times":[4.72169572882e11,5.0409909815e11,5.07815085942e11,5.10453558146e11,5.10478958938e11,4.97262381193e11,5.0260603513e11,4.99542972531e11,4.87993778737e11,4.89021704445e11,5.03746768492e11,4.89869107858e11,4.73146154356e11,4.8171801387e11,5.08579879922e11,4.949573335e11,4.72187897068e11,4.99229768599e11,4.60419913288e11,4.69019613895e11,4.50583091837e11,4.72792727311e11,4.72333754492e11,4.65152305777e11,4.82234976786e11,4.72238483765e11,4.73826923338e11,4.76267120461e11,4.87120033427e11,5.04120244741e11,4.69559064737e11,4.72201757593e11,4.69914031792e11,4.93629873162e11,4.71968584791e11,5.01452793581e11,4.80458931455e11,4.83065538379e11,4.99070229147e11,4.71609869279e11,4.71492369998e11,4.58522950715e11,4.80960881323e11,4.91960762476e11,4.73412762655e11,4.69283546561e11,4.66574358844e11,4.67318993209e11,4.5724723899e11,4.7334516285e11]}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]]

View File

@ -62,6 +62,7 @@ Document the process of performance tuning
Initial: no cache; 256 blocksize; exprs pre-processed and sent to GPU on every call; vars sent on every call; frontend + dispatch are multithreaded Initial: no cache; 256 blocksize; exprs pre-processed and sent to GPU on every call; vars sent on every call; frontend + dispatch are multithreaded
1.) Done before parameter optimisation loop: Frontend, transmitting Exprs and Variables (improved runtime) 1.) Done before parameter optimisation loop: Frontend, transmitting Exprs and Variables (improved runtime)
2.) tuned blocksize to have as little wasted threads as possible (new blocksize 121 -> 3-blocks -> 363 threads but 362 threads needed per expression)
\subsection{Transpiler} \subsection{Transpiler}
@ -75,6 +76,7 @@ Document the process of performance tuning
Initial: no cache; 256 blocksize; exprs pre-processed and transpiled on every call; vars sent on every call; frontend + transpilation + dispatch are multithreaded Initial: no cache; 256 blocksize; exprs pre-processed and transpiled on every call; vars sent on every call; frontend + transpilation + dispatch are multithreaded
1.) Done before parameter optimisation loop: Frontend, transmitting Exprs and Variables (improved runtime) 1.) Done before parameter optimisation loop: Frontend, transmitting Exprs and Variables (improved runtime)
2.) All expressions to execute are transpiled first (before they were transpiled for every execution, even in parameter optimisation scenarios). Compilation is still done every time, because too little RAM was available (compilation takes the most time, so this is only a minor boost)
\subsection{Comparison} \subsection{Comparison}
Comparison of Interpreter and Transpiler as well as Comparing the two with CPU interpreter Comparison of Interpreter and Transpiler as well as Comparing the two with CPU interpreter

Binary file not shown.