benchmarking: tuned interpreter blocksize
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run

This commit is contained in:
2025-05-20 09:05:35 +02:00
parent a9ffd5da63
commit 250deb334c
5 changed files with 26 additions and 15 deletions

View File

@ -25,7 +25,7 @@ function interpret(cudaExprs, numExprs::Integer, exprsInnerLength::Integer,
# Start kernel for each expression to ensure that no warp is working on different expressions
@inbounds Threads.@threads for i in 1:numExprs # multithreaded to speedup dispatching (seems to have improved performance)
numThreads = min(variableColumns, 256)
numThreads = min(variableColumns, 121)
numBlocks = cld(variableColumns, numThreads)
@cuda threads=numThreads blocks=numBlocks fastmath=true interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)

View File

@ -1,30 +1,38 @@
using CUDA
using DelimitedFiles
using GZip
using .Transpiler
using .Interpreter
varsets_medium = 10000
X = randn(Float32, 5, varsets_medium)
include("parser.jl") # to parse expressions from a file
exprsGPU = [
# CPU interpreter requires an anonymous function and array ref s
:(p1 * x1 + p2), # 5 op
:((((x1 + x2) + x3) + x4) + x5), # 9 op
:(log(abs(x1))), # 3 op
:(powabs(p2 - powabs(p1 + x1, 1/x1),p3)) # 13 op
] # 30 op
# p is the same for CPU and GPU
p = [randn(Float32, 10) for _ in 1:length(exprsGPU)] # generate 10 random parameter values for each expr
data,varnames = readdlm("data/nikuradse_1.csv", ',', header=true);
X = permutedims(convert(Matrix{Float32}, data))
exprs = Expr[]
parameters = Vector{Vector{Float32}}()
varnames = ["x$i" for i in 1:10]
paramnames = ["p$i" for i in 1:20]
# data/esr_nvar2_len10.txt.gz_9.txt.gz has ~250_000 exprs
# data/esr_nvar2_len10.txt.gz_10.txt.gz has ~800_000 exrps
GZip.open("data/esr_nvar2_len10.txt.gz_3.txt.gz") do io
for line in eachline(io)
expr, p = parse_infix(line, varnames, paramnames)
push!(exprs, expr)
push!(parameters, randn(Float32, length(p)))
end
end
expr_reps = 1
@testset "Interpreter Tuning" begin
CUDA.@profile interpret_gpu(exprsGPU, X, p; repetitions=expr_reps)
# CUDA.@profile interpret_gpu(exprs, X, parameters; repetitions=expr_reps)
end
@testset "Transpiler Tuning" begin
CUDA.@profile evaluate_gpu(exprsGPU, X, p; repetitions=expr_reps)
CUDA.@profile evaluate_gpu(exprs, X, parameters; repetitions=expr_reps)
end

View File

@ -0,0 +1 @@
[{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"GPUT":["BenchmarkGroup",{"data":{},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"nikuradse_1":["Trial",{"allocs":768767740,"gctimes":[1.4209871071e10,8.529233725e9,8.165943693e9,8.180014668e9,8.231263428e9,1.1110946388e10,1.3136749872e10,1.0515143897e10,1.2978886885e10,1.0709110363e10,1.2408937103e10,1.4486745203e10,1.3229416582e10,1.8353010658e10,1.32173253e10,1.1621004633e10,1.1136122325e10,9.614762707e9,1.4564265563e10,9.399404156e9,1.063983064e10,1.2513746965e10,9.039906393e9,1.2382209752e10,1.3127092115e10,1.2713843793e10,1.1111974511e10,1.5837882785e10,1.5005237417e10,1.2439743996e10,9.607861366e9,1.0680724758e10,1.4012997282e10,1.258804731e10,1.020862355e10,9.630750655e9,1.5428270551e10,1.746317266e10,1.3141055589e10,1.5009128259e10,8.453648604e9,1.6874341516e10,1.1411307067e10,1.2542892313e10,1.1232296452e10,1.3458245148e10,1.0818032806e10,9.239119183e9,1.7897566617e10,1.565065385e10],"memory":54082712568,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":50,"evals":1,"gcsample":false,"seconds":43200.0,"overhead":0.0,"memory_tolerance":0.01}],"times":[4.72169572882e11,5.0409909815e11,5.07815085942e11,5.10453558146e11,5.10478958938e11,4.97262381193e11,5.0260603513e11,4.99542972531e11,4.87993778737e11,4.89021704445e11,5.03746768492e11,4.89869107858e11,4.73146154356e11,4.8171801387e11,5.08579879922e11,4.949573335e11,4.72187897068e11,4.99229768599e11,4.60419913288e11,4.69019613895e11,4.50583091837e11,4.72792727311e11,4.72333754492e11,4.65152305777e11,4.82234976786e11,4.72238483765e11,4.73826923338e11,4.76267120461e11,4.87120033427e11,5.04120244741e11,4.69559064737e11,4.72201757593e11,4.69914031792e11,4.93629873162e11,4.71968584791e11,5.01452793581e11,4.80458931455e11,4.83065538379e11,4.99070229147e11,4.71609869279e11,4.71492369998e11,4.58522950715e11,4.80960881323e11,4.91960762476e11,4.73412762655e11,4.69283546561e11,4.66574358844e11,4.67318993209e11,4.5724723899e11,4.7334516285e11]}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]]

View File

@ -62,6 +62,7 @@ Document the process of performance tuning
Initial: no cache; 256 blocksize; exprs pre-processed and sent to GPU on every call; vars sent on every call; frontend + dispatch are multithreaded
1.) Done before parameter optimisation loop: Frontend, transmitting Exprs and Variables (improved runtime)
2.) tuned blocksize to have as little wasted threads as possible (new blocksize 121 -> 3-blocks -> 363 threads but 362 threads needed per expression)
\subsection{Transpiler}
@ -75,6 +76,7 @@ Document the process of performance tuning
Initial: no cache; 256 blocksize; exprs pre-processed and transpiled on every call; vars sent on every call; frontend + transpilation + dispatch are multithreaded
1.) Done before parameter optimisation loop: Frontend, transmitting Exprs and Variables (improved runtime)
2.) All expressions to execute are transpiled first (before they were transpiled for every execution, even in parameter optimisation scenarios). Compilation is still done every time, because too little RAM was available (compilation takes the most time, so this is only a minor boost)
\subsection{Comparison}
Comparison of Interpreter and Transpiler as well as Comparing the two with CPU interpreter

Binary file not shown.