benchmarking: tuned interpreter blocksize
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
This commit is contained in:
@ -25,7 +25,7 @@ function interpret(cudaExprs, numExprs::Integer, exprsInnerLength::Integer,
|
||||
|
||||
# Start kernel for each expression to ensure that no warp is working on different expressions
|
||||
@inbounds Threads.@threads for i in 1:numExprs # multithreaded to speedup dispatching (seems to have improved performance)
|
||||
numThreads = min(variableColumns, 256)
|
||||
numThreads = min(variableColumns, 121)
|
||||
numBlocks = cld(variableColumns, numThreads)
|
||||
|
||||
@cuda threads=numThreads blocks=numBlocks fastmath=true interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
|
||||
|
@ -1,30 +1,38 @@
|
||||
using CUDA
|
||||
using DelimitedFiles
|
||||
using GZip
|
||||
|
||||
using .Transpiler
|
||||
using .Interpreter
|
||||
|
||||
varsets_medium = 10000
|
||||
X = randn(Float32, 5, varsets_medium)
|
||||
include("parser.jl") # to parse expressions from a file
|
||||
|
||||
exprsGPU = [
|
||||
# CPU interpreter requires an anonymous function and array ref s
|
||||
:(p1 * x1 + p2), # 5 op
|
||||
:((((x1 + x2) + x3) + x4) + x5), # 9 op
|
||||
:(log(abs(x1))), # 3 op
|
||||
:(powabs(p2 - powabs(p1 + x1, 1/x1),p3)) # 13 op
|
||||
] # 30 op
|
||||
|
||||
# p is the same for CPU and GPU
|
||||
p = [randn(Float32, 10) for _ in 1:length(exprsGPU)] # generate 10 random parameter values for each expr
|
||||
data,varnames = readdlm("data/nikuradse_1.csv", ',', header=true);
|
||||
X = permutedims(convert(Matrix{Float32}, data))
|
||||
|
||||
exprs = Expr[]
|
||||
parameters = Vector{Vector{Float32}}()
|
||||
varnames = ["x$i" for i in 1:10]
|
||||
paramnames = ["p$i" for i in 1:20]
|
||||
# data/esr_nvar2_len10.txt.gz_9.txt.gz has ~250_000 exprs
|
||||
# data/esr_nvar2_len10.txt.gz_10.txt.gz has ~800_000 exrps
|
||||
GZip.open("data/esr_nvar2_len10.txt.gz_3.txt.gz") do io
|
||||
for line in eachline(io)
|
||||
expr, p = parse_infix(line, varnames, paramnames)
|
||||
|
||||
push!(exprs, expr)
|
||||
push!(parameters, randn(Float32, length(p)))
|
||||
end
|
||||
end
|
||||
expr_reps = 1
|
||||
|
||||
|
||||
|
||||
@testset "Interpreter Tuning" begin
|
||||
CUDA.@profile interpret_gpu(exprsGPU, X, p; repetitions=expr_reps)
|
||||
# CUDA.@profile interpret_gpu(exprs, X, parameters; repetitions=expr_reps)
|
||||
end
|
||||
|
||||
|
||||
@testset "Transpiler Tuning" begin
|
||||
CUDA.@profile evaluate_gpu(exprsGPU, X, p; repetitions=expr_reps)
|
||||
CUDA.@profile evaluate_gpu(exprs, X, parameters; repetitions=expr_reps)
|
||||
end
|
@ -0,0 +1 @@
|
||||
[{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"GPUT":["BenchmarkGroup",{"data":{},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"nikuradse_1":["Trial",{"allocs":768767740,"gctimes":[1.4209871071e10,8.529233725e9,8.165943693e9,8.180014668e9,8.231263428e9,1.1110946388e10,1.3136749872e10,1.0515143897e10,1.2978886885e10,1.0709110363e10,1.2408937103e10,1.4486745203e10,1.3229416582e10,1.8353010658e10,1.32173253e10,1.1621004633e10,1.1136122325e10,9.614762707e9,1.4564265563e10,9.399404156e9,1.063983064e10,1.2513746965e10,9.039906393e9,1.2382209752e10,1.3127092115e10,1.2713843793e10,1.1111974511e10,1.5837882785e10,1.5005237417e10,1.2439743996e10,9.607861366e9,1.0680724758e10,1.4012997282e10,1.258804731e10,1.020862355e10,9.630750655e9,1.5428270551e10,1.746317266e10,1.3141055589e10,1.5009128259e10,8.453648604e9,1.6874341516e10,1.1411307067e10,1.2542892313e10,1.1232296452e10,1.3458245148e10,1.0818032806e10,9.239119183e9,1.7897566617e10,1.565065385e10],"memory":54082712568,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":50,"evals":1,"gcsample":false,"seconds":43200.0,"overhead":0.0,"memory_tolerance":0.01}],"times":[4.72169572882e11,5.0409909815e11,5.07815085942e11,5.10453558146e11,5.10478958938e11,4.97262381193e11,5.0260603513e11,4.99542972531e11,4.87993778737e11,4.89021704445e11,5.03746768492e11,4.89869107858e11,4.73146154356e11,4.8171801387e11,5.08579879922e11,4.949573335e11,4.72187897068e11,4.99229768599e11,4.60419913288e11,4.69019613895e11,4.50583091837e11,4.72792727311e11,4.72333754492e11,4.65152305777e11,4.82234976786e11,4.72238483765e11,4.73826923338e11,4.76267120461e11,4.87120033427e11,5.04120244741e11,4.69559064737e11,4.72201757593e11,4.69914031792e11,4.93629873162e11,4.71968584791e11,5.01452793581e11,4.80458931455e11,4.83065538379e11,4.99070229147e11,4.71609869279e11,4.71492369998e11,4.58522950715e11,4.80960881323e11,4.91960762476e11,4.73412762655e11,4.69283546561e11,4.66574358844e11,4.67318993209e11,4.5724723899e11,4.7334516285e11]}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]]
|
@ -62,6 +62,7 @@ Document the process of performance tuning
|
||||
Initial: no cache; 256 blocksize; exprs pre-processed and sent to GPU on every call; vars sent on every call; frontend + dispatch are multithreaded
|
||||
|
||||
1.) Done before parameter optimisation loop: Frontend, transmitting Exprs and Variables (improved runtime)
|
||||
2.) tuned blocksize to have as little wasted threads as possible (new blocksize 121 -> 3-blocks -> 363 threads but 362 threads needed per expression)
|
||||
|
||||
|
||||
\subsection{Transpiler}
|
||||
@ -75,6 +76,7 @@ Document the process of performance tuning
|
||||
Initial: no cache; 256 blocksize; exprs pre-processed and transpiled on every call; vars sent on every call; frontend + transpilation + dispatch are multithreaded
|
||||
|
||||
1.) Done before parameter optimisation loop: Frontend, transmitting Exprs and Variables (improved runtime)
|
||||
2.) All expressions to execute are transpiled first (before they were transpiled for every execution, even in parameter optimisation scenarios). Compilation is still done every time, because too little RAM was available (compilation takes the most time, so this is only a minor boost)
|
||||
|
||||
\subsection{Comparison}
|
||||
Comparison of Interpreter and Transpiler as well as Comparing the two with CPU interpreter
|
||||
|
BIN
thesis/main.pdf
BIN
thesis/main.pdf
Binary file not shown.
Reference in New Issue
Block a user