diff --git a/package/src/Interpreter.jl b/package/src/Interpreter.jl index 2326092..a0138f8 100644 --- a/package/src/Interpreter.jl +++ b/package/src/Interpreter.jl @@ -25,7 +25,7 @@ function interpret(cudaExprs, numExprs::Integer, exprsInnerLength::Integer, # Start kernel for each expression to ensure that no warp is working on different expressions @inbounds Threads.@threads for i in 1:numExprs # multithreaded to speedup dispatching (seems to have improved performance) - numThreads = min(variableColumns, 256) + numThreads = min(variableColumns, 121) numBlocks = cld(variableColumns, numThreads) @cuda threads=numThreads blocks=numBlocks fastmath=true interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i) diff --git a/package/test/PerformanceTuning.jl b/package/test/PerformanceTuning.jl index 94bca31..fe64cc1 100644 --- a/package/test/PerformanceTuning.jl +++ b/package/test/PerformanceTuning.jl @@ -1,30 +1,38 @@ using CUDA +using DelimitedFiles +using GZip using .Transpiler using .Interpreter -varsets_medium = 10000 -X = randn(Float32, 5, varsets_medium) +include("parser.jl") # to parse expressions from a file -exprsGPU = [ - # CPU interpreter requires an anonymous function and array ref s - :(p1 * x1 + p2), # 5 op - :((((x1 + x2) + x3) + x4) + x5), # 9 op - :(log(abs(x1))), # 3 op - :(powabs(p2 - powabs(p1 + x1, 1/x1),p3)) # 13 op -] # 30 op -# p is the same for CPU and GPU -p = [randn(Float32, 10) for _ in 1:length(exprsGPU)] # generate 10 random parameter values for each expr +data,varnames = readdlm("data/nikuradse_1.csv", ',', header=true); +X = permutedims(convert(Matrix{Float32}, data)) + +exprs = Expr[] +parameters = Vector{Vector{Float32}}() +varnames = ["x$i" for i in 1:10] +paramnames = ["p$i" for i in 1:20] +# data/esr_nvar2_len10.txt.gz_9.txt.gz has ~250_000 exprs +# data/esr_nvar2_len10.txt.gz_10.txt.gz has ~800_000 exrps +GZip.open("data/esr_nvar2_len10.txt.gz_3.txt.gz") do io + for line in eachline(io) + expr, p = parse_infix(line, varnames, paramnames) + + push!(exprs, expr) + push!(parameters, randn(Float32, length(p))) + end +end expr_reps = 1 - @testset "Interpreter Tuning" begin - CUDA.@profile interpret_gpu(exprsGPU, X, p; repetitions=expr_reps) + # CUDA.@profile interpret_gpu(exprs, X, parameters; repetitions=expr_reps) end @testset "Transpiler Tuning" begin - CUDA.@profile evaluate_gpu(exprsGPU, X, p; repetitions=expr_reps) + CUDA.@profile evaluate_gpu(exprs, X, parameters; repetitions=expr_reps) end \ No newline at end of file diff --git a/package/test/results-fh-new/2-i_blocksize_121__t_transpiling_only_once.json b/package/test/results-fh-new/2-i_blocksize_121__t_transpiling_only_once.json new file mode 100644 index 0000000..bc60587 --- /dev/null +++ b/package/test/results-fh-new/2-i_blocksize_121__t_transpiling_only_once.json @@ -0,0 +1 @@ +[{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"GPUT":["BenchmarkGroup",{"data":{},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"nikuradse_1":["Trial",{"allocs":768767740,"gctimes":[1.4209871071e10,8.529233725e9,8.165943693e9,8.180014668e9,8.231263428e9,1.1110946388e10,1.3136749872e10,1.0515143897e10,1.2978886885e10,1.0709110363e10,1.2408937103e10,1.4486745203e10,1.3229416582e10,1.8353010658e10,1.32173253e10,1.1621004633e10,1.1136122325e10,9.614762707e9,1.4564265563e10,9.399404156e9,1.063983064e10,1.2513746965e10,9.039906393e9,1.2382209752e10,1.3127092115e10,1.2713843793e10,1.1111974511e10,1.5837882785e10,1.5005237417e10,1.2439743996e10,9.607861366e9,1.0680724758e10,1.4012997282e10,1.258804731e10,1.020862355e10,9.630750655e9,1.5428270551e10,1.746317266e10,1.3141055589e10,1.5009128259e10,8.453648604e9,1.6874341516e10,1.1411307067e10,1.2542892313e10,1.1232296452e10,1.3458245148e10,1.0818032806e10,9.239119183e9,1.7897566617e10,1.565065385e10],"memory":54082712568,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":50,"evals":1,"gcsample":false,"seconds":43200.0,"overhead":0.0,"memory_tolerance":0.01}],"times":[4.72169572882e11,5.0409909815e11,5.07815085942e11,5.10453558146e11,5.10478958938e11,4.97262381193e11,5.0260603513e11,4.99542972531e11,4.87993778737e11,4.89021704445e11,5.03746768492e11,4.89869107858e11,4.73146154356e11,4.8171801387e11,5.08579879922e11,4.949573335e11,4.72187897068e11,4.99229768599e11,4.60419913288e11,4.69019613895e11,4.50583091837e11,4.72792727311e11,4.72333754492e11,4.65152305777e11,4.82234976786e11,4.72238483765e11,4.73826923338e11,4.76267120461e11,4.87120033427e11,5.04120244741e11,4.69559064737e11,4.72201757593e11,4.69914031792e11,4.93629873162e11,4.71968584791e11,5.01452793581e11,4.80458931455e11,4.83065538379e11,4.99070229147e11,4.71609869279e11,4.71492369998e11,4.58522950715e11,4.80960881323e11,4.91960762476e11,4.73412762655e11,4.69283546561e11,4.66574358844e11,4.67318993209e11,4.5724723899e11,4.7334516285e11]}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]] \ No newline at end of file diff --git a/thesis/chapters/evaluation.tex b/thesis/chapters/evaluation.tex index d0fa1f3..1d28846 100644 --- a/thesis/chapters/evaluation.tex +++ b/thesis/chapters/evaluation.tex @@ -62,6 +62,7 @@ Document the process of performance tuning Initial: no cache; 256 blocksize; exprs pre-processed and sent to GPU on every call; vars sent on every call; frontend + dispatch are multithreaded 1.) Done before parameter optimisation loop: Frontend, transmitting Exprs and Variables (improved runtime) +2.) tuned blocksize to have as little wasted threads as possible (new blocksize 121 -> 3-blocks -> 363 threads but 362 threads needed per expression) \subsection{Transpiler} @@ -75,6 +76,7 @@ Document the process of performance tuning Initial: no cache; 256 blocksize; exprs pre-processed and transpiled on every call; vars sent on every call; frontend + transpilation + dispatch are multithreaded 1.) Done before parameter optimisation loop: Frontend, transmitting Exprs and Variables (improved runtime) +2.) All expressions to execute are transpiled first (before they were transpiled for every execution, even in parameter optimisation scenarios). Compilation is still done every time, because too little RAM was available (compilation takes the most time, so this is only a minor boost) \subsection{Comparison} Comparison of Interpreter and Transpiler as well as Comparing the two with CPU interpreter diff --git a/thesis/main.pdf b/thesis/main.pdf index 13f357e..7eefe64 100644 Binary files a/thesis/main.pdf and b/thesis/main.pdf differ