benchmarking: tuned blocksizes; slightly improved performance; mostly improved standard deviation
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run

This commit is contained in:
Wiplinger Daniel - s2310454043 2025-04-12 13:20:50 +02:00
parent c6e2ce47aa
commit 101ccef67b
5 changed files with 8 additions and 7 deletions

View File

@ -33,7 +33,7 @@ function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parame
@inbounds for i in eachindex(exprs)
kernel = @cuda launch=false interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
# config = launch_configuration(kernel.fun)
threads = min(variableCols, 256)
threads = min(variableCols, 128)
blocks = cld(variableCols, threads)
kernel(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i; threads, blocks)

View File

@ -73,7 +73,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
for kernel in kernels
# config = launch_configuration(kernels[i])
threads = min(variableCols, 256)
threads = min(variableCols, 96)
blocks = cld(variableCols, threads)
cudacall(kernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)

View File

@ -4,7 +4,7 @@ using BenchmarkTools
using .Transpiler
using .Interpreter
const BENCHMARKS_RESULTS_PATH = "./results"
const BENCHMARKS_RESULTS_PATH = "./results-fh"
exprsCPU = [
# CPU interpreter requires an anonymous function and array ref s
:(p[1] * x[1] + p[2]), # 5 op
@ -69,7 +69,7 @@ end
# Add /usr/local/cuda/bin in .bashrc to PATH to access ncu and nsys (depending how well this works with my 1080 do it on my machine, otherwise re do the tests and perform them on FH PCs)
# University setup at 10.20.1.7 if needed
compareWithCPU = false
compareWithCPU = true
suite = BenchmarkGroup()
@ -143,9 +143,9 @@ if compareWithCPU
println(gpuiVsGPUT_median)
println(gpuiVsGPUT_std)
# BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/using_inbounds.json", results)
BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json", results)
else
resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/using_inbounds.json")[1]
resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/2-using_inbounds.json")[1]
medianGPUI_old = median(resultsOld["GPUI"])
stdGPUI_old = std(resultsOld["GPUI"])

View File

@ -26,5 +26,5 @@ end
@testset "Transpiler Tuning" begin
CUDA.@profile evaluate_gpu(exprsGPU, X, p; repetitions=expr_reps)
# CUDA.@profile evaluate_gpu(exprsGPU, X, p; repetitions=expr_reps)
end

File diff suppressed because one or more lines are too long