2 Commits

Author SHA1 Message Date
101ccef67b benchmarking: tuned blocksizes; slightly improved performance; mostly improved standard deviation
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
2025-04-12 13:20:50 +02:00
c6e2ce47aa benchmarking: redid performance tests on uni pc
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
2025-04-12 12:13:28 +02:00
9 changed files with 13 additions and 9 deletions

View File

@ -33,7 +33,7 @@ function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parame
@inbounds for i in eachindex(exprs)
kernel = @cuda launch=false interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
# config = launch_configuration(kernel.fun)
threads = min(variableCols, 256)
threads = min(variableCols, 128)
blocks = cld(variableCols, threads)
kernel(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i; threads, blocks)

View File

@ -73,7 +73,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
for kernel in kernels
# config = launch_configuration(kernels[i])
threads = min(variableCols, 256)
threads = min(variableCols, 96)
blocks = cld(variableCols, threads)
cudacall(kernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)

View File

@ -4,7 +4,7 @@ using BenchmarkTools
using .Transpiler
using .Interpreter
const BENCHMARKS_RESULTS_PATH = "./results"
const BENCHMARKS_RESULTS_PATH = "./results-fh"
exprsCPU = [
# CPU interpreter requires an anonymous function and array ref s
:(p[1] * x[1] + p[2]), # 5 op
@ -69,7 +69,7 @@ end
# Add /usr/local/cuda/bin in .bashrc to PATH to access ncu and nsys (depending how well this works with my 1080 do it on my machine, otherwise re do the tests and perform them on FH PCs)
# University setup at 10.20.1.7 if needed
compareWithCPU = false
compareWithCPU = true
suite = BenchmarkGroup()
@ -143,9 +143,9 @@ if compareWithCPU
println(gpuiVsGPUT_median)
println(gpuiVsGPUT_std)
# BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/using_inbounds.json", results)
BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json", results)
else
resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/using_inbounds.json")[1]
resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/2-using_inbounds.json")[1]
medianGPUI_old = median(resultsOld["GPUI"])
stdGPUI_old = std(resultsOld["GPUI"])

View File

@ -26,5 +26,5 @@ end
@testset "Transpiler Tuning" begin
CUDA.@profile evaluate_gpu(exprsGPU, X, p; repetitions=expr_reps)
# CUDA.@profile evaluate_gpu(exprsGPU, X, p; repetitions=expr_reps)
end

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -19,6 +19,6 @@ end
# end
@testset "Performance tests" begin
include("PerformanceTuning.jl")
# include("PerformanceTests.jl")
# include("PerformanceTuning.jl")
include("PerformanceTests.jl")
end