benchmarking: tuned blocksizes; slightly improved performance; mostly improved standard deviation
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
This commit is contained in:
parent
c6e2ce47aa
commit
101ccef67b
|
@ -33,7 +33,7 @@ function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parame
|
|||
@inbounds for i in eachindex(exprs)
|
||||
kernel = @cuda launch=false interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
|
||||
# config = launch_configuration(kernel.fun)
|
||||
threads = min(variableCols, 256)
|
||||
threads = min(variableCols, 128)
|
||||
blocks = cld(variableCols, threads)
|
||||
|
||||
kernel(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i; threads, blocks)
|
||||
|
|
|
@ -73,7 +73,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
|
|||
# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
|
||||
for kernel in kernels
|
||||
# config = launch_configuration(kernels[i])
|
||||
threads = min(variableCols, 256)
|
||||
threads = min(variableCols, 96)
|
||||
blocks = cld(variableCols, threads)
|
||||
|
||||
cudacall(kernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
|
||||
|
|
|
@ -4,7 +4,7 @@ using BenchmarkTools
|
|||
using .Transpiler
|
||||
using .Interpreter
|
||||
|
||||
const BENCHMARKS_RESULTS_PATH = "./results"
|
||||
const BENCHMARKS_RESULTS_PATH = "./results-fh"
|
||||
exprsCPU = [
|
||||
# CPU interpreter requires an anonymous function and array ref s
|
||||
:(p[1] * x[1] + p[2]), # 5 op
|
||||
|
@ -69,7 +69,7 @@ end
|
|||
# Add /usr/local/cuda/bin in .bashrc to PATH to access ncu and nsys (depending how well this works with my 1080 do it on my machine, otherwise re do the tests and perform them on FH PCs)
|
||||
# University setup at 10.20.1.7 if needed
|
||||
|
||||
compareWithCPU = false
|
||||
compareWithCPU = true
|
||||
|
||||
|
||||
suite = BenchmarkGroup()
|
||||
|
@ -143,9 +143,9 @@ if compareWithCPU
|
|||
println(gpuiVsGPUT_median)
|
||||
println(gpuiVsGPUT_std)
|
||||
|
||||
# BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/using_inbounds.json", results)
|
||||
BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json", results)
|
||||
else
|
||||
resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/using_inbounds.json")[1]
|
||||
resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/2-using_inbounds.json")[1]
|
||||
|
||||
medianGPUI_old = median(resultsOld["GPUI"])
|
||||
stdGPUI_old = std(resultsOld["GPUI"])
|
||||
|
|
|
@ -26,5 +26,5 @@ end
|
|||
|
||||
|
||||
@testset "Transpiler Tuning" begin
|
||||
CUDA.@profile evaluate_gpu(exprsGPU, X, p; repetitions=expr_reps)
|
||||
# CUDA.@profile evaluate_gpu(exprsGPU, X, p; repetitions=expr_reps)
|
||||
end
|
1
package/test/results-fh/3-tuned-blocksize_I128_T96.json
Normal file
1
package/test/results-fh/3-tuned-blocksize_I128_T96.json
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user