benchmarking: tuned blocksizes; slightly improved performance; mostly improved standard deviation
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
This commit is contained in:
@ -73,7 +73,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
|
||||
# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
|
||||
for kernel in kernels
|
||||
# config = launch_configuration(kernels[i])
|
||||
threads = min(variableCols, 256)
|
||||
threads = min(variableCols, 96)
|
||||
blocks = cld(variableCols, threads)
|
||||
|
||||
cudacall(kernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
|
||||
|
Reference in New Issue
Block a user