benchmarking: used int32 wherever possible; resulted in noticeable performance drop
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run

This commit is contained in:
Daniel 2025-04-13 11:32:54 +02:00
parent 4c60331288
commit af3b72f196
8 changed files with 29 additions and 23 deletions

View File

@ -1,5 +1,6 @@
module Interpreter module Interpreter
using CUDA using CUDA
using CUDA: i32
using StaticArrays using StaticArrays
using ..ExpressionProcessing using ..ExpressionProcessing
using ..Utils using ..Utils
@ -24,14 +25,14 @@ function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parame
cudaParams = Utils.create_cuda_array(parameters, NaN32) # column corresponds to data for one expression cudaParams = Utils.create_cuda_array(parameters, NaN32) # column corresponds to data for one expression
cudaExprs = Utils.create_cuda_array(exprs, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression cudaExprs = Utils.create_cuda_array(exprs, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression
# put into seperate cuArray, as this is static and would be inefficient to send seperatly to every kernel # put into seperate cuArray, as this is static and would be inefficient to send seperatly to every kernel
cudaStepsize = CuArray([Utils.get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression cudaStepsize::CuArray{Int32} = CuArray([Utils.get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression
# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions # each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
cudaResults = CuArray{Float32}(undef, variableCols, length(exprs)) cudaResults = CuArray{Float32}(undef, variableCols, length(exprs))
# Start kernel for each expression to ensure that no warp is working on different expressions # Start kernel for each expression to ensure that no warp is working on different expressions
@inbounds for i in eachindex(exprs) @inbounds for i in eachindex(exprs)
kernel = @cuda launch=false interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i) kernel = @cuda launch=false interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, convert(Int32, i))
# config = launch_configuration(kernel.fun) # config = launch_configuration(kernel.fun)
threads = min(variableCols, 128) threads = min(variableCols, 128)
blocks = cld(variableCols, threads) blocks = cld(variableCols, threads)
@ -44,8 +45,8 @@ end
#TODO: Add @inbounds to all indexing after it is verified that all works https://cuda.juliagpu.org/stable/development/kernel/#Bounds-checking #TODO: Add @inbounds to all indexing after it is verified that all works https://cuda.juliagpu.org/stable/development/kernel/#Bounds-checking
const MAX_STACK_SIZE = 25 # The depth of the stack to store the values and intermediate results const MAX_STACK_SIZE = 25 # The depth of the stack to store the values and intermediate results
function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, variables::CuDeviceArray{Float32}, parameters::CuDeviceArray{Float32}, results::CuDeviceArray{Float32}, stepsize::CuDeviceArray{Int}, exprIndex::Int) function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, variables::CuDeviceArray{Float32}, parameters::CuDeviceArray{Float32}, results::CuDeviceArray{Float32}, stepsize::CuDeviceArray{Int32}, exprIndex::Int32)
varSetIndex = (blockIdx().x - 1) * blockDim().x + threadIdx().x # ctaid.x * ntid.x + tid.x (1-based) varSetIndex = (blockIdx().x - 1i32) * blockDim().x + threadIdx().x # ctaid.x * ntid.x + tid.x (1-based)
@inbounds variableCols = length(variables) / stepsize[2] @inbounds variableCols = length(variables) / stepsize[2]
if varSetIndex > variableCols if varSetIndex > variableCols
@ -54,19 +55,19 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var
# firstExprIndex = ((exprIndex - 1) * stepsize[1]) + 1 # Inclusive # firstExprIndex = ((exprIndex - 1) * stepsize[1]) + 1 # Inclusive
# lastExprIndex = firstExprIndex + stepsize[1] - 1 # Inclusive # lastExprIndex = firstExprIndex + stepsize[1] - 1 # Inclusive
@inbounds firstParamIndex = ((exprIndex - 1) * stepsize[1]) # Exclusive @inbounds firstParamIndex = ((exprIndex - 1i32) * stepsize[1]) # Exclusive
operationStack = MVector{MAX_STACK_SIZE, Float32}(undef) # Try to get this to function with variable size too, to allow better memory usage operationStack = MVector{MAX_STACK_SIZE, Float32}(undef) # Try to get this to function with variable size too, to allow better memory usage
operationStackTop = 0 # stores index of the last defined/valid value operationStackTop = 0i32 # stores index of the last defined/valid value
@inbounds firstVariableIndex = ((varSetIndex-1) * stepsize[2]) # Exclusive @inbounds firstVariableIndex = ((varSetIndex-1i32) * stepsize[2]) # Exclusive
@inbounds for expr in expressions @inbounds for expr in expressions
if expr.Type == EMPTY if expr.Type == EMPTY
break break
elseif expr.Type == INDEX elseif expr.Type == INDEX
val = expr.Value val = expr.Value
operationStackTop += 1 operationStackTop += 1i32
if val > 0 if val > 0
operationStack[operationStackTop] = variables[firstVariableIndex + val] operationStack[operationStackTop] = variables[firstVariableIndex + val]
@ -75,25 +76,25 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var
operationStack[operationStackTop] = parameters[firstParamIndex + val] operationStack[operationStackTop] = parameters[firstParamIndex + val]
end end
elseif expr.Type == FLOAT32 elseif expr.Type == FLOAT32
operationStackTop += 1 operationStackTop += 1i32
operationStack[operationStackTop] = reinterpret(Float32, expr.Value) operationStack[operationStackTop] = reinterpret(Float32, expr.Value)
elseif expr.Type == OPERATOR elseif expr.Type == OPERATOR
type = reinterpret(Operator, expr.Value) type = reinterpret(Operator, expr.Value)
if type == ADD if type == ADD
operationStackTop -= 1 operationStackTop -= 1i32
operationStack[operationStackTop] = operationStack[operationStackTop] + operationStack[operationStackTop + 1] operationStack[operationStackTop] = operationStack[operationStackTop] + operationStack[operationStackTop + 1i32]
elseif type == SUBTRACT elseif type == SUBTRACT
operationStackTop -= 1 operationStackTop -= 1i32
operationStack[operationStackTop] = operationStack[operationStackTop] - operationStack[operationStackTop + 1] operationStack[operationStackTop] = operationStack[operationStackTop] - operationStack[operationStackTop + 1i32]
elseif type == MULTIPLY elseif type == MULTIPLY
operationStackTop -= 1 operationStackTop -= 1i32
operationStack[operationStackTop] = operationStack[operationStackTop] * operationStack[operationStackTop + 1] operationStack[operationStackTop] = operationStack[operationStackTop] * operationStack[operationStackTop + 1i32]
elseif type == DIVIDE elseif type == DIVIDE
operationStackTop -= 1 operationStackTop -= 1i32
operationStack[operationStackTop] = operationStack[operationStackTop] / operationStack[operationStackTop + 1] operationStack[operationStackTop] = operationStack[operationStackTop] / operationStack[operationStackTop + 1i32]
elseif type == POWER elseif type == POWER
operationStackTop -= 1 operationStackTop -= 1i32
operationStack[operationStackTop] = operationStack[operationStackTop] ^ operationStack[operationStackTop + 1] operationStack[operationStackTop] = operationStack[operationStackTop] ^ operationStack[operationStackTop + 1i32]
elseif type == ABS elseif type == ABS
operationStack[operationStackTop] = abs(operationStack[operationStackTop]) operationStack[operationStackTop] = abs(operationStack[operationStackTop])
elseif type == LOG elseif type == LOG
@ -104,14 +105,14 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var
operationStack[operationStackTop] = sqrt(operationStack[operationStackTop]) operationStack[operationStackTop] = sqrt(operationStack[operationStackTop])
end end
else else
operationStack[operationStackTop] = NaN operationStack[operationStackTop] = NaN32
break break
end end
end end
# "(exprIndex - 1) * variableCols" -> calculates the column in which to insert the result (expression = column) # "(exprIndex - 1) * variableCols" -> calculates the column in which to insert the result (expression = column)
# "+ varSetIndex" -> to get the row inside the column at which to insert the result of the variable set (variable set = row) # "+ varSetIndex" -> to get the row inside the column at which to insert the result of the variable set (variable set = row)
resultIndex = convert(Int, (exprIndex - 1) * variableCols + varSetIndex) # Inclusive resultIndex = convert(Int, (exprIndex - 1i32) * variableCols + varSetIndex) # Inclusive
@inbounds results[resultIndex] = operationStack[operationStackTop] @inbounds results[resultIndex] = operationStack[operationStackTop]
return return

View File

@ -4,7 +4,7 @@ using BenchmarkTools
using .Transpiler using .Transpiler
using .Interpreter using .Interpreter
const BENCHMARKS_RESULTS_PATH = "./results-fh" const BENCHMARKS_RESULTS_PATH = "./results"
exprsCPU = [ exprsCPU = [
# CPU interpreter requires an anonymous function and array ref s # CPU interpreter requires an anonymous function and array ref s
:(p[1] * x[1] + p[2]), # 5 op :(p[1] * x[1] + p[2]), # 5 op
@ -143,7 +143,7 @@ if compareWithCPU
println(gpuiVsGPUT_median) println(gpuiVsGPUT_median)
println(gpuiVsGPUT_std) println(gpuiVsGPUT_std)
BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json", results) BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/4-interpreter_using_int32.json", results)
else else
resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/2-using_inbounds.json")[1] resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/2-using_inbounds.json")[1]

File diff suppressed because one or more lines are too long

View File

@ -21,6 +21,8 @@ Initial: CPU-Side single-threaded; up to 1024 threads per block; bounds-checking
1.) Blocksize reduced to a maximum of 256 -> moderate improvement in medium and large 1.) Blocksize reduced to a maximum of 256 -> moderate improvement in medium and large
2.) Using @inbounds -> noticeable improvement in 2 out of 3 2.) Using @inbounds -> noticeable improvement in 2 out of 3
3.) Tuned blocksize with NSight compute -> slight improvement
4.) used int32 everywhere to reduce register usage -> significant performance drop (probably because a lot more waiting time, or more type conversions happening on GPU? would need to look at PTX)
\subsection{Transpiler} \subsection{Transpiler}
Results only for Transpiler (also contains final kernel configuration and probably quick overview/recap of the implementation used and described in Implementation section Results only for Transpiler (also contains final kernel configuration and probably quick overview/recap of the implementation used and described in Implementation section
@ -31,6 +33,8 @@ Initial: CPU-Side single-threaded; up to 1024 threads per block; bounds-checking
1.) Blocksize reduced to a maximum of 256 -> moderate improvement in medium and large 1.) Blocksize reduced to a maximum of 256 -> moderate improvement in medium and large
2.) Using @inbounds -> small improvement only on CPU side code 2.) Using @inbounds -> small improvement only on CPU side code
3.) Tuned blocksize with NSight compute -> slight improvement
4.) Only changed things on interpreter side
\subsection{Comparison} \subsection{Comparison}
Comparison of Interpreter and Transpiler as well as Comparing the two with CPU interpreter Comparison of Interpreter and Transpiler as well as Comparing the two with CPU interpreter

Binary file not shown.