benchmarking: reverted previous; made interpreter use fast math
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
This commit is contained in:
parent
6d6874c7ba
commit
a5c34a53b7
|
@ -1,6 +1,5 @@
|
||||||
module Interpreter
|
module Interpreter
|
||||||
using CUDA
|
using CUDA
|
||||||
using CUDA: i32
|
|
||||||
using StaticArrays
|
using StaticArrays
|
||||||
using ..ExpressionProcessing
|
using ..ExpressionProcessing
|
||||||
using ..Utils
|
using ..Utils
|
||||||
|
@ -25,14 +24,14 @@ function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parame
|
||||||
cudaParams = Utils.create_cuda_array(parameters, NaN32) # column corresponds to data for one expression
|
cudaParams = Utils.create_cuda_array(parameters, NaN32) # column corresponds to data for one expression
|
||||||
cudaExprs = Utils.create_cuda_array(exprs, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression
|
cudaExprs = Utils.create_cuda_array(exprs, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression
|
||||||
# put into seperate cuArray, as this is static and would be inefficient to send seperatly to every kernel
|
# put into seperate cuArray, as this is static and would be inefficient to send seperatly to every kernel
|
||||||
cudaStepsize::CuArray{Int32} = CuArray([Utils.get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression
|
cudaStepsize = CuArray([Utils.get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression
|
||||||
|
|
||||||
# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
|
# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
|
||||||
cudaResults = CuArray{Float32}(undef, variableCols, length(exprs))
|
cudaResults = CuArray{Float32}(undef, variableCols, length(exprs))
|
||||||
|
|
||||||
# Start kernel for each expression to ensure that no warp is working on different expressions
|
# Start kernel for each expression to ensure that no warp is working on different expressions
|
||||||
@inbounds for i in eachindex(exprs)
|
@inbounds for i in eachindex(exprs)
|
||||||
kernel = @cuda launch=false interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, convert(Int32, i))
|
kernel = @cuda launch=false fastmath=true interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
|
||||||
# config = launch_configuration(kernel.fun)
|
# config = launch_configuration(kernel.fun)
|
||||||
threads = min(variableCols, 128)
|
threads = min(variableCols, 128)
|
||||||
blocks = cld(variableCols, threads)
|
blocks = cld(variableCols, threads)
|
||||||
|
@ -45,8 +44,8 @@ end
|
||||||
|
|
||||||
#TODO: Add @inbounds to all indexing after it is verified that all works https://cuda.juliagpu.org/stable/development/kernel/#Bounds-checking
|
#TODO: Add @inbounds to all indexing after it is verified that all works https://cuda.juliagpu.org/stable/development/kernel/#Bounds-checking
|
||||||
const MAX_STACK_SIZE = 25 # The depth of the stack to store the values and intermediate results
|
const MAX_STACK_SIZE = 25 # The depth of the stack to store the values and intermediate results
|
||||||
function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, variables::CuDeviceArray{Float32}, parameters::CuDeviceArray{Float32}, results::CuDeviceArray{Float32}, stepsize::CuDeviceArray{Int32}, exprIndex::Int32)
|
function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, variables::CuDeviceArray{Float32}, parameters::CuDeviceArray{Float32}, results::CuDeviceArray{Float32}, stepsize::CuDeviceArray{Int}, exprIndex::Int)
|
||||||
varSetIndex = (blockIdx().x - 1i32) * blockDim().x + threadIdx().x # ctaid.x * ntid.x + tid.x (1-based)
|
varSetIndex = (blockIdx().x - 1) * blockDim().x + threadIdx().x # ctaid.x * ntid.x + tid.x (1-based)
|
||||||
@inbounds variableCols = length(variables) / stepsize[2]
|
@inbounds variableCols = length(variables) / stepsize[2]
|
||||||
|
|
||||||
if varSetIndex > variableCols
|
if varSetIndex > variableCols
|
||||||
|
@ -55,19 +54,19 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var
|
||||||
|
|
||||||
# firstExprIndex = ((exprIndex - 1) * stepsize[1]) + 1 # Inclusive
|
# firstExprIndex = ((exprIndex - 1) * stepsize[1]) + 1 # Inclusive
|
||||||
# lastExprIndex = firstExprIndex + stepsize[1] - 1 # Inclusive
|
# lastExprIndex = firstExprIndex + stepsize[1] - 1 # Inclusive
|
||||||
@inbounds firstParamIndex = ((exprIndex - 1i32) * stepsize[1]) # Exclusive
|
@inbounds firstParamIndex = ((exprIndex - 1) * stepsize[1]) # Exclusive
|
||||||
|
|
||||||
operationStack = MVector{MAX_STACK_SIZE, Float32}(undef) # Try to get this to function with variable size too, to allow better memory usage
|
operationStack = MVector{MAX_STACK_SIZE, Float32}(undef) # Try to get this to function with variable size too, to allow better memory usage
|
||||||
operationStackTop = 0i32 # stores index of the last defined/valid value
|
operationStackTop = 0 # stores index of the last defined/valid value
|
||||||
|
|
||||||
@inbounds firstVariableIndex = ((varSetIndex-1i32) * stepsize[2]) # Exclusive
|
@inbounds firstVariableIndex = ((varSetIndex-1) * stepsize[2]) # Exclusive
|
||||||
|
|
||||||
@inbounds for expr in expressions
|
@inbounds for expr in expressions
|
||||||
if expr.Type == EMPTY
|
if expr.Type == EMPTY
|
||||||
break
|
break
|
||||||
elseif expr.Type == INDEX
|
elseif expr.Type == INDEX
|
||||||
val = expr.Value
|
val = expr.Value
|
||||||
operationStackTop += 1i32
|
operationStackTop += 1
|
||||||
|
|
||||||
if val > 0
|
if val > 0
|
||||||
operationStack[operationStackTop] = variables[firstVariableIndex + val]
|
operationStack[operationStackTop] = variables[firstVariableIndex + val]
|
||||||
|
@ -76,25 +75,25 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var
|
||||||
operationStack[operationStackTop] = parameters[firstParamIndex + val]
|
operationStack[operationStackTop] = parameters[firstParamIndex + val]
|
||||||
end
|
end
|
||||||
elseif expr.Type == FLOAT32
|
elseif expr.Type == FLOAT32
|
||||||
operationStackTop += 1i32
|
operationStackTop += 1
|
||||||
operationStack[operationStackTop] = reinterpret(Float32, expr.Value)
|
operationStack[operationStackTop] = reinterpret(Float32, expr.Value)
|
||||||
elseif expr.Type == OPERATOR
|
elseif expr.Type == OPERATOR
|
||||||
type = reinterpret(Operator, expr.Value)
|
type = reinterpret(Operator, expr.Value)
|
||||||
if type == ADD
|
if type == ADD
|
||||||
operationStackTop -= 1i32
|
operationStackTop -= 1
|
||||||
operationStack[operationStackTop] = operationStack[operationStackTop] + operationStack[operationStackTop + 1i32]
|
operationStack[operationStackTop] = operationStack[operationStackTop] + operationStack[operationStackTop + 1]
|
||||||
elseif type == SUBTRACT
|
elseif type == SUBTRACT
|
||||||
operationStackTop -= 1i32
|
operationStackTop -= 1
|
||||||
operationStack[operationStackTop] = operationStack[operationStackTop] - operationStack[operationStackTop + 1i32]
|
operationStack[operationStackTop] = operationStack[operationStackTop] - operationStack[operationStackTop + 1]
|
||||||
elseif type == MULTIPLY
|
elseif type == MULTIPLY
|
||||||
operationStackTop -= 1i32
|
operationStackTop -= 1
|
||||||
operationStack[operationStackTop] = operationStack[operationStackTop] * operationStack[operationStackTop + 1i32]
|
operationStack[operationStackTop] = operationStack[operationStackTop] * operationStack[operationStackTop + 1]
|
||||||
elseif type == DIVIDE
|
elseif type == DIVIDE
|
||||||
operationStackTop -= 1i32
|
operationStackTop -= 1
|
||||||
operationStack[operationStackTop] = operationStack[operationStackTop] / operationStack[operationStackTop + 1i32]
|
operationStack[operationStackTop] = operationStack[operationStackTop] / operationStack[operationStackTop + 1]
|
||||||
elseif type == POWER
|
elseif type == POWER
|
||||||
operationStackTop -= 1i32
|
operationStackTop -= 1
|
||||||
operationStack[operationStackTop] = operationStack[operationStackTop] ^ operationStack[operationStackTop + 1i32]
|
operationStack[operationStackTop] = operationStack[operationStackTop] ^ operationStack[operationStackTop + 1]
|
||||||
elseif type == ABS
|
elseif type == ABS
|
||||||
operationStack[operationStackTop] = abs(operationStack[operationStackTop])
|
operationStack[operationStackTop] = abs(operationStack[operationStackTop])
|
||||||
elseif type == LOG
|
elseif type == LOG
|
||||||
|
@ -112,7 +111,7 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var
|
||||||
|
|
||||||
# "(exprIndex - 1) * variableCols" -> calculates the column in which to insert the result (expression = column)
|
# "(exprIndex - 1) * variableCols" -> calculates the column in which to insert the result (expression = column)
|
||||||
# "+ varSetIndex" -> to get the row inside the column at which to insert the result of the variable set (variable set = row)
|
# "+ varSetIndex" -> to get the row inside the column at which to insert the result of the variable set (variable set = row)
|
||||||
resultIndex = convert(Int, (exprIndex - 1i32) * variableCols + varSetIndex) # Inclusive
|
resultIndex = convert(Int, (exprIndex - 1) * variableCols + varSetIndex) # Inclusive
|
||||||
@inbounds results[resultIndex] = operationStack[operationStackTop]
|
@inbounds results[resultIndex] = operationStack[operationStackTop]
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
|
@ -143,9 +143,10 @@ if compareWithCPU
|
||||||
println(gpuiVsGPUT_median)
|
println(gpuiVsGPUT_median)
|
||||||
println(gpuiVsGPUT_std)
|
println(gpuiVsGPUT_std)
|
||||||
|
|
||||||
BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/4-interpreter_using_int32.json", results)
|
BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/5-interpreter_using_fastmath.json", results)
|
||||||
else
|
else
|
||||||
resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/2-using_inbounds.json")[1]
|
resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/2-using_inbounds.json")[1]
|
||||||
|
# resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json")[1]
|
||||||
|
|
||||||
medianGPUI_old = median(resultsOld["GPUI"])
|
medianGPUI_old = median(resultsOld["GPUI"])
|
||||||
stdGPUI_old = std(resultsOld["GPUI"])
|
stdGPUI_old = std(resultsOld["GPUI"])
|
||||||
|
|
|
@ -26,5 +26,5 @@ end
|
||||||
|
|
||||||
|
|
||||||
@testset "Transpiler Tuning" begin
|
@testset "Transpiler Tuning" begin
|
||||||
# CUDA.@profile evaluate_gpu(exprsGPU, X, p; repetitions=expr_reps)
|
CUDA.@profile evaluate_gpu(exprsGPU, X, p; repetitions=expr_reps)
|
||||||
end
|
end
|
1
package/test/results/5-interpreter_using_fastmath.json
Normal file
1
package/test/results/5-interpreter_using_fastmath.json
Normal file
File diff suppressed because one or more lines are too long
|
@ -2,8 +2,11 @@
|
||||||
\label{cha:conclusion}
|
\label{cha:conclusion}
|
||||||
|
|
||||||
Summarise the results
|
Summarise the results
|
||||||
|
talk again how a typical input is often not complex enough (basically repeat that statement from comparison section in evaluation)
|
||||||
|
|
||||||
\section{Future Work}
|
\section{Future Work}
|
||||||
talk about what can be improved
|
talk about what can be improved
|
||||||
|
|
||||||
Transpiler: transpile expression directly from Julia AST -> would save time because no intermediate representation needs to be created (looses step and gains performance, but also makes transpiler itself more complex)
|
Transpiler: transpile expression directly from Julia AST -> would save time because no intermediate representation needs to be created (looses step and gains performance, but also makes transpiler itself more complex)
|
||||||
|
|
||||||
|
CPU Interpreter: Probably more worth to dive into parallelising cpu interpreter itself (not really future work, as you wouldn't write a paper about that)
|
|
@ -22,7 +22,7 @@ Initial: CPU-Side single-threaded; up to 1024 threads per block; bounds-checking
|
||||||
1.) Blocksize reduced to a maximum of 256 -> moderate improvement in medium and large
|
1.) Blocksize reduced to a maximum of 256 -> moderate improvement in medium and large
|
||||||
2.) Using @inbounds -> noticeable improvement in 2 out of 3
|
2.) Using @inbounds -> noticeable improvement in 2 out of 3
|
||||||
3.) Tuned blocksize with NSight compute -> slight improvement
|
3.) Tuned blocksize with NSight compute -> slight improvement
|
||||||
4.) used int32 everywhere to reduce register usage -> significant performance drop (probably because a lot more waiting time, or more type conversions happening on GPU? would need to look at PTX)
|
4.) used int32 everywhere to reduce register usage -> significant performance drop (probably because a lot more waiting time "latency hiding not working basically", or more type conversions happening on GPU? look at generated PTX code and use that as an argument to describe why it is slower)
|
||||||
|
|
||||||
\subsection{Transpiler}
|
\subsection{Transpiler}
|
||||||
Results only for Transpiler (also contains final kernel configuration and probably quick overview/recap of the implementation used and described in Implementation section
|
Results only for Transpiler (also contains final kernel configuration and probably quick overview/recap of the implementation used and described in Implementation section
|
||||||
|
@ -37,4 +37,6 @@ Initial: CPU-Side single-threaded; up to 1024 threads per block; bounds-checking
|
||||||
4.) Only changed things on interpreter side
|
4.) Only changed things on interpreter side
|
||||||
|
|
||||||
\subsection{Comparison}
|
\subsection{Comparison}
|
||||||
Comparison of Interpreter and Transpiler as well as Comparing the two with CPU interpreter
|
Comparison of Interpreter and Transpiler as well as Comparing the two with CPU interpreter
|
||||||
|
|
||||||
|
talk about that compute portion is just too little. Only more complex expressions with higher var set count benefit well (make one or two performance evaluations, with 10 larger expressions and at least 1k var sets and present that here as point for that statement)
|
BIN
thesis/main.pdf
BIN
thesis/main.pdf
Binary file not shown.
Loading…
Reference in New Issue
Block a user