benchmarking: further tests done. Seems like transpiler takes ages, need to investigate further
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run

This commit is contained in:
Daniel 2025-05-11 16:54:19 +02:00
parent 5b31fbb270
commit 3d80ae95e4
6 changed files with 39 additions and 38 deletions

View File

@ -34,6 +34,7 @@ function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector
for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl)
results = Interpreter.interpret(exprs, X, p)
println("got results")
end
return results

View File

@ -27,15 +27,17 @@ NOTE: This function is not thread save, especially cache access is not thread sa
function expr_to_postfix(expression::Expr)::PostfixType
expr = expression
if expression.head === :->
if typeof(expression.args[2]) == Float64
println()
println("Expression: $expression")
println("Expr: $expr")
println()
dump(expression; maxdepth=10)
end
# if typeof(expression.args[2]) == Float64
# println()
# println("Expression: $expression")
# println("Expr: $expr")
# println()
# dump(expression; maxdepth=10)
# end
# if the expression equals (x, p) -> (...) then the below statement extracts the expression to evaluate
if expression.args[2].head == :block # expressions that are not generated with the parser (./test/parser.jl) contain this extra "block" node, which needs to be skipped
if typeof(expression.args[2]) == Float64
return [convert_to_ExpressionElement(expression.args[2])]
elseif expression.args[2].head == :block # expressions that are not generated with the parser (./test/parser.jl) contain this extra "block" node, which needs to be skipped
expr = expression.args[2].args[2]
else # ... if the are generated with the parser, this node is not present and therefore doesn't need to be skipped
expr = expression.args[2]
@ -46,9 +48,7 @@ function expr_to_postfix(expression::Expr)::PostfixType
# return cache[expr]
# end
postfix = PostfixType()
postfix = PostfixType()
# Special handling in the case where the expression is an array access
# This can happen if the token is a variable/parameter of the form x[n]/p[n]

View File

@ -18,7 +18,7 @@ function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parame
@inbounds for i in eachindex(expressions)
exprs[i] = ExpressionProcessing.expr_to_postfix(expressions[i])
end
variableCols = size(variables, 2) # number of variable sets to use for each expression
cudaVars = CuArray(variables)
cudaParams = Utils.create_cuda_array(parameters, NaN32) # column corresponds to data for one expression
@ -30,7 +30,7 @@ function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parame
cudaResults = CuArray{Float32}(undef, variableCols, length(exprs))
# Start kernel for each expression to ensure that no warp is working on different expressions
@inbounds for i in eachindex(exprs)
@inbounds Threads.@threads for i in eachindex(exprs)
numThreads = min(variableCols, 256)
numBlocks = cld(variableCols, numThreads)

View File

@ -157,7 +157,7 @@ function get_kernel_signature(kernelName::String, parameters::Vector{DataType},
println(signatureBuffer, "(")
for i in eachindex(parameters)
print(signatureBuffer, " .param .u64", " ", "param_", i)
print(signatureBuffer, " .param .u64 param_", i)
parametersLocation = Utils.get_next_free_register(regManager, "rd")
println(paramLoadingBuffer, "ld.param.u64 $parametersLocation, [param_$i];")
@ -183,21 +183,21 @@ function get_guard_clause(exitJumpLocation::String, nrOfVarSets::Integer, regMan
threadsPerCTA = Utils.get_next_free_register(regManager, "r")
currentThreadId = Utils.get_next_free_register(regManager, "r")
println(guardBuffer, "mov.u32 $threadIds, %ntid.x;")
println(guardBuffer, "mov.u32 $threadsPerCTA, %ctaid.x;")
println(guardBuffer, "mov.u32 $currentThreadId, %tid.x;")
println(guardBuffer, "mov.u32 $threadIds, %ntid.x;")
println(guardBuffer, "mov.u32 $threadsPerCTA, %ctaid.x;")
println(guardBuffer, "mov.u32 $currentThreadId, %tid.x;")
globalThreadId = Utils.get_next_free_register(regManager, "r") # basically the index of the thread in the variable set
breakCondition = Utils.get_next_free_register(regManager, "p")
println(guardBuffer, "mad.lo.s32 $globalThreadId, $threadIds, $threadsPerCTA, $currentThreadId;")
println(guardBuffer, "setp.gt.s32 $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets
println(guardBuffer, "mad.lo.s32 $globalThreadId, $threadIds, $threadsPerCTA, $currentThreadId;")
println(guardBuffer, "setp.gt.s32 $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets
# branch to end if breakCondition is true
println(guardBuffer, "@$breakCondition bra $exitJumpLocation;")
println(guardBuffer, "@$breakCondition bra $exitJumpLocation;")
# Convert threadIdReg to a 64 bit register. Not 64 bit from the start, as this would take up more registers. Performance tests can be performed to determin if it is faster doing this, or making everything 64-bit from the start
threadId64Reg = Utils.get_next_free_register(regManager, "rd")
print(guardBuffer, "cvt.u64.u32 $threadId64Reg, $globalThreadId;")
print(guardBuffer, "cvt.u64.u32 $threadId64Reg, $globalThreadId;")
return (String(take!(guardBuffer)), threadId64Reg)
end
@ -306,38 +306,38 @@ function get_operation(operator::Operator, regManager::Utils.RegisterManager, le
end
if operator == ADD
resultCode = "add.f32 $resultRegister, $left, $right;"
resultCode = "add.f32 $resultRegister, $left, $right;"
elseif operator == SUBTRACT
resultCode = "sub.f32 $resultRegister, $left, $right;"
resultCode = "sub.f32 $resultRegister, $left, $right;"
elseif operator == MULTIPLY
resultCode = "mul.f32 $resultRegister, $left, $right;"
resultCode = "mul.f32 $resultRegister, $left, $right;"
elseif operator == DIVIDE
resultCode = "div.approx.f32 $resultRegister, $left, $right;"
resultCode = "div.approx.f32 $resultRegister, $left, $right;"
elseif operator == POWER
# x^y == 2^(y*log2(x)) as generated by nvcc for "pow(x, y)"
resultCode = "
// x^y:
lg2.approx.f32 $resultRegister, $left;
mul.f32 $resultRegister, $right, $resultRegister;
ex2.approx.f32 $resultRegister, $resultRegister;"
lg2.approx.f32 $resultRegister, $left;
mul.f32 $resultRegister, $right, $resultRegister;
ex2.approx.f32 $resultRegister, $resultRegister;"
elseif operator == ABS
resultCode = "abs.f32 $resultRegister, $left;"
resultCode = "abs.f32 $resultRegister, $left;"
elseif operator == LOG
# log(x) == log2(x) * ln(2) as generated by nvcc for "log(x)"
resultCode = "
// log(x):
lg2.approx.f32 $resultRegister, $left;
mul.f32 $resultRegister, $resultRegister, 0.693147182;"
lg2.approx.f32 $resultRegister, $left;
mul.f32 $resultRegister, $resultRegister, 0.693147182;"
elseif operator == EXP
# e^x == 2^(x/ln(2)) as generated by nvcc for "exp(x)"
resultCode = "
// e^x:
mul.f32 $resultRegister, $left, 1.44269502;
ex2.approx.f32 $resultRegister, $resultRegister;"
mul.f32 $resultRegister, $left, 1.44269502;
ex2.approx.f32 $resultRegister, $resultRegister;"
elseif operator == SQRT
resultCode = "sqrt.approx.f32 $resultRegister, $left;"
resultCode = "sqrt.approx.f32 $resultRegister, $left;"
elseif operator == INV
resultCode = "rcp.approx.f32 $resultRegister, $left;"
resultCode = "rcp.approx.f32 $resultRegister, $left;"
else
throw(ArgumentError("Operator conversion to ptx not implemented for '$operator'"))
end

View File

@ -63,11 +63,11 @@ if compareWithCPU
end
# cacheInterpreter = Dict{Expr, PostfixType}()
# suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprs, X_t, parameters; repetitions=expr_reps)
suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprs, X_t, parameters; repetitions=expr_reps)
# cacheTranspilerFront = Dict{Expr, PostfixType}()
# cacheTranspilerRes = Dict{Expr, CuFunction}()
suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprs, X_t, parameters; repetitions=expr_reps)
suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprs, X_t, parameters; repetitions=expr_reps) # Takes forever. Needs more investigation
tune!(suite)
BenchmarkTools.save("params.json", params(suite))

View File

@ -1 +1 @@
[{"Julia":"1.11.4","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["CPUInterpreter"]}],"GPUT":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]]
[{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{},"tags":["CPUInterpreter"]}],"GPUT":["BenchmarkGroup",{"data":{},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"nikuradse_1":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]]