benchmarking: further tests done. Seems like transpiler takes ages, need to investigate further
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
This commit is contained in:
parent
5b31fbb270
commit
3d80ae95e4
|
@ -34,6 +34,7 @@ function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector
|
|||
|
||||
for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl)
|
||||
results = Interpreter.interpret(exprs, X, p)
|
||||
println("got results")
|
||||
end
|
||||
|
||||
return results
|
||||
|
|
|
@ -27,15 +27,17 @@ NOTE: This function is not thread save, especially cache access is not thread sa
|
|||
function expr_to_postfix(expression::Expr)::PostfixType
|
||||
expr = expression
|
||||
if expression.head === :->
|
||||
if typeof(expression.args[2]) == Float64
|
||||
println()
|
||||
println("Expression: $expression")
|
||||
println("Expr: $expr")
|
||||
println()
|
||||
dump(expression; maxdepth=10)
|
||||
end
|
||||
# if typeof(expression.args[2]) == Float64
|
||||
# println()
|
||||
# println("Expression: $expression")
|
||||
# println("Expr: $expr")
|
||||
# println()
|
||||
# dump(expression; maxdepth=10)
|
||||
# end
|
||||
# if the expression equals (x, p) -> (...) then the below statement extracts the expression to evaluate
|
||||
if expression.args[2].head == :block # expressions that are not generated with the parser (./test/parser.jl) contain this extra "block" node, which needs to be skipped
|
||||
if typeof(expression.args[2]) == Float64
|
||||
return [convert_to_ExpressionElement(expression.args[2])]
|
||||
elseif expression.args[2].head == :block # expressions that are not generated with the parser (./test/parser.jl) contain this extra "block" node, which needs to be skipped
|
||||
expr = expression.args[2].args[2]
|
||||
else # ... if the are generated with the parser, this node is not present and therefore doesn't need to be skipped
|
||||
expr = expression.args[2]
|
||||
|
@ -46,9 +48,7 @@ function expr_to_postfix(expression::Expr)::PostfixType
|
|||
# return cache[expr]
|
||||
# end
|
||||
|
||||
postfix = PostfixType()
|
||||
|
||||
|
||||
postfix = PostfixType()
|
||||
|
||||
# Special handling in the case where the expression is an array access
|
||||
# This can happen if the token is a variable/parameter of the form x[n]/p[n]
|
||||
|
|
|
@ -18,7 +18,7 @@ function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parame
|
|||
@inbounds for i in eachindex(expressions)
|
||||
exprs[i] = ExpressionProcessing.expr_to_postfix(expressions[i])
|
||||
end
|
||||
|
||||
|
||||
variableCols = size(variables, 2) # number of variable sets to use for each expression
|
||||
cudaVars = CuArray(variables)
|
||||
cudaParams = Utils.create_cuda_array(parameters, NaN32) # column corresponds to data for one expression
|
||||
|
@ -30,7 +30,7 @@ function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parame
|
|||
cudaResults = CuArray{Float32}(undef, variableCols, length(exprs))
|
||||
|
||||
# Start kernel for each expression to ensure that no warp is working on different expressions
|
||||
@inbounds for i in eachindex(exprs)
|
||||
@inbounds Threads.@threads for i in eachindex(exprs)
|
||||
numThreads = min(variableCols, 256)
|
||||
numBlocks = cld(variableCols, numThreads)
|
||||
|
||||
|
|
|
@ -157,7 +157,7 @@ function get_kernel_signature(kernelName::String, parameters::Vector{DataType},
|
|||
println(signatureBuffer, "(")
|
||||
|
||||
for i in eachindex(parameters)
|
||||
print(signatureBuffer, " .param .u64", " ", "param_", i)
|
||||
print(signatureBuffer, " .param .u64 param_", i)
|
||||
|
||||
parametersLocation = Utils.get_next_free_register(regManager, "rd")
|
||||
println(paramLoadingBuffer, "ld.param.u64 $parametersLocation, [param_$i];")
|
||||
|
@ -183,21 +183,21 @@ function get_guard_clause(exitJumpLocation::String, nrOfVarSets::Integer, regMan
|
|||
threadsPerCTA = Utils.get_next_free_register(regManager, "r")
|
||||
currentThreadId = Utils.get_next_free_register(regManager, "r")
|
||||
|
||||
println(guardBuffer, "mov.u32 $threadIds, %ntid.x;")
|
||||
println(guardBuffer, "mov.u32 $threadsPerCTA, %ctaid.x;")
|
||||
println(guardBuffer, "mov.u32 $currentThreadId, %tid.x;")
|
||||
println(guardBuffer, "mov.u32 $threadIds, %ntid.x;")
|
||||
println(guardBuffer, "mov.u32 $threadsPerCTA, %ctaid.x;")
|
||||
println(guardBuffer, "mov.u32 $currentThreadId, %tid.x;")
|
||||
|
||||
globalThreadId = Utils.get_next_free_register(regManager, "r") # basically the index of the thread in the variable set
|
||||
breakCondition = Utils.get_next_free_register(regManager, "p")
|
||||
println(guardBuffer, "mad.lo.s32 $globalThreadId, $threadIds, $threadsPerCTA, $currentThreadId;")
|
||||
println(guardBuffer, "setp.gt.s32 $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets
|
||||
println(guardBuffer, "mad.lo.s32 $globalThreadId, $threadIds, $threadsPerCTA, $currentThreadId;")
|
||||
println(guardBuffer, "setp.gt.s32 $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets
|
||||
|
||||
# branch to end if breakCondition is true
|
||||
println(guardBuffer, "@$breakCondition bra $exitJumpLocation;")
|
||||
println(guardBuffer, "@$breakCondition bra $exitJumpLocation;")
|
||||
|
||||
# Convert threadIdReg to a 64 bit register. Not 64 bit from the start, as this would take up more registers. Performance tests can be performed to determin if it is faster doing this, or making everything 64-bit from the start
|
||||
threadId64Reg = Utils.get_next_free_register(regManager, "rd")
|
||||
print(guardBuffer, "cvt.u64.u32 $threadId64Reg, $globalThreadId;")
|
||||
print(guardBuffer, "cvt.u64.u32 $threadId64Reg, $globalThreadId;")
|
||||
|
||||
return (String(take!(guardBuffer)), threadId64Reg)
|
||||
end
|
||||
|
@ -306,38 +306,38 @@ function get_operation(operator::Operator, regManager::Utils.RegisterManager, le
|
|||
end
|
||||
|
||||
if operator == ADD
|
||||
resultCode = "add.f32 $resultRegister, $left, $right;"
|
||||
resultCode = "add.f32 $resultRegister, $left, $right;"
|
||||
elseif operator == SUBTRACT
|
||||
resultCode = "sub.f32 $resultRegister, $left, $right;"
|
||||
resultCode = "sub.f32 $resultRegister, $left, $right;"
|
||||
elseif operator == MULTIPLY
|
||||
resultCode = "mul.f32 $resultRegister, $left, $right;"
|
||||
resultCode = "mul.f32 $resultRegister, $left, $right;"
|
||||
elseif operator == DIVIDE
|
||||
resultCode = "div.approx.f32 $resultRegister, $left, $right;"
|
||||
resultCode = "div.approx.f32 $resultRegister, $left, $right;"
|
||||
elseif operator == POWER
|
||||
# x^y == 2^(y*log2(x)) as generated by nvcc for "pow(x, y)"
|
||||
resultCode = "
|
||||
// x^y:
|
||||
lg2.approx.f32 $resultRegister, $left;
|
||||
mul.f32 $resultRegister, $right, $resultRegister;
|
||||
ex2.approx.f32 $resultRegister, $resultRegister;"
|
||||
lg2.approx.f32 $resultRegister, $left;
|
||||
mul.f32 $resultRegister, $right, $resultRegister;
|
||||
ex2.approx.f32 $resultRegister, $resultRegister;"
|
||||
elseif operator == ABS
|
||||
resultCode = "abs.f32 $resultRegister, $left;"
|
||||
resultCode = "abs.f32 $resultRegister, $left;"
|
||||
elseif operator == LOG
|
||||
# log(x) == log2(x) * ln(2) as generated by nvcc for "log(x)"
|
||||
resultCode = "
|
||||
// log(x):
|
||||
lg2.approx.f32 $resultRegister, $left;
|
||||
mul.f32 $resultRegister, $resultRegister, 0.693147182;"
|
||||
lg2.approx.f32 $resultRegister, $left;
|
||||
mul.f32 $resultRegister, $resultRegister, 0.693147182;"
|
||||
elseif operator == EXP
|
||||
# e^x == 2^(x/ln(2)) as generated by nvcc for "exp(x)"
|
||||
resultCode = "
|
||||
// e^x:
|
||||
mul.f32 $resultRegister, $left, 1.44269502;
|
||||
ex2.approx.f32 $resultRegister, $resultRegister;"
|
||||
mul.f32 $resultRegister, $left, 1.44269502;
|
||||
ex2.approx.f32 $resultRegister, $resultRegister;"
|
||||
elseif operator == SQRT
|
||||
resultCode = "sqrt.approx.f32 $resultRegister, $left;"
|
||||
resultCode = "sqrt.approx.f32 $resultRegister, $left;"
|
||||
elseif operator == INV
|
||||
resultCode = "rcp.approx.f32 $resultRegister, $left;"
|
||||
resultCode = "rcp.approx.f32 $resultRegister, $left;"
|
||||
else
|
||||
throw(ArgumentError("Operator conversion to ptx not implemented for '$operator'"))
|
||||
end
|
||||
|
|
|
@ -63,11 +63,11 @@ if compareWithCPU
|
|||
end
|
||||
|
||||
# cacheInterpreter = Dict{Expr, PostfixType}()
|
||||
# suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprs, X_t, parameters; repetitions=expr_reps)
|
||||
suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprs, X_t, parameters; repetitions=expr_reps)
|
||||
|
||||
# cacheTranspilerFront = Dict{Expr, PostfixType}()
|
||||
# cacheTranspilerRes = Dict{Expr, CuFunction}()
|
||||
suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprs, X_t, parameters; repetitions=expr_reps)
|
||||
suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprs, X_t, parameters; repetitions=expr_reps) # Takes forever. Needs more investigation
|
||||
|
||||
tune!(suite)
|
||||
BenchmarkTools.save("params.json", params(suite))
|
||||
|
|
|
@ -1 +1 @@
|
|||
[{"Julia":"1.11.4","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["CPUInterpreter"]}],"GPUT":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]]
|
||||
[{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{},"tags":["CPUInterpreter"]}],"GPUT":["BenchmarkGroup",{"data":{},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"nikuradse_1":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]]
|
Loading…
Reference in New Issue
Block a user