diff --git a/package/src/ExpressionExecutorCuda.jl b/package/src/ExpressionExecutorCuda.jl index 04ca11a..81c202d 100644 --- a/package/src/ExpressionExecutorCuda.jl +++ b/package/src/ExpressionExecutorCuda.jl @@ -34,6 +34,7 @@ function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl) results = Interpreter.interpret(exprs, X, p) + println("got results") end return results diff --git a/package/src/ExpressionProcessing.jl b/package/src/ExpressionProcessing.jl index 9d386d4..8c30f69 100644 --- a/package/src/ExpressionProcessing.jl +++ b/package/src/ExpressionProcessing.jl @@ -27,15 +27,17 @@ NOTE: This function is not thread save, especially cache access is not thread sa function expr_to_postfix(expression::Expr)::PostfixType expr = expression if expression.head === :-> - if typeof(expression.args[2]) == Float64 - println() - println("Expression: $expression") - println("Expr: $expr") - println() - dump(expression; maxdepth=10) - end + # if typeof(expression.args[2]) == Float64 + # println() + # println("Expression: $expression") + # println("Expr: $expr") + # println() + # dump(expression; maxdepth=10) + # end # if the expression equals (x, p) -> (...) then the below statement extracts the expression to evaluate - if expression.args[2].head == :block # expressions that are not generated with the parser (./test/parser.jl) contain this extra "block" node, which needs to be skipped + if typeof(expression.args[2]) == Float64 + return [convert_to_ExpressionElement(expression.args[2])] + elseif expression.args[2].head == :block # expressions that are not generated with the parser (./test/parser.jl) contain this extra "block" node, which needs to be skipped expr = expression.args[2].args[2] else # ... if the are generated with the parser, this node is not present and therefore doesn't need to be skipped expr = expression.args[2] @@ -46,9 +48,7 @@ function expr_to_postfix(expression::Expr)::PostfixType # return cache[expr] # end - postfix = PostfixType() - - + postfix = PostfixType() # Special handling in the case where the expression is an array access # This can happen if the token is a variable/parameter of the form x[n]/p[n] diff --git a/package/src/Interpreter.jl b/package/src/Interpreter.jl index cf7b9bd..659c170 100644 --- a/package/src/Interpreter.jl +++ b/package/src/Interpreter.jl @@ -18,7 +18,7 @@ function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parame @inbounds for i in eachindex(expressions) exprs[i] = ExpressionProcessing.expr_to_postfix(expressions[i]) end - + variableCols = size(variables, 2) # number of variable sets to use for each expression cudaVars = CuArray(variables) cudaParams = Utils.create_cuda_array(parameters, NaN32) # column corresponds to data for one expression @@ -30,7 +30,7 @@ function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parame cudaResults = CuArray{Float32}(undef, variableCols, length(exprs)) # Start kernel for each expression to ensure that no warp is working on different expressions - @inbounds for i in eachindex(exprs) + @inbounds Threads.@threads for i in eachindex(exprs) numThreads = min(variableCols, 256) numBlocks = cld(variableCols, numThreads) diff --git a/package/src/Transpiler.jl b/package/src/Transpiler.jl index fe40619..c6f368e 100644 --- a/package/src/Transpiler.jl +++ b/package/src/Transpiler.jl @@ -157,7 +157,7 @@ function get_kernel_signature(kernelName::String, parameters::Vector{DataType}, println(signatureBuffer, "(") for i in eachindex(parameters) - print(signatureBuffer, " .param .u64", " ", "param_", i) + print(signatureBuffer, " .param .u64 param_", i) parametersLocation = Utils.get_next_free_register(regManager, "rd") println(paramLoadingBuffer, "ld.param.u64 $parametersLocation, [param_$i];") @@ -183,21 +183,21 @@ function get_guard_clause(exitJumpLocation::String, nrOfVarSets::Integer, regMan threadsPerCTA = Utils.get_next_free_register(regManager, "r") currentThreadId = Utils.get_next_free_register(regManager, "r") - println(guardBuffer, "mov.u32 $threadIds, %ntid.x;") - println(guardBuffer, "mov.u32 $threadsPerCTA, %ctaid.x;") - println(guardBuffer, "mov.u32 $currentThreadId, %tid.x;") + println(guardBuffer, "mov.u32 $threadIds, %ntid.x;") + println(guardBuffer, "mov.u32 $threadsPerCTA, %ctaid.x;") + println(guardBuffer, "mov.u32 $currentThreadId, %tid.x;") globalThreadId = Utils.get_next_free_register(regManager, "r") # basically the index of the thread in the variable set breakCondition = Utils.get_next_free_register(regManager, "p") - println(guardBuffer, "mad.lo.s32 $globalThreadId, $threadIds, $threadsPerCTA, $currentThreadId;") - println(guardBuffer, "setp.gt.s32 $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets + println(guardBuffer, "mad.lo.s32 $globalThreadId, $threadIds, $threadsPerCTA, $currentThreadId;") + println(guardBuffer, "setp.gt.s32 $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets # branch to end if breakCondition is true - println(guardBuffer, "@$breakCondition bra $exitJumpLocation;") + println(guardBuffer, "@$breakCondition bra $exitJumpLocation;") # Convert threadIdReg to a 64 bit register. Not 64 bit from the start, as this would take up more registers. Performance tests can be performed to determin if it is faster doing this, or making everything 64-bit from the start threadId64Reg = Utils.get_next_free_register(regManager, "rd") - print(guardBuffer, "cvt.u64.u32 $threadId64Reg, $globalThreadId;") + print(guardBuffer, "cvt.u64.u32 $threadId64Reg, $globalThreadId;") return (String(take!(guardBuffer)), threadId64Reg) end @@ -306,38 +306,38 @@ function get_operation(operator::Operator, regManager::Utils.RegisterManager, le end if operator == ADD - resultCode = "add.f32 $resultRegister, $left, $right;" + resultCode = "add.f32 $resultRegister, $left, $right;" elseif operator == SUBTRACT - resultCode = "sub.f32 $resultRegister, $left, $right;" + resultCode = "sub.f32 $resultRegister, $left, $right;" elseif operator == MULTIPLY - resultCode = "mul.f32 $resultRegister, $left, $right;" + resultCode = "mul.f32 $resultRegister, $left, $right;" elseif operator == DIVIDE - resultCode = "div.approx.f32 $resultRegister, $left, $right;" + resultCode = "div.approx.f32 $resultRegister, $left, $right;" elseif operator == POWER # x^y == 2^(y*log2(x)) as generated by nvcc for "pow(x, y)" resultCode = " // x^y: - lg2.approx.f32 $resultRegister, $left; - mul.f32 $resultRegister, $right, $resultRegister; - ex2.approx.f32 $resultRegister, $resultRegister;" + lg2.approx.f32 $resultRegister, $left; + mul.f32 $resultRegister, $right, $resultRegister; + ex2.approx.f32 $resultRegister, $resultRegister;" elseif operator == ABS - resultCode = "abs.f32 $resultRegister, $left;" + resultCode = "abs.f32 $resultRegister, $left;" elseif operator == LOG # log(x) == log2(x) * ln(2) as generated by nvcc for "log(x)" resultCode = " // log(x): - lg2.approx.f32 $resultRegister, $left; - mul.f32 $resultRegister, $resultRegister, 0.693147182;" + lg2.approx.f32 $resultRegister, $left; + mul.f32 $resultRegister, $resultRegister, 0.693147182;" elseif operator == EXP # e^x == 2^(x/ln(2)) as generated by nvcc for "exp(x)" resultCode = " // e^x: - mul.f32 $resultRegister, $left, 1.44269502; - ex2.approx.f32 $resultRegister, $resultRegister;" + mul.f32 $resultRegister, $left, 1.44269502; + ex2.approx.f32 $resultRegister, $resultRegister;" elseif operator == SQRT - resultCode = "sqrt.approx.f32 $resultRegister, $left;" + resultCode = "sqrt.approx.f32 $resultRegister, $left;" elseif operator == INV - resultCode = "rcp.approx.f32 $resultRegister, $left;" + resultCode = "rcp.approx.f32 $resultRegister, $left;" else throw(ArgumentError("Operator conversion to ptx not implemented for '$operator'")) end diff --git a/package/test/PerformanceTests.jl b/package/test/PerformanceTests.jl index edfb6c8..aa2a8f0 100644 --- a/package/test/PerformanceTests.jl +++ b/package/test/PerformanceTests.jl @@ -63,11 +63,11 @@ if compareWithCPU end # cacheInterpreter = Dict{Expr, PostfixType}() -# suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprs, X_t, parameters; repetitions=expr_reps) +suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprs, X_t, parameters; repetitions=expr_reps) # cacheTranspilerFront = Dict{Expr, PostfixType}() # cacheTranspilerRes = Dict{Expr, CuFunction}() -suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprs, X_t, parameters; repetitions=expr_reps) +suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprs, X_t, parameters; repetitions=expr_reps) # Takes forever. Needs more investigation tune!(suite) BenchmarkTools.save("params.json", params(suite)) diff --git a/package/test/params.json b/package/test/params.json index 4d79097..88e2d6d 100644 --- a/package/test/params.json +++ b/package/test/params.json @@ -1 +1 @@ -[{"Julia":"1.11.4","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["CPUInterpreter"]}],"GPUT":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]] \ No newline at end of file +[{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{},"tags":["CPUInterpreter"]}],"GPUT":["BenchmarkGroup",{"data":{},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"nikuradse_1":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]] \ No newline at end of file