From 7283082699d5ea4007973a053c518a8ade222dfd Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 28 Sep 2024 11:41:13 +0200 Subject: [PATCH] added guard clause generation --- package/src/Interpreter.jl | 4 +- package/src/Transpiler.jl | 75 ++++++++++++++++++++++++++------ package/test/InterpreterTests.jl | 18 +------- package/test/TranspilerTests.jl | 32 ++++++++++++++ package/test/runtests.jl | 2 + 5 files changed, 98 insertions(+), 33 deletions(-) create mode 100644 package/test/TranspilerTests.jl diff --git a/package/src/Interpreter.jl b/package/src/Interpreter.jl index 168d9c3..99b36ab 100644 --- a/package/src/Interpreter.jl +++ b/package/src/Interpreter.jl @@ -44,8 +44,8 @@ end #TODO: Add @inbounds to all indexing after it is verified that all works https://cuda.juliagpu.org/stable/development/kernel/#Bounds-checking const MAX_STACK_SIZE = 25 # The max number of values the expression can have. so Constant values, Variables and parameters function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, variables::CuDeviceArray{Float64}, parameters::CuDeviceArray{Float64}, results::CuDeviceArray{Float64}, stepsize::CuDeviceArray{Int}, exprIndex::Int) - index = (blockIdx().x - 1) * blockDim().x + threadIdx().x - stride = gridDim().x * blockDim().x + index = (blockIdx().x - 1) * blockDim().x + threadIdx().x # ctaid.x * ntid.x + tid.x + stride = gridDim().x * blockDim().x # nctaid.x * ntid.x firstExprIndex = ((exprIndex - 1) * stepsize[1]) + 1 # Inclusive lastExprIndex = firstExprIndex + stepsize[1] - 1 # Inclusive diff --git a/package/src/Transpiler.jl b/package/src/Transpiler.jl index 1162e12..b4f0d67 100644 --- a/package/src/Transpiler.jl +++ b/package/src/Transpiler.jl @@ -78,36 +78,83 @@ function culoadtest(N::Int32, op = "add.f32") @time CUDA.@sync cudacall(func, Tuple{CuPtr{Cfloat},CuPtr{Cfloat},CuPtr{Cfloat},Cint}, d_a, d_b, d_c, N; threads=threadsPerBlock, blocks=blocksPerGrid) end +const exitJumpLocationMarker = "\$L__BB0_2" function transpile(expression::ExpressionProcessing.PostfixType) + ptxBuffer = IOBuffer() + println(ptxBuffer, get_cuda_header()) + println(ptxBuffer, get_kernel_signature("ExpressionProcessing", [Int64, Float64])) + println(ptxBuffer, "{") + + # Register definition + # Parameter loading + println(ptxBuffer, get_guard_clause()) + + # Code goes here + + # exit jump location + print(ptxBuffer, exitJumpLocationMarker) + println(ptxBuffer, ": ret;") + println(ptxBuffer, "}") + println(String(take!(ptxBuffer))) end # TODO: Make version, target and address_size configurable function get_cuda_header()::String return " - .version 7.1 - .target sm_52 - .address_size 64 - " +.version 7.1 +.target sm_52 +.address_size 64 +" end -function get_kernel_signature(kernelName::String, parameters::Vector{Type})::String - signature = ".visible .entry " * kernelName - - stringBuilder = IOBuffer() - print(stringBuilder, "(") +function get_kernel_signature(kernelName::String, parameters::Vector{DataType})::String + signatureBuffer = IOBuffer() + print(signatureBuffer, ".visible .entry ") + print(signatureBuffer, kernelName) + println(signatureBuffer, "(") + for i in eachindex(parameters) type = type_to_cuda_type(parameters[i]) - print(stringBuilder, - ".param ", type, " ", kernelName, "_param_", i, ",") + print(signatureBuffer, + " .param ", type, " ", kernelName, "_param_", i) + if i != lastindex(parameters) + println(signatureBuffer, ",") + end end - print(stringBuilder, ")") - return String(take!(stringBuilder)) + print(signatureBuffer, ")") + return String(take!(signatureBuffer)) end -function type_to_cuda_type(type::Type)::String +" +Constructs the PTX code used for handling the case where too many threads are started. + +Assumes the following: + - There are the unused ```32 bit``` registers ```r1, r2, r3, r4``` + - There is an unused ```predicate``` register ```p1``` + - The ```32 bit``` register ```r5``` contains the number of variable sets +" +function get_guard_clause()::String + guardBuffer = IOBuffer() + + println(guardBuffer, "mov.u32 %r1, %ntid.x;") # nr of thread ids + println(guardBuffer, "mov.u32 %r2, %ctaid.x;") # nr of threads per cta + println(guardBuffer, "mov.u32 %r3, %tid.x;") # id of the current thread + + println(guardBuffer, "mad.lo.s32 %r4, %r1, %r2, %r3;") # the current index (basically index of variable set) + println(guardBuffer, "setp.ge.s32 %p1, %r4, %r5;") # guard clause (p1 = r4 > r5 -> index > nrOfVariableSets) + + # branch to end if p1 is true + print(guardBuffer, "@%p1 bra ") + print(guardBuffer, exitJumpLocationMarker) + println(guardBuffer, ";") + + return String(take!(guardBuffer)) +end + +function type_to_cuda_type(type::DataType)::String if type == Int64 return ".s64" elseif type == Float64 diff --git a/package/test/InterpreterTests.jl b/package/test/InterpreterTests.jl index 05bba4d..cf80e13 100644 --- a/package/test/InterpreterTests.jl +++ b/package/test/InterpreterTests.jl @@ -25,17 +25,7 @@ function testHelper(expression::Expr, variables::Matrix{Float64}, parameters::Ve @test isequal(result[1,1], expectedResult) end -@testset "Test TMP interpretation" begin - postfixExpr = expr_to_postfix(expressions[1]) - postfixExprs = Vector([postfixExpr]) - push!(postfixExprs, expr_to_postfix(expressions[2])) - - # CUDA.@sync interpret(postfixExprs, variables, parameters) -end - @testset "Test conversion to matrix" begin - return - reference = Matrix{Float64}(undef, 2, 2) reference[1,1] = 5.0 reference[2,1] = NaN64 @@ -43,14 +33,12 @@ end reference[2,2] = 0.0 # reference = Matrix([5.0, NaN], # [5.0, 0.0]) - CUDA.@sync result = Interpreter.convert_to_matrix(parameters, NaN64) + result = Interpreter.convert_to_matrix(parameters, NaN64) @test isequal(result, reference) end @testset "Test commutative interpretation" begin - return - var = Matrix{Float64}(undef, 2, 1) param = Vector{Vector{Float64}}(undef, 1) expectedResult = 8.0 # Not using "eval" because the variables are not stored in global scope @@ -71,8 +59,6 @@ end end @testset "Test non commutative interpretation" begin - return - var = Matrix{Float64}(undef, 2, 1) param = Vector{Vector{Float64}}(undef, 1) expectedResult = -2.0 # Not using "eval" because the variables are not stored in global scope @@ -103,8 +89,6 @@ end end @testset "Test single value operator interpretation" begin - return - var = Matrix{Float64}(undef, 1, 1) param = Vector{Vector{Float64}}(undef, 1) expectedResult = 3.0 # Not using "eval" because the variables are not stored in global scope diff --git a/package/test/TranspilerTests.jl b/package/test/TranspilerTests.jl new file mode 100644 index 0000000..c244081 --- /dev/null +++ b/package/test/TranspilerTests.jl @@ -0,0 +1,32 @@ +using CUDA +using .ExpressionProcessing +using .Transpiler + +expressions = Vector{Expr}(undef, 2) +variables = Matrix{Float64}(undef, 2,2) +parameters = Vector{Vector{Float64}}(undef, 2) + +# Resulting value should be 10 for the first expression +expressions[1] = :(x1 + 1 * x2 + p1) +expressions[2] = :(5 + x1 + 1 * x2 + p1 + p2) +variables[1,1] = 2.0 +variables[2,1] = 3.0 +variables[1,2] = 0.0 +variables[2,2] = 5.0 +parameters[1] = Vector{Float64}(undef, 1) +parameters[2] = Vector{Float64}(undef, 2) +parameters[1][1] = 5.0 +parameters[2][1] = 5.0 +parameters[2][2] = 0.0 + + +@testset "Test TMP transpiler" begin + postfixExpr = expr_to_postfix(expressions[1]) + postfixExprs = Vector([postfixExpr]) + push!(postfixExprs, expr_to_postfix(expressions[2])) + + Transpiler.transpile(postfixExpr) + # CUDA.@sync interpret(postfixExprs, variables, parameters) +end + +#TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)" diff --git a/package/test/runtests.jl b/package/test/runtests.jl index da95594..fd6da3f 100644 --- a/package/test/runtests.jl +++ b/package/test/runtests.jl @@ -4,8 +4,10 @@ using Test const baseFolder = dirname(dirname(pathof(ExpressionExecutorCuda))) include(joinpath(baseFolder, "src", "ExpressionProcessing.jl")) include(joinpath(baseFolder, "src", "Interpreter.jl")) +include(joinpath(baseFolder, "src", "Transpiler.jl")) @testset "ExpressionExecutorCuda.jl" begin include("ExpressionProcessingTests.jl") include("InterpreterTests.jl") + include("TranspilerTests.jl") end