From 1e7f6e9010a008de03563843001b57e68c53fa28 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 7 Dec 2024 10:12:53 +0100 Subject: [PATCH] tried streamlining register management --- package/src/ExpressionExecutorCuda.jl | 1 - package/src/Transpiler.jl | 64 +++++++++++++++------------ 2 files changed, 35 insertions(+), 30 deletions(-) diff --git a/package/src/ExpressionExecutorCuda.jl b/package/src/ExpressionExecutorCuda.jl index dfe72f1..934509d 100644 --- a/package/src/ExpressionExecutorCuda.jl +++ b/package/src/ExpressionExecutorCuda.jl @@ -14,7 +14,6 @@ export test # Evaluate Expressions on the GPU function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32} - # Ensure that no two expressions are interpreted in the same "warp" exprsPostfix = ExpressionProcessing.expr_to_postfix(exprs[1]) end diff --git a/package/src/Transpiler.jl b/package/src/Transpiler.jl index bb34710..9ba416c 100644 --- a/package/src/Transpiler.jl +++ b/package/src/Transpiler.jl @@ -100,6 +100,32 @@ end # Note: Maybe make an additional function that transpiles and executed the code. This would then be the function the user calls # +# TODO: I guess I need to move this to different module because it seems like I can't access inner functions +let registers = Dict() # stores the count of the register already used. + global get_next_free_register + global get_used_registers + + # By convention these names correspond to the following types: + # - p -> pred + # - f32 -> float32 + # - b32 -> 32 bit + # - var -> float32 + # - param -> float32 !! although, they might get inserted as fixed number and not be sent to gpu? + function get_next_free_register(name::String)::String + if haskey(registers, name) + registers[name] += 1 + else + registers[name] = 1 + end + + return String(["%", name, registers[name]]) + end + + function get_used_registers() + return pairs(registers) + end +end + # To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string const exitJumpLocationMarker = "\$L__BB0_2" function transpile(expression::ExpressionProcessing.PostfixType)::String @@ -133,7 +159,7 @@ function transpile(expression::ExpressionProcessing.PostfixType)::String return generatedCode end -# TODO: Make version, target and address_size configurable +# TODO: Make version, target and address_size configurable; also see what address_size means exactly function get_cuda_header()::String return " .version 7.1 @@ -178,9 +204,9 @@ function get_guard_clause()::String println(guardBuffer, "mov.u32 %r2, %tid.x;") # id of the current thread println(guardBuffer, "mad.lo.s32 %r3, %r0, %r1, %r2;") # the current index (basically index of variable set) - println(guardBuffer, "setp.ge.s32 %p0, %r3, %r4;") # guard clause (p1 = r4 > r5 -> index > nrOfVariableSets) + println(guardBuffer, "setp.ge.s32 %p0, %r3, %r4;") # guard clause (p0 = r3 > r4 -> index > nrOfVariableSets) - # branch to end if p1 is true + # branch to end if p0 is true print(guardBuffer, "@%p0 bra $exitJumpLocationMarker;") return String(take!(guardBuffer)) @@ -203,10 +229,6 @@ function get_register_definitions(nrPred::Int, nr32Bit::Int, nrFloat32::Int)::St return String(take!(registersBuffer)) end -# TODO: Dont convert expression to postfix! It seems like this is not the best way since postfix evaluation assumes to be calculated in a stack -# where results get pushed back to the stack. This however is not the best behaviour for this kind of calculation. -# Probably do this: Get Expr -> traverse tree -> if child node is Expr: basically replace that node with the register containing the result of that Expr - # Current assumption: Expression only made out of constant values function generate_calculation_code(expression::ExpressionProcessing.PostfixType)::Tuple{String, Int} codeBuffer = IOBuffer() @@ -220,37 +242,21 @@ function generate_calculation_code(expression::ExpressionProcessing.PostfixType) if token.Type == FLOAT32 push!(operands, reinterpret(Float32, token.Value)) elseif token.Type == OPERATOR + # function call to see if operator is unary -> adapt below calculation; probably able to reuse register operator = get_ptx_operator(reinterpret(Operator, token.Value)) register = "%f$registerCounter" print(codeBuffer, " $operator $register, ") - # Ugly temporary proof of concept which is ignoring unary operators - # if length(operands) == 0 - # print(codeBuffer, "%f") - # print(codeBuffer, registerCounter - 2) # add result before previous result - # end - # print(codeBuffer, " ") - # if length(operands) <= 1 - # print(codeBuffer, "%f") - # print(codeBuffer, registerCounter - 1) # add previous result - # end - # print(codeBuffer, " ") - ops = last(operands, 2) pop!(operands);pop!(operands) - print(codeBuffer, join(ops, ", ")) # if operands has too few values it means the previous calculation is needed. So we need to use registerCounter - 1 or registerCounter - 2 previous registers + print(codeBuffer, join(ops, ", ")) println(codeBuffer, ";") - # empty!(operands) push!(operands, register) registerCounter += 1 + elseif token.Type == INDEX + # TODO end - - # read to operator - # add code for calculation - - # on first iteration this would be either 2 or 3 steps (two if unary and three if binary operator) - # on all other operations either 1 or 2 (one if unary and two if binary operator) end return (String(take!(codeBuffer)), registerCounter) @@ -266,8 +272,7 @@ function type_to_ptx_type(type::DataType)::String end end -# TODO: Probably change this, to return the entire calculation not just the operator. Because for POWER and EXP we need multiple instructions to calculate them. -# Left out for now since I don't have register management yet +# TODO: Probably change this, to return the entire calculation not just the operator. Because for POWER and EXP we need multiple instructions to calculate them (seperation of concerns). function get_ptx_operator(operator::Operator)::String if operator == ADD return "add.f32" @@ -293,5 +298,6 @@ function get_ptx_operator(operator::Operator)::String end + end