From 0e24d74e54f7718102f30f6b24bc6317b33f7ba7 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 26 Oct 2024 11:41:00 +0200 Subject: [PATCH] small increment and fixes --- PTX_understanding.md | 8 ++--- package/src/Transpiler.jl | 60 +++++++++++++++++++++++---------- package/test/TranspilerTests.jl | 11 +++++- 3 files changed, 57 insertions(+), 22 deletions(-) diff --git a/PTX_understanding.md b/PTX_understanding.md index c30b1cf..dd0b9e2 100644 --- a/PTX_understanding.md +++ b/PTX_understanding.md @@ -16,10 +16,10 @@ All Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/index.h ) { - .reg .pred %p<2>; -> predicate registers: p1 (needed for branching) - .reg .f32 %f<4>; -> float registers: f1 - f3 - .reg .b32 %r<6>; -> 32 bits registers: r1 - r5 (bits are actual raw bits without a type) - .reg .b64 %rd<11>; -> 64 bits registers: rd1 - rd10 + .reg .pred %p<2>; -> predicate registers: p0, p1 (needed for branching) + .reg .f32 %f<4>; -> float registers: f0 - f3 + .reg .b32 %r<6>; -> 32 bits registers: r0 - r5 (bits are actual raw bits without a type) + .reg .b64 %rd<11>; -> 64 bits registers: rd0 - rd10 ld.param.u64 %rd1, [VecAdd_kernel_param_0]; -> rd1 = Data1 ld.param.u64 %rd2, [VecAdd_kernel_param_1]; -> rd2 = Data2 diff --git a/package/src/Transpiler.jl b/package/src/Transpiler.jl index 983c301..5b14eaa 100644 --- a/package/src/Transpiler.jl +++ b/package/src/Transpiler.jl @@ -95,26 +95,39 @@ end # Since the generated expressions should have between 10 and 50 symbols, I think allowing a max. of 128 32-bit registers should make for an easy algorithm. If during testing the result is slow, maybe try reducing the number of registers and perform more intelligent allocation/assignment # With 128 Registers, one could have 32 Warps on one SM ((128 * 16 = 2048) * 32 == 64*1024 == max number of registers per SM) This means 512 Threads per SM in the worst case +# +# Make a "function execute(...)" that takes the data and the transpiled code. Pass the data to the kernel and start executing +# Note: Maybe make an additional function that transpiles and executed the code. This would then be the function the user calls +# const exitJumpLocationMarker = "\$L__BB0_2" -function transpile(expression::ExpressionProcessing.PostfixType) +function transpile(expression::ExpressionProcessing.PostfixType)::String ptxBuffer = IOBuffer() println(ptxBuffer, get_cuda_header()) println(ptxBuffer, get_kernel_signature("ExpressionProcessing", [Int64, Float64])) println(ptxBuffer, "{") - # Register definition - println(ptxBuffer, get_register_definitions(1, 5)) # TODO: Actually calculate the number of needed registers and extend to more register kinds - # Parameter loading + # TODO: Actually calculate the number of needed registers and extend to more register kinds + println(ptxBuffer, get_register_definitions(1, 5)) + # TODO: Parameter loading println(ptxBuffer, get_guard_clause()) + # top down create the code. keep track of the max number of variables/parameters used (needed for later iterations. See section "Plan" in "PTX_understanding.md") + # return this alongside the generated code + # Generate registers based off of the above number + # Variables have: %var0 to %varn - 1 + # Parameters have: %param0 to %paramn - 1 # Code goes here + println(ptxBuffer, generate_calculation_code(expression)) # exit jump location print(ptxBuffer, exitJumpLocationMarker); println(ptxBuffer, ": ret;") println(ptxBuffer, "}") - println(String(take!(ptxBuffer))) + + generatedCode = String(take!(ptxBuffer)) + println(generatedCode) + return generatedCode end # TODO: Make version, target and address_size configurable @@ -150,47 +163,60 @@ end Constructs the PTX code used for handling the case where too many threads are started. Assumes the following: - - There are the unused ```32 bit``` registers ```r1, r2, r3, r4 (index of the variable set)``` - - There is an unused ```predicate``` register ```p1``` - - The ```32 bit``` register ```r5``` contains the number of variable sets + - There are the unused ```32 bit``` registers ```r0, r1, r2, r3 (index of the variable set)``` + - There is an unused ```predicate``` register ```p0``` + - The ```32 bit``` register ```r4``` contains the number of variable sets " function get_guard_clause()::String guardBuffer = IOBuffer() - println(guardBuffer, "mov.u32 %r1, %ntid.x;") # nr of thread ids - println(guardBuffer, "mov.u32 %r2, %ctaid.x;") # nr of threads per cta - println(guardBuffer, "mov.u32 %r3, %tid.x;") # id of the current thread + println(guardBuffer, "mov.u32 %r0, %ntid.x;") # nr of thread ids + println(guardBuffer, "mov.u32 %r1, %ctaid.x;") # nr of threads per cta + println(guardBuffer, "mov.u32 %r2, %tid.x;") # id of the current thread - println(guardBuffer, "mad.lo.s32 %r4, %r1, %r2, %r3;") # the current index (basically index of variable set) - println(guardBuffer, "setp.ge.s32 %p1, %r4, %r5;") # guard clause (p1 = r4 > r5 -> index > nrOfVariableSets) + println(guardBuffer, "mad.lo.s32 %r3, %r0, %r1, %r2;") # the current index (basically index of variable set) + println(guardBuffer, "setp.ge.s32 %p0, %r3, %r4;") # guard clause (p1 = r4 > r5 -> index > nrOfVariableSets) # branch to end if p1 is true - print(guardBuffer, "@%p1 bra ") + print(guardBuffer, "@%p0 bra ") print(guardBuffer, exitJumpLocationMarker) println(guardBuffer, ";") return String(take!(guardBuffer)) end -function get_register_definitions(nrPred::Int, nr32Bit::Int):::String +function get_register_definitions(nrPred::Int, nr32Bit::Int)::String registersBuffer = IOBuffer() if nrPred > 0 print(registersBuffer, ".reg .pred") print(registersBuffer, " %p<") - print(registersBuffer, nrPred + 1) + print(registersBuffer, nrPred) println(registersBuffer, ">;") end if nr32Bit > 0 print(registersBuffer, ".reg .b32") print(registersBuffer, " %r<") - print(registersBuffer, nr32Bit + 1) + print(registersBuffer, nr32Bit) println(registersBuffer, ">;") end return String(take!(registersBuffer)) end +function generate_calculation_code(expression::ExpressionProcessing.PostfixType)::String + codeBuffer = IOBuffer() + + for i in eachindex(expression) + token = expression[i] + # read to operator + # add code for calculation + + # on first iteration this would be either 2 or 3 steps (two if unary and three if binary operator) + # on all other operations either 1 or 2 (one if unary and two if binary operator) + end +end + function type_to_cuda_type(type::DataType)::String if type == Int64 return ".s64" diff --git a/package/test/TranspilerTests.jl b/package/test/TranspilerTests.jl index c244081..5b396a6 100644 --- a/package/test/TranspilerTests.jl +++ b/package/test/TranspilerTests.jl @@ -25,8 +25,17 @@ parameters[2][2] = 0.0 postfixExprs = Vector([postfixExpr]) push!(postfixExprs, expr_to_postfix(expressions[2])) - Transpiler.transpile(postfixExpr) + generatedCode = Transpiler.transpile(postfixExpr) # CUDA.@sync interpret(postfixExprs, variables, parameters) + + # This is just here for testing. This will be called inside the execute method in the Transpiler module + linker = CuLink() + add_data!(linker, "ExpressionProcessing", generatedCode) + + image = complete(linker) + + mod = CuModule(image) + func = CuFunction(mod, "ExpressionProcessing") end #TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)"