small increment and fixes
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run

This commit is contained in:
Daniel 2024-10-26 11:41:00 +02:00
parent ee3c5001bd
commit 0e24d74e54
3 changed files with 57 additions and 22 deletions

View File

@ -16,10 +16,10 @@ All Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/index.h
) )
{ {
.reg .pred %p<2>; -> predicate registers: p1 (needed for branching) .reg .pred %p<2>; -> predicate registers: p0, p1 (needed for branching)
.reg .f32 %f<4>; -> float registers: f1 - f3 .reg .f32 %f<4>; -> float registers: f0 - f3
.reg .b32 %r<6>; -> 32 bits registers: r1 - r5 (bits are actual raw bits without a type) .reg .b32 %r<6>; -> 32 bits registers: r0 - r5 (bits are actual raw bits without a type)
.reg .b64 %rd<11>; -> 64 bits registers: rd1 - rd10 .reg .b64 %rd<11>; -> 64 bits registers: rd0 - rd10
ld.param.u64 %rd1, [VecAdd_kernel_param_0]; -> rd1 = Data1 ld.param.u64 %rd1, [VecAdd_kernel_param_0]; -> rd1 = Data1
ld.param.u64 %rd2, [VecAdd_kernel_param_1]; -> rd2 = Data2 ld.param.u64 %rd2, [VecAdd_kernel_param_1]; -> rd2 = Data2

View File

@ -95,26 +95,39 @@ end
# Since the generated expressions should have between 10 and 50 symbols, I think allowing a max. of 128 32-bit registers should make for an easy algorithm. If during testing the result is slow, maybe try reducing the number of registers and perform more intelligent allocation/assignment # Since the generated expressions should have between 10 and 50 symbols, I think allowing a max. of 128 32-bit registers should make for an easy algorithm. If during testing the result is slow, maybe try reducing the number of registers and perform more intelligent allocation/assignment
# With 128 Registers, one could have 32 Warps on one SM ((128 * 16 = 2048) * 32 == 64*1024 == max number of registers per SM) This means 512 Threads per SM in the worst case # With 128 Registers, one could have 32 Warps on one SM ((128 * 16 = 2048) * 32 == 64*1024 == max number of registers per SM) This means 512 Threads per SM in the worst case
#
# Make a "function execute(...)" that takes the data and the transpiled code. Pass the data to the kernel and start executing
# Note: Maybe make an additional function that transpiles and executed the code. This would then be the function the user calls
#
const exitJumpLocationMarker = "\$L__BB0_2" const exitJumpLocationMarker = "\$L__BB0_2"
function transpile(expression::ExpressionProcessing.PostfixType) function transpile(expression::ExpressionProcessing.PostfixType)::String
ptxBuffer = IOBuffer() ptxBuffer = IOBuffer()
println(ptxBuffer, get_cuda_header()) println(ptxBuffer, get_cuda_header())
println(ptxBuffer, get_kernel_signature("ExpressionProcessing", [Int64, Float64])) println(ptxBuffer, get_kernel_signature("ExpressionProcessing", [Int64, Float64]))
println(ptxBuffer, "{") println(ptxBuffer, "{")
# Register definition # TODO: Actually calculate the number of needed registers and extend to more register kinds
println(ptxBuffer, get_register_definitions(1, 5)) # TODO: Actually calculate the number of needed registers and extend to more register kinds println(ptxBuffer, get_register_definitions(1, 5))
# Parameter loading # TODO: Parameter loading
println(ptxBuffer, get_guard_clause()) println(ptxBuffer, get_guard_clause())
# top down create the code. keep track of the max number of variables/parameters used (needed for later iterations. See section "Plan" in "PTX_understanding.md")
# return this alongside the generated code
# Generate registers based off of the above number
# Variables have: %var0 to %varn - 1
# Parameters have: %param0 to %paramn - 1
# Code goes here # Code goes here
println(ptxBuffer, generate_calculation_code(expression))
# exit jump location # exit jump location
print(ptxBuffer, exitJumpLocationMarker); println(ptxBuffer, ": ret;") print(ptxBuffer, exitJumpLocationMarker); println(ptxBuffer, ": ret;")
println(ptxBuffer, "}") println(ptxBuffer, "}")
println(String(take!(ptxBuffer)))
generatedCode = String(take!(ptxBuffer))
println(generatedCode)
return generatedCode
end end
# TODO: Make version, target and address_size configurable # TODO: Make version, target and address_size configurable
@ -150,47 +163,60 @@ end
Constructs the PTX code used for handling the case where too many threads are started. Constructs the PTX code used for handling the case where too many threads are started.
Assumes the following: Assumes the following:
- There are the unused ```32 bit``` registers ```r1, r2, r3, r4 (index of the variable set)``` - There are the unused ```32 bit``` registers ```r0, r1, r2, r3 (index of the variable set)```
- There is an unused ```predicate``` register ```p1``` - There is an unused ```predicate``` register ```p0```
- The ```32 bit``` register ```r5``` contains the number of variable sets - The ```32 bit``` register ```r4``` contains the number of variable sets
" "
function get_guard_clause()::String function get_guard_clause()::String
guardBuffer = IOBuffer() guardBuffer = IOBuffer()
println(guardBuffer, "mov.u32 %r1, %ntid.x;") # nr of thread ids println(guardBuffer, "mov.u32 %r0, %ntid.x;") # nr of thread ids
println(guardBuffer, "mov.u32 %r2, %ctaid.x;") # nr of threads per cta println(guardBuffer, "mov.u32 %r1, %ctaid.x;") # nr of threads per cta
println(guardBuffer, "mov.u32 %r3, %tid.x;") # id of the current thread println(guardBuffer, "mov.u32 %r2, %tid.x;") # id of the current thread
println(guardBuffer, "mad.lo.s32 %r4, %r1, %r2, %r3;") # the current index (basically index of variable set) println(guardBuffer, "mad.lo.s32 %r3, %r0, %r1, %r2;") # the current index (basically index of variable set)
println(guardBuffer, "setp.ge.s32 %p1, %r4, %r5;") # guard clause (p1 = r4 > r5 -> index > nrOfVariableSets) println(guardBuffer, "setp.ge.s32 %p0, %r3, %r4;") # guard clause (p1 = r4 > r5 -> index > nrOfVariableSets)
# branch to end if p1 is true # branch to end if p1 is true
print(guardBuffer, "@%p1 bra ") print(guardBuffer, "@%p0 bra ")
print(guardBuffer, exitJumpLocationMarker) print(guardBuffer, exitJumpLocationMarker)
println(guardBuffer, ";") println(guardBuffer, ";")
return String(take!(guardBuffer)) return String(take!(guardBuffer))
end end
function get_register_definitions(nrPred::Int, nr32Bit::Int):::String function get_register_definitions(nrPred::Int, nr32Bit::Int)::String
registersBuffer = IOBuffer() registersBuffer = IOBuffer()
if nrPred > 0 if nrPred > 0
print(registersBuffer, ".reg .pred") print(registersBuffer, ".reg .pred")
print(registersBuffer, " %p<") print(registersBuffer, " %p<")
print(registersBuffer, nrPred + 1) print(registersBuffer, nrPred)
println(registersBuffer, ">;") println(registersBuffer, ">;")
end end
if nr32Bit > 0 if nr32Bit > 0
print(registersBuffer, ".reg .b32") print(registersBuffer, ".reg .b32")
print(registersBuffer, " %r<") print(registersBuffer, " %r<")
print(registersBuffer, nr32Bit + 1) print(registersBuffer, nr32Bit)
println(registersBuffer, ">;") println(registersBuffer, ">;")
end end
return String(take!(registersBuffer)) return String(take!(registersBuffer))
end end
function generate_calculation_code(expression::ExpressionProcessing.PostfixType)::String
codeBuffer = IOBuffer()
for i in eachindex(expression)
token = expression[i]
# read to operator
# add code for calculation
# on first iteration this would be either 2 or 3 steps (two if unary and three if binary operator)
# on all other operations either 1 or 2 (one if unary and two if binary operator)
end
end
function type_to_cuda_type(type::DataType)::String function type_to_cuda_type(type::DataType)::String
if type == Int64 if type == Int64
return ".s64" return ".s64"

View File

@ -25,8 +25,17 @@ parameters[2][2] = 0.0
postfixExprs = Vector([postfixExpr]) postfixExprs = Vector([postfixExpr])
push!(postfixExprs, expr_to_postfix(expressions[2])) push!(postfixExprs, expr_to_postfix(expressions[2]))
Transpiler.transpile(postfixExpr) generatedCode = Transpiler.transpile(postfixExpr)
# CUDA.@sync interpret(postfixExprs, variables, parameters) # CUDA.@sync interpret(postfixExprs, variables, parameters)
# This is just here for testing. This will be called inside the execute method in the Transpiler module
linker = CuLink()
add_data!(linker, "ExpressionProcessing", generatedCode)
image = complete(linker)
mod = CuModule(image)
func = CuFunction(mod, "ExpressionProcessing")
end end
#TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)" #TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)"