small increment and fixes
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run

This commit is contained in:
Daniel 2024-10-26 11:41:00 +02:00
parent ee3c5001bd
commit 0e24d74e54
3 changed files with 57 additions and 22 deletions

View File

@ -16,10 +16,10 @@ All Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/index.h
)
{
.reg .pred %p<2>; -> predicate registers: p1 (needed for branching)
.reg .f32 %f<4>; -> float registers: f1 - f3
.reg .b32 %r<6>; -> 32 bits registers: r1 - r5 (bits are actual raw bits without a type)
.reg .b64 %rd<11>; -> 64 bits registers: rd1 - rd10
.reg .pred %p<2>; -> predicate registers: p0, p1 (needed for branching)
.reg .f32 %f<4>; -> float registers: f0 - f3
.reg .b32 %r<6>; -> 32 bits registers: r0 - r5 (bits are actual raw bits without a type)
.reg .b64 %rd<11>; -> 64 bits registers: rd0 - rd10
ld.param.u64 %rd1, [VecAdd_kernel_param_0]; -> rd1 = Data1
ld.param.u64 %rd2, [VecAdd_kernel_param_1]; -> rd2 = Data2

View File

@ -95,26 +95,39 @@ end
# Since the generated expressions should have between 10 and 50 symbols, I think allowing a max. of 128 32-bit registers should make for an easy algorithm. If during testing the result is slow, maybe try reducing the number of registers and perform more intelligent allocation/assignment
# With 128 Registers, one could have 32 Warps on one SM ((128 * 16 = 2048) * 32 == 64*1024 == max number of registers per SM) This means 512 Threads per SM in the worst case
#
# Make a "function execute(...)" that takes the data and the transpiled code. Pass the data to the kernel and start executing
# Note: Maybe make an additional function that transpiles and executed the code. This would then be the function the user calls
#
const exitJumpLocationMarker = "\$L__BB0_2"
function transpile(expression::ExpressionProcessing.PostfixType)
function transpile(expression::ExpressionProcessing.PostfixType)::String
ptxBuffer = IOBuffer()
println(ptxBuffer, get_cuda_header())
println(ptxBuffer, get_kernel_signature("ExpressionProcessing", [Int64, Float64]))
println(ptxBuffer, "{")
# Register definition
println(ptxBuffer, get_register_definitions(1, 5)) # TODO: Actually calculate the number of needed registers and extend to more register kinds
# Parameter loading
# TODO: Actually calculate the number of needed registers and extend to more register kinds
println(ptxBuffer, get_register_definitions(1, 5))
# TODO: Parameter loading
println(ptxBuffer, get_guard_clause())
# top down create the code. keep track of the max number of variables/parameters used (needed for later iterations. See section "Plan" in "PTX_understanding.md")
# return this alongside the generated code
# Generate registers based off of the above number
# Variables have: %var0 to %varn - 1
# Parameters have: %param0 to %paramn - 1
# Code goes here
println(ptxBuffer, generate_calculation_code(expression))
# exit jump location
print(ptxBuffer, exitJumpLocationMarker); println(ptxBuffer, ": ret;")
println(ptxBuffer, "}")
println(String(take!(ptxBuffer)))
generatedCode = String(take!(ptxBuffer))
println(generatedCode)
return generatedCode
end
# TODO: Make version, target and address_size configurable
@ -150,47 +163,60 @@ end
Constructs the PTX code used for handling the case where too many threads are started.
Assumes the following:
- There are the unused ```32 bit``` registers ```r1, r2, r3, r4 (index of the variable set)```
- There is an unused ```predicate``` register ```p1```
- The ```32 bit``` register ```r5``` contains the number of variable sets
- There are the unused ```32 bit``` registers ```r0, r1, r2, r3 (index of the variable set)```
- There is an unused ```predicate``` register ```p0```
- The ```32 bit``` register ```r4``` contains the number of variable sets
"
function get_guard_clause()::String
guardBuffer = IOBuffer()
println(guardBuffer, "mov.u32 %r1, %ntid.x;") # nr of thread ids
println(guardBuffer, "mov.u32 %r2, %ctaid.x;") # nr of threads per cta
println(guardBuffer, "mov.u32 %r3, %tid.x;") # id of the current thread
println(guardBuffer, "mov.u32 %r0, %ntid.x;") # nr of thread ids
println(guardBuffer, "mov.u32 %r1, %ctaid.x;") # nr of threads per cta
println(guardBuffer, "mov.u32 %r2, %tid.x;") # id of the current thread
println(guardBuffer, "mad.lo.s32 %r4, %r1, %r2, %r3;") # the current index (basically index of variable set)
println(guardBuffer, "setp.ge.s32 %p1, %r4, %r5;") # guard clause (p1 = r4 > r5 -> index > nrOfVariableSets)
println(guardBuffer, "mad.lo.s32 %r3, %r0, %r1, %r2;") # the current index (basically index of variable set)
println(guardBuffer, "setp.ge.s32 %p0, %r3, %r4;") # guard clause (p1 = r4 > r5 -> index > nrOfVariableSets)
# branch to end if p1 is true
print(guardBuffer, "@%p1 bra ")
print(guardBuffer, "@%p0 bra ")
print(guardBuffer, exitJumpLocationMarker)
println(guardBuffer, ";")
return String(take!(guardBuffer))
end
function get_register_definitions(nrPred::Int, nr32Bit::Int):::String
function get_register_definitions(nrPred::Int, nr32Bit::Int)::String
registersBuffer = IOBuffer()
if nrPred > 0
print(registersBuffer, ".reg .pred")
print(registersBuffer, " %p<")
print(registersBuffer, nrPred + 1)
print(registersBuffer, nrPred)
println(registersBuffer, ">;")
end
if nr32Bit > 0
print(registersBuffer, ".reg .b32")
print(registersBuffer, " %r<")
print(registersBuffer, nr32Bit + 1)
print(registersBuffer, nr32Bit)
println(registersBuffer, ">;")
end
return String(take!(registersBuffer))
end
function generate_calculation_code(expression::ExpressionProcessing.PostfixType)::String
codeBuffer = IOBuffer()
for i in eachindex(expression)
token = expression[i]
# read to operator
# add code for calculation
# on first iteration this would be either 2 or 3 steps (two if unary and three if binary operator)
# on all other operations either 1 or 2 (one if unary and two if binary operator)
end
end
function type_to_cuda_type(type::DataType)::String
if type == Int64
return ".s64"

View File

@ -25,8 +25,17 @@ parameters[2][2] = 0.0
postfixExprs = Vector([postfixExpr])
push!(postfixExprs, expr_to_postfix(expressions[2]))
Transpiler.transpile(postfixExpr)
generatedCode = Transpiler.transpile(postfixExpr)
# CUDA.@sync interpret(postfixExprs, variables, parameters)
# This is just here for testing. This will be called inside the execute method in the Transpiler module
linker = CuLink()
add_data!(linker, "ExpressionProcessing", generatedCode)
image = complete(linker)
mod = CuModule(image)
func = CuFunction(mod, "ExpressionProcessing")
end
#TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)"