small increment and fixes
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
This commit is contained in:
parent
ee3c5001bd
commit
0e24d74e54
|
@ -16,10 +16,10 @@ All Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/index.h
|
|||
)
|
||||
|
||||
{
|
||||
.reg .pred %p<2>; -> predicate registers: p1 (needed for branching)
|
||||
.reg .f32 %f<4>; -> float registers: f1 - f3
|
||||
.reg .b32 %r<6>; -> 32 bits registers: r1 - r5 (bits are actual raw bits without a type)
|
||||
.reg .b64 %rd<11>; -> 64 bits registers: rd1 - rd10
|
||||
.reg .pred %p<2>; -> predicate registers: p0, p1 (needed for branching)
|
||||
.reg .f32 %f<4>; -> float registers: f0 - f3
|
||||
.reg .b32 %r<6>; -> 32 bits registers: r0 - r5 (bits are actual raw bits without a type)
|
||||
.reg .b64 %rd<11>; -> 64 bits registers: rd0 - rd10
|
||||
|
||||
ld.param.u64 %rd1, [VecAdd_kernel_param_0]; -> rd1 = Data1
|
||||
ld.param.u64 %rd2, [VecAdd_kernel_param_1]; -> rd2 = Data2
|
||||
|
|
|
@ -95,26 +95,39 @@ end
|
|||
# Since the generated expressions should have between 10 and 50 symbols, I think allowing a max. of 128 32-bit registers should make for an easy algorithm. If during testing the result is slow, maybe try reducing the number of registers and perform more intelligent allocation/assignment
|
||||
# With 128 Registers, one could have 32 Warps on one SM ((128 * 16 = 2048) * 32 == 64*1024 == max number of registers per SM) This means 512 Threads per SM in the worst case
|
||||
|
||||
#
|
||||
# Make a "function execute(...)" that takes the data and the transpiled code. Pass the data to the kernel and start executing
|
||||
# Note: Maybe make an additional function that transpiles and executed the code. This would then be the function the user calls
|
||||
#
|
||||
|
||||
const exitJumpLocationMarker = "\$L__BB0_2"
|
||||
function transpile(expression::ExpressionProcessing.PostfixType)
|
||||
function transpile(expression::ExpressionProcessing.PostfixType)::String
|
||||
ptxBuffer = IOBuffer()
|
||||
|
||||
println(ptxBuffer, get_cuda_header())
|
||||
println(ptxBuffer, get_kernel_signature("ExpressionProcessing", [Int64, Float64]))
|
||||
println(ptxBuffer, "{")
|
||||
|
||||
# Register definition
|
||||
println(ptxBuffer, get_register_definitions(1, 5)) # TODO: Actually calculate the number of needed registers and extend to more register kinds
|
||||
# Parameter loading
|
||||
# TODO: Actually calculate the number of needed registers and extend to more register kinds
|
||||
println(ptxBuffer, get_register_definitions(1, 5))
|
||||
# TODO: Parameter loading
|
||||
println(ptxBuffer, get_guard_clause())
|
||||
|
||||
# top down create the code. keep track of the max number of variables/parameters used (needed for later iterations. See section "Plan" in "PTX_understanding.md")
|
||||
# return this alongside the generated code
|
||||
# Generate registers based off of the above number
|
||||
# Variables have: %var0 to %varn - 1
|
||||
# Parameters have: %param0 to %paramn - 1
|
||||
# Code goes here
|
||||
println(ptxBuffer, generate_calculation_code(expression))
|
||||
|
||||
# exit jump location
|
||||
print(ptxBuffer, exitJumpLocationMarker); println(ptxBuffer, ": ret;")
|
||||
println(ptxBuffer, "}")
|
||||
println(String(take!(ptxBuffer)))
|
||||
|
||||
generatedCode = String(take!(ptxBuffer))
|
||||
println(generatedCode)
|
||||
return generatedCode
|
||||
end
|
||||
|
||||
# TODO: Make version, target and address_size configurable
|
||||
|
@ -150,47 +163,60 @@ end
|
|||
Constructs the PTX code used for handling the case where too many threads are started.
|
||||
|
||||
Assumes the following:
|
||||
- There are the unused ```32 bit``` registers ```r1, r2, r3, r4 (index of the variable set)```
|
||||
- There is an unused ```predicate``` register ```p1```
|
||||
- The ```32 bit``` register ```r5``` contains the number of variable sets
|
||||
- There are the unused ```32 bit``` registers ```r0, r1, r2, r3 (index of the variable set)```
|
||||
- There is an unused ```predicate``` register ```p0```
|
||||
- The ```32 bit``` register ```r4``` contains the number of variable sets
|
||||
"
|
||||
function get_guard_clause()::String
|
||||
guardBuffer = IOBuffer()
|
||||
|
||||
println(guardBuffer, "mov.u32 %r1, %ntid.x;") # nr of thread ids
|
||||
println(guardBuffer, "mov.u32 %r2, %ctaid.x;") # nr of threads per cta
|
||||
println(guardBuffer, "mov.u32 %r3, %tid.x;") # id of the current thread
|
||||
println(guardBuffer, "mov.u32 %r0, %ntid.x;") # nr of thread ids
|
||||
println(guardBuffer, "mov.u32 %r1, %ctaid.x;") # nr of threads per cta
|
||||
println(guardBuffer, "mov.u32 %r2, %tid.x;") # id of the current thread
|
||||
|
||||
println(guardBuffer, "mad.lo.s32 %r4, %r1, %r2, %r3;") # the current index (basically index of variable set)
|
||||
println(guardBuffer, "setp.ge.s32 %p1, %r4, %r5;") # guard clause (p1 = r4 > r5 -> index > nrOfVariableSets)
|
||||
println(guardBuffer, "mad.lo.s32 %r3, %r0, %r1, %r2;") # the current index (basically index of variable set)
|
||||
println(guardBuffer, "setp.ge.s32 %p0, %r3, %r4;") # guard clause (p1 = r4 > r5 -> index > nrOfVariableSets)
|
||||
|
||||
# branch to end if p1 is true
|
||||
print(guardBuffer, "@%p1 bra ")
|
||||
print(guardBuffer, "@%p0 bra ")
|
||||
print(guardBuffer, exitJumpLocationMarker)
|
||||
println(guardBuffer, ";")
|
||||
|
||||
return String(take!(guardBuffer))
|
||||
end
|
||||
|
||||
function get_register_definitions(nrPred::Int, nr32Bit::Int):::String
|
||||
function get_register_definitions(nrPred::Int, nr32Bit::Int)::String
|
||||
registersBuffer = IOBuffer()
|
||||
|
||||
if nrPred > 0
|
||||
print(registersBuffer, ".reg .pred")
|
||||
print(registersBuffer, " %p<")
|
||||
print(registersBuffer, nrPred + 1)
|
||||
print(registersBuffer, nrPred)
|
||||
println(registersBuffer, ">;")
|
||||
end
|
||||
if nr32Bit > 0
|
||||
print(registersBuffer, ".reg .b32")
|
||||
print(registersBuffer, " %r<")
|
||||
print(registersBuffer, nr32Bit + 1)
|
||||
print(registersBuffer, nr32Bit)
|
||||
println(registersBuffer, ">;")
|
||||
end
|
||||
|
||||
return String(take!(registersBuffer))
|
||||
end
|
||||
|
||||
function generate_calculation_code(expression::ExpressionProcessing.PostfixType)::String
|
||||
codeBuffer = IOBuffer()
|
||||
|
||||
for i in eachindex(expression)
|
||||
token = expression[i]
|
||||
# read to operator
|
||||
# add code for calculation
|
||||
|
||||
# on first iteration this would be either 2 or 3 steps (two if unary and three if binary operator)
|
||||
# on all other operations either 1 or 2 (one if unary and two if binary operator)
|
||||
end
|
||||
end
|
||||
|
||||
function type_to_cuda_type(type::DataType)::String
|
||||
if type == Int64
|
||||
return ".s64"
|
||||
|
|
|
@ -25,8 +25,17 @@ parameters[2][2] = 0.0
|
|||
postfixExprs = Vector([postfixExpr])
|
||||
push!(postfixExprs, expr_to_postfix(expressions[2]))
|
||||
|
||||
Transpiler.transpile(postfixExpr)
|
||||
generatedCode = Transpiler.transpile(postfixExpr)
|
||||
# CUDA.@sync interpret(postfixExprs, variables, parameters)
|
||||
|
||||
# This is just here for testing. This will be called inside the execute method in the Transpiler module
|
||||
linker = CuLink()
|
||||
add_data!(linker, "ExpressionProcessing", generatedCode)
|
||||
|
||||
image = complete(linker)
|
||||
|
||||
mod = CuModule(image)
|
||||
func = CuFunction(mod, "ExpressionProcessing")
|
||||
end
|
||||
|
||||
#TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)"
|
||||
|
|
Loading…
Reference in New Issue
Block a user