added guard clause generation
Some checks failed
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled
CompatHelper / CompatHelper (push) Has been cancelled

This commit is contained in:
2024-09-28 11:41:13 +02:00
parent d875fc7325
commit 7283082699
5 changed files with 98 additions and 33 deletions

View File

@ -44,8 +44,8 @@ end
#TODO: Add @inbounds to all indexing after it is verified that all works https://cuda.juliagpu.org/stable/development/kernel/#Bounds-checking
const MAX_STACK_SIZE = 25 # The max number of values the expression can have. so Constant values, Variables and parameters
function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, variables::CuDeviceArray{Float64}, parameters::CuDeviceArray{Float64}, results::CuDeviceArray{Float64}, stepsize::CuDeviceArray{Int}, exprIndex::Int)
index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
stride = gridDim().x * blockDim().x
index = (blockIdx().x - 1) * blockDim().x + threadIdx().x # ctaid.x * ntid.x + tid.x
stride = gridDim().x * blockDim().x # nctaid.x * ntid.x
firstExprIndex = ((exprIndex - 1) * stepsize[1]) + 1 # Inclusive
lastExprIndex = firstExprIndex + stepsize[1] - 1 # Inclusive

View File

@ -78,36 +78,83 @@ function culoadtest(N::Int32, op = "add.f32")
@time CUDA.@sync cudacall(func, Tuple{CuPtr{Cfloat},CuPtr{Cfloat},CuPtr{Cfloat},Cint}, d_a, d_b, d_c, N; threads=threadsPerBlock, blocks=blocksPerGrid)
end
const exitJumpLocationMarker = "\$L__BB0_2"
function transpile(expression::ExpressionProcessing.PostfixType)
ptxBuffer = IOBuffer()
println(ptxBuffer, get_cuda_header())
println(ptxBuffer, get_kernel_signature("ExpressionProcessing", [Int64, Float64]))
println(ptxBuffer, "{")
# Register definition
# Parameter loading
println(ptxBuffer, get_guard_clause())
# Code goes here
# exit jump location
print(ptxBuffer, exitJumpLocationMarker)
println(ptxBuffer, ": ret;")
println(ptxBuffer, "}")
println(String(take!(ptxBuffer)))
end
# TODO: Make version, target and address_size configurable
function get_cuda_header()::String
return "
.version 7.1
.target sm_52
.address_size 64
"
.version 7.1
.target sm_52
.address_size 64
"
end
function get_kernel_signature(kernelName::String, parameters::Vector{Type})::String
signature = ".visible .entry " * kernelName
stringBuilder = IOBuffer()
print(stringBuilder, "(")
function get_kernel_signature(kernelName::String, parameters::Vector{DataType})::String
signatureBuffer = IOBuffer()
print(signatureBuffer, ".visible .entry ")
print(signatureBuffer, kernelName)
println(signatureBuffer, "(")
for i in eachindex(parameters)
type = type_to_cuda_type(parameters[i])
print(stringBuilder,
".param ", type, " ", kernelName, "_param_", i, ",")
print(signatureBuffer,
" .param ", type, " ", kernelName, "_param_", i)
if i != lastindex(parameters)
println(signatureBuffer, ",")
end
end
print(stringBuilder, ")")
return String(take!(stringBuilder))
print(signatureBuffer, ")")
return String(take!(signatureBuffer))
end
function type_to_cuda_type(type::Type)::String
"
Constructs the PTX code used for handling the case where too many threads are started.
Assumes the following:
- There are the unused ```32 bit``` registers ```r1, r2, r3, r4```
- There is an unused ```predicate``` register ```p1```
- The ```32 bit``` register ```r5``` contains the number of variable sets
"
function get_guard_clause()::String
guardBuffer = IOBuffer()
println(guardBuffer, "mov.u32 %r1, %ntid.x;") # nr of thread ids
println(guardBuffer, "mov.u32 %r2, %ctaid.x;") # nr of threads per cta
println(guardBuffer, "mov.u32 %r3, %tid.x;") # id of the current thread
println(guardBuffer, "mad.lo.s32 %r4, %r1, %r2, %r3;") # the current index (basically index of variable set)
println(guardBuffer, "setp.ge.s32 %p1, %r4, %r5;") # guard clause (p1 = r4 > r5 -> index > nrOfVariableSets)
# branch to end if p1 is true
print(guardBuffer, "@%p1 bra ")
print(guardBuffer, exitJumpLocationMarker)
println(guardBuffer, ";")
return String(take!(guardBuffer))
end
function type_to_cuda_type(type::DataType)::String
if type == Int64
return ".s64"
elseif type == Float64