started implementing transpilation of expression
Some checks failed
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled

This commit is contained in:
Daniel 2024-10-27 11:48:11 +01:00
parent 0e24d74e54
commit 9fc55c4c15
4 changed files with 78 additions and 26 deletions

View File

@ -26,9 +26,9 @@ All Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/index.h
ld.param.u64 %rd3, [VecAdd_kernel_param_2]; -> rd3 = Result ld.param.u64 %rd3, [VecAdd_kernel_param_2]; -> rd3 = Result
ld.param.u32 %r2, [VecAdd_kernel_param_3]; -> r2 = N ld.param.u32 %r2, [VecAdd_kernel_param_3]; -> r2 = N
mov.u32 %r3, %ntid.x; mov.u32 %r3, %ntid.x; -> initialise r3 with ntid.x
mov.u32 %r4, %ctaid.x; mov.u32 %r4, %ctaid.x; -> same as above
mov.u32 %r5, %tid.x; mov.u32 %r5, %tid.x; -> same as above
mad.lo.s32 %r1, %r3, %r4, %r5; -> r3 * r4 -> extract lowest 32/2 bits -> add r5 -> r1 = lowest16Bits(r3*r4) + r5 mad.lo.s32 %r1, %r3, %r4, %r5; -> r3 * r4 -> extract lowest 32/2 bits -> add r5 -> r1 = lowest16Bits(r3*r4) + r5

View File

@ -22,12 +22,6 @@ function interpret(expressions::Vector{ExpressionProcessing.PostfixType}, variab
# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions # each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
cudaResults = CuArray{Float64}(undef, variableCols, length(expressions)) cudaResults = CuArray{Float64}(undef, variableCols, length(expressions))
# println("cudaVars")
# println(cudaVars)
# println("cudaParams")
# println(cudaParams)
# println("cudaExprs")
# println(cudaExprs)
# Start kernel for each expression to ensure that no warp is working on different expressions # Start kernel for each expression to ensure that no warp is working on different expressions
for i in eachindex(expressions) for i in eachindex(expressions)
kernel = @cuda launch=false interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i) kernel = @cuda launch=false interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
@ -50,7 +44,6 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var
firstExprIndex = ((exprIndex - 1) * stepsize[1]) + 1 # Inclusive firstExprIndex = ((exprIndex - 1) * stepsize[1]) + 1 # Inclusive
lastExprIndex = firstExprIndex + stepsize[1] - 1 # Inclusive lastExprIndex = firstExprIndex + stepsize[1] - 1 # Inclusive
firstParamIndex = ((exprIndex - 1) * stepsize[2]) # Exclusive firstParamIndex = ((exprIndex - 1) * stepsize[2]) # Exclusive
# lastParamIndex = firstParamIndex + stepsize[2] - 1 # Inclusive (probably not needed)
variableCols = length(variables) / stepsize[3] variableCols = length(variables) / stepsize[3]
operationStack = MVector{MAX_STACK_SIZE, Float64}(undef) # Try to get this to function with variable size too, to allow better memory usage operationStack = MVector{MAX_STACK_SIZE, Float64}(undef) # Try to get this to function with variable size too, to allow better memory usage

View File

@ -100,6 +100,7 @@ end
# Note: Maybe make an additional function that transpiles and executed the code. This would then be the function the user calls # Note: Maybe make an additional function that transpiles and executed the code. This would then be the function the user calls
# #
# To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string
const exitJumpLocationMarker = "\$L__BB0_2" const exitJumpLocationMarker = "\$L__BB0_2"
function transpile(expression::ExpressionProcessing.PostfixType)::String function transpile(expression::ExpressionProcessing.PostfixType)::String
ptxBuffer = IOBuffer() ptxBuffer = IOBuffer()
@ -109,7 +110,7 @@ function transpile(expression::ExpressionProcessing.PostfixType)::String
println(ptxBuffer, "{") println(ptxBuffer, "{")
# TODO: Actually calculate the number of needed registers and extend to more register kinds # TODO: Actually calculate the number of needed registers and extend to more register kinds
println(ptxBuffer, get_register_definitions(1, 5)) println(ptxBuffer, get_register_definitions(1, 5, 1)) # apparently I can define registers anywhere. This might make things easier
# TODO: Parameter loading # TODO: Parameter loading
println(ptxBuffer, get_guard_clause()) println(ptxBuffer, get_guard_clause())
@ -147,7 +148,7 @@ function get_kernel_signature(kernelName::String, parameters::Vector{DataType}):
for i in eachindex(parameters) for i in eachindex(parameters)
type = type_to_cuda_type(parameters[i]) type = type_to_ptx_type(parameters[i])
print(signatureBuffer, print(signatureBuffer,
" .param ", type, " ", kernelName, "_param_", i) " .param ", type, " ", kernelName, "_param_", i)
if i != lastindex(parameters) if i != lastindex(parameters)
@ -178,46 +179,78 @@ function get_guard_clause()::String
println(guardBuffer, "setp.ge.s32 %p0, %r3, %r4;") # guard clause (p1 = r4 > r5 -> index > nrOfVariableSets) println(guardBuffer, "setp.ge.s32 %p0, %r3, %r4;") # guard clause (p1 = r4 > r5 -> index > nrOfVariableSets)
# branch to end if p1 is true # branch to end if p1 is true
print(guardBuffer, "@%p0 bra ") print(guardBuffer, "@%p0 bra $exitJumpLocationMarker;")
print(guardBuffer, exitJumpLocationMarker)
println(guardBuffer, ";")
return String(take!(guardBuffer)) return String(take!(guardBuffer))
end end
function get_register_definitions(nrPred::Int, nr32Bit::Int)::String function get_register_definitions(nrPred::Int, nr32Bit::Int, nrFloat64::Int)::String
registersBuffer = IOBuffer() registersBuffer = IOBuffer()
if nrPred > 0 if nrPred > 0
print(registersBuffer, ".reg .pred") println(registersBuffer, ".reg .pred %p<$nrPred>;")
print(registersBuffer, " %p<")
print(registersBuffer, nrPred)
println(registersBuffer, ">;")
end end
if nr32Bit > 0 if nr32Bit > 0
print(registersBuffer, ".reg .b32") println(registersBuffer, ".reg .b32 %r<$nr32Bit>;")
print(registersBuffer, " %r<") end
print(registersBuffer, nr32Bit) if nrFloat64 > 0
println(registersBuffer, ">;") println(registersBuffer, ".reg .f64 %f<$nrFloat64>;")
end end
return String(take!(registersBuffer)) return String(take!(registersBuffer))
end end
# TODO: Dont convert expression to postfix! It seems like this is not the best way since postfix evaluation assumes to be calculated in a stack
# where results get pushed back to the stack. This however is not the best behaviour for this kind of calculation.
# Probably do this: Get Expr -> traverse tree -> if child node is Expr: basically replace that node with the register containing the result of that Expr
# Current assumption: Expression only made out of constant values
function generate_calculation_code(expression::ExpressionProcessing.PostfixType)::String function generate_calculation_code(expression::ExpressionProcessing.PostfixType)::String
codeBuffer = IOBuffer() codeBuffer = IOBuffer()
operands = Vector{Float64}()
registerCounter = 0
println(expression)
for i in eachindex(expression) for i in eachindex(expression)
token = expression[i] token = expression[i]
if token.Type == FLOAT64
push!(operands, reinterpret(Float64, token.Value))
elseif token.Type == OPERATOR
operator = get_ptx_operator(reinterpret(Operator, token.Value))
print(codeBuffer, " $operator %f$registerCounter ")
# Ugly temporary proof of concept which is ignoring unary operators
if length(operands) == 0
print(codeBuffer, "%f")
print(codeBuffer, registerCounter - 2) # add result before previous result
end
print(codeBuffer, " ")
if length(operands) <= 1
print(codeBuffer, "%f")
print(codeBuffer, registerCounter - 1) # add previous result
end
print(codeBuffer, " ")
ops = last(operands, 2)
pop!(operands);pop!(operands)
print(codeBuffer, join(ops, ", ")) # if operands has too few values it means the previous calculation is needed. So we need to use registerCounter - 1 or registerCounter - 2 previous registers
println(codeBuffer, ";")
# empty!(operands)
registerCounter += 1
end
# read to operator # read to operator
# add code for calculation # add code for calculation
# on first iteration this would be either 2 or 3 steps (two if unary and three if binary operator) # on first iteration this would be either 2 or 3 steps (two if unary and three if binary operator)
# on all other operations either 1 or 2 (one if unary and two if binary operator) # on all other operations either 1 or 2 (one if unary and two if binary operator)
end end
return String(take!(codeBuffer))
end end
function type_to_cuda_type(type::DataType)::String function type_to_ptx_type(type::DataType)::String
if type == Int64 if type == Int64
return ".s64" return ".s64"
elseif type == Float64 elseif type == Float64
@ -227,6 +260,32 @@ function type_to_cuda_type(type::DataType)::String
end end
end end
# TODO: Probably change this, to return the entire calculation not just the operator. Because for POWER and EXP we need multiple instructions to calculate them.
# Left out for now since I don't have register management yet
function get_ptx_operator(operator::Operator)::String
if operator == ADD
return "add.f64"
elseif operator == SUBTRACT
return "sub.f64"
elseif operator == MULTIPLY
return "mul.f64"
elseif operator == DIVIDE
return "div.approx.f64"
elseif operator == POWER
return ""
elseif operator == ABS
return "abs.f64"
elseif operator == LOG
return "lg2.approx.f64"
elseif operator == EXP
return ""
elseif operator == SQRT
return "sqrt.approx.f64"
else
throw(ArgumentError("Operator conversion to ptx not implemented for $operator"))
end
end
end end

View File

@ -7,7 +7,7 @@ variables = Matrix{Float64}(undef, 2,2)
parameters = Vector{Vector{Float64}}(undef, 2) parameters = Vector{Vector{Float64}}(undef, 2)
# Resulting value should be 10 for the first expression # Resulting value should be 10 for the first expression
expressions[1] = :(x1 + 1 * x2 + p1) expressions[1] = :(1 + 3 * 5 / 7 - 1)
expressions[2] = :(5 + x1 + 1 * x2 + p1 + p2) expressions[2] = :(5 + x1 + 1 * x2 + p1 + p2)
variables[1,1] = 2.0 variables[1,1] = 2.0
variables[2,1] = 3.0 variables[2,1] = 3.0