started implementing transpilation of expression
Some checks failed
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled
Some checks failed
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled
This commit is contained in:
parent
0e24d74e54
commit
9fc55c4c15
|
@ -26,9 +26,9 @@ All Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/index.h
|
|||
ld.param.u64 %rd3, [VecAdd_kernel_param_2]; -> rd3 = Result
|
||||
ld.param.u32 %r2, [VecAdd_kernel_param_3]; -> r2 = N
|
||||
|
||||
mov.u32 %r3, %ntid.x;
|
||||
mov.u32 %r4, %ctaid.x;
|
||||
mov.u32 %r5, %tid.x;
|
||||
mov.u32 %r3, %ntid.x; -> initialise r3 with ntid.x
|
||||
mov.u32 %r4, %ctaid.x; -> same as above
|
||||
mov.u32 %r5, %tid.x; -> same as above
|
||||
|
||||
mad.lo.s32 %r1, %r3, %r4, %r5; -> r3 * r4 -> extract lowest 32/2 bits -> add r5 -> r1 = lowest16Bits(r3*r4) + r5
|
||||
|
||||
|
|
|
@ -22,12 +22,6 @@ function interpret(expressions::Vector{ExpressionProcessing.PostfixType}, variab
|
|||
# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
|
||||
cudaResults = CuArray{Float64}(undef, variableCols, length(expressions))
|
||||
|
||||
# println("cudaVars")
|
||||
# println(cudaVars)
|
||||
# println("cudaParams")
|
||||
# println(cudaParams)
|
||||
# println("cudaExprs")
|
||||
# println(cudaExprs)
|
||||
# Start kernel for each expression to ensure that no warp is working on different expressions
|
||||
for i in eachindex(expressions)
|
||||
kernel = @cuda launch=false interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
|
||||
|
@ -50,7 +44,6 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var
|
|||
firstExprIndex = ((exprIndex - 1) * stepsize[1]) + 1 # Inclusive
|
||||
lastExprIndex = firstExprIndex + stepsize[1] - 1 # Inclusive
|
||||
firstParamIndex = ((exprIndex - 1) * stepsize[2]) # Exclusive
|
||||
# lastParamIndex = firstParamIndex + stepsize[2] - 1 # Inclusive (probably not needed)
|
||||
variableCols = length(variables) / stepsize[3]
|
||||
|
||||
operationStack = MVector{MAX_STACK_SIZE, Float64}(undef) # Try to get this to function with variable size too, to allow better memory usage
|
||||
|
|
|
@ -100,6 +100,7 @@ end
|
|||
# Note: Maybe make an additional function that transpiles and executed the code. This would then be the function the user calls
|
||||
#
|
||||
|
||||
# To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string
|
||||
const exitJumpLocationMarker = "\$L__BB0_2"
|
||||
function transpile(expression::ExpressionProcessing.PostfixType)::String
|
||||
ptxBuffer = IOBuffer()
|
||||
|
@ -109,7 +110,7 @@ function transpile(expression::ExpressionProcessing.PostfixType)::String
|
|||
println(ptxBuffer, "{")
|
||||
|
||||
# TODO: Actually calculate the number of needed registers and extend to more register kinds
|
||||
println(ptxBuffer, get_register_definitions(1, 5))
|
||||
println(ptxBuffer, get_register_definitions(1, 5, 1)) # apparently I can define registers anywhere. This might make things easier
|
||||
# TODO: Parameter loading
|
||||
println(ptxBuffer, get_guard_clause())
|
||||
|
||||
|
@ -147,7 +148,7 @@ function get_kernel_signature(kernelName::String, parameters::Vector{DataType}):
|
|||
|
||||
|
||||
for i in eachindex(parameters)
|
||||
type = type_to_cuda_type(parameters[i])
|
||||
type = type_to_ptx_type(parameters[i])
|
||||
print(signatureBuffer,
|
||||
" .param ", type, " ", kernelName, "_param_", i)
|
||||
if i != lastindex(parameters)
|
||||
|
@ -178,46 +179,78 @@ function get_guard_clause()::String
|
|||
println(guardBuffer, "setp.ge.s32 %p0, %r3, %r4;") # guard clause (p1 = r4 > r5 -> index > nrOfVariableSets)
|
||||
|
||||
# branch to end if p1 is true
|
||||
print(guardBuffer, "@%p0 bra ")
|
||||
print(guardBuffer, exitJumpLocationMarker)
|
||||
println(guardBuffer, ";")
|
||||
print(guardBuffer, "@%p0 bra $exitJumpLocationMarker;")
|
||||
|
||||
return String(take!(guardBuffer))
|
||||
end
|
||||
|
||||
function get_register_definitions(nrPred::Int, nr32Bit::Int)::String
|
||||
function get_register_definitions(nrPred::Int, nr32Bit::Int, nrFloat64::Int)::String
|
||||
registersBuffer = IOBuffer()
|
||||
|
||||
if nrPred > 0
|
||||
print(registersBuffer, ".reg .pred")
|
||||
print(registersBuffer, " %p<")
|
||||
print(registersBuffer, nrPred)
|
||||
println(registersBuffer, ">;")
|
||||
println(registersBuffer, ".reg .pred %p<$nrPred>;")
|
||||
end
|
||||
if nr32Bit > 0
|
||||
print(registersBuffer, ".reg .b32")
|
||||
print(registersBuffer, " %r<")
|
||||
print(registersBuffer, nr32Bit)
|
||||
println(registersBuffer, ">;")
|
||||
println(registersBuffer, ".reg .b32 %r<$nr32Bit>;")
|
||||
end
|
||||
if nrFloat64 > 0
|
||||
println(registersBuffer, ".reg .f64 %f<$nrFloat64>;")
|
||||
end
|
||||
|
||||
return String(take!(registersBuffer))
|
||||
end
|
||||
|
||||
# TODO: Dont convert expression to postfix! It seems like this is not the best way since postfix evaluation assumes to be calculated in a stack
|
||||
# where results get pushed back to the stack. This however is not the best behaviour for this kind of calculation.
|
||||
# Probably do this: Get Expr -> traverse tree -> if child node is Expr: basically replace that node with the register containing the result of that Expr
|
||||
|
||||
# Current assumption: Expression only made out of constant values
|
||||
function generate_calculation_code(expression::ExpressionProcessing.PostfixType)::String
|
||||
codeBuffer = IOBuffer()
|
||||
operands = Vector{Float64}()
|
||||
|
||||
registerCounter = 0
|
||||
println(expression)
|
||||
for i in eachindex(expression)
|
||||
token = expression[i]
|
||||
|
||||
if token.Type == FLOAT64
|
||||
push!(operands, reinterpret(Float64, token.Value))
|
||||
elseif token.Type == OPERATOR
|
||||
operator = get_ptx_operator(reinterpret(Operator, token.Value))
|
||||
print(codeBuffer, " $operator %f$registerCounter ")
|
||||
|
||||
# Ugly temporary proof of concept which is ignoring unary operators
|
||||
if length(operands) == 0
|
||||
print(codeBuffer, "%f")
|
||||
print(codeBuffer, registerCounter - 2) # add result before previous result
|
||||
end
|
||||
print(codeBuffer, " ")
|
||||
if length(operands) <= 1
|
||||
print(codeBuffer, "%f")
|
||||
print(codeBuffer, registerCounter - 1) # add previous result
|
||||
end
|
||||
print(codeBuffer, " ")
|
||||
|
||||
ops = last(operands, 2)
|
||||
pop!(operands);pop!(operands)
|
||||
print(codeBuffer, join(ops, ", ")) # if operands has too few values it means the previous calculation is needed. So we need to use registerCounter - 1 or registerCounter - 2 previous registers
|
||||
println(codeBuffer, ";")
|
||||
# empty!(operands)
|
||||
registerCounter += 1
|
||||
end
|
||||
|
||||
# read to operator
|
||||
# add code for calculation
|
||||
|
||||
# on first iteration this would be either 2 or 3 steps (two if unary and three if binary operator)
|
||||
# on all other operations either 1 or 2 (one if unary and two if binary operator)
|
||||
end
|
||||
|
||||
return String(take!(codeBuffer))
|
||||
end
|
||||
|
||||
function type_to_cuda_type(type::DataType)::String
|
||||
function type_to_ptx_type(type::DataType)::String
|
||||
if type == Int64
|
||||
return ".s64"
|
||||
elseif type == Float64
|
||||
|
@ -227,6 +260,32 @@ function type_to_cuda_type(type::DataType)::String
|
|||
end
|
||||
end
|
||||
|
||||
# TODO: Probably change this, to return the entire calculation not just the operator. Because for POWER and EXP we need multiple instructions to calculate them.
|
||||
# Left out for now since I don't have register management yet
|
||||
function get_ptx_operator(operator::Operator)::String
|
||||
if operator == ADD
|
||||
return "add.f64"
|
||||
elseif operator == SUBTRACT
|
||||
return "sub.f64"
|
||||
elseif operator == MULTIPLY
|
||||
return "mul.f64"
|
||||
elseif operator == DIVIDE
|
||||
return "div.approx.f64"
|
||||
elseif operator == POWER
|
||||
return ""
|
||||
elseif operator == ABS
|
||||
return "abs.f64"
|
||||
elseif operator == LOG
|
||||
return "lg2.approx.f64"
|
||||
elseif operator == EXP
|
||||
return ""
|
||||
elseif operator == SQRT
|
||||
return "sqrt.approx.f64"
|
||||
else
|
||||
throw(ArgumentError("Operator conversion to ptx not implemented for $operator"))
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
end
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ variables = Matrix{Float64}(undef, 2,2)
|
|||
parameters = Vector{Vector{Float64}}(undef, 2)
|
||||
|
||||
# Resulting value should be 10 for the first expression
|
||||
expressions[1] = :(x1 + 1 * x2 + p1)
|
||||
expressions[1] = :(1 + 3 * 5 / 7 - 1)
|
||||
expressions[2] = :(5 + x1 + 1 * x2 + p1 + p2)
|
||||
variables[1,1] = 2.0
|
||||
variables[2,1] = 3.0
|
||||
|
|
Loading…
Reference in New Issue
Block a user