diff --git a/PTX_understanding.md b/PTX_understanding.md index dd0b9e2..3ef432b 100644 --- a/PTX_understanding.md +++ b/PTX_understanding.md @@ -26,9 +26,9 @@ All Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/index.h ld.param.u64 %rd3, [VecAdd_kernel_param_2]; -> rd3 = Result ld.param.u32 %r2, [VecAdd_kernel_param_3]; -> r2 = N - mov.u32 %r3, %ntid.x; - mov.u32 %r4, %ctaid.x; - mov.u32 %r5, %tid.x; + mov.u32 %r3, %ntid.x; -> initialise r3 with ntid.x + mov.u32 %r4, %ctaid.x; -> same as above + mov.u32 %r5, %tid.x; -> same as above mad.lo.s32 %r1, %r3, %r4, %r5; -> r3 * r4 -> extract lowest 32/2 bits -> add r5 -> r1 = lowest16Bits(r3*r4) + r5 diff --git a/package/src/Interpreter.jl b/package/src/Interpreter.jl index 99b36ab..44fd9d6 100644 --- a/package/src/Interpreter.jl +++ b/package/src/Interpreter.jl @@ -22,12 +22,6 @@ function interpret(expressions::Vector{ExpressionProcessing.PostfixType}, variab # each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions cudaResults = CuArray{Float64}(undef, variableCols, length(expressions)) - # println("cudaVars") - # println(cudaVars) - # println("cudaParams") - # println(cudaParams) - # println("cudaExprs") - # println(cudaExprs) # Start kernel for each expression to ensure that no warp is working on different expressions for i in eachindex(expressions) kernel = @cuda launch=false interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i) @@ -50,7 +44,6 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var firstExprIndex = ((exprIndex - 1) * stepsize[1]) + 1 # Inclusive lastExprIndex = firstExprIndex + stepsize[1] - 1 # Inclusive firstParamIndex = ((exprIndex - 1) * stepsize[2]) # Exclusive - # lastParamIndex = firstParamIndex + stepsize[2] - 1 # Inclusive (probably not needed) variableCols = length(variables) / stepsize[3] operationStack = MVector{MAX_STACK_SIZE, Float64}(undef) # Try to get this to function with variable size too, to allow better memory usage diff --git a/package/src/Transpiler.jl b/package/src/Transpiler.jl index 5b14eaa..eeb4c31 100644 --- a/package/src/Transpiler.jl +++ b/package/src/Transpiler.jl @@ -100,6 +100,7 @@ end # Note: Maybe make an additional function that transpiles and executed the code. This would then be the function the user calls # +# To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string const exitJumpLocationMarker = "\$L__BB0_2" function transpile(expression::ExpressionProcessing.PostfixType)::String ptxBuffer = IOBuffer() @@ -109,7 +110,7 @@ function transpile(expression::ExpressionProcessing.PostfixType)::String println(ptxBuffer, "{") # TODO: Actually calculate the number of needed registers and extend to more register kinds - println(ptxBuffer, get_register_definitions(1, 5)) + println(ptxBuffer, get_register_definitions(1, 5, 1)) # apparently I can define registers anywhere. This might make things easier # TODO: Parameter loading println(ptxBuffer, get_guard_clause()) @@ -147,7 +148,7 @@ function get_kernel_signature(kernelName::String, parameters::Vector{DataType}): for i in eachindex(parameters) - type = type_to_cuda_type(parameters[i]) + type = type_to_ptx_type(parameters[i]) print(signatureBuffer, " .param ", type, " ", kernelName, "_param_", i) if i != lastindex(parameters) @@ -178,46 +179,78 @@ function get_guard_clause()::String println(guardBuffer, "setp.ge.s32 %p0, %r3, %r4;") # guard clause (p1 = r4 > r5 -> index > nrOfVariableSets) # branch to end if p1 is true - print(guardBuffer, "@%p0 bra ") - print(guardBuffer, exitJumpLocationMarker) - println(guardBuffer, ";") + print(guardBuffer, "@%p0 bra $exitJumpLocationMarker;") return String(take!(guardBuffer)) end -function get_register_definitions(nrPred::Int, nr32Bit::Int)::String +function get_register_definitions(nrPred::Int, nr32Bit::Int, nrFloat64::Int)::String registersBuffer = IOBuffer() if nrPred > 0 - print(registersBuffer, ".reg .pred") - print(registersBuffer, " %p<") - print(registersBuffer, nrPred) - println(registersBuffer, ">;") + println(registersBuffer, ".reg .pred %p<$nrPred>;") end if nr32Bit > 0 - print(registersBuffer, ".reg .b32") - print(registersBuffer, " %r<") - print(registersBuffer, nr32Bit) - println(registersBuffer, ">;") + println(registersBuffer, ".reg .b32 %r<$nr32Bit>;") + end + if nrFloat64 > 0 + println(registersBuffer, ".reg .f64 %f<$nrFloat64>;") end return String(take!(registersBuffer)) end +# TODO: Dont convert expression to postfix! It seems like this is not the best way since postfix evaluation assumes to be calculated in a stack +# where results get pushed back to the stack. This however is not the best behaviour for this kind of calculation. +# Probably do this: Get Expr -> traverse tree -> if child node is Expr: basically replace that node with the register containing the result of that Expr + +# Current assumption: Expression only made out of constant values function generate_calculation_code(expression::ExpressionProcessing.PostfixType)::String codeBuffer = IOBuffer() + operands = Vector{Float64}() + registerCounter = 0 + println(expression) for i in eachindex(expression) token = expression[i] + + if token.Type == FLOAT64 + push!(operands, reinterpret(Float64, token.Value)) + elseif token.Type == OPERATOR + operator = get_ptx_operator(reinterpret(Operator, token.Value)) + print(codeBuffer, " $operator %f$registerCounter ") + + # Ugly temporary proof of concept which is ignoring unary operators + if length(operands) == 0 + print(codeBuffer, "%f") + print(codeBuffer, registerCounter - 2) # add result before previous result + end + print(codeBuffer, " ") + if length(operands) <= 1 + print(codeBuffer, "%f") + print(codeBuffer, registerCounter - 1) # add previous result + end + print(codeBuffer, " ") + + ops = last(operands, 2) + pop!(operands);pop!(operands) + print(codeBuffer, join(ops, ", ")) # if operands has too few values it means the previous calculation is needed. So we need to use registerCounter - 1 or registerCounter - 2 previous registers + println(codeBuffer, ";") + # empty!(operands) + registerCounter += 1 + end + # read to operator # add code for calculation # on first iteration this would be either 2 or 3 steps (two if unary and three if binary operator) # on all other operations either 1 or 2 (one if unary and two if binary operator) end + + return String(take!(codeBuffer)) end -function type_to_cuda_type(type::DataType)::String +function type_to_ptx_type(type::DataType)::String if type == Int64 return ".s64" elseif type == Float64 @@ -227,6 +260,32 @@ function type_to_cuda_type(type::DataType)::String end end +# TODO: Probably change this, to return the entire calculation not just the operator. Because for POWER and EXP we need multiple instructions to calculate them. +# Left out for now since I don't have register management yet +function get_ptx_operator(operator::Operator)::String + if operator == ADD + return "add.f64" + elseif operator == SUBTRACT + return "sub.f64" + elseif operator == MULTIPLY + return "mul.f64" + elseif operator == DIVIDE + return "div.approx.f64" + elseif operator == POWER + return "" + elseif operator == ABS + return "abs.f64" + elseif operator == LOG + return "lg2.approx.f64" + elseif operator == EXP + return "" + elseif operator == SQRT + return "sqrt.approx.f64" + else + throw(ArgumentError("Operator conversion to ptx not implemented for $operator")) + end +end + end diff --git a/package/test/TranspilerTests.jl b/package/test/TranspilerTests.jl index 5b396a6..3f5b1ee 100644 --- a/package/test/TranspilerTests.jl +++ b/package/test/TranspilerTests.jl @@ -7,7 +7,7 @@ variables = Matrix{Float64}(undef, 2,2) parameters = Vector{Vector{Float64}}(undef, 2) # Resulting value should be 10 for the first expression -expressions[1] = :(x1 + 1 * x2 + p1) +expressions[1] = :(1 + 3 * 5 / 7 - 1) expressions[2] = :(5 + x1 + 1 * x2 + p1 + p2) variables[1,1] = 2.0 variables[2,1] = 3.0