started implementing transpilation of expression

2024-10-27 11:48:11 +01:00
parent 0e24d74e54
commit 9fc55c4c15
4 changed files with 78 additions and 26 deletions
--- a/PTX_understanding.md
+++ b/PTX_understanding.md
@ -26,9 +26,9 @@ All Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/index.h
 	ld.param.u64    %rd3, [VecAdd_kernel_param_2];	-> rd3 = Result
 	ld.param.u32    %r2, [VecAdd_kernel_param_3]; 	-> r2 = N

-	mov.u32         %r3, %ntid.x;
-	mov.u32         %r4, %ctaid.x;
-	mov.u32         %r5, %tid.x;
+	mov.u32         %r3, %ntid.x;   -> initialise r3 with ntid.x
+	mov.u32         %r4, %ctaid.x;  -> same as above
+	mov.u32         %r5, %tid.x;    -> same as above 

 	mad.lo.s32      %r1, %r3, %r4, %r5;	-> r3 * r4 -> extract lowest 32/2 bits -> add r5 -> r1 = lowest16Bits(r3*r4) + r5

--- a/package/src/Interpreter.jl
+++ b/package/src/Interpreter.jl
@ -22,12 +22,6 @@ function interpret(expressions::Vector{ExpressionProcessing.PostfixType}, variab
 	# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
 	cudaResults = CuArray{Float64}(undef, variableCols, length(expressions))

-	# println("cudaVars")
-	# println(cudaVars)
-	# println("cudaParams")
-	# println(cudaParams)
-	# println("cudaExprs")
-	# println(cudaExprs)
 	# Start kernel for each expression to ensure that no warp is working on different expressions
 	for i in eachindex(expressions)
 		kernel = @cuda launch=false interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
@ -50,7 +44,6 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var
 	firstExprIndex = ((exprIndex - 1) * stepsize[1]) + 1 # Inclusive
 	lastExprIndex = firstExprIndex + stepsize[1] - 1 # Inclusive
 	firstParamIndex = ((exprIndex - 1) * stepsize[2]) # Exclusive
-	# lastParamIndex = firstParamIndex + stepsize[2] - 1 # Inclusive (probably not needed)
 	variableCols = length(variables) / stepsize[3]

 	operationStack = MVector{MAX_STACK_SIZE, Float64}(undef) # Try to get this to function with variable size too, to allow better memory usage
--- a/package/src/Transpiler.jl
+++ b/package/src/Transpiler.jl
@ -100,6 +100,7 @@ end
 # Note: Maybe make an additional function that transpiles and executed the code. This would then be the function the user calls
 #

+# To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string
 const exitJumpLocationMarker = "\$L__BB0_2"
 function transpile(expression::ExpressionProcessing.PostfixType)::String
 	ptxBuffer = IOBuffer()
@ -109,7 +110,7 @@ function transpile(expression::ExpressionProcessing.PostfixType)::String
 	println(ptxBuffer, "{")

 	# TODO: Actually calculate the number of needed registers and extend to more register kinds
-	println(ptxBuffer, get_register_definitions(1, 5)) 
+	println(ptxBuffer, get_register_definitions(1, 5, 1))  # apparently I can define registers anywhere. This might make things easier
 	# TODO: Parameter loading
 	println(ptxBuffer, get_guard_clause())

@ -147,7 +148,7 @@ function get_kernel_signature(kernelName::String, parameters::Vector{DataType}):

 	
 	for i in eachindex(parameters)
-		type = type_to_cuda_type(parameters[i])
+		type = type_to_ptx_type(parameters[i])
 		print(signatureBuffer, 
 			  "  .param ", type, " ", kernelName, "_param_", i)
 		if i != lastindex(parameters)
@ -178,46 +179,78 @@ function get_guard_clause()::String
 	println(guardBuffer, "setp.ge.s32    %p0, %r3, %r4;") # guard clause (p1 = r4 > r5 -> index > nrOfVariableSets)

 	# branch to end if p1 is true
-	print(guardBuffer, "@%p0 bra    ")
-	print(guardBuffer, exitJumpLocationMarker)
-	println(guardBuffer, ";")
+	print(guardBuffer, "@%p0 bra    $exitJumpLocationMarker;")

 	return String(take!(guardBuffer))
 end

-function get_register_definitions(nrPred::Int, nr32Bit::Int)::String
+function get_register_definitions(nrPred::Int, nr32Bit::Int, nrFloat64::Int)::String
 	registersBuffer = IOBuffer()

 	if nrPred > 0
-		print(registersBuffer, ".reg .pred")
-		print(registersBuffer, "    %p<")
-		print(registersBuffer, nrPred)
-		println(registersBuffer, ">;")
+		println(registersBuffer, ".reg .pred   %p<$nrPred>;")
 	end
 	if nr32Bit > 0
-		print(registersBuffer, ".reg .b32")
-		print(registersBuffer, "    %r<")
-		print(registersBuffer, nr32Bit)
-		println(registersBuffer, ">;")
+		println(registersBuffer, ".reg .b32    %r<$nr32Bit>;")
+	end
+	if nrFloat64 > 0
+		println(registersBuffer, ".reg .f64    %f<$nrFloat64>;")
 	end

 	return String(take!(registersBuffer))
 end

+# TODO: Dont convert expression to postfix! It seems like this is not the best way since postfix evaluation assumes to be calculated in a stack
+# where results get pushed back to the stack. This however is not the best behaviour for this kind of calculation.
+# Probably do this: Get Expr -> traverse tree -> if child node is Expr: basically replace that node with the register containing the result of that Expr
+
+# Current assumption: Expression only made out of constant values
 function generate_calculation_code(expression::ExpressionProcessing.PostfixType)::String
 	codeBuffer = IOBuffer()
+	operands = Vector{Float64}()

+	registerCounter = 0
+	println(expression)
 	for i in eachindex(expression)
 		token = expression[i]
+
+		if token.Type == FLOAT64
+			push!(operands, reinterpret(Float64, token.Value))
+		elseif token.Type == OPERATOR
+			operator = get_ptx_operator(reinterpret(Operator, token.Value))
+			print(codeBuffer, "    $operator %f$registerCounter ")
+
+			# Ugly temporary proof of concept which is ignoring unary operators
+			if length(operands) == 0
+				print(codeBuffer, "%f")
+				print(codeBuffer, registerCounter - 2) # add result before previous result
+			end
+			print(codeBuffer, " ")
+			if length(operands) <= 1
+				print(codeBuffer, "%f")
+				print(codeBuffer, registerCounter - 1) # add previous result
+			end
+			print(codeBuffer, " ")
+
+			ops = last(operands, 2)
+			pop!(operands);pop!(operands)
+			print(codeBuffer, join(ops, ", ")) # if operands has too few values it means the previous calculation is needed. So we need to use registerCounter - 1 or registerCounter - 2 previous registers
+			println(codeBuffer, ";")
+			# empty!(operands)
+			registerCounter += 1
+		end
+		
 		# read to operator
 		# add code for calculation

 		# on first iteration this would be either 2 or 3 steps (two if unary and three if binary operator)
 		# on all other operations either 1 or 2 (one if unary and two if binary operator)
 	end
+
+	return String(take!(codeBuffer))
 end

-function type_to_cuda_type(type::DataType)::String
+function type_to_ptx_type(type::DataType)::String
 	if type == Int64
 		return ".s64"
 	elseif type == Float64
@ -227,6 +260,32 @@ function type_to_cuda_type(type::DataType)::String
 	end
 end

+# TODO: Probably change this, to return the entire calculation not just the operator. Because for POWER and EXP we need multiple instructions to calculate them.
+# Left out for now since I don't have register management yet
+function get_ptx_operator(operator::Operator)::String
+	if operator == ADD
+		return "add.f64"
+	elseif operator == SUBTRACT
+		return "sub.f64"
+	elseif operator == MULTIPLY
+		return "mul.f64"
+	elseif operator == DIVIDE
+		return "div.approx.f64"
+	elseif operator == POWER
+		return ""
+	elseif operator == ABS
+		return "abs.f64"
+	elseif operator == LOG
+		return "lg2.approx.f64"
+	elseif operator == EXP
+		return ""
+	elseif operator == SQRT
+		return "sqrt.approx.f64"
+	else
+		throw(ArgumentError("Operator conversion to ptx not implemented for $operator"))
+	end
+end
+

 end

--- a/package/test/TranspilerTests.jl
+++ b/package/test/TranspilerTests.jl
@ -7,7 +7,7 @@ variables = Matrix{Float64}(undef, 2,2)
 parameters = Vector{Vector{Float64}}(undef, 2)

 # Resulting value should be 10 for the first expression
-expressions[1] = :(x1 + 1 * x2 + p1)
+expressions[1] = :(1 + 3 * 5 / 7 - 1)
 expressions[2] = :(5 + x1 + 1 * x2 + p1 + p2)
 variables[1,1] = 2.0
 variables[2,1] = 3.0