started implementing transpilation of expression

2024-10-27 11:48:11 +01:00
parent 0e24d74e54
commit 9fc55c4c15
4 changed files with 78 additions and 26 deletions
--- a/PTX_understanding.md
+++ b/PTX_understanding.md
@ -26,9 +26,9 @@ All Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/index.h
 	ld.param.u64    %rd3, [VecAdd_kernel_param_2];	-> rd3 = Result
 	ld.param.u32    %r2, [VecAdd_kernel_param_3]; 	-> r2 = N
-	mov.u32         %r3, %ntid.x;
+	mov.u32         %r3, %ntid.x;   -> initialise r3 with ntid.x
-	mov.u32         %r4, %ctaid.x;
+	mov.u32         %r4, %ctaid.x;  -> same as above
-	mov.u32         %r5, %tid.x;
+	mov.u32         %r5, %tid.x;    -> same as above 
 	mad.lo.s32      %r1, %r3, %r4, %r5;	-> r3 * r4 -> extract lowest 32/2 bits -> add r5 -> r1 = lowest16Bits(r3*r4) + r5
--- a/package/src/Interpreter.jl
+++ b/package/src/Interpreter.jl
@ -22,12 +22,6 @@ function interpret(expressions::Vector{ExpressionProcessing.PostfixType}, variab
 	# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
 	cudaResults = CuArray{Float64}(undef, variableCols, length(expressions))
 	# println("cudaVars")
 	# println(cudaVars)
 	# println("cudaParams")
 	# println(cudaParams)
 	# println("cudaExprs")
 	# println(cudaExprs)
 	# Start kernel for each expression to ensure that no warp is working on different expressions
 	for i in eachindex(expressions)
 		kernel = @cuda launch=false interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
@ -50,7 +44,6 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var
 	firstExprIndex = ((exprIndex - 1) * stepsize[1]) + 1 # Inclusive
 	lastExprIndex = firstExprIndex + stepsize[1] - 1 # Inclusive
 	firstParamIndex = ((exprIndex - 1) * stepsize[2]) # Exclusive
 	# lastParamIndex = firstParamIndex + stepsize[2] - 1 # Inclusive (probably not needed)
 	variableCols = length(variables) / stepsize[3]
 	operationStack = MVector{MAX_STACK_SIZE, Float64}(undef) # Try to get this to function with variable size too, to allow better memory usage
--- a/package/src/Transpiler.jl
+++ b/package/src/Transpiler.jl
@ -100,6 +100,7 @@ end
 # Note: Maybe make an additional function that transpiles and executed the code. This would then be the function the user calls
 #
 # To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string
 const exitJumpLocationMarker = "\$L__BB0_2"
 function transpile(expression::ExpressionProcessing.PostfixType)::String
 	ptxBuffer = IOBuffer()
@ -109,7 +110,7 @@ function transpile(expression::ExpressionProcessing.PostfixType)::String
 	println(ptxBuffer, "{")
 	# TODO: Actually calculate the number of needed registers and extend to more register kinds
-	println(ptxBuffer, get_register_definitions(1, 5)) 
+	println(ptxBuffer, get_register_definitions(1, 5, 1))  # apparently I can define registers anywhere. This might make things easier
 	# TODO: Parameter loading
 	println(ptxBuffer, get_guard_clause())
@ -147,7 +148,7 @@ function get_kernel_signature(kernelName::String, parameters::Vector{DataType}):
 	for i in eachindex(parameters)
-		type = type_to_cuda_type(parameters[i])
+		type = type_to_ptx_type(parameters[i])
 		print(signatureBuffer, 
 			  "  .param ", type, " ", kernelName, "_param_", i)
 		if i != lastindex(parameters)
@ -178,46 +179,78 @@ function get_guard_clause()::String
 	println(guardBuffer, "setp.ge.s32    %p0, %r3, %r4;") # guard clause (p1 = r4 > r5 -> index > nrOfVariableSets)
 	# branch to end if p1 is true
-	print(guardBuffer, "@%p0 bra    ")
+	print(guardBuffer, "@%p0 bra    $exitJumpLocationMarker;")
 	print(guardBuffer, exitJumpLocationMarker)
 	println(guardBuffer, ";")
 	return String(take!(guardBuffer))
 end
-function get_register_definitions(nrPred::Int, nr32Bit::Int)::String
+function get_register_definitions(nrPred::Int, nr32Bit::Int, nrFloat64::Int)::String
 	registersBuffer = IOBuffer()
 	if nrPred > 0
-		print(registersBuffer, ".reg .pred")
+		println(registersBuffer, ".reg .pred   %p<$nrPred>;")
 		print(registersBuffer, "    %p<")
 		print(registersBuffer, nrPred)
 		println(registersBuffer, ">;")
 	end
 	if nr32Bit > 0
-		print(registersBuffer, ".reg .b32")
+		println(registersBuffer, ".reg .b32    %r<$nr32Bit>;")
-		print(registersBuffer, "    %r<")
+	end
-		print(registersBuffer, nr32Bit)
+	if nrFloat64 > 0
-		println(registersBuffer, ">;")
+		println(registersBuffer, ".reg .f64    %f<$nrFloat64>;")
 	end
 	return String(take!(registersBuffer))
 end
 # TODO: Dont convert expression to postfix! It seems like this is not the best way since postfix evaluation assumes to be calculated in a stack
 # where results get pushed back to the stack. This however is not the best behaviour for this kind of calculation.
 # Probably do this: Get Expr -> traverse tree -> if child node is Expr: basically replace that node with the register containing the result of that Expr
 # Current assumption: Expression only made out of constant values
 function generate_calculation_code(expression::ExpressionProcessing.PostfixType)::String
 	codeBuffer = IOBuffer()
 	operands = Vector{Float64}()
 	registerCounter = 0
 	println(expression)
 	for i in eachindex(expression)
 		token = expression[i]
 		if token.Type == FLOAT64
 			push!(operands, reinterpret(Float64, token.Value))
 		elseif token.Type == OPERATOR
 			operator = get_ptx_operator(reinterpret(Operator, token.Value))
 			print(codeBuffer, "    $operator %f$registerCounter ")
 			# Ugly temporary proof of concept which is ignoring unary operators
 			if length(operands) == 0
 				print(codeBuffer, "%f")
 				print(codeBuffer, registerCounter - 2) # add result before previous result
 			end
 			print(codeBuffer, " ")
 			if length(operands) <= 1
 				print(codeBuffer, "%f")
 				print(codeBuffer, registerCounter - 1) # add previous result
 			end
 			print(codeBuffer, " ")
 			ops = last(operands, 2)
 			pop!(operands);pop!(operands)
 			print(codeBuffer, join(ops, ", ")) # if operands has too few values it means the previous calculation is needed. So we need to use registerCounter - 1 or registerCounter - 2 previous registers
 			println(codeBuffer, ";")
 			# empty!(operands)
 			registerCounter += 1
 		end
 		# read to operator
 		# add code for calculation
 		# on first iteration this would be either 2 or 3 steps (two if unary and three if binary operator)
 		# on all other operations either 1 or 2 (one if unary and two if binary operator)
 	end
 	return String(take!(codeBuffer))
 end
-function type_to_cuda_type(type::DataType)::String
+function type_to_ptx_type(type::DataType)::String
 	if type == Int64
 		return ".s64"
 	elseif type == Float64
@ -227,6 +260,32 @@ function type_to_cuda_type(type::DataType)::String
 	end
 end
 # TODO: Probably change this, to return the entire calculation not just the operator. Because for POWER and EXP we need multiple instructions to calculate them.
 # Left out for now since I don't have register management yet
 function get_ptx_operator(operator::Operator)::String
 	if operator == ADD
 		return "add.f64"
 	elseif operator == SUBTRACT
 		return "sub.f64"
 	elseif operator == MULTIPLY
 		return "mul.f64"
 	elseif operator == DIVIDE
 		return "div.approx.f64"
 	elseif operator == POWER
 		return ""
 	elseif operator == ABS
 		return "abs.f64"
 	elseif operator == LOG
 		return "lg2.approx.f64"
 	elseif operator == EXP
 		return ""
 	elseif operator == SQRT
 		return "sqrt.approx.f64"
 	else
 		throw(ArgumentError("Operator conversion to ptx not implemented for $operator"))
 	end
 end
 end
--- a/package/test/TranspilerTests.jl
+++ b/package/test/TranspilerTests.jl
@ -7,7 +7,7 @@ variables = Matrix{Float64}(undef, 2,2)
 parameters = Vector{Vector{Float64}}(undef, 2)
 # Resulting value should be 10 for the first expression
-expressions[1] = :(x1 + 1 * x2 + p1)
+expressions[1] = :(1 + 3 * 5 / 7 - 1)
 expressions[2] = :(5 + x1 + 1 * x2 + p1 + p2)
 variables[1,1] = 2.0
 variables[2,1] = 3.0