From 0e24d74e54f7718102f30f6b24bc6317b33f7ba7 Mon Sep 17 00:00:00 2001
From: Daniel <danwip@hotmail.com>
Date: Sat, 26 Oct 2024 11:41:00 +0200
Subject: [PATCH] small increment and fixes

---
 PTX_understanding.md            |  8 ++---
 package/src/Transpiler.jl       | 60 +++++++++++++++++++++++----------
 package/test/TranspilerTests.jl | 11 +++++-
 3 files changed, 57 insertions(+), 22 deletions(-)

diff --git a/PTX_understanding.md b/PTX_understanding.md
index c30b1cf..dd0b9e2 100644
--- a/PTX_understanding.md
+++ b/PTX_understanding.md
@@ -16,10 +16,10 @@ All Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/index.h
 )
 
 {
-	.reg .pred      %p<2>;		-> predicate registers: p1 (needed for branching)
-	.reg .f32       %f<4>; 		-> float registers: f1 - f3
-	.reg .b32       %r<6>; 		-> 32 bits registers: r1 - r5 (bits are actual raw bits without a type)
-	.reg .b64       %rd<11>; 	-> 64 bits registers: rd1 - rd10
+	.reg .pred      %p<2>;		-> predicate registers: p0, p1 (needed for branching)
+	.reg .f32       %f<4>; 		-> float registers: f0 - f3
+	.reg .b32       %r<6>; 		-> 32 bits registers: r0 - r5 (bits are actual raw bits without a type)
+	.reg .b64       %rd<11>; 	-> 64 bits registers: rd0 - rd10
 
 	ld.param.u64    %rd1, [VecAdd_kernel_param_0];	-> rd1 = Data1
 	ld.param.u64    %rd2, [VecAdd_kernel_param_1];	-> rd2 = Data2
diff --git a/package/src/Transpiler.jl b/package/src/Transpiler.jl
index 983c301..5b14eaa 100644
--- a/package/src/Transpiler.jl
+++ b/package/src/Transpiler.jl
@@ -95,26 +95,39 @@ end
 # Since the generated expressions should have between 10 and 50 symbols, I think allowing a max. of 128 32-bit registers should make for an easy algorithm. If during testing the result is slow, maybe try reducing the number of registers and perform more intelligent allocation/assignment
 # With 128 Registers, one could have 32 Warps on one SM ((128 * 16 = 2048) * 32 == 64*1024 == max number of registers per SM) This means 512 Threads per SM in the worst case
 
+#
+# Make a "function execute(...)" that takes the data and the transpiled code. Pass the data to the kernel and start executing
+# Note: Maybe make an additional function that transpiles and executed the code. This would then be the function the user calls
+#
 
 const exitJumpLocationMarker = "\$L__BB0_2"
-function transpile(expression::ExpressionProcessing.PostfixType)
+function transpile(expression::ExpressionProcessing.PostfixType)::String
 	ptxBuffer = IOBuffer()
 
 	println(ptxBuffer, get_cuda_header())
 	println(ptxBuffer, get_kernel_signature("ExpressionProcessing", [Int64, Float64]))
 	println(ptxBuffer, "{")
 
-	# Register definition
-	println(ptxBuffer, get_register_definitions(1, 5)) # TODO: Actually calculate the number of needed registers and extend to more register kinds
-	# Parameter loading
+	# TODO: Actually calculate the number of needed registers and extend to more register kinds
+	println(ptxBuffer, get_register_definitions(1, 5)) 
+	# TODO: Parameter loading
 	println(ptxBuffer, get_guard_clause())
 
+	# top down create the code. keep track of the max number of variables/parameters used (needed for later iterations. See section "Plan" in "PTX_understanding.md")
+	# return this alongside the generated code
+	# Generate registers based off of the above number
+	# Variables have: %var0 to %varn - 1
+	# Parameters have: %param0 to %paramn - 1
 	# Code goes here
+	println(ptxBuffer, generate_calculation_code(expression))
 
 	# exit jump location
 	print(ptxBuffer, exitJumpLocationMarker); println(ptxBuffer, ": ret;")
 	println(ptxBuffer, "}")
-	println(String(take!(ptxBuffer)))
+
+	generatedCode = String(take!(ptxBuffer))
+	println(generatedCode)
+	return generatedCode
 end
 
 # TODO: Make version, target and address_size configurable
@@ -150,47 +163,60 @@ end
 Constructs the PTX code used for handling the case where too many threads are started.
 
 Assumes the following:
- - There are the unused ```32 bit``` registers ```r1, r2, r3, r4 (index of the variable set)```
- - There is an unused ```predicate``` register ```p1```
- - The ```32 bit``` register ```r5``` contains the number of variable sets
+ - There are the unused ```32 bit``` registers ```r0, r1, r2, r3 (index of the variable set)```
+ - There is an unused ```predicate``` register ```p0```
+ - The ```32 bit``` register ```r4``` contains the number of variable sets
 "
 function get_guard_clause()::String
 	guardBuffer = IOBuffer()
 
-	println(guardBuffer, "mov.u32    %r1, %ntid.x;") # nr of thread ids
-	println(guardBuffer, "mov.u32    %r2, %ctaid.x;") # nr of threads per cta
-	println(guardBuffer, "mov.u32    %r3, %tid.x;") # id of the current thread
+	println(guardBuffer, "mov.u32    %r0, %ntid.x;") # nr of thread ids
+	println(guardBuffer, "mov.u32    %r1, %ctaid.x;") # nr of threads per cta
+	println(guardBuffer, "mov.u32    %r2, %tid.x;") # id of the current thread
 
-	println(guardBuffer, "mad.lo.s32    %r4, %r1, %r2, %r3;") # the current index (basically index of variable set)
-	println(guardBuffer, "setp.ge.s32    %p1, %r4, %r5;") # guard clause (p1 = r4 > r5 -> index > nrOfVariableSets)
+	println(guardBuffer, "mad.lo.s32    %r3, %r0, %r1, %r2;") # the current index (basically index of variable set)
+	println(guardBuffer, "setp.ge.s32    %p0, %r3, %r4;") # guard clause (p1 = r4 > r5 -> index > nrOfVariableSets)
 
 	# branch to end if p1 is true
-	print(guardBuffer, "@%p1 bra    ")
+	print(guardBuffer, "@%p0 bra    ")
 	print(guardBuffer, exitJumpLocationMarker)
 	println(guardBuffer, ";")
 
 	return String(take!(guardBuffer))
 end
 
-function get_register_definitions(nrPred::Int, nr32Bit::Int):::String
+function get_register_definitions(nrPred::Int, nr32Bit::Int)::String
 	registersBuffer = IOBuffer()
 
 	if nrPred > 0
 		print(registersBuffer, ".reg .pred")
 		print(registersBuffer, "    %p<")
-		print(registersBuffer, nrPred + 1)
+		print(registersBuffer, nrPred)
 		println(registersBuffer, ">;")
 	end
 	if nr32Bit > 0
 		print(registersBuffer, ".reg .b32")
 		print(registersBuffer, "    %r<")
-		print(registersBuffer, nr32Bit + 1)
+		print(registersBuffer, nr32Bit)
 		println(registersBuffer, ">;")
 	end
 
 	return String(take!(registersBuffer))
 end
 
+function generate_calculation_code(expression::ExpressionProcessing.PostfixType)::String
+	codeBuffer = IOBuffer()
+
+	for i in eachindex(expression)
+		token = expression[i]
+		# read to operator
+		# add code for calculation
+
+		# on first iteration this would be either 2 or 3 steps (two if unary and three if binary operator)
+		# on all other operations either 1 or 2 (one if unary and two if binary operator)
+	end
+end
+
 function type_to_cuda_type(type::DataType)::String
 	if type == Int64
 		return ".s64"
diff --git a/package/test/TranspilerTests.jl b/package/test/TranspilerTests.jl
index c244081..5b396a6 100644
--- a/package/test/TranspilerTests.jl
+++ b/package/test/TranspilerTests.jl
@@ -25,8 +25,17 @@ parameters[2][2] = 0.0
 	postfixExprs = Vector([postfixExpr])
 	push!(postfixExprs, expr_to_postfix(expressions[2]))
 
-	Transpiler.transpile(postfixExpr)
+	generatedCode = Transpiler.transpile(postfixExpr)
 	# CUDA.@sync interpret(postfixExprs, variables, parameters)
+
+	# This is just here for testing. This will be called inside the execute method in the Transpiler module
+	linker = CuLink()
+	add_data!(linker, "ExpressionProcessing", generatedCode)
+
+	image = complete(linker)
+
+	mod = CuModule(image)
+	func = CuFunction(mod, "ExpressionProcessing")
 end
 
 #TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)"