From ee3c5001bdbf96a2de636187d8ad1903cd32a60f Mon Sep 17 00:00:00 2001
From: Daniel <danwip@hotmail.com>
Date: Sun, 20 Oct 2024 12:24:18 +0200
Subject: [PATCH] added information on how to best approach register assignment

---
 package/src/Transpiler.jl | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/package/src/Transpiler.jl b/package/src/Transpiler.jl
index 029a3d5..983c301 100644
--- a/package/src/Transpiler.jl
+++ b/package/src/Transpiler.jl
@@ -78,6 +78,24 @@ function culoadtest(N::Int32, op = "add.f32")
 		@time CUDA.@sync cudacall(func, Tuple{CuPtr{Cfloat},CuPtr{Cfloat},CuPtr{Cfloat},Cint}, d_a, d_b, d_c, N; threads=threadsPerBlock, blocks=blocksPerGrid)
 end
 
+# Number of threads per block/SM + max number of registers
+# https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications
+# Need to assume a max of 2048 threads per Streaming Multiprocessor (SM)
+# One SM can have 64*1024 32-bit registers at max
+# One thread can at max use 255 registers
+# Meaning one has access to at most 32 registers in the worst case. Using 64 bit values this number gets halfed (see: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#multiprocessor-level (almost at the end of the linked section))
+
+# I think I will go with max 16 registers for now and leave a better register allocation technique for future work
+# Maybe helpful for future tuning: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#maximum-number-of-registers-per-thread
+
+# https://docs.nvidia.com/cuda/cuda-c-programming-guide/#multiprocessor-level
+# This states, that using fewer registers allows more threads to reside on a single SM which improves performance. 
+# So I could use more registers at the expense for performance. Depending on how this would simplify my algorithm, I might do this and leave more optimisation to future work
+
+# Since the generated expressions should have between 10 and 50 symbols, I think allowing a max. of 128 32-bit registers should make for an easy algorithm. If during testing the result is slow, maybe try reducing the number of registers and perform more intelligent allocation/assignment
+# With 128 Registers, one could have 32 Warps on one SM ((128 * 16 = 2048) * 32 == 64*1024 == max number of registers per SM) This means 512 Threads per SM in the worst case
+
+
 const exitJumpLocationMarker = "\$L__BB0_2"
 function transpile(expression::ExpressionProcessing.PostfixType)
 	ptxBuffer = IOBuffer()