From ee3c5001bdbf96a2de636187d8ad1903cd32a60f Mon Sep 17 00:00:00 2001 From: Daniel Date: Sun, 20 Oct 2024 12:24:18 +0200 Subject: [PATCH] added information on how to best approach register assignment --- package/src/Transpiler.jl | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/package/src/Transpiler.jl b/package/src/Transpiler.jl index 029a3d5..983c301 100644 --- a/package/src/Transpiler.jl +++ b/package/src/Transpiler.jl @@ -78,6 +78,24 @@ function culoadtest(N::Int32, op = "add.f32") @time CUDA.@sync cudacall(func, Tuple{CuPtr{Cfloat},CuPtr{Cfloat},CuPtr{Cfloat},Cint}, d_a, d_b, d_c, N; threads=threadsPerBlock, blocks=blocksPerGrid) end +# Number of threads per block/SM + max number of registers +# https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications +# Need to assume a max of 2048 threads per Streaming Multiprocessor (SM) +# One SM can have 64*1024 32-bit registers at max +# One thread can at max use 255 registers +# Meaning one has access to at most 32 registers in the worst case. Using 64 bit values this number gets halfed (see: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#multiprocessor-level (almost at the end of the linked section)) + +# I think I will go with max 16 registers for now and leave a better register allocation technique for future work +# Maybe helpful for future tuning: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#maximum-number-of-registers-per-thread + +# https://docs.nvidia.com/cuda/cuda-c-programming-guide/#multiprocessor-level +# This states, that using fewer registers allows more threads to reside on a single SM which improves performance. +# So I could use more registers at the expense for performance. Depending on how this would simplify my algorithm, I might do this and leave more optimisation to future work + +# Since the generated expressions should have between 10 and 50 symbols, I think allowing a max. of 128 32-bit registers should make for an easy algorithm. If during testing the result is slow, maybe try reducing the number of registers and perform more intelligent allocation/assignment +# With 128 Registers, one could have 32 Warps on one SM ((128 * 16 = 2048) * 32 == 64*1024 == max number of registers per SM) This means 512 Threads per SM in the worst case + + const exitJumpLocationMarker = "\$L__BB0_2" function transpile(expression::ExpressionProcessing.PostfixType) ptxBuffer = IOBuffer()