started implementing transpilation of expression

2024-10-27 11:48:11 +01:00
parent 0e24d74e54
commit 9fc55c4c15
4 changed files with 78 additions and 26 deletions
--- a/PTX_understanding.md
+++ b/PTX_understanding.md
@ -26,9 +26,9 @@ All Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/index.h
 	ld.param.u64    %rd3, [VecAdd_kernel_param_2];	-> rd3 = Result
 	ld.param.u32    %r2, [VecAdd_kernel_param_3]; 	-> r2 = N

-	mov.u32         %r3, %ntid.x;
-	mov.u32         %r4, %ctaid.x;
-	mov.u32         %r5, %tid.x;
+	mov.u32         %r3, %ntid.x;   -> initialise r3 with ntid.x
+	mov.u32         %r4, %ctaid.x;  -> same as above
+	mov.u32         %r5, %tid.x;    -> same as above 

 	mad.lo.s32      %r1, %r3, %r4, %r5;	-> r3 * r4 -> extract lowest 32/2 bits -> add r5 -> r1 = lowest16Bits(r3*r4) + r5