continued understanding given PTX file and made plan on how to approach the transpiler part
	
		
			
	
		
	
	
		
	
		
			Some checks failed
		
		
	
	
		
			
				
	
				CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled
				
			
		
			
				
	
				CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled
				
			
		
			
				
	
				CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled
				
			
		
		
	
	
				
					
				
			
		
			Some checks failed
		
		
	
	CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled
				
			CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled
				
			CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled
				
			This commit is contained in:
		
							
								
								
									
										3
									
								
								.github/workflows/CI.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.github/workflows/CI.yml
									
									
									
									
										vendored
									
									
								
							| @ -32,7 +32,8 @@ jobs: | |||||||
|           - x64 |           - x64 | ||||||
|     steps: |     steps: | ||||||
|       - uses: actions/checkout@v4 |       - uses: actions/checkout@v4 | ||||||
|       - uses: cd /package |       - name: Go to package | ||||||
|  |         run: cd ./package | ||||||
|       - uses: julia-actions/setup-julia@v2 |       - uses: julia-actions/setup-julia@v2 | ||||||
|         with: |         with: | ||||||
|           version: ${{ matrix.version }} |           version: ${{ matrix.version }} | ||||||
|  | |||||||
							
								
								
									
										2
									
								
								.github/workflows/CompatHelper.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/CompatHelper.yml
									
									
									
									
										vendored
									
									
								
							| @ -7,6 +7,8 @@ jobs: | |||||||
|   CompatHelper: |   CompatHelper: | ||||||
|     runs-on: ubuntu-latest |     runs-on: ubuntu-latest | ||||||
|     steps: |     steps: | ||||||
|  |       - name: Got to package folder | ||||||
|  |         run: cd ./package | ||||||
|       - name: Pkg.add("CompatHelper") |       - name: Pkg.add("CompatHelper") | ||||||
|         run: julia -e 'using Pkg; Pkg.add("CompatHelper")' |         run: julia -e 'using Pkg; Pkg.add("CompatHelper")' | ||||||
|       - name: CompatHelper.main() |       - name: CompatHelper.main() | ||||||
|  | |||||||
							
								
								
									
										71
									
								
								PTX_understanding.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										71
									
								
								PTX_understanding.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,71 @@ | |||||||
|  | All Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#instructions | ||||||
|  |  | ||||||
|  | ``` | ||||||
|  | .version 7.1 | ||||||
|  |  | ||||||
|  | .target sm_52 | ||||||
|  | .address_size 64 | ||||||
|  |  | ||||||
|  | 	// .globl       VecAdd_kernel | ||||||
|  |  | ||||||
|  | .visible .entry VecAdd_kernel( | ||||||
|  | 	.param .u64 VecAdd_kernel_param_0, | ||||||
|  | 	.param .u64 VecAdd_kernel_param_1, | ||||||
|  | 	.param .u64 VecAdd_kernel_param_2, | ||||||
|  | 	.param .u32 VecAdd_kernel_param_3 | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | { | ||||||
|  | 	.reg .pred      %p<2>;		-> predicate registers: p1 (needed for branching) | ||||||
|  | 	.reg .f32       %f<4>; 		-> float registers: f1 - f3 | ||||||
|  | 	.reg .b32       %r<6>; 		-> 32 bits registers: r1 - r5 (bits are actual raw bits without a type) | ||||||
|  | 	.reg .b64       %rd<11>; 	-> 64 bits registers: rd1 - rd10 | ||||||
|  |  | ||||||
|  | 	ld.param.u64    %rd1, [VecAdd_kernel_param_0];	-> rd1 = Data1 | ||||||
|  | 	ld.param.u64    %rd2, [VecAdd_kernel_param_1];	-> rd2 = Data2 | ||||||
|  | 	ld.param.u64    %rd3, [VecAdd_kernel_param_2];	-> rd3 = Result | ||||||
|  | 	ld.param.u32    %r2, [VecAdd_kernel_param_3]; 	-> r2 = N | ||||||
|  |  | ||||||
|  | 	mov.u32         %r3, %ntid.x; | ||||||
|  | 	mov.u32         %r4, %ctaid.x; | ||||||
|  | 	mov.u32         %r5, %tid.x; | ||||||
|  |  | ||||||
|  | 	mad.lo.s32      %r1, %r3, %r4, %r5;	-> r3 * r4 -> extract lowest 32/2 bits -> add r5 -> r1 = lowest16Bits(r3*r4) + r5 | ||||||
|  |  | ||||||
|  | 	setp.ge.s32     %p1, %r1, %r2;	-> p1 = r1 >= r2 (setp would assign !p1 to second register if one was given) | ||||||
|  |  | ||||||
|  | 	(gate clause for the case when we start more threads than needed) | ||||||
|  | 	@%p1 bra        \$L__BB0_2;		-> if(p1) then {execute} else {branch to \$L__BB0_2}  | ||||||
|  |  | ||||||
|  | 	cvta.to.global.u64      %rd4, %rd1;	-> convert rd1 to global state space and write address to rd4 (I think) | ||||||
|  |  | ||||||
|  | 	mul.wide.s32    %rd5, %r1, 4;			-> rd5 = r1 * 4 | ||||||
|  | 	add.s64         %rd6, %rd4, %rd5;		-> rd6 = rd4 + rd5 | ||||||
|  | 	cvta.to.global.u64      %rd7, %rd2;		-> same as above cvta | ||||||
|  | 	add.s64         %rd8, %rd7, %rd5;		-> rd8 = rd7 + rd5 | ||||||
|  |  | ||||||
|  | 	ld.global.f32   %f1, [%rd8];		-> f1 = rd8 (loading rd8 in a global f32 register) | ||||||
|  | 	ld.global.f32   %f2, [%rd6];" * | ||||||
|  | 	op * | ||||||
|  | 	"               %f3, %f2, %f1;		-> custom binary operator | ||||||
|  | 	cvta.to.global.u64      %rd9, %rd3; -> load local Result to global Result | ||||||
|  |  | ||||||
|  | 	(I think this aggregates the result because rd9 = rd3 = Result) | ||||||
|  | 	add.s64         %rd10, %rd9, %rd5;	-> rd10 = rd9 + rd5  | ||||||
|  | 	st.global.f32   [%rd10], %f3;		-> rd10 = f3 (We are overwriting the previous result?) | ||||||
|  |  | ||||||
|  | \$L__BB0_2: | ||||||
|  | 	ret; | ||||||
|  | } | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | The above probably calculates this expression: f3 = (x1 + ((r3 * r4 + r5) * 4)   CUSTOM_OPERATOR   (x2 + ((r3 * r4 + r5) * 4))) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # Plan | ||||||
|  |  | ||||||
|  | 1. Generate PTX that only works with constant values and one expression | ||||||
|  | 1. Add support for loading variables and parameters (get vars/params as parameters -> Result still only one number) | ||||||
|  | 1. Add support for loading variables as matrix (params still only one value -> Result now a vector) | ||||||
|  | 1. Add support for loading parameters as "sparse" matrix (Not much should change) | ||||||
|  | 1. Add support for multiple expressions (Result is now a matrix) | ||||||
							
								
								
									
										76
									
								
								package/src/Transpiler.jl
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										76
									
								
								package/src/Transpiler.jl
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,76 @@ | |||||||
|  |  | ||||||
|  | # culoadtest(N, rand(["add.f32", "sub.f32", "mul.f32", "div.approx.f32"])) | ||||||
|  | function culoadtest(N::Int32, op = "add.f32") | ||||||
|  |  | ||||||
|  | 	vadd_code = ".version 7.1 | ||||||
|  | 	 | ||||||
|  | 	.target sm_52 | ||||||
|  | 	.address_size 64 | ||||||
|  | 	 | ||||||
|  | 		// .globl       VecAdd_kernel | ||||||
|  | 	 | ||||||
|  | 	.visible .entry VecAdd_kernel( | ||||||
|  | 		.param .u64 VecAdd_kernel_param_0, | ||||||
|  | 		.param .u64 VecAdd_kernel_param_1, | ||||||
|  | 		.param .u64 VecAdd_kernel_param_2, | ||||||
|  | 		.param .u32 VecAdd_kernel_param_3 | ||||||
|  | 	) | ||||||
|  | 	 | ||||||
|  | 	{ | ||||||
|  | 		.reg .pred      %p<2>; | ||||||
|  | 		.reg .f32       %f<4>; | ||||||
|  | 		.reg .b32       %r<6>; | ||||||
|  | 		.reg .b64       %rd<11>; | ||||||
|  |  | ||||||
|  | 		ld.param.u64    %rd1, [VecAdd_kernel_param_0]; | ||||||
|  | 		ld.param.u64    %rd2, [VecAdd_kernel_param_1]; | ||||||
|  | 		ld.param.u64    %rd3, [VecAdd_kernel_param_2]; | ||||||
|  | 		ld.param.u32    %r2, [VecAdd_kernel_param_3]; | ||||||
|  |  | ||||||
|  | 		mov.u32         %r3, %ntid.x; | ||||||
|  | 		mov.u32         %r4, %ctaid.x; | ||||||
|  | 		mov.u32         %r5, %tid.x; | ||||||
|  |  | ||||||
|  | 		mad.lo.s32      %r1, %r3, %r4, %r5; | ||||||
|  |  | ||||||
|  | 		setp.ge.s32     %p1, %r1, %r2; | ||||||
|  |  | ||||||
|  | 		@%p1 bra        \$L__BB0_2; | ||||||
|  |  | ||||||
|  | 		cvta.to.global.u64      %rd4, %rd1; | ||||||
|  |  | ||||||
|  | 		mul.wide.s32    %rd5, %r1, 4; | ||||||
|  | 		add.s64         %rd6, %rd4, %rd5; | ||||||
|  | 		cvta.to.global.u64      %rd7, %rd2; | ||||||
|  | 		add.s64         %rd8, %rd7, %rd5; | ||||||
|  |  | ||||||
|  | 		ld.global.f32   %f1, [%rd8]; | ||||||
|  | 		ld.global.f32   %f2, [%rd6];" * | ||||||
|  | 		op * | ||||||
|  | 		"               %f3, %f2, %f1; | ||||||
|  | 		cvta.to.global.u64      %rd9, %rd3; | ||||||
|  | 		add.s64         %rd10, %rd9, %rd5; | ||||||
|  | 		st.global.f32   [%rd10], %f3; | ||||||
|  | 	 | ||||||
|  | 	\$L__BB0_2: | ||||||
|  | 		ret; | ||||||
|  | 	}" | ||||||
|  | 	 | ||||||
|  | 		linker = CuLink() | ||||||
|  | 		add_data!(linker, "VecAdd_kernel", vadd_code) | ||||||
|  | 	 | ||||||
|  | 		image = complete(linker) | ||||||
|  | 	 | ||||||
|  | 		mod = CuModule(image) | ||||||
|  | 		func = CuFunction(mod, "VecAdd_kernel") | ||||||
|  | 	 | ||||||
|  | 		d_a = CUDA.fill(1.0f0, N) | ||||||
|  | 		d_b = CUDA.fill(2.0f0, N) | ||||||
|  | 		d_c = CUDA.fill(0.0f0, N) | ||||||
|  | 	 | ||||||
|  | 		# Grid/Block configuration | ||||||
|  | 		threadsPerBlock = 256; | ||||||
|  | 		blocksPerGrid  = (N + threadsPerBlock - 1) ÷ threadsPerBlock; | ||||||
|  | 	 | ||||||
|  | 		@time CUDA.@sync cudacall(func, Tuple{CuPtr{Cfloat},CuPtr{Cfloat},CuPtr{Cfloat},Cint}, d_a, d_b, d_c, N; threads=threadsPerBlock, blocks=blocksPerGrid) | ||||||
|  | 	end | ||||||
		Reference in New Issue
	
	Block a user