transpiler: trying to fix problem with writing to global memory; not yet fixed

2025-03-27 18:00:31 +01:00
parent eaee21ca75
commit 561b37160b
2 changed files with 108 additions and 43 deletions
--- a/package/src/Transpiler.jl
+++ b/package/src/Transpiler.jl
@ -3,23 +3,9 @@ using CUDA
 using ..ExpressionProcessing
 using ..Utils

-# Number of threads per block/SM + max number of registers
 # https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications
-# Need to assume a max of 2048 threads per Streaming Multiprocessor (SM)
-# One SM can have 64*1024 32-bit registers at max
-# One thread can at max use 255 registers
-# Meaning one has access to at most 32 registers in the worst case. Using 64 bit values this number gets halfed (see: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#multiprocessor-level (almost at the end of the linked section))
-
-# Maybe helpful for future performance tuning: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#maximum-number-of-registers-per-thread
-
-# https://docs.nvidia.com/cuda/cuda-c-programming-guide/#multiprocessor-level
-# This states, that using fewer registers allows more threads to reside on a single SM which improves performance. 
-# So I could use more registers at the expense for performance. Depending on how this would simplify my algorithm, I might do this and leave more optimisation to future work
-
-# Since the generated expressions should have between 10 and 50 symbols, I think allowing a max. of 128 32-bit registers should make for an easy algorithm. If during testing the result is slow, maybe try reducing the number of registers and perform more intelligent allocation/assignment
-# With 128 Registers, one could have 32 Warps on one SM ((128 * 16 = 2048) * 32 == 64*1024 == max number of registers per SM) This means 512 Threads per SM in the worst case
-

+const BYTES = sizeof(Float32)
 const Operand = Union{Float32, String} # Operand is either fixed value or register
 cache = Dict{Expr, CuFunction}() # needed if multiple runs with the same expr but different parameters are performed

@ -41,6 +27,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
 	# 	mod = CuModule(image)
 	# 	kernels[i] = CuFunction(mod, "ExpressionProcessing")
 	# end
+
 	for i in eachindex(expressions)
 		if haskey(cache, expressions[i])
 			kernels[i] = cache[expressions[i]]
@ -50,6 +37,8 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
 		formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
 		kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i)
 		
+		println(kernel)
+
 		linker = CuLink()
 		add_data!(linker, "ExpressionProcessing", kernel)
 		
@ -65,6 +54,12 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet

 	# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
 	cudaResults = CuArray{Float32}(undef, variableCols, length(expressions))
+	# cudaResults = CUDA.zeros(variableCols * length(expressions))
+	# ptr = CuPtr{Float32}(C_NULL)
+	# CUDA.cuMemAlloc(ptr, sizeof(Float32) * 10)
+	# values = Float32[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
+	# CUDA.cuMemcpyHtoD(ptr, values, sizeof(Float32) * 10)
+	# copyto!(cudaResults, z)

 	# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
 	for i in eachindex(kernels)
@ -72,9 +67,14 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
 		threads = min(variableCols, config.threads)
 		blocks = cld(variableCols, threads)

-		cudacall(kernels[i], Tuple{CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}}, cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
-		break
+		# cudacall(kernels[i], (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
+		# launch(kernels[i], cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
+		launch(kernels[i], cudaResults; threads=threads, blocks=blocks)
 	end
+	
+	println(Array(cudaResults))
+	# cudaResults = nothing
+	# CUDA.cuMemFree(ptr)
 end

 # To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string
@ -89,16 +89,20 @@ function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Int
 	exitJumpLocationMarker = "\$L__BB0_2"
 	ptxBuffer = IOBuffer()

+	# TODO: Temp fix. Make these types and create new instances for every call to this function. Otherwise we cannot parallelise the transpilation
+	reset_registers()
+	reset_symtable()
+
 	# TODO: Suboptimal solution
-	signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Float32, Float32, Float32]) # nrOfVarSets, Vars, Params
-	guardClause, threadIdReg = get_guard_clause(exitJumpLocationMarker, nrOfVariableSets) # parameter0 because first entry holds the number of variable sets and that is always stored in %parameter0
+	signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Float32]) # Vars, Params, Results
+	guardClause, threadIdReg = get_guard_clause(exitJumpLocationMarker, nrOfVariableSets)

 	println(ptxBuffer, get_cuda_header())
 	println(ptxBuffer, signature)
 	println(ptxBuffer, "{")


-	calc_code = generate_calculation_code(expression, "%parameter0", varSetSize, "%parameter1", paramSetSize, "%parameter2", 
+	calc_code = generate_calculation_code(expression, "%parameter0", varSetSize, "%parameter1", paramSetSize, "%parameter0", 
 										  threadIdReg, expressionIndex, nrOfVariableSets)
 	println(ptxBuffer, get_register_definitions())
 	println(ptxBuffer, paramLoading)
@ -116,7 +120,7 @@ end
 # TODO: Make version, target and address_size configurable; also see what address_size means exactly
 function get_cuda_header()::String
 	return "
-.version 8.0
+.version 7.1
 .target sm_61
 .address_size 32
 "
@ -135,7 +139,7 @@ function get_kernel_signature(kernelName::String, parameters::Vector{DataType}):
 	for i in eachindex(parameters)
 		print(signatureBuffer, "  .param .u32", " ", "param_", i)

-		parametersLocation = get_next_free_register("r")
+		parametersLocation = get_next_free_register("i")
 		println(paramLoadingBuffer, "ld.param.u32   $parametersLocation, [param_$i];")
 		println(paramLoadingBuffer, "cvta.to.global.u32   $(get_next_free_register("parameter")), $parametersLocation;")
 		if i != lastindex(parameters)
@ -165,12 +169,12 @@ function get_guard_clause(exitJumpLocation::String, nrOfVarSets::Integer)::Tuple
 	println(guardBuffer, "mov.u32    $currentThreadId, %tid.x;")

 	globalThreadId = get_next_free_register("r") # basically the index of the thread in the variable set
-	breakCondition = get_next_free_register("p")
+	# breakCondition = get_next_free_register("p")
 	println(guardBuffer, "mad.lo.s32     $globalThreadId, $threadIds, $threadsPerCTA, $currentThreadId;")
-	println(guardBuffer, "setp.ge.s32    $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets
+	# println(guardBuffer, "setp.ge.s32    $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets

 	# branch to end if breakCondition is true
-	print(guardBuffer, "@$breakCondition bra    $exitJumpLocation;")
+	# print(guardBuffer, "@$breakCondition bra    $exitJumpLocation;")

 	return (String(take!(guardBuffer)), globalThreadId)
 end
@ -181,6 +185,9 @@ end
 function generate_calculation_code(expression::ExpressionProcessing.PostfixType, variablesLocation::String, variablesSetSize::Integer, 
 								   parametersLocation::String, parametersSetSize::Integer, resultsLocation::String, 
 								   threadIdReg::String, expressionIndex::Integer, nrOfVarSets::Integer)::String
+
+	return "st.global.f32  [$resultsLocation], 10.0;"
+
 	codeBuffer = IOBuffer()
 	operands = Vector{Operand}()

@ -223,15 +230,15 @@ function generate_calculation_code(expression::ExpressionProcessing.PostfixType,

 	# resultIndex = ((expressionIndex - 1) * nrOfVarSets + threadIdReg) * bytes (4 in our case)
 	# resultsLocation[resultIndex] = "";
-	tempReg = get_next_free_register("i")
-	println(codeBuffer, "
-	add.u32        $tempReg, $((expressionIndex-1)*nrOfVarSets), $threadIdReg;
-	mul.lo.u32     $tempReg, $tempReg, $(sizeof(expressionIndex));
-	add.u32        $tempReg, $resultsLocation, $tempReg;
-	st.global.f32  [$tempReg], $(pop!(operands));
-	")
+	# tempReg = get_next_free_register("i")
+	# println(codeBuffer, "
+	# add.u32        $tempReg, $((expressionIndex-1)*nrOfVarSets), $threadIdReg;
+	# mul.lo.u32     $tempReg, $tempReg, $BYTES;
+	# add.u32        $tempReg, $resultsLocation, $tempReg;
+	# st.global.f32  [$tempReg], $(pop!(operands));
+	# ")
+	println(codeBuffer, "st.global.f32  [$resultsLocation], 10.0;")

-	println(operands)
 	return String(take!(codeBuffer))
 end

@ -248,10 +255,9 @@ function load_into_register(register::String, loadLocation::String, valueIndex::
 	# loadLocation + startIndex + valueIndex * bytes (4 in our case)
 	# startIndex: setIndex * setSize
 	tempReg = get_next_free_register("i")
-	# we are using "sizeof(valueIndex)" because it has to use the same amount of bytes as the actual stored values, even though it could use more bytes
 	return "
 	mul.lo.u32 $tempReg, $setIndexReg, $setSize;
-	add.u32 $tempReg, $tempReg, $(valueIndex*sizeof(valueIndex));
+	add.u32 $tempReg, $tempReg, $(valueIndex*BYTES);
 	add.u32 $tempReg, $loadLocation, $tempReg;
 	ld.global.f32 $register, [$tempReg];"
 end
@ -316,6 +322,12 @@ end
 let registers = Dict() # stores the count of the register already used.
 	global get_next_free_register
 	global get_register_definitions
+	global reset_registers
+
+	function reset_registers()
+		registers = Dict()
+	end
+

 	# By convention these names correspond to the following types:
 	# - p -> pred
@ -363,6 +375,11 @@ end

 let symtable = Dict()
 	global get_register_for_name
+	global reset_symtable
+
+	function reset_symtable()
+		symtable = Dict()
+	end

 	"Returns the register for this variable/parameter and true if it is used for the first time and false otherwise."
 	function get_register_for_name(varName::String)
--- a/package/test/TranspilerTests.jl
+++ b/package/test/TranspilerTests.jl
@ -27,18 +27,18 @@ parameters[2][2] = 0.0
 	push!(postfixExprs, expr_to_postfix(:(5^3 + x1 - p1)))

 	# generatedCode = Transpiler.transpile(postfixExpr)
-	generatedCode = Transpiler.transpile(postfixExprs[3], 2, 3, 2, 3) # TEMP
-	println(generatedCode)
+	# generatedCode = Transpiler.transpile(postfixExprs[3], 2, 3, 2, 3) # TEMP
+	# println(generatedCode)
 	# CUDA.@sync interpret(postfixExprs, variables, parameters)

 	# This is just here for testing. This will be called inside the execute method in the Transpiler module
-	linker = CuLink()
-	add_data!(linker, "ExpressionProcessing", generatedCode)
+	# linker = CuLink()
+	# add_data!(linker, "ExpressionProcessing", generatedCode)

-	image = complete(linker)
+	# image = complete(linker)

-	mod = CuModule(image)
-	func = CuFunction(mod, "ExpressionProcessing")
+	# mod = CuModule(image)
+	# func = CuFunction(mod, "ExpressionProcessing")
 end

@testset "Test transpiler evaluation" begin
@ -46,7 +46,55 @@ end
 	# push!(postfixExprs, expressions[1])
 	# push!(postfixExprs, expressions[2])

-	@time Transpiler.evaluate(expressions, variables, parameters)
+	expr = Vector{Expr}()
+	push!(expr, expressions[1])
+	# @time Transpiler.evaluate(expr, variables, parameters)
 end

 #TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)"
+
+@testset "TEMP" begin
+	ptx = "
+	.version 7.1
+	.target sm_61
+	.address_size 64
+
+	.visible .entry ExpressionProcessing(
+	.param .u32 param_1)
+	{
+		.reg .u32   %parameter<1>;
+		.reg .u32   %i<1>;
+
+		ld.param.u32   %i0, [param_1];
+		cvta.to.global.u32   %parameter0, %i0;
+
+		st.global.f32  [%parameter0], 10.0;
+		ret;
+	}"
+
+	linker = CuLink()
+	add_data!(linker, "ExpressionProcessing", ptx)
+	
+	image = complete(linker)
+	
+	mod = CuModule(image)
+	func = CuFunction(mod, "ExpressionProcessing")
+
+	variableCols = 2
+	cudaResults = CuArray{Float32}(undef, 1)
+	# cd = CUDA.alloc(CUDA.DeviceMemory, (variableCols * length(expressions)) * sizeof(Float32))
+	# cudaResults = CUDA.fill(0f0, variableCols * length(expressions))
+	# cudaResults = cu(zeros(Float32, variableCols * length(expressions)))
+
+	config = launch_configuration(func)
+	threads = min(variableCols, config.threads)
+	blocks = cld(variableCols, threads)
+
+	cudacall(func, Tuple{CuPtr{Float32}}, cudaResults; threads=1, blocks=1)
+	# launch(func, cudaResults; threads=threads, blocks=blocks)
+
+	println(Array(cudaResults))
+end
+
+
+# TODO: University setup at 10.20.1.7