transpiler: invalid memory access error finally fixed

2025-03-27 22:32:24 +01:00
parent 561b37160b
commit 9df78ca72e
7 changed files with 133 additions and 30 deletions
--- a/package/src/Transpiler.jl
+++ b/package/src/Transpiler.jl
@ -53,7 +53,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
 	cudaParams = Utils.create_cuda_array(parameters, NaN32) # maybe make constant (see runtests.jl for more info)

 	# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
-	cudaResults = CuArray{Float32}(undef, variableCols, length(expressions))
+	cudaResults = CuArray{Float32}(undef, variableCols * length(expressions))
 	# cudaResults = CUDA.zeros(variableCols * length(expressions))
 	# ptr = CuPtr{Float32}(C_NULL)
 	# CUDA.cuMemAlloc(ptr, sizeof(Float32) * 10)
@ -68,8 +68,9 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
 		blocks = cld(variableCols, threads)

 		# cudacall(kernels[i], (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
+		cudacall(kernels[i], (CuPtr{Float32},), cudaResults; threads=threads, blocks=blocks)
 		# launch(kernels[i], cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
-		launch(kernels[i], cudaResults; threads=threads, blocks=blocks)
+		# launch(kernels[i], cudaResults; threads=threads, blocks=blocks)
 	end
 	
 	println(Array(cudaResults))
@ -120,9 +121,9 @@ end
 # TODO: Make version, target and address_size configurable; also see what address_size means exactly
 function get_cuda_header()::String
 	return "
-.version 7.1
+.version 8.5
 .target sm_61
-.address_size 32
+.address_size 64
 "
 end

@ -137,11 +138,11 @@ function get_kernel_signature(kernelName::String, parameters::Vector{DataType}):
 	println(signatureBuffer, "(")
 	
 	for i in eachindex(parameters)
-		print(signatureBuffer, "  .param .u32", " ", "param_", i)
+		print(signatureBuffer, "  .param .u64", " ", "param_", i)

 		parametersLocation = get_next_free_register("i")
-		println(paramLoadingBuffer, "ld.param.u32   $parametersLocation, [param_$i];")
-		println(paramLoadingBuffer, "cvta.to.global.u32   $(get_next_free_register("parameter")), $parametersLocation;")
+		println(paramLoadingBuffer, "ld.param.u64   $parametersLocation, [param_$i];")
+		println(paramLoadingBuffer, "cvta.to.global.u64   $(get_next_free_register("parameter")), $parametersLocation;")
 		if i != lastindex(parameters)
 			println(signatureBuffer, ",")
 		end
@ -169,12 +170,12 @@ function get_guard_clause(exitJumpLocation::String, nrOfVarSets::Integer)::Tuple
 	println(guardBuffer, "mov.u32    $currentThreadId, %tid.x;")

 	globalThreadId = get_next_free_register("r") # basically the index of the thread in the variable set
-	# breakCondition = get_next_free_register("p")
+	breakCondition = get_next_free_register("p")
 	println(guardBuffer, "mad.lo.s32     $globalThreadId, $threadIds, $threadsPerCTA, $currentThreadId;")
-	# println(guardBuffer, "setp.ge.s32    $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets
+	println(guardBuffer, "setp.gt.s32    $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets

 	# branch to end if breakCondition is true
-	# print(guardBuffer, "@$breakCondition bra    $exitJumpLocation;")
+	print(guardBuffer, "@$breakCondition bra    $exitJumpLocation;")

 	return (String(take!(guardBuffer)), globalThreadId)
 end
@ -186,7 +187,7 @@ function generate_calculation_code(expression::ExpressionProcessing.PostfixType,
 								   parametersLocation::String, parametersSetSize::Integer, resultsLocation::String, 
 								   threadIdReg::String, expressionIndex::Integer, nrOfVarSets::Integer)::String

-	return "st.global.f32  [$resultsLocation], 10.0;"
+	# return "st.global.f32  [$resultsLocation], 10.0;"

 	codeBuffer = IOBuffer()
 	operands = Vector{Operand}()
@ -360,9 +361,9 @@ let registers = Dict() # stores the count of the register already used.
 			elseif definition.first == "r"
 				regType = ".b32"
 			elseif definition.first == "parameter"
-				regType = ".u32"
+				regType = ".b64"
 			elseif definition.first == "i"
-				regType = ".u32"
+				regType = ".b64"
 			else
 				throw(ArgumentError("Unknown register name used. Name '$(definition.first)' cannot be mapped to a PTX type."))
 			end