transpiler: trying to fix problem with writing to global memory; not yet fixed
	
		
			
	
		
	
	
		
	
		
			Some checks are pending
		
		
	
	
		
			
				
	
				CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
				
			
		
			
				
	
				CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
				
			
		
			
				
	
				CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
				
			
		
		
	
	
				
					
				
			
		
			Some checks are pending
		
		
	
	CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
				
			CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
				
			CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
				
			This commit is contained in:
		@ -3,23 +3,9 @@ using CUDA
 | 
			
		||||
using ..ExpressionProcessing
 | 
			
		||||
using ..Utils
 | 
			
		||||
 | 
			
		||||
# Number of threads per block/SM + max number of registers
 | 
			
		||||
# https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications
 | 
			
		||||
# Need to assume a max of 2048 threads per Streaming Multiprocessor (SM)
 | 
			
		||||
# One SM can have 64*1024 32-bit registers at max
 | 
			
		||||
# One thread can at max use 255 registers
 | 
			
		||||
# Meaning one has access to at most 32 registers in the worst case. Using 64 bit values this number gets halfed (see: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#multiprocessor-level (almost at the end of the linked section))
 | 
			
		||||
 | 
			
		||||
# Maybe helpful for future performance tuning: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#maximum-number-of-registers-per-thread
 | 
			
		||||
 | 
			
		||||
# https://docs.nvidia.com/cuda/cuda-c-programming-guide/#multiprocessor-level
 | 
			
		||||
# This states, that using fewer registers allows more threads to reside on a single SM which improves performance. 
 | 
			
		||||
# So I could use more registers at the expense for performance. Depending on how this would simplify my algorithm, I might do this and leave more optimisation to future work
 | 
			
		||||
 | 
			
		||||
# Since the generated expressions should have between 10 and 50 symbols, I think allowing a max. of 128 32-bit registers should make for an easy algorithm. If during testing the result is slow, maybe try reducing the number of registers and perform more intelligent allocation/assignment
 | 
			
		||||
# With 128 Registers, one could have 32 Warps on one SM ((128 * 16 = 2048) * 32 == 64*1024 == max number of registers per SM) This means 512 Threads per SM in the worst case
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
const BYTES = sizeof(Float32)
 | 
			
		||||
const Operand = Union{Float32, String} # Operand is either fixed value or register
 | 
			
		||||
cache = Dict{Expr, CuFunction}() # needed if multiple runs with the same expr but different parameters are performed
 | 
			
		||||
 | 
			
		||||
@ -41,6 +27,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
 | 
			
		||||
	# 	mod = CuModule(image)
 | 
			
		||||
	# 	kernels[i] = CuFunction(mod, "ExpressionProcessing")
 | 
			
		||||
	# end
 | 
			
		||||
 | 
			
		||||
	for i in eachindex(expressions)
 | 
			
		||||
		if haskey(cache, expressions[i])
 | 
			
		||||
			kernels[i] = cache[expressions[i]]
 | 
			
		||||
@ -50,6 +37,8 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
 | 
			
		||||
		formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
 | 
			
		||||
		kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i)
 | 
			
		||||
		
 | 
			
		||||
		println(kernel)
 | 
			
		||||
 | 
			
		||||
		linker = CuLink()
 | 
			
		||||
		add_data!(linker, "ExpressionProcessing", kernel)
 | 
			
		||||
		
 | 
			
		||||
@ -65,6 +54,12 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
 | 
			
		||||
 | 
			
		||||
	# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
 | 
			
		||||
	cudaResults = CuArray{Float32}(undef, variableCols, length(expressions))
 | 
			
		||||
	# cudaResults = CUDA.zeros(variableCols * length(expressions))
 | 
			
		||||
	# ptr = CuPtr{Float32}(C_NULL)
 | 
			
		||||
	# CUDA.cuMemAlloc(ptr, sizeof(Float32) * 10)
 | 
			
		||||
	# values = Float32[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
 | 
			
		||||
	# CUDA.cuMemcpyHtoD(ptr, values, sizeof(Float32) * 10)
 | 
			
		||||
	# copyto!(cudaResults, z)
 | 
			
		||||
 | 
			
		||||
	# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
 | 
			
		||||
	for i in eachindex(kernels)
 | 
			
		||||
@ -72,9 +67,14 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
 | 
			
		||||
		threads = min(variableCols, config.threads)
 | 
			
		||||
		blocks = cld(variableCols, threads)
 | 
			
		||||
 | 
			
		||||
		cudacall(kernels[i], Tuple{CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}}, cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
 | 
			
		||||
		break
 | 
			
		||||
		# cudacall(kernels[i], (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
 | 
			
		||||
		# launch(kernels[i], cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
 | 
			
		||||
		launch(kernels[i], cudaResults; threads=threads, blocks=blocks)
 | 
			
		||||
	end
 | 
			
		||||
	
 | 
			
		||||
	println(Array(cudaResults))
 | 
			
		||||
	# cudaResults = nothing
 | 
			
		||||
	# CUDA.cuMemFree(ptr)
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
# To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string
 | 
			
		||||
@ -89,16 +89,20 @@ function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Int
 | 
			
		||||
	exitJumpLocationMarker = "\$L__BB0_2"
 | 
			
		||||
	ptxBuffer = IOBuffer()
 | 
			
		||||
 | 
			
		||||
	# TODO: Temp fix. Make these types and create new instances for every call to this function. Otherwise we cannot parallelise the transpilation
 | 
			
		||||
	reset_registers()
 | 
			
		||||
	reset_symtable()
 | 
			
		||||
 | 
			
		||||
	# TODO: Suboptimal solution
 | 
			
		||||
	signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Float32, Float32, Float32]) # nrOfVarSets, Vars, Params
 | 
			
		||||
	guardClause, threadIdReg = get_guard_clause(exitJumpLocationMarker, nrOfVariableSets) # parameter0 because first entry holds the number of variable sets and that is always stored in %parameter0
 | 
			
		||||
	signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Float32]) # Vars, Params, Results
 | 
			
		||||
	guardClause, threadIdReg = get_guard_clause(exitJumpLocationMarker, nrOfVariableSets)
 | 
			
		||||
 | 
			
		||||
	println(ptxBuffer, get_cuda_header())
 | 
			
		||||
	println(ptxBuffer, signature)
 | 
			
		||||
	println(ptxBuffer, "{")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	calc_code = generate_calculation_code(expression, "%parameter0", varSetSize, "%parameter1", paramSetSize, "%parameter2", 
 | 
			
		||||
	calc_code = generate_calculation_code(expression, "%parameter0", varSetSize, "%parameter1", paramSetSize, "%parameter0", 
 | 
			
		||||
										  threadIdReg, expressionIndex, nrOfVariableSets)
 | 
			
		||||
	println(ptxBuffer, get_register_definitions())
 | 
			
		||||
	println(ptxBuffer, paramLoading)
 | 
			
		||||
@ -116,7 +120,7 @@ end
 | 
			
		||||
# TODO: Make version, target and address_size configurable; also see what address_size means exactly
 | 
			
		||||
function get_cuda_header()::String
 | 
			
		||||
	return "
 | 
			
		||||
.version 8.0
 | 
			
		||||
.version 7.1
 | 
			
		||||
.target sm_61
 | 
			
		||||
.address_size 32
 | 
			
		||||
"
 | 
			
		||||
@ -135,7 +139,7 @@ function get_kernel_signature(kernelName::String, parameters::Vector{DataType}):
 | 
			
		||||
	for i in eachindex(parameters)
 | 
			
		||||
		print(signatureBuffer, "  .param .u32", " ", "param_", i)
 | 
			
		||||
 | 
			
		||||
		parametersLocation = get_next_free_register("r")
 | 
			
		||||
		parametersLocation = get_next_free_register("i")
 | 
			
		||||
		println(paramLoadingBuffer, "ld.param.u32   $parametersLocation, [param_$i];")
 | 
			
		||||
		println(paramLoadingBuffer, "cvta.to.global.u32   $(get_next_free_register("parameter")), $parametersLocation;")
 | 
			
		||||
		if i != lastindex(parameters)
 | 
			
		||||
@ -165,12 +169,12 @@ function get_guard_clause(exitJumpLocation::String, nrOfVarSets::Integer)::Tuple
 | 
			
		||||
	println(guardBuffer, "mov.u32    $currentThreadId, %tid.x;")
 | 
			
		||||
 | 
			
		||||
	globalThreadId = get_next_free_register("r") # basically the index of the thread in the variable set
 | 
			
		||||
	breakCondition = get_next_free_register("p")
 | 
			
		||||
	# breakCondition = get_next_free_register("p")
 | 
			
		||||
	println(guardBuffer, "mad.lo.s32     $globalThreadId, $threadIds, $threadsPerCTA, $currentThreadId;")
 | 
			
		||||
	println(guardBuffer, "setp.ge.s32    $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets
 | 
			
		||||
	# println(guardBuffer, "setp.ge.s32    $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets
 | 
			
		||||
 | 
			
		||||
	# branch to end if breakCondition is true
 | 
			
		||||
	print(guardBuffer, "@$breakCondition bra    $exitJumpLocation;")
 | 
			
		||||
	# print(guardBuffer, "@$breakCondition bra    $exitJumpLocation;")
 | 
			
		||||
 | 
			
		||||
	return (String(take!(guardBuffer)), globalThreadId)
 | 
			
		||||
end
 | 
			
		||||
@ -181,6 +185,9 @@ end
 | 
			
		||||
function generate_calculation_code(expression::ExpressionProcessing.PostfixType, variablesLocation::String, variablesSetSize::Integer, 
 | 
			
		||||
								   parametersLocation::String, parametersSetSize::Integer, resultsLocation::String, 
 | 
			
		||||
								   threadIdReg::String, expressionIndex::Integer, nrOfVarSets::Integer)::String
 | 
			
		||||
 | 
			
		||||
	return "st.global.f32  [$resultsLocation], 10.0;"
 | 
			
		||||
 | 
			
		||||
	codeBuffer = IOBuffer()
 | 
			
		||||
	operands = Vector{Operand}()
 | 
			
		||||
 | 
			
		||||
@ -223,15 +230,15 @@ function generate_calculation_code(expression::ExpressionProcessing.PostfixType,
 | 
			
		||||
 | 
			
		||||
	# resultIndex = ((expressionIndex - 1) * nrOfVarSets + threadIdReg) * bytes (4 in our case)
 | 
			
		||||
	# resultsLocation[resultIndex] = "";
 | 
			
		||||
	tempReg = get_next_free_register("i")
 | 
			
		||||
	println(codeBuffer, "
 | 
			
		||||
	add.u32        $tempReg, $((expressionIndex-1)*nrOfVarSets), $threadIdReg;
 | 
			
		||||
	mul.lo.u32     $tempReg, $tempReg, $(sizeof(expressionIndex));
 | 
			
		||||
	add.u32        $tempReg, $resultsLocation, $tempReg;
 | 
			
		||||
	st.global.f32  [$tempReg], $(pop!(operands));
 | 
			
		||||
	")
 | 
			
		||||
	# tempReg = get_next_free_register("i")
 | 
			
		||||
	# println(codeBuffer, "
 | 
			
		||||
	# add.u32        $tempReg, $((expressionIndex-1)*nrOfVarSets), $threadIdReg;
 | 
			
		||||
	# mul.lo.u32     $tempReg, $tempReg, $BYTES;
 | 
			
		||||
	# add.u32        $tempReg, $resultsLocation, $tempReg;
 | 
			
		||||
	# st.global.f32  [$tempReg], $(pop!(operands));
 | 
			
		||||
	# ")
 | 
			
		||||
	println(codeBuffer, "st.global.f32  [$resultsLocation], 10.0;")
 | 
			
		||||
 | 
			
		||||
	println(operands)
 | 
			
		||||
	return String(take!(codeBuffer))
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
@ -248,10 +255,9 @@ function load_into_register(register::String, loadLocation::String, valueIndex::
 | 
			
		||||
	# loadLocation + startIndex + valueIndex * bytes (4 in our case)
 | 
			
		||||
	# startIndex: setIndex * setSize
 | 
			
		||||
	tempReg = get_next_free_register("i")
 | 
			
		||||
	# we are using "sizeof(valueIndex)" because it has to use the same amount of bytes as the actual stored values, even though it could use more bytes
 | 
			
		||||
	return "
 | 
			
		||||
	mul.lo.u32 $tempReg, $setIndexReg, $setSize;
 | 
			
		||||
	add.u32 $tempReg, $tempReg, $(valueIndex*sizeof(valueIndex));
 | 
			
		||||
	add.u32 $tempReg, $tempReg, $(valueIndex*BYTES);
 | 
			
		||||
	add.u32 $tempReg, $loadLocation, $tempReg;
 | 
			
		||||
	ld.global.f32 $register, [$tempReg];"
 | 
			
		||||
end
 | 
			
		||||
@ -316,6 +322,12 @@ end
 | 
			
		||||
let registers = Dict() # stores the count of the register already used.
 | 
			
		||||
	global get_next_free_register
 | 
			
		||||
	global get_register_definitions
 | 
			
		||||
	global reset_registers
 | 
			
		||||
 | 
			
		||||
	function reset_registers()
 | 
			
		||||
		registers = Dict()
 | 
			
		||||
	end
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	# By convention these names correspond to the following types:
 | 
			
		||||
	# - p -> pred
 | 
			
		||||
@ -363,6 +375,11 @@ end
 | 
			
		||||
 | 
			
		||||
let symtable = Dict()
 | 
			
		||||
	global get_register_for_name
 | 
			
		||||
	global reset_symtable
 | 
			
		||||
 | 
			
		||||
	function reset_symtable()
 | 
			
		||||
		symtable = Dict()
 | 
			
		||||
	end
 | 
			
		||||
 | 
			
		||||
	"Returns the register for this variable/parameter and true if it is used for the first time and false otherwise."
 | 
			
		||||
	function get_register_for_name(varName::String)
 | 
			
		||||
 | 
			
		||||
@ -27,18 +27,18 @@ parameters[2][2] = 0.0
 | 
			
		||||
	push!(postfixExprs, expr_to_postfix(:(5^3 + x1 - p1)))
 | 
			
		||||
 | 
			
		||||
	# generatedCode = Transpiler.transpile(postfixExpr)
 | 
			
		||||
	generatedCode = Transpiler.transpile(postfixExprs[3], 2, 3, 2, 3) # TEMP
 | 
			
		||||
	println(generatedCode)
 | 
			
		||||
	# generatedCode = Transpiler.transpile(postfixExprs[3], 2, 3, 2, 3) # TEMP
 | 
			
		||||
	# println(generatedCode)
 | 
			
		||||
	# CUDA.@sync interpret(postfixExprs, variables, parameters)
 | 
			
		||||
 | 
			
		||||
	# This is just here for testing. This will be called inside the execute method in the Transpiler module
 | 
			
		||||
	linker = CuLink()
 | 
			
		||||
	add_data!(linker, "ExpressionProcessing", generatedCode)
 | 
			
		||||
	# linker = CuLink()
 | 
			
		||||
	# add_data!(linker, "ExpressionProcessing", generatedCode)
 | 
			
		||||
 | 
			
		||||
	image = complete(linker)
 | 
			
		||||
	# image = complete(linker)
 | 
			
		||||
 | 
			
		||||
	mod = CuModule(image)
 | 
			
		||||
	func = CuFunction(mod, "ExpressionProcessing")
 | 
			
		||||
	# mod = CuModule(image)
 | 
			
		||||
	# func = CuFunction(mod, "ExpressionProcessing")
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
@testset "Test transpiler evaluation" begin
 | 
			
		||||
@ -46,7 +46,55 @@ end
 | 
			
		||||
	# push!(postfixExprs, expressions[1])
 | 
			
		||||
	# push!(postfixExprs, expressions[2])
 | 
			
		||||
 | 
			
		||||
	@time Transpiler.evaluate(expressions, variables, parameters)
 | 
			
		||||
	expr = Vector{Expr}()
 | 
			
		||||
	push!(expr, expressions[1])
 | 
			
		||||
	# @time Transpiler.evaluate(expr, variables, parameters)
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
#TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)"
 | 
			
		||||
 | 
			
		||||
@testset "TEMP" begin
 | 
			
		||||
	ptx = "
 | 
			
		||||
	.version 7.1
 | 
			
		||||
	.target sm_61
 | 
			
		||||
	.address_size 64
 | 
			
		||||
 | 
			
		||||
	.visible .entry ExpressionProcessing(
 | 
			
		||||
	.param .u32 param_1)
 | 
			
		||||
	{
 | 
			
		||||
		.reg .u32   %parameter<1>;
 | 
			
		||||
		.reg .u32   %i<1>;
 | 
			
		||||
 | 
			
		||||
		ld.param.u32   %i0, [param_1];
 | 
			
		||||
		cvta.to.global.u32   %parameter0, %i0;
 | 
			
		||||
 | 
			
		||||
		st.global.f32  [%parameter0], 10.0;
 | 
			
		||||
		ret;
 | 
			
		||||
	}"
 | 
			
		||||
 | 
			
		||||
	linker = CuLink()
 | 
			
		||||
	add_data!(linker, "ExpressionProcessing", ptx)
 | 
			
		||||
	
 | 
			
		||||
	image = complete(linker)
 | 
			
		||||
	
 | 
			
		||||
	mod = CuModule(image)
 | 
			
		||||
	func = CuFunction(mod, "ExpressionProcessing")
 | 
			
		||||
 | 
			
		||||
	variableCols = 2
 | 
			
		||||
	cudaResults = CuArray{Float32}(undef, 1)
 | 
			
		||||
	# cd = CUDA.alloc(CUDA.DeviceMemory, (variableCols * length(expressions)) * sizeof(Float32))
 | 
			
		||||
	# cudaResults = CUDA.fill(0f0, variableCols * length(expressions))
 | 
			
		||||
	# cudaResults = cu(zeros(Float32, variableCols * length(expressions)))
 | 
			
		||||
 | 
			
		||||
	config = launch_configuration(func)
 | 
			
		||||
	threads = min(variableCols, config.threads)
 | 
			
		||||
	blocks = cld(variableCols, threads)
 | 
			
		||||
 | 
			
		||||
	cudacall(func, Tuple{CuPtr{Float32}}, cudaResults; threads=1, blocks=1)
 | 
			
		||||
	# launch(func, cudaResults; threads=threads, blocks=blocks)
 | 
			
		||||
 | 
			
		||||
	println(Array(cudaResults))
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# TODO: University setup at 10.20.1.7
 | 
			
		||||
		Reference in New Issue
	
	Block a user