transpiler: results are now written in results array; preperation for performance testing
	
		
			
	
		
	
	
		
	
		
			Some checks are pending
		
		
	
	
		
			
				
	
				CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
				
			
		
			
				
	
				CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
				
			
		
			
				
	
				CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
				
			
		
		
	
	
				
					
				
			
		
			Some checks are pending
		
		
	
	CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
				
			CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
				
			CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
				
			This commit is contained in:
		@ -19,19 +19,18 @@ using ..Utils
 | 
			
		||||
# Since the generated expressions should have between 10 and 50 symbols, I think allowing a max. of 128 32-bit registers should make for an easy algorithm. If during testing the result is slow, maybe try reducing the number of registers and perform more intelligent allocation/assignment
 | 
			
		||||
# With 128 Registers, one could have 32 Warps on one SM ((128 * 16 = 2048) * 32 == 64*1024 == max number of registers per SM) This means 512 Threads per SM in the worst case
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# Make a "function execute(...)" that takes the data and the transpiled code. Pass the data to the kernel and start executing
 | 
			
		||||
# Note: Maybe make an additional function that transpiles and executed the code. This would then be the function the user calls
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
const Operand = Union{Float32, String} # Operand is either fixed value or register
 | 
			
		||||
cache = Dict{Expr, CuFunction}() # needed if multiple runs with the same expr but different parameters are performed
 | 
			
		||||
 | 
			
		||||
function evaluate(expressions::Vector{ExpressionProcessing.PostfixType}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})
 | 
			
		||||
function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})
 | 
			
		||||
	varRows = size(variables, 1)
 | 
			
		||||
	variableCols = size(variables, 2)
 | 
			
		||||
	kernels = Vector{CuFunction}(undef, length(expressions))
 | 
			
		||||
	
 | 
			
		||||
	# Test this parallel version again when doing performance tests. With the simple "functionality" tests this took 0.03 seconds while sequential took "0.00009" seconds
 | 
			
		||||
	# Threads.@threads for i in eachindex(expressions)
 | 
			
		||||
	#   TODO: Use cache
 | 
			
		||||
	# 	kernel = transpile(expressions[i], varRows, Utils.get_max_inner_length(parameters))
 | 
			
		||||
 | 
			
		||||
	# 	linker = CuLink()
 | 
			
		||||
@ -43,15 +42,22 @@ function evaluate(expressions::Vector{ExpressionProcessing.PostfixType}, variabl
 | 
			
		||||
	# 	kernels[i] = CuFunction(mod, "ExpressionProcessing")
 | 
			
		||||
	# end
 | 
			
		||||
	for i in eachindex(expressions)
 | 
			
		||||
		kernel = transpile(expressions[i], varRows, Utils.get_max_inner_length(parameters))
 | 
			
		||||
		if haskey(cache, expressions[i])
 | 
			
		||||
			kernels[i] = cache[expressions[i]]
 | 
			
		||||
			continue
 | 
			
		||||
		end
 | 
			
		||||
 | 
			
		||||
		formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
 | 
			
		||||
		kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i)
 | 
			
		||||
		
 | 
			
		||||
		linker = CuLink()
 | 
			
		||||
		add_data!(linker, "ExpressionProcessing", kernel)
 | 
			
		||||
 | 
			
		||||
		
 | 
			
		||||
		image = complete(linker)
 | 
			
		||||
	
 | 
			
		||||
		
 | 
			
		||||
		mod = CuModule(image)
 | 
			
		||||
		kernels[i] = CuFunction(mod, "ExpressionProcessing")
 | 
			
		||||
		cache[expressions[i]] = kernels[i]
 | 
			
		||||
	end
 | 
			
		||||
 | 
			
		||||
	cudaVars = CuArray(variables) # maybe put in shared memory (see runtests.jl for more info)
 | 
			
		||||
@ -61,13 +67,13 @@ function evaluate(expressions::Vector{ExpressionProcessing.PostfixType}, variabl
 | 
			
		||||
	cudaResults = CuArray{Float32}(undef, variableCols, length(expressions))
 | 
			
		||||
 | 
			
		||||
	# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
 | 
			
		||||
	variableCols = size(variables, 2)
 | 
			
		||||
	for i in eachindex(kernels)
 | 
			
		||||
		config = launch_configuration(kernels[i])
 | 
			
		||||
		threads = min(variableCols, config.threads)
 | 
			
		||||
		blocks = cld(variableCols, threads)
 | 
			
		||||
 | 
			
		||||
		cudacall(kernels[i], Tuple{CuPtr{Cfloat},CuPtr{Cfloat},CuPtr{Cfloat}}, cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
 | 
			
		||||
		cudacall(kernels[i], Tuple{CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}}, cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
 | 
			
		||||
		break
 | 
			
		||||
	end
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
@ -76,21 +82,24 @@ end
 | 
			
		||||
"
 | 
			
		||||
- param ```varSetSize```: The size of a variable set. Equal to number of rows of variable matrix (in a column major matrix)
 | 
			
		||||
- param ```paramSetSize```: The size of the longest parameter set. As it has to be stored in a column major matrix, the nr of rows is dependent oon the longest parameter set
 | 
			
		||||
- param ```expressionIndex```: The 1-based index of the expression
 | 
			
		||||
"
 | 
			
		||||
function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Integer, paramSetSize::Integer)::String
 | 
			
		||||
function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Integer, paramSetSize::Integer, 
 | 
			
		||||
				   nrOfVariableSets::Integer, expressionIndex::Integer)::String
 | 
			
		||||
	exitJumpLocationMarker = "\$L__BB0_2"
 | 
			
		||||
	ptxBuffer = IOBuffer()
 | 
			
		||||
 | 
			
		||||
	# TODO: Suboptimal solution
 | 
			
		||||
	signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Int32, Float32, Float32]) # nrOfVarSets, Vars, Params
 | 
			
		||||
	guardClause, threadIdReg = get_guard_clause(exitJumpLocationMarker, "%parameter0") # parameter0 because first entry holds the number of variable sets and that is always stored in %parameter0
 | 
			
		||||
	signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Float32, Float32, Float32]) # nrOfVarSets, Vars, Params
 | 
			
		||||
	guardClause, threadIdReg = get_guard_clause(exitJumpLocationMarker, nrOfVariableSets) # parameter0 because first entry holds the number of variable sets and that is always stored in %parameter0
 | 
			
		||||
 | 
			
		||||
	println(ptxBuffer, get_cuda_header())
 | 
			
		||||
	println(ptxBuffer, signature)
 | 
			
		||||
	println(ptxBuffer, "{")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	calc_code = generate_calculation_code(expression, "%parameter1", varSetSize, "%parameter2", paramSetSize, threadIdReg)
 | 
			
		||||
	calc_code = generate_calculation_code(expression, "%parameter0", varSetSize, "%parameter1", paramSetSize, "%parameter2", 
 | 
			
		||||
										  threadIdReg, expressionIndex, nrOfVariableSets)
 | 
			
		||||
	println(ptxBuffer, get_register_definitions())
 | 
			
		||||
	println(ptxBuffer, paramLoading)
 | 
			
		||||
	println(ptxBuffer, guardClause)
 | 
			
		||||
@ -107,12 +116,15 @@ end
 | 
			
		||||
# TODO: Make version, target and address_size configurable; also see what address_size means exactly
 | 
			
		||||
function get_cuda_header()::String
 | 
			
		||||
	return "
 | 
			
		||||
.version 7.1
 | 
			
		||||
.version 8.0
 | 
			
		||||
.target sm_61
 | 
			
		||||
.address_size 32
 | 
			
		||||
"
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
"
 | 
			
		||||
param ```parameters```: [1] = nr of var sets; [2] = variables; [3] = parameters; [4] = result
 | 
			
		||||
"
 | 
			
		||||
function get_kernel_signature(kernelName::String, parameters::Vector{DataType})::Tuple{String, String}
 | 
			
		||||
	signatureBuffer = IOBuffer()
 | 
			
		||||
	paramLoadingBuffer = IOBuffer()
 | 
			
		||||
@ -123,9 +135,9 @@ function get_kernel_signature(kernelName::String, parameters::Vector{DataType}):
 | 
			
		||||
	for i in eachindex(parameters)
 | 
			
		||||
		print(signatureBuffer, "  .param .u32", " ", "param_", i)
 | 
			
		||||
 | 
			
		||||
		parametersReg = get_next_free_register("r")
 | 
			
		||||
		println(paramLoadingBuffer, "ld.param.u32   $parametersReg, [param_$i];")
 | 
			
		||||
		println(paramLoadingBuffer, "cvta.to.global.u32   $(get_next_free_register("parameter")), $parametersReg;")
 | 
			
		||||
		parametersLocation = get_next_free_register("r")
 | 
			
		||||
		println(paramLoadingBuffer, "ld.param.u32   $parametersLocation, [param_$i];")
 | 
			
		||||
		println(paramLoadingBuffer, "cvta.to.global.u32   $(get_next_free_register("parameter")), $parametersLocation;")
 | 
			
		||||
		if i != lastindex(parameters)
 | 
			
		||||
			println(signatureBuffer, ",")
 | 
			
		||||
		end
 | 
			
		||||
@ -140,7 +152,7 @@ Constructs the PTX code used for handling the case where too many threads are st
 | 
			
		||||
 | 
			
		||||
- param ```nrOfVarSetsRegister```: The register which holds the total amount of variable sets for the kernel
 | 
			
		||||
"
 | 
			
		||||
function get_guard_clause(exitJumpLocation::String, nrOfVarSetsRegister::String)::Tuple{String, String}
 | 
			
		||||
function get_guard_clause(exitJumpLocation::String, nrOfVarSets::Integer)::Tuple{String, String}
 | 
			
		||||
	guardBuffer = IOBuffer()
 | 
			
		||||
 | 
			
		||||
	threadIds = get_next_free_register("r")
 | 
			
		||||
@ -154,8 +166,6 @@ function get_guard_clause(exitJumpLocation::String, nrOfVarSetsRegister::String)
 | 
			
		||||
 | 
			
		||||
	globalThreadId = get_next_free_register("r") # basically the index of the thread in the variable set
 | 
			
		||||
	breakCondition = get_next_free_register("p")
 | 
			
		||||
	nrOfVarSets = get_next_free_register("i")
 | 
			
		||||
	println(guardBuffer, "ld.global.u32  $nrOfVarSets, [$nrOfVarSetsRegister];")
 | 
			
		||||
	println(guardBuffer, "mad.lo.s32     $globalThreadId, $threadIds, $threadsPerCTA, $currentThreadId;")
 | 
			
		||||
	println(guardBuffer, "setp.ge.s32    $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets
 | 
			
		||||
 | 
			
		||||
@ -168,8 +178,9 @@ end
 | 
			
		||||
"
 | 
			
		||||
- param ```parametersSetSize```: Size of the largest parameter set
 | 
			
		||||
"
 | 
			
		||||
function generate_calculation_code(expression::ExpressionProcessing.PostfixType, variablesReg::String, variablesSetSize::Integer, 
 | 
			
		||||
								   parametersReg::String, parametersSetSize::Integer, threadIdReg::String)::String
 | 
			
		||||
function generate_calculation_code(expression::ExpressionProcessing.PostfixType, variablesLocation::String, variablesSetSize::Integer, 
 | 
			
		||||
								   parametersLocation::String, parametersSetSize::Integer, resultsLocation::String, 
 | 
			
		||||
								   threadIdReg::String, expressionIndex::Integer, nrOfVarSets::Integer)::String
 | 
			
		||||
	codeBuffer = IOBuffer()
 | 
			
		||||
	operands = Vector{Operand}()
 | 
			
		||||
 | 
			
		||||
@ -196,24 +207,37 @@ function generate_calculation_code(expression::ExpressionProcessing.PostfixType,
 | 
			
		||||
			if token.Value > 0 # varaibles
 | 
			
		||||
				var, first_access = get_register_for_name("x$(token.Value)")
 | 
			
		||||
				if first_access
 | 
			
		||||
					println(codeBuffer, load_into_register(var, variablesReg, token.Value, threadIdReg, variablesSetSize))
 | 
			
		||||
					println(codeBuffer, load_into_register(var, variablesLocation, token.Value, threadIdReg, variablesSetSize))
 | 
			
		||||
				end
 | 
			
		||||
				push!(operands, var)
 | 
			
		||||
			else
 | 
			
		||||
				absVal = abs(token.Value)
 | 
			
		||||
				param, first_access = get_register_for_name("p$absVal")
 | 
			
		||||
				if first_access
 | 
			
		||||
					println(codeBuffer, load_into_register(param, parametersReg, absVal, threadIdReg, parametersSetSize))
 | 
			
		||||
					println(codeBuffer, load_into_register(param, parametersLocation, absVal, threadIdReg, parametersSetSize))
 | 
			
		||||
				end
 | 
			
		||||
				push!(operands, param)
 | 
			
		||||
			end
 | 
			
		||||
		end
 | 
			
		||||
	end
 | 
			
		||||
 | 
			
		||||
	# resultIndex = ((expressionIndex - 1) * nrOfVarSets + threadIdReg) * bytes (4 in our case)
 | 
			
		||||
	# resultsLocation[resultIndex] = "";
 | 
			
		||||
	tempReg = get_next_free_register("i")
 | 
			
		||||
	println(codeBuffer, "
 | 
			
		||||
	add.u32        $tempReg, $((expressionIndex-1)*nrOfVarSets), $threadIdReg;
 | 
			
		||||
	mul.lo.u32     $tempReg, $tempReg, $(sizeof(expressionIndex));
 | 
			
		||||
	add.u32        $tempReg, $resultsLocation, $tempReg;
 | 
			
		||||
	st.global.f32  [$tempReg], $(pop!(operands));
 | 
			
		||||
	")
 | 
			
		||||
 | 
			
		||||
	println(operands)
 | 
			
		||||
	return String(take!(codeBuffer))
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
"
 | 
			
		||||
Loads a value from a location into the given register. It is assumed that the location refers to a column-major matrix
 | 
			
		||||
 | 
			
		||||
- param ```register```: The register where the loaded value will be stored
 | 
			
		||||
- param ```loadLocation```: The location from where to load the value
 | 
			
		||||
- param ```valueIndex```: 0-based index of the value in the variable set/parameter set
 | 
			
		||||
@ -263,6 +287,7 @@ function get_operation(operator::Operator, left::Operand, right::Union{Operand,
 | 
			
		||||
	elseif operator == POWER
 | 
			
		||||
		# x^y == 2^(y*log2(x)) as generated by nvcc for "pow(x, y)"
 | 
			
		||||
		resultCode = "
 | 
			
		||||
		// x^y:
 | 
			
		||||
		lg2.approx.f32   $resultRegister, $left;
 | 
			
		||||
		mul.f32          $resultRegister, $right, $resultRegister;
 | 
			
		||||
		ex2.approx.f32   $resultRegister, $resultRegister;"
 | 
			
		||||
@ -271,11 +296,13 @@ function get_operation(operator::Operator, left::Operand, right::Union{Operand,
 | 
			
		||||
	elseif operator == LOG
 | 
			
		||||
		# log(x) == log2(x) * ln(2) as generated by nvcc for "log(x)"
 | 
			
		||||
		resultCode = "
 | 
			
		||||
		// log(x):
 | 
			
		||||
		lg2.approx.f32   $resultRegister, $left;
 | 
			
		||||
		mul.f32          $resultRegister, $resultRegister, 0.693147182;"
 | 
			
		||||
	elseif operator == EXP
 | 
			
		||||
		# e^x == 2^(x/ln(2)) as generated by nvcc for "exp(x)"
 | 
			
		||||
		resultCode = "
 | 
			
		||||
		// e^x:
 | 
			
		||||
		mul.f32          $resultRegister, $left, 1.44269502; 
 | 
			
		||||
		ex2.approx.f32   $resultRegister, $resultRegister;"
 | 
			
		||||
	elseif operator == SQRT
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,7 @@
 | 
			
		||||
module Utils
 | 
			
		||||
 | 
			
		||||
using CUDA
 | 
			
		||||
 | 
			
		||||
"Converts a vector of vectors into a matrix. The inner vectors do not need to have the same length.
 | 
			
		||||
 | 
			
		||||
All entries that cannot be filled have ```invalidElement``` as their value
 | 
			
		||||
 | 
			
		||||
@ -24,11 +24,11 @@ parameters[2][2] = 0.0
 | 
			
		||||
	postfixExpr = expr_to_postfix(expressions[1])
 | 
			
		||||
	postfixExprs = Vector([postfixExpr])
 | 
			
		||||
	push!(postfixExprs, expr_to_postfix(expressions[2]))
 | 
			
		||||
	push!(postfixExprs, expr_to_postfix(:(5^3 + x1)))
 | 
			
		||||
	push!(postfixExprs, expr_to_postfix(:(5^3 + x1 - p1)))
 | 
			
		||||
 | 
			
		||||
	# generatedCode = Transpiler.transpile(postfixExpr)
 | 
			
		||||
	generatedCode = Transpiler.transpile(postfixExprs[3], 2, 3) # TEMP
 | 
			
		||||
	# println(generatedCode)
 | 
			
		||||
	generatedCode = Transpiler.transpile(postfixExprs[3], 2, 3, 2, 3) # TEMP
 | 
			
		||||
	println(generatedCode)
 | 
			
		||||
	# CUDA.@sync interpret(postfixExprs, variables, parameters)
 | 
			
		||||
 | 
			
		||||
	# This is just here for testing. This will be called inside the execute method in the Transpiler module
 | 
			
		||||
@ -42,11 +42,11 @@ parameters[2][2] = 0.0
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
@testset "Test transpiler evaluation" begin
 | 
			
		||||
	postfixExprs = Vector{ExpressionProcessing.PostfixType}()
 | 
			
		||||
	push!(postfixExprs, expr_to_postfix(expressions[1]))
 | 
			
		||||
	push!(postfixExprs, expr_to_postfix(expressions[2]))
 | 
			
		||||
	# postfixExprs = Vector{Expr}()
 | 
			
		||||
	# push!(postfixExprs, expressions[1])
 | 
			
		||||
	# push!(postfixExprs, expressions[2])
 | 
			
		||||
 | 
			
		||||
	@time Transpiler.evaluate(postfixExprs, variables, parameters)
 | 
			
		||||
	@time Transpiler.evaluate(expressions, variables, parameters)
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
#TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)"
 | 
			
		||||
 | 
			
		||||
		Reference in New Issue
	
	Block a user