code: started finalising transpilation process and preparing for performance testing and tuning

2025-03-23 13:38:22 +01:00
parent db02e9f90f
commit baa37ea183
11 changed files with 149 additions and 60 deletions
--- a/package/Project.toml
+++ b/package/Project.toml
@ -1,6 +1,6 @@
 name = "ExpressionExecutorCuda"
 uuid = "5b8ee377-1e19-4ba5-a85c-78c7d1694bfe"
-authors = ["Daniel Wiplinger"]
+authors = ["Daniel Roth"]
 version = "1.0.0-DEV"

 [deps]
--- a/package/src/ExpressionExecutorCuda.jl
+++ b/package/src/ExpressionExecutorCuda.jl
@ -1,4 +1,5 @@
 module ExpressionExecutorCuda
+include("Utils.jl")
 include("ExpressionProcessing.jl")
 include("Interpreter.jl")

@ -13,18 +14,26 @@ export test

 # Some assertions:
 # Variables and parameters start their naming with "1" meaning the first variable/parameter has to be "x1/p1" and not "x0/p0"
+# Matrix X is column major
 # each index i in exprs has to have the matching values in the column i in Matrix X so that X[:,i] contains the values for expr[i]. The same goes for p
 #     This assertion is made, because in julia, the first index doesn't have to be 1
 #

 # Evaluate Expressions on the GPU
 function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
-	exprsPostfix = ExpressionProcessing.expr_to_postfix(exprs[1])
+	@assert axes(exprs) == axes(p)
+	ncols = size(X, 2)
+
+	result = Matrix{Float32}(undef, ncols, length(exprs))
+	# interpret
 end

 # Convert Expressions to PTX Code and execute that instead
 function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
-	# Look into this to maybe speed up PTX generation: https://cuda.juliagpu.org/stable/tutorials/introduction/#Parallelization-on-the-CPU
+	@assert axes(exprs) == axes(p)
+	ncols = size(X, 2)
+
+	result = Matrix{Float32}(undef, ncols, length(exprs))
 end


--- a/package/src/Interpreter.jl
+++ b/package/src/Interpreter.jl
@ -2,6 +2,7 @@ module Interpreter
 using CUDA
 using StaticArrays
 using ..ExpressionProcessing
+using ..Utils

 export interpret

@ -14,10 +15,10 @@ export interpret
 function interpret(expressions::Vector{ExpressionProcessing.PostfixType}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
 	variableCols = size(variables, 2) # number of variable sets to use for each expression
 	cudaVars = CuArray(variables)
-	cudaParams = create_cuda_array(parameters, NaN32) # column corresponds to data for one expression
-	cudaExprs = create_cuda_array(expressions, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression
+	cudaParams = Utils.create_cuda_array(parameters, NaN32) # column corresponds to data for one expression
+	cudaExprs = Utils.create_cuda_array(expressions, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression
 	# put into seperate cuArray, as this is static and would be inefficient to send seperatly to every kernel
-	cudaStepsize = CuArray([get_max_inner_length(expressions), get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression
+	cudaStepsize = CuArray([Utils.get_max_inner_length(expressions), Utils.get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression

 	# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
 	cudaResults = CuArray{Float32}(undef, variableCols, length(expressions))
@ -108,44 +109,4 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var
 	return
 end

-
-"Retrieves the number of entries for the largest inner vector"
-function get_max_inner_length(vec::Vector{Vector{T}})::Int where T
-	maxLength = 0
-	@inbounds for i in eachindex(vec)
-		if length(vec[i]) > maxLength
-			maxLength = length(vec[i])
-		end
-	end
-
-	return maxLength
-end
-
-"Returns a CuArray filed with the data provided. The inner vectors do not have to have the same length. All missing elements will be the value ```invalidElement```"
-function create_cuda_array(data::Vector{Vector{T}}, invalidElement::T)::CuArray{T} where T
-	dataCols = get_max_inner_length(data)
-	dataRows = length(data)
-	dataMat = convert_to_matrix(data, invalidElement)
-	cudaArr = CuArray{T}(undef, dataCols, dataRows) # length(parameters) == number of expressions
-	copyto!(cudaArr, dataMat)
-
-	return cudaArr
-end
-
-"Converts a vector of vectors into a matrix. The inner vectors do not need to have the same length.
-
-All entries that cannot be filled have ```invalidElement``` as their value
-"
-function convert_to_matrix(vec::Vector{Vector{T}}, invalidElement::T)::Matrix{T} where T
-	vecCols = get_max_inner_length(vec)
-	vecRows = length(vec)
-	vecMat = fill(invalidElement, vecCols, vecRows)
-	
-	for i in eachindex(vec)
-		vecMat[:,i] = copyto!(vecMat[:,i], vec[i])
-	end
-
-	return vecMat
-end
-
 end
--- a/package/src/Transpiler.jl
+++ b/package/src/Transpiler.jl
@ -1,6 +1,7 @@
 module Transpiler
 using CUDA
 using ..ExpressionProcessing
+using ..Utils

 # Number of threads per block/SM + max number of registers
 # https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications
@ -25,16 +26,57 @@ using ..ExpressionProcessing

 const Operand = Union{Float32, String} # Operand is either fixed value or register

-function evaluate(expression::ExpressionProcessing.PostfixType, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})
-	# TODO: think of how to do this. Probably get all expressions. Transpile them in parallel and then execute the generatd code.
-	cudaVars = CuArray(variables)
+function evaluate(expressions::Vector{ExpressionProcessing.PostfixType}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})
+	varRows = size(variables, 1)
+	kernels = Vector{CuFunction}(undef, length(expressions))
+	
+	# Test this parallel version again when doing performance tests. With the simple "functionality" tests this took 0.03 seconds while sequential took "0.00009" seconds
+	# Threads.@threads for i in eachindex(expressions)
+	# 	kernel = transpile(expressions[i], varRows, Utils.get_max_inner_length(parameters))

-	#kernel = transpile(expression, )
-	# execute kernel.
+	# 	linker = CuLink()
+	# 	add_data!(linker, "ExpressionProcessing", kernel)
+
+	# 	image = complete(linker)
+	
+	# 	mod = CuModule(image)
+	# 	kernels[i] = CuFunction(mod, "ExpressionProcessing")
+	# end
+	for i in eachindex(expressions)
+		kernel = transpile(expressions[i], varRows, Utils.get_max_inner_length(parameters))
+
+		linker = CuLink()
+		add_data!(linker, "ExpressionProcessing", kernel)
+
+		image = complete(linker)
+	
+		mod = CuModule(image)
+		kernels[i] = CuFunction(mod, "ExpressionProcessing")
+	end
+
+	cudaVars = CuArray(variables) # maybe put in shared memory (see runtests.jl for more info)
+	cudaParams = Utils.create_cuda_array(parameters, NaN32) # maybe make constant (see runtests.jl for more info)
+
+	# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
+	cudaResults = CuArray{Float32}(undef, variableCols, length(expressions))
+
+	# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
+	variableCols = size(variables, 2)
+	for i in eachindex(kernels)
+		config = launch_configuration(kernels[i])
+		threads = min(variableCols, config.threads)
+		blocks = cld(variableCols, threads)
+
+		cudacall(kernels[i], Tuple{CuPtr{Cfloat},CuPtr{Cfloat},CuPtr{Cfloat}}, cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
+	end
 end

 # To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string
 # seekstart(buf1); write(buf2, buf1)
+"
+- param ```varSetSize```: The size of a variable set. Equal to number of rows of variable matrix (in a column major matrix)
+- param ```paramSetSize```: The size of the longest parameter set. As it has to be stored in a column major matrix, the nr of rows is dependent oon the longest parameter set
+"
 function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Integer, paramSetSize::Integer)::String
 	exitJumpLocationMarker = "\$L__BB0_2"
 	ptxBuffer = IOBuffer()
@ -59,7 +101,6 @@ function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Int
 	println(ptxBuffer, "}")

 	generatedCode = String(take!(ptxBuffer))
-	println(generatedCode)
 	return generatedCode
 end

@ -124,6 +165,9 @@ function get_guard_clause(exitJumpLocation::String, nrOfVarSetsRegister::String)
 	return (String(take!(guardBuffer)), globalThreadId)
 end

+"
+- param ```parametersSetSize```: Size of the largest parameter set
+"
 function generate_calculation_code(expression::ExpressionProcessing.PostfixType, variablesReg::String, variablesSetSize::Integer, 
 								   parametersReg::String, parametersSetSize::Integer, threadIdReg::String)::String
 	codeBuffer = IOBuffer()
@ -174,7 +218,7 @@ end
 - param ```loadLocation```: The location from where to load the value
 - param ```valueIndex```: 0-based index of the value in the variable set/parameter set
 - param ```setIndexReg```: 0-based index of the set. Needed to calculate the actual index from the ```valueIndex```. Is equal to the global threadId
- param ```setSize```: The size of one set. Needed to calculate the actual index from the ```valueIndex```
+- param ```setSize```: The size of one set. Needed to calculate the actual index from the ```valueIndex```. Total number of elements in the set (length(set))
 "
 function load_into_register(register::String, loadLocation::String, valueIndex::Integer, setIndexReg::String, setSize::Integer)::String
 	# loadLocation + startIndex + valueIndex * bytes (4 in our case)
--- a/package/src/Utils.jl
+++ b/package/src/Utils.jl
@ -0,0 +1,42 @@
+module Utils
+
+"Converts a vector of vectors into a matrix. The inner vectors do not need to have the same length.
+
+All entries that cannot be filled have ```invalidElement``` as their value
+"
+function convert_to_matrix(vec::Vector{Vector{T}}, invalidElement::T)::Matrix{T} where T
+	vecCols = get_max_inner_length(vec)
+	vecRows = length(vec)
+	vecMat = fill(invalidElement, vecCols, vecRows)
+	
+	for i in eachindex(vec)
+		vecMat[:,i] = copyto!(vecMat[:,i], vec[i])
+	end
+
+	return vecMat
+end
+
+"Retrieves the number of entries for the largest inner vector"
+function get_max_inner_length(vec::Vector{Vector{T}})::Int where T
+	maxLength = 0
+	@inbounds for i in eachindex(vec)
+		if length(vec[i]) > maxLength
+			maxLength = length(vec[i])
+		end
+	end
+
+	return maxLength
+end
+
+"Returns a CuArray filed with the data provided. The inner vectors do not have to have the same length. All missing elements will be the value ```invalidElement```"
+function create_cuda_array(data::Vector{Vector{T}}, invalidElement::T)::CuArray{T} where T
+	dataCols = Utils.get_max_inner_length(data)
+	dataRows = length(data)
+	dataMat = Utils.convert_to_matrix(data, invalidElement)
+	cudaArr = CuArray{T}(undef, dataCols, dataRows) # length(parameters) == number of expressions
+	copyto!(cudaArr, dataMat)
+
+	return cudaArr
+end
+
+end
--- a/package/test/InterpreterTests.jl
+++ b/package/test/InterpreterTests.jl
@ -1,6 +1,7 @@
 using CUDA
 using .ExpressionProcessing
 using .Interpreter
+using .Utils

 expressions = Vector{Expr}(undef, 2)
 variables = Matrix{Float32}(undef, 2,2)
@ -35,7 +36,7 @@ end
 	reference[2,2] = 0.0
 	# reference = Matrix([5.0, NaN],
 	# 				   [5.0, 0.0])
-	result = Interpreter.convert_to_matrix(parameters, NaN32)
+	result = Utils.convert_to_matrix(parameters, NaN32)

 	@test isequal(result, reference)
 end
--- a/package/test/TranspilerTests.jl
+++ b/package/test/TranspilerTests.jl
@ -28,6 +28,7 @@ parameters[2][2] = 0.0

 	# generatedCode = Transpiler.transpile(postfixExpr)
 	generatedCode = Transpiler.transpile(postfixExprs[3], 2, 3) # TEMP
+	# println(generatedCode)
 	# CUDA.@sync interpret(postfixExprs, variables, parameters)

 	# This is just here for testing. This will be called inside the execute method in the Transpiler module
@ -40,4 +41,12 @@ parameters[2][2] = 0.0
 	func = CuFunction(mod, "ExpressionProcessing")
 end

+@testset "Test transpiler evaluation" begin
+	postfixExprs = Vector{ExpressionProcessing.PostfixType}()
+	push!(postfixExprs, expr_to_postfix(expressions[1]))
+	push!(postfixExprs, expr_to_postfix(expressions[2]))
+
+	@time Transpiler.evaluate(postfixExprs, variables, parameters)
+end
+
 #TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)"
--- a/package/test/runtests.jl
+++ b/package/test/runtests.jl
@ -2,17 +2,33 @@ using ExpressionExecutorCuda
 using Test

 const baseFolder = dirname(dirname(pathof(ExpressionExecutorCuda)))
+include(joinpath(baseFolder, "src", "Utils.jl"))
 include(joinpath(baseFolder, "src", "ExpressionProcessing.jl"))
 include(joinpath(baseFolder, "src", "Interpreter.jl"))
 include(joinpath(baseFolder, "src", "Transpiler.jl"))

@testset "ExpressionExecutorCuda.jl" begin
-	include("ExpressionProcessingTests.jl")
-	include("InterpreterTests.jl")
+	# include("ExpressionProcessingTests.jl")
+	# include("InterpreterTests.jl")
 	include("TranspilerTests.jl")
 end


-@testset "CPU Interpreter" begin
-	include("CpuInterpreterTests.jl")
+#@testset "CPU Interpreter" begin
+#	include("CpuInterpreterTests.jl")
+#end
+
+@testset "Performance tests" begin
+	# TODO: make performance tests
+
+	# Put data in shared memory: 
+	# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
+
+	# Make array const:
+	# https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays
+
+	# Memory management like in C++ might help with performance improvements
+	# https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management
+
+	
 end