code: started finalising transpilation process and preparing for performance testing and tuning
Some checks failed
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled

This commit is contained in:
Daniel 2025-03-23 13:38:22 +01:00
parent db02e9f90f
commit baa37ea183
11 changed files with 149 additions and 60 deletions

View File

@ -1,6 +1,6 @@
name = "ExpressionExecutorCuda"
uuid = "5b8ee377-1e19-4ba5-a85c-78c7d1694bfe"
authors = ["Daniel Wiplinger"]
authors = ["Daniel Roth"]
version = "1.0.0-DEV"
[deps]

View File

@ -1,4 +1,5 @@
module ExpressionExecutorCuda
include("Utils.jl")
include("ExpressionProcessing.jl")
include("Interpreter.jl")
@ -13,18 +14,26 @@ export test
# Some assertions:
# Variables and parameters start their naming with "1" meaning the first variable/parameter has to be "x1/p1" and not "x0/p0"
# Matrix X is column major
# each index i in exprs has to have the matching values in the column i in Matrix X so that X[:,i] contains the values for expr[i]. The same goes for p
# This assertion is made, because in julia, the first index doesn't have to be 1
#
# Evaluate Expressions on the GPU
function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
exprsPostfix = ExpressionProcessing.expr_to_postfix(exprs[1])
@assert axes(exprs) == axes(p)
ncols = size(X, 2)
result = Matrix{Float32}(undef, ncols, length(exprs))
# interpret
end
# Convert Expressions to PTX Code and execute that instead
function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
# Look into this to maybe speed up PTX generation: https://cuda.juliagpu.org/stable/tutorials/introduction/#Parallelization-on-the-CPU
@assert axes(exprs) == axes(p)
ncols = size(X, 2)
result = Matrix{Float32}(undef, ncols, length(exprs))
end

View File

@ -2,6 +2,7 @@ module Interpreter
using CUDA
using StaticArrays
using ..ExpressionProcessing
using ..Utils
export interpret
@ -14,10 +15,10 @@ export interpret
function interpret(expressions::Vector{ExpressionProcessing.PostfixType}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
variableCols = size(variables, 2) # number of variable sets to use for each expression
cudaVars = CuArray(variables)
cudaParams = create_cuda_array(parameters, NaN32) # column corresponds to data for one expression
cudaExprs = create_cuda_array(expressions, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression
cudaParams = Utils.create_cuda_array(parameters, NaN32) # column corresponds to data for one expression
cudaExprs = Utils.create_cuda_array(expressions, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression
# put into seperate cuArray, as this is static and would be inefficient to send seperatly to every kernel
cudaStepsize = CuArray([get_max_inner_length(expressions), get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression
cudaStepsize = CuArray([Utils.get_max_inner_length(expressions), Utils.get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression
# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
cudaResults = CuArray{Float32}(undef, variableCols, length(expressions))
@ -108,44 +109,4 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var
return
end
"Retrieves the number of entries for the largest inner vector"
function get_max_inner_length(vec::Vector{Vector{T}})::Int where T
maxLength = 0
@inbounds for i in eachindex(vec)
if length(vec[i]) > maxLength
maxLength = length(vec[i])
end
end
return maxLength
end
"Returns a CuArray filed with the data provided. The inner vectors do not have to have the same length. All missing elements will be the value ```invalidElement```"
function create_cuda_array(data::Vector{Vector{T}}, invalidElement::T)::CuArray{T} where T
dataCols = get_max_inner_length(data)
dataRows = length(data)
dataMat = convert_to_matrix(data, invalidElement)
cudaArr = CuArray{T}(undef, dataCols, dataRows) # length(parameters) == number of expressions
copyto!(cudaArr, dataMat)
return cudaArr
end
"Converts a vector of vectors into a matrix. The inner vectors do not need to have the same length.
All entries that cannot be filled have ```invalidElement``` as their value
"
function convert_to_matrix(vec::Vector{Vector{T}}, invalidElement::T)::Matrix{T} where T
vecCols = get_max_inner_length(vec)
vecRows = length(vec)
vecMat = fill(invalidElement, vecCols, vecRows)
for i in eachindex(vec)
vecMat[:,i] = copyto!(vecMat[:,i], vec[i])
end
return vecMat
end
end

View File

@ -1,6 +1,7 @@
module Transpiler
using CUDA
using ..ExpressionProcessing
using ..Utils
# Number of threads per block/SM + max number of registers
# https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications
@ -25,16 +26,57 @@ using ..ExpressionProcessing
const Operand = Union{Float32, String} # Operand is either fixed value or register
function evaluate(expression::ExpressionProcessing.PostfixType, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})
# TODO: think of how to do this. Probably get all expressions. Transpile them in parallel and then execute the generatd code.
cudaVars = CuArray(variables)
function evaluate(expressions::Vector{ExpressionProcessing.PostfixType}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})
varRows = size(variables, 1)
kernels = Vector{CuFunction}(undef, length(expressions))
# Test this parallel version again when doing performance tests. With the simple "functionality" tests this took 0.03 seconds while sequential took "0.00009" seconds
# Threads.@threads for i in eachindex(expressions)
# kernel = transpile(expressions[i], varRows, Utils.get_max_inner_length(parameters))
#kernel = transpile(expression, )
# execute kernel.
# linker = CuLink()
# add_data!(linker, "ExpressionProcessing", kernel)
# image = complete(linker)
# mod = CuModule(image)
# kernels[i] = CuFunction(mod, "ExpressionProcessing")
# end
for i in eachindex(expressions)
kernel = transpile(expressions[i], varRows, Utils.get_max_inner_length(parameters))
linker = CuLink()
add_data!(linker, "ExpressionProcessing", kernel)
image = complete(linker)
mod = CuModule(image)
kernels[i] = CuFunction(mod, "ExpressionProcessing")
end
cudaVars = CuArray(variables) # maybe put in shared memory (see runtests.jl for more info)
cudaParams = Utils.create_cuda_array(parameters, NaN32) # maybe make constant (see runtests.jl for more info)
# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
cudaResults = CuArray{Float32}(undef, variableCols, length(expressions))
# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
variableCols = size(variables, 2)
for i in eachindex(kernels)
config = launch_configuration(kernels[i])
threads = min(variableCols, config.threads)
blocks = cld(variableCols, threads)
cudacall(kernels[i], Tuple{CuPtr{Cfloat},CuPtr{Cfloat},CuPtr{Cfloat}}, cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
end
end
# To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string
# seekstart(buf1); write(buf2, buf1)
"
- param ```varSetSize```: The size of a variable set. Equal to number of rows of variable matrix (in a column major matrix)
- param ```paramSetSize```: The size of the longest parameter set. As it has to be stored in a column major matrix, the nr of rows is dependent oon the longest parameter set
"
function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Integer, paramSetSize::Integer)::String
exitJumpLocationMarker = "\$L__BB0_2"
ptxBuffer = IOBuffer()
@ -59,7 +101,6 @@ function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Int
println(ptxBuffer, "}")
generatedCode = String(take!(ptxBuffer))
println(generatedCode)
return generatedCode
end
@ -124,6 +165,9 @@ function get_guard_clause(exitJumpLocation::String, nrOfVarSetsRegister::String)
return (String(take!(guardBuffer)), globalThreadId)
end
"
- param ```parametersSetSize```: Size of the largest parameter set
"
function generate_calculation_code(expression::ExpressionProcessing.PostfixType, variablesReg::String, variablesSetSize::Integer,
parametersReg::String, parametersSetSize::Integer, threadIdReg::String)::String
codeBuffer = IOBuffer()
@ -174,7 +218,7 @@ end
- param ```loadLocation```: The location from where to load the value
- param ```valueIndex```: 0-based index of the value in the variable set/parameter set
- param ```setIndexReg```: 0-based index of the set. Needed to calculate the actual index from the ```valueIndex```. Is equal to the global threadId
- param ```setSize```: The size of one set. Needed to calculate the actual index from the ```valueIndex```
- param ```setSize```: The size of one set. Needed to calculate the actual index from the ```valueIndex```. Total number of elements in the set (length(set))
"
function load_into_register(register::String, loadLocation::String, valueIndex::Integer, setIndexReg::String, setSize::Integer)::String
# loadLocation + startIndex + valueIndex * bytes (4 in our case)

42
package/src/Utils.jl Normal file
View File

@ -0,0 +1,42 @@
module Utils
"Converts a vector of vectors into a matrix. The inner vectors do not need to have the same length.
All entries that cannot be filled have ```invalidElement``` as their value
"
function convert_to_matrix(vec::Vector{Vector{T}}, invalidElement::T)::Matrix{T} where T
vecCols = get_max_inner_length(vec)
vecRows = length(vec)
vecMat = fill(invalidElement, vecCols, vecRows)
for i in eachindex(vec)
vecMat[:,i] = copyto!(vecMat[:,i], vec[i])
end
return vecMat
end
"Retrieves the number of entries for the largest inner vector"
function get_max_inner_length(vec::Vector{Vector{T}})::Int where T
maxLength = 0
@inbounds for i in eachindex(vec)
if length(vec[i]) > maxLength
maxLength = length(vec[i])
end
end
return maxLength
end
"Returns a CuArray filed with the data provided. The inner vectors do not have to have the same length. All missing elements will be the value ```invalidElement```"
function create_cuda_array(data::Vector{Vector{T}}, invalidElement::T)::CuArray{T} where T
dataCols = Utils.get_max_inner_length(data)
dataRows = length(data)
dataMat = Utils.convert_to_matrix(data, invalidElement)
cudaArr = CuArray{T}(undef, dataCols, dataRows) # length(parameters) == number of expressions
copyto!(cudaArr, dataMat)
return cudaArr
end
end

View File

@ -1,6 +1,7 @@
using CUDA
using .ExpressionProcessing
using .Interpreter
using .Utils
expressions = Vector{Expr}(undef, 2)
variables = Matrix{Float32}(undef, 2,2)
@ -35,7 +36,7 @@ end
reference[2,2] = 0.0
# reference = Matrix([5.0, NaN],
# [5.0, 0.0])
result = Interpreter.convert_to_matrix(parameters, NaN32)
result = Utils.convert_to_matrix(parameters, NaN32)
@test isequal(result, reference)
end

View File

@ -28,6 +28,7 @@ parameters[2][2] = 0.0
# generatedCode = Transpiler.transpile(postfixExpr)
generatedCode = Transpiler.transpile(postfixExprs[3], 2, 3) # TEMP
# println(generatedCode)
# CUDA.@sync interpret(postfixExprs, variables, parameters)
# This is just here for testing. This will be called inside the execute method in the Transpiler module
@ -40,4 +41,12 @@ parameters[2][2] = 0.0
func = CuFunction(mod, "ExpressionProcessing")
end
@testset "Test transpiler evaluation" begin
postfixExprs = Vector{ExpressionProcessing.PostfixType}()
push!(postfixExprs, expr_to_postfix(expressions[1]))
push!(postfixExprs, expr_to_postfix(expressions[2]))
@time Transpiler.evaluate(postfixExprs, variables, parameters)
end
#TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)"

View File

@ -2,17 +2,33 @@ using ExpressionExecutorCuda
using Test
const baseFolder = dirname(dirname(pathof(ExpressionExecutorCuda)))
include(joinpath(baseFolder, "src", "Utils.jl"))
include(joinpath(baseFolder, "src", "ExpressionProcessing.jl"))
include(joinpath(baseFolder, "src", "Interpreter.jl"))
include(joinpath(baseFolder, "src", "Transpiler.jl"))
@testset "ExpressionExecutorCuda.jl" begin
include("ExpressionProcessingTests.jl")
include("InterpreterTests.jl")
# include("ExpressionProcessingTests.jl")
# include("InterpreterTests.jl")
include("TranspilerTests.jl")
end
@testset "CPU Interpreter" begin
include("CpuInterpreterTests.jl")
#@testset "CPU Interpreter" begin
# include("CpuInterpreterTests.jl")
#end
@testset "Performance tests" begin
# TODO: make performance tests
# Put data in shared memory:
# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
# Make array const:
# https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays
# Memory management like in C++ might help with performance improvements
# https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management
end

View File

@ -9,5 +9,12 @@ Probably reference the performance evaluation papers for Julia and CUDA.jl
\section{Interpreter}
Talk about how the interpreter has been developed.
\subsection{Performance tuning}
Document the process of performance tuning
\section{Transpiler}
Talk about how the transpiler has been developed
Talk about how the transpiler has been developed
\subsection{Performance tuning}
Document the process of performance tuning

View File

@ -41,7 +41,7 @@ In order to answer the research questions, this thesis is divided into the follo
\item[Chapter 4: Implementation] \mbox{} \\
This chapter explains the implementation of the GPU interpreter and transpiler. The details of the implementation with the used technologies are covered, such as the interpretation process and the transpilation of the expressions into Parallel Thread Execution (PTX) code.
\item[Chapter 5: Evaluation] \mbox{} \\
The software and hardware requirements and the evaluation environment are introduced in this chapter. All three evaluators will be compared against each other and the form of the expressions used for the comparisons are outlined. Finally, the results of the comparison of the GPU and CPU evaluators are presented to show which of these yields the best performance.
The software and hardware requirements and the evaluation environment are introduced in this chapter. All three evaluators will be compared against each other and the form of the expressions used for the comparisons are outlined. The comparison will not only include the time taken for the pure evaluation, but it will also include the overhead, like PTX code generation. Finally, the results of the comparison of the GPU and CPU evaluators are presented to show which of these yields the best performance.
\item[Chapter 6: Conclusion] \mbox{} \\
In the final chapter, the entire work is summarised. A brief overview of the implementation as well as the evaluation results will be provided. Additionally, an outlook of possible future research is given.
\end{description}

Binary file not shown.