code: started finalising transpilation process and preparing for performance testing and tuning
Some checks failed
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled
Some checks failed
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled
This commit is contained in:
parent
db02e9f90f
commit
baa37ea183
|
@ -1,6 +1,6 @@
|
|||
name = "ExpressionExecutorCuda"
|
||||
uuid = "5b8ee377-1e19-4ba5-a85c-78c7d1694bfe"
|
||||
authors = ["Daniel Wiplinger"]
|
||||
authors = ["Daniel Roth"]
|
||||
version = "1.0.0-DEV"
|
||||
|
||||
[deps]
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
module ExpressionExecutorCuda
|
||||
include("Utils.jl")
|
||||
include("ExpressionProcessing.jl")
|
||||
include("Interpreter.jl")
|
||||
|
||||
|
@ -13,18 +14,26 @@ export test
|
|||
|
||||
# Some assertions:
|
||||
# Variables and parameters start their naming with "1" meaning the first variable/parameter has to be "x1/p1" and not "x0/p0"
|
||||
# Matrix X is column major
|
||||
# each index i in exprs has to have the matching values in the column i in Matrix X so that X[:,i] contains the values for expr[i]. The same goes for p
|
||||
# This assertion is made, because in julia, the first index doesn't have to be 1
|
||||
#
|
||||
|
||||
# Evaluate Expressions on the GPU
|
||||
function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
|
||||
exprsPostfix = ExpressionProcessing.expr_to_postfix(exprs[1])
|
||||
@assert axes(exprs) == axes(p)
|
||||
ncols = size(X, 2)
|
||||
|
||||
result = Matrix{Float32}(undef, ncols, length(exprs))
|
||||
# interpret
|
||||
end
|
||||
|
||||
# Convert Expressions to PTX Code and execute that instead
|
||||
function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
|
||||
# Look into this to maybe speed up PTX generation: https://cuda.juliagpu.org/stable/tutorials/introduction/#Parallelization-on-the-CPU
|
||||
@assert axes(exprs) == axes(p)
|
||||
ncols = size(X, 2)
|
||||
|
||||
result = Matrix{Float32}(undef, ncols, length(exprs))
|
||||
end
|
||||
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ module Interpreter
|
|||
using CUDA
|
||||
using StaticArrays
|
||||
using ..ExpressionProcessing
|
||||
using ..Utils
|
||||
|
||||
export interpret
|
||||
|
||||
|
@ -14,10 +15,10 @@ export interpret
|
|||
function interpret(expressions::Vector{ExpressionProcessing.PostfixType}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
|
||||
variableCols = size(variables, 2) # number of variable sets to use for each expression
|
||||
cudaVars = CuArray(variables)
|
||||
cudaParams = create_cuda_array(parameters, NaN32) # column corresponds to data for one expression
|
||||
cudaExprs = create_cuda_array(expressions, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression
|
||||
cudaParams = Utils.create_cuda_array(parameters, NaN32) # column corresponds to data for one expression
|
||||
cudaExprs = Utils.create_cuda_array(expressions, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression
|
||||
# put into seperate cuArray, as this is static and would be inefficient to send seperatly to every kernel
|
||||
cudaStepsize = CuArray([get_max_inner_length(expressions), get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression
|
||||
cudaStepsize = CuArray([Utils.get_max_inner_length(expressions), Utils.get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression
|
||||
|
||||
# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
|
||||
cudaResults = CuArray{Float32}(undef, variableCols, length(expressions))
|
||||
|
@ -108,44 +109,4 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var
|
|||
return
|
||||
end
|
||||
|
||||
|
||||
"Retrieves the number of entries for the largest inner vector"
|
||||
function get_max_inner_length(vec::Vector{Vector{T}})::Int where T
|
||||
maxLength = 0
|
||||
@inbounds for i in eachindex(vec)
|
||||
if length(vec[i]) > maxLength
|
||||
maxLength = length(vec[i])
|
||||
end
|
||||
end
|
||||
|
||||
return maxLength
|
||||
end
|
||||
|
||||
"Returns a CuArray filed with the data provided. The inner vectors do not have to have the same length. All missing elements will be the value ```invalidElement```"
|
||||
function create_cuda_array(data::Vector{Vector{T}}, invalidElement::T)::CuArray{T} where T
|
||||
dataCols = get_max_inner_length(data)
|
||||
dataRows = length(data)
|
||||
dataMat = convert_to_matrix(data, invalidElement)
|
||||
cudaArr = CuArray{T}(undef, dataCols, dataRows) # length(parameters) == number of expressions
|
||||
copyto!(cudaArr, dataMat)
|
||||
|
||||
return cudaArr
|
||||
end
|
||||
|
||||
"Converts a vector of vectors into a matrix. The inner vectors do not need to have the same length.
|
||||
|
||||
All entries that cannot be filled have ```invalidElement``` as their value
|
||||
"
|
||||
function convert_to_matrix(vec::Vector{Vector{T}}, invalidElement::T)::Matrix{T} where T
|
||||
vecCols = get_max_inner_length(vec)
|
||||
vecRows = length(vec)
|
||||
vecMat = fill(invalidElement, vecCols, vecRows)
|
||||
|
||||
for i in eachindex(vec)
|
||||
vecMat[:,i] = copyto!(vecMat[:,i], vec[i])
|
||||
end
|
||||
|
||||
return vecMat
|
||||
end
|
||||
|
||||
end
|
|
@ -1,6 +1,7 @@
|
|||
module Transpiler
|
||||
using CUDA
|
||||
using ..ExpressionProcessing
|
||||
using ..Utils
|
||||
|
||||
# Number of threads per block/SM + max number of registers
|
||||
# https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications
|
||||
|
@ -25,16 +26,57 @@ using ..ExpressionProcessing
|
|||
|
||||
const Operand = Union{Float32, String} # Operand is either fixed value or register
|
||||
|
||||
function evaluate(expression::ExpressionProcessing.PostfixType, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})
|
||||
# TODO: think of how to do this. Probably get all expressions. Transpile them in parallel and then execute the generatd code.
|
||||
cudaVars = CuArray(variables)
|
||||
function evaluate(expressions::Vector{ExpressionProcessing.PostfixType}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})
|
||||
varRows = size(variables, 1)
|
||||
kernels = Vector{CuFunction}(undef, length(expressions))
|
||||
|
||||
#kernel = transpile(expression, )
|
||||
# execute kernel.
|
||||
# Test this parallel version again when doing performance tests. With the simple "functionality" tests this took 0.03 seconds while sequential took "0.00009" seconds
|
||||
# Threads.@threads for i in eachindex(expressions)
|
||||
# kernel = transpile(expressions[i], varRows, Utils.get_max_inner_length(parameters))
|
||||
|
||||
# linker = CuLink()
|
||||
# add_data!(linker, "ExpressionProcessing", kernel)
|
||||
|
||||
# image = complete(linker)
|
||||
|
||||
# mod = CuModule(image)
|
||||
# kernels[i] = CuFunction(mod, "ExpressionProcessing")
|
||||
# end
|
||||
for i in eachindex(expressions)
|
||||
kernel = transpile(expressions[i], varRows, Utils.get_max_inner_length(parameters))
|
||||
|
||||
linker = CuLink()
|
||||
add_data!(linker, "ExpressionProcessing", kernel)
|
||||
|
||||
image = complete(linker)
|
||||
|
||||
mod = CuModule(image)
|
||||
kernels[i] = CuFunction(mod, "ExpressionProcessing")
|
||||
end
|
||||
|
||||
cudaVars = CuArray(variables) # maybe put in shared memory (see runtests.jl for more info)
|
||||
cudaParams = Utils.create_cuda_array(parameters, NaN32) # maybe make constant (see runtests.jl for more info)
|
||||
|
||||
# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
|
||||
cudaResults = CuArray{Float32}(undef, variableCols, length(expressions))
|
||||
|
||||
# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
|
||||
variableCols = size(variables, 2)
|
||||
for i in eachindex(kernels)
|
||||
config = launch_configuration(kernels[i])
|
||||
threads = min(variableCols, config.threads)
|
||||
blocks = cld(variableCols, threads)
|
||||
|
||||
cudacall(kernels[i], Tuple{CuPtr{Cfloat},CuPtr{Cfloat},CuPtr{Cfloat}}, cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
|
||||
end
|
||||
end
|
||||
|
||||
# To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string
|
||||
# seekstart(buf1); write(buf2, buf1)
|
||||
"
|
||||
- param ```varSetSize```: The size of a variable set. Equal to number of rows of variable matrix (in a column major matrix)
|
||||
- param ```paramSetSize```: The size of the longest parameter set. As it has to be stored in a column major matrix, the nr of rows is dependent oon the longest parameter set
|
||||
"
|
||||
function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Integer, paramSetSize::Integer)::String
|
||||
exitJumpLocationMarker = "\$L__BB0_2"
|
||||
ptxBuffer = IOBuffer()
|
||||
|
@ -59,7 +101,6 @@ function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Int
|
|||
println(ptxBuffer, "}")
|
||||
|
||||
generatedCode = String(take!(ptxBuffer))
|
||||
println(generatedCode)
|
||||
return generatedCode
|
||||
end
|
||||
|
||||
|
@ -124,6 +165,9 @@ function get_guard_clause(exitJumpLocation::String, nrOfVarSetsRegister::String)
|
|||
return (String(take!(guardBuffer)), globalThreadId)
|
||||
end
|
||||
|
||||
"
|
||||
- param ```parametersSetSize```: Size of the largest parameter set
|
||||
"
|
||||
function generate_calculation_code(expression::ExpressionProcessing.PostfixType, variablesReg::String, variablesSetSize::Integer,
|
||||
parametersReg::String, parametersSetSize::Integer, threadIdReg::String)::String
|
||||
codeBuffer = IOBuffer()
|
||||
|
@ -174,7 +218,7 @@ end
|
|||
- param ```loadLocation```: The location from where to load the value
|
||||
- param ```valueIndex```: 0-based index of the value in the variable set/parameter set
|
||||
- param ```setIndexReg```: 0-based index of the set. Needed to calculate the actual index from the ```valueIndex```. Is equal to the global threadId
|
||||
- param ```setSize```: The size of one set. Needed to calculate the actual index from the ```valueIndex```
|
||||
- param ```setSize```: The size of one set. Needed to calculate the actual index from the ```valueIndex```. Total number of elements in the set (length(set))
|
||||
"
|
||||
function load_into_register(register::String, loadLocation::String, valueIndex::Integer, setIndexReg::String, setSize::Integer)::String
|
||||
# loadLocation + startIndex + valueIndex * bytes (4 in our case)
|
||||
|
|
42
package/src/Utils.jl
Normal file
42
package/src/Utils.jl
Normal file
|
@ -0,0 +1,42 @@
|
|||
module Utils
|
||||
|
||||
"Converts a vector of vectors into a matrix. The inner vectors do not need to have the same length.
|
||||
|
||||
All entries that cannot be filled have ```invalidElement``` as their value
|
||||
"
|
||||
function convert_to_matrix(vec::Vector{Vector{T}}, invalidElement::T)::Matrix{T} where T
|
||||
vecCols = get_max_inner_length(vec)
|
||||
vecRows = length(vec)
|
||||
vecMat = fill(invalidElement, vecCols, vecRows)
|
||||
|
||||
for i in eachindex(vec)
|
||||
vecMat[:,i] = copyto!(vecMat[:,i], vec[i])
|
||||
end
|
||||
|
||||
return vecMat
|
||||
end
|
||||
|
||||
"Retrieves the number of entries for the largest inner vector"
|
||||
function get_max_inner_length(vec::Vector{Vector{T}})::Int where T
|
||||
maxLength = 0
|
||||
@inbounds for i in eachindex(vec)
|
||||
if length(vec[i]) > maxLength
|
||||
maxLength = length(vec[i])
|
||||
end
|
||||
end
|
||||
|
||||
return maxLength
|
||||
end
|
||||
|
||||
"Returns a CuArray filed with the data provided. The inner vectors do not have to have the same length. All missing elements will be the value ```invalidElement```"
|
||||
function create_cuda_array(data::Vector{Vector{T}}, invalidElement::T)::CuArray{T} where T
|
||||
dataCols = Utils.get_max_inner_length(data)
|
||||
dataRows = length(data)
|
||||
dataMat = Utils.convert_to_matrix(data, invalidElement)
|
||||
cudaArr = CuArray{T}(undef, dataCols, dataRows) # length(parameters) == number of expressions
|
||||
copyto!(cudaArr, dataMat)
|
||||
|
||||
return cudaArr
|
||||
end
|
||||
|
||||
end
|
|
@ -1,6 +1,7 @@
|
|||
using CUDA
|
||||
using .ExpressionProcessing
|
||||
using .Interpreter
|
||||
using .Utils
|
||||
|
||||
expressions = Vector{Expr}(undef, 2)
|
||||
variables = Matrix{Float32}(undef, 2,2)
|
||||
|
@ -35,7 +36,7 @@ end
|
|||
reference[2,2] = 0.0
|
||||
# reference = Matrix([5.0, NaN],
|
||||
# [5.0, 0.0])
|
||||
result = Interpreter.convert_to_matrix(parameters, NaN32)
|
||||
result = Utils.convert_to_matrix(parameters, NaN32)
|
||||
|
||||
@test isequal(result, reference)
|
||||
end
|
||||
|
|
|
@ -28,6 +28,7 @@ parameters[2][2] = 0.0
|
|||
|
||||
# generatedCode = Transpiler.transpile(postfixExpr)
|
||||
generatedCode = Transpiler.transpile(postfixExprs[3], 2, 3) # TEMP
|
||||
# println(generatedCode)
|
||||
# CUDA.@sync interpret(postfixExprs, variables, parameters)
|
||||
|
||||
# This is just here for testing. This will be called inside the execute method in the Transpiler module
|
||||
|
@ -40,4 +41,12 @@ parameters[2][2] = 0.0
|
|||
func = CuFunction(mod, "ExpressionProcessing")
|
||||
end
|
||||
|
||||
@testset "Test transpiler evaluation" begin
|
||||
postfixExprs = Vector{ExpressionProcessing.PostfixType}()
|
||||
push!(postfixExprs, expr_to_postfix(expressions[1]))
|
||||
push!(postfixExprs, expr_to_postfix(expressions[2]))
|
||||
|
||||
@time Transpiler.evaluate(postfixExprs, variables, parameters)
|
||||
end
|
||||
|
||||
#TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)"
|
||||
|
|
|
@ -2,17 +2,33 @@ using ExpressionExecutorCuda
|
|||
using Test
|
||||
|
||||
const baseFolder = dirname(dirname(pathof(ExpressionExecutorCuda)))
|
||||
include(joinpath(baseFolder, "src", "Utils.jl"))
|
||||
include(joinpath(baseFolder, "src", "ExpressionProcessing.jl"))
|
||||
include(joinpath(baseFolder, "src", "Interpreter.jl"))
|
||||
include(joinpath(baseFolder, "src", "Transpiler.jl"))
|
||||
|
||||
@testset "ExpressionExecutorCuda.jl" begin
|
||||
include("ExpressionProcessingTests.jl")
|
||||
include("InterpreterTests.jl")
|
||||
# include("ExpressionProcessingTests.jl")
|
||||
# include("InterpreterTests.jl")
|
||||
include("TranspilerTests.jl")
|
||||
end
|
||||
|
||||
|
||||
@testset "CPU Interpreter" begin
|
||||
include("CpuInterpreterTests.jl")
|
||||
#@testset "CPU Interpreter" begin
|
||||
# include("CpuInterpreterTests.jl")
|
||||
#end
|
||||
|
||||
@testset "Performance tests" begin
|
||||
# TODO: make performance tests
|
||||
|
||||
# Put data in shared memory:
|
||||
# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
|
||||
|
||||
# Make array const:
|
||||
# https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays
|
||||
|
||||
# Memory management like in C++ might help with performance improvements
|
||||
# https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management
|
||||
|
||||
|
||||
end
|
|
@ -9,5 +9,12 @@ Probably reference the performance evaluation papers for Julia and CUDA.jl
|
|||
\section{Interpreter}
|
||||
Talk about how the interpreter has been developed.
|
||||
|
||||
\subsection{Performance tuning}
|
||||
Document the process of performance tuning
|
||||
|
||||
|
||||
\section{Transpiler}
|
||||
Talk about how the transpiler has been developed
|
||||
|
||||
\subsection{Performance tuning}
|
||||
Document the process of performance tuning
|
|
@ -41,7 +41,7 @@ In order to answer the research questions, this thesis is divided into the follo
|
|||
\item[Chapter 4: Implementation] \mbox{} \\
|
||||
This chapter explains the implementation of the GPU interpreter and transpiler. The details of the implementation with the used technologies are covered, such as the interpretation process and the transpilation of the expressions into Parallel Thread Execution (PTX) code.
|
||||
\item[Chapter 5: Evaluation] \mbox{} \\
|
||||
The software and hardware requirements and the evaluation environment are introduced in this chapter. All three evaluators will be compared against each other and the form of the expressions used for the comparisons are outlined. Finally, the results of the comparison of the GPU and CPU evaluators are presented to show which of these yields the best performance.
|
||||
The software and hardware requirements and the evaluation environment are introduced in this chapter. All three evaluators will be compared against each other and the form of the expressions used for the comparisons are outlined. The comparison will not only include the time taken for the pure evaluation, but it will also include the overhead, like PTX code generation. Finally, the results of the comparison of the GPU and CPU evaluators are presented to show which of these yields the best performance.
|
||||
\item[Chapter 6: Conclusion] \mbox{} \\
|
||||
In the final chapter, the entire work is summarised. A brief overview of the implementation as well as the evaluation results will be provided. Additionally, an outlook of possible future research is given.
|
||||
\end{description}
|
||||
|
|
BIN
thesis/main.pdf
BIN
thesis/main.pdf
Binary file not shown.
Loading…
Reference in New Issue
Block a user