diff --git a/package/Project.toml b/package/Project.toml index 7944c28..714f623 100644 --- a/package/Project.toml +++ b/package/Project.toml @@ -1,6 +1,6 @@ name = "ExpressionExecutorCuda" uuid = "5b8ee377-1e19-4ba5-a85c-78c7d1694bfe" -authors = ["Daniel Wiplinger"] +authors = ["Daniel Roth"] version = "1.0.0-DEV" [deps] diff --git a/package/src/ExpressionExecutorCuda.jl b/package/src/ExpressionExecutorCuda.jl index 2192dcf..c5b8093 100644 --- a/package/src/ExpressionExecutorCuda.jl +++ b/package/src/ExpressionExecutorCuda.jl @@ -1,4 +1,5 @@ module ExpressionExecutorCuda +include("Utils.jl") include("ExpressionProcessing.jl") include("Interpreter.jl") @@ -13,18 +14,26 @@ export test # Some assertions: # Variables and parameters start their naming with "1" meaning the first variable/parameter has to be "x1/p1" and not "x0/p0" +# Matrix X is column major # each index i in exprs has to have the matching values in the column i in Matrix X so that X[:,i] contains the values for expr[i]. The same goes for p # This assertion is made, because in julia, the first index doesn't have to be 1 # # Evaluate Expressions on the GPU function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32} - exprsPostfix = ExpressionProcessing.expr_to_postfix(exprs[1]) + @assert axes(exprs) == axes(p) + ncols = size(X, 2) + + result = Matrix{Float32}(undef, ncols, length(exprs)) + # interpret end # Convert Expressions to PTX Code and execute that instead function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32} - # Look into this to maybe speed up PTX generation: https://cuda.juliagpu.org/stable/tutorials/introduction/#Parallelization-on-the-CPU + @assert axes(exprs) == axes(p) + ncols = size(X, 2) + + result = Matrix{Float32}(undef, ncols, length(exprs)) end diff --git a/package/src/Interpreter.jl b/package/src/Interpreter.jl index bbf58e9..3a74efa 100644 --- a/package/src/Interpreter.jl +++ b/package/src/Interpreter.jl @@ -2,6 +2,7 @@ module Interpreter using CUDA using StaticArrays using ..ExpressionProcessing +using ..Utils export interpret @@ -14,10 +15,10 @@ export interpret function interpret(expressions::Vector{ExpressionProcessing.PostfixType}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32} variableCols = size(variables, 2) # number of variable sets to use for each expression cudaVars = CuArray(variables) - cudaParams = create_cuda_array(parameters, NaN32) # column corresponds to data for one expression - cudaExprs = create_cuda_array(expressions, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression + cudaParams = Utils.create_cuda_array(parameters, NaN32) # column corresponds to data for one expression + cudaExprs = Utils.create_cuda_array(expressions, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression # put into seperate cuArray, as this is static and would be inefficient to send seperatly to every kernel - cudaStepsize = CuArray([get_max_inner_length(expressions), get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression + cudaStepsize = CuArray([Utils.get_max_inner_length(expressions), Utils.get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression # each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions cudaResults = CuArray{Float32}(undef, variableCols, length(expressions)) @@ -108,44 +109,4 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var return end - -"Retrieves the number of entries for the largest inner vector" -function get_max_inner_length(vec::Vector{Vector{T}})::Int where T - maxLength = 0 - @inbounds for i in eachindex(vec) - if length(vec[i]) > maxLength - maxLength = length(vec[i]) - end - end - - return maxLength -end - -"Returns a CuArray filed with the data provided. The inner vectors do not have to have the same length. All missing elements will be the value ```invalidElement```" -function create_cuda_array(data::Vector{Vector{T}}, invalidElement::T)::CuArray{T} where T - dataCols = get_max_inner_length(data) - dataRows = length(data) - dataMat = convert_to_matrix(data, invalidElement) - cudaArr = CuArray{T}(undef, dataCols, dataRows) # length(parameters) == number of expressions - copyto!(cudaArr, dataMat) - - return cudaArr -end - -"Converts a vector of vectors into a matrix. The inner vectors do not need to have the same length. - -All entries that cannot be filled have ```invalidElement``` as their value -" -function convert_to_matrix(vec::Vector{Vector{T}}, invalidElement::T)::Matrix{T} where T - vecCols = get_max_inner_length(vec) - vecRows = length(vec) - vecMat = fill(invalidElement, vecCols, vecRows) - - for i in eachindex(vec) - vecMat[:,i] = copyto!(vecMat[:,i], vec[i]) - end - - return vecMat -end - end \ No newline at end of file diff --git a/package/src/Transpiler.jl b/package/src/Transpiler.jl index 81aa049..cd97400 100644 --- a/package/src/Transpiler.jl +++ b/package/src/Transpiler.jl @@ -1,6 +1,7 @@ module Transpiler using CUDA using ..ExpressionProcessing +using ..Utils # Number of threads per block/SM + max number of registers # https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications @@ -25,16 +26,57 @@ using ..ExpressionProcessing const Operand = Union{Float32, String} # Operand is either fixed value or register -function evaluate(expression::ExpressionProcessing.PostfixType, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}}) - # TODO: think of how to do this. Probably get all expressions. Transpile them in parallel and then execute the generatd code. - cudaVars = CuArray(variables) +function evaluate(expressions::Vector{ExpressionProcessing.PostfixType}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}}) + varRows = size(variables, 1) + kernels = Vector{CuFunction}(undef, length(expressions)) + + # Test this parallel version again when doing performance tests. With the simple "functionality" tests this took 0.03 seconds while sequential took "0.00009" seconds + # Threads.@threads for i in eachindex(expressions) + # kernel = transpile(expressions[i], varRows, Utils.get_max_inner_length(parameters)) - #kernel = transpile(expression, ) - # execute kernel. + # linker = CuLink() + # add_data!(linker, "ExpressionProcessing", kernel) + + # image = complete(linker) + + # mod = CuModule(image) + # kernels[i] = CuFunction(mod, "ExpressionProcessing") + # end + for i in eachindex(expressions) + kernel = transpile(expressions[i], varRows, Utils.get_max_inner_length(parameters)) + + linker = CuLink() + add_data!(linker, "ExpressionProcessing", kernel) + + image = complete(linker) + + mod = CuModule(image) + kernels[i] = CuFunction(mod, "ExpressionProcessing") + end + + cudaVars = CuArray(variables) # maybe put in shared memory (see runtests.jl for more info) + cudaParams = Utils.create_cuda_array(parameters, NaN32) # maybe make constant (see runtests.jl for more info) + + # each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions + cudaResults = CuArray{Float32}(undef, variableCols, length(expressions)) + + # execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance) + variableCols = size(variables, 2) + for i in eachindex(kernels) + config = launch_configuration(kernels[i]) + threads = min(variableCols, config.threads) + blocks = cld(variableCols, threads) + + cudacall(kernels[i], Tuple{CuPtr{Cfloat},CuPtr{Cfloat},CuPtr{Cfloat}}, cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks) + end end # To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string # seekstart(buf1); write(buf2, buf1) +" +- param ```varSetSize```: The size of a variable set. Equal to number of rows of variable matrix (in a column major matrix) +- param ```paramSetSize```: The size of the longest parameter set. As it has to be stored in a column major matrix, the nr of rows is dependent oon the longest parameter set +" function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Integer, paramSetSize::Integer)::String exitJumpLocationMarker = "\$L__BB0_2" ptxBuffer = IOBuffer() @@ -59,7 +101,6 @@ function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Int println(ptxBuffer, "}") generatedCode = String(take!(ptxBuffer)) - println(generatedCode) return generatedCode end @@ -124,6 +165,9 @@ function get_guard_clause(exitJumpLocation::String, nrOfVarSetsRegister::String) return (String(take!(guardBuffer)), globalThreadId) end +" +- param ```parametersSetSize```: Size of the largest parameter set +" function generate_calculation_code(expression::ExpressionProcessing.PostfixType, variablesReg::String, variablesSetSize::Integer, parametersReg::String, parametersSetSize::Integer, threadIdReg::String)::String codeBuffer = IOBuffer() @@ -174,7 +218,7 @@ end - param ```loadLocation```: The location from where to load the value - param ```valueIndex```: 0-based index of the value in the variable set/parameter set - param ```setIndexReg```: 0-based index of the set. Needed to calculate the actual index from the ```valueIndex```. Is equal to the global threadId -- param ```setSize```: The size of one set. Needed to calculate the actual index from the ```valueIndex``` +- param ```setSize```: The size of one set. Needed to calculate the actual index from the ```valueIndex```. Total number of elements in the set (length(set)) " function load_into_register(register::String, loadLocation::String, valueIndex::Integer, setIndexReg::String, setSize::Integer)::String # loadLocation + startIndex + valueIndex * bytes (4 in our case) diff --git a/package/src/Utils.jl b/package/src/Utils.jl new file mode 100644 index 0000000..595bd21 --- /dev/null +++ b/package/src/Utils.jl @@ -0,0 +1,42 @@ +module Utils + +"Converts a vector of vectors into a matrix. The inner vectors do not need to have the same length. + +All entries that cannot be filled have ```invalidElement``` as their value +" +function convert_to_matrix(vec::Vector{Vector{T}}, invalidElement::T)::Matrix{T} where T + vecCols = get_max_inner_length(vec) + vecRows = length(vec) + vecMat = fill(invalidElement, vecCols, vecRows) + + for i in eachindex(vec) + vecMat[:,i] = copyto!(vecMat[:,i], vec[i]) + end + + return vecMat +end + +"Retrieves the number of entries for the largest inner vector" +function get_max_inner_length(vec::Vector{Vector{T}})::Int where T + maxLength = 0 + @inbounds for i in eachindex(vec) + if length(vec[i]) > maxLength + maxLength = length(vec[i]) + end + end + + return maxLength +end + +"Returns a CuArray filed with the data provided. The inner vectors do not have to have the same length. All missing elements will be the value ```invalidElement```" +function create_cuda_array(data::Vector{Vector{T}}, invalidElement::T)::CuArray{T} where T + dataCols = Utils.get_max_inner_length(data) + dataRows = length(data) + dataMat = Utils.convert_to_matrix(data, invalidElement) + cudaArr = CuArray{T}(undef, dataCols, dataRows) # length(parameters) == number of expressions + copyto!(cudaArr, dataMat) + + return cudaArr +end + +end \ No newline at end of file diff --git a/package/test/InterpreterTests.jl b/package/test/InterpreterTests.jl index 8af9063..d383c70 100644 --- a/package/test/InterpreterTests.jl +++ b/package/test/InterpreterTests.jl @@ -1,6 +1,7 @@ using CUDA using .ExpressionProcessing using .Interpreter +using .Utils expressions = Vector{Expr}(undef, 2) variables = Matrix{Float32}(undef, 2,2) @@ -35,7 +36,7 @@ end reference[2,2] = 0.0 # reference = Matrix([5.0, NaN], # [5.0, 0.0]) - result = Interpreter.convert_to_matrix(parameters, NaN32) + result = Utils.convert_to_matrix(parameters, NaN32) @test isequal(result, reference) end diff --git a/package/test/TranspilerTests.jl b/package/test/TranspilerTests.jl index 01f859d..7234e2d 100644 --- a/package/test/TranspilerTests.jl +++ b/package/test/TranspilerTests.jl @@ -28,6 +28,7 @@ parameters[2][2] = 0.0 # generatedCode = Transpiler.transpile(postfixExpr) generatedCode = Transpiler.transpile(postfixExprs[3], 2, 3) # TEMP + # println(generatedCode) # CUDA.@sync interpret(postfixExprs, variables, parameters) # This is just here for testing. This will be called inside the execute method in the Transpiler module @@ -40,4 +41,12 @@ parameters[2][2] = 0.0 func = CuFunction(mod, "ExpressionProcessing") end +@testset "Test transpiler evaluation" begin + postfixExprs = Vector{ExpressionProcessing.PostfixType}() + push!(postfixExprs, expr_to_postfix(expressions[1])) + push!(postfixExprs, expr_to_postfix(expressions[2])) + + @time Transpiler.evaluate(postfixExprs, variables, parameters) +end + #TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)" diff --git a/package/test/runtests.jl b/package/test/runtests.jl index ee68520..1fb88ef 100644 --- a/package/test/runtests.jl +++ b/package/test/runtests.jl @@ -2,17 +2,33 @@ using ExpressionExecutorCuda using Test const baseFolder = dirname(dirname(pathof(ExpressionExecutorCuda))) +include(joinpath(baseFolder, "src", "Utils.jl")) include(joinpath(baseFolder, "src", "ExpressionProcessing.jl")) include(joinpath(baseFolder, "src", "Interpreter.jl")) include(joinpath(baseFolder, "src", "Transpiler.jl")) @testset "ExpressionExecutorCuda.jl" begin - include("ExpressionProcessingTests.jl") - include("InterpreterTests.jl") + # include("ExpressionProcessingTests.jl") + # include("InterpreterTests.jl") include("TranspilerTests.jl") end -@testset "CPU Interpreter" begin - include("CpuInterpreterTests.jl") +#@testset "CPU Interpreter" begin +# include("CpuInterpreterTests.jl") +#end + +@testset "Performance tests" begin + # TODO: make performance tests + + # Put data in shared memory: + # https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory + + # Make array const: + # https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays + + # Memory management like in C++ might help with performance improvements + # https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management + + end \ No newline at end of file diff --git a/thesis/chapters/implementation.tex b/thesis/chapters/implementation.tex index 1c01a40..bb01a59 100644 --- a/thesis/chapters/implementation.tex +++ b/thesis/chapters/implementation.tex @@ -9,5 +9,12 @@ Probably reference the performance evaluation papers for Julia and CUDA.jl \section{Interpreter} Talk about how the interpreter has been developed. +\subsection{Performance tuning} +Document the process of performance tuning + + \section{Transpiler} -Talk about how the transpiler has been developed \ No newline at end of file +Talk about how the transpiler has been developed + +\subsection{Performance tuning} +Document the process of performance tuning \ No newline at end of file diff --git a/thesis/chapters/introduction.tex b/thesis/chapters/introduction.tex index 2583d13..0a18f31 100644 --- a/thesis/chapters/introduction.tex +++ b/thesis/chapters/introduction.tex @@ -41,7 +41,7 @@ In order to answer the research questions, this thesis is divided into the follo \item[Chapter 4: Implementation] \mbox{} \\ This chapter explains the implementation of the GPU interpreter and transpiler. The details of the implementation with the used technologies are covered, such as the interpretation process and the transpilation of the expressions into Parallel Thread Execution (PTX) code. \item[Chapter 5: Evaluation] \mbox{} \\ - The software and hardware requirements and the evaluation environment are introduced in this chapter. All three evaluators will be compared against each other and the form of the expressions used for the comparisons are outlined. Finally, the results of the comparison of the GPU and CPU evaluators are presented to show which of these yields the best performance. + The software and hardware requirements and the evaluation environment are introduced in this chapter. All three evaluators will be compared against each other and the form of the expressions used for the comparisons are outlined. The comparison will not only include the time taken for the pure evaluation, but it will also include the overhead, like PTX code generation. Finally, the results of the comparison of the GPU and CPU evaluators are presented to show which of these yields the best performance. \item[Chapter 6: Conclusion] \mbox{} \\ In the final chapter, the entire work is summarised. A brief overview of the implementation as well as the evaluation results will be provided. Additionally, an outlook of possible future research is given. \end{description} diff --git a/thesis/main.pdf b/thesis/main.pdf index eea60eb..56dabe2 100644 Binary files a/thesis/main.pdf and b/thesis/main.pdf differ