Files
master-thesis/package/src/ExpressionExecutorCuda.jl
Daniel 381a4819c9
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
evaluation: continued with interpreter benchmarking and performance tuning
2025-05-29 15:13:17 +02:00

114 lines
4.5 KiB
Julia

module ExpressionExecutorCuda
include("Utils.jl")
include("ExpressionProcessing.jl")
include("Interpreter.jl")
include("Transpiler.jl")
module CpuInterpreter
include("Code.jl")
include("CpuInterpreter.jl")
end
using CUDA
using ..ExpressionProcessing
export interpret_gpu,interpret_cpu
export evaluate_gpu
# Some assertions:
# Variables and parameters start their indexing with "1" meaning the first variable/parameter has to be "x1/p1" and not "x0/p0"
# Matrix X is column major
# each index i in exprs has to have the matching values in the column i in Matrix X so that X[:,i] contains the values for expr[i]. The same goes for p
# This assertion is made, because in julia, the first index doesn't have to be 1
#
# Evaluate Expressions on the GPU
function interpret_gpu(expressions::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32}
@assert axes(expressions) == axes(p)
variableCols = size(X, 2)
variableRows = size(X, 1)
variables = CuArray(X)
exprs = Vector{ExpressionProcessing.PostfixType}(undef, length(expressions))
@inbounds Threads.@threads for i in eachindex(expressions)
exprs[i] = ExpressionProcessing.expr_to_postfix(expressions[i])
end
cudaExprs = Utils.create_cuda_array(exprs, ExpressionProcessing.ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression;
exprsLength = length(exprs)
exprsInnerLength = Utils.get_max_inner_length(exprs)
results = Matrix{Float32}(undef, variableCols, length(exprs))
for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl)
results = Interpreter.interpret(cudaExprs, exprsLength, exprsInnerLength, variables, variableCols, variableRows, p)
end
return results
end
# Convert Expressions to PTX Code and execute that instead
function evaluate_gpu(expressions::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32}
@assert axes(expressions) == axes(p)
numVariableSets = size(X, 2) # nr. of columns of X
variableSetSize = size(X, 1) # nr. of rows of X
variables = CuArray(X)
largestParameterSetSize = Utils.get_max_inner_length(p) # parameters get transformed into matrix. Will be nr. of rows in parameter matrix
compiledKernels = Vector{CuFunction}(undef, length(expressions))
kernelName = "evaluate_gpu"
@inbounds Threads.@threads for i in eachindex(expressions)
ex = ExpressionProcessing.expr_to_postfix(expressions[i])
ptxKernel = Transpiler.transpile(ex, variableSetSize, largestParameterSetSize, numVariableSets, i-1, kernelName) # i-1 because julia is 1-based but PTX needs 0-based indexing
compiledKernels[i] = Transpiler.compile_kernel(ptxKernel, kernelName)
end
results = Matrix{Float32}(undef, numVariableSets, length(expressions))
for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl)
# evaluate
# results = Transpiler.evaluate(exprs, variables, numVariableSets, variableSetSize, p)
results = Transpiler.evaluate(compiledKernels, variables, numVariableSets, p, kernelName)
end
return results
end
# Evaluate Expressions on the CPU
function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1, parallel=false)::Matrix{Float32}
@assert axes(exprs) == axes(p)
nrows = size(X, 1)
# each column of the matrix has the result for an expr
res = Matrix{Float32}(undef, nrows, length(exprs))
if parallel
Threads.@threads for i in eachindex(exprs)
# The interpreter holds the postfix code and buffers for evaluation. It is costly to create
interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i]))
# If an expression has to be evaluated multiple times (e.g. for different parameters),
# it is worthwhile to reuse the interpreter to reduce the number of allocations
for rep in 1:repetitions
CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i])
end
end
else
for i in eachindex(exprs)
# The interpreter holds the postfix code and buffers for evaluation. It is costly to create
interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i]))
# If an expression has to be evaluated multiple times (e.g. for different parameters),
# it is worthwhile to reuse the interpreter to reduce the number of allocations
for rep in 1:repetitions
CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i])
end
end
end
res
end
end