module ExpressionExecutorCuda include("Utils.jl") include("ExpressionProcessing.jl") include("Interpreter.jl") include("Transpiler.jl") module CpuInterpreter include("Code.jl") include("CpuInterpreter.jl") end using CUDA using ..ExpressionProcessing export interpret_gpu,interpret_cpu export evaluate_gpu # Some assertions: # Variables and parameters start their indexing with "1" meaning the first variable/parameter has to be "x1/p1" and not "x0/p0" # Matrix X is column major # each index i in exprs has to have the matching values in the column i in Matrix X so that X[:,i] contains the values for expr[i]. The same goes for p # This assertion is made, because in julia, the first index doesn't have to be 1 # # Evaluate Expressions on the GPU function interpret_gpu(expressions::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32} @assert axes(expressions) == axes(p) variableCols = size(X, 2) variableRows = size(X, 1) variables = CuArray(X) exprs = Vector{ExpressionProcessing.PostfixType}(undef, length(expressions)) @inbounds Threads.@threads for i in eachindex(expressions) exprs[i] = ExpressionProcessing.expr_to_postfix(expressions[i]) end cudaExprs = Utils.create_cuda_array(exprs, ExpressionProcessing.ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression; exprsLength = length(exprs) exprsInnerLength = Utils.get_max_inner_length(exprs) results = Matrix{Float32}(undef, variableCols, length(exprs)) for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl) results = Interpreter.interpret(cudaExprs, exprsLength, exprsInnerLength, variables, variableCols, variableRows, p) end return results end # Convert Expressions to PTX Code and execute that instead function evaluate_gpu(expressions::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32} @assert axes(expressions) == axes(p) numVariableSets = size(X, 2) # nr. of columns of X variableSetSize = size(X, 1) # nr. of rows of X variables = CuArray(X) largestParameterSetSize = Utils.get_max_inner_length(p) # parameters get transformed into matrix. Will be nr. of rows in parameter matrix compiledKernels = Vector{CuFunction}(undef, length(expressions)) kernelName = "evaluate_gpu" @inbounds Threads.@threads for i in eachindex(expressions) ex = ExpressionProcessing.expr_to_postfix(expressions[i]) ptxKernel = Transpiler.transpile(ex, variableSetSize, largestParameterSetSize, numVariableSets, i-1, kernelName) # i-1 because julia is 1-based but PTX needs 0-based indexing compiledKernels[i] = Transpiler.compile_kernel(ptxKernel, kernelName) end results = Matrix{Float32}(undef, numVariableSets, length(expressions)) for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl) # evaluate # results = Transpiler.evaluate(exprs, variables, numVariableSets, variableSetSize, p) results = Transpiler.evaluate(compiledKernels, variables, numVariableSets, p, kernelName) end return results end # Evaluate Expressions on the CPU function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1, parallel=false)::Matrix{Float32} @assert axes(exprs) == axes(p) nrows = size(X, 1) # each column of the matrix has the result for an expr res = Matrix{Float32}(undef, nrows, length(exprs)) if parallel Threads.@threads for i in eachindex(exprs) # The interpreter holds the postfix code and buffers for evaluation. It is costly to create interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i])) # If an expression has to be evaluated multiple times (e.g. for different parameters), # it is worthwhile to reuse the interpreter to reduce the number of allocations for rep in 1:repetitions CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i]) end end else for i in eachindex(exprs) # The interpreter holds the postfix code and buffers for evaluation. It is costly to create interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i])) # If an expression has to be evaluated multiple times (e.g. for different parameters), # it is worthwhile to reuse the interpreter to reduce the number of allocations for rep in 1:repetitions CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i]) end end end res end end