implementation: finished pre-processing section; updated code
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run

This commit is contained in:
2025-04-26 13:46:23 +02:00
parent ad2eab2e0a
commit e571fa5bd6
10 changed files with 238 additions and 46 deletions

View File

@ -26,8 +26,9 @@ function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector
ncols = size(X, 2)
results = Matrix{Float32}(undef, ncols, length(exprs))
# TODO: create CuArray for variables here already, as they never change
for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially)
for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl)
results = Interpreter.interpret(exprs, X, p)
end
@ -40,8 +41,9 @@ function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{
ncols = size(X, 2)
results = Matrix{Float32}(undef, ncols, length(exprs))
# TODO: create CuArray for variables here already, as they never change
for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially)
for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl)
results = Transpiler.evaluate(exprs, X, p)
end

View File

@ -9,6 +9,7 @@ export ExpressionElement
@enum Operator ADD=1 SUBTRACT=2 MULTIPLY=3 DIVIDE=4 POWER=5 ABS=6 LOG=7 EXP=8 SQRT=9
@enum ElementType EMPTY=0 FLOAT32=1 OPERATOR=2 INDEX=3
const binary_operators = [ADD, SUBTRACT, MULTIPLY, DIVIDE, POWER]
const unary_operators = [ABS, LOG, EXP, SQRT]
struct ExpressionElement
@ -17,12 +18,13 @@ struct ExpressionElement
end
const PostfixType = Vector{ExpressionElement}
const cache = Dict{Expr, PostfixType}()
"
Converts a julia expression to its postfix notation.
NOTE: All 64-Bit values will be converted to 32-Bit. Be aware of the lost precision
NOTE: All 64-Bit values will be converted to 32-Bit. Be aware of the lost precision.
NOTE: This function is not thread save, especially cache access is not thread save
"
function expr_to_postfix(expr::Expr)::PostfixType
function expr_to_postfix(expr::Expr, cache::Dict{Expr, PostfixType})::PostfixType
if haskey(cache, expr)
return cache[expr]
end
@ -34,7 +36,7 @@ function expr_to_postfix(expr::Expr)::PostfixType
arg = expr.args[j]
if typeof(arg) === Expr
append!(postfix, expr_to_postfix(arg))
append!(postfix, expr_to_postfix(arg, cache))
elseif typeof(arg) === Symbol # variables/parameters
# maybe TODO: replace the parameters with their respective values, as this might make the expr evaluation faster
exprElement = convert_to_ExpressionElement(convert_var_to_int(arg))
@ -56,6 +58,8 @@ function expr_to_postfix(expr::Expr)::PostfixType
if operator in unary_operators
push!(postfix, convert_to_ExpressionElement(operator))
end
cache[expr] = postfix
return postfix
end

View File

@ -6,6 +6,8 @@ using ..Utils
export interpret
const cacheFrontend = Dict{Expr, PostfixType}()
"Interprets the given expressions with the values provided.
# Arguments
- expressions::Vector{ExpressionProcessing.PostfixType} : The expressions to execute in postfix form
@ -13,10 +15,9 @@ export interpret
- parameters::Vector{Vector{Float32}} : The parameters to use. Each Vector contains the values for the parameters p1..pn. The number of parameters can be different for every expression
"
function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
exprs = Vector{ExpressionProcessing.PostfixType}(undef, length(expressions))
@inbounds for i in eachindex(expressions)
exprs[i] = ExpressionProcessing.expr_to_postfix(expressions[i])
exprs[i] = ExpressionProcessing.expr_to_postfix(expressions[i], cacheFrontend)
end
variableCols = size(variables, 2) # number of variable sets to use for each expression

View File

@ -7,21 +7,23 @@ using ..Utils
const BYTES = sizeof(Float32)
const Operand = Union{Float32, String} # Operand is either fixed value or register
cache = Dict{Expr, CuFunction}() # needed if multiple runs with the same expr but different parameters are performed
const cacheFrontend = Dict{Expr, PostfixType}()
const transpilerCache = Dict{Expr, CuFunction}() # needed if multiple runs with the same expr but different parameters are performed
function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
varRows = size(variables, 1)
variableCols = size(variables, 2)
kernels = Vector{CuFunction}(undef, length(expressions))
# TODO: test this again with multiple threads. The first time I tried, I was using only one thread
# Test this parallel version again when doing performance tests. With the simple "functionality" tests this took 0.03 seconds while sequential took "0.00009" seconds
# Threads.@threads for i in eachindex(expressions)
# cacheLock = ReentrantLock()
# cacheHit = false
# lock(cacheLock) do
# if haskey(cache, expressions[i])
# kernels[i] = cache[expressions[i]]
# if haskey(transpilerCache, expressions[i])
# kernels[i] = transpilerCache[expressions[i]]
# cacheHit = true
# end
# end
@ -42,16 +44,16 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
# mod = CuModule(image)
# kernels[i] = CuFunction(mod, "ExpressionProcessing")
# @lock cacheLock cache[expressions[i]] = kernels[i]
# @lock cacheLock transpilerCache[expressions[i]] = kernels[i]
# end
@inbounds for i in eachindex(expressions)
if haskey(cache, expressions[i])
kernels[i] = cache[expressions[i]]
if haskey(transpilerCache, expressions[i])
kernels[i] = transpilerCache[expressions[i]]
continue
end
formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i], cacheFrontend)
kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i-1) # i-1 because julia is 1-based but PTX needs 0-based indexing
linker = CuLink()
@ -61,7 +63,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
mod = CuModule(image)
kernels[i] = CuFunction(mod, "ExpressionProcessing")
cache[expressions[i]] = kernels[i]
transpilerCache[expressions[i]] = kernels[i]
end
cudaVars = CuArray(variables) # maybe put in shared memory (see PerformanceTests.jl for more info)
@ -78,7 +80,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
cudacall(kernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
end
return cudaResults
end

View File

@ -73,16 +73,17 @@ end
# Add /usr/local/cuda/bin in .bashrc to PATH to access ncu and nsys (depending how well this works with my 1080 do it on my machine, otherwise re do the tests and perform them on FH PCs)
# University setup at 10.20.1.7 if needed
compareWithCPU = true
compareWithCPU = false
suite = BenchmarkGroup()
suite["CPU"] = BenchmarkGroup(["CPUInterpreter"])
suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"])
suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"])
varsets_small = 100
varsets_medium = 1000
varsets_large = 10000
# TODO: see CpuInterpreterTests.jl to see how all data is loaded and implement this here
varsets_small = 1000 # 1k should be absolute minimum
varsets_medium = 10000
varsets_large = 100000 # 100k should be absolute maximum (although not as strict as minimum)
if compareWithCPU
X_small = randn(Float32, varsets_small, 5)
@ -112,7 +113,7 @@ suite["GPUT"]["large varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_large_GP
loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)
results = run(suite, verbose=true, seconds=180)
results = run(suite, verbose=true, seconds=3600) # 1 hour because of CPU. lets see if more is needed
if compareWithCPU
medianCPU = median(results["CPU"])

View File

@ -3,7 +3,7 @@ using CUDA
using .Transpiler
using .Interpreter
varsets_medium = 1000
varsets_medium = 10000
X = randn(Float32, 5, varsets_medium)
exprsGPU = [