benchmarking: removed caches to get initial performance measurement. still some problems
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run

This commit is contained in:
Daniel
2025-05-10 13:11:27 +02:00
parent 6d3c3164cf
commit 2ba1fef5ba
5 changed files with 68 additions and 54 deletions

View File

@ -8,9 +8,10 @@ using ..Utils
const BYTES = sizeof(Float32)
const Operand = Union{Float32, String} # Operand is either fixed value or register
const cacheFrontend = Dict{Expr, PostfixType}()
const transpilerCache = Dict{Expr, CuFunction}() # needed if multiple runs with the same expr but different parameters are performed
"
- kwparam ```frontendCache```: The cache that stores the (partial) results of the frontend, to speedup the pre-processing
- kwparam ```frontendCache```: The cache that stores the result of the transpilation. Useful for parameter optimisation, as the same expression gets executed multiple times
"
function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
varRows = size(variables, 1)
variableCols = size(variables, 2)
@ -48,22 +49,33 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
# end
@inbounds for i in eachindex(expressions)
if haskey(transpilerCache, expressions[i])
kernels[i] = transpilerCache[expressions[i]]
continue
end
# if haskey(resultCache, expressions[i])
# kernels[i] = resultCache[expressions[i]]
# continue
# end
formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i], cacheFrontend)
formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i-1) # i-1 because julia is 1-based but PTX needs 0-based indexing
linker = CuLink()
add_data!(linker, "ExpressionProcessing", kernel)
image = complete(linker)
mod = CuModule(image)
kernels[i] = CuFunction(mod, "ExpressionProcessing")
transpilerCache[expressions[i]] = kernels[i]
# try
linker = CuLink()
add_data!(linker, "ExpressionProcessing", kernel)
image = complete(linker)
mod = CuModule(image)
kernels[i] = CuFunction(mod, "ExpressionProcessing")
# resultCache[expressions[i]] = kernels[i]
# catch
# dump(expressions[i]; maxdepth=10)
# println()
# println()
# println(kernel)
# println()
# println()
# error(current_exceptions())
# end
end
cudaVars = CuArray(variables) # maybe put in shared memory (see PerformanceTests.jl for more info)
@ -203,7 +215,12 @@ function generate_calculation_code(expression::ExpressionProcessing.PostfixType,
for token in expression
if token.Type == FLOAT32
push!(operands, reinterpret(Float32, token.Value))
value = reinterpret(Float32, token.Value)
if isfinite(value)
push!(operands, value)
else
push!(operands, "0f" * string(token.Value, base = 16)) # otherwise, values like "Inf" would be written as "Inf" and therefore not understandable to the PTX compiler
end
elseif token.Type == OPERATOR
operator = reinterpret(Operator, token.Value)