benchmarking: updated transpiler to drastically reduce the number of transpilations at the expense of memory usage
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run

This commit is contained in:
2025-05-19 11:39:49 +02:00
parent 33e7edd4c8
commit f33551e25f
4 changed files with 48 additions and 69 deletions

View File

@ -49,19 +49,26 @@ end
# Convert Expressions to PTX Code and execute that instead # Convert Expressions to PTX Code and execute that instead
function evaluate_gpu(expressions::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32} function evaluate_gpu(expressions::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32}
@assert axes(expressions) == axes(p) @assert axes(expressions) == axes(p)
variableCols = size(X, 2) numVariableSets = size(X, 2) # nr. of columns of X
variableRows = size(X, 1) variableSetSize = size(X, 1) # nr. of rows of X
variables = CuArray(X) variables = CuArray(X)
exprs = Vector{ExpressionProcessing.PostfixType}(undef, length(expressions)) largestParameterSetSize = Utils.get_max_inner_length(p) # parameters get transformed into matrix. Will be nr. of rows in parameter matrix
compiledKernels = Vector{CuFunction}(undef, length(expressions))
kernelName = "evaluate_gpu"
@inbounds Threads.@threads for i in eachindex(expressions) @inbounds Threads.@threads for i in eachindex(expressions)
exprs[i] = ExpressionProcessing.expr_to_postfix(expressions[i]) ex = ExpressionProcessing.expr_to_postfix(expressions[i])
ptxKernel = Transpiler.transpile(ex, variableSetSize, largestParameterSetSize, numVariableSets, i-1, kernelName) # i-1 because julia is 1-based but PTX needs 0-based indexing
compiledKernels[i] = Transpiler.CompileKernel(ptxKernel, kernelName)
end end
results = Matrix{Float32}(undef, variableCols, length(exprs)) results = Matrix{Float32}(undef, numVariableSets, length(exprs))
for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl) for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl)
results = Transpiler.evaluate(exprs, variables, variableCols, variableRows, p) # evaluate
# results = Transpiler.evaluate(exprs, variables, numVariableSets, variableSetSize, p)
results = Transpiler.evaluate(compiledKernels, variables, variableSetSize, p)
end end
return results return results
@ -103,7 +110,6 @@ function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector
res res
end end
# Flow # Flow
# input: Vector expr == expressions contains eg. 4 expressions # input: Vector expr == expressions contains eg. 4 expressions
# Matrix X == |expr| columns, n rows. n == number of variabls x1..xn; n is the same for all expressions --- WRONG # Matrix X == |expr| columns, n rows. n == number of variabls x1..xn; n is the same for all expressions --- WRONG

View File

@ -14,37 +14,6 @@ const Operand = Union{Float32, String} # Operand is either fixed value or regist
" "
function evaluate(expressions::Vector{ExpressionProcessing.PostfixType}, cudaVars::CuArray{Float32}, variableColumns::Integer, variableRows::Integer, parameters::Vector{Vector{Float32}})::Matrix{Float32} function evaluate(expressions::Vector{ExpressionProcessing.PostfixType}, cudaVars::CuArray{Float32}, variableColumns::Integer, variableRows::Integer, parameters::Vector{Vector{Float32}})::Matrix{Float32}
# TODO: test this again with multiple threads. The first time I tried, I was using only one thread
# Test this parallel version again when doing performance tests. With the simple "functionality" tests this took 0.03 seconds while sequential took "0.00009" seconds
# Threads.@threads for i in eachindex(expressions)
# cacheLock = ReentrantLock()
# cacheHit = false
# lock(cacheLock) do
# if haskey(transpilerCache, expressions[i])
# kernels[i] = transpilerCache[expressions[i]]
# cacheHit = true
# end
# end
# if cacheHit
# continue
# end
# formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
# kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableColumns, i-1) # i-1 because julia is 1-based but PTX needs 0-based indexing
# linker = CuLink()
# add_data!(linker, "ExpressionProcessing", kernel)
# image = complete(linker)
# mod = CuModule(image)
# kernels[i] = CuFunction(mod, "ExpressionProcessing")
# @lock cacheLock transpilerCache[expressions[i]] = kernels[i]
# end
cudaParams = Utils.create_cuda_array(parameters, NaN32) # maybe make constant (see PerformanceTests.jl for more info) cudaParams = Utils.create_cuda_array(parameters, NaN32) # maybe make constant (see PerformanceTests.jl for more info)
# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions # each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
@ -54,33 +23,44 @@ function evaluate(expressions::Vector{ExpressionProcessing.PostfixType}, cudaVar
blocks = cld(variableColumns, threads) blocks = cld(variableColumns, threads)
kernelName = "evaluate_gpu" kernelName = "evaluate_gpu"
# TODO: Implement batching as a middleground between "transpile everything and then run" and "tranpile one run one" even though cudacall is async
@inbounds Threads.@threads for i in eachindex(expressions) @inbounds Threads.@threads for i in eachindex(expressions)
# if haskey(resultCache, expressions[i])
# kernels[i] = resultCache[expressions[i]]
# continue
# end
# formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
kernel = transpile(expressions[i], variableRows, Utils.get_max_inner_length(parameters), variableColumns, i-1, kernelName) # i-1 because julia is 1-based but PTX needs 0-based indexing kernel = transpile(expressions[i], variableRows, Utils.get_max_inner_length(parameters), variableColumns, i-1, kernelName) # i-1 because julia is 1-based but PTX needs 0-based indexing
compiledKernel = CompileKernel(kernel, kernelName)
linker = CuLink()
add_data!(linker, kernelName, kernel)
image = complete(linker)
mod = CuModule(image)
compiledKernel = CuFunction(mod, kernelName)
cudacall(compiledKernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks) cudacall(compiledKernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
end end
# for kernel in kernels return cudaResults
# cudacall(kernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks) end
# end
"
A simplified version of the evaluate function. It takes a list of already compiled kernels to be executed. This should yield better performance, where the same expressions should be evaluated multiple times i.e. for parameter optimisation.
"
function evaluate(kernels::Vector{CuFunction}, cudaVars::CuArray{Float32}, nrOfVariableSets::Integer, parameters::Vector{Vector{Float32}})::Matrix{Float32}
cudaParams = Utils.create_cuda_array(parameters, NaN32) # maybe make constant (see PerformanceTests.jl for more info)
# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
cudaResults = CuArray{Float32}(undef, nrOfVariableSets, length(expressions))
threads = min(nrOfVariableSets, 256)
blocks = cld(nrOfVariableSets, threads)
@inbounds Threads.@threads for i in eachindex(kernels)
cudacall(kernels[i], (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
end
return cudaResults return cudaResults
end end
function CompileKernel(ptxKernel::String, kernelName::String)::CuFunction
linker = CuLink()
add_data!(linker, kernelName, ptxKernel)
image = complete(linker)
mod = CuModule(image)
return CuFunction(mod, kernelName)
end
# To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string # To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string
# seekstart(buf1); write(buf2, buf1) # seekstart(buf1); write(buf2, buf1)
" "

View File

@ -59,13 +59,10 @@ Results only for Interpreter (also contains final kernel configuration and proba
\subsection{Performance Tuning} \subsection{Performance Tuning}
Document the process of performance tuning Document the process of performance tuning
Initial: CPU-Side single-threaded; up to 1024 threads per block; bounds-checking enabled (especially in kernel) Initial: no cache; 256 blocksize; exprs pre-processed and sent to GPU on every call; vars sent on every call; frontend + dispatch are multithreaded
1.) Done before parameter optimisation loop: Frontend, transmitting Exprs and Variables (improved runtime)
1.) Blocksize reduced to a maximum of 256 -> moderate improvement in medium and large
2.) Using @inbounds -> noticeable improvement in 2 out of 3
3.) Tuned blocksize with NSight compute -> slight improvement
4.) used int32 everywhere to reduce register usage -> significant performance drop (probably because a lot more waiting time "latency hiding not working basically", or more type conversions happening on GPU? look at generated PTX code and use that as an argument to describe why it is slower)
5.) reverted previous; used fastmath instead -> imporvement (large var set is now faster than on transpiler)
\subsection{Transpiler} \subsection{Transpiler}
Results only for Transpiler (also contains final kernel configuration and probably quick overview/recap of the implementation used and described in Implementation section Results only for Transpiler (also contains final kernel configuration and probably quick overview/recap of the implementation used and described in Implementation section
@ -75,13 +72,9 @@ Results only for Transpiler (also contains final kernel configuration and probab
\subsection{Performance Tuning} \subsection{Performance Tuning}
Document the process of performance tuning Document the process of performance tuning
Initial: CPU-Side single-threaded; up to 1024 threads per block; bounds-checking enabled Initial: no cache; 256 blocksize; exprs pre-processed and transpiled on every call; vars sent on every call; frontend + transpilation + dispatch are multithreaded
1.) Blocksize reduced to a maximum of 256 -> moderate improvement in medium and large 1.) Done before parameter optimisation loop: Frontend, transmitting Exprs and Variables (improved runtime)
2.) Using @inbounds -> small improvement only on CPU side code
3.) Tuned blocksize with NSight compute -> slight improvement
4.) Only changed things on interpreter side
5.) Only changed things on interpreter side
\subsection{Comparison} \subsection{Comparison}
Comparison of Interpreter and Transpiler as well as Comparing the two with CPU interpreter Comparison of Interpreter and Transpiler as well as Comparing the two with CPU interpreter

Binary file not shown.