From f4f39ec47c170bbc29bc13ee4c2d24df979cf687 Mon Sep 17 00:00:00 2001 From: Gabriel Kronberger Date: Wed, 19 Feb 2025 17:18:43 +0100 Subject: [PATCH] Improvements / fixes. --- package/src/ExpressionExecutorCuda.jl | 11 +++++++++-- package/test/CpuInterpreterTests.jl | 12 ++++++++---- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/package/src/ExpressionExecutorCuda.jl b/package/src/ExpressionExecutorCuda.jl index 3296e7c..2192dcf 100644 --- a/package/src/ExpressionExecutorCuda.jl +++ b/package/src/ExpressionExecutorCuda.jl @@ -29,7 +29,7 @@ end # Evaluate Expressions on the CPU -function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32} +function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32} @assert axes(exprs) == axes(p) nrows = size(X, 1) @@ -37,7 +37,14 @@ function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector res = Matrix{Float32}(undef, nrows, length(exprs)) for i in eachindex(exprs) - CpuInterpreter.interpret!((@view res[:,i]), exprs[i], X, p[i]) + # The interpreter holds the postfix code and buffers for evaluation. It is costly to create + interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i])) + + # If an expression has to be evaluated multiple times (e.g. for different parameters), + # it is worthwhile to reuse the interpreter to reduce the number of allocations + for rep in 1:repetitions + CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i]) + end end res diff --git a/package/test/CpuInterpreterTests.jl b/package/test/CpuInterpreterTests.jl index 591ef21..356a4a6 100644 --- a/package/test/CpuInterpreterTests.jl +++ b/package/test/CpuInterpreterTests.jl @@ -1,3 +1,4 @@ +using LinearAlgebra function test_cpu_interpreter(nrows; parallel = false) exprs = [ @@ -13,12 +14,15 @@ function test_cpu_interpreter(nrows; parallel = false) # warmup interpret_cpu(exprs, X, p) + expr_reps = 100 # for each expr + reps= 100 + if parallel - t_sec = @elapsed Threads.@threads :static for i in 1:100 interpret_cpu(exprs, X, p) end - println("~ $(round(30 * 100 * nrows / 1e9 / t_sec, digits=2)) GFLOPS (single-core) ($(round(peakflops(1000, eltype=Float32, ntrials=1, parallel=false) / 1e9, digits=2)) GFLOPS (peak, single-core))") + t_sec = @elapsed fetch.([Threads.@spawn interpret_cpu(exprs, X, p; repetitions=expr_reps) for i in 1:reps]) + println("~ $(round(30 * reps * expr_reps * nrows / 1e9 / t_sec, digits=2)) GFLOPS ($(Threads.nthreads()) threads) ($(round(peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))") else - t_sec = @elapsed for i in 1:100 interpret_cpu(exprs, X, p) end - println("~ $(round(30 * 100 * nrows / 1e9 / t_sec, digits=2)) GFLOPS ($(Threads.nthreads()) threads) ($(round(peakflops(1000, eltype=Float32, ntrials=1, parallel=false) / 1e9, digits=2)) GFLOPS (peak, single-core))") + t_sec = @elapsed for i in 1:reps interpret_cpu(exprs, X, p; repetitions=expr_reps) end + println("~ $(round(30 * reps * expr_reps * nrows / 1e9 / t_sec, digits=2)) GFLOPS (single-core) ($(round(peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))") end true end