Improvements / fixes.

2025-02-19 17:18:43 +01:00
parent 942adb8612
commit f4f39ec47c
2 changed files with 17 additions and 6 deletions
--- a/package/src/ExpressionExecutorCuda.jl
+++ b/package/src/ExpressionExecutorCuda.jl
@ -29,7 +29,7 @@ end


 # Evaluate Expressions on the CPU
-function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
+function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32}
 	@assert axes(exprs) == axes(p)
 	nrows = size(X, 1)
 	
@ -37,7 +37,14 @@ function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector
 	res = Matrix{Float32}(undef, nrows, length(exprs))

 	for i in eachindex(exprs) 
-		CpuInterpreter.interpret!((@view res[:,i]), exprs[i], X, p[i])
+		# The interpreter holds the postfix code and buffers for evaluation. It is costly to create
+		interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i])) 
+
+		# If an expression has to be evaluated multiple times (e.g. for different parameters),
+		# it is worthwhile to reuse the interpreter to reduce the number of allocations
+		for rep in 1:repetitions
+			CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i])
+		end
 	end

 	res
--- a/package/test/CpuInterpreterTests.jl
+++ b/package/test/CpuInterpreterTests.jl
@ -1,3 +1,4 @@
+using LinearAlgebra

 function test_cpu_interpreter(nrows; parallel = false)
    exprs = [
@ -13,12 +14,15 @@ function test_cpu_interpreter(nrows; parallel = false)
    
    # warmup
    interpret_cpu(exprs, X, p)
+    expr_reps = 100 # for each expr
+    reps= 100
+
    if parallel 
-        t_sec = @elapsed Threads.@threads :static for i in 1:100 interpret_cpu(exprs, X, p) end
-        println("~ $(round(30 * 100 * nrows  / 1e9 / t_sec, digits=2)) GFLOPS (single-core) ($(round(peakflops(1000, eltype=Float32, ntrials=1, parallel=false) / 1e9, digits=2)) GFLOPS (peak, single-core))")
+        t_sec = @elapsed fetch.([Threads.@spawn interpret_cpu(exprs, X, p; repetitions=expr_reps) for i in 1:reps])
+        println("~ $(round(30 * reps * expr_reps * nrows  / 1e9 / t_sec, digits=2)) GFLOPS ($(Threads.nthreads()) threads) ($(round(peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))")
    else
-        t_sec = @elapsed for i in 1:100 interpret_cpu(exprs, X, p) end
-        println("~ $(round(30 * 100 * nrows  / 1e9 / t_sec, digits=2)) GFLOPS ($(Threads.nthreads()) threads) ($(round(peakflops(1000, eltype=Float32, ntrials=1, parallel=false) / 1e9, digits=2)) GFLOPS (peak, single-core))")
+        t_sec = @elapsed for i in 1:reps interpret_cpu(exprs, X, p; repetitions=expr_reps) end
+        println("~ $(round(30 * reps * expr_reps * nrows  / 1e9 / t_sec, digits=2)) GFLOPS (single-core) ($(round(peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))")
    end
    true
 end