Improvements / fixes.
This commit is contained in:
parent
942adb8612
commit
f4f39ec47c
|
@ -29,7 +29,7 @@ end
|
||||||
|
|
||||||
|
|
||||||
# Evaluate Expressions on the CPU
|
# Evaluate Expressions on the CPU
|
||||||
function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
|
function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32}
|
||||||
@assert axes(exprs) == axes(p)
|
@assert axes(exprs) == axes(p)
|
||||||
nrows = size(X, 1)
|
nrows = size(X, 1)
|
||||||
|
|
||||||
|
@ -37,7 +37,14 @@ function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector
|
||||||
res = Matrix{Float32}(undef, nrows, length(exprs))
|
res = Matrix{Float32}(undef, nrows, length(exprs))
|
||||||
|
|
||||||
for i in eachindex(exprs)
|
for i in eachindex(exprs)
|
||||||
CpuInterpreter.interpret!((@view res[:,i]), exprs[i], X, p[i])
|
# The interpreter holds the postfix code and buffers for evaluation. It is costly to create
|
||||||
|
interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i]))
|
||||||
|
|
||||||
|
# If an expression has to be evaluated multiple times (e.g. for different parameters),
|
||||||
|
# it is worthwhile to reuse the interpreter to reduce the number of allocations
|
||||||
|
for rep in 1:repetitions
|
||||||
|
CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i])
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
res
|
res
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
using LinearAlgebra
|
||||||
|
|
||||||
function test_cpu_interpreter(nrows; parallel = false)
|
function test_cpu_interpreter(nrows; parallel = false)
|
||||||
exprs = [
|
exprs = [
|
||||||
|
@ -13,12 +14,15 @@ function test_cpu_interpreter(nrows; parallel = false)
|
||||||
|
|
||||||
# warmup
|
# warmup
|
||||||
interpret_cpu(exprs, X, p)
|
interpret_cpu(exprs, X, p)
|
||||||
|
expr_reps = 100 # for each expr
|
||||||
|
reps= 100
|
||||||
|
|
||||||
if parallel
|
if parallel
|
||||||
t_sec = @elapsed Threads.@threads :static for i in 1:100 interpret_cpu(exprs, X, p) end
|
t_sec = @elapsed fetch.([Threads.@spawn interpret_cpu(exprs, X, p; repetitions=expr_reps) for i in 1:reps])
|
||||||
println("~ $(round(30 * 100 * nrows / 1e9 / t_sec, digits=2)) GFLOPS (single-core) ($(round(peakflops(1000, eltype=Float32, ntrials=1, parallel=false) / 1e9, digits=2)) GFLOPS (peak, single-core))")
|
println("~ $(round(30 * reps * expr_reps * nrows / 1e9 / t_sec, digits=2)) GFLOPS ($(Threads.nthreads()) threads) ($(round(peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))")
|
||||||
else
|
else
|
||||||
t_sec = @elapsed for i in 1:100 interpret_cpu(exprs, X, p) end
|
t_sec = @elapsed for i in 1:reps interpret_cpu(exprs, X, p; repetitions=expr_reps) end
|
||||||
println("~ $(round(30 * 100 * nrows / 1e9 / t_sec, digits=2)) GFLOPS ($(Threads.nthreads()) threads) ($(round(peakflops(1000, eltype=Float32, ntrials=1, parallel=false) / 1e9, digits=2)) GFLOPS (peak, single-core))")
|
println("~ $(round(30 * reps * expr_reps * nrows / 1e9 / t_sec, digits=2)) GFLOPS (single-core) ($(round(peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))")
|
||||||
end
|
end
|
||||||
true
|
true
|
||||||
end
|
end
|
||||||
|
|
Loading…
Reference in New Issue
Block a user