benchmarking: started tuning benchmarking results. found some errors that need fixing
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run

This commit is contained in:
Daniel 2025-05-09 19:19:53 +02:00
parent 327e4ebf1b
commit 7121329a17
5 changed files with 39 additions and 33 deletions

View File

@ -58,21 +58,34 @@ end
# Evaluate Expressions on the CPU
function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32}
function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1, parallel=false)::Matrix{Float32}
@assert axes(exprs) == axes(p)
nrows = size(X, 1)
# each column of the matrix has the result for an expr
res = Matrix{Float32}(undef, nrows, length(exprs))
for i in eachindex(exprs)
# The interpreter holds the postfix code and buffers for evaluation. It is costly to create
interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i]))
# If an expression has to be evaluated multiple times (e.g. for different parameters),
# it is worthwhile to reuse the interpreter to reduce the number of allocations
for rep in 1:repetitions
CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i])
if parallel
Threads.@threads for i in eachindex(exprs)
# The interpreter holds the postfix code and buffers for evaluation. It is costly to create
interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i]))
# If an expression has to be evaluated multiple times (e.g. for different parameters),
# it is worthwhile to reuse the interpreter to reduce the number of allocations
for rep in 1:repetitions
CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i])
end
end
else
for i in eachindex(exprs)
# The interpreter holds the postfix code and buffers for evaluation. It is costly to create
interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i]))
# If an expression has to be evaluated multiple times (e.g. for different parameters),
# it is worthwhile to reuse the interpreter to reduce the number of allocations
for rep in 1:repetitions
CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i])
end
end
end

View File

@ -100,7 +100,7 @@ function get_operator(op::Symbol)::Operator
elseif op == :sqrt
return SQRT
else
throw("Operator unknown")
throw("Operator unknown. Operator was $op")
end
end

View File

@ -65,11 +65,6 @@ function test_cpu_interpreter_nikuradse()
i = 0
for line in eachline(io)
expr, p = parse_infix(line, varnames, paramnames)
if i > 10
return
end
println(expr)
push!(exprs, expr)
push!(parameters, randn(Float32, length(p)))

View File

@ -1,9 +1,13 @@
using LinearAlgebra
using BenchmarkTools
using DelimitedFiles
using GZip
using .Transpiler
using .Interpreter
include("parser.jl") # to parse expressions from a file
const BENCHMARKS_RESULTS_PATH = "./results-fh-new"
# Number of expressions can get really big (into millions)
@ -11,6 +15,7 @@ const BENCHMARKS_RESULTS_PATH = "./results-fh-new"
data,varnames = readdlm("data/nikuradse_1.csv", ',', header=true);
X = convert(Matrix{Float32}, data)
X_t = permutedims(X) # for gpu
exprs = Expr[]
parameters = Vector{Vector{Float32}}()
@ -19,24 +24,15 @@ paramnames = ["p$i" for i in 1:20]
# data/esr_nvar2_len10.txt.gz_9.txt.gz has ~250_000 exprs
# data/esr_nvar2_len10.txt.gz_10.txt.gz has ~800_000 exrps
GZip.open("data/esr_nvar2_len10.txt.gz_9.txt.gz") do io
i = 0
for line in eachline(io)
expr, p = parse_infix(line, varnames, paramnames)
if i > 10
return
end
println(expr)
push!(exprs, expr)
push!(parameters, randn(Float32, length(p)))
i += 1
end
end
expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially; only p changes but not X)
# TODO: Tipps for tuning:
# Put data in shared memory:
# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
@ -60,18 +56,20 @@ suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"])
suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"])
if compareWithCPU
suite["CPU"]["nikuradse_1"] = @benchmarkable interpret_cpu(exprsCPU, X, parameters; repetitions=expr_reps)
suite["CPU"]["nikuradse_1"] = @benchmarkable interpret_cpu(exprs, X, parameters; repetitions=expr_reps)
suite["CPU"]["nikuradse_1_parallel"] = @benchmarkable interpret_cpu(exprs, X, parameters; repetitions=expr_reps, parallel=true)
end
# TODO: Most likely need to transpose X matrix here, as we are expecting a column major matrix for more efficient memory access
suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprsGPU, X, parameters; repetitions=expr_reps)
suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprsGPU, X, parameters; repetitions=expr_reps)
suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprs, X_t, parameters; repetitions=expr_reps)
suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprs, X_t, parameters; repetitions=expr_reps)
for i in 1:10
for i in 1:2
tune!(suite)
end
BenchmarkTools.save("params.json", params(suite))
throw("finished tuning")
loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)
results = run(suite, verbose=true, seconds=3600) # 1 hour because of CPU. lets see if more is needed

View File

@ -10,9 +10,9 @@ include(joinpath(baseFolder, "src", "Interpreter.jl"))
include(joinpath(baseFolder, "src", "Transpiler.jl"))
@testset "Functionality tests" begin
include("ExpressionProcessingTests.jl")
include("InterpreterTests.jl")
include("TranspilerTests.jl")
# include("ExpressionProcessingTests.jl")
# include("InterpreterTests.jl")
# include("TranspilerTests.jl")
end
@ -22,5 +22,5 @@ end
@testset "Performance tests" begin
# include("PerformanceTuning.jl")
# include("PerformanceTests.jl")
include("PerformanceTests.jl")
end