From 7121329a1743a1332cc62477fd19bf74b1f30c7d Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 9 May 2025 19:19:53 +0200 Subject: [PATCH] benchmarking: started tuning benchmarking results. found some errors that need fixing --- package/src/ExpressionExecutorCuda.jl | 31 +++++++++++++++++++-------- package/src/ExpressionProcessing.jl | 2 +- package/test/CpuInterpreterTests.jl | 5 ----- package/test/PerformanceTests.jl | 26 +++++++++++----------- package/test/runtests.jl | 8 +++---- 5 files changed, 39 insertions(+), 33 deletions(-) diff --git a/package/src/ExpressionExecutorCuda.jl b/package/src/ExpressionExecutorCuda.jl index ac4cf64..ad8eef1 100644 --- a/package/src/ExpressionExecutorCuda.jl +++ b/package/src/ExpressionExecutorCuda.jl @@ -58,21 +58,34 @@ end # Evaluate Expressions on the CPU -function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32} +function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1, parallel=false)::Matrix{Float32} @assert axes(exprs) == axes(p) nrows = size(X, 1) # each column of the matrix has the result for an expr res = Matrix{Float32}(undef, nrows, length(exprs)) - for i in eachindex(exprs) - # The interpreter holds the postfix code and buffers for evaluation. It is costly to create - interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i])) - - # If an expression has to be evaluated multiple times (e.g. for different parameters), - # it is worthwhile to reuse the interpreter to reduce the number of allocations - for rep in 1:repetitions - CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i]) + if parallel + Threads.@threads for i in eachindex(exprs) + # The interpreter holds the postfix code and buffers for evaluation. It is costly to create + interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i])) + + # If an expression has to be evaluated multiple times (e.g. for different parameters), + # it is worthwhile to reuse the interpreter to reduce the number of allocations + for rep in 1:repetitions + CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i]) + end + end + else + for i in eachindex(exprs) + # The interpreter holds the postfix code and buffers for evaluation. It is costly to create + interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i])) + + # If an expression has to be evaluated multiple times (e.g. for different parameters), + # it is worthwhile to reuse the interpreter to reduce the number of allocations + for rep in 1:repetitions + CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i]) + end end end diff --git a/package/src/ExpressionProcessing.jl b/package/src/ExpressionProcessing.jl index c06d7b2..7bc05d6 100644 --- a/package/src/ExpressionProcessing.jl +++ b/package/src/ExpressionProcessing.jl @@ -100,7 +100,7 @@ function get_operator(op::Symbol)::Operator elseif op == :sqrt return SQRT else - throw("Operator unknown") + throw("Operator unknown. Operator was $op") end end diff --git a/package/test/CpuInterpreterTests.jl b/package/test/CpuInterpreterTests.jl index a5948c3..1e76b1b 100644 --- a/package/test/CpuInterpreterTests.jl +++ b/package/test/CpuInterpreterTests.jl @@ -65,11 +65,6 @@ function test_cpu_interpreter_nikuradse() i = 0 for line in eachline(io) expr, p = parse_infix(line, varnames, paramnames) - - if i > 10 - return - end - println(expr) push!(exprs, expr) push!(parameters, randn(Float32, length(p))) diff --git a/package/test/PerformanceTests.jl b/package/test/PerformanceTests.jl index d51d276..1cc332b 100644 --- a/package/test/PerformanceTests.jl +++ b/package/test/PerformanceTests.jl @@ -1,9 +1,13 @@ using LinearAlgebra using BenchmarkTools +using DelimitedFiles +using GZip using .Transpiler using .Interpreter +include("parser.jl") # to parse expressions from a file + const BENCHMARKS_RESULTS_PATH = "./results-fh-new" # Number of expressions can get really big (into millions) @@ -11,6 +15,7 @@ const BENCHMARKS_RESULTS_PATH = "./results-fh-new" data,varnames = readdlm("data/nikuradse_1.csv", ',', header=true); X = convert(Matrix{Float32}, data) +X_t = permutedims(X) # for gpu exprs = Expr[] parameters = Vector{Vector{Float32}}() @@ -19,24 +24,15 @@ paramnames = ["p$i" for i in 1:20] # data/esr_nvar2_len10.txt.gz_9.txt.gz has ~250_000 exprs # data/esr_nvar2_len10.txt.gz_10.txt.gz has ~800_000 exrps GZip.open("data/esr_nvar2_len10.txt.gz_9.txt.gz") do io - i = 0 for line in eachline(io) expr, p = parse_infix(line, varnames, paramnames) - - if i > 10 - return - end - println(expr) push!(exprs, expr) push!(parameters, randn(Float32, length(p))) - - i += 1 end end expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially; only p changes but not X) - # TODO: Tipps for tuning: # Put data in shared memory: # https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory @@ -60,18 +56,20 @@ suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"]) suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"]) if compareWithCPU - suite["CPU"]["nikuradse_1"] = @benchmarkable interpret_cpu(exprsCPU, X, parameters; repetitions=expr_reps) + suite["CPU"]["nikuradse_1"] = @benchmarkable interpret_cpu(exprs, X, parameters; repetitions=expr_reps) + suite["CPU"]["nikuradse_1_parallel"] = @benchmarkable interpret_cpu(exprs, X, parameters; repetitions=expr_reps, parallel=true) end -# TODO: Most likely need to transpose X matrix here, as we are expecting a column major matrix for more efficient memory access -suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprsGPU, X, parameters; repetitions=expr_reps) -suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprsGPU, X, parameters; repetitions=expr_reps) +suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprs, X_t, parameters; repetitions=expr_reps) +suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprs, X_t, parameters; repetitions=expr_reps) -for i in 1:10 +for i in 1:2 tune!(suite) end BenchmarkTools.save("params.json", params(suite)) +throw("finished tuning") + loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance) results = run(suite, verbose=true, seconds=3600) # 1 hour because of CPU. lets see if more is needed diff --git a/package/test/runtests.jl b/package/test/runtests.jl index 7df38c2..f769550 100644 --- a/package/test/runtests.jl +++ b/package/test/runtests.jl @@ -10,9 +10,9 @@ include(joinpath(baseFolder, "src", "Interpreter.jl")) include(joinpath(baseFolder, "src", "Transpiler.jl")) @testset "Functionality tests" begin - include("ExpressionProcessingTests.jl") - include("InterpreterTests.jl") - include("TranspilerTests.jl") + # include("ExpressionProcessingTests.jl") + # include("InterpreterTests.jl") + # include("TranspilerTests.jl") end @@ -22,5 +22,5 @@ end @testset "Performance tests" begin # include("PerformanceTuning.jl") - # include("PerformanceTests.jl") + include("PerformanceTests.jl") end \ No newline at end of file