benchmarking: started tuning benchmarking results. found some errors that need fixing
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
This commit is contained in:
parent
327e4ebf1b
commit
7121329a17
|
@ -58,13 +58,25 @@ end
|
|||
|
||||
|
||||
# Evaluate Expressions on the CPU
|
||||
function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32}
|
||||
function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1, parallel=false)::Matrix{Float32}
|
||||
@assert axes(exprs) == axes(p)
|
||||
nrows = size(X, 1)
|
||||
|
||||
# each column of the matrix has the result for an expr
|
||||
res = Matrix{Float32}(undef, nrows, length(exprs))
|
||||
|
||||
if parallel
|
||||
Threads.@threads for i in eachindex(exprs)
|
||||
# The interpreter holds the postfix code and buffers for evaluation. It is costly to create
|
||||
interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i]))
|
||||
|
||||
# If an expression has to be evaluated multiple times (e.g. for different parameters),
|
||||
# it is worthwhile to reuse the interpreter to reduce the number of allocations
|
||||
for rep in 1:repetitions
|
||||
CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i])
|
||||
end
|
||||
end
|
||||
else
|
||||
for i in eachindex(exprs)
|
||||
# The interpreter holds the postfix code and buffers for evaluation. It is costly to create
|
||||
interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i]))
|
||||
|
@ -75,6 +87,7 @@ function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector
|
|||
CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i])
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
res
|
||||
end
|
||||
|
|
|
@ -100,7 +100,7 @@ function get_operator(op::Symbol)::Operator
|
|||
elseif op == :sqrt
|
||||
return SQRT
|
||||
else
|
||||
throw("Operator unknown")
|
||||
throw("Operator unknown. Operator was $op")
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
@ -66,11 +66,6 @@ function test_cpu_interpreter_nikuradse()
|
|||
for line in eachline(io)
|
||||
expr, p = parse_infix(line, varnames, paramnames)
|
||||
|
||||
if i > 10
|
||||
return
|
||||
end
|
||||
println(expr)
|
||||
|
||||
push!(exprs, expr)
|
||||
push!(parameters, randn(Float32, length(p)))
|
||||
|
||||
|
|
|
@ -1,9 +1,13 @@
|
|||
using LinearAlgebra
|
||||
using BenchmarkTools
|
||||
using DelimitedFiles
|
||||
using GZip
|
||||
|
||||
using .Transpiler
|
||||
using .Interpreter
|
||||
|
||||
include("parser.jl") # to parse expressions from a file
|
||||
|
||||
const BENCHMARKS_RESULTS_PATH = "./results-fh-new"
|
||||
|
||||
# Number of expressions can get really big (into millions)
|
||||
|
@ -11,6 +15,7 @@ const BENCHMARKS_RESULTS_PATH = "./results-fh-new"
|
|||
|
||||
data,varnames = readdlm("data/nikuradse_1.csv", ',', header=true);
|
||||
X = convert(Matrix{Float32}, data)
|
||||
X_t = permutedims(X) # for gpu
|
||||
|
||||
exprs = Expr[]
|
||||
parameters = Vector{Vector{Float32}}()
|
||||
|
@ -19,24 +24,15 @@ paramnames = ["p$i" for i in 1:20]
|
|||
# data/esr_nvar2_len10.txt.gz_9.txt.gz has ~250_000 exprs
|
||||
# data/esr_nvar2_len10.txt.gz_10.txt.gz has ~800_000 exrps
|
||||
GZip.open("data/esr_nvar2_len10.txt.gz_9.txt.gz") do io
|
||||
i = 0
|
||||
for line in eachline(io)
|
||||
expr, p = parse_infix(line, varnames, paramnames)
|
||||
|
||||
if i > 10
|
||||
return
|
||||
end
|
||||
println(expr)
|
||||
|
||||
push!(exprs, expr)
|
||||
push!(parameters, randn(Float32, length(p)))
|
||||
|
||||
i += 1
|
||||
end
|
||||
end
|
||||
expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially; only p changes but not X)
|
||||
|
||||
|
||||
# TODO: Tipps for tuning:
|
||||
# Put data in shared memory:
|
||||
# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
|
||||
|
@ -60,18 +56,20 @@ suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"])
|
|||
suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"])
|
||||
|
||||
if compareWithCPU
|
||||
suite["CPU"]["nikuradse_1"] = @benchmarkable interpret_cpu(exprsCPU, X, parameters; repetitions=expr_reps)
|
||||
suite["CPU"]["nikuradse_1"] = @benchmarkable interpret_cpu(exprs, X, parameters; repetitions=expr_reps)
|
||||
suite["CPU"]["nikuradse_1_parallel"] = @benchmarkable interpret_cpu(exprs, X, parameters; repetitions=expr_reps, parallel=true)
|
||||
end
|
||||
|
||||
# TODO: Most likely need to transpose X matrix here, as we are expecting a column major matrix for more efficient memory access
|
||||
suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprsGPU, X, parameters; repetitions=expr_reps)
|
||||
suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprsGPU, X, parameters; repetitions=expr_reps)
|
||||
suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprs, X_t, parameters; repetitions=expr_reps)
|
||||
suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprs, X_t, parameters; repetitions=expr_reps)
|
||||
|
||||
for i in 1:10
|
||||
for i in 1:2
|
||||
tune!(suite)
|
||||
end
|
||||
BenchmarkTools.save("params.json", params(suite))
|
||||
|
||||
throw("finished tuning")
|
||||
|
||||
loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)
|
||||
|
||||
results = run(suite, verbose=true, seconds=3600) # 1 hour because of CPU. lets see if more is needed
|
||||
|
|
|
@ -10,9 +10,9 @@ include(joinpath(baseFolder, "src", "Interpreter.jl"))
|
|||
include(joinpath(baseFolder, "src", "Transpiler.jl"))
|
||||
|
||||
@testset "Functionality tests" begin
|
||||
include("ExpressionProcessingTests.jl")
|
||||
include("InterpreterTests.jl")
|
||||
include("TranspilerTests.jl")
|
||||
# include("ExpressionProcessingTests.jl")
|
||||
# include("InterpreterTests.jl")
|
||||
# include("TranspilerTests.jl")
|
||||
end
|
||||
|
||||
|
||||
|
@ -22,5 +22,5 @@ end
|
|||
|
||||
@testset "Performance tests" begin
|
||||
# include("PerformanceTuning.jl")
|
||||
# include("PerformanceTests.jl")
|
||||
include("PerformanceTests.jl")
|
||||
end
|
Loading…
Reference in New Issue
Block a user