benchmarking: started tuning benchmarking results. found some errors that need fixing
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
This commit is contained in:
parent
327e4ebf1b
commit
7121329a17
|
@ -58,21 +58,34 @@ end
|
||||||
|
|
||||||
|
|
||||||
# Evaluate Expressions on the CPU
|
# Evaluate Expressions on the CPU
|
||||||
function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32}
|
function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1, parallel=false)::Matrix{Float32}
|
||||||
@assert axes(exprs) == axes(p)
|
@assert axes(exprs) == axes(p)
|
||||||
nrows = size(X, 1)
|
nrows = size(X, 1)
|
||||||
|
|
||||||
# each column of the matrix has the result for an expr
|
# each column of the matrix has the result for an expr
|
||||||
res = Matrix{Float32}(undef, nrows, length(exprs))
|
res = Matrix{Float32}(undef, nrows, length(exprs))
|
||||||
|
|
||||||
for i in eachindex(exprs)
|
if parallel
|
||||||
# The interpreter holds the postfix code and buffers for evaluation. It is costly to create
|
Threads.@threads for i in eachindex(exprs)
|
||||||
interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i]))
|
# The interpreter holds the postfix code and buffers for evaluation. It is costly to create
|
||||||
|
interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i]))
|
||||||
# If an expression has to be evaluated multiple times (e.g. for different parameters),
|
|
||||||
# it is worthwhile to reuse the interpreter to reduce the number of allocations
|
# If an expression has to be evaluated multiple times (e.g. for different parameters),
|
||||||
for rep in 1:repetitions
|
# it is worthwhile to reuse the interpreter to reduce the number of allocations
|
||||||
CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i])
|
for rep in 1:repetitions
|
||||||
|
CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i])
|
||||||
|
end
|
||||||
|
end
|
||||||
|
else
|
||||||
|
for i in eachindex(exprs)
|
||||||
|
# The interpreter holds the postfix code and buffers for evaluation. It is costly to create
|
||||||
|
interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i]))
|
||||||
|
|
||||||
|
# If an expression has to be evaluated multiple times (e.g. for different parameters),
|
||||||
|
# it is worthwhile to reuse the interpreter to reduce the number of allocations
|
||||||
|
for rep in 1:repetitions
|
||||||
|
CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i])
|
||||||
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -100,7 +100,7 @@ function get_operator(op::Symbol)::Operator
|
||||||
elseif op == :sqrt
|
elseif op == :sqrt
|
||||||
return SQRT
|
return SQRT
|
||||||
else
|
else
|
||||||
throw("Operator unknown")
|
throw("Operator unknown. Operator was $op")
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -65,11 +65,6 @@ function test_cpu_interpreter_nikuradse()
|
||||||
i = 0
|
i = 0
|
||||||
for line in eachline(io)
|
for line in eachline(io)
|
||||||
expr, p = parse_infix(line, varnames, paramnames)
|
expr, p = parse_infix(line, varnames, paramnames)
|
||||||
|
|
||||||
if i > 10
|
|
||||||
return
|
|
||||||
end
|
|
||||||
println(expr)
|
|
||||||
|
|
||||||
push!(exprs, expr)
|
push!(exprs, expr)
|
||||||
push!(parameters, randn(Float32, length(p)))
|
push!(parameters, randn(Float32, length(p)))
|
||||||
|
|
|
@ -1,9 +1,13 @@
|
||||||
using LinearAlgebra
|
using LinearAlgebra
|
||||||
using BenchmarkTools
|
using BenchmarkTools
|
||||||
|
using DelimitedFiles
|
||||||
|
using GZip
|
||||||
|
|
||||||
using .Transpiler
|
using .Transpiler
|
||||||
using .Interpreter
|
using .Interpreter
|
||||||
|
|
||||||
|
include("parser.jl") # to parse expressions from a file
|
||||||
|
|
||||||
const BENCHMARKS_RESULTS_PATH = "./results-fh-new"
|
const BENCHMARKS_RESULTS_PATH = "./results-fh-new"
|
||||||
|
|
||||||
# Number of expressions can get really big (into millions)
|
# Number of expressions can get really big (into millions)
|
||||||
|
@ -11,6 +15,7 @@ const BENCHMARKS_RESULTS_PATH = "./results-fh-new"
|
||||||
|
|
||||||
data,varnames = readdlm("data/nikuradse_1.csv", ',', header=true);
|
data,varnames = readdlm("data/nikuradse_1.csv", ',', header=true);
|
||||||
X = convert(Matrix{Float32}, data)
|
X = convert(Matrix{Float32}, data)
|
||||||
|
X_t = permutedims(X) # for gpu
|
||||||
|
|
||||||
exprs = Expr[]
|
exprs = Expr[]
|
||||||
parameters = Vector{Vector{Float32}}()
|
parameters = Vector{Vector{Float32}}()
|
||||||
|
@ -19,24 +24,15 @@ paramnames = ["p$i" for i in 1:20]
|
||||||
# data/esr_nvar2_len10.txt.gz_9.txt.gz has ~250_000 exprs
|
# data/esr_nvar2_len10.txt.gz_9.txt.gz has ~250_000 exprs
|
||||||
# data/esr_nvar2_len10.txt.gz_10.txt.gz has ~800_000 exrps
|
# data/esr_nvar2_len10.txt.gz_10.txt.gz has ~800_000 exrps
|
||||||
GZip.open("data/esr_nvar2_len10.txt.gz_9.txt.gz") do io
|
GZip.open("data/esr_nvar2_len10.txt.gz_9.txt.gz") do io
|
||||||
i = 0
|
|
||||||
for line in eachline(io)
|
for line in eachline(io)
|
||||||
expr, p = parse_infix(line, varnames, paramnames)
|
expr, p = parse_infix(line, varnames, paramnames)
|
||||||
|
|
||||||
if i > 10
|
|
||||||
return
|
|
||||||
end
|
|
||||||
println(expr)
|
|
||||||
|
|
||||||
push!(exprs, expr)
|
push!(exprs, expr)
|
||||||
push!(parameters, randn(Float32, length(p)))
|
push!(parameters, randn(Float32, length(p)))
|
||||||
|
|
||||||
i += 1
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially; only p changes but not X)
|
expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially; only p changes but not X)
|
||||||
|
|
||||||
|
|
||||||
# TODO: Tipps for tuning:
|
# TODO: Tipps for tuning:
|
||||||
# Put data in shared memory:
|
# Put data in shared memory:
|
||||||
# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
|
# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
|
||||||
|
@ -60,18 +56,20 @@ suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"])
|
||||||
suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"])
|
suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"])
|
||||||
|
|
||||||
if compareWithCPU
|
if compareWithCPU
|
||||||
suite["CPU"]["nikuradse_1"] = @benchmarkable interpret_cpu(exprsCPU, X, parameters; repetitions=expr_reps)
|
suite["CPU"]["nikuradse_1"] = @benchmarkable interpret_cpu(exprs, X, parameters; repetitions=expr_reps)
|
||||||
|
suite["CPU"]["nikuradse_1_parallel"] = @benchmarkable interpret_cpu(exprs, X, parameters; repetitions=expr_reps, parallel=true)
|
||||||
end
|
end
|
||||||
|
|
||||||
# TODO: Most likely need to transpose X matrix here, as we are expecting a column major matrix for more efficient memory access
|
suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprs, X_t, parameters; repetitions=expr_reps)
|
||||||
suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprsGPU, X, parameters; repetitions=expr_reps)
|
suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprs, X_t, parameters; repetitions=expr_reps)
|
||||||
suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprsGPU, X, parameters; repetitions=expr_reps)
|
|
||||||
|
|
||||||
for i in 1:10
|
for i in 1:2
|
||||||
tune!(suite)
|
tune!(suite)
|
||||||
end
|
end
|
||||||
BenchmarkTools.save("params.json", params(suite))
|
BenchmarkTools.save("params.json", params(suite))
|
||||||
|
|
||||||
|
throw("finished tuning")
|
||||||
|
|
||||||
loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)
|
loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)
|
||||||
|
|
||||||
results = run(suite, verbose=true, seconds=3600) # 1 hour because of CPU. lets see if more is needed
|
results = run(suite, verbose=true, seconds=3600) # 1 hour because of CPU. lets see if more is needed
|
||||||
|
|
|
@ -10,9 +10,9 @@ include(joinpath(baseFolder, "src", "Interpreter.jl"))
|
||||||
include(joinpath(baseFolder, "src", "Transpiler.jl"))
|
include(joinpath(baseFolder, "src", "Transpiler.jl"))
|
||||||
|
|
||||||
@testset "Functionality tests" begin
|
@testset "Functionality tests" begin
|
||||||
include("ExpressionProcessingTests.jl")
|
# include("ExpressionProcessingTests.jl")
|
||||||
include("InterpreterTests.jl")
|
# include("InterpreterTests.jl")
|
||||||
include("TranspilerTests.jl")
|
# include("TranspilerTests.jl")
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
@ -22,5 +22,5 @@ end
|
||||||
|
|
||||||
@testset "Performance tests" begin
|
@testset "Performance tests" begin
|
||||||
# include("PerformanceTuning.jl")
|
# include("PerformanceTuning.jl")
|
||||||
# include("PerformanceTests.jl")
|
include("PerformanceTests.jl")
|
||||||
end
|
end
|
Loading…
Reference in New Issue
Block a user