Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
151 lines
5.2 KiB
Julia
151 lines
5.2 KiB
Julia
using LinearAlgebra
|
|
using BenchmarkTools
|
|
using DelimitedFiles
|
|
using GZip
|
|
using CUDA
|
|
|
|
using .Transpiler
|
|
using .Interpreter
|
|
using .ExpressionProcessing
|
|
|
|
include("parser.jl") # to parse expressions from a file
|
|
|
|
const BENCHMARKS_RESULTS_PATH = "./results-fh-new"
|
|
|
|
# Number of expressions can get really big (into millions)
|
|
# Variable-Sets: 1000 can be considered the minimum; 100.000 can be considered the maximum
|
|
|
|
data,varnames = readdlm("data/nikuradse_1.csv", ',', header=true);
|
|
X = convert(Matrix{Float32}, data)
|
|
X_t = permutedims(X) # for gpu
|
|
|
|
exprs = Expr[]
|
|
parameters = Vector{Vector{Float32}}()
|
|
varnames = ["x$i" for i in 1:10]
|
|
paramnames = ["p$i" for i in 1:20]
|
|
# data/esr_nvar2_len10.txt.gz_9.txt.gz has ~250_000 exprs
|
|
# data/esr_nvar2_len10.txt.gz_10.txt.gz has ~800_000 exrps
|
|
GZip.open("data/esr_nvar2_len10.txt.gz_9.txt.gz") do io
|
|
for line in eachline(io)
|
|
expr, p = parse_infix(line, varnames, paramnames)
|
|
|
|
push!(exprs, expr)
|
|
push!(parameters, randn(Float32, length(p)))
|
|
end
|
|
end
|
|
expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially; only p changes but not X)
|
|
|
|
# TODO: Tipps for tuning:
|
|
# Put data in shared memory:
|
|
# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
|
|
|
|
# Make array const:
|
|
# https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays
|
|
|
|
# Memory management like in C++ might help with performance improvements
|
|
# https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management
|
|
|
|
# https://cuda.juliagpu.org/stable/development/profiling/#NVIDIA-Nsight-Systems
|
|
# Systems and Compute installable via WSL. Compute UI can even be used inside wsl
|
|
# Add /usr/local/cuda/bin in .bashrc to PATH to access ncu and nsys (do the tests on FH PCs)
|
|
# University setup at 10.20.1.7 and 10.20.1.13
|
|
|
|
compareWithCPU = false
|
|
|
|
suite = BenchmarkGroup()
|
|
suite["CPU"] = BenchmarkGroup(["CPUInterpreter"])
|
|
suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"])
|
|
suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"])
|
|
|
|
if compareWithCPU
|
|
suite["CPU"]["nikuradse_1"] = @benchmarkable interpret_cpu(exprs, X, parameters; repetitions=expr_reps)
|
|
suite["CPU"]["nikuradse_1_parallel"] = @benchmarkable interpret_cpu(exprs, X, parameters; repetitions=expr_reps, parallel=true)
|
|
end
|
|
|
|
# cacheInterpreter = Dict{Expr, PostfixType}()
|
|
suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprs, X_t, parameters; repetitions=expr_reps)
|
|
|
|
# cacheTranspilerFront = Dict{Expr, PostfixType}()
|
|
# cacheTranspilerRes = Dict{Expr, CuFunction}()
|
|
suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprs, X_t, parameters; repetitions=expr_reps)
|
|
|
|
for i in 1:1
|
|
tune!(suite)
|
|
end
|
|
BenchmarkTools.save("params.json", params(suite))
|
|
|
|
throw("finished tuning")
|
|
|
|
loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)
|
|
|
|
results = run(suite, verbose=true, seconds=3600) # 1 hour because of CPU. lets see if more is needed
|
|
|
|
if compareWithCPU
|
|
medianCPU = median(results["CPU"])
|
|
stdCPU = std(results["CPU"])
|
|
|
|
medianInterpreter = median(results["GPUI"])
|
|
stdInterpreter = std(results["GPUI"])
|
|
|
|
medianTranspiler = median(results["GPUT"])
|
|
stdTranspiler = std(results["GPUT"])
|
|
|
|
cpuVsGPUI_median = judge(medianInterpreter, medianCPU) # is interpreter better than cpu?
|
|
cpuVsGPUT_median = judge(medianTranspiler, medianCPU) # is transpiler better than cpu?
|
|
gpuiVsGPUT_median = judge(medianTranspiler, medianInterpreter) # is tranpiler better than interpreter?
|
|
|
|
cpuVsGPUI_std = judge(stdInterpreter, stdCPU) # is interpreter better than cpu?
|
|
cpuVsGPUT_std = judge(stdTranspiler, stdCPU) # is transpiler better than cpu?
|
|
gpuiVsGPUT_std = judge(stdTranspiler, stdInterpreter) # is tranpiler better than interpreter?
|
|
|
|
println()
|
|
println("Is the interpreter better than the CPU implementation:")
|
|
println(cpuVsGPUI_median)
|
|
println(cpuVsGPUI_std)
|
|
|
|
println()
|
|
println("Is the transpiler better than the CPU implementation:")
|
|
println(cpuVsGPUT_median)
|
|
println(cpuVsGPUT_std)
|
|
|
|
println()
|
|
println("Is the transpiler better than the interpreter:")
|
|
println(gpuiVsGPUT_median)
|
|
println(gpuiVsGPUT_std)
|
|
|
|
BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/0_initial.json", results)
|
|
else
|
|
resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json")[1]
|
|
# resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json")[1]
|
|
|
|
medianGPUI_old = median(resultsOld["GPUI"])
|
|
stdGPUI_old = std(resultsOld["GPUI"])
|
|
|
|
medianGPUT_old = median(resultsOld["GPUT"])
|
|
stdGPUT_old = std(resultsOld["GPUT"])
|
|
|
|
medianInterpreter = median(results["GPUI"])
|
|
stdInterpreter = std(results["GPUI"])
|
|
|
|
medianTranspiler = median(results["GPUT"])
|
|
stdTranspiler = std(results["GPUT"])
|
|
|
|
oldVsGPUI_median = judge(medianInterpreter, medianGPUI_old) # is interpreter better than old?
|
|
oldVsGPUI_std = judge(stdInterpreter, stdGPUI_old) # is interpreter better than old?
|
|
|
|
oldVsGPUT_median = judge(medianTranspiler, medianGPUT_old) # is transpiler better than old?
|
|
oldVsGPUT_std = judge(stdTranspiler, stdGPUT_old) # is transpiler better than old?
|
|
|
|
|
|
println()
|
|
println("Is the interpreter better than the old implementation:")
|
|
println(oldVsGPUI_median)
|
|
println(oldVsGPUI_std)
|
|
|
|
println()
|
|
println("Is the transpiler better than the old implementation:")
|
|
println(oldVsGPUT_median)
|
|
println(oldVsGPUT_std)
|
|
end
|
|
|