Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
180 lines
6.6 KiB
Julia
180 lines
6.6 KiB
Julia
using LinearAlgebra
|
|
using BenchmarkTools
|
|
|
|
using .Transpiler
|
|
using .Interpreter
|
|
|
|
const BENCHMARKS_RESULTS_PATH = "./results-fh"
|
|
exprsCPU = [
|
|
# CPU interpreter requires an anonymous function and array ref s
|
|
:(p[1] * x[1] + p[2]), # 5 op
|
|
:((((x[1] + x[2]) + x[3]) + x[4]) + x[5]), # 9 op
|
|
:(log(abs(x[1]))), # 3 op
|
|
:(powabs(p[2] - powabs(p[1] + x[1], 1/x[1]),p[3])) # 13 op
|
|
] # 30 op
|
|
exprsCPU = map(e -> Expr(:->, :(x,p), e), exprsCPU)
|
|
|
|
exprsGPU = [
|
|
# CPU interpreter requires an anonymous function and array ref s
|
|
:(p1 * x1 + p2), # 5 op
|
|
:((((x1 + x2) + x3) + x4) + x5), # 9 op
|
|
:(log(abs(x1))), # 3 op
|
|
:(powabs(p2 - powabs(p1 + x1, 1/x1),p3)) # 13 op
|
|
] # 30 op
|
|
|
|
# p is the same for CPU and GPU
|
|
p = [randn(Float32, 10) for _ in 1:length(exprsCPU)] # generate 10 random parameter values for each expr
|
|
expr_reps = 100 # 100 parameter optimisation steps basically
|
|
|
|
|
|
@testset "CPU performance" begin
|
|
# warmup
|
|
# interpret_cpu(exprsCPU, X, p)
|
|
|
|
# @btime interpret_cpu(exprsCPU, X, p; repetitions=expr_reps) # repetitions simulates parameter optimisation
|
|
# @btime test_cpu_interpreter(1000)
|
|
# @btime fetch.([Threads.@spawn interpret_cpu(exprsCPU, X, p; repetitions=expr_reps) for i in 1:reps])
|
|
|
|
# test_cpu_interpreter(1000, parallel=true) # start julia -t 6 for six threads
|
|
# @btime test_cpu_interpreter(10000)
|
|
# @btime test_cpu_interpreter(10000, parallel=true)
|
|
|
|
end
|
|
|
|
@testset "Interpreter Performance" begin
|
|
# Put data in shared memory:
|
|
# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
|
|
|
|
# Make array const:
|
|
# https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays
|
|
|
|
# Memory management like in C++ might help with performance improvements
|
|
# https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management
|
|
end
|
|
|
|
@testset "Transpiler Performance" begin
|
|
# Put data in shared memory:
|
|
# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
|
|
|
|
# Make array const:
|
|
# https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays
|
|
|
|
# Memory management like in C++ might help with performance improvements
|
|
# https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management
|
|
end
|
|
|
|
# After these tests have been redone, use Nsight Compute/Systems as described here:
|
|
#https://cuda.juliagpu.org/stable/development/profiling/#NVIDIA-Nsight-Systems
|
|
# Systems and Compute installable via WSL. Compute UI can even be used inside wsl
|
|
# Add /usr/local/cuda/bin in .bashrc to PATH to access ncu and nsys (depending how well this works with my 1080 do it on my machine, otherwise re do the tests and perform them on FH PCs)
|
|
# University setup at 10.20.1.7 if needed
|
|
|
|
compareWithCPU = true
|
|
|
|
|
|
suite = BenchmarkGroup()
|
|
suite["CPU"] = BenchmarkGroup(["CPUInterpreter"])
|
|
suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"])
|
|
suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"])
|
|
varsets_small = 100
|
|
varsets_medium = 1000
|
|
varsets_large = 10000
|
|
|
|
if compareWithCPU
|
|
X_small = randn(Float32, varsets_small, 5)
|
|
suite["CPU"]["small varset"] = @benchmarkable interpret_cpu(exprsCPU, X_small, p; repetitions=expr_reps)
|
|
X_medium = randn(Float32, varsets_medium, 5)
|
|
suite["CPU"]["medium varset"] = @benchmarkable interpret_cpu(exprsCPU, X_medium, p; repetitions=expr_reps)
|
|
X_large = randn(Float32, varsets_large, 5)
|
|
suite["CPU"]["large varset"] = @benchmarkable interpret_cpu(exprsCPU, X_large, p; repetitions=expr_reps)
|
|
end
|
|
|
|
X_small_GPU = randn(Float32, 5, varsets_small)
|
|
suite["GPUI"]["small varset"] = @benchmarkable interpret_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
|
|
suite["GPUT"]["small varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
|
|
|
|
X_medium_GPU = randn(Float32, 5, varsets_medium)
|
|
suite["GPUI"]["medium varset"] = @benchmarkable interpret_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
|
|
suite["GPUT"]["medium varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
|
|
|
|
X_large_GPU = randn(Float32, 5, varsets_large)
|
|
suite["GPUI"]["large varset"] = @benchmarkable interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
|
|
suite["GPUT"]["large varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
|
|
|
|
# interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
|
|
|
|
# tune!(suite)
|
|
# BenchmarkTools.save("params.json", params(suite))
|
|
|
|
loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)
|
|
|
|
results = run(suite, verbose=true, seconds=180)
|
|
|
|
if compareWithCPU
|
|
medianCPU = median(results["CPU"])
|
|
stdCPU = std(results["CPU"])
|
|
|
|
medianInterpreter = median(results["GPUI"])
|
|
stdInterpreter = std(results["GPUI"])
|
|
|
|
medianTranspiler = median(results["GPUT"])
|
|
stdTranspiler = std(results["GPUT"])
|
|
|
|
cpuVsGPUI_median = judge(medianInterpreter, medianCPU) # is interpreter better than cpu?
|
|
cpuVsGPUT_median = judge(medianTranspiler, medianCPU) # is transpiler better than cpu?
|
|
gpuiVsGPUT_median = judge(medianTranspiler, medianInterpreter) # is tranpiler better than interpreter?
|
|
|
|
cpuVsGPUI_std = judge(stdInterpreter, stdCPU) # is interpreter better than cpu?
|
|
cpuVsGPUT_std = judge(stdTranspiler, stdCPU) # is transpiler better than cpu?
|
|
gpuiVsGPUT_std = judge(stdTranspiler, stdInterpreter) # is tranpiler better than interpreter?
|
|
|
|
println()
|
|
println("Is the interpreter better than the CPU implementation:")
|
|
println(cpuVsGPUI_median)
|
|
println(cpuVsGPUI_std)
|
|
|
|
println()
|
|
println("Is the transpiler better than the CPU implementation:")
|
|
println(cpuVsGPUT_median)
|
|
println(cpuVsGPUT_std)
|
|
|
|
println()
|
|
println("Is the transpiler better than the interpreter:")
|
|
println(gpuiVsGPUT_median)
|
|
println(gpuiVsGPUT_std)
|
|
|
|
BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json", results)
|
|
else
|
|
resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/2-using_inbounds.json")[1]
|
|
|
|
medianGPUI_old = median(resultsOld["GPUI"])
|
|
stdGPUI_old = std(resultsOld["GPUI"])
|
|
|
|
medianGPUT_old = median(resultsOld["GPUT"])
|
|
stdGPUT_old = std(resultsOld["GPUT"])
|
|
|
|
medianInterpreter = median(results["GPUI"])
|
|
stdInterpreter = std(results["GPUI"])
|
|
|
|
medianTranspiler = median(results["GPUT"])
|
|
stdTranspiler = std(results["GPUT"])
|
|
|
|
oldVsGPUI_median = judge(medianInterpreter, medianGPUI_old) # is interpreter better than old?
|
|
oldVsGPUI_std = judge(stdInterpreter, stdGPUI_old) # is interpreter better than old?
|
|
|
|
oldVsGPUT_median = judge(medianTranspiler, medianGPUT_old) # is transpiler better than old?
|
|
oldVsGPUT_std = judge(stdTranspiler, stdGPUT_old) # is transpiler better than old?
|
|
|
|
|
|
println()
|
|
println("Is the interpreter better than the old implementation:")
|
|
println(oldVsGPUI_median)
|
|
println(oldVsGPUI_std)
|
|
|
|
println()
|
|
println("Is the transpiler better than the old implementation:")
|
|
println(oldVsGPUT_median)
|
|
println(oldVsGPUT_std)
|
|
end
|
|
|