Some checks failed
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled
175 lines
6.2 KiB
Julia
175 lines
6.2 KiB
Julia
using LinearAlgebra
|
|
using BenchmarkTools
|
|
|
|
using .Transpiler
|
|
using .Interpreter
|
|
|
|
const BENCHMARKS_RESULTS_PATH = "./results"
|
|
# University setup at 10.20.1.7 if needed
|
|
exprsCPU = [
|
|
# CPU interpreter requires an anonymous function and array ref s
|
|
:(p[1] * x[1] + p[2]), # 5 op
|
|
:((((x[1] + x[2]) + x[3]) + x[4]) + x[5]), # 9 op
|
|
:(log(abs(x[1]))), # 3 op
|
|
:(powabs(p[2] - powabs(p[1] + x[1], 1/x[1]),p[3])) # 13 op
|
|
] # 30 op
|
|
exprsCPU = map(e -> Expr(:->, :(x,p), e), exprsCPU)
|
|
|
|
exprsGPU = [
|
|
# CPU interpreter requires an anonymous function and array ref s
|
|
:(p1 * x1 + p2), # 5 op
|
|
:((((x1 + x2) + x3) + x4) + x5), # 9 op
|
|
:(log(abs(x1))), # 3 op
|
|
:(powabs(p2 - powabs(p1 + x1, 1/x1),p3)) # 13 op
|
|
] # 30 op
|
|
|
|
# p is the same for CPU and GPU
|
|
p = [randn(Float32, 10) for _ in 1:length(exprsCPU)] # generate 10 random parameter values for each expr
|
|
expr_reps = 100 # 100 parameter optimisation steps basically
|
|
|
|
|
|
@testset "CPU performance" begin
|
|
# warmup
|
|
# interpret_cpu(exprsCPU, X, p)
|
|
|
|
# @btime interpret_cpu(exprsCPU, X, p; repetitions=expr_reps) # repetitions simulates parameter optimisation
|
|
# @btime test_cpu_interpreter(1000)
|
|
# @btime fetch.([Threads.@spawn interpret_cpu(exprsCPU, X, p; repetitions=expr_reps) for i in 1:reps])
|
|
|
|
# test_cpu_interpreter(1000, parallel=true) # start julia -t 6 for six threads
|
|
# @btime test_cpu_interpreter(10000)
|
|
# @btime test_cpu_interpreter(10000, parallel=true)
|
|
|
|
end
|
|
|
|
@testset "Interpreter Performance" begin
|
|
# Put data in shared memory:
|
|
# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
|
|
|
|
# Make array const:
|
|
# https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays
|
|
|
|
# Memory management like in C++ might help with performance improvements
|
|
# https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management
|
|
end
|
|
|
|
@testset "Transpiler Performance" begin
|
|
# Put data in shared memory:
|
|
# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
|
|
|
|
# Make array const:
|
|
# https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays
|
|
|
|
# Memory management like in C++ might help with performance improvements
|
|
# https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management
|
|
end
|
|
|
|
compareWithCPU = true
|
|
|
|
|
|
suite = BenchmarkGroup()
|
|
suite["CPU"] = BenchmarkGroup(["CPUInterpreter"])
|
|
suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"])
|
|
suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"])
|
|
varsets_small = 100
|
|
varsets_medium = 1000
|
|
varsets_large = 10000
|
|
|
|
if compareWithCPU
|
|
X_small = randn(Float32, varsets_small, 5)
|
|
suite["CPU"]["small varset"] = @benchmarkable interpret_cpu(exprsCPU, X_small, p; repetitions=expr_reps)
|
|
X_medium = randn(Float32, varsets_medium, 5)
|
|
suite["CPU"]["medium varset"] = @benchmarkable interpret_cpu(exprsCPU, X_medium, p; repetitions=expr_reps)
|
|
X_large = randn(Float32, varsets_large, 5)
|
|
suite["CPU"]["large varset"] = @benchmarkable interpret_cpu(exprsCPU, X_large, p; repetitions=expr_reps)
|
|
end
|
|
|
|
X_small_GPU = randn(Float32, 5, varsets_small)
|
|
suite["GPUI"]["small varset"] = @benchmarkable interpret_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
|
|
suite["GPUT"]["small varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
|
|
|
|
X_medium_GPU = randn(Float32, 5, varsets_medium)
|
|
suite["GPUI"]["medium varset"] = @benchmarkable interpret_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
|
|
suite["GPUT"]["medium varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
|
|
|
|
X_large_GPU = randn(Float32, 5, varsets_large)
|
|
suite["GPUI"]["large varset"] = @benchmarkable interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
|
|
suite["GPUT"]["large varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
|
|
|
|
# interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
|
|
|
|
# tune!(suite)
|
|
# BenchmarkTools.save("params.json", params(suite))
|
|
|
|
loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)
|
|
|
|
results = run(suite, verbose=true, seconds=180)
|
|
|
|
if compareWithCPU
|
|
medianCPU = median(results["CPU"])
|
|
stdCPU = std(results["CPU"])
|
|
|
|
medianInterpreter = median(results["GPUI"])
|
|
stdInterpreter = std(results["GPUI"])
|
|
|
|
medianTranspiler = median(results["GPUT"])
|
|
stdTranspiler = std(results["GPUT"])
|
|
|
|
cpuVsGPUI_median = judge(medianInterpreter, medianCPU) # is interpreter better than cpu?
|
|
cpuVsGPUT_median = judge(medianTranspiler, medianCPU) # is transpiler better than cpu?
|
|
gpuiVsGPUT_median = judge(medianTranspiler, medianInterpreter) # is tranpiler better than interpreter?
|
|
|
|
cpuVsGPUI_std = judge(stdInterpreter, stdCPU) # is interpreter better than cpu?
|
|
cpuVsGPUT_std = judge(stdTranspiler, stdCPU) # is transpiler better than cpu?
|
|
gpuiVsGPUT_std = judge(stdTranspiler, stdInterpreter) # is tranpiler better than interpreter?
|
|
|
|
println()
|
|
println("Is the interpreter better than the CPU implementation:")
|
|
println(cpuVsGPUI_median)
|
|
println(cpuVsGPUI_std)
|
|
|
|
println()
|
|
println("Is the transpiler better than the CPU implementation:")
|
|
println(cpuVsGPUT_median)
|
|
println(cpuVsGPUT_std)
|
|
|
|
println()
|
|
println("Is the transpiler better than the interpreter:")
|
|
println(gpuiVsGPUT_median)
|
|
println(gpuiVsGPUT_std)
|
|
|
|
BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/using_inbounds.json", results)
|
|
else
|
|
resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/256_blocksize.json")[1]
|
|
|
|
medianGPUI_old = median(resultsOld["GPUI"])
|
|
stdGPUI_old = std(resultsOld["GPUI"])
|
|
|
|
medianGPUT_old = median(resultsOld["GPUT"])
|
|
stdGPUT_old = std(resultsOld["GPUT"])
|
|
|
|
medianInterpreter = median(results["GPUI"])
|
|
stdInterpreter = std(results["GPUI"])
|
|
|
|
medianTranspiler = median(results["GPUT"])
|
|
stdTranspiler = std(results["GPUT"])
|
|
|
|
oldVsGPUI_median = judge(medianInterpreter, medianGPUI_old) # is interpreter better than old?
|
|
oldVsGPUI_std = judge(stdInterpreter, stdGPUI_old) # is interpreter better than old?
|
|
|
|
oldVsGPUT_median = judge(medianTranspiler, medianGPUT_old) # is transpiler better than old?
|
|
oldVsGPUT_std = judge(stdTranspiler, stdGPUT_old) # is transpiler better than old?
|
|
|
|
|
|
println()
|
|
println("Is the interpreter better than the old implementation:")
|
|
println(oldVsGPUI_median)
|
|
println(oldVsGPUI_std)
|
|
|
|
println()
|
|
println("Is the transpiler better than the old implementation:")
|
|
println(oldVsGPUT_median)
|
|
println(oldVsGPUT_std)
|
|
end
|
|
|