using LinearAlgebra using BenchmarkTools using .Transpiler using .Interpreter const BENCHMARKS_RESULTS_PATH = "./results" # University setup at 10.20.1.7 if needed exprsCPU = [ # CPU interpreter requires an anonymous function and array ref s :(p[1] * x[1] + p[2]), # 5 op :((((x[1] + x[2]) + x[3]) + x[4]) + x[5]), # 9 op :(log(abs(x[1]))), # 3 op :(powabs(p[2] - powabs(p[1] + x[1], 1/x[1]),p[3])) # 13 op ] # 30 op exprsCPU = map(e -> Expr(:->, :(x,p), e), exprsCPU) exprsGPU = [ # CPU interpreter requires an anonymous function and array ref s :(p1 * x1 + p2), # 5 op :((((x1 + x2) + x3) + x4) + x5), # 9 op :(log(abs(x1))), # 3 op :(powabs(p2 - powabs(p1 + x1, 1/x1),p3)) # 13 op ] # 30 op # p is the same for CPU and GPU p = [randn(Float32, 10) for _ in 1:length(exprsCPU)] # generate 10 random parameter values for each expr expr_reps = 100 # 100 parameter optimisation steps basically @testset "CPU performance" begin # warmup # interpret_cpu(exprsCPU, X, p) # @btime interpret_cpu(exprsCPU, X, p; repetitions=expr_reps) # repetitions simulates parameter optimisation # @btime test_cpu_interpreter(1000) # @btime fetch.([Threads.@spawn interpret_cpu(exprsCPU, X, p; repetitions=expr_reps) for i in 1:reps]) # test_cpu_interpreter(1000, parallel=true) # start julia -t 6 for six threads # @btime test_cpu_interpreter(10000) # @btime test_cpu_interpreter(10000, parallel=true) end @testset "Interpreter Performance" begin # Put data in shared memory: # https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory # Make array const: # https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays # Memory management like in C++ might help with performance improvements # https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management end @testset "Transpiler Performance" begin # Put data in shared memory: # https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory # Make array const: # https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays # Memory management like in C++ might help with performance improvements # https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management end compareWithCPU = true suite = BenchmarkGroup() suite["CPU"] = BenchmarkGroup(["CPUInterpreter"]) suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"]) suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"]) varsets_small = 100 varsets_medium = 1000 varsets_large = 10000 if compareWithCPU X_small = randn(Float32, varsets_small, 5) suite["CPU"]["small varset"] = @benchmarkable interpret_cpu(exprsCPU, X_small, p; repetitions=expr_reps) X_medium = randn(Float32, varsets_medium, 5) suite["CPU"]["medium varset"] = @benchmarkable interpret_cpu(exprsCPU, X_medium, p; repetitions=expr_reps) X_large = randn(Float32, varsets_large, 5) suite["CPU"]["large varset"] = @benchmarkable interpret_cpu(exprsCPU, X_large, p; repetitions=expr_reps) end X_small_GPU = randn(Float32, 5, varsets_small) suite["GPUI"]["small varset"] = @benchmarkable interpret_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps) suite["GPUT"]["small varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps) X_medium_GPU = randn(Float32, 5, varsets_medium) suite["GPUI"]["medium varset"] = @benchmarkable interpret_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps) suite["GPUT"]["medium varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps) X_large_GPU = randn(Float32, 5, varsets_large) suite["GPUI"]["large varset"] = @benchmarkable interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps) suite["GPUT"]["large varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps) # interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps) # tune!(suite) # BenchmarkTools.save("params.json", params(suite)) loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance) results = run(suite, verbose=true, seconds=180) if compareWithCPU medianCPU = median(results["CPU"]) stdCPU = std(results["CPU"]) medianInterpreter = median(results["GPUI"]) stdInterpreter = std(results["GPUI"]) medianTranspiler = median(results["GPUT"]) stdTranspiler = std(results["GPUT"]) cpuVsGPUI_median = judge(medianInterpreter, medianCPU) # is interpreter better than cpu? cpuVsGPUT_median = judge(medianTranspiler, medianCPU) # is transpiler better than cpu? gpuiVsGPUT_median = judge(medianTranspiler, medianInterpreter) # is tranpiler better than interpreter? cpuVsGPUI_std = judge(stdInterpreter, stdCPU) # is interpreter better than cpu? cpuVsGPUT_std = judge(stdTranspiler, stdCPU) # is transpiler better than cpu? gpuiVsGPUT_std = judge(stdTranspiler, stdInterpreter) # is tranpiler better than interpreter? println() println("Is the interpreter better than the CPU implementation:") println(cpuVsGPUI_median) println(cpuVsGPUI_std) println() println("Is the transpiler better than the CPU implementation:") println(cpuVsGPUT_median) println(cpuVsGPUT_std) println() println("Is the transpiler better than the interpreter:") println(gpuiVsGPUT_median) println(gpuiVsGPUT_std) BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/using_inbounds.json", results) else resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/256_blocksize.json")[1] medianGPUI_old = median(resultsOld["GPUI"]) stdGPUI_old = std(resultsOld["GPUI"]) medianGPUT_old = median(resultsOld["GPUT"]) stdGPUT_old = std(resultsOld["GPUT"]) medianInterpreter = median(results["GPUI"]) stdInterpreter = std(results["GPUI"]) medianTranspiler = median(results["GPUT"]) stdTranspiler = std(results["GPUT"]) oldVsGPUI_median = judge(medianInterpreter, medianGPUI_old) # is interpreter better than old? oldVsGPUI_std = judge(stdInterpreter, stdGPUI_old) # is interpreter better than old? oldVsGPUT_median = judge(medianTranspiler, medianGPUT_old) # is transpiler better than old? oldVsGPUT_std = judge(stdTranspiler, stdGPUT_old) # is transpiler better than old? println() println("Is the interpreter better than the old implementation:") println(oldVsGPUI_median) println(oldVsGPUI_std) println() println("Is the transpiler better than the old implementation:") println(oldVsGPUT_median) println(oldVsGPUT_std) end