using LinearAlgebra using BenchmarkTools using .Transpiler using .Interpreter const BENCHMARKS_RESULTS_PATH = "./results-fh" # TODO: Expressions can get much much bigger (into millions) (will be provided by Mr. Kronberger) # TODO: Variable-Sets: 1000 can be considered the minimum; 100.000 can be considered the maximum (will be provided by Mr. Kronberger) exprsCPU = [ # CPU interpreter requires an anonymous function and array ref s :(p[1] * x[1] + p[2]), # 5 op :((((x[1] + x[2]) + x[3]) + x[4]) + x[5]), # 9 op :(log(abs(x[1]))), # 3 op :(powabs(p[2] - powabs(p[1] + x[1], 1/x[1]),p[3])) # 13 op ] # 30 op exprsCPU = map(e -> Expr(:->, :(x,p), e), exprsCPU) exprsGPU = [ # CPU interpreter requires an anonymous function and array ref s :(p1 * x1 + p2), # 5 op :((((x1 + x2) + x3) + x4) + x5), # 9 op :(log(abs(x1))), # 3 op :(powabs(p2 - powabs(p1 + x1, 1/x1),p3)) # 13 op ] # 30 op # p is the same for CPU and GPU p = [randn(Float32, 10) for _ in 1:length(exprsCPU)] # generate 10 random parameter values for each expr expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially; only p changes but not X) @testset "CPU performance" begin # warmup # interpret_cpu(exprsCPU, X, p) # @btime interpret_cpu(exprsCPU, X, p; repetitions=expr_reps) # repetitions simulates parameter optimisation # @btime test_cpu_interpreter(1000) # @btime fetch.([Threads.@spawn interpret_cpu(exprsCPU, X, p; repetitions=expr_reps) for i in 1:reps]) # test_cpu_interpreter(1000, parallel=true) # start julia -t 6 for six threads # @btime test_cpu_interpreter(10000) # @btime test_cpu_interpreter(10000, parallel=true) end @testset "Interpreter Performance" begin # Put data in shared memory: # https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory # Make array const: # https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays # Memory management like in C++ might help with performance improvements # https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management end @testset "Transpiler Performance" begin # Put data in shared memory: # https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory # Make array const: # https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays # Memory management like in C++ might help with performance improvements # https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management end # After these tests have been redone, use Nsight Compute/Systems as described here: #https://cuda.juliagpu.org/stable/development/profiling/#NVIDIA-Nsight-Systems # Systems and Compute installable via WSL. Compute UI can even be used inside wsl # Add /usr/local/cuda/bin in .bashrc to PATH to access ncu and nsys (depending how well this works with my 1080 do it on my machine, otherwise re do the tests and perform them on FH PCs) # University setup at 10.20.1.7 if needed compareWithCPU = true suite = BenchmarkGroup() suite["CPU"] = BenchmarkGroup(["CPUInterpreter"]) suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"]) suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"]) varsets_small = 100 varsets_medium = 1000 varsets_large = 10000 if compareWithCPU X_small = randn(Float32, varsets_small, 5) suite["CPU"]["small varset"] = @benchmarkable interpret_cpu(exprsCPU, X_small, p; repetitions=expr_reps) X_medium = randn(Float32, varsets_medium, 5) suite["CPU"]["medium varset"] = @benchmarkable interpret_cpu(exprsCPU, X_medium, p; repetitions=expr_reps) X_large = randn(Float32, varsets_large, 5) suite["CPU"]["large varset"] = @benchmarkable interpret_cpu(exprsCPU, X_large, p; repetitions=expr_reps) end X_small_GPU = randn(Float32, 5, varsets_small) # column-major suite["GPUI"]["small varset"] = @benchmarkable interpret_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps) suite["GPUT"]["small varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps) X_medium_GPU = randn(Float32, 5, varsets_medium) # column-major suite["GPUI"]["medium varset"] = @benchmarkable interpret_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps) suite["GPUT"]["medium varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps) X_large_GPU = randn(Float32, 5, varsets_large) # column-major suite["GPUI"]["large varset"] = @benchmarkable interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps) suite["GPUT"]["large varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps) # interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps) # tune!(suite) # BenchmarkTools.save("params.json", params(suite)) loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance) results = run(suite, verbose=true, seconds=180) if compareWithCPU medianCPU = median(results["CPU"]) stdCPU = std(results["CPU"]) medianInterpreter = median(results["GPUI"]) stdInterpreter = std(results["GPUI"]) medianTranspiler = median(results["GPUT"]) stdTranspiler = std(results["GPUT"]) cpuVsGPUI_median = judge(medianInterpreter, medianCPU) # is interpreter better than cpu? cpuVsGPUT_median = judge(medianTranspiler, medianCPU) # is transpiler better than cpu? gpuiVsGPUT_median = judge(medianTranspiler, medianInterpreter) # is tranpiler better than interpreter? cpuVsGPUI_std = judge(stdInterpreter, stdCPU) # is interpreter better than cpu? cpuVsGPUT_std = judge(stdTranspiler, stdCPU) # is transpiler better than cpu? gpuiVsGPUT_std = judge(stdTranspiler, stdInterpreter) # is tranpiler better than interpreter? println() println("Is the interpreter better than the CPU implementation:") println(cpuVsGPUI_median) println(cpuVsGPUI_std) println() println("Is the transpiler better than the CPU implementation:") println(cpuVsGPUT_median) println(cpuVsGPUT_std) println() println("Is the transpiler better than the interpreter:") println(gpuiVsGPUT_median) println(gpuiVsGPUT_std) BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/5-interpreter_using_fastmath.json", results) else resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json")[1] # resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json")[1] medianGPUI_old = median(resultsOld["GPUI"]) stdGPUI_old = std(resultsOld["GPUI"]) medianGPUT_old = median(resultsOld["GPUT"]) stdGPUT_old = std(resultsOld["GPUT"]) medianInterpreter = median(results["GPUI"]) stdInterpreter = std(results["GPUI"]) medianTranspiler = median(results["GPUT"]) stdTranspiler = std(results["GPUT"]) oldVsGPUI_median = judge(medianInterpreter, medianGPUI_old) # is interpreter better than old? oldVsGPUI_std = judge(stdInterpreter, stdGPUI_old) # is interpreter better than old? oldVsGPUT_median = judge(medianTranspiler, medianGPUT_old) # is transpiler better than old? oldVsGPUT_std = judge(stdTranspiler, stdGPUT_old) # is transpiler better than old? println() println("Is the interpreter better than the old implementation:") println(oldVsGPUI_median) println(oldVsGPUI_std) println() println("Is the transpiler better than the old implementation:") println(oldVsGPUT_median) println(oldVsGPUT_std) end