From 690ee33db1df3c7f63f326b08d47cea0d72cf07f Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 29 Mar 2025 12:01:06 +0100 Subject: [PATCH] benchmarks: started preparing benchmarks --- package/test/PerformanceTests.jl | 103 +++++++++++++++++++++---------- package/test/Project.toml | 2 + package/test/params.json | 1 + package/test/runtests.jl | 4 +- 4 files changed, 76 insertions(+), 34 deletions(-) create mode 100644 package/test/params.json diff --git a/package/test/PerformanceTests.jl b/package/test/PerformanceTests.jl index 42b0b69..76cf2b0 100644 --- a/package/test/PerformanceTests.jl +++ b/package/test/PerformanceTests.jl @@ -1,45 +1,50 @@ +using LinearAlgebra +using BenchmarkTools +using BenchmarkPlots, StatsPlots + using .Transpiler using .Interpreter # University setup at 10.20.1.7 if needed +exprsCPU = [ + # CPU interpreter requires an anonymous function and array ref s + :(p[1] * x[1] + p[2]), # 5 op + :((((x[1] + x[2]) + x[3]) + x[4]) + x[5]), # 9 op + :(log(abs(x[1]))), # 3 op + :(powabs(p[2] - powabs(p[1] + x[1], 1/x[1]),p[3])) # 13 op +] # 30 op +exprsCPU = map(e -> Expr(:->, :(x,p), e), exprsCPU) +exprsGPU = [ + # CPU interpreter requires an anonymous function and array ref s + :(p1 * x1 + p2), # 5 op + :((((x1 + x2) + x3) + x4) + x5), # 9 op + :(log(abs(x1))), # 3 op + :(powabs(p2 - powabs(p1 + x1, 1/x1),p3)) # 13 op +] # 30 op + +# p is the same for CPU and GPU +p = [randn(Float32, 10) for _ in 1:length(exprsCPU)] # generate 10 random parameter values for each expr +nrows = 1000 +X = randn(Float32, nrows, 5) + +expr_reps = 100 # 100 parameter optimisation steps basically @testset "CPU performance" begin - function test_cpu_interpreter(nrows; parallel = false) - exprs = [ - # CPU interpreter requires an anonymous function and array ref s - :(p[1] * x[1] + p[2]), # 5 op - :((((x[1] + x[2]) + x[3]) + x[4]) + x[5]), # 9 op - :(log(abs(x[1]))), # 3 op - :(powabs(p[2] - powabs(p[1] + x[1], 1/x[1]),p[3])) # 13 op - ] # 30 op - exprs = map(e -> Expr(:->, :(x,p), e), exprs) - X = randn(Float32, nrows, 10) - p = [randn(Float32, 10) for _ in 1:length(exprs)] # generate 10 random parameter values for each expr - - # warmup - interpret_cpu(exprs, X, p) - expr_reps = 100 # for each expr - reps= 100 + # warmup + # interpret_cpu(exprsCPU, X, p) - if parallel - t_sec = @elapsed fetch.([Threads.@spawn interpret_cpu(exprs, X, p; repetitions=expr_reps) for i in 1:reps]) - println("~ $(round(30 * reps * expr_reps * nrows / 1e9 / t_sec, digits=2)) GFLOPS ($(Threads.nthreads()) threads) ($(round(peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))") - else - t_sec = @elapsed for i in 1:reps interpret_cpu(exprs, X, p; repetitions=expr_reps) end - println("~ $(round(30 * reps * expr_reps * nrows / 1e9 / t_sec, digits=2)) GFLOPS (single-core) ($(round(peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))") - end - true - end + # @btime interpret_cpu(exprsCPU, X, p; repetitions=expr_reps) # repetitions simulates parameter optimisation + # @btime test_cpu_interpreter(1000) + # @btime fetch.([Threads.@spawn interpret_cpu(exprsCPU, X, p; repetitions=expr_reps) for i in 1:reps]) -LinearAlgebra.BLAS.set_num_threads(1) # only use a single thread for peakflops - -@test test_cpu_interpreter(1000) -@test test_cpu_interpreter(1000, parallel=true) # start julia -t 6 for six threads -@test test_cpu_interpreter(10000) -@test test_cpu_interpreter(10000, parallel=true) + # test_cpu_interpreter(1000, parallel=true) # start julia -t 6 for six threads + # @btime test_cpu_interpreter(10000) + # @btime test_cpu_interpreter(10000, parallel=true) end +ncols = 1000 +X_GPU = randn(Float32, 5, ncols) @testset "Interpreter Performance" begin # Put data in shared memory: # https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory @@ -60,4 +65,38 @@ end # Memory management like in C++ might help with performance improvements # https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management -end \ No newline at end of file +end + + + +suite = BenchmarkGroup() +suite["CPU"] = BenchmarkGroup(["CPUInterpreter"]) +# suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"]) +# suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"]) + +X_small = randn(Float32, 100, 5) +suite["CPU"]["small varset"] = @benchmarkable interpret_cpu(exprsCPU, X_small, p; repetitions=expr_reps) +X_normal = randn(Float32, 1000, 5) +suite["CPU"]["normal varset"] = @benchmarkable interpret_cpu(exprsCPU, X_normal, p; repetitions=expr_reps) +X_large = randn(Float32, 10000, 5) +suite["CPU"]["large varset"] = @benchmarkable interpret_cpu(exprsCPU, X_large, p; repetitions=expr_reps) + +# tune!(suite) + +# BenchmarkTools.save("params.json", params(suite)) +loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance) + +results = run(suite, verbose=true, seconds=180) +# results2 = run(suite, verbose=true, seconds=180) + +medianCPU = median(results["CPU"]) +# medianCPU2 = median(results2["CPU"]) +# medianInterpreter = median(results["GPUI"]) +# medianTranspiler = median(results["GPUT"]) + +# jud = judge(medianCPU, medianCPU2; time_tolerance=0.001) +# println(jud) + +# judge(medianCPU, medianInterpreter; time_tolerance=0.001) +# judge(medianCPU, medianTranspiler; time_tolerance=0.001) +# judge(medianInterpreter, medianTranspiler; time_tolerance=0.001) diff --git a/package/test/Project.toml b/package/test/Project.toml index ec911b4..78d233d 100644 --- a/package/test/Project.toml +++ b/package/test/Project.toml @@ -1,6 +1,8 @@ [deps] +BenchmarkPlots = "ab8c0f59-4072-4e0d-8f91-a91e1495eb26" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" +StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/package/test/params.json b/package/test/params.json new file mode 100644 index 0000000..b1c95dc --- /dev/null +++ b/package/test/params.json @@ -0,0 +1 @@ +[{"Julia":"1.11.4","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"normal varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["CPUInterpreter"]}]},"tags":[]}]]] \ No newline at end of file diff --git a/package/test/runtests.jl b/package/test/runtests.jl index 9c1f232..e2b4ccb 100644 --- a/package/test/runtests.jl +++ b/package/test/runtests.jl @@ -10,7 +10,7 @@ include(joinpath(baseFolder, "src", "Transpiler.jl")) @testset "Functionality tests" begin # include("ExpressionProcessingTests.jl") # include("InterpreterTests.jl") - include("TranspilerTests.jl") + # include("TranspilerTests.jl") end @@ -19,5 +19,5 @@ end # end @testset "Performance tests" begin - # include("PerformanceTests.jl") + include("PerformanceTests.jl") end \ No newline at end of file