using .Transpiler using .Interpreter # University setup at 10.20.1.7 if needed @testset "CPU performance" begin function test_cpu_interpreter(nrows; parallel = false) exprs = [ # CPU interpreter requires an anonymous function and array ref s :(p[1] * x[1] + p[2]), # 5 op :((((x[1] + x[2]) + x[3]) + x[4]) + x[5]), # 9 op :(log(abs(x[1]))), # 3 op :(powabs(p[2] - powabs(p[1] + x[1], 1/x[1]),p[3])) # 13 op ] # 30 op exprs = map(e -> Expr(:->, :(x,p), e), exprs) X = randn(Float32, nrows, 10) p = [randn(Float32, 10) for _ in 1:length(exprs)] # generate 10 random parameter values for each expr # warmup interpret_cpu(exprs, X, p) expr_reps = 100 # for each expr reps= 100 if parallel t_sec = @elapsed fetch.([Threads.@spawn interpret_cpu(exprs, X, p; repetitions=expr_reps) for i in 1:reps]) println("~ $(round(30 * reps * expr_reps * nrows / 1e9 / t_sec, digits=2)) GFLOPS ($(Threads.nthreads()) threads) ($(round(peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))") else t_sec = @elapsed for i in 1:reps interpret_cpu(exprs, X, p; repetitions=expr_reps) end println("~ $(round(30 * reps * expr_reps * nrows / 1e9 / t_sec, digits=2)) GFLOPS (single-core) ($(round(peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))") end true end LinearAlgebra.BLAS.set_num_threads(1) # only use a single thread for peakflops @test test_cpu_interpreter(1000) @test test_cpu_interpreter(1000, parallel=true) # start julia -t 6 for six threads @test test_cpu_interpreter(10000) @test test_cpu_interpreter(10000, parallel=true) end @testset "Interpreter Performance" begin # Put data in shared memory: # https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory # Make array const: # https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays # Memory management like in C++ might help with performance improvements # https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management end @testset "Transpiler Performance" begin # Put data in shared memory: # https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory # Make array const: # https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays # Memory management like in C++ might help with performance improvements # https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management end