benchmarks: started preparing benchmarks
	
		
			
	
		
	
	
		
	
		
			Some checks are pending
		
		
	
	
		
			
				
	
				CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
				
			
		
			
				
	
				CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
				
			
		
			
				
	
				CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
				
			
		
		
	
	
				
					
				
			
		
			Some checks are pending
		
		
	
	CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
				
			CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
				
			CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
				
			This commit is contained in:
		@ -1,45 +1,50 @@
 | 
			
		||||
using LinearAlgebra
 | 
			
		||||
using BenchmarkTools
 | 
			
		||||
using BenchmarkPlots, StatsPlots
 | 
			
		||||
 | 
			
		||||
using .Transpiler
 | 
			
		||||
using .Interpreter
 | 
			
		||||
 | 
			
		||||
# University setup at 10.20.1.7 if needed
 | 
			
		||||
exprsCPU = [
 | 
			
		||||
	# CPU interpreter requires an anonymous function and array ref s
 | 
			
		||||
	:(p[1] * x[1] + p[2]), # 5 op
 | 
			
		||||
	:((((x[1] + x[2]) + x[3]) + x[4]) + x[5]), # 9 op
 | 
			
		||||
	:(log(abs(x[1]))), # 3 op
 | 
			
		||||
	:(powabs(p[2] - powabs(p[1] + x[1], 1/x[1]),p[3])) # 13 op
 | 
			
		||||
] # 30 op
 | 
			
		||||
exprsCPU = map(e -> Expr(:->, :(x,p), e), exprsCPU)
 | 
			
		||||
 | 
			
		||||
exprsGPU = [
 | 
			
		||||
	# CPU interpreter requires an anonymous function and array ref s
 | 
			
		||||
	:(p1 * x1 + p2), # 5 op
 | 
			
		||||
	:((((x1 + x2) + x3) + x4) + x5), # 9 op
 | 
			
		||||
	:(log(abs(x1))), # 3 op
 | 
			
		||||
	:(powabs(p2 - powabs(p1 + x1, 1/x1),p3)) # 13 op
 | 
			
		||||
] # 30 op
 | 
			
		||||
 | 
			
		||||
# p is the same for CPU and GPU
 | 
			
		||||
p = [randn(Float32, 10) for _ in 1:length(exprsCPU)] # generate 10 random parameter values for each expr
 | 
			
		||||
nrows = 1000
 | 
			
		||||
X = randn(Float32, nrows, 5)
 | 
			
		||||
 | 
			
		||||
expr_reps = 100 # 100 parameter optimisation steps basically
 | 
			
		||||
@testset "CPU performance" begin
 | 
			
		||||
	function test_cpu_interpreter(nrows; parallel = false)
 | 
			
		||||
		exprs = [
 | 
			
		||||
			# CPU interpreter requires an anonymous function and array ref s
 | 
			
		||||
			:(p[1] * x[1] + p[2]), # 5 op
 | 
			
		||||
			:((((x[1] + x[2]) + x[3]) + x[4]) + x[5]), # 9 op
 | 
			
		||||
			:(log(abs(x[1]))), # 3 op
 | 
			
		||||
			:(powabs(p[2] - powabs(p[1] + x[1], 1/x[1]),p[3])) # 13 op
 | 
			
		||||
		] # 30 op
 | 
			
		||||
		exprs = map(e -> Expr(:->, :(x,p), e), exprs)
 | 
			
		||||
		X = randn(Float32, nrows, 10)
 | 
			
		||||
		p = [randn(Float32, 10) for _ in 1:length(exprs)] # generate 10 random parameter values for each expr
 | 
			
		||||
		
 | 
			
		||||
		# warmup
 | 
			
		||||
		interpret_cpu(exprs, X, p)
 | 
			
		||||
		expr_reps = 100 # for each expr
 | 
			
		||||
		reps= 100
 | 
			
		||||
	# warmup
 | 
			
		||||
	# interpret_cpu(exprsCPU, X, p)
 | 
			
		||||
	
 | 
			
		||||
		if parallel 
 | 
			
		||||
			t_sec = @elapsed fetch.([Threads.@spawn interpret_cpu(exprs, X, p; repetitions=expr_reps) for i in 1:reps])
 | 
			
		||||
			println("~ $(round(30 * reps * expr_reps * nrows  / 1e9 / t_sec, digits=2)) GFLOPS ($(Threads.nthreads()) threads) ($(round(peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))")
 | 
			
		||||
		else
 | 
			
		||||
			t_sec = @elapsed for i in 1:reps interpret_cpu(exprs, X, p; repetitions=expr_reps) end
 | 
			
		||||
			println("~ $(round(30 * reps * expr_reps * nrows  / 1e9 / t_sec, digits=2)) GFLOPS (single-core) ($(round(peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))")
 | 
			
		||||
		end
 | 
			
		||||
		true
 | 
			
		||||
	end
 | 
			
		||||
	# @btime interpret_cpu(exprsCPU, X, p; repetitions=expr_reps) # repetitions simulates parameter optimisation
 | 
			
		||||
	# @btime test_cpu_interpreter(1000)
 | 
			
		||||
	# @btime fetch.([Threads.@spawn interpret_cpu(exprsCPU, X, p; repetitions=expr_reps) for i in 1:reps])
 | 
			
		||||
 | 
			
		||||
LinearAlgebra.BLAS.set_num_threads(1) # only use a single thread for peakflops
 | 
			
		||||
 | 
			
		||||
@test test_cpu_interpreter(1000)
 | 
			
		||||
@test test_cpu_interpreter(1000, parallel=true) # start julia -t 6 for six threads
 | 
			
		||||
@test test_cpu_interpreter(10000)
 | 
			
		||||
@test test_cpu_interpreter(10000, parallel=true)
 | 
			
		||||
	# test_cpu_interpreter(1000, parallel=true) # start julia -t 6 for six threads
 | 
			
		||||
	# @btime test_cpu_interpreter(10000)
 | 
			
		||||
	# @btime test_cpu_interpreter(10000, parallel=true)
 | 
			
		||||
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
ncols = 1000
 | 
			
		||||
X_GPU = randn(Float32, 5, ncols)
 | 
			
		||||
@testset "Interpreter Performance" begin
 | 
			
		||||
	# Put data in shared memory: 
 | 
			
		||||
	# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
 | 
			
		||||
@ -60,4 +65,38 @@ end
 | 
			
		||||
 | 
			
		||||
	# Memory management like in C++ might help with performance improvements
 | 
			
		||||
	# https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management
 | 
			
		||||
end
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
suite = BenchmarkGroup()
 | 
			
		||||
suite["CPU"] = BenchmarkGroup(["CPUInterpreter"])
 | 
			
		||||
# suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"])
 | 
			
		||||
# suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"])
 | 
			
		||||
 | 
			
		||||
X_small = randn(Float32, 100, 5)
 | 
			
		||||
suite["CPU"]["small varset"] = @benchmarkable interpret_cpu(exprsCPU, X_small, p; repetitions=expr_reps)
 | 
			
		||||
X_normal = randn(Float32, 1000, 5)
 | 
			
		||||
suite["CPU"]["normal varset"] = @benchmarkable interpret_cpu(exprsCPU, X_normal, p; repetitions=expr_reps)
 | 
			
		||||
X_large = randn(Float32, 10000, 5)
 | 
			
		||||
suite["CPU"]["large varset"] = @benchmarkable interpret_cpu(exprsCPU, X_large, p; repetitions=expr_reps)
 | 
			
		||||
 | 
			
		||||
# tune!(suite)
 | 
			
		||||
 | 
			
		||||
# BenchmarkTools.save("params.json", params(suite))
 | 
			
		||||
loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)
 | 
			
		||||
 | 
			
		||||
results = run(suite, verbose=true, seconds=180)
 | 
			
		||||
# results2 = run(suite, verbose=true, seconds=180)
 | 
			
		||||
 | 
			
		||||
medianCPU = median(results["CPU"])
 | 
			
		||||
# medianCPU2 = median(results2["CPU"])
 | 
			
		||||
# medianInterpreter = median(results["GPUI"])
 | 
			
		||||
# medianTranspiler = median(results["GPUT"])
 | 
			
		||||
 | 
			
		||||
# jud = judge(medianCPU, medianCPU2; time_tolerance=0.001)
 | 
			
		||||
# println(jud)
 | 
			
		||||
 | 
			
		||||
# judge(medianCPU, medianInterpreter; time_tolerance=0.001)
 | 
			
		||||
# judge(medianCPU, medianTranspiler; time_tolerance=0.001)
 | 
			
		||||
# judge(medianInterpreter, medianTranspiler; time_tolerance=0.001)
 | 
			
		||||
 | 
			
		||||
@ -1,6 +1,8 @@
 | 
			
		||||
[deps]
 | 
			
		||||
BenchmarkPlots = "ab8c0f59-4072-4e0d-8f91-a91e1495eb26"
 | 
			
		||||
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 | 
			
		||||
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 | 
			
		||||
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 | 
			
		||||
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 | 
			
		||||
StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd"
 | 
			
		||||
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										1
									
								
								package/test/params.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								package/test/params.json
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1 @@
 | 
			
		||||
[{"Julia":"1.11.4","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"normal varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["CPUInterpreter"]}]},"tags":[]}]]]
 | 
			
		||||
@ -10,7 +10,7 @@ include(joinpath(baseFolder, "src", "Transpiler.jl"))
 | 
			
		||||
@testset "Functionality tests" begin
 | 
			
		||||
	# include("ExpressionProcessingTests.jl")
 | 
			
		||||
	# include("InterpreterTests.jl")
 | 
			
		||||
	include("TranspilerTests.jl")
 | 
			
		||||
	# include("TranspilerTests.jl")
 | 
			
		||||
end
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -19,5 +19,5 @@ end
 | 
			
		||||
# end
 | 
			
		||||
 | 
			
		||||
@testset "Performance tests" begin
 | 
			
		||||
	# include("PerformanceTests.jl")
 | 
			
		||||
	include("PerformanceTests.jl")
 | 
			
		||||
end
 | 
			
		||||
		Reference in New Issue
	
	Block a user