benchmarking: started tuning benchmarking results. found some errors that need fixing

2025-05-09 19:19:53 +02:00
parent 327e4ebf1b
commit 7121329a17
5 changed files with 39 additions and 33 deletions
--- a/package/test/PerformanceTests.jl
+++ b/package/test/PerformanceTests.jl
@ -1,9 +1,13 @@
 using LinearAlgebra
 using BenchmarkTools
+using DelimitedFiles
+using GZip

 using .Transpiler
 using .Interpreter

+include("parser.jl") # to parse expressions from a file
+
 const BENCHMARKS_RESULTS_PATH = "./results-fh-new"

 # Number of expressions can get really big (into millions)
@ -11,6 +15,7 @@ const BENCHMARKS_RESULTS_PATH = "./results-fh-new"

 data,varnames = readdlm("data/nikuradse_1.csv", ',', header=true);
 X = convert(Matrix{Float32}, data)
+X_t = permutedims(X) # for gpu

 exprs = Expr[]
 parameters = Vector{Vector{Float32}}()
@ -19,24 +24,15 @@ paramnames = ["p$i" for i in 1:20]
 # data/esr_nvar2_len10.txt.gz_9.txt.gz has  ~250_000 exprs
 # data/esr_nvar2_len10.txt.gz_10.txt.gz has ~800_000 exrps
 GZip.open("data/esr_nvar2_len10.txt.gz_9.txt.gz") do io 
-	i = 0
 	for line in eachline(io)
 		expr, p = parse_infix(line, varnames, paramnames)
-		
-		if i > 10
-			return
-		end
-		println(expr)

 		push!(exprs, expr)
 		push!(parameters, randn(Float32, length(p)))
-
-		i += 1
 	end
 end
 expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially; only p changes but not X)

-
 # TODO: Tipps for tuning:
 	# Put data in shared memory: 
 	# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
@ -60,18 +56,20 @@ suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"])
 suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"])

 if compareWithCPU
-	suite["CPU"]["nikuradse_1"] = @benchmarkable interpret_cpu(exprsCPU, X, parameters; repetitions=expr_reps)
+	suite["CPU"]["nikuradse_1"] = @benchmarkable interpret_cpu(exprs, X, parameters; repetitions=expr_reps)
+	suite["CPU"]["nikuradse_1_parallel"] = @benchmarkable interpret_cpu(exprs, X, parameters; repetitions=expr_reps, parallel=true)
 end

-# TODO: Most likely need to transpose X matrix here, as we are expecting a column major matrix for more efficient memory access
-suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprsGPU, X, parameters; repetitions=expr_reps)
-suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprsGPU, X, parameters; repetitions=expr_reps)
+suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprs, X_t, parameters; repetitions=expr_reps)
+suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprs, X_t, parameters; repetitions=expr_reps)

-for i in 1:10
+for i in 1:2
 	tune!(suite)
 end
 BenchmarkTools.save("params.json", params(suite))

+throw("finished tuning")
+
 loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)

 results = run(suite, verbose=true, seconds=3600) # 1 hour because of CPU. lets see if more is needed