benchmarking: started tuning benchmarking results. found some errors that need fixing

2025-05-09 19:19:53 +02:00
parent 327e4ebf1b
commit 7121329a17
5 changed files with 39 additions and 33 deletions
--- a/package/src/ExpressionExecutorCuda.jl
+++ b/package/src/ExpressionExecutorCuda.jl
@ -58,21 +58,34 @@ end
 # Evaluate Expressions on the CPU
-function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32}
+function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1, parallel=false)::Matrix{Float32}
 	@assert axes(exprs) == axes(p)
 	nrows = size(X, 1)
 	# each column of the matrix has the result for an expr
 	res = Matrix{Float32}(undef, nrows, length(exprs))
-	for i in eachindex(exprs) 
+	if parallel
-		# The interpreter holds the postfix code and buffers for evaluation. It is costly to create
+		Threads.@threads for i in eachindex(exprs) 
-		interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i])) 
+			# The interpreter holds the postfix code and buffers for evaluation. It is costly to create
-
+			interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i])) 
-		# If an expression has to be evaluated multiple times (e.g. for different parameters),
+	
-		# it is worthwhile to reuse the interpreter to reduce the number of allocations
+			# If an expression has to be evaluated multiple times (e.g. for different parameters),
-		for rep in 1:repetitions
+			# it is worthwhile to reuse the interpreter to reduce the number of allocations
-			CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i])
+			for rep in 1:repetitions
 				CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i])
 			end
 		end
 	else
 		for i in eachindex(exprs) 
 			# The interpreter holds the postfix code and buffers for evaluation. It is costly to create
 			interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i])) 
 			# If an expression has to be evaluated multiple times (e.g. for different parameters),
 			# it is worthwhile to reuse the interpreter to reduce the number of allocations
 			for rep in 1:repetitions
 				CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i])
 			end
 		end
 	end
--- a/package/src/ExpressionProcessing.jl
+++ b/package/src/ExpressionProcessing.jl
@ -100,7 +100,7 @@ function get_operator(op::Symbol)::Operator
 	elseif op == :sqrt
 		return SQRT
 	else
-		throw("Operator unknown")
+		throw("Operator unknown. Operator was $op")
 	end
 end
--- a/package/test/CpuInterpreterTests.jl
+++ b/package/test/CpuInterpreterTests.jl
@ -65,11 +65,6 @@ function test_cpu_interpreter_nikuradse()
 		i = 0
        for line in eachline(io)
            expr, p = parse_infix(line, varnames, paramnames)
 			if i > 10
 				return
 			end
 			println(expr)
            push!(exprs, expr)
            push!(parameters, randn(Float32, length(p)))
--- a/package/test/PerformanceTests.jl
+++ b/package/test/PerformanceTests.jl
@ -1,9 +1,13 @@
 using LinearAlgebra
 using BenchmarkTools
 using DelimitedFiles
 using GZip
 using .Transpiler
 using .Interpreter
 include("parser.jl") # to parse expressions from a file
 const BENCHMARKS_RESULTS_PATH = "./results-fh-new"
 # Number of expressions can get really big (into millions)
@ -11,6 +15,7 @@ const BENCHMARKS_RESULTS_PATH = "./results-fh-new"
 data,varnames = readdlm("data/nikuradse_1.csv", ',', header=true);
 X = convert(Matrix{Float32}, data)
 X_t = permutedims(X) # for gpu
 exprs = Expr[]
 parameters = Vector{Vector{Float32}}()
@ -19,24 +24,15 @@ paramnames = ["p$i" for i in 1:20]
 # data/esr_nvar2_len10.txt.gz_9.txt.gz has  ~250_000 exprs
 # data/esr_nvar2_len10.txt.gz_10.txt.gz has ~800_000 exrps
 GZip.open("data/esr_nvar2_len10.txt.gz_9.txt.gz") do io 
 	i = 0
 	for line in eachline(io)
 		expr, p = parse_infix(line, varnames, paramnames)
 		if i > 10
 			return
 		end
 		println(expr)
 		push!(exprs, expr)
 		push!(parameters, randn(Float32, length(p)))
 		i += 1
 	end
 end
 expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially; only p changes but not X)
 # TODO: Tipps for tuning:
 	# Put data in shared memory: 
 	# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
@ -60,18 +56,20 @@ suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"])
 suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"])
 if compareWithCPU
-	suite["CPU"]["nikuradse_1"] = @benchmarkable interpret_cpu(exprsCPU, X, parameters; repetitions=expr_reps)
+	suite["CPU"]["nikuradse_1"] = @benchmarkable interpret_cpu(exprs, X, parameters; repetitions=expr_reps)
 	suite["CPU"]["nikuradse_1_parallel"] = @benchmarkable interpret_cpu(exprs, X, parameters; repetitions=expr_reps, parallel=true)
 end
-# TODO: Most likely need to transpose X matrix here, as we are expecting a column major matrix for more efficient memory access
+suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprs, X_t, parameters; repetitions=expr_reps)
-suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprsGPU, X, parameters; repetitions=expr_reps)
+suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprs, X_t, parameters; repetitions=expr_reps)
 suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprsGPU, X, parameters; repetitions=expr_reps)
-for i in 1:10
+for i in 1:2
 	tune!(suite)
 end
 BenchmarkTools.save("params.json", params(suite))
 throw("finished tuning")
 loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)
 results = run(suite, verbose=true, seconds=3600) # 1 hour because of CPU. lets see if more is needed
--- a/package/test/runtests.jl
+++ b/package/test/runtests.jl
@ -10,9 +10,9 @@ include(joinpath(baseFolder, "src", "Interpreter.jl"))
 include(joinpath(baseFolder, "src", "Transpiler.jl"))
@testset "Functionality tests" begin
-	include("ExpressionProcessingTests.jl")
+	# include("ExpressionProcessingTests.jl")
-	include("InterpreterTests.jl")
+	# include("InterpreterTests.jl")
-	include("TranspilerTests.jl")
+	# include("TranspilerTests.jl")
 end
@ -22,5 +22,5 @@ end
@testset "Performance tests" begin
 	# include("PerformanceTuning.jl")
-	# include("PerformanceTests.jl")
+	include("PerformanceTests.jl")
 end