implementation: finished pre-processing section; updated code

2025-04-26 13:46:23 +02:00
parent ad2eab2e0a
commit e571fa5bd6
10 changed files with 238 additions and 46 deletions
--- a/package/src/ExpressionExecutorCuda.jl
+++ b/package/src/ExpressionExecutorCuda.jl
@ -26,8 +26,9 @@ function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector
 	ncols = size(X, 2)

 	results = Matrix{Float32}(undef, ncols, length(exprs))
+	# TODO: create CuArray for variables here already, as they never change

-	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially)
+	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl)
 		results = Interpreter.interpret(exprs, X, p)
 	end

@ -40,8 +41,9 @@ function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{
 	ncols = size(X, 2)

 	results = Matrix{Float32}(undef, ncols, length(exprs))
+	# TODO: create CuArray for variables here already, as they never change

-	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially)
+	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl)
 		results = Transpiler.evaluate(exprs, X, p)
 	end

--- a/package/src/ExpressionProcessing.jl
+++ b/package/src/ExpressionProcessing.jl
@ -9,6 +9,7 @@ export ExpressionElement
@enum Operator ADD=1 SUBTRACT=2 MULTIPLY=3 DIVIDE=4 POWER=5 ABS=6 LOG=7 EXP=8 SQRT=9
@enum ElementType EMPTY=0 FLOAT32=1 OPERATOR=2 INDEX=3

+const binary_operators = [ADD, SUBTRACT, MULTIPLY, DIVIDE, POWER]
 const unary_operators = [ABS, LOG, EXP, SQRT]

 struct ExpressionElement
@ -17,12 +18,13 @@ struct ExpressionElement
 end

 const PostfixType = Vector{ExpressionElement}
-const cache = Dict{Expr, PostfixType}()
+
 "
 Converts a julia expression to its postfix notation.
-NOTE: All 64-Bit values will be converted to 32-Bit. Be aware of the lost precision
+NOTE: All 64-Bit values will be converted to 32-Bit. Be aware of the lost precision.
+NOTE: This function is not thread save, especially cache access is not thread save
 "
-function expr_to_postfix(expr::Expr)::PostfixType
+function expr_to_postfix(expr::Expr, cache::Dict{Expr, PostfixType})::PostfixType
 	if haskey(cache, expr)
 		return cache[expr]
 	end
@ -34,7 +36,7 @@ function expr_to_postfix(expr::Expr)::PostfixType
 		arg = expr.args[j]

 		if typeof(arg) === Expr
-			append!(postfix, expr_to_postfix(arg))
+			append!(postfix, expr_to_postfix(arg, cache))
 		elseif typeof(arg) === Symbol # variables/parameters
 			# maybe TODO: replace the parameters with their respective values, as this might make the expr evaluation faster
 			exprElement = convert_to_ExpressionElement(convert_var_to_int(arg))
@ -56,6 +58,8 @@ function expr_to_postfix(expr::Expr)::PostfixType
 	if operator in unary_operators 
 		push!(postfix, convert_to_ExpressionElement(operator))
 	end
+
+	cache[expr] = postfix
 	return postfix
 end

--- a/package/src/Interpreter.jl
+++ b/package/src/Interpreter.jl
@ -6,6 +6,8 @@ using ..Utils

 export interpret

+const cacheFrontend = Dict{Expr, PostfixType}()
+
 "Interprets the given expressions with the values provided.
 # Arguments
 - expressions::Vector{ExpressionProcessing.PostfixType} : The expressions to execute in postfix form
@ -13,10 +15,9 @@ export interpret
 - parameters::Vector{Vector{Float32}} : The parameters to use. Each Vector contains the values for the parameters p1..pn. The number of parameters can be different for every expression
 "
 function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
-	
 	exprs = Vector{ExpressionProcessing.PostfixType}(undef, length(expressions))
 	@inbounds for i in eachindex(expressions)
-		exprs[i] = ExpressionProcessing.expr_to_postfix(expressions[i])
+		exprs[i] = ExpressionProcessing.expr_to_postfix(expressions[i], cacheFrontend)
 	end
 	
 	variableCols = size(variables, 2) # number of variable sets to use for each expression
--- a/package/src/Transpiler.jl
+++ b/package/src/Transpiler.jl
@ -7,21 +7,23 @@ using ..Utils

 const BYTES = sizeof(Float32)
 const Operand = Union{Float32, String} # Operand is either fixed value or register
-cache = Dict{Expr, CuFunction}() # needed if multiple runs with the same expr but different parameters are performed
+
+const cacheFrontend = Dict{Expr, PostfixType}()
+const transpilerCache = Dict{Expr, CuFunction}() # needed if multiple runs with the same expr but different parameters are performed

 function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
 	varRows = size(variables, 1)
 	variableCols = size(variables, 2)
 	kernels = Vector{CuFunction}(undef, length(expressions))
-	
+
 	# TODO: test this again with multiple threads. The first time I tried, I was using only one thread
 	# Test this parallel version again when doing performance tests. With the simple "functionality" tests this took 0.03 seconds while sequential took "0.00009" seconds
 	# Threads.@threads for i in eachindex(expressions)
 	# 	cacheLock = ReentrantLock()
 	# 	cacheHit = false
 	# 	lock(cacheLock) do 
-	# 		if haskey(cache, expressions[i])
-	# 			kernels[i] = cache[expressions[i]]
+	# 		if haskey(transpilerCache, expressions[i])
+	# 			kernels[i] = transpilerCache[expressions[i]]
 	# 			cacheHit = true
 	# 		end
 	# 	end
@ -42,16 +44,16 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
 	# 	mod = CuModule(image)
 	# 	kernels[i] = CuFunction(mod, "ExpressionProcessing")

-	# 	@lock cacheLock cache[expressions[i]] = kernels[i]
+	# 	@lock cacheLock transpilerCache[expressions[i]] = kernels[i]
 	# end

 	@inbounds for i in eachindex(expressions)
-		if haskey(cache, expressions[i])
-			kernels[i] = cache[expressions[i]]
+		if haskey(transpilerCache, expressions[i])
+			kernels[i] = transpilerCache[expressions[i]]
 			continue
 		end

-		formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
+		formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i], cacheFrontend)
 		kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i-1) # i-1 because julia is 1-based but PTX needs 0-based indexing
 		
 		linker = CuLink()
@ -61,7 +63,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
 		
 		mod = CuModule(image)
 		kernels[i] = CuFunction(mod, "ExpressionProcessing")
-		cache[expressions[i]] = kernels[i]
+		transpilerCache[expressions[i]] = kernels[i]
 	end

 	cudaVars = CuArray(variables) # maybe put in shared memory (see PerformanceTests.jl for more info)
@ -78,7 +80,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet

 		cudacall(kernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
 	end
-
+	
 	return cudaResults
 end

--- a/package/test/PerformanceTests.jl
+++ b/package/test/PerformanceTests.jl
@ -73,16 +73,17 @@ end
 # Add /usr/local/cuda/bin in .bashrc to PATH to access ncu and nsys (depending how well this works with my 1080 do it on my machine, otherwise re do the tests and perform them on FH PCs)
 # University setup at 10.20.1.7 if needed

-compareWithCPU = true
+compareWithCPU = false


 suite = BenchmarkGroup()
 suite["CPU"] = BenchmarkGroup(["CPUInterpreter"])
 suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"])
 suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"])
-varsets_small = 100
-varsets_medium = 1000
-varsets_large = 10000
+# TODO: see CpuInterpreterTests.jl to see how all data is loaded and implement this here
+varsets_small = 1000 # 1k should be absolute minimum
+varsets_medium = 10000
+varsets_large = 100000 # 100k should be absolute maximum (although not as strict as minimum)

 if compareWithCPU
 	X_small = randn(Float32, varsets_small, 5)
@ -112,7 +113,7 @@ suite["GPUT"]["large varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_large_GP

 loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)

-results = run(suite, verbose=true, seconds=180)
+results = run(suite, verbose=true, seconds=3600) # 1 hour because of CPU. lets see if more is needed

 if compareWithCPU
 	medianCPU = median(results["CPU"])
--- a/package/test/PerformanceTuning.jl
+++ b/package/test/PerformanceTuning.jl
@ -3,7 +3,7 @@ using CUDA
 using .Transpiler
 using .Interpreter

-varsets_medium = 1000
+varsets_medium = 10000
 X = randn(Float32, 5, varsets_medium)

 exprsGPU = [