implementation: finished pre-processing section; updated code

2025-04-26 13:46:23 +02:00
parent ad2eab2e0a
commit e571fa5bd6
10 changed files with 238 additions and 46 deletions
--- a/package/src/ExpressionExecutorCuda.jl
+++ b/package/src/ExpressionExecutorCuda.jl
@ -26,8 +26,9 @@ function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector
 	ncols = size(X, 2)

 	results = Matrix{Float32}(undef, ncols, length(exprs))
+	# TODO: create CuArray for variables here already, as they never change

-	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially)
+	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl)
 		results = Interpreter.interpret(exprs, X, p)
 	end

@ -40,8 +41,9 @@ function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{
 	ncols = size(X, 2)

 	results = Matrix{Float32}(undef, ncols, length(exprs))
+	# TODO: create CuArray for variables here already, as they never change

-	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially)
+	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl)
 		results = Transpiler.evaluate(exprs, X, p)
 	end

--- a/package/src/ExpressionProcessing.jl
+++ b/package/src/ExpressionProcessing.jl
@ -9,6 +9,7 @@ export ExpressionElement
@enum Operator ADD=1 SUBTRACT=2 MULTIPLY=3 DIVIDE=4 POWER=5 ABS=6 LOG=7 EXP=8 SQRT=9
@enum ElementType EMPTY=0 FLOAT32=1 OPERATOR=2 INDEX=3

+const binary_operators = [ADD, SUBTRACT, MULTIPLY, DIVIDE, POWER]
 const unary_operators = [ABS, LOG, EXP, SQRT]

 struct ExpressionElement
@ -17,12 +18,13 @@ struct ExpressionElement
 end

 const PostfixType = Vector{ExpressionElement}
-const cache = Dict{Expr, PostfixType}()
+
 "
 Converts a julia expression to its postfix notation.
-NOTE: All 64-Bit values will be converted to 32-Bit. Be aware of the lost precision
+NOTE: All 64-Bit values will be converted to 32-Bit. Be aware of the lost precision.
+NOTE: This function is not thread save, especially cache access is not thread save
 "
-function expr_to_postfix(expr::Expr)::PostfixType
+function expr_to_postfix(expr::Expr, cache::Dict{Expr, PostfixType})::PostfixType
 	if haskey(cache, expr)
 		return cache[expr]
 	end
@ -34,7 +36,7 @@ function expr_to_postfix(expr::Expr)::PostfixType
 		arg = expr.args[j]

 		if typeof(arg) === Expr
-			append!(postfix, expr_to_postfix(arg))
+			append!(postfix, expr_to_postfix(arg, cache))
 		elseif typeof(arg) === Symbol # variables/parameters
 			# maybe TODO: replace the parameters with their respective values, as this might make the expr evaluation faster
 			exprElement = convert_to_ExpressionElement(convert_var_to_int(arg))
@ -56,6 +58,8 @@ function expr_to_postfix(expr::Expr)::PostfixType
 	if operator in unary_operators 
 		push!(postfix, convert_to_ExpressionElement(operator))
 	end
+
+	cache[expr] = postfix
 	return postfix
 end

--- a/package/src/Interpreter.jl
+++ b/package/src/Interpreter.jl
@ -6,6 +6,8 @@ using ..Utils

 export interpret

+const cacheFrontend = Dict{Expr, PostfixType}()
+
 "Interprets the given expressions with the values provided.
 # Arguments
 - expressions::Vector{ExpressionProcessing.PostfixType} : The expressions to execute in postfix form
@ -13,10 +15,9 @@ export interpret
 - parameters::Vector{Vector{Float32}} : The parameters to use. Each Vector contains the values for the parameters p1..pn. The number of parameters can be different for every expression
 "
 function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
-	
 	exprs = Vector{ExpressionProcessing.PostfixType}(undef, length(expressions))
 	@inbounds for i in eachindex(expressions)
-		exprs[i] = ExpressionProcessing.expr_to_postfix(expressions[i])
+		exprs[i] = ExpressionProcessing.expr_to_postfix(expressions[i], cacheFrontend)
 	end
 	
 	variableCols = size(variables, 2) # number of variable sets to use for each expression
--- a/package/src/Transpiler.jl
+++ b/package/src/Transpiler.jl
@ -7,21 +7,23 @@ using ..Utils

 const BYTES = sizeof(Float32)
 const Operand = Union{Float32, String} # Operand is either fixed value or register
-cache = Dict{Expr, CuFunction}() # needed if multiple runs with the same expr but different parameters are performed
+
+const cacheFrontend = Dict{Expr, PostfixType}()
+const transpilerCache = Dict{Expr, CuFunction}() # needed if multiple runs with the same expr but different parameters are performed

 function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
 	varRows = size(variables, 1)
 	variableCols = size(variables, 2)
 	kernels = Vector{CuFunction}(undef, length(expressions))
-	
+
 	# TODO: test this again with multiple threads. The first time I tried, I was using only one thread
 	# Test this parallel version again when doing performance tests. With the simple "functionality" tests this took 0.03 seconds while sequential took "0.00009" seconds
 	# Threads.@threads for i in eachindex(expressions)
 	# 	cacheLock = ReentrantLock()
 	# 	cacheHit = false
 	# 	lock(cacheLock) do 
-	# 		if haskey(cache, expressions[i])
-	# 			kernels[i] = cache[expressions[i]]
+	# 		if haskey(transpilerCache, expressions[i])
+	# 			kernels[i] = transpilerCache[expressions[i]]
 	# 			cacheHit = true
 	# 		end
 	# 	end
@ -42,16 +44,16 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
 	# 	mod = CuModule(image)
 	# 	kernels[i] = CuFunction(mod, "ExpressionProcessing")

-	# 	@lock cacheLock cache[expressions[i]] = kernels[i]
+	# 	@lock cacheLock transpilerCache[expressions[i]] = kernels[i]
 	# end

 	@inbounds for i in eachindex(expressions)
-		if haskey(cache, expressions[i])
-			kernels[i] = cache[expressions[i]]
+		if haskey(transpilerCache, expressions[i])
+			kernels[i] = transpilerCache[expressions[i]]
 			continue
 		end

-		formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
+		formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i], cacheFrontend)
 		kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i-1) # i-1 because julia is 1-based but PTX needs 0-based indexing
 		
 		linker = CuLink()
@ -61,7 +63,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
 		
 		mod = CuModule(image)
 		kernels[i] = CuFunction(mod, "ExpressionProcessing")
-		cache[expressions[i]] = kernels[i]
+		transpilerCache[expressions[i]] = kernels[i]
 	end

 	cudaVars = CuArray(variables) # maybe put in shared memory (see PerformanceTests.jl for more info)
@ -78,7 +80,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet

 		cudacall(kernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
 	end
-
+	
 	return cudaResults
 end