implementation: finished pre-processing section; updated code

2025-04-26 13:46:23 +02:00
parent ad2eab2e0a
commit e571fa5bd6
10 changed files with 238 additions and 46 deletions
--- a/package/src/Transpiler.jl
+++ b/package/src/Transpiler.jl
@ -7,21 +7,23 @@ using ..Utils

 const BYTES = sizeof(Float32)
 const Operand = Union{Float32, String} # Operand is either fixed value or register
-cache = Dict{Expr, CuFunction}() # needed if multiple runs with the same expr but different parameters are performed
+
+const cacheFrontend = Dict{Expr, PostfixType}()
+const transpilerCache = Dict{Expr, CuFunction}() # needed if multiple runs with the same expr but different parameters are performed

 function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
 	varRows = size(variables, 1)
 	variableCols = size(variables, 2)
 	kernels = Vector{CuFunction}(undef, length(expressions))
-	
+
 	# TODO: test this again with multiple threads. The first time I tried, I was using only one thread
 	# Test this parallel version again when doing performance tests. With the simple "functionality" tests this took 0.03 seconds while sequential took "0.00009" seconds
 	# Threads.@threads for i in eachindex(expressions)
 	# 	cacheLock = ReentrantLock()
 	# 	cacheHit = false
 	# 	lock(cacheLock) do 
-	# 		if haskey(cache, expressions[i])
-	# 			kernels[i] = cache[expressions[i]]
+	# 		if haskey(transpilerCache, expressions[i])
+	# 			kernels[i] = transpilerCache[expressions[i]]
 	# 			cacheHit = true
 	# 		end
 	# 	end
@ -42,16 +44,16 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
 	# 	mod = CuModule(image)
 	# 	kernels[i] = CuFunction(mod, "ExpressionProcessing")

-	# 	@lock cacheLock cache[expressions[i]] = kernels[i]
+	# 	@lock cacheLock transpilerCache[expressions[i]] = kernels[i]
 	# end

 	@inbounds for i in eachindex(expressions)
-		if haskey(cache, expressions[i])
-			kernels[i] = cache[expressions[i]]
+		if haskey(transpilerCache, expressions[i])
+			kernels[i] = transpilerCache[expressions[i]]
 			continue
 		end

-		formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
+		formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i], cacheFrontend)
 		kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i-1) # i-1 because julia is 1-based but PTX needs 0-based indexing
 		
 		linker = CuLink()
@ -61,7 +63,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
 		
 		mod = CuModule(image)
 		kernels[i] = CuFunction(mod, "ExpressionProcessing")
-		cache[expressions[i]] = kernels[i]
+		transpilerCache[expressions[i]] = kernels[i]
 	end

 	cudaVars = CuArray(variables) # maybe put in shared memory (see PerformanceTests.jl for more info)
@ -78,7 +80,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet

 		cudacall(kernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
 	end
-
+	
 	return cudaResults
 end