module ExpressionExecutorCuda
include("Interpreter.jl")
include("ExpressionProcessing.jl")

export interpret_gpu
export evaluate_gpu
export test

# const SymbolTable64 = Dict{Tuple{Expr, Symbol},Float64}
#
# Some assertions:
# Variables and parameters start their naming with "1" meaning the first variable/parameter has to be "x1/p1" and not "x0/p0"
# each index i in exprs has to have the matching values in the column i in Matrix X so that X[:,i] contains the values for expr[i]. The same goes for p
#     This assertion is made, because in julia, the first index doesn't have to be 1
#

# Evaluate Expressions on the GPU
function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float64}, p::Vector{Vector{Float64}})::Matrix{Float64}
	# Ensure that no two expressions are interpreted in the same "warp"
	exprsPostfix = ExpressionProcessing.expr_to_postfix(exprs[1])
end

# Convert Expressions to PTX Code and execute that instead
function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float64}, p::Vector{Vector{Float64}})::Matrix{Float64}
	# Look into this to maybe speed up PTX generation: https://cuda.juliagpu.org/stable/tutorials/introduction/#Parallelization-on-the-CPU
end


# TODO: See if it is feasible to make 32 versions too (mostly because 32 is significantly faster than 64)
# If AMD GPU support gets added, it might even be a good idea to add 16 bit floats, since they are even faster than 32 bit. On Nvidia 16 is either slower or equal in performance to 32 bit
function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
end
function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
end

function test() 
	Interpreter.CudaTest()
end

"Performs pre processing steps to the expressions.
 - It replaces every variable with the according value stored in X and p.
 - It transforms the expressions into postfix form and returns them.
"
function preprocess_expressions!(exprs::Vector{Expr}, X::Matrix{Float64}, p::Vector{Vector{Float64}})::Array{String}
	symtable = ExpressionProcessing.construct_symtable(exprs, X, p)
	postfixExpressions = Array{String,1}()

	# Test if multi threading provides a speedup and if it does, roughly determin the size at which it is beneficial.
	for i in eachindex(exprs)
		expr = deepcopy(exprs[i])
		ExpressionProcessing.replace_variables!(exprs[i], symtable, expr)
		push!(postfixExpressions, ExpressionProcessing.expr_to_postfix(exprs[i]))
	end
	
	return postfixExpressions
end

end



# Flow
# input: Vector expr    == expressions contains eg. 4 expressions
#        Matrix X       == |expr| columns, n rows. n == number of variabls x1..xn; n is the same for all expressions --- WRONG
#        Matrix X       == k columns, n rows. k == number of variables in the expressions (every expression must have the same number of variables); n == number of different values for xk where k is the column
#        VectorVector p == vector size |expr| containing vector size m. m == number of parameters per expression. p can be different for each expression
# 
# The following can be done on the CPU
#     convert expression to postfix notation (mandatory)
#     replace every variable with the according value from X and p (reduce extensive memory access on the GPU)