module ExpressionExecutorCuda
include("Utils.jl")
include("ExpressionProcessing.jl")
include("Interpreter.jl")

module CpuInterpreter
include("Code.jl")
include("CpuInterpreter.jl")
end

export interpret_gpu,interpret_cpu
export evaluate_gpu
export test

# Some assertions:
# Variables and parameters start their naming with "1" meaning the first variable/parameter has to be "x1/p1" and not "x0/p0"
# Matrix X is column major
# each index i in exprs has to have the matching values in the column i in Matrix X so that X[:,i] contains the values for expr[i]. The same goes for p
#     This assertion is made, because in julia, the first index doesn't have to be 1
#

# Evaluate Expressions on the GPU
function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
	@assert axes(exprs) == axes(p)
	ncols = size(X, 2)

	result = Matrix{Float32}(undef, ncols, length(exprs))
	# interpret
end

# Convert Expressions to PTX Code and execute that instead
function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
	@assert axes(exprs) == axes(p)
	ncols = size(X, 2)

	result = Matrix{Float32}(undef, ncols, length(exprs))
end


# Evaluate Expressions on the CPU
function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32}
	@assert axes(exprs) == axes(p)
	nrows = size(X, 1)
	
	# each column of the matrix has the result for an expr
	res = Matrix{Float32}(undef, nrows, length(exprs))

	for i in eachindex(exprs) 
		# The interpreter holds the postfix code and buffers for evaluation. It is costly to create
		interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i])) 

		# If an expression has to be evaluated multiple times (e.g. for different parameters),
		# it is worthwhile to reuse the interpreter to reduce the number of allocations
		for rep in 1:repetitions
			CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i])
		end
	end

	res
end


# Flow
# input: Vector expr    == expressions contains eg. 4 expressions
#        Matrix X       == |expr| columns, n rows. n == number of variabls x1..xn; n is the same for all expressions --- WRONG
#        Matrix X       == k columns, n rows. k == number of variables in the expressions (every expression must have the same number of variables); n == number of different values for xk where k is the column
#        VectorVector p == vector size |expr| containing vector size m. m == number of parameters per expression. p can be different for each expression
# 
# The following can be done on the CPU
#     convert expression to postfix notation (mandatory)
#     optional: replace every parameter with the correct value (should only improve performance if data transfer is the bottleneck)

end