module ExpressionExecutorCuda
include("Utils.jl")
include("ExpressionProcessing.jl")
include("Interpreter.jl")
include("Transpiler.jl")

module CpuInterpreter
include("Code.jl")
include("CpuInterpreter.jl")
end

using CUDA
using ..ExpressionProcessing

export interpret_gpu,interpret_cpu
export evaluate_gpu

# Some assertions:
# Variables and parameters start their indexing with "1" meaning the first variable/parameter has to be "x1/p1" and not "x0/p0"
# Matrix X is column major
# each index i in exprs has to have the matching values in the column i in Matrix X so that X[:,i] contains the values for expr[i]. The same goes for p
#     This assertion is made, because in julia, the first index doesn't have to be 1
#

# Evaluate Expressions on the GPU
function interpret_gpu(expressions::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32}
	@assert axes(expressions) == axes(p)
	variableCols = size(X, 2)
	variableRows = size(X, 1)

	variables = CuArray(X)

	exprs = Vector{ExpressionProcessing.PostfixType}(undef, length(expressions))
	@inbounds Threads.@threads for i in eachindex(expressions)
		exprs[i] = ExpressionProcessing.expr_to_postfix(expressions[i])
	end
	cudaExprs = Utils.create_cuda_array(exprs, ExpressionProcessing.ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression;
	exprsLength = length(exprs)
	exprsInnerLength = Utils.get_max_inner_length(exprs)
	
	results = Matrix{Float32}(undef, variableCols, length(exprs))
	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl)
		results = Interpreter.interpret(cudaExprs, exprsLength, exprsInnerLength, variables, variableCols, variableRows, p)
	end

	return results
end

# Convert Expressions to PTX Code and execute that instead
function evaluate_gpu(expressions::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32}
	@assert axes(expressions) == axes(p)
	numVariableSets = size(X, 2) # nr. of columns of X
	variableSetSize = size(X, 1) # nr. of rows of X
	
	variables = CuArray(X)

	largestParameterSetSize = Utils.get_max_inner_length(p) # parameters get transformed into matrix. Will be nr. of rows in parameter matrix

	compiledKernels = Vector{CuFunction}(undef, length(expressions)) 
	kernelName = "evaluate_gpu"
	@inbounds Threads.@threads for i in eachindex(expressions)
		ex = ExpressionProcessing.expr_to_postfix(expressions[i])
		ptxKernel = Transpiler.transpile(ex, variableSetSize, largestParameterSetSize, numVariableSets, i-1, kernelName) # i-1 because julia is 1-based but PTX needs 0-based indexing
		compiledKernels[i] = Transpiler.compile_kernel(ptxKernel, kernelName)
	end

	results = Matrix{Float32}(undef, numVariableSets, length(expressions))
	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl)
		# evaluate
		# results = Transpiler.evaluate(exprs, variables, numVariableSets, variableSetSize, p)
		results = Transpiler.evaluate(compiledKernels, variables, numVariableSets, p, kernelName)
	end

	return results
end


# Evaluate Expressions on the CPU
function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1, parallel=false)::Matrix{Float32}
	@assert axes(exprs) == axes(p)
	nrows = size(X, 1)
	
	# each column of the matrix has the result for an expr
	res = Matrix{Float32}(undef, nrows, length(exprs))

	if parallel
		Threads.@threads for i in eachindex(exprs) 
			# The interpreter holds the postfix code and buffers for evaluation. It is costly to create
			interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i])) 
	
			# If an expression has to be evaluated multiple times (e.g. for different parameters),
			# it is worthwhile to reuse the interpreter to reduce the number of allocations
			for rep in 1:repetitions
				CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i])
			end
		end
	else
		for i in eachindex(exprs) 
			# The interpreter holds the postfix code and buffers for evaluation. It is costly to create
			interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i])) 
			
			# If an expression has to be evaluated multiple times (e.g. for different parameters),
			# it is worthwhile to reuse the interpreter to reduce the number of allocations
			for rep in 1:repetitions
				CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i])
			end
		end
	end

	res
end

end