diff --git a/package/src/ExpressionExecutorCuda.jl b/package/src/ExpressionExecutorCuda.jl index c5b8093..6670167 100644 --- a/package/src/ExpressionExecutorCuda.jl +++ b/package/src/ExpressionExecutorCuda.jl @@ -2,6 +2,7 @@ module ExpressionExecutorCuda include("Utils.jl") include("ExpressionProcessing.jl") include("Interpreter.jl") +include("Transpiler.jl") module CpuInterpreter include("Code.jl") @@ -20,20 +21,31 @@ export test # # Evaluate Expressions on the GPU -function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32} +function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32} @assert axes(exprs) == axes(p) ncols = size(X, 2) - result = Matrix{Float32}(undef, ncols, length(exprs)) - # interpret + results = Matrix{Float32}(undef, ncols, length(exprs)) + + for i in 1:repetitions # Simulate parameter tuning + results = Interpreter.interpret(exprs, X, p) + end + + return results end # Convert Expressions to PTX Code and execute that instead -function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32} +function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32} @assert axes(exprs) == axes(p) ncols = size(X, 2) - result = Matrix{Float32}(undef, ncols, length(exprs)) + results = Matrix{Float32}(undef, ncols, length(exprs)) + + for i in 1:repetitions # Simulate parameter tuning + results = Transpiler.evaluate(exprs, X, p) + end + + return results end diff --git a/package/src/ExpressionProcessing.jl b/package/src/ExpressionProcessing.jl index bbed13f..0b76445 100644 --- a/package/src/ExpressionProcessing.jl +++ b/package/src/ExpressionProcessing.jl @@ -71,6 +71,10 @@ function get_operator(op::Symbol)::Operator return EXP elseif op == :sqrt return SQRT + elseif op == :powabs + return POWER # TODO: Fix this + else + throw("Operator unknown") end end diff --git a/package/src/Interpreter.jl b/package/src/Interpreter.jl index 3a74efa..913fb9e 100644 --- a/package/src/Interpreter.jl +++ b/package/src/Interpreter.jl @@ -12,19 +12,25 @@ export interpret - variables::Matrix{Float32} : The variables to use. Each column is mapped to the variables x1..xn - parameters::Vector{Vector{Float32}} : The parameters to use. Each Vector contains the values for the parameters p1..pn. The number of parameters can be different for every expression " -function interpret(expressions::Vector{ExpressionProcessing.PostfixType}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32} +function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32} + + exprs = Vector{ExpressionProcessing.PostfixType}(undef, length(expressions)) + for i in eachindex(expressions) + exprs[i] = ExpressionProcessing.expr_to_postfix(expressions[i]) + end + variableCols = size(variables, 2) # number of variable sets to use for each expression cudaVars = CuArray(variables) cudaParams = Utils.create_cuda_array(parameters, NaN32) # column corresponds to data for one expression - cudaExprs = Utils.create_cuda_array(expressions, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression + cudaExprs = Utils.create_cuda_array(exprs, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression # put into seperate cuArray, as this is static and would be inefficient to send seperatly to every kernel - cudaStepsize = CuArray([Utils.get_max_inner_length(expressions), Utils.get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression + cudaStepsize = CuArray([Utils.get_max_inner_length(exprs), Utils.get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression # each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions - cudaResults = CuArray{Float32}(undef, variableCols, length(expressions)) + cudaResults = CuArray{Float32}(undef, variableCols, length(exprs)) # Start kernel for each expression to ensure that no warp is working on different expressions - for i in eachindex(expressions) + for i in eachindex(exprs) kernel = @cuda launch=false interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i) config = launch_configuration(kernel.fun) threads = min(variableCols, config.threads) diff --git a/package/src/Transpiler.jl b/package/src/Transpiler.jl index 1cf6c52..57248e3 100644 --- a/package/src/Transpiler.jl +++ b/package/src/Transpiler.jl @@ -107,7 +107,7 @@ function get_cuda_header()::String return " .version 8.5 .target sm_61 -.address_size 32 +.address_size 64 " end diff --git a/package/test/InterpreterTests.jl b/package/test/InterpreterTests.jl index d383c70..c4e017b 100644 --- a/package/test/InterpreterTests.jl +++ b/package/test/InterpreterTests.jl @@ -21,8 +21,8 @@ parameters[2][1] = 5.0 parameters[2][2] = 0.0 function testHelper(expression::Expr, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}}, expectedResult) - postfix = Vector([expr_to_postfix(expression)]) - result = Interpreter.interpret(postfix, variables, parameters) + exprs = Vector([expression]) + result = Interpreter.interpret(exprs, variables, parameters) expectedResult32 = convert(Float32, expectedResult) @test isequal(result[1,1], expectedResult32) @@ -127,8 +127,8 @@ end expr1 = :((x1 + 5) * p1 - 3 / abs(x2) + (2^4) - log(8)) expr2 = :(1 + 5 * x1 - 10^2 + (p1 - p2) / 9 + exp(x2)) - postfix = Vector([expr_to_postfix(expr1), expr_to_postfix(expr2)]) - result = Interpreter.interpret(postfix, var, param) + exprs = Vector([expr1, expr2]) + result = Interpreter.interpret(exprs, var, param) # var set 1 @test isapprox(result[1,1], 37.32, atol=0.01) # expr1 diff --git a/package/test/PerformanceTests.jl b/package/test/PerformanceTests.jl index 76cf2b0..54d6519 100644 --- a/package/test/PerformanceTests.jl +++ b/package/test/PerformanceTests.jl @@ -1,6 +1,5 @@ using LinearAlgebra using BenchmarkTools -using BenchmarkPlots, StatsPlots using .Transpiler using .Interpreter @@ -71,26 +70,40 @@ end suite = BenchmarkGroup() suite["CPU"] = BenchmarkGroup(["CPUInterpreter"]) -# suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"]) -# suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"]) +suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"]) +suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"]) +varsets_small = 100 +varsets_medium = 1000 +varsets_large = 10000 -X_small = randn(Float32, 100, 5) +X_small = randn(Float32, varsets_small, 5) suite["CPU"]["small varset"] = @benchmarkable interpret_cpu(exprsCPU, X_small, p; repetitions=expr_reps) -X_normal = randn(Float32, 1000, 5) -suite["CPU"]["normal varset"] = @benchmarkable interpret_cpu(exprsCPU, X_normal, p; repetitions=expr_reps) -X_large = randn(Float32, 10000, 5) +X_medium = randn(Float32, varsets_medium, 5) +suite["CPU"]["medium varset"] = @benchmarkable interpret_cpu(exprsCPU, X_medium, p; repetitions=expr_reps) +X_large = randn(Float32, varsets_large, 5) suite["CPU"]["large varset"] = @benchmarkable interpret_cpu(exprsCPU, X_large, p; repetitions=expr_reps) -# tune!(suite) +X_small_GPU = randn(Float32, 5, varsets_small) +suite["GPUI"]["small varset"] = @benchmarkable interpret_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps) +suite["GPUT"]["small varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps) -# BenchmarkTools.save("params.json", params(suite)) -loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance) +X_medium_GPU = randn(Float32, 5, varsets_medium) +suite["GPUI"]["medium varset"] = @benchmarkable interpret_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps) +suite["GPUT"]["medium varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps) -results = run(suite, verbose=true, seconds=180) +X_large_GPU = randn(Float32, 5, varsets_large) +suite["GPUI"]["large varset"] = @benchmarkable interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps) +suite["GPUT"]["large varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps) + +tune!(suite) + +BenchmarkTools.save("params.json", params(suite)) +# loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance) + +# results = run(suite, verbose=true, seconds=180) # results2 = run(suite, verbose=true, seconds=180) -medianCPU = median(results["CPU"]) -# medianCPU2 = median(results2["CPU"]) +# medianCPU = median(results["CPU"]) # medianInterpreter = median(results["GPUI"]) # medianTranspiler = median(results["GPUT"])