implementation: finished pre-processing section; updated code
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
This commit is contained in:
@ -26,8 +26,9 @@ function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector
|
||||
ncols = size(X, 2)
|
||||
|
||||
results = Matrix{Float32}(undef, ncols, length(exprs))
|
||||
# TODO: create CuArray for variables here already, as they never change
|
||||
|
||||
for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially)
|
||||
for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl)
|
||||
results = Interpreter.interpret(exprs, X, p)
|
||||
end
|
||||
|
||||
@ -40,8 +41,9 @@ function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{
|
||||
ncols = size(X, 2)
|
||||
|
||||
results = Matrix{Float32}(undef, ncols, length(exprs))
|
||||
# TODO: create CuArray for variables here already, as they never change
|
||||
|
||||
for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially)
|
||||
for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl)
|
||||
results = Transpiler.evaluate(exprs, X, p)
|
||||
end
|
||||
|
||||
|
@ -9,6 +9,7 @@ export ExpressionElement
|
||||
@enum Operator ADD=1 SUBTRACT=2 MULTIPLY=3 DIVIDE=4 POWER=5 ABS=6 LOG=7 EXP=8 SQRT=9
|
||||
@enum ElementType EMPTY=0 FLOAT32=1 OPERATOR=2 INDEX=3
|
||||
|
||||
const binary_operators = [ADD, SUBTRACT, MULTIPLY, DIVIDE, POWER]
|
||||
const unary_operators = [ABS, LOG, EXP, SQRT]
|
||||
|
||||
struct ExpressionElement
|
||||
@ -17,12 +18,13 @@ struct ExpressionElement
|
||||
end
|
||||
|
||||
const PostfixType = Vector{ExpressionElement}
|
||||
const cache = Dict{Expr, PostfixType}()
|
||||
|
||||
"
|
||||
Converts a julia expression to its postfix notation.
|
||||
NOTE: All 64-Bit values will be converted to 32-Bit. Be aware of the lost precision
|
||||
NOTE: All 64-Bit values will be converted to 32-Bit. Be aware of the lost precision.
|
||||
NOTE: This function is not thread save, especially cache access is not thread save
|
||||
"
|
||||
function expr_to_postfix(expr::Expr)::PostfixType
|
||||
function expr_to_postfix(expr::Expr, cache::Dict{Expr, PostfixType})::PostfixType
|
||||
if haskey(cache, expr)
|
||||
return cache[expr]
|
||||
end
|
||||
@ -34,7 +36,7 @@ function expr_to_postfix(expr::Expr)::PostfixType
|
||||
arg = expr.args[j]
|
||||
|
||||
if typeof(arg) === Expr
|
||||
append!(postfix, expr_to_postfix(arg))
|
||||
append!(postfix, expr_to_postfix(arg, cache))
|
||||
elseif typeof(arg) === Symbol # variables/parameters
|
||||
# maybe TODO: replace the parameters with their respective values, as this might make the expr evaluation faster
|
||||
exprElement = convert_to_ExpressionElement(convert_var_to_int(arg))
|
||||
@ -56,6 +58,8 @@ function expr_to_postfix(expr::Expr)::PostfixType
|
||||
if operator in unary_operators
|
||||
push!(postfix, convert_to_ExpressionElement(operator))
|
||||
end
|
||||
|
||||
cache[expr] = postfix
|
||||
return postfix
|
||||
end
|
||||
|
||||
|
@ -6,6 +6,8 @@ using ..Utils
|
||||
|
||||
export interpret
|
||||
|
||||
const cacheFrontend = Dict{Expr, PostfixType}()
|
||||
|
||||
"Interprets the given expressions with the values provided.
|
||||
# Arguments
|
||||
- expressions::Vector{ExpressionProcessing.PostfixType} : The expressions to execute in postfix form
|
||||
@ -13,10 +15,9 @@ export interpret
|
||||
- parameters::Vector{Vector{Float32}} : The parameters to use. Each Vector contains the values for the parameters p1..pn. The number of parameters can be different for every expression
|
||||
"
|
||||
function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
|
||||
|
||||
exprs = Vector{ExpressionProcessing.PostfixType}(undef, length(expressions))
|
||||
@inbounds for i in eachindex(expressions)
|
||||
exprs[i] = ExpressionProcessing.expr_to_postfix(expressions[i])
|
||||
exprs[i] = ExpressionProcessing.expr_to_postfix(expressions[i], cacheFrontend)
|
||||
end
|
||||
|
||||
variableCols = size(variables, 2) # number of variable sets to use for each expression
|
||||
|
@ -7,21 +7,23 @@ using ..Utils
|
||||
|
||||
const BYTES = sizeof(Float32)
|
||||
const Operand = Union{Float32, String} # Operand is either fixed value or register
|
||||
cache = Dict{Expr, CuFunction}() # needed if multiple runs with the same expr but different parameters are performed
|
||||
|
||||
const cacheFrontend = Dict{Expr, PostfixType}()
|
||||
const transpilerCache = Dict{Expr, CuFunction}() # needed if multiple runs with the same expr but different parameters are performed
|
||||
|
||||
function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
|
||||
varRows = size(variables, 1)
|
||||
variableCols = size(variables, 2)
|
||||
kernels = Vector{CuFunction}(undef, length(expressions))
|
||||
|
||||
|
||||
# TODO: test this again with multiple threads. The first time I tried, I was using only one thread
|
||||
# Test this parallel version again when doing performance tests. With the simple "functionality" tests this took 0.03 seconds while sequential took "0.00009" seconds
|
||||
# Threads.@threads for i in eachindex(expressions)
|
||||
# cacheLock = ReentrantLock()
|
||||
# cacheHit = false
|
||||
# lock(cacheLock) do
|
||||
# if haskey(cache, expressions[i])
|
||||
# kernels[i] = cache[expressions[i]]
|
||||
# if haskey(transpilerCache, expressions[i])
|
||||
# kernels[i] = transpilerCache[expressions[i]]
|
||||
# cacheHit = true
|
||||
# end
|
||||
# end
|
||||
@ -42,16 +44,16 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
|
||||
# mod = CuModule(image)
|
||||
# kernels[i] = CuFunction(mod, "ExpressionProcessing")
|
||||
|
||||
# @lock cacheLock cache[expressions[i]] = kernels[i]
|
||||
# @lock cacheLock transpilerCache[expressions[i]] = kernels[i]
|
||||
# end
|
||||
|
||||
@inbounds for i in eachindex(expressions)
|
||||
if haskey(cache, expressions[i])
|
||||
kernels[i] = cache[expressions[i]]
|
||||
if haskey(transpilerCache, expressions[i])
|
||||
kernels[i] = transpilerCache[expressions[i]]
|
||||
continue
|
||||
end
|
||||
|
||||
formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
|
||||
formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i], cacheFrontend)
|
||||
kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i-1) # i-1 because julia is 1-based but PTX needs 0-based indexing
|
||||
|
||||
linker = CuLink()
|
||||
@ -61,7 +63,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
|
||||
|
||||
mod = CuModule(image)
|
||||
kernels[i] = CuFunction(mod, "ExpressionProcessing")
|
||||
cache[expressions[i]] = kernels[i]
|
||||
transpilerCache[expressions[i]] = kernels[i]
|
||||
end
|
||||
|
||||
cudaVars = CuArray(variables) # maybe put in shared memory (see PerformanceTests.jl for more info)
|
||||
@ -78,7 +80,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
|
||||
|
||||
cudacall(kernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
|
||||
end
|
||||
|
||||
|
||||
return cudaResults
|
||||
end
|
||||
|
||||
|
@ -73,16 +73,17 @@ end
|
||||
# Add /usr/local/cuda/bin in .bashrc to PATH to access ncu and nsys (depending how well this works with my 1080 do it on my machine, otherwise re do the tests and perform them on FH PCs)
|
||||
# University setup at 10.20.1.7 if needed
|
||||
|
||||
compareWithCPU = true
|
||||
compareWithCPU = false
|
||||
|
||||
|
||||
suite = BenchmarkGroup()
|
||||
suite["CPU"] = BenchmarkGroup(["CPUInterpreter"])
|
||||
suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"])
|
||||
suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"])
|
||||
varsets_small = 100
|
||||
varsets_medium = 1000
|
||||
varsets_large = 10000
|
||||
# TODO: see CpuInterpreterTests.jl to see how all data is loaded and implement this here
|
||||
varsets_small = 1000 # 1k should be absolute minimum
|
||||
varsets_medium = 10000
|
||||
varsets_large = 100000 # 100k should be absolute maximum (although not as strict as minimum)
|
||||
|
||||
if compareWithCPU
|
||||
X_small = randn(Float32, varsets_small, 5)
|
||||
@ -112,7 +113,7 @@ suite["GPUT"]["large varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_large_GP
|
||||
|
||||
loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)
|
||||
|
||||
results = run(suite, verbose=true, seconds=180)
|
||||
results = run(suite, verbose=true, seconds=3600) # 1 hour because of CPU. lets see if more is needed
|
||||
|
||||
if compareWithCPU
|
||||
medianCPU = median(results["CPU"])
|
||||
|
@ -3,7 +3,7 @@ using CUDA
|
||||
using .Transpiler
|
||||
using .Interpreter
|
||||
|
||||
varsets_medium = 1000
|
||||
varsets_medium = 10000
|
||||
X = randn(Float32, 5, varsets_medium)
|
||||
|
||||
exprsGPU = [
|
||||
|
Reference in New Issue
Block a user