updated all to 32-bit to save registers and boost performance
Some checks failed
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled

This commit is contained in:
2024-11-01 11:23:58 +01:00
parent 9fc55c4c15
commit 68cedd75fc
7 changed files with 113 additions and 106 deletions

View File

@ -6,8 +6,6 @@ export interpret_gpu
export evaluate_gpu
export test
# const SymbolTable64 = Dict{Tuple{Expr, Symbol},Float64}
#
# Some assertions:
# Variables and parameters start their naming with "1" meaning the first variable/parameter has to be "x1/p1" and not "x0/p0"
# each index i in exprs has to have the matching values in the column i in Matrix X so that X[:,i] contains the values for expr[i]. The same goes for p
@ -15,22 +13,14 @@ export test
#
# Evaluate Expressions on the GPU
function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float64}, p::Vector{Vector{Float64}})::Matrix{Float64}
function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
# Ensure that no two expressions are interpreted in the same "warp"
exprsPostfix = ExpressionProcessing.expr_to_postfix(exprs[1])
end
# Convert Expressions to PTX Code and execute that instead
function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float64}, p::Vector{Vector{Float64}})::Matrix{Float64}
# Look into this to maybe speed up PTX generation: https://cuda.juliagpu.org/stable/tutorials/introduction/#Parallelization-on-the-CPU
end
# TODO: See if it is feasible to make 32 versions too (mostly because 32 is significantly faster than 64)
# If AMD GPU support gets added, it might even be a good idea to add 16 bit floats, since they are even faster than 32 bit. On Nvidia 16 is either slower or equal in performance to 32 bit
function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
end
function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
# Look into this to maybe speed up PTX generation: https://cuda.juliagpu.org/stable/tutorials/introduction/#Parallelization-on-the-CPU
end
end
@ -45,4 +35,4 @@ end
#
# The following can be done on the CPU
# convert expression to postfix notation (mandatory)
# replace every variable with the according value from X and p (reduce extensive memory access on the GPU)
# optional: replace every parameter with the correct value (should only improve performance if data transfer is the bottleneck)