updated all to 32-bit to save registers and boost performance
Some checks failed
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled

This commit is contained in:
Daniel 2024-11-01 11:23:58 +01:00
parent 9fc55c4c15
commit 68cedd75fc
7 changed files with 113 additions and 106 deletions

View File

@ -6,8 +6,6 @@ export interpret_gpu
export evaluate_gpu
export test
# const SymbolTable64 = Dict{Tuple{Expr, Symbol},Float64}
#
# Some assertions:
# Variables and parameters start their naming with "1" meaning the first variable/parameter has to be "x1/p1" and not "x0/p0"
# each index i in exprs has to have the matching values in the column i in Matrix X so that X[:,i] contains the values for expr[i]. The same goes for p
@ -15,22 +13,14 @@ export test
#
# Evaluate Expressions on the GPU
function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float64}, p::Vector{Vector{Float64}})::Matrix{Float64}
function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
# Ensure that no two expressions are interpreted in the same "warp"
exprsPostfix = ExpressionProcessing.expr_to_postfix(exprs[1])
end
# Convert Expressions to PTX Code and execute that instead
function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float64}, p::Vector{Vector{Float64}})::Matrix{Float64}
# Look into this to maybe speed up PTX generation: https://cuda.juliagpu.org/stable/tutorials/introduction/#Parallelization-on-the-CPU
end
# TODO: See if it is feasible to make 32 versions too (mostly because 32 is significantly faster than 64)
# If AMD GPU support gets added, it might even be a good idea to add 16 bit floats, since they are even faster than 32 bit. On Nvidia 16 is either slower or equal in performance to 32 bit
function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
end
function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
# Look into this to maybe speed up PTX generation: https://cuda.juliagpu.org/stable/tutorials/introduction/#Parallelization-on-the-CPU
end
end
@ -45,4 +35,4 @@ end
#
# The following can be done on the CPU
# convert expression to postfix notation (mandatory)
# replace every variable with the according value from X and p (reduce extensive memory access on the GPU)
# optional: replace every parameter with the correct value (should only improve performance if data transfer is the bottleneck)

View File

@ -3,20 +3,23 @@ module ExpressionProcessing
export expr_to_postfix
export PostfixType
export Operator, ADD, SUBTRACT, MULTIPLY, DIVIDE, POWER, ABS, LOG, EXP, SQRT
export ElementType, EMPTY, FLOAT64, OPERATOR, INDEX
export ElementType, EMPTY, FLOAT32, OPERATOR, INDEX
export ExpressionElement
@enum Operator::Int64 ADD=1 SUBTRACT=2 MULTIPLY=3 DIVIDE=4 POWER=5 ABS=6 LOG=7 EXP=8 SQRT=9
@enum ElementType EMPTY=0 FLOAT64=1 OPERATOR=2 INDEX=3
@enum Operator ADD=1 SUBTRACT=2 MULTIPLY=3 DIVIDE=4 POWER=5 ABS=6 LOG=7 EXP=8 SQRT=9
@enum ElementType EMPTY=0 FLOAT32=1 OPERATOR=2 INDEX=3
struct ExpressionElement
Type::ElementType
Value::Int64 # Reinterpret the stored value to type "ElementType" when using it
Value::Int32 # Reinterpret the stored value to type "ElementType" when using it
end
const PostfixType = Vector{ExpressionElement}
"Converts a julia expression to its postfix notation"
"
Converts a julia expression to its postfix notation.
NOTE: All 64-Bit values will be converted to 32-Bit. Be aware of the lost precision
"
function expr_to_postfix(expr::Expr)::PostfixType
postfix = PostfixType()
operator = get_operator(expr.args[1])
@ -30,7 +33,7 @@ function expr_to_postfix(expr::Expr)::PostfixType
exprElement = convert_to_ExpressionElement(convert_var_to_int(arg))
push!(postfix, exprElement)
else
exprElement = convert_to_ExpressionElement(convert(Float64, arg))
exprElement = convert_to_ExpressionElement(convert(Float32, arg))
push!(postfix, exprElement)
end
@ -73,7 +76,7 @@ end
"Extracts the number from a variable/parameter and returns it. If the symbol is a parameter ```pn```, the resulting value will be negativ.
```x0 and p0``` are not allowed."
function convert_var_to_int(var::Symbol)::Int
function convert_var_to_int(var::Symbol)::Int32
varStr = String(var)
number = parse(Int32, SubString(varStr, 2))
@ -84,33 +87,39 @@ function convert_var_to_int(var::Symbol)::Int
return number
end
function convert_to_ExpressionElement(element)::ExpressionElement
value = reinterpret(Int64, element)
if element isa Float64
return ExpressionElement(FLOAT64, value)
elseif element isa Int64
return ExpressionElement(INDEX, value)
elseif element isa Operator
return ExpressionElement(OPERATOR, value)
else
error("Element was of type '$(typeof(element))', which is not supported.")
end
function convert_to_ExpressionElement(element::Int32)::ExpressionElement
value = reinterpret(Int32, element)
return ExpressionElement(INDEX, value)
end
function convert_to_ExpressionElement(element::Int64)::ExpressionElement
value = reinterpret(Int32, convert(Int32, element))
return ExpressionElement(INDEX, value)
end
function convert_to_ExpressionElement(element::Float32)::ExpressionElement
value = reinterpret(Int32, element)
return ExpressionElement(FLOAT32, value)
end
function convert_to_ExpressionElement(element::Float64)::ExpressionElement
value = reinterpret(Int32, convert(Float32, element))
return ExpressionElement(FLOAT32, value)
end
function convert_to_ExpressionElement(element::Operator)::ExpressionElement
value = reinterpret(Int32, element)
return ExpressionElement(OPERATOR, value)
end
#
# Everything below is currently not needed. Left here for potential future use
#
const SymbolTable64 = Dict{Tuple{Expr, Symbol},Float64}
const SymbolTable32 = Dict{Tuple{Expr, Symbol},Float32}
"Replaces all the variables and parameters of the given expression with their corresponding Value stored in the symtable
# Arguments
- `symtable::SymbolTable64`: Contains the values of all variables for each expression
- `symtable::SymbolTable32`: Contains the values of all variables for each expression
- `originalExpr::Expr`: Contains a deep copy of the original expression. It is used to link the expression and variables to their according Value stored in the symtable
"
function replace_variables!(ex::Expr, symtable::SymbolTable64, originalExpr::Expr)
function replace_variables!(ex::Expr, symtable::SymbolTable32, originalExpr::Expr)
for i in 1:length(ex.args)
arg = ex.args[i]
if typeof(arg) === Expr
@ -123,8 +132,8 @@ end
# TODO: Completly rewrite this function because I misunderstood it. Not every column is linked to an expression. therefore all other functions need to be reworked as well. Probably can't replace the variables in julia anymore, look into this. (see ExpressionExecutorCuda.jl for more info)
# Before rewriting, proceed with just creating a postfix notation and sending the variable matrix as well as the parameter "matrix" to the GPU to perform first calculations
function construct_symtable(expressions::Vector{Expr}, mat::Matrix{Float64}, params::Vector{Vector{Float64}})::SymbolTable64
symtable = SymbolTable64()
function construct_symtable(expressions::Vector{Expr}, mat::Matrix{Float32}, params::Vector{Vector{Float32}})::SymbolTable32
symtable = SymbolTable32()
for i in eachindex(expressions)
expr = expressions[i]
@ -138,7 +147,7 @@ function construct_symtable(expressions::Vector{Expr}, mat::Matrix{Float64}, par
return symtable
end
function fill_symtable!(expr::Expr, symtable::SymbolTable64, values::Vector{Float64}, symbolPrefix::String)
function fill_symtable!(expr::Expr, symtable::SymbolTable32, values::Vector{Float32}, symbolPrefix::String)
varIndex = 1
for j in eachindex(values)
val = values[j]

View File

@ -8,19 +8,19 @@ export interpret
"Interprets the given expressions with the values provided.
# Arguments
- expressions::Vector{ExpressionProcessing.PostfixType} : The expressions to execute in postfix form
- variables::Matrix{Float64} : The variables to use. Each column is mapped to the variables x1..xn
- parameters::Vector{Vector{Float64}} : The parameters to use. Each Vector contains the values for the parameters p1..pn. The number of parameters can be different for every expression
- variables::Matrix{Float32} : The variables to use. Each column is mapped to the variables x1..xn
- parameters::Vector{Vector{Float32}} : The parameters to use. Each Vector contains the values for the parameters p1..pn. The number of parameters can be different for every expression
"
function interpret(expressions::Vector{ExpressionProcessing.PostfixType}, variables::Matrix{Float64}, parameters::Vector{Vector{Float64}})::Matrix{Float64}
function interpret(expressions::Vector{ExpressionProcessing.PostfixType}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
variableCols = size(variables, 2) # number of sets of variables to use for each expression
cudaVars = CuArray(variables)
cudaParams = create_cuda_array(parameters, NaN64) # column corresponds to data for one expression
cudaParams = create_cuda_array(parameters, NaN32) # column corresponds to data for one expression
cudaExprs = create_cuda_array(expressions, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression
# put into seperate cuArray, as this is static and would be inefficient to send seperatly to every kernel
cudaStepsize = CuArray([get_max_inner_length(expressions), get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression
# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
cudaResults = CuArray{Float64}(undef, variableCols, length(expressions))
cudaResults = CuArray{Float32}(undef, variableCols, length(expressions))
# Start kernel for each expression to ensure that no warp is working on different expressions
for i in eachindex(expressions)
@ -37,7 +37,7 @@ end
#TODO: Add @inbounds to all indexing after it is verified that all works https://cuda.juliagpu.org/stable/development/kernel/#Bounds-checking
const MAX_STACK_SIZE = 25 # The max number of values the expression can have. so Constant values, Variables and parameters
function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, variables::CuDeviceArray{Float64}, parameters::CuDeviceArray{Float64}, results::CuDeviceArray{Float64}, stepsize::CuDeviceArray{Int}, exprIndex::Int)
function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, variables::CuDeviceArray{Float32}, parameters::CuDeviceArray{Float32}, results::CuDeviceArray{Float32}, stepsize::CuDeviceArray{Int}, exprIndex::Int)
index = (blockIdx().x - 1) * blockDim().x + threadIdx().x # ctaid.x * ntid.x + tid.x
stride = gridDim().x * blockDim().x # nctaid.x * ntid.x
@ -46,7 +46,7 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var
firstParamIndex = ((exprIndex - 1) * stepsize[2]) # Exclusive
variableCols = length(variables) / stepsize[3]
operationStack = MVector{MAX_STACK_SIZE, Float64}(undef) # Try to get this to function with variable size too, to allow better memory usage
operationStack = MVector{MAX_STACK_SIZE, Float32}(undef) # Try to get this to function with variable size too, to allow better memory usage
operationStackTop = 0 # stores index of the last defined/valid value
for varSetIndex in index:stride
@ -65,9 +65,9 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var
val = -val
operationStack[operationStackTop] = parameters[firstParamIndex + val]
end
elseif expressions[i].Type == FLOAT64
elseif expressions[i].Type == FLOAT32
operationStackTop += 1
operationStack[operationStackTop] = reinterpret(Float64, expressions[i].Value)
operationStack[operationStackTop] = reinterpret(Float32, expressions[i].Value)
elseif expressions[i].Type == OPERATOR
type = reinterpret(Operator, expressions[i].Value)
if type == ADD

View File

@ -106,11 +106,11 @@ function transpile(expression::ExpressionProcessing.PostfixType)::String
ptxBuffer = IOBuffer()
println(ptxBuffer, get_cuda_header())
println(ptxBuffer, get_kernel_signature("ExpressionProcessing", [Int64, Float64]))
println(ptxBuffer, get_kernel_signature("ExpressionProcessing", [Int32, Float32]))
println(ptxBuffer, "{")
# TODO: Actually calculate the number of needed registers and extend to more register kinds
println(ptxBuffer, get_register_definitions(1, 5, 1)) # apparently I can define registers anywhere. This might make things easier
println(ptxBuffer, get_register_definitions(1, 5, 0)) # apparently I can define registers anywhere. This might make things easier
# TODO: Parameter loading
println(ptxBuffer, get_guard_clause())
@ -120,7 +120,9 @@ function transpile(expression::ExpressionProcessing.PostfixType)::String
# Variables have: %var0 to %varn - 1
# Parameters have: %param0 to %paramn - 1
# Code goes here
println(ptxBuffer, generate_calculation_code(expression))
(calc_code, fRegisterCount) = generate_calculation_code(expression)
println(ptxBuffer, get_register_definitions(0, 0, fRegisterCount))
println(ptxBuffer, calc_code)
# exit jump location
print(ptxBuffer, exitJumpLocationMarker); println(ptxBuffer, ": ret;")
@ -184,7 +186,8 @@ function get_guard_clause()::String
return String(take!(guardBuffer))
end
function get_register_definitions(nrPred::Int, nr32Bit::Int, nrFloat64::Int)::String
# TODO: refactor this for better usage. Maybe make this generate only one register definition and pass in the details
function get_register_definitions(nrPred::Int, nr32Bit::Int, nrFloat32::Int)::String
registersBuffer = IOBuffer()
if nrPred > 0
@ -193,8 +196,8 @@ function get_register_definitions(nrPred::Int, nr32Bit::Int, nrFloat64::Int)::St
if nr32Bit > 0
println(registersBuffer, ".reg .b32 %r<$nr32Bit>;")
end
if nrFloat64 > 0
println(registersBuffer, ".reg .f64 %f<$nrFloat64>;")
if nrFloat32 > 0
println(registersBuffer, ".reg .f32 %f<$nrFloat32>;")
end
return String(take!(registersBuffer))
@ -205,38 +208,41 @@ end
# Probably do this: Get Expr -> traverse tree -> if child node is Expr: basically replace that node with the register containing the result of that Expr
# Current assumption: Expression only made out of constant values
function generate_calculation_code(expression::ExpressionProcessing.PostfixType)::String
function generate_calculation_code(expression::ExpressionProcessing.PostfixType)::Tuple{String, Int}
codeBuffer = IOBuffer()
operands = Vector{Float64}()
operands = Vector{Union{Float32, String}}() # Maybe make it of type ANY. Then I could put the register name "on the stack" instead and build up the code like that. Could also make it easier implementing variables/params
registerCounter = 0
println(expression)
for i in eachindex(expression)
token = expression[i]
if token.Type == FLOAT64
push!(operands, reinterpret(Float64, token.Value))
if token.Type == FLOAT32
push!(operands, reinterpret(Float32, token.Value))
elseif token.Type == OPERATOR
operator = get_ptx_operator(reinterpret(Operator, token.Value))
print(codeBuffer, " $operator %f$registerCounter ")
register = "%f$registerCounter"
print(codeBuffer, " $operator $register, ")
# Ugly temporary proof of concept which is ignoring unary operators
if length(operands) == 0
print(codeBuffer, "%f")
print(codeBuffer, registerCounter - 2) # add result before previous result
end
print(codeBuffer, " ")
if length(operands) <= 1
print(codeBuffer, "%f")
print(codeBuffer, registerCounter - 1) # add previous result
end
print(codeBuffer, " ")
# if length(operands) == 0
# print(codeBuffer, "%f")
# print(codeBuffer, registerCounter - 2) # add result before previous result
# end
# print(codeBuffer, " ")
# if length(operands) <= 1
# print(codeBuffer, "%f")
# print(codeBuffer, registerCounter - 1) # add previous result
# end
# print(codeBuffer, " ")
ops = last(operands, 2)
pop!(operands);pop!(operands)
print(codeBuffer, join(ops, ", ")) # if operands has too few values it means the previous calculation is needed. So we need to use registerCounter - 1 or registerCounter - 2 previous registers
println(codeBuffer, ";")
# empty!(operands)
push!(operands, register)
registerCounter += 1
end
@ -247,14 +253,14 @@ function generate_calculation_code(expression::ExpressionProcessing.PostfixType)
# on all other operations either 1 or 2 (one if unary and two if binary operator)
end
return String(take!(codeBuffer))
return (String(take!(codeBuffer)), registerCounter)
end
function type_to_ptx_type(type::DataType)::String
if type == Int64
return ".s64"
elseif type == Float64
return ".f64"
elseif type == Float32
return ".f32"
else
return ".b64"
end
@ -264,23 +270,23 @@ end
# Left out for now since I don't have register management yet
function get_ptx_operator(operator::Operator)::String
if operator == ADD
return "add.f64"
return "add.f32"
elseif operator == SUBTRACT
return "sub.f64"
return "sub.f32"
elseif operator == MULTIPLY
return "mul.f64"
return "mul.f32"
elseif operator == DIVIDE
return "div.approx.f64"
return "div.approx.f32"
elseif operator == POWER
return ""
elseif operator == ABS
return "abs.f64"
return "abs.f32"
elseif operator == LOG
return "lg2.approx.f64"
return "lg2.approx.f32"
elseif operator == EXP
return ""
elseif operator == SQRT
return "sqrt.approx.f64"
return "sqrt.approx.f32"
else
throw(ArgumentError("Operator conversion to ptx not implemented for $operator"))
end

View File

@ -1,20 +1,20 @@
using .ExpressionProcessing
expressions = Vector{Expr}(undef, 1)
variables = Matrix{Float64}(undef, 1,2)
parameters = Vector{Vector{Float64}}(undef, 1)
variables = Matrix{Float32}(undef, 1,2)
parameters = Vector{Vector{Float32}}(undef, 1)
# Resulting value should be 10
expressions[1] = :(x1 + 1 * x2 + p1)
variables[1,1] = 2
variables[1,2] = 3
parameters[1] = Vector{Float64}(undef, 1)
parameters[1] = Vector{Float32}(undef, 1)
parameters[1][1] = 5
@testset "Test conversion expression element" begin
reference1 = ExpressionElement(FLOAT64, reinterpret(Int64, 1.0))
reference2 = ExpressionElement(INDEX, reinterpret(Int64, 1))
reference3 = ExpressionElement(OPERATOR, reinterpret(Int64, ADD))
reference1 = ExpressionElement(FLOAT32, reinterpret(Int32, 1f0))
reference2 = ExpressionElement(INDEX, reinterpret(Int32, Int32(1)))
reference3 = ExpressionElement(OPERATOR, reinterpret(Int32, ADD))
@test isequal(reference1, ExpressionProcessing.convert_to_ExpressionElement(1.0))
@test isequal(reference2, ExpressionProcessing.convert_to_ExpressionElement(1))

View File

@ -3,8 +3,8 @@ using .ExpressionProcessing
using .Interpreter
expressions = Vector{Expr}(undef, 2)
variables = Matrix{Float64}(undef, 2,2)
parameters = Vector{Vector{Float64}}(undef, 2)
variables = Matrix{Float32}(undef, 2,2)
parameters = Vector{Vector{Float32}}(undef, 2)
# Resulting value should be 10 for the first expression
expressions[1] = :(x1 + 1 * x2 + p1)
@ -13,34 +13,36 @@ variables[1,1] = 2.0
variables[2,1] = 3.0
variables[1,2] = 0.0
variables[2,2] = 5.0
parameters[1] = Vector{Float64}(undef, 1)
parameters[2] = Vector{Float64}(undef, 2)
parameters[1] = Vector{Float32}(undef, 1)
parameters[2] = Vector{Float32}(undef, 2)
parameters[1][1] = 5.0
parameters[2][1] = 5.0
parameters[2][2] = 0.0
function testHelper(expression::Expr, variables::Matrix{Float64}, parameters::Vector{Vector{Float64}}, expectedResult::Float64)
function testHelper(expression::Expr, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}}, expectedResult)
postfix = Vector([expr_to_postfix(expression)])
result = Interpreter.interpret(postfix, variables, parameters)
@test isequal(result[1,1], expectedResult)
expectedResult32 = convert(Float32, expectedResult)
@test isequal(result[1,1], expectedResult32)
end
@testset "Test conversion to matrix" begin
reference = Matrix{Float64}(undef, 2, 2)
reference = Matrix{Float32}(undef, 2, 2)
reference[1,1] = 5.0
reference[2,1] = NaN64
reference[2,1] = NaN32
reference[1,2] = 5.0
reference[2,2] = 0.0
# reference = Matrix([5.0, NaN],
# [5.0, 0.0])
result = Interpreter.convert_to_matrix(parameters, NaN64)
result = Interpreter.convert_to_matrix(parameters, NaN32)
@test isequal(result, reference)
end
@testset "Test commutative interpretation" begin
var = Matrix{Float64}(undef, 2, 1)
param = Vector{Vector{Float64}}(undef, 1)
var = Matrix{Float32}(undef, 2, 1)
param = Vector{Vector{Float32}}(undef, 1)
expectedResult = 8.0 # Not using "eval" because the variables are not stored in global scope
var[1,1] = 3.0
@ -59,8 +61,8 @@ end
end
@testset "Test non commutative interpretation" begin
var = Matrix{Float64}(undef, 2, 1)
param = Vector{Vector{Float64}}(undef, 1)
var = Matrix{Float32}(undef, 2, 1)
param = Vector{Vector{Float32}}(undef, 1)
expectedResult = -2.0 # Not using "eval" because the variables are not stored in global scope
var[1,1] = 3.0
@ -89,8 +91,8 @@ end
end
@testset "Test single value operator interpretation" begin
var = Matrix{Float64}(undef, 1, 1)
param = Vector{Vector{Float64}}(undef, 1)
var = Matrix{Float32}(undef, 1, 1)
param = Vector{Vector{Float32}}(undef, 1)
expectedResult = 3.0 # Not using "eval" because the variables are not stored in global scope
var[1,1] = -3.0
@ -108,8 +110,8 @@ end
end
@testset "Test complex expressions" begin
var = Matrix{Float64}(undef, 2, 2)
param = Vector{Vector{Float64}}(undef, 2)
var = Matrix{Float32}(undef, 2, 2)
param = Vector{Vector{Float32}}(undef, 2)
# var set 1
var[1,1] = 3.0

View File

@ -3,8 +3,8 @@ using .ExpressionProcessing
using .Transpiler
expressions = Vector{Expr}(undef, 2)
variables = Matrix{Float64}(undef, 2,2)
parameters = Vector{Vector{Float64}}(undef, 2)
variables = Matrix{Float32}(undef, 2,2)
parameters = Vector{Vector{Float32}}(undef, 2)
# Resulting value should be 10 for the first expression
expressions[1] = :(1 + 3 * 5 / 7 - 1)
@ -13,8 +13,8 @@ variables[1,1] = 2.0
variables[2,1] = 3.0
variables[1,2] = 0.0
variables[2,2] = 5.0
parameters[1] = Vector{Float64}(undef, 1)
parameters[2] = Vector{Float64}(undef, 2)
parameters[1] = Vector{Float32}(undef, 1)
parameters[2] = Vector{Float32}(undef, 2)
parameters[1][1] = 5.0
parameters[2][1] = 5.0
parameters[2][2] = 0.0