Compare commits
11 Commits
e33be8f59e
...
initial-im
Author | SHA1 | Date | |
---|---|---|---|
1dc0c1898d | |||
ad175abac0 | |||
690ee33db1 | |||
effd477558 | |||
9df78ca72e | |||
561b37160b | |||
eaee21ca75 | |||
baa37ea183 | |||
db02e9f90f | |||
f4f39ec47c | |||
942adb8612 |
@ -1,13 +1,19 @@
|
|||||||
name = "ExpressionExecutorCuda"
|
name = "ExpressionExecutorCuda"
|
||||||
uuid = "5b8ee377-1e19-4ba5-a85c-78c7d1694bfe"
|
uuid = "5b8ee377-1e19-4ba5-a85c-78c7d1694bfe"
|
||||||
authors = ["Daniel Wiplinger"]
|
authors = ["Daniel Roth"]
|
||||||
version = "1.0.0-DEV"
|
version = "1.0.0-DEV"
|
||||||
|
|
||||||
[deps]
|
[deps]
|
||||||
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
|
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
|
||||||
|
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
|
||||||
|
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
|
||||||
|
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
|
||||||
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
|
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
|
||||||
|
|
||||||
[compat]
|
[compat]
|
||||||
|
LinearAlgebra = "1.11.0"
|
||||||
|
Printf = "1.11.0"
|
||||||
|
Random = "1.11.0"
|
||||||
julia = "1.6.7"
|
julia = "1.6.7"
|
||||||
|
|
||||||
[extras]
|
[extras]
|
||||||
|
207
package/src/Code.jl
Normal file
207
package/src/Code.jl
Normal file
@ -0,0 +1,207 @@
|
|||||||
|
using Printf
|
||||||
|
|
||||||
|
@enum Opcode::UInt8 begin
|
||||||
|
opc_stop = 1 # must start with 1 here TODO: remove stop
|
||||||
|
opc_add
|
||||||
|
opc_sub
|
||||||
|
opc_mul
|
||||||
|
opc_div
|
||||||
|
opc_inv
|
||||||
|
opc_log
|
||||||
|
opc_log10
|
||||||
|
opc_exp
|
||||||
|
opc_pow
|
||||||
|
opc_powconst
|
||||||
|
opc_powabs
|
||||||
|
opc_neg
|
||||||
|
opc_abs
|
||||||
|
opc_sign
|
||||||
|
opc_sin
|
||||||
|
opc_asin
|
||||||
|
opc_tan
|
||||||
|
opc_tanh
|
||||||
|
opc_cos
|
||||||
|
opc_cosh
|
||||||
|
opc_constant
|
||||||
|
opc_param
|
||||||
|
opc_variable
|
||||||
|
end
|
||||||
|
|
||||||
|
const terminal_opcodes = [opc_stop, opc_constant, opc_param, opc_variable]
|
||||||
|
const unary_opcodes = [opc_log, opc_log10, opc_exp, opc_abs, opc_sign, opc_sin, opc_cos, opc_cosh, opc_asin, opc_tan, opc_tanh, opc_powconst, opc_neg, opc_inv]
|
||||||
|
const binary_opcodes = [opc_add, opc_sub, opc_mul, opc_div, opc_pow, opc_powabs]
|
||||||
|
|
||||||
|
function opcode(sy::Symbol)::Opcode
|
||||||
|
if sy == :+ return opc_add
|
||||||
|
elseif sy == :- return opc_sub
|
||||||
|
elseif sy == :* return opc_mul
|
||||||
|
elseif sy == :/ return opc_div
|
||||||
|
elseif sy == :inv return opc_inv
|
||||||
|
elseif sy == :log return opc_log
|
||||||
|
elseif sy == :log10 return opc_log10
|
||||||
|
elseif sy == :exp return opc_exp
|
||||||
|
elseif sy == :^ return opc_powabs # TODO: this is temporary to enforce that all powers are evaluated as pow(abs(...)) for parameter optimization
|
||||||
|
elseif sy == :powabs return opc_powabs # TODO: this is temporary to enforce that all powers are evaluated as pow(abs(...)) for parameter optimization
|
||||||
|
elseif sy == :abs return opc_abs
|
||||||
|
elseif sy == :sign return opc_sign
|
||||||
|
elseif sy == :sin return opc_sin
|
||||||
|
elseif sy == :asin return opc_asin
|
||||||
|
elseif sy == :cos return opc_cos
|
||||||
|
elseif sy == :cosh return opc_cosh
|
||||||
|
elseif sy == :tan return opc_tan
|
||||||
|
elseif sy == :tanh return opc_tanh
|
||||||
|
else error("no opcode for symbol $sy")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
function degree(opc::Opcode)::Integer
|
||||||
|
if opc in terminal_opcodes return 0
|
||||||
|
elseif opc in unary_opcodes return 1
|
||||||
|
elseif opc in binary_opcodes return 2
|
||||||
|
else error("unknown degree of opcode $opc")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
# code is a Vector{Instruction} which is a linear representation of a directed acyclic graph of expressions.
|
||||||
|
# The code can be evaluated from left to right.
|
||||||
|
struct Instruction{T}
|
||||||
|
opcode::Opcode
|
||||||
|
arg1idx::UInt32 # index of first argument. 0 for terminals
|
||||||
|
arg2idx::UInt32 # index of second argument. 0 for functions with a single argument
|
||||||
|
idx::UInt32 # for variables and parameters
|
||||||
|
val::T # for constants
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
function Base.show(io::IO, instr::Instruction)
|
||||||
|
Printf.format(io, Printf.format"%15s %3d %3d %3d %f", instr.opcode, instr.arg1idx, instr.arg2idx, instr.idx, instr.val)
|
||||||
|
end
|
||||||
|
|
||||||
|
create_const_instruction(val::T) where {T} = Instruction{T}(opc_constant, UInt32(0), UInt32(0), UInt32(0), val)
|
||||||
|
create_var_instruction(::Type{T}, varidx) where {T} = Instruction{T}(opc_variable, UInt32(0), UInt32(0), UInt32(varidx), zero(T))
|
||||||
|
create_param_instruction(::Type{T}, paramidx; val::T = zero(T)) where {T} = Instruction{T}(opc_param, UInt32(0), UInt32(0), UInt32(paramidx), val)
|
||||||
|
|
||||||
|
|
||||||
|
function convert_expr_to_code(::Type{T}, expr::Expr)::Vector{Instruction{T}} where {T}
|
||||||
|
code = Vector{Instruction{T}}()
|
||||||
|
|
||||||
|
Base.remove_linenums!(expr)
|
||||||
|
paramTup = expr.args[1]
|
||||||
|
xSy = paramTup.args[1]
|
||||||
|
pSy = paramTup.args[2]
|
||||||
|
body = expr.args[2]
|
||||||
|
|
||||||
|
cache = Dict{Any,Int32}() # for de-duplication of expressions. If an expression is in the cache simply return the index of the existing code
|
||||||
|
|
||||||
|
convert_expr_to_code!(code, cache, body, xSy, pSy)
|
||||||
|
|
||||||
|
# for debugging
|
||||||
|
# for tup in sort(cache; byvalue=true)
|
||||||
|
# println(tup)
|
||||||
|
# end
|
||||||
|
return code
|
||||||
|
end
|
||||||
|
|
||||||
|
# uses cache (hashcons) to de-duplicate subexpressions in the tree.
|
||||||
|
function convert_expr_to_code!(code::Vector{Instruction{T}}, cache, val::TV, xSy, pSy)::UInt32 where {T,TV}
|
||||||
|
if haskey(cache, val) return cache[val] end
|
||||||
|
|
||||||
|
push!(code, create_const_instruction(T(val)))
|
||||||
|
cache[val] = length(code)
|
||||||
|
return length(code)
|
||||||
|
end
|
||||||
|
|
||||||
|
function convert_expr_to_code!(code::Vector{Instruction{T}}, cache, expr::Expr, xSy, pSy)::UInt32 where {T}
|
||||||
|
# predicate to check if an expression is abs(...)
|
||||||
|
is_abs(a) = a isa Expr && a.head == :call && a.args[1] == :abs
|
||||||
|
|
||||||
|
if haskey(cache, expr) return cache[expr] end
|
||||||
|
|
||||||
|
sy = expr.head
|
||||||
|
if sy == :call
|
||||||
|
func = expr.args[1]
|
||||||
|
arg1idx::UInt32 = 0
|
||||||
|
arg2idx::UInt32 = 0
|
||||||
|
# unary functions
|
||||||
|
if length(expr.args) == 2
|
||||||
|
arg1idx = convert_expr_to_code!(code, cache, expr.args[2], xSy, pSy)
|
||||||
|
if (func == :-)
|
||||||
|
# - with one argument => negate
|
||||||
|
push!(code, Instruction{T}(opc_neg, arg1idx, UInt32(0), UInt32(0), zero(T)))
|
||||||
|
elseif (func == :sqrt)
|
||||||
|
push!(code, Instruction{T}(opc_powconst, arg1idx, UInt32(0), UInt32(0), T(0.5)))
|
||||||
|
else
|
||||||
|
push!(code, Instruction{T}(opcode(func), arg1idx, UInt32(0), UInt32(0), zero(T)))
|
||||||
|
end
|
||||||
|
elseif length(expr.args) == 3
|
||||||
|
arg1idx = convert_expr_to_code!(code, cache, expr.args[2], xSy, pSy)
|
||||||
|
if func == :^ && expr.args[3] isa Number && round(expr.args[3]) == expr.args[3] # is integer
|
||||||
|
# special case for constant powers
|
||||||
|
push!(code, Instruction{T}(opc_powconst, arg1idx, UInt32(0), UInt32(0), T(expr.args[3])))
|
||||||
|
elseif func == :^ && is_abs(expr.args[2])
|
||||||
|
# fuse pow(abs(x), y) --> powabs(x,y)
|
||||||
|
absexpr = expr.args[2]
|
||||||
|
x = absexpr.args[2]
|
||||||
|
arg1idx = convert_expr_to_code!(code, cache, x, xSy, pSy) # because of hashconsing this will return the index within the code for abs(x) generated above
|
||||||
|
arg2idx = convert_expr_to_code!(code, cache, expr.args[3], xSy, pSy)
|
||||||
|
push!(code, Instruction{T}(opc_powabs, arg1idx, arg2idx, UInt32(0), zero(T)))
|
||||||
|
else
|
||||||
|
arg2idx = convert_expr_to_code!(code, cache, expr.args[3], xSy, pSy)
|
||||||
|
push!(code, Instruction{T}(opcode(func), arg1idx, arg2idx, UInt32(0), zero(T)))
|
||||||
|
end
|
||||||
|
else
|
||||||
|
# dump(expr)
|
||||||
|
errpr("only unary and binary functions are supported ($func is not supported)")
|
||||||
|
end
|
||||||
|
elseif sy == :ref
|
||||||
|
arrSy = expr.args[1]
|
||||||
|
idx = expr.args[2]
|
||||||
|
if arrSy == xSy
|
||||||
|
push!(code, create_var_instruction(T, idx))
|
||||||
|
elseif arrSy == pSy
|
||||||
|
push!(code, create_param_instruction(T, idx))
|
||||||
|
else
|
||||||
|
dump(expr)
|
||||||
|
throw(UndefVarError("unknown symbol"))
|
||||||
|
end
|
||||||
|
else
|
||||||
|
error("Unsupported symbol $sy")
|
||||||
|
end
|
||||||
|
|
||||||
|
cache[expr] = length(code)
|
||||||
|
return length(code)
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
function Base.show(io::IO, code::AbstractArray{Instruction{T}}) where {T}
|
||||||
|
sym = Dict(
|
||||||
|
opc_stop => ".",
|
||||||
|
opc_add => "+",
|
||||||
|
opc_sub => "-",
|
||||||
|
opc_neg => "neg",
|
||||||
|
opc_mul => "*",
|
||||||
|
opc_div => "/",
|
||||||
|
opc_inv => "inv",
|
||||||
|
opc_pow => "^",
|
||||||
|
opc_powabs => "abs^",
|
||||||
|
opc_powconst => "^c",
|
||||||
|
opc_log => "log",
|
||||||
|
opc_log10 => "l10",
|
||||||
|
opc_exp => "exp",
|
||||||
|
opc_abs => "abs",
|
||||||
|
opc_sign => "sgn",
|
||||||
|
opc_sin => "sin",
|
||||||
|
opc_cos => "cos",
|
||||||
|
opc_variable => "var",
|
||||||
|
opc_constant => "con",
|
||||||
|
opc_param => "par",
|
||||||
|
)
|
||||||
|
|
||||||
|
for i in eachindex(code)
|
||||||
|
instr = code[i]
|
||||||
|
Printf.format(io, Printf.format"%4d %4s %3d %3d %3d %f", i, sym[instr.opcode], instr.arg1idx, instr.arg2idx, instr.idx, instr.val)
|
||||||
|
println(io)
|
||||||
|
# printfmtln(io, "{1:>4d} {2:>4s} {3:>3d} {4:>3d} {5:>3d} {6:>}", i, sym[instr.opcode], instr.arg1idx, instr.arg2idx, instr.idx, instr.val)
|
||||||
|
end
|
||||||
|
end
|
172
package/src/CpuInterpreter.jl
Normal file
172
package/src/CpuInterpreter.jl
Normal file
@ -0,0 +1,172 @@
|
|||||||
|
using Random
|
||||||
|
|
||||||
|
struct InterpreterBuffers{T}
|
||||||
|
resultcache::Matrix{T} # for forward eval
|
||||||
|
diffcache::Matrix{T} # for reverse AD
|
||||||
|
jaccache::Matrix{T} # for Jacobian
|
||||||
|
tmp::Vector{T} # a temporary space for each of the vector operations
|
||||||
|
|
||||||
|
function InterpreterBuffers{T}(codelen, num_param, batchsize) where {T<:AbstractFloat}
|
||||||
|
buf = Matrix{T}(undef, batchsize, codelen)
|
||||||
|
rev_buf = Matrix{T}(undef, batchsize, codelen)
|
||||||
|
jac_buf = Matrix{T}(undef, batchsize, num_param)
|
||||||
|
tmp = Vector{T}(undef, batchsize)
|
||||||
|
|
||||||
|
new(buf, rev_buf, jac_buf, tmp)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
mutable struct Interpreter{T}
|
||||||
|
const code::Vector{Instruction{T}}
|
||||||
|
const buffers::InterpreterBuffers{T}
|
||||||
|
const batchsize::UInt32
|
||||||
|
pc::Int32
|
||||||
|
|
||||||
|
function Interpreter{T}(expr::Expr, num_param; batchsize = 1024) where {T<:AbstractFloat}
|
||||||
|
code = convert_expr_to_code(T, expr)
|
||||||
|
# println(code)
|
||||||
|
buffers = InterpreterBuffers{T}(length(code), num_param, batchsize)
|
||||||
|
new(code, buffers, batchsize, 1)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
peek_instruction(interpreter) = interpreter.code[interpreter.pc]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# batch size 1024 was fast in benchmark
|
||||||
|
interpret!(result::AbstractVector{T}, expr::Expr, x::AbstractMatrix{T}, p; batchsize=1024) where {T} = interpret!(result, Interpreter{T}(expr, length(p); batchsize), x, p)
|
||||||
|
|
||||||
|
# for Float evaluation use the preallocated buffer
|
||||||
|
function interpret!(result::AbstractVector{T}, interpreter::Interpreter{T}, x::AbstractMatrix{T}, p::AbstractArray{T}) where {T}
|
||||||
|
interpret_withbuf!(result, interpreter, interpreter.buffers.resultcache, interpreter.buffers.tmp, x, p)
|
||||||
|
end
|
||||||
|
|
||||||
|
function interpret_withbuf!(result::AbstractVector{T}, interpreter::Interpreter{T}, batchresult, tmp, x::AbstractMatrix{T}, p::AbstractArray{TD}) where {T,TD}
|
||||||
|
allrows = axes(x, 1)
|
||||||
|
@assert length(result) == length(allrows)
|
||||||
|
|
||||||
|
|
||||||
|
# all batches
|
||||||
|
start = first(allrows)
|
||||||
|
while start + interpreter.batchsize < last(allrows)
|
||||||
|
batchrows = start:(start + interpreter.batchsize - 1)
|
||||||
|
interpret_batch!(interpreter, batchresult, tmp, x, p, batchrows)
|
||||||
|
copy!((@view result[batchrows]), (@view batchresult[:, end]))
|
||||||
|
start += interpreter.batchsize
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
# process remaining rows
|
||||||
|
remrows = start:last(allrows)
|
||||||
|
if length(remrows) > 0
|
||||||
|
interpret_batch!(interpreter, batchresult, tmp, x, p, remrows)
|
||||||
|
copy!((@view result[remrows]), (@view batchresult[1:length(remrows), end]))
|
||||||
|
# res += sum(view(batchresult, 1:length(remrows), lastcolidx))
|
||||||
|
end
|
||||||
|
# res
|
||||||
|
result
|
||||||
|
end
|
||||||
|
|
||||||
|
function interpret_batch!(interpreter,
|
||||||
|
batchresult, tmp,
|
||||||
|
x, p, rows)
|
||||||
|
# forward pass
|
||||||
|
interpret_fwd!(interpreter, batchresult, tmp, x, p, rows)
|
||||||
|
|
||||||
|
nothing
|
||||||
|
end
|
||||||
|
|
||||||
|
function interpret_fwd!(interpreter, batchresult, tmp, x, p, rows)
|
||||||
|
interpreter.pc = 1
|
||||||
|
while interpreter.pc <= length(interpreter.code)
|
||||||
|
step!(interpreter, batchresult, tmp, x, p, rows)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
function step!(interpreter, batchresult, tmp, x, p, range)
|
||||||
|
instr = interpreter.code[interpreter.pc]
|
||||||
|
opc = instr.opcode
|
||||||
|
res = view(batchresult, :, interpreter.pc)
|
||||||
|
|
||||||
|
if degree(opc) == 0
|
||||||
|
if opc == opc_variable
|
||||||
|
copyto!(res, view(x, range, instr.idx))
|
||||||
|
elseif opc == opc_param
|
||||||
|
fill!(res, p[instr.idx])
|
||||||
|
elseif opc == opc_constant
|
||||||
|
fill!(res, instr.val)
|
||||||
|
end
|
||||||
|
elseif degree(opc) == 1
|
||||||
|
arg = view(batchresult, :, instr.arg1idx)
|
||||||
|
# is converted to a switch automatically by LLVM
|
||||||
|
if opc == opc_log vec_log!(res, arg, tmp)
|
||||||
|
elseif opc == opc_log10 vec_log10!(res, arg, tmp)
|
||||||
|
elseif opc == opc_exp vec_exp!(res, arg, tmp)
|
||||||
|
elseif opc == opc_abs vec_abs!(res, arg, tmp)
|
||||||
|
elseif opc == opc_neg vec_neg!(res, arg, tmp)
|
||||||
|
elseif opc == opc_inv vec_inv!(res, arg, tmp)
|
||||||
|
elseif opc == opc_sign vec_sign!(res, arg, tmp)
|
||||||
|
elseif opc == opc_powconst vec_powconst!(res, arg, instr.val, tmp);
|
||||||
|
elseif opc == opc_sin vec_sin!(res, arg, tmp)
|
||||||
|
elseif opc == opc_cos vec_cos!(res, arg, tmp)
|
||||||
|
elseif opc == opc_cosh vec_cosh!(res, arg, tmp)
|
||||||
|
elseif opc == opc_asin vec_asin!(res, arg, tmp)
|
||||||
|
elseif opc == opc_tan vec_tan!(res, arg, tmp)
|
||||||
|
elseif opc == opc_tanh vec_tanh!(res, arg, tmp)
|
||||||
|
|
||||||
|
else throw(DomainError("Unsupported opcode $opc"))
|
||||||
|
end
|
||||||
|
elseif degree(opc) == 2
|
||||||
|
left = view(batchresult, :, instr.arg1idx)
|
||||||
|
right = view(batchresult, :, instr.arg2idx)
|
||||||
|
|
||||||
|
if opc == opc_add vec_add!(res, left, right, tmp)
|
||||||
|
elseif opc == opc_sub vec_sub!(res, left, right, tmp)
|
||||||
|
elseif opc == opc_mul vec_mul!(res, left, right, tmp)
|
||||||
|
elseif opc == opc_div vec_div!(res, left, right, tmp)
|
||||||
|
elseif opc == opc_pow vec_pow!(res, left, right, tmp)
|
||||||
|
elseif opc == opc_powabs vec_powabs!(res, left, right, tmp)
|
||||||
|
else throw(DomainError("Unsupported opcode $opc"))
|
||||||
|
end
|
||||||
|
# if any(isnan, res)
|
||||||
|
# throw(DomainError("got NaN for $opc $(interpreter.pc) $left $right"))
|
||||||
|
# end
|
||||||
|
end
|
||||||
|
|
||||||
|
interpreter.pc += 1
|
||||||
|
|
||||||
|
return nothing
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
for unaryfunc in (:exp, :abs, :sin, :cos, :cosh, :asin, :tan, :tanh, :sinh)
|
||||||
|
funcsy = Symbol("vec_$(unaryfunc)!")
|
||||||
|
@eval function $funcsy(res::AbstractVector{T}, arg::AbstractVector{T}, ::AbstractVector{T}) where T<:Real
|
||||||
|
@simd for i in eachindex(res)
|
||||||
|
@inbounds res[i] = Base.$unaryfunc(arg[i])
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
function vec_add!(res::AbstractVector{TE}, left::AbstractVector{TE}, right::AbstractVector{TE}, ::AbstractVector{TE}) where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = left[i] + right[i] end end
|
||||||
|
function vec_sub!(res::AbstractVector{TE}, left::AbstractVector{TE}, right::AbstractVector{TE}, ::AbstractVector{TE}) where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = left[i] - right[i] end end
|
||||||
|
function vec_mul!(res::AbstractVector{TE}, left::AbstractVector{TE}, right::AbstractVector{TE}, ::AbstractVector{TE}) where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = left[i] * right[i] end end
|
||||||
|
function vec_div!(res::AbstractVector{TE}, left::AbstractVector{TE}, right::AbstractVector{TE}, ::AbstractVector{TE}) where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = left[i] / right[i] end end
|
||||||
|
function vec_pow!(res::AbstractVector{TE}, left::AbstractVector{TE}, right::AbstractVector{TE}, ::AbstractVector{TE}) where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = left[i] ^ right[i] end end
|
||||||
|
|
||||||
|
# TODO: special case scalar power
|
||||||
|
function vec_powconst!(res::AbstractVector{TE}, left::AbstractVector{TE}, right::TC, ::AbstractVector{TE}) where {TE<:Real,TC<:Real} @simd for i in eachindex(res) @inbounds res[i] = left[i] ^ right end end
|
||||||
|
function vec_powabs!(res::AbstractVector{TE}, left::AbstractVector{TE}, right::AbstractVector{TE}, ::AbstractVector{TE}) where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = abs(left[i]) ^ right[i] end end
|
||||||
|
|
||||||
|
function vec_neg!(res::AbstractVector{TE}, arg::AbstractVector{TE}, ::AbstractVector{TE}) where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = -arg[i] end end
|
||||||
|
function vec_inv!(res::AbstractVector{TE}, arg::AbstractVector{TE}, ::AbstractVector{TE}) where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = inv(arg[i]) end end
|
||||||
|
function vec_sign!(res::AbstractVector{TE}, arg::AbstractVector{TE}, ::AbstractVector{TE}) where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = sign(arg[i]) end end
|
||||||
|
|
||||||
|
# handle log and exp specially to use NaN instead of DomainError
|
||||||
|
function vec_log!(res::AbstractVector{TE}, arg::AbstractVector{TE}, ::AbstractVector{TE}) where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = arg[i] < zero(TE) ? TE(NaN) : log(arg[i]) end end
|
||||||
|
function vec_log10!(res::AbstractVector{TE}, arg::AbstractVector{TE}, ::AbstractVector{TE}) where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = arg[i] < zero(TE) ? TE(NaN) : log10(arg[i]) end end
|
||||||
|
|
||||||
|
|
@ -1,29 +1,75 @@
|
|||||||
module ExpressionExecutorCuda
|
module ExpressionExecutorCuda
|
||||||
|
include("Utils.jl")
|
||||||
include("ExpressionProcessing.jl")
|
include("ExpressionProcessing.jl")
|
||||||
include("Interpreter.jl")
|
include("Interpreter.jl")
|
||||||
|
include("Transpiler.jl")
|
||||||
|
|
||||||
export interpret_gpu
|
module CpuInterpreter
|
||||||
|
include("Code.jl")
|
||||||
|
include("CpuInterpreter.jl")
|
||||||
|
end
|
||||||
|
|
||||||
|
export interpret_gpu,interpret_cpu
|
||||||
export evaluate_gpu
|
export evaluate_gpu
|
||||||
export test
|
export test
|
||||||
|
|
||||||
# Some assertions:
|
# Some assertions:
|
||||||
# Variables and parameters start their naming with "1" meaning the first variable/parameter has to be "x1/p1" and not "x0/p0"
|
# Variables and parameters start their naming with "1" meaning the first variable/parameter has to be "x1/p1" and not "x0/p0"
|
||||||
|
# Matrix X is column major
|
||||||
# each index i in exprs has to have the matching values in the column i in Matrix X so that X[:,i] contains the values for expr[i]. The same goes for p
|
# each index i in exprs has to have the matching values in the column i in Matrix X so that X[:,i] contains the values for expr[i]. The same goes for p
|
||||||
# This assertion is made, because in julia, the first index doesn't have to be 1
|
# This assertion is made, because in julia, the first index doesn't have to be 1
|
||||||
#
|
#
|
||||||
|
|
||||||
# Evaluate Expressions on the GPU
|
# Evaluate Expressions on the GPU
|
||||||
function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
|
function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32}
|
||||||
exprsPostfix = ExpressionProcessing.expr_to_postfix(exprs[1])
|
@assert axes(exprs) == axes(p)
|
||||||
|
ncols = size(X, 2)
|
||||||
|
|
||||||
|
results = Matrix{Float32}(undef, ncols, length(exprs))
|
||||||
|
|
||||||
|
for i in 1:repetitions # Simulate parameter tuning
|
||||||
|
results = Interpreter.interpret(exprs, X, p)
|
||||||
|
end
|
||||||
|
|
||||||
|
return results
|
||||||
end
|
end
|
||||||
|
|
||||||
# Convert Expressions to PTX Code and execute that instead
|
# Convert Expressions to PTX Code and execute that instead
|
||||||
function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
|
function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32}
|
||||||
# Look into this to maybe speed up PTX generation: https://cuda.juliagpu.org/stable/tutorials/introduction/#Parallelization-on-the-CPU
|
@assert axes(exprs) == axes(p)
|
||||||
|
ncols = size(X, 2)
|
||||||
|
|
||||||
|
results = Matrix{Float32}(undef, ncols, length(exprs))
|
||||||
|
|
||||||
|
for i in 1:repetitions # Simulate parameter tuning
|
||||||
|
results = Transpiler.evaluate(exprs, X, p)
|
||||||
|
end
|
||||||
|
|
||||||
|
return results
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
|
||||||
|
|
||||||
|
# Evaluate Expressions on the CPU
|
||||||
|
function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32}
|
||||||
|
@assert axes(exprs) == axes(p)
|
||||||
|
nrows = size(X, 1)
|
||||||
|
|
||||||
|
# each column of the matrix has the result for an expr
|
||||||
|
res = Matrix{Float32}(undef, nrows, length(exprs))
|
||||||
|
|
||||||
|
for i in eachindex(exprs)
|
||||||
|
# The interpreter holds the postfix code and buffers for evaluation. It is costly to create
|
||||||
|
interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i]))
|
||||||
|
|
||||||
|
# If an expression has to be evaluated multiple times (e.g. for different parameters),
|
||||||
|
# it is worthwhile to reuse the interpreter to reduce the number of allocations
|
||||||
|
for rep in 1:repetitions
|
||||||
|
CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i])
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
res
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
# Flow
|
# Flow
|
||||||
@ -35,3 +81,5 @@ end
|
|||||||
# The following can be done on the CPU
|
# The following can be done on the CPU
|
||||||
# convert expression to postfix notation (mandatory)
|
# convert expression to postfix notation (mandatory)
|
||||||
# optional: replace every parameter with the correct value (should only improve performance if data transfer is the bottleneck)
|
# optional: replace every parameter with the correct value (should only improve performance if data transfer is the bottleneck)
|
||||||
|
|
||||||
|
end
|
||||||
|
@ -71,6 +71,10 @@ function get_operator(op::Symbol)::Operator
|
|||||||
return EXP
|
return EXP
|
||||||
elseif op == :sqrt
|
elseif op == :sqrt
|
||||||
return SQRT
|
return SQRT
|
||||||
|
elseif op == :powabs
|
||||||
|
return POWER # TODO: Fix this
|
||||||
|
else
|
||||||
|
throw("Operator unknown")
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -2,6 +2,7 @@ module Interpreter
|
|||||||
using CUDA
|
using CUDA
|
||||||
using StaticArrays
|
using StaticArrays
|
||||||
using ..ExpressionProcessing
|
using ..ExpressionProcessing
|
||||||
|
using ..Utils
|
||||||
|
|
||||||
export interpret
|
export interpret
|
||||||
|
|
||||||
@ -11,19 +12,25 @@ export interpret
|
|||||||
- variables::Matrix{Float32} : The variables to use. Each column is mapped to the variables x1..xn
|
- variables::Matrix{Float32} : The variables to use. Each column is mapped to the variables x1..xn
|
||||||
- parameters::Vector{Vector{Float32}} : The parameters to use. Each Vector contains the values for the parameters p1..pn. The number of parameters can be different for every expression
|
- parameters::Vector{Vector{Float32}} : The parameters to use. Each Vector contains the values for the parameters p1..pn. The number of parameters can be different for every expression
|
||||||
"
|
"
|
||||||
function interpret(expressions::Vector{ExpressionProcessing.PostfixType}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
|
function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
|
||||||
|
|
||||||
|
exprs = Vector{ExpressionProcessing.PostfixType}(undef, length(expressions))
|
||||||
|
for i in eachindex(expressions)
|
||||||
|
exprs[i] = ExpressionProcessing.expr_to_postfix(expressions[i])
|
||||||
|
end
|
||||||
|
|
||||||
variableCols = size(variables, 2) # number of variable sets to use for each expression
|
variableCols = size(variables, 2) # number of variable sets to use for each expression
|
||||||
cudaVars = CuArray(variables)
|
cudaVars = CuArray(variables)
|
||||||
cudaParams = create_cuda_array(parameters, NaN32) # column corresponds to data for one expression
|
cudaParams = Utils.create_cuda_array(parameters, NaN32) # column corresponds to data for one expression
|
||||||
cudaExprs = create_cuda_array(expressions, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression
|
cudaExprs = Utils.create_cuda_array(exprs, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression
|
||||||
# put into seperate cuArray, as this is static and would be inefficient to send seperatly to every kernel
|
# put into seperate cuArray, as this is static and would be inefficient to send seperatly to every kernel
|
||||||
cudaStepsize = CuArray([get_max_inner_length(expressions), get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression
|
cudaStepsize = CuArray([Utils.get_max_inner_length(exprs), Utils.get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression
|
||||||
|
|
||||||
# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
|
# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
|
||||||
cudaResults = CuArray{Float32}(undef, variableCols, length(expressions))
|
cudaResults = CuArray{Float32}(undef, variableCols, length(exprs))
|
||||||
|
|
||||||
# Start kernel for each expression to ensure that no warp is working on different expressions
|
# Start kernel for each expression to ensure that no warp is working on different expressions
|
||||||
for i in eachindex(expressions)
|
for i in eachindex(exprs)
|
||||||
kernel = @cuda launch=false interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
|
kernel = @cuda launch=false interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
|
||||||
config = launch_configuration(kernel.fun)
|
config = launch_configuration(kernel.fun)
|
||||||
threads = min(variableCols, config.threads)
|
threads = min(variableCols, config.threads)
|
||||||
@ -38,19 +45,23 @@ end
|
|||||||
#TODO: Add @inbounds to all indexing after it is verified that all works https://cuda.juliagpu.org/stable/development/kernel/#Bounds-checking
|
#TODO: Add @inbounds to all indexing after it is verified that all works https://cuda.juliagpu.org/stable/development/kernel/#Bounds-checking
|
||||||
const MAX_STACK_SIZE = 25 # The depth of the stack to store the values and intermediate results
|
const MAX_STACK_SIZE = 25 # The depth of the stack to store the values and intermediate results
|
||||||
function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, variables::CuDeviceArray{Float32}, parameters::CuDeviceArray{Float32}, results::CuDeviceArray{Float32}, stepsize::CuDeviceArray{Int}, exprIndex::Int)
|
function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, variables::CuDeviceArray{Float32}, parameters::CuDeviceArray{Float32}, results::CuDeviceArray{Float32}, stepsize::CuDeviceArray{Int}, exprIndex::Int)
|
||||||
index = (blockIdx().x - 1) * blockDim().x + threadIdx().x # ctaid.x * ntid.x + tid.x
|
varSetIndex = (blockIdx().x - 1) * blockDim().x + threadIdx().x # ctaid.x * ntid.x + tid.x (1-based)
|
||||||
stride = gridDim().x * blockDim().x # nctaid.x * ntid.x
|
# stride = gridDim().x * blockDim().x # nctaid.x * ntid.x
|
||||||
|
variableCols = length(variables) / stepsize[3]
|
||||||
|
|
||||||
|
if varSetIndex > variableCols
|
||||||
|
return
|
||||||
|
end
|
||||||
|
|
||||||
firstExprIndex = ((exprIndex - 1) * stepsize[1]) + 1 # Inclusive
|
firstExprIndex = ((exprIndex - 1) * stepsize[1]) + 1 # Inclusive
|
||||||
lastExprIndex = firstExprIndex + stepsize[1] - 1 # Inclusive
|
lastExprIndex = firstExprIndex + stepsize[1] - 1 # Inclusive
|
||||||
firstParamIndex = ((exprIndex - 1) * stepsize[2]) # Exclusive
|
firstParamIndex = ((exprIndex - 1) * stepsize[2]) # Exclusive
|
||||||
variableCols = length(variables) / stepsize[3]
|
|
||||||
|
|
||||||
operationStack = MVector{MAX_STACK_SIZE, Float32}(undef) # Try to get this to function with variable size too, to allow better memory usage
|
operationStack = MVector{MAX_STACK_SIZE, Float32}(undef) # Try to get this to function with variable size too, to allow better memory usage
|
||||||
operationStackTop = 0 # stores index of the last defined/valid value
|
operationStackTop = 0 # stores index of the last defined/valid value
|
||||||
|
|
||||||
for varSetIndex in index:stride
|
# for varSetIndex in index:stride
|
||||||
firstVariableIndex = ((varSetIndex - 1) * stepsize[3]) # Exclusive
|
firstVariableIndex = ((varSetIndex-1) * stepsize[3]) # Exclusive
|
||||||
|
|
||||||
for i in firstExprIndex:lastExprIndex
|
for i in firstExprIndex:lastExprIndex
|
||||||
if expressions[i].Type == EMPTY
|
if expressions[i].Type == EMPTY
|
||||||
@ -62,7 +73,7 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var
|
|||||||
if val > 0
|
if val > 0
|
||||||
operationStack[operationStackTop] = variables[firstVariableIndex + val]
|
operationStack[operationStackTop] = variables[firstVariableIndex + val]
|
||||||
else
|
else
|
||||||
val = -val
|
val = abs(val)
|
||||||
operationStack[operationStackTop] = parameters[firstParamIndex + val]
|
operationStack[operationStackTop] = parameters[firstParamIndex + val]
|
||||||
end
|
end
|
||||||
elseif expressions[i].Type == FLOAT32
|
elseif expressions[i].Type == FLOAT32
|
||||||
@ -103,49 +114,9 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var
|
|||||||
# "+ varSetIndex" -> to get the row inside the column at which to insert the result of the variable set (variable set = row)
|
# "+ varSetIndex" -> to get the row inside the column at which to insert the result of the variable set (variable set = row)
|
||||||
resultIndex = convert(Int, (exprIndex - 1) * variableCols + varSetIndex) # Inclusive
|
resultIndex = convert(Int, (exprIndex - 1) * variableCols + varSetIndex) # Inclusive
|
||||||
results[resultIndex] = operationStack[operationStackTop]
|
results[resultIndex] = operationStack[operationStackTop]
|
||||||
end
|
# end
|
||||||
|
|
||||||
return
|
return
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
"Retrieves the number of entries for the largest inner vector"
|
|
||||||
function get_max_inner_length(vec::Vector{Vector{T}})::Int where T
|
|
||||||
maxLength = 0
|
|
||||||
@inbounds for i in eachindex(vec)
|
|
||||||
if length(vec[i]) > maxLength
|
|
||||||
maxLength = length(vec[i])
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
return maxLength
|
|
||||||
end
|
|
||||||
|
|
||||||
"Returns a CuArray filed with the data provided. The inner vectors do not have to have the same length. All missing elements will be the value ```invalidElement```"
|
|
||||||
function create_cuda_array(data::Vector{Vector{T}}, invalidElement::T)::CuArray{T} where T
|
|
||||||
dataCols = get_max_inner_length(data)
|
|
||||||
dataRows = length(data)
|
|
||||||
dataMat = convert_to_matrix(data, invalidElement)
|
|
||||||
cudaArr = CuArray{T}(undef, dataCols, dataRows) # length(parameters) == number of expressions
|
|
||||||
copyto!(cudaArr, dataMat)
|
|
||||||
|
|
||||||
return cudaArr
|
|
||||||
end
|
|
||||||
|
|
||||||
"Converts a vector of vectors into a matrix. The inner vectors do not need to have the same length.
|
|
||||||
|
|
||||||
All entries that cannot be filled have ```invalidElement``` as their value
|
|
||||||
"
|
|
||||||
function convert_to_matrix(vec::Vector{Vector{T}}, invalidElement::T)::Matrix{T} where T
|
|
||||||
vecCols = get_max_inner_length(vec)
|
|
||||||
vecRows = length(vec)
|
|
||||||
vecMat = fill(invalidElement, vecCols, vecRows)
|
|
||||||
|
|
||||||
for i in eachindex(vec)
|
|
||||||
vecMat[:,i] = copyto!(vecMat[:,i], vec[i])
|
|
||||||
end
|
|
||||||
|
|
||||||
return vecMat
|
|
||||||
end
|
|
||||||
|
|
||||||
end
|
end
|
@ -1,55 +1,95 @@
|
|||||||
module Transpiler
|
module Transpiler
|
||||||
using CUDA
|
using CUDA
|
||||||
using ..ExpressionProcessing
|
using ..ExpressionProcessing
|
||||||
|
using ..Utils
|
||||||
|
|
||||||
# Number of threads per block/SM + max number of registers
|
|
||||||
# https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications
|
# https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications
|
||||||
# Need to assume a max of 2048 threads per Streaming Multiprocessor (SM)
|
|
||||||
# One SM can have 64*1024 32-bit registers at max
|
|
||||||
# One thread can at max use 255 registers
|
|
||||||
# Meaning one has access to at most 32 registers in the worst case. Using 64 bit values this number gets halfed (see: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#multiprocessor-level (almost at the end of the linked section))
|
|
||||||
|
|
||||||
# Maybe helpful for future performance tuning: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#maximum-number-of-registers-per-thread
|
|
||||||
|
|
||||||
# https://docs.nvidia.com/cuda/cuda-c-programming-guide/#multiprocessor-level
|
|
||||||
# This states, that using fewer registers allows more threads to reside on a single SM which improves performance.
|
|
||||||
# So I could use more registers at the expense for performance. Depending on how this would simplify my algorithm, I might do this and leave more optimisation to future work
|
|
||||||
|
|
||||||
# Since the generated expressions should have between 10 and 50 symbols, I think allowing a max. of 128 32-bit registers should make for an easy algorithm. If during testing the result is slow, maybe try reducing the number of registers and perform more intelligent allocation/assignment
|
|
||||||
# With 128 Registers, one could have 32 Warps on one SM ((128 * 16 = 2048) * 32 == 64*1024 == max number of registers per SM) This means 512 Threads per SM in the worst case
|
|
||||||
|
|
||||||
#
|
|
||||||
# Make a "function execute(...)" that takes the data and the transpiled code. Pass the data to the kernel and start executing
|
|
||||||
# Note: Maybe make an additional function that transpiles and executed the code. This would then be the function the user calls
|
|
||||||
#
|
|
||||||
|
|
||||||
|
const BYTES = sizeof(Float32)
|
||||||
const Operand = Union{Float32, String} # Operand is either fixed value or register
|
const Operand = Union{Float32, String} # Operand is either fixed value or register
|
||||||
|
cache = Dict{Expr, CuFunction}() # needed if multiple runs with the same expr but different parameters are performed
|
||||||
|
|
||||||
function evaluate(expression::ExpressionProcessing.PostfixType, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})
|
function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
|
||||||
# TODO: think of how to do this. Probably get all expressions. Transpile them in parallel and then execute the generatd code.
|
varRows = size(variables, 1)
|
||||||
cudaVars = CuArray(variables)
|
variableCols = size(variables, 2)
|
||||||
|
kernels = Vector{CuFunction}(undef, length(expressions))
|
||||||
|
|
||||||
|
# Test this parallel version again when doing performance tests. With the simple "functionality" tests this took 0.03 seconds while sequential took "0.00009" seconds
|
||||||
|
# Threads.@threads for i in eachindex(expressions)
|
||||||
|
# TODO: Use cache
|
||||||
|
# kernel = transpile(expressions[i], varRows, Utils.get_max_inner_length(parameters))
|
||||||
|
|
||||||
#kernel = transpile(expression, )
|
# linker = CuLink()
|
||||||
# execute kernel.
|
# add_data!(linker, "ExpressionProcessing", kernel)
|
||||||
|
|
||||||
|
# image = complete(linker)
|
||||||
|
|
||||||
|
# mod = CuModule(image)
|
||||||
|
# kernels[i] = CuFunction(mod, "ExpressionProcessing")
|
||||||
|
# end
|
||||||
|
|
||||||
|
for i in eachindex(expressions)
|
||||||
|
if haskey(cache, expressions[i])
|
||||||
|
kernels[i] = cache[expressions[i]]
|
||||||
|
continue
|
||||||
|
end
|
||||||
|
|
||||||
|
formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
|
||||||
|
kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i-1) # i-1 because julia is 1-based but PTX needs 0-based indexing
|
||||||
|
|
||||||
|
linker = CuLink()
|
||||||
|
add_data!(linker, "ExpressionProcessing", kernel)
|
||||||
|
|
||||||
|
image = complete(linker)
|
||||||
|
|
||||||
|
mod = CuModule(image)
|
||||||
|
kernels[i] = CuFunction(mod, "ExpressionProcessing")
|
||||||
|
cache[expressions[i]] = kernels[i]
|
||||||
|
end
|
||||||
|
|
||||||
|
cudaVars = CuArray(variables) # maybe put in shared memory (see PerformanceTests.jl for more info)
|
||||||
|
cudaParams = Utils.create_cuda_array(parameters, NaN32) # maybe make constant (see PerformanceTests.jl for more info)
|
||||||
|
|
||||||
|
# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
|
||||||
|
cudaResults = CuArray{Float32}(undef, variableCols, length(expressions))
|
||||||
|
|
||||||
|
# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
|
||||||
|
for i in eachindex(kernels)
|
||||||
|
config = launch_configuration(kernels[i])
|
||||||
|
threads = min(variableCols, config.threads)
|
||||||
|
blocks = cld(variableCols, threads)
|
||||||
|
|
||||||
|
cudacall(kernels[i], (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
|
||||||
|
end
|
||||||
|
|
||||||
|
return cudaResults
|
||||||
end
|
end
|
||||||
|
|
||||||
# To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string
|
# To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string
|
||||||
# seekstart(buf1); write(buf2, buf1)
|
# seekstart(buf1); write(buf2, buf1)
|
||||||
function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Integer, paramSetSize::Integer)::String
|
"
|
||||||
|
- param ```varSetSize```: The size of a variable set. Equal to number of rows of variable matrix (in a column major matrix)
|
||||||
|
- param ```paramSetSize```: The size of the longest parameter set. As it has to be stored in a column major matrix, the nr of rows is dependent oon the longest parameter set
|
||||||
|
- param ```expressionIndex```: The 0-based index of the expression
|
||||||
|
"
|
||||||
|
function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Integer, paramSetSize::Integer,
|
||||||
|
nrOfVariableSets::Integer, expressionIndex::Integer)::String
|
||||||
exitJumpLocationMarker = "\$L__BB0_2"
|
exitJumpLocationMarker = "\$L__BB0_2"
|
||||||
ptxBuffer = IOBuffer()
|
ptxBuffer = IOBuffer()
|
||||||
|
regManager = Utils.RegisterManager(Dict(), Dict())
|
||||||
|
|
||||||
# TODO: Suboptimal solution
|
# TODO: Suboptimal solution
|
||||||
signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Int32, Float32, Float32]) # nrOfVarSets, Vars, Params
|
signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Float32, Float32, Float32], regManager) # Vars, Params, Results
|
||||||
guardClause, threadIdReg = get_guard_clause(exitJumpLocationMarker, "%parameter0") # parameter0 because first entry holds the number of variable sets and that is always stored in %parameter0
|
guardClause, threadId64Reg = get_guard_clause(exitJumpLocationMarker, nrOfVariableSets, regManager)
|
||||||
|
|
||||||
println(ptxBuffer, get_cuda_header())
|
println(ptxBuffer, get_cuda_header())
|
||||||
println(ptxBuffer, signature)
|
println(ptxBuffer, signature)
|
||||||
println(ptxBuffer, "{")
|
println(ptxBuffer, "{")
|
||||||
|
|
||||||
|
|
||||||
calc_code = generate_calculation_code(expression, "%parameter1", varSetSize, "%parameter2", paramSetSize, threadIdReg)
|
calc_code = generate_calculation_code(expression, "%parameter0", varSetSize, "%parameter1", paramSetSize, "%parameter2",
|
||||||
println(ptxBuffer, get_register_definitions())
|
threadId64Reg, expressionIndex, nrOfVariableSets, regManager)
|
||||||
|
println(ptxBuffer, Utils.get_register_definitions(regManager))
|
||||||
println(ptxBuffer, paramLoading)
|
println(ptxBuffer, paramLoading)
|
||||||
println(ptxBuffer, guardClause)
|
println(ptxBuffer, guardClause)
|
||||||
println(ptxBuffer, calc_code)
|
println(ptxBuffer, calc_code)
|
||||||
@ -59,20 +99,23 @@ function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Int
|
|||||||
println(ptxBuffer, "}")
|
println(ptxBuffer, "}")
|
||||||
|
|
||||||
generatedCode = String(take!(ptxBuffer))
|
generatedCode = String(take!(ptxBuffer))
|
||||||
println(generatedCode)
|
|
||||||
return generatedCode
|
return generatedCode
|
||||||
end
|
end
|
||||||
|
|
||||||
# TODO: Make version, target and address_size configurable; also see what address_size means exactly
|
# TODO: Make version, target and address_size configurable; also see what address_size means exactly
|
||||||
function get_cuda_header()::String
|
function get_cuda_header()::String
|
||||||
return "
|
return "
|
||||||
.version 7.1
|
.version 8.5
|
||||||
.target sm_61
|
.target sm_61
|
||||||
.address_size 32
|
.address_size 64
|
||||||
"
|
"
|
||||||
end
|
end
|
||||||
|
|
||||||
function get_kernel_signature(kernelName::String, parameters::Vector{DataType})::Tuple{String, String}
|
"
|
||||||
|
param ```parameters```: [1] = nr of var sets; [2] = variables; [3] = parameters; [4] = result
|
||||||
|
"
|
||||||
|
function get_kernel_signature(kernelName::String, parameters::Vector{DataType}, regManager::Utils.RegisterManager)::Tuple{String, String}
|
||||||
|
|
||||||
signatureBuffer = IOBuffer()
|
signatureBuffer = IOBuffer()
|
||||||
paramLoadingBuffer = IOBuffer()
|
paramLoadingBuffer = IOBuffer()
|
||||||
print(signatureBuffer, ".visible .entry ")
|
print(signatureBuffer, ".visible .entry ")
|
||||||
@ -80,11 +123,11 @@ function get_kernel_signature(kernelName::String, parameters::Vector{DataType}):
|
|||||||
println(signatureBuffer, "(")
|
println(signatureBuffer, "(")
|
||||||
|
|
||||||
for i in eachindex(parameters)
|
for i in eachindex(parameters)
|
||||||
print(signatureBuffer, " .param .u32", " ", "param_", i)
|
print(signatureBuffer, " .param .u64", " ", "param_", i)
|
||||||
|
|
||||||
parametersReg = get_next_free_register("r")
|
parametersLocation = Utils.get_next_free_register(regManager, "rd")
|
||||||
println(paramLoadingBuffer, "ld.param.u32 $parametersReg, [param_$i];")
|
println(paramLoadingBuffer, "ld.param.u64 $parametersLocation, [param_$i];")
|
||||||
println(paramLoadingBuffer, "cvta.to.global.u32 $(get_next_free_register("parameter")), $parametersReg;")
|
println(paramLoadingBuffer, "cvta.to.global.u64 $(Utils.get_next_free_register(regManager, "parameter")), $parametersLocation;")
|
||||||
if i != lastindex(parameters)
|
if i != lastindex(parameters)
|
||||||
println(signatureBuffer, ",")
|
println(signatureBuffer, ",")
|
||||||
end
|
end
|
||||||
@ -99,36 +142,45 @@ Constructs the PTX code used for handling the case where too many threads are st
|
|||||||
|
|
||||||
- param ```nrOfVarSetsRegister```: The register which holds the total amount of variable sets for the kernel
|
- param ```nrOfVarSetsRegister```: The register which holds the total amount of variable sets for the kernel
|
||||||
"
|
"
|
||||||
function get_guard_clause(exitJumpLocation::String, nrOfVarSetsRegister::String)::Tuple{String, String}
|
function get_guard_clause(exitJumpLocation::String, nrOfVarSets::Integer, regManager::Utils.RegisterManager)::Tuple{String, String}
|
||||||
guardBuffer = IOBuffer()
|
guardBuffer = IOBuffer()
|
||||||
|
|
||||||
threadIds = get_next_free_register("r")
|
threadIds = Utils.get_next_free_register(regManager, "r")
|
||||||
threadsPerCTA = get_next_free_register("r")
|
threadsPerCTA = Utils.get_next_free_register(regManager, "r")
|
||||||
currentThreadId = get_next_free_register("r")
|
currentThreadId = Utils.get_next_free_register(regManager, "r")
|
||||||
|
|
||||||
# load data into above defined registers
|
|
||||||
println(guardBuffer, "mov.u32 $threadIds, %ntid.x;")
|
println(guardBuffer, "mov.u32 $threadIds, %ntid.x;")
|
||||||
println(guardBuffer, "mov.u32 $threadsPerCTA, %ctaid.x;")
|
println(guardBuffer, "mov.u32 $threadsPerCTA, %ctaid.x;")
|
||||||
println(guardBuffer, "mov.u32 $currentThreadId, %tid.x;")
|
println(guardBuffer, "mov.u32 $currentThreadId, %tid.x;")
|
||||||
|
|
||||||
globalThreadId = get_next_free_register("r") # basically the index of the thread in the variable set
|
globalThreadId = Utils.get_next_free_register(regManager, "r") # basically the index of the thread in the variable set
|
||||||
breakCondition = get_next_free_register("p")
|
breakCondition = Utils.get_next_free_register(regManager, "p")
|
||||||
nrOfVarSets = get_next_free_register("i")
|
|
||||||
println(guardBuffer, "ld.global.u32 $nrOfVarSets, [$nrOfVarSetsRegister];")
|
|
||||||
println(guardBuffer, "mad.lo.s32 $globalThreadId, $threadIds, $threadsPerCTA, $currentThreadId;")
|
println(guardBuffer, "mad.lo.s32 $globalThreadId, $threadIds, $threadsPerCTA, $currentThreadId;")
|
||||||
println(guardBuffer, "setp.ge.s32 $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets
|
println(guardBuffer, "setp.gt.s32 $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets
|
||||||
|
|
||||||
# branch to end if breakCondition is true
|
# branch to end if breakCondition is true
|
||||||
print(guardBuffer, "@$breakCondition bra $exitJumpLocation;")
|
println(guardBuffer, "@$breakCondition bra $exitJumpLocation;")
|
||||||
|
|
||||||
return (String(take!(guardBuffer)), globalThreadId)
|
# Convert threadIdReg to a 64 bit register. Not 64 bit from the start, as this would take up more registers. Performance tests can be performed to determin if it is faster doing this, or making everything 64-bit from the start
|
||||||
|
threadId64Reg = Utils.get_next_free_register(regManager, "rd")
|
||||||
|
print(guardBuffer, "cvt.u64.u32 $threadId64Reg, $globalThreadId;")
|
||||||
|
|
||||||
|
return (String(take!(guardBuffer)), threadId64Reg)
|
||||||
end
|
end
|
||||||
|
|
||||||
function generate_calculation_code(expression::ExpressionProcessing.PostfixType, variablesReg::String, variablesSetSize::Integer,
|
"
|
||||||
parametersReg::String, parametersSetSize::Integer, threadIdReg::String)::String
|
- param ```parametersSetSize```: Size of the largest parameter set
|
||||||
|
"
|
||||||
|
function generate_calculation_code(expression::ExpressionProcessing.PostfixType, variablesLocation::String, variablesSetSize::Integer,
|
||||||
|
parametersLocation::String, parametersSetSize::Integer, resultsLocation::String,
|
||||||
|
threadId64Reg::String, expressionIndex::Integer, nrOfVarSets::Integer, regManager::Utils.RegisterManager)::String
|
||||||
|
|
||||||
codeBuffer = IOBuffer()
|
codeBuffer = IOBuffer()
|
||||||
operands = Vector{Operand}()
|
operands = Vector{Operand}()
|
||||||
|
|
||||||
|
exprId64Reg = Utils.get_next_free_register(regManager, "rd")
|
||||||
|
println(codeBuffer, "mov.u64 $exprId64Reg, $expressionIndex;")
|
||||||
|
|
||||||
for i in eachindex(expression)
|
for i in eachindex(expression)
|
||||||
token = expression[i]
|
token = expression[i]
|
||||||
|
|
||||||
@ -144,47 +196,57 @@ function generate_calculation_code(expression::ExpressionProcessing.PostfixType,
|
|||||||
else
|
else
|
||||||
left = pop!(operands)
|
left = pop!(operands)
|
||||||
end
|
end
|
||||||
operation, resultRegister = get_operation(operator, left, right)
|
operation, resultRegister = get_operation(operator, regManager, left, right)
|
||||||
|
|
||||||
println(codeBuffer, operation)
|
println(codeBuffer, operation)
|
||||||
push!(operands, resultRegister)
|
push!(operands, resultRegister)
|
||||||
elseif token.Type == INDEX
|
elseif token.Type == INDEX
|
||||||
if token.Value > 0 # varaibles
|
if token.Value > 0 # varaibles
|
||||||
var, first_access = get_register_for_name("x$(token.Value)")
|
var, first_access = Utils.get_register_for_name(regManager, "x$(token.Value)")
|
||||||
if first_access
|
if first_access
|
||||||
println(codeBuffer, load_into_register(var, variablesReg, token.Value, threadIdReg, variablesSetSize))
|
println(codeBuffer, load_into_register(var, variablesLocation, token.Value, threadId64Reg, variablesSetSize, regManager))
|
||||||
end
|
end
|
||||||
push!(operands, var)
|
push!(operands, var)
|
||||||
else
|
else
|
||||||
absVal = abs(token.Value)
|
absVal = abs(token.Value)
|
||||||
param, first_access = get_register_for_name("p$absVal")
|
param, first_access = Utils.get_register_for_name(regManager, "p$absVal")
|
||||||
if first_access
|
if first_access
|
||||||
println(codeBuffer, load_into_register(param, parametersReg, absVal, threadIdReg, parametersSetSize))
|
println(codeBuffer, load_into_register(param, parametersLocation, absVal, exprId64Reg, parametersSetSize, regManager))
|
||||||
end
|
end
|
||||||
push!(operands, param)
|
push!(operands, param)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
tempReg = Utils.get_next_free_register(regManager, "rd")
|
||||||
|
# reg = pop!(operands)
|
||||||
|
# tmp = "abs.f32 $(reg), 16.0;"
|
||||||
|
# push!(operands, reg)
|
||||||
|
println(codeBuffer, "
|
||||||
|
add.u64 $tempReg, $((expressionIndex)*nrOfVarSets), $threadId64Reg;
|
||||||
|
mad.lo.u64 $tempReg, $tempReg, $BYTES, $resultsLocation;
|
||||||
|
st.global.f32 [$tempReg], $(pop!(operands));
|
||||||
|
")
|
||||||
|
|
||||||
return String(take!(codeBuffer))
|
return String(take!(codeBuffer))
|
||||||
end
|
end
|
||||||
|
|
||||||
"
|
"
|
||||||
|
Loads a value from a location into the given register. It is assumed that the location refers to a column-major matrix
|
||||||
|
|
||||||
- param ```register```: The register where the loaded value will be stored
|
- param ```register```: The register where the loaded value will be stored
|
||||||
- param ```loadLocation```: The location from where to load the value
|
- param ```loadLocation```: The location from where to load the value
|
||||||
- param ```valueIndex```: 0-based index of the value in the variable set/parameter set
|
- param ```valueIndex```: 1-based index of the value in the variable set/parameter set
|
||||||
- param ```setIndexReg```: 0-based index of the set. Needed to calculate the actual index from the ```valueIndex```. Is equal to the global threadId
|
- param ```setIndexReg64```: 0-based index of the set. Needed to calculate the actual index from the ```valueIndex```. Is equal to the global threadId
|
||||||
- param ```setSize```: The size of one set. Needed to calculate the actual index from the ```valueIndex```
|
- param ```setSize```: The size of one set. Needed to calculate the actual index from the ```valueIndex```. Total number of elements in the set (length(set))
|
||||||
"
|
"
|
||||||
function load_into_register(register::String, loadLocation::String, valueIndex::Integer, setIndexReg::String, setSize::Integer)::String
|
function load_into_register(register::String, loadLocation::String, valueIndex::Integer, setIndexReg64::String, setSize::Integer, regManager::Utils.RegisterManager)::String
|
||||||
# loadLocation + startIndex + valueIndex * bytes (4 in our case)
|
tempReg = Utils.get_next_free_register(regManager, "rd")
|
||||||
# startIndex: setIndex * setSize
|
|
||||||
tempReg = get_next_free_register("i")
|
# "mad" calculates the offset and "add" applies the offset. Classical pointer arithmetic for accessing values of an array like in C
|
||||||
# we are using "sizeof(valueIndex)" because it has to use the same amount of bytes as the actual stored values, even though it could use more bytes
|
|
||||||
return "
|
return "
|
||||||
mul.lo.u32 $tempReg, $setIndexReg, $setSize;
|
mad.lo.u64 $tempReg, $setIndexReg64, $(setSize*BYTES), $((valueIndex - 1) * BYTES);
|
||||||
add.u32 $tempReg, $tempReg, $(valueIndex*sizeof(valueIndex));
|
add.u64 $tempReg, $loadLocation, $tempReg;
|
||||||
add.u32 $tempReg, $loadLocation, $tempReg;
|
|
||||||
ld.global.f32 $register, [$tempReg];"
|
ld.global.f32 $register, [$tempReg];"
|
||||||
end
|
end
|
||||||
|
|
||||||
@ -200,8 +262,8 @@ function type_to_ptx_type(type::DataType)::String
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
function get_operation(operator::Operator, left::Operand, right::Union{Operand, Nothing} = nothing)::Tuple{String, String}
|
function get_operation(operator::Operator, regManager::Utils.RegisterManager, left::Operand, right::Union{Operand, Nothing} = nothing)::Tuple{String, String}
|
||||||
resultRegister = get_next_free_register("f")
|
resultRegister = Utils.get_next_free_register(regManager, "f")
|
||||||
resultCode = ""
|
resultCode = ""
|
||||||
|
|
||||||
if is_binary_operator(operator) && isnothing(right)
|
if is_binary_operator(operator) && isnothing(right)
|
||||||
@ -219,6 +281,7 @@ function get_operation(operator::Operator, left::Operand, right::Union{Operand,
|
|||||||
elseif operator == POWER
|
elseif operator == POWER
|
||||||
# x^y == 2^(y*log2(x)) as generated by nvcc for "pow(x, y)"
|
# x^y == 2^(y*log2(x)) as generated by nvcc for "pow(x, y)"
|
||||||
resultCode = "
|
resultCode = "
|
||||||
|
// x^y:
|
||||||
lg2.approx.f32 $resultRegister, $left;
|
lg2.approx.f32 $resultRegister, $left;
|
||||||
mul.f32 $resultRegister, $right, $resultRegister;
|
mul.f32 $resultRegister, $right, $resultRegister;
|
||||||
ex2.approx.f32 $resultRegister, $resultRegister;"
|
ex2.approx.f32 $resultRegister, $resultRegister;"
|
||||||
@ -227,11 +290,13 @@ function get_operation(operator::Operator, left::Operand, right::Union{Operand,
|
|||||||
elseif operator == LOG
|
elseif operator == LOG
|
||||||
# log(x) == log2(x) * ln(2) as generated by nvcc for "log(x)"
|
# log(x) == log2(x) * ln(2) as generated by nvcc for "log(x)"
|
||||||
resultCode = "
|
resultCode = "
|
||||||
|
// log(x):
|
||||||
lg2.approx.f32 $resultRegister, $left;
|
lg2.approx.f32 $resultRegister, $left;
|
||||||
mul.f32 $resultRegister, $resultRegister, 0.693147182;"
|
mul.f32 $resultRegister, $resultRegister, 0.693147182;"
|
||||||
elseif operator == EXP
|
elseif operator == EXP
|
||||||
# e^x == 2^(x/ln(2)) as generated by nvcc for "exp(x)"
|
# e^x == 2^(x/ln(2)) as generated by nvcc for "exp(x)"
|
||||||
resultCode = "
|
resultCode = "
|
||||||
|
// e^x:
|
||||||
mul.f32 $resultRegister, $left, 1.44269502;
|
mul.f32 $resultRegister, $left, 1.44269502;
|
||||||
ex2.approx.f32 $resultRegister, $resultRegister;"
|
ex2.approx.f32 $resultRegister, $resultRegister;"
|
||||||
elseif operator == SQRT
|
elseif operator == SQRT
|
||||||
@ -242,68 +307,5 @@ function get_operation(operator::Operator, left::Operand, right::Union{Operand,
|
|||||||
return (resultCode, resultRegister)
|
return (resultCode, resultRegister)
|
||||||
end
|
end
|
||||||
|
|
||||||
let registers = Dict() # stores the count of the register already used.
|
|
||||||
global get_next_free_register
|
|
||||||
global get_register_definitions
|
|
||||||
|
|
||||||
# By convention these names correspond to the following types:
|
|
||||||
# - p -> pred
|
|
||||||
# - f -> float32
|
|
||||||
# - r -> 32 bit
|
|
||||||
# - var -> float32 (used for variables and params)
|
|
||||||
function get_next_free_register(name::String)::String
|
|
||||||
if haskey(registers, name)
|
|
||||||
registers[name] += 1
|
|
||||||
else
|
|
||||||
registers[name] = 1
|
|
||||||
end
|
|
||||||
|
|
||||||
return string("%", name, registers[name] - 1)
|
|
||||||
end
|
|
||||||
|
|
||||||
function get_register_definitions()::String
|
|
||||||
registersBuffer = IOBuffer()
|
|
||||||
|
|
||||||
for definition in registers
|
|
||||||
regType = ""
|
|
||||||
if definition.first == "p"
|
|
||||||
regType = ".pred"
|
|
||||||
elseif definition.first == "f"
|
|
||||||
regType = ".f32"
|
|
||||||
elseif definition.first == "var"
|
|
||||||
regType = ".f32"
|
|
||||||
elseif definition.first == "param"
|
|
||||||
regType = ".f32"
|
|
||||||
elseif definition.first == "r"
|
|
||||||
regType = ".b32"
|
|
||||||
elseif definition.first == "parameter"
|
|
||||||
regType = ".u32"
|
|
||||||
elseif definition.first == "i"
|
|
||||||
regType = ".u32"
|
|
||||||
else
|
|
||||||
throw(ArgumentError("Unknown register name used. Name '$(definition.first)' cannot be mapped to a PTX type."))
|
|
||||||
end
|
|
||||||
println(registersBuffer, ".reg $regType %$(definition.first)<$(definition.second)>;")
|
|
||||||
end
|
|
||||||
|
|
||||||
return String(take!(registersBuffer))
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
let symtable = Dict()
|
|
||||||
global get_register_for_name
|
|
||||||
|
|
||||||
"Returns the register for this variable/parameter and true if it is used for the first time and false otherwise."
|
|
||||||
function get_register_for_name(varName::String)
|
|
||||||
if haskey(symtable, varName)
|
|
||||||
return (symtable[varName], false)
|
|
||||||
else
|
|
||||||
reg = get_next_free_register("var")
|
|
||||||
symtable[varName] = reg
|
|
||||||
return (reg, true)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
88
package/src/Utils.jl
Normal file
88
package/src/Utils.jl
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
module Utils
|
||||||
|
|
||||||
|
using CUDA
|
||||||
|
|
||||||
|
"Converts a vector of vectors into a matrix. The inner vectors do not need to have the same length.
|
||||||
|
|
||||||
|
All entries that cannot be filled have ```invalidElement``` as their value
|
||||||
|
"
|
||||||
|
function convert_to_matrix(vecs::Vector{Vector{T}}, invalidElement::T)::Matrix{T} where T
|
||||||
|
maxLength = get_max_inner_length(vecs)
|
||||||
|
|
||||||
|
# Pad the shorter vectors with the invalidElement
|
||||||
|
paddedVecs = [vcat(vec, fill(invalidElement, maxLength - length(vec))) for vec in vecs]
|
||||||
|
vecMat = hcat(paddedVecs...)
|
||||||
|
|
||||||
|
return vecMat
|
||||||
|
end
|
||||||
|
|
||||||
|
"Retrieves the number of entries for the largest inner vector"
|
||||||
|
function get_max_inner_length(vecs::Vector{Vector{T}})::Int where T
|
||||||
|
return maximum(length.(vecs))
|
||||||
|
end
|
||||||
|
|
||||||
|
"Returns a CuArray filed with the data provided. The inner vectors do not have to have the same length. All missing elements will be the value ```invalidElement```"
|
||||||
|
function create_cuda_array(data::Vector{Vector{T}}, invalidElement::T)::CuArray{T} where T
|
||||||
|
dataMat = convert_to_matrix(data, invalidElement)
|
||||||
|
cudaArr = CuArray(dataMat)
|
||||||
|
|
||||||
|
return cudaArr
|
||||||
|
end
|
||||||
|
|
||||||
|
struct RegisterManager
|
||||||
|
registers::Dict
|
||||||
|
symtable::Dict
|
||||||
|
end
|
||||||
|
|
||||||
|
function get_next_free_register(manager::RegisterManager, name::String)::String
|
||||||
|
if haskey(manager.registers, name)
|
||||||
|
manager.registers[name] += 1
|
||||||
|
else
|
||||||
|
manager.registers[name] = 1
|
||||||
|
end
|
||||||
|
|
||||||
|
return string("%", name, manager.registers[name] - 1)
|
||||||
|
end
|
||||||
|
|
||||||
|
function get_register_definitions(manager::RegisterManager)::String
|
||||||
|
registersBuffer = IOBuffer()
|
||||||
|
|
||||||
|
for definition in manager.registers
|
||||||
|
regType = ""
|
||||||
|
if definition.first == "p"
|
||||||
|
regType = ".pred"
|
||||||
|
elseif definition.first == "f"
|
||||||
|
regType = ".f32"
|
||||||
|
elseif definition.first == "var"
|
||||||
|
regType = ".f32"
|
||||||
|
elseif definition.first == "param"
|
||||||
|
regType = ".f32"
|
||||||
|
elseif definition.first == "r"
|
||||||
|
regType = ".b32"
|
||||||
|
elseif definition.first == "rd"
|
||||||
|
regType = ".b64"
|
||||||
|
elseif definition.first == "parameter"
|
||||||
|
regType = ".b64"
|
||||||
|
elseif definition.first == "i"
|
||||||
|
regType = ".b64"
|
||||||
|
else
|
||||||
|
throw(ArgumentError("Unknown register name used. Name '$(definition.first)' cannot be mapped to a PTX type."))
|
||||||
|
end
|
||||||
|
println(registersBuffer, ".reg $regType %$(definition.first)<$(definition.second)>;")
|
||||||
|
end
|
||||||
|
|
||||||
|
return String(take!(registersBuffer))
|
||||||
|
end
|
||||||
|
|
||||||
|
"Returns the register for this variable/parameter and true if it is used for the first time and false otherwise."
|
||||||
|
function get_register_for_name(manager::RegisterManager, varName::String)
|
||||||
|
if haskey(manager.symtable, varName)
|
||||||
|
return (manager.symtable[varName], false)
|
||||||
|
else
|
||||||
|
reg = get_next_free_register(manager, "var")
|
||||||
|
manager.symtable[varName] = reg
|
||||||
|
return (reg, true)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
47
package/test/CpuInterpreterTests.jl
Normal file
47
package/test/CpuInterpreterTests.jl
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
using LinearAlgebra
|
||||||
|
using BenchmarkTools
|
||||||
|
|
||||||
|
function test_cpu_interpreter(nrows; parallel = false)
|
||||||
|
exprs = [
|
||||||
|
# CPU interpreter requires an anonymous function and array ref s
|
||||||
|
:(p[1] * x[1] + p[2]), # 5 op
|
||||||
|
:((((x[1] + x[2]) + x[3]) + x[4]) + x[5]), # 9 op
|
||||||
|
:(log(abs(x[1]))), # 3 op
|
||||||
|
:(powabs(p[2] - powabs(p[1] + x[1], 1/x[1]),p[3])) # 13 op
|
||||||
|
] # 30 op
|
||||||
|
exprs = map(e -> Expr(:->, :(x,p), e), exprs)
|
||||||
|
X = randn(Float32, nrows, 10)
|
||||||
|
p = [randn(Float32, 10) for _ in 1:length(exprs)] # generate 10 random parameter values for each expr
|
||||||
|
|
||||||
|
# warmup
|
||||||
|
interpret_cpu(exprs, X, p)
|
||||||
|
expr_reps = 100 # for each expr
|
||||||
|
reps= 100
|
||||||
|
|
||||||
|
if parallel
|
||||||
|
# t_sec = @elapsed fetch.([Threads.@spawn interpret_cpu(exprs, X, p; repetitions=expr_reps) for i in 1:reps])
|
||||||
|
@btime parallel(exprs, X, p, expr_reps, reps)
|
||||||
|
println("~ $(round(30 * reps * expr_reps * nrows / 1e9 / t_sec, digits=2)) GFLOPS ($(Threads.nthreads()) threads) ($(round(LinearAlgebra.peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))")
|
||||||
|
else
|
||||||
|
# t_sec = @elapsed for i in 1:reps interpret_cpu(exprs, X, p; repetitions=expr_reps) end
|
||||||
|
@btime single(exprs, X, p, expr_reps, reps)
|
||||||
|
println("~ $(round(30 * reps * expr_reps * nrows / 1e9 / t_sec, digits=2)) GFLOPS (single-core) ($(round(LinearAlgebra.peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))")
|
||||||
|
end
|
||||||
|
true
|
||||||
|
end
|
||||||
|
|
||||||
|
function parallel(exprs, X, p, expr_reps, reps)
|
||||||
|
fetch.([Threads.@spawn interpret_cpu(exprs, X, p; repetitions=expr_reps) for i in 1:reps])
|
||||||
|
end
|
||||||
|
|
||||||
|
function single(exprs, X, p, expr_reps, reps)
|
||||||
|
for i in 1:reps interpret_cpu(exprs, X, p; repetitions=expr_reps) end
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
# LinearAlgebra.BLAS.set_num_threads(1) # only use a single thread for peakflops
|
||||||
|
|
||||||
|
@test test_cpu_interpreter(1000)
|
||||||
|
@test test_cpu_interpreter(1000, parallel=true) # start julia -t 6 for six threads
|
||||||
|
@test test_cpu_interpreter(10000)
|
||||||
|
@test test_cpu_interpreter(10000, parallel=true)
|
@ -1,6 +1,7 @@
|
|||||||
using CUDA
|
using CUDA
|
||||||
using .ExpressionProcessing
|
using .ExpressionProcessing
|
||||||
using .Interpreter
|
using .Interpreter
|
||||||
|
using .Utils
|
||||||
|
|
||||||
expressions = Vector{Expr}(undef, 2)
|
expressions = Vector{Expr}(undef, 2)
|
||||||
variables = Matrix{Float32}(undef, 2,2)
|
variables = Matrix{Float32}(undef, 2,2)
|
||||||
@ -20,8 +21,8 @@ parameters[2][1] = 5.0
|
|||||||
parameters[2][2] = 0.0
|
parameters[2][2] = 0.0
|
||||||
|
|
||||||
function testHelper(expression::Expr, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}}, expectedResult)
|
function testHelper(expression::Expr, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}}, expectedResult)
|
||||||
postfix = Vector([expr_to_postfix(expression)])
|
exprs = Vector([expression])
|
||||||
result = Interpreter.interpret(postfix, variables, parameters)
|
result = Interpreter.interpret(exprs, variables, parameters)
|
||||||
|
|
||||||
expectedResult32 = convert(Float32, expectedResult)
|
expectedResult32 = convert(Float32, expectedResult)
|
||||||
@test isequal(result[1,1], expectedResult32)
|
@test isequal(result[1,1], expectedResult32)
|
||||||
@ -35,7 +36,7 @@ end
|
|||||||
reference[2,2] = 0.0
|
reference[2,2] = 0.0
|
||||||
# reference = Matrix([5.0, NaN],
|
# reference = Matrix([5.0, NaN],
|
||||||
# [5.0, 0.0])
|
# [5.0, 0.0])
|
||||||
result = Interpreter.convert_to_matrix(parameters, NaN32)
|
result = Utils.convert_to_matrix(parameters, NaN32)
|
||||||
|
|
||||||
@test isequal(result, reference)
|
@test isequal(result, reference)
|
||||||
end
|
end
|
||||||
@ -126,8 +127,8 @@ end
|
|||||||
expr1 = :((x1 + 5) * p1 - 3 / abs(x2) + (2^4) - log(8))
|
expr1 = :((x1 + 5) * p1 - 3 / abs(x2) + (2^4) - log(8))
|
||||||
expr2 = :(1 + 5 * x1 - 10^2 + (p1 - p2) / 9 + exp(x2))
|
expr2 = :(1 + 5 * x1 - 10^2 + (p1 - p2) / 9 + exp(x2))
|
||||||
|
|
||||||
postfix = Vector([expr_to_postfix(expr1), expr_to_postfix(expr2)])
|
exprs = Vector([expr1, expr2])
|
||||||
result = Interpreter.interpret(postfix, var, param)
|
result = Interpreter.interpret(exprs, var, param)
|
||||||
|
|
||||||
# var set 1
|
# var set 1
|
||||||
@test isapprox(result[1,1], 37.32, atol=0.01) # expr1
|
@test isapprox(result[1,1], 37.32, atol=0.01) # expr1
|
||||||
|
146
package/test/PerformanceTests.jl
Normal file
146
package/test/PerformanceTests.jl
Normal file
@ -0,0 +1,146 @@
|
|||||||
|
using LinearAlgebra
|
||||||
|
using BenchmarkTools
|
||||||
|
|
||||||
|
using .Transpiler
|
||||||
|
using .Interpreter
|
||||||
|
|
||||||
|
const BENCHMARKS_RESULTS_PATH = "./results"
|
||||||
|
# University setup at 10.20.1.7 if needed
|
||||||
|
exprsCPU = [
|
||||||
|
# CPU interpreter requires an anonymous function and array ref s
|
||||||
|
:(p[1] * x[1] + p[2]), # 5 op
|
||||||
|
:((((x[1] + x[2]) + x[3]) + x[4]) + x[5]), # 9 op
|
||||||
|
:(log(abs(x[1]))), # 3 op
|
||||||
|
:(powabs(p[2] - powabs(p[1] + x[1], 1/x[1]),p[3])) # 13 op
|
||||||
|
] # 30 op
|
||||||
|
exprsCPU = map(e -> Expr(:->, :(x,p), e), exprsCPU)
|
||||||
|
|
||||||
|
exprsGPU = [
|
||||||
|
# CPU interpreter requires an anonymous function and array ref s
|
||||||
|
:(p1 * x1 + p2), # 5 op
|
||||||
|
:((((x1 + x2) + x3) + x4) + x5), # 9 op
|
||||||
|
:(log(abs(x1))), # 3 op
|
||||||
|
:(powabs(p2 - powabs(p1 + x1, 1/x1),p3)) # 13 op
|
||||||
|
] # 30 op
|
||||||
|
|
||||||
|
# p is the same for CPU and GPU
|
||||||
|
p = [randn(Float32, 10) for _ in 1:length(exprsCPU)] # generate 10 random parameter values for each expr
|
||||||
|
expr_reps = 100 # 100 parameter optimisation steps basically
|
||||||
|
|
||||||
|
|
||||||
|
@testset "CPU performance" begin
|
||||||
|
# warmup
|
||||||
|
# interpret_cpu(exprsCPU, X, p)
|
||||||
|
|
||||||
|
# @btime interpret_cpu(exprsCPU, X, p; repetitions=expr_reps) # repetitions simulates parameter optimisation
|
||||||
|
# @btime test_cpu_interpreter(1000)
|
||||||
|
# @btime fetch.([Threads.@spawn interpret_cpu(exprsCPU, X, p; repetitions=expr_reps) for i in 1:reps])
|
||||||
|
|
||||||
|
# test_cpu_interpreter(1000, parallel=true) # start julia -t 6 for six threads
|
||||||
|
# @btime test_cpu_interpreter(10000)
|
||||||
|
# @btime test_cpu_interpreter(10000, parallel=true)
|
||||||
|
|
||||||
|
end
|
||||||
|
|
||||||
|
@testset "Interpreter Performance" begin
|
||||||
|
# Put data in shared memory:
|
||||||
|
# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
|
||||||
|
|
||||||
|
# Make array const:
|
||||||
|
# https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays
|
||||||
|
|
||||||
|
# Memory management like in C++ might help with performance improvements
|
||||||
|
# https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management
|
||||||
|
end
|
||||||
|
|
||||||
|
@testset "Transpiler Performance" begin
|
||||||
|
# Put data in shared memory:
|
||||||
|
# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
|
||||||
|
|
||||||
|
# Make array const:
|
||||||
|
# https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays
|
||||||
|
|
||||||
|
# Memory management like in C++ might help with performance improvements
|
||||||
|
# https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
suite = BenchmarkGroup()
|
||||||
|
suite["CPU"] = BenchmarkGroup(["CPUInterpreter"])
|
||||||
|
suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"])
|
||||||
|
suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"])
|
||||||
|
varsets_small = 100
|
||||||
|
varsets_medium = 1000
|
||||||
|
varsets_large = 10000
|
||||||
|
|
||||||
|
X_small = randn(Float32, varsets_small, 5)
|
||||||
|
suite["CPU"]["small varset"] = @benchmarkable interpret_cpu(exprsCPU, X_small, p; repetitions=expr_reps)
|
||||||
|
X_medium = randn(Float32, varsets_medium, 5)
|
||||||
|
suite["CPU"]["medium varset"] = @benchmarkable interpret_cpu(exprsCPU, X_medium, p; repetitions=expr_reps)
|
||||||
|
X_large = randn(Float32, varsets_large, 5)
|
||||||
|
suite["CPU"]["large varset"] = @benchmarkable interpret_cpu(exprsCPU, X_large, p; repetitions=expr_reps)
|
||||||
|
|
||||||
|
X_small_GPU = randn(Float32, 5, varsets_small)
|
||||||
|
suite["GPUI"]["small varset"] = @benchmarkable interpret_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
|
||||||
|
suite["GPUT"]["small varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
|
||||||
|
|
||||||
|
X_medium_GPU = randn(Float32, 5, varsets_medium)
|
||||||
|
suite["GPUI"]["medium varset"] = @benchmarkable interpret_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
|
||||||
|
suite["GPUT"]["medium varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
|
||||||
|
|
||||||
|
X_large_GPU = randn(Float32, 5, varsets_large)
|
||||||
|
suite["GPUI"]["large varset"] = @benchmarkable interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
|
||||||
|
suite["GPUT"]["large varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
|
||||||
|
|
||||||
|
# interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
|
||||||
|
|
||||||
|
# tune!(suite)
|
||||||
|
# BenchmarkTools.save("params.json", params(suite))
|
||||||
|
|
||||||
|
loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)
|
||||||
|
|
||||||
|
results = run(suite, verbose=true, seconds=180)
|
||||||
|
|
||||||
|
# BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/initial_results.json", results)
|
||||||
|
# initial_results = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATHinitial_results.json")
|
||||||
|
|
||||||
|
medianCPU = median(results["CPU"])
|
||||||
|
minimumCPU = minimum(results["CPU"])
|
||||||
|
stdCPU = std(results["CPU"])
|
||||||
|
|
||||||
|
medianInterpreter = median(results["GPUI"])
|
||||||
|
minimumInterpreter = minimum(results["GPUI"])
|
||||||
|
stdInterpreter = std(results["GPUI"])
|
||||||
|
|
||||||
|
medianTranspiler = median(results["GPUT"])
|
||||||
|
minimumTranspiler = minimum(results["GPUT"])
|
||||||
|
stdTranspiler = std(results["GPUT"])
|
||||||
|
|
||||||
|
cpuVsGPUI_median = judge(medianInterpreter, medianCPU) # is interpreter better than cpu?
|
||||||
|
cpuVsGPUT_median = judge(medianTranspiler, medianCPU) # is transpiler better than cpu?
|
||||||
|
gpuiVsGPUT_median = judge(medianTranspiler, medianInterpreter) # is tranpiler better than interpreter?
|
||||||
|
|
||||||
|
cpuVsGPUI_minimum = judge(minimumInterpreter, minimumCPU) # is interpreter better than cpu?
|
||||||
|
cpuVsGPUT_minimum = judge(minimumTranspiler, minimumCPU) # is transpiler better than cpu?
|
||||||
|
gpuiVsGPUT_minimum = judge(minimumTranspiler, minimumInterpreter) # is tranpiler better than interpreter?
|
||||||
|
|
||||||
|
cpuVsGPUI_std = judge(stdInterpreter, stdCPU) # is interpreter better than cpu?
|
||||||
|
cpuVsGPUT_std = judge(stdTranspiler, stdCPU) # is transpiler better than cpu?
|
||||||
|
gpuiVsGPUT_std = judge(stdTranspiler, stdInterpreter) # is tranpiler better than interpreter?
|
||||||
|
|
||||||
|
|
||||||
|
println("Is the interpreter better than the CPU implementation:")
|
||||||
|
println(cpuVsGPUI_median)
|
||||||
|
println(cpuVsGPUI_minimum)
|
||||||
|
println(cpuVsGPUI_std)
|
||||||
|
|
||||||
|
println("Is the transpiler better than the CPU implementation:")
|
||||||
|
println(cpuVsGPUT_median)
|
||||||
|
println(cpuVsGPUT_minimum)
|
||||||
|
println(cpuVsGPUT_std)
|
||||||
|
|
||||||
|
println("Is the transpiler better than the interpreter:")
|
||||||
|
println(gpuiVsGPUT_median)
|
||||||
|
println(gpuiVsGPUT_minimum)
|
||||||
|
println(gpuiVsGPUT_std)
|
@ -1,4 +1,8 @@
|
|||||||
[deps]
|
[deps]
|
||||||
|
BenchmarkPlots = "ab8c0f59-4072-4e0d-8f91-a91e1495eb26"
|
||||||
|
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
|
||||||
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
|
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
|
||||||
|
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
|
||||||
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
|
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
|
||||||
|
StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd"
|
||||||
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
|
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
|
||||||
|
@ -2,42 +2,65 @@ using CUDA
|
|||||||
using .ExpressionProcessing
|
using .ExpressionProcessing
|
||||||
using .Transpiler
|
using .Transpiler
|
||||||
|
|
||||||
expressions = Vector{Expr}(undef, 2)
|
expressions = Vector{Expr}(undef, 3)
|
||||||
variables = Matrix{Float32}(undef, 2,2)
|
variables = Matrix{Float32}(undef, 5, 4)
|
||||||
parameters = Vector{Vector{Float32}}(undef, 2)
|
parameters = Vector{Vector{Float32}}(undef, 3)
|
||||||
|
|
||||||
# Resulting value should be 1.14... for the first expression
|
|
||||||
expressions[1] = :(1 + 3 * 5 / 7 - sqrt(4))
|
expressions[1] = :(1 + 3 * 5 / 7 - sqrt(4))
|
||||||
expressions[2] = :(5 + x1 + 1 * x2 + p1 + p2)
|
expressions[2] = :(5 + x1 + 1 * x2 + p1 + p2 + x1^x3)
|
||||||
|
expressions[3] = :(log(x1) / x2 * sqrt(p1) + x3^x4 - exp(x5))
|
||||||
|
|
||||||
variables[1,1] = 2.0
|
variables[1,1] = 2.0
|
||||||
variables[2,1] = 3.0
|
variables[2,1] = 3.0
|
||||||
variables[1,2] = 0.0
|
variables[3,1] = 0.0
|
||||||
|
variables[4,1] = 1.0
|
||||||
|
variables[5,1] = 0.0
|
||||||
|
|
||||||
|
variables[1,2] = 2.0
|
||||||
variables[2,2] = 5.0
|
variables[2,2] = 5.0
|
||||||
parameters[1] = Vector{Float32}(undef, 1)
|
variables[3,2] = 3.0
|
||||||
|
variables[4,2] = 0.0
|
||||||
|
variables[5,2] = 0.0
|
||||||
|
|
||||||
|
variables[1,3] = 6.0
|
||||||
|
variables[2,3] = 2.0
|
||||||
|
variables[3,3] = 2.0
|
||||||
|
variables[4,3] = 4.0
|
||||||
|
variables[5,3] = 2.0
|
||||||
|
|
||||||
|
variables[1,4] = 1.0
|
||||||
|
variables[2,4] = 2.0
|
||||||
|
variables[3,4] = 3.0
|
||||||
|
variables[4,4] = 4.0
|
||||||
|
variables[5,4] = 5.0
|
||||||
|
|
||||||
|
parameters[1] = Vector{Float32}(undef, 0)
|
||||||
parameters[2] = Vector{Float32}(undef, 2)
|
parameters[2] = Vector{Float32}(undef, 2)
|
||||||
parameters[1][1] = 5.0
|
parameters[3] = Vector{Float32}(undef, 1)
|
||||||
parameters[2][1] = 5.0
|
parameters[2][1] = 5.0
|
||||||
parameters[2][2] = 0.0
|
parameters[2][2] = 0.0
|
||||||
|
parameters[3][1] = 16.0
|
||||||
|
|
||||||
|
@testset "Test transpiler evaluation" begin
|
||||||
|
results = Transpiler.evaluate(expressions, variables, parameters)
|
||||||
|
|
||||||
@testset "Test TMP transpiler" begin
|
# dump(expressions[3]; maxdepth=10)
|
||||||
postfixExpr = expr_to_postfix(expressions[1])
|
# Expr 1:
|
||||||
postfixExprs = Vector([postfixExpr])
|
@test isapprox(results[1,1], 1.14286)
|
||||||
push!(postfixExprs, expr_to_postfix(expressions[2]))
|
@test isapprox(results[2,1], 1.14286)
|
||||||
push!(postfixExprs, expr_to_postfix(:(5^3 + x1)))
|
@test isapprox(results[3,1], 1.14286)
|
||||||
|
@test isapprox(results[4,1], 1.14286)
|
||||||
|
#Expr 2:
|
||||||
|
@test isapprox(results[1,2], 16.0)
|
||||||
|
@test isapprox(results[2,2], 25.0)
|
||||||
|
@test isapprox(results[3,2], 54.0)
|
||||||
|
@test isapprox(results[4,2], 14.0)
|
||||||
|
|
||||||
# generatedCode = Transpiler.transpile(postfixExpr)
|
#Expr3:
|
||||||
generatedCode = Transpiler.transpile(postfixExprs[3], 2, 3) # TEMP
|
@test isapprox(results[1,3], -0.07580)
|
||||||
# CUDA.@sync interpret(postfixExprs, variables, parameters)
|
@test isapprox(results[2,3], 0.55452)
|
||||||
|
@test isapprox(results[3,3], 12.19446)
|
||||||
# This is just here for testing. This will be called inside the execute method in the Transpiler module
|
@test isapprox(results[4,3], -67.41316)
|
||||||
linker = CuLink()
|
|
||||||
add_data!(linker, "ExpressionProcessing", generatedCode)
|
|
||||||
|
|
||||||
image = complete(linker)
|
|
||||||
|
|
||||||
mod = CuModule(image)
|
|
||||||
func = CuFunction(mod, "ExpressionProcessing")
|
|
||||||
end
|
end
|
||||||
|
|
||||||
#TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)"
|
# TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)"
|
1
package/test/initial_results.json
Normal file
1
package/test/initial_results.json
Normal file
File diff suppressed because one or more lines are too long
1
package/test/params.json
Normal file
1
package/test/params.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
[{"Julia":"1.11.4","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["CPUInterpreter"]}],"GPUT":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]]
|
@ -2,12 +2,22 @@ using ExpressionExecutorCuda
|
|||||||
using Test
|
using Test
|
||||||
|
|
||||||
const baseFolder = dirname(dirname(pathof(ExpressionExecutorCuda)))
|
const baseFolder = dirname(dirname(pathof(ExpressionExecutorCuda)))
|
||||||
|
include(joinpath(baseFolder, "src", "Utils.jl"))
|
||||||
include(joinpath(baseFolder, "src", "ExpressionProcessing.jl"))
|
include(joinpath(baseFolder, "src", "ExpressionProcessing.jl"))
|
||||||
include(joinpath(baseFolder, "src", "Interpreter.jl"))
|
include(joinpath(baseFolder, "src", "Interpreter.jl"))
|
||||||
include(joinpath(baseFolder, "src", "Transpiler.jl"))
|
include(joinpath(baseFolder, "src", "Transpiler.jl"))
|
||||||
|
|
||||||
@testset "ExpressionExecutorCuda.jl" begin
|
@testset "Functionality tests" begin
|
||||||
include("ExpressionProcessingTests.jl")
|
# include("ExpressionProcessingTests.jl")
|
||||||
include("InterpreterTests.jl")
|
# include("InterpreterTests.jl")
|
||||||
include("TranspilerTests.jl")
|
# include("TranspilerTests.jl")
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
||||||
|
# @testset "CPU Interpreter" begin
|
||||||
|
# include("CpuInterpreterTests.jl")
|
||||||
|
# end
|
||||||
|
|
||||||
|
@testset "Performance tests" begin
|
||||||
|
include("PerformanceTests.jl")
|
||||||
|
end
|
@ -9,5 +9,12 @@ Probably reference the performance evaluation papers for Julia and CUDA.jl
|
|||||||
\section{Interpreter}
|
\section{Interpreter}
|
||||||
Talk about how the interpreter has been developed.
|
Talk about how the interpreter has been developed.
|
||||||
|
|
||||||
|
\subsection{Performance tuning}
|
||||||
|
Document the process of performance tuning
|
||||||
|
|
||||||
|
|
||||||
\section{Transpiler}
|
\section{Transpiler}
|
||||||
Talk about how the transpiler has been developed
|
Talk about how the transpiler has been developed
|
||||||
|
|
||||||
|
\subsection{Performance tuning}
|
||||||
|
Document the process of performance tuning
|
@ -41,7 +41,7 @@ In order to answer the research questions, this thesis is divided into the follo
|
|||||||
\item[Chapter 4: Implementation] \mbox{} \\
|
\item[Chapter 4: Implementation] \mbox{} \\
|
||||||
This chapter explains the implementation of the GPU interpreter and transpiler. The details of the implementation with the used technologies are covered, such as the interpretation process and the transpilation of the expressions into Parallel Thread Execution (PTX) code.
|
This chapter explains the implementation of the GPU interpreter and transpiler. The details of the implementation with the used technologies are covered, such as the interpretation process and the transpilation of the expressions into Parallel Thread Execution (PTX) code.
|
||||||
\item[Chapter 5: Evaluation] \mbox{} \\
|
\item[Chapter 5: Evaluation] \mbox{} \\
|
||||||
The software and hardware requirements and the evaluation environment are introduced in this chapter. All three evaluators will be compared against each other and the form of the expressions used for the comparisons are outlined. Finally, the results of the comparison of the GPU and CPU evaluators are presented to show which of these yields the best performance.
|
The software and hardware requirements and the evaluation environment are introduced in this chapter. All three evaluators will be compared against each other and the form of the expressions used for the comparisons are outlined. The comparison will not only include the time taken for the pure evaluation, but it will also include the overhead, like PTX code generation. Finally, the results of the comparison of the GPU and CPU evaluators are presented to show which of these yields the best performance.
|
||||||
\item[Chapter 6: Conclusion] \mbox{} \\
|
\item[Chapter 6: Conclusion] \mbox{} \\
|
||||||
In the final chapter, the entire work is summarised. A brief overview of the implementation as well as the evaluation results will be provided. Additionally, an outlook of possible future research is given.
|
In the final chapter, the entire work is summarised. A brief overview of the implementation as well as the evaluation results will be provided. Additionally, an outlook of possible future research is given.
|
||||||
\end{description}
|
\end{description}
|
||||||
|
BIN
thesis/main.pdf
BIN
thesis/main.pdf
Binary file not shown.
Reference in New Issue
Block a user