transpiler: generates valid PTX and evaluates expressions correctly
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
This commit is contained in:
parent
9df78ca72e
commit
effd477558
|
@ -9,7 +9,7 @@ const BYTES = sizeof(Float32)
|
||||||
const Operand = Union{Float32, String} # Operand is either fixed value or register
|
const Operand = Union{Float32, String} # Operand is either fixed value or register
|
||||||
cache = Dict{Expr, CuFunction}() # needed if multiple runs with the same expr but different parameters are performed
|
cache = Dict{Expr, CuFunction}() # needed if multiple runs with the same expr but different parameters are performed
|
||||||
|
|
||||||
function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})
|
function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
|
||||||
varRows = size(variables, 1)
|
varRows = size(variables, 1)
|
||||||
variableCols = size(variables, 2)
|
variableCols = size(variables, 2)
|
||||||
kernels = Vector{CuFunction}(undef, length(expressions))
|
kernels = Vector{CuFunction}(undef, length(expressions))
|
||||||
|
@ -35,10 +35,8 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
|
||||||
end
|
end
|
||||||
|
|
||||||
formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
|
formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
|
||||||
kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i)
|
kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i-1) # i-1 because julia is 1-based but PTX needs 0-based indexing
|
||||||
|
|
||||||
println(kernel)
|
|
||||||
|
|
||||||
linker = CuLink()
|
linker = CuLink()
|
||||||
add_data!(linker, "ExpressionProcessing", kernel)
|
add_data!(linker, "ExpressionProcessing", kernel)
|
||||||
|
|
||||||
|
@ -49,17 +47,11 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
|
||||||
cache[expressions[i]] = kernels[i]
|
cache[expressions[i]] = kernels[i]
|
||||||
end
|
end
|
||||||
|
|
||||||
cudaVars = CuArray(variables) # maybe put in shared memory (see runtests.jl for more info)
|
cudaVars = CuArray(variables) # maybe put in shared memory (see PerformanceTests.jl for more info)
|
||||||
cudaParams = Utils.create_cuda_array(parameters, NaN32) # maybe make constant (see runtests.jl for more info)
|
cudaParams = Utils.create_cuda_array(parameters, NaN32) # maybe make constant (see PerformanceTests.jl for more info)
|
||||||
|
|
||||||
# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
|
# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
|
||||||
cudaResults = CuArray{Float32}(undef, variableCols * length(expressions))
|
cudaResults = CuArray{Float32}(undef, variableCols, length(expressions))
|
||||||
# cudaResults = CUDA.zeros(variableCols * length(expressions))
|
|
||||||
# ptr = CuPtr{Float32}(C_NULL)
|
|
||||||
# CUDA.cuMemAlloc(ptr, sizeof(Float32) * 10)
|
|
||||||
# values = Float32[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
|
|
||||||
# CUDA.cuMemcpyHtoD(ptr, values, sizeof(Float32) * 10)
|
|
||||||
# copyto!(cudaResults, z)
|
|
||||||
|
|
||||||
# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
|
# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
|
||||||
for i in eachindex(kernels)
|
for i in eachindex(kernels)
|
||||||
|
@ -67,15 +59,10 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
|
||||||
threads = min(variableCols, config.threads)
|
threads = min(variableCols, config.threads)
|
||||||
blocks = cld(variableCols, threads)
|
blocks = cld(variableCols, threads)
|
||||||
|
|
||||||
# cudacall(kernels[i], (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
|
cudacall(kernels[i], (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
|
||||||
cudacall(kernels[i], (CuPtr{Float32},), cudaResults; threads=threads, blocks=blocks)
|
|
||||||
# launch(kernels[i], cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
|
|
||||||
# launch(kernels[i], cudaResults; threads=threads, blocks=blocks)
|
|
||||||
end
|
end
|
||||||
|
|
||||||
println(Array(cudaResults))
|
return cudaResults
|
||||||
# cudaResults = nothing
|
|
||||||
# CUDA.cuMemFree(ptr)
|
|
||||||
end
|
end
|
||||||
|
|
||||||
# To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string
|
# To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string
|
||||||
|
@ -83,29 +70,26 @@ end
|
||||||
"
|
"
|
||||||
- param ```varSetSize```: The size of a variable set. Equal to number of rows of variable matrix (in a column major matrix)
|
- param ```varSetSize```: The size of a variable set. Equal to number of rows of variable matrix (in a column major matrix)
|
||||||
- param ```paramSetSize```: The size of the longest parameter set. As it has to be stored in a column major matrix, the nr of rows is dependent oon the longest parameter set
|
- param ```paramSetSize```: The size of the longest parameter set. As it has to be stored in a column major matrix, the nr of rows is dependent oon the longest parameter set
|
||||||
- param ```expressionIndex```: The 1-based index of the expression
|
- param ```expressionIndex```: The 0-based index of the expression
|
||||||
"
|
"
|
||||||
function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Integer, paramSetSize::Integer,
|
function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Integer, paramSetSize::Integer,
|
||||||
nrOfVariableSets::Integer, expressionIndex::Integer)::String
|
nrOfVariableSets::Integer, expressionIndex::Integer)::String
|
||||||
exitJumpLocationMarker = "\$L__BB0_2"
|
exitJumpLocationMarker = "\$L__BB0_2"
|
||||||
ptxBuffer = IOBuffer()
|
ptxBuffer = IOBuffer()
|
||||||
|
regManager = Utils.RegisterManager(Dict(), Dict())
|
||||||
# TODO: Temp fix. Make these types and create new instances for every call to this function. Otherwise we cannot parallelise the transpilation
|
|
||||||
reset_registers()
|
|
||||||
reset_symtable()
|
|
||||||
|
|
||||||
# TODO: Suboptimal solution
|
# TODO: Suboptimal solution
|
||||||
signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Float32]) # Vars, Params, Results
|
signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Float32, Float32, Float32], regManager) # Vars, Params, Results
|
||||||
guardClause, threadIdReg = get_guard_clause(exitJumpLocationMarker, nrOfVariableSets)
|
guardClause, threadId64Reg = get_guard_clause(exitJumpLocationMarker, nrOfVariableSets, regManager)
|
||||||
|
|
||||||
println(ptxBuffer, get_cuda_header())
|
println(ptxBuffer, get_cuda_header())
|
||||||
println(ptxBuffer, signature)
|
println(ptxBuffer, signature)
|
||||||
println(ptxBuffer, "{")
|
println(ptxBuffer, "{")
|
||||||
|
|
||||||
|
|
||||||
calc_code = generate_calculation_code(expression, "%parameter0", varSetSize, "%parameter1", paramSetSize, "%parameter0",
|
calc_code = generate_calculation_code(expression, "%parameter0", varSetSize, "%parameter1", paramSetSize, "%parameter2",
|
||||||
threadIdReg, expressionIndex, nrOfVariableSets)
|
threadId64Reg, expressionIndex, nrOfVariableSets, regManager)
|
||||||
println(ptxBuffer, get_register_definitions())
|
println(ptxBuffer, Utils.get_register_definitions(regManager))
|
||||||
println(ptxBuffer, paramLoading)
|
println(ptxBuffer, paramLoading)
|
||||||
println(ptxBuffer, guardClause)
|
println(ptxBuffer, guardClause)
|
||||||
println(ptxBuffer, calc_code)
|
println(ptxBuffer, calc_code)
|
||||||
|
@ -123,14 +107,15 @@ function get_cuda_header()::String
|
||||||
return "
|
return "
|
||||||
.version 8.5
|
.version 8.5
|
||||||
.target sm_61
|
.target sm_61
|
||||||
.address_size 64
|
.address_size 32
|
||||||
"
|
"
|
||||||
end
|
end
|
||||||
|
|
||||||
"
|
"
|
||||||
param ```parameters```: [1] = nr of var sets; [2] = variables; [3] = parameters; [4] = result
|
param ```parameters```: [1] = nr of var sets; [2] = variables; [3] = parameters; [4] = result
|
||||||
"
|
"
|
||||||
function get_kernel_signature(kernelName::String, parameters::Vector{DataType})::Tuple{String, String}
|
function get_kernel_signature(kernelName::String, parameters::Vector{DataType}, regManager::Utils.RegisterManager)::Tuple{String, String}
|
||||||
|
|
||||||
signatureBuffer = IOBuffer()
|
signatureBuffer = IOBuffer()
|
||||||
paramLoadingBuffer = IOBuffer()
|
paramLoadingBuffer = IOBuffer()
|
||||||
print(signatureBuffer, ".visible .entry ")
|
print(signatureBuffer, ".visible .entry ")
|
||||||
|
@ -140,9 +125,9 @@ function get_kernel_signature(kernelName::String, parameters::Vector{DataType}):
|
||||||
for i in eachindex(parameters)
|
for i in eachindex(parameters)
|
||||||
print(signatureBuffer, " .param .u64", " ", "param_", i)
|
print(signatureBuffer, " .param .u64", " ", "param_", i)
|
||||||
|
|
||||||
parametersLocation = get_next_free_register("i")
|
parametersLocation = Utils.get_next_free_register(regManager, "rd")
|
||||||
println(paramLoadingBuffer, "ld.param.u64 $parametersLocation, [param_$i];")
|
println(paramLoadingBuffer, "ld.param.u64 $parametersLocation, [param_$i];")
|
||||||
println(paramLoadingBuffer, "cvta.to.global.u64 $(get_next_free_register("parameter")), $parametersLocation;")
|
println(paramLoadingBuffer, "cvta.to.global.u64 $(Utils.get_next_free_register(regManager, "parameter")), $parametersLocation;")
|
||||||
if i != lastindex(parameters)
|
if i != lastindex(parameters)
|
||||||
println(signatureBuffer, ",")
|
println(signatureBuffer, ",")
|
||||||
end
|
end
|
||||||
|
@ -157,27 +142,30 @@ Constructs the PTX code used for handling the case where too many threads are st
|
||||||
|
|
||||||
- param ```nrOfVarSetsRegister```: The register which holds the total amount of variable sets for the kernel
|
- param ```nrOfVarSetsRegister```: The register which holds the total amount of variable sets for the kernel
|
||||||
"
|
"
|
||||||
function get_guard_clause(exitJumpLocation::String, nrOfVarSets::Integer)::Tuple{String, String}
|
function get_guard_clause(exitJumpLocation::String, nrOfVarSets::Integer, regManager::Utils.RegisterManager)::Tuple{String, String}
|
||||||
guardBuffer = IOBuffer()
|
guardBuffer = IOBuffer()
|
||||||
|
|
||||||
threadIds = get_next_free_register("r")
|
threadIds = Utils.get_next_free_register(regManager, "r")
|
||||||
threadsPerCTA = get_next_free_register("r")
|
threadsPerCTA = Utils.get_next_free_register(regManager, "r")
|
||||||
currentThreadId = get_next_free_register("r")
|
currentThreadId = Utils.get_next_free_register(regManager, "r")
|
||||||
|
|
||||||
# load data into above defined registers
|
|
||||||
println(guardBuffer, "mov.u32 $threadIds, %ntid.x;")
|
println(guardBuffer, "mov.u32 $threadIds, %ntid.x;")
|
||||||
println(guardBuffer, "mov.u32 $threadsPerCTA, %ctaid.x;")
|
println(guardBuffer, "mov.u32 $threadsPerCTA, %ctaid.x;")
|
||||||
println(guardBuffer, "mov.u32 $currentThreadId, %tid.x;")
|
println(guardBuffer, "mov.u32 $currentThreadId, %tid.x;")
|
||||||
|
|
||||||
globalThreadId = get_next_free_register("r") # basically the index of the thread in the variable set
|
globalThreadId = Utils.get_next_free_register(regManager, "r") # basically the index of the thread in the variable set
|
||||||
breakCondition = get_next_free_register("p")
|
breakCondition = Utils.get_next_free_register(regManager, "p")
|
||||||
println(guardBuffer, "mad.lo.s32 $globalThreadId, $threadIds, $threadsPerCTA, $currentThreadId;")
|
println(guardBuffer, "mad.lo.s32 $globalThreadId, $threadIds, $threadsPerCTA, $currentThreadId;")
|
||||||
println(guardBuffer, "setp.gt.s32 $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets
|
println(guardBuffer, "setp.gt.s32 $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets
|
||||||
|
|
||||||
# branch to end if breakCondition is true
|
# branch to end if breakCondition is true
|
||||||
print(guardBuffer, "@$breakCondition bra $exitJumpLocation;")
|
println(guardBuffer, "@$breakCondition bra $exitJumpLocation;")
|
||||||
|
|
||||||
return (String(take!(guardBuffer)), globalThreadId)
|
# Convert threadIdReg to a 64 bit register. Not 64 bit from the start, as this would take up more registers. Performance tests can be performed to determin if it is faster doing this, or making everything 64-bit from the start
|
||||||
|
threadId64Reg = Utils.get_next_free_register(regManager, "rd")
|
||||||
|
print(guardBuffer, "cvt.u64.u32 $threadId64Reg, $globalThreadId;")
|
||||||
|
|
||||||
|
return (String(take!(guardBuffer)), threadId64Reg)
|
||||||
end
|
end
|
||||||
|
|
||||||
"
|
"
|
||||||
|
@ -185,13 +173,14 @@ end
|
||||||
"
|
"
|
||||||
function generate_calculation_code(expression::ExpressionProcessing.PostfixType, variablesLocation::String, variablesSetSize::Integer,
|
function generate_calculation_code(expression::ExpressionProcessing.PostfixType, variablesLocation::String, variablesSetSize::Integer,
|
||||||
parametersLocation::String, parametersSetSize::Integer, resultsLocation::String,
|
parametersLocation::String, parametersSetSize::Integer, resultsLocation::String,
|
||||||
threadIdReg::String, expressionIndex::Integer, nrOfVarSets::Integer)::String
|
threadId64Reg::String, expressionIndex::Integer, nrOfVarSets::Integer, regManager::Utils.RegisterManager)::String
|
||||||
|
|
||||||
# return "st.global.f32 [$resultsLocation], 10.0;"
|
|
||||||
|
|
||||||
codeBuffer = IOBuffer()
|
codeBuffer = IOBuffer()
|
||||||
operands = Vector{Operand}()
|
operands = Vector{Operand}()
|
||||||
|
|
||||||
|
exprId64Reg = Utils.get_next_free_register(regManager, "rd")
|
||||||
|
println(codeBuffer, "mov.u64 $exprId64Reg, $expressionIndex;")
|
||||||
|
|
||||||
for i in eachindex(expression)
|
for i in eachindex(expression)
|
||||||
token = expression[i]
|
token = expression[i]
|
||||||
|
|
||||||
|
@ -207,38 +196,37 @@ function generate_calculation_code(expression::ExpressionProcessing.PostfixType,
|
||||||
else
|
else
|
||||||
left = pop!(operands)
|
left = pop!(operands)
|
||||||
end
|
end
|
||||||
operation, resultRegister = get_operation(operator, left, right)
|
operation, resultRegister = get_operation(operator, regManager, left, right)
|
||||||
|
|
||||||
println(codeBuffer, operation)
|
println(codeBuffer, operation)
|
||||||
push!(operands, resultRegister)
|
push!(operands, resultRegister)
|
||||||
elseif token.Type == INDEX
|
elseif token.Type == INDEX
|
||||||
if token.Value > 0 # varaibles
|
if token.Value > 0 # varaibles
|
||||||
var, first_access = get_register_for_name("x$(token.Value)")
|
var, first_access = Utils.get_register_for_name(regManager, "x$(token.Value)")
|
||||||
if first_access
|
if first_access
|
||||||
println(codeBuffer, load_into_register(var, variablesLocation, token.Value, threadIdReg, variablesSetSize))
|
println(codeBuffer, load_into_register(var, variablesLocation, token.Value, threadId64Reg, variablesSetSize, regManager))
|
||||||
end
|
end
|
||||||
push!(operands, var)
|
push!(operands, var)
|
||||||
else
|
else
|
||||||
absVal = abs(token.Value)
|
absVal = abs(token.Value)
|
||||||
param, first_access = get_register_for_name("p$absVal")
|
param, first_access = Utils.get_register_for_name(regManager, "p$absVal")
|
||||||
if first_access
|
if first_access
|
||||||
println(codeBuffer, load_into_register(param, parametersLocation, absVal, threadIdReg, parametersSetSize))
|
println(codeBuffer, load_into_register(param, parametersLocation, absVal, exprId64Reg, parametersSetSize, regManager))
|
||||||
end
|
end
|
||||||
push!(operands, param)
|
push!(operands, param)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
# resultIndex = ((expressionIndex - 1) * nrOfVarSets + threadIdReg) * bytes (4 in our case)
|
tempReg = Utils.get_next_free_register(regManager, "rd")
|
||||||
# resultsLocation[resultIndex] = "";
|
# reg = pop!(operands)
|
||||||
# tempReg = get_next_free_register("i")
|
# tmp = "abs.f32 $(reg), 16.0;"
|
||||||
# println(codeBuffer, "
|
# push!(operands, reg)
|
||||||
# add.u32 $tempReg, $((expressionIndex-1)*nrOfVarSets), $threadIdReg;
|
println(codeBuffer, "
|
||||||
# mul.lo.u32 $tempReg, $tempReg, $BYTES;
|
add.u64 $tempReg, $((expressionIndex)*nrOfVarSets), $threadId64Reg;
|
||||||
# add.u32 $tempReg, $resultsLocation, $tempReg;
|
mad.lo.u64 $tempReg, $tempReg, $BYTES, $resultsLocation;
|
||||||
# st.global.f32 [$tempReg], $(pop!(operands));
|
st.global.f32 [$tempReg], $(pop!(operands));
|
||||||
# ")
|
")
|
||||||
println(codeBuffer, "st.global.f32 [$resultsLocation], 10.0;")
|
|
||||||
|
|
||||||
return String(take!(codeBuffer))
|
return String(take!(codeBuffer))
|
||||||
end
|
end
|
||||||
|
@ -248,18 +236,17 @@ Loads a value from a location into the given register. It is assumed that the lo
|
||||||
|
|
||||||
- param ```register```: The register where the loaded value will be stored
|
- param ```register```: The register where the loaded value will be stored
|
||||||
- param ```loadLocation```: The location from where to load the value
|
- param ```loadLocation```: The location from where to load the value
|
||||||
- param ```valueIndex```: 0-based index of the value in the variable set/parameter set
|
- param ```valueIndex```: 1-based index of the value in the variable set/parameter set
|
||||||
- param ```setIndexReg```: 0-based index of the set. Needed to calculate the actual index from the ```valueIndex```. Is equal to the global threadId
|
- param ```setIndexReg64```: 0-based index of the set. Needed to calculate the actual index from the ```valueIndex```. Is equal to the global threadId
|
||||||
- param ```setSize```: The size of one set. Needed to calculate the actual index from the ```valueIndex```. Total number of elements in the set (length(set))
|
- param ```setSize```: The size of one set. Needed to calculate the actual index from the ```valueIndex```. Total number of elements in the set (length(set))
|
||||||
"
|
"
|
||||||
function load_into_register(register::String, loadLocation::String, valueIndex::Integer, setIndexReg::String, setSize::Integer)::String
|
function load_into_register(register::String, loadLocation::String, valueIndex::Integer, setIndexReg64::String, setSize::Integer, regManager::Utils.RegisterManager)::String
|
||||||
# loadLocation + startIndex + valueIndex * bytes (4 in our case)
|
tempReg = Utils.get_next_free_register(regManager, "rd")
|
||||||
# startIndex: setIndex * setSize
|
|
||||||
tempReg = get_next_free_register("i")
|
# "mad" calculates the offset and "add" applies the offset. Classical pointer arithmetic for accessing values of an array like in C
|
||||||
return "
|
return "
|
||||||
mul.lo.u32 $tempReg, $setIndexReg, $setSize;
|
mad.lo.u64 $tempReg, $setIndexReg64, $(setSize*BYTES), $((valueIndex - 1) * BYTES);
|
||||||
add.u32 $tempReg, $tempReg, $(valueIndex*BYTES);
|
add.u64 $tempReg, $loadLocation, $tempReg;
|
||||||
add.u32 $tempReg, $loadLocation, $tempReg;
|
|
||||||
ld.global.f32 $register, [$tempReg];"
|
ld.global.f32 $register, [$tempReg];"
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -275,8 +262,8 @@ function type_to_ptx_type(type::DataType)::String
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
function get_operation(operator::Operator, left::Operand, right::Union{Operand, Nothing} = nothing)::Tuple{String, String}
|
function get_operation(operator::Operator, regManager::Utils.RegisterManager, left::Operand, right::Union{Operand, Nothing} = nothing)::Tuple{String, String}
|
||||||
resultRegister = get_next_free_register("f")
|
resultRegister = Utils.get_next_free_register(regManager, "f")
|
||||||
resultCode = ""
|
resultCode = ""
|
||||||
|
|
||||||
if is_binary_operator(operator) && isnothing(right)
|
if is_binary_operator(operator) && isnothing(right)
|
||||||
|
@ -320,79 +307,5 @@ function get_operation(operator::Operator, left::Operand, right::Union{Operand,
|
||||||
return (resultCode, resultRegister)
|
return (resultCode, resultRegister)
|
||||||
end
|
end
|
||||||
|
|
||||||
let registers = Dict() # stores the count of the register already used.
|
|
||||||
global get_next_free_register
|
|
||||||
global get_register_definitions
|
|
||||||
global reset_registers
|
|
||||||
|
|
||||||
function reset_registers()
|
|
||||||
registers = Dict()
|
|
||||||
end
|
|
||||||
|
|
||||||
|
|
||||||
# By convention these names correspond to the following types:
|
|
||||||
# - p -> pred
|
|
||||||
# - f -> float32
|
|
||||||
# - r -> 32 bit
|
|
||||||
# - var -> float32 (used for variables and params)
|
|
||||||
function get_next_free_register(name::String)::String
|
|
||||||
if haskey(registers, name)
|
|
||||||
registers[name] += 1
|
|
||||||
else
|
|
||||||
registers[name] = 1
|
|
||||||
end
|
|
||||||
|
|
||||||
return string("%", name, registers[name] - 1)
|
|
||||||
end
|
|
||||||
|
|
||||||
function get_register_definitions()::String
|
|
||||||
registersBuffer = IOBuffer()
|
|
||||||
|
|
||||||
for definition in registers
|
|
||||||
regType = ""
|
|
||||||
if definition.first == "p"
|
|
||||||
regType = ".pred"
|
|
||||||
elseif definition.first == "f"
|
|
||||||
regType = ".f32"
|
|
||||||
elseif definition.first == "var"
|
|
||||||
regType = ".f32"
|
|
||||||
elseif definition.first == "param"
|
|
||||||
regType = ".f32"
|
|
||||||
elseif definition.first == "r"
|
|
||||||
regType = ".b32"
|
|
||||||
elseif definition.first == "parameter"
|
|
||||||
regType = ".b64"
|
|
||||||
elseif definition.first == "i"
|
|
||||||
regType = ".b64"
|
|
||||||
else
|
|
||||||
throw(ArgumentError("Unknown register name used. Name '$(definition.first)' cannot be mapped to a PTX type."))
|
|
||||||
end
|
|
||||||
println(registersBuffer, ".reg $regType %$(definition.first)<$(definition.second)>;")
|
|
||||||
end
|
|
||||||
|
|
||||||
return String(take!(registersBuffer))
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
let symtable = Dict()
|
|
||||||
global get_register_for_name
|
|
||||||
global reset_symtable
|
|
||||||
|
|
||||||
function reset_symtable()
|
|
||||||
symtable = Dict()
|
|
||||||
end
|
|
||||||
|
|
||||||
"Returns the register for this variable/parameter and true if it is used for the first time and false otherwise."
|
|
||||||
function get_register_for_name(varName::String)
|
|
||||||
if haskey(symtable, varName)
|
|
||||||
return (symtable[varName], false)
|
|
||||||
else
|
|
||||||
reg = get_next_free_register("var")
|
|
||||||
symtable[varName] = reg
|
|
||||||
return (reg, true)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -6,39 +6,83 @@ using CUDA
|
||||||
|
|
||||||
All entries that cannot be filled have ```invalidElement``` as their value
|
All entries that cannot be filled have ```invalidElement``` as their value
|
||||||
"
|
"
|
||||||
function convert_to_matrix(vec::Vector{Vector{T}}, invalidElement::T)::Matrix{T} where T
|
function convert_to_matrix(vecs::Vector{Vector{T}}, invalidElement::T)::Matrix{T} where T
|
||||||
vecCols = get_max_inner_length(vec)
|
maxLength = get_max_inner_length(vecs)
|
||||||
vecRows = length(vec)
|
|
||||||
vecMat = fill(invalidElement, vecCols, vecRows)
|
# Pad the shorter vectors with the invalidElement
|
||||||
|
paddedVecs = [vcat(vec, fill(invalidElement, maxLength - length(vec))) for vec in vecs]
|
||||||
for i in eachindex(vec)
|
vecMat = hcat(paddedVecs...)
|
||||||
vecMat[:,i] = copyto!(vecMat[:,i], vec[i])
|
|
||||||
end
|
|
||||||
|
|
||||||
return vecMat
|
return vecMat
|
||||||
end
|
end
|
||||||
|
|
||||||
"Retrieves the number of entries for the largest inner vector"
|
"Retrieves the number of entries for the largest inner vector"
|
||||||
function get_max_inner_length(vec::Vector{Vector{T}})::Int where T
|
function get_max_inner_length(vecs::Vector{Vector{T}})::Int where T
|
||||||
maxLength = 0
|
return maximum(length.(vecs))
|
||||||
@inbounds for i in eachindex(vec)
|
|
||||||
if length(vec[i]) > maxLength
|
|
||||||
maxLength = length(vec[i])
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
return maxLength
|
|
||||||
end
|
end
|
||||||
|
|
||||||
"Returns a CuArray filed with the data provided. The inner vectors do not have to have the same length. All missing elements will be the value ```invalidElement```"
|
"Returns a CuArray filed with the data provided. The inner vectors do not have to have the same length. All missing elements will be the value ```invalidElement```"
|
||||||
function create_cuda_array(data::Vector{Vector{T}}, invalidElement::T)::CuArray{T} where T
|
function create_cuda_array(data::Vector{Vector{T}}, invalidElement::T)::CuArray{T} where T
|
||||||
dataCols = Utils.get_max_inner_length(data)
|
dataMat = convert_to_matrix(data, invalidElement)
|
||||||
dataRows = length(data)
|
cudaArr = CuArray(dataMat)
|
||||||
dataMat = Utils.convert_to_matrix(data, invalidElement)
|
|
||||||
cudaArr = CuArray{T}(undef, dataCols, dataRows) # length(parameters) == number of expressions
|
|
||||||
copyto!(cudaArr, dataMat)
|
|
||||||
|
|
||||||
return cudaArr
|
return cudaArr
|
||||||
end
|
end
|
||||||
|
|
||||||
|
struct RegisterManager
|
||||||
|
registers::Dict
|
||||||
|
symtable::Dict
|
||||||
|
end
|
||||||
|
|
||||||
|
function get_next_free_register(manager::RegisterManager, name::String)::String
|
||||||
|
if haskey(manager.registers, name)
|
||||||
|
manager.registers[name] += 1
|
||||||
|
else
|
||||||
|
manager.registers[name] = 1
|
||||||
|
end
|
||||||
|
|
||||||
|
return string("%", name, manager.registers[name] - 1)
|
||||||
|
end
|
||||||
|
|
||||||
|
function get_register_definitions(manager::RegisterManager)::String
|
||||||
|
registersBuffer = IOBuffer()
|
||||||
|
|
||||||
|
for definition in manager.registers
|
||||||
|
regType = ""
|
||||||
|
if definition.first == "p"
|
||||||
|
regType = ".pred"
|
||||||
|
elseif definition.first == "f"
|
||||||
|
regType = ".f32"
|
||||||
|
elseif definition.first == "var"
|
||||||
|
regType = ".f32"
|
||||||
|
elseif definition.first == "param"
|
||||||
|
regType = ".f32"
|
||||||
|
elseif definition.first == "r"
|
||||||
|
regType = ".b32"
|
||||||
|
elseif definition.first == "rd"
|
||||||
|
regType = ".b64"
|
||||||
|
elseif definition.first == "parameter"
|
||||||
|
regType = ".b64"
|
||||||
|
elseif definition.first == "i"
|
||||||
|
regType = ".b64"
|
||||||
|
else
|
||||||
|
throw(ArgumentError("Unknown register name used. Name '$(definition.first)' cannot be mapped to a PTX type."))
|
||||||
|
end
|
||||||
|
println(registersBuffer, ".reg $regType %$(definition.first)<$(definition.second)>;")
|
||||||
|
end
|
||||||
|
|
||||||
|
return String(take!(registersBuffer))
|
||||||
|
end
|
||||||
|
|
||||||
|
"Returns the register for this variable/parameter and true if it is used for the first time and false otherwise."
|
||||||
|
function get_register_for_name(manager::RegisterManager, varName::String)
|
||||||
|
if haskey(manager.symtable, varName)
|
||||||
|
return (manager.symtable[varName], false)
|
||||||
|
else
|
||||||
|
reg = get_next_free_register(manager, "var")
|
||||||
|
manager.symtable[varName] = reg
|
||||||
|
return (reg, true)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
end
|
end
|
|
@ -1,6 +1,8 @@
|
||||||
using .Transpiler
|
using .Transpiler
|
||||||
using .Interpreter
|
using .Interpreter
|
||||||
|
|
||||||
|
# University setup at 10.20.1.7 if needed
|
||||||
|
|
||||||
@testset "CPU performance" begin
|
@testset "CPU performance" begin
|
||||||
function test_cpu_interpreter(nrows; parallel = false)
|
function test_cpu_interpreter(nrows; parallel = false)
|
||||||
exprs = [
|
exprs = [
|
||||||
|
@ -39,9 +41,23 @@ LinearAlgebra.BLAS.set_num_threads(1) # only use a single thread for peakflops
|
||||||
end
|
end
|
||||||
|
|
||||||
@testset "Interpreter Performance" begin
|
@testset "Interpreter Performance" begin
|
||||||
|
# Put data in shared memory:
|
||||||
|
# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
|
||||||
|
|
||||||
|
# Make array const:
|
||||||
|
# https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays
|
||||||
|
|
||||||
|
# Memory management like in C++ might help with performance improvements
|
||||||
|
# https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management
|
||||||
end
|
end
|
||||||
|
|
||||||
@testset "Transpiler Performance" begin
|
@testset "Transpiler Performance" begin
|
||||||
|
# Put data in shared memory:
|
||||||
|
# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
|
||||||
|
|
||||||
|
# Make array const:
|
||||||
|
# https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays
|
||||||
|
|
||||||
|
# Memory management like in C++ might help with performance improvements
|
||||||
|
# https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management
|
||||||
end
|
end
|
|
@ -2,138 +2,65 @@ using CUDA
|
||||||
using .ExpressionProcessing
|
using .ExpressionProcessing
|
||||||
using .Transpiler
|
using .Transpiler
|
||||||
|
|
||||||
expressions = Vector{Expr}(undef, 2)
|
expressions = Vector{Expr}(undef, 3)
|
||||||
variables = Matrix{Float32}(undef, 2,2)
|
variables = Matrix{Float32}(undef, 5, 4)
|
||||||
parameters = Vector{Vector{Float32}}(undef, 2)
|
parameters = Vector{Vector{Float32}}(undef, 3)
|
||||||
|
|
||||||
# Resulting value should be 1.14... for the first expression
|
|
||||||
expressions[1] = :(1 + 3 * 5 / 7 - sqrt(4))
|
expressions[1] = :(1 + 3 * 5 / 7 - sqrt(4))
|
||||||
expressions[2] = :(5 + x1 + 1 * x2 + p1 + p2)
|
expressions[2] = :(5 + x1 + 1 * x2 + p1 + p2 + x1^x3)
|
||||||
|
expressions[3] = :(log(x1) / x2 * sqrt(p1) + x3^x4 - exp(x5))
|
||||||
|
|
||||||
variables[1,1] = 2.0
|
variables[1,1] = 2.0
|
||||||
variables[2,1] = 3.0
|
variables[2,1] = 3.0
|
||||||
variables[1,2] = 0.0
|
variables[3,1] = 0.0
|
||||||
|
variables[4,1] = 1.0
|
||||||
|
variables[5,1] = 0.0
|
||||||
|
|
||||||
|
variables[1,2] = 2.0
|
||||||
variables[2,2] = 5.0
|
variables[2,2] = 5.0
|
||||||
parameters[1] = Vector{Float32}(undef, 1)
|
variables[3,2] = 3.0
|
||||||
|
variables[4,2] = 0.0
|
||||||
|
variables[5,2] = 0.0
|
||||||
|
|
||||||
|
variables[1,3] = 6.0
|
||||||
|
variables[2,3] = 2.0
|
||||||
|
variables[3,3] = 2.0
|
||||||
|
variables[4,3] = 4.0
|
||||||
|
variables[5,3] = 2.0
|
||||||
|
|
||||||
|
variables[1,4] = 1.0
|
||||||
|
variables[2,4] = 2.0
|
||||||
|
variables[3,4] = 3.0
|
||||||
|
variables[4,4] = 4.0
|
||||||
|
variables[5,4] = 5.0
|
||||||
|
|
||||||
|
parameters[1] = Vector{Float32}(undef, 0)
|
||||||
parameters[2] = Vector{Float32}(undef, 2)
|
parameters[2] = Vector{Float32}(undef, 2)
|
||||||
parameters[1][1] = 5.0
|
parameters[3] = Vector{Float32}(undef, 1)
|
||||||
parameters[2][1] = 5.0
|
parameters[2][1] = 5.0
|
||||||
parameters[2][2] = 0.0
|
parameters[2][2] = 0.0
|
||||||
|
parameters[3][1] = 16.0
|
||||||
|
|
||||||
@testset "Test TMP transpiler" begin
|
|
||||||
postfixExpr = expr_to_postfix(expressions[1])
|
|
||||||
postfixExprs = Vector([postfixExpr])
|
|
||||||
push!(postfixExprs, expr_to_postfix(expressions[2]))
|
|
||||||
push!(postfixExprs, expr_to_postfix(:(5^3 + x1 - p1)))
|
|
||||||
|
|
||||||
# generatedCode = Transpiler.transpile(postfixExpr)
|
|
||||||
# generatedCode = Transpiler.transpile(postfixExprs[3], 2, 3, 2, 3) # TEMP
|
|
||||||
# println(generatedCode)
|
|
||||||
# CUDA.@sync interpret(postfixExprs, variables, parameters)
|
|
||||||
|
|
||||||
# This is just here for testing. This will be called inside the execute method in the Transpiler module
|
|
||||||
# linker = CuLink()
|
|
||||||
# add_data!(linker, "ExpressionProcessing", generatedCode)
|
|
||||||
|
|
||||||
# image = complete(linker)
|
|
||||||
|
|
||||||
# mod = CuModule(image)
|
|
||||||
# func = CuFunction(mod, "ExpressionProcessing")
|
|
||||||
end
|
|
||||||
|
|
||||||
@testset "Test transpiler evaluation" begin
|
@testset "Test transpiler evaluation" begin
|
||||||
# postfixExprs = Vector{Expr}()
|
results = Transpiler.evaluate(expressions, variables, parameters)
|
||||||
# push!(postfixExprs, expressions[1])
|
|
||||||
# push!(postfixExprs, expressions[2])
|
|
||||||
|
|
||||||
expr = Vector{Expr}()
|
# dump(expressions[3]; maxdepth=10)
|
||||||
push!(expr, expressions[1])
|
# Expr 1:
|
||||||
@time Transpiler.evaluate(expr, variables, parameters)
|
@test isapprox(results[1,1], 1.14286)
|
||||||
|
@test isapprox(results[2,1], 1.14286)
|
||||||
|
@test isapprox(results[3,1], 1.14286)
|
||||||
|
@test isapprox(results[4,1], 1.14286)
|
||||||
|
#Expr 2:
|
||||||
|
@test isapprox(results[1,2], 16.0)
|
||||||
|
@test isapprox(results[2,2], 25.0)
|
||||||
|
@test isapprox(results[3,2], 54.0)
|
||||||
|
@test isapprox(results[4,2], 14.0)
|
||||||
|
|
||||||
|
#Expr3:
|
||||||
|
@test isapprox(results[1,3], -0.07580)
|
||||||
|
@test isapprox(results[2,3], 0.55452)
|
||||||
|
@test isapprox(results[3,3], 12.19446)
|
||||||
|
@test isapprox(results[4,3], -67.41316)
|
||||||
end
|
end
|
||||||
|
|
||||||
#TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)"
|
# TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)"
|
||||||
|
|
||||||
function test_kernel(results)
|
|
||||||
@inbounds results[1] = 10f0
|
|
||||||
|
|
||||||
return nothing
|
|
||||||
end
|
|
||||||
|
|
||||||
@testset "TEMP" begin
|
|
||||||
return
|
|
||||||
results = CuArray{Float32}(undef, 2)
|
|
||||||
# @device_code_ptx @cuda test_kernel(results)
|
|
||||||
|
|
||||||
|
|
||||||
# println(CUDA.code_ptx(kernel.fun, ))
|
|
||||||
# return
|
|
||||||
|
|
||||||
ptx = "
|
|
||||||
.version 8.5
|
|
||||||
.target sm_61
|
|
||||||
.address_size 64
|
|
||||||
|
|
||||||
.visible .entry ExpressionProcessing(
|
|
||||||
.param .u64 param_1)
|
|
||||||
{
|
|
||||||
.reg .b64 %parameter<1>;
|
|
||||||
.reg .b64 %i<1>;
|
|
||||||
//.reg .b64 %rd<6>;
|
|
||||||
|
|
||||||
ld.param.u64 %i0, [param_1];
|
|
||||||
cvta.to.global.u64 %parameter0, %i0;
|
|
||||||
|
|
||||||
st.global.f32 [%parameter0], 10.0;
|
|
||||||
ret;
|
|
||||||
}"
|
|
||||||
|
|
||||||
ptx = ".version 8.5
|
|
||||||
.target sm_61
|
|
||||||
.address_size 64
|
|
||||||
|
|
||||||
.visible .entry ExpressionProcessing(
|
|
||||||
.param .u64 param_1)
|
|
||||||
{
|
|
||||||
.reg .b64 %parameter<1>;
|
|
||||||
.reg .b32 %r<4>;
|
|
||||||
.reg .pred %p<1>;
|
|
||||||
.reg .b64 %i<1>;
|
|
||||||
|
|
||||||
ld.param.u64 %i0, [param_1];
|
|
||||||
cvta.to.global.u64 %parameter0, %i0;
|
|
||||||
|
|
||||||
mov.u32 %r0, %ntid.x;
|
|
||||||
mov.u32 %r1, %ctaid.x;
|
|
||||||
mov.u32 %r2, %tid.x;
|
|
||||||
mad.lo.s32 %r3, %r0, %r1, %r2;
|
|
||||||
setp.gt.s32 %p0, %r3, 2;
|
|
||||||
@%p0 bra \$L__BB0_2;
|
|
||||||
st.global.f32 [%parameter0], 10.0;
|
|
||||||
\$L__BB0_2: ret;
|
|
||||||
}"
|
|
||||||
|
|
||||||
linker = CuLink()
|
|
||||||
add_data!(linker, "ExpressionProcessing", ptx)
|
|
||||||
|
|
||||||
image = complete(linker)
|
|
||||||
|
|
||||||
mod = CuModule(image)
|
|
||||||
func = CuFunction(mod, "ExpressionProcessing")
|
|
||||||
|
|
||||||
variableCols = 2
|
|
||||||
cudaResults = CuArray{Float32}(undef, 1)
|
|
||||||
# cd = CUDA.alloc(CUDA.DeviceMemory, (variableCols * length(expressions)) * sizeof(Float32))
|
|
||||||
# cudaResults = CUDA.fill(0f0, variableCols * length(expressions))
|
|
||||||
# cudaResults = cu(zeros(Float32, variableCols * length(expressions)))
|
|
||||||
|
|
||||||
config = launch_configuration(func)
|
|
||||||
threads = min(variableCols, config.threads)
|
|
||||||
blocks = cld(variableCols, threads)
|
|
||||||
|
|
||||||
cudacall(func, Tuple{CuPtr{Float32}}, cudaResults; threads=4, blocks=1)
|
|
||||||
# launch(func, cudaResults; threads=threads, blocks=blocks)
|
|
||||||
|
|
||||||
println(Array(cudaResults))
|
|
||||||
end
|
|
||||||
|
|
||||||
# TODO: University setup at 10.20.1.7
|
|
|
@ -7,7 +7,7 @@ include(joinpath(baseFolder, "src", "ExpressionProcessing.jl"))
|
||||||
include(joinpath(baseFolder, "src", "Interpreter.jl"))
|
include(joinpath(baseFolder, "src", "Interpreter.jl"))
|
||||||
include(joinpath(baseFolder, "src", "Transpiler.jl"))
|
include(joinpath(baseFolder, "src", "Transpiler.jl"))
|
||||||
|
|
||||||
@testset "ExpressionExecutorCuda.jl" begin
|
@testset "Functionality tests" begin
|
||||||
# include("ExpressionProcessingTests.jl")
|
# include("ExpressionProcessingTests.jl")
|
||||||
# include("InterpreterTests.jl")
|
# include("InterpreterTests.jl")
|
||||||
include("TranspilerTests.jl")
|
include("TranspilerTests.jl")
|
||||||
|
@ -19,16 +19,5 @@ end
|
||||||
# end
|
# end
|
||||||
|
|
||||||
@testset "Performance tests" begin
|
@testset "Performance tests" begin
|
||||||
# TODO: make performance tests
|
# include("PerformanceTests.jl")
|
||||||
|
|
||||||
# Put data in shared memory:
|
|
||||||
# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
|
|
||||||
|
|
||||||
# Make array const:
|
|
||||||
# https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays
|
|
||||||
|
|
||||||
# Memory management like in C++ might help with performance improvements
|
|
||||||
# https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management
|
|
||||||
|
|
||||||
|
|
||||||
end
|
end
|
Loading…
Reference in New Issue
Block a user