transpiler: generates valid PTX and evaluates expressions correctly
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run

This commit is contained in:
Daniel 2025-03-28 19:32:48 +01:00
parent 9df78ca72e
commit effd477558
5 changed files with 195 additions and 306 deletions

View File

@ -9,7 +9,7 @@ const BYTES = sizeof(Float32)
const Operand = Union{Float32, String} # Operand is either fixed value or register const Operand = Union{Float32, String} # Operand is either fixed value or register
cache = Dict{Expr, CuFunction}() # needed if multiple runs with the same expr but different parameters are performed cache = Dict{Expr, CuFunction}() # needed if multiple runs with the same expr but different parameters are performed
function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}}) function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
varRows = size(variables, 1) varRows = size(variables, 1)
variableCols = size(variables, 2) variableCols = size(variables, 2)
kernels = Vector{CuFunction}(undef, length(expressions)) kernels = Vector{CuFunction}(undef, length(expressions))
@ -35,9 +35,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
end end
formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i]) formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i) kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i-1) # i-1 because julia is 1-based but PTX needs 0-based indexing
println(kernel)
linker = CuLink() linker = CuLink()
add_data!(linker, "ExpressionProcessing", kernel) add_data!(linker, "ExpressionProcessing", kernel)
@ -49,17 +47,11 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
cache[expressions[i]] = kernels[i] cache[expressions[i]] = kernels[i]
end end
cudaVars = CuArray(variables) # maybe put in shared memory (see runtests.jl for more info) cudaVars = CuArray(variables) # maybe put in shared memory (see PerformanceTests.jl for more info)
cudaParams = Utils.create_cuda_array(parameters, NaN32) # maybe make constant (see runtests.jl for more info) cudaParams = Utils.create_cuda_array(parameters, NaN32) # maybe make constant (see PerformanceTests.jl for more info)
# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions # each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
cudaResults = CuArray{Float32}(undef, variableCols * length(expressions)) cudaResults = CuArray{Float32}(undef, variableCols, length(expressions))
# cudaResults = CUDA.zeros(variableCols * length(expressions))
# ptr = CuPtr{Float32}(C_NULL)
# CUDA.cuMemAlloc(ptr, sizeof(Float32) * 10)
# values = Float32[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
# CUDA.cuMemcpyHtoD(ptr, values, sizeof(Float32) * 10)
# copyto!(cudaResults, z)
# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance) # execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
for i in eachindex(kernels) for i in eachindex(kernels)
@ -67,15 +59,10 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
threads = min(variableCols, config.threads) threads = min(variableCols, config.threads)
blocks = cld(variableCols, threads) blocks = cld(variableCols, threads)
# cudacall(kernels[i], (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks) cudacall(kernels[i], (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
cudacall(kernels[i], (CuPtr{Float32},), cudaResults; threads=threads, blocks=blocks)
# launch(kernels[i], cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
# launch(kernels[i], cudaResults; threads=threads, blocks=blocks)
end end
println(Array(cudaResults)) return cudaResults
# cudaResults = nothing
# CUDA.cuMemFree(ptr)
end end
# To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string # To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string
@ -83,29 +70,26 @@ end
" "
- param ```varSetSize```: The size of a variable set. Equal to number of rows of variable matrix (in a column major matrix) - param ```varSetSize```: The size of a variable set. Equal to number of rows of variable matrix (in a column major matrix)
- param ```paramSetSize```: The size of the longest parameter set. As it has to be stored in a column major matrix, the nr of rows is dependent oon the longest parameter set - param ```paramSetSize```: The size of the longest parameter set. As it has to be stored in a column major matrix, the nr of rows is dependent oon the longest parameter set
- param ```expressionIndex```: The 1-based index of the expression - param ```expressionIndex```: The 0-based index of the expression
" "
function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Integer, paramSetSize::Integer, function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Integer, paramSetSize::Integer,
nrOfVariableSets::Integer, expressionIndex::Integer)::String nrOfVariableSets::Integer, expressionIndex::Integer)::String
exitJumpLocationMarker = "\$L__BB0_2" exitJumpLocationMarker = "\$L__BB0_2"
ptxBuffer = IOBuffer() ptxBuffer = IOBuffer()
regManager = Utils.RegisterManager(Dict(), Dict())
# TODO: Temp fix. Make these types and create new instances for every call to this function. Otherwise we cannot parallelise the transpilation
reset_registers()
reset_symtable()
# TODO: Suboptimal solution # TODO: Suboptimal solution
signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Float32]) # Vars, Params, Results signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Float32, Float32, Float32], regManager) # Vars, Params, Results
guardClause, threadIdReg = get_guard_clause(exitJumpLocationMarker, nrOfVariableSets) guardClause, threadId64Reg = get_guard_clause(exitJumpLocationMarker, nrOfVariableSets, regManager)
println(ptxBuffer, get_cuda_header()) println(ptxBuffer, get_cuda_header())
println(ptxBuffer, signature) println(ptxBuffer, signature)
println(ptxBuffer, "{") println(ptxBuffer, "{")
calc_code = generate_calculation_code(expression, "%parameter0", varSetSize, "%parameter1", paramSetSize, "%parameter0", calc_code = generate_calculation_code(expression, "%parameter0", varSetSize, "%parameter1", paramSetSize, "%parameter2",
threadIdReg, expressionIndex, nrOfVariableSets) threadId64Reg, expressionIndex, nrOfVariableSets, regManager)
println(ptxBuffer, get_register_definitions()) println(ptxBuffer, Utils.get_register_definitions(regManager))
println(ptxBuffer, paramLoading) println(ptxBuffer, paramLoading)
println(ptxBuffer, guardClause) println(ptxBuffer, guardClause)
println(ptxBuffer, calc_code) println(ptxBuffer, calc_code)
@ -123,14 +107,15 @@ function get_cuda_header()::String
return " return "
.version 8.5 .version 8.5
.target sm_61 .target sm_61
.address_size 64 .address_size 32
" "
end end
" "
param ```parameters```: [1] = nr of var sets; [2] = variables; [3] = parameters; [4] = result param ```parameters```: [1] = nr of var sets; [2] = variables; [3] = parameters; [4] = result
" "
function get_kernel_signature(kernelName::String, parameters::Vector{DataType})::Tuple{String, String} function get_kernel_signature(kernelName::String, parameters::Vector{DataType}, regManager::Utils.RegisterManager)::Tuple{String, String}
signatureBuffer = IOBuffer() signatureBuffer = IOBuffer()
paramLoadingBuffer = IOBuffer() paramLoadingBuffer = IOBuffer()
print(signatureBuffer, ".visible .entry ") print(signatureBuffer, ".visible .entry ")
@ -140,9 +125,9 @@ function get_kernel_signature(kernelName::String, parameters::Vector{DataType}):
for i in eachindex(parameters) for i in eachindex(parameters)
print(signatureBuffer, " .param .u64", " ", "param_", i) print(signatureBuffer, " .param .u64", " ", "param_", i)
parametersLocation = get_next_free_register("i") parametersLocation = Utils.get_next_free_register(regManager, "rd")
println(paramLoadingBuffer, "ld.param.u64 $parametersLocation, [param_$i];") println(paramLoadingBuffer, "ld.param.u64 $parametersLocation, [param_$i];")
println(paramLoadingBuffer, "cvta.to.global.u64 $(get_next_free_register("parameter")), $parametersLocation;") println(paramLoadingBuffer, "cvta.to.global.u64 $(Utils.get_next_free_register(regManager, "parameter")), $parametersLocation;")
if i != lastindex(parameters) if i != lastindex(parameters)
println(signatureBuffer, ",") println(signatureBuffer, ",")
end end
@ -157,27 +142,30 @@ Constructs the PTX code used for handling the case where too many threads are st
- param ```nrOfVarSetsRegister```: The register which holds the total amount of variable sets for the kernel - param ```nrOfVarSetsRegister```: The register which holds the total amount of variable sets for the kernel
" "
function get_guard_clause(exitJumpLocation::String, nrOfVarSets::Integer)::Tuple{String, String} function get_guard_clause(exitJumpLocation::String, nrOfVarSets::Integer, regManager::Utils.RegisterManager)::Tuple{String, String}
guardBuffer = IOBuffer() guardBuffer = IOBuffer()
threadIds = get_next_free_register("r") threadIds = Utils.get_next_free_register(regManager, "r")
threadsPerCTA = get_next_free_register("r") threadsPerCTA = Utils.get_next_free_register(regManager, "r")
currentThreadId = get_next_free_register("r") currentThreadId = Utils.get_next_free_register(regManager, "r")
# load data into above defined registers
println(guardBuffer, "mov.u32 $threadIds, %ntid.x;") println(guardBuffer, "mov.u32 $threadIds, %ntid.x;")
println(guardBuffer, "mov.u32 $threadsPerCTA, %ctaid.x;") println(guardBuffer, "mov.u32 $threadsPerCTA, %ctaid.x;")
println(guardBuffer, "mov.u32 $currentThreadId, %tid.x;") println(guardBuffer, "mov.u32 $currentThreadId, %tid.x;")
globalThreadId = get_next_free_register("r") # basically the index of the thread in the variable set globalThreadId = Utils.get_next_free_register(regManager, "r") # basically the index of the thread in the variable set
breakCondition = get_next_free_register("p") breakCondition = Utils.get_next_free_register(regManager, "p")
println(guardBuffer, "mad.lo.s32 $globalThreadId, $threadIds, $threadsPerCTA, $currentThreadId;") println(guardBuffer, "mad.lo.s32 $globalThreadId, $threadIds, $threadsPerCTA, $currentThreadId;")
println(guardBuffer, "setp.gt.s32 $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets println(guardBuffer, "setp.gt.s32 $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets
# branch to end if breakCondition is true # branch to end if breakCondition is true
print(guardBuffer, "@$breakCondition bra $exitJumpLocation;") println(guardBuffer, "@$breakCondition bra $exitJumpLocation;")
return (String(take!(guardBuffer)), globalThreadId) # Convert threadIdReg to a 64 bit register. Not 64 bit from the start, as this would take up more registers. Performance tests can be performed to determin if it is faster doing this, or making everything 64-bit from the start
threadId64Reg = Utils.get_next_free_register(regManager, "rd")
print(guardBuffer, "cvt.u64.u32 $threadId64Reg, $globalThreadId;")
return (String(take!(guardBuffer)), threadId64Reg)
end end
" "
@ -185,13 +173,14 @@ end
" "
function generate_calculation_code(expression::ExpressionProcessing.PostfixType, variablesLocation::String, variablesSetSize::Integer, function generate_calculation_code(expression::ExpressionProcessing.PostfixType, variablesLocation::String, variablesSetSize::Integer,
parametersLocation::String, parametersSetSize::Integer, resultsLocation::String, parametersLocation::String, parametersSetSize::Integer, resultsLocation::String,
threadIdReg::String, expressionIndex::Integer, nrOfVarSets::Integer)::String threadId64Reg::String, expressionIndex::Integer, nrOfVarSets::Integer, regManager::Utils.RegisterManager)::String
# return "st.global.f32 [$resultsLocation], 10.0;"
codeBuffer = IOBuffer() codeBuffer = IOBuffer()
operands = Vector{Operand}() operands = Vector{Operand}()
exprId64Reg = Utils.get_next_free_register(regManager, "rd")
println(codeBuffer, "mov.u64 $exprId64Reg, $expressionIndex;")
for i in eachindex(expression) for i in eachindex(expression)
token = expression[i] token = expression[i]
@ -207,38 +196,37 @@ function generate_calculation_code(expression::ExpressionProcessing.PostfixType,
else else
left = pop!(operands) left = pop!(operands)
end end
operation, resultRegister = get_operation(operator, left, right) operation, resultRegister = get_operation(operator, regManager, left, right)
println(codeBuffer, operation) println(codeBuffer, operation)
push!(operands, resultRegister) push!(operands, resultRegister)
elseif token.Type == INDEX elseif token.Type == INDEX
if token.Value > 0 # varaibles if token.Value > 0 # varaibles
var, first_access = get_register_for_name("x$(token.Value)") var, first_access = Utils.get_register_for_name(regManager, "x$(token.Value)")
if first_access if first_access
println(codeBuffer, load_into_register(var, variablesLocation, token.Value, threadIdReg, variablesSetSize)) println(codeBuffer, load_into_register(var, variablesLocation, token.Value, threadId64Reg, variablesSetSize, regManager))
end end
push!(operands, var) push!(operands, var)
else else
absVal = abs(token.Value) absVal = abs(token.Value)
param, first_access = get_register_for_name("p$absVal") param, first_access = Utils.get_register_for_name(regManager, "p$absVal")
if first_access if first_access
println(codeBuffer, load_into_register(param, parametersLocation, absVal, threadIdReg, parametersSetSize)) println(codeBuffer, load_into_register(param, parametersLocation, absVal, exprId64Reg, parametersSetSize, regManager))
end end
push!(operands, param) push!(operands, param)
end end
end end
end end
# resultIndex = ((expressionIndex - 1) * nrOfVarSets + threadIdReg) * bytes (4 in our case) tempReg = Utils.get_next_free_register(regManager, "rd")
# resultsLocation[resultIndex] = ""; # reg = pop!(operands)
# tempReg = get_next_free_register("i") # tmp = "abs.f32 $(reg), 16.0;"
# println(codeBuffer, " # push!(operands, reg)
# add.u32 $tempReg, $((expressionIndex-1)*nrOfVarSets), $threadIdReg; println(codeBuffer, "
# mul.lo.u32 $tempReg, $tempReg, $BYTES; add.u64 $tempReg, $((expressionIndex)*nrOfVarSets), $threadId64Reg;
# add.u32 $tempReg, $resultsLocation, $tempReg; mad.lo.u64 $tempReg, $tempReg, $BYTES, $resultsLocation;
# st.global.f32 [$tempReg], $(pop!(operands)); st.global.f32 [$tempReg], $(pop!(operands));
# ") ")
println(codeBuffer, "st.global.f32 [$resultsLocation], 10.0;")
return String(take!(codeBuffer)) return String(take!(codeBuffer))
end end
@ -248,18 +236,17 @@ Loads a value from a location into the given register. It is assumed that the lo
- param ```register```: The register where the loaded value will be stored - param ```register```: The register where the loaded value will be stored
- param ```loadLocation```: The location from where to load the value - param ```loadLocation```: The location from where to load the value
- param ```valueIndex```: 0-based index of the value in the variable set/parameter set - param ```valueIndex```: 1-based index of the value in the variable set/parameter set
- param ```setIndexReg```: 0-based index of the set. Needed to calculate the actual index from the ```valueIndex```. Is equal to the global threadId - param ```setIndexReg64```: 0-based index of the set. Needed to calculate the actual index from the ```valueIndex```. Is equal to the global threadId
- param ```setSize```: The size of one set. Needed to calculate the actual index from the ```valueIndex```. Total number of elements in the set (length(set)) - param ```setSize```: The size of one set. Needed to calculate the actual index from the ```valueIndex```. Total number of elements in the set (length(set))
" "
function load_into_register(register::String, loadLocation::String, valueIndex::Integer, setIndexReg::String, setSize::Integer)::String function load_into_register(register::String, loadLocation::String, valueIndex::Integer, setIndexReg64::String, setSize::Integer, regManager::Utils.RegisterManager)::String
# loadLocation + startIndex + valueIndex * bytes (4 in our case) tempReg = Utils.get_next_free_register(regManager, "rd")
# startIndex: setIndex * setSize
tempReg = get_next_free_register("i") # "mad" calculates the offset and "add" applies the offset. Classical pointer arithmetic for accessing values of an array like in C
return " return "
mul.lo.u32 $tempReg, $setIndexReg, $setSize; mad.lo.u64 $tempReg, $setIndexReg64, $(setSize*BYTES), $((valueIndex - 1) * BYTES);
add.u32 $tempReg, $tempReg, $(valueIndex*BYTES); add.u64 $tempReg, $loadLocation, $tempReg;
add.u32 $tempReg, $loadLocation, $tempReg;
ld.global.f32 $register, [$tempReg];" ld.global.f32 $register, [$tempReg];"
end end
@ -275,8 +262,8 @@ function type_to_ptx_type(type::DataType)::String
end end
end end
function get_operation(operator::Operator, left::Operand, right::Union{Operand, Nothing} = nothing)::Tuple{String, String} function get_operation(operator::Operator, regManager::Utils.RegisterManager, left::Operand, right::Union{Operand, Nothing} = nothing)::Tuple{String, String}
resultRegister = get_next_free_register("f") resultRegister = Utils.get_next_free_register(regManager, "f")
resultCode = "" resultCode = ""
if is_binary_operator(operator) && isnothing(right) if is_binary_operator(operator) && isnothing(right)
@ -320,79 +307,5 @@ function get_operation(operator::Operator, left::Operand, right::Union{Operand,
return (resultCode, resultRegister) return (resultCode, resultRegister)
end end
let registers = Dict() # stores the count of the register already used.
global get_next_free_register
global get_register_definitions
global reset_registers
function reset_registers()
registers = Dict()
end
# By convention these names correspond to the following types:
# - p -> pred
# - f -> float32
# - r -> 32 bit
# - var -> float32 (used for variables and params)
function get_next_free_register(name::String)::String
if haskey(registers, name)
registers[name] += 1
else
registers[name] = 1
end
return string("%", name, registers[name] - 1)
end
function get_register_definitions()::String
registersBuffer = IOBuffer()
for definition in registers
regType = ""
if definition.first == "p"
regType = ".pred"
elseif definition.first == "f"
regType = ".f32"
elseif definition.first == "var"
regType = ".f32"
elseif definition.first == "param"
regType = ".f32"
elseif definition.first == "r"
regType = ".b32"
elseif definition.first == "parameter"
regType = ".b64"
elseif definition.first == "i"
regType = ".b64"
else
throw(ArgumentError("Unknown register name used. Name '$(definition.first)' cannot be mapped to a PTX type."))
end
println(registersBuffer, ".reg $regType %$(definition.first)<$(definition.second)>;")
end
return String(take!(registersBuffer))
end
end
let symtable = Dict()
global get_register_for_name
global reset_symtable
function reset_symtable()
symtable = Dict()
end
"Returns the register for this variable/parameter and true if it is used for the first time and false otherwise."
function get_register_for_name(varName::String)
if haskey(symtable, varName)
return (symtable[varName], false)
else
reg = get_next_free_register("var")
symtable[varName] = reg
return (reg, true)
end
end
end
end end

View File

@ -6,39 +6,83 @@ using CUDA
All entries that cannot be filled have ```invalidElement``` as their value All entries that cannot be filled have ```invalidElement``` as their value
" "
function convert_to_matrix(vec::Vector{Vector{T}}, invalidElement::T)::Matrix{T} where T function convert_to_matrix(vecs::Vector{Vector{T}}, invalidElement::T)::Matrix{T} where T
vecCols = get_max_inner_length(vec) maxLength = get_max_inner_length(vecs)
vecRows = length(vec)
vecMat = fill(invalidElement, vecCols, vecRows)
for i in eachindex(vec) # Pad the shorter vectors with the invalidElement
vecMat[:,i] = copyto!(vecMat[:,i], vec[i]) paddedVecs = [vcat(vec, fill(invalidElement, maxLength - length(vec))) for vec in vecs]
end vecMat = hcat(paddedVecs...)
return vecMat return vecMat
end end
"Retrieves the number of entries for the largest inner vector" "Retrieves the number of entries for the largest inner vector"
function get_max_inner_length(vec::Vector{Vector{T}})::Int where T function get_max_inner_length(vecs::Vector{Vector{T}})::Int where T
maxLength = 0 return maximum(length.(vecs))
@inbounds for i in eachindex(vec)
if length(vec[i]) > maxLength
maxLength = length(vec[i])
end
end
return maxLength
end end
"Returns a CuArray filed with the data provided. The inner vectors do not have to have the same length. All missing elements will be the value ```invalidElement```" "Returns a CuArray filed with the data provided. The inner vectors do not have to have the same length. All missing elements will be the value ```invalidElement```"
function create_cuda_array(data::Vector{Vector{T}}, invalidElement::T)::CuArray{T} where T function create_cuda_array(data::Vector{Vector{T}}, invalidElement::T)::CuArray{T} where T
dataCols = Utils.get_max_inner_length(data) dataMat = convert_to_matrix(data, invalidElement)
dataRows = length(data) cudaArr = CuArray(dataMat)
dataMat = Utils.convert_to_matrix(data, invalidElement)
cudaArr = CuArray{T}(undef, dataCols, dataRows) # length(parameters) == number of expressions
copyto!(cudaArr, dataMat)
return cudaArr return cudaArr
end end
struct RegisterManager
registers::Dict
symtable::Dict
end
function get_next_free_register(manager::RegisterManager, name::String)::String
if haskey(manager.registers, name)
manager.registers[name] += 1
else
manager.registers[name] = 1
end
return string("%", name, manager.registers[name] - 1)
end
function get_register_definitions(manager::RegisterManager)::String
registersBuffer = IOBuffer()
for definition in manager.registers
regType = ""
if definition.first == "p"
regType = ".pred"
elseif definition.first == "f"
regType = ".f32"
elseif definition.first == "var"
regType = ".f32"
elseif definition.first == "param"
regType = ".f32"
elseif definition.first == "r"
regType = ".b32"
elseif definition.first == "rd"
regType = ".b64"
elseif definition.first == "parameter"
regType = ".b64"
elseif definition.first == "i"
regType = ".b64"
else
throw(ArgumentError("Unknown register name used. Name '$(definition.first)' cannot be mapped to a PTX type."))
end
println(registersBuffer, ".reg $regType %$(definition.first)<$(definition.second)>;")
end
return String(take!(registersBuffer))
end
"Returns the register for this variable/parameter and true if it is used for the first time and false otherwise."
function get_register_for_name(manager::RegisterManager, varName::String)
if haskey(manager.symtable, varName)
return (manager.symtable[varName], false)
else
reg = get_next_free_register(manager, "var")
manager.symtable[varName] = reg
return (reg, true)
end
end
end end

View File

@ -1,6 +1,8 @@
using .Transpiler using .Transpiler
using .Interpreter using .Interpreter
# University setup at 10.20.1.7 if needed
@testset "CPU performance" begin @testset "CPU performance" begin
function test_cpu_interpreter(nrows; parallel = false) function test_cpu_interpreter(nrows; parallel = false)
exprs = [ exprs = [
@ -39,9 +41,23 @@ LinearAlgebra.BLAS.set_num_threads(1) # only use a single thread for peakflops
end end
@testset "Interpreter Performance" begin @testset "Interpreter Performance" begin
# Put data in shared memory:
# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
# Make array const:
# https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays
# Memory management like in C++ might help with performance improvements
# https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management
end end
@testset "Transpiler Performance" begin @testset "Transpiler Performance" begin
# Put data in shared memory:
# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
# Make array const:
# https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays
# Memory management like in C++ might help with performance improvements
# https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management
end end

View File

@ -2,138 +2,65 @@ using CUDA
using .ExpressionProcessing using .ExpressionProcessing
using .Transpiler using .Transpiler
expressions = Vector{Expr}(undef, 2) expressions = Vector{Expr}(undef, 3)
variables = Matrix{Float32}(undef, 2,2) variables = Matrix{Float32}(undef, 5, 4)
parameters = Vector{Vector{Float32}}(undef, 2) parameters = Vector{Vector{Float32}}(undef, 3)
# Resulting value should be 1.14... for the first expression
expressions[1] = :(1 + 3 * 5 / 7 - sqrt(4)) expressions[1] = :(1 + 3 * 5 / 7 - sqrt(4))
expressions[2] = :(5 + x1 + 1 * x2 + p1 + p2) expressions[2] = :(5 + x1 + 1 * x2 + p1 + p2 + x1^x3)
expressions[3] = :(log(x1) / x2 * sqrt(p1) + x3^x4 - exp(x5))
variables[1,1] = 2.0 variables[1,1] = 2.0
variables[2,1] = 3.0 variables[2,1] = 3.0
variables[1,2] = 0.0 variables[3,1] = 0.0
variables[4,1] = 1.0
variables[5,1] = 0.0
variables[1,2] = 2.0
variables[2,2] = 5.0 variables[2,2] = 5.0
parameters[1] = Vector{Float32}(undef, 1) variables[3,2] = 3.0
variables[4,2] = 0.0
variables[5,2] = 0.0
variables[1,3] = 6.0
variables[2,3] = 2.0
variables[3,3] = 2.0
variables[4,3] = 4.0
variables[5,3] = 2.0
variables[1,4] = 1.0
variables[2,4] = 2.0
variables[3,4] = 3.0
variables[4,4] = 4.0
variables[5,4] = 5.0
parameters[1] = Vector{Float32}(undef, 0)
parameters[2] = Vector{Float32}(undef, 2) parameters[2] = Vector{Float32}(undef, 2)
parameters[1][1] = 5.0 parameters[3] = Vector{Float32}(undef, 1)
parameters[2][1] = 5.0 parameters[2][1] = 5.0
parameters[2][2] = 0.0 parameters[2][2] = 0.0
parameters[3][1] = 16.0
@testset "Test TMP transpiler" begin
postfixExpr = expr_to_postfix(expressions[1])
postfixExprs = Vector([postfixExpr])
push!(postfixExprs, expr_to_postfix(expressions[2]))
push!(postfixExprs, expr_to_postfix(:(5^3 + x1 - p1)))
# generatedCode = Transpiler.transpile(postfixExpr)
# generatedCode = Transpiler.transpile(postfixExprs[3], 2, 3, 2, 3) # TEMP
# println(generatedCode)
# CUDA.@sync interpret(postfixExprs, variables, parameters)
# This is just here for testing. This will be called inside the execute method in the Transpiler module
# linker = CuLink()
# add_data!(linker, "ExpressionProcessing", generatedCode)
# image = complete(linker)
# mod = CuModule(image)
# func = CuFunction(mod, "ExpressionProcessing")
end
@testset "Test transpiler evaluation" begin @testset "Test transpiler evaluation" begin
# postfixExprs = Vector{Expr}() results = Transpiler.evaluate(expressions, variables, parameters)
# push!(postfixExprs, expressions[1])
# push!(postfixExprs, expressions[2])
expr = Vector{Expr}() # dump(expressions[3]; maxdepth=10)
push!(expr, expressions[1]) # Expr 1:
@time Transpiler.evaluate(expr, variables, parameters) @test isapprox(results[1,1], 1.14286)
@test isapprox(results[2,1], 1.14286)
@test isapprox(results[3,1], 1.14286)
@test isapprox(results[4,1], 1.14286)
#Expr 2:
@test isapprox(results[1,2], 16.0)
@test isapprox(results[2,2], 25.0)
@test isapprox(results[3,2], 54.0)
@test isapprox(results[4,2], 14.0)
#Expr3:
@test isapprox(results[1,3], -0.07580)
@test isapprox(results[2,3], 0.55452)
@test isapprox(results[3,3], 12.19446)
@test isapprox(results[4,3], -67.41316)
end end
#TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)" # TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)"
function test_kernel(results)
@inbounds results[1] = 10f0
return nothing
end
@testset "TEMP" begin
return
results = CuArray{Float32}(undef, 2)
# @device_code_ptx @cuda test_kernel(results)
# println(CUDA.code_ptx(kernel.fun, ))
# return
ptx = "
.version 8.5
.target sm_61
.address_size 64
.visible .entry ExpressionProcessing(
.param .u64 param_1)
{
.reg .b64 %parameter<1>;
.reg .b64 %i<1>;
//.reg .b64 %rd<6>;
ld.param.u64 %i0, [param_1];
cvta.to.global.u64 %parameter0, %i0;
st.global.f32 [%parameter0], 10.0;
ret;
}"
ptx = ".version 8.5
.target sm_61
.address_size 64
.visible .entry ExpressionProcessing(
.param .u64 param_1)
{
.reg .b64 %parameter<1>;
.reg .b32 %r<4>;
.reg .pred %p<1>;
.reg .b64 %i<1>;
ld.param.u64 %i0, [param_1];
cvta.to.global.u64 %parameter0, %i0;
mov.u32 %r0, %ntid.x;
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %tid.x;
mad.lo.s32 %r3, %r0, %r1, %r2;
setp.gt.s32 %p0, %r3, 2;
@%p0 bra \$L__BB0_2;
st.global.f32 [%parameter0], 10.0;
\$L__BB0_2: ret;
}"
linker = CuLink()
add_data!(linker, "ExpressionProcessing", ptx)
image = complete(linker)
mod = CuModule(image)
func = CuFunction(mod, "ExpressionProcessing")
variableCols = 2
cudaResults = CuArray{Float32}(undef, 1)
# cd = CUDA.alloc(CUDA.DeviceMemory, (variableCols * length(expressions)) * sizeof(Float32))
# cudaResults = CUDA.fill(0f0, variableCols * length(expressions))
# cudaResults = cu(zeros(Float32, variableCols * length(expressions)))
config = launch_configuration(func)
threads = min(variableCols, config.threads)
blocks = cld(variableCols, threads)
cudacall(func, Tuple{CuPtr{Float32}}, cudaResults; threads=4, blocks=1)
# launch(func, cudaResults; threads=threads, blocks=blocks)
println(Array(cudaResults))
end
# TODO: University setup at 10.20.1.7

View File

@ -7,7 +7,7 @@ include(joinpath(baseFolder, "src", "ExpressionProcessing.jl"))
include(joinpath(baseFolder, "src", "Interpreter.jl")) include(joinpath(baseFolder, "src", "Interpreter.jl"))
include(joinpath(baseFolder, "src", "Transpiler.jl")) include(joinpath(baseFolder, "src", "Transpiler.jl"))
@testset "ExpressionExecutorCuda.jl" begin @testset "Functionality tests" begin
# include("ExpressionProcessingTests.jl") # include("ExpressionProcessingTests.jl")
# include("InterpreterTests.jl") # include("InterpreterTests.jl")
include("TranspilerTests.jl") include("TranspilerTests.jl")
@ -19,16 +19,5 @@ end
# end # end
@testset "Performance tests" begin @testset "Performance tests" begin
# TODO: make performance tests # include("PerformanceTests.jl")
# Put data in shared memory:
# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
# Make array const:
# https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays
# Memory management like in C++ might help with performance improvements
# https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management
end end