transpiler: trying to fix problem with writing to global memory; not yet fixed
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run

This commit is contained in:
Daniel 2025-03-27 18:00:31 +01:00
parent eaee21ca75
commit 561b37160b
2 changed files with 108 additions and 43 deletions

View File

@ -3,23 +3,9 @@ using CUDA
using ..ExpressionProcessing
using ..Utils
# Number of threads per block/SM + max number of registers
# https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications
# Need to assume a max of 2048 threads per Streaming Multiprocessor (SM)
# One SM can have 64*1024 32-bit registers at max
# One thread can at max use 255 registers
# Meaning one has access to at most 32 registers in the worst case. Using 64 bit values this number gets halfed (see: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#multiprocessor-level (almost at the end of the linked section))
# Maybe helpful for future performance tuning: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#maximum-number-of-registers-per-thread
# https://docs.nvidia.com/cuda/cuda-c-programming-guide/#multiprocessor-level
# This states, that using fewer registers allows more threads to reside on a single SM which improves performance.
# So I could use more registers at the expense for performance. Depending on how this would simplify my algorithm, I might do this and leave more optimisation to future work
# Since the generated expressions should have between 10 and 50 symbols, I think allowing a max. of 128 32-bit registers should make for an easy algorithm. If during testing the result is slow, maybe try reducing the number of registers and perform more intelligent allocation/assignment
# With 128 Registers, one could have 32 Warps on one SM ((128 * 16 = 2048) * 32 == 64*1024 == max number of registers per SM) This means 512 Threads per SM in the worst case
const BYTES = sizeof(Float32)
const Operand = Union{Float32, String} # Operand is either fixed value or register
cache = Dict{Expr, CuFunction}() # needed if multiple runs with the same expr but different parameters are performed
@ -41,6 +27,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
# mod = CuModule(image)
# kernels[i] = CuFunction(mod, "ExpressionProcessing")
# end
for i in eachindex(expressions)
if haskey(cache, expressions[i])
kernels[i] = cache[expressions[i]]
@ -50,6 +37,8 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i)
println(kernel)
linker = CuLink()
add_data!(linker, "ExpressionProcessing", kernel)
@ -65,6 +54,12 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
cudaResults = CuArray{Float32}(undef, variableCols, length(expressions))
# cudaResults = CUDA.zeros(variableCols * length(expressions))
# ptr = CuPtr{Float32}(C_NULL)
# CUDA.cuMemAlloc(ptr, sizeof(Float32) * 10)
# values = Float32[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
# CUDA.cuMemcpyHtoD(ptr, values, sizeof(Float32) * 10)
# copyto!(cudaResults, z)
# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
for i in eachindex(kernels)
@ -72,9 +67,14 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
threads = min(variableCols, config.threads)
blocks = cld(variableCols, threads)
cudacall(kernels[i], Tuple{CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}}, cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
break
# cudacall(kernels[i], (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
# launch(kernels[i], cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
launch(kernels[i], cudaResults; threads=threads, blocks=blocks)
end
println(Array(cudaResults))
# cudaResults = nothing
# CUDA.cuMemFree(ptr)
end
# To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string
@ -89,16 +89,20 @@ function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Int
exitJumpLocationMarker = "\$L__BB0_2"
ptxBuffer = IOBuffer()
# TODO: Temp fix. Make these types and create new instances for every call to this function. Otherwise we cannot parallelise the transpilation
reset_registers()
reset_symtable()
# TODO: Suboptimal solution
signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Float32, Float32, Float32]) # nrOfVarSets, Vars, Params
guardClause, threadIdReg = get_guard_clause(exitJumpLocationMarker, nrOfVariableSets) # parameter0 because first entry holds the number of variable sets and that is always stored in %parameter0
signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Float32]) # Vars, Params, Results
guardClause, threadIdReg = get_guard_clause(exitJumpLocationMarker, nrOfVariableSets)
println(ptxBuffer, get_cuda_header())
println(ptxBuffer, signature)
println(ptxBuffer, "{")
calc_code = generate_calculation_code(expression, "%parameter0", varSetSize, "%parameter1", paramSetSize, "%parameter2",
calc_code = generate_calculation_code(expression, "%parameter0", varSetSize, "%parameter1", paramSetSize, "%parameter0",
threadIdReg, expressionIndex, nrOfVariableSets)
println(ptxBuffer, get_register_definitions())
println(ptxBuffer, paramLoading)
@ -116,7 +120,7 @@ end
# TODO: Make version, target and address_size configurable; also see what address_size means exactly
function get_cuda_header()::String
return "
.version 8.0
.version 7.1
.target sm_61
.address_size 32
"
@ -135,7 +139,7 @@ function get_kernel_signature(kernelName::String, parameters::Vector{DataType}):
for i in eachindex(parameters)
print(signatureBuffer, " .param .u32", " ", "param_", i)
parametersLocation = get_next_free_register("r")
parametersLocation = get_next_free_register("i")
println(paramLoadingBuffer, "ld.param.u32 $parametersLocation, [param_$i];")
println(paramLoadingBuffer, "cvta.to.global.u32 $(get_next_free_register("parameter")), $parametersLocation;")
if i != lastindex(parameters)
@ -165,12 +169,12 @@ function get_guard_clause(exitJumpLocation::String, nrOfVarSets::Integer)::Tuple
println(guardBuffer, "mov.u32 $currentThreadId, %tid.x;")
globalThreadId = get_next_free_register("r") # basically the index of the thread in the variable set
breakCondition = get_next_free_register("p")
# breakCondition = get_next_free_register("p")
println(guardBuffer, "mad.lo.s32 $globalThreadId, $threadIds, $threadsPerCTA, $currentThreadId;")
println(guardBuffer, "setp.ge.s32 $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets
# println(guardBuffer, "setp.ge.s32 $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets
# branch to end if breakCondition is true
print(guardBuffer, "@$breakCondition bra $exitJumpLocation;")
# print(guardBuffer, "@$breakCondition bra $exitJumpLocation;")
return (String(take!(guardBuffer)), globalThreadId)
end
@ -181,6 +185,9 @@ end
function generate_calculation_code(expression::ExpressionProcessing.PostfixType, variablesLocation::String, variablesSetSize::Integer,
parametersLocation::String, parametersSetSize::Integer, resultsLocation::String,
threadIdReg::String, expressionIndex::Integer, nrOfVarSets::Integer)::String
return "st.global.f32 [$resultsLocation], 10.0;"
codeBuffer = IOBuffer()
operands = Vector{Operand}()
@ -223,15 +230,15 @@ function generate_calculation_code(expression::ExpressionProcessing.PostfixType,
# resultIndex = ((expressionIndex - 1) * nrOfVarSets + threadIdReg) * bytes (4 in our case)
# resultsLocation[resultIndex] = "";
tempReg = get_next_free_register("i")
println(codeBuffer, "
add.u32 $tempReg, $((expressionIndex-1)*nrOfVarSets), $threadIdReg;
mul.lo.u32 $tempReg, $tempReg, $(sizeof(expressionIndex));
add.u32 $tempReg, $resultsLocation, $tempReg;
st.global.f32 [$tempReg], $(pop!(operands));
")
# tempReg = get_next_free_register("i")
# println(codeBuffer, "
# add.u32 $tempReg, $((expressionIndex-1)*nrOfVarSets), $threadIdReg;
# mul.lo.u32 $tempReg, $tempReg, $BYTES;
# add.u32 $tempReg, $resultsLocation, $tempReg;
# st.global.f32 [$tempReg], $(pop!(operands));
# ")
println(codeBuffer, "st.global.f32 [$resultsLocation], 10.0;")
println(operands)
return String(take!(codeBuffer))
end
@ -248,10 +255,9 @@ function load_into_register(register::String, loadLocation::String, valueIndex::
# loadLocation + startIndex + valueIndex * bytes (4 in our case)
# startIndex: setIndex * setSize
tempReg = get_next_free_register("i")
# we are using "sizeof(valueIndex)" because it has to use the same amount of bytes as the actual stored values, even though it could use more bytes
return "
mul.lo.u32 $tempReg, $setIndexReg, $setSize;
add.u32 $tempReg, $tempReg, $(valueIndex*sizeof(valueIndex));
add.u32 $tempReg, $tempReg, $(valueIndex*BYTES);
add.u32 $tempReg, $loadLocation, $tempReg;
ld.global.f32 $register, [$tempReg];"
end
@ -316,6 +322,12 @@ end
let registers = Dict() # stores the count of the register already used.
global get_next_free_register
global get_register_definitions
global reset_registers
function reset_registers()
registers = Dict()
end
# By convention these names correspond to the following types:
# - p -> pred
@ -363,6 +375,11 @@ end
let symtable = Dict()
global get_register_for_name
global reset_symtable
function reset_symtable()
symtable = Dict()
end
"Returns the register for this variable/parameter and true if it is used for the first time and false otherwise."
function get_register_for_name(varName::String)

View File

@ -27,18 +27,18 @@ parameters[2][2] = 0.0
push!(postfixExprs, expr_to_postfix(:(5^3 + x1 - p1)))
# generatedCode = Transpiler.transpile(postfixExpr)
generatedCode = Transpiler.transpile(postfixExprs[3], 2, 3, 2, 3) # TEMP
println(generatedCode)
# generatedCode = Transpiler.transpile(postfixExprs[3], 2, 3, 2, 3) # TEMP
# println(generatedCode)
# CUDA.@sync interpret(postfixExprs, variables, parameters)
# This is just here for testing. This will be called inside the execute method in the Transpiler module
linker = CuLink()
add_data!(linker, "ExpressionProcessing", generatedCode)
# linker = CuLink()
# add_data!(linker, "ExpressionProcessing", generatedCode)
image = complete(linker)
# image = complete(linker)
mod = CuModule(image)
func = CuFunction(mod, "ExpressionProcessing")
# mod = CuModule(image)
# func = CuFunction(mod, "ExpressionProcessing")
end
@testset "Test transpiler evaluation" begin
@ -46,7 +46,55 @@ end
# push!(postfixExprs, expressions[1])
# push!(postfixExprs, expressions[2])
@time Transpiler.evaluate(expressions, variables, parameters)
expr = Vector{Expr}()
push!(expr, expressions[1])
# @time Transpiler.evaluate(expr, variables, parameters)
end
#TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)"
@testset "TEMP" begin
ptx = "
.version 7.1
.target sm_61
.address_size 64
.visible .entry ExpressionProcessing(
.param .u32 param_1)
{
.reg .u32 %parameter<1>;
.reg .u32 %i<1>;
ld.param.u32 %i0, [param_1];
cvta.to.global.u32 %parameter0, %i0;
st.global.f32 [%parameter0], 10.0;
ret;
}"
linker = CuLink()
add_data!(linker, "ExpressionProcessing", ptx)
image = complete(linker)
mod = CuModule(image)
func = CuFunction(mod, "ExpressionProcessing")
variableCols = 2
cudaResults = CuArray{Float32}(undef, 1)
# cd = CUDA.alloc(CUDA.DeviceMemory, (variableCols * length(expressions)) * sizeof(Float32))
# cudaResults = CUDA.fill(0f0, variableCols * length(expressions))
# cudaResults = cu(zeros(Float32, variableCols * length(expressions)))
config = launch_configuration(func)
threads = min(variableCols, config.threads)
blocks = cld(variableCols, threads)
cudacall(func, Tuple{CuPtr{Float32}}, cudaResults; threads=1, blocks=1)
# launch(func, cudaResults; threads=threads, blocks=blocks)
println(Array(cudaResults))
end
# TODO: University setup at 10.20.1.7