transpiler: trying to fix problem with writing to global memory; not yet fixed
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
This commit is contained in:
parent
eaee21ca75
commit
561b37160b
|
@ -3,23 +3,9 @@ using CUDA
|
|||
using ..ExpressionProcessing
|
||||
using ..Utils
|
||||
|
||||
# Number of threads per block/SM + max number of registers
|
||||
# https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications
|
||||
# Need to assume a max of 2048 threads per Streaming Multiprocessor (SM)
|
||||
# One SM can have 64*1024 32-bit registers at max
|
||||
# One thread can at max use 255 registers
|
||||
# Meaning one has access to at most 32 registers in the worst case. Using 64 bit values this number gets halfed (see: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#multiprocessor-level (almost at the end of the linked section))
|
||||
|
||||
# Maybe helpful for future performance tuning: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#maximum-number-of-registers-per-thread
|
||||
|
||||
# https://docs.nvidia.com/cuda/cuda-c-programming-guide/#multiprocessor-level
|
||||
# This states, that using fewer registers allows more threads to reside on a single SM which improves performance.
|
||||
# So I could use more registers at the expense for performance. Depending on how this would simplify my algorithm, I might do this and leave more optimisation to future work
|
||||
|
||||
# Since the generated expressions should have between 10 and 50 symbols, I think allowing a max. of 128 32-bit registers should make for an easy algorithm. If during testing the result is slow, maybe try reducing the number of registers and perform more intelligent allocation/assignment
|
||||
# With 128 Registers, one could have 32 Warps on one SM ((128 * 16 = 2048) * 32 == 64*1024 == max number of registers per SM) This means 512 Threads per SM in the worst case
|
||||
|
||||
|
||||
const BYTES = sizeof(Float32)
|
||||
const Operand = Union{Float32, String} # Operand is either fixed value or register
|
||||
cache = Dict{Expr, CuFunction}() # needed if multiple runs with the same expr but different parameters are performed
|
||||
|
||||
|
@ -41,6 +27,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
|
|||
# mod = CuModule(image)
|
||||
# kernels[i] = CuFunction(mod, "ExpressionProcessing")
|
||||
# end
|
||||
|
||||
for i in eachindex(expressions)
|
||||
if haskey(cache, expressions[i])
|
||||
kernels[i] = cache[expressions[i]]
|
||||
|
@ -50,6 +37,8 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
|
|||
formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
|
||||
kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i)
|
||||
|
||||
println(kernel)
|
||||
|
||||
linker = CuLink()
|
||||
add_data!(linker, "ExpressionProcessing", kernel)
|
||||
|
||||
|
@ -65,6 +54,12 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
|
|||
|
||||
# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
|
||||
cudaResults = CuArray{Float32}(undef, variableCols, length(expressions))
|
||||
# cudaResults = CUDA.zeros(variableCols * length(expressions))
|
||||
# ptr = CuPtr{Float32}(C_NULL)
|
||||
# CUDA.cuMemAlloc(ptr, sizeof(Float32) * 10)
|
||||
# values = Float32[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
|
||||
# CUDA.cuMemcpyHtoD(ptr, values, sizeof(Float32) * 10)
|
||||
# copyto!(cudaResults, z)
|
||||
|
||||
# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
|
||||
for i in eachindex(kernels)
|
||||
|
@ -72,9 +67,14 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
|
|||
threads = min(variableCols, config.threads)
|
||||
blocks = cld(variableCols, threads)
|
||||
|
||||
cudacall(kernels[i], Tuple{CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}}, cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
|
||||
break
|
||||
# cudacall(kernels[i], (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
|
||||
# launch(kernels[i], cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
|
||||
launch(kernels[i], cudaResults; threads=threads, blocks=blocks)
|
||||
end
|
||||
|
||||
println(Array(cudaResults))
|
||||
# cudaResults = nothing
|
||||
# CUDA.cuMemFree(ptr)
|
||||
end
|
||||
|
||||
# To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string
|
||||
|
@ -89,16 +89,20 @@ function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Int
|
|||
exitJumpLocationMarker = "\$L__BB0_2"
|
||||
ptxBuffer = IOBuffer()
|
||||
|
||||
# TODO: Temp fix. Make these types and create new instances for every call to this function. Otherwise we cannot parallelise the transpilation
|
||||
reset_registers()
|
||||
reset_symtable()
|
||||
|
||||
# TODO: Suboptimal solution
|
||||
signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Float32, Float32, Float32]) # nrOfVarSets, Vars, Params
|
||||
guardClause, threadIdReg = get_guard_clause(exitJumpLocationMarker, nrOfVariableSets) # parameter0 because first entry holds the number of variable sets and that is always stored in %parameter0
|
||||
signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Float32]) # Vars, Params, Results
|
||||
guardClause, threadIdReg = get_guard_clause(exitJumpLocationMarker, nrOfVariableSets)
|
||||
|
||||
println(ptxBuffer, get_cuda_header())
|
||||
println(ptxBuffer, signature)
|
||||
println(ptxBuffer, "{")
|
||||
|
||||
|
||||
calc_code = generate_calculation_code(expression, "%parameter0", varSetSize, "%parameter1", paramSetSize, "%parameter2",
|
||||
calc_code = generate_calculation_code(expression, "%parameter0", varSetSize, "%parameter1", paramSetSize, "%parameter0",
|
||||
threadIdReg, expressionIndex, nrOfVariableSets)
|
||||
println(ptxBuffer, get_register_definitions())
|
||||
println(ptxBuffer, paramLoading)
|
||||
|
@ -116,7 +120,7 @@ end
|
|||
# TODO: Make version, target and address_size configurable; also see what address_size means exactly
|
||||
function get_cuda_header()::String
|
||||
return "
|
||||
.version 8.0
|
||||
.version 7.1
|
||||
.target sm_61
|
||||
.address_size 32
|
||||
"
|
||||
|
@ -135,7 +139,7 @@ function get_kernel_signature(kernelName::String, parameters::Vector{DataType}):
|
|||
for i in eachindex(parameters)
|
||||
print(signatureBuffer, " .param .u32", " ", "param_", i)
|
||||
|
||||
parametersLocation = get_next_free_register("r")
|
||||
parametersLocation = get_next_free_register("i")
|
||||
println(paramLoadingBuffer, "ld.param.u32 $parametersLocation, [param_$i];")
|
||||
println(paramLoadingBuffer, "cvta.to.global.u32 $(get_next_free_register("parameter")), $parametersLocation;")
|
||||
if i != lastindex(parameters)
|
||||
|
@ -165,12 +169,12 @@ function get_guard_clause(exitJumpLocation::String, nrOfVarSets::Integer)::Tuple
|
|||
println(guardBuffer, "mov.u32 $currentThreadId, %tid.x;")
|
||||
|
||||
globalThreadId = get_next_free_register("r") # basically the index of the thread in the variable set
|
||||
breakCondition = get_next_free_register("p")
|
||||
# breakCondition = get_next_free_register("p")
|
||||
println(guardBuffer, "mad.lo.s32 $globalThreadId, $threadIds, $threadsPerCTA, $currentThreadId;")
|
||||
println(guardBuffer, "setp.ge.s32 $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets
|
||||
# println(guardBuffer, "setp.ge.s32 $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets
|
||||
|
||||
# branch to end if breakCondition is true
|
||||
print(guardBuffer, "@$breakCondition bra $exitJumpLocation;")
|
||||
# print(guardBuffer, "@$breakCondition bra $exitJumpLocation;")
|
||||
|
||||
return (String(take!(guardBuffer)), globalThreadId)
|
||||
end
|
||||
|
@ -181,6 +185,9 @@ end
|
|||
function generate_calculation_code(expression::ExpressionProcessing.PostfixType, variablesLocation::String, variablesSetSize::Integer,
|
||||
parametersLocation::String, parametersSetSize::Integer, resultsLocation::String,
|
||||
threadIdReg::String, expressionIndex::Integer, nrOfVarSets::Integer)::String
|
||||
|
||||
return "st.global.f32 [$resultsLocation], 10.0;"
|
||||
|
||||
codeBuffer = IOBuffer()
|
||||
operands = Vector{Operand}()
|
||||
|
||||
|
@ -223,15 +230,15 @@ function generate_calculation_code(expression::ExpressionProcessing.PostfixType,
|
|||
|
||||
# resultIndex = ((expressionIndex - 1) * nrOfVarSets + threadIdReg) * bytes (4 in our case)
|
||||
# resultsLocation[resultIndex] = "";
|
||||
tempReg = get_next_free_register("i")
|
||||
println(codeBuffer, "
|
||||
add.u32 $tempReg, $((expressionIndex-1)*nrOfVarSets), $threadIdReg;
|
||||
mul.lo.u32 $tempReg, $tempReg, $(sizeof(expressionIndex));
|
||||
add.u32 $tempReg, $resultsLocation, $tempReg;
|
||||
st.global.f32 [$tempReg], $(pop!(operands));
|
||||
")
|
||||
# tempReg = get_next_free_register("i")
|
||||
# println(codeBuffer, "
|
||||
# add.u32 $tempReg, $((expressionIndex-1)*nrOfVarSets), $threadIdReg;
|
||||
# mul.lo.u32 $tempReg, $tempReg, $BYTES;
|
||||
# add.u32 $tempReg, $resultsLocation, $tempReg;
|
||||
# st.global.f32 [$tempReg], $(pop!(operands));
|
||||
# ")
|
||||
println(codeBuffer, "st.global.f32 [$resultsLocation], 10.0;")
|
||||
|
||||
println(operands)
|
||||
return String(take!(codeBuffer))
|
||||
end
|
||||
|
||||
|
@ -248,10 +255,9 @@ function load_into_register(register::String, loadLocation::String, valueIndex::
|
|||
# loadLocation + startIndex + valueIndex * bytes (4 in our case)
|
||||
# startIndex: setIndex * setSize
|
||||
tempReg = get_next_free_register("i")
|
||||
# we are using "sizeof(valueIndex)" because it has to use the same amount of bytes as the actual stored values, even though it could use more bytes
|
||||
return "
|
||||
mul.lo.u32 $tempReg, $setIndexReg, $setSize;
|
||||
add.u32 $tempReg, $tempReg, $(valueIndex*sizeof(valueIndex));
|
||||
add.u32 $tempReg, $tempReg, $(valueIndex*BYTES);
|
||||
add.u32 $tempReg, $loadLocation, $tempReg;
|
||||
ld.global.f32 $register, [$tempReg];"
|
||||
end
|
||||
|
@ -316,6 +322,12 @@ end
|
|||
let registers = Dict() # stores the count of the register already used.
|
||||
global get_next_free_register
|
||||
global get_register_definitions
|
||||
global reset_registers
|
||||
|
||||
function reset_registers()
|
||||
registers = Dict()
|
||||
end
|
||||
|
||||
|
||||
# By convention these names correspond to the following types:
|
||||
# - p -> pred
|
||||
|
@ -363,6 +375,11 @@ end
|
|||
|
||||
let symtable = Dict()
|
||||
global get_register_for_name
|
||||
global reset_symtable
|
||||
|
||||
function reset_symtable()
|
||||
symtable = Dict()
|
||||
end
|
||||
|
||||
"Returns the register for this variable/parameter and true if it is used for the first time and false otherwise."
|
||||
function get_register_for_name(varName::String)
|
||||
|
|
|
@ -27,18 +27,18 @@ parameters[2][2] = 0.0
|
|||
push!(postfixExprs, expr_to_postfix(:(5^3 + x1 - p1)))
|
||||
|
||||
# generatedCode = Transpiler.transpile(postfixExpr)
|
||||
generatedCode = Transpiler.transpile(postfixExprs[3], 2, 3, 2, 3) # TEMP
|
||||
println(generatedCode)
|
||||
# generatedCode = Transpiler.transpile(postfixExprs[3], 2, 3, 2, 3) # TEMP
|
||||
# println(generatedCode)
|
||||
# CUDA.@sync interpret(postfixExprs, variables, parameters)
|
||||
|
||||
# This is just here for testing. This will be called inside the execute method in the Transpiler module
|
||||
linker = CuLink()
|
||||
add_data!(linker, "ExpressionProcessing", generatedCode)
|
||||
# linker = CuLink()
|
||||
# add_data!(linker, "ExpressionProcessing", generatedCode)
|
||||
|
||||
image = complete(linker)
|
||||
# image = complete(linker)
|
||||
|
||||
mod = CuModule(image)
|
||||
func = CuFunction(mod, "ExpressionProcessing")
|
||||
# mod = CuModule(image)
|
||||
# func = CuFunction(mod, "ExpressionProcessing")
|
||||
end
|
||||
|
||||
@testset "Test transpiler evaluation" begin
|
||||
|
@ -46,7 +46,55 @@ end
|
|||
# push!(postfixExprs, expressions[1])
|
||||
# push!(postfixExprs, expressions[2])
|
||||
|
||||
@time Transpiler.evaluate(expressions, variables, parameters)
|
||||
expr = Vector{Expr}()
|
||||
push!(expr, expressions[1])
|
||||
# @time Transpiler.evaluate(expr, variables, parameters)
|
||||
end
|
||||
|
||||
#TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)"
|
||||
|
||||
@testset "TEMP" begin
|
||||
ptx = "
|
||||
.version 7.1
|
||||
.target sm_61
|
||||
.address_size 64
|
||||
|
||||
.visible .entry ExpressionProcessing(
|
||||
.param .u32 param_1)
|
||||
{
|
||||
.reg .u32 %parameter<1>;
|
||||
.reg .u32 %i<1>;
|
||||
|
||||
ld.param.u32 %i0, [param_1];
|
||||
cvta.to.global.u32 %parameter0, %i0;
|
||||
|
||||
st.global.f32 [%parameter0], 10.0;
|
||||
ret;
|
||||
}"
|
||||
|
||||
linker = CuLink()
|
||||
add_data!(linker, "ExpressionProcessing", ptx)
|
||||
|
||||
image = complete(linker)
|
||||
|
||||
mod = CuModule(image)
|
||||
func = CuFunction(mod, "ExpressionProcessing")
|
||||
|
||||
variableCols = 2
|
||||
cudaResults = CuArray{Float32}(undef, 1)
|
||||
# cd = CUDA.alloc(CUDA.DeviceMemory, (variableCols * length(expressions)) * sizeof(Float32))
|
||||
# cudaResults = CUDA.fill(0f0, variableCols * length(expressions))
|
||||
# cudaResults = cu(zeros(Float32, variableCols * length(expressions)))
|
||||
|
||||
config = launch_configuration(func)
|
||||
threads = min(variableCols, config.threads)
|
||||
blocks = cld(variableCols, threads)
|
||||
|
||||
cudacall(func, Tuple{CuPtr{Float32}}, cudaResults; threads=1, blocks=1)
|
||||
# launch(func, cudaResults; threads=threads, blocks=blocks)
|
||||
|
||||
println(Array(cudaResults))
|
||||
end
|
||||
|
||||
|
||||
# TODO: University setup at 10.20.1.7
|
Loading…
Reference in New Issue
Block a user