diff --git a/package/src/Transpiler.jl b/package/src/Transpiler.jl index 6381e14..d7c113e 100644 --- a/package/src/Transpiler.jl +++ b/package/src/Transpiler.jl @@ -3,23 +3,9 @@ using CUDA using ..ExpressionProcessing using ..Utils -# Number of threads per block/SM + max number of registers # https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications -# Need to assume a max of 2048 threads per Streaming Multiprocessor (SM) -# One SM can have 64*1024 32-bit registers at max -# One thread can at max use 255 registers -# Meaning one has access to at most 32 registers in the worst case. Using 64 bit values this number gets halfed (see: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#multiprocessor-level (almost at the end of the linked section)) - -# Maybe helpful for future performance tuning: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#maximum-number-of-registers-per-thread - -# https://docs.nvidia.com/cuda/cuda-c-programming-guide/#multiprocessor-level -# This states, that using fewer registers allows more threads to reside on a single SM which improves performance. -# So I could use more registers at the expense for performance. Depending on how this would simplify my algorithm, I might do this and leave more optimisation to future work - -# Since the generated expressions should have between 10 and 50 symbols, I think allowing a max. of 128 32-bit registers should make for an easy algorithm. If during testing the result is slow, maybe try reducing the number of registers and perform more intelligent allocation/assignment -# With 128 Registers, one could have 32 Warps on one SM ((128 * 16 = 2048) * 32 == 64*1024 == max number of registers per SM) This means 512 Threads per SM in the worst case - +const BYTES = sizeof(Float32) const Operand = Union{Float32, String} # Operand is either fixed value or register cache = Dict{Expr, CuFunction}() # needed if multiple runs with the same expr but different parameters are performed @@ -41,6 +27,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet # mod = CuModule(image) # kernels[i] = CuFunction(mod, "ExpressionProcessing") # end + for i in eachindex(expressions) if haskey(cache, expressions[i]) kernels[i] = cache[expressions[i]] @@ -50,6 +37,8 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i]) kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i) + println(kernel) + linker = CuLink() add_data!(linker, "ExpressionProcessing", kernel) @@ -65,6 +54,12 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet # each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions cudaResults = CuArray{Float32}(undef, variableCols, length(expressions)) + # cudaResults = CUDA.zeros(variableCols * length(expressions)) + # ptr = CuPtr{Float32}(C_NULL) + # CUDA.cuMemAlloc(ptr, sizeof(Float32) * 10) + # values = Float32[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0] + # CUDA.cuMemcpyHtoD(ptr, values, sizeof(Float32) * 10) + # copyto!(cudaResults, z) # execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance) for i in eachindex(kernels) @@ -72,9 +67,14 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet threads = min(variableCols, config.threads) blocks = cld(variableCols, threads) - cudacall(kernels[i], Tuple{CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}}, cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks) - break + # cudacall(kernels[i], (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks) + # launch(kernels[i], cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks) + launch(kernels[i], cudaResults; threads=threads, blocks=blocks) end + + println(Array(cudaResults)) + # cudaResults = nothing + # CUDA.cuMemFree(ptr) end # To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string @@ -89,16 +89,20 @@ function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Int exitJumpLocationMarker = "\$L__BB0_2" ptxBuffer = IOBuffer() + # TODO: Temp fix. Make these types and create new instances for every call to this function. Otherwise we cannot parallelise the transpilation + reset_registers() + reset_symtable() + # TODO: Suboptimal solution - signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Float32, Float32, Float32]) # nrOfVarSets, Vars, Params - guardClause, threadIdReg = get_guard_clause(exitJumpLocationMarker, nrOfVariableSets) # parameter0 because first entry holds the number of variable sets and that is always stored in %parameter0 + signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Float32]) # Vars, Params, Results + guardClause, threadIdReg = get_guard_clause(exitJumpLocationMarker, nrOfVariableSets) println(ptxBuffer, get_cuda_header()) println(ptxBuffer, signature) println(ptxBuffer, "{") - calc_code = generate_calculation_code(expression, "%parameter0", varSetSize, "%parameter1", paramSetSize, "%parameter2", + calc_code = generate_calculation_code(expression, "%parameter0", varSetSize, "%parameter1", paramSetSize, "%parameter0", threadIdReg, expressionIndex, nrOfVariableSets) println(ptxBuffer, get_register_definitions()) println(ptxBuffer, paramLoading) @@ -116,7 +120,7 @@ end # TODO: Make version, target and address_size configurable; also see what address_size means exactly function get_cuda_header()::String return " -.version 8.0 +.version 7.1 .target sm_61 .address_size 32 " @@ -135,7 +139,7 @@ function get_kernel_signature(kernelName::String, parameters::Vector{DataType}): for i in eachindex(parameters) print(signatureBuffer, " .param .u32", " ", "param_", i) - parametersLocation = get_next_free_register("r") + parametersLocation = get_next_free_register("i") println(paramLoadingBuffer, "ld.param.u32 $parametersLocation, [param_$i];") println(paramLoadingBuffer, "cvta.to.global.u32 $(get_next_free_register("parameter")), $parametersLocation;") if i != lastindex(parameters) @@ -165,12 +169,12 @@ function get_guard_clause(exitJumpLocation::String, nrOfVarSets::Integer)::Tuple println(guardBuffer, "mov.u32 $currentThreadId, %tid.x;") globalThreadId = get_next_free_register("r") # basically the index of the thread in the variable set - breakCondition = get_next_free_register("p") + # breakCondition = get_next_free_register("p") println(guardBuffer, "mad.lo.s32 $globalThreadId, $threadIds, $threadsPerCTA, $currentThreadId;") - println(guardBuffer, "setp.ge.s32 $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets + # println(guardBuffer, "setp.ge.s32 $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets # branch to end if breakCondition is true - print(guardBuffer, "@$breakCondition bra $exitJumpLocation;") + # print(guardBuffer, "@$breakCondition bra $exitJumpLocation;") return (String(take!(guardBuffer)), globalThreadId) end @@ -181,6 +185,9 @@ end function generate_calculation_code(expression::ExpressionProcessing.PostfixType, variablesLocation::String, variablesSetSize::Integer, parametersLocation::String, parametersSetSize::Integer, resultsLocation::String, threadIdReg::String, expressionIndex::Integer, nrOfVarSets::Integer)::String + + return "st.global.f32 [$resultsLocation], 10.0;" + codeBuffer = IOBuffer() operands = Vector{Operand}() @@ -223,15 +230,15 @@ function generate_calculation_code(expression::ExpressionProcessing.PostfixType, # resultIndex = ((expressionIndex - 1) * nrOfVarSets + threadIdReg) * bytes (4 in our case) # resultsLocation[resultIndex] = ""; - tempReg = get_next_free_register("i") - println(codeBuffer, " - add.u32 $tempReg, $((expressionIndex-1)*nrOfVarSets), $threadIdReg; - mul.lo.u32 $tempReg, $tempReg, $(sizeof(expressionIndex)); - add.u32 $tempReg, $resultsLocation, $tempReg; - st.global.f32 [$tempReg], $(pop!(operands)); - ") + # tempReg = get_next_free_register("i") + # println(codeBuffer, " + # add.u32 $tempReg, $((expressionIndex-1)*nrOfVarSets), $threadIdReg; + # mul.lo.u32 $tempReg, $tempReg, $BYTES; + # add.u32 $tempReg, $resultsLocation, $tempReg; + # st.global.f32 [$tempReg], $(pop!(operands)); + # ") + println(codeBuffer, "st.global.f32 [$resultsLocation], 10.0;") - println(operands) return String(take!(codeBuffer)) end @@ -248,10 +255,9 @@ function load_into_register(register::String, loadLocation::String, valueIndex:: # loadLocation + startIndex + valueIndex * bytes (4 in our case) # startIndex: setIndex * setSize tempReg = get_next_free_register("i") - # we are using "sizeof(valueIndex)" because it has to use the same amount of bytes as the actual stored values, even though it could use more bytes return " mul.lo.u32 $tempReg, $setIndexReg, $setSize; - add.u32 $tempReg, $tempReg, $(valueIndex*sizeof(valueIndex)); + add.u32 $tempReg, $tempReg, $(valueIndex*BYTES); add.u32 $tempReg, $loadLocation, $tempReg; ld.global.f32 $register, [$tempReg];" end @@ -316,6 +322,12 @@ end let registers = Dict() # stores the count of the register already used. global get_next_free_register global get_register_definitions + global reset_registers + + function reset_registers() + registers = Dict() + end + # By convention these names correspond to the following types: # - p -> pred @@ -363,6 +375,11 @@ end let symtable = Dict() global get_register_for_name + global reset_symtable + + function reset_symtable() + symtable = Dict() + end "Returns the register for this variable/parameter and true if it is used for the first time and false otherwise." function get_register_for_name(varName::String) diff --git a/package/test/TranspilerTests.jl b/package/test/TranspilerTests.jl index 4025b91..769d85b 100644 --- a/package/test/TranspilerTests.jl +++ b/package/test/TranspilerTests.jl @@ -27,18 +27,18 @@ parameters[2][2] = 0.0 push!(postfixExprs, expr_to_postfix(:(5^3 + x1 - p1))) # generatedCode = Transpiler.transpile(postfixExpr) - generatedCode = Transpiler.transpile(postfixExprs[3], 2, 3, 2, 3) # TEMP - println(generatedCode) + # generatedCode = Transpiler.transpile(postfixExprs[3], 2, 3, 2, 3) # TEMP + # println(generatedCode) # CUDA.@sync interpret(postfixExprs, variables, parameters) # This is just here for testing. This will be called inside the execute method in the Transpiler module - linker = CuLink() - add_data!(linker, "ExpressionProcessing", generatedCode) + # linker = CuLink() + # add_data!(linker, "ExpressionProcessing", generatedCode) - image = complete(linker) + # image = complete(linker) - mod = CuModule(image) - func = CuFunction(mod, "ExpressionProcessing") + # mod = CuModule(image) + # func = CuFunction(mod, "ExpressionProcessing") end @testset "Test transpiler evaluation" begin @@ -46,7 +46,55 @@ end # push!(postfixExprs, expressions[1]) # push!(postfixExprs, expressions[2]) - @time Transpiler.evaluate(expressions, variables, parameters) + expr = Vector{Expr}() + push!(expr, expressions[1]) + # @time Transpiler.evaluate(expr, variables, parameters) end #TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)" + +@testset "TEMP" begin + ptx = " + .version 7.1 + .target sm_61 + .address_size 64 + + .visible .entry ExpressionProcessing( + .param .u32 param_1) + { + .reg .u32 %parameter<1>; + .reg .u32 %i<1>; + + ld.param.u32 %i0, [param_1]; + cvta.to.global.u32 %parameter0, %i0; + + st.global.f32 [%parameter0], 10.0; + ret; + }" + + linker = CuLink() + add_data!(linker, "ExpressionProcessing", ptx) + + image = complete(linker) + + mod = CuModule(image) + func = CuFunction(mod, "ExpressionProcessing") + + variableCols = 2 + cudaResults = CuArray{Float32}(undef, 1) + # cd = CUDA.alloc(CUDA.DeviceMemory, (variableCols * length(expressions)) * sizeof(Float32)) + # cudaResults = CUDA.fill(0f0, variableCols * length(expressions)) + # cudaResults = cu(zeros(Float32, variableCols * length(expressions))) + + config = launch_configuration(func) + threads = min(variableCols, config.threads) + blocks = cld(variableCols, threads) + + cudacall(func, Tuple{CuPtr{Float32}}, cudaResults; threads=1, blocks=1) + # launch(func, cudaResults; threads=threads, blocks=blocks) + + println(Array(cudaResults)) +end + + +# TODO: University setup at 10.20.1.7 \ No newline at end of file