From eaee21ca75473ce728a076f15a8953d07e656540 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 27 Mar 2025 09:55:29 +0100 Subject: [PATCH] transpiler: results are now written in results array; preperation for performance testing --- package/src/Transpiler.jl | 77 ++++++++++++++++++++++----------- package/src/Utils.jl | 2 + package/test/TranspilerTests.jl | 14 +++--- 3 files changed, 61 insertions(+), 32 deletions(-) diff --git a/package/src/Transpiler.jl b/package/src/Transpiler.jl index cd97400..6381e14 100644 --- a/package/src/Transpiler.jl +++ b/package/src/Transpiler.jl @@ -19,19 +19,18 @@ using ..Utils # Since the generated expressions should have between 10 and 50 symbols, I think allowing a max. of 128 32-bit registers should make for an easy algorithm. If during testing the result is slow, maybe try reducing the number of registers and perform more intelligent allocation/assignment # With 128 Registers, one could have 32 Warps on one SM ((128 * 16 = 2048) * 32 == 64*1024 == max number of registers per SM) This means 512 Threads per SM in the worst case -# -# Make a "function execute(...)" that takes the data and the transpiled code. Pass the data to the kernel and start executing -# Note: Maybe make an additional function that transpiles and executed the code. This would then be the function the user calls -# const Operand = Union{Float32, String} # Operand is either fixed value or register +cache = Dict{Expr, CuFunction}() # needed if multiple runs with the same expr but different parameters are performed -function evaluate(expressions::Vector{ExpressionProcessing.PostfixType}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}}) +function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}}) varRows = size(variables, 1) + variableCols = size(variables, 2) kernels = Vector{CuFunction}(undef, length(expressions)) # Test this parallel version again when doing performance tests. With the simple "functionality" tests this took 0.03 seconds while sequential took "0.00009" seconds # Threads.@threads for i in eachindex(expressions) + # TODO: Use cache # kernel = transpile(expressions[i], varRows, Utils.get_max_inner_length(parameters)) # linker = CuLink() @@ -43,15 +42,22 @@ function evaluate(expressions::Vector{ExpressionProcessing.PostfixType}, variabl # kernels[i] = CuFunction(mod, "ExpressionProcessing") # end for i in eachindex(expressions) - kernel = transpile(expressions[i], varRows, Utils.get_max_inner_length(parameters)) + if haskey(cache, expressions[i]) + kernels[i] = cache[expressions[i]] + continue + end + formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i]) + kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i) + linker = CuLink() add_data!(linker, "ExpressionProcessing", kernel) - + image = complete(linker) - + mod = CuModule(image) kernels[i] = CuFunction(mod, "ExpressionProcessing") + cache[expressions[i]] = kernels[i] end cudaVars = CuArray(variables) # maybe put in shared memory (see runtests.jl for more info) @@ -61,13 +67,13 @@ function evaluate(expressions::Vector{ExpressionProcessing.PostfixType}, variabl cudaResults = CuArray{Float32}(undef, variableCols, length(expressions)) # execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance) - variableCols = size(variables, 2) for i in eachindex(kernels) config = launch_configuration(kernels[i]) threads = min(variableCols, config.threads) blocks = cld(variableCols, threads) - cudacall(kernels[i], Tuple{CuPtr{Cfloat},CuPtr{Cfloat},CuPtr{Cfloat}}, cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks) + cudacall(kernels[i], Tuple{CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}}, cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks) + break end end @@ -76,21 +82,24 @@ end " - param ```varSetSize```: The size of a variable set. Equal to number of rows of variable matrix (in a column major matrix) - param ```paramSetSize```: The size of the longest parameter set. As it has to be stored in a column major matrix, the nr of rows is dependent oon the longest parameter set +- param ```expressionIndex```: The 1-based index of the expression " -function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Integer, paramSetSize::Integer)::String +function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Integer, paramSetSize::Integer, + nrOfVariableSets::Integer, expressionIndex::Integer)::String exitJumpLocationMarker = "\$L__BB0_2" ptxBuffer = IOBuffer() # TODO: Suboptimal solution - signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Int32, Float32, Float32]) # nrOfVarSets, Vars, Params - guardClause, threadIdReg = get_guard_clause(exitJumpLocationMarker, "%parameter0") # parameter0 because first entry holds the number of variable sets and that is always stored in %parameter0 + signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Float32, Float32, Float32]) # nrOfVarSets, Vars, Params + guardClause, threadIdReg = get_guard_clause(exitJumpLocationMarker, nrOfVariableSets) # parameter0 because first entry holds the number of variable sets and that is always stored in %parameter0 println(ptxBuffer, get_cuda_header()) println(ptxBuffer, signature) println(ptxBuffer, "{") - calc_code = generate_calculation_code(expression, "%parameter1", varSetSize, "%parameter2", paramSetSize, threadIdReg) + calc_code = generate_calculation_code(expression, "%parameter0", varSetSize, "%parameter1", paramSetSize, "%parameter2", + threadIdReg, expressionIndex, nrOfVariableSets) println(ptxBuffer, get_register_definitions()) println(ptxBuffer, paramLoading) println(ptxBuffer, guardClause) @@ -107,12 +116,15 @@ end # TODO: Make version, target and address_size configurable; also see what address_size means exactly function get_cuda_header()::String return " -.version 7.1 +.version 8.0 .target sm_61 .address_size 32 " end +" +param ```parameters```: [1] = nr of var sets; [2] = variables; [3] = parameters; [4] = result +" function get_kernel_signature(kernelName::String, parameters::Vector{DataType})::Tuple{String, String} signatureBuffer = IOBuffer() paramLoadingBuffer = IOBuffer() @@ -123,9 +135,9 @@ function get_kernel_signature(kernelName::String, parameters::Vector{DataType}): for i in eachindex(parameters) print(signatureBuffer, " .param .u32", " ", "param_", i) - parametersReg = get_next_free_register("r") - println(paramLoadingBuffer, "ld.param.u32 $parametersReg, [param_$i];") - println(paramLoadingBuffer, "cvta.to.global.u32 $(get_next_free_register("parameter")), $parametersReg;") + parametersLocation = get_next_free_register("r") + println(paramLoadingBuffer, "ld.param.u32 $parametersLocation, [param_$i];") + println(paramLoadingBuffer, "cvta.to.global.u32 $(get_next_free_register("parameter")), $parametersLocation;") if i != lastindex(parameters) println(signatureBuffer, ",") end @@ -140,7 +152,7 @@ Constructs the PTX code used for handling the case where too many threads are st - param ```nrOfVarSetsRegister```: The register which holds the total amount of variable sets for the kernel " -function get_guard_clause(exitJumpLocation::String, nrOfVarSetsRegister::String)::Tuple{String, String} +function get_guard_clause(exitJumpLocation::String, nrOfVarSets::Integer)::Tuple{String, String} guardBuffer = IOBuffer() threadIds = get_next_free_register("r") @@ -154,8 +166,6 @@ function get_guard_clause(exitJumpLocation::String, nrOfVarSetsRegister::String) globalThreadId = get_next_free_register("r") # basically the index of the thread in the variable set breakCondition = get_next_free_register("p") - nrOfVarSets = get_next_free_register("i") - println(guardBuffer, "ld.global.u32 $nrOfVarSets, [$nrOfVarSetsRegister];") println(guardBuffer, "mad.lo.s32 $globalThreadId, $threadIds, $threadsPerCTA, $currentThreadId;") println(guardBuffer, "setp.ge.s32 $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets @@ -168,8 +178,9 @@ end " - param ```parametersSetSize```: Size of the largest parameter set " -function generate_calculation_code(expression::ExpressionProcessing.PostfixType, variablesReg::String, variablesSetSize::Integer, - parametersReg::String, parametersSetSize::Integer, threadIdReg::String)::String +function generate_calculation_code(expression::ExpressionProcessing.PostfixType, variablesLocation::String, variablesSetSize::Integer, + parametersLocation::String, parametersSetSize::Integer, resultsLocation::String, + threadIdReg::String, expressionIndex::Integer, nrOfVarSets::Integer)::String codeBuffer = IOBuffer() operands = Vector{Operand}() @@ -196,24 +207,37 @@ function generate_calculation_code(expression::ExpressionProcessing.PostfixType, if token.Value > 0 # varaibles var, first_access = get_register_for_name("x$(token.Value)") if first_access - println(codeBuffer, load_into_register(var, variablesReg, token.Value, threadIdReg, variablesSetSize)) + println(codeBuffer, load_into_register(var, variablesLocation, token.Value, threadIdReg, variablesSetSize)) end push!(operands, var) else absVal = abs(token.Value) param, first_access = get_register_for_name("p$absVal") if first_access - println(codeBuffer, load_into_register(param, parametersReg, absVal, threadIdReg, parametersSetSize)) + println(codeBuffer, load_into_register(param, parametersLocation, absVal, threadIdReg, parametersSetSize)) end push!(operands, param) end end end + # resultIndex = ((expressionIndex - 1) * nrOfVarSets + threadIdReg) * bytes (4 in our case) + # resultsLocation[resultIndex] = ""; + tempReg = get_next_free_register("i") + println(codeBuffer, " + add.u32 $tempReg, $((expressionIndex-1)*nrOfVarSets), $threadIdReg; + mul.lo.u32 $tempReg, $tempReg, $(sizeof(expressionIndex)); + add.u32 $tempReg, $resultsLocation, $tempReg; + st.global.f32 [$tempReg], $(pop!(operands)); + ") + + println(operands) return String(take!(codeBuffer)) end " +Loads a value from a location into the given register. It is assumed that the location refers to a column-major matrix + - param ```register```: The register where the loaded value will be stored - param ```loadLocation```: The location from where to load the value - param ```valueIndex```: 0-based index of the value in the variable set/parameter set @@ -263,6 +287,7 @@ function get_operation(operator::Operator, left::Operand, right::Union{Operand, elseif operator == POWER # x^y == 2^(y*log2(x)) as generated by nvcc for "pow(x, y)" resultCode = " + // x^y: lg2.approx.f32 $resultRegister, $left; mul.f32 $resultRegister, $right, $resultRegister; ex2.approx.f32 $resultRegister, $resultRegister;" @@ -271,11 +296,13 @@ function get_operation(operator::Operator, left::Operand, right::Union{Operand, elseif operator == LOG # log(x) == log2(x) * ln(2) as generated by nvcc for "log(x)" resultCode = " + // log(x): lg2.approx.f32 $resultRegister, $left; mul.f32 $resultRegister, $resultRegister, 0.693147182;" elseif operator == EXP # e^x == 2^(x/ln(2)) as generated by nvcc for "exp(x)" resultCode = " + // e^x: mul.f32 $resultRegister, $left, 1.44269502; ex2.approx.f32 $resultRegister, $resultRegister;" elseif operator == SQRT diff --git a/package/src/Utils.jl b/package/src/Utils.jl index 595bd21..e1480a9 100644 --- a/package/src/Utils.jl +++ b/package/src/Utils.jl @@ -1,5 +1,7 @@ module Utils +using CUDA + "Converts a vector of vectors into a matrix. The inner vectors do not need to have the same length. All entries that cannot be filled have ```invalidElement``` as their value diff --git a/package/test/TranspilerTests.jl b/package/test/TranspilerTests.jl index 7234e2d..4025b91 100644 --- a/package/test/TranspilerTests.jl +++ b/package/test/TranspilerTests.jl @@ -24,11 +24,11 @@ parameters[2][2] = 0.0 postfixExpr = expr_to_postfix(expressions[1]) postfixExprs = Vector([postfixExpr]) push!(postfixExprs, expr_to_postfix(expressions[2])) - push!(postfixExprs, expr_to_postfix(:(5^3 + x1))) + push!(postfixExprs, expr_to_postfix(:(5^3 + x1 - p1))) # generatedCode = Transpiler.transpile(postfixExpr) - generatedCode = Transpiler.transpile(postfixExprs[3], 2, 3) # TEMP - # println(generatedCode) + generatedCode = Transpiler.transpile(postfixExprs[3], 2, 3, 2, 3) # TEMP + println(generatedCode) # CUDA.@sync interpret(postfixExprs, variables, parameters) # This is just here for testing. This will be called inside the execute method in the Transpiler module @@ -42,11 +42,11 @@ parameters[2][2] = 0.0 end @testset "Test transpiler evaluation" begin - postfixExprs = Vector{ExpressionProcessing.PostfixType}() - push!(postfixExprs, expr_to_postfix(expressions[1])) - push!(postfixExprs, expr_to_postfix(expressions[2])) + # postfixExprs = Vector{Expr}() + # push!(postfixExprs, expressions[1]) + # push!(postfixExprs, expressions[2]) - @time Transpiler.evaluate(postfixExprs, variables, parameters) + @time Transpiler.evaluate(expressions, variables, parameters) end #TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)"