transpiler: generates valid PTX and evaluates expressions correctly
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
This commit is contained in:
@ -2,138 +2,65 @@ using CUDA
|
||||
using .ExpressionProcessing
|
||||
using .Transpiler
|
||||
|
||||
expressions = Vector{Expr}(undef, 2)
|
||||
variables = Matrix{Float32}(undef, 2,2)
|
||||
parameters = Vector{Vector{Float32}}(undef, 2)
|
||||
expressions = Vector{Expr}(undef, 3)
|
||||
variables = Matrix{Float32}(undef, 5, 4)
|
||||
parameters = Vector{Vector{Float32}}(undef, 3)
|
||||
|
||||
# Resulting value should be 1.14... for the first expression
|
||||
expressions[1] = :(1 + 3 * 5 / 7 - sqrt(4))
|
||||
expressions[2] = :(5 + x1 + 1 * x2 + p1 + p2)
|
||||
expressions[2] = :(5 + x1 + 1 * x2 + p1 + p2 + x1^x3)
|
||||
expressions[3] = :(log(x1) / x2 * sqrt(p1) + x3^x4 - exp(x5))
|
||||
|
||||
variables[1,1] = 2.0
|
||||
variables[2,1] = 3.0
|
||||
variables[1,2] = 0.0
|
||||
variables[3,1] = 0.0
|
||||
variables[4,1] = 1.0
|
||||
variables[5,1] = 0.0
|
||||
|
||||
variables[1,2] = 2.0
|
||||
variables[2,2] = 5.0
|
||||
parameters[1] = Vector{Float32}(undef, 1)
|
||||
variables[3,2] = 3.0
|
||||
variables[4,2] = 0.0
|
||||
variables[5,2] = 0.0
|
||||
|
||||
variables[1,3] = 6.0
|
||||
variables[2,3] = 2.0
|
||||
variables[3,3] = 2.0
|
||||
variables[4,3] = 4.0
|
||||
variables[5,3] = 2.0
|
||||
|
||||
variables[1,4] = 1.0
|
||||
variables[2,4] = 2.0
|
||||
variables[3,4] = 3.0
|
||||
variables[4,4] = 4.0
|
||||
variables[5,4] = 5.0
|
||||
|
||||
parameters[1] = Vector{Float32}(undef, 0)
|
||||
parameters[2] = Vector{Float32}(undef, 2)
|
||||
parameters[1][1] = 5.0
|
||||
parameters[3] = Vector{Float32}(undef, 1)
|
||||
parameters[2][1] = 5.0
|
||||
parameters[2][2] = 0.0
|
||||
|
||||
|
||||
@testset "Test TMP transpiler" begin
|
||||
postfixExpr = expr_to_postfix(expressions[1])
|
||||
postfixExprs = Vector([postfixExpr])
|
||||
push!(postfixExprs, expr_to_postfix(expressions[2]))
|
||||
push!(postfixExprs, expr_to_postfix(:(5^3 + x1 - p1)))
|
||||
|
||||
# generatedCode = Transpiler.transpile(postfixExpr)
|
||||
# generatedCode = Transpiler.transpile(postfixExprs[3], 2, 3, 2, 3) # TEMP
|
||||
# println(generatedCode)
|
||||
# CUDA.@sync interpret(postfixExprs, variables, parameters)
|
||||
|
||||
# This is just here for testing. This will be called inside the execute method in the Transpiler module
|
||||
# linker = CuLink()
|
||||
# add_data!(linker, "ExpressionProcessing", generatedCode)
|
||||
|
||||
# image = complete(linker)
|
||||
|
||||
# mod = CuModule(image)
|
||||
# func = CuFunction(mod, "ExpressionProcessing")
|
||||
end
|
||||
parameters[3][1] = 16.0
|
||||
|
||||
@testset "Test transpiler evaluation" begin
|
||||
# postfixExprs = Vector{Expr}()
|
||||
# push!(postfixExprs, expressions[1])
|
||||
# push!(postfixExprs, expressions[2])
|
||||
results = Transpiler.evaluate(expressions, variables, parameters)
|
||||
|
||||
expr = Vector{Expr}()
|
||||
push!(expr, expressions[1])
|
||||
@time Transpiler.evaluate(expr, variables, parameters)
|
||||
# dump(expressions[3]; maxdepth=10)
|
||||
# Expr 1:
|
||||
@test isapprox(results[1,1], 1.14286)
|
||||
@test isapprox(results[2,1], 1.14286)
|
||||
@test isapprox(results[3,1], 1.14286)
|
||||
@test isapprox(results[4,1], 1.14286)
|
||||
#Expr 2:
|
||||
@test isapprox(results[1,2], 16.0)
|
||||
@test isapprox(results[2,2], 25.0)
|
||||
@test isapprox(results[3,2], 54.0)
|
||||
@test isapprox(results[4,2], 14.0)
|
||||
|
||||
#Expr3:
|
||||
@test isapprox(results[1,3], -0.07580)
|
||||
@test isapprox(results[2,3], 0.55452)
|
||||
@test isapprox(results[3,3], 12.19446)
|
||||
@test isapprox(results[4,3], -67.41316)
|
||||
end
|
||||
|
||||
#TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)"
|
||||
|
||||
function test_kernel(results)
|
||||
@inbounds results[1] = 10f0
|
||||
|
||||
return nothing
|
||||
end
|
||||
|
||||
@testset "TEMP" begin
|
||||
return
|
||||
results = CuArray{Float32}(undef, 2)
|
||||
# @device_code_ptx @cuda test_kernel(results)
|
||||
|
||||
|
||||
# println(CUDA.code_ptx(kernel.fun, ))
|
||||
# return
|
||||
|
||||
ptx = "
|
||||
.version 8.5
|
||||
.target sm_61
|
||||
.address_size 64
|
||||
|
||||
.visible .entry ExpressionProcessing(
|
||||
.param .u64 param_1)
|
||||
{
|
||||
.reg .b64 %parameter<1>;
|
||||
.reg .b64 %i<1>;
|
||||
//.reg .b64 %rd<6>;
|
||||
|
||||
ld.param.u64 %i0, [param_1];
|
||||
cvta.to.global.u64 %parameter0, %i0;
|
||||
|
||||
st.global.f32 [%parameter0], 10.0;
|
||||
ret;
|
||||
}"
|
||||
|
||||
ptx = ".version 8.5
|
||||
.target sm_61
|
||||
.address_size 64
|
||||
|
||||
.visible .entry ExpressionProcessing(
|
||||
.param .u64 param_1)
|
||||
{
|
||||
.reg .b64 %parameter<1>;
|
||||
.reg .b32 %r<4>;
|
||||
.reg .pred %p<1>;
|
||||
.reg .b64 %i<1>;
|
||||
|
||||
ld.param.u64 %i0, [param_1];
|
||||
cvta.to.global.u64 %parameter0, %i0;
|
||||
|
||||
mov.u32 %r0, %ntid.x;
|
||||
mov.u32 %r1, %ctaid.x;
|
||||
mov.u32 %r2, %tid.x;
|
||||
mad.lo.s32 %r3, %r0, %r1, %r2;
|
||||
setp.gt.s32 %p0, %r3, 2;
|
||||
@%p0 bra \$L__BB0_2;
|
||||
st.global.f32 [%parameter0], 10.0;
|
||||
\$L__BB0_2: ret;
|
||||
}"
|
||||
|
||||
linker = CuLink()
|
||||
add_data!(linker, "ExpressionProcessing", ptx)
|
||||
|
||||
image = complete(linker)
|
||||
|
||||
mod = CuModule(image)
|
||||
func = CuFunction(mod, "ExpressionProcessing")
|
||||
|
||||
variableCols = 2
|
||||
cudaResults = CuArray{Float32}(undef, 1)
|
||||
# cd = CUDA.alloc(CUDA.DeviceMemory, (variableCols * length(expressions)) * sizeof(Float32))
|
||||
# cudaResults = CUDA.fill(0f0, variableCols * length(expressions))
|
||||
# cudaResults = cu(zeros(Float32, variableCols * length(expressions)))
|
||||
|
||||
config = launch_configuration(func)
|
||||
threads = min(variableCols, config.threads)
|
||||
blocks = cld(variableCols, threads)
|
||||
|
||||
cudacall(func, Tuple{CuPtr{Float32}}, cudaResults; threads=4, blocks=1)
|
||||
# launch(func, cudaResults; threads=threads, blocks=blocks)
|
||||
|
||||
println(Array(cudaResults))
|
||||
end
|
||||
|
||||
# TODO: University setup at 10.20.1.7
|
||||
# TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)"
|
Reference in New Issue
Block a user