using CUDA using .ExpressionProcessing using .Transpiler expressions = Vector{Expr}(undef, 2) variables = Matrix{Float32}(undef, 2,2) parameters = Vector{Vector{Float32}}(undef, 2) # Resulting value should be 1.14... for the first expression expressions[1] = :(1 + 3 * 5 / 7 - sqrt(4)) expressions[2] = :(5 + x1 + 1 * x2 + p1 + p2) variables[1,1] = 2.0 variables[2,1] = 3.0 variables[1,2] = 0.0 variables[2,2] = 5.0 parameters[1] = Vector{Float32}(undef, 1) parameters[2] = Vector{Float32}(undef, 2) parameters[1][1] = 5.0 parameters[2][1] = 5.0 parameters[2][2] = 0.0 @testset "Test TMP transpiler" begin postfixExpr = expr_to_postfix(expressions[1]) postfixExprs = Vector([postfixExpr]) push!(postfixExprs, expr_to_postfix(expressions[2])) push!(postfixExprs, expr_to_postfix(:(5^3 + x1 - p1))) # generatedCode = Transpiler.transpile(postfixExpr) # generatedCode = Transpiler.transpile(postfixExprs[3], 2, 3, 2, 3) # TEMP # println(generatedCode) # CUDA.@sync interpret(postfixExprs, variables, parameters) # This is just here for testing. This will be called inside the execute method in the Transpiler module # linker = CuLink() # add_data!(linker, "ExpressionProcessing", generatedCode) # image = complete(linker) # mod = CuModule(image) # func = CuFunction(mod, "ExpressionProcessing") end @testset "Test transpiler evaluation" begin # postfixExprs = Vector{Expr}() # push!(postfixExprs, expressions[1]) # push!(postfixExprs, expressions[2]) expr = Vector{Expr}() push!(expr, expressions[1]) @time Transpiler.evaluate(expr, variables, parameters) end #TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)" function test_kernel(results) @inbounds results[1] = 10f0 return nothing end @testset "TEMP" begin return results = CuArray{Float32}(undef, 2) # @device_code_ptx @cuda test_kernel(results) # println(CUDA.code_ptx(kernel.fun, )) # return ptx = " .version 8.5 .target sm_61 .address_size 64 .visible .entry ExpressionProcessing( .param .u64 param_1) { .reg .b64 %parameter<1>; .reg .b64 %i<1>; //.reg .b64 %rd<6>; ld.param.u64 %i0, [param_1]; cvta.to.global.u64 %parameter0, %i0; st.global.f32 [%parameter0], 10.0; ret; }" ptx = ".version 8.5 .target sm_61 .address_size 64 .visible .entry ExpressionProcessing( .param .u64 param_1) { .reg .b64 %parameter<1>; .reg .b32 %r<4>; .reg .pred %p<1>; .reg .b64 %i<1>; ld.param.u64 %i0, [param_1]; cvta.to.global.u64 %parameter0, %i0; mov.u32 %r0, %ntid.x; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r3, %r0, %r1, %r2; setp.gt.s32 %p0, %r3, 2; @%p0 bra \$L__BB0_2; st.global.f32 [%parameter0], 10.0; \$L__BB0_2: ret; }" linker = CuLink() add_data!(linker, "ExpressionProcessing", ptx) image = complete(linker) mod = CuModule(image) func = CuFunction(mod, "ExpressionProcessing") variableCols = 2 cudaResults = CuArray{Float32}(undef, 1) # cd = CUDA.alloc(CUDA.DeviceMemory, (variableCols * length(expressions)) * sizeof(Float32)) # cudaResults = CUDA.fill(0f0, variableCols * length(expressions)) # cudaResults = cu(zeros(Float32, variableCols * length(expressions))) config = launch_configuration(func) threads = min(variableCols, config.threads) blocks = cld(variableCols, threads) cudacall(func, Tuple{CuPtr{Float32}}, cudaResults; threads=4, blocks=1) # launch(func, cudaResults; threads=threads, blocks=blocks) println(Array(cudaResults)) end # TODO: University setup at 10.20.1.7