transpiler: generates valid PTX and evaluates expressions correctly

2025-03-28 19:32:48 +01:00
parent 9df78ca72e
commit effd477558
5 changed files with 195 additions and 306 deletions
--- a/package/test/TranspilerTests.jl
+++ b/package/test/TranspilerTests.jl
@ -2,138 +2,65 @@ using CUDA
 using .ExpressionProcessing
 using .Transpiler

-expressions = Vector{Expr}(undef, 2)
-variables = Matrix{Float32}(undef, 2,2)
-parameters = Vector{Vector{Float32}}(undef, 2)
+expressions = Vector{Expr}(undef, 3)
+variables = Matrix{Float32}(undef, 5, 4)
+parameters = Vector{Vector{Float32}}(undef, 3)

-# Resulting value should be 1.14... for the first expression
 expressions[1] = :(1 + 3 * 5 / 7 - sqrt(4))
-expressions[2] = :(5 + x1 + 1 * x2 + p1 + p2)
+expressions[2] = :(5 + x1 + 1 * x2 + p1 + p2 + x1^x3)
+expressions[3] = :(log(x1) / x2 * sqrt(p1) + x3^x4 - exp(x5))
+
 variables[1,1] = 2.0
 variables[2,1] = 3.0
-variables[1,2] = 0.0
+variables[3,1] = 0.0
+variables[4,1] = 1.0
+variables[5,1] = 0.0
+
+variables[1,2] = 2.0
 variables[2,2] = 5.0
-parameters[1] = Vector{Float32}(undef, 1)
+variables[3,2] = 3.0
+variables[4,2] = 0.0 
+variables[5,2] = 0.0
+
+variables[1,3] = 6.0
+variables[2,3] = 2.0
+variables[3,3] = 2.0
+variables[4,3] = 4.0
+variables[5,3] = 2.0
+
+variables[1,4] = 1.0
+variables[2,4] = 2.0
+variables[3,4] = 3.0
+variables[4,4] = 4.0
+variables[5,4] = 5.0
+
+parameters[1] = Vector{Float32}(undef, 0)
 parameters[2] = Vector{Float32}(undef, 2)
-parameters[1][1] = 5.0
+parameters[3] = Vector{Float32}(undef, 1)
 parameters[2][1] = 5.0
 parameters[2][2] = 0.0
-
-
-@testset "Test TMP transpiler" begin
-	postfixExpr = expr_to_postfix(expressions[1])
-	postfixExprs = Vector([postfixExpr])
-	push!(postfixExprs, expr_to_postfix(expressions[2]))
-	push!(postfixExprs, expr_to_postfix(:(5^3 + x1 - p1)))
-
-	# generatedCode = Transpiler.transpile(postfixExpr)
-	# generatedCode = Transpiler.transpile(postfixExprs[3], 2, 3, 2, 3) # TEMP
-	# println(generatedCode)
-	# CUDA.@sync interpret(postfixExprs, variables, parameters)
-
-	# This is just here for testing. This will be called inside the execute method in the Transpiler module
-	# linker = CuLink()
-	# add_data!(linker, "ExpressionProcessing", generatedCode)
-
-	# image = complete(linker)
-
-	# mod = CuModule(image)
-	# func = CuFunction(mod, "ExpressionProcessing")
-end
+parameters[3][1] = 16.0

@testset "Test transpiler evaluation" begin
-	# postfixExprs = Vector{Expr}()
-	# push!(postfixExprs, expressions[1])
-	# push!(postfixExprs, expressions[2])
+	results = Transpiler.evaluate(expressions, variables, parameters)

-	expr = Vector{Expr}()
-	push!(expr, expressions[1])
-	@time Transpiler.evaluate(expr, variables, parameters)
+	# dump(expressions[3]; maxdepth=10)
+	# Expr 1:
+	@test isapprox(results[1,1], 1.14286)
+	@test isapprox(results[2,1], 1.14286)
+	@test isapprox(results[3,1], 1.14286)
+	@test isapprox(results[4,1], 1.14286)
+	#Expr 2:
+	@test isapprox(results[1,2], 16.0)
+	@test isapprox(results[2,2], 25.0)
+	@test isapprox(results[3,2], 54.0)
+	@test isapprox(results[4,2], 14.0)
+
+	#Expr3:
+	@test isapprox(results[1,3],  -0.07580)
+	@test isapprox(results[2,3],   0.55452)
+	@test isapprox(results[3,3],  12.19446)
+	@test isapprox(results[4,3], -67.41316)
 end

-#TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)"
-
-function test_kernel(results)
-	@inbounds results[1] = 10f0
-
-	return nothing
-end
-
-@testset "TEMP" begin
-return
-	results = CuArray{Float32}(undef, 2)
-	# @device_code_ptx @cuda test_kernel(results)
-
-
-	# println(CUDA.code_ptx(kernel.fun, ))
-	# return
-
-	ptx = "
-	.version 8.5
-	.target sm_61
-	.address_size 64
-
-	.visible .entry ExpressionProcessing(
-	.param .u64 param_1)
-	{
-		.reg .b64   %parameter<1>;
-		.reg .b64   %i<1>;
-		//.reg .b64 %rd<6>;
-
-		ld.param.u64   %i0, [param_1];
-		cvta.to.global.u64   %parameter0, %i0;
-
-		st.global.f32  [%parameter0], 10.0;
-		ret;
-	}"
-
-	ptx = ".version 8.5
-.target sm_61
-.address_size 64
-
-.visible .entry ExpressionProcessing(
-  .param .u64 param_1)
-{
-.reg .b64   %parameter<1>;
-.reg .b32   %r<4>;
-.reg .pred   %p<1>;
-.reg .b64   %i<1>;
-
-ld.param.u64   %i0, [param_1];
-cvta.to.global.u64   %parameter0, %i0;
-
-mov.u32    %r0, %ntid.x;
-mov.u32    %r1, %ctaid.x;
-mov.u32    %r2, %tid.x;
-mad.lo.s32     %r3, %r0, %r1, %r2;
-setp.gt.s32    %p0, %r3, 2;
-@%p0 bra    \$L__BB0_2;
-st.global.f32  [%parameter0], 10.0;
-\$L__BB0_2: ret;
-}"
-
-	linker = CuLink()
-	add_data!(linker, "ExpressionProcessing", ptx)
-	
-	image = complete(linker)
-	
-	mod = CuModule(image)
-	func = CuFunction(mod, "ExpressionProcessing")
-
-	variableCols = 2
-	cudaResults = CuArray{Float32}(undef, 1)
-	# cd = CUDA.alloc(CUDA.DeviceMemory, (variableCols * length(expressions)) * sizeof(Float32))
-	# cudaResults = CUDA.fill(0f0, variableCols * length(expressions))
-	# cudaResults = cu(zeros(Float32, variableCols * length(expressions)))
-
-	config = launch_configuration(func)
-	threads = min(variableCols, config.threads)
-	blocks = cld(variableCols, threads)
-
-	cudacall(func, Tuple{CuPtr{Float32}}, cudaResults; threads=4, blocks=1)
-	# launch(func, cudaResults; threads=threads, blocks=blocks)
-
-	println(Array(cudaResults))
-end
-
-# TODO: University setup at 10.20.1.7
+# TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)"