continued understanding given PTX file and made plan on how to approach the transpiler part
Some checks failed
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled

This commit is contained in:
2024-09-22 09:32:39 +02:00
parent 44722bfc65
commit d60cba7e4d
4 changed files with 151 additions and 1 deletions

76
package/src/Transpiler.jl Normal file
View File

@ -0,0 +1,76 @@
# culoadtest(N, rand(["add.f32", "sub.f32", "mul.f32", "div.approx.f32"]))
function culoadtest(N::Int32, op = "add.f32")
vadd_code = ".version 7.1
.target sm_52
.address_size 64
// .globl VecAdd_kernel
.visible .entry VecAdd_kernel(
.param .u64 VecAdd_kernel_param_0,
.param .u64 VecAdd_kernel_param_1,
.param .u64 VecAdd_kernel_param_2,
.param .u32 VecAdd_kernel_param_3
)
{
.reg .pred %p<2>;
.reg .f32 %f<4>;
.reg .b32 %r<6>;
.reg .b64 %rd<11>;
ld.param.u64 %rd1, [VecAdd_kernel_param_0];
ld.param.u64 %rd2, [VecAdd_kernel_param_1];
ld.param.u64 %rd3, [VecAdd_kernel_param_2];
ld.param.u32 %r2, [VecAdd_kernel_param_3];
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ctaid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r3, %r4, %r5;
setp.ge.s32 %p1, %r1, %r2;
@%p1 bra \$L__BB0_2;
cvta.to.global.u64 %rd4, %rd1;
mul.wide.s32 %rd5, %r1, 4;
add.s64 %rd6, %rd4, %rd5;
cvta.to.global.u64 %rd7, %rd2;
add.s64 %rd8, %rd7, %rd5;
ld.global.f32 %f1, [%rd8];
ld.global.f32 %f2, [%rd6];" *
op *
" %f3, %f2, %f1;
cvta.to.global.u64 %rd9, %rd3;
add.s64 %rd10, %rd9, %rd5;
st.global.f32 [%rd10], %f3;
\$L__BB0_2:
ret;
}"
linker = CuLink()
add_data!(linker, "VecAdd_kernel", vadd_code)
image = complete(linker)
mod = CuModule(image)
func = CuFunction(mod, "VecAdd_kernel")
d_a = CUDA.fill(1.0f0, N)
d_b = CUDA.fill(2.0f0, N)
d_c = CUDA.fill(0.0f0, N)
# Grid/Block configuration
threadsPerBlock = 256;
blocksPerGrid = (N + threadsPerBlock - 1) ÷ threadsPerBlock;
@time CUDA.@sync cudacall(func, Tuple{CuPtr{Cfloat},CuPtr{Cfloat},CuPtr{Cfloat},Cint}, d_a, d_b, d_c, N; threads=threadsPerBlock, blocks=blocksPerGrid)
end