continued understanding given PTX file and made plan on how to approach the transpiler part
Some checks failed
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled
Some checks failed
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled
This commit is contained in:
parent
44722bfc65
commit
d60cba7e4d
3
.github/workflows/CI.yml
vendored
3
.github/workflows/CI.yml
vendored
|
@ -32,7 +32,8 @@ jobs:
|
|||
- x64
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: cd /package
|
||||
- name: Go to package
|
||||
run: cd ./package
|
||||
- uses: julia-actions/setup-julia@v2
|
||||
with:
|
||||
version: ${{ matrix.version }}
|
||||
|
|
2
.github/workflows/CompatHelper.yml
vendored
2
.github/workflows/CompatHelper.yml
vendored
|
@ -7,6 +7,8 @@ jobs:
|
|||
CompatHelper:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Got to package folder
|
||||
run: cd ./package
|
||||
- name: Pkg.add("CompatHelper")
|
||||
run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
|
||||
- name: CompatHelper.main()
|
||||
|
|
71
PTX_understanding.md
Normal file
71
PTX_understanding.md
Normal file
|
@ -0,0 +1,71 @@
|
|||
All Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#instructions
|
||||
|
||||
```
|
||||
.version 7.1
|
||||
|
||||
.target sm_52
|
||||
.address_size 64
|
||||
|
||||
// .globl VecAdd_kernel
|
||||
|
||||
.visible .entry VecAdd_kernel(
|
||||
.param .u64 VecAdd_kernel_param_0,
|
||||
.param .u64 VecAdd_kernel_param_1,
|
||||
.param .u64 VecAdd_kernel_param_2,
|
||||
.param .u32 VecAdd_kernel_param_3
|
||||
)
|
||||
|
||||
{
|
||||
.reg .pred %p<2>; -> predicate registers: p1 (needed for branching)
|
||||
.reg .f32 %f<4>; -> float registers: f1 - f3
|
||||
.reg .b32 %r<6>; -> 32 bits registers: r1 - r5 (bits are actual raw bits without a type)
|
||||
.reg .b64 %rd<11>; -> 64 bits registers: rd1 - rd10
|
||||
|
||||
ld.param.u64 %rd1, [VecAdd_kernel_param_0]; -> rd1 = Data1
|
||||
ld.param.u64 %rd2, [VecAdd_kernel_param_1]; -> rd2 = Data2
|
||||
ld.param.u64 %rd3, [VecAdd_kernel_param_2]; -> rd3 = Result
|
||||
ld.param.u32 %r2, [VecAdd_kernel_param_3]; -> r2 = N
|
||||
|
||||
mov.u32 %r3, %ntid.x;
|
||||
mov.u32 %r4, %ctaid.x;
|
||||
mov.u32 %r5, %tid.x;
|
||||
|
||||
mad.lo.s32 %r1, %r3, %r4, %r5; -> r3 * r4 -> extract lowest 32/2 bits -> add r5 -> r1 = lowest16Bits(r3*r4) + r5
|
||||
|
||||
setp.ge.s32 %p1, %r1, %r2; -> p1 = r1 >= r2 (setp would assign !p1 to second register if one was given)
|
||||
|
||||
(gate clause for the case when we start more threads than needed)
|
||||
@%p1 bra \$L__BB0_2; -> if(p1) then {execute} else {branch to \$L__BB0_2}
|
||||
|
||||
cvta.to.global.u64 %rd4, %rd1; -> convert rd1 to global state space and write address to rd4 (I think)
|
||||
|
||||
mul.wide.s32 %rd5, %r1, 4; -> rd5 = r1 * 4
|
||||
add.s64 %rd6, %rd4, %rd5; -> rd6 = rd4 + rd5
|
||||
cvta.to.global.u64 %rd7, %rd2; -> same as above cvta
|
||||
add.s64 %rd8, %rd7, %rd5; -> rd8 = rd7 + rd5
|
||||
|
||||
ld.global.f32 %f1, [%rd8]; -> f1 = rd8 (loading rd8 in a global f32 register)
|
||||
ld.global.f32 %f2, [%rd6];" *
|
||||
op *
|
||||
" %f3, %f2, %f1; -> custom binary operator
|
||||
cvta.to.global.u64 %rd9, %rd3; -> load local Result to global Result
|
||||
|
||||
(I think this aggregates the result because rd9 = rd3 = Result)
|
||||
add.s64 %rd10, %rd9, %rd5; -> rd10 = rd9 + rd5
|
||||
st.global.f32 [%rd10], %f3; -> rd10 = f3 (We are overwriting the previous result?)
|
||||
|
||||
\$L__BB0_2:
|
||||
ret;
|
||||
}
|
||||
```
|
||||
|
||||
The above probably calculates this expression: f3 = (x1 + ((r3 * r4 + r5) * 4) CUSTOM_OPERATOR (x2 + ((r3 * r4 + r5) * 4)))
|
||||
|
||||
|
||||
# Plan
|
||||
|
||||
1. Generate PTX that only works with constant values and one expression
|
||||
1. Add support for loading variables and parameters (get vars/params as parameters -> Result still only one number)
|
||||
1. Add support for loading variables as matrix (params still only one value -> Result now a vector)
|
||||
1. Add support for loading parameters as "sparse" matrix (Not much should change)
|
||||
1. Add support for multiple expressions (Result is now a matrix)
|
76
package/src/Transpiler.jl
Normal file
76
package/src/Transpiler.jl
Normal file
|
@ -0,0 +1,76 @@
|
|||
|
||||
# culoadtest(N, rand(["add.f32", "sub.f32", "mul.f32", "div.approx.f32"]))
|
||||
function culoadtest(N::Int32, op = "add.f32")
|
||||
|
||||
vadd_code = ".version 7.1
|
||||
|
||||
.target sm_52
|
||||
.address_size 64
|
||||
|
||||
// .globl VecAdd_kernel
|
||||
|
||||
.visible .entry VecAdd_kernel(
|
||||
.param .u64 VecAdd_kernel_param_0,
|
||||
.param .u64 VecAdd_kernel_param_1,
|
||||
.param .u64 VecAdd_kernel_param_2,
|
||||
.param .u32 VecAdd_kernel_param_3
|
||||
)
|
||||
|
||||
{
|
||||
.reg .pred %p<2>;
|
||||
.reg .f32 %f<4>;
|
||||
.reg .b32 %r<6>;
|
||||
.reg .b64 %rd<11>;
|
||||
|
||||
ld.param.u64 %rd1, [VecAdd_kernel_param_0];
|
||||
ld.param.u64 %rd2, [VecAdd_kernel_param_1];
|
||||
ld.param.u64 %rd3, [VecAdd_kernel_param_2];
|
||||
ld.param.u32 %r2, [VecAdd_kernel_param_3];
|
||||
|
||||
mov.u32 %r3, %ntid.x;
|
||||
mov.u32 %r4, %ctaid.x;
|
||||
mov.u32 %r5, %tid.x;
|
||||
|
||||
mad.lo.s32 %r1, %r3, %r4, %r5;
|
||||
|
||||
setp.ge.s32 %p1, %r1, %r2;
|
||||
|
||||
@%p1 bra \$L__BB0_2;
|
||||
|
||||
cvta.to.global.u64 %rd4, %rd1;
|
||||
|
||||
mul.wide.s32 %rd5, %r1, 4;
|
||||
add.s64 %rd6, %rd4, %rd5;
|
||||
cvta.to.global.u64 %rd7, %rd2;
|
||||
add.s64 %rd8, %rd7, %rd5;
|
||||
|
||||
ld.global.f32 %f1, [%rd8];
|
||||
ld.global.f32 %f2, [%rd6];" *
|
||||
op *
|
||||
" %f3, %f2, %f1;
|
||||
cvta.to.global.u64 %rd9, %rd3;
|
||||
add.s64 %rd10, %rd9, %rd5;
|
||||
st.global.f32 [%rd10], %f3;
|
||||
|
||||
\$L__BB0_2:
|
||||
ret;
|
||||
}"
|
||||
|
||||
linker = CuLink()
|
||||
add_data!(linker, "VecAdd_kernel", vadd_code)
|
||||
|
||||
image = complete(linker)
|
||||
|
||||
mod = CuModule(image)
|
||||
func = CuFunction(mod, "VecAdd_kernel")
|
||||
|
||||
d_a = CUDA.fill(1.0f0, N)
|
||||
d_b = CUDA.fill(2.0f0, N)
|
||||
d_c = CUDA.fill(0.0f0, N)
|
||||
|
||||
# Grid/Block configuration
|
||||
threadsPerBlock = 256;
|
||||
blocksPerGrid = (N + threadsPerBlock - 1) ÷ threadsPerBlock;
|
||||
|
||||
@time CUDA.@sync cudacall(func, Tuple{CuPtr{Cfloat},CuPtr{Cfloat},CuPtr{Cfloat},Cint}, d_a, d_b, d_c, N; threads=threadsPerBlock, blocks=blocksPerGrid)
|
||||
end
|
Loading…
Reference in New Issue
Block a user