diff --git a/package/Project.toml b/package/Project.toml index 714f623..7509c9d 100644 --- a/package/Project.toml +++ b/package/Project.toml @@ -5,11 +5,13 @@ version = "1.0.0-DEV" [deps] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" [compat] +LinearAlgebra = "1.11.0" Printf = "1.11.0" Random = "1.11.0" julia = "1.6.7" diff --git a/package/src/Transpiler.jl b/package/src/Transpiler.jl index d7c113e..b56d6fe 100644 --- a/package/src/Transpiler.jl +++ b/package/src/Transpiler.jl @@ -53,7 +53,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet cudaParams = Utils.create_cuda_array(parameters, NaN32) # maybe make constant (see runtests.jl for more info) # each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions - cudaResults = CuArray{Float32}(undef, variableCols, length(expressions)) + cudaResults = CuArray{Float32}(undef, variableCols * length(expressions)) # cudaResults = CUDA.zeros(variableCols * length(expressions)) # ptr = CuPtr{Float32}(C_NULL) # CUDA.cuMemAlloc(ptr, sizeof(Float32) * 10) @@ -68,8 +68,9 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet blocks = cld(variableCols, threads) # cudacall(kernels[i], (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks) + cudacall(kernels[i], (CuPtr{Float32},), cudaResults; threads=threads, blocks=blocks) # launch(kernels[i], cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks) - launch(kernels[i], cudaResults; threads=threads, blocks=blocks) + # launch(kernels[i], cudaResults; threads=threads, blocks=blocks) end println(Array(cudaResults)) @@ -120,9 +121,9 @@ end # TODO: Make version, target and address_size configurable; also see what address_size means exactly function get_cuda_header()::String return " -.version 7.1 +.version 8.5 .target sm_61 -.address_size 32 +.address_size 64 " end @@ -137,11 +138,11 @@ function get_kernel_signature(kernelName::String, parameters::Vector{DataType}): println(signatureBuffer, "(") for i in eachindex(parameters) - print(signatureBuffer, " .param .u32", " ", "param_", i) + print(signatureBuffer, " .param .u64", " ", "param_", i) parametersLocation = get_next_free_register("i") - println(paramLoadingBuffer, "ld.param.u32 $parametersLocation, [param_$i];") - println(paramLoadingBuffer, "cvta.to.global.u32 $(get_next_free_register("parameter")), $parametersLocation;") + println(paramLoadingBuffer, "ld.param.u64 $parametersLocation, [param_$i];") + println(paramLoadingBuffer, "cvta.to.global.u64 $(get_next_free_register("parameter")), $parametersLocation;") if i != lastindex(parameters) println(signatureBuffer, ",") end @@ -169,12 +170,12 @@ function get_guard_clause(exitJumpLocation::String, nrOfVarSets::Integer)::Tuple println(guardBuffer, "mov.u32 $currentThreadId, %tid.x;") globalThreadId = get_next_free_register("r") # basically the index of the thread in the variable set - # breakCondition = get_next_free_register("p") + breakCondition = get_next_free_register("p") println(guardBuffer, "mad.lo.s32 $globalThreadId, $threadIds, $threadsPerCTA, $currentThreadId;") - # println(guardBuffer, "setp.ge.s32 $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets + println(guardBuffer, "setp.gt.s32 $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets # branch to end if breakCondition is true - # print(guardBuffer, "@$breakCondition bra $exitJumpLocation;") + print(guardBuffer, "@$breakCondition bra $exitJumpLocation;") return (String(take!(guardBuffer)), globalThreadId) end @@ -186,7 +187,7 @@ function generate_calculation_code(expression::ExpressionProcessing.PostfixType, parametersLocation::String, parametersSetSize::Integer, resultsLocation::String, threadIdReg::String, expressionIndex::Integer, nrOfVarSets::Integer)::String - return "st.global.f32 [$resultsLocation], 10.0;" + # return "st.global.f32 [$resultsLocation], 10.0;" codeBuffer = IOBuffer() operands = Vector{Operand}() @@ -360,9 +361,9 @@ let registers = Dict() # stores the count of the register already used. elseif definition.first == "r" regType = ".b32" elseif definition.first == "parameter" - regType = ".u32" + regType = ".b64" elseif definition.first == "i" - regType = ".u32" + regType = ".b64" else throw(ArgumentError("Unknown register name used. Name '$(definition.first)' cannot be mapped to a PTX type.")) end diff --git a/package/test/CpuInterpreterTests.jl b/package/test/CpuInterpreterTests.jl index 356a4a6..02b7be7 100644 --- a/package/test/CpuInterpreterTests.jl +++ b/package/test/CpuInterpreterTests.jl @@ -1,4 +1,5 @@ using LinearAlgebra +using BenchmarkTools function test_cpu_interpreter(nrows; parallel = false) exprs = [ @@ -18,16 +19,27 @@ function test_cpu_interpreter(nrows; parallel = false) reps= 100 if parallel - t_sec = @elapsed fetch.([Threads.@spawn interpret_cpu(exprs, X, p; repetitions=expr_reps) for i in 1:reps]) - println("~ $(round(30 * reps * expr_reps * nrows / 1e9 / t_sec, digits=2)) GFLOPS ($(Threads.nthreads()) threads) ($(round(peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))") + # t_sec = @elapsed fetch.([Threads.@spawn interpret_cpu(exprs, X, p; repetitions=expr_reps) for i in 1:reps]) + @btime parallel(exprs, X, p, expr_reps, reps) + println("~ $(round(30 * reps * expr_reps * nrows / 1e9 / t_sec, digits=2)) GFLOPS ($(Threads.nthreads()) threads) ($(round(LinearAlgebra.peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))") else - t_sec = @elapsed for i in 1:reps interpret_cpu(exprs, X, p; repetitions=expr_reps) end - println("~ $(round(30 * reps * expr_reps * nrows / 1e9 / t_sec, digits=2)) GFLOPS (single-core) ($(round(peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))") + # t_sec = @elapsed for i in 1:reps interpret_cpu(exprs, X, p; repetitions=expr_reps) end + @btime single(exprs, X, p, expr_reps, reps) + println("~ $(round(30 * reps * expr_reps * nrows / 1e9 / t_sec, digits=2)) GFLOPS (single-core) ($(round(LinearAlgebra.peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))") end true end -LinearAlgebra.BLAS.set_num_threads(1) # only use a single thread for peakflops +function parallel(exprs, X, p, expr_reps, reps) + fetch.([Threads.@spawn interpret_cpu(exprs, X, p; repetitions=expr_reps) for i in 1:reps]) +end + +function single(exprs, X, p, expr_reps, reps) + for i in 1:reps interpret_cpu(exprs, X, p; repetitions=expr_reps) end +end + + +# LinearAlgebra.BLAS.set_num_threads(1) # only use a single thread for peakflops @test test_cpu_interpreter(1000) @test test_cpu_interpreter(1000, parallel=true) # start julia -t 6 for six threads diff --git a/package/test/PerformanceTests.jl b/package/test/PerformanceTests.jl new file mode 100644 index 0000000..ada8454 --- /dev/null +++ b/package/test/PerformanceTests.jl @@ -0,0 +1,47 @@ +using .Transpiler +using .Interpreter + +@testset "CPU performance" begin + function test_cpu_interpreter(nrows; parallel = false) + exprs = [ + # CPU interpreter requires an anonymous function and array ref s + :(p[1] * x[1] + p[2]), # 5 op + :((((x[1] + x[2]) + x[3]) + x[4]) + x[5]), # 9 op + :(log(abs(x[1]))), # 3 op + :(powabs(p[2] - powabs(p[1] + x[1], 1/x[1]),p[3])) # 13 op + ] # 30 op + exprs = map(e -> Expr(:->, :(x,p), e), exprs) + X = randn(Float32, nrows, 10) + p = [randn(Float32, 10) for _ in 1:length(exprs)] # generate 10 random parameter values for each expr + + # warmup + interpret_cpu(exprs, X, p) + expr_reps = 100 # for each expr + reps= 100 + + if parallel + t_sec = @elapsed fetch.([Threads.@spawn interpret_cpu(exprs, X, p; repetitions=expr_reps) for i in 1:reps]) + println("~ $(round(30 * reps * expr_reps * nrows / 1e9 / t_sec, digits=2)) GFLOPS ($(Threads.nthreads()) threads) ($(round(peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))") + else + t_sec = @elapsed for i in 1:reps interpret_cpu(exprs, X, p; repetitions=expr_reps) end + println("~ $(round(30 * reps * expr_reps * nrows / 1e9 / t_sec, digits=2)) GFLOPS (single-core) ($(round(peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))") + end + true + end + +LinearAlgebra.BLAS.set_num_threads(1) # only use a single thread for peakflops + +@test test_cpu_interpreter(1000) +@test test_cpu_interpreter(1000, parallel=true) # start julia -t 6 for six threads +@test test_cpu_interpreter(10000) +@test test_cpu_interpreter(10000, parallel=true) + +end + +@testset "Interpreter Performance" begin + +end + +@testset "Transpiler Performance" begin + +end \ No newline at end of file diff --git a/package/test/Project.toml b/package/test/Project.toml index 9479a70..ec911b4 100644 --- a/package/test/Project.toml +++ b/package/test/Project.toml @@ -1,4 +1,6 @@ [deps] +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/package/test/TranspilerTests.jl b/package/test/TranspilerTests.jl index 769d85b..5a8bcea 100644 --- a/package/test/TranspilerTests.jl +++ b/package/test/TranspilerTests.jl @@ -48,30 +48,70 @@ end expr = Vector{Expr}() push!(expr, expressions[1]) - # @time Transpiler.evaluate(expr, variables, parameters) + @time Transpiler.evaluate(expr, variables, parameters) end #TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)" +function test_kernel(results) + @inbounds results[1] = 10f0 + + return nothing +end + @testset "TEMP" begin +return + results = CuArray{Float32}(undef, 2) + # @device_code_ptx @cuda test_kernel(results) + + + # println(CUDA.code_ptx(kernel.fun, )) + # return + ptx = " - .version 7.1 + .version 8.5 .target sm_61 .address_size 64 .visible .entry ExpressionProcessing( - .param .u32 param_1) + .param .u64 param_1) { - .reg .u32 %parameter<1>; - .reg .u32 %i<1>; + .reg .b64 %parameter<1>; + .reg .b64 %i<1>; + //.reg .b64 %rd<6>; - ld.param.u32 %i0, [param_1]; - cvta.to.global.u32 %parameter0, %i0; + ld.param.u64 %i0, [param_1]; + cvta.to.global.u64 %parameter0, %i0; st.global.f32 [%parameter0], 10.0; ret; }" + ptx = ".version 8.5 +.target sm_61 +.address_size 64 + +.visible .entry ExpressionProcessing( + .param .u64 param_1) +{ +.reg .b64 %parameter<1>; +.reg .b32 %r<4>; +.reg .pred %p<1>; +.reg .b64 %i<1>; + +ld.param.u64 %i0, [param_1]; +cvta.to.global.u64 %parameter0, %i0; + +mov.u32 %r0, %ntid.x; +mov.u32 %r1, %ctaid.x; +mov.u32 %r2, %tid.x; +mad.lo.s32 %r3, %r0, %r1, %r2; +setp.gt.s32 %p0, %r3, 2; +@%p0 bra \$L__BB0_2; +st.global.f32 [%parameter0], 10.0; +\$L__BB0_2: ret; +}" + linker = CuLink() add_data!(linker, "ExpressionProcessing", ptx) @@ -90,11 +130,10 @@ end threads = min(variableCols, config.threads) blocks = cld(variableCols, threads) - cudacall(func, Tuple{CuPtr{Float32}}, cudaResults; threads=1, blocks=1) + cudacall(func, Tuple{CuPtr{Float32}}, cudaResults; threads=4, blocks=1) # launch(func, cudaResults; threads=threads, blocks=blocks) println(Array(cudaResults)) end - # TODO: University setup at 10.20.1.7 \ No newline at end of file diff --git a/package/test/runtests.jl b/package/test/runtests.jl index 1fb88ef..86851c6 100644 --- a/package/test/runtests.jl +++ b/package/test/runtests.jl @@ -14,9 +14,9 @@ include(joinpath(baseFolder, "src", "Transpiler.jl")) end -#@testset "CPU Interpreter" begin -# include("CpuInterpreterTests.jl") -#end +# @testset "CPU Interpreter" begin +# include("CpuInterpreterTests.jl") +# end @testset "Performance tests" begin # TODO: make performance tests