transpiler: invalid memory access error finally fixed
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
This commit is contained in:
parent
561b37160b
commit
9df78ca72e
|
@ -5,11 +5,13 @@ version = "1.0.0-DEV"
|
|||
|
||||
[deps]
|
||||
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
|
||||
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
|
||||
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
|
||||
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
|
||||
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
|
||||
|
||||
[compat]
|
||||
LinearAlgebra = "1.11.0"
|
||||
Printf = "1.11.0"
|
||||
Random = "1.11.0"
|
||||
julia = "1.6.7"
|
||||
|
|
|
@ -53,7 +53,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
|
|||
cudaParams = Utils.create_cuda_array(parameters, NaN32) # maybe make constant (see runtests.jl for more info)
|
||||
|
||||
# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
|
||||
cudaResults = CuArray{Float32}(undef, variableCols, length(expressions))
|
||||
cudaResults = CuArray{Float32}(undef, variableCols * length(expressions))
|
||||
# cudaResults = CUDA.zeros(variableCols * length(expressions))
|
||||
# ptr = CuPtr{Float32}(C_NULL)
|
||||
# CUDA.cuMemAlloc(ptr, sizeof(Float32) * 10)
|
||||
|
@ -68,8 +68,9 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
|
|||
blocks = cld(variableCols, threads)
|
||||
|
||||
# cudacall(kernels[i], (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
|
||||
cudacall(kernels[i], (CuPtr{Float32},), cudaResults; threads=threads, blocks=blocks)
|
||||
# launch(kernels[i], cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
|
||||
launch(kernels[i], cudaResults; threads=threads, blocks=blocks)
|
||||
# launch(kernels[i], cudaResults; threads=threads, blocks=blocks)
|
||||
end
|
||||
|
||||
println(Array(cudaResults))
|
||||
|
@ -120,9 +121,9 @@ end
|
|||
# TODO: Make version, target and address_size configurable; also see what address_size means exactly
|
||||
function get_cuda_header()::String
|
||||
return "
|
||||
.version 7.1
|
||||
.version 8.5
|
||||
.target sm_61
|
||||
.address_size 32
|
||||
.address_size 64
|
||||
"
|
||||
end
|
||||
|
||||
|
@ -137,11 +138,11 @@ function get_kernel_signature(kernelName::String, parameters::Vector{DataType}):
|
|||
println(signatureBuffer, "(")
|
||||
|
||||
for i in eachindex(parameters)
|
||||
print(signatureBuffer, " .param .u32", " ", "param_", i)
|
||||
print(signatureBuffer, " .param .u64", " ", "param_", i)
|
||||
|
||||
parametersLocation = get_next_free_register("i")
|
||||
println(paramLoadingBuffer, "ld.param.u32 $parametersLocation, [param_$i];")
|
||||
println(paramLoadingBuffer, "cvta.to.global.u32 $(get_next_free_register("parameter")), $parametersLocation;")
|
||||
println(paramLoadingBuffer, "ld.param.u64 $parametersLocation, [param_$i];")
|
||||
println(paramLoadingBuffer, "cvta.to.global.u64 $(get_next_free_register("parameter")), $parametersLocation;")
|
||||
if i != lastindex(parameters)
|
||||
println(signatureBuffer, ",")
|
||||
end
|
||||
|
@ -169,12 +170,12 @@ function get_guard_clause(exitJumpLocation::String, nrOfVarSets::Integer)::Tuple
|
|||
println(guardBuffer, "mov.u32 $currentThreadId, %tid.x;")
|
||||
|
||||
globalThreadId = get_next_free_register("r") # basically the index of the thread in the variable set
|
||||
# breakCondition = get_next_free_register("p")
|
||||
breakCondition = get_next_free_register("p")
|
||||
println(guardBuffer, "mad.lo.s32 $globalThreadId, $threadIds, $threadsPerCTA, $currentThreadId;")
|
||||
# println(guardBuffer, "setp.ge.s32 $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets
|
||||
println(guardBuffer, "setp.gt.s32 $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets
|
||||
|
||||
# branch to end if breakCondition is true
|
||||
# print(guardBuffer, "@$breakCondition bra $exitJumpLocation;")
|
||||
print(guardBuffer, "@$breakCondition bra $exitJumpLocation;")
|
||||
|
||||
return (String(take!(guardBuffer)), globalThreadId)
|
||||
end
|
||||
|
@ -186,7 +187,7 @@ function generate_calculation_code(expression::ExpressionProcessing.PostfixType,
|
|||
parametersLocation::String, parametersSetSize::Integer, resultsLocation::String,
|
||||
threadIdReg::String, expressionIndex::Integer, nrOfVarSets::Integer)::String
|
||||
|
||||
return "st.global.f32 [$resultsLocation], 10.0;"
|
||||
# return "st.global.f32 [$resultsLocation], 10.0;"
|
||||
|
||||
codeBuffer = IOBuffer()
|
||||
operands = Vector{Operand}()
|
||||
|
@ -360,9 +361,9 @@ let registers = Dict() # stores the count of the register already used.
|
|||
elseif definition.first == "r"
|
||||
regType = ".b32"
|
||||
elseif definition.first == "parameter"
|
||||
regType = ".u32"
|
||||
regType = ".b64"
|
||||
elseif definition.first == "i"
|
||||
regType = ".u32"
|
||||
regType = ".b64"
|
||||
else
|
||||
throw(ArgumentError("Unknown register name used. Name '$(definition.first)' cannot be mapped to a PTX type."))
|
||||
end
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
using LinearAlgebra
|
||||
using BenchmarkTools
|
||||
|
||||
function test_cpu_interpreter(nrows; parallel = false)
|
||||
exprs = [
|
||||
|
@ -18,16 +19,27 @@ function test_cpu_interpreter(nrows; parallel = false)
|
|||
reps= 100
|
||||
|
||||
if parallel
|
||||
t_sec = @elapsed fetch.([Threads.@spawn interpret_cpu(exprs, X, p; repetitions=expr_reps) for i in 1:reps])
|
||||
println("~ $(round(30 * reps * expr_reps * nrows / 1e9 / t_sec, digits=2)) GFLOPS ($(Threads.nthreads()) threads) ($(round(peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))")
|
||||
# t_sec = @elapsed fetch.([Threads.@spawn interpret_cpu(exprs, X, p; repetitions=expr_reps) for i in 1:reps])
|
||||
@btime parallel(exprs, X, p, expr_reps, reps)
|
||||
println("~ $(round(30 * reps * expr_reps * nrows / 1e9 / t_sec, digits=2)) GFLOPS ($(Threads.nthreads()) threads) ($(round(LinearAlgebra.peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))")
|
||||
else
|
||||
t_sec = @elapsed for i in 1:reps interpret_cpu(exprs, X, p; repetitions=expr_reps) end
|
||||
println("~ $(round(30 * reps * expr_reps * nrows / 1e9 / t_sec, digits=2)) GFLOPS (single-core) ($(round(peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))")
|
||||
# t_sec = @elapsed for i in 1:reps interpret_cpu(exprs, X, p; repetitions=expr_reps) end
|
||||
@btime single(exprs, X, p, expr_reps, reps)
|
||||
println("~ $(round(30 * reps * expr_reps * nrows / 1e9 / t_sec, digits=2)) GFLOPS (single-core) ($(round(LinearAlgebra.peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))")
|
||||
end
|
||||
true
|
||||
end
|
||||
|
||||
LinearAlgebra.BLAS.set_num_threads(1) # only use a single thread for peakflops
|
||||
function parallel(exprs, X, p, expr_reps, reps)
|
||||
fetch.([Threads.@spawn interpret_cpu(exprs, X, p; repetitions=expr_reps) for i in 1:reps])
|
||||
end
|
||||
|
||||
function single(exprs, X, p, expr_reps, reps)
|
||||
for i in 1:reps interpret_cpu(exprs, X, p; repetitions=expr_reps) end
|
||||
end
|
||||
|
||||
|
||||
# LinearAlgebra.BLAS.set_num_threads(1) # only use a single thread for peakflops
|
||||
|
||||
@test test_cpu_interpreter(1000)
|
||||
@test test_cpu_interpreter(1000, parallel=true) # start julia -t 6 for six threads
|
||||
|
|
47
package/test/PerformanceTests.jl
Normal file
47
package/test/PerformanceTests.jl
Normal file
|
@ -0,0 +1,47 @@
|
|||
using .Transpiler
|
||||
using .Interpreter
|
||||
|
||||
@testset "CPU performance" begin
|
||||
function test_cpu_interpreter(nrows; parallel = false)
|
||||
exprs = [
|
||||
# CPU interpreter requires an anonymous function and array ref s
|
||||
:(p[1] * x[1] + p[2]), # 5 op
|
||||
:((((x[1] + x[2]) + x[3]) + x[4]) + x[5]), # 9 op
|
||||
:(log(abs(x[1]))), # 3 op
|
||||
:(powabs(p[2] - powabs(p[1] + x[1], 1/x[1]),p[3])) # 13 op
|
||||
] # 30 op
|
||||
exprs = map(e -> Expr(:->, :(x,p), e), exprs)
|
||||
X = randn(Float32, nrows, 10)
|
||||
p = [randn(Float32, 10) for _ in 1:length(exprs)] # generate 10 random parameter values for each expr
|
||||
|
||||
# warmup
|
||||
interpret_cpu(exprs, X, p)
|
||||
expr_reps = 100 # for each expr
|
||||
reps= 100
|
||||
|
||||
if parallel
|
||||
t_sec = @elapsed fetch.([Threads.@spawn interpret_cpu(exprs, X, p; repetitions=expr_reps) for i in 1:reps])
|
||||
println("~ $(round(30 * reps * expr_reps * nrows / 1e9 / t_sec, digits=2)) GFLOPS ($(Threads.nthreads()) threads) ($(round(peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))")
|
||||
else
|
||||
t_sec = @elapsed for i in 1:reps interpret_cpu(exprs, X, p; repetitions=expr_reps) end
|
||||
println("~ $(round(30 * reps * expr_reps * nrows / 1e9 / t_sec, digits=2)) GFLOPS (single-core) ($(round(peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))")
|
||||
end
|
||||
true
|
||||
end
|
||||
|
||||
LinearAlgebra.BLAS.set_num_threads(1) # only use a single thread for peakflops
|
||||
|
||||
@test test_cpu_interpreter(1000)
|
||||
@test test_cpu_interpreter(1000, parallel=true) # start julia -t 6 for six threads
|
||||
@test test_cpu_interpreter(10000)
|
||||
@test test_cpu_interpreter(10000, parallel=true)
|
||||
|
||||
end
|
||||
|
||||
@testset "Interpreter Performance" begin
|
||||
|
||||
end
|
||||
|
||||
@testset "Transpiler Performance" begin
|
||||
|
||||
end
|
|
@ -1,4 +1,6 @@
|
|||
[deps]
|
||||
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
|
||||
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
|
||||
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
|
||||
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
|
||||
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
|
||||
|
|
|
@ -48,30 +48,70 @@ end
|
|||
|
||||
expr = Vector{Expr}()
|
||||
push!(expr, expressions[1])
|
||||
# @time Transpiler.evaluate(expr, variables, parameters)
|
||||
@time Transpiler.evaluate(expr, variables, parameters)
|
||||
end
|
||||
|
||||
#TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)"
|
||||
|
||||
function test_kernel(results)
|
||||
@inbounds results[1] = 10f0
|
||||
|
||||
return nothing
|
||||
end
|
||||
|
||||
@testset "TEMP" begin
|
||||
return
|
||||
results = CuArray{Float32}(undef, 2)
|
||||
# @device_code_ptx @cuda test_kernel(results)
|
||||
|
||||
|
||||
# println(CUDA.code_ptx(kernel.fun, ))
|
||||
# return
|
||||
|
||||
ptx = "
|
||||
.version 7.1
|
||||
.version 8.5
|
||||
.target sm_61
|
||||
.address_size 64
|
||||
|
||||
.visible .entry ExpressionProcessing(
|
||||
.param .u32 param_1)
|
||||
.param .u64 param_1)
|
||||
{
|
||||
.reg .u32 %parameter<1>;
|
||||
.reg .u32 %i<1>;
|
||||
.reg .b64 %parameter<1>;
|
||||
.reg .b64 %i<1>;
|
||||
//.reg .b64 %rd<6>;
|
||||
|
||||
ld.param.u32 %i0, [param_1];
|
||||
cvta.to.global.u32 %parameter0, %i0;
|
||||
ld.param.u64 %i0, [param_1];
|
||||
cvta.to.global.u64 %parameter0, %i0;
|
||||
|
||||
st.global.f32 [%parameter0], 10.0;
|
||||
ret;
|
||||
}"
|
||||
|
||||
ptx = ".version 8.5
|
||||
.target sm_61
|
||||
.address_size 64
|
||||
|
||||
.visible .entry ExpressionProcessing(
|
||||
.param .u64 param_1)
|
||||
{
|
||||
.reg .b64 %parameter<1>;
|
||||
.reg .b32 %r<4>;
|
||||
.reg .pred %p<1>;
|
||||
.reg .b64 %i<1>;
|
||||
|
||||
ld.param.u64 %i0, [param_1];
|
||||
cvta.to.global.u64 %parameter0, %i0;
|
||||
|
||||
mov.u32 %r0, %ntid.x;
|
||||
mov.u32 %r1, %ctaid.x;
|
||||
mov.u32 %r2, %tid.x;
|
||||
mad.lo.s32 %r3, %r0, %r1, %r2;
|
||||
setp.gt.s32 %p0, %r3, 2;
|
||||
@%p0 bra \$L__BB0_2;
|
||||
st.global.f32 [%parameter0], 10.0;
|
||||
\$L__BB0_2: ret;
|
||||
}"
|
||||
|
||||
linker = CuLink()
|
||||
add_data!(linker, "ExpressionProcessing", ptx)
|
||||
|
||||
|
@ -90,11 +130,10 @@ end
|
|||
threads = min(variableCols, config.threads)
|
||||
blocks = cld(variableCols, threads)
|
||||
|
||||
cudacall(func, Tuple{CuPtr{Float32}}, cudaResults; threads=1, blocks=1)
|
||||
cudacall(func, Tuple{CuPtr{Float32}}, cudaResults; threads=4, blocks=1)
|
||||
# launch(func, cudaResults; threads=threads, blocks=blocks)
|
||||
|
||||
println(Array(cudaResults))
|
||||
end
|
||||
|
||||
|
||||
# TODO: University setup at 10.20.1.7
|
|
@ -14,9 +14,9 @@ include(joinpath(baseFolder, "src", "Transpiler.jl"))
|
|||
end
|
||||
|
||||
|
||||
#@testset "CPU Interpreter" begin
|
||||
# include("CpuInterpreterTests.jl")
|
||||
#end
|
||||
# @testset "CPU Interpreter" begin
|
||||
# include("CpuInterpreterTests.jl")
|
||||
# end
|
||||
|
||||
@testset "Performance tests" begin
|
||||
# TODO: make performance tests
|
||||
|
|
Loading…
Reference in New Issue
Block a user