benchmarking: fixed bugs; took initial_benchmark
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
This commit is contained in:
parent
ad175abac0
commit
1dc0c1898d
|
@ -45,19 +45,23 @@ end
|
||||||
#TODO: Add @inbounds to all indexing after it is verified that all works https://cuda.juliagpu.org/stable/development/kernel/#Bounds-checking
|
#TODO: Add @inbounds to all indexing after it is verified that all works https://cuda.juliagpu.org/stable/development/kernel/#Bounds-checking
|
||||||
const MAX_STACK_SIZE = 25 # The depth of the stack to store the values and intermediate results
|
const MAX_STACK_SIZE = 25 # The depth of the stack to store the values and intermediate results
|
||||||
function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, variables::CuDeviceArray{Float32}, parameters::CuDeviceArray{Float32}, results::CuDeviceArray{Float32}, stepsize::CuDeviceArray{Int}, exprIndex::Int)
|
function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, variables::CuDeviceArray{Float32}, parameters::CuDeviceArray{Float32}, results::CuDeviceArray{Float32}, stepsize::CuDeviceArray{Int}, exprIndex::Int)
|
||||||
index = (blockIdx().x - 1) * blockDim().x + threadIdx().x # ctaid.x * ntid.x + tid.x
|
varSetIndex = (blockIdx().x - 1) * blockDim().x + threadIdx().x # ctaid.x * ntid.x + tid.x (1-based)
|
||||||
stride = gridDim().x * blockDim().x # nctaid.x * ntid.x
|
# stride = gridDim().x * blockDim().x # nctaid.x * ntid.x
|
||||||
|
variableCols = length(variables) / stepsize[3]
|
||||||
|
|
||||||
|
if varSetIndex > variableCols
|
||||||
|
return
|
||||||
|
end
|
||||||
|
|
||||||
firstExprIndex = ((exprIndex - 1) * stepsize[1]) + 1 # Inclusive
|
firstExprIndex = ((exprIndex - 1) * stepsize[1]) + 1 # Inclusive
|
||||||
lastExprIndex = firstExprIndex + stepsize[1] - 1 # Inclusive
|
lastExprIndex = firstExprIndex + stepsize[1] - 1 # Inclusive
|
||||||
firstParamIndex = ((exprIndex - 1) * stepsize[2]) # Exclusive
|
firstParamIndex = ((exprIndex - 1) * stepsize[2]) # Exclusive
|
||||||
variableCols = length(variables) / stepsize[3]
|
|
||||||
|
|
||||||
operationStack = MVector{MAX_STACK_SIZE, Float32}(undef) # Try to get this to function with variable size too, to allow better memory usage
|
operationStack = MVector{MAX_STACK_SIZE, Float32}(undef) # Try to get this to function with variable size too, to allow better memory usage
|
||||||
operationStackTop = 0 # stores index of the last defined/valid value
|
operationStackTop = 0 # stores index of the last defined/valid value
|
||||||
|
|
||||||
for varSetIndex in index:stride
|
# for varSetIndex in index:stride
|
||||||
firstVariableIndex = ((varSetIndex - 1) * stepsize[3]) # Exclusive
|
firstVariableIndex = ((varSetIndex-1) * stepsize[3]) # Exclusive
|
||||||
|
|
||||||
for i in firstExprIndex:lastExprIndex
|
for i in firstExprIndex:lastExprIndex
|
||||||
if expressions[i].Type == EMPTY
|
if expressions[i].Type == EMPTY
|
||||||
|
@ -69,7 +73,7 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var
|
||||||
if val > 0
|
if val > 0
|
||||||
operationStack[operationStackTop] = variables[firstVariableIndex + val]
|
operationStack[operationStackTop] = variables[firstVariableIndex + val]
|
||||||
else
|
else
|
||||||
val = -val
|
val = abs(val)
|
||||||
operationStack[operationStackTop] = parameters[firstParamIndex + val]
|
operationStack[operationStackTop] = parameters[firstParamIndex + val]
|
||||||
end
|
end
|
||||||
elseif expressions[i].Type == FLOAT32
|
elseif expressions[i].Type == FLOAT32
|
||||||
|
@ -110,7 +114,7 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var
|
||||||
# "+ varSetIndex" -> to get the row inside the column at which to insert the result of the variable set (variable set = row)
|
# "+ varSetIndex" -> to get the row inside the column at which to insert the result of the variable set (variable set = row)
|
||||||
resultIndex = convert(Int, (exprIndex - 1) * variableCols + varSetIndex) # Inclusive
|
resultIndex = convert(Int, (exprIndex - 1) * variableCols + varSetIndex) # Inclusive
|
||||||
results[resultIndex] = operationStack[operationStackTop]
|
results[resultIndex] = operationStack[operationStackTop]
|
||||||
end
|
# end
|
||||||
|
|
||||||
return
|
return
|
||||||
end
|
end
|
||||||
|
|
|
@ -4,6 +4,7 @@ using BenchmarkTools
|
||||||
using .Transpiler
|
using .Transpiler
|
||||||
using .Interpreter
|
using .Interpreter
|
||||||
|
|
||||||
|
const BENCHMARKS_RESULTS_PATH = "./results"
|
||||||
# University setup at 10.20.1.7 if needed
|
# University setup at 10.20.1.7 if needed
|
||||||
exprsCPU = [
|
exprsCPU = [
|
||||||
# CPU interpreter requires an anonymous function and array ref s
|
# CPU interpreter requires an anonymous function and array ref s
|
||||||
|
@ -24,10 +25,9 @@ exprsGPU = [
|
||||||
|
|
||||||
# p is the same for CPU and GPU
|
# p is the same for CPU and GPU
|
||||||
p = [randn(Float32, 10) for _ in 1:length(exprsCPU)] # generate 10 random parameter values for each expr
|
p = [randn(Float32, 10) for _ in 1:length(exprsCPU)] # generate 10 random parameter values for each expr
|
||||||
nrows = 1000
|
|
||||||
X = randn(Float32, nrows, 5)
|
|
||||||
|
|
||||||
expr_reps = 100 # 100 parameter optimisation steps basically
|
expr_reps = 100 # 100 parameter optimisation steps basically
|
||||||
|
|
||||||
|
|
||||||
@testset "CPU performance" begin
|
@testset "CPU performance" begin
|
||||||
# warmup
|
# warmup
|
||||||
# interpret_cpu(exprsCPU, X, p)
|
# interpret_cpu(exprsCPU, X, p)
|
||||||
|
@ -42,8 +42,6 @@ expr_reps = 100 # 100 parameter optimisation steps basically
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
ncols = 1000
|
|
||||||
X_GPU = randn(Float32, 5, ncols)
|
|
||||||
@testset "Interpreter Performance" begin
|
@testset "Interpreter Performance" begin
|
||||||
# Put data in shared memory:
|
# Put data in shared memory:
|
||||||
# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
|
# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
|
||||||
|
@ -95,21 +93,54 @@ X_large_GPU = randn(Float32, 5, varsets_large)
|
||||||
suite["GPUI"]["large varset"] = @benchmarkable interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
|
suite["GPUI"]["large varset"] = @benchmarkable interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
|
||||||
suite["GPUT"]["large varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
|
suite["GPUT"]["large varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
|
||||||
|
|
||||||
tune!(suite)
|
# interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
|
||||||
|
|
||||||
BenchmarkTools.save("params.json", params(suite))
|
# tune!(suite)
|
||||||
# loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)
|
# BenchmarkTools.save("params.json", params(suite))
|
||||||
|
|
||||||
# results = run(suite, verbose=true, seconds=180)
|
loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)
|
||||||
# results2 = run(suite, verbose=true, seconds=180)
|
|
||||||
|
|
||||||
# medianCPU = median(results["CPU"])
|
results = run(suite, verbose=true, seconds=180)
|
||||||
# medianInterpreter = median(results["GPUI"])
|
|
||||||
# medianTranspiler = median(results["GPUT"])
|
|
||||||
|
|
||||||
# jud = judge(medianCPU, medianCPU2; time_tolerance=0.001)
|
# BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/initial_results.json", results)
|
||||||
# println(jud)
|
# initial_results = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATHinitial_results.json")
|
||||||
|
|
||||||
# judge(medianCPU, medianInterpreter; time_tolerance=0.001)
|
medianCPU = median(results["CPU"])
|
||||||
# judge(medianCPU, medianTranspiler; time_tolerance=0.001)
|
minimumCPU = minimum(results["CPU"])
|
||||||
# judge(medianInterpreter, medianTranspiler; time_tolerance=0.001)
|
stdCPU = std(results["CPU"])
|
||||||
|
|
||||||
|
medianInterpreter = median(results["GPUI"])
|
||||||
|
minimumInterpreter = minimum(results["GPUI"])
|
||||||
|
stdInterpreter = std(results["GPUI"])
|
||||||
|
|
||||||
|
medianTranspiler = median(results["GPUT"])
|
||||||
|
minimumTranspiler = minimum(results["GPUT"])
|
||||||
|
stdTranspiler = std(results["GPUT"])
|
||||||
|
|
||||||
|
cpuVsGPUI_median = judge(medianInterpreter, medianCPU) # is interpreter better than cpu?
|
||||||
|
cpuVsGPUT_median = judge(medianTranspiler, medianCPU) # is transpiler better than cpu?
|
||||||
|
gpuiVsGPUT_median = judge(medianTranspiler, medianInterpreter) # is tranpiler better than interpreter?
|
||||||
|
|
||||||
|
cpuVsGPUI_minimum = judge(minimumInterpreter, minimumCPU) # is interpreter better than cpu?
|
||||||
|
cpuVsGPUT_minimum = judge(minimumTranspiler, minimumCPU) # is transpiler better than cpu?
|
||||||
|
gpuiVsGPUT_minimum = judge(minimumTranspiler, minimumInterpreter) # is tranpiler better than interpreter?
|
||||||
|
|
||||||
|
cpuVsGPUI_std = judge(stdInterpreter, stdCPU) # is interpreter better than cpu?
|
||||||
|
cpuVsGPUT_std = judge(stdTranspiler, stdCPU) # is transpiler better than cpu?
|
||||||
|
gpuiVsGPUT_std = judge(stdTranspiler, stdInterpreter) # is tranpiler better than interpreter?
|
||||||
|
|
||||||
|
|
||||||
|
println("Is the interpreter better than the CPU implementation:")
|
||||||
|
println(cpuVsGPUI_median)
|
||||||
|
println(cpuVsGPUI_minimum)
|
||||||
|
println(cpuVsGPUI_std)
|
||||||
|
|
||||||
|
println("Is the transpiler better than the CPU implementation:")
|
||||||
|
println(cpuVsGPUT_median)
|
||||||
|
println(cpuVsGPUT_minimum)
|
||||||
|
println(cpuVsGPUT_std)
|
||||||
|
|
||||||
|
println("Is the transpiler better than the interpreter:")
|
||||||
|
println(gpuiVsGPUT_median)
|
||||||
|
println(gpuiVsGPUT_minimum)
|
||||||
|
println(gpuiVsGPUT_std)
|
||||||
|
|
1
package/test/initial_results.json
Normal file
1
package/test/initial_results.json
Normal file
File diff suppressed because one or more lines are too long
|
@ -1 +1 @@
|
||||||
[{"Julia":"1.11.4","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"normal varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["CPUInterpreter"]}]},"tags":[]}]]]
|
[{"Julia":"1.11.4","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["CPUInterpreter"]}],"GPUT":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]]
|
Loading…
Reference in New Issue
Block a user