using LinearAlgebra using BenchmarkTools using DelimitedFiles using GZip using CUDA using .Transpiler using .Interpreter using .ExpressionProcessing include("parser.jl") # to parse expressions from a file const BENCHMARKS_RESULTS_PATH = "./results-fh-new" # Number of expressions can get really big (into millions) # Variable-Sets: 1000 can be considered the minimum; 100.000 can be considered the maximum data,varnames = readdlm("data/nikuradse_1.csv", ',', header=true); X = convert(Matrix{Float32}, data) X_t = permutedims(X) # for gpu exprs = Expr[] parameters = Vector{Vector{Float32}}() varnames = ["x$i" for i in 1:10] paramnames = ["p$i" for i in 1:20] # data/esr_nvar2_len10.txt.gz_9.txt.gz has ~250_000 exprs # data/esr_nvar2_len10.txt.gz_10.txt.gz has ~800_000 exrps GZip.open("data/esr_nvar2_len10.txt.gz_9.txt.gz") do io for line in eachline(io) expr, p = parse_infix(line, varnames, paramnames) push!(exprs, expr) push!(parameters, randn(Float32, length(p))) end end expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially; only p changes but not X) # TODO: Tipps for tuning: # Put data in shared memory: # https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory # Make array const: # https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays # Memory management like in C++ might help with performance improvements # https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management # https://cuda.juliagpu.org/stable/development/profiling/#NVIDIA-Nsight-Systems # Systems and Compute installable via WSL. Compute UI can even be used inside wsl # Add /usr/local/cuda/bin in .bashrc to PATH to access ncu and nsys (do the tests on FH PCs) # University setup at 10.20.1.7 and 10.20.1.13 compareWithCPU = false suite = BenchmarkGroup() suite["CPU"] = BenchmarkGroup(["CPUInterpreter"]) suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"]) suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"]) if compareWithCPU suite["CPU"]["nikuradse_1"] = @benchmarkable interpret_cpu(exprs, X, parameters; repetitions=expr_reps) suite["CPU"]["nikuradse_1_parallel"] = @benchmarkable interpret_cpu(exprs, X, parameters; repetitions=expr_reps, parallel=true) end # cacheInterpreter = Dict{Expr, PostfixType}() # suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprs, X_t, parameters; repetitions=expr_reps) # cacheTranspilerFront = Dict{Expr, PostfixType}() # cacheTranspilerRes = Dict{Expr, CuFunction}() suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprs, X_t, parameters; repetitions=expr_reps) tune!(suite) BenchmarkTools.save("params.json", params(suite)) throw("finished tuning") loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance) results = run(suite, verbose=true, seconds=3600) # 1 hour because of CPU. lets see if more is needed if compareWithCPU medianCPU = median(results["CPU"]) stdCPU = std(results["CPU"]) medianInterpreter = median(results["GPUI"]) stdInterpreter = std(results["GPUI"]) medianTranspiler = median(results["GPUT"]) stdTranspiler = std(results["GPUT"]) cpuVsGPUI_median = judge(medianInterpreter, medianCPU) # is interpreter better than cpu? cpuVsGPUT_median = judge(medianTranspiler, medianCPU) # is transpiler better than cpu? gpuiVsGPUT_median = judge(medianTranspiler, medianInterpreter) # is tranpiler better than interpreter? cpuVsGPUI_std = judge(stdInterpreter, stdCPU) # is interpreter better than cpu? cpuVsGPUT_std = judge(stdTranspiler, stdCPU) # is transpiler better than cpu? gpuiVsGPUT_std = judge(stdTranspiler, stdInterpreter) # is tranpiler better than interpreter? println() println("Is the interpreter better than the CPU implementation:") println(cpuVsGPUI_median) println(cpuVsGPUI_std) println() println("Is the transpiler better than the CPU implementation:") println(cpuVsGPUT_median) println(cpuVsGPUT_std) println() println("Is the transpiler better than the interpreter:") println(gpuiVsGPUT_median) println(gpuiVsGPUT_std) BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/0_initial.json", results) else resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json")[1] # resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json")[1] medianGPUI_old = median(resultsOld["GPUI"]) stdGPUI_old = std(resultsOld["GPUI"]) medianGPUT_old = median(resultsOld["GPUT"]) stdGPUT_old = std(resultsOld["GPUT"]) medianInterpreter = median(results["GPUI"]) stdInterpreter = std(results["GPUI"]) medianTranspiler = median(results["GPUT"]) stdTranspiler = std(results["GPUT"]) oldVsGPUI_median = judge(medianInterpreter, medianGPUI_old) # is interpreter better than old? oldVsGPUI_std = judge(stdInterpreter, stdGPUI_old) # is interpreter better than old? oldVsGPUT_median = judge(medianTranspiler, medianGPUT_old) # is transpiler better than old? oldVsGPUT_std = judge(stdTranspiler, stdGPUT_old) # is transpiler better than old? println() println("Is the interpreter better than the old implementation:") println(oldVsGPUI_median) println(oldVsGPUI_std) println() println("Is the transpiler better than the old implementation:") println(oldVsGPUT_median) println(oldVsGPUT_std) end