diff --git a/package/test/benchmarks/3/gpu_transpiler_yet_to_be_done.json b/package/test/benchmarks/3/gpu_transpiler_yet_to_be_done.json index 0409a3d..193b321 100644 --- a/package/test/benchmarks/3/gpu_transpiler_yet_to_be_done.json +++ b/package/test/benchmarks/3/gpu_transpiler_yet_to_be_done.json @@ -42,8 +42,8 @@ } ], "times": [ - --3.7202049569362e13, - --3.7400159760069e13 + 3.7202049569362e13, + 3.7400159760069e13 ] } ] diff --git a/package/test/benchmarks/4/gpu_transpiler_yet_to_be_done.json b/package/test/benchmarks/4/gpu_transpiler_yet_to_be_done.json new file mode 100644 index 0000000..a7d8588 --- /dev/null +++ b/package/test/benchmarks/4/gpu_transpiler_yet_to_be_done.json @@ -0,0 +1,196 @@ +[ + { + "Julia": "1.11.5", + "BenchmarkTools": { + "major": 1, + "minor": 6, + "patch": 0, + "prerelease": [], + "build": [] + } + }, + [ + [ + "BenchmarkGroup", + { + "data": { + "GPUT": [ + "BenchmarkGroup", + { + "data": { + "nikuradse_1": [ + "Trial", + { + "allocs": 1534112879, + "gctimes": [ + 3.398826747854e12, + 2.618070795579e12 + ], + "memory": 51380857328968, + "params": [ + "Parameters", + { + "gctrial": true, + "time_tolerance": 0.05, + "evals_set": false, + "samples": 50, + "evals": 1, + "gcsample": false, + "seconds": 43200.0, + "overhead": 0.0, + "memory_tolerance": 0.01 + } + ], + "times": [ + --3.7202049569362e13, + --3.7400159760069e13 + ] + } + ] + }, + "tags": [ + "GPUTranspiler" + ] + } + ], + "GPUI": [ + "BenchmarkGroup", + { + "data": { + "nikuradse_1": [ + "Trial", + { + "allocs": 32241320, + "gctimes": [ + 3.76843873e8, + 3.87520681e8, + 3.53674001e8, + 3.67061252e8, + 3.741527e8, + 3.69293996e8, + 3.63305802e8, + 3.61913634e8, + 3.51818682e8, + 3.48188601e8, + 3.62864887e8, + 3.47736729e8, + 3.50237523e8, + 3.53595403e8, + 3.51245475e8, + 3.57725399e8, + 3.48667085e8, + 3.5174771e8, + 3.50159541e8, + 3.57487652e8, + 3.61893033e8, + 3.67797485e8, + 3.44948035e8, + 3.50222654e8, + 3.36037781e8, + 3.50770955e8, + 3.48655148e8, + 3.46508038e8, + 3.48958873e8, + 4.49202169e8, + 3.53247995e8, + 3.71504213e8, + 3.5431637e8, + 3.59468716e8, + 3.46016454e8, + 3.69149583e8, + 3.65486404e8, + 4.45340687e8, + 4.37909167e8, + 3.3690913e8, + 3.50482929e8, + 3.49559472e8, + 3.38465639e8, + 3.44654417e8, + 3.49173998e8, + 3.50582847e8, + 3.55724581e8, + 3.4921611e8, + 3.55360179e8, + 3.48805235e8 + ], + "memory": 45874227656, + "params": [ + "Parameters", + { + "gctrial": true, + "time_tolerance": 0.05, + "evals_set": false, + "samples": 50, + "evals": 1, + "gcsample": false, + "seconds": 43200.0, + "overhead": 0.0, + "memory_tolerance": 0.01 + } + ], + "times": [ + 3.07178374e10, + 3.0668015775e10, + 3.0731090373e10, + 3.0442775184e10, + 3.0456642482e10, + 3.0082122734e10, + 3.0126331654e10, + 3.0751723908e10, + 3.1179628532e10, + 3.0065663574e10, + 3.0464515622e10, + 3.0393855038e10, + 3.1635622751e10, + 3.0447222014e10, + 2.973601985e10, + 3.0033623194e10, + 3.0580015719e10, + 3.1400733412e10, + 3.0272328646e10, + 3.0223853837e10, + 2.9915814997e10, + 3.0818324531e10, + 3.0179331592e10, + 3.0293039282e10, + 3.0017377964e10, + 3.0087189496e10, + 3.0582174914e10, + 2.996325235e10, + 3.0134649182e10, + 3.1042223141e10, + 3.0007740363e10, + 3.0437426607e10, + 3.0810836436e10, + 3.1234163757e10, + 3.0221879009e10, + 3.0338940936e10, + 3.1233683944e10, + 3.1019897889e10, + 3.1380379599e10, + 2.9821214171e10, + 3.0882968215e10, + 3.0159994975e10, + 3.0309932542e10, + 2.9969275606e10, + 3.0447151474e10, + 3.0342592912e10, + 3.024330255e10, + 3.0258060029e10, + 3.0095601739e10, + 3.0209601692e10 + ] + } + ] + }, + "tags": [ + "GPUInterpreter" + ] + } + ] + }, + "tags": [] + } + ] + ] +] \ No newline at end of file diff --git a/package/test/benchmarks/4/gpui_blocksize_192.json b/package/test/benchmarks/4/gpui_blocksize_192.json new file mode 100644 index 0000000..a9c75e3 --- /dev/null +++ b/package/test/benchmarks/4/gpui_blocksize_192.json @@ -0,0 +1,196 @@ +[ + { + "Julia": "1.11.5", + "BenchmarkTools": { + "major": 1, + "minor": 6, + "patch": 0, + "prerelease": [], + "build": [] + } + }, + [ + [ + "BenchmarkGroup", + { + "data": { + "GPUT": [ + "BenchmarkGroup", + { + "data": { + "nikuradse_1": [ + "Trial", + { + "allocs": 1534112879, + "gctimes": [ + 3.398826747854e12, + 2.618070795579e12 + ], + "memory": 51380857328968, + "params": [ + "Parameters", + { + "gctrial": true, + "time_tolerance": 0.05, + "evals_set": false, + "samples": 50, + "evals": 1, + "gcsample": false, + "seconds": 43200.0, + "overhead": 0.0, + "memory_tolerance": 0.01 + } + ], + "times": [ + --3.7202049569362e13, + --3.7400159760069e13 + ] + } + ] + }, + "tags": [ + "GPUTranspiler" + ] + } + ], + "GPUI": [ + "BenchmarkGroup", + { + "data": { + "nikuradse_1": [ + "Trial", + { + "allocs": 32241307, + "gctimes": [ + 2.99988451e8, + 3.18541335e8, + 3.40658917e8, + 3.20735576e8, + 3.17668135e8, + 3.11634185e8, + 3.55400831e8, + 3.25257947e8, + 3.25941878e8, + 3.31627658e8, + 3.2513644e8, + 5.34886621e8, + 4.30305899e8, + 4.75073379e8, + 5.41262095e8, + 5.14748243e8, + 4.91966069e8, + 4.55043676e8, + 4.70840046e8, + 5.50526217e8, + 4.31207494e8, + 4.76072811e8, + 5.04324319e8, + 5.72218216e8, + 4.11391335e8, + 4.73366047e8, + 5.12748251e8, + 4.58269866e8, + 3.87267173e8, + 5.38187011e8, + 4.56822334e8, + 4.24688896e8, + 5.94190171e8, + 5.28701852e8, + 5.15021748e8, + 6.10057318e8, + 4.74982584e8, + 4.33478296e8, + 4.33664662e8, + 4.22168618e8, + 4.16528265e8, + 4.15685104e8, + 4.23277232e8, + 3.74337751e8, + 4.25875703e8, + 5.42365157e8, + 4.94701466e8, + 4.83233782e8, + 4.24986417e8, + 4.8780606e8 + ], + "memory": 45874227384, + "params": [ + "Parameters", + { + "gctrial": true, + "time_tolerance": 0.05, + "evals_set": false, + "samples": 50, + "evals": 1, + "gcsample": false, + "seconds": 43200.0, + "overhead": 0.0, + "memory_tolerance": 0.01 + } + ], + "times": [ + 3.055626804e10, + 3.0413771477e10, + 3.0058609633e10, + 3.007921294e10, + 3.0178903964e10, + 3.0243374529e10, + 3.0043488197e10, + 2.9849309299e10, + 3.0134058306e10, + 3.0627343705e10, + 3.0130179115e10, + 4.8987140933e10, + 1.0029494223e11, + 9.991837876e10, + 1.01083284461e11, + 1.00013926981e11, + 1.00050439359e11, + 1.00453826906e11, + 1.00398291414e11, + 1.0026599822e11, + 1.00645806674e11, + 9.9875971997e10, + 9.9612950384e10, + 1.00253673473e11, + 9.9643175894e10, + 1.0027620915e11, + 9.9714066248e10, + 1.00141668213e11, + 1.00269405678e11, + 1.00149909912e11, + 1.00645303739e11, + 9.9693734213e10, + 1.01986856167e11, + 1.00367529986e11, + 9.986664487e10, + 1.01112512248e11, + 9.9866828996e10, + 9.887153973e10, + 9.9119068947e10, + 9.9161506987e10, + 9.8659948079e10, + 9.9016722639e10, + 9.9226347837e10, + 9.9361219392e10, + 9.9532328849e10, + 9.9181660704e10, + 9.9525871099e10, + 9.877397928e10, + 9.8880425186e10, + 9.9195828801e10 + ] + } + ] + }, + "tags": [ + "GPUInterpreter" + ] + } + ] + }, + "tags": [] + } + ] + ] +] \ No newline at end of file diff --git a/thesis/chapters/evaluation.tex b/thesis/chapters/evaluation.tex index b1c57f1..82f76ab 100644 --- a/thesis/chapters/evaluation.tex +++ b/thesis/chapters/evaluation.tex @@ -71,8 +71,10 @@ This section presents the results of the benchmarks described above. First the r \subsection{Interpreter} % Results only for Interpreter (also contains final kernel configuration and probably quick overview/recap of the implementation used and described in Implementation section) -In this section, the results for the interpreter are presented in detail. ... +In this section, the results for the GPU-based interpreter are presented in detail. Following the benchmark results, the process of tuning the interpreter is described as well as how to adapt the tuning for the different benchmarks. This part not only contains the tuning of the GPU, but also performance improvements done on the CPU side. + \subsubsection{Benchmark 1} +The first benchmark consisted of $250\,000$ expressions and $362$ variable sets with $100$ parameter optimisation steps. Because each expression needs to be evaluated with each variable set for each parameter optimisation step, a total of $9.05\,\textit{billion}$ evaluations have been performed per sample. In Figure \ref{fig:gpu_i_benchmark_1} the result over all $50$ samples is presented. The median value across all executions is $466.3$ seconds with a standard deviation of $14.2$ seconds. \begin{figure} \centering \includegraphics[width=.9\textwidth]{results/gpu-interpreter-final-performance-benchmark1.png} @@ -80,14 +82,23 @@ In this section, the results for the interpreter are presented in detail. ... \label{fig:gpu_i_benchmark_1} \end{figure} +% talk about kernel configuration (along the lines of: results achieved with block size of X) etc. Also include that CPU and GPU utilisation was 100% the entire time. If this is too short, just add it to the above paragraph and make the 4 benchmark sections relatively short, as the most interesting information is in the performance tuning and comparison sections anyway + \subsubsection{Benchmark 2} \subsubsection{Benchmark 3} +std of 750.1 ms +\begin{figure} + \centering + \includegraphics[width=.9\textwidth]{results/gpu-interpreter-final-performance-benchmark3.png} + \caption{The results of the GPU-based interpreter for benchmark 3} + \label{fig:gpu_i_benchmark_3} +\end{figure} \subsubsection{Benchmark 4} \subsubsection{Performance Tuning} % either subsubSection or change the title to "Performance Tuning Interpreter" -Document the process of performance tuning +Document the process of performance tuning (mostly GPU, but also talk about CPU. Especially the re-aranging of data transfer and non usage of a cache) Initial: no cache; 256 blocksize; exprs pre-processed and sent to GPU on every call; vars sent on every call; frontend + dispatch are multithreaded diff --git a/thesis/images/results/gpu-interpreter-final-performance-benchmark1.png b/thesis/images/results/gpu-interpreter-final-performance-benchmark1.png index 610ac6a..fe59dbf 100644 Binary files a/thesis/images/results/gpu-interpreter-final-performance-benchmark1.png and b/thesis/images/results/gpu-interpreter-final-performance-benchmark1.png differ diff --git a/thesis/images/results/gpu-interpreter-final-performance-benchmark3.png b/thesis/images/results/gpu-interpreter-final-performance-benchmark3.png new file mode 100644 index 0000000..8a75d84 Binary files /dev/null and b/thesis/images/results/gpu-interpreter-final-performance-benchmark3.png differ diff --git a/thesis/main.pdf b/thesis/main.pdf index 5e6def8..1d3024a 100644 Binary files a/thesis/main.pdf and b/thesis/main.pdf differ