benchmarking: added benchmark4 interpreter results; extended evaluation section
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run

This commit is contained in:
2025-05-24 13:17:15 +02:00
parent 2bbdef6837
commit 5f44e4d122
7 changed files with 407 additions and 4 deletions

View File

@ -42,8 +42,8 @@
} }
], ],
"times": [ "times": [
--3.7202049569362e13, 3.7202049569362e13,
--3.7400159760069e13 3.7400159760069e13
] ]
} }
] ]

View File

@ -0,0 +1,196 @@
[
{
"Julia": "1.11.5",
"BenchmarkTools": {
"major": 1,
"minor": 6,
"patch": 0,
"prerelease": [],
"build": []
}
},
[
[
"BenchmarkGroup",
{
"data": {
"GPUT": [
"BenchmarkGroup",
{
"data": {
"nikuradse_1": [
"Trial",
{
"allocs": 1534112879,
"gctimes": [
3.398826747854e12,
2.618070795579e12
],
"memory": 51380857328968,
"params": [
"Parameters",
{
"gctrial": true,
"time_tolerance": 0.05,
"evals_set": false,
"samples": 50,
"evals": 1,
"gcsample": false,
"seconds": 43200.0,
"overhead": 0.0,
"memory_tolerance": 0.01
}
],
"times": [
--3.7202049569362e13,
--3.7400159760069e13
]
}
]
},
"tags": [
"GPUTranspiler"
]
}
],
"GPUI": [
"BenchmarkGroup",
{
"data": {
"nikuradse_1": [
"Trial",
{
"allocs": 32241320,
"gctimes": [
3.76843873e8,
3.87520681e8,
3.53674001e8,
3.67061252e8,
3.741527e8,
3.69293996e8,
3.63305802e8,
3.61913634e8,
3.51818682e8,
3.48188601e8,
3.62864887e8,
3.47736729e8,
3.50237523e8,
3.53595403e8,
3.51245475e8,
3.57725399e8,
3.48667085e8,
3.5174771e8,
3.50159541e8,
3.57487652e8,
3.61893033e8,
3.67797485e8,
3.44948035e8,
3.50222654e8,
3.36037781e8,
3.50770955e8,
3.48655148e8,
3.46508038e8,
3.48958873e8,
4.49202169e8,
3.53247995e8,
3.71504213e8,
3.5431637e8,
3.59468716e8,
3.46016454e8,
3.69149583e8,
3.65486404e8,
4.45340687e8,
4.37909167e8,
3.3690913e8,
3.50482929e8,
3.49559472e8,
3.38465639e8,
3.44654417e8,
3.49173998e8,
3.50582847e8,
3.55724581e8,
3.4921611e8,
3.55360179e8,
3.48805235e8
],
"memory": 45874227656,
"params": [
"Parameters",
{
"gctrial": true,
"time_tolerance": 0.05,
"evals_set": false,
"samples": 50,
"evals": 1,
"gcsample": false,
"seconds": 43200.0,
"overhead": 0.0,
"memory_tolerance": 0.01
}
],
"times": [
3.07178374e10,
3.0668015775e10,
3.0731090373e10,
3.0442775184e10,
3.0456642482e10,
3.0082122734e10,
3.0126331654e10,
3.0751723908e10,
3.1179628532e10,
3.0065663574e10,
3.0464515622e10,
3.0393855038e10,
3.1635622751e10,
3.0447222014e10,
2.973601985e10,
3.0033623194e10,
3.0580015719e10,
3.1400733412e10,
3.0272328646e10,
3.0223853837e10,
2.9915814997e10,
3.0818324531e10,
3.0179331592e10,
3.0293039282e10,
3.0017377964e10,
3.0087189496e10,
3.0582174914e10,
2.996325235e10,
3.0134649182e10,
3.1042223141e10,
3.0007740363e10,
3.0437426607e10,
3.0810836436e10,
3.1234163757e10,
3.0221879009e10,
3.0338940936e10,
3.1233683944e10,
3.1019897889e10,
3.1380379599e10,
2.9821214171e10,
3.0882968215e10,
3.0159994975e10,
3.0309932542e10,
2.9969275606e10,
3.0447151474e10,
3.0342592912e10,
3.024330255e10,
3.0258060029e10,
3.0095601739e10,
3.0209601692e10
]
}
]
},
"tags": [
"GPUInterpreter"
]
}
]
},
"tags": []
}
]
]
]

View File

@ -0,0 +1,196 @@
[
{
"Julia": "1.11.5",
"BenchmarkTools": {
"major": 1,
"minor": 6,
"patch": 0,
"prerelease": [],
"build": []
}
},
[
[
"BenchmarkGroup",
{
"data": {
"GPUT": [
"BenchmarkGroup",
{
"data": {
"nikuradse_1": [
"Trial",
{
"allocs": 1534112879,
"gctimes": [
3.398826747854e12,
2.618070795579e12
],
"memory": 51380857328968,
"params": [
"Parameters",
{
"gctrial": true,
"time_tolerance": 0.05,
"evals_set": false,
"samples": 50,
"evals": 1,
"gcsample": false,
"seconds": 43200.0,
"overhead": 0.0,
"memory_tolerance": 0.01
}
],
"times": [
--3.7202049569362e13,
--3.7400159760069e13
]
}
]
},
"tags": [
"GPUTranspiler"
]
}
],
"GPUI": [
"BenchmarkGroup",
{
"data": {
"nikuradse_1": [
"Trial",
{
"allocs": 32241307,
"gctimes": [
2.99988451e8,
3.18541335e8,
3.40658917e8,
3.20735576e8,
3.17668135e8,
3.11634185e8,
3.55400831e8,
3.25257947e8,
3.25941878e8,
3.31627658e8,
3.2513644e8,
5.34886621e8,
4.30305899e8,
4.75073379e8,
5.41262095e8,
5.14748243e8,
4.91966069e8,
4.55043676e8,
4.70840046e8,
5.50526217e8,
4.31207494e8,
4.76072811e8,
5.04324319e8,
5.72218216e8,
4.11391335e8,
4.73366047e8,
5.12748251e8,
4.58269866e8,
3.87267173e8,
5.38187011e8,
4.56822334e8,
4.24688896e8,
5.94190171e8,
5.28701852e8,
5.15021748e8,
6.10057318e8,
4.74982584e8,
4.33478296e8,
4.33664662e8,
4.22168618e8,
4.16528265e8,
4.15685104e8,
4.23277232e8,
3.74337751e8,
4.25875703e8,
5.42365157e8,
4.94701466e8,
4.83233782e8,
4.24986417e8,
4.8780606e8
],
"memory": 45874227384,
"params": [
"Parameters",
{
"gctrial": true,
"time_tolerance": 0.05,
"evals_set": false,
"samples": 50,
"evals": 1,
"gcsample": false,
"seconds": 43200.0,
"overhead": 0.0,
"memory_tolerance": 0.01
}
],
"times": [
3.055626804e10,
3.0413771477e10,
3.0058609633e10,
3.007921294e10,
3.0178903964e10,
3.0243374529e10,
3.0043488197e10,
2.9849309299e10,
3.0134058306e10,
3.0627343705e10,
3.0130179115e10,
4.8987140933e10,
1.0029494223e11,
9.991837876e10,
1.01083284461e11,
1.00013926981e11,
1.00050439359e11,
1.00453826906e11,
1.00398291414e11,
1.0026599822e11,
1.00645806674e11,
9.9875971997e10,
9.9612950384e10,
1.00253673473e11,
9.9643175894e10,
1.0027620915e11,
9.9714066248e10,
1.00141668213e11,
1.00269405678e11,
1.00149909912e11,
1.00645303739e11,
9.9693734213e10,
1.01986856167e11,
1.00367529986e11,
9.986664487e10,
1.01112512248e11,
9.9866828996e10,
9.887153973e10,
9.9119068947e10,
9.9161506987e10,
9.8659948079e10,
9.9016722639e10,
9.9226347837e10,
9.9361219392e10,
9.9532328849e10,
9.9181660704e10,
9.9525871099e10,
9.877397928e10,
9.8880425186e10,
9.9195828801e10
]
}
]
},
"tags": [
"GPUInterpreter"
]
}
]
},
"tags": []
}
]
]
]

View File

@ -71,8 +71,10 @@ This section presents the results of the benchmarks described above. First the r
\subsection{Interpreter} \subsection{Interpreter}
% Results only for Interpreter (also contains final kernel configuration and probably quick overview/recap of the implementation used and described in Implementation section) % Results only for Interpreter (also contains final kernel configuration and probably quick overview/recap of the implementation used and described in Implementation section)
In this section, the results for the interpreter are presented in detail. ... In this section, the results for the GPU-based interpreter are presented in detail. Following the benchmark results, the process of tuning the interpreter is described as well as how to adapt the tuning for the different benchmarks. This part not only contains the tuning of the GPU, but also performance improvements done on the CPU side.
\subsubsection{Benchmark 1} \subsubsection{Benchmark 1}
The first benchmark consisted of $250\,000$ expressions and $362$ variable sets with $100$ parameter optimisation steps. Because each expression needs to be evaluated with each variable set for each parameter optimisation step, a total of $9.05\,\textit{billion}$ evaluations have been performed per sample. In Figure \ref{fig:gpu_i_benchmark_1} the result over all $50$ samples is presented. The median value across all executions is $466.3$ seconds with a standard deviation of $14.2$ seconds.
\begin{figure} \begin{figure}
\centering \centering
\includegraphics[width=.9\textwidth]{results/gpu-interpreter-final-performance-benchmark1.png} \includegraphics[width=.9\textwidth]{results/gpu-interpreter-final-performance-benchmark1.png}
@ -80,14 +82,23 @@ In this section, the results for the interpreter are presented in detail. ...
\label{fig:gpu_i_benchmark_1} \label{fig:gpu_i_benchmark_1}
\end{figure} \end{figure}
% talk about kernel configuration (along the lines of: results achieved with block size of X) etc. Also include that CPU and GPU utilisation was 100% the entire time. If this is too short, just add it to the above paragraph and make the 4 benchmark sections relatively short, as the most interesting information is in the performance tuning and comparison sections anyway
\subsubsection{Benchmark 2} \subsubsection{Benchmark 2}
\subsubsection{Benchmark 3} \subsubsection{Benchmark 3}
std of 750.1 ms
\begin{figure}
\centering
\includegraphics[width=.9\textwidth]{results/gpu-interpreter-final-performance-benchmark3.png}
\caption{The results of the GPU-based interpreter for benchmark 3}
\label{fig:gpu_i_benchmark_3}
\end{figure}
\subsubsection{Benchmark 4} \subsubsection{Benchmark 4}
\subsubsection{Performance Tuning} % either subsubSection or change the title to "Performance Tuning Interpreter" \subsubsection{Performance Tuning} % either subsubSection or change the title to "Performance Tuning Interpreter"
Document the process of performance tuning Document the process of performance tuning (mostly GPU, but also talk about CPU. Especially the re-aranging of data transfer and non usage of a cache)
Initial: no cache; 256 blocksize; exprs pre-processed and sent to GPU on every call; vars sent on every call; frontend + dispatch are multithreaded Initial: no cache; 256 blocksize; exprs pre-processed and sent to GPU on every call; vars sent on every call; frontend + dispatch are multithreaded

Binary file not shown.

Before

Width:  |  Height:  |  Size: 17 KiB

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.