benchmarking: finished taking evaluation results; evaluation: continued writing
Some checks failed
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled

This commit is contained in:
2025-05-25 13:27:18 +02:00
parent 14b2e23d9a
commit 99a222341d
14 changed files with 262 additions and 57 deletions

View File

@ -1 +0,0 @@
The CPU did not finish. Once it started using the page-file it just aborted. therefore too little RAM for this test

View File

@ -0,0 +1 @@
[{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"nikuradse_1":["Trial",{"allocs":1070928,"gctimes":[2.7425249e7,7.412701e7,3.6607941e7,7.2060594e7,3.4529434e7,7.6634167e7,3.2012321e7,7.3820784e7,3.3949954e7,7.8478248e7,4.0126379e7,7.8064709e7,3.7594681e7,7.7171913e7,3.2345052e7,7.4243448e7,3.4353198e7,7.6815947e7,3.3275476e7,7.6196381e7,3.5836579e7,7.9893164e7,3.426444e7,7.8096102e7,3.5667171e7,7.8791806e7,3.4285798e7,8.0897821e7,3.6955997e7,7.3759746e7,3.3773137e7,7.328944e7,3.4533305e7,7.4964616e7,3.4649633e7,7.4867313e7,3.6125153e7,7.7465251e7,3.4405076e7,8.0242334e7,3.2479474e7,7.5060436e7,3.272518e7,7.2772772e7,3.5399275e7,7.4715997e7,3.5420495e7,7.68539e7,3.5243677e7,7.4565513e7],"memory":660409808,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":50,"evals":1,"gcsample":false,"seconds":28800.0,"overhead":0.0,"memory_tolerance":0.01}],"times":[3.43605916e9,3.571094971e9,3.581169167e9,3.617386266e9,3.640177061e9,3.666140865e9,3.59312133e9,3.642660429e9,3.649362226e9,3.639535812e9,3.615756984e9,4.084913467e9,3.658895286e9,3.697572649e9,3.639317733e9,3.626500969e9,3.730074621e9,3.834972951e9,3.77581077e9,3.810886128e9,3.821828959e9,3.810445379e9,3.74010373e9,4.100990879e9,3.805819398e9,3.883427787e9,3.759697669e9,3.826958891e9,3.806828201e9,3.737459795e9,3.82547766e9,3.875865222e9,3.778686866e9,3.772500863e9,3.695058761e9,3.839603577e9,3.758997268e9,3.78092914e9,3.722981644e9,3.81821317e9,3.755600545e9,3.849403637e9,3.807069344e9,3.731021781e9,3.775985336e9,3.809009579e9,3.844778579e9,3.795816016e9,3.761476812e9,3.856067238e9]}]},"tags":["CPUInterpreter"]}]},"tags":[]}]]]

View File

@ -15,18 +15,66 @@
{ {
"data": { "data": {
"GPUT": [ "GPUT": [
"BenchmarkGroup", "BenchmarkGroup",
{ {
"data": { "data": {
"nikuradse_1": [ "nikuradse_1": [
"Trial", "Trial",
{ {
"allocs": 1534112879, "allocs": 27539530,
"gctimes": [ "gctimes": [
3.398826747854e12, 7.21326673e8,
2.618070795579e12 7.48889043e8,
8.00904516e8,
7.37378345e8,
7.24851528e8,
7.35546499e8,
6.86027619e8,
7.3845303e8,
7.79203625e8,
7.52721538e8,
7.60364838e8,
7.59372464e8,
7.46489405e8,
8.077102e8,
7.62237779e8,
7.80462131e8,
8.24630083e8,
8.30753044e8,
7.73842108e8,
8.42642472e8,
7.94451496e8,
8.35754001e8,
7.8590998e8,
7.96294466e8,
8.69176891e8,
8.10771728e8,
7.95383527e8,
8.17274343e8,
7.57214285e8,
8.67359312e8,
7.88826755e8,
7.73170589e8,
7.4383235e8,
7.35437044e8,
7.29270175e8,
7.30839033e8,
7.78530806e8,
7.84806598e8,
7.86753701e8,
7.70199148e8,
7.99968565e8,
7.31105205e8,
7.94627452e8,
7.52205262e8,
7.44255972e8,
7.92573816e8,
7.75143609e8,
7.50085445e8,
7.42457424e8,
7.35277689e8
], ],
"memory": 51380857328968, "memory": 23891072456,
"params": [ "params": [
"Parameters", "Parameters",
{ {
@ -42,17 +90,65 @@
} }
], ],
"times": [ "times": [
3.7202049569362e13, 1.9649655533e10,
3.7400159760069e13 1.8655222625e10,
2.044920046e10,
2.0006253124e10,
1.9225532614e10,
1.8425637493e10,
1.8009993618e10,
1.8566547913e10,
2.0298324918e10,
1.9375435774e10,
2.0259600918e10,
1.9689447935e10,
2.0440165546e10,
2.1198185981e10,
2.1529941031e10,
1.9621765309e10,
2.0096583579e10,
1.9353443691e10,
2.2395139743e10,
2.2147177349e10,
2.2065235354e10,
1.9008133225e10,
2.226108083e10,
2.2085219053e10,
2.0505924388e10,
1.951018691e10,
2.1750413636e10,
2.2142496895e10,
2.1011968434e10,
1.9815838525e10,
1.9442578236e10,
1.9848841235e10,
1.8999443547e10,
1.8850250259e10,
1.9418255558e10,
2.0859989717e10,
1.9155040161e10,
1.9639739596e10,
1.939165026e10,
1.9236817418e10,
1.9837660656e10,
1.8577069226e10,
1.9406743348e10,
1.9497777664e10,
1.911300801e10,
1.875399388e10,
2.0604575964e10,
1.8009223946e10,
1.9248258647e10,
1.9877171946e10
] ]
} }
] ]
}, },
"tags": [ "tags": [
"GPUTranspiler" "GPUTranspiler"
] ]
} }
], ],
"GPUI": [ "GPUI": [
"BenchmarkGroup", "BenchmarkGroup",
{ {

View File

@ -1 +1 @@
[{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"nikuradse_1":["Trial",{"allocs":1070928,"gctimes":[2.7425249e7,7.412701e7,3.6607941e7,7.2060594e7,3.4529434e7,7.6634167e7,3.2012321e7,7.3820784e7,3.3949954e7,7.8478248e7,4.0126379e7,7.8064709e7,3.7594681e7,7.7171913e7,3.2345052e7,7.4243448e7,3.4353198e7,7.6815947e7,3.3275476e7,7.6196381e7,3.5836579e7,7.9893164e7,3.426444e7,7.8096102e7,3.5667171e7,7.8791806e7,3.4285798e7,8.0897821e7,3.6955997e7,7.3759746e7,3.3773137e7,7.328944e7,3.4533305e7,7.4964616e7,3.4649633e7,7.4867313e7,3.6125153e7,7.7465251e7,3.4405076e7,8.0242334e7,3.2479474e7,7.5060436e7,3.272518e7,7.2772772e7,3.5399275e7,7.4715997e7,3.5420495e7,7.68539e7,3.5243677e7,7.4565513e7],"memory":660409808,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":50,"evals":1,"gcsample":false,"seconds":28800.0,"overhead":0.0,"memory_tolerance":0.01}],"times":[3.43605916e9,3.571094971e9,3.581169167e9,3.617386266e9,3.640177061e9,3.666140865e9,3.59312133e9,3.642660429e9,3.649362226e9,3.639535812e9,3.615756984e9,4.084913467e9,3.658895286e9,3.697572649e9,3.639317733e9,3.626500969e9,3.730074621e9,3.834972951e9,3.77581077e9,3.810886128e9,3.821828959e9,3.810445379e9,3.74010373e9,4.100990879e9,3.805819398e9,3.883427787e9,3.759697669e9,3.826958891e9,3.806828201e9,3.737459795e9,3.82547766e9,3.875865222e9,3.778686866e9,3.772500863e9,3.695058761e9,3.839603577e9,3.758997268e9,3.78092914e9,3.722981644e9,3.81821317e9,3.755600545e9,3.849403637e9,3.807069344e9,3.731021781e9,3.775985336e9,3.809009579e9,3.844778579e9,3.795816016e9,3.761476812e9,3.856067238e9]}]},"tags":["CPUInterpreter"]}]},"tags":[]}]]] [{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"nikuradse_1":["Trial",{"allocs":1070928,"gctimes":[5.9810994e7,6.4737628e7,6.6129743e7,7.1565506e7,6.7670658e7,5.9069063e7,6.4987093e7,6.9015313e7,6.1821987e7,6.232688e7,6.9091327e7,6.0481241e7,6.7083905e7,7.1568874e7,6.4126918e7,6.656359e7,6.0971912e7,6.6050458e7,6.4490748e7,6.3792946e7,6.387013e7,6.2149826e7,6.5226883e7,5.8717054e7,7.0228008e7,6.4004441e7,6.179879e7,6.5803149e7,6.7778738e7,6.7530531e7,6.3874846e7,6.5191925e7,6.3458451e7,6.2795489e7,6.1214158e7,6.3242098e7,6.0904665e7,6.1067523e7,6.4187211e7,5.9758454e7,6.3188528e7,6.2703208e7,6.3179623e7,6.6383934e7,6.4153586e7,6.1124868e7,6.1729561e7,6.4057874e7,6.2238357e7,6.7185346e7],"memory":1092256904,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":50,"evals":1,"gcsample":false,"seconds":28800.0,"overhead":0.0,"memory_tolerance":0.01}],"times":[4.7279143479e10,4.7935284564e10,4.7889470924e10,4.8090224769e10,4.9154174752e10,4.8234789351e10,4.8104334398e10,4.8399709732e10,4.7838902153e10,4.7903811218e10,4.8003267026e10,4.7752267096e10,5.0271042e10,5.2827474041e10,5.1150949627e10,5.3890859024e10,5.0334709301e10,4.8277583693e10,4.8826727675e10,4.8699721506e10,4.8785042007e10,4.8277983231e10,4.814134015e10,4.8104634658e10,4.9149760213e10,4.8564998255e10,4.8300117448e10,4.8526373086e10,4.8889779772e10,4.8001705803e10,4.7925610954e10,4.8209726338e10,4.8102811977e10,4.8159213161e10,4.816676277e10,4.8356507356e10,4.8464023297e10,4.8347214632e10,4.8467268775e10,4.8034119608e10,4.88565184e10,4.8690518925e10,4.8235448799e10,4.8346337545e10,4.8627889423e10,4.784124779e10,4.8095176162e10,4.869052391e10,4.7806194068e10,4.8009508502e10]}]},"tags":["CPUInterpreter"]}]},"tags":[]}]]]

View File

@ -15,18 +15,66 @@
{ {
"data": { "data": {
"GPUT": [ "GPUT": [
"BenchmarkGroup", "BenchmarkGroup",
{ {
"data": { "data": {
"nikuradse_1": [ "nikuradse_1": [
"Trial", "Trial",
{ {
"allocs": 1534112879, "allocs": 27549909,
"gctimes": [ "gctimes": [
3.398826747854e12, 1.796502723e9,
2.618070795579e12 7.72059865e8,
3.94563446e8,
4.27997326e8,
4.06964911e8,
4.08277194e8,
4.02770711e8,
4.11141922e8,
4.07309952e8,
4.12815766e8,
4.13257433e8,
4.11708235e8,
4.06349416e8,
4.14353433e8,
4.05742826e8,
4.09829039e8,
4.02646084e8,
4.01623866e8,
4.11190055e8,
4.11476122e8,
4.07361638e8,
4.07028467e8,
4.11106781e8,
4.26360821e8,
4.07521363e8,
4.07228793e8,
4.09025385e8,
4.21241253e8,
4.1859973e8,
4.2067553e8,
4.00959317e8,
4.16666312e8,
4.10104406e8,
4.18910797e8,
4.05213147e8,
4.16627063e8,
4.1920481e8,
4.54088613e8,
4.39532553e8,
4.13238829e8,
4.14822338e8,
4.11867383e8,
4.15005572e8,
4.11339915e8,
4.1448983e8,
4.17699043e8,
4.16447232e8,
4.1597287e8,
4.14369912e8,
4.19276762e8
], ],
"memory": 51380857328968, "memory": 67507887480,
"params": [ "params": [
"Parameters", "Parameters",
{ {
@ -42,17 +90,65 @@
} }
], ],
"times": [ "times": [
--3.7202049569362e13, 3.9931587632e10,
--3.7400159760069e13 3.8962332239e10,
2.6658724209e10,
2.769671872e10,
2.6617417291e10,
2.6695278116e10,
2.6389594847e10,
2.6500758348e10,
2.6314618692e10,
2.6869478695e10,
2.6596999781e10,
2.6195296634e10,
2.6321536967e10,
2.676203466e10,
2.6810603797e10,
2.6754603343e10,
2.6616260783e10,
2.7015249577e10,
2.621089281e10,
2.565195064e10,
2.4093609228e10,
2.6872052438e10,
2.6312874968e10,
2.6567674382e10,
2.6188371615e10,
2.6627277961e10,
2.6351801318e10,
2.6764821332e10,
2.658020325e10,
2.6845009549e10,
2.6127450384e10,
2.6523726565e10,
2.6221363227e10,
2.542875719e10,
2.6885440863e10,
2.7207730806e10,
2.770831496e10,
2.7896929881e10,
2.7711770473e10,
2.6842628626e10,
2.4898863927e10,
2.6687932301e10,
2.6503076469e10,
2.655039632e10,
2.708347459e10,
2.5440628322e10,
2.6279933326e10,
2.7371915793e10,
2.6695784917e10,
2.7225562291e10
] ]
} }
] ]
}, },
"tags": [ "tags": [
"GPUTranspiler" "GPUTranspiler"
] ]
} }
], ],
"GPUI": [ "GPUI": [
"BenchmarkGroup", "BenchmarkGroup",
{ {

View File

@ -1 +0,0 @@
[{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"nikuradse_1":["Trial",{"allocs":1070928,"gctimes":[5.9810994e7,6.4737628e7,6.6129743e7,7.1565506e7,6.7670658e7,5.9069063e7,6.4987093e7,6.9015313e7,6.1821987e7,6.232688e7,6.9091327e7,6.0481241e7,6.7083905e7,7.1568874e7,6.4126918e7,6.656359e7,6.0971912e7,6.6050458e7,6.4490748e7,6.3792946e7,6.387013e7,6.2149826e7,6.5226883e7,5.8717054e7,7.0228008e7,6.4004441e7,6.179879e7,6.5803149e7,6.7778738e7,6.7530531e7,6.3874846e7,6.5191925e7,6.3458451e7,6.2795489e7,6.1214158e7,6.3242098e7,6.0904665e7,6.1067523e7,6.4187211e7,5.9758454e7,6.3188528e7,6.2703208e7,6.3179623e7,6.6383934e7,6.4153586e7,6.1124868e7,6.1729561e7,6.4057874e7,6.2238357e7,6.7185346e7],"memory":1092256904,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":50,"evals":1,"gcsample":false,"seconds":28800.0,"overhead":0.0,"memory_tolerance":0.01}],"times":[4.7279143479e10,4.7935284564e10,4.7889470924e10,4.8090224769e10,4.9154174752e10,4.8234789351e10,4.8104334398e10,4.8399709732e10,4.7838902153e10,4.7903811218e10,4.8003267026e10,4.7752267096e10,5.0271042e10,5.2827474041e10,5.1150949627e10,5.3890859024e10,5.0334709301e10,4.8277583693e10,4.8826727675e10,4.8699721506e10,4.8785042007e10,4.8277983231e10,4.814134015e10,4.8104634658e10,4.9149760213e10,4.8564998255e10,4.8300117448e10,4.8526373086e10,4.8889779772e10,4.8001705803e10,4.7925610954e10,4.8209726338e10,4.8102811977e10,4.8159213161e10,4.816676277e10,4.8356507356e10,4.8464023297e10,4.8347214632e10,4.8467268775e10,4.8034119608e10,4.88565184e10,4.8690518925e10,4.8235448799e10,4.8346337545e10,4.8627889423e10,4.784124779e10,4.8095176162e10,4.869052391e10,4.7806194068e10,4.8009508502e10]}]},"tags":["CPUInterpreter"]}]},"tags":[]}]]]

View File

@ -20,7 +20,7 @@ Although the GPU plays a crucial role, work is also carried out on the CPU. The
\subsubsection{System Memory} \subsubsection{System Memory}
In addition to the hardware configuration of the GPU and CPU, system memory (RAM) also plays a crucial role. Although RAM does not directly contribute to the overall performance, it can have a noticeable indirect impact due to its role in caching and general data storage. Insufficient RAM forces the operating system to use the page file, which is stored on a considerably slower SSD. This leads to slower data access, thereby reducing the overall performance of the application. In addition to the hardware configuration of the GPU and CPU, system memory (RAM) also plays a crucial role. Although RAM does not directly contribute to the overall performance, it can have a noticeable indirect impact due to its role in caching and general data storage. Insufficient RAM forces the operating system to use the page file, which is stored on a considerably slower SSD. This leads to slower data access, thereby reducing the overall performance of the application.
As seen in the list below, only 16 GB of RAM were available during the benchmarking process. This amount is insufficient to utilise caching to the extent outlined in Chapter \ref{cha:implementation}. Additional RAM was not available, meaning caching had to be disabled, which will be further explained in Section \ref{sec:results}. As seen in the list below, only 16 GB of RAM were available during the benchmarking process. This amount is insufficient to utilise caching to the extent outlined in Chapter \ref{cha:implementation}. Additional RAM was not available, meaning caching had to be disabled for all benchmarks as further explained in Section \ref{sec:results}.
\subsubsection{Hardware} \subsubsection{Hardware}
With the requirements explained above in mind, the following hardware is used to perform the benchmarks for the CPU-based evaluator, which was used as the baseline, as well as for the GPU-based evaluators: With the requirements explained above in mind, the following hardware is used to perform the benchmarks for the CPU-based evaluator, which was used as the baseline, as well as for the GPU-based evaluators:
@ -45,18 +45,15 @@ Typically, newer versions of these components include, among other things, perfo
\subsection{Performance Evaluation Process} \subsection{Performance Evaluation Process}
With the hardware and software configuration established, the process of benchmarking the implementations can be described. This process is designed to simulate the load and scenario in which these evaluators will be used. The Nikuradse dataset \parencite{nikuradse_laws_1950} has been chosen as the data source. The dataset models the laws of flow in rough pipes and provides $362$ variable sets, each set containing two variables. This dataset has first been used by \textcite{guimera_bayesian_2020} to benchmark a symbolic regression algorithm. With the hardware and software configuration established, the process of benchmarking the implementations can be described. This process is designed to simulate the load and scenario in which these evaluators will be used. The Nikuradse dataset \parencite{nikuradse_laws_1950} has been chosen as the data source. The dataset models the laws of flow in rough pipes and provides $362$ variable sets, each set containing two variables. This dataset has first been used by \textcite{guimera_bayesian_2020} to benchmark a symbolic regression algorithm.
Since only the evaluators are benchmarked, the expressions to be evaluated must already exist. These expressions are generated for the Nikuradse dataset using the exhaustive symbolic regression algorithm proposed by \textcite{bartlett_exhaustive_2024}. This ensures that the expressions are representative of what needs to be evaluated in a real-world application. In total, four benchmarks will be conducted, each having a different goal, which will be further explained in the following paragraphs. Since only the evaluators are benchmarked, the expressions to be evaluated must already exist. These expressions are generated for the Nikuradse dataset using the exhaustive symbolic regression algorithm proposed by \textcite{bartlett_exhaustive_2024}. This ensures that the expressions are representative of what needs to be evaluated in a real-world application. In total, three benchmarks will be conducted, each having a different goal, which will be further explained in the following paragraphs.
The first benchmark involves a very large set of roughly $250\,000$ expressions. This means that all $250\,000$ expressions are evaluated in a single generation when using GP. In a typical generation, significantly fewer expressions would be evaluated. However, this benchmark is designed to show how the evaluators can handle large volumes of data. The first benchmark involves a very large set of roughly $250\,000$ expressions with $362$ variable sets. This means that when using GP all $250\,000$ expressions would be evaluated in a single generation. In a typical generation, significantly fewer expressions would be evaluated. However, this benchmark is designed to show how the evaluators can handle very large volumes of data. Because of memory constraints, it was not possible to conduct an additional benchmark with a higher number of variable sets.
TODO:::: Remove this benchmark, as it just uses too much RAM Both the second and third benchmarks are conducted to demonstrate how the evaluators will perform in more realistic scenarios. For the second benchmark the number of expressions has been reduced to roughly $10\,000$, and the number of variable sets is again $362$. The number of expressions is much more representative to a typical scenario, while the number of variable sets is very low. To determine if the GPU evaluators are also a feasible alternative, this benchmark is conducted nonetheless.
A second benchmark, with slight modifications to the first, is also conducted. Given that GPUs are very good at executing work in parallel, the number of variable sets is increased in this benchmark. Therefore, the second benchmark consists of the same $250\,000$ expressions, but the number of variable sets has been increased by a factor of 30 to a total of roughly $10\,000$. This benchmark aims to demonstrate how the GPU is best used for a larger number of variable sets. A higher number of variable sets is also more representative of the scenarios the evaluators will be employed.
The third benchmark is conducted to demonstrate how the evaluators will perform in more realistic scenarios. For this benchmark the number of expressions has been reduced to roughly $10\,000$, and the number of variable sets is again $362$. The purpose of this benchmark is to demonstrate how the evaluators are likely perform in a typical scenario. Finally, the third benchmark will be conducted. Similar to the second benchmark, this benchmark evaluates the same $10\,000$ expressions but now with 30 times more variable sets, which equates to roughly $10\,000$. This benchmark mimics the scenario where the evaluators will most likely be used. While the others simulate different conditions to determine if and where the GPU evaluators can be used efficiently, this benchmark is more focused on determining if the GPU evaluators are suitable for the specific scenario they would be used in.
Finally, a fourth benchmark will be conducted. Similar to the second and third benchmarks, this benchmark evaluates the same $10\,000$ expressions with the same $10\,000$ variable sets. This benchmark mimics the scenario where the evaluators will most likely be used. While the others simulate different conditions to determine if and where the GPU evaluators can be used efficiently, this benchmark is more focused on determining if the GPU evaluators are suitable for the specific scenario they would be used in. All three benchmarks also simulate a parameter optimisation step, as this is the scenario in which these evaluators will be used in. For parameter optimisation, $100$ steps are used, meaning that all expressions will be evaluated $100$ times. During the benchmark, this process is simulated by re-transmitting the parameters instead of generating new ones. Generating new parameters is not part of the evaluators and is therefore not implemented. However, because the parameters are re-transmitted every time, the overhead of sending the data is taken into account. This overhead is part of the evaluators and is an additional burden that the CPU implementation does not have, making important to be measured.
All four benchmarks also simulate a parameter optimisation step, as this is the scenario in which these evaluators will be used in. For parameter optimisation, $100$ steps are used, meaning that all expressions will be evaluated $100$ times. During the benchmark, this process is simulated by re-transmitting the parameters instead of generating new ones. Generating new parameters is not part of the evaluators and is therefore not implemented. However, because the parameters are re-transmitted every time, the overhead of sending the data is taken into account. This overhead is part of the evaluators and is an additional burden that the CPU implementation does not have, making important to be measured.
\subsubsection{Measuring Performance} \subsubsection{Measuring Performance}
The performance measurements are taken, using the BenchmarkTools.jl\footnote{\url{https://juliaci.github.io/BenchmarkTools.jl/stable/}} package. It is the standard for benchmarking applications in Julia, which makes it an obvious choice for measuring the performance of the evaluators. The performance measurements are taken, using the BenchmarkTools.jl\footnote{\url{https://juliaci.github.io/BenchmarkTools.jl/stable/}} package. It is the standard for benchmarking applications in Julia, which makes it an obvious choice for measuring the performance of the evaluators.
@ -75,7 +72,7 @@ This section presents the results of the benchmarks described above. First the r
In this section, the results for the GPU-based interpreter are presented in detail. Following the benchmark results, the process of tuning the interpreter is described as well as how to adapt the tuning for the different benchmarks. This part not only contains the tuning of the GPU, but also performance improvements done on the CPU side. In this section, the results for the GPU-based interpreter are presented in detail. Following the benchmark results, the process of tuning the interpreter is described as well as how to adapt the tuning for the different benchmarks. This part not only contains the tuning of the GPU, but also performance improvements done on the CPU side.
\subsubsection{Benchmark 1} \subsubsection{Benchmark 1}
The first benchmark consisted of $250\,000$ expressions and $362$ variable sets with $100$ parameter optimisation steps. Because each expression needs to be evaluated with each variable set for each parameter optimisation step, a total of $9.05\,\textit{billion}$ evaluations have been performed per sample. In Figure \ref{fig:gpu_i_benchmark_1} the result over all $50$ samples is presented. The median value across all executions is $466.3$ seconds with a standard deviation of $14.2$ seconds. The first benchmark consisted of $250\,000$ expressions and $362$ variable sets with $100$ parameter optimisation steps. Because each expression needs to be evaluated with each variable set for each parameter optimisation step, a total of $9.05\,\textit{billion}$ evaluations have been performed per sample. In Figure \ref{fig:gpu_i_benchmark_1} the result over all $50$ samples is presented. The median value across all samples is $466.3$ seconds with a standard deviation of $14.2$ seconds.
\begin{figure} \begin{figure}
\centering \centering
\includegraphics[width=.9\textwidth]{results/gpu-interpreter-final-performance-benchmark1.png} \includegraphics[width=.9\textwidth]{results/gpu-interpreter-final-performance-benchmark1.png}
@ -83,13 +80,19 @@ The first benchmark consisted of $250\,000$ expressions and $362$ variable sets
\label{fig:gpu_i_benchmark_1} \label{fig:gpu_i_benchmark_1}
\end{figure} \end{figure}
% talk about kernel configuration (along the lines of: results achieved with block size of X) etc. Also include that CPU and GPU utilisation was 100% the entire time. If this is too short, just add it to the above paragraph and make the 4 benchmark sections relatively short, as the most interesting information is in the performance tuning and comparison sections anyway For the kernel configuration, a block size of $128$ threads has been used. As will be explained below, this has been found to be the configuration that results in the most performance. During the benchmark, the utilisation of both the CPU and GPU was roughly $100\%$.
\subsubsection{Benchmark 2} \subsubsection{Benchmark 2}
TODO: Remove this benchmark, none of the implementations had enough RAM available With $10\,000$ expressions, $362$ variable sets and $100$ parameter optimisation steps, the total number of evaluations per sample was $362\,\textit{million}$. The median across all samples is $21.3$ seconds with a standard deviation of $0.75$ seconds. Compared to benchmark 1, there were $25$ times fewer evaluations which also resulted in a reduction of the median and standard deviation of roughly $25$ times. Since the number of variable sets did not change, the block size for this benchmark remained at $128$ threads. Again the utilisation of the CPU and GPU during the benchmark was roughly $100\%$.
\begin{figure}
\centering
\includegraphics[width=.9\textwidth]{results/gpu-interpreter-final-performance-benchmark2.png}
\caption{The results of the GPU-based interpreter for benchmark 2}
\label{fig:gpu_i_benchmark_2}
\end{figure}
\subsubsection{Benchmark 3} \subsubsection{Benchmark 3}
std of 750.1 ms The third benchmark used the same $10\,000$ expressions and $100$ parameter optimisation steps. However, now there are 30 times more variable sets that need to be used for evaluation. This means, that the total number of evaluations per sample is now $10.86\,\textit{billion}$. This means, compared to benchmark 1, an additional $1.8\,\textit{billion}$ evaluations were performed. However, as seen in Figure \ref{fig:gpu_i_benchmark_3}, the execution time was significantly faster. With a median of $30.3$ seconds and a standard deviation of $0.45$ seconds, this benchmark was only marginally slower than benchmark 2. This also indicates, that the GPU evaluators are much more suited for scenarios, where there is a high number of variable sets.
\begin{figure} \begin{figure}
\centering \centering
\includegraphics[width=.9\textwidth]{results/gpu-interpreter-final-performance-benchmark3.png} \includegraphics[width=.9\textwidth]{results/gpu-interpreter-final-performance-benchmark3.png}
@ -97,7 +100,7 @@ std of 750.1 ms
\label{fig:gpu_i_benchmark_3} \label{fig:gpu_i_benchmark_3}
\end{figure} \end{figure}
\subsubsection{Benchmark 4} Although the number of variable sets has been increased by 30 times, the block size remained at 128 threads. Unlike the previous benchmarks, the hardware utilisation was different. Now only the GPU was utilised to 100\% while the CPU utilisation started at 100\% and slowly dropped to 80\%. The GPU needs to perform 30 times more evaluations, meaning it takes longer for one kernel dispatch to be finished. At the same time, the CPU tries to dispatch the kernel at the same rate as before. Because only a certain amount of kernels can be dispatched at once, the CPU needs to wait for the GPU to finish a kernel before another one can be dispatched again. Therefore, in this scenario, the evaluator runs into a GPU-bottleneck and using a GPU with more performance, would consequently improve the runtime in this scenario. In the benchmarks before, both the CPU and GPU would need to be upgraded, to achieve better performance.
blocksize 128: 84.84 blocks fast (prolly because less wasted threads) blocksize 128: 84.84 blocks fast (prolly because less wasted threads)
bocksize 192: 56.56 blocks very slow bocksize 192: 56.56 blocks very slow
@ -121,14 +124,30 @@ Results only for Transpiler (also contains final kernel configuration and probab
\subsubsection{Benchmark 1} \subsubsection{Benchmark 1}
\subsubsection{Benchmark 2} \subsubsection{Benchmark 2}
TODO: Remove this benchmark kernels can now be compiled at the same time as they are generated (should drastically improve performance)
std: 1.16 seconds
\begin{figure}
\centering
\includegraphics[width=.9\textwidth]{results/gpu-transpiler-final-performance-benchmark2.png}
\caption{The results of the transpiler for benchmark 2}
\label{fig:gpu_t_benchmark_2}
\end{figure}
CPU: 100\%
GPU: very short bursts to 100\% then down to 0\% with a very high frequency (therefore GPU pretty much utilised to 50\% during a sample)
\subsubsection{Benchmark 3} \subsubsection{Benchmark 3}
kernels can now be compiled at the same time as they are generated (should drastically improve performance) Even larger var sets would be perfect. 10k is rather small and the GPU still has barely any work to do
std: 2.64 seconds
\begin{figure}
\centering
\includegraphics[width=.9\textwidth]{results/gpu-transpiler-final-performance-benchmark3.png}
\caption{The results of the transpiler for benchmark 3}
\label{fig:gpu_t_benchmark_3}
\end{figure}
\subsubsection{Benchmark 4} CPU: 100\% during frontend + transpilation + compilation, then goes hovers at 80\% (same reason than interpreter bench 3 most likely)
GPU: During compilation at 20\% -> evaluation: between 50 and 100 but fewer spikes to 100; probably very small kernels, therefore a lot of scheduling on GPU, resulting in less utilisation but too many dispatches so CPU slows down (maybe do another quick performance tuning session to see if different block size can improve this behaviour)
Even larger var sets would be perfect. 10k is rather small and the GPU barely has any work to do
\subsection{Performance Tuning} \subsection{Performance Tuning}
Document the process of performance tuning Document the process of performance tuning
@ -143,15 +162,10 @@ CPU at 100\% GPU at around 30\%. Heavily CPU bottlenecked. Mainly due to PTX com
\subsection{Comparison} \subsection{Comparison}
Comparison of Interpreter and Transpiler as well as Comparing the two with CPU interpreter Comparison of Interpreter and Transpiler as well as Comparing the two with CPU interpreter
talk about that compute portion is just too little. Only more complex expressions with higher var set count benefit well (make one or two performance evaluations, with 10 larger expressions and at least 1k var sets and present that here as point for that statement) more var sets == better performance for GPU; more expressions == more performance for CPU evaluator
\subsubsection{Benchmark 1} \subsubsection{Benchmark 1}
\subsubsection{Benchmark 2} \subsubsection{Benchmark 2}
TODO: Remove this benchmark
CPU Did not finish due to RAM constraints
\subsubsection{Benchmark 3} \subsubsection{Benchmark 3}
\subsubsection{Benchmark 4}

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 18 KiB

After

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.