diff --git a/other/input-explanation.drawio b/other/input-explanation.drawio index a4ff345..b49fda6 100644 --- a/other/input-explanation.drawio +++ b/other/input-explanation.drawio @@ -1,11 +1,11 @@ - + - + - + @@ -59,7 +59,7 @@ - + @@ -95,442 +95,442 @@ - + - - + + - - + + - - + + - - + + - + - - + + - - + + - - + + - - + + - + - - + + - - + + - - + + - + - - + + - + - + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - + - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - + - - + + - - + + - - + + - - + + - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - + - - + + - + - + @@ -578,13 +578,13 @@ - + - + - - + + diff --git a/other/interpreter_sequence_diagram.drawio b/other/interpreter_sequence_diagram.drawio index bc61ec8..fb64e95 100644 --- a/other/interpreter_sequence_diagram.drawio +++ b/other/interpreter_sequence_diagram.drawio @@ -1,172 +1,172 @@ - + - + - - + + - - + + - - + + - - + + - - + + - + - - + + - + - - + + - - + + - + - - + + - - + + - + - + - + - + - + - - + + - + - - + + - + - - + + - + - - + + - + - - + + - - + + - + - - + + - - + + - - + + - + - - + + - + - - + + - - + + - - + + - + - - + + - - + + - + - + - + - + - + - - + + - + - - + + - + - - + + - - + + - + - - + + - + diff --git a/other/transpiler_sequence_diagram.drawio b/other/transpiler_sequence_diagram.drawio index f5c9656..dd05836 100644 --- a/other/transpiler_sequence_diagram.drawio +++ b/other/transpiler_sequence_diagram.drawio @@ -1,178 +1,178 @@ - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + diff --git a/package/test/benchmarks/3/gpu.json b/package/test/benchmarks/3/gpu.json index a550815..3af3f30 100644 --- a/package/test/benchmarks/3/gpu.json +++ b/package/test/benchmarks/3/gpu.json @@ -1,292 +1,292 @@ [ - { - "Julia": "1.11.5", - "BenchmarkTools": { - "major": 1, - "minor": 6, - "patch": 0, - "prerelease": [], - "build": [] - } - }, - [ - [ - "BenchmarkGroup", - { - "data": { - "GPUT": [ - "BenchmarkGroup", - { - "data": { - "nikuradse_1": [ - "Trial", - { - "allocs": 27549794, - "gctimes": [ - 7.34988931e8, - 5.41494997e8, - 4.54013175e8, - 4.35208291e8, - 4.3231789e8, - 4.55546184e8, - 4.23418621e8, - 4.50430938e8, - 4.57438035e8, - 4.40032177e8, - 4.44249114e8, - 4.59505029e8, - 4.68161721e8, - 4.78667113e8, - 4.41616067e8, - 4.5551461e8, - 4.75652448e8, - 4.72338385e8, - 4.47779781e8, - 4.52755333e8, - 4.76158081e8, - 4.48737222e8, - 4.55761564e8, - 4.39574521e8, - 4.86435134e8, - 4.43170348e8, - 4.33731271e8, - 4.61921334e8, - 4.37434039e8, - 4.59409079e8, - 4.36341634e8, - 4.71427401e8, - 4.31984388e8, - 4.59200269e8, - 4.52769327e8, - 4.44261215e8, - 4.61363275e8, - 4.61565013e8, - 4.48557831e8, - 4.85488793e8, - 4.4128917e8, - 4.7205662e8, - 4.55980625e8, - 4.49702326e8, - 4.57778953e8, - 4.52225066e8, - 4.53744762e8, - 4.61079024e8, - 4.47186032e8, - 4.51833021e8 - ], - "memory": 67507887608, - "params": [ - "Parameters", - { - "gctrial": true, - "time_tolerance": 0.05, - "evals_set": false, - "samples": 50, - "evals": 1, - "gcsample": false, - "seconds": 43200.0, - "overhead": 0.0, - "memory_tolerance": 0.01 - } - ], - "times": [ - 2.4717936323e10, - 2.4983074984e10, - 2.3139017877e10, - 2.4848874137e10, - 2.5056845586e10, - 2.547690064e10, - 2.4976535335e10, - 2.5575731567e10, - 2.5140349264e10, - 2.5896177615e10, - 2.5501376819e10, - 2.5327110754e10, - 2.5409913851e10, - 2.6295037648e10, - 2.4355540157e10, - 2.4657706641e10, - 2.5952612569e10, - 2.5854856758e10, - 2.5568112399e10, - 2.5490261014e10, - 2.5160759326e10, - 2.6260268676e10, - 2.5242980231e10, - 2.5638644329e10, - 2.3768975772e10, - 2.5146122285e10, - 2.5682055949e10, - 2.5237286107e10, - 2.5496022078e10, - 2.5568661702e10, - 2.4330249484e10, - 2.5685686423e10, - 2.5250886166e10, - 2.5401607442e10, - 2.5564544027e10, - 2.5868746223e10, - 2.5977606065e10, - 2.5405825803e10, - 2.4619705069e10, - 2.4325894725e10, - 2.566709978e10, - 2.5400372207e10, - 2.5148598725e10, - 2.5256329818e10, - 2.5236091538e10, - 2.602685786e10, - 2.5430861304e10, - 2.5972127622e10, - 2.3654688411e10, - 2.605084424e10 - ] - } - ] - }, - "tags": [ - "GPUTranspiler" - ] - } - ], - "GPUI": [ - "BenchmarkGroup", - { - "data": { - "nikuradse_1": [ - "Trial", - { - "allocs": 32243751, - "gctimes": [ - 5.41994011e8, - 5.74350603e8, - 4.90525664e8, - 5.92143868e8, - 6.56922572e8, - 6.38722256e8, - 5.51324211e8, - 5.94380581e8, - 5.65880356e8, - 5.30293176e8, - 6.75544373e8, - 5.91556404e8, - 5.30953191e8, - 5.73477234e8, - 5.07802986e8, - 6.71908957e8, - 4.58611495e8, - 5.34383897e8, - 4.35307473e8, - 4.25796027e8, - 4.26650755e8, - 5.43969839e8, - 4.62966279e8, - 5.62772957e8, - 5.61112059e8, - 5.21608844e8, - 4.29687492e8, - 5.3098919e8, - 4.18511386e8, - 5.51285144e8, - 6.36456452e8, - 5.80375968e8, - 4.90520531e8, - 5.72019977e8, - 5.16803925e8, - 5.31636535e8, - 4.88470453e8, - 4.57291468e8, - 4.63585061e8, - 6.75995209e8, - 4.47446015e8, - 4.21505932e8, - 4.63417339e8, - 6.17901021e8, - 5.04952063e8, - 6.3799233e8, - 4.34554313e8, - 6.24205134e8, - 6.1699824e8, - 5.4327705e8 - ], - "memory": 45874268472, - "params": [ - "Parameters", - { - "gctrial": true, - "time_tolerance": 0.05, - "evals_set": false, - "samples": 50, - "evals": 1, - "gcsample": false, - "seconds": 43200.0, - "overhead": 0.0, - "memory_tolerance": 0.01 - } - ], - "times": [ - 3.07178374e10, - 3.0668015775e10, - 3.0731090373e10, - 3.0442775184e10, - 3.0456642482e10, - 3.0082122734e10, - 3.0126331654e10, - 3.0751723908e10, - 3.1179628532e10, - 3.0065663574e10, - 3.0464515622e10, - 3.0393855038e10, - 3.1635622751e10, - 3.0447222014e10, - 2.973601985e10, - 3.0033623194e10, - 3.0580015719e10, - 3.1400733412e10, - 3.0272328646e10, - 3.0223853837e10, - 2.9915814997e10, - 3.0818324531e10, - 3.0179331592e10, - 3.0293039282e10, - 3.0017377964e10, - 3.0087189496e10, - 3.0582174914e10, - 2.996325235e10, - 3.0134649182e10, - 3.1042223141e10, - 3.0007740363e10, - 3.0437426607e10, - 3.0810836436e10, - 3.1234163757e10, - 3.0221879009e10, - 3.0338940936e10, - 3.1233683944e10, - 3.1019897889e10, - 3.1380379599e10, - 2.9821214171e10, - 3.0882968215e10, - 3.0159994975e10, - 3.0309932542e10, - 2.9969275606e10, - 3.0447151474e10, - 3.0342592912e10, - 3.024330255e10, - 3.0258060029e10, - 3.0095601739e10, - 3.0209601692e10 - ] - } - ] - }, - "tags": [ - "GPUInterpreter" - ] - } - ] - }, - "tags": [] - } - ] - ] + { + "Julia": "1.11.5", + "BenchmarkTools": { + "major": 1, + "minor": 6, + "patch": 0, + "prerelease": [], + "build": [] + } + }, + [ + [ + "BenchmarkGroup", + { + "data": { + "GPUT": [ + "BenchmarkGroup", + { + "data": { + "nikuradse_1": [ + "Trial", + { + "allocs": 27549794, + "gctimes": [ + 7.34988931e8, + 5.41494997e8, + 4.54013175e8, + 4.35208291e8, + 4.3231789e8, + 4.55546184e8, + 4.23418621e8, + 4.50430938e8, + 4.57438035e8, + 4.40032177e8, + 4.44249114e8, + 4.59505029e8, + 4.68161721e8, + 4.78667113e8, + 4.41616067e8, + 4.5551461e8, + 4.75652448e8, + 4.72338385e8, + 4.47779781e8, + 4.52755333e8, + 4.76158081e8, + 4.48737222e8, + 4.55761564e8, + 4.39574521e8, + 4.86435134e8, + 4.43170348e8, + 4.33731271e8, + 4.61921334e8, + 4.37434039e8, + 4.59409079e8, + 4.36341634e8, + 4.71427401e8, + 4.31984388e8, + 4.59200269e8, + 4.52769327e8, + 4.44261215e8, + 4.61363275e8, + 4.61565013e8, + 4.48557831e8, + 4.85488793e8, + 4.4128917e8, + 4.7205662e8, + 4.55980625e8, + 4.49702326e8, + 4.57778953e8, + 4.52225066e8, + 4.53744762e8, + 4.61079024e8, + 4.47186032e8, + 4.51833021e8 + ], + "memory": 67507887608, + "params": [ + "Parameters", + { + "gctrial": true, + "time_tolerance": 0.05, + "evals_set": false, + "samples": 50, + "evals": 1, + "gcsample": false, + "seconds": 43200.0, + "overhead": 0.0, + "memory_tolerance": 0.01 + } + ], + "times": [ + 2.4717936323e10, + 2.4983074984e10, + 2.3139017877e10, + 2.4848874137e10, + 2.5056845586e10, + 2.547690064e10, + 2.4976535335e10, + 2.5575731567e10, + 2.5140349264e10, + 2.5896177615e10, + 2.5501376819e10, + 2.5327110754e10, + 2.5409913851e10, + 2.6295037648e10, + 2.4355540157e10, + 2.4657706641e10, + 2.5952612569e10, + 2.5854856758e10, + 2.5568112399e10, + 2.5490261014e10, + 2.5160759326e10, + 2.6260268676e10, + 2.5242980231e10, + 2.5638644329e10, + 2.3768975772e10, + 2.5146122285e10, + 2.5682055949e10, + 2.5237286107e10, + 2.5496022078e10, + 2.5568661702e10, + 2.4330249484e10, + 2.5685686423e10, + 2.5250886166e10, + 2.5401607442e10, + 2.5564544027e10, + 2.5868746223e10, + 2.5977606065e10, + 2.5405825803e10, + 2.4619705069e10, + 2.4325894725e10, + 2.566709978e10, + 2.5400372207e10, + 2.5148598725e10, + 2.5256329818e10, + 2.5236091538e10, + 2.602685786e10, + 2.5430861304e10, + 2.5972127622e10, + 2.3654688411e10, + 2.605084424e10 + ] + } + ] + }, + "tags": [ + "GPUTranspiler" + ] + } + ], + "GPUI": [ + "BenchmarkGroup", + { + "data": { + "nikuradse_1": [ + "Trial", + { + "allocs": 32243751, + "gctimes": [ + 5.41994011e8, + 5.74350603e8, + 4.90525664e8, + 5.92143868e8, + 6.56922572e8, + 6.38722256e8, + 5.51324211e8, + 5.94380581e8, + 5.65880356e8, + 5.30293176e8, + 6.75544373e8, + 5.91556404e8, + 5.30953191e8, + 5.73477234e8, + 5.07802986e8, + 6.71908957e8, + 4.58611495e8, + 5.34383897e8, + 4.35307473e8, + 4.25796027e8, + 4.26650755e8, + 5.43969839e8, + 4.62966279e8, + 5.62772957e8, + 5.61112059e8, + 5.21608844e8, + 4.29687492e8, + 5.3098919e8, + 4.18511386e8, + 5.51285144e8, + 6.36456452e8, + 5.80375968e8, + 4.90520531e8, + 5.72019977e8, + 5.16803925e8, + 5.31636535e8, + 4.88470453e8, + 4.57291468e8, + 4.63585061e8, + 6.75995209e8, + 4.47446015e8, + 4.21505932e8, + 4.63417339e8, + 6.17901021e8, + 5.04952063e8, + 6.3799233e8, + 4.34554313e8, + 6.24205134e8, + 6.1699824e8, + 5.4327705e8 + ], + "memory": 45874268472, + "params": [ + "Parameters", + { + "gctrial": true, + "time_tolerance": 0.05, + "evals_set": false, + "samples": 50, + "evals": 1, + "gcsample": false, + "seconds": 43200.0, + "overhead": 0.0, + "memory_tolerance": 0.01 + } + ], + "times": [ + 3.07178374e10, + 3.0668015775e10, + 3.0731090373e10, + 3.0442775184e10, + 3.0456642482e10, + 3.0082122734e10, + 3.0126331654e10, + 3.0751723908e10, + 3.1179628532e10, + 3.0065663574e10, + 3.0464515622e10, + 3.0393855038e10, + 3.1635622751e10, + 3.0447222014e10, + 2.973601985e10, + 3.0033623194e10, + 3.0580015719e10, + 3.1400733412e10, + 3.0272328646e10, + 3.0223853837e10, + 2.9915814997e10, + 3.0818324531e10, + 3.0179331592e10, + 3.0293039282e10, + 3.0017377964e10, + 3.0087189496e10, + 3.0582174914e10, + 2.996325235e10, + 3.0134649182e10, + 3.1042223141e10, + 3.0007740363e10, + 3.0437426607e10, + 3.0810836436e10, + 3.1234163757e10, + 3.0221879009e10, + 3.0338940936e10, + 3.1233683944e10, + 3.1019897889e10, + 3.1380379599e10, + 2.9821214171e10, + 3.0882968215e10, + 3.0159994975e10, + 3.0309932542e10, + 2.9969275606e10, + 3.0447151474e10, + 3.0342592912e10, + 3.024330255e10, + 3.0258060029e10, + 3.0095601739e10, + 3.0209601692e10 + ] + } + ] + }, + "tags": [ + "GPUInterpreter" + ] + } + ] + }, + "tags": [] + } + ] + ] ] \ No newline at end of file diff --git a/thesis/chapters/conceptdesign.tex b/thesis/chapters/conceptdesign.tex index 14a8e23..2b9f755 100644 --- a/thesis/chapters/conceptdesign.tex +++ b/thesis/chapters/conceptdesign.tex @@ -9,10 +9,10 @@ The main goal of both prototypes or evaluators is to provide a speed-up compared \begin{itemize} \item Multiple expressions as input. \item All input expressions have the same number of variables ($x_n$), but can have a different number of parameters ($p_n$). - \item The variables are parametrised using a matrix of the form $k \times N$, where $k$ is the number of variables in the expressions and $N$ is the number of different parametrisations for the variables. This matrix is the same for all expressions. + \item The variables are parametrised using a matrix of the form $k \times N$, where $k$ is the number of variables in the expressions and $N$ is the number of data points. This matrix is the same for all expressions. \item The parameters are parametrised using a vector of vectors. Each vector $v_i$ corresponds to an expression $e_i$. - \item The following operations must be supported: $x + y$, $x - y$, $x * y$, $x / y$, $x ^ y$, $|x|$, $\log(x)$, $e^x$, $1 / x$ and $\sqrt{x}$. Note that $x$ and $y$ can either stand for a constant, a variable, a parameter, or another operation. - \item The results of the evaluations are returned in a matrix of the form $k \times N$. In this case, $k$ is equal to the $N$ of the variable matrix and $N$ is equal to the number of input expressions. + \item The following operations must be supported: $x + y$, $x - y$, $x * y$, $x / y$, $x ^ y$, $|x|$, $\log(x)$, $e^x$, $1 / x$ and $\sqrt{x}$. Note that $x$ and $y$ can either stand for a constant, a variable, a parameter, or another expression. + \item The results of the evaluations are returned in a matrix of the form $k \times N_e$. In this case, $k$ is equal to the $N$ of the variable matrix and $N_e$ is equal to the number of input expressions. \end{itemize} \begin{figure} @@ -25,19 +25,19 @@ The main goal of both prototypes or evaluators is to provide a speed-up compared With this, the required capabilities are outlined. However, for a better understanding, the input and output data need to be explained further. The first input contains the expressions that need to be evaluated. These can be of any length and can contain constant values, variables and parameters, all of which are linked together with the supported operators. In the simplified example shown in Figure \ref{fig:input_output_explanation}, there are six expressions $e_1$ to $e_6$. -Next is the variable matrix. An entry in this matrix corresponds to one variable in every expression. The row indicates which variable it holds the value for. For example the values in row three are used to parameterise the variable $x_3$. Each column holds a different set of variables. Each expression must be evaluated using each set of variables. In the provided example, there are three variable sets, each containing the values for four variables $x_1$ to $x_4$. +Next is the variable matrix. An entry in this matrix corresponds to one variable in every expression. The row indicates which variable it holds the value for. For example the values in row three are used to parameterise the variable $x_3$. Each column holds a different set of variables. Each expression must be evaluated using each set of variables. In the provided example, there are three data points, each containing the values for four variables $x_1$ to $x_4$. -After all expressions have been evaluated using all variable sets, the results of these evaluations must be stored in the result matrix. Each entry in this matrix holds the result of the evaluation of one expression parameterised with one variable set. The row indicates the variable set and the column indicates the expression. +After all expressions have been evaluated using all data points, the results of these evaluations must be stored in the result matrix. Each entry in this matrix holds the result of the evaluation of one expression parameterised with one data point. The row indicates the data point and the column indicates the expression. -The prototypes developed in this thesis, are part of a GP algorithm for symbolic regression. This means that the expressions that are evaluated, represent parts of the search space of all expressions being made up of any combination of allowed operators, the set of input variables, a set of parameters and constants. This means that the size of the search space grows exponentially. Exploring this search space by simply generating expressions, evaluating them once and then generating the next set of expressions leaves much of the search space unexplored. To combat this, parameters are introduced. These allow the algorithm to perform some kind of intensification. To enable this, the prototypes must support not only variables, but also parameters. +The prototypes developed in this thesis, are part of a GP algorithm for symbolic regression. This means that the expressions that are evaluated, represent parts of the search space of all expressions being made up of any combination of allowed operators, the set of input variables, a set of parameters and constants. This means that the size of the search space grows exponentially. Exploring this search space by simply generating expressions, evaluating them once and then generating the next set of expressions leaves much of their potential hidden. To assist in finding better fitting expressions, parameters are introduced. This allows the algorithm to fit the expressions to the data. To enable this improved search, the prototypes must support not only variables, but also parameters. The parameters themselves are unique to each expression, meaning they have a one-to-one mapping to an expression. Furthermore, as can be seen in Figure \ref{fig:input_output_explanation}, each expression can have a different number of parameters, or even no parameters at all. However, with no parameters, it wouldn't be possible to perform parameter optimisation. This is in contrast to variables, where each expression must have the same number of variables. Because parameters are unique to each expression and can vary in size, they are not structured as a matrix, but as a vector of vectors. An important thing to consider, is the volume and volatility of the data itself. The example shown in Figure \ref{fig:input_output_explanation} has been drastically simplified. It is expected, that there are hundreds of expressions evaluate per GP generation. Each of these expressions may contain between ten and 50 tokens. A token is equivalent to either a variable, a parameter, a constant value or an operator. -It can be assumed that typically the number of variables per expression is around ten. However, the number of variable sets can increase drastically. It can be considered that $1\,000$ variable sets is the lower limit. On the other hand, $100\,000$ can be considered as the upper limit. Considering that one variable takes up 4 bytes of memory and 10 variables are needed per expression, at least $4 * 10 * 1\,000 = 40\,000$ bytes and at most $4 * 10 * 100\,000 = 400\,000$ bytes need to be transferred to the GPU for the variables. +It can be assumed that typically the number of variables per expression is around ten. However, the number of data points can increase drastically. It can be considered that $1\,000$ data points is the lower limit. On the other hand, $100\,000$ can be considered as the upper limit. Considering that one variable takes up 4 bytes of memory and 10 variables are needed per expression, at least $4 * 10 * 1\,000 = 40\,000$ bytes and at most $4 * 10 * 100\,000 = 400\,000$ bytes need to be transferred to the GPU for the variables. Therefore this -These variables do not change during the runtime of the symbolic regression algorithm. As a result the data only needs to be sent to the GPU once. This means that the impact of this data transfer is minimal. On the other hand, the data for the parameters is much more volatile. As explained above, they are used for parameter optimisation and therefore vary from evaluation to evaluation and need to be sent to the GPU very frequently. The amount of data that needs to be sent depends on the number of expressions as well as on the number of parameters per expression. Considering $10\,000$ expressions that need to be evaluated and an average of two parameters per expression each requiring 4 bytes of memory, a total of $10\,000 * 2 * 4 = 80\,000$ bytes need to be transferred to the GPU on each parameter optimisation step. +These variables do not change during the runtime of the symbolic regression algorithm. As a result the data only needs to be sent to the GPU once. This means that the impact of this data transfer is minimal. On the other hand, the data for the parameters is much more volatile. As explained above, they are used for parameter optimisation and therefore vary from evaluation to evaluation and need to be sent to the GPU very frequently. The amount of data that needs to be sent depends on the number of expressions as well as on the number of parameters per expression. Considering $10\,000$ expressions that need to be evaluated and an average of two parameters per expression each requiring 4 bytes of memory, a total of $10\,000 * 2 * 4 = 80\,000$ bytes need to be transferred to the GPU on each parameter optimisation step. This is comparatively low, as the GPU is connected via PCI Express with version six allowing transfer rates of up to $256$ GB per second \parencite{pci-sig_pci_2025}. However, the amount of data is not of concern but rather the number of data transfers to the GPU, as every transfer has some overhead and waiting time associated with it. \section{Architecture} \label{sec:architecture} @@ -50,7 +50,7 @@ Based on the requirements and data structure above, the architecture of both pro \label{fig:kernel_architecture} \end{figure} -A design decision that has been made for both prototypes is to split the evaluation of each expression into a separate kernel or kernel dispatch as seen in Figure \ref{fig:kernel_architecture}. As explained in Section \ref{sec:thread_hierarchy}, it is desirable to reduce the occurrence of thread divergence as much as possible. Although the SIMT programming model tries to mitigate the negative effects of thread divergence, it is still advisable to avoid it when possible. For this use-case, thread divergence can easily be avoided by not evaluating all expressions in a single kernel or kernel dispatch. GPUs are able to have multiple resident grids, with modern GPUs being able to accommodate 128 grids concurrently \parencite{nvidia_cuda_2025}. One grid corresponds to one kernel dispatch, and therefore allows up-to 128 kernels to be run concurrently. Therefore, dispatching a kernel for each expression, further increases GPU utilisation. In the case of the interpreter, having only one kernel that can be dispatched for each expression, also simplifies the kernel itself. This is because the kernel can focus on evaluating one expression and does not require additional code to handle multiple expressions at once. Similarly, the transpiler can also be simplified, as it can generate many smaller kernels rather than one big kernel. Additionally, the smaller kernels do not need any branching, because the generated code only needs to perform the operations as they occur in the expressions themselves. This also reduces the overhead on the GPU. One drawback of generating a kernel for each expression, is the generation itself. Especially for smaller variable sets, it is possible, that the time it takes to transpile an expression and compile the kernel into machine code is greater than the time it takes to evaluate it. However, for larger variable sets this should not be a concern. +A design decision that has been made for both prototypes is to split the evaluation of each expression into a separate kernel or kernel dispatch as seen in Figure \ref{fig:kernel_architecture}. As explained in Section \ref{sec:thread_hierarchy}, it is desirable to reduce the occurrence of thread divergence as much as possible. Although the SIMT programming model tries to mitigate the negative effects of thread divergence, it is still advisable to avoid it when possible. For this use-case, thread divergence can easily be avoided by not evaluating all expressions in a single kernel or kernel dispatch. GPUs are able to have multiple resident grids, with modern GPUs being able to accommodate 128 grids concurrently \parencite{nvidia_cuda_2025}. One grid corresponds to one kernel dispatch, and therefore allows up-to 128 kernels to be run concurrently. Therefore, dispatching a kernel for each expression, further increases GPU utilisation. In the case of the interpreter, having only one kernel that can be dispatched for each expression, also simplifies the kernel itself. This is because the kernel can focus on evaluating one expression and does not require additional code to handle multiple expressions at once. Similarly, the transpiler can also be simplified, as it can generate many smaller kernels rather than one big kernel. Additionally, the smaller kernels do not need any branching, because the generated code only needs to perform the operations as they occur in the expressions themselves. This also reduces the overhead on the GPU. One drawback of generating a kernel for each expression, is the generation itself. Especially for smaller data points, it is possible, that the time it takes to transpile an expression and compile the kernel into machine code is greater than the time it takes to evaluate it. However, for larger data points this should not be a concern, especially in parameter optimisation scenarios, where the kernel is re-used on each parameter optimisation step. % % TODO: Probably include a diagram that shows how the evaluators are integrated in the symbolic regression algorithm (assuming its a GP variant), to show the bigger picture @@ -58,7 +58,7 @@ A design decision that has been made for both prototypes is to split the evaluat \subsection{Pre-Processing} \label{sec:pre-processing} -The first step in both prototypes is the pre-processing step. It is needed, as it simplifies working with the expressions in the later steps. One of the responsibilities of the pre-processor is to verify that only allowed operators and symbols are present in the given expressions. This is comparable to the work a scanner like Flex\footnote{\url{https://github.com/westes/flex}} performs. Secondly, this step also converts the expression into an intermediate representation. In essence, the pre-processing step can be compared to the frontend of a compiler as described in Section \ref{sec:compilers}. If new operators are required, the pre-processor must be extended as well. Otherwise, expressions containing these operators would be treated as invalid and never reach the evaluator. +The first step in both prototypes is the pre-processing step. It is needed, as it simplifies working with the expressions in the later steps. One of the responsibilities of the pre-processor is to verify that only allowed operators and symbols are present in the given expressions. Secondly, this step also converts the expression into an intermediate representation. In essence, the pre-processing step can be compared to the frontend of a compiler as described in Section \ref{sec:compilers}. If new operators are required, the pre-processor must be extended as well. Otherwise, expressions containing these operators would be treated as invalid and never reach the evaluator. The conversion into the intermediate representation transforms the expressions from infix notation into postfix notation. This further allows the later parts to more easily evaluate the expressions. One of the major benefits of this notation is the implicit operator precedence. It allows the evaluators to evaluate the expressions token by token from left to right, without needing to worry about the correct order of operations. One token represents either an operator, a constant value, a variable or a parameter. Apart from the intermediate representation containing the expression in postfix notation, it also contains information about the types of the tokens themselves. This is all that is needed for the interpretation and transpilation steps. A simple expression like $x + 2$ would look like depicted in Figure \ref{fig:pre-processing_results} after the pre-processing step. @@ -86,7 +86,7 @@ The interpreter consists of two parts. The CPU side is the part of the program, Once all the necessary data is present on the GPU, the interpreter kernel can be dispatched. As previously mentioned, the kernel is dispatched for each expression to minimise thread divergence. In fact, dispatching the same kernel multiple times for each expression ensures that there will not occur any thread divergence, as will be explained later. -After the GPU has finished evaluating all expressions with all variable sets, the result is stored in a matrix on the GPU. The CPU then retrieves the results and returns them to the caller in the format specified by the requirements. +After the GPU has finished evaluating all expressions with all data points, the result is stored in a matrix on the GPU. The CPU then retrieves the results and returns them to the caller in the format specified by the requirements. Evaluating the expressions is relatively straight forward. Because the expressions are in postfix notation, the actual interpreter just needs to iterate over all the tokens and perform the appropriate tasks. If the interpreter encounters a binary operator, it simply needs to read the previous two values and perform the operation specified by the operator. For unary operators, only the previous value needs to be read. As already mentioned, expressions in postfix notation implicitly contain the operator precedence, therefore no look-ahead or other strategies need to be used to ensure correct evaluation. This also means that each token is visited exactly once and no unnecessary or overhead work needs to be done. The Algorithm \ref{alg:eval_interpreter} shows how the interpreter works. Note that this is a simplified version, that only works with additions, multiplications, constants and variables. @@ -124,7 +124,7 @@ Handling constants, variables and parameters is very simple. Constants simply ne When an operator token is encountered, the handling becomes more complex. The value of the token indicates the type of operation to be applied. For binary operators, the top two values on the stack need to be used as input to the operator. For unary operators, only the top value of the stack needs to be used as an input. Once the result has been computed, it must be stored at the top of the stack to be used as an input for the next operation or the result for this expression. -At the end of the algorithm, the stack contains one last entry. This entry is the value computed by the expression with the designated variable set and parameters. In order to send this value back to the CPU, it must be stored in the result matrix. The last statement performs this action. It again has been simplified to omit the index calculation of the expression and variable set needed to store the result at the correct location. +At the end of the algorithm, the stack contains one last entry. This entry is the value computed by the expression with the designated data point and parameters. In order to send this value back to the CPU, it must be stored in the result matrix. The last statement performs this action. It again has been simplified to omit the index calculation of the expression and data point needed to store the result at the correct location. The Algorithm \ref{alg:eval_interpreter} in this case resembles the kernel. This kernel will be dispatched for each expression that needs to be evaluated, to prevent thread divergence. Thread divergence can only occur on data-dependent branches. In this case, the while loop and every if and else-if statement contains a data-dependent branch. Depending on the expression passed to the kernel, the while loop may run longer than for another expression. Similarly, not all expressions have the same constants, operators, variables or parameters in the same order, and would therefore cause each thread to take a different path. However, one expression always has the same constants, operators, variables and parameter in the same locations, meaning that all threads will take the same path. This also means that although the interpreter contains many data-dependent branches, these branches only depend on the expression itself. Because of this, all threads will follow the same path and will therefore never diverge from one another. diff --git a/thesis/chapters/conclusion.tex b/thesis/chapters/conclusion.tex index 23715f9..7d5b1ce 100644 --- a/thesis/chapters/conclusion.tex +++ b/thesis/chapters/conclusion.tex @@ -1,12 +1,9 @@ \chapter[Conclusion]{Conclusion and Future Work} \label{cha:conclusion} -% When trying to model a system consisting of some inputs with an observed output, a computer can be used -A typical system consists of a set of inputs with an observed output. For example when trying to model the flow in rough pipes as done by \textcite{nikuradse_laws_1950} where the length, the diameter and the roughness of the pipes are the input. In this scenario the flow through the pipe is the output and a mathematical model is needed to describe the correlation between the inputs and outputs. Finding such a model or formula can be done by utilising a computer and symbolic regression. Symbolic regression typically is implemented using genetic programming. During the runtime thousands or even hundreds of thousands of formulas or expressions are generated which need to be evaluated to determine if they describe the observed system with sufficient accuracy. This process can take several hours to days to find a suitable formula on a single machine utilising the CPU only. Therefore, this thesis deals with the question of how the evaluation of the expressions generated at runtime can be sped up to minimise execution times. +Research has been conducted on how to best approach the evaluation of dynamically generated expressions for symbolic regression. The GPU has been chosen to improve the performance as a cheap and powerful tool especially compared to compute clusters. Numerous instances exist were utilising the GPU lead to drastic performance improvements in many fields of research. -Research has been conducted on how to best approach this problem statement. The GPU has been chosen to improve the performance as a cheap and powerful tool especially compared to compute clusters. Numerous instances exist were utilising the GPU lead to drastic performance improvements in many fields of research. - -Two GPU evaluators were implemented which should determine if the GPU is more suitable for evaluating expressions generated at runtime as compared to the CPU. The two implementations are as follows: +Two GPU evaluators were implemented which are used to determine if the GPU is more suitable for evaluating expressions generated at runtime as compared to the CPU. The two implementations are as follows: \begin{description} \item[GPU Interpreter] \mbox{} \\ @@ -15,21 +12,29 @@ Two GPU evaluators were implemented which should determine if the GPU is more su A transpiler that takes the expressions and transpiles them into PTX code. Each expression is represented in its own unique kernel. The kernels are simpler than the one GPU interpreter kernel, but more effort is needed to generate them. \end{description} -In total three benchmarks were conducted to determine if and under which circumstances the GPU is a more suitable choice for evaluating the expressions. The current CPU implementation is the baseline against which the GPU evaluators are evaluated. To answer the research questions the benchmarks are structured as follows: +In total three benchmarks were conducted to determine if and under which circumstances the GPU is a more suitable choice for evaluating the expressions. A CPU-based implementation is the baseline against which the GPU evaluators are evaluated. To answer the research questions the benchmarks are structured as follows: \begin{enumerate} - \item Roughly $250\,000$ expressions with $362$ variable sets have been evaluated. The goal of this benchmark was determining how the evaluators can handle large volumes of expressions. - \item Roughly $10\,000$ expressions with $362$ variable sets have been evaluated. This benchmark should demonstrate how a change in the number of expressions impacts the performance, especially compared with each other. - \item Roughly $10\,000$ expressions and roughly $10\,000$ variable sets have been evaluated. By increasing the number of variable sets a more realistic use-case is modelled with this benchmark. Additionally, by using more variable sets the strengths of the GPU should get more exploited. + \item Roughly $250\,000$ expressions with $362$ data points have been evaluated. The goal of this benchmark was determining how the evaluators can handle large volumes of expressions. + \item Roughly $10\,000$ expressions with $362$ data points have been evaluated. This benchmark should demonstrate how a change in the number of expressions impacts the performance, especially compared with each other. + \item Roughly $10\,000$ expressions and roughly $10\,000$ data points have been evaluated. By increasing the number of data points a more realistic use-case is modelled with this benchmark. Additionally, by using more data points the strengths of the GPU should get more exploited. \end{enumerate} -After conducting the first and second benchmarks it was clear, that the CPU is the better choice in these scenarios. The first benchmark in particular demonstrated how the high RAM usage of the GPU transpiler lead to it not finishing this benchmark. Reducing the number of expressions demonstrated that the GPU transpiler can perform better than the GPU interpreter, however, in relation to the CPU implementation, no real change was observed between the first and second benchmark. However, in the third benchmark, both GPU evaluators managed to outperform the CPU, with the GPU transpiler performing the best. +After conducting the first and second benchmarks it was clear, that the CPU is the better choice in these scenarios. The CPU was faster by roughly four times when compared to the GPU interpreter and the GPU transpiler did not finish this benchmark at all. -To address the research questions, this thesis demonstrates that evaluating expressions generated at runtime can be more efficient on the GPU under specific conditions. Utilizing the GPU becomes feasible when dealing with a high number of variable sets, typically in the thousands and above. For scenarios with fewer variable sets, the CPU remains the better choice. Additionally, in scenarios where RAM is abundant, the GPU transpiler is the optimal choice. If too little RAM is available and the number of variable sets is sufficiently large, the GPU interpreter should be chosen, as it outperforms both the GPU transpiler and the CPU in such cases. +The first benchmark in particular demonstrated how the high RAM usage of this GPU transpiler implementation lead to it not finishing this benchmark. Storing $250\,000$ compiled kernels uses a lot of RAM, however, compiling the PTX kernels just in time before they are executed is not a feasible alternative to reduce RAM usage. Since the PTX kernels need to be compiled into machine code before they can be executed, one alternative would be to use batch processing as a compromise between compiling ahead of time and just in time. Since it is not expected that these evaluators need to evaluate hundreds of thousands of expressions, the non-trivial process of rewriting the implementation to support batch processing has not been done. + +Reducing the number of expressions demonstrated that the GPU transpiler can perform better than the GPU interpreter by roughly ten percent. However, in relation to the CPU implementation, no real change was observed between the first and second benchmark with the CPU being faster by roughly five times. + +In the third benchmark, both GPU evaluators managed to outperform the CPU, with the GPU transpiler performing the best. The GPU interpreter was faster by roughly $1.6$ times and the GPU transpiler was faster by roughly $2$ times compared to the CPU interpreter. Furthermore, the GPU transpiler managed to outperform the GPU interpreter by roughly $1.2$ times. + +To address the research questions, this thesis demonstrates that evaluating expressions generated at runtime can be more efficient on the GPU under specific conditions. Utilizing the GPU becomes feasible when dealing with a high number of data points, typically in the thousands and above. For scenarios with fewer data points, the CPU remains the better choice. Additionally, in scenarios where RAM is abundant, the implementation of the GPU transpiler discussed in this thesis is the optimal choice. If too little RAM is available and the number of data points is sufficiently large, the GPU interpreter should be chosen, as it outperforms both the GPU transpiler and the CPU in such cases. \section{Future Work} -This thesis demonstrated how the GPU can be used to accelerate the evaluation of expressions and therefore the symbolic regression algorithm as a whole. However, the boundaries at which it is more feasible to utilise the GPU are very coarse-grained. Therefore, conducting more research into how the number of expressions and variable sets impact performance is needed. Furthermore, only one dataset with only two variables per variable set was used. Varying the number of variables per set and their impact on performance could also be interesting. The impact of the parameters was omitted from this thesis entirely. Further research on how the number of parameters impact the performance is of interest. Since parameters need to be transferred to the GPU frequently, having too many parameters could impact the GPU more negatively than the CPU. +This thesis demonstrated how the GPU can be used to accelerate the evaluation of expressions and therefore the symbolic regression algorithm as a whole. However, the boundaries at which it is more feasible to utilise the GPU needs to be further refined. Therefore, conducting more research into how the number of expressions and data points impact performance is needed. Furthermore, only one dataset with only two variables per data point was used. Varying the number of variables per data point and their impact on performance could also be interesting. The impact of the parameters was omitted from this thesis entirely. Further research on how the number of parameters impact the performance is of interest. Since parameters need to be transferred to the GPU frequently, having too many parameters could impact the GPU more negatively than the CPU. Alternatively, performing the entire parameter optimisation step on the GPU and not just the evaluation might also result in better performance, as the number of data transfers is drastically reduced. -The current implementation also has flaws that can be improved in future work. Currently, no shared memory is utilised, meaning the threads need to always retrieve the data from global memory. This is a slow operation and efficiently utilising shared memory should further improve the performance of both GPU evaluators. +The current implementation also has flaws that can be improved in future work. Currently, no shared memory is utilised, meaning the threads need to always retrieve the data from global memory. This is a slow operation and efficiently utilising shared memory should further improve the performance of both GPU evaluators. -Additionally, neither of the implementations supports special GPU instructions. Especially the Fused Multiply-Add (FMA) instruction is of interest. Given that multiplying two values and adding a third is a common operation, this special instruction allows these operations to be performed in a single clock cycle. The frontend can be extended to detect and convert sub-expressions of this form into a special ternary opcode, enabling the backend to generate more efficient code. If the effort of detecting these sub-expressions is outweighed by the performance improvement needs to be determined in a future work. +Furthermore, as seen with the GPU transpiler and the first benchmark, reducing RAM usage is of essence for very large problems with hundreds of thousands of expressions or very RAM limited environments. Therefore, future work needs to be done to rewrite the transpiler to support batch processing and conduct benchmarks with this new implementation. This will answer the question if batch processing allows the GPU transpiler to outperform the CPU and GPU interpreters in these scenarios. Additionally, it is of interest if the batch processing transpiler manages to achieve the same or better performance in the other scenarios explored in this thesis. + +Lastly, neither of the implementations supports special GPU instructions. Especially the Fused Multiply-Add (FMA) instruction is of interest. Given that multiplying two values and adding a third is a common operation, this special instruction allows these operations to be performed in a single clock cycle. The frontend can be extended to detect and convert sub-expressions of this form into a special ternary opcode, enabling the backend to generate more efficient code. If the effort of detecting these sub-expressions is outweighed by the performance improvement needs to be determined in a future work. diff --git a/thesis/chapters/evaluation.tex b/thesis/chapters/evaluation.tex index a3b2ea9..28dd1ed 100644 --- a/thesis/chapters/evaluation.tex +++ b/thesis/chapters/evaluation.tex @@ -12,7 +12,7 @@ The hardware configuration is the most important aspect of the benchmark environ \subsubsection{GPU} The GPU plays a crucial role, as different microarchitectures typically operate differently and therefore require different performance tuning. Although the evaluators can generally operate on any Nvidia GPU with a compute capability of at least 6.1, they are tuned for the Ampere microarchitecture which has a compute capability of 8.6. Despite the evaluators being tuned for this microarchitecture, more recent microarchitectures can be used as well. However, additional tuning is required to ensure that the evaluators can utilise the hardware to its fullest potential. -Tuning must also be done on a per-problem basis. In particular, the number of variable sets impact how well the hardware is utilised. Therefore, it is crucial to determine which configuration yields the best performance. Section \ref{sec:results} outlines steps to tune the configuration for a specific problem. +Tuning must also be done on a per-problem basis. In particular, the number of data points impact how well the hardware is utilised. Therefore, it is crucial to determine which configuration yields the best performance. Section \ref{sec:results} outlines steps to tune the configuration for a specific problem. \subsubsection{CPU} Although the GPU plays a crucial role, work is also carried out on the CPU. The interpreter primarily utilises the CPU for the frontend and data transfer, making it more GPU-bound as most of the work is performed on the GPU. However, the transpiler additionally relies on the CPU to perform the transpilation step. This step involves generating a kernel for each expression and sending these kernels to the driver for compilation, a process also handled by the CPU. By contrast, the interpreter only required one kernel which needs to be converted into PTX and compiled by the driver only once. Consequently, the transpiler is significantly more CPU-bound and variations in the CPU used have a much greater impact. Therefore, using a more powerful CPU benefits the transpiler more than the interpreter. @@ -43,15 +43,15 @@ Typically, newer versions of these components include, among other things, perfo \subsection{Performance Evaluation Process} -Now that the hardware and software configurations have been established, the benchmarking process can be defined. This process is designed to simulate the load and scenario in which these evaluators will be used. The Nikuradse dataset \parencite{nikuradse_laws_1950} has been chosen as the data source. The dataset models the laws of flow in rough pipes and provides $362$ variable sets, each set containing two variables. This dataset has first been used by \textcite{guimera_bayesian_2020} to benchmark a symbolic regression algorithm. +Now that the hardware and software configurations have been established, the benchmarking process can be defined. This process is designed to simulate the load and scenario in which these evaluators will be used. The Nikuradse dataset \parencite{nikuradse_laws_1950} has been chosen as the data source. The dataset models the laws of flow in rough pipes and provides $362$ data points, each set containing two variables. This dataset has first been used by \textcite{guimera_bayesian_2020} to benchmark a symbolic regression algorithm. Since only the evaluators are benchmarked, the expressions to be evaluated must already exist. These expressions are generated for the Nikuradse dataset using the exhaustive symbolic regression algorithm proposed by \textcite{bartlett_exhaustive_2024}. This ensures that the expressions are representative of what needs to be evaluated in a real-world application. In total, three benchmarks will be conducted, each having a different goal, which will be further explained in the following paragraphs. -The first benchmark involves a very large set of roughly $250\,000$ expressions with $362$ variable sets. This means that when using GP all $250\,000$ expressions would be evaluated in a single generation. In a typical generation, significantly fewer expressions would be evaluated. However, this benchmark is designed to show how the evaluators can handle very large volumes of data. Because of memory constraints, it was not possible to conduct an additional benchmark with a higher number of variable sets. +The first benchmark involves a very large set of roughly $250\,000$ expressions with $362$ data points. This means that when using GP all $250\,000$ expressions would be evaluated in a single generation. In a typical generation, significantly fewer expressions would be evaluated. However, this benchmark is designed to show how the evaluators can handle very large volumes of data. Because of memory constraints, it was not possible to conduct an additional benchmark with a higher number of data points. -Both the second and third benchmarks are conducted to demonstrate how the evaluators will perform in more realistic scenarios. For the second benchmark the number of expressions has been reduced to roughly $10\,000$, and the number of variable sets is again $362$. The number of expressions is much more representative to a typical scenario, while the number of variable sets is still low. To determine if the GPU evaluators are a feasible alternative in scenarios with a realistic number of expressions but comparably few variable sets, this benchmark is conducted nonetheless. +Both the second and third benchmarks are conducted to demonstrate how the evaluators will perform in more realistic scenarios. For the second benchmark the number of expressions has been reduced to roughly $10\,000$, and the number of data points is again $362$. The number of expressions is much more representative to a typical scenario, while the number of data points is still low. To determine if the GPU evaluators are a feasible alternative in scenarios with a realistic number of expressions but comparably few data points, this benchmark is conducted nonetheless. -Finally, a third benchmark will be conducted. Similar to the second benchmark, this benchmark evaluates the same roughly $10\,000$ expressions but now with $30$ times more variable sets, which equates to roughly $10\,000$. This benchmark mimics the scenario where the evaluators will most likely be used. While the others simulate different conditions to determine if and where the GPU evaluators can be used efficiently, this benchmark is more focused on determining if the GPU evaluators are suitable for the specific scenario they are likely going to be used in. +Finally, a third benchmark will be conducted. Similar to the second benchmark, this benchmark evaluates the same roughly $10\,000$ expressions but now with $30$ times more data points, which equates to roughly $10\,000$. This benchmark mimics the scenario where the evaluators will most likely be used. While the others simulate different conditions to determine if and where the GPU evaluators can be used efficiently, this benchmark is more focused on determining if the GPU evaluators are suitable for the specific scenario they are likely going to be used in. All three benchmarks also simulate a parameter optimisation step, as this is the intended use-case for these evaluators. For parameter optimisation, $100$ steps are used, meaning that all expressions are evaluated $100$ times. During the benchmark, this process is simulated by re-transmitting the parameters instead of generating new ones. Generating new parameters is not part of the evaluators and is therefore not implemented. However, because the parameters are re-transmitted each time, the overhead of sending the data is taken into account. This overhead is part of the evaluators and represents an additional burden that the CPU implementation does not have, making it important to be measured. @@ -62,13 +62,13 @@ It offers extensive support for measuring and comparing results of different imp \section{Results} \label{sec:results} -This section presents the results of the benchmarks described above. First the results for the GPU-based interpreter will be presented alongside the performance tuning process. This is followed by the results of the transpiler as well as the performance tuning process. Finally, both GPU-based evaluators will be compared with each other to determine which of them performs the best. Additionally, these evaluators will be compared against the CPU-based interpreter to answer the research questions of this thesis. +This section presents the results of the benchmarks described above. First the results for the GPU-based interpreter and GPU transpiler alongside the performance tuning process will be presented in isolation. Finally, both GPU-based evaluators will be compared with each other to determine which of them performs the best. Additionally, these evaluators will be compared against the CPU-based interpreter to answer the research questions of this thesis. \subsection{Interpreter} In this section, the results for the GPU-based interpreter are presented in detail. Following the benchmark results, the process of tuning the interpreter is described as well as how to adapt the tuning for the different benchmarks. This part not only contains the tuning of the GPU, but also performance improvements done on the CPU side. \subsubsection{Benchmark 1} -The first benchmark consists of $250\,000$ expressions and $362$ variable sets with $100$ parameter optimisation steps. Because each expression needs to be evaluated with each variable set for each parameter optimisation step, a total of $250\,000 * 362 * 100 \approx 9.05\,\textit{billion}$ evaluations have been performed per sample. In Figure \ref{fig:gpu_i_benchmark_1} the result over all $50$ samples is presented. The median value across all samples is $466.3$ seconds with a standard deviation of $14.2$ seconds. +The first benchmark consists of $250\,000$ expressions and $362$ data points with $100$ parameter optimisation steps. Because each expression needs to be evaluated with each data point for each parameter optimisation step, a total of $250\,000 * 362 * 100 \approx 9.05\,\text{billion}$ evaluations have been performed per sample. In Figure \ref{fig:gpu_i_benchmark_1} the result over all $50$ samples is presented. The median value across all samples is $466.3$ seconds with a standard deviation of $14.2$ seconds. \begin{figure} \centering \includegraphics[width=.9\textwidth]{results/gpu-interpreter-final-performance-benchmark1.png} @@ -79,7 +79,7 @@ The first benchmark consists of $250\,000$ expressions and $362$ variable sets w For the kernel configuration, a block size of $128$ threads has been used. As will be explained below, this has been found to be the configuration that results in the most performance. During the benchmark, the utilisation of both the CPU and GPU was roughly $100\%$. \subsubsection{Benchmark 2} -With $10\,000$ expressions, $362$ variable sets and $100$ parameter optimisation steps, the total number of evaluations per sample was $362\,\textit{million}$. The median across all samples is $21.3$ seconds with a standard deviation of $0.75$ seconds. Compared to the first benchmark, there were $25$ times fewer evaluations which also resulted in a reduction of the median and standard deviation of roughly $25$ times. This indicates a roughly linear correlation between the number of expressions and the runtime. Since the number of variable sets did not change, the block size for this benchmark remained at $128$ threads. Again the utilisation of the CPU and GPU during the benchmark was roughly $100\%$. +With $10\,000$ expressions, $362$ data points and $100$ parameter optimisation steps, the total number of evaluations per sample was $362\,\text{million}$. The median across all samples is $21.3$ seconds with a standard deviation of $0.75$ seconds. Compared to the first benchmark, there were $25$ times fewer evaluations which also resulted in a reduction of the median and standard deviation of roughly $25$ times. This indicates a roughly linear correlation between the number of expressions and the runtime. Since the number of data points did not change, the block size for this benchmark remained at $128$ threads. Again the utilisation of the CPU and GPU during the benchmark was roughly $100\%$. \begin{figure} \centering \includegraphics[width=.9\textwidth]{results/gpu-interpreter-final-performance-benchmark2.png} @@ -88,7 +88,7 @@ With $10\,000$ expressions, $362$ variable sets and $100$ parameter optimisation \end{figure} \subsubsection{Benchmark 3} -The third benchmark used the same $10\,000$ expressions and $100$ parameter optimisation steps. However, now there are $30$ times more variable sets that need to be used for evaluation. This means, that the total number of evaluations per sample is now $10.86\,\textit{billion}$. Compared to the first benchmark, an additional $1.8\,\textit{billion}$ evaluations were performed. However, as seen in Figure \ref{fig:gpu_i_benchmark_3}, the execution time was significantly faster. With a median of $30.3$ seconds and a standard deviation of $0.45$ seconds, this benchmark was only marginally slower than the second benchmark. This also indicates, that the GPU evaluators are much more suited for scenarios, where there is a high number of variable sets. +The third benchmark used the same $10\,000$ expressions and $100$ parameter optimisation steps. However, now there are $30$ times more data points that need to be used for evaluation. This means, that the total number of evaluations per sample is now $10.86\,\text{billion}$. Compared to the first benchmark, an additional $1.8\,\text{billion}$ evaluations were performed. However, as seen in Figure \ref{fig:gpu_i_benchmark_3}, the execution time was significantly faster. With a median of $30.3$ seconds and a standard deviation of $0.45$ seconds, this benchmark was only marginally slower than the second benchmark. This also indicates, that the GPU evaluators are much more suited for scenarios, where there is a high number of data points. \begin{figure} \centering \includegraphics[width=.9\textwidth]{results/gpu-interpreter-final-performance-benchmark3.png} @@ -96,7 +96,7 @@ The third benchmark used the same $10\,000$ expressions and $100$ parameter opti \label{fig:gpu_i_benchmark_3} \end{figure} -Although the number of variable sets has been increased by $30$ times, the block size remained at $128$ threads. Unlike the previous benchmarks, the hardware utilisation was different. Now only the GPU was utilised to 100\% while the CPU utilisation started at 100\% and slowly dropped to 80\%. The GPU needs to perform $30$ times more evaluations per expression, meaning it takes longer for one kernel dispatch to be finished. At the same time, the CPU tries to dispatch the kernel at the same rate as before. Because only a certain number of kernels can be dispatched at once, the CPU needs to wait for the GPU to finish a kernel before another one can be dispatched. Therefore, in this scenario, the evaluator runs into a GPU-bottleneck and using a more performant GPU would consequently improve the runtime. In the previous benchmarks, both the CPU and GPU would need to be upgraded, to achieve better performance. +Although the number of data points has been increased by $30$ times, the block size remained at $128$ threads. Unlike the previous benchmarks, the hardware utilisation was different. Now only the GPU was utilised to 100\% while the CPU utilisation started at 100\% and slowly dropped to 80\%. The GPU needs to perform $30$ times more evaluations per expression, meaning it takes longer for one kernel dispatch to be finished. At the same time, the CPU tries to dispatch the kernel at the same rate as before. Because only a certain number of kernels can be dispatched at once, the CPU needs to wait for the GPU to finish a kernel before another one can be dispatched. Therefore, in this scenario, the evaluator runs into a GPU-bottleneck and using a more performant GPU would consequently improve the runtime. In the previous benchmarks, both the CPU and GPU would need to be upgraded, to achieve better performance. \subsection{Performance Tuning Interpreter} @@ -105,15 +105,15 @@ Optimising and tuning the interpreter is crucial to achieve good performance. Es With this implementation, the initial performance measurements have been conducted for the first benchmark which served as the baseline for further performance optimisations. However, as already mentioned, during this benchmark, memory limitations where encountered, as too much RAM was being used. Therefore, the caching had to be disabled. Because the evaluator is multithreaded, this change resulted in significantly better performance. As the cache introduced critical sections where race conditions could occur, locking mechanisms were required. While locking ensures that no race conditions occur, it also means that parts of an otherwise entirely parallel implementation are now serialised, reducing the effect of parallelisation. -Without a cache and utilising all 12 threads, the frontend achieved very good performance. Processing $250\,000$ expressions takes roughly $88.5$ milliseconds. On the other hand, using a cache, resulted in the frontend running for $6.9$ \textit{seconds}. This equates to a speed-up of roughly 78 times when using no cache. Additionally, when looking at the benchmark results above, the time it takes to execute the frontend is negligible, meaning further optimising the frontend would not significantly improve the overall runtime. +Without a cache and utilising all 12 threads, the frontend achieved very good performance. Processing $250\,000$ expressions takes roughly $88.5$ milliseconds. On the other hand, using a cache, resulted in the frontend running for $6.9$ \text{seconds}. This equates to a speed-up of roughly 78 times when using no cache. Additionally, when looking at the benchmark results above, the time it takes to execute the frontend is negligible, meaning further optimising the frontend would not significantly improve the overall runtime. -During the tuning process $362$ variable sets have been used, which is the number of variable sets used by benchmark one and two. Before conducting benchmark three, additional performance tuning has been performed to ensure that this benchmark also utilises the hardware as much as possible. +During the tuning process $362$ data points have been used, which is the number of data points used by benchmark one and two. Before conducting benchmark three, additional performance tuning has been performed to ensure that this benchmark also utilises the hardware as much as possible. \subsubsection{Optimisation 1} After caching has been disabled, the first performance improvement was to drastically reduce the number of calls to the frontend and the number of data transfers to the GPU. Because the expressions and variables never change during the parameter optimisation process, processing the expression and transmitting the data to the GPU on each step wastes resources. Therefore, the expressions are sent to the frontend once before the parameter optimisation process. Afterwards, the processed expressions as well as the variables are transferred to the GPU exactly once for this execution of the interpreter. -Figure \ref{fig:gpu_i_optimisation_1} shows how this optimisation improved the overall performance as demonstrated with benchmark one. However, it can also be seen that the range the individual samples fall within is much greater now. While in all cases, this optimisation improved the performance, in some cases the difference between the initial and the optimised version is very low with roughly a two-second improvement. +Figure \ref{fig:gpu_i_optimisation_1} shows how this optimisation improved the overall performance as demonstrated with benchmark one. However, it can also be seen that the range the individual samples fall within is much greater now. While in all cases, this optimisation improved the performance, in some cases the difference between the initial and the optimised version is very low with roughly a two-second improvement. On median the performance improvement was roughly five percent. \begin{figure} \centering @@ -128,7 +128,7 @@ The second optimisation was concerned with tuning the kernel configuration. Usin Since the evaluator is designed to execute many kernel dispatches in parallel, it was important to reduce the kernel runtime. Reducing the runtime per kernel has a knock-on effect, as the following kernel dispatches can begin execution sooner reducing the overall runtime. -After the evaluator tuning has been concluded, it was found that a block size of $128$ yielded the best results. With this kernel configuration, another performance measurement has been conducted with the results shown in Figure \ref{fig:gpu_i_optimisation_2} using benchmark one. As can be seen, the overall runtime again was noticeably faster. However, the standard deviation also drastically increased, with the duration from the fastest to the slowest sample differing by roughly 60 seconds. +After the evaluator tuning has been concluded, it was found that a block size of $128$ yielded the best results. With this kernel configuration, another performance measurement has been conducted with the results shown in Figure \ref{fig:gpu_i_optimisation_2} using benchmark one. As can be seen, the overall runtime again was noticeably faster, albeit in improvement of roughly six percent. However, the standard deviation also drastically increased, with the duration from the fastest to the slowest sample differing by roughly 60 seconds. \begin{figure} \centering @@ -139,7 +139,7 @@ After the evaluator tuning has been concluded, it was found that a block size of The found block size of $128$ might seem strange. However, it makes sense, as in total at least $362$ threads need to be started to evaluate one expression. If one block contains $128$ threads a total of $362 / 128 \approx 3$ blocks need to be started, totalling $384$ threads. As a result, only $384 - 362 = 22$ threads are excess threads. When choosing a block size of $121$ three blocks could be started, totalling one excess thread. However, there is no performance difference between a block size of $121$ and $128$. Since all threads are executed inside a warp, which consists of exactly $32$ threads, a block size that is not divisible by $32$ has no benefit and only hides the true amount of excess threads started. -Benchmark three had a total of $10\,860$ variable sets, meaning at least this number of threads must be started. To ensure optimal hardware utilisation, the evaluator had to undergo another tuning process. As seen above, it is beneficial to start as little excess threads as possible. By utilising NSight Compute, a performance measurement with a block size of $128$ was used as the initial configuration. This already performed well as again very little excess threads are started. In total $10\,860 / 128 \approx 84.84$ blocks are needed, which must be round up to $85$ blocks with the last block being filled by roughly $84\%$ which equates to $20$ excess threads being started. +Benchmark three had a total of $10\,860$ data points, meaning at least this number of threads must be started. To ensure optimal hardware utilisation, the evaluator had to undergo another tuning process. As seen above, it is beneficial to start as little excess threads as possible. By utilising NSight Compute, a performance measurement with a block size of $128$ was used as the initial configuration. This already performed well as again very little excess threads are started. In total $10\,860 / 128 \approx 84.84$ blocks are needed, which must be round up to $85$ blocks with the last block being filled by roughly $84\%$ which equates to $20$ excess threads being started. This was repeated for two more configurations. Once for a block size of $160$ and once for $192$. With a block size of $160$, the total number of blocks was reduced to $68$, which again resulted in $20$ excess threads being started. With the hypothesis behind increasing the block size was that using fewer blocks would result in better utilisation and therefore better performance. The same idea was also behind choosing a block size $192$. However, While this only required $57$ blocks, the number of excess threads increased to $84$. @@ -159,7 +159,7 @@ The first optimisation was to reduce the stack size of the interpreter from 25 t During the parameter optimisation step a lot of memory operations where performed. These are required as for each step new memory on the GPU must be allocated for both the parameters and the meta information. The documentation of CUDA.jl\footnote{\url{https://cuda.juliagpu.org/stable/usage/memory/\#Avoiding-GC-pressure}} mentioned that this can lead to higher garbage-collector (GC) pressure, increasing the time spent garbage-collecting. To reduce this, CUDA.jl provides the \verb|CUDA.unsafe_free!(::CuArray)| function. This frees the memory on the GPU without requiring to run the Julia GC and therefore spending less resources on garbage-collecting and more on evaluating the expressions. -With these two changes the overall runtime has been improved as can be seen in Figure \ref{fig:gpu_i_optimisation_3}. Moreover, the standard deviation was also reduced which was the main goal of this optimisation. +With these two changes the overall runtime has been improved by two percent as can be seen in Figure \ref{fig:gpu_i_optimisation_3}. Moreover, the standard deviation was also reduced which was the main goal of this optimisation. \begin{figure} \centering @@ -194,7 +194,7 @@ During the benchmark it was observed that the CPU maintained a utilisation of 10 \subsubsection{Benchmark 3} -This benchmark increased the amount of variable sets by $30$ times and therefore also increases the total number of evaluations by $30$ times. As observed in the second benchmark, the GPU was underutilised and thus had more resources available for evaluating the expressions. As shown in Figure \ref{fig:gpu_t_benchmark_3} the available resources were better utilised. Although the number of evaluations increased by a factor of $30$, the median execution time only increased by approximately six seconds, or $1.3$ times, from $19.6$ to $25.4$. The standard deviation also decreased from $1.16$ seconds to $0.65$ seconds. +This benchmark increased the amount of data points by $30$ times and therefore also increases the total number of evaluations by $30$ times. As observed in the second benchmark, the GPU was underutilised and thus had more resources available for evaluating the expressions. As shown in Figure \ref{fig:gpu_t_benchmark_3} the available resources were better utilised. Although the number of evaluations increased by a factor of $30$, the median execution time only increased by approximately six seconds, or $1.3$ times, from $19.6$ to $25.4$. The standard deviation also decreased from $1.16$ seconds to $0.65$ seconds. \begin{figure} \centering \includegraphics[width=.9\textwidth]{results/gpu-transpiler-final-performance-benchmark3.png} @@ -202,7 +202,7 @@ This benchmark increased the amount of variable sets by $30$ times and therefore \label{fig:gpu_t_benchmark_3} \end{figure} -Given the change in the number of variable sets, additional performance tests with different block sizes were conducted. During this process it was found, that changing the block size from $128$ to $160$ threads resulted in the best performance. This is in contrast to the GPU interpreter where changing the block size to $160$ resulted in degraded performance. +Given the change in the number of data points, additional performance tests with different block sizes were conducted. During this process it was found, that changing the block size from $128$ to $160$ threads resulted in the best performance. This is in contrast to the GPU interpreter where changing the block size to $160$ resulted in degraded performance. While conducting this benchmark, the CPU utilisation began at 100\% during the frontend step as well as the transpilation and compilation steps. However, similar to the third benchmark of the GPU interpreter, the CPU utilisation dropped to 80\% during the evaluation phase. This is very likely due to the same reason that the kernels are dispatched too quickly in succession, filling up the number of allowed resident grids on the GPU. @@ -218,7 +218,7 @@ As already mentioned in Section \ref{sec:tuning_interpreter}, using a cache in c Caching has also been used for the transpilation step. The reason for this was to reduce the runtime during the parameter optimisation step. While this reduced the overhead of transpilation, the overhead of searching the cache if the expression has already been transpiled still existed. Because of the already mentioned RAM constraints this cache has been disabled and a better solution has been implemented in the first and second optimisation steps. -Most data of the tuning process has been gathered with the number of expressions and variable sets of the first benchmark, as this was the worst performing scenario. Therefore, it would show best where potential for performance improvements was. Before any optimisations were applied a single sample of the first benchmark took roughly 15 hours. However, it needs to be noted that only two samples were taken due to the duration of one sample. +Most data of the tuning process has been gathered with the number of expressions and data points of the first benchmark, as this was the worst performing scenario. Therefore, it would show best where potential for performance improvements was. Before any optimisations were applied a single sample of the first benchmark took roughly 15 hours. However, it needs to be noted that only two samples were taken due to the duration of one sample. \subsubsection{Optimisation 1} % 1.) Done before parameter optimisation loop: Frontend, transmitting Variables (improved runtime) @@ -230,15 +230,15 @@ With this optimisation step the number of calls to the transpiler and compiler h It also must be noted, that compiling the PTX kernels and storing the result before the parameter optimisation step lead to an out of memory error for the first benchmark. In order to get any results, this step had to be reverted for this benchmark. If much more RAM were available, the runtime would have been significantly better. -These optimisations lead to a runtime of one sample of roughly ten hours for the first benchmark. Therefore, a substantial improvement of roughly four hours per sample was achieved. When $10\,000$ expressions are transpiled it takes on average $0.05$ seconds over ten samples. Comparing this to the time spent compiling the resulting $10\,000$ kernels it takes on average $3.2$ seconds over ten samples. This suggests that performing the compilation before the parameter optimisation step would yield drastically better results in the first benchmark. +Nonetheless, these optimisations lead to a runtime of one sample of roughly ten hours for the first benchmark. Therefore, a substantial improvement of roughly four hours or 40\% per sample was achieved. When $10\,000$ expressions are transpiled it takes on average $0.05$ seconds over ten samples. Comparing this to the time spent compiling the resulting $10\,000$ kernels it takes on average $3.2$ seconds over ten samples. This suggests that performing the compilation before the parameter optimisation step would yield drastically better results in the first benchmark. \subsubsection{Optimisation 3} % 3.) benchmark3 std noticeably improved with blocksize 160 (around 70\% better) (also includes call to unsafe_free) % here I can show chart of comparing the two blocksizes % unsafe_free in benchmark one reduced std. but could also be run to run variance. at least no negative effects -The third optimisation step was more focused on improving the performance for the third benchmark as it has a higher number of variable sets than the first and second one. However, as with the interpreter, the function \verb|CUDA.unsafe_free!(::CuArray)| has been used to reduce the standard deviation for all benchmarks. +The third optimisation step was more focused on improving the performance for the third benchmark as it has a higher number of data points than the first and second one. However, as with the interpreter, the function \verb|CUDA.unsafe_free!(::CuArray)| has been used to reduce the standard deviation for all benchmarks. -Since the number of variable sets has changed in the third benchmark, it is important to re-do the performance tuning. This was done by measuring the kernel performance using NSight Compute. As with the interpreter, block sizes of $128$ and $160$ threads have been compared with each other. A block size of $192$ threads has been omitted here since the number of excess threads is very high. In the case of the interpreter the performance of this configuration was the worst out of the three configurations, and it was assumed it will be similar in this scenario. +Since the number of data points has changed in the third benchmark, it is important to re-do the performance tuning. This was done by measuring the kernel performance using NSight Compute. As with the interpreter, block sizes of $128$ and $160$ threads have been compared with each other. A block size of $192$ threads has been omitted here since the number of excess threads is very high. In the case of the interpreter the performance of this configuration was the worst out of the three configurations, and it was assumed it will be similar in this scenario. However, since the number of excess threads for $128$ and $160$ threads per block is the same, the latter using fewer blocks might lead to performance improvements in the case of the transpiler. As seen in Figure \ref{fig:gpu_t_128_160} this assumption was true and using a block size of $160$ threads resulted in better performance for the third benchmark. This is in contrast to the interpreter, where this configuration performed much more poorly. \begin{figure} @@ -259,11 +259,11 @@ The goal of the first benchmark was to determine how the evaluators are able to \begin{figure} \centering \includegraphics[width=.9\textwidth]{results/cpu_gpui_gput_bench1.png} - \caption{The results of the comparison of all three implementations for the first benchmark. Note that the transpiler is absent because it did not finish this benchmark.} + \caption{The results of the comparison of the CPU and GPU based interpreter for the first benchmark. Note that the transpiler is absent because it did not finish this benchmark.} \label{fig:cpu_gpui_gput_benchmark_1} \end{figure} -Figure \ref{fig:cpu_gpui_gput_benchmark_1} shows the results of the first benchmark for the CPU and GPU interpreter. It can be seen that the GPU interpreter takes roughly four times as long on median than the CPU interpreter. Additionally, the standard deviation is much larger on the GPU interpreter. This shows that the CPU heavily benefits from scenarios where a lot of expressions need to be evaluated with very few variable sets. Therefore, it is not advisable to use the GPU to increase the performance in such scenarios. +Figure \ref{fig:cpu_gpui_gput_benchmark_1} shows the results of the first benchmark for the CPU and GPU interpreter. It can be seen that the GPU interpreter takes roughly four times as long on median than the CPU interpreter. Additionally, the standard deviation is much larger on the GPU interpreter. This shows that the CPU heavily benefits from scenarios where a lot of expressions need to be evaluated with very few data points. Therefore, it is not advisable to use the GPU to increase the performance in such scenarios. \subsubsection{Benchmark 2} Since the first benchmark has shown that with a large number of expressions the GPU is not a suitable alternative to the CPU. To further proof this statement a second benchmark with much fewer expressions was conducted. Now instead of $250\,000$ expressions, only $10\,000$ are evaluated. This reduction also meant that the transpiler can now be included in the comparison as it does not face any RAM limitations any more. @@ -280,7 +280,7 @@ Reducing the number of expressions did not benefit the GPU evaluators at all in On the other side, it can also be seen that the GPU transpiler tends to perform better than the GPU interpreter. While in the worst case both implementations are roughly equal, the GPU transpiler on median performs better. Additionally, the GPU transpiler can also outperform the GPU interpreter in the best case. \subsubsection{Benchmark 3} -As found by the previous two benchmarks, varying the number of expressions only has a slight impact on the performance of the GPU in relation to the performance of the CPU. However, instead of varying the number of expressions, the number of variable sets can also be changed. For this benchmark, instead of $362$ variable sets, a total of $10\,860$ variable sets were used, which translates to an increase by $30$ times. It needs to be noted, that it was only possible to evaluate the performance with roughly $10\,000$ expressions with this number of variable sets. When using the same roughly $250\,000$ expressions of the first benchmark and the increase number of variable sets, none of the implementations managed to complete the benchmark, as there was too little RAM available. +As found by the previous two benchmarks, varying the number of expressions only has a slight impact on the performance of the GPU in relation to the performance of the CPU. However, instead of varying the number of expressions, the number of data points can also be changed. For this benchmark, instead of $362$ data points, a total of $10\,860$ data points were used, which translates to an increase in performance by $30$ times. It needs to be noted, that it was only possible to evaluate the performance with roughly $10\,000$ expressions with this number of data points. When using the same roughly $250\,000$ expressions of the first benchmark and the increased number of data points, none of the implementations managed to complete the benchmark, as there was too little RAM available. \begin{figure} \centering @@ -289,9 +289,9 @@ As found by the previous two benchmarks, varying the number of expressions only \label{fig:cpu_gpui_gput_benchmark_3} \end{figure} -Increasing the number of variable sets greatly benefited both GPU evaluators as seen in Figure \ref{fig:cpu_gpui_gput_benchmark_3}. With this change, the CPU interpreter noticeably fell behind the GPU evaluators. Compared to the GPU transpiler, the CPU interpreter took roughly twice as long on median. The GPU transpiler continued its trend of performing better than the GPU interpreter. Furthermore, the standard deviation of all three evaluators is also very similar. +Increasing the number of data points greatly benefited both GPU evaluators as seen in Figure \ref{fig:cpu_gpui_gput_benchmark_3}. With this change, the CPU interpreter noticeably fell behind the GPU evaluators. Compared to the GPU transpiler, the CPU interpreter took roughly twice as long on median. The GPU transpiler continued its trend of performing better than the GPU interpreter. Furthermore, the standard deviation of all three evaluators is also very similar. -From this benchmark it can be concluded that the GPU heavily benefits from a larger number of variable sets. If the number of variable sets is increased even further, the difference in performance between the GPU and CPU should be even more pronounced. +From this benchmark it can be concluded that the GPU heavily benefits from a larger number of data points. If the number of data points is increased even further, the difference in performance between the GPU and CPU should be even more pronounced. -While the GPU is very limited in terms of concurrent kernel dispatches that can be evaluated, the number of threads and blocks can virtually be infinitely large. This means that a higher degree of parallelism is achievable with a higher number of variable sets. Increasing the number of expressions on the other hand does not influence the degree of parallelism to this extent. This is the reason no performance benefit was found by only decreasing the number of expressions with the same number of variable sets. +While the GPU is very limited in terms of concurrent kernel dispatches that can be evaluated, the number of threads and blocks can virtually be infinitely large. This means that a higher degree of parallelism is achievable with a higher number of data points. Increasing the number of expressions on the other hand does not influence the degree of parallelism to this extent. This is the reason no performance benefit was found by only decreasing the number of expressions with the same number of data points. diff --git a/thesis/chapters/implementation.tex b/thesis/chapters/implementation.tex index 79bc80c..01fe51b 100644 --- a/thesis/chapters/implementation.tex +++ b/thesis/chapters/implementation.tex @@ -75,7 +75,7 @@ It should be noted however, that Julia stores the tree as a list of arrays to al \label{fig:expr-ast} \end{figure} -\subsubsection{Parsing} +\subsubsection{Conversion into the Intermediate Representation} To convert the AST of an expression into the intermediate representation, a top-down traversal of the tree is required. The steps for this are as follows: \begin{enumerate} @@ -88,13 +88,13 @@ To convert the AST of an expression into the intermediate representation, a top- \item Return the generated postfix expression/intermediate representation. \end{enumerate} -The validation of the expression is performed throughout the parsing process. Validating that only correct operators are used is performed in step 1. To be able to convert the operator to its corresponding opcode, it must be validated that an opcode exists for it, and therefore whether it is valid or not. Similarly, converting the tokens into an expression element object ensures that only variables and parameters in the correct format are present in the expression. This is handled in step 2. +The validation of the expression is performed throughout the conversion process. Validating that only correct operators are used is performed in step 1. To be able to convert the operator to its corresponding opcode, it must be validated that an opcode exists for it, and therefore whether it is valid or not. Similarly, converting the tokens into an expression element object ensures that only variables and parameters in the correct format are present in the expression. This is handled in step 2. As explained above, a node of a binary operator can have $n$ children. In these cases, additional handling is required to ensure correct conversion. This handling is summarised in step 4. Essentially, the operator must be added after the first two elements, for each subsequent element, the operator must also be added. The expression $1+2+3+4$ is converted to the AST $+\,1\,2\,3\,4$ and without step 4 the postfix expression would be $1\,2\,3\,4\,+$. If the operator is added after the first two elements and then after each subsequent element, the correct postfix expression $1\,2\,+\,3\,+\,4\,+$ will be generated. -Each subtree of the AST is its own separate AST, which can be converted to postfix notation in the same way the whole AST can be converted. This means that the algorithm only needs to be able to handle leave nodes, and when it encounters a subtree, it recursively calls itself to parse the remaining AST. Step 5 indicates this recursive behaviour. +Each subtree of the AST is its own separate AST, which can be converted to postfix notation in the same way the whole AST can be converted. This means that the algorithm only needs to be able to handle leave nodes, and when it encounters a subtree, it recursively calls itself to convert the remaining AST. Step 5 indicates this recursive behaviour. -While the same expression usually occurs only once, sub-expressions can occur multiple times. In the example in Figure \ref{fig:expr-ast}, the whole expression $1 + x_1 \, \log(p_1)$ is unlikely to be generated more than once by the symbolic regression algorithm. However, the sub-expression $\log(p_1)$ is much more likely to be generated multiple times. This means that the generation of the intermediate representation for this subtree only needs to be done once and can be reused later. Therefore, a cache can be used to store the intermediate representation for this sub-expression and access it again later to eliminate the parsing overhead. +While the same expression usually occurs only once, sub-expressions can occur multiple times. In the example in Figure \ref{fig:expr-ast}, the whole expression $1 + x_1 \, \log(p_1)$ is unlikely to be generated more than once by the symbolic regression algorithm. However, the sub-expression $\log(p_1)$ is much more likely to be generated multiple times. This means that the generation of the intermediate representation for this subtree only needs to be done once and can be reused later. Therefore, a cache can be used to store the intermediate representation for this sub-expression and access it again later to eliminate the conversion overhead. \section{Interpreter} The implementation of the interpreter is divided into two main components, the CPU-based control logic and the GPU-based interpreter as outlined in the Concept and Design chapter. This section aims to describe the technical details of these components. First the CPU-based control logic will be discussed. This component handles the communication with the GPU and is the entry point which is called by the symbolic regression algorithm. Following this, the GPU-based interpreter will be explored, highlighting the specifics of developing an interpreter on the GPU. @@ -142,7 +142,7 @@ Similar to the parameters, the expressions are also stored as a vector of vector Once the conversion into matrix form has been performed, the expressions are transferred to the GPU. Just like with the variables, the expressions remain the same over the course of the parameter optimisation part. Which is the reason they are transferred to the GPU before the interpreter is called, reducing the number of unnecessary data transfers. -Only raw data can be sent to the GPU, which means that meta information about the data layout is missing. The matrices are represented as flat arrays, which means they have lost their column and row information. This information must be sent separately to inform the kernel about the dimensions of the expressions, variables and parameters. Otherwise, the kernel does not know at which memory location the second variable set is stored for example, as it does not know how large a single set is. Figure \ref{fig:memory-layout-data} shows how the data is stored without any information about the rows or columns of the matrices. The thick lines help to identify where a new column, and therefore a new set of data begins. However, the GPU has no knowledge of this and therefore the meta information must be transferred separately to ensure that the data is accessed correctly. +Only raw data can be sent to the GPU, which means that meta information about the data layout is missing. The matrices are represented as flat arrays, which means they have lost their column and row information. This information must be sent separately to inform the kernel about the dimensions of the expressions, variables and parameters. Otherwise, the kernel does not know at which memory location the second data point is stored for example, as it does not know how large a single set is. Figure \ref{fig:memory-layout-data} shows how the data is stored without any information about the rows or columns of the matrices. The thick lines help to identify where a new column, and therefore a new set of data begins. However, the GPU has no knowledge of this and therefore the meta information must be transferred separately to ensure that the data is accessed correctly. \begin{figure} \centering @@ -155,7 +155,7 @@ In addition to the already described data that needs to be sent, one more step i \subsubsection{Kernel Dispatch} -Once all the data is present on the GPU, the CPU can dispatch the kernel for each expression. This dispatch requires parameters that specify the number of threads and their organisation into thread blocks. In total, one thread is required for each variable set and therefore the grouping into thread blocks is the primary variable. Taking into account the constraints explained in Section \ref{sec:occupancy}, this grouping needs to be tuned for optimal performance. The specific values alongside the methodology for determining these values will be explained in Chapter \ref{cha:evaluation}. +Once all the data is present on the GPU, the CPU can dispatch the kernel for each expression. This dispatch requires parameters that specify the number of threads and their organisation into thread blocks. In total, one thread is required for each data point and therefore the grouping into thread blocks is the primary variable. Taking into account the constraints explained in Section \ref{sec:occupancy}, this grouping needs to be tuned for optimal performance. The specific values alongside the methodology for determining these values will be explained in Chapter \ref{cha:evaluation}. In addition, the dispatch parameters also include the pointers to the location of the data allocated and transferred above, as well as the index of the expression to be interpreted. Since all expressions and parameters are sent to the GPU at once, this index ensures that the kernel knows where in memory to find the expression it needs to interpret and which parameter set it needs to use. After the kernel has finished, the result matrix needs to be read from the GPU and passed back to the symbolic regression algorithm. @@ -163,9 +163,9 @@ Crucially, dispatching a kernel is an asynchronous operation, which means that t \subsection{GPU Side} \label{sec:interpreter-gpu-side} -With the GPU's global memory containing all the necessary data and the kernel being dispatched, the interpretation process can begin. Before interpreting an expression, the global thread ID must be calculated. This step is crucial because each variable set is assigned to a unique thread. Therefore, the global thread ID determines which variable set should be used for the current interpretation instance. +With the GPU's global memory containing all the necessary data and the kernel being dispatched, the interpretation process can begin. Before interpreting an expression, the global thread ID must be calculated. This step is crucial because each data point is assigned to a unique thread. Therefore, the global thread ID determines which data point should be used for the current interpretation instance. -Moreover, the global thread ID ensures that excess threads do not perform any work. As otherwise these threads would try to access a variable set that does not exist and therefore would lead to an illegal memory access. This is necessary because the number of required threads often does not align perfectly with the number of threads per block multiplied by the number of blocks. If for example $1031$ threads are required, then at least two thread blocks are needed, as one thread block can hold at most $1024$ threads. Because $1031$ is a prime number, it can not be divided by any practical number of thread blocks. If two thread blocks are allocated, each holding $1024$ threads, a total of $2048$ threads is started. Therefore, the excess $2048 - 1031 = 1017$ threads must be prevented from executing. By using the global thread ID and the number of available variable sets, these excess threads can be easily identified and terminated early in the kernel execution. +Moreover, the global thread ID ensures that excess threads do not perform any work. As otherwise these threads would try to access a data point that does not exist and therefore would lead to an illegal memory access. This is necessary because the number of required threads often does not align perfectly with the number of threads per block multiplied by the number of blocks. If for example $1031$ threads are required, then at least two thread blocks are needed, as one thread block can hold at most $1024$ threads. Because $1031$ is a prime number, it can not be divided by any practical number of thread blocks. If two thread blocks are allocated, each holding $1024$ threads, a total of $2048$ threads is started. Therefore, the excess $2048 - 1031 = 1017$ threads must be prevented from executing. By using the global thread ID and the number of available data points, these excess threads can be easily identified and terminated early in the kernel execution. Afterwards the stack for the interpretation can be created. It is possible to dynamically allocate memory on the GPU, which enables a similar programming model as on the CPU. \textcite{winter_are_2021} have compared many dynamic memory managers and found, that the performance impact of them is rather small. However, if it is easily possible to use static allocations, it still offers better performance. In the case of this thesis, it is easily possible which is the reason why the stack has been chosen to have a static size. Because it is known that expressions do not exceed 50 tokens, including the operators, the stack size has been set to ten, which should be more than enough to hold the values and partial results, even in the worst case. It is very unlikely that ten values must be stored before a binary operator is encountered that reduces the number of values on the stack. Therefore, a stack size of ten should be sufficient, however it is possible to increase the stack size if needed. @@ -185,7 +185,7 @@ Evaluating the expression is happening if the current token is an operator. The Support for ternary operators could also be easily added. An example of a ternary operator that would help improve performance would be the GPU supported Fused Multiply-Add (FMA) operator. While this operator does not exist in Julia, the frontend can generate it when it encounters a sub-expression of the form $x * y + z$. Since this expression performs the multiplication and addition in a single clock cycle instead of two, it would be a feasible optimisation. However, detecting such sub-expressions is complicated, which why it is not supported in the current implementation. -Once the interpreter loop has finished, the result of the evaluation must be stored in the result matrix. By using the index of the current expression, as well as the index of the current variable set (the global thread ID) it is possible to calculate the index where the result must be stored. The last value on the stack is the result, which is stored in the result matrix at the calculated location. +Once the interpreter loop has finished, the result of the evaluation must be stored in the result matrix. By using the index of the current expression, as well as the index of the current data point (the global thread ID) it is possible to calculate the index where the result must be stored. The last value on the stack is the result, which is stored in the result matrix at the calculated location. \section{Transpiler} Unlike the interpreter, the transpiler primarily operates on the CPU, with only a minor GPU-based component. This is because the transpiler must generate entire PTX kernels from Julia expressions, rather than simply executing a pre-written kernel like the interpreter. Similar to the interpreter, the CPU side of the transpiler manages communication with both the GPU and the symbolic regression algorithm. This section provides a detailed overview of the transpiler's functionality. @@ -303,7 +303,7 @@ End: ret; \end{PTXCode} -It needs to be noted, that the register \verb|%r2| is not needed. Since the transpiler already knows the number of variable sets, it would be wasteful to transmit this information to the kernel. Instead, the transpiler inserts the number directly as a constant to save resources. +It needs to be noted, that the register \verb|%r2| is not needed. Since the transpiler already knows the number of data points, it would be wasteful to transmit this information to the kernel. Instead, the transpiler inserts the number directly as a constant to save resources. \subsubsection{Main Loop} The main loop of the transpiler, which generates the kernel for evaluating a single expression, is analogous to the interpreter's main loop. Since the transpiler uses the same intermediate representation as the interpreter, both loops behave similarly. The transpiler loop also uses a stack to store the values and intermediate results. However, the transpiler does not require the special opcode \textit{stop} which was necessary in the interpreter to handle expressions padded to fit into a matrix. The transpiler only needs to process a single expression, which is stored in an unpadded vector of known length. This means that all tokens within the vector are valid and therefore do not require this opcode. @@ -375,12 +375,12 @@ On the GPU, the transpiled kernels are executed. Given that these kernels are re Note that Program \ref{code:ptx_kernel} has been slightly simplified to omit the mandatory directives and the register allocation. From line five to line ten, the addresses stored in the parameters are converted from parameter state space into global state space so that they reference the correct portion of the GPU's memory. It needs to be noted, that this kernel uses 64-bit addresses, which is the reason why some 64-bit instructions are used throughout the kernel. However, the evaluation of the expression itself is performed entirely using the faster 32-bit instructions. -Lines 12 through 17 are responsible for calculating the global thread ID and ensuring that excessive threads are terminated early. Note that in line 16, if the global thread ID stored in register \verb|%r3| is greater than one, it must terminate early. This is because only one variable set needs to be evaluated in this example. +Lines 12 through 17 are responsible for calculating the global thread ID and ensuring that excessive threads are terminated early. Note that in line 16, if the global thread ID stored in register \verb|%r3| is greater than one, it must terminate early. This is because only one data point needs to be evaluated in this example. The PTX code from line 22 to line 28 is the actual evaluation of the expression, with line 28 performing the calculation $x_1 + p_1$. All other lines are responsible for loading the values from global memory. The instructions in lines 22, 23, 25 and 26 are responsible for calculating the offset in bytes to the memory location where the value is stored with respect to the location of the first element. -The constants $4$ and $0$ are introduced for performance reasons. The number $4$ is the size of a variable set in bytes. Since one variable set in this case stores only a single FP32 value, each variable set has a size of four bytes. Similarly, the number $0$ represents the index of the value within the variable set. More precisely, this is the offset in bytes from the index to the variable set, which is zero for the first element, four for the second, and so on. These two constants are calculated during the transpilation process to minimise the amount of data to be transferred to the GPU. +The constants $4$ and $0$ are introduced for performance reasons. The number $4$ is the size of a data point in bytes. Since one data point in this case stores only a single FP32 value, each data point has a size of four bytes. Similarly, the number $0$ represents the index of the value within the data point. More precisely, this is the offset in bytes from the index to the data point, which is zero for the first element, four for the second, and so on. These two constants are calculated during the transpilation process to minimise the amount of data to be transferred to the GPU. -Storing the result in the result matrix is performed from line 31 to 33. The location where the value is to be stored is calculated in lines 31 and 32. Line 31 calculates the index inside the result matrix according to the current variable set stored in register \verb|%rd3|. The constant $0$ is the product of the index of the expression being evaluated and the number of variable sets, and represents the column of the result matrix. Converting this index into bytes and adding it as an offset to the first element of the result matrix gives the correct memory location to store the result at. +Storing the result in the result matrix is performed from line 31 to 33. The location where the value is to be stored is calculated in lines 31 and 32. Line 31 calculates the index inside the result matrix according to the current data point stored in register \verb|%rd3|. The constant $0$ is the product of the index of the expression being evaluated and the number of data points, and represents the column of the result matrix. Converting this index into bytes and adding it as an offset to the first element of the result matrix gives the correct memory location to store the result at. This kernel consists mostly of overhead code, as only lines 22 through 33 contribute to calculating the result of the expression with the designated variable and parameter set. However, for larger expressions, the percentage of overhead code shrinks drastically. \ No newline at end of file diff --git a/thesis/chapters/relwork.tex b/thesis/chapters/relwork.tex index b6d107c..244fe10 100644 --- a/thesis/chapters/relwork.tex +++ b/thesis/chapters/relwork.tex @@ -21,14 +21,16 @@ An implementation for an equation learner in the physics domain is proposed by \ % A survey conducted by \textcite{dabhi_survey_2012} shows how overfitting is not desirable and why more generalisable solutions are preferred. - -To generate an equation, first the operators need to be defined that make up the equation. It is also possible to define a maximum length for an expression as proposed by \textcite{koza_genetic_1994}. Expressions also consist of constants as well as variables which represent the inputs. Assuming that a given problem has two variables and one parameter, the equation learner could generate an expression as seen in Equation \ref{eq:example} where $x_n$ are the variables, $p_1$ is the parameter and $O$ is the output which should correspond to the observed output for the given variables. +\subsection{Genetic Programming} +To generate equations, first the operators which are allowed to be used during generation need to be defined. It is also possible to define a maximum length for an expression as proposed by \textcite{koza_genetic_1994}. Expressions also consist of variables which represent the inputs as well as constants. Assuming that a given problem has two variables and one parameter, GP could generate an expression as seen in Equation \ref{eq:example} where $x_n$ are the variables, $p_1$ is the parameter and $O$ is the output which should correspond to the observed output for the given variables. \begin{equation} \label{eq:example} O = 5 - \text{abs}(x_1) + x_2 \, \sqrt{p_1} / 10 \end{equation} -A typical equation learner generates multiple expressions at once. If for example the equation learner generates $300$ expressions per GP generation, each of these expressions needs to be evaluated at least once to determine how well they can produce the desired output. Each expression lies in a different part of the search space and with only the variables, it would not easily be possible to explore the surrounding search space. To perform for example local search in this area, the parameter $p_1$ can be used. This local search phase helps to find the local or even global optimum. For example $50$ local search steps can be used, meaning that each expression needs to be evaluated $50$ times with the same variables, but different parameters. As a result, one GP generation consequently requires a total $300 * 50 = 15\,000$ evaluations of the expressions. However, typically more than one GP generation is needed to find a good local optimum. While the exact number of generations is problem specific, for this example a total of $100$ generations can be assumed. Each generation again generates $300$ expressions and needs to perform $50$ local search steps. This results in a total of $300 * 50 * 100 = 1\,500\,000$ evaluations which need to be performed during the entire runtime of the GP algorithm. These values have been taken from the equation learner for predicting discharge voltage curves of batteries as described by \textcite{kronberger_symbolic_2024}. Their equation learner converged after 54 generations, resulting in $300 * 50 * 54 \approx 800\,000$ evaluations. Depending on the complexity of the generated expressions, performing all of these evaluations takes up a lot of the runtime. Their results took over two days to compute on an eight core desktop CPU. While they did not provide runtime information for all problems they tested, the voltage curve prediction was the slowest. The other problems were in the range of a few seconds and up to a day. Especially the problems that took several hours to days to finish show, that there is still room for performance improvements. While a better CPU with more cores can be used, it is interesting to determine, if using GPUs can yield noticeable better performance. +A typical GP generation generates multiple expressions at once. If for example a single generation consists of $300$ solution candidates or expressions, each of these expressions needs to be evaluated at least once to determine how well they can produce the desired output. + +Each expression is part of a search space of all possible expressions consisting of the defined operators, variables and constants up to a defined maximum length. With the help of GP, this search space is explored, however, the generated expressions might not perfectly fit the data. To further refine the generated expressions, the concept of parameter optimisation can be used as described by \textcite{kommenda_local_2018}. Parameter optimisation is a kind of local search where parameters $p$ are introduced in the generated equations. In Equation \ref{eq:example} the parameter $p_1$ will be modified over some amount of iterations. This modification should assist in finding a local or even the global optimum by better fitting the expressions to the data. For example $50$ local search steps can be used, meaning that each expression needs to be evaluated $50$ times with the same variables, but different parameters. As a result, one GP generation consequently requires a total $300 * 50 = 15\,000$ evaluations of the expressions. However, typically more than one GP generation is needed to find a good solution. While the exact number of generations is problem specific, for this example a total of $100$ generations can be assumed. Each generation again generates $300$ expressions and needs to perform $50$ local search steps. This results in a total of $300 * 50 * 100 = 1\,500\,000$ evaluations which need to be performed during the entire runtime of the GP algorithm. These values have been taken from the GP algorithm for predicting discharge voltage curves of batteries as described by \textcite{kronberger_symbolic_2024}. Their GP algorithm converged after $54$ generations, resulting in $300 * 50 * 54 \approx 800\,000$ evaluations. This calculation omits the number of data points, which are the main contributor towards the total runtime. As for each generated expression, each data point needs to be used for parametrising the variables, drastically increasing the number of evaluations. They used a total of $11\,000$ data points, resulting in a total of $800\,000 * 11\,000 = 8.8 \text{billion}$ evaluations. Their results took over two days to compute on an eight core desktop CPU. While they did not provide runtime information for all problems they tested, the voltage curve prediction was the slowest. The other problems were in the range of a few seconds and up to a day. Especially the problems that took several hours to days to finish show, that there is still room for performance improvements. While a better CPU with more cores can be used, it is interesting to determine, if using GPUs can yield noticeable better performance. \section[GPGPU]{General Purpose Computation on Graphics Processing Units} \label{sec:gpgpu} @@ -40,16 +42,10 @@ If not specified otherwise, the following section and its subsections use the in Generally, simulations are great candidates for using GPUs, as they can benefit heavily from a high degree of parallelism and data throughput. \textcite{koster_high-performance_2020} have developed a way of using adaptive time steps on the GPU to considerably improve the performance of numerical and discrete simulations. In addition to the performance gains they were able to retain the precision and constraint correctness of the simulation. Black hole simulations are crucial for science and education for a better understanding of our world. \textcite{verbraeck_interactive_2021} have shown that simulating complex Kerr (rotating) black holes can be done on consumer hardware in a few seconds. Schwarzschild black hole simulations can be performed in real-time with GPUs as described by \textcite{hissbach_overview_2022} which is especially helpful for educational scenarios. While both approaches do not have the same accuracy as detailed simulations on supercomputers, they show how a single GPU can yield similar accuracy at a fraction of the cost. -Software network routing can also heavily benefit from GPU acceleration as shown by \textcite{han_packetshader_2010}, where they achieved a significantly higher throughput than with a CPU only implementation. - -Finite element structural analysis is an essential tool for many branches of engineering and can also heavily benefit from the usage of GPUs as demonstrated by \textcite{georgescu_gpu_2013}. - -Generating test data for DeepQ learning can also significantly benefit from using the GPU \parencite{koster_macsq_2022}. - -However, it also needs to be noted, that GPUs are not always better performing than CPUs as illustrated by \textcite{lee_debunking_2010}, so it is important to consider if it is worth using GPUs for specific tasks. +Software network routing can also heavily benefit from GPU acceleration as shown by \textcite{han_packetshader_2010}, where they achieved a significantly higher throughput than with a CPU only implementation. Finite element structural analysis is an essential tool for many branches of engineering and can also heavily benefit from the usage of GPUs as demonstrated by \textcite{georgescu_gpu_2013}. Generating test data for DeepQ learning can also significantly benefit from using the GPU \parencite{koster_macsq_2022}. However, it also needs to be noted, that GPUs are not always better performing than CPUs as illustrated by \textcite{lee_debunking_2010}, so it is important to consider if it is worth using GPUs for specific tasks. \subsection{Programming GPUs} -The development process on a GPU is vastly different from a CPU. A CPU has tens or hundreds of complex cores with the AMD Epyc 9965\footnote{\url{https://www.amd.com/en/products/processors/server/epyc/9005-series/amd-epyc-9965.html}} having $192$ cores and twice as many threads. To demonstrate how a modern CPU works \textcite{knuth_mmix_1999} introduced the MMIX architecture. It is a 64-bit CPU architecture containing many concepts and design decisions to compete with other CPUs on the market at that time. He provides the information in great detail and demonstrates the complexity of CPU architectures. Current CPUs are even more complex, and often contain features like sophisticated branch prediction among other things to achieve higher and higher performance. This makes a CPU perfect for handling complex control flows on a single program thread and even multiple threads simultaneously \parencite{palacios_comparison_2011}. However, as seen in Section \ref{sec:gpgpu}, this often is not enough. On the other hand, a GPU contains thousands or even tens of thousands of cores. For example, the GeForce RTX 5090\footnote{\url{https://www.nvidia.com/en-us/geforce/graphics-cards/50-series/rtx-5090/}} contains a total of $21\,760$ CUDA cores. To achieve this enormous core count, a single GPU core has to be much simpler than a single CPU core. As described by \textcite{nvidia_cuda_2025}, a GPU designates much more transistors towards floating-point computations. This, however, results in less efficient integer arithmetic and control flow handling. There is also less Cache available per core and clock speeds are usually also much lower than those on a CPU. An overview of the differences of a CPU and a GPU architecture can be seen in Figure \ref{fig:cpu_vs_gpu}. +The development process on a GPU is vastly different from a CPU. A CPU has tens or hundreds of complex cores with the AMD Epyc 9965\footnote{\url{https://www.amd.com/en/products/processors/server/epyc/9005-series/amd-epyc-9965.html}} having $192$ cores and twice as many threads. Current CPUs are complex, and often contain features such as sophisticated branch prediction among other things to achieve higher and higher performance. This makes a CPU perfect for handling complex control flows on a single program thread and even multiple threads simultaneously \parencite{palacios_comparison_2011}. However, as seen in Section \ref{sec:gpgpu}, this often is not enough. On the other hand, a GPU contains thousands or even tens of thousands of cores. For example, the GeForce RTX 5090\footnote{\url{https://www.nvidia.com/en-us/geforce/graphics-cards/50-series/rtx-5090/}} contains a total of $21\,760$ CUDA cores. To achieve this enormous core count, a single GPU core has to be much simpler than a single CPU core. As described by \textcite{nvidia_cuda_2025}, a GPU designates much more transistors towards floating-point computations. This, however, results in less efficient integer arithmetic and control flow handling. There is also less Cache available per core and clock speeds are usually also much lower than those on a CPU. An overview of the differences of a CPU and a GPU architecture can be seen in Figure \ref{fig:cpu_vs_gpu}. \begin{figure} \centering @@ -62,7 +58,7 @@ Despite these drawbacks, the sheer number of cores, makes a GPU a valid choice w \subsubsection{Thread Hierarchy and Tuning} \label{sec:thread_hierarchy} -The thousands of cores on a GPU, as well as the threads created by the developer, are grouped together in several categories. This is the so-called thread hierarchy of GPUs. The developer can influence this grouping to a degree which allows them to tune their algorithm for optimal performance. In order to develop a well performing algorithm, it is necessary to know how this grouping works. Tuning the grouping is unique to each algorithm and also dependent on the GPU used, which means it is important to test a lot of different configurations to achieve the best possible result. This section aims at exploring the thread hierarchy and how it can be tuned to fit an algorithm. +The thousands of cores on a GPU, as well as the threads created by the developer, are grouped together in several categories. This is the so-called thread hierarchy of GPUs. The developer can influence this grouping to a degree which allows them to tune their algorithm for optimal performance. To develop a well performing algorithm, it is necessary to know how this grouping works. Tuning the grouping is unique to each algorithm and also dependent on the GPU used, which means it is important to test a lot of different configurations to achieve the best possible result. This section aims at exploring the thread hierarchy and how it can be tuned to fit an algorithm. At the lowest level of a GPU exists a Streaming Multiprocessor (SM), which is a hardware unit responsible for scheduling and executing threads and also contains the registers used by these threads. An SM is always executing a group of 32 threads simultaneously, and this group is called a warp. The number of threads that can be started is virtually unlimited. However, threads must be grouped in a block, with one block typically containing a maximum of $1024$ threads but is often configured to be less. Therefore, if more than $1024$ threads are required, more blocks must be created. Blocks can also be grouped into thread block clusters which is optional, but can be useful in certain scenarios. All thread blocks or thread block clusters are part of a grid, which manifests as a dispatch of the code run on the GPU, also called kernel \parencite{amd_hip_2025}. All threads in one block have access to some shared memory, which can be used for L1 caching or communication between threads. It is important that the blocks can be scheduled independently, with no dependencies between them. This allows the scheduler to schedule blocks and threads as efficiently as possible. All threads within a warp are guaranteed to be part of the same block, and are therefore executed simultaneously and can access the same memory addresses. Figure \ref{fig:thread_hierarchy} depicts how threads in a block are grouped into warps for execution and how they shared memory. @@ -79,7 +75,7 @@ Once a kernel is dispatched, all threads start at the same point in a program. H \begin{figure} \centering - \includegraphics[width=.8\textwidth]{thread_divergence.png} + \includegraphics[width=.4\textwidth]{thread_divergence.png} \caption{Thread T2 wants to execute instruction B while T1 and T3 want to execute instruction A. Therefore T2 will be an inactive thread this cycle and active once T1 and T3 are finished. This means that now the divergent threads are serialised.} \label{fig:thread_divergence} \end{figure} @@ -87,7 +83,9 @@ Once a kernel is dispatched, all threads start at the same point in a program. H Modern GPUs implement what is known as the Single-Instruction Multiple-Thread (SIMT) architecture. In many cases a developer does not need to know the details of SIMT and can design fast, correct and accurate programs with just the SIMD architecture in mind. However, leveraging the power of SIMT can yield substantial performance gains by re-converging threads after data-dependent divergence has occurred. SIMT can also help with increasing the occupancy of the GPU. Occupancy and its importance to performance is discussed in detail in Section \ref{sec:occupancy}. -A stack-less re-convergence algorithm was proposed by \textcite{collange_stack-less_2011} as an alternative to the default stack-based re-convergence algorithm. Their algorithm was able to achieve higher performance than the default one. Another approach for increasing occupancy using the SIMT architecture is proposed by \textcite{fung_thread_2011}. They introduced a technique for compacting thread blocks by moving divergent threads to new warps until they re-converge. This approach resulted in a noticeable speed-up between 17\% and 22\%. Another example where a SIMT aware algorithm can perform better was proposed by \textcite{koster_massively_2020}. While they did not implement techniques for thread re-convergence, they implemented a thread compaction algorithm. On data-dependent divergence it is possible for threads to end early, leaving a warp with only partial active threads. This means the inactive threads are still occupied and cannot be used for other work. Their thread compaction tackles this problem by moving active threads into a new thread block, releasing the inactive threads to perform other work. With this they were able to gain a speed-up of roughly 4 times compared to previous implementations. Adapting Multiple-Instruction Multiple-Data (MIMD) programs with synchronisation to run on SIMT architecture can be a difficult task, especially if the underlying architecture is not well understood. A static analysis tool and a transformer specifically designed to help avoid deadlocks with MIMD synchronisation is proposed by \textcite{eltantawy_mimd_2016}. In addition, they proposed a hardware re-convergence mechanism that supports MIMD synchronisation. A survey by \textcite{khairy_survey_2019} explores different aspects of improving GPGPU performance architecturally. Specifically, they have compiled a list of different publications discussing algorithms for thread re-convergence, thread compaction and much more. Their main goal was to give a broad overview of many ways to improve the performance of GPGPU programming to help other developers. +A stack-less re-convergence algorithm was proposed by \textcite{collange_stack-less_2011} as an alternative to the default stack-based re-convergence algorithm. Their algorithm was able to achieve higher performance than the default one. Another approach for increasing occupancy using the SIMT architecture is proposed by \textcite{fung_thread_2011}. They introduced a technique for compacting thread blocks by moving divergent threads to new warps until they re-converge. This approach resulted in a noticeable speed-up between 17\% and 22\%. Another example where a SIMT aware algorithm can perform better was proposed by \textcite{koster_massively_2020}. While they did not implement techniques for thread re-convergence, they implemented a thread compaction algorithm. On data-dependent divergence it is possible for threads to end early, leaving a warp with only partial active threads. This means the inactive threads are still occupied and cannot be used for other work. Their thread compaction tackles this problem by moving active threads into a new thread block, releasing the inactive threads to perform other work. With this they were able to gain a speed-up of roughly 4 times compared to previous implementations. + +Adapting Multiple-Instruction Multiple-Data (MIMD) programs with synchronisation to run on SIMT architecture can be a difficult task, especially if the underlying architecture is not well understood. A static analysis tool and a transformer specifically designed to help avoid deadlocks with MIMD synchronisation is proposed by \textcite{eltantawy_mimd_2016}. In addition, they proposed a hardware re-convergence mechanism that supports MIMD synchronisation. A survey by \textcite{khairy_survey_2019} explores different aspects of improving GPGPU performance architecturally. Specifically, they have compiled a list of different publications discussing algorithms for thread re-convergence, thread compaction and much more. Their main goal was to give a broad overview of many ways to improve the performance of GPGPU programming to help other developers. \subsubsection{Memory Model} \label{sec:memory_model} @@ -200,7 +198,7 @@ Compilers are a necessary tool for many developers. If a developer wants to run \begin{figure} \centering \includegraphics[width=.9\textwidth]{compiler_architecture.png} - \caption{A simplified overview of how the architecture of a compiler looks, using Flex and Bison.} + \caption{A simplified overview of the architecture of a compiler.} \label{fig:compiler_layout} \end{figure} @@ -208,8 +206,8 @@ Compilers are a necessary tool for many developers. If a developer wants to run \subsection{Interpreters} % What are interpreters; how they work; should mostly contain/reference gpu interpreters -Interpreters are a different kind of program for executing source code. Rather than compiling the code and executing the result, an interpreter executes the source code directly. Languages like Python and JavaScript are prominent examples of interpreted languages, but also Java, or more precise Java-Bytecode, is also interpreted before it gets compiled \parencite{lindholm_java_2025}. However, interpreters can not only be used for interpreting programming languages. It is also possible for them to be used in GP. \textcite{langdon_simd_2008} have shown how a SIMD interpreter can be efficiently used for evaluating entire GP populations on the GPU directly. In a later work \textcite{cano_gpu-parallel_2014} further improved this interpreter. They used the fact that a GP individual represents a tree which can be split into independent subtrees. These can be evaluated concurrently and with the help of communication via shared memory, they were able to evaluate the entire tree. With this they achieved a significant performance improvement over previous implementations. As shown by \textcite{dietz_mimd_2010}, it is even possible to develop an interpreter that can execute MIMD programs on a SIMD GPU. However, as noted by the authors, any kind interpretation comes with an overhead. This means that with the additional challenges of executing MIMD programs on SIMD hardware, their interpreter, while achieving reasonable efficiency, still suffers from performance problems. Another field where interpreters can be useful are rule-based simulations. \textcite{koster_massively_2020} has shown how they implemented a GPU interpreter for such simulations. In addition with other novel performance improvements in running programs on a GPU, they were able to gain a speed-up of 4 over non-interpreted implementations. While publications like \textcite{fua_comparing_2020} and \textcite{gherardi_java_2012} have shown, interpreted languages often trail behind in terms of performance compared to compiled languages, interpreters per se are not slow. And while they come with performance overhead as demonstrated by \textcite{dietz_mimd_2010} and \textcite{romer_structure_1996}, they can still be a very fast, easy and powerful alternative for certain tasks. +Interpreters are a different kind of program for executing source code. Rather than compiling the code and executing the result, an interpreter executes the source code directly. Languages like Python and JavaScript are prominent examples of interpreted languages, but also Java, or more precise Java-Bytecode, is also interpreted before it gets compiled \parencite{lindholm_java_2025}. However, interpreters can not only be used for interpreting programming languages. It is also possible for them to be used in GP. \textcite{langdon_simd_2008} have shown how a SIMD interpreter can be efficiently used for evaluating entire GP populations on the GPU directly. In a later work \textcite{cano_gpu-parallel_2014} further improved this interpreter. They used the fact that a GP individual represents a tree which can be split into independent subtrees. These can be evaluated concurrently and with the help of communication via shared memory, they were able to evaluate the entire tree. With this they achieved a significant performance improvement over previous implementations. As shown by \textcite{dietz_mimd_2010}, it is even possible to develop an interpreter that can execute MIMD programs on a SIMD GPU. However, as noted by the authors, any kind of interpretation comes with an overhead. This means that with the additional challenges of executing MIMD programs on SIMD hardware, their interpreter, while achieving reasonable efficiency, still suffers from performance problems. Another field where interpreters can be useful are rule-based simulations. \textcite{koster_massively_2020} has shown how they implemented a GPU interpreter for such simulations. In addition with other novel performance improvements in running programs on a GPU, they were able to gain a speed-up of 4 over non-interpreted implementations. While publications like \textcite{fua_comparing_2020} and \textcite{gherardi_java_2012} have shown, interpreted languages often trail behind in terms of performance compared to compiled languages, interpreters per se are not slow. And while they come with performance overhead as demonstrated by \textcite{dietz_mimd_2010} and \textcite{romer_structure_1996}, they can still be a very fast, easy and powerful alternative for certain tasks. \subsection{Transpilers} % talk about what transpilers are and how to implement them. If possible also gpu specific transpilation. -With the concepts already mentioned, it is possible to generate executable code from code written in a programming language. However, sometimes it is desired to convert a program from one programming language to another and therefore the major difference between these use-cases is the backend. A popular transpiler example is the TypeScript transpiler, which transforms TypeScript source code into JavaScript source code \parencite{microsoft_typescript_2025}. Other examples for transpilers are the C2Rust transpiler \parencite{ling_rust_2022} that transpiles C code into Rust code as well as the PyJL transpiler \parencite{marcelino_transpiling_2022} which transpiles Python code into Julia code. \textcite{chaber_effectiveness_2016} proposed a transpiler that takes MATLAB and C code and transforms it into pure and optimised C code for an STM32 microcontroller. An early example for a transpiler has been developed by \textcite{intel_mcs86_1978} where they built a transpiler for transforming assembly code for their 8080 CPU to assembly code for their 8086 CPU. Transpilers can also be used in parallelisation environments, like OpenMP \parencite{wang_automatic_2015}. There also exists a transpiler that transforms CUDA code into highly parallel CPU code. \textcite{moses_high-performance_2023} described this transpiler, and they found that the generated code performs noticeably better than doing this transformation by hand. When designing complex processors and accelerators, Register-transfer level (RTL) simulations are essential \parencite{wang_electronic_2009}. In a later study \textcite{zhang_opportunities_2020} have shown how RTL simulations can be performed on GPUs with a speed-up of 20. This led to \textcite{lin_rtl_2023} developing a transpiler to transform RTL into CUDA kernels instead of handwriting them. The compared their results with a CPU implementation running on 80 CPUs, where they found that the transpiled CUDA version was 40 times faster. Using transpilers for software backend and business logic has been proposed by \textcite{bastidas_fuertes_transpiler-based_2023}. Their approach implemented a programming language that can be transpiled into different programming languages, for usage in a multi-programming-language environment that share some business logic. In another study, \textcite{bastidas_fuertes_transpilers_2023} reviewed over 600 publications to map the use of transpilers alongside their implementations in different fields of research, demonstrating the versatility of transpiler use. +With the concepts already mentioned, it is possible to generate executable code from code written in a programming language. However, sometimes it is desired to convert a program from one programming language to another and therefore the major difference between these use-cases is the backend. A popular transpiler example is the TypeScript transpiler, which transforms TypeScript source code into JavaScript source code \parencite{microsoft_typescript_2025}. Other examples for transpilers are the C2Rust transpiler \parencite{ling_rust_2022} that transpiles C code into Rust code as well as the PyJL transpiler \parencite{marcelino_transpiling_2022} which transpiles Python code into Julia code. \textcite{chaber_effectiveness_2016} proposed a transpiler that takes MATLAB and C code and transforms it into pure and optimised C code for an STM32 microcontroller. An early example for a transpiler has been developed by \textcite{intel_mcs86_1978} where they built a transpiler for transforming assembly code for their 8080 CPU to assembly code for their 8086 CPU. Transpilers can also be used in parallelisation environments, like OpenMP \parencite{wang_automatic_2015}. \textcite{moses_high-performance_2023} describe a transpiler, that can transform CUDA code into highly parallel CPU code, where they found that it performs noticeably better than doing this transformation by hand. When designing complex processors and accelerators, register-transfer level (RTL) simulations are essential \parencite{wang_electronic_2009}. In a later study \textcite{zhang_opportunities_2020} have shown how RTL simulations can be performed on GPUs with a speed-up of 20. This led to \textcite{lin_rtl_2023} developing a transpiler to transform RTL into CUDA kernels instead of handwriting them. The compared their results with a CPU implementation running on 80 CPUs, where they found that the transpiled CUDA version was 40 times faster. Using transpilers for software backend and business logic has been proposed by \textcite{bastidas_fuertes_transpiler-based_2023}. Their approach implemented a programming language that can be transpiled into different programming languages, for usage in a multi-programming-language environment that share some business logic. In another study, \textcite{bastidas_fuertes_transpilers_2023} reviewed over 600 publications to map the use of transpilers alongside their implementations in different fields of research, demonstrating the versatility of transpiler use. diff --git a/thesis/front/abstract.tex b/thesis/front/abstract.tex index e71d7f0..9139066 100644 --- a/thesis/front/abstract.tex +++ b/thesis/front/abstract.tex @@ -1,11 +1,11 @@ \chapter{Abstract} -The objective of symbolic regression is to identify an expression that accurately models a system based on a set of inputs. For instance, one might determine the flow through pipes using inputs such as roughness, diameter, and length by conducting experiments with varying input configurations and observing the resulting flow and derive an expression from the experiments. This methodology, exemplified by \textcite{nikuradse_laws_1950}, can be applied to any system through symbolic regression. To find the best-fitting expression, millions of candidate expressions are generated, each requiring evaluation against every input configuration to assess how well they fit to the system. Consequently, millions of evaluations must be performed, a process that is computationally intensive and time-consuming. Thus, optimizing the evaluation phase of symbolic regression is crucial for discovering expressions that describe large and complex systems within a feasible timeframe. +The objective of symbolic regression is to identify an expression that accurately models a system based on a set of inputs. For instance, one might determine the flow through pipes using inputs such as roughness, diameter, and length by conducting experiments with varying input configurations and observing the resulting flow and derive an expression from the experiments. This methodology, exemplified by \textcite{nikuradse_laws_1950}, can be applied to any system through symbolic regression. To find the best-fitting expression, millions of candidate expressions are generated, each requiring evaluation against every data point to assess how well they fit to the system. Consequently, millions of evaluations must be performed, a process that is computationally intensive and time-consuming. Thus, optimizing the evaluation phase of symbolic regression is crucial for discovering expressions that describe large and complex systems within a feasible timeframe. % Applications such as weather simulation \parencite{michalakes_gpu_2008}, simulation of static and rotating black holes \parencite{hissbach_overview_2022, verbraeck_interactive_2021}, and structural analysis \parencite{georgescu_gpu_2013} significantly benefit from optimized algorithms that leverage the graphics processing unit (GPU). -This thesis presents the design and implementation of two evaluators that utilize the GPU to evaluate expressions generated at runtime by the symbolic regression algorithm. Performance benchmarks are conducted to compare the efficiency of the GPU evaluators against the current CPU evaluator. +This thesis presents the design and implementation of two evaluators that utilize the GPU to evaluate expressions generated at runtime by the symbolic regression algorithm. Performance benchmarks are conducted to compare the efficiency of the GPU evaluators against a CPU evaluator. -The benchmark results indicate that the GPU can serve as a viable alternative to the CPU in certain scenarios. The determining factor for choosing between GPU and CPU evaluation is the number of input configurations. In a scenario with $10\,000$ expressions and $10\,000$ input configurations, the GPU outperformed the CPU by a significant margin. +The benchmark results indicate that the GPU can serve as a viable alternative to the CPU in certain scenarios. The determining factor for choosing between GPU and CPU evaluation is the number of data points. In a scenario with $10\,000$ expressions and $10\,000$ data points, the GPU outperformed the CPU by a factor between $1.6$ and $2$. -This master thesis is associated with the FFG COMET project ProMetHeus (\#904919). The developed software is used and further developed for modelling in the ProMetHeus project. +This master thesis is associated with the FFG COMET project ProMetHeus (\#904919). The developed software is used and further developed for symbolic regression in the ProMetHeus project. diff --git a/thesis/front/kurzfassung.tex b/thesis/front/kurzfassung.tex index 43510d1..d658d66 100644 --- a/thesis/front/kurzfassung.tex +++ b/thesis/front/kurzfassung.tex @@ -1,12 +1,12 @@ \chapter{Kurzfassung} \begin{german} -Das Ziel der symbolischen Regression ist es, einen Ausdruck zu finden, der ein System basierend auf einer Reihe von Variablen modelliert. Beispielsweise kann man den Durchfluss durch Rohre unter Verwendung von Variablen wie Rauheit, Durchmesser und Länge bestimmen, indem Experimente mit verschiedenen Werten für die Variablen durchgeführt werden. Für jedes Experiment wird der Durchfluss gemessen, wodurch man eine allgemeine Formel ableiten kann, welche die Beziehung der Variablen mit dem Durchfluss beschreibt. Diese Methodik, veranschaulicht durch die Arbeit von \textcite{nikuradse_laws_1950}, kann auf unterschiedliche Systeme mithilfe von symbolischer Regression angewendet werden. Um einen Ausdruck zu finden, welcher das System am besten beschreibt, werden Millionen von Kandidatenausdrücken generiert. Diese müssen, unter Verwendung der Variablenkonfiguration aller Experimente ausgewertet werden, um ihre Passgenauigkeit zum System zu beurteilen. Folglich müssen Millionen von Auswertungen durchgeführt werden, ein Prozess, der rechenintensiv und zeitaufwendig ist. Daher ist die Optimierung der Auswertungsphase der symbolischen Regression entscheidend. So wird es ermöglicht Ausdrücke in einem angemessenen Zeitrahmen zu finden, welche große und komplexe Systeme beschreiben. +Das Ziel der symbolischen Regression ist es, einen Ausdruck zu finden, der ein System basierend auf einer Reihe von Variablen modelliert. Beispielsweise kann man den Durchfluss durch Rohre unter Verwendung von Variablen wie Rauheit, Durchmesser und Länge bestimmen, indem Experimente mit verschiedenen Werten für die Variablen durchgeführt werden. Für jedes Experiment wird der Durchfluss gemessen, wodurch man eine allgemeine Formel ableiten kann, welche die Beziehung der Variablen mit dem Durchfluss beschreibt. Diese Methodik, veranschaulicht durch die Arbeit von \textcite{nikuradse_laws_1950}, kann auf unterschiedliche Systeme mithilfe von symbolischer Regression angewendet werden. Um einen Ausdruck zu finden, welcher das System am besten beschreibt, werden Millionen von Kandidatenausdrücken generiert. Diese müssen, unter Verwendung der Daten aller Experimente ausgewertet werden, um ihre Passgenauigkeit zum System zu beurteilen. Folglich müssen Millionen von Auswertungen durchgeführt werden, ein Prozess, der rechenintensiv und zeitaufwendig ist. Daher ist die Optimierung der Auswertungsphase der symbolischen Regression entscheidend. So wird es ermöglicht Ausdrücke in einem angemessenen Zeitrahmen zu finden, welche große und komplexe Systeme beschreiben. Diese Arbeit präsentiert das Design und die Implementierung von zwei Evaluatoren, die die Grafikkarte (GPU) nutzen, um Ausdrücke zu bewerten, die zur Laufzeit der symbolischen Regression generiert werden. Leistungsbenchmarks werden durchgeführt, um die Performanz der GPU-Evaluatoren mit dem aktuellen CPU-Evaluator zu vergleichen. -Die Benchmark-Ergebnisse zeigen, dass die GPU in bestimmten Szenarien als eine tragfähige Alternative zur CPU dienen kann. Der entscheidende Faktor für die Wahl zwischen GPU- und CPU-Auswertung ist die Anzahl der Experimente und folglich die Menge an Variablenkonfigurationen. In einer Konfiguration mit $10\,000$ Ausdrücken und $10\,000$ Variablenkonfigurationen übertraf die GPU die CPU um ein bedeutendes Maß. +Die Benchmark-Ergebnisse zeigen, dass die GPU in bestimmten Szenarien eine geeignete Alternative zur CPU darstellt. Der entscheidende Faktor für die Wahl zwischen GPU- und CPU-Auswertung ist die Anzahl der Experimente und folglich die Anzahl der Datenpunkte. In einer Konfiguration mit $10\,000$ Ausdrücken und $10\,000$ Variablenkonfigurationen übertraf die GPU die CPU um ein bedeutendes Maß. -Diese Masterarbeit steht im Zusammenhang mit dem FFG COMET Projekt ProMetHeus (\#904919). Die entwickelte Software wird für die Modellierung im ProMetHeus Projekt verwendet und weiterentwickelt. +Diese Masterarbeit ist Teil des FFG COMET Projekt ProMetHeus (\#904919). Die entwickelte Software wird für die symbolische Regression im ProMetHeus Projekt verwendet und weiterentwickelt. \end{german} \ No newline at end of file diff --git a/thesis/images/input_output_explanation.png b/thesis/images/input_output_explanation.png index 414595b..d0f8c62 100644 Binary files a/thesis/images/input_output_explanation.png and b/thesis/images/input_output_explanation.png differ diff --git a/thesis/images/interpreter_sequence_diagram.png b/thesis/images/interpreter_sequence_diagram.png index e7347b0..d5814cd 100644 Binary files a/thesis/images/interpreter_sequence_diagram.png and b/thesis/images/interpreter_sequence_diagram.png differ diff --git a/thesis/images/results/cpu_gpui_gput_bench1.png b/thesis/images/results/cpu_gpui_gput_bench1.png index ad27310..7b08d39 100644 Binary files a/thesis/images/results/cpu_gpui_gput_bench1.png and b/thesis/images/results/cpu_gpui_gput_bench1.png differ diff --git a/thesis/images/results/cpu_gpui_gput_bench2.png b/thesis/images/results/cpu_gpui_gput_bench2.png index 1423c17..29dc5e3 100644 Binary files a/thesis/images/results/cpu_gpui_gput_bench2.png and b/thesis/images/results/cpu_gpui_gput_bench2.png differ diff --git a/thesis/images/results/cpu_gpui_gput_bench3.png b/thesis/images/results/cpu_gpui_gput_bench3.png index 0749fad..238d1a8 100644 Binary files a/thesis/images/results/cpu_gpui_gput_bench3.png and b/thesis/images/results/cpu_gpui_gput_bench3.png differ diff --git a/thesis/images/results/gpu-interpreter-final-performance-benchmark1.png b/thesis/images/results/gpu-interpreter-final-performance-benchmark1.png index fe59dbf..e6554ca 100644 Binary files a/thesis/images/results/gpu-interpreter-final-performance-benchmark1.png and b/thesis/images/results/gpu-interpreter-final-performance-benchmark1.png differ diff --git a/thesis/images/results/gpu-interpreter-final-performance-benchmark2.png b/thesis/images/results/gpu-interpreter-final-performance-benchmark2.png index 8a75d84..66ab87f 100644 Binary files a/thesis/images/results/gpu-interpreter-final-performance-benchmark2.png and b/thesis/images/results/gpu-interpreter-final-performance-benchmark2.png differ diff --git a/thesis/images/results/gpu-interpreter-final-performance-benchmark3.png b/thesis/images/results/gpu-interpreter-final-performance-benchmark3.png index 2f47116..1519b33 100644 Binary files a/thesis/images/results/gpu-interpreter-final-performance-benchmark3.png and b/thesis/images/results/gpu-interpreter-final-performance-benchmark3.png differ diff --git a/thesis/images/results/gpu-transpiler-final-performance-benchmark2.png b/thesis/images/results/gpu-transpiler-final-performance-benchmark2.png index d1cb77f..da123bb 100644 Binary files a/thesis/images/results/gpu-transpiler-final-performance-benchmark2.png and b/thesis/images/results/gpu-transpiler-final-performance-benchmark2.png differ diff --git a/thesis/images/results/gpu-transpiler-final-performance-benchmark3.png b/thesis/images/results/gpu-transpiler-final-performance-benchmark3.png index d4d11a2..bfe76e9 100644 Binary files a/thesis/images/results/gpu-transpiler-final-performance-benchmark3.png and b/thesis/images/results/gpu-transpiler-final-performance-benchmark3.png differ diff --git a/thesis/images/results/interpreter-comparison-128-160-192.png b/thesis/images/results/interpreter-comparison-128-160-192.png index 8cc8542..5a52bd4 100644 Binary files a/thesis/images/results/interpreter-comparison-128-160-192.png and b/thesis/images/results/interpreter-comparison-128-160-192.png differ diff --git a/thesis/images/results/interpreter-comparison-initial-optim1.png b/thesis/images/results/interpreter-comparison-initial-optim1.png index 79515d5..127d7f1 100644 Binary files a/thesis/images/results/interpreter-comparison-initial-optim1.png and b/thesis/images/results/interpreter-comparison-initial-optim1.png differ diff --git a/thesis/images/results/interpreter-comparison-optim1-optim2.png b/thesis/images/results/interpreter-comparison-optim1-optim2.png index 94abf9c..4d77c65 100644 Binary files a/thesis/images/results/interpreter-comparison-optim1-optim2.png and b/thesis/images/results/interpreter-comparison-optim1-optim2.png differ diff --git a/thesis/images/results/interpreter-comparison-optim2-optim3.png b/thesis/images/results/interpreter-comparison-optim2-optim3.png index 986f78b..6c8ab1b 100644 Binary files a/thesis/images/results/interpreter-comparison-optim2-optim3.png and b/thesis/images/results/interpreter-comparison-optim2-optim3.png differ diff --git a/thesis/images/results/transpiler-comparison-128-160.png b/thesis/images/results/transpiler-comparison-128-160.png index 2d03735..b09b119 100644 Binary files a/thesis/images/results/transpiler-comparison-128-160.png and b/thesis/images/results/transpiler-comparison-128-160.png differ diff --git a/thesis/images/transpiler_sequence_diagram.png b/thesis/images/transpiler_sequence_diagram.png index 7decfd3..b5a89fa 100644 Binary files a/thesis/images/transpiler_sequence_diagram.png and b/thesis/images/transpiler_sequence_diagram.png differ diff --git a/thesis/main.pdf b/thesis/main.pdf index cf310ae..25a08eb 100644 Binary files a/thesis/main.pdf and b/thesis/main.pdf differ diff --git a/thesis/main.tex b/thesis/main.tex index f21ae4d..1f6bcb0 100644 --- a/thesis/main.tex +++ b/thesis/main.tex @@ -49,7 +49,9 @@ \frontmatter % Front part (roman page numbers) %%%----------------------------------------------------------------------------- -\maketitle +\includepdf[pages=1]{title_page.pdf} +\includepdf[pages=2, pagecommand={\thispagestyle{plain}}]{title_page.pdf} + \tableofcontents \include{front/abstract} diff --git a/thesis/references.bib b/thesis/references.bib index 35caaeb..5ff47d1 100644 --- a/thesis/references.bib +++ b/thesis/references.bib @@ -1279,3 +1279,23 @@ date = {2020-01-31}, file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\HLG9FD4H\\Guimerà et al. - 2020 - A Bayesian machine scientist to aid in the solution of challenging scientific problems.pdf:application/pdf}, } + +@book{kommenda_local_2018, + title = {Local Optimization and Complexity Control for Symbolic Regression / eingereicht von Michael Kommenda}, + url = {http://epub.jku.at/obvulihs/2581907}, + abstract = {Hochschulschriften. Local Optimization and Complexity Control for Symbolic Regression / eingereicht von Michael Kommenda. Linz, 2018}, + author = {Kommenda, Michael}, + urldate = {2025-06-28}, + date = {2018}, + langid = {english}, + file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\9LXXSHJ8\\Kommenda - 2018 - Local Optimization and Complexity Control for Symbolic Regression eingereicht von Michael Kommenda.pdf:application/pdf}, +} + +@online{pci-sig_pci_2025, + title = {{PCI} Express 6.0 Specification {\textbar} {PCI}-{SIG}}, + url = {https://pcisig.com/pci-express-6.0-specification}, + author = {{PCI-SIG}}, + urldate = {2025-06-28}, + date = {2025}, + file = {PCI Express 6.0 Specification | PCI-SIG:C\:\\Users\\danwi\\Zotero\\storage\\MSYN4ZIU\\pci-express-6.html:text/html}, +} diff --git a/thesis/title_page.pdf b/thesis/title_page.pdf new file mode 100644 index 0000000..dec62c8 Binary files /dev/null and b/thesis/title_page.pdf differ