Compare commits

..

No commits in common. "main" and "2-performance-inbounds" have entirely different histories.

25 changed files with 73 additions and 214 deletions

View File

@ -1,6 +1,6 @@
MIT License
Copyright (c) 2024 Daniel Roth
Copyright (c) 2024 Daniel Wiplinger
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal

View File

@ -27,7 +27,7 @@ function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector
results = Matrix{Float32}(undef, ncols, length(exprs))
for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially)
for i in 1:repetitions # Simulate parameter tuning
results = Interpreter.interpret(exprs, X, p)
end
@ -41,7 +41,7 @@ function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{
results = Matrix{Float32}(undef, ncols, length(exprs))
for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially)
for i in 1:repetitions # Simulate parameter tuning
results = Transpiler.evaluate(exprs, X, p)
end

View File

@ -31,9 +31,9 @@ function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parame
# Start kernel for each expression to ensure that no warp is working on different expressions
@inbounds for i in eachindex(exprs)
kernel = @cuda launch=false fastmath=true interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
kernel = @cuda launch=false interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
# config = launch_configuration(kernel.fun)
threads = min(variableCols, 128)
threads = min(variableCols, 256)
blocks = cld(variableCols, threads)
kernel(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i; threads, blocks)
@ -52,8 +52,9 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var
return
end
# firstExprIndex = ((exprIndex - 1) * stepsize[1]) + 1 # Inclusive
# lastExprIndex = firstExprIndex + stepsize[1] - 1 # Inclusive
@inbounds firstParamIndex = ((exprIndex - 1) * stepsize[1]) # Exclusive
# TODO: Use @cuDynamicSharedMem/@cuStaticSharedMem for variables and or parameters
operationStack = MVector{MAX_STACK_SIZE, Float32}(undef) # Try to get this to function with variable size too, to allow better memory usage
operationStackTop = 0 # stores index of the last defined/valid value
@ -103,7 +104,7 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var
operationStack[operationStackTop] = sqrt(operationStack[operationStackTop])
end
else
operationStack[operationStackTop] = NaN32
operationStack[operationStackTop] = NaN
break
end
end

View File

@ -73,7 +73,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
for kernel in kernels
# config = launch_configuration(kernels[i])
threads = min(variableCols, 96)
threads = min(variableCols, 256)
blocks = cld(variableCols, threads)
cudacall(kernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)

View File

@ -4,11 +4,7 @@ using BenchmarkTools
using .Transpiler
using .Interpreter
const BENCHMARKS_RESULTS_PATH = "./results-fh"
# TODO: Expressions can get much much bigger (into millions) (will be provided by Mr. Kronberger)
# TODO: Variable-Sets: 1000 can be considered the minimum; 100.000 can be considered the maximum (will be provided by Mr. Kronberger)
const BENCHMARKS_RESULTS_PATH = "./results"
exprsCPU = [
# CPU interpreter requires an anonymous function and array ref s
:(p[1] * x[1] + p[2]), # 5 op
@ -28,7 +24,7 @@ exprsGPU = [
# p is the same for CPU and GPU
p = [randn(Float32, 10) for _ in 1:length(exprsCPU)] # generate 10 random parameter values for each expr
expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially; only p changes but not X)
expr_reps = 100 # 100 parameter optimisation steps basically
@testset "CPU performance" begin
@ -73,7 +69,7 @@ end
# Add /usr/local/cuda/bin in .bashrc to PATH to access ncu and nsys (depending how well this works with my 1080 do it on my machine, otherwise re do the tests and perform them on FH PCs)
# University setup at 10.20.1.7 if needed
compareWithCPU = true
compareWithCPU = false
suite = BenchmarkGroup()
@ -93,15 +89,15 @@ if compareWithCPU
suite["CPU"]["large varset"] = @benchmarkable interpret_cpu(exprsCPU, X_large, p; repetitions=expr_reps)
end
X_small_GPU = randn(Float32, 5, varsets_small) # column-major
X_small_GPU = randn(Float32, 5, varsets_small)
suite["GPUI"]["small varset"] = @benchmarkable interpret_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
suite["GPUT"]["small varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
X_medium_GPU = randn(Float32, 5, varsets_medium) # column-major
X_medium_GPU = randn(Float32, 5, varsets_medium)
suite["GPUI"]["medium varset"] = @benchmarkable interpret_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
suite["GPUT"]["medium varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
X_large_GPU = randn(Float32, 5, varsets_large) # column-major
X_large_GPU = randn(Float32, 5, varsets_large)
suite["GPUI"]["large varset"] = @benchmarkable interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
suite["GPUT"]["large varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
@ -147,10 +143,9 @@ if compareWithCPU
println(gpuiVsGPUT_median)
println(gpuiVsGPUT_std)
BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/5-interpreter_using_fastmath.json", results)
# BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/using_inbounds.json", results)
else
resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json")[1]
# resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json")[1]
resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/using_inbounds.json")[1]
medianGPUI_old = median(resultsOld["GPUI"])
stdGPUI_old = std(resultsOld["GPUI"])

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,8 +1,6 @@
using ExpressionExecutorCuda
using Test
using BenchmarkTools
const baseFolder = dirname(dirname(pathof(ExpressionExecutorCuda)))
include(joinpath(baseFolder, "src", "Utils.jl"))
include(joinpath(baseFolder, "src", "ExpressionProcessing.jl"))
@ -21,6 +19,6 @@ end
# end
@testset "Performance tests" begin
# include("PerformanceTuning.jl")
include("PerformanceTests.jl")
include("PerformanceTuning.jl")
# include("PerformanceTests.jl")
end

View File

@ -1,5 +1,3 @@
RE-READ to ensure that concepts why this is done to improve performance and why this should be the "locally best" implementation (most should be in implementation though)
\chapter{Concept and Design}
\label{cha:conceptdesign}
% introduction to what needs to be done. also clarify terms "Host" and "Device" here
@ -27,23 +25,15 @@ The main goal of both prototypes or evaluators is to provide a speed-up compared
\end{figure}
With this, the required capabilities are outlined. However, for a better understanding, the input and output data need to be explained further. The first input contains the expressions that need to be evaluated. These can be of any length and can contain constant values, variables and parameters, all of which are linked together with the supported operations. In the simplified example shown in Figure \ref{fig:input_output_explanation}, there are six expressions $e_1$ to $e_6$.
With this, the required capabilities are outlined. However, the input and output data need to further be explained for a better understanding. The first input contains the expressions that need to be evaluated. These can have any length and can contain constant values, variables and parameters and all of these are linked together with the supported operations. In the example shown in Figure \ref{fig:input_output_explanation}, there are six expressions $e_1$ through $e_6$. Next is the variable matrix. One entry in this matrix, corresponds to one variable in every expression. The row indicates which variable it holds the value for. For example the values in row three, are used to parametrise the variable $x_3$. Each column holds a different set of variables. Each expression must be evaluated using every variable set. In the provided example, there are three variable sets, each holding the values for four variables $x_1$ through $x_4$. After all expressions are evaluated using all variable sets the results of these evaluations must be stored in the results matrix. Each entry in this matrix holds the resulting value of the evaluation of one expression parametrised with one variable set. The row indicates the variable set while the column indicates the expression.
Next is the variable matrix. An entry in this matrix corresponds to one variable in every expression. The row indicates which variable it holds the value for. For example the values in row three, are used to parameterise the variable $x_3$. Each column holds a different set of variables. Each expression must be evaluated using each set of variable. In the provided example, there are three variable sets, each containing the values for four variables $x_1$ to $x_4$. After all expressions have been evaluated using all variable sets, the results of these evaluations must be stored in the result matrix. Each entry in this matrix holds the result of the evaluation of one expression parameterised with one variable set. The row indicates the variable set and the column indicates the expression.
The prototypes developed in this thesis, are part of a GP algorithm for symbolic regression. This means that the expressions that are evaluated, represent parts of the search space of all expressions being made up of any combination of allowed operators, the set of input variables, a set of parameters and constants. This means that the size of the search space grows exponentially. Exploring this search space by simply generating expressions, evaluating them once and then generating the next set of expressions leaves much of the search space unexplored. To combat this, parameters are introduced. These allow the algorithm to perform some kind of local search. To enable this, the prototypes must support not only variables, but also parameters.
The parameters themselves are unique to each expression, meaning they have a one-to-one mapping to an expression. Furthermore, as can be seen in Figure \ref{fig:input_output_explanation}, each expression can have a different number of parameters, or even no parameters at all. However, with no parameters, it wouldn't be possible to perform parameter optimisation. This is in contrast to variables, where each expression must have the same number of variables. Because parameters are unique to each expression and can vary in size, they are not structured as a matrix, but as a vector of vectors.
An important thing to consider, is the volume and volatility of the data itself. The example used above has been drastically simplified. It is expected, that there are hundreds of expressions evaluate per GP generation. Each of these expressions may contain between ten and 50 tokens. A token is equivalent to either a variable, a parameter, a constant value or an operator.
Usually, the number of variables per expression is around ten. However, the number of variable sets can increase drastically. It can be considered, that $1\,000$ variable sets is the lower limit. On the other hand, $100\,000$ can be considered as the upper limit. Considering that one variable takes up 4 bytes of space and 10 variables are needed per expression, at least $4 * 10 * 1\,000 = 40\,000$ bytes and at most $4 * 10 * 100\,000 = 400\,000$ bytes need to be transferred to the GPU for the variables.
These variables do not change during the runtime of the symbolic regression algorithm. As a result the data only needs to be sent to the GPU once. This means that the impact of this data transfer is minimal. On the other hand, the data for the parameters is much more volatile. As explained above, they are used for parameter optimisation and therefore vary from evaluation to evaluation and need to be sent to the GPU very frequently. However, the amount of data that needs to be sent is also much smaller. TODO: ONCE I GET THE DATA SEE HOW MANY BYTES PARAMETERS TAKE ON AVERAGE
This is the minimal functionality needed to evaluate expressions with variables generated by a symbolic regression algorithm. In the case of parameter optimisation, it is useful to have a different type of variable, called parameter. For parameter optimisation it is important that for the given variable sets, the best fitting parameters need to be found. To achieve this, the evaluator is called multiple times with different parameters, but the same variables. The results are then evaluated for their fitness by the caller. In this case, the parameters do not change within one call. Parameters could therefore be treated as constant values of the expressions, and no separate input for them would be needed. However, providing the possibility to have the parameters as an input, makes the process of parameter optimisation easier. Unlike variables, not all expressions need to have the same number of parameters. Therefore, they are structured as a vector of vectors and not a matrix. The example in Figure \ref{fig:input_output_explanation} shows how the parameters are structured. For example one expression has zero parameters, while another has six parameters $p_1$ through $p_6$. It needs to be mentioned that just like the number of variables, the number of parameters per expression is not limited. It is also possible to completely omit the parameters if they are not needed. Because these evaluators will primarily be used in parameter optimisation use-cases, allowing parameters as an input is required.
% \subsection{Non-Goals}
% Probably a good idea. Probably move this to "introduction"
\section{Architecture}
Based on the requirements and data structure above, the architecture of both prototypes can be designed. While the requirements only specify the input and output, the components and workflow also need to be specified. This section aims at giving an architectural overview of both prototypes, alongside their design decisions.
Based on the requirements above, the architecture of both prototypes can be designed. While the requirements only specify the input and output, the components and workflow also need to be specified. This section aims at giving an architectural overview of both prototypes, alongside their design decisions.
\begin{figure}
\centering
@ -52,12 +42,10 @@ Based on the requirements and data structure above, the architecture of both pro
\label{fig:kernel_architecture}
\end{figure}
A design decision that has been made for both prototypes is to split the evaluation of each expression into a separate kernel or kernel dispatch as seen in Figure \ref{fig:kernel_architecture}. As explained in Section \ref{sec:thread_hierarchy}, it is desirable to reduce the occurrence of thread divergence as much as possible. Although the SIMT programming model tries to mitigate the negative effects of thread divergence, it is still a good idea to avoid it when possible. For this use-case, thread divergence can easily be avoided by not evaluating all expressions in a single kernel or kernel dispatch. GPUs are able to have multiple resident grids, with modern GPUs being able to accommodate 128 grids concurrently \parencite{nvidia_cuda_2025}. One grid corresponds to one kernel dispatch, and therefore allows up-to 128 kernels to be run concurrently. Therefore, dispatching a kernel for each expression, further increases GPU utilisation. In the case of the interpreter, having only one kernel that can be dispatched for each expression, also simplifies the kernel itself. This is because the kernel can focus on evaluating one expression and does not require additional code to handle multiple expressions at once. Similarly, the transpiler can also be simplified, as it can generate many smaller kernels rather than one big kernel. Additionally, the smaller kernels do not need any branching, because the generated code only needs to perform the operations as they occur in the expression itself. This also reduces the overhead on the GPU.
A design decision that has been made for both prototypes is to split the evaluation of each expression into a separate kernel or kernel dispatch as seen in Figure \ref{fig:kernel_architecture}. As explained in Section \ref{sec:thread_hierarchy}, it is desirable to reduce the occurrence of thread divergence as much as possible. Although the SIMT programming model tries to mitigate the negative effects of thread divergence, it is still a good idea to avoid it when possible. For this use-case, thread divergence can easily be avoided by not evaluating all expressions in a single kernel or kernel dispatch. GPUs are able to have multiple resident grids, with modern GPUs being able to accommodate 128 grids concurrently \parencite{nvidia_cuda_2025}. One grid corresponds to one kernel dispatch, and therefore allows up-to 128 kernels to be run concurrently. Therefore, dispatching a kernel for each expression, has the possibility to improve the performance. In the case of the interpreter, having only one kernel that can be dispatched for each expression, also simplifies the kernel itself. This is because the kernel can focus on evaluating one expression and does not require additional code to handle multiple expressions at once. Similarly, the transpiler can also be simplified, as it can generate many smaller kernels than one big kernel. Additionally, the smaller kernels do not need any branching, because the generated code only needs to perform the operations as they occur in the expression itself.
\subsection{Pre-Processing}
The first step in both prototypes is the pre-processing step. It is needed, as it simplifies working with the expressions in the later steps. One of the responsibilities of the pre-processor is to verify that only allowed operators and symbols are present in the given expressions. This is comparable to the work a scanner like Flex\footnote{\url{https://github.com/westes/flex}} performs. Secondly, this step also converts the expression into an intermediate representation. In essence, the pre-processing step can be compared to the frontend of a compiler as described in Section \ref{sec:compilers}. If new operators are required, the pre-processor must be extended as well. Otherwise, expressions containing these operators would be treated as invalid and never reach the evaluator.
The conversion into the intermediate representation transforms the expressions from infix-notation into postfix notation. This further allows the later parts to more easily evaluate the expressions. One of the major benefits of this notation is the implicit operator precedence. It allows the evaluators to evaluate the expressions token by token from left to right, without needing to worry about the correct order of operations. One token represents either an operator, a constant value, a variable or a parameter. Apart from the intermediate representation containing the expression in postfix notation, it also contains information about the types of the tokens themselves. This is all that is needed for the interpretation and transpilation steps. A simple expression like $x + 2$ would look like depicted in figure \ref{fig:pre-processing_results} after the pre-processing step.
The first step in both prototypes is the pre-processing step. It is needed, as it simplifies working with the expressions in the later steps. One of the responsibilities of the pre-processor is to verify that only allowed operators and symbols are present in the given expressions. This is comparable to the work a scanner like Flex\footnote{\url{https://github.com/westes/flex}} performs. Additionally, this step also converts the expression into an intermediate representation. In essence, the pre-processing step can be compared to the front-end of a compiler as described in Section \ref{sec:compilers}. The conversion into the intermediate representation transforms the expressions from infix-notation into postfix-notation. This further allows the later parts to more easily evaluate the expressions. One of the major benefits of this notation is the implicit operator precedence. It allows the evaluators to evaluate the expressions token by token from left to right, without needing to worry about the correct order of operations. One token represents either an operator, a constant value, a variable or a parameter. Apart from the intermediate representation containing the expression in postfix-notation, it also contains the information about the types of the tokens themselves. This is all that is needed for the interpretation and transpilation steps. A simple expression like $x + 2$ would look like depicted in figure \ref{fig:pre-processing_results} after the pre-processing step.
\begin{figure}
\centering
@ -66,9 +54,7 @@ The conversion into the intermediate representation transforms the expressions f
\label{fig:pre-processing_results}
\end{figure}
It would have also been possible to perform the pre-processing step on the GPU. However, pre-processing only one expression can not easily be split into multiple threads, which means one GPU thread would need to process one expression. As described in Section \ref{sec:gpgpu} a single GPU thread is slower than a single CPU thread and as a result means the processing will also be slower. Furthermore, it wouldn't make sense to process all expressions in a single kernel. This would lead to a lot of thread divergence, which essentially means processing one expression after the other. The SIMT programming model might help with parallelising at least some parts of the processing work. However, the generated expressions can differ a lot from each other and restricting them to be similar and therefore SIMT friendly, would likely reduce the overall quality of the symbolic regression algorithm. Therefore, it does not make sense to perform the processing step on the GPU.
The already mentioned concept of processing one expression per thread can also be used on the CPU, which is better designed for this type of work. Concepts such as caching processed expressions, or caching parts of the processed expressions can also be employed on the CPU to speed up pre-processing. This would not be possible on the GPU, because a GPU can not save state between two kernel dispatches. This is a typical example of code that is better run on the CPU and shows how the CPU and GPU need to work together and exploit their respective strengths to achieve the best performance.
It would have also been possible to perform the pre-processing step on the GPU. However, pre-processing only one expression can not easily be split into multiple threads, which means one GPU thread would need to process one expression. As described in Section \ref{sec:gpgpu} a single GPU thread is slower than a single CPU thread and as a result means the processing will also be slower. Furthermore, it wouldn't make sense to process all expressions in a single kernel. This would lead to a lot of thread divergence, which essentially means processing one expression after the other. The SIMT programming model might help with parallelising at least some parts of the processing work. However, the generated expressions can differ a lot from each other and restricting them to be similar and therefore SIMT friendly, would likely reduce the overall quality of the symbolic regression algorithm. Therefore, it does not make sense to perform the processing step on the GPU. This is a typical example of code that is better run on the CPU, also because the parallelisation possibilities of one thread per expression can be applied to the CPU as well. Concepts like caching processed expressions, or caching parts of the processed expressions can also be employed on the CPU. This would not be possible on the GPU, because a GPU can not save state between two kernel dispatches.
\subsection{Interpreter}
@ -79,16 +65,12 @@ The already mentioned concept of processing one expression per thread can also b
\label{fig:component_diagram_interpreter}
\end{figure}
The interpreter consists of two parts. The CPU side is the part of the program, that interacts with both the GPU and the caller. An overview of the components and the workflow of the interpreter is shown in Figure \ref{fig:component_diagram_interpreter}. Once the interpreter has received the expressions, they are pre-processed. This ensures that the expressions are valid, and that they are transformed into the intermediate representation needed to evaluate them. The result of this pre-processing step is then sent to the GPU, which performs the actual interpretation of the expressions. In addition to the expressions, the data for the variables and parameters must also be sent to the GPU.
The interpreter consists of two parts. The CPU side is the part of the program, that interacts with both the GPU and the caller. An overview on the components and the workflow of the interpreter can be seen in Figure \ref{fig:component_diagram_interpreter}. Once the interpreter receives the expressions, they are pre-processed. This ensures the expressions are valid, and that they are transformed into the intermediate representation needed for evaluating them. The results of this pre-processing are then sent to the GPU, which performs the actual interpretation of the expressions. Alongside the expressions, the data for the variables and parameters also needs to be sent to the GPU. Once all the data resides on the GPU, the interpreter kernel can be dispatched. It needs to be noted, that for each of the expressions, a separate kernel will be dispatched. As already described, this decision has been made to reduce thread divergence and therefore increase performance. In fact, dispatching the same kernel multiple times with different expressions, means, there will not occur any thread divergence as explained later. Once the GPU has finished evaluating all expressions with all variable sets, the result will be stored in a matrix on the GPU. The CPU then retrieves the results and returns them to the caller in the format specified by the requirements.
Once all the data is present on the GPU, the interpreter kernel can be dispatched. As already described, the kernel will be dispatched for each expression to reduce thread divergence. In fact, dispatching the same kernel multiple times with different expressions, means, there will not occur any thread divergence which will be explained later.
After the GPU has finished evaluating all expressions with all variable sets, the result is stored in a matrix on the GPU. The CPU then retrieves the results and returns them to the caller in the format specified by the requirements.
Evaluating the expressions is relatively straight forward. Because the expressions are in postfix notation, the actual interpreter just needs to iterate over all the tokens and perform the appropriate tasks. If the interpreter encounters a binary operator, it simply needs to read the previous two values and perform the operation specified by the operator. For unary operators, only the previous value needs to be read. As already mentioned, expressions in postfix notation implicitly contain the operator precedence, therefore no look-ahead or other strategies need to be used to ensure correct evaluation. This also means that each token is visited exactly once and no unnecessary or overhead work needs to be done. The Algorithm \ref{alg:eval_interpreter} shows how the interpreter works. Note that this is a simplified version, that only works with additions, multiplications, constants and variables.
Evaluating the expressions is relatively straight forward. Due to the expressions being in postfix-notation, the actual interpreter must only iterate over all tokens once and perform the appropriate tasks. If the interpreter encounters a binary operator, it must simply read the previous two values and perform the operation specified by the operator. For unary operators, only the previous value must be read. As already mentioned, expressions in postfix-notation implicitly contain the operator precedence, therefore no look-ahead or other strategies need to be used to ensure correct evaluation. The Algorithm \ref{alg:eval_interpreter} shows how the interpreter works. Note that this is a simplified version, that only works with additions, multiplications, constant values and variables.
\begin{algorithm}
\caption{Interpreting an equation in postfix notation}\label{alg:eval_interpreter}
\caption{Interpreting an equation in postfix-notation}\label{alg:eval_interpreter}
\begin{algorithmic}[1]
\Procedure{Evaluate}{\textit{expr}: PostfixExpression}
\State $\textit{stack} \gets []$
@ -112,18 +94,14 @@ Evaluating the expressions is relatively straight forward. Because the expressio
\EndIf
\EndWhile
\State StoreResult($\text{Pop}(\textit{stack})$)
\Return $\text{Pop}(\textit{stack})$
\EndProcedure
\end{algorithmic}
\end{algorithm}
The handling of constants and variables is very simple. Constants only need to be stored on the stack for later use. Variables also only need to be stored on the stack. However, their value must first be loaded from the variable matrix according to the token value of the variable. Since the entire variable matrix is sent to the GPU, the index of the variable set is also needed to load the variable value. However, for the sake of simplicity, it has been omitted from the algorithm.
If a new operator is needed, it must simply be added as another else-if block inside the operator branch. New token types like variables or parameters, can also be added by adding a new outer else-if block that checks for these token types. However, the pre-processing step also needs to be extended with these new operators and token types. Otherwise, the expression will never reach the evaluation step as they would be seen as invalid. It is also possible to add unary operators like $\log()$. In this case only one value would be read from the stack, the operation would be applied, and the result would be written back to the stack.
When an operator token is encountered, the handling becomes more complex. The value of the token indicates the type of operation to be applied. For binary operators, the top two values on the stack need to be used as input to the operator. For unary operators, only the top value of the stack needs to be used as an input. Once the result has been computed, it must be stored at the top of the stack to be used as an input for the next operation.
At the end of the algorithm, the stack contains one last entry. This entry is the value computed by the expression with the designated variable set and parameters. In order to send this value back to the CPU, it must be stored in the result matrix. The last statement performs this action. It again has been simplified to omit the index of the expression and variable set needed to store the result at the correct location.
The Algorithm \ref{alg:eval_interpreter} in this case resembles the kernel. This kernel will be dispatched for each expression that needs to be evaluated, to prevent thread divergence. Thread divergence can only occur on data-dependent branches. In this case, the while loop and every if and else-if statement contains a data-dependent branch. Depending on the expression passed to the kernel, the while loop may run longer than for another expression. Similarly, not all expressions have the same constants, operators or variables in the same order, and would therefore cause each thread to take a different path. However, one expression always has the same constants, operators and variables in the same locations, meaning that all threads will take the same path. This also means that although the interpreter contains many data-dependent branches, these branches only depend on the expression itself. Because of this, all threads will follow the same path and will therefore never diverge from one another as long as they are executing the same expression.
The Algorithm \ref{alg:eval_interpreter} in this case resembles the kernel. This kernel will be dispatched for every expression that needs to be evaluated, to eliminate thread divergence. Thread divergence can only happen on data dependent branches. In this case, the while loop and every if and else-if statement contains a data dependent branch. Depending on the expression passed to the kernel, the while loop may run longer than for another expression. Similarly, not all expressions have the same constants, operators and variables in the same order and would therefore lead to each thread, taking different paths. However, one expression, always has the same constants, operators and variables in the same locations, meaning all threads will take the same paths. This also means that despite the interpreter containing many data dependent branches, these branches only depend on the expression itself. Because of this, all threads will take the same paths and therefore will never diverge from one another if they execute the same expression.
\subsection{Transpiler}
@ -134,16 +112,12 @@ The Algorithm \ref{alg:eval_interpreter} in this case resembles the kernel. This
\label{fig:component_diagram_transpiler}
\end{figure}
Similar to the interpreter, the transpiler also consists of a part that runs on the CPU and a part that runs on the GPU. Looking at the components and workflow of the transpiler, as shown in Figure \ref{fig:component_diagram_transpiler}, it is almost identical to the interpreter. However, the key difference between the two, is the additional code generation, or transpilation step. Apart from that, the transpiler also needs the same pre-processing step and also the GPU to evaluate the expressions. However, the GPU evaluator generated by the transpiler works very differently to the GPU evaluator for the interpreter. The difference between these evaluators will be explained later.
Similar to the interpreter, the transpiler also consists of a part that runs on the CPU and a part that runs on the GPU. When looking at the components and workflow of the transpiler, as shown in Figure \ref{fig:component_diagram_transpiler}, it is almost identical to the interpreter. However, the key difference between the two, is the additional code generation, or transpilation step. Apart from that, the transpiler also needs the same pre-processing step and also the GPU to evaluate the expressions. However, the GPU evaluator generated by the transpiler works differently to the GPU evaluator for the interpreter. The difference between these evaluators will be explained later.
Before the expressions can be transpiled into PTX code, they have to be pre-processed. As already described, this step ensures the validity of the expressions and transforms them into the intermediate representation described above. As with the interpreter, this also simplifies the code generation step. By transforming the expressions into postfix notation, the code generation follows a similar pattern to the interpretation already described.
Algorithm \ref{alg:transpile} shows how the transpiler takes an expression, transpiles it and then returns the finished code. It can be seen that the while loop is largely the same as the while loop of the interpreter. The main difference is in the operator branches, because now code needs to be generated instead of the result of computing the expression. Therefore, the branches themselves call their designated code generation function, such as $\textit{GetAddition}$. This function returns the PTX code responsible for the addition. However, this function must return more than just the code that performs the addition. When executed, this addition also returns a value which will be needed as an input by other operators. Therefore, not only the code fragment must be returned, but also the reference to the result.
This reference can then be put on the stack for later use, just as the interpreter stores the value for later use. The code fragment must also be added to the already generated code so that it can be returned to the caller. As with the interpreter, there is a final value on the stack when the loop has finished. Once the code has been executed, this value is the reference to the result of the expression. This value then needs to be stored in the result matrix, so that it can be retrieved by the CPU after all expressions have been executed on the GPU. Therefore, a final code fragment must be generated to handle the storage of this value in the result matrix. This fragment must then be added to the code already generated, and the transpilation process is complete.
Before the expressions can be transpiled into PTX code, they need to be pre-processed. As already described, this step ensures the validity of the expressions and transforms them into the intermediate representation described above. As with the interpreter, this also simplifies the code generation step at the cost of some performance because the validity has to be ensured, and the intermediate representation needs to be generated. However, in this case the benefit of having a simple code generation step was more important than performance. By transforming the expressions into postfix-notation, the code generation follows a similar pattern to the interpretation already described. Algorithm \ref{alg:transpile} shows how the transpiler takes an expression, transpiles it and then returns the finished code. It can be seen that the while loop is the same as the while loop of the interpreter. The main difference is in the operator branches. Because now code needs to be generated, the branches themselves call their designated code generation function, such as $\textit{GetAddition}$. However, this function can not only return the code that performs the addition for example. When executed, this addition also returns a value which will be needed as an input by other operators. Therefore, not only the code fragment must be returned, but also the reference to the result. This reference can then be put on the stack for later use the same as the interpreter stores the value for later use. The code fragment must also be added to the already generated code so that it can be returned to the caller. As with the interpreter, there is a final value on the stack when the loop has finished. Once the code is executed, this value is the reference to the result of the expression. This value then needs to be stored in the results matrix, so that it can be retrieved by the CPU after all expressions have been executed on the GPU. Therefore, one last code fragment must be generated to handle the storage of this value in the results matrix. This fragment must then be added to the code already generated, and the transpilation process is completed.
\begin{algorithm}
\caption{Transpiling an equation in postfix notation}\label{alg:transpile}
\caption{Transpiling an equation in postfix-notation}\label{alg:transpile}
\begin{algorithmic}[1]
\Procedure{Transpile}{\textit{expr}: PostfixExpression}: String
\State $\textit{stack} \gets []$
@ -182,10 +156,10 @@ This reference can then be put on the stack for later use, just as the interpret
\end{algorithmic}
\end{algorithm}
The code generated by the transpiler is the kernel for the transpiled expressions. This means that a new kernel must be generated for each expression that needs to be evaluated. This is in contrast to the interpreter, which has one kernel and dispatches it once for each expression. However, generating one kernel per expression results in a much simpler kernel. This allows the kernel to focus on evaluating the postfix expression from left to right. There is no overhead work such as branching or managing a stack. However, this overhead is now shifted to the transpilation step on the CPU which can be seen in Algorithm \ref{alg:transpile}. There is also a noticeable overhead in that a kernel has to be generated for each expression. In cases like parameter optimisation, many of the expressions will be transpiled multiple times as the transpiler is called multiple times with the same expressions.
The code generated by the transpiler is the kernel for the transpiled expressions. This means that a new kernel must be generated for each expression that needs to be evaluated. This is in contrast to the interpreter, which has one kernel and dispatches it once for each expression. However, generating one kernel per expression results in a much simpler kernel. This allows the kernel to focus on evaluating the postfix expression from left to right. No overhead work, like branching or managing a stack is needed. However, this overhead is now offloaded to the transpilation step on the CPU as can be seen in Algorithm \ref{alg:transpile}. There is also a noticeable overhead in that a kernel has to be generated for each expression. In cases like parameter optimisation, many of the expressions will be transpiled multiple times as the transpiler is called multiple times with the same expressions.
Both the transpiler and the interpreter have their respective advantages and disadvantages. While the interpreter puts less load on the CPU, the GPU has to perform more work. Much of this work involves branching or managing a stack, and therefore involves many instructions that are not used to evaluate the expression itself. However, this overhead can be mitigated by the fact, that all this work is performed in parallel rather than sequentially.
Both the transpiler and the interpreter have their respective advantages and disadvantages. While the interpreter puts less load on the CPU, the GPU has to perform more work. Much of this work is branching or managing a stack and therefore involves many instructions that are not used to evaluate the expression itself. However, this overhead can be mitigated by the fact, that all of this overhead is performed in parallel and not sequentially.
On the other hand, the transpiler performs more work on the CPU. The kernels are much simpler, and most of the instructions are used to evaluate the expressions themselves. Furthermore, as explained in Section \ref{sec:ptx}, any program running on the GPU, must be transpiled into PTX code before the driver can compile it into machine code. Therefore, the kernel written for the interpreter, must also be transpiled into PTX. This overhead is in addition to the branch instruction overhead. The self-written transpiler removes this intermediate step by transpiling directly into PTX. In addition, the generated code is tailored to evaluate expressions and does not need to generate generic PTX code, which can reduce transpilation time.
On the other hand, the transpiler performs more work on the CPU. The kernels are much simpler, and most of the instructions are used to evaluate the expressions themselves. Furthermore, as explained in Section \ref{sec:ptx}, any program running on the GPU, must be transpiled into PTX code before the driver can compile it into machine code. Therefore, the kernel written for the interpreter, must also be transpiled into PTX. This overhead is in addition to the branch instruction overhead. The self-written transpiler removes this intermediate step by transpiling directly to PTX. In addition, the generated code is tailored to evaluate expressions and does not need to generate generic PTX code, which can reduce transpilation time.
Unlike the GPU, the CPU can manage state across multiple kernel dispatches. Concepts such as caches can be employed by the transpiler to reduce the overhead on the CPU. In cases such as parameter optimisation, where expressions remain the same across multiple calls, the resulting PTX code can be cached. As a result, the same expression doesn't need to be transpiled multiple times which drastically reducing the transpilation time. This is an important optimisation as this can improve the overall performance of the transpiler.
Unlike the GPU, the CPU can manage state across multiple calls. Concepts such as caches can be employed by the transpiler to reduce the overhead on the CPU. In cases such as parameter optimisation, where expressions remain the same over multiple calls, the resulting PTX code can be cached. As a result the same expression doesn't need to be transpiled multiple times, drastically reducing the transpilation time and therefore improving the overall performance of the transpiler.

View File

@ -2,11 +2,8 @@
\label{cha:conclusion}
Summarise the results
talk again how a typical input is often not complex enough (basically repeat that statement from comparison section in evaluation)
\section{Future Work}
talk about what can be improved
Transpiler: transpile expression directly from Julia AST -> would save time because no intermediate representation needs to be created (looses step and gains performance, but also makes transpiler itself more complex)
CPU Interpreter: Probably more worth to dive into parallelising cpu interpreter itself (not really future work, as you wouldn't write a paper about that)
Transpiler: transpile expression directly from Julia AST -> would save time because no intermediate representation needs to be created (looses step and gains performance, but also makes transpiler itself more complex)

View File

@ -1,14 +1,9 @@
\chapter{Evaluation}
\label{cha:evaluation}
The aim of this thesis is to determine whether at least one of the GPU evaluators is faster than the current CPU evaluator. This chapter describes the performance evaluation. First, the environment in which the performance tests are performed is explained. Then the individual results for the GPU interpreter and the transpiler are presented. In addition, this part also includes the performance tuning steps taken to achieve these results. Finally, the results of the GPU evaluators are compared to the CPU evaluator in order to answer the research questions of this thesis.
\section{Test environment}
Explain the hardware used, as well as the actual data (how many expressions, variables etc.)
three scenarios -> few, normal and many variable sets;; expr repetitions to simulate parameter optimisation
Benchmarktools.jl -> 1000 samples per scenario
\section{Results}
talk about what we will see now (results only for interpreter, then transpiler and then compared with each other and a CPU interpreter)
@ -21,9 +16,6 @@ Initial: CPU-Side single-threaded; up to 1024 threads per block; bounds-checking
1.) Blocksize reduced to a maximum of 256 -> moderate improvement in medium and large
2.) Using @inbounds -> noticeable improvement in 2 out of 3
3.) Tuned blocksize with NSight compute -> slight improvement
4.) used int32 everywhere to reduce register usage -> significant performance drop (probably because a lot more waiting time "latency hiding not working basically", or more type conversions happening on GPU? look at generated PTX code and use that as an argument to describe why it is slower)
5.) reverted previous; used fastmath instead -> imporvement (large var set is now faster than on transpiler)
\subsection{Transpiler}
Results only for Transpiler (also contains final kernel configuration and probably quick overview/recap of the implementation used and described in Implementation section
@ -34,11 +26,6 @@ Initial: CPU-Side single-threaded; up to 1024 threads per block; bounds-checking
1.) Blocksize reduced to a maximum of 256 -> moderate improvement in medium and large
2.) Using @inbounds -> small improvement only on CPU side code
3.) Tuned blocksize with NSight compute -> slight improvement
4.) Only changed things on interpreter side
5.) Only changed things on interpreter side
\subsection{Comparison}
Comparison of Interpreter and Transpiler as well as Comparing the two with CPU interpreter
talk about that compute portion is just too little. Only more complex expressions with higher var set count benefit well (make one or two performance evaluations, with 10 larger expressions and at least 1k var sets and present that here as point for that statement)
Comparison of Interpreter and Transpiler as well as Comparing the two with CPU interpreter

View File

@ -3,8 +3,6 @@
somewhere in here explain why one kernel per expression and not one kernel for all expressions
Go into the details why this implementation is tuned towards performance and should be the optimum at that
\section{Technologies}
Short section; CUDA, PTX, Julia, CUDA.jl

View File

@ -1,7 +1,7 @@
\chapter{Introduction}
\label{cha:Introduction}
This chapter provides an entry point for this thesis. First the motivation of exploring this topic is presented. In addition, the research questions of this thesis are outlined. Lastly the methodology on how to answer these questions will be explained. This master thesis is associated with the FFG COMET project ProMetHeus (\#904919). The developed software is used and further developed for modelling in the ProMetHeus project.
This chapter provides an entry point for this thesis. First the motivation of exploring this topic is presented. In addition, the research questions of this thesis are outlined. Lastly the methodology on how to answer these questions will be explained.
\section{Background and Motivation}
%
@ -9,9 +9,11 @@ This chapter provides an entry point for this thesis. First the motivation of ex
%
Optimisation and acceleration of program code is a crucial part in many fields. For example video games need optimisation to lower the minimum hardware requirements which allows more people to run the game, increasing sales. Another example where optimisation is important are computer simulations. For those, optimisation is even more crucial, as this allows the scientists to run more detailed simulations or get the simulation results faster. Equation learning or symbolic regression is another field that can heavily benefit from optimisation. One part of equation learning, is to evaluate the expressions generated by a search algorithm which can make up a significant portion of the runtime. This thesis is concerned with optimising the evaluation part to increase the overall performance of equation learning algorithms.
The following expression $5 - \text{abs}(x_1) \, \sqrt{p_1} / 10 + 2^{x_2}$ which contains simple mathematical operations as well as variables $x_n$ and parameters $p_n$ is one example that can be generated by the equation learning algorithm, Usually an equation learning algorithm generates multiple of such expressions per iteration. Out of these expressions all possibly relevant ones have to be evaluated. Additionally, multiple different values need to be inserted for all variables and parameters, drastically increasing the amount of evaluations that need to be performed.
The following expression $5 - \text{abs}(x_1) * \text{sqrt}(x_2) / 10 + 2 \char`^ x_3$ which contains simple mathematical operations as well as variables $x_n$ and parameters $p_n$ is one example that can be generated by the equation learning algorithm, Usually an equation learning algorithm generates multiple of such expressions per iteration. Out of these expressions all possibly relevant ones have to be evaluated. Additionally, multiple different values need to be inserted for all variables and parameters, drastically increasing the amount of evaluations that need to be performed.
In his blog, \textcite{sutter_free_2004} described how the free lunch is over in terms of the ever-increasing performance of hardware like the CPU. He states that to gain additional performance, developers need to start developing software for multiple cores and not just hope that on the next generation of CPUs the program magically runs faster. While this approach means more development overhead, a much greater speed-up can be achieved. However, in some cases the speed-up achieved by this is still not large enough and another approach is needed. One of these approaches is the utilisation of Graphics Processing Units (GPUs) as an easy and affordable option as compared to compute clusters. Especially when talking about performance per dollar, GPUs are very inexpensive as found by \textcite{brodtkorb_graphics_2013}. \textcite{michalakes_gpu_2008} have shown a noticeable speed-up when using GPUs for weather simulation. In addition to computer simulations, GPU acceleration also can be found in other places such as networking \parencite{han_packetshader_2010} or structural analysis of buildings \parencite{georgescu_gpu_2013}. These solutions were all developed using CUDA\footnote{\url{https://developer.nvidia.com/cuda-toolkit}}. However, it is also possible to develop assembly like code for GPUs using Parallel Thread Execution (PTX)\footnote{\url{https://docs.nvidia.com/cuda/parallel-thread-execution/}} to gain more control.
In his blog, \textcite{sutter_free_2004} described how the free lunch is over in terms of the ever-increasing performance of hardware like the CPU. He states that to gain additional performance, developers need to start developing software for multiple cores and not just hope that on the next generation of CPUs the program magically runs faster. While this approach means more development overhead, a much greater speed-up can be achieved. However, in some cases the speed-up achieved by this is still not large enough and another approach is needed. One of these approaches is the utilisation of Graphics Processing Units (GPUs) as an easy and affordable option as compared to compute clusters. Especially when talking about performance per dollar, GPUs are very inexpensive as found by \textcite{brodtkorb_graphics_2013}. \textcite{michalakes_gpu_2008} have shown a noticeable speed-up when using GPUs for weather simulation. In addition to computer simulations, GPU acceleration also can be found in other places such as networking \parencite{han_packetshader_2010} or structural analysis of buildings \parencite{georgescu_gpu_2013}.
% TODO: Incorporate PTX somehow
\section{Research Question}

View File

@ -3,53 +3,29 @@
The goal of this chapter is to provide an overview of equation learning or symbolic regression to establish common knowledge of the topic and problem this thesis is trying to solve. First the field of equation learning is explored which helps to contextualise the topic of this thesis. The main part of this chapter is split into two sub-parts. The first part is exploring research that has been done in the field of general purpose computations on the GPU (GPGPU) as well as the fundamentals of it. Focus lies on exploring how graphics processing units (GPUs) are used to achieve substantial speed-ups and when and where they can be effectively employed. The second part describes the basics of how interpreters and compilers are built and how they can be adapted to the workflow of programming GPUs. When discussing GPU programming concepts, the terminology used is that of Nvidia and may differ from that used for AMD GPUs.
\section{Equation learning}
Equation learning is a field of research that can be used for understanding and discovering equations from a set of data from various fields like mathematics and physics. Data is usually much more abundant while models often are elusive which is demonstrated by \textcite{guillemot_climate_2022} where they explain how validating the models against large amounts of data is a big part in creating such models. Because of this effort, generating equations with a computer can more easily lead to discovering equations that describe the observed data. In one instance \textcite{werner_informed_2021} described that they want to find an expression to predict the power loss of an electric machine based on known input values. They used four inputs, direct and quadratic current as well as temperature and motor speed, and they have an observed output which is the power loss. With the help of an equation learner, they were able to generate useful results.
Equation learning is a field of research that can be used for understanding and discovering equations from a set of data from various fields like mathematics and physics. Data is usually much more abundant while models often are elusive which is demonstrated by \textcite{guillemot_climate_2022} where they explain how validating the models against large amounts of data is a big part in creating such models. Because of this effort, generating equations with a computer can more easily lead to discovering equations that describe the observed data. \textcite{brunton_discovering_2016} describe an algorithm that leverages equation learning to discover equations for physical systems. A more literal interpretation of equation learning is demonstrated by \textcite{pfahler_semantic_2020}. They use machine learning to learn the form of equations. Their aim was to simplify the discovery of relevant publications by the equations they use and not by technical terms, as they may differ by the field of research. However, this kind of equation learning is not relevant for this thesis.
A more literal interpretation of equation learning is demonstrated by \textcite{pfahler_semantic_2020}. They use machine learning to learn the form of equations to simplify the discovery of relevant publications. Instead of searching for keywords which might differ from one field of research to another, they allow searching by the equations the publications use. This helps as the form of equations stay the same over different fields and are therefore not subject to specific terminology. However, this form of equation learning is not relevant for this thesis.
Symbolic regression is a subset of equation learning, that specialises more towards discovering mathematical equations. A lot of research is done in this field. Using genetic programming (GP) for different problems, including symbolic regression, was first described by \textcite{koza_genetic_1994}. He described that finding a computer program to solve a problem for a given input and output, can be done by traversing the search space of all solutions. This fits well for the goal of symbolic regression, where a mathematical expression needs to be found to describe a problem with specific inputs and outputs. Later, \textcite{koza_human-competitive_2010} provided an overview of results that were generated with the help of GP and were competitive with human solutions, showing how symbolic regression is a useful tool. In their book Symbolic Regression, \textcite{kronberger_symbolic_2024} show how symbolic regression can be applied for real world scenarios. They also describe symbolic regression in great detail, while being tailored towards beginners and experts alike.
Symbolic regression is a subset of equation learning, that specialises more towards discovering mathematical equations. A lot of research is done in this field. Using the evolutionary algorithm genetic programming (GP) for different problems, including symbolic regression, was first popularised by \textcite{koza_genetic_1994}. He described that finding a computer program to solve a problem for a given input and output, can be done by traversing the search space of relevant solutions. This fits well for the goal of symbolic regression, where a mathematical expression needs to be found to describe a problem with specific inputs and outputs. Later, \textcite{koza_human-competitive_2010} provided an overview of results that were generated with the help of GP and were competitive with human solutions, showing how symbolic regression is a useful tool. In their book Symbolic Regression, \textcite{kronberger_symbolic_2024} show how symbolic regression can be applied for real world scenarios. One of these scenarios is finding simpler but still accurate models for hydrodynamic simulations to speed up the design process of ship hulls. Another one is finding an expression to find the remaining capacity of a Lithium-ion battery by measuring its voltage. In total, they described ten scenarios from different domains to show the capabilities of symbolic regression.
\textcite{keijzer_scaled_2004} and \textcite{korns_accuracy_2011} presented ways of improving the quality of symbolic regression algorithms, making symbolic regression more feasible for problem-solving. \textcite{bartlett_exhaustive_2024} describe an exhaustive approach for symbolic regression which can find the true optimum for perfectly optimised parameters while retaining simple and interpretable results. Alternatives to GP for symbolic regression also exist with one proposed by \textcite{jin_bayesian_2020}. Their approach increased the quality of the results noticeably compared to GP alternatives. Another alternative to heuristics like GP is the usage of neural networks. One such alternative has been introduced by \textcite{martius_extrapolation_2016} where they used a neural network for their equation learner with mixed results. Later, an extension has been provided by \textcite{sahoo_learning_2018}. They introduced the division operator, which led to much better results. Further improvements have been described by \textcite{werner_informed_2021} with their informed equation learner. By incorporating domain expert knowledge they could limit the search space and find better solutions for particular domains. One drawback of these three implementations is the fact that their neural networks are fixed. An equation learner which can change the network at runtime and therefore evolve over time is proposed by \textcite{dong_evolving_2024}. Their approach further improved the results of neural network equation learners. In their work, \textcite{lemos_rediscovering_2022} also used a neural network for symbolic regression. They were able to find an equivalent to Newton's law of gravitation and rediscovered Newton's second and third law only with trajectory data of bodies of our solar system. Although these laws were already known, this research has shown how neural networks and machine learning in general have great potential. An implementation for an equation learner in the physics domain is proposed by \textcite{sun_symbolic_2023}. Their algorithm was specifically designed for nonlinear dynamics often occurring in physical systems. When compared to other implementations their equation learner was able to create better results but has the main drawback of high computational cost. As seen by these publications, increasing the quality of generated equations and also increasing the speed of finding these equations is a central part in symbolic regression and equation learning in general.
\textcite{keijzer_scaled_2004}, \textcite{gustafson_improving_2005}, \textcite{korns_accuracy_2011}, \textcite{korns_extremely_2015}, \textcite{bruneton_enhancing_2025} and many more presented ways of improving the quality of symbolic regression algorithms, making symbolic regression more feasible for problem-solving. \textcite{bartlett_exhaustive_2024} describe an exhaustive approach for symbolic regression which can find the true optimum for perfectly optimised parameters while retaining simple and interpretable results.
Alternatives to GP for symbolic regression also exist with for example Bayesian Symbolic Regression as proposed by \textcite{jin_bayesian_2020}. Their approach increased the quality of the results noticeably compared to GP alternatives by for example incorporating prior knowledge. In order to avoid overfitting, \textcite{bomarito_bayesian_2022} have proposed a way of using Bayesian model selection to combat overfitting and reduce the complexity of the generated expressions. This also helps with making the expressions more generalisable and therefore be applicable to unseen inputs.
Another alternative to meta-heuristics like GP is the usage of neural networks. One such alternative has been introduced by \textcite{martius_extrapolation_2016} where they used a neural network for their equation learner with mixed results. Later, an extension has been provided by \textcite{sahoo_learning_2018}. They introduced the division operator, which led to much better results. Further improvements have been described by \textcite{werner_informed_2021} with their informed equation learner. By incorporating domain expert knowledge they could limit the search space and find better solutions for particular domains. One drawback of these three implementations is the fact that their neural networks are fixed. An equation learner which can change the network at runtime and therefore evolve over time is proposed by \textcite{dong_evolving_2024}. Their approach further improved the results of neural network equation learners. In their work, \textcite{lemos_rediscovering_2022} also used a neural network for symbolic regression. They were able to find an equivalent to Newton's law of gravitation and rediscovered Newton's second and third law only with trajectory data of bodies of our solar system. Although these laws were already known, this research has shown how neural networks and machine learning in general have great potential.
An implementation for an equation learner in the physics domain is proposed by \textcite{brunton_discovering_2016}. Their algorithm was specifically designed for nonlinear dynamics often occurring in physical systems. An improvement to this approach was introduced by \textcite{sun_symbolic_2023} where they used Monte Carlo tree search. When compared to other implementations their equation learner was able to create better results but has the main drawback of high computational cost.
% As seen by these publications, increasing the quality of generated equations and also increasing the speed of finding these equations is a central part in symbolic regression and equation learning in general.
% A survey conducted by \textcite{dabhi_survey_2012} shows how overfitting is not desirable and why more generalisable solutions are preferred.
To generate an equation, first the operators need to be defined that make up the equation. It is also possible to define a maximum length for an expression as proposed by \textcite{koza_genetic_1994}. Expressions also consist of constants as well as variables which represent the inputs. Assuming that a given problem has two variables and one parameter, the equation learner could generate an expression as seen in Equation \ref{eq:example} where $x_n$ are the variables, $p_1$ is the parameter and $O$ is the output which should correspond to the observed output for the given variables.
As described earlier, the goal of equation learning is to find an expression that fits a given set of data. The data usually consists of a set of inputs that have been applied to the unknown expression and the output after the input has been applied. An example for such data is described by \textcite{werner_informed_2021}. In one instance they want to find the power loss formula for an electric machine. They used four inputs, direct and quadratic current as well as temperature and motor speed, and they have an observed output which is the power loss. Now for an arbitrary problem with different input and outputs, the equation learner tries to find an expression that fits this data \parencite{koza_genetic_1994}. Fitting in this context means that when the input is applied to the expression, the result will be the same as the observed output. In order to avoid overfitting \textcite{bomarito_bayesian_2022} have proposed a way of using Bayesian model selection to combat overfitting and reduce the complexity of the generated expressions. This also helps with making the expressions more generalisable and therefore be applicable to unseen inputs. A survey conducted by \textcite{dabhi_survey_2012} shows how overfitting is not desirable and why more generalisable solutions are preferred. To generate an equation, first the operators need to be defined that make up the equation. It is also possible to define a maximum length for an expression as proposed by \textcite{bartlett_exhaustive_2024}. Expressions also consist of constants as well as variables which represent the inputs. Assuming that a given problem has three variables, the equation learner could generate an expression as seen in \ref{eq:example} where $x_n$ are the variables and $O$ is the output which should correspond to the observed output for the given variables.
\begin{equation} \label{eq:example}
O = 5 - \text{abs}(x_1) + x_2 \, \sqrt{p_1} / 10
O = 5 - \text{abs}(x_1) * \text{sqrt}(x_2) / 10 + 2 \char`^ x_3
\end{equation}
A typical equation learner generates multiple expressions at once. If for example the equation learner generates $300$ expressions per GP generation, each of these expressions needs to be evaluated at least once to determine how well they can produce the desired output. Each expression lies in a different part of the search space and with only the variables, it would not easily be possible to explore the surrounding search space. To perform for example local search in this area, the parameter $p_1$ can be used. This local search phase helps to find the local or even global optimum. For example $50$ local search steps can be used, meaning that each expression needs to be evaluated $50$ times with the same variables, but different parameters. As a result, one GP generation consequently requires a total $300 * 50 = 15\,000$ evaluations of the expressions. However, typically more than one GP generation is needed to find a good local optimum. While the exact number of generations is problem specific, for this example a total of $100$ generations can be assumed. Each generation again generates $300$ expressions and needs to perform $50$ local search steps. This results in a total of $300 * 50 * 100 = 1\,500\,000$ evaluations which need to be performed during the entire runtime of the GP algorithm. These values have been taken from the equation learner for predicting discharge voltage curves of batteries as described by \textcite{kronberger_symbolic_2024}. Their equation learner converged after 54 generations, resulting in $300 * 50 * 54 \approx 800\,000$ evaluations. Depending on the complexity of the generated expressions, performing all of these evaluations takes up a lot of the runtime. Their results took over two days to compute on an eight core desktop CPU. While they did not provide runtime information for all problems they tested, the voltage curve prediction was the slowest. The other problems were in the range of a few seconds and up to a day. Especially the problems that took several hours to days to finish show, that there is still room for performance improvements. While a better CPU with more cores can be used, it is interesting to determine, if using GPUs can yield noticeable better performance.
A typical equation learner generates multiple expressions at once. If the equation learner generates $300$ expressions and each expression needs to be evaluated $50$ times to get the best parametrisation for each of these expressions, the total number of evaluations is $300 * 50 = 15\,000$. However, it is likely that multiple runs or generations in the context of GP need to be performed. The number of generations is dependent to the problem, but assuming a maximum of $100$ generations, the total number of evaluations is equal to $300 * 50 * 100 = 1\,500\,000$. These values have been taken from the equation learner for predicting discharge voltage curves of batteries as described by \textcite{kronberger_symbolic_2024}. Their equation learner converged after 54 generations, resulting in evaluating $800\,000$ expressions. Depending on the complexity of the generated expressions, performing all of these evaluations takes up a lot of the runtime. Their results took over two days on an eight core desktop CPU. While they did not provide runtime information for all problems they tested, the voltage curve prediction was the slowest. The other problems were in the range of a few seconds and up to a day. Especially the problems that took several hours to days to finish show, that there is still room for performance improvements. While a better CPU with more cores can be used, it is interesting to determine, if using Graphics cards can yield noticeable better performance or not, which is the goal of this thesis.
\section[GPGPU]{General Purpose Computation on Graphics Processing Units}
\label{sec:gpgpu}
Graphics cards (GPUs) are commonly used to increase the performance of many different applications. Originally they were designed to improve performance and visual quality in games. \textcite{dokken_gpu_2005} first described the usage of GPUs for general purpose programming (GPGPU). They have shown how the graphics pipeline can be used for GPGPU programming. Because this approach also requires the programmer to understand the graphics terminology, this was not a great solution. Therefore, Nvidia released CUDA\footnote{\url{https://developer.nvidia.com/cuda-toolkit}} in 2007 with the goal of allowing developers to program GPUs independent of the graphics pipeline and terminology. A study of the programmability of GPUs with CUDA and the resulting performance has been conducted by \textcite{huang_gpu_2008}. They found that GPGPU programming has potential, even for non-embarassingly parallel problems.
Graphics cards (GPUs) are commonly used to increase the performance of many different applications. Originally they were designed to improve performance and visual quality in games. \textcite{dokken_gpu_2005} first described the usage of GPUs for general purpose programming (GPGPU). They have shown how the graphics pipeline can be used for GPGPU programming. Because this approach also requires the programmer to understand the graphics terminology, this was not a great solution. Therefore, Nvidia released CUDA\footnote{\url{https://developer.nvidia.com/cuda-toolkit}} in 2007 with the goal of allowing developers to program GPUs independent of the graphics pipeline and terminology. A study of the programmability of GPUs with CUDA and the resulting performance has been conducted by \textcite{huang_gpu_2008}. They found that GPGPU programming has potential, even for non-embarassingly parallel problems. Research is also done in making the low level CUDA development simpler. \textcite{han_hicuda_2011} have described a directive-based language to make development simpler and less error-prone, while retaining the performance of handwritten code. To drastically simplify CUDA development, \textcite{besard_effective_2019} showed that it is possible to develop with CUDA in the high level programming language Julia\footnote{\url{https://julialang.org/}} with similar performance to CUDA written in C. In a subsequent study \textcite{lin_comparing_2021} found, that high performance computing (HPC) on the CPU and GPU in Julia performs similar to HPC development in C. This means that Julia can be a viable alternative to Fortran, C and C++ in the HPC field. Additional Julia has the benefit of developer comfort since it is a high level language with modern features such as a garbage-collector. \textcite{besard_rapid_2019} have also shown how the combination of Julia and CUDA help in rapidly developing HPC software. While this thesis in general revolves around CUDA, there also exist alternatives by AMD called ROCm\footnote{\url{https://www.amd.com/de/products/software/rocm.html}} and a vendor independent alternative called OpenCL\footnote{\url{https://www.khronos.org/opencl/}}. If not specified otherwise, the following section and its subsections use the information presented by \textcite{nvidia_cuda_2025} in their CUDA programming guide.
Research is also done in making the low level CUDA development simpler. \textcite{han_hicuda_2011} have described a directive-based language to make development simpler and less error-prone, while retaining the performance of handwritten code. To drastically simplify CUDA development, \textcite{besard_effective_2019} showed that it is possible to develop with CUDA in the high level programming language Julia\footnote{\url{https://julialang.org/}} with similar performance to CUDA written in C. In a subsequent study \textcite{lin_comparing_2021} found, that high performance computing (HPC) on the CPU and GPU in Julia performs similar to HPC development in C. This means that Julia can be a viable alternative to Fortran, C and C++ in the HPC field. Additional Julia has the benefit of developer comfort since it is a high level language with modern features such as a garbage-collector. \textcite{besard_rapid_2019} have also shown how the combination of Julia and CUDA help in rapidly developing HPC software. While this thesis in general revolves around CUDA, there also exist alternatives by AMD called ROCm\footnote{\url{https://www.amd.com/de/products/software/rocm.html}} and a vendor independent alternative called OpenCL\footnote{\url{https://www.khronos.org/opencl/}}.
If not specified otherwise, the following section and its subsections use the information presented by \textcite{nvidia_cuda_2025} in their CUDA programming guide. While in the early days of GPGPU programming a lot of research has been done to assess if this approach is feasible, it now seems obvious to use GPUs to accelerate algorithms. GPUs have been used early to speed up weather simulation models. \textcite{michalakes_gpu_2008} proposed a method for simulating weather with the Weather Research and Forecast (WRF) model on a GPU. With their approach, they reached a speed-up of 5 to 2 for the most compute intensive task, with little GPU optimisation effort. They also found that the GPU usage was low, meaning there are resources and potential for more detailed simulations.
Generally, simulations are great candidates for using GPUs, as they can benefit heavily from a high degree of parallelism and data throughput. \textcite{koster_high-performance_2020} have developed a way of using adaptive time steps on the GPU to considerably improve the performance of numerical and discrete simulations. In addition to the performance gains they were able to retain the precision and constraint correctness of the simulation. Black hole simulations are crucial for science and education for a better understanding of our world. \textcite{verbraeck_interactive_2021} have shown that simulating complex Kerr (rotating) black holes can be done on consumer hardware in a few seconds. Schwarzschild black hole simulations can be performed in real-time with GPUs as described by \textcite{hissbach_overview_2022} which is especially helpful for educational scenarios. While both approaches do not have the same accuracy as detailed simulations on supercomputers, they show how a single GPU can yield similar accuracy at a fraction of the cost.
Software network routing can also heavily benefit from GPU acceleration as shown by \textcite{han_packetshader_2010}, where they achieved a significantly higher throughput than with a CPU only implementation.
Finite element structural analysis is an essential tool for many branches of engineering and can also heavily benefit from the usage of GPUs as demonstrated by \textcite{georgescu_gpu_2013}.
Generating test data for DeepQ learning can also significantly benefit from using the GPU \parencite{koster_macsq_2022}.
However, it also needs to be noted, that GPUs are not always better performing than CPUs as illustrated by \textcite{lee_debunking_2010}, so it is important to consider if it is worth using GPUs for specific tasks.
While in the early days of GPGPU programming a lot of research has been done to assess if this approach is feasible, it now seems obvious to use GPUs to accelerate algorithms. GPUs have been used early to speed up weather simulation models. \textcite{michalakes_gpu_2008} proposed a method for simulating weather with the Weather Research and Forecast (WRF) model on a GPU. With their approach, they reached a speed-up of 5 to 2 for the most compute intensive task, with little GPU optimisation effort. They also found that the GPU usage was low, meaning there are resources and potential for more detailed simulations. Generally, simulations are great candidates for using GPUs, as they can benefit heavily from a high degree of parallelism and data throughput. \textcite{koster_high-performance_2020} have developed a way of using adaptive time steps on the GPU to considerably improve the performance of numerical and discrete simulations. In addition to the performance gains they were able to retain the precision and constraint correctness of the simulation. Black hole simulations are crucial for science and education for a better understanding of our world. \textcite{verbraeck_interactive_2021} have shown that simulating complex Kerr (rotating) black holes can be done on consumer hardware in a few seconds. Schwarzschild black hole simulations can be performed in real-time with GPUs as described by \textcite{hissbach_overview_2022} which is especially helpful for educational scenarios. While both approaches do not have the same accuracy as detailed simulations on supercomputers, they show how a single GPU can yield similar accuracy at a fraction of the cost. Software network routing can also heavily benefit from GPU acceleration as shown by \textcite{han_packetshader_2010}, where they achieved a significantly higher throughput than with a CPU only implementation. Finite element structural analysis is an essential tool for many branches of engineering and can also heavily benefit from the usage of GPUs as demonstrated by \textcite{georgescu_gpu_2013}. Generating test data for DeepQ learning can also significantly benefit from using the GPU \parencite{koster_macsq_2022}. However, it also needs to be noted, that GPUs are not always better performing than CPUs as illustrated by \textcite{lee_debunking_2010}, so it is important to consider if it is worth using GPUs for specific tasks.
\subsection{Programming GPUs}
The development process on a GPU is vastly different from a CPU. A CPU has tens or hundreds of complex cores with the AMD Epyc 9965\footnote{\url{https://www.amd.com/en/products/processors/server/epyc/9005-series/amd-epyc-9965.html}} having $192$ cores and twice as many threads. To demonstrate how a modern CPU works \textcite{knuth_mmix_1999} introduced the MMIX architecture. It is a 64-bit CPU architecture containing many concepts and design decisions to compete with other CPUs on the market at that time. He provides the information in great detail and demonstrates the complexity of CPU architectures. Current CPUs are even more complex, and often contain features like sophisticated branch prediction among other things to achieve higher and higher performance. This makes a CPU perfect for handling complex control flows on a single program thread and even multiple threads simultaneously \parencite{palacios_comparison_2011}. However, as seen in Section \ref{sec:gpgpu}, this often is not enough. On the other hand, a GPU contains thousands or even tens of thousands of cores. For example, the GeForce RTX 5090\footnote{\url{https://www.nvidia.com/en-us/geforce/graphics-cards/50-series/rtx-5090/}} contains a total of $21\,760$ CUDA cores. To achieve this enormous core count, a single GPU core has to be much simpler than a single CPU core. As described by \textcite{nvidia_cuda_2025}, a GPU designates much more transistors towards floating-point computations. This, however, results in less efficient integer arithmetic and control flow handling. There is also less Cache available per core and clock speeds are usually also much lower than those on a CPU. An overview of the differences of a CPU and a GPU architecture can be seen in Figure \ref{fig:cpu_vs_gpu}.
The development process on a GPU is vastly different from a CPU. A CPU has tens or hundreds of complex cores with the AMD Epyc 9965\footnote{\url{https://www.amd.com/en/products/processors/server/epyc/9005-series/amd-epyc-9965.html}} having $192$ cores and twice as many threads. To demonstrate the complexity of a simple one core 8-bit CPU \textcite{schuurman_step-by-step_2013} has written a development guide. He describes the different parts of one CPU core and how they interact. Modern CPUs are even more complex, with dedicated fast integer and floating-point arithmetic gates as well as logic gates, sophisticated branch prediction and much more. This makes a CPU perfect for handling complex control flows on a single program strand and on modern CPUs even multiple strands simultaneously \parencite{palacios_comparison_2011}. However, as seen in Section \ref{sec:gpgpu}, this often is not enough. On the other hand, a GPU contains thousands or even tens of thousands of cores. For example, the GeForce RTX 5090\footnote{\url{https://www.nvidia.com/en-us/geforce/graphics-cards/50-series/rtx-5090/}} contains a total of $21\,760$ CUDA cores. To achieve this enormous core count a single GPU core has to be much simpler than one CPU core. As described by \textcite{nvidia_cuda_2025} a GPU designates much more transistors towards floating-point computations. This results in less efficient integer arithmetic and control flow handling. There is also less Cache available per core and clock speeds are usually also much lower than those on a CPU. An overview of the differences of a CPU and a GPU architecture can be seen in Figure \ref{fig:cpu_vs_gpu}.
\begin{figure}
\centering
@ -62,7 +38,7 @@ Despite these drawbacks, the sheer number of cores, makes a GPU a valid choice w
\subsubsection{Thread Hierarchy and Tuning}
\label{sec:thread_hierarchy}
The thousands of cores on a GPU, as well as the threads created by the developer, are grouped together in several categories. This is the so-called thread hierarchy of GPUs. The developer can influence this grouping to a degree which allows them to tune their algorithm for optimal performance. In order to develop a well performing algorithm, it is necessary to know how this grouping works. Tuning the grouping is unique to each algorithm and also dependent on the GPU used, which means it is important to test a lot of different configurations to achieve the best possible result. This section aims at exploring the thread hierarchy and how it can be tuned to fit an algorithm.
The thousands of cores on a GPU, also called threads, are grouped together in several categories. This is the Thread hierarchy of GPUs. The developer can influence this grouping to a degree which allows them to tune their algorithm for optimal performance. In order to develop a well performing algorithm, it is necessary to know how this grouping works. Tuning the grouping is unique to each algorithm and also dependent on the GPU used, which means it is important to test a lot of different configurations to achieve the best possible result. This section aims at exploring the thread hierarchy and how it can be tuned to fit an algorithm.
At the lowest level of a GPU exists a Streaming Multiprocessor (SM), which is a hardware unit responsible for scheduling and executing threads and also contains the registers used by these threads. An SM is always executing a group of 32 threads simultaneously, and this group is called a warp. The number of threads that can be started is virtually unlimited. However, threads must be grouped in a block, with one block typically containing a maximum of $1024$ threads but is often configured to be less. Therefore, if more than $1024$ threads are required, more blocks must be created. Blocks can also be grouped into thread block clusters which is optional, but can be useful in certain scenarios. All thread blocks or thread block clusters are part of a grid, which manifests as a dispatch of the code run on the GPU, also called kernel \parencite{amd_hip_2025}. All threads in one block have access to some shared memory, which can be used for L1 caching or communication between threads. It is important that the blocks can be scheduled independently, with no dependencies between them. This allows the scheduler to schedule blocks and threads as efficiently as possible. All threads within a warp are guaranteed to be part of the same block, and are therefore executed simultaneously and can access the same memory addresses. Figure \ref{fig:thread_hierarchy} depicts how threads in a block are grouped into warps for execution and how they shared memory.
@ -75,7 +51,7 @@ At the lowest level of a GPU exists a Streaming Multiprocessor (SM), which is a
A piece of code that is executed on a GPU is written as a kernel which can be configured. The most important configuration is how threads are grouped into blocks. The GPU allows the kernel to allocate threads and blocks and block clusters in up to three dimensions. This is often useful because of the already mentioned shared memory, which will be explained in more detail in Section \ref{sec:memory_model}. Considering the case where an image needs to be blurred, it not only simplifies the development if threads are arranged in a 2D grid, it also helps with optimising memory access. As the threads in a block, need to access a lot of the same data, this data can be loaded in the shared memory of the block. This allows the data to be accessed much quicker compared to when threads are allocated in only one dimension. With one dimensional blocks it is possible that threads assigned to nearby pixels, are part of a different block, leading to a lot of duplicate data transfer. The size in each dimension of a block can be almost arbitrary within the maximum allowed number of threads. However, blocks that are too large might lead to other problems which are described in more detail in Section \ref{sec:occupancy}.
Once a kernel is dispatched, all threads start at the same point in a program. However, because a thread may encounter instructions, such as branches, where it can take a different path to the other threads, or in other words diverge, each thread has a unique instruction pointer. This allows threads to work independently, even if they are part of the same warp. However, because of the SIMD architecture, all threads in a warp must execute the same instructions and if threads start to diverge, the SM must pause threads with different instructions and execute them later. Figure \ref{fig:thread_divergence} shows how such divergences can impact performance. The situation described in the figure also shows, that the thread could re-converge after the divergence. On older hardware this does not happen and results in T2 being executed after T1 and T3 have finished. In situations where there is a lot of data dependent thread divergence, most of the benefits of using a GPU are likely to be lost. Threads not executing the same instruction is strictly speaking against the SIMD principle, but can happen in reality, due to data dependent branching. Consequently, this leads to poor resource utilisation, which in turn leads to poor performance. Another way in which threads can be paused (inactive threads) is the fact that sometimes, the number of threads started is not divisible by 32. In such cases, the last warp still contains 32 threads but only the threads with work are executed.
All threads in a warp start at the same point in a program, but with their own instruction address, allowing them to work independently. Because of the SIMD architecture, all threads in a warp must execute the same instructions and if threads start diverging, the SM must pause threads with different instructions and execute them later. Figure \ref{fig:thread_divergence} shows how such divergences can impact performance. The situation described by the figure also shows, that after the divergence the thread could re-converge. On older hardware this does not happen and leads to T2 being executed after T1 and T3 are finished. In situations where a lot of data dependent thread divergence happens, most of the benefits of using a GPU likely have vanished. Threads not executing the same instruction is strictly speaking against the SIMD principle but can happen in reality, due to data dependent branching. Consequently, this leads to bad resource utilisation, which in turn leads to worse performance. Another possibility of threads being paused (inactive threads) is the fact that sometimes, the number of threads started is not divisible by 32. In such cases, the last warp still contains 32 threads but only the threads with work are executed.
\begin{figure}
\centering
@ -85,9 +61,7 @@ Once a kernel is dispatched, all threads start at the same point in a program. H
\end{figure}
Modern GPUs implement what is known as the Single-Instruction Multiple-Thread (SIMT) architecture. In many cases a developer does not need to know the details of SIMT and can design fast, correct and accurate programs with just the SIMD architecture in mind. However, leveraging the power of SIMT can yield substantial performance gains by re-converging threads after data-dependent divergence has occurred. SIMT can also help with increasing the occupancy of the GPU. Occupancy and its importance to performance is discussed in detail in Section \ref{sec:occupancy}.
A stack-less re-convergence algorithm was proposed by \textcite{collange_stack-less_2011} as an alternative to the default stack-based re-convergence algorithm. Their algorithm was able to achieve higher performance than the default one. Another approach for increasing occupancy using the SIMT architecture is proposed by \textcite{fung_thread_2011}. They introduced a technique for compacting thread blocks by moving divergent threads to new warps until they re-converge. This approach resulted in a noticeable speed-up between 17\% and 22\%. Another example where a SIMT aware algorithm can perform better was proposed by \textcite{koster_massively_2020}. While they did not implement techniques for thread re-convergence, they implemented a thread compaction algorithm. On data-dependent divergence it is possible for threads to end early, leaving a warp with only partial active threads. This means the inactive threads are still occupied and cannot be used for other work. Their thread compaction tackles this problem by moving active threads into a new thread block, releasing the inactive threads to perform other work. With this they were able to gain a speed-up of roughly 4 times compared to previous implementations. Adapting Multiple-Instruction Multiple-Data (MIMD) programs with synchronisation to run on SIMT architecture can be a difficult task, especially if the underlying architecture is not well understood. A static analysis tool and a transformer specifically designed to help avoid deadlocks with MIMD synchronisation is proposed by \textcite{eltantawy_mimd_2016}. In addition, they proposed a hardware re-convergence mechanism that supports MIMD synchronisation. A survey by \textcite{khairy_survey_2019} explores different aspects of improving GPGPU performance architecturally. Specifically, they have compiled a list of different publications discussing algorithms for thread re-convergence, thread compaction and much more. Their main goal was to give a broad overview of many ways to improve the performance of GPGPU programming to help other developers.
Modern GPUs implement the so called Single-Instruction Multiple-Thread (SIMT) architecture. In many cases a developer does not need to know the details of SIMT and can develop fast and correct programs with just the SIMD architecture in mind. However, leveraging the power of SIMT can yield substantial performance gains by re-converging threads after data dependent divergence occurred. A stack-less re-convergence algorithm was proposed by \textcite{collange_stack-less_2011} as an alternative to the default stack-based re-convergence algorithm. Their algorithm was able to achieve higher performance than the default one. Another approach for increasing occupancy using the SIMT architecture is proposed by \textcite{fung_thread_2011}. They introduced a technique for compacting thread blocks by moving divergent threads to new warps until they re-converge. This approach resulted in a noticeable speed-up between 17\% and 22\%. Another example where a SIMT aware algorithm can perform better was proposed by \textcite{koster_massively_2020}. While they did not implement techniques for thread re-convergence, they implemented a thread compaction algorithm. On data-dependent divergence it is possible for threads to end early, leaving a warp with only partial active threads. This means the inactive threads are still occupied and cannot be used for other work. Their thread compaction tackles this problem by moving active threads into a new thread block, releasing the inactive threads to perform other work. With this they were able to gain a speed-up of roughly 4 times compared to previous implementations. Adapting Multiple-Instruction Multiple-Data (MIMD) programs with synchronisation to run on SIMT architecture can be a difficult task, especially if the underlying architecture is not well understood. A static analysis tool and a transformer specifically designed to help avoid deadlocks with MIMD synchronisation is proposed by \textcite{eltantawy_mimd_2016}. In addition, they proposed a hardware re-convergence mechanism that supports MIMD synchronisation. A survey by \textcite{khairy_survey_2019} explores different aspects of improving GPGPU performance architecturally. Specifically, they have compiled a list of different publications discussing algorithms for thread re-convergence, thread compaction and much more. Their main goal was to give a broad overview of many ways to improve the performance of GPGPU programming to help other developers.
\subsubsection{Memory Model}
\label{sec:memory_model}
@ -104,7 +78,7 @@ On a GPU there are two parts that contribute to the performance of an algorithm.
\label{fig:gpu_memory_layout}
\end{figure}
On a GPU there are multiple levels and kinds of memory available. All these levels and kinds have different purposes they are optimised for. This means that it is important to know what they are and how they can be best used for specific tasks. On the lowest level, threads have registers and local memory available. Registers are the fastest way to access memory, but they are also the least abundant memory with up to a maximum of 255 32-Bit registers per thread on Nvidia GPUs and 256 on AMD GPUs \parencite{amd_hardware_2025}. However, using all registers of a thread can lead to other problems which are described in more detail in Section \ref{sec:occupancy}. In contrast to registers, local memory is significantly slower. This is due to the fact, that local memory is actually stored in global memory and therefore has the same limitations as explained later. This means that it is important to try and avoid local memory as much as possible. Local memory is usually only used when a thread is using too many registers. The compiler will then spill the remaining data into local memory and load it into registers once needed, slowing down the application drastically.
On a GPU there are multiple levels and kinds of memory available. All these levels and kinds have different purposes they are optimised for. This means that it is important to know what they are and how they can be best used for specific tasks. On the lowest level threads have registers and local memory available. Registers is the fastest way to access memory but is also the least abundant memory with up to a maximum of 255 32-Bit registers per thread on Nvidia GPUs and 256 on AMD GPUs \parencite{amd_hardware_2025}. However, using all registers of a thread can lead to other problems which are described in more detail in Section \ref{sec:occupancy}. On the other side, the thread local memory is significantly slower than registers. This is due to the fact, that local memory is actually stored in global memory and therefore has the same limitations which are explained later. This means it is important to try and avoid local memory as much as possible. Local memory is usually only used when a thread uses too many registers. The compiler will then spill the remaining data into local memory and loads it into registers once needed, drastically slowing down the application.
Shared memory is the next tier of memory on a GPU. Unlike local memory and registers, shared memory is shared between all threads inside a block. The amount of shared memory is depending on the GPU architecture but for Nvidia it hovers at around 100 Kilobyte (KB) per block. While this memory is slower than registers, its primary use-case is communicating and sharing data between threads in a block. If all threads in a block access a lot of overlapping data this data can be loaded from global memory into faster shared memory once. It can then be accessed multiple times, further increasing performance. Loading data into shared memory and accessing that data has to be done manually. Because shared memory is part of the unified data cache, it can either be used as a cache or for manual use, meaning a developer can allocate more shared memory towards caching if needed. Another feature of shared memory are the so-called memory banks. Shared memory is always split into 32 equally sized memory modules also called memory banks. All available memory addresses lie in one of these banks. This means if two threads access two memory addresses which lie in different banks, the access can be performed simultaneously, increasing the throughput.
@ -141,13 +115,11 @@ Occupancy describes the utilisation of a GPU. A high occupancy means, that there
\label{tab:compute_capabilities}
\end{table}
When starting a kernel, the most important configuration is the number of threads and thread blocks that need to be started. This is important, as this has other effects on occupancy as well. In Table \ref{tab:compute_capabilities} the most notable limitations are presented that can affect occupancy. These limitations need to be considered when choosing a kernel configuration. It is important to note, that depending on the GPU and problem, the occupancy tuning might differ, and the same approach might perform well on one GPU but perform poorly on another GPU. Therefore, the things discussed here are only guidelines.
Tools like Nvidia Nsight Compute\footnote{\url{https://developer.nvidia.com/nsight-compute}} and Nsight Systems\footnote{\url{https://developer.nvidia.com/nsight-systems}} are essential for performance tuning. Nsight compute also contains an occupancy calculator which takes a kernel and computes how the configuration performs in terms of occupancy and also lets the developer try out different configurations \parencite{nvidia_nsight_2025}.
When starting a kernel, the most important configuration is the number of threads and thread blocks that need to be started. This is important, as this has other effects on occupancy as well. In table \ref{tab:compute_capabilities} the most notable limitations are presented that can affect occupancy. These limitations need to be considered when choosing a kernel configuration. It is important to note, that depending on the GPU and problem, the occupancy tuning might differ, and the same approach might perform well on one GPU but perform poorly on another GPU. Therefore, the things discussed here are only guidelines. Tools like Nvidia Nsight Compute\footnote{\url{https://developer.nvidia.com/nsight-compute}} and Nsight Systems\footnote{\url{https://developer.nvidia.com/nsight-systems}} are essential for performance tuning. Nsight compute also contains an occupancy calculator which takes a kernel and computes how the configuration performs in terms of occupancy and also lets the developer try out different configurations \parencite{nvidia_nsight_2025}.
In general, it is important to have as many warps as possible ready for execution. While this means that a lot of warps could be executed but are not, this is actually desired. A key feature of GPUs is so-called latency hiding, meaning that while a warp waits for data to be retrieved for example, another warp ready for execution can now be run. With low occupancy, and therefore little to no warps waiting for execution, latency hiding does not work, as now the hardware is idle. As a result, the runtime increases which also explains why high occupancy is not guaranteed to result in performance improvements while low occupancy can and often will increase the runtime.
As seen in Table \ref{tab:compute_capabilities}, there exist different limitations that can impact occupancy. The number of warps per SM is important, as this means this is the degree of parallelism achievable per SM. If due to other limitations, the number of warps per SM is below the maximum, there is idle hardware. One such limitation is the number of registers per block and SM. In the case of compute capability 8.9, one SM can handle $32 * 48 = 1\,536$ threads. This leaves $64\,000 / 1\,536 \approx 41$ registers per thread, which is lower than the theoretical maximum of $255$ registers per thread. Typically, one register is mapped to one variable in the kernel code, meaning a developer can use up to 41 variables in their code. However, if the variable needs 64 bits to store its value, the register usage doubles, as all registers on a GPU are 32-bit. On a GPU with compute capability 10.x a developer can use up to $64\,000 / 2\,048 \approx 31$ registers. Of course a developer can use more registers, but this results in less occupancy. However, depending on the algorithm using more registers might be more beneficial to performance than the lower occupancy, in which case occupancy is not as important. If a developer needs more than $255$ registers for their variables the additional variables will spill into local memory which is, as described in Section \ref{sec:memory_model}, not desirable.
As seen in table \ref{tab:compute_capabilities}, there exist different limitations that can impact occupancy. The number of warps per SM is important, as this means this is the degree of parallelism achievable per SM. If due to other limitations, the number of warps per SM is below the maximum, there is idle hardware. One such limitation is the number of registers per block and SM. In the case of compute capability 8.9, one SM can handle $32 * 48 = 1\,536$ threads. This leaves $64\,000 / 1\,536 \approx 41$ registers per thread, which is lower than the theoretical maximum of $255$ registers per thread. Typically, one register is mapped to one variable in the kernel code, meaning a developer can use up to 41 variables in their code. However, if the variable needs 64 bits to store its value, the register usage doubles, as all registers on a GPU are 32-bit. On a GPU with compute capability 10.x a developer can use up to $64\,000 / 2\,048 \approx 31$ registers. Of course a developer can use more registers, but this results in less occupancy. However, depending on the algorithm using more registers might be more beneficial to performance than the lower occupancy, in which case occupancy is not as important. If a developer needs more than $255$ registers for their variables the additional variables will spill into local memory which is, as described in Section \ref{sec:memory_model}, not desirable.
Additionally, shared memory consumption can also impact the occupancy. If for example a block needs all the available shared memory, which is almost the same as the amount of shared memory per SM, this SM can only serve this block. On compute capability 10.x, this would mean that occupancy would be at maximum $50\%$ as a block can have up to $1\,024$ threads while an SM supports up to $2\,048$ threads. Again, in such cases it needs to be determined, if the performance gain of using this much shared memory is worth the lower occupancy.
@ -160,11 +132,11 @@ While in most cases a GPU can be programmed in a higher level language like C++
PTX defines a virtual machine with an own instruction set architecture (ISA) and is designed for data-parallel processing on a GPU. It is an abstraction of the underlying hardware instruction set, allowing PTX code to be portable across Nvidia GPUs. In order for PTX code to be usable for the GPU, the driver is responsible for compiling the code to the hardware instruction set of the GPU it is run on. A developer typically writes a kernel in CUDA using C++, for example, and the Nvidia compiler generates the PTX code for that kernel. This PTX code is then compiled by the driver once it is executed. The concepts for programming the GPU with PTX and CUDA are the same, apart from the terminology which is slightly different. For consistency, the CUDA terminology will continue to be used.
Syntactically, PTX is similar to assembler style code. Every PTX code must have a \verb|.version| directive which indicates the PTX version and an optional \verb|.target| directive which indicates the compute capability. If the program works in 64 bit addresses, the optional \verb|.address_size| directive can be used to indicate that, which simplifies the code for such applications. After these directives, the actual code is written. As each PTX code needs an entry point (the kernel) the \verb|.entry| directive indicates the name of the kernel and the parameters needed. It is also possible to write helper functions with the \verb|.func| directive. Inside the kernel or a helper function, normal PTX code can be written. Because PTX is very low level, it assumes an underlying register machine, therefore a developer needs to think about register management. This includes loading data from global or shared memory into registers if needed. Code for manipulating data like addition and subtraction generally follow the structure \verb|operation.datatype| followed by up to four parameters for that operation. For adding two FP32 values together and storing them in the register \%n, the code looks like the following:
Syntactically PTX resembles Assembly style code. Every PTX code must have a \verb|.version| directive which indicates the PTX version and an optional \verb|.target| directive which indicates the compute capability. If the program works in 64 bit addresses, the optional \verb|.address_size| directive can be used to indicate that, which simplifies the code for such applications. After these directives, the actual code is written. As each PTX code needs an entry point (the kernel) the \verb|.entry| directive indicates the name of the kernel and the parameters needed. It is also possible to write helper functions with the \verb|.func| directive. Inside the kernel or a helper function, normal PTX code can be written. Because PTX is very low level, it assumes an underlying register machine, therefore a developer needs to think about register management. This includes loading data from global or shared memory into registers if needed. Code for manipulating data like addition and subtraction generally follow the structure \verb|operation.datatype| followed by up to four parameters for that operation. For adding two FP32 values together and storing them in the register \%n, the code looks like the following:
\begin{GenericCode}[numbers=none]
add.f32 \%n, 0.1, 0.2;
\end{GenericCode}
Loops in the classical sense do not exist in PTX. Instead, a developer needs to define jump targets for the beginning and end of the loop. The Program in \ref{code:ptx_loop} shows how a function with simple loop can be implemented. The loop counts down to zero from the passed parameter $N$ which is loaded into the register \%n in line 6. If the value in the register \%n reached zero the loop branches at line 9 to the jump target at line 12 and the loop has finished. All other used directives and further information on writing PTX code can be taken from the PTX documentation \parencite{nvidia_parallel_2025}.
Loops in the classical sense do not exist in PTX. Alternatively a developer needs to define jump targets for the beginning and end of the loop. The Program in \ref{code:ptx_loop} shows how a function with simple loop can be implemented. The loop counts down to zero from the passed parameter $N$ which is loaded into the register \%n in line 6. If the value in the register \%n reached zero the loop branches at line 9 to the jump target at line 12 and the loop has finished. All other used directives and further information on writing PTX code can be taken from the PTX documentation \parencite{nvidia_parallel_2025}.
\begin{program}
\begin{GenericCode}
@ -190,10 +162,9 @@ Done:
\label{sec:compilers}
Compilers are a necessary tool for many developers. If a developer wants to run their program it is very likely they need one. As best described by \textcite{aho_compilers_2006} in their dragon book, a compiler takes code written by a human in some source language and translates it into a destination language readable by a computer. This section briefly explores what compilers are and research done in this old field of computer science. Furthermore, the topics of transpilers and interpreters are explored, as their use-cases are very similar.
\textcite{aho_compilers_2006} and \textcite{cooper_engineering_2022} describe how a compiler can be developed, with the latter focusing on more modern approaches. They describe how a compiler consists of two parts, the analyser, also called frontend, and the synthesiser also called backend. The frontend is responsible for ensuring syntactic and semantic correctness and converts the source code into an intermediate representation, an abstract syntax tree (AST), for the backend. Generating code in the target language, from the intermediate representation is the job of the backend. This target code can be assembly or anything else that is needed for a specific use-case. This intermediate representation also makes it simple to swap out frontends or backends. The Gnu Compiler Collection \textcite{gcc_gcc_2025} takes advantage of using different frontends to provide support for many languages including C, C++, Ada and more. Instead of compiling source code for specific machines directly, many languages compile code for virtual machines instead. Notable examples are the Java Virtual Machine (JVM) \parencite{lindholm_java_2025} and the low level virtual machine (LLVM) \parencite{lattner_llvm_2004}. Such virtual machines provide a bytecode which can be used as a target language for compilers. A huge benefit of such virtual machines is the ability for one program to be run on all physical machines the virtual machine exists for, without the developer needing to change that program \parencite{lindholm_java_2025}. Programs written for virtual machines are compiled into their respective bytecode. This bytecode can then be interpreted or compiled to physical machine code and then be run. According to the JVM specification \textcite{lindholm_java_2025} the Java bytecode is interpreted and also compiled with a just-in-time (JIT) compiler to increase the performance of code blocks that are often executed. On the other hand, the common language runtime (CLR)\footnote{\url{https://learn.microsoft.com/en-us/dotnet/standard/clr}}, the virtual machine for languages like C\#, never interprets the generated bytecode. As described by \textcite{microsoft_overview_2023} the CLR always compiles the bytecode to physical machine code using a JIT compiler before it is executed.
\textcite{aho_compilers_2006} and \textcite{cooper_engineering_2022} describe how a compiler can be developed, with the latter focusing on more modern approaches. They describe how a compiler consists of two parts, the analyser, also called frontend, and the synthesiser also called backend. The front end is responsible for ensuring syntactic and semantic correctness and converts the source code into an intermediate representation, an abstract syntax tree (AST), for the backend. Generating code in the target language, from the intermediate representation is the job of the backend. This target code can be assembly or anything else that is needed for a specific use-case. This intermediate representation also makes it simple to swap out frontends or backends. The Gnu Compiler Collection \textcite{gcc_gcc_2025} takes advantage of using different frontends to provide support for many languages including C, C++, Ada and more. Instead of compiling source code for specific machines directly, many languages compile code for virtual machines instead. Notable examples are the Java Virtual Machine (JVM) \parencite{lindholm_java_2025} and the low level virtual machine (LLVM) \parencite{lattner_llvm_2004}. Such virtual machines provide a bytecode which can be used as a target language for compilers. A huge benefit of such virtual machines is the ability for one program to be run on all physical machines the virtual machine exists for, without the developer needing to change that program \parencite{lindholm_java_2025}. Programs written for virtual machines are compiled into their respective bytecode. This bytecode can then be interpreted or compiled to physical machine code and then be run. According to the JVM specification \textcite{lindholm_java_2025} the Java bytecode is interpreted and also compiled with a just-in-time (JIT) compiler to increase the performance of code blocks that are often executed. On the other hand, the common language runtime (CLR)\footnote{\url{https://learn.microsoft.com/en-us/dotnet/standard/clr}}, the virtual machine for languages like C\#, never interprets the generated bytecode. As described by \textcite{microsoft_overview_2023} the CLR always compiles the bytecode to physical machine code using a JIT compiler before it is executed.
% Is not relevant to thesis!!!!!
% A grammar describes how a language is structured. It not only describes the structure of natural language, but it can also be used to describe the structure of a programming language. \textcite{chomsky_certain_1959} found that grammars can be grouped into four levels, with regular and context-free grammars being the most relevant for programming languages. A regular grammar is of the structure $A = a\,|\,a\,B$ which is called a rule. The symbols $A$ and $B$ are non-terminal symbols and $a$ is a terminal symbol. A non-terminal symbol stands for another rule with the same structure and must only occur after a terminal symbol. Terminal symbols are fixed symbols or a value that can be found in the input stream, like literals in programming languages. Context-free grammars are more complex and are of the structure $A = \beta$. In this context $\beta$ stands for any combination of terminal and non-terminal symbols. Therefore, a rule like $A = a\,| a\,B\,a$ is allowed with this grammar level. This shows that with context-free grammars enclosing structures are possible. To write grammars for programming languages, other properties are also important to efficiently validate or parse some input to be defined by this grammar. However, these are not discussed here, but are described by \textcite{aho_compilers_2006}. They also described that generating a parser out of a grammar can be automated. This automation can be performed by parser generators like Yacc \parencite{johnson_yacc_1975} as described in their book. More modern alternatives are Bison\footnote{\url{https://www.gnu.org/software/bison/}} or Antlr\footnote{\url{https://www.antlr.org/}}. Before the parser can validate the input stream, a scanner is needed as described by \textcite{cooper_engineering_2022}. The scanner reads every character of the input stream and is responsible for removing white-spaces and ensures only valid characters and words are present. Flex \footnote{\url{https://github.com/westes/flex}} is a tool that allows generating a scanner and is often used in combination with Bison. A simplified version of the compiler architecture using Flex and Bison is depicted in Figure \ref{fig:compiler_layout}. It shows how source code is taken and transformed into the intermediate representation by the frontend, and how it is converted into executable machine code by the backend.
A grammar describes how a language is structured. It not only describes the structure of natural language, but it can also be used to describe the structure of a programming language. \textcite{chomsky_certain_1959} found that grammars can be grouped into four levels, with regular and context-free grammars being the most relevant for programming languages. A regular grammar is of the structure $A = a\,|\,a\,B$ which is called a rule. The symbols $A$ and $B$ are non-terminal symbols and $a$ is a terminal symbol. A non-terminal symbol stands for another rule with the same structure and must only occur after a terminal symbol. Terminal symbols are fixed symbols or a value that can be found in the input stream, like literals in programming languages. Context-free grammars are more complex and are of the structure $A = \beta$. In this context $\beta$ stands for any combination of terminal and non-terminal symbols. Therefore, a rule like $A = a\,| a\,B\,a$ is allowed with this grammar level. This shows that with context-free grammars enclosing structures are possible. To write grammars for programming languages, other properties are also important to efficiently validate or parse some input to be defined by this grammar. However, these are not discussed here, but are described by \textcite{aho_compilers_2006}. They also described that generating a parser out of a grammar can be automated. This automation can be performed by parser generators like Yacc \parencite{johnson_yacc_1975} as described in their book. More modern alternatives are Bison\footnote{\url{https://www.gnu.org/software/bison/}} or Antlr\footnote{\url{https://www.antlr.org/}}. Before the parser can validate the input stream, a scanner is needed as described by \textcite{cooper_engineering_2022}. The scanner reads every character of the input stream and is responsible for removing white-spaces and ensures only valid characters and words are present. Flex \footnote{\url{https://github.com/westes/flex}} is a tool that allows generating a scanner and is often used in combination with Bison. A simplified version of the compiler architecture using Flex and Bison is depicted in Figure \ref{fig:compiler_layout}. It shows how source code is taken and transformed into the intermediate representation by the frontend, and how it is converted into executable machine code by the backend.
\begin{figure}
\centering
@ -204,10 +175,12 @@ Compilers are a necessary tool for many developers. If a developer wants to run
% More references to JIT: https://dl.acm.org/doi/abs/10.1145/857076.857077
\subsection{Interpreters}
% What are interpreters; how they work; should mostly contain/reference gpu interpreters
Interpreters are a different kind of program for executing source code. Rather than compiling the code and executing the result, an interpreter executes the source code directly. Languages like Python and JavaScript are prominent examples of interpreted languages, but also Java, or more precise Java-Bytecode, is also interpreted before it gets compiled \parencite{lindholm_java_2025}. However, interpreters can not only be used for interpreting programming languages. It is also possible for them to be used in GP. \textcite{langdon_simd_2008} have shown how a SIMD interpreter can be efficiently used for evaluating entire GP populations on the GPU directly. In a later work \textcite{cano_gpu-parallel_2014} further improved this interpreter. They used the fact that a GP individual represents a tree which can be split into independent subtrees. These can be evaluated concurrently and with the help of communication via shared memory, they were able to evaluate the entire tree. With this they achieved a significant performance improvement over previous implementations. As shown by \textcite{dietz_mimd_2010}, it is even possible to develop an interpreter that can execute MIMD programs on a SIMD GPU. However, as noted by the authors, any kind interpretation comes with an overhead. This means that with the additional challenges of executing MIMD programs on SIMD hardware, their interpreter, while achieving reasonable efficiency, still suffers from performance problems. Another field where interpreters can be useful are rule-based simulations. \textcite{koster_massively_2020} has shown how they implemented a GPU interpreter for such simulations. In addition with other novel performance improvements in running programs on a GPU, they were able to gain a speed-up of 4 over non-interpreted implementations. While publications like \textcite{fua_comparing_2020} and \textcite{gherardi_java_2012} have shown, interpreted languages often trail behind in terms of performance compared to compiled languages, interpreters per se are not slow. And while they come with performance overhead as demonstrated by \textcite{dietz_mimd_2010} and \textcite{romer_structure_1996}, they can still be a very fast, easy and powerful alternative for certain tasks.
\subsection{Transpilers}
% talk about what transpilers are and how to implement them. If possible also gpu specific transpilation.
With the concepts already mentioned, it is possible to generate executable code from code written in a programming language. However, sometimes it is desired to convert a program from one programming language to another and therefore the major difference between these use-cases is the backend. A popular transpiler example is the TypeScript transpiler, which transforms TypeScript source code into JavaScript source code \parencite{microsoft_typescript_2025}. Other examples for transpilers are the C2Rust transpiler \parencite{ling_rust_2022} that transpiles C code into Rust code as well as the PyJL transpiler \parencite{marcelino_transpiling_2022} which transpiles Python code into Julia code. \textcite{chaber_effectiveness_2016} proposed a transpiler that takes MATLAB and C code and transforms it into pure and optimised C code for an STM32 microcontroller. An early example for a transpiler has been developed by \textcite{intel_mcs86_1978} where they built a transpiler for transforming assembly code for their 8080 CPU to assembly code for their 8086 CPU. Transpilers can also be used in parallelisation environments, like OpenMP \parencite{wang_automatic_2015}. There also exists a transpiler that transforms CUDA code into highly parallel CPU code. \textcite{moses_high-performance_2023} described this transpiler, and they found that the generated code performs noticeably better than doing this transformation by hand. When designing complex processors and accelerators, Register-transfer level (RTL) simulations are essential \parencite{wang_electronic_2009}. In a later study \textcite{zhang_opportunities_2020} have shown how RTL simulations can be performed on GPUs with a speed-up of 20. This led to \textcite{lin_rtl_2023} developing a transpiler to transform RTL into CUDA kernels instead of handwriting them. The compared their results with a CPU implementation running on 80 CPUs, where they found that the transpiled CUDA version was 40 times faster. Using transpilers for software backend and business logic has been proposed by \textcite{bastidas_fuertes_transpiler-based_2023}. Their approach implemented a programming language that can be transpiled into different programming languages, for usage in a multi-programming-language environment that share some business logic. In another study, \textcite{bastidas_fuertes_transpilers_2023} reviewed over 600 publications to map the use of transpilers alongside their implementations in different fields of research, demonstrating the versatility of transpiler use.
With the concepts already mentioned, it is possible to generate executable code from code written in a programming language. However, sometimes it is desired to convert a program from one programming language to another and therefore the major difference between these use-cases is the backend. A popular transpiler example is TypeScript, which transforms TypeScript source code into JavaScript source code \parencite{microsoft_typescript_2025}. Other examples for transpilers are the C2Rust transpiler \parencite{ling_rust_2022} that transpiles C code into Rust code as well as the PyJL transpiler \parencite{marcelino_transpiling_2022} which transpiles Python code into Julia code. \textcite{chaber_effectiveness_2016} proposed a transpiler that takes MATLAB and C code and transforms it into pure and optimised C code for an STM32 microcontroller. An early example for a transpiler has been developed by \textcite{intel_mcs86_1978} where they built a transpiler for transforming assembly code for their 8080 CPU to assembly code for their 8086 CPU. Transpilers can also be used in parallelisation environments, like OpenMP \parencite{wang_automatic_2015}. There also exists a transpiler that transforms CUDA code into highly parallel CPU code. \textcite{moses_high-performance_2023} described this transpiler, and they found that the generated code performs noticeably better than doing this transformation by hand. When designing complex processors and accelerators, Register-transfer level (RTL) simulations are essential \parencite{wang_electronic_2009}. In a later study \textcite{zhang_opportunities_2020} have shown how RTL simulations can be performed on GPUs with a speed-up of 20. This led to \textcite{lin_rtl_2023} developing a transpiler to transform RTL into CUDA kernels instead of handwriting them. The compared their results with a CPU implementation running on 80 CPUs, where they found that the transpiled CUDA version was 40 times faster. Using transpilers for software backend and business logic has been proposed by \textcite{bastidas_fuertes_transpiler-based_2023}. Their approach implemented a programming language that can be transpiled into different programming languages, for usage in a multi-programming-language environment that share some business logic. In another study, \textcite{bastidas_fuertes_transpilers_2023} reviewed over 600 publications to map the use of transpilers alongside their implementations in different fields of research, demonstrating the versatility of transpiler use.
\subsection{Interpreters}
% What are interpreters; how they work; should mostly contain/reference gpu interpreters
Interpreters are a different kind of program for executing source code. Rather than compiling the code and executing the result, an interpreter executes the source code directly. Languages like Python and JavaScript are prominent examples of interpreted languages, but also Java, or more precise Java-Bytecode, is also interpreted before it gets compiled \parencite{lindholm_java_2025}. However, interpreters can not only be used for interpreting programming languages. It is also possible for them to be used in GP. \textcite{langdon_simd_2008} have shown how a SIMD interpreter can be efficiently used for evaluating entire GP populations on the GPU directly. In a later work \textcite{cano_gpu-parallel_2014} further improved this interpreter. They used the fact that a GP individual represents a tree which can be split into independent subtrees. These can be evaluated concurrently and with the help of communication via shared memory, they were able to evaluate the entire tree. With this they achieved a significant performance improvement over previous implementations. As shown by \textcite{dietz_mimd_2010}, it is even possible to develop an interpreter that can execute MIMD programs on a SIMD GPU. However, as noted by the authors, any kind interpretation comes with an overhead. This means that with the additional challenges of executing MIMD programs on SIMD hardware, their interpreter, while achieving reasonable efficiency, still suffers from performance problems. Another field where interpreters can be useful are rule-based simulations. \textcite{koster_massively_2020} has shown how they implemented a GPU interpreter for such simulations. In addition with other novel performance improvements in running programs on a GPU, they were able to gain a speed-up of 4 over non-interpreted implementations. While publications like \textcite{fua_comparing_2020} and \textcite{gherardi_java_2012} have shown, interpreted languages often trail behind in terms of performance compared to compiled languages, interpreters per se are not slow. And while they come with performance overhead as demonstrated by \textcite{dietz_mimd_2010} and \textcite{romer_structure_1996}, they can still be a very fast, easy and powerful alternative for certain tasks.

Binary file not shown.

View File

@ -1164,61 +1164,3 @@
author = {Palacios, Jonathan and Triska, Josh},
date = {2011},
}
@inproceedings{gustafson_improving_2005,
title = {On improving genetic programming for symbolic regression},
volume = {1},
url = {https://ieeexplore.ieee.org/abstract/document/1554780},
doi = {10.1109/CEC.2005.1554780},
abstract = {This paper reports an improvement to genetic programming ({GP}) search for the symbolic regression domain, based on an analysis of dissimilarity and mating. {GP} search is generally difficult to characterise for this domain, preventing well motivated algorithmic improvements. We first examine the ability of various solutions to contribute to the search process. Further analysis highlights the numerous solutions produced during search with no change to solution quality. A simple algorithmic enhancement is made that reduces these events and produces a statistically significant improvement in solution quality. We conclude by verifying the generalisability of these results on several other regression instances},
eventtitle = {2005 {IEEE} Congress on Evolutionary Computation},
pages = {912--919 Vol.1},
booktitle = {2005 {IEEE} Congress on Evolutionary Computation},
author = {Gustafson, S. and Burke, E.K. and Krasnogor, N.},
date = {2005-09},
keywords = {Computer science, Concrete, Diversity methods, Evolutionary computation, Genetic programming, Problem-solving},
file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\28ZEEUYG\\Gustafson et al. - 2005 - On improving genetic programming for symbolic regression.pdf:application/pdf},
}
@incollection{korns_extremely_2015,
location = {Cham},
title = {Extremely Accurate Symbolic Regression for Large Feature Problems},
isbn = {978-3-319-16030-6},
url = {https://doi.org/10.1007/978-3-319-16030-6_7},
abstract = {grammarnonlinear regressiongeneralized linear models ({GLM})basis functionmaximum binary {treeRegression} Query Language ({RQL})islandelitistconstraintextreme accuracystepwise regressionheuristicridge {regressionpolynomialAsKorns} Michael F.symbolic regression ({SR}) has advanced into the early stages of commercial exploitation, the poor accuracy of {SR}, still plaguing even the most advanced commercial packages, has become an issue for early adopters. Users expect to have the correct formula returned, especially in cases with zero noise and only one basis function with minimally complex grammar depth.},
pages = {109--131},
booktitle = {Genetic Programming Theory and Practice {XII}},
publisher = {Springer International Publishing},
author = {Korns, Michael F.},
editor = {Riolo, Rick and Worzel, William P. and Kotanchek, Mark},
date = {2015},
langid = {english},
}
@misc{bruneton_enhancing_2025,
title = {Enhancing Symbolic Regression with Quality-Diversity and Physics-Inspired Constraints},
url = {https://doi.org/10.48550/arXiv.2503.19043},
doi = {10.48550/arXiv.2503.19043},
abstract = {This paper presents {QDSR}, an advanced symbolic Regression ({SR}) system that integrates genetic programming ({GP}), a quality-diversity ({QD}) algorithm, and a dimensional analysis ({DA}) engine. Our method focuses on exact symbolic recovery of known expressions from datasets, with a particular emphasis on the Feynman-{AI} benchmark. On this widely used collection of 117 physics equations, {QDSR} achieves an exact recovery rate of 91.6{\textasciitilde}\${\textbackslash}\%\$, surpassing all previous {SR} methods by over 20 percentage points. Our method also exhibits strong robustness to noise. Beyond {QD} and {DA}, this high success rate results from a profitable trade-off between vocabulary expressiveness and search space size: we show that significantly expanding the vocabulary with precomputed meaningful variables (e.g., dimensionless combinations and well-chosen scalar products) often reduces equation complexity, ultimately leading to better performance. Ablation studies will also show that {QD} alone already outperforms the state-of-the-art. This suggests that a simple integration of {QD}, by projecting individuals onto a {QD} grid, can significantly boost performance in existing algorithms, without requiring major system overhauls.},
number = {{arXiv}:2503.19043},
publisher = {{arXiv}},
author = {Bruneton, J.-P.},
date = {2025-03-24},
keywords = {Computer Science - Neural and Evolutionary Computing, Computer Science - Symbolic Computation, Physics - Data Analysis, Statistics and Probability},
file = {Preprint PDF:C\:\\Users\\danwi\\Zotero\\storage\\9U346ZEV\\Bruneton - 2025 - Enhancing Symbolic Regression with Quality-Diversity and Physics-Inspired Constraints.pdf:application/pdf},
}
@incollection{knuth_mmix_1999,
location = {Berlin, Heidelberg},
title = {{MMIX}},
isbn = {978-3-540-46611-6},
url = {https://doi.org/10.1007/3-540-46611-8_2},
abstract = {Thirty-eight years have passed since the {MIX} computer was designed, and computer architecture has been converging during those years towards a rather different style of machine. Therefore it is time to replace {MIX} with a new computer that contains even less saturated fat than its predecessor.},
pages = {2--61},
booktitle = {{MMIXware}: A {RISC} Computer for the Third Millennium},
publisher = {Springer},
author = {Knuth, Donald E.},
editor = {Knuth, Donald E.},
date = {1999},
langid = {english},
}