concept and design: continued with architecture

2025-04-04 14:18:56 +02:00
parent d8f5454e9c
commit 8afc3a5e3b
8 changed files with 255 additions and 24 deletions
--- a/other/component_diagram_interpreter.drawio
+++ b/other/component_diagram_interpreter.drawio
@ -0,0 +1,75 @@
+<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0" version="26.2.4">
+  <diagram name="Page-1" id="R-oAYELteez0U9UgfQ2t">
+    <mxGraphModel dx="1723" dy="956" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
+      <root>
+        <mxCell id="0" />
+        <mxCell id="1" parent="0" />
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-14" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" edge="1" parent="1" source="GDUa8-GdCzSgoxu7vCdt-4" target="GDUa8-GdCzSgoxu7vCdt-12">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-4" value="Pre-Processing" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="500" y="280" width="120" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-8" value="Interpreter" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;width=90;height=40;" vertex="1" parent="1">
+          <mxGeometry x="440" y="160" width="440" height="480" as="geometry" />
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-9" value="" style="ellipse;html=1;shape=endState;fillColor=#000000;strokeColor=#000000;" vertex="1" parent="1">
+          <mxGeometry x="270" y="520" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-13" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" edge="1" parent="1" source="GDUa8-GdCzSgoxu7vCdt-10" target="GDUa8-GdCzSgoxu7vCdt-4">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-15" value="&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;&lt;b&gt;Input:&lt;/b&gt;&lt;/font&gt;&lt;/div&gt;&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;Expressions&lt;/font&gt;&lt;/div&gt;&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;Variable-Sets&lt;/font&gt;&lt;/div&gt;&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;Parameters&lt;/font&gt;&lt;/div&gt;" style="edgeLabel;html=1;align=left;verticalAlign=middle;resizable=0;points=[];" vertex="1" connectable="0" parent="GDUa8-GdCzSgoxu7vCdt-13">
+          <mxGeometry x="-0.4633" relative="1" as="geometry">
+            <mxPoint x="-33" as="offset" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-10" value="" style="ellipse;html=1;shape=endState;fillColor=#000000;strokeColor=none;" vertex="1" parent="1">
+          <mxGeometry x="270" y="280" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-11" value="Host" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" vertex="1" parent="1">
+          <mxGeometry x="460" y="220" width="400" height="140" as="geometry" />
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-18" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="GDUa8-GdCzSgoxu7vCdt-12" target="GDUa8-GdCzSgoxu7vCdt-17">
+          <mxGeometry relative="1" as="geometry">
+            <Array as="points">
+              <mxPoint x="770" y="356" />
+              <mxPoint x="770" y="356" />
+            </Array>
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-19" value="&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;&lt;b&gt;Input:&lt;br&gt;&lt;/b&gt;&lt;/font&gt;&lt;/div&gt;&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;Processed Expressions&lt;/font&gt;&lt;/div&gt;&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;Variable-Sets&lt;/font&gt;&lt;/div&gt;&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;Parameters&lt;/font&gt;&lt;/div&gt;" style="edgeLabel;html=1;align=left;verticalAlign=middle;resizable=0;points=[];" vertex="1" connectable="0" parent="GDUa8-GdCzSgoxu7vCdt-18">
+          <mxGeometry x="0.1565" y="-2" relative="1" as="geometry">
+            <mxPoint x="-48" y="-26" as="offset" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-12" value="Dispatch Kernel" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="710" y="280" width="120" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-16" value="Device" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" vertex="1" parent="1">
+          <mxGeometry x="680" y="456" width="180" height="139" as="geometry" />
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-25" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0.5;exitDx=0;exitDy=0;entryX=1;entryY=0.5;entryDx=0;entryDy=0;" edge="1" parent="1" source="GDUa8-GdCzSgoxu7vCdt-17" target="GDUa8-GdCzSgoxu7vCdt-21">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-17" value="Evaluation" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="710" y="520" width="120" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-20" value="Host" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" vertex="1" parent="1">
+          <mxGeometry x="460" y="456" width="170" height="139" as="geometry" />
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-22" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0.5;exitDx=0;exitDy=0;entryX=1;entryY=0.5;entryDx=0;entryDy=0;" edge="1" parent="1" source="GDUa8-GdCzSgoxu7vCdt-21" target="GDUa8-GdCzSgoxu7vCdt-9">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-26" value="&lt;div&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;&lt;b&gt;Output:&lt;/b&gt;&lt;/font&gt;&lt;/div&gt;&lt;div&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;Evaluation-Results&lt;/font&gt;&lt;/div&gt;" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" vertex="1" connectable="0" parent="GDUa8-GdCzSgoxu7vCdt-22">
+          <mxGeometry x="0.4108" y="-1" relative="1" as="geometry">
+            <mxPoint x="13" y="1" as="offset" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-21" value="Retrieve Results" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="485" y="520" width="120" height="40" as="geometry" />
+        </mxCell>
+      </root>
+    </mxGraphModel>
+  </diagram>
+</mxfile>
--- a/other/component_diagram_transpiler.drawio
+++ b/other/component_diagram_transpiler.drawio
@ -0,0 +1,84 @@
+<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0" version="26.2.5">
+  <diagram name="Page-1" id="KFoKKVRmhU8qG_-FEeqA">
+    <mxGraphModel dx="985" dy="546" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
+      <root>
+        <mxCell id="0" />
+        <mxCell id="1" parent="0" />
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-1" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" edge="1" parent="1" source="tQMPqDGkYp4bv8unJ6VJ-21" target="tQMPqDGkYp4bv8unJ6VJ-11">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-2" value="Pre-Processing" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="480" y="280" width="120" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-3" value="Transpiler" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;width=90;height=40;" vertex="1" parent="1">
+          <mxGeometry x="440" y="160" width="480" height="480" as="geometry" />
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-4" value="" style="ellipse;html=1;shape=endState;fillColor=#000000;strokeColor=#000000;" vertex="1" parent="1">
+          <mxGeometry x="270" y="520" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-5" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" edge="1" parent="1" source="tQMPqDGkYp4bv8unJ6VJ-7" target="tQMPqDGkYp4bv8unJ6VJ-2">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-6" value="&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;&lt;b&gt;Input:&lt;/b&gt;&lt;/font&gt;&lt;/div&gt;&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;Expressions&lt;/font&gt;&lt;/div&gt;&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;Variable-Sets&lt;/font&gt;&lt;/div&gt;&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;Parameters&lt;/font&gt;&lt;/div&gt;" style="edgeLabel;html=1;align=left;verticalAlign=middle;resizable=0;points=[];" vertex="1" connectable="0" parent="tQMPqDGkYp4bv8unJ6VJ-5">
+          <mxGeometry x="-0.4633" relative="1" as="geometry">
+            <mxPoint x="-16" as="offset" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-7" value="" style="ellipse;html=1;shape=endState;fillColor=#000000;strokeColor=none;" vertex="1" parent="1">
+          <mxGeometry x="270" y="280" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-8" value="Host" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" vertex="1" parent="1">
+          <mxGeometry x="460" y="220" width="440" height="140" as="geometry" />
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-9" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="tQMPqDGkYp4bv8unJ6VJ-11" target="tQMPqDGkYp4bv8unJ6VJ-14">
+          <mxGeometry relative="1" as="geometry">
+            <Array as="points">
+              <mxPoint x="820" y="420" />
+              <mxPoint x="820" y="420" />
+            </Array>
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-10" value="&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;&lt;b&gt;Input:&lt;br&gt;&lt;/b&gt;&lt;/font&gt;&lt;/div&gt;&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;Processed Expressions&lt;/font&gt;&lt;/div&gt;&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;Variable-Sets&lt;/font&gt;&lt;/div&gt;&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;Parameters&lt;/font&gt;&lt;/div&gt;" style="edgeLabel;html=1;align=left;verticalAlign=middle;resizable=0;points=[];" vertex="1" connectable="0" parent="tQMPqDGkYp4bv8unJ6VJ-9">
+          <mxGeometry x="0.1565" y="-2" relative="1" as="geometry">
+            <mxPoint x="-48" y="-25" as="offset" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-11" value="Dispatch Kernel" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="760" y="280" width="120" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-12" value="Device" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" vertex="1" parent="1">
+          <mxGeometry x="720" y="456" width="180" height="134" as="geometry" />
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-13" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0.5;exitDx=0;exitDy=0;entryX=1;entryY=0.5;entryDx=0;entryDy=0;" edge="1" parent="1" source="tQMPqDGkYp4bv8unJ6VJ-14" target="tQMPqDGkYp4bv8unJ6VJ-18">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-14" value="Evaluation" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="760" y="520" width="120" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-15" value="Host" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" vertex="1" parent="1">
+          <mxGeometry x="460" y="456" width="180" height="134" as="geometry" />
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-16" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0.5;exitDx=0;exitDy=0;entryX=1;entryY=0.5;entryDx=0;entryDy=0;" edge="1" parent="1" source="tQMPqDGkYp4bv8unJ6VJ-18" target="tQMPqDGkYp4bv8unJ6VJ-4">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-17" value="&lt;div&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;&lt;b&gt;Output:&lt;/b&gt;&lt;/font&gt;&lt;/div&gt;&lt;div&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;Evaluation-Results&lt;/font&gt;&lt;/div&gt;" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" vertex="1" connectable="0" parent="tQMPqDGkYp4bv8unJ6VJ-16">
+          <mxGeometry x="0.4108" y="-1" relative="1" as="geometry">
+            <mxPoint x="13" y="1" as="offset" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-18" value="Retrieve Results" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="485" y="520" width="120" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-22" value="" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" edge="1" parent="1" source="tQMPqDGkYp4bv8unJ6VJ-2" target="tQMPqDGkYp4bv8unJ6VJ-21">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="600" y="300" as="sourcePoint" />
+            <mxPoint x="760" y="300" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-21" value="Code-Generation" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="620" y="280" width="120" height="40" as="geometry" />
+        </mxCell>
+      </root>
+    </mxGraphModel>
+  </diagram>
+</mxfile>
--- a/thesis/chapters/conceptdesign.tex
+++ b/thesis/chapters/conceptdesign.tex
@ -31,36 +31,104 @@ With these requirements, one possible expression that must be able to be evaluat

 With this, the capabilities are outlined, however, the input and output data need to further be explained for a better understanding. The first input are the expressions that need to be evaluated. These can have any length and can contain constant values, variables and parameters and all of these are linked together with the supported operations. In the example shown in Figure \ref{fig:input_output_explanation}, there are six expressions $e_1$ through $e_6$. Next is the variable matrix. One entry in this matrix, corresponds to one variable in every expression, with the row indicating which variable it holds the value for. Each column holds a different set of variables. In the provided example, there are three variable sets, each holding the values for four variables $x_1$ through $x_4$. All expressions are evaluated using all variable sets and the results of these evaluations are stored in the results matrix. Each entry in this matrix holds the resulting value of the evaluation of one expression with one variable set. The row indicates the variable set while the column indicates the expression.

-%%
-%%  TODO: Explain parameter optimisation a bit better/longer. Right now the understanding of parameters is not great with this.
-%%
-
 This is the minimal functionality needed to evaluate expressions with variables generated by a symbolic regression algorithm. In the case of parameter optimisation, it is useful to have a different type of variable, called parameter. For parameter optimisation it is important that for the given variable sets, the best fitting parameters need to be found. To achieve this, the evaluator is called multiple times with different parameters, but the same variables, and the results are evaluated for their fitness by the caller. In this case, the parameters do not change within one call. Parameters could therefore be treated as constant values of the expressions and no separate input for them would be needed. However, providing the possibility to have the parameters as an input, makes the process of parameter optimisation easier. This is the reason the prototype evaluators need to support parameters as inputs. Not all expressions need to have the same number of parameters. Therefore, they are structured as a vector of vectors and not a matrix. The example in Figure \ref{fig:input_output_explanation} shows how the parameters are structured. For example one expression has zero parameters, while another has six parameters $p_1$ through $p_6$. It needs to be mentioned that just like the number of variables, the number of parameters per expression is not limited. It is also possible to completely omit the parameters if they are not needed.

-\subsection{Non-Goals}
-Probably a good idea. Probably move this to "introduction"
+% \subsection{Non-Goals}
+% Probably a good idea. Probably move this to "introduction"
+\section{Architecture}

-\section{Interpreter}
-as introduction to this section talk about what "interpreter" means in this context. so "gpu parses expr and calculates"
+Based on the requirements above, the architecture of both prototypes can be designed. While the requirements only specify the input and output, the components and workflow also need to be specified. This section aims at giving an architectural overview of both prototypes, alongside their design decisions.

-\subsection{Architecture}
-talk about the coarse grained architecture on how the interpreter will work. (.5 to 1 page probably)
+A design decision that has been made for both prototypes is to split the evaluation of each expression into a separate kernel dispatch. As explained in Section \ref{sec:thread_hierarchy}, it is desirable to reduce the occurrence of thread divergence as much as possible. Although the SIMT programming model tries to mitigate the negative effects of thread divergence, it is still a good idea to avoid it when possible. In this case, thread divergence can easily be avoided by not evaluating all expressions in a single kernel dispatch. GPUs are able to have multiple resident grids, with modern GPUs being able to have 128 grids concurrently \parencite{nvidia_cuda_2025}. One grid corresponds to one kernel dispatch, and therefore allow 128 kernels to be run concurrently. Therefore, dispatching a kernel for each expression, has the possibility to improve the performance. 

-\subsection{Host}
-talk about the steps taken to prepare for GPU interpretation
+%% Maybe add overview Diagram 
+%% Shows -> Caller calls Evaluator -> Evaluator dispatches kernel -> kernel evaluates -> Evaluator returns evaluation result
+%% Probably the same as the interpreter and transpiler diagram. If so, dont add it

-\subsection{Device}
-talk about how the actual interpreter will be implemented
+\subsection{Pre-Processing}
+The first step in both prototypes is the pre-processing step. It is needed, as it simplifies working with the expressions in the later steps. Similar to the front-end in compilers as described in Section \ref{sec:compilers}, it takes an expression and transforms it into an intermediate representation. One of the responsibilities of the pre-processor is to verify that only allowed operators are present in the given expressions. Furthermore, it transforms the expressions into postfix-notation. This further allows the later parts to more easily evaluate the expressions. One of the major benefits of this notation is the implicit operator precedence. It allows the evaluators to evaluate the expressions token by token from left to right, without needing to worry about the correct order of operations. With one token being either an operator, a constant value, a variable or a parameter.
+
+It would have also been possible to perform the pre-processing step on the GPU. However, pre-processing only one expression can not easily be split into multiple threads, which means one GPU thread would need to process one expression. As described in Section \ref{sec:gpgpu} a single GPU thread is slower than a single CPU thread. Furthermore, it wouldn't make sense to process all expressions in a single kernel. This would lead to a lot of thread divergence, essentially processing one expression after the other. The SIMT programming model might help with parallelising at least some parts of the processing work. The generated expressions can differ a lot from each other and restricting them to be similar and therefore SIMT friendly, would reduce the overall quality of the symbolic regression algorithm. Therefore, it does not make sense to perform the processing step on the GPU. This is a typical example of code that is better run on the CPU, also because the parallelisation possibilities of one thread per expression can be applied to the CPU as well. Concepts like caching processed expressions, or caching parts of the processed expressions can also be employed on the CPU. This would not be possible on the GPU, because they can not save state between two kernel dispatches.
+
+\subsection{Interpreter}
+
+\begin{figure}
+	\centering
+	\includegraphics[width=.9\textwidth]{component_diagram_interpreter.png}
+	\caption{This diagram depicts the coarse-grained workflow of the interpreter. It shows how the parts interact with each other and with the system it will operate in.}
+	\label{fig:component_diagram_interpreter}
+\end{figure}
+
+The interpreter consists of two parts. The CPU side or host side is the part of the program, that interacts with both the GPU or device and the caller. An overview on the components and the workflow of the interpreter can be seen in Figure \ref{fig:component_diagram_interpreter}. Before the GPU can start evaluating the expressions, a pre-processing step is necessary. This step is crucial, as it transforms the expressions in a format, that greatly simplifies the evaluation part. Before the expression can be evaluated however, all data needs to be sent to the GPU, this includes the processed expressions, as well as the data for the variables and parameters. After the kernel has finished evaluating all expressions, the CPU reads the results from the GPU and returns them to the caller.
+
+Because of the already mentioned pre-processing step, the evaluation process is relatively straight-forward. The Algorithm \ref{alg:eval_interpreter} demonstrates how an expression in postfix-notation can be evaluated. It shows a simplified version that only works with addition, multiplication and constant values. This is the part of the interpreter prototype, that actually interprets the expressions and runs on the GPU. 
+
+\begin{algorithm}
+	\caption{Interpreting an equation in postfix-notation}\label{alg:eval_interpreter}
+	\begin{algorithmic}[1]
+		\Procedure{Evaluate}{\textit{expr}: PostfixExpression}
+			\State $\textit{stack} \gets ()$
+
+			\While{HasTokenLeft(\textit{expr})}
+				\State $\textit{token} \gets \text{GetNextToken}(\textit{expr})$ 
+				\If{$\textit{token.Kind} = \text{Constant}$} 
+					Push($\textit{stack}$, $\textit{token.Value}$)
+				\ElsIf{$\textit{token.Kind} = \text{Operator}$} 
+					\If{$\textit{token.Value} = \text{Addition}$}
+						\State $\textit{right} \gets \text{Pop}(\textit{stack})$
+						\State $\textit{left} \gets \text{Pop}(\textit{stack})$
+						Push($\textit{left} + \textit{right}$)
+					\ElsIf{$\textit{token.Value} = \text{Multiplication}$}
+						\State $\textit{right} \gets \text{Pop}(\textit{stack})$
+						\State $\textit{left} \gets \text{Pop}(\textit{stack})$
+						Push($\textit{left} * \textit{right}$)
+					\EndIf
+				\EndIf
+			\EndWhile
+
+			\Return $\text{Pop}(\textit{stack})$
+		\EndProcedure
+	\end{algorithmic}
+\end{algorithm}
+
+If a new operator is needed, it must simply be added as another else-if block inside the operator branch. New token kinds like variables or parameters, can also be added by adding a new outer else-if block that checks for these token kinds. However, the pre-processing step also needs to be extended with these new operators and token kinds, otherwise the expression will never reach the evaluation step. It is also possible to add unary operators like logarithm. In this case only one value would be read from the stack, the operation would be applied, and the result would be written back to the stack.


-\section{Transpiler}
-as introduction to this section talk about what "transpiler" means in this context. so "cpu takes expressions and generates ptx for gpu execution"
+\subsection{Transpiler}

-\subsection{Architecture}
-talk about the coarse grained architecture on how the transpiler will work. (.5 to 1 page probably)
+\begin{figure}
+	\centering
+	\includegraphics[width=.9\textwidth]{component_diagram_transpiler.png}
+	\caption{This diagram depicts the coarse-grained workflow of the transpiler. It shows how the parts interact with each other and with the system it will operate in.}
+	\label{fig:component_diagram_transpiler}
+\end{figure}

-\subsection{Host}
-talk about how the transpiler is implemented

-\subsection{Device}
-talk about what the GPU does. short section since the gpu does not do much
+% \section{Interpreter}
+% % as introduction to this section talk about what "interpreter" means in this context. so "gpu parses expr and calculates"
+% In this section, the GPU-based expression interpreter is described. It includes the design decisions made for the architecture of the interpreter. It also describes what is done on the CPU side or host side and what is performed on the GPU side or device side.
+
+% \subsection{Architecture}
+% talk about the coarse grained architecture on how the interpreter will work. (.5 to 1 page probably)
+% Include decicions made like "one kernel per expression"
+
+% \subsection{Host}
+% talk about the steps taken to prepare for GPU interpretation
+
+% \subsection{Device}
+% talk about how the actual interpreter will be implemented
+
+
+% \section{Transpiler}
+% as introduction to this section talk about what "transpiler" means in this context. so "cpu takes expressions and generates ptx for gpu execution"
+
+% Transpiler used, to reduce overhead of the generic PTX transpiler of CUDA, as we can build a more specialised transpiler and hopefully generate faster code that way. (this sentence was written as a reminder and not to be used as is)
+
+% \subsection{Architecture}
+% talk about the coarse grained architecture on how the transpiler will work. (.5 to 1 page probably)
+
+% \subsection{Host}
+% talk about how the transpiler is implemented
+
+% \subsection{Device}
+% talk about what the GPU does. short section since the gpu does not do much
--- a/thesis/chapters/implementation.tex
+++ b/thesis/chapters/implementation.tex
@ -1,6 +1,8 @@
 \chapter{Implementation}
 \label{cha:implementation}

+somewhere in here explain why one kernel per expression and not one kernel for all expressions
+
 \section{Technologies}
 Short section; CUDA, PTX, Julia, CUDA.jl

--- a/thesis/chapters/relwork.tex
+++ b/thesis/chapters/relwork.tex
@ -25,7 +25,7 @@ Graphics cards (GPUs) are commonly used to increase the performance of many diff
 While in the early days of GPGPU programming a lot of research has been done to assess if this approach is feasible, it now seems obvious to use GPUs to accelerate algorithms. GPUs have been used early to speed up weather simulation models. \textcite{michalakes_gpu_2008} proposed a method for simulating weather with the Weather Research and Forecast (WRF) model on a GPU. With their approach, they reached a speed-up of 5 to 2 for the most compute intensive task, with little GPU optimisation effort. They also found that the GPU usage was low, meaning there are resources and potential for more detailed simulations. Generally, simulations are great candidates for using GPUs, as they can benefit heavily from a high degree of parallelism and data throughput. \textcite{koster_high-performance_2020} have developed a way of using adaptive time steps on the GPU to considerably improve the performance of numerical and discrete simulations. In addition to the performance gains they were able to retain the precision and constraint correctness of the simulation. Black hole simulations are crucial for science and education for a better understanding of our world. \textcite{verbraeck_interactive_2021} have shown that simulating complex Kerr (rotating) black holes can be done on consumer hardware in a few seconds. Schwarzschild black hole simulations can be performed in real-time with GPUs as described by \textcite{hissbach_overview_2022} which is especially helpful for educational scenarios. While both approaches do not have the same accuracy as detailed simulations on supercomputers, they show how a single GPU can yield similar accuracy at a fraction of the cost. Software network routing can also heavily benefit from GPU acceleration as shown by \textcite{han_packetshader_2010}, where they achieved a significantly higher throughput than with a CPU only implementation. Finite element structural analysis is an essential tool for many branches of engineering and can also heavily benefit from the usage of GPUs as demonstrated by \textcite{georgescu_gpu_2013}. Generating test data for DeepQ learning can also significantly benefit from using the GPU \parencite{koster_macsq_2022}. However, it also needs to be noted, that GPUs are not always better performing than CPUs as illustrated by \textcite{lee_debunking_2010}, so it is important to consider if it is worth using GPUs for specific tasks.

 \subsection{Programming GPUs}
-The development process on a GPU is vastly different from a CPU. A CPU has tens or hundreds of complex cores with the AMD Epyc 9965\footnote{\url{https://www.amd.com/en/products/processors/server/epyc/9005-series/amd-epyc-9965.html}} having a staggering $192$ cores and twice as many threads. To demonstrate the complexity of a simple one core 8-bit CPU \textcite{schuurman_step-by-step_2013} has written a development guide. He describes the different parts of one CPU core and how they interact. Modern CPUs are even more complex, with dedicated fast integer and floating-point arithmetic gates as well as logic gates, sophisticated branch prediction and much more. This makes a CPU perfect for handling complex control flows on a single program strand and on modern CPUs even multiple strands simultaneously \parencite{palacios_comparison_2011}. However, as seen in Section \ref{sec:gpgpu}, this often is not enough. On the other hand, a GPU contains thousands or even tens of thousands of cores. For example, the GeForce RTX 5090\footnote{\url{https://www.nvidia.com/en-us/geforce/graphics-cards/50-series/rtx-5090/}} contains a total of $21\,760$ CUDA cores. To achieve this enormous core count a single GPU core has to be much simpler than one CPU core. As described by \textcite{nvidia_cuda_2025} a GPU designates much more transistors towards floating-point computations. This results in less efficient integer arithmetic and control flow handling. There is also less Cache available per core and clock speeds are usually also much lower than those on a CPU. An overview of the differences of a CPU and a GPU architecture can be seen in Figure \ref{fig:cpu_vs_gpu}.
+The development process on a GPU is vastly different from a CPU. A CPU has tens or hundreds of complex cores with the AMD Epyc 9965\footnote{\url{https://www.amd.com/en/products/processors/server/epyc/9005-series/amd-epyc-9965.html}} having $192$ cores and twice as many threads. To demonstrate the complexity of a simple one core 8-bit CPU \textcite{schuurman_step-by-step_2013} has written a development guide. He describes the different parts of one CPU core and how they interact. Modern CPUs are even more complex, with dedicated fast integer and floating-point arithmetic gates as well as logic gates, sophisticated branch prediction and much more. This makes a CPU perfect for handling complex control flows on a single program strand and on modern CPUs even multiple strands simultaneously \parencite{palacios_comparison_2011}. However, as seen in Section \ref{sec:gpgpu}, this often is not enough. On the other hand, a GPU contains thousands or even tens of thousands of cores. For example, the GeForce RTX 5090\footnote{\url{https://www.nvidia.com/en-us/geforce/graphics-cards/50-series/rtx-5090/}} contains a total of $21\,760$ CUDA cores. To achieve this enormous core count a single GPU core has to be much simpler than one CPU core. As described by \textcite{nvidia_cuda_2025} a GPU designates much more transistors towards floating-point computations. This results in less efficient integer arithmetic and control flow handling. There is also less Cache available per core and clock speeds are usually also much lower than those on a CPU. An overview of the differences of a CPU and a GPU architecture can be seen in Figure \ref{fig:cpu_vs_gpu}.

 \begin{figure}
 	\centering
@ -34,9 +34,10 @@ The development process on a GPU is vastly different from a CPU. A CPU has tens
 	\label{fig:cpu_vs_gpu}
 \end{figure}

-Despite these drawbacks, the sheer number of cores, makes a GPU a valid choice when considering improving the performance of an algorithm. Because of the high number of cores, GPUs are best suited for data parallel scenarios. This is due to the SIMD architecture of these cards. SIMD stands for Sinlge-Instruction Multiple-Data and states that there is a single stream of instructions that is executed on a huge number of data streams. \textcite{franchetti_efficient_2005} and \textcite{tian_compiling_2012} describe ways of using SIMD instructions on the CPU. Their approaches lead to noticeable speed-ups of 3.3 and 4.7 respectively by using SIMD instructions instead of serial computations. Extending this to GPUs which are specifically built for SIMD/data parallel calculations shows why they are so powerful despite having less complex and slower cores than a CPU. 
+Despite these drawbacks, the sheer number of cores, makes a GPU a valid choice when considering improving the performance of an algorithm. Because of the high number of cores, GPUs are best suited for data parallel scenarios. This is due to the SIMD architecture of these cards. SIMD stands for Sinlge-Instruction Multiple-Data and states that there is a single stream of instructions that is executed on a huge number of data streams. \textcite{franchetti_efficient_2005} and \textcite{tian_compiling_2012} describe ways of using SIMD instructions on the CPU. Their approaches lead to noticeable speed-ups of 3.3 and 4.7 respectively by using SIMD instructions instead of serial computations. Extending this to GPUs which are specifically built for SIMD/data parallel calculations shows why they are so powerful despite having less complex and slower cores than a CPU. It is also important to note, that a GPU also always needs a CPU, as the CPU is responsible for sending the data to the GPU and starting the GPU program. In GPGPU programming, the CPU is usually called the host, while the GPU is usually called the device.

 \subsubsection{Thread Hierarchy and Tuning}
+\label{sec:thread_hierarchy}
 The thousands of cores on a GPU, also called threads, are grouped together in several categories. This is the Thread hierarchy of GPUs. The developer can influence this grouping to a degree which allows them to tune their algorithm for optimal performance. In order to develop a well performing algorithm, it is necessary to know how this grouping works. Tuning the grouping is unique to each algorithm and also dependent on the GPU used, which means it is important to test a lot of different configurations to achieve the best possible result. This section aims at exploring the thread hierarchy and how it can be tuned to fit an algorithm.

 At the lowest level of a GPU exists a Streaming Multiprocessor (SM), which is a hardware unit responsible for scheduling and executing threads and also contains the registers used by these threads. An SM is always executing a group of 32 threads simultaneously, and this group is called a warp. The number of threads that can be started is virtually unlimited. However, threads must be grouped in a block, with one block typically containing a maximum of $1024$ threads but is often configured to be less. Therefore, if more than $1024$ threads are required, more blocks must be created. Blocks can also be grouped into thread block clusters which is optional, but can be useful in certain scenarios. All thread blocks or thread block clusters are part of a grid, which manifests as a dispatch of the code run on the GPU, also called kernel \parencite{amd_hip_2025}. All threads in one block have access to some shared memory, which can be used for L1 caching or communication between threads. It is important that the blocks can be scheduled independently, with no dependencies between them. This allows the scheduler to schedule blocks and threads as efficiently as possible. All threads within a warp are guaranteed to be part of the same block, and are therefore executed simultaneously and can access the same memory addresses. Figure \ref{fig:thread_hierarchy} depicts how threads in a block are grouped into warps for execution and how they shared memory. 
@ -157,6 +158,7 @@ Done:
 \end{program}

 \section{Compilers}
+\label{sec:compilers}
 Compilers are a necessary tool for many developers. If a developer wants to run their program it is very likely they need one. As best described by \textcite{aho_compilers_2006} in their dragon book, a compiler takes code written by a human in some source language and translates it into a destination language readable by a computer. This section briefly explores what compilers are and research done in this old field of computer science. Furthermore, the topics of transpilers and interpreters are explored, as their use-cases are very similar.

 \textcite{aho_compilers_2006} and \textcite{cooper_engineering_2022} describe how a compiler can be developed, with the latter focusing on more modern approaches. They describe how a compiler consists of two parts, the analyser, also called frontend, and the synthesiser also called backend. The front end is responsible for ensuring syntactic and semantic correctness and converts the source code into an intermediate representation, an abstract syntax tree (AST), for the backend. Generating code in the target language, from the intermediate representation is the job of the backend. This target code can be assembly or anything else that is needed for a specific use-case. This intermediate representation also makes it simple to swap out frontends or backends. The Gnu Compiler Collection \textcite{gcc_gcc_2025} takes advantage of using different frontends to provide support for many languages including C, C++, Ada and more. Instead of compiling source code for specific machines directly, many languages compile code for virtual machines instead. Notable examples are the Java Virtual Machine (JVM) \parencite{lindholm_java_2025} and the low level virtual machine (LLVM) \parencite{lattner_llvm_2004}. Such virtual machines provide a bytecode which can be used as a target language for compilers. A huge benefit of such virtual machines is the ability for one program to be run on all physical machines the virtual machine exists for, without the developer needing to change that program \parencite{lindholm_java_2025}. Programs written for virtual machines are compiled into their respective bytecode. This bytecode can then be interpreted or compiled to physical machine code and then be run. According to the JVM specification \textcite{lindholm_java_2025} the Java bytecode is interpreted and also compiled with a just-in-time (JIT) compiler to increase the performance of code blocks that are often executed. On the other hand, the common language runtime (CLR)\footnote{\url{https://learn.microsoft.com/en-us/dotnet/standard/clr}}, the virtual machine for languages like C\#, never interprets the generated bytecode. As described by \textcite{microsoft_overview_2023} the CLR always compiles the bytecode to physical machine code using a JIT compiler before it is executed.
--- a/thesis/images/component_diagram_interpreter.png
+++ b/thesis/images/component_diagram_interpreter.png
--- a/thesis/images/component_diagram_transpiler.png
+++ b/thesis/images/component_diagram_transpiler.png
--- a/thesis/main.pdf
+++ b/thesis/main.pdf