benchmarking: added results for transpiler

2025-05-20 18:55:15 +02:00
50 changed files with 640 additions and 268603 deletions
--- a/other/input-explanation.drawio
+++ b/other/input-explanation.drawio
@ -1,11 +1,11 @@
-<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0" version="27.1.6">
+<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0" version="26.1.1">
  <diagram name="Page-1" id="gpsZjoig8lt5hVv5Hzwz">
-    <mxGraphModel dx="1425" dy="791" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
+    <mxGraphModel dx="830" dy="457" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
      <root>
        <mxCell id="0" />
        <mxCell id="1" parent="0" />
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-13" value="" style="group" parent="1" vertex="1" connectable="0">
-          <mxGeometry x="340" y="200" width="240" height="40" as="geometry" />
+          <mxGeometry x="520" y="360" width="240" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-1" value="" style="rounded=0;whiteSpace=wrap;html=1;" parent="9Xn2HrUYLFHSwPnNgvM3-13" vertex="1">
          <mxGeometry width="240" height="40" as="geometry" />
@ -59,7 +59,7 @@
          <mxGeometry x="200" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-14" value="" style="group" parent="1" vertex="1" connectable="0">
-          <mxGeometry x="700" y="200" width="240" height="40" as="geometry" />
+          <mxGeometry x="880" y="360" width="240" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-7" value="" style="rounded=0;whiteSpace=wrap;html=1;" parent="9Xn2HrUYLFHSwPnNgvM3-14" vertex="1">
          <mxGeometry width="240" height="40" as="geometry" />
@ -95,442 +95,442 @@
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-37" value="" style="rounded=0;whiteSpace=wrap;html=1;rotation=90;" parent="1" vertex="1">
-          <mxGeometry x="740" y="360" width="200" height="40" as="geometry" />
+          <mxGeometry x="920" y="520" width="200" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-39" value="" style="endArrow=none;html=1;rounded=0;exitX=0.167;exitY=1;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0.167;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="820" y="360" as="sourcePoint" />
-            <mxPoint x="860" y="360" as="targetPoint" />
+            <mxPoint x="1000" y="520" as="sourcePoint" />
+            <mxPoint x="1040" y="520" as="targetPoint" />
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-40" value="" style="endArrow=none;html=1;rounded=0;exitX=0.167;exitY=1;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0.167;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="820" y="400" as="sourcePoint" />
-            <mxPoint x="860" y="400" as="targetPoint" />
+            <mxPoint x="1000" y="560" as="sourcePoint" />
+            <mxPoint x="1040" y="560" as="targetPoint" />
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-41" value="" style="endArrow=none;html=1;rounded=0;exitX=0.167;exitY=1;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0.167;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="820" y="440" as="sourcePoint" />
-            <mxPoint x="860" y="440" as="targetPoint" />
+            <mxPoint x="1000" y="600" as="sourcePoint" />
+            <mxPoint x="1040" y="600" as="targetPoint" />
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-57" value="" style="endArrow=none;html=1;rounded=0;exitX=0.167;exitY=1;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0.167;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="820" y="320" as="sourcePoint" />
-            <mxPoint x="860" y="320" as="targetPoint" />
+            <mxPoint x="1000" y="480" as="sourcePoint" />
+            <mxPoint x="1040" y="480" as="targetPoint" />
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-23" value="" style="rounded=0;whiteSpace=wrap;html=1;rotation=90;" parent="1" vertex="1">
-          <mxGeometry x="800" y="380" width="240" height="40" as="geometry" />
+          <mxGeometry x="980" y="540" width="240" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-25" value="" style="endArrow=none;html=1;rounded=0;exitX=0.167;exitY=1;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0.167;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="900" y="360" as="sourcePoint" />
-            <mxPoint x="940" y="360" as="targetPoint" />
+            <mxPoint x="1080" y="520" as="sourcePoint" />
+            <mxPoint x="1120" y="520" as="targetPoint" />
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-26" value="" style="endArrow=none;html=1;rounded=0;exitX=0.167;exitY=1;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0.167;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="900" y="400" as="sourcePoint" />
-            <mxPoint x="940" y="400" as="targetPoint" />
+            <mxPoint x="1080" y="560" as="sourcePoint" />
+            <mxPoint x="1120" y="560" as="targetPoint" />
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-27" value="" style="endArrow=none;html=1;rounded=0;exitX=0.167;exitY=1;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0.167;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="900" y="440" as="sourcePoint" />
-            <mxPoint x="940" y="440" as="targetPoint" />
+            <mxPoint x="1080" y="600" as="sourcePoint" />
+            <mxPoint x="1120" y="600" as="targetPoint" />
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-28" value="" style="endArrow=none;html=1;rounded=0;exitX=0.167;exitY=1;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0.167;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="900" y="480" as="sourcePoint" />
-            <mxPoint x="940" y="480" as="targetPoint" />
+            <mxPoint x="1080" y="640" as="sourcePoint" />
+            <mxPoint x="1120" y="640" as="targetPoint" />
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-16" value="" style="rounded=0;whiteSpace=wrap;html=1;rotation=90;" parent="1" vertex="1">
-          <mxGeometry x="640" y="340" width="160" height="40" as="geometry" />
+          <mxGeometry x="820" y="500" width="160" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-18" value="" style="endArrow=none;html=1;rounded=0;exitX=0.167;exitY=1;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0.167;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="700" y="360" as="sourcePoint" />
-            <mxPoint x="740" y="360" as="targetPoint" />
+            <mxPoint x="880" y="520" as="sourcePoint" />
+            <mxPoint x="920" y="520" as="targetPoint" />
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-19" value="" style="endArrow=none;html=1;rounded=0;exitX=0.167;exitY=1;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0.167;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="700" y="400" as="sourcePoint" />
-            <mxPoint x="740" y="400" as="targetPoint" />
+            <mxPoint x="880" y="560" as="sourcePoint" />
+            <mxPoint x="920" y="560" as="targetPoint" />
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-20" value="" style="endArrow=none;html=1;rounded=0;exitX=0.167;exitY=1;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0.167;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="700" y="440" as="sourcePoint" />
-            <mxPoint x="740" y="440" as="targetPoint" />
+            <mxPoint x="880" y="600" as="sourcePoint" />
+            <mxPoint x="920" y="600" as="targetPoint" />
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-30" value="" style="rounded=0;whiteSpace=wrap;html=1;rotation=90;" parent="1" vertex="1">
-          <mxGeometry x="760" y="300" width="80" height="40" as="geometry" />
+          <mxGeometry x="940" y="460" width="80" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-32" value="" style="endArrow=none;html=1;rounded=0;exitX=0.167;exitY=1;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0.167;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="780" y="360" as="sourcePoint" />
-            <mxPoint x="820" y="360" as="targetPoint" />
+            <mxPoint x="960" y="520" as="sourcePoint" />
+            <mxPoint x="1000" y="520" as="targetPoint" />
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-44" value="" style="rounded=0;whiteSpace=wrap;html=1;rotation=90;" parent="1" vertex="1">
-          <mxGeometry x="780" y="360" width="40" height="40" as="geometry" />
+          <mxGeometry x="960" y="520" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-51" value="" style="rounded=0;whiteSpace=wrap;html=1;rotation=90;" parent="1" vertex="1">
-          <mxGeometry x="700" y="320" width="120" height="40" as="geometry" />
+          <mxGeometry x="880" y="480" width="120" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-53" value="" style="endArrow=none;html=1;rounded=0;exitX=0.167;exitY=1;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0.167;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="740" y="360" as="sourcePoint" />
-            <mxPoint x="780" y="360" as="targetPoint" />
+            <mxPoint x="920" y="520" as="sourcePoint" />
+            <mxPoint x="960" y="520" as="targetPoint" />
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-58" value="" style="endArrow=none;html=1;rounded=0;exitX=0.167;exitY=1;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0.167;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="700" y="320" as="sourcePoint" />
-            <mxPoint x="740" y="320" as="targetPoint" />
+            <mxPoint x="880" y="480" as="sourcePoint" />
+            <mxPoint x="920" y="480" as="targetPoint" />
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-59" value="" style="endArrow=none;html=1;rounded=0;exitX=0.167;exitY=1;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0.167;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="900" y="320" as="sourcePoint" />
-            <mxPoint x="940" y="320" as="targetPoint" />
+            <mxPoint x="1080" y="480" as="sourcePoint" />
+            <mxPoint x="1120" y="480" as="targetPoint" />
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-60" value="" style="endArrow=none;html=1;rounded=0;exitX=0.167;exitY=1;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0.167;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="780" y="320" as="sourcePoint" />
-            <mxPoint x="820" y="320" as="targetPoint" />
+            <mxPoint x="960" y="480" as="sourcePoint" />
+            <mxPoint x="1000" y="480" as="targetPoint" />
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-62" value="" style="endArrow=none;html=1;rounded=0;exitX=0.167;exitY=1;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0.167;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="740" y="320" as="sourcePoint" />
-            <mxPoint x="780" y="320" as="targetPoint" />
+            <mxPoint x="920" y="480" as="sourcePoint" />
+            <mxPoint x="960" y="480" as="targetPoint" />
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-64" value="" style="endArrow=classic;html=1;rounded=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="719.83" y="240" as="sourcePoint" />
-            <mxPoint x="719.83" y="280" as="targetPoint" />
+            <mxPoint x="899.83" y="400" as="sourcePoint" />
+            <mxPoint x="899.83" y="440" as="targetPoint" />
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-65" value="" style="endArrow=classic;html=1;rounded=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="759.6700000000001" y="240" as="sourcePoint" />
-            <mxPoint x="759.6700000000001" y="280" as="targetPoint" />
+            <mxPoint x="939.6700000000001" y="400" as="sourcePoint" />
+            <mxPoint x="939.6700000000001" y="440" as="targetPoint" />
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-66" value="" style="endArrow=classic;html=1;rounded=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="799.83" y="240" as="sourcePoint" />
-            <mxPoint x="799.83" y="280" as="targetPoint" />
+            <mxPoint x="979.83" y="400" as="sourcePoint" />
+            <mxPoint x="979.83" y="440" as="targetPoint" />
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-67" value="" style="endArrow=classic;html=1;rounded=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="839.6700000000001" y="240" as="sourcePoint" />
-            <mxPoint x="839.6700000000001" y="280" as="targetPoint" />
+            <mxPoint x="1019.6700000000001" y="400" as="sourcePoint" />
+            <mxPoint x="1019.6700000000001" y="440" as="targetPoint" />
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-68" value="" style="endArrow=baseDash;html=1;rounded=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;endFill=0;endSize=18;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="879.8300000000002" y="240" as="sourcePoint" />
-            <mxPoint x="879.8300000000002" y="280" as="targetPoint" />
+            <mxPoint x="1059.8300000000002" y="400" as="sourcePoint" />
+            <mxPoint x="1059.8300000000002" y="440" as="targetPoint" />
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-69" value="" style="endArrow=classic;html=1;rounded=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="919.8300000000002" y="240" as="sourcePoint" />
-            <mxPoint x="919.8300000000002" y="280" as="targetPoint" />
+            <mxPoint x="1099.8300000000002" y="400" as="sourcePoint" />
+            <mxPoint x="1099.8300000000002" y="440" as="targetPoint" />
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-70" value="Parameters" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="630" y="210" width="70" height="20" as="geometry" />
+          <mxGeometry x="810" y="370" width="70" height="20" as="geometry" />
        </mxCell>
-        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-71" value="&lt;div&gt;Variable Matrix&lt;/div&gt;&lt;div&gt;k = 4&lt;/div&gt;&lt;div&gt;N = 3&lt;/div&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="280" y="380" width="90" height="40" as="geometry" />
+        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-71" value="Variable Matrix" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
+          <mxGeometry x="470" y="540" width="70" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-79" value="p1" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="700" y="280" width="40" height="40" as="geometry" />
+          <mxGeometry x="880" y="440" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-80" value="p2" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="700" y="320" width="40" height="40" as="geometry" />
+          <mxGeometry x="880" y="480" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-81" value="p3" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="700" y="360" width="40" height="40" as="geometry" />
+          <mxGeometry x="880" y="520" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-82" value="p4" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="700" y="400" width="40" height="40" as="geometry" />
+          <mxGeometry x="880" y="560" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-83" value="p1" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="900" y="280" width="40" height="40" as="geometry" />
+          <mxGeometry x="1080" y="440" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-84" value="p2" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="900" y="320" width="40" height="40" as="geometry" />
+          <mxGeometry x="1080" y="480" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-85" value="p3" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="900" y="360" width="40" height="40" as="geometry" />
+          <mxGeometry x="1080" y="520" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-86" value="p4" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="900" y="400" width="40" height="40" as="geometry" />
+          <mxGeometry x="1080" y="560" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-87" value="&lt;div&gt;p5&lt;/div&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="900" y="440" width="40" height="40" as="geometry" />
+          <mxGeometry x="1080" y="600" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-88" value="p6" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="900" y="480" width="40" height="40" as="geometry" />
+          <mxGeometry x="1080" y="640" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-89" value="p1" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="780" y="280" width="40" height="40" as="geometry" />
+          <mxGeometry x="960" y="440" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-90" value="p2" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="780" y="320" width="40" height="40" as="geometry" />
+          <mxGeometry x="960" y="480" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-91" value="p1" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="820" y="280" width="40" height="40" as="geometry" />
+          <mxGeometry x="1000" y="440" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-92" value="p2" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="820" y="320" width="40" height="40" as="geometry" />
+          <mxGeometry x="1000" y="480" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-93" value="p3" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="820" y="360" width="40" height="40" as="geometry" />
+          <mxGeometry x="1000" y="520" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-94" value="p4" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="820" y="400" width="40" height="40" as="geometry" />
+          <mxGeometry x="1000" y="560" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-95" value="&lt;div&gt;p5&lt;/div&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="820" y="440" width="40" height="40" as="geometry" />
+          <mxGeometry x="1000" y="600" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-96" value="p3" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="780" y="360" width="40" height="40" as="geometry" />
+          <mxGeometry x="960" y="520" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-97" value="p1" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="740" y="280" width="40" height="40" as="geometry" />
+          <mxGeometry x="920" y="440" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-98" value="p2" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="740" y="320" width="40" height="40" as="geometry" />
+          <mxGeometry x="920" y="480" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-99" value="p3" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="740" y="360" width="40" height="40" as="geometry" />
+          <mxGeometry x="920" y="520" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-100" value="" style="endArrow=classic;html=1;rounded=0;edgeStyle=orthogonalEdgeStyle;" parent="1" source="9Xn2HrUYLFHSwPnNgvM3-1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="360" y="161" as="sourcePoint" />
-            <mxPoint x="720" y="200" as="targetPoint" />
+            <mxPoint x="540" y="321" as="sourcePoint" />
+            <mxPoint x="900" y="360" as="targetPoint" />
            <Array as="points">
-              <mxPoint x="360" y="180" />
-              <mxPoint x="720" y="180" />
+              <mxPoint x="540" y="340" />
+              <mxPoint x="900" y="340" />
            </Array>
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-101" value="" style="endArrow=classic;html=1;rounded=0;edgeStyle=orthogonalEdgeStyle;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="400" y="200" as="sourcePoint" />
-            <mxPoint x="760" y="200" as="targetPoint" />
+            <mxPoint x="580" y="360" as="sourcePoint" />
+            <mxPoint x="940" y="360" as="targetPoint" />
            <Array as="points">
-              <mxPoint x="400" y="170" />
-              <mxPoint x="760" y="170" />
+              <mxPoint x="580" y="330" />
+              <mxPoint x="940" y="330" />
            </Array>
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-102" value="" style="endArrow=classic;html=1;rounded=0;edgeStyle=orthogonalEdgeStyle;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="440" y="200" as="sourcePoint" />
-            <mxPoint x="800" y="200" as="targetPoint" />
+            <mxPoint x="620" y="360" as="sourcePoint" />
+            <mxPoint x="980" y="360" as="targetPoint" />
            <Array as="points">
-              <mxPoint x="440" y="160" />
-              <mxPoint x="800" y="160" />
+              <mxPoint x="620" y="320" />
+              <mxPoint x="980" y="320" />
            </Array>
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-103" value="" style="endArrow=classic;html=1;rounded=0;edgeStyle=orthogonalEdgeStyle;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="480" y="200" as="sourcePoint" />
-            <mxPoint x="840" y="200" as="targetPoint" />
+            <mxPoint x="660" y="360" as="sourcePoint" />
+            <mxPoint x="1020" y="360" as="targetPoint" />
            <Array as="points">
-              <mxPoint x="480" y="150" />
-              <mxPoint x="840" y="150" />
+              <mxPoint x="660" y="310" />
+              <mxPoint x="1020" y="310" />
            </Array>
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-104" value="" style="endArrow=classic;html=1;rounded=0;edgeStyle=orthogonalEdgeStyle;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="520" y="200" as="sourcePoint" />
-            <mxPoint x="880" y="200" as="targetPoint" />
+            <mxPoint x="700" y="360" as="sourcePoint" />
+            <mxPoint x="1060" y="360" as="targetPoint" />
            <Array as="points">
-              <mxPoint x="520" y="140" />
-              <mxPoint x="880" y="140" />
+              <mxPoint x="700" y="300" />
+              <mxPoint x="1060" y="300" />
            </Array>
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-105" value="" style="endArrow=classic;html=1;rounded=0;edgeStyle=orthogonalEdgeStyle;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="560" y="200" as="sourcePoint" />
-            <mxPoint x="920" y="200" as="targetPoint" />
+            <mxPoint x="740" y="360" as="sourcePoint" />
+            <mxPoint x="1100" y="360" as="targetPoint" />
            <Array as="points">
-              <mxPoint x="560" y="130" />
-              <mxPoint x="920" y="130" />
+              <mxPoint x="740" y="290" />
+              <mxPoint x="1100" y="290" />
            </Array>
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-107" value="" style="rounded=0;whiteSpace=wrap;html=1;direction=south;" parent="1" vertex="1">
-          <mxGeometry x="380" y="320" width="120" height="160" as="geometry" />
+          <mxGeometry x="560" y="480" width="120" height="160" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-109" value="" style="endArrow=none;html=1;rounded=0;exitX=0;exitY=0;exitDx=0;exitDy=0;" parent="1" source="9Xn2HrUYLFHSwPnNgvM3-117" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="540" y="360" as="sourcePoint" />
-            <mxPoint x="380" y="360" as="targetPoint" />
+            <mxPoint x="720" y="520" as="sourcePoint" />
+            <mxPoint x="560" y="520" as="targetPoint" />
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-110" value="" style="endArrow=none;html=1;rounded=0;exitX=1;exitY=1;exitDx=0;exitDy=0;" parent="1" source="9Xn2HrUYLFHSwPnNgvM3-123" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="460" y="439.30999999999995" as="sourcePoint" />
-            <mxPoint x="460" y="320" as="targetPoint" />
+            <mxPoint x="640" y="599.31" as="sourcePoint" />
+            <mxPoint x="640" y="480" as="targetPoint" />
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-111" value="" style="endArrow=none;html=1;rounded=0;exitX=1;exitY=1;exitDx=0;exitDy=0;" parent="1" source="9Xn2HrUYLFHSwPnNgvM3-119" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="419.52" y="440" as="sourcePoint" />
-            <mxPoint x="419.52" y="320.69000000000005" as="targetPoint" />
+            <mxPoint x="599.52" y="600" as="sourcePoint" />
+            <mxPoint x="599.52" y="480.69000000000005" as="targetPoint" />
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-114" value="" style="endArrow=none;html=1;rounded=0;exitX=1;exitY=0;exitDx=0;exitDy=0;" parent="1" source="9Xn2HrUYLFHSwPnNgvM3-117" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="540" y="399.76" as="sourcePoint" />
-            <mxPoint x="380" y="399.76" as="targetPoint" />
+            <mxPoint x="720" y="559.76" as="sourcePoint" />
+            <mxPoint x="560" y="559.76" as="targetPoint" />
          </mxGeometry>
        </mxCell>
-        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-115" value="&lt;div&gt;Expressions&lt;/div&gt;&lt;div&gt;N&lt;sub&gt;e&lt;/sub&gt; = 6&lt;/div&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="270" y="210" width="70" height="20" as="geometry" />
+        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-115" value="Expressions" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
+          <mxGeometry x="450" y="370" width="70" height="20" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-116" value="x1" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="1" vertex="1">
-          <mxGeometry x="460" y="320" width="40" height="40" as="geometry" />
+          <mxGeometry x="640" y="480" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-117" value="x2" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="1" vertex="1">
-          <mxGeometry x="460" y="360" width="40" height="40" as="geometry" />
+          <mxGeometry x="640" y="520" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-118" value="x3" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="1" vertex="1">
-          <mxGeometry x="460" y="400" width="40" height="40" as="geometry" />
+          <mxGeometry x="640" y="560" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9og6d5YY-6gPx96OlZrF-12" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;" parent="1" source="9Xn2HrUYLFHSwPnNgvM3-119" edge="1">
          <mxGeometry relative="1" as="geometry">
-            <mxPoint x="540" y="580" as="targetPoint" />
+            <mxPoint x="720" y="740" as="targetPoint" />
            <Array as="points">
-              <mxPoint x="440" y="580" />
+              <mxPoint x="620" y="740" />
            </Array>
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-119" value="x4" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="1" vertex="1">
-          <mxGeometry x="420" y="440" width="40" height="40" as="geometry" />
+          <mxGeometry x="600" y="600" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-120" value="x1" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="1" vertex="1">
-          <mxGeometry x="420" y="320" width="40" height="40" as="geometry" />
+          <mxGeometry x="600" y="480" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-121" value="x2" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="1" vertex="1">
-          <mxGeometry x="420" y="360" width="40" height="40" as="geometry" />
+          <mxGeometry x="600" y="520" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-122" value="x3" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="1" vertex="1">
-          <mxGeometry x="420" y="400" width="40" height="40" as="geometry" />
+          <mxGeometry x="600" y="560" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9og6d5YY-6gPx96OlZrF-14" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;" parent="1" source="9Xn2HrUYLFHSwPnNgvM3-123" edge="1">
          <mxGeometry relative="1" as="geometry">
-            <mxPoint x="540" y="620" as="targetPoint" />
+            <mxPoint x="720" y="780" as="targetPoint" />
            <Array as="points">
-              <mxPoint x="480" y="620" />
+              <mxPoint x="660" y="780" />
            </Array>
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-123" value="x4" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="1" vertex="1">
-          <mxGeometry x="460" y="440" width="40" height="40" as="geometry" />
+          <mxGeometry x="640" y="600" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-124" value="x1" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="1" vertex="1">
-          <mxGeometry x="380" y="320" width="40" height="40" as="geometry" />
+          <mxGeometry x="560" y="480" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-125" value="x2" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="1" vertex="1">
-          <mxGeometry x="380" y="360" width="40" height="40" as="geometry" />
+          <mxGeometry x="560" y="520" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-126" value="x3" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="1" vertex="1">
-          <mxGeometry x="380" y="400" width="40" height="40" as="geometry" />
+          <mxGeometry x="560" y="560" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9og6d5YY-6gPx96OlZrF-11" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;" parent="1" source="9Xn2HrUYLFHSwPnNgvM3-127" edge="1">
          <mxGeometry relative="1" as="geometry">
-            <mxPoint x="540" y="540" as="targetPoint" />
+            <mxPoint x="720" y="700" as="targetPoint" />
            <Array as="points">
-              <mxPoint x="400" y="540" />
+              <mxPoint x="580" y="700" />
            </Array>
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-127" value="x4" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="1" vertex="1">
-          <mxGeometry x="380" y="440" width="40" height="40" as="geometry" />
+          <mxGeometry x="560" y="600" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="J6L5rIkjYbQ64Ew3ZQAR-1" value="" style="endArrow=none;html=1;rounded=0;exitX=1;exitY=0;exitDx=0;exitDy=0;" parent="1" source="9Xn2HrUYLFHSwPnNgvM3-118" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="540" y="440" as="sourcePoint" />
-            <mxPoint x="380" y="440" as="targetPoint" />
+            <mxPoint x="720" y="600" as="sourcePoint" />
+            <mxPoint x="560" y="600" as="targetPoint" />
          </mxGeometry>
        </mxCell>
        <mxCell id="J6L5rIkjYbQ64Ew3ZQAR-2" value="" style="endArrow=classic;html=1;rounded=0;edgeStyle=orthogonalEdgeStyle;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="360" y="240" as="sourcePoint" />
-            <mxPoint x="400" y="320" as="targetPoint" />
+            <mxPoint x="540" y="400" as="sourcePoint" />
+            <mxPoint x="580" y="480" as="targetPoint" />
            <Array as="points">
-              <mxPoint x="360" y="280" />
-              <mxPoint x="400" y="280" />
+              <mxPoint x="540" y="440" />
+              <mxPoint x="580" y="440" />
            </Array>
          </mxGeometry>
        </mxCell>
        <mxCell id="J6L5rIkjYbQ64Ew3ZQAR-3" value="" style="endArrow=classic;html=1;rounded=0;edgeStyle=orthogonalEdgeStyle;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="400" y="240" as="sourcePoint" />
-            <mxPoint x="440" y="320" as="targetPoint" />
+            <mxPoint x="580" y="400" as="sourcePoint" />
+            <mxPoint x="620" y="480" as="targetPoint" />
            <Array as="points">
-              <mxPoint x="400" y="280" />
-              <mxPoint x="440" y="280" />
+              <mxPoint x="580" y="440" />
+              <mxPoint x="620" y="440" />
            </Array>
          </mxGeometry>
        </mxCell>
        <mxCell id="J6L5rIkjYbQ64Ew3ZQAR-4" value="" style="endArrow=classic;html=1;rounded=0;edgeStyle=orthogonalEdgeStyle;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="440" y="240" as="sourcePoint" />
-            <mxPoint x="480" y="320" as="targetPoint" />
+            <mxPoint x="620" y="400" as="sourcePoint" />
+            <mxPoint x="660" y="480" as="targetPoint" />
            <Array as="points">
-              <mxPoint x="440" y="280" />
-              <mxPoint x="480" y="280" />
+              <mxPoint x="620" y="440" />
+              <mxPoint x="660" y="440" />
            </Array>
          </mxGeometry>
        </mxCell>
        <mxCell id="J6L5rIkjYbQ64Ew3ZQAR-5" value="" style="endArrow=none;html=1;rounded=0;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="480" y="310" as="sourcePoint" />
-            <mxPoint x="480" y="240" as="targetPoint" />
+            <mxPoint x="660" y="470" as="sourcePoint" />
+            <mxPoint x="660" y="400" as="targetPoint" />
          </mxGeometry>
        </mxCell>
        <mxCell id="J6L5rIkjYbQ64Ew3ZQAR-9" value="" style="endArrow=none;html=1;rounded=0;entryX=0.75;entryY=1;entryDx=0;entryDy=0;edgeStyle=orthogonalEdgeStyle;" parent="1" target="9Xn2HrUYLFHSwPnNgvM3-1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="480" y="280" as="sourcePoint" />
-            <mxPoint x="530" y="243.5" as="targetPoint" />
+            <mxPoint x="660" y="440" as="sourcePoint" />
+            <mxPoint x="710" y="403.5" as="targetPoint" />
            <Array as="points">
-              <mxPoint x="520" y="280" />
+              <mxPoint x="700" y="440" />
            </Array>
          </mxGeometry>
        </mxCell>
        <mxCell id="J6L5rIkjYbQ64Ew3ZQAR-10" value="" style="endArrow=none;html=1;rounded=0;entryX=0.75;entryY=1;entryDx=0;entryDy=0;edgeStyle=orthogonalEdgeStyle;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="520" y="280" as="sourcePoint" />
-            <mxPoint x="560" y="240" as="targetPoint" />
+            <mxPoint x="700" y="440" as="sourcePoint" />
+            <mxPoint x="740" y="400" as="targetPoint" />
            <Array as="points">
-              <mxPoint x="560" y="280" />
+              <mxPoint x="740" y="440" />
            </Array>
          </mxGeometry>
        </mxCell>
        <mxCell id="9og6d5YY-6gPx96OlZrF-9" value="" style="group" parent="1" vertex="1" connectable="0">
-          <mxGeometry x="541" y="520" width="240" height="120" as="geometry" />
+          <mxGeometry x="721" y="680" width="240" height="120" as="geometry" />
        </mxCell>
        <mxCell id="9og6d5YY-6gPx96OlZrF-1" value="" style="rounded=0;whiteSpace=wrap;html=1;" parent="9og6d5YY-6gPx96OlZrF-9" vertex="1">
          <mxGeometry width="240" height="120" as="geometry" />
@ -578,13 +578,13 @@
          </mxGeometry>
        </mxCell>
        <mxCell id="9og6d5YY-6gPx96OlZrF-10" value="&lt;div&gt;Results&lt;/div&gt;&lt;div&gt;Matrix&lt;/div&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="541" y="470" width="70" height="40" as="geometry" />
+          <mxGeometry x="721" y="630" width="70" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9og6d5YY-6gPx96OlZrF-16" value="" style="shape=curlyBracket;whiteSpace=wrap;html=1;rounded=1;labelPosition=left;verticalLabelPosition=middle;align=right;verticalAlign=middle;rotation=-90;" parent="1" vertex="1">
-          <mxGeometry x="652" y="541" width="20" height="240" as="geometry" />
+          <mxGeometry x="832" y="701" width="20" height="240" as="geometry" />
        </mxCell>
-        <mxCell id="9og6d5YY-6gPx96OlZrF-17" value="Expression 1 through Expression 6" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="547" y="672" width="230" height="30" as="geometry" />
+        <mxCell id="9og6d5YY-6gPx96OlZrF-17" value="Expression 1 through Expression n" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
+          <mxGeometry x="727" y="832" width="230" height="30" as="geometry" />
        </mxCell>
      </root>
    </mxGraphModel>
--- a/other/interpreter_sequence_diagram.drawio
+++ b/other/interpreter_sequence_diagram.drawio
@ -1,172 +1,169 @@
-<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0" version="27.1.6">
+<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:138.0) Gecko/20100101 Firefox/138.0" version="26.2.14">
  <diagram name="Page-1" id="6PRo98IcIigsbWnrE1av">
-    <mxGraphModel dx="2066" dy="1147" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
+    <mxGraphModel dx="1181" dy="655" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
      <root>
        <mxCell id="0" />
        <mxCell id="1" parent="0" />
-        <mxCell id="Eoor0cZiH70DILZYRCY1-1" value="Expression Evaluator" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
-          <mxGeometry x="360" y="100" width="100" height="40" as="geometry" />
+        <mxCell id="gfXG8frgiKgzaB5gouxS-22" value="Interpreter" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
+          <mxGeometry x="260" y="60" width="100" height="40" as="geometry" />
        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-2" value="CPU-Side" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
-          <mxGeometry x="800" y="100" width="100" height="40" as="geometry" />
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-1" value="Pre-Processing" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
+          <mxGeometry x="500" y="60" width="90" height="40" as="geometry" />
        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-3" value="Frontend" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
-          <mxGeometry x="540" y="100" width="90" height="40" as="geometry" />
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-2" value="GPU" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
+          <mxGeometry x="640" y="60" width="90" height="40" as="geometry" />
        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-4" value="GPU-Side" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
-          <mxGeometry x="670" y="100" width="80" height="40" as="geometry" />
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-3" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" parent="1" vertex="1">
+          <mxGeometry x="305" y="100" width="10" height="420" as="geometry" />
        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-5" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" parent="1" vertex="1">
-          <mxGeometry x="405" y="140" width="10" height="560" as="geometry" />
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-7" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" parent="1" vertex="1">
+          <mxGeometry x="540" y="170" width="10" height="40" as="geometry" />
        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-6" value="" style="html=1;verticalAlign=bottom;startArrow=circle;startFill=1;endArrow=open;startSize=6;endSize=8;curved=0;rounded=0;" parent="1" target="Eoor0cZiH70DILZYRCY1-5" edge="1">
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-8" value="expr_to_postfix(expr): ExpressionElement[]" style="html=1;verticalAlign=bottom;endArrow=block;curved=0;rounded=0;entryX=0;entryY=0;entryDx=0;entryDy=5;" parent="1" source="hKyrbmUfddmyC9NB2b_t-3" target="hKyrbmUfddmyC9NB2b_t-7" edge="1">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="420" y="185" as="sourcePoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-9" value="" style="html=1;verticalAlign=bottom;endArrow=open;endSize=8;curved=0;rounded=0;exitX=0;exitY=1;exitDx=0;exitDy=-5;dashed=1;" parent="1" source="hKyrbmUfddmyC9NB2b_t-7" target="hKyrbmUfddmyC9NB2b_t-3" edge="1">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="420" y="255" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-16" value="intermediate_representation" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="hKyrbmUfddmyC9NB2b_t-9" vertex="1" connectable="0">
+          <mxGeometry x="-0.008" y="-1" relative="1" as="geometry">
+            <mxPoint y="-9" as="offset" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-10" value="" style="endArrow=none;dashed=1;html=1;rounded=0;entryX=0.5;entryY=1;entryDx=0;entryDy=0;" parent="1" source="hKyrbmUfddmyC9NB2b_t-7" target="hKyrbmUfddmyC9NB2b_t-1" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="550" y="150" as="sourcePoint" />
+            <mxPoint x="780" y="260" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-11" value="" style="endArrow=none;dashed=1;html=1;rounded=0;" parent="1" target="hKyrbmUfddmyC9NB2b_t-7" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="545" y="520" as="sourcePoint" />
+            <mxPoint x="539.76" y="260" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-12" value="loop" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" parent="1" vertex="1">
+          <mxGeometry x="170" y="150" width="420" height="80" as="geometry" />
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-14" value="&lt;font style=&quot;font-size: 9px;&quot;&gt;[for each expression]&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
+          <mxGeometry x="170" y="180" width="90" height="20" as="geometry" />
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-17" value="loop" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" parent="1" vertex="1">
+          <mxGeometry x="170" y="370" width="560" height="60" as="geometry" />
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-18" value="&lt;font style=&quot;font-size: 9px;&quot;&gt;[for each intermediate_representation]&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
+          <mxGeometry x="172" y="403" width="120" height="20" as="geometry" />
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-20" value="kernel(intermediate_representation, variables, parameters)" style="html=1;verticalAlign=bottom;endArrow=open;curved=0;rounded=0;endFill=0;" parent="1" source="hKyrbmUfddmyC9NB2b_t-3" edge="1">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="320" y="403" as="sourcePoint" />
+            <mxPoint x="685" y="403" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-23" value="" style="endArrow=none;dashed=1;html=1;rounded=0;entryX=0.5;entryY=1;entryDx=0;entryDy=0;" parent="1" source="hKyrbmUfddmyC9NB2b_t-34" target="hKyrbmUfddmyC9NB2b_t-2" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="610" y="250" as="sourcePoint" />
+            <mxPoint x="660" y="200" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-26" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" parent="1" vertex="1">
+          <mxGeometry x="680" y="460" width="10" height="30" as="geometry" />
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-27" value="read_results()" style="html=1;verticalAlign=bottom;endArrow=block;curved=0;rounded=0;entryX=0;entryY=0;entryDx=0;entryDy=5;" parent="1" source="hKyrbmUfddmyC9NB2b_t-3" target="hKyrbmUfddmyC9NB2b_t-26" edge="1">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="305" y="444" as="sourcePoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-28" value="resultMatrix" style="html=1;verticalAlign=bottom;endArrow=open;dashed=1;endSize=8;curved=0;rounded=0;exitX=0;exitY=1;exitDx=0;exitDy=-5;" parent="1" source="hKyrbmUfddmyC9NB2b_t-26" target="hKyrbmUfddmyC9NB2b_t-3" edge="1">
+          <mxGeometry x="0.0012" relative="1" as="geometry">
+            <mxPoint x="305" y="494.0000000000001" as="targetPoint" />
+            <mxPoint as="offset" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-30" value="" style="endArrow=none;dashed=1;html=1;rounded=0;" parent="1" target="hKyrbmUfddmyC9NB2b_t-26" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="685" y="520" as="sourcePoint" />
+            <mxPoint x="710" y="390" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-35" value="send_data(variables)" style="html=1;verticalAlign=bottom;endArrow=block;curved=0;rounded=0;entryX=0;entryY=0;entryDx=0;entryDy=5;" parent="1" source="hKyrbmUfddmyC9NB2b_t-3" target="hKyrbmUfddmyC9NB2b_t-34" edge="1">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="720" y="225" as="sourcePoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-36" value="" style="html=1;verticalAlign=bottom;endArrow=open;dashed=1;endSize=8;curved=0;rounded=0;exitX=0;exitY=1;exitDx=0;exitDy=-5;" parent="1" source="hKyrbmUfddmyC9NB2b_t-34" target="hKyrbmUfddmyC9NB2b_t-3" edge="1">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="720" y="255" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-37" value="" style="endArrow=none;dashed=1;html=1;rounded=0;entryX=0.5;entryY=1;entryDx=0;entryDy=0;" parent="1" source="hKyrbmUfddmyC9NB2b_t-38" target="hKyrbmUfddmyC9NB2b_t-34" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="700" y="349" as="sourcePoint" />
+            <mxPoint x="700" y="120" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-34" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" parent="1" vertex="1">
+          <mxGeometry x="680" y="250" width="10" height="20" as="geometry" />
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-39" value="send_data(parameters)" style="html=1;verticalAlign=bottom;endArrow=block;curved=0;rounded=0;entryX=0;entryY=0;entryDx=0;entryDy=5;" parent="1" source="hKyrbmUfddmyC9NB2b_t-3" target="hKyrbmUfddmyC9NB2b_t-38" edge="1">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="750" y="288" as="sourcePoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-40" value="" style="html=1;verticalAlign=bottom;endArrow=open;dashed=1;endSize=8;curved=0;rounded=0;exitX=0;exitY=1;exitDx=0;exitDy=-5;" parent="1" source="hKyrbmUfddmyC9NB2b_t-38" target="hKyrbmUfddmyC9NB2b_t-3" edge="1">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="750" y="358" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-38" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" parent="1" vertex="1">
+          <mxGeometry x="680" y="290" width="10" height="20" as="geometry" />
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-42" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" parent="1" vertex="1">
+          <mxGeometry x="680" y="330" width="10" height="21" as="geometry" />
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-43" value="send_data(intermediate_representations)" style="html=1;verticalAlign=bottom;endArrow=block;curved=0;rounded=0;entryX=0;entryY=0;entryDx=0;entryDy=5;" parent="1" source="hKyrbmUfddmyC9NB2b_t-3" target="hKyrbmUfddmyC9NB2b_t-42" edge="1">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="820" y="325" as="sourcePoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-44" value="" style="html=1;verticalAlign=bottom;endArrow=open;dashed=1;endSize=8;curved=0;rounded=0;exitX=0;exitY=1;exitDx=0;exitDy=-5;" parent="1" source="hKyrbmUfddmyC9NB2b_t-42" target="hKyrbmUfddmyC9NB2b_t-3" edge="1">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="820" y="395" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-45" value="" style="endArrow=none;dashed=1;html=1;rounded=0;" parent="1" source="hKyrbmUfddmyC9NB2b_t-42" target="hKyrbmUfddmyC9NB2b_t-38" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="710" y="310" as="sourcePoint" />
+            <mxPoint x="710" y="290" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-46" value="" style="endArrow=none;dashed=1;html=1;rounded=0;" parent="1" source="hKyrbmUfddmyC9NB2b_t-42" target="hKyrbmUfddmyC9NB2b_t-26" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="710" y="363" as="sourcePoint" />
+            <mxPoint x="710" y="330" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-49" value="&lt;div&gt;interpret(expressions,&lt;/div&gt;&lt;div&gt;variables, parameters)&lt;/div&gt;" style="html=1;verticalAlign=bottom;startArrow=circle;startFill=1;endArrow=open;startSize=6;endSize=8;curved=0;rounded=0;" parent="1" target="hKyrbmUfddmyC9NB2b_t-3" edge="1">
          <mxGeometry x="0.1057" width="80" relative="1" as="geometry">
-            <mxPoint x="300" y="180" as="sourcePoint" />
-            <mxPoint x="385" y="170.0000000000001" as="targetPoint" />
+            <mxPoint x="172" y="130" as="sourcePoint" />
+            <mxPoint x="295" y="130" as="targetPoint" />
            <mxPoint as="offset" />
          </mxGeometry>
        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-7" value="&lt;div&gt;interpret(exprs,&amp;nbsp;&lt;/div&gt;&lt;div&gt;vars, params)&lt;/div&gt;" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="Eoor0cZiH70DILZYRCY1-6" vertex="1" connectable="0">
-          <mxGeometry x="-0.0676" y="1" relative="1" as="geometry">
-            <mxPoint x="2" y="-19" as="offset" />
-          </mxGeometry>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-50" value="" style="ellipse;html=1;shape=endState;fillColor=#000000;strokeColor=default;" parent="1" vertex="1">
+          <mxGeometry x="180" y="500" width="20" height="20" as="geometry" />
        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-8" value="loop" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" parent="1" vertex="1">
-          <mxGeometry x="300" y="200" width="330" height="90" as="geometry" />
-        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-9" value="&lt;font style=&quot;font-size: 9px;&quot;&gt;[for each expression]&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="300" y="230" width="90" height="20" as="geometry" />
-        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-13" value="" style="endArrow=none;dashed=1;html=1;rounded=0;entryX=0.5;entryY=1;entryDx=0;entryDy=0;" parent="1" source="Eoor0cZiH70DILZYRCY1-14" target="Eoor0cZiH70DILZYRCY1-3" edge="1">
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-51" value="" style="endArrow=open;html=1;rounded=0;entryX=1;entryY=0.5;entryDx=0;entryDy=0;dashed=1;endFill=0;" parent="1" source="hKyrbmUfddmyC9NB2b_t-3" target="hKyrbmUfddmyC9NB2b_t-50" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="565" y="190" as="sourcePoint" />
-            <mxPoint x="550" y="300" as="targetPoint" />
+            <mxPoint x="230" y="640" as="sourcePoint" />
+            <mxPoint x="280" y="590" as="targetPoint" />
          </mxGeometry>
        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-14" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" parent="1" vertex="1">
-          <mxGeometry x="580" y="220" width="10" height="40" as="geometry" />
-        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-15" value="expr_to_postfix()" style="html=1;verticalAlign=bottom;endArrow=block;curved=0;rounded=0;entryX=0;entryY=0;entryDx=0;entryDy=5;" parent="1" source="Eoor0cZiH70DILZYRCY1-5" target="Eoor0cZiH70DILZYRCY1-14" edge="1">
-          <mxGeometry relative="1" as="geometry">
-            <mxPoint x="490" y="225" as="sourcePoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-16" value="intermediate_representation" style="html=1;verticalAlign=bottom;endArrow=open;dashed=1;endSize=8;curved=0;rounded=0;exitX=0;exitY=1;exitDx=0;exitDy=-5;" parent="1" source="Eoor0cZiH70DILZYRCY1-14" target="Eoor0cZiH70DILZYRCY1-5" edge="1">
-          <mxGeometry relative="1" as="geometry">
-            <mxPoint x="490" y="295" as="targetPoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-17" value="" style="endArrow=none;dashed=1;html=1;rounded=0;entryX=0.5;entryY=1;entryDx=0;entryDy=0;" parent="1" source="Eoor0cZiH70DILZYRCY1-24" target="Eoor0cZiH70DILZYRCY1-2" edge="1">
-          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="850" y="340" as="sourcePoint" />
-            <mxPoint x="849.6600000000001" y="200" as="targetPoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-18" value="" style="endArrow=none;dashed=1;html=1;rounded=0;entryX=0.5;entryY=1;entryDx=0;entryDy=0;" parent="1" source="Eoor0cZiH70DILZYRCY1-34" target="Eoor0cZiH70DILZYRCY1-4" edge="1">
-          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="710" y="560" as="sourcePoint" />
-            <mxPoint x="740.3399999999999" y="160" as="targetPoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-20" value="send_data(intermediate_representations)" style="html=1;verticalAlign=bottom;endArrow=open;curved=0;rounded=0;endFill=0;" parent="1" source="Eoor0cZiH70DILZYRCY1-5" edge="1">
-          <mxGeometry relative="1" as="geometry">
-            <mxPoint x="650" y="315" as="sourcePoint" />
-            <mxPoint x="710" y="315" as="targetPoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-22" value="send_data(variables)" style="html=1;verticalAlign=bottom;endArrow=open;curved=0;rounded=0;endFill=0;" parent="1" edge="1">
-          <mxGeometry relative="1" as="geometry">
-            <mxPoint x="415" y="349.31" as="sourcePoint" />
-            <mxPoint x="710" y="349" as="targetPoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-23" value="" style="endArrow=none;dashed=1;html=1;rounded=0;" parent="1" source="Eoor0cZiH70DILZYRCY1-14" edge="1">
-          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="595" y="230" as="sourcePoint" />
-            <mxPoint x="585" y="700" as="targetPoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-24" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" parent="1" vertex="1">
-          <mxGeometry x="845" y="400" width="10" height="240" as="geometry" />
-        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-25" value="interpret(nr_expressions, parameters)" style="html=1;verticalAlign=bottom;endArrow=block;curved=0;rounded=0;" parent="1" edge="1">
-          <mxGeometry relative="1" as="geometry">
-            <mxPoint x="415" y="400.16" as="sourcePoint" />
-            <mxPoint x="845" y="400.16" as="targetPoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-26" value="loop" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" parent="1" vertex="1">
-          <mxGeometry x="300" y="370" width="600" height="290" as="geometry" />
-        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-27" value="&lt;div&gt;&lt;font style=&quot;font-size: 9px;&quot;&gt;[for each&amp;nbsp;&lt;/font&gt;&lt;/div&gt;&lt;div&gt;&lt;font style=&quot;font-size: 9px;&quot;&gt;optimisation step]&lt;/font&gt;&lt;/div&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="300" y="400" width="90" height="30" as="geometry" />
-        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-28" value="send_data(parameters)" style="html=1;verticalAlign=bottom;endArrow=open;curved=0;rounded=0;endFill=0;" parent="1" source="Eoor0cZiH70DILZYRCY1-24" edge="1">
-          <mxGeometry relative="1" as="geometry">
-            <mxPoint x="480" y="690" as="sourcePoint" />
-            <mxPoint x="710" y="440" as="targetPoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-29" value="send_data(meta)" style="html=1;verticalAlign=bottom;endArrow=open;curved=0;rounded=0;endFill=0;" parent="1" source="Eoor0cZiH70DILZYRCY1-24" edge="1">
-          <mxGeometry relative="1" as="geometry">
-            <mxPoint x="865" y="520" as="sourcePoint" />
-            <mxPoint x="710" y="470" as="targetPoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-30" value="loop" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" parent="1" vertex="1">
-          <mxGeometry x="610" y="490" width="270" height="60" as="geometry" />
-        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-32" value="&lt;font style=&quot;font-size: 9px;&quot;&gt;[for each expression]&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="610" y="520" width="90" height="20" as="geometry" />
-        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-33" value="dispatch kernel" style="html=1;verticalAlign=bottom;endArrow=open;curved=0;rounded=0;endFill=0;" parent="1" source="Eoor0cZiH70DILZYRCY1-24" edge="1">
-          <mxGeometry relative="1" as="geometry">
-            <mxPoint x="845" y="519.31" as="sourcePoint" />
-            <mxPoint x="710" y="519.31" as="targetPoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-34" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" parent="1" vertex="1">
-          <mxGeometry x="705" y="570" width="10" height="30" as="geometry" />
-        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-35" value="read_results()" style="html=1;verticalAlign=bottom;endArrow=block;curved=0;rounded=0;entryX=1;entryY=0;entryDx=0;entryDy=5;" parent="1" source="Eoor0cZiH70DILZYRCY1-24" target="Eoor0cZiH70DILZYRCY1-34" edge="1">
-          <mxGeometry x="-0.0027" relative="1" as="geometry">
-            <mxPoint x="870" y="725" as="sourcePoint" />
-            <mxPoint as="offset" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-36" value="result_matrix" style="html=1;verticalAlign=bottom;endArrow=open;dashed=1;endSize=8;curved=0;rounded=0;exitX=1;exitY=1;exitDx=0;exitDy=-5;" parent="1" source="Eoor0cZiH70DILZYRCY1-34" target="Eoor0cZiH70DILZYRCY1-24" edge="1">
-          <mxGeometry relative="1" as="geometry">
-            <mxPoint x="870" y="755" as="targetPoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-37" value="result_matrix" style="html=1;verticalAlign=bottom;endArrow=open;dashed=1;endSize=8;curved=0;rounded=0;" parent="1" edge="1">
-          <mxGeometry relative="1" as="geometry">
-            <mxPoint x="415" y="630" as="targetPoint" />
-            <mxPoint x="845" y="630" as="sourcePoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-38" value="" style="endArrow=none;dashed=1;html=1;rounded=0;" parent="1" source="Eoor0cZiH70DILZYRCY1-34" edge="1">
-          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="720" y="580" as="sourcePoint" />
-            <mxPoint x="710" y="700" as="targetPoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-39" value="" style="endArrow=none;dashed=1;html=1;rounded=0;" parent="1" source="Eoor0cZiH70DILZYRCY1-24" edge="1">
-          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="820" y="690" as="sourcePoint" />
-            <mxPoint x="850" y="700" as="targetPoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-40" value="" style="ellipse;html=1;shape=endState;fillColor=#000000;strokeColor=default;" parent="1" vertex="1">
-          <mxGeometry x="300" y="680" width="20" height="20" as="geometry" />
-        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-41" value="" style="endArrow=open;html=1;rounded=0;entryX=1;entryY=0.5;entryDx=0;entryDy=0;dashed=1;endFill=0;" parent="1" target="Eoor0cZiH70DILZYRCY1-40" edge="1">
-          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="405" y="690" as="sourcePoint" />
-            <mxPoint x="380" y="770" as="targetPoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="Eoor0cZiH70DILZYRCY1-42" value="result_matrix" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="Eoor0cZiH70DILZYRCY1-41" vertex="1" connectable="0">
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-52" value="resultMatrix" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="hKyrbmUfddmyC9NB2b_t-51" vertex="1" connectable="0">
          <mxGeometry x="0.1271" relative="1" as="geometry">
            <mxPoint x="8" y="-10" as="offset" />
          </mxGeometry>
--- a/other/transpiler_sequence_diagram.drawio
+++ b/other/transpiler_sequence_diagram.drawio
@ -1,181 +1,165 @@
-<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0" version="27.1.6">
+<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:138.0) Gecko/20100101 Firefox/138.0" version="26.2.14">
  <diagram name="Page-1" id="dN1vCd9jYV9B4u8MPVmJ">
-    <mxGraphModel dx="2066" dy="1147" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
+    <mxGraphModel dx="1181" dy="655" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
      <root>
        <mxCell id="0" />
        <mxCell id="1" parent="0" />
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-1" value="Expression Evaluator" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
-          <mxGeometry x="400" y="60" width="100" height="40" as="geometry" />
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-1" value="Transpiler" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="260" y="60" width="100" height="40" as="geometry" />
        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-2" value="CPU-Side" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
-          <mxGeometry x="840" y="60" width="100" height="40" as="geometry" />
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-2" value="Pre-Processing" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="500" y="60" width="90" height="40" as="geometry" />
        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-3" value="Frontend" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
-          <mxGeometry x="580" y="60" width="90" height="40" as="geometry" />
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-3" value="GPU" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="640" y="60" width="90" height="40" as="geometry" />
        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-4" value="GPU-Side" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
-          <mxGeometry x="710" y="60" width="80" height="40" as="geometry" />
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-4" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" vertex="1" parent="1">
+          <mxGeometry x="305" y="100" width="10" height="420" as="geometry" />
        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-5" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" parent="1" vertex="1">
-          <mxGeometry x="445" y="100" width="10" height="490" as="geometry" />
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-5" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" vertex="1" parent="1">
+          <mxGeometry x="540" y="170" width="10" height="40" as="geometry" />
        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-6" value="" style="html=1;verticalAlign=bottom;startArrow=circle;startFill=1;endArrow=open;startSize=6;endSize=8;curved=0;rounded=0;" parent="1" target="NwwB8n2c3V3IefT4sgtS-5" edge="1">
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-6" value="expr_to_postfix(expr): ExpressionElement[]" style="html=1;verticalAlign=bottom;endArrow=block;curved=0;rounded=0;entryX=0;entryY=0;entryDx=0;entryDy=5;" edge="1" parent="1" source="gMhPBGUGI9FZGhFn2pCe-4" target="gMhPBGUGI9FZGhFn2pCe-5">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="420" y="185" as="sourcePoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-7" value="" style="html=1;verticalAlign=bottom;endArrow=open;endSize=8;curved=0;rounded=0;exitX=0;exitY=1;exitDx=0;exitDy=-5;dashed=1;" edge="1" parent="1" source="gMhPBGUGI9FZGhFn2pCe-5" target="gMhPBGUGI9FZGhFn2pCe-4">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="420" y="255" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-8" value="intermediate_representation" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" vertex="1" connectable="0" parent="gMhPBGUGI9FZGhFn2pCe-7">
+          <mxGeometry x="-0.008" y="-1" relative="1" as="geometry">
+            <mxPoint y="-9" as="offset" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-9" value="" style="endArrow=none;dashed=1;html=1;rounded=0;entryX=0.5;entryY=1;entryDx=0;entryDy=0;" edge="1" parent="1" source="gMhPBGUGI9FZGhFn2pCe-5" target="gMhPBGUGI9FZGhFn2pCe-2">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="550" y="150" as="sourcePoint" />
+            <mxPoint x="780" y="260" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-11" value="loop" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" vertex="1" parent="1">
+          <mxGeometry x="170" y="150" width="420" height="140" as="geometry" />
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-12" value="&lt;font style=&quot;font-size: 9px;&quot;&gt;[for each expression]&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
+          <mxGeometry x="170" y="180" width="90" height="20" as="geometry" />
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-13" value="loop" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" vertex="1" parent="1">
+          <mxGeometry x="170" y="370" width="560" height="60" as="geometry" />
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-14" value="&lt;font style=&quot;font-size: 9px;&quot;&gt;[for each kernel]&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
+          <mxGeometry x="172" y="403" width="68" height="17" as="geometry" />
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-15" value="kernel(variables, parameters)" style="html=1;verticalAlign=bottom;endArrow=open;curved=0;rounded=0;endFill=0;" edge="1" parent="1" source="gMhPBGUGI9FZGhFn2pCe-4">
+          <mxGeometry x="0.0008" relative="1" as="geometry">
+            <mxPoint x="320" y="403" as="sourcePoint" />
+            <mxPoint x="685" y="403" as="targetPoint" />
+            <mxPoint as="offset" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-16" value="" style="endArrow=none;dashed=1;html=1;rounded=0;entryX=0.5;entryY=1;entryDx=0;entryDy=0;" edge="1" parent="1" source="gMhPBGUGI9FZGhFn2pCe-24" target="gMhPBGUGI9FZGhFn2pCe-3">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="610" y="250" as="sourcePoint" />
+            <mxPoint x="660" y="200" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-17" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" vertex="1" parent="1">
+          <mxGeometry x="680" y="460" width="10" height="30" as="geometry" />
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-18" value="read_results()" style="html=1;verticalAlign=bottom;endArrow=block;curved=0;rounded=0;entryX=0;entryY=0;entryDx=0;entryDy=5;" edge="1" parent="1" source="gMhPBGUGI9FZGhFn2pCe-4" target="gMhPBGUGI9FZGhFn2pCe-17">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="305" y="444" as="sourcePoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-19" value="resultMatrix" style="html=1;verticalAlign=bottom;endArrow=open;dashed=1;endSize=8;curved=0;rounded=0;exitX=0;exitY=1;exitDx=0;exitDy=-5;" edge="1" parent="1" source="gMhPBGUGI9FZGhFn2pCe-17" target="gMhPBGUGI9FZGhFn2pCe-4">
+          <mxGeometry x="0.0012" relative="1" as="geometry">
+            <mxPoint x="305" y="494.0000000000001" as="targetPoint" />
+            <mxPoint as="offset" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-20" value="" style="endArrow=none;dashed=1;html=1;rounded=0;" edge="1" parent="1" target="gMhPBGUGI9FZGhFn2pCe-17">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="685" y="520" as="sourcePoint" />
+            <mxPoint x="710" y="390" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-21" value="send_data(variables)" style="html=1;verticalAlign=bottom;endArrow=block;curved=0;rounded=0;entryX=0;entryY=0;entryDx=0;entryDy=5;" edge="1" parent="1" source="gMhPBGUGI9FZGhFn2pCe-4" target="gMhPBGUGI9FZGhFn2pCe-24">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="720" y="225" as="sourcePoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-22" value="" style="html=1;verticalAlign=bottom;endArrow=open;dashed=1;endSize=8;curved=0;rounded=0;exitX=0;exitY=1;exitDx=0;exitDy=-5;" edge="1" parent="1" source="gMhPBGUGI9FZGhFn2pCe-24" target="gMhPBGUGI9FZGhFn2pCe-4">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="720" y="255" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-23" value="" style="endArrow=none;dashed=1;html=1;rounded=0;entryX=0.5;entryY=1;entryDx=0;entryDy=0;" edge="1" parent="1" source="gMhPBGUGI9FZGhFn2pCe-27" target="gMhPBGUGI9FZGhFn2pCe-24">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="700" y="349" as="sourcePoint" />
+            <mxPoint x="700" y="120" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-24" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" vertex="1" parent="1">
+          <mxGeometry x="680" y="310" width="10" height="20" as="geometry" />
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-25" value="send_data(parameters)" style="html=1;verticalAlign=bottom;endArrow=block;curved=0;rounded=0;entryX=0;entryY=0;entryDx=0;entryDy=5;" edge="1" parent="1" source="gMhPBGUGI9FZGhFn2pCe-4" target="gMhPBGUGI9FZGhFn2pCe-27">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="750" y="288" as="sourcePoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-26" value="" style="html=1;verticalAlign=bottom;endArrow=open;dashed=1;endSize=8;curved=0;rounded=0;exitX=0;exitY=1;exitDx=0;exitDy=-5;" edge="1" parent="1" source="gMhPBGUGI9FZGhFn2pCe-27" target="gMhPBGUGI9FZGhFn2pCe-4">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="750" y="358" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-27" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" vertex="1" parent="1">
+          <mxGeometry x="680" y="340" width="10" height="20" as="geometry" />
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-31" value="" style="endArrow=none;dashed=1;html=1;rounded=0;" edge="1" parent="1" source="gMhPBGUGI9FZGhFn2pCe-17" target="gMhPBGUGI9FZGhFn2pCe-27">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="685" y="330" as="sourcePoint" />
+            <mxPoint x="710" y="290" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-33" value="&lt;div&gt;transpile(expressions,&lt;/div&gt;&lt;div&gt;variables, parameters)&lt;/div&gt;" style="html=1;verticalAlign=bottom;startArrow=circle;startFill=1;endArrow=open;startSize=6;endSize=8;curved=0;rounded=0;" edge="1" parent="1" target="gMhPBGUGI9FZGhFn2pCe-4">
          <mxGeometry x="0.1057" width="80" relative="1" as="geometry">
-            <mxPoint x="340" y="140" as="sourcePoint" />
-            <mxPoint x="425" y="130.0000000000001" as="targetPoint" />
+            <mxPoint x="172" y="130" as="sourcePoint" />
+            <mxPoint x="295" y="130" as="targetPoint" />
            <mxPoint as="offset" />
          </mxGeometry>
        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-7" value="&lt;div&gt;evaluate(exprs,&amp;nbsp;&lt;/div&gt;&lt;div&gt;vars, params)&lt;/div&gt;" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="NwwB8n2c3V3IefT4sgtS-6" vertex="1" connectable="0">
-          <mxGeometry x="-0.0676" y="1" relative="1" as="geometry">
-            <mxPoint x="2" y="-19" as="offset" />
-          </mxGeometry>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-34" value="" style="ellipse;html=1;shape=endState;fillColor=#000000;strokeColor=default;" vertex="1" parent="1">
+          <mxGeometry x="180" y="500" width="20" height="20" as="geometry" />
        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-8" value="loop" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" parent="1" vertex="1">
-          <mxGeometry x="340" y="160" width="601" height="120" as="geometry" />
-        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-9" value="&lt;font style=&quot;font-size: 9px;&quot;&gt;[for each expression]&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="340" y="190" width="90" height="20" as="geometry" />
-        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-10" value="" style="endArrow=none;dashed=1;html=1;rounded=0;entryX=0.5;entryY=1;entryDx=0;entryDy=0;" parent="1" source="NwwB8n2c3V3IefT4sgtS-11" target="NwwB8n2c3V3IefT4sgtS-3" edge="1">
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-35" value="" style="endArrow=open;html=1;rounded=0;entryX=1;entryY=0.5;entryDx=0;entryDy=0;dashed=1;endFill=0;" edge="1" parent="1" source="gMhPBGUGI9FZGhFn2pCe-4" target="gMhPBGUGI9FZGhFn2pCe-34">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="605" y="150" as="sourcePoint" />
-            <mxPoint x="590" y="260" as="targetPoint" />
+            <mxPoint x="230" y="640" as="sourcePoint" />
+            <mxPoint x="280" y="590" as="targetPoint" />
          </mxGeometry>
        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-11" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" parent="1" vertex="1">
-          <mxGeometry x="620" y="180" width="10" height="30" as="geometry" />
-        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-12" value="expr_to_postfix()" style="html=1;verticalAlign=bottom;endArrow=block;curved=0;rounded=0;entryX=0;entryY=0;entryDx=0;entryDy=5;" parent="1" source="NwwB8n2c3V3IefT4sgtS-5" target="NwwB8n2c3V3IefT4sgtS-11" edge="1">
-          <mxGeometry relative="1" as="geometry">
-            <mxPoint x="530" y="185" as="sourcePoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-13" value="intermediate_representation" style="html=1;verticalAlign=bottom;endArrow=open;dashed=1;endSize=8;curved=0;rounded=0;exitX=0;exitY=1;exitDx=0;exitDy=-5;" parent="1" source="NwwB8n2c3V3IefT4sgtS-11" target="NwwB8n2c3V3IefT4sgtS-5" edge="1">
-          <mxGeometry relative="1" as="geometry">
-            <mxPoint x="530" y="255" as="targetPoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-14" value="" style="endArrow=none;dashed=1;html=1;rounded=0;entryX=0.5;entryY=1;entryDx=0;entryDy=0;" parent="1" source="NwwB8n2c3V3IefT4sgtS-37" target="NwwB8n2c3V3IefT4sgtS-2" edge="1">
-          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="891" y="140" as="sourcePoint" />
-            <mxPoint x="889.6599999999999" y="160" as="targetPoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-15" value="" style="endArrow=none;dashed=1;html=1;rounded=0;entryX=0.5;entryY=1;entryDx=0;entryDy=0;" parent="1" source="NwwB8n2c3V3IefT4sgtS-28" target="NwwB8n2c3V3IefT4sgtS-4" edge="1">
-          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="750" y="520" as="sourcePoint" />
-            <mxPoint x="780.3399999999999" y="120" as="targetPoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-17" value="send_data(variables)" style="html=1;verticalAlign=bottom;endArrow=open;curved=0;rounded=0;endFill=0;" parent="1" edge="1">
-          <mxGeometry relative="1" as="geometry">
-            <mxPoint x="455" y="309.30999999999995" as="sourcePoint" />
-            <mxPoint x="750" y="309" as="targetPoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-18" value="" style="endArrow=none;dashed=1;html=1;rounded=0;" parent="1" source="NwwB8n2c3V3IefT4sgtS-11" edge="1">
-          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="635" y="190" as="sourcePoint" />
-            <mxPoint x="625" y="590" as="targetPoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-19" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" parent="1" vertex="1">
-          <mxGeometry x="885" y="360" width="10" height="180" as="geometry" />
-        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-20" value="evaluate(kernels, parameters)" style="html=1;verticalAlign=bottom;endArrow=block;curved=0;rounded=0;" parent="1" edge="1">
-          <mxGeometry relative="1" as="geometry">
-            <mxPoint x="455" y="360.1600000000001" as="sourcePoint" />
-            <mxPoint x="885" y="360.1600000000001" as="targetPoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-21" value="loop" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" parent="1" vertex="1">
-          <mxGeometry x="340" y="330" width="600" height="230" as="geometry" />
-        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-22" value="&lt;div&gt;&lt;font style=&quot;font-size: 9px;&quot;&gt;[for each&amp;nbsp;&lt;/font&gt;&lt;/div&gt;&lt;div&gt;&lt;font style=&quot;font-size: 9px;&quot;&gt;optimisation step]&lt;/font&gt;&lt;/div&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="340" y="360" width="90" height="30" as="geometry" />
-        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-23" value="send_data(parameters)" style="html=1;verticalAlign=bottom;endArrow=open;curved=0;rounded=0;endFill=0;" parent="1" source="NwwB8n2c3V3IefT4sgtS-19" edge="1">
-          <mxGeometry relative="1" as="geometry">
-            <mxPoint x="520" y="650" as="sourcePoint" />
-            <mxPoint x="750" y="400" as="targetPoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-25" value="loop" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" parent="1" vertex="1">
-          <mxGeometry x="650" y="410" width="270" height="60" as="geometry" />
-        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-26" value="&lt;font style=&quot;font-size: 9px;&quot;&gt;[for each kernel]&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="650" y="440" width="90" height="20" as="geometry" />
-        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-27" value="dispatch kernel" style="html=1;verticalAlign=bottom;endArrow=open;curved=0;rounded=0;endFill=0;" parent="1" edge="1">
-          <mxGeometry relative="1" as="geometry">
-            <mxPoint x="885" y="439.30999999999995" as="sourcePoint" />
-            <mxPoint x="750" y="439.30999999999995" as="targetPoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-28" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" parent="1" vertex="1">
-          <mxGeometry x="745" y="490" width="10" height="30" as="geometry" />
-        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-29" value="read_results()" style="html=1;verticalAlign=bottom;endArrow=block;curved=0;rounded=0;entryX=1;entryY=0;entryDx=0;entryDy=5;" parent="1" source="NwwB8n2c3V3IefT4sgtS-19" target="NwwB8n2c3V3IefT4sgtS-28" edge="1">
-          <mxGeometry x="-0.0027" relative="1" as="geometry">
-            <mxPoint x="910" y="685" as="sourcePoint" />
-            <mxPoint as="offset" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-30" value="result_matrix" style="html=1;verticalAlign=bottom;endArrow=open;dashed=1;endSize=8;curved=0;rounded=0;exitX=1;exitY=1;exitDx=0;exitDy=-5;" parent="1" source="NwwB8n2c3V3IefT4sgtS-28" target="NwwB8n2c3V3IefT4sgtS-19" edge="1">
-          <mxGeometry relative="1" as="geometry">
-            <mxPoint x="910" y="715" as="targetPoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-31" value="result_matrix" style="html=1;verticalAlign=bottom;endArrow=open;dashed=1;endSize=8;curved=0;rounded=0;" parent="1" edge="1">
-          <mxGeometry relative="1" as="geometry">
-            <mxPoint x="455" y="540" as="targetPoint" />
-            <mxPoint x="885" y="540" as="sourcePoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-32" value="" style="endArrow=none;dashed=1;html=1;rounded=0;" parent="1" source="NwwB8n2c3V3IefT4sgtS-28" edge="1">
-          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="760" y="540" as="sourcePoint" />
-            <mxPoint x="750" y="590" as="targetPoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-33" value="" style="endArrow=none;dashed=1;html=1;rounded=0;" parent="1" source="NwwB8n2c3V3IefT4sgtS-19" edge="1">
-          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="860" y="650" as="sourcePoint" />
-            <mxPoint x="890" y="590" as="targetPoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-34" value="" style="ellipse;html=1;shape=endState;fillColor=#000000;strokeColor=default;" parent="1" vertex="1">
-          <mxGeometry x="340" y="570" width="20" height="20" as="geometry" />
-        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-35" value="" style="endArrow=open;html=1;rounded=0;entryX=1;entryY=0.5;entryDx=0;entryDy=0;dashed=1;endFill=0;" parent="1" source="NwwB8n2c3V3IefT4sgtS-5" target="NwwB8n2c3V3IefT4sgtS-34" edge="1">
-          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="445" y="650" as="sourcePoint" />
-            <mxPoint x="420" y="730" as="targetPoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-36" value="result_matrix" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="NwwB8n2c3V3IefT4sgtS-35" vertex="1" connectable="0">
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-36" value="resultMatrix" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" vertex="1" connectable="0" parent="gMhPBGUGI9FZGhFn2pCe-35">
          <mxGeometry x="0.1271" relative="1" as="geometry">
            <mxPoint x="8" y="-10" as="offset" />
          </mxGeometry>
        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-37" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" parent="1" vertex="1">
-          <mxGeometry x="885" y="230" width="10" height="40" as="geometry" />
-        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-38" value="transpile(intermediate_representation)" style="html=1;verticalAlign=bottom;endArrow=block;curved=0;rounded=0;entryX=0;entryY=0;entryDx=0;entryDy=5;" parent="1" source="NwwB8n2c3V3IefT4sgtS-5" target="NwwB8n2c3V3IefT4sgtS-37" edge="1">
-          <mxGeometry relative="1" as="geometry">
-            <mxPoint x="961" y="205" as="sourcePoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-39" value="kernel" style="html=1;verticalAlign=bottom;endArrow=open;dashed=1;endSize=8;curved=0;rounded=0;exitX=0;exitY=1;exitDx=0;exitDy=-5;" parent="1" source="NwwB8n2c3V3IefT4sgtS-37" target="NwwB8n2c3V3IefT4sgtS-5" edge="1">
-          <mxGeometry relative="1" as="geometry">
-            <mxPoint x="961" y="275" as="targetPoint" />
-          </mxGeometry>
-        </mxCell>
-        <mxCell id="NwwB8n2c3V3IefT4sgtS-40" value="" style="endArrow=none;dashed=1;html=1;rounded=0;" parent="1" source="NwwB8n2c3V3IefT4sgtS-37" target="NwwB8n2c3V3IefT4sgtS-19" edge="1">
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-38" value="" style="endArrow=none;dashed=1;html=1;rounded=0;" edge="1" parent="1" target="gMhPBGUGI9FZGhFn2pCe-5">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="900" y="240" as="sourcePoint" />
-            <mxPoint x="900" y="110" as="targetPoint" />
+            <mxPoint x="545" y="520" as="sourcePoint" />
+            <mxPoint x="545" y="280" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-53" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" vertex="1" parent="1">
+          <mxGeometry x="310" y="243" width="10" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-54" value="transpile(intermediate_representation): Kernel" style="html=1;align=left;spacingLeft=2;endArrow=block;rounded=0;edgeStyle=orthogonalEdgeStyle;curved=0;rounded=0;" edge="1" target="gMhPBGUGI9FZGhFn2pCe-53" parent="1">
+          <mxGeometry x="-0.005" relative="1" as="geometry">
+            <mxPoint x="315" y="223" as="sourcePoint" />
+            <Array as="points">
+              <mxPoint x="345" y="253" />
+            </Array>
+            <mxPoint as="offset" />
          </mxGeometry>
        </mxCell>
      </root>
--- a/package/src/ExpressionExecutorCuda.jl
+++ b/package/src/ExpressionExecutorCuda.jl
@ -16,7 +16,7 @@ export interpret_gpu,interpret_cpu
 export evaluate_gpu

 # Some assertions:
-# Variables and parameters start their indexing with "1" meaning the first variable/parameter has to be "x1/p1" and not "x0/p0"
+# Variables and parameters start their naming with "1" meaning the first variable/parameter has to be "x1/p1" and not "x0/p0"
 # Matrix X is column major
 # each index i in exprs has to have the matching values in the column i in Matrix X so that X[:,i] contains the values for expr[i]. The same goes for p
 #     This assertion is made, because in julia, the first index doesn't have to be 1
@ -56,19 +56,18 @@ function evaluate_gpu(expressions::Vector{Expr}, X::Matrix{Float32}, p::Vector{V

 	largestParameterSetSize = Utils.get_max_inner_length(p) # parameters get transformed into matrix. Will be nr. of rows in parameter matrix

-	compiledKernels = Vector{CuFunction}(undef, length(expressions)) 
+	ptxKernels = Vector{String}(undef, length(expressions)) 
 	kernelName = "evaluate_gpu"
 	@inbounds Threads.@threads for i in eachindex(expressions)
 		ex = ExpressionProcessing.expr_to_postfix(expressions[i])
-		ptxKernel = Transpiler.transpile(ex, variableSetSize, largestParameterSetSize, numVariableSets, i-1, kernelName) # i-1 because julia is 1-based but PTX needs 0-based indexing
-		compiledKernels[i] = Transpiler.compile_kernel(ptxKernel, kernelName)
+		ptxKernels[i] = Transpiler.transpile(ex, variableSetSize, largestParameterSetSize, numVariableSets, i-1, kernelName) # i-1 because julia is 1-based but PTX needs 0-based indexing
 	end

 	results = Matrix{Float32}(undef, numVariableSets, length(expressions))
 	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl)
 		# evaluate
 		# results = Transpiler.evaluate(exprs, variables, numVariableSets, variableSetSize, p)
-		results = Transpiler.evaluate(compiledKernels, variables, numVariableSets, p, kernelName)
+		results = Transpiler.evaluate(ptxKernels, variables, numVariableSets, p, kernelName)
 	end

 	return results
@ -110,4 +109,14 @@ function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector
 	res
 end

+# Flow
+# input: Vector expr    == expressions contains eg. 4 expressions
+#        Matrix X       == |expr| columns, n rows. n == number of variabls x1..xn; n is the same for all expressions --- WRONG
+#        Matrix X       == k columns, n rows. k == number of variables in the expressions (every expression must have the same number of variables); n == number of different values for xk where k is the column
+#        VectorVector p == vector size |expr| containing vector size m. m == number of parameters per expression. p can be different for each expression
+# 
+# The following can be done on the CPU
+#     convert expression to postfix notation (mandatory)
+#     optional: replace every parameter with the correct value (should only improve performance if data transfer is the bottleneck)
+
 end
--- a/package/src/Interpreter.jl
+++ b/package/src/Interpreter.jl
@ -24,21 +24,17 @@ function interpret(cudaExprs, numExprs::Integer, exprsInnerLength::Integer,
 	cudaResults = CuArray{Float32}(undef, variableColumns, numExprs)

 	# Start kernel for each expression to ensure that no warp is working on different expressions
-	numThreads = min(variableColumns, 128)
-	numBlocks = cld(variableColumns, numThreads)
+	@inbounds Threads.@threads for i in 1:numExprs # multithreaded to speedup dispatching (seems to have improved performance)
+		numThreads = min(variableColumns, 121)
+		numBlocks = cld(variableColumns, numThreads)

-	Threads.@threads for i in 1:numExprs # multithreaded to speedup dispatching (seems to have improved performance)
 		@cuda threads=numThreads blocks=numBlocks fastmath=true interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
 	end

-	# Reduce GC pressure https://cuda.juliagpu.org/stable/usage/memory/#Avoiding-GC-pressure
-	CUDA.unsafe_free!(cudaParams)
-	CUDA.unsafe_free!(cudaStepsize)
-
 	return cudaResults
 end

-const MAX_STACK_SIZE = 10 # The depth of the stack to store the values and intermediate results
+const MAX_STACK_SIZE = 25 # The depth of the stack to store the values and intermediate results
 function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, variables::CuDeviceArray{Float32}, parameters::CuDeviceArray{Float32}, results::CuDeviceArray{Float32}, stepsize::CuDeviceArray{Int}, exprIndex::Int)
 	varSetIndex = (blockIdx().x - 1) * blockDim().x + threadIdx().x # ctaid.x * ntid.x + tid.x (1-based)
 	@inbounds variableCols = length(variables) / stepsize[3] # number of variable sets
@ -96,6 +92,7 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var
 			elseif opcode == SQRT
 				operationStack[operationStackTop] = sqrt(operationStack[operationStackTop])
 			elseif opcode == INV
+				# operationStack[operationStackTop] = 1f0 / operationStack[operationStackTop]
 				operationStack[operationStackTop] = inv(operationStack[operationStackTop])
 			end
 		else
--- a/package/src/Transpiler.jl
+++ b/package/src/Transpiler.jl
@ -19,42 +19,41 @@ function evaluate(expressions::Vector{ExpressionProcessing.PostfixType}, cudaVar
 	# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
 	cudaResults = CuArray{Float32}(undef, variableColumns, length(expressions))

-	threads = min(variableColumns, 160)
+	threads = min(variableColumns, 256)
 	blocks = cld(variableColumns, threads)
 	
 	kernelName = "evaluate_gpu"
 	@inbounds Threads.@threads for i in eachindex(expressions)
 		kernel = transpile(expressions[i], variableRows, Utils.get_max_inner_length(parameters), variableColumns, i-1, kernelName) # i-1 because julia is 1-based but PTX needs 0-based indexing
-		compiledKernel = compile_kernel(kernel, kernelName)
+		compiledKernel = CompileKernel(kernel, kernelName)
 		cudacall(compiledKernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
 	end

-	CUDA.unsafe_free!(cudaParams)
-
 	return cudaResults
 end

 "
 A simplified version of the evaluate function. It takes a list of already transpiled kernels to be executed. This should yield better performance, where the same expressions should be evaluated multiple times i.e. for parameter optimisation.
 "
-function evaluate(kernels::Vector{CuFunction}, cudaVars::CuArray{Float32}, nrOfVariableSets::Integer, parameters::Vector{Vector{Float32}}, kernelName::String)::Matrix{Float32}
+function evaluate(kernels::Vector{String}, cudaVars::CuArray{Float32}, nrOfVariableSets::Integer, parameters::Vector{Vector{Float32}}, kernelName::String)::Matrix{Float32}

 	cudaParams = Utils.create_cuda_array(parameters, NaN32) # maybe make constant (see PerformanceTests.jl for more info)

 	# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
 	cudaResults = CuArray{Float32}(undef, nrOfVariableSets, length(kernels))

-	threads = min(nrOfVariableSets, 160)
+	threads = min(nrOfVariableSets, 256)
 	blocks = cld(nrOfVariableSets, threads)
 	
 	@inbounds Threads.@threads for i in eachindex(kernels)
-		cudacall(kernels[i], (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
+		compiledKernel = CompileKernel(kernels[i], kernelName)
+		cudacall(compiledKernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
 	end

 	return cudaResults
 end

-function compile_kernel(ptxKernel::String, kernelName::String)::CuFunction
+function CompileKernel(ptxKernel::String, kernelName::String)::CuFunction
 	linker = CuLink()
 	add_data!(linker, kernelName, ptxKernel)
 		
--- a/package/test/PerformanceTests.jl
+++ b/package/test/PerformanceTests.jl
@ -57,8 +57,11 @@ suite = BenchmarkGroup()
 suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"])
 suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"])

+# cacheInterpreter = Dict{Expr, PostfixType}()
 suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprs, X_t, parameters; repetitions=expr_reps)

+# cacheTranspilerFront = Dict{Expr, PostfixType}()
+# cacheTranspilerRes = Dict{Expr, CuFunction}()
 suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprs, X_t, parameters; repetitions=expr_reps)

 # tune!(suite)
@ -70,8 +73,6 @@ results = run(suite, verbose=true, seconds=43200) # 12 hour timeout
 resultsCPU = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/cpu.json")[1]

 if compareWithCPU
-	BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/3-interpreter-smaller-stack-less-threadblock-allocations.json", results)
-
 	medianCPU = median(resultsCPU["CPU"])
 	stdCPU = std(resultsCPU["CPU"])
 	
@ -104,6 +105,7 @@ if compareWithCPU
 	println(gpuiVsGPUT_median)
 	println(gpuiVsGPUT_std)
 	
+	BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/1-fronted-and-data-transfer-to-ExpressionExecutor.json", results)
 else
 	resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json")[1]
 	# resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json")[1]
--- a/package/test/PerformanceTuning.jl
+++ b/package/test/PerformanceTuning.jl
@ -29,10 +29,10 @@ expr_reps = 1


@testset "Interpreter Tuning" begin
-    CUDA.@profile interpret_gpu(exprs, X, parameters; repetitions=expr_reps)
+    # CUDA.@profile interpret_gpu(exprs, X, parameters; repetitions=expr_reps)
 end


@testset "Transpiler Tuning" begin
-    # CUDA.@profile evaluate_gpu(exprs, X, parameters; repetitions=expr_reps)
+    CUDA.@profile evaluate_gpu(exprs, X, parameters; repetitions=expr_reps)
 end
--- a/package/test/benchmarks/1/cpu.json
+++ b/package/test/benchmarks/1/cpu.json
@ -1 +0,0 @@
-[{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"nikuradse_1":["Trial",{"allocs":36814947,"gctimes":[1.082739415e9,9.35589349e8,8.95739997e8,8.82797331e8,8.44175578e8,8.27278981e8,8.24664534e8,8.41590342e8,8.23430705e8,8.26304622e8,8.7328356e8,8.48151374e8,8.20769383e8,8.36210366e8,8.25357919e8,8.18247354e8,8.05126298e8,8.10738655e8,8.14534413e8,8.05974078e8,8.08104945e8,8.07549224e8,8.11047079e8,8.36937224e8,8.19217772e8,8.03258649e8,8.00177357e8,8.05390572e8,7.81551092e8,7.84470283e8,7.84717493e8,7.87670826e8,7.91518273e8,7.95865535e8,7.9488509e8,7.85908564e8,7.96303832e8,7.83015419e8,7.98406799e8,7.95693404e8,7.89571842e8,7.87009536e8,7.92931167e8,8.0354065e8,8.01147304e8,7.90650725e8,7.91114336e8,8.14447424e8,8.09202389e8,8.0150787e8],"memory":19327142456,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":50,"evals":1,"gcsample":false,"seconds":28800.0,"overhead":0.0,"memory_tolerance":0.01}],"times":[1.11960461697e11,1.12658407743e11,1.11797123654e11,1.14086430365e11,1.12540701243e11,1.13057199848e11,1.12421343743e11,1.12335917668e11,1.11873753956e11,1.12087309285e11,1.15372551368e11,1.12857587668e11,1.12212954999e11,1.12352839748e11,1.12799090735e11,1.12712852105e11,1.11910175268e11,1.12890418194e11,1.12536406676e11,1.12333546234e11,1.12414119618e11,1.12632975657e11,1.12274854817e11,1.13642350405e11,1.13191424262e11,1.12623305956e11,1.12519637206e11,1.12733882055e11,1.13175515626e11,1.12499258654e11,1.12175542007e11,1.14221603568e11,1.12620900601e11,1.12996891317e11,1.12370260538e11,1.12760626809e11,1.13153933145e11,1.12762108936e11,1.12758858333e11,1.13381876923e11,1.12152161607e11,1.12831962905e11,1.12135760011e11,1.14343808852e11,1.12720432473e11,1.13061653545e11,1.12414150523e11,1.13142168741e11,1.12805546557e11,1.13053409368e11]}]},"tags":["CPUInterpreter"]}]},"tags":[]}]]]
--- a/package/test/benchmarks/1/gpu.json
+++ b/package/test/benchmarks/1/gpu.json
@ -1,196 +0,0 @@
-[
-	{
-		"Julia": "1.11.5",
-		"BenchmarkTools": {
-			"major": 1,
-			"minor": 6,
-			"patch": 0,
-			"prerelease": [],
-			"build": []
-		}
-	},
-	[
-		[
-			"BenchmarkGroup",
-			{
-				"data": {
-					"GPUT": [
-						"BenchmarkGroup",
-						{
-							"data": {
-                                "nikuradse_1": [
-                                    "Trial",
-                                    {
-                                        "allocs": 1534112879,
-                                        "gctimes": [
-                                            3.398826747854e12,
-                                            2.618070795579e12
-                                        ],
-                                        "memory": 51380857328968,
-                                        "params": [
-                                            "Parameters",
-                                            {
-                                                "gctrial": true,
-                                                "time_tolerance": 0.05,
-                                                "evals_set": false,
-                                                "samples": 50,
-                                                "evals": 1,
-                                                "gcsample": false,
-                                                "seconds": 43200.0,
-                                                "overhead": 0.0,
-                                                "memory_tolerance": 0.01
-                                            }
-                                        ],
-                                        "times": [
-                                            3.7202049569362e13,
-                                            3.7400159760069e13
-                                        ]
-                                    }
-                                ]
-                            },
-							"tags": [
-								"GPUTranspiler"
-							]
-						}
-					],
-					"GPUI": [
-						"BenchmarkGroup",
-						{
-							"data": {
-								"nikuradse_1": [
-									"Trial",
-									{
-										"allocs": 768766234,
-										"gctimes": [
-											9.039427718e9,
-											9.064446832e9,
-											9.800666936e9,
-											1.0827322595e10,
-											8.183176119e9,
-											1.0336680452e10,
-											1.2123016536e10,
-											1.1144637536e10,
-											1.1608950879e10,
-											8.957069847e9,
-											1.6269942403e10,
-											1.4918376698e10,
-											1.4251938232e10,
-											1.2206537223e10,
-											9.651032299e9,
-											8.903295497e9,
-											1.0685161605e10,
-											1.3667059513e10,
-											9.290015888e9,
-											9.461223008e9,
-											8.563242328e9,
-											9.004616808e9,
-											1.1567444604e10,
-											1.4886979643e10,
-											1.1748297074e10,
-											1.0925963713e10,
-											1.1739338325e10,
-											1.2370751697e10,
-											9.841839527e9,
-											1.0294011249e10,
-											1.0448009806e10,
-											1.0032240935e10,
-											1.0339378214e10,
-											1.0181439573e10,
-											1.0002432745e10,
-											1.024672632e10,
-											1.0288169821e10,
-											9.9328892e9,
-											9.691621257e9,
-											1.0178716919e10,
-											9.874193006e9,
-											1.0230965657e10,
-											9.986166398e9,
-											1.0348837109e10,
-											9.905019212e9,
-											1.0229049781e10,
-											1.0217382544e10,
-											9.984211393e9,
-											1.085035782e10,
-											9.611515998e9
-										],
-										"memory": 54082704216,
-										"params": [
-											"Parameters",
-											{
-												"gctrial": true,
-												"time_tolerance": 0.05,
-												"evals_set": false,
-												"samples": 50,
-												"evals": 1,
-												"gcsample": false,
-												"seconds": 43200.0,
-												"overhead": 0.0,
-												"memory_tolerance": 0.01
-											}
-										],
-										"times": [
-											4.59689343211e11,
-											4.58928721202e11,
-											4.58790866806e11,
-											4.57001612541e11,
-											4.18694344791e11,
-											4.51768004064e11,
-											4.72273439611e11,
-											4.71801815498e11,
-											4.7151631773e11,
-											4.59466410568e11,
-											4.78707344875e11,
-											4.57553935546e11,
-											4.83383119184e11,
-											4.93000010286e11,
-											4.73094508424e11,
-											4.61605014711e11,
-											4.5350924569e11,
-											4.60262826899e11,
-											4.89557260771e11,
-											4.9072202667e11,
-											5.01206569571e11,
-											4.98388682969e11,
-											4.97754134578e11,
-											4.77393384636e11,
-											4.86333985432e11,
-											4.95544193592e11,
-											4.61734198363e11,
-											4.65888337953e11,
-											4.62496887686e11,
-											4.6684460331e11,
-											4.67632785813e11,
-											4.6746114379e11,
-											4.66166811424e11,
-											4.66344731528e11,
-											4.67420138865e11,
-											4.6812935133e11,
-											4.67987294196e11,
-											4.67112396022e11,
-											4.65770084163e11,
-											4.673875228e11,
-											4.63872430175e11,
-											4.62557110467e11,
-											4.64486258696e11,
-											4.67577200165e11,
-											4.65189110368e11,
-											4.64529885356e11,
-											4.61770978471e11,
-											4.63199044468e11,
-											4.61538097167e11,
-											4.61694021731e11
-										]
-									}
-								]
-							},
-							"tags": [
-								"GPUInterpreter"
-							]
-						}
-					]
-				},
-				"tags": []
-			}
-		]
-	]
-]
--- a/package/test/benchmarks/2/cpu.json
+++ b/package/test/benchmarks/2/cpu.json
@ -1 +0,0 @@
-[{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"nikuradse_1":["Trial",{"allocs":1070928,"gctimes":[2.7425249e7,7.412701e7,3.6607941e7,7.2060594e7,3.4529434e7,7.6634167e7,3.2012321e7,7.3820784e7,3.3949954e7,7.8478248e7,4.0126379e7,7.8064709e7,3.7594681e7,7.7171913e7,3.2345052e7,7.4243448e7,3.4353198e7,7.6815947e7,3.3275476e7,7.6196381e7,3.5836579e7,7.9893164e7,3.426444e7,7.8096102e7,3.5667171e7,7.8791806e7,3.4285798e7,8.0897821e7,3.6955997e7,7.3759746e7,3.3773137e7,7.328944e7,3.4533305e7,7.4964616e7,3.4649633e7,7.4867313e7,3.6125153e7,7.7465251e7,3.4405076e7,8.0242334e7,3.2479474e7,7.5060436e7,3.272518e7,7.2772772e7,3.5399275e7,7.4715997e7,3.5420495e7,7.68539e7,3.5243677e7,7.4565513e7],"memory":660409808,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":50,"evals":1,"gcsample":false,"seconds":28800.0,"overhead":0.0,"memory_tolerance":0.01}],"times":[3.43605916e9,3.571094971e9,3.581169167e9,3.617386266e9,3.640177061e9,3.666140865e9,3.59312133e9,3.642660429e9,3.649362226e9,3.639535812e9,3.615756984e9,4.084913467e9,3.658895286e9,3.697572649e9,3.639317733e9,3.626500969e9,3.730074621e9,3.834972951e9,3.77581077e9,3.810886128e9,3.821828959e9,3.810445379e9,3.74010373e9,4.100990879e9,3.805819398e9,3.883427787e9,3.759697669e9,3.826958891e9,3.806828201e9,3.737459795e9,3.82547766e9,3.875865222e9,3.778686866e9,3.772500863e9,3.695058761e9,3.839603577e9,3.758997268e9,3.78092914e9,3.722981644e9,3.81821317e9,3.755600545e9,3.849403637e9,3.807069344e9,3.731021781e9,3.775985336e9,3.809009579e9,3.844778579e9,3.795816016e9,3.761476812e9,3.856067238e9]}]},"tags":["CPUInterpreter"]}]},"tags":[]}]]]
--- a/package/test/benchmarks/2/gpu.json
+++ b/package/test/benchmarks/2/gpu.json
@ -1,292 +0,0 @@
-[
-	{
-		"Julia": "1.11.5",
-		"BenchmarkTools": {
-			"major": 1,
-			"minor": 6,
-			"patch": 0,
-			"prerelease": [],
-			"build": []
-		}
-	},
-	[
-		[
-			"BenchmarkGroup",
-			{
-				"data": {
-					"GPUT": [
-                        "BenchmarkGroup",
-                        {
-                            "data": {
-                                "nikuradse_1": [
-                                    "Trial",
-                                    {
-                                        "allocs": 27539530,
-                                        "gctimes": [
-                                            7.21326673e8,
-                                            7.48889043e8,
-                                            8.00904516e8,
-                                            7.37378345e8,
-                                            7.24851528e8,
-                                            7.35546499e8,
-                                            6.86027619e8,
-                                            7.3845303e8,
-                                            7.79203625e8,
-                                            7.52721538e8,
-                                            7.60364838e8,
-                                            7.59372464e8,
-                                            7.46489405e8,
-                                            8.077102e8,
-                                            7.62237779e8,
-                                            7.80462131e8,
-                                            8.24630083e8,
-                                            8.30753044e8,
-                                            7.73842108e8,
-                                            8.42642472e8,
-                                            7.94451496e8,
-                                            8.35754001e8,
-                                            7.8590998e8,
-                                            7.96294466e8,
-                                            8.69176891e8,
-                                            8.10771728e8,
-                                            7.95383527e8,
-                                            8.17274343e8,
-                                            7.57214285e8,
-                                            8.67359312e8,
-                                            7.88826755e8,
-                                            7.73170589e8,
-                                            7.4383235e8,
-                                            7.35437044e8,
-                                            7.29270175e8,
-                                            7.30839033e8,
-                                            7.78530806e8,
-                                            7.84806598e8,
-                                            7.86753701e8,
-                                            7.70199148e8,
-                                            7.99968565e8,
-                                            7.31105205e8,
-                                            7.94627452e8,
-                                            7.52205262e8,
-                                            7.44255972e8,
-                                            7.92573816e8,
-                                            7.75143609e8,
-                                            7.50085445e8,
-                                            7.42457424e8,
-                                            7.35277689e8
-                                        ],
-                                        "memory": 23891072456,
-                                        "params": [
-                                            "Parameters",
-                                            {
-                                                "gctrial": true,
-                                                "time_tolerance": 0.05,
-                                                "evals_set": false,
-                                                "samples": 50,
-                                                "evals": 1,
-                                                "gcsample": false,
-                                                "seconds": 43200.0,
-                                                "overhead": 0.0,
-                                                "memory_tolerance": 0.01
-                                            }
-                                        ],
-                                        "times": [
-                                            1.9649655533e10,
-                                            1.8655222625e10,
-                                            2.044920046e10,
-                                            2.0006253124e10,
-                                            1.9225532614e10,
-                                            1.8425637493e10,
-                                            1.8009993618e10,
-                                            1.8566547913e10,
-                                            2.0298324918e10,
-                                            1.9375435774e10,
-                                            2.0259600918e10,
-                                            1.9689447935e10,
-                                            2.0440165546e10,
-                                            2.1198185981e10,
-                                            2.1529941031e10,
-                                            1.9621765309e10,
-                                            2.0096583579e10,
-                                            1.9353443691e10,
-                                            2.2395139743e10,
-                                            2.2147177349e10,
-                                            2.2065235354e10,
-                                            1.9008133225e10,
-                                            2.226108083e10,
-                                            2.2085219053e10,
-                                            2.0505924388e10,
-                                            1.951018691e10,
-                                            2.1750413636e10,
-                                            2.2142496895e10,
-                                            2.1011968434e10,
-                                            1.9815838525e10,
-                                            1.9442578236e10,
-                                            1.9848841235e10,
-                                            1.8999443547e10,
-                                            1.8850250259e10,
-                                            1.9418255558e10,
-                                            2.0859989717e10,
-                                            1.9155040161e10,
-                                            1.9639739596e10,
-                                            1.939165026e10,
-                                            1.9236817418e10,
-                                            1.9837660656e10,
-                                            1.8577069226e10,
-                                            1.9406743348e10,
-                                            1.9497777664e10,
-                                            1.911300801e10,
-                                            1.875399388e10,
-                                            2.0604575964e10,
-                                            1.8009223946e10,
-                                            1.9248258647e10,
-                                            1.9877171946e10
-                                        ]
-                                    }
-                                ]
-                            },
-                            "tags": [
-                                "GPUTranspiler"
-                            ]
-                        }
-                    ],
-					"GPUI": [
-                        "BenchmarkGroup",
-                        {
-                            "data": {
-                                "nikuradse_1": [
-                                    "Trial",
-                                    {
-                                        "allocs": 32238807,
-                                        "gctimes": [
-                                            1.56829321e8,
-                                            2.14105993e8,
-                                            1.58515177e8,
-                                            1.63696235e8,
-                                            2.34640809e8,
-                                            1.73666066e8,
-                                            1.56664373e8,
-                                            2.26899015e8,
-                                            1.63204006e8,
-                                            1.58502341e8,
-                                            2.47932191e8,
-                                            1.78750726e8,
-                                            1.67031609e8,
-                                            2.15658378e8,
-                                            1.60582033e8,
-                                            1.72095901e8,
-                                            2.21813348e8,
-                                            1.74449684e8,
-                                            1.84254384e8,
-                                            2.22992914e8,
-                                            1.76230333e8,
-                                            1.74281367e8,
-                                            2.28631335e8,
-                                            1.61470635e8,
-                                            1.68059409e8,
-                                            2.12305109e8,
-                                            1.75816447e8,
-                                            1.69632479e8,
-                                            2.1658411e8,
-                                            1.74270008e8,
-                                            1.59168408e8,
-                                            2.07166015e8,
-                                            1.65286551e8,
-                                            1.64998613e8,
-                                            2.18075822e8,
-                                            1.56942033e8,
-                                            1.70028149e8,
-                                            2.16634107e8,
-                                            1.4930586e8,
-                                            1.51200189e8,
-                                            2.01026428e8,
-                                            1.59543849e8,
-                                            1.50339886e8,
-                                            2.28309026e8,
-                                            1.98278555e8,
-                                            1.82240598e8,
-                                            2.19269356e8,
-                                            2.03655465e8,
-                                            1.9340733e8,
-                                            2.11109946e8
-                                        ],
-                                        "memory": 2257633856,
-                                        "params": [
-                                            "Parameters",
-                                            {
-                                                "gctrial": true,
-                                                "time_tolerance": 0.05,
-                                                "evals_set": false,
-                                                "samples": 50,
-                                                "evals": 1,
-                                                "gcsample": false,
-                                                "seconds": 43200.0,
-                                                "overhead": 0.0,
-                                                "memory_tolerance": 0.01
-                                            }
-                                        ],
-                                        "times": [
-                                            2.0826857964e10,
-                                            2.1827080714e10,
-                                            2.1835551871e10,
-                                            2.2276713278e10,
-                                            2.1050209243e10,
-                                            2.1087741277e10,
-                                            2.140495689e10,
-                                            2.1674895344e10,
-                                            2.129635694e10,
-                                            2.1222094469e10,
-                                            2.108505901e10,
-                                            2.1660111564e10,
-                                            2.1389281024e10,
-                                            2.1611482236e10,
-                                            2.1466940439e10,
-                                            2.1287361922e10,
-                                            2.1364654963e10,
-                                            2.1199132878e10,
-                                            2.1326807938e10,
-                                            2.1447609392e10,
-                                            2.2037961513e10,
-                                            2.142405292e10,
-                                            2.1286263351e10,
-                                            2.1990491197e10,
-                                            2.1482849605e10,
-                                            2.1164038182e10,
-                                            2.079518548e10,
-                                            2.126873883e10,
-                                            2.1411215523e10,
-                                            2.1295914023e10,
-                                            2.1190139946e10,
-                                            2.0902865278e10,
-                                            2.1491691028e10,
-                                            2.1334090493e10,
-                                            2.0610763454e10,
-                                            2.0818827168e10,
-                                            2.2233566715e10,
-                                            2.0669388741e10,
-                                            1.9453333387e10,
-                                            1.9431037635e10,
-                                            1.9578726548e10,
-                                            1.9741413294e10,
-                                            1.9700711839e10,
-                                            2.0273015536e10,
-                                            1.9950670205e10,
-                                            1.9633260654e10,
-                                            1.9837290753e10,
-                                            2.0314635685e10,
-                                            2.039860576e10,
-                                            1.9990320184e10
-                                        ]
-                                    }
-                                ]
-                            },
-                            "tags": [
-                                "GPUInterpreter"
-                            ]
-                        }
-                    ]
-				},
-				"tags": []
-			}
-		]
-	]
-]
--- a/package/test/benchmarks/3/cpu.json
+++ b/package/test/benchmarks/3/cpu.json
@ -1 +0,0 @@
-[{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"nikuradse_1":["Trial",{"allocs":1070928,"gctimes":[5.9810994e7,6.4737628e7,6.6129743e7,7.1565506e7,6.7670658e7,5.9069063e7,6.4987093e7,6.9015313e7,6.1821987e7,6.232688e7,6.9091327e7,6.0481241e7,6.7083905e7,7.1568874e7,6.4126918e7,6.656359e7,6.0971912e7,6.6050458e7,6.4490748e7,6.3792946e7,6.387013e7,6.2149826e7,6.5226883e7,5.8717054e7,7.0228008e7,6.4004441e7,6.179879e7,6.5803149e7,6.7778738e7,6.7530531e7,6.3874846e7,6.5191925e7,6.3458451e7,6.2795489e7,6.1214158e7,6.3242098e7,6.0904665e7,6.1067523e7,6.4187211e7,5.9758454e7,6.3188528e7,6.2703208e7,6.3179623e7,6.6383934e7,6.4153586e7,6.1124868e7,6.1729561e7,6.4057874e7,6.2238357e7,6.7185346e7],"memory":1092256904,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":50,"evals":1,"gcsample":false,"seconds":28800.0,"overhead":0.0,"memory_tolerance":0.01}],"times":[4.7279143479e10,4.7935284564e10,4.7889470924e10,4.8090224769e10,4.9154174752e10,4.8234789351e10,4.8104334398e10,4.8399709732e10,4.7838902153e10,4.7903811218e10,4.8003267026e10,4.7752267096e10,5.0271042e10,5.2827474041e10,5.1150949627e10,5.3890859024e10,5.0334709301e10,4.8277583693e10,4.8826727675e10,4.8699721506e10,4.8785042007e10,4.8277983231e10,4.814134015e10,4.8104634658e10,4.9149760213e10,4.8564998255e10,4.8300117448e10,4.8526373086e10,4.8889779772e10,4.8001705803e10,4.7925610954e10,4.8209726338e10,4.8102811977e10,4.8159213161e10,4.816676277e10,4.8356507356e10,4.8464023297e10,4.8347214632e10,4.8467268775e10,4.8034119608e10,4.88565184e10,4.8690518925e10,4.8235448799e10,4.8346337545e10,4.8627889423e10,4.784124779e10,4.8095176162e10,4.869052391e10,4.7806194068e10,4.8009508502e10]}]},"tags":["CPUInterpreter"]}]},"tags":[]}]]]
--- a/package/test/benchmarks/3/cpu_benchmark4_3620_var_sets.json
+++ b/package/test/benchmarks/3/cpu_benchmark4_3620_var_sets.json
@ -1 +0,0 @@
-[{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"nikuradse_1":["Trial",{"allocs":1070928,"gctimes":[7.0737156e7,7.612373e7,7.0319845e7,7.4023793e7,6.9487953e7,7.13138e7,6.7919623e7,6.9790257e7,6.8884079e7,7.721969e7,7.1838152e7,7.1771265e7,7.4864423e7,7.117193e7,7.3785755e7,7.2381003e7,7.1546295e7,7.2286584e7,6.6459785e7,6.8890027e7,7.1509224e7,6.8358627e7,6.727323e7,7.4713054e7,6.8894051e7,7.4541532e7,6.9580992e7,7.2921699e7,6.9328135e7,7.0329552e7,7.5108206e7,7.0075534e7,7.0016025e7,6.7855751e7,7.1709092e7,6.9724248e7,7.0715959e7,7.0777192e7,6.8408832e7,6.9834693e7,6.8831404e7,6.5682229e7,6.8860642e7,7.0180588e7,7.5755869e7,6.7473844e7,7.511939e7,6.9218764e7,6.958201e7,7.1617631e7],"memory":794432648,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":50,"evals":1,"gcsample":false,"seconds":28800.0,"overhead":0.0,"memory_tolerance":0.01}],"times":[1.6299104854e10,1.6167206829e10,1.6494236203e10,1.6947550778e10,1.7034780856e10,1.7021231819e10,1.6932198004e10,1.6876549898e10,1.7079182771e10,1.7187766638e10,1.7102805204e10,1.7039326747e10,1.691889137e10,1.7042592496e10,1.6973888675e10,1.6948724634e10,1.7391081318e10,1.7129440584e10,1.6927534357e10,1.6966924724e10,1.7041711187e10,1.7245124816e10,1.6984027892e10,1.7099565673e10,1.6911699936e10,1.7139973297e10,1.6953329392e10,1.7090684047e10,1.6975681606e10,1.6964116346e10,1.7015161121e10,1.6879788824e10,1.6915966142e10,1.6996507916e10,1.7192698411e10,1.6972989288e10,1.7019713067e10,1.7042324004e10,1.7166773762e10,1.7270761002e10,1.6941653675e10,1.6944168651e10,1.7034265554e10,1.7108598363e10,1.7005234284e10,1.6827084022e10,1.7000687966e10,1.7102413049e10,1.7094021963e10,1.7128389177e10]}]},"tags":["CPUInterpreter"]}]},"tags":[]}]]]
--- a/package/test/benchmarks/3/gpu.json
+++ b/package/test/benchmarks/3/gpu.json
@ -1,292 +0,0 @@
-[
-	{
-		"Julia": "1.11.5",
-		"BenchmarkTools": {
-			"major": 1,
-			"minor": 6,
-			"patch": 0,
-			"prerelease": [],
-			"build": []
-		}
-	},
-	[
-		[
-			"BenchmarkGroup",
-			{
-				"data": {
-					"GPUT": [
-						"BenchmarkGroup",
-						{
-							"data": {
-								"nikuradse_1": [
-									"Trial",
-									{
-										"allocs": 27549794,
-										"gctimes": [
-											7.34988931e8,
-											5.41494997e8,
-											4.54013175e8,
-											4.35208291e8,
-											4.3231789e8,
-											4.55546184e8,
-											4.23418621e8,
-											4.50430938e8,
-											4.57438035e8,
-											4.40032177e8,
-											4.44249114e8,
-											4.59505029e8,
-											4.68161721e8,
-											4.78667113e8,
-											4.41616067e8,
-											4.5551461e8,
-											4.75652448e8,
-											4.72338385e8,
-											4.47779781e8,
-											4.52755333e8,
-											4.76158081e8,
-											4.48737222e8,
-											4.55761564e8,
-											4.39574521e8,
-											4.86435134e8,
-											4.43170348e8,
-											4.33731271e8,
-											4.61921334e8,
-											4.37434039e8,
-											4.59409079e8,
-											4.36341634e8,
-											4.71427401e8,
-											4.31984388e8,
-											4.59200269e8,
-											4.52769327e8,
-											4.44261215e8,
-											4.61363275e8,
-											4.61565013e8,
-											4.48557831e8,
-											4.85488793e8,
-											4.4128917e8,
-											4.7205662e8,
-											4.55980625e8,
-											4.49702326e8,
-											4.57778953e8,
-											4.52225066e8,
-											4.53744762e8,
-											4.61079024e8,
-											4.47186032e8,
-											4.51833021e8
-										],
-										"memory": 67507887608,
-										"params": [
-											"Parameters",
-											{
-												"gctrial": true,
-												"time_tolerance": 0.05,
-												"evals_set": false,
-												"samples": 50,
-												"evals": 1,
-												"gcsample": false,
-												"seconds": 43200.0,
-												"overhead": 0.0,
-												"memory_tolerance": 0.01
-											}
-										],
-										"times": [
-											2.4717936323e10,
-											2.4983074984e10,
-											2.3139017877e10,
-											2.4848874137e10,
-											2.5056845586e10,
-											2.547690064e10,
-											2.4976535335e10,
-											2.5575731567e10,
-											2.5140349264e10,
-											2.5896177615e10,
-											2.5501376819e10,
-											2.5327110754e10,
-											2.5409913851e10,
-											2.6295037648e10,
-											2.4355540157e10,
-											2.4657706641e10,
-											2.5952612569e10,
-											2.5854856758e10,
-											2.5568112399e10,
-											2.5490261014e10,
-											2.5160759326e10,
-											2.6260268676e10,
-											2.5242980231e10,
-											2.5638644329e10,
-											2.3768975772e10,
-											2.5146122285e10,
-											2.5682055949e10,
-											2.5237286107e10,
-											2.5496022078e10,
-											2.5568661702e10,
-											2.4330249484e10,
-											2.5685686423e10,
-											2.5250886166e10,
-											2.5401607442e10,
-											2.5564544027e10,
-											2.5868746223e10,
-											2.5977606065e10,
-											2.5405825803e10,
-											2.4619705069e10,
-											2.4325894725e10,
-											2.566709978e10,
-											2.5400372207e10,
-											2.5148598725e10,
-											2.5256329818e10,
-											2.5236091538e10,
-											2.602685786e10,
-											2.5430861304e10,
-											2.5972127622e10,
-											2.3654688411e10,
-											2.605084424e10
-										]
-									}
-								]
-							},
-							"tags": [
-								"GPUTranspiler"
-							]
-						}
-					],
-					"GPUI": [
-						"BenchmarkGroup",
-						{
-							"data": {
-								"nikuradse_1": [
-									"Trial",
-									{
-										"allocs": 32243751,
-										"gctimes": [
-											5.41994011e8,
-											5.74350603e8,
-											4.90525664e8,
-											5.92143868e8,
-											6.56922572e8,
-											6.38722256e8,
-											5.51324211e8,
-											5.94380581e8,
-											5.65880356e8,
-											5.30293176e8,
-											6.75544373e8,
-											5.91556404e8,
-											5.30953191e8,
-											5.73477234e8,
-											5.07802986e8,
-											6.71908957e8,
-											4.58611495e8,
-											5.34383897e8,
-											4.35307473e8,
-											4.25796027e8,
-											4.26650755e8,
-											5.43969839e8,
-											4.62966279e8,
-											5.62772957e8,
-											5.61112059e8,
-											5.21608844e8,
-											4.29687492e8,
-											5.3098919e8,
-											4.18511386e8,
-											5.51285144e8,
-											6.36456452e8,
-											5.80375968e8,
-											4.90520531e8,
-											5.72019977e8,
-											5.16803925e8,
-											5.31636535e8,
-											4.88470453e8,
-											4.57291468e8,
-											4.63585061e8,
-											6.75995209e8,
-											4.47446015e8,
-											4.21505932e8,
-											4.63417339e8,
-											6.17901021e8,
-											5.04952063e8,
-											6.3799233e8,
-											4.34554313e8,
-											6.24205134e8,
-											6.1699824e8,
-											5.4327705e8
-										],
-										"memory": 45874268472,
-										"params": [
-											"Parameters",
-											{
-												"gctrial": true,
-												"time_tolerance": 0.05,
-												"evals_set": false,
-												"samples": 50,
-												"evals": 1,
-												"gcsample": false,
-												"seconds": 43200.0,
-												"overhead": 0.0,
-												"memory_tolerance": 0.01
-											}
-										],
-										"times": [
-											3.07178374e10,
-											3.0668015775e10,
-											3.0731090373e10,
-											3.0442775184e10,
-											3.0456642482e10,
-											3.0082122734e10,
-											3.0126331654e10,
-											3.0751723908e10,
-											3.1179628532e10,
-											3.0065663574e10,
-											3.0464515622e10,
-											3.0393855038e10,
-											3.1635622751e10,
-											3.0447222014e10,
-											2.973601985e10,
-											3.0033623194e10,
-											3.0580015719e10,
-											3.1400733412e10,
-											3.0272328646e10,
-											3.0223853837e10,
-											2.9915814997e10,
-											3.0818324531e10,
-											3.0179331592e10,
-											3.0293039282e10,
-											3.0017377964e10,
-											3.0087189496e10,
-											3.0582174914e10,
-											2.996325235e10,
-											3.0134649182e10,
-											3.1042223141e10,
-											3.0007740363e10,
-											3.0437426607e10,
-											3.0810836436e10,
-											3.1234163757e10,
-											3.0221879009e10,
-											3.0338940936e10,
-											3.1233683944e10,
-											3.1019897889e10,
-											3.1380379599e10,
-											2.9821214171e10,
-											3.0882968215e10,
-											3.0159994975e10,
-											3.0309932542e10,
-											2.9969275606e10,
-											3.0447151474e10,
-											3.0342592912e10,
-											3.024330255e10,
-											3.0258060029e10,
-											3.0095601739e10,
-											3.0209601692e10
-										]
-									}
-								]
-							},
-							"tags": [
-								"GPUInterpreter"
-							]
-						}
-					]
-				},
-				"tags": []
-			}
-		]
-	]
-]
--- a/package/test/benchmarks/3/gpu_i_160_t_128_blocksize__worse-than-switching-blocksizes.json
+++ b/package/test/benchmarks/3/gpu_i_160_t_128_blocksize__worse-than-switching-blocksizes.json
@ -1,292 +0,0 @@
-[
-	{
-		"Julia": "1.11.5",
-		"BenchmarkTools": {
-			"major": 1,
-			"minor": 6,
-			"patch": 0,
-			"prerelease": [],
-			"build": []
-		}
-	},
-	[
-		[
-			"BenchmarkGroup",
-			{
-				"data": {
-					"GPUT": [
-                        "BenchmarkGroup",
-                        {
-                            "data": {
-                                "nikuradse_1": [
-                                    "Trial",
-                                    {
-                                        "allocs": 27549909,
-                                        "gctimes": [
-                                            1.796502723e9,
-                                            7.72059865e8,
-                                            3.94563446e8,
-                                            4.27997326e8,
-                                            4.06964911e8,
-                                            4.08277194e8,
-                                            4.02770711e8,
-                                            4.11141922e8,
-                                            4.07309952e8,
-                                            4.12815766e8,
-                                            4.13257433e8,
-                                            4.11708235e8,
-                                            4.06349416e8,
-                                            4.14353433e8,
-                                            4.05742826e8,
-                                            4.09829039e8,
-                                            4.02646084e8,
-                                            4.01623866e8,
-                                            4.11190055e8,
-                                            4.11476122e8,
-                                            4.07361638e8,
-                                            4.07028467e8,
-                                            4.11106781e8,
-                                            4.26360821e8,
-                                            4.07521363e8,
-                                            4.07228793e8,
-                                            4.09025385e8,
-                                            4.21241253e8,
-                                            4.1859973e8,
-                                            4.2067553e8,
-                                            4.00959317e8,
-                                            4.16666312e8,
-                                            4.10104406e8,
-                                            4.18910797e8,
-                                            4.05213147e8,
-                                            4.16627063e8,
-                                            4.1920481e8,
-                                            4.54088613e8,
-                                            4.39532553e8,
-                                            4.13238829e8,
-                                            4.14822338e8,
-                                            4.11867383e8,
-                                            4.15005572e8,
-                                            4.11339915e8,
-                                            4.1448983e8,
-                                            4.17699043e8,
-                                            4.16447232e8,
-                                            4.1597287e8,
-                                            4.14369912e8,
-                                            4.19276762e8
-                                        ],
-                                        "memory": 67507887480,
-                                        "params": [
-                                            "Parameters",
-                                            {
-                                                "gctrial": true,
-                                                "time_tolerance": 0.05,
-                                                "evals_set": false,
-                                                "samples": 50,
-                                                "evals": 1,
-                                                "gcsample": false,
-                                                "seconds": 43200.0,
-                                                "overhead": 0.0,
-                                                "memory_tolerance": 0.01
-                                            }
-                                        ],
-                                        "times": [
-                                            3.9931587632e10,
-                                            3.8962332239e10,
-                                            2.6658724209e10,
-                                            2.769671872e10,
-                                            2.6617417291e10,
-                                            2.6695278116e10,
-                                            2.6389594847e10,
-                                            2.6500758348e10,
-                                            2.6314618692e10,
-                                            2.6869478695e10,
-                                            2.6596999781e10,
-                                            2.6195296634e10,
-                                            2.6321536967e10,
-                                            2.676203466e10,
-                                            2.6810603797e10,
-                                            2.6754603343e10,
-                                            2.6616260783e10,
-                                            2.7015249577e10,
-                                            2.621089281e10,
-                                            2.565195064e10,
-                                            2.4093609228e10,
-                                            2.6872052438e10,
-                                            2.6312874968e10,
-                                            2.6567674382e10,
-                                            2.6188371615e10,
-                                            2.6627277961e10,
-                                            2.6351801318e10,
-                                            2.6764821332e10,
-                                            2.658020325e10,
-                                            2.6845009549e10,
-                                            2.6127450384e10,
-                                            2.6523726565e10,
-                                            2.6221363227e10,
-                                            2.542875719e10,
-                                            2.6885440863e10,
-                                            2.7207730806e10,
-                                            2.770831496e10,
-                                            2.7896929881e10,
-                                            2.7711770473e10,
-                                            2.6842628626e10,
-                                            2.4898863927e10,
-                                            2.6687932301e10,
-                                            2.6503076469e10,
-                                            2.655039632e10,
-                                            2.708347459e10,
-                                            2.5440628322e10,
-                                            2.6279933326e10,
-                                            2.7371915793e10,
-                                            2.6695784917e10,
-                                            2.7225562291e10
-                                        ]
-                                    }
-                                ]
-                            },
-                            "tags": [
-                                "GPUTranspiler"
-                            ]
-                        }
-                    ],
-					"GPUI": [
-                        "BenchmarkGroup",
-                        {
-                            "data": {
-                                "nikuradse_1": [
-                                    "Trial",
-                                    {
-                                        "allocs": 32241320,
-                                        "gctimes": [
-                                            3.76843873e8,
-                                            3.87520681e8,
-                                            3.53674001e8,
-                                            3.67061252e8,
-                                            3.741527e8,
-                                            3.69293996e8,
-                                            3.63305802e8,
-                                            3.61913634e8,
-                                            3.51818682e8,
-                                            3.48188601e8,
-                                            3.62864887e8,
-                                            3.47736729e8,
-                                            3.50237523e8,
-                                            3.53595403e8,
-                                            3.51245475e8,
-                                            3.57725399e8,
-                                            3.48667085e8,
-                                            3.5174771e8,
-                                            3.50159541e8,
-                                            3.57487652e8,
-                                            3.61893033e8,
-                                            3.67797485e8,
-                                            3.44948035e8,
-                                            3.50222654e8,
-                                            3.36037781e8,
-                                            3.50770955e8,
-                                            3.48655148e8,
-                                            3.46508038e8,
-                                            3.48958873e8,
-                                            4.49202169e8,
-                                            3.53247995e8,
-                                            3.71504213e8,
-                                            3.5431637e8,
-                                            3.59468716e8,
-                                            3.46016454e8,
-                                            3.69149583e8,
-                                            3.65486404e8,
-                                            4.45340687e8,
-                                            4.37909167e8,
-                                            3.3690913e8,
-                                            3.50482929e8,
-                                            3.49559472e8,
-                                            3.38465639e8,
-                                            3.44654417e8,
-                                            3.49173998e8,
-                                            3.50582847e8,
-                                            3.55724581e8,
-                                            3.4921611e8,
-                                            3.55360179e8,
-                                            3.48805235e8
-                                        ],
-                                        "memory": 45874227656,
-                                        "params": [
-                                            "Parameters",
-                                            {
-                                                "gctrial": true,
-                                                "time_tolerance": 0.05,
-                                                "evals_set": false,
-                                                "samples": 50,
-                                                "evals": 1,
-                                                "gcsample": false,
-                                                "seconds": 43200.0,
-                                                "overhead": 0.0,
-                                                "memory_tolerance": 0.01
-                                            }
-                                        ],
-                                        "times": [
-                                            7.3943918395e10,
-                                            7.4070804594e10,
-                                            7.3896520127e10,
-                                            7.4134852923e10,
-                                            7.4229052084e10,
-                                            7.4064320483e10,
-                                            7.3463069111e10,
-                                            7.3918826132e10,
-                                            7.3667157657e10,
-                                            7.3970536289e10,
-                                            7.4355207783e10,
-                                            7.3727364718e10,
-                                            7.384731378e10,
-                                            7.4005447387e10,
-                                            7.4051183283e10,
-                                            7.3985867593e10,
-                                            7.3531459498e10,
-                                            7.3479080625e10,
-                                            7.5207069603e10,
-                                            7.4365038661e10,
-                                            7.3929205754e10,
-                                            7.4276829344e10,
-                                            7.4038629545e10,
-                                            7.4778589402e10,
-                                            7.4428735243e10,
-                                            7.3981806593e10,
-                                            7.3927279144e10,
-                                            7.3861975856e10,
-                                            7.3529711339e10,
-                                            7.3747593328e10,
-                                            7.4109278095e10,
-                                            7.421203285e10,
-                                            7.3915105894e10,
-                                            7.3744032137e10,
-                                            7.4102811619e10,
-                                            7.4106619627e10,
-                                            7.3922721844e10,
-                                            7.4218465669e10,
-                                            7.4356041135e10,
-                                            7.4323162031e10,
-                                            7.3943656925e10,
-                                            7.4352507972e10,
-                                            7.4394224103e10,
-                                            7.4250996553e10,
-                                            7.3976550142e10,
-                                            7.4218926316e10,
-                                            7.4574530318e10,
-                                            7.4235191697e10,
-                                            7.4346408894e10,
-                                            7.474792626e10
-                                        ]
-                                    }
-                                ]
-                            },
-                            "tags": [
-                                "GPUInterpreter"
-                            ]
-                        }
-                    ]
-				},
-				"tags": []
-			}
-		]
-	]
-]
--- a/package/test/benchmarks/3/gpui_blocksize_192_much_worse_than_128.json
+++ b/package/test/benchmarks/3/gpui_blocksize_192_much_worse_than_128.json
@ -1,196 +0,0 @@
-[
-	{
-		"Julia": "1.11.5",
-		"BenchmarkTools": {
-			"major": 1,
-			"minor": 6,
-			"patch": 0,
-			"prerelease": [],
-			"build": []
-		}
-	},
-	[
-		[
-			"BenchmarkGroup",
-			{
-				"data": {
-					"GPUT": [
-						"BenchmarkGroup",
-						{
-							"data": {
-                                "nikuradse_1": [
-                                    "Trial",
-                                    {
-                                        "allocs": 1534112879,
-                                        "gctimes": [
-                                            3.398826747854e12,
-                                            2.618070795579e12
-                                        ],
-                                        "memory": 51380857328968,
-                                        "params": [
-                                            "Parameters",
-                                            {
-                                                "gctrial": true,
-                                                "time_tolerance": 0.05,
-                                                "evals_set": false,
-                                                "samples": 50,
-                                                "evals": 1,
-                                                "gcsample": false,
-                                                "seconds": 43200.0,
-                                                "overhead": 0.0,
-                                                "memory_tolerance": 0.01
-                                            }
-                                        ],
-                                        "times": [
-                                            --3.7202049569362e13,
-                                            --3.7400159760069e13
-                                        ]
-                                    }
-                                ]
-                            },
-							"tags": [
-								"GPUTranspiler"
-							]
-						}
-					],
-					"GPUI": [
-                        "BenchmarkGroup",
-                        {
-                            "data": {
-                                "nikuradse_1": [
-                                    "Trial",
-                                    {
-                                        "allocs": 32241307,
-                                        "gctimes": [
-                                            2.99988451e8,
-                                            3.18541335e8,
-                                            3.40658917e8,
-                                            3.20735576e8,
-                                            3.17668135e8,
-                                            3.11634185e8,
-                                            3.55400831e8,
-                                            3.25257947e8,
-                                            3.25941878e8,
-                                            3.31627658e8,
-                                            3.2513644e8,
-                                            5.34886621e8,
-                                            4.30305899e8,
-                                            4.75073379e8,
-                                            5.41262095e8,
-                                            5.14748243e8,
-                                            4.91966069e8,
-                                            4.55043676e8,
-                                            4.70840046e8,
-                                            5.50526217e8,
-                                            4.31207494e8,
-                                            4.76072811e8,
-                                            5.04324319e8,
-                                            5.72218216e8,
-                                            4.11391335e8,
-                                            4.73366047e8,
-                                            5.12748251e8,
-                                            4.58269866e8,
-                                            3.87267173e8,
-                                            5.38187011e8,
-                                            4.56822334e8,
-                                            4.24688896e8,
-                                            5.94190171e8,
-                                            5.28701852e8,
-                                            5.15021748e8,
-                                            6.10057318e8,
-                                            4.74982584e8,
-                                            4.33478296e8,
-                                            4.33664662e8,
-                                            4.22168618e8,
-                                            4.16528265e8,
-                                            4.15685104e8,
-                                            4.23277232e8,
-                                            3.74337751e8,
-                                            4.25875703e8,
-                                            5.42365157e8,
-                                            4.94701466e8,
-                                            4.83233782e8,
-                                            4.24986417e8,
-                                            4.8780606e8
-                                        ],
-                                        "memory": 45874227384,
-                                        "params": [
-                                            "Parameters",
-                                            {
-                                                "gctrial": true,
-                                                "time_tolerance": 0.05,
-                                                "evals_set": false,
-                                                "samples": 50,
-                                                "evals": 1,
-                                                "gcsample": false,
-                                                "seconds": 43200.0,
-                                                "overhead": 0.0,
-                                                "memory_tolerance": 0.01
-                                            }
-                                        ],
-                                        "times": [
-                                            3.055626804e10,
-                                            3.0413771477e10,
-                                            3.0058609633e10,
-                                            3.007921294e10,
-                                            3.0178903964e10,
-                                            3.0243374529e10,
-                                            3.0043488197e10,
-                                            2.9849309299e10,
-                                            3.0134058306e10,
-                                            3.0627343705e10,
-                                            3.0130179115e10,
-                                            4.8987140933e10,
-                                            1.0029494223e11,
-                                            9.991837876e10,
-                                            1.01083284461e11,
-                                            1.00013926981e11,
-                                            1.00050439359e11,
-                                            1.00453826906e11,
-                                            1.00398291414e11,
-                                            1.0026599822e11,
-                                            1.00645806674e11,
-                                            9.9875971997e10,
-                                            9.9612950384e10,
-                                            1.00253673473e11,
-                                            9.9643175894e10,
-                                            1.0027620915e11,
-                                            9.9714066248e10,
-                                            1.00141668213e11,
-                                            1.00269405678e11,
-                                            1.00149909912e11,
-                                            1.00645303739e11,
-                                            9.9693734213e10,
-                                            1.01986856167e11,
-                                            1.00367529986e11,
-                                            9.986664487e10,
-                                            1.01112512248e11,
-                                            9.9866828996e10,
-                                            9.887153973e10,
-                                            9.9119068947e10,
-                                            9.9161506987e10,
-                                            9.8659948079e10,
-                                            9.9016722639e10,
-                                            9.9226347837e10,
-                                            9.9361219392e10,
-                                            9.9532328849e10,
-                                            9.9181660704e10,
-                                            9.9525871099e10,
-                                            9.877397928e10,
-                                            9.8880425186e10,
-                                            9.9195828801e10
-                                        ]
-                                    }
-                                ]
-                            },
-                            "tags": [
-                                "GPUInterpreter"
-                            ]
-                        }
-                    ]
-				},
-				"tags": []
-			}
-		]
-	]
-]
--- a/package/test/data/esr_nvar2_len10.txt.gz_5.txt/esr_nvar2_len10.txt.gz_5.txt
+++ b/package/test/data/esr_nvar2_len10.txt.gz_5.txt/esr_nvar2_len10.txt.gz_5.txt
@ -1,494 +0,0 @@
-inv(inv(x1) - x1)
-inv(inv(x2) - x2)
-inv(inv(x1) - x2)
-inv(inv(x2) - x1)
-inv(inv(x1) + p1)
-inv(inv(x2) + p1)
-inv(inv(x1) + x1)
-inv(inv(x2) + x2)
-inv(inv(x1) + x2)
-inv(inv(x2) + x1)
-inv(x1 - inv(x1))
-inv(x2 - inv(x2))
-inv(x1 - inv(x2))
-inv(x2 - inv(x1))
-abs(x1) ^ -(inv(x1))
-abs(x2) ^ -(inv(x2))
-abs(x1) ^ -(inv(x2))
-abs(x2) ^ -(inv(x1))
-inv(p1 - inv(x1))
-inv(p1 - inv(x2))
-inv(x1) - inv(x2)
-inv(x2) - inv(x1)
-2 / x1
-2 / x2
-inv(x1) + inv(x2)
-inv(x2) + inv(x1)
-x1
-x2
-x1 * -2 + x2
-x2 * -2 + x1
-x1 / (x1 + x1)
-x2 / (x2 + x2)
-x1 / (x2 + x2)
-x2 / (x1 + x1)
-abs(x1) ^ (x1 + x1)
-abs(x2) ^ (x2 + x2)
-abs(x1) ^ (x2 + x2)
-abs(x2) ^ (x1 + x1)
-x1 * -2 + p1
-x2 * -2 + p1
-p1 / (x1 + x1)
-p1 / (x2 + x2)
-x1 / (x1 + x2)
-x2 / (x1 + x2)
-abs(x1) ^ (x1 + x2)
-abs(x2) ^ (x1 + x2)
-p1 - (x1 + x2)
-p1 / (x1 + x2)
-abs(p1) ^ (x1 + x2)
-x1 - sqr(x1)
-x2 - sqr(x2)
-x1 - sqr(x2)
-x2 - sqr(x1)
-x1 / sqr(x1)
-x2 / sqr(x2)
-x1 / sqr(x2)
-x2 / sqr(x1)
-abs(x1) ^ sqr(x1)
-abs(x2) ^ sqr(x2)
-abs(x1) ^ sqr(x2)
-abs(x2) ^ sqr(x1)
-p1 - sqr(x1)
-p1 - sqr(x2)
-p1 / sqr(x1)
-p1 / sqr(x2)
-abs(p1) ^ sqr(x1)
-abs(p1) ^ sqr(x2)
-x1 - x1 * x2
-x2 - x1 * x2
-x1 / (x1 * x2)
-x2 / (x1 * x2)
-abs(x1) ^ (x1 * x2)
-abs(x2) ^ (x1 * x2)
-p1 - x1 * x2
-p1 / (x1 * x2)
-abs(p1) ^ (x1 * x2)
-(x1 - x2) + p1
-(x2 - x1) + p1
-x1 / (x1 + p1)
-x2 / (x2 + p1)
-x1 / (x2 + p1)
-x2 / (x1 + p1)
-abs(x1) ^ (x1 + p1)
-abs(x2) ^ (x2 + p1)
-abs(x1) ^ (x2 + p1)
-abs(x2) ^ (x1 + p1)
-p1 / (x1 + p2)
-p1 / (x2 + p2)
-abs(p1) ^ (x1 + p2)
-abs(p1) ^ (x2 + p2)
-x1 * p1 + x2
-x2 * p1 + x1
-(x1 / x1) * p1
-(x2 / x2) * p1
-(x1 / x2) * p1
-(x2 / x1) * p1
-abs(x1) ^ (x1 * p1)
-abs(x2) ^ (x2 * p1)
-abs(x1) ^ (x2 * p1)
-abs(x2) ^ (x1 * p1)
-x1 * p1 + p2
-x2 * p1 + p2
-x1 * 1//0
-x2 * 1//0
-abs(x1) ^ 0
-abs(x2) ^ 0
-1
-x1 / (x1 - x2)
-x2 / (x2 - x1)
-abs(x1) ^ (x1 - x2)
-abs(x2) ^ (x2 - x1)
-p1 / (x1 - x2)
-p1 / (x2 - x1)
-abs(p1) ^ (x1 - x2)
-abs(p1) ^ (x2 - x1)
-x1 - x1 / x1
-x2 - x2 / x2
-x1 - x2 / x2
-x2 - x1 / x1
-sqr(x1) / x1
-sqr(x2) / x2
-(x1 / x1) * x2
-(x2 / x2) * x1
-abs(x1) ^ (x1 / x1)
-abs(x2) ^ (x2 / x2)
-abs(x1) ^ (x2 / x2)
-abs(x2) ^ (x1 / x1)
-p1 - x1 / x1
-p1 - x2 / x2
-abs(p1) ^ (x1 / x1)
-abs(p1) ^ (x2 / x2)
-x1 - x1 / x2
-x2 - x2 / x1
-(x1 / x2) * x2
-(x2 / x1) * x1
-abs(x1) ^ (x1 / x2)
-abs(x2) ^ (x2 / x1)
-p1 - x1 / x2
-p1 - x2 / x1
-abs(p1) ^ (x1 / x2)
-abs(p1) ^ (x2 / x1)
-x1 - abs(x1) ^ x1
-x2 - abs(x2) ^ x2
-x1 - abs(x2) ^ x2
-x2 - abs(x1) ^ x1
-x1 / abs(x1) ^ x1
-x2 / abs(x2) ^ x2
-x1 / abs(x2) ^ x2
-x2 / abs(x1) ^ x1
-abs(x1) ^ (abs(x1) ^ x1)
-abs(x2) ^ (abs(x2) ^ x2)
-abs(x1) ^ (abs(x2) ^ x2)
-abs(x2) ^ (abs(x1) ^ x1)
-p1 - abs(x1) ^ x1
-p1 - abs(x2) ^ x2
-p1 / abs(x1) ^ x1
-p1 / abs(x2) ^ x2
-abs(p1) ^ (abs(x1) ^ x1)
-abs(p1) ^ (abs(x2) ^ x2)
-x1 - abs(x1) ^ x2
-x2 - abs(x2) ^ x1
-x1 / abs(x1) ^ x2
-x2 / abs(x2) ^ x1
-abs(x1) ^ (abs(x1) ^ x2)
-abs(x2) ^ (abs(x2) ^ x1)
-p1 - abs(x1) ^ x2
-p1 - abs(x2) ^ x1
-p1 / abs(x1) ^ x2
-p1 / abs(x2) ^ x1
-abs(p1) ^ (abs(x1) ^ x2)
-abs(p1) ^ (abs(x2) ^ x1)
-x1 + x1 + p1
-x2 + x2 + p1
-x1 + x2 + p1
-x1 / (p1 - x1)
-x2 / (p1 - x2)
-x1 / (p1 - x2)
-x2 / (p1 - x1)
-abs(x1) ^ (p1 - x1)
-abs(x2) ^ (p1 - x2)
-abs(x1) ^ (p1 - x2)
-abs(x2) ^ (p1 - x1)
-p1 / (p2 - x1)
-p1 / (p2 - x2)
-abs(p1) ^ (p2 - x1)
-abs(p1) ^ (p2 - x2)
-p1 / x1 + x1
-p1 / x2 + x2
-p1 / x1 + x2
-p1 / x2 + x1
-sqr(x1) * p1
-sqr(x2) * p1
-x1 * x2 * p1
-abs(x1) ^ (p1 / x1)
-abs(x2) ^ (p1 / x2)
-abs(x1) ^ (p1 / x2)
-abs(x2) ^ (p1 / x1)
-p1 / x1 + p2
-p1 / x2 + p2
-x1 - abs(p1) ^ x1
-x2 - abs(p1) ^ x2
-x1 - abs(p1) ^ x2
-x2 - abs(p1) ^ x1
-abs(p1) ^ x1 * x1
-abs(p1) ^ x2 * x2
-abs(p1) ^ x1 * x2
-abs(p1) ^ x2 * x1
-abs(x1) ^ (abs(p1) ^ x1)
-abs(x2) ^ (abs(p1) ^ x2)
-abs(x1) ^ (abs(p1) ^ x2)
-abs(x2) ^ (abs(p1) ^ x1)
-p1 - abs(p2) ^ x1
-p1 - abs(p2) ^ x2
-abs(p1) ^ x1 * p2
-abs(p1) ^ x2 * p2
-abs(p1) ^ (abs(p2) ^ x1)
-abs(p1) ^ (abs(p2) ^ x2)
-x1 - abs(x1) ^ p1
-x2 - abs(x2) ^ p1
-x1 - abs(x2) ^ p1
-x2 - abs(x1) ^ p1
-abs(x1) ^ p1 * x1
-abs(x2) ^ p1 * x2
-abs(x1) ^ p1 * x2
-abs(x2) ^ p1 * x1
-abs(x1) ^ (abs(x1) ^ p1)
-abs(x2) ^ (abs(x2) ^ p1)
-abs(x1) ^ (abs(x2) ^ p1)
-abs(x2) ^ (abs(x1) ^ p1)
-p1 - abs(x1) ^ p2
-p1 - abs(x2) ^ p2
-abs(x1) ^ p1 * p2
-abs(x2) ^ p1 * p2
-abs(p1) ^ (abs(x1) ^ p2)
-abs(p1) ^ (abs(x2) ^ p2)
-x1 - abs(p1)
-x2 - abs(p1)
-abs(p1) * x1
-abs(p1) * x2
-abs(x1) ^ abs(p1)
-abs(x2) ^ abs(p1)
-x1 * 3
-x2 * 3
-x1 + x1 + x2
-x1 + x2 + x2
-sqr(x1) * 2
-sqr(x2) * 2
-x1 * x2 * 2
-x1 + x2 + x2
-x1 + x1 + x2
-(x1 + x2) * x1
-(x1 + x2) * x2
-(x1 + x2) * x2
-(x1 + x2) * x1
-sqr(x1) + x1
-sqr(x2) + x2
-sqr(x1) + x2
-sqr(x2) + x1
-sqr(x1) * x1
-sqr(x2) * x2
-sqr(x1) * x2
-sqr(x2) * x1
-x1 * x2 + x1
-x1 * x2 + x2
-x1 * x2 + x2
-x1 * x2 + x1
-(x1 + p1) * x1
-(x2 + p1) * x2
-(x1 + p1) * x2
-(x2 + p1) * x1
-(x1 - x2) + x1
-(x2 - x1) + x2
-(x1 - x2) * x1
-(x2 - x1) * x2
-(x1 - x2) * x2
-(x2 - x1) * x1
-x1 / x1 + x1
-x2 / x2 + x2
-x1 / x1 + x2
-x2 / x2 + x1
-x1 / x2 + x1
-x2 / x1 + x2
-x1 / x2 + x2
-x2 / x1 + x1
-sqr(x1) / x2
-sqr(x2) / x1
-abs(x1) ^ x1 + x1
-abs(x2) ^ x2 + x2
-abs(x1) ^ x1 + x2
-abs(x2) ^ x2 + x1
-abs(x1) ^ x1 * x1
-abs(x2) ^ x2 * x2
-abs(x1) ^ x1 * x2
-abs(x2) ^ x2 * x1
-abs(x1) ^ x2 + x1
-abs(x2) ^ x1 + x2
-abs(x1) ^ x2 + x2
-abs(x2) ^ x1 + x1
-abs(x1) ^ x2 * x1
-abs(x2) ^ x1 * x2
-abs(x1) ^ x2 * x2
-abs(x2) ^ x1 * x1
-(p1 - x1) * x1
-(p1 - x2) * x2
-(p1 - x1) * x2
-(p1 - x2) * x1
-abs(p1) ^ x1 + x1
-abs(p1) ^ x2 + x2
-abs(p1) ^ x1 + x2
-abs(p1) ^ x2 + x1
-abs(x1) ^ p1 + x1
-abs(x2) ^ p1 + x2
-abs(x1) ^ p1 + x2
-abs(x2) ^ p1 + x1
-abs(p1) + x1
-abs(p1) + x2
-(x1 + x2) * p1
-sqr(x1) + p1
-sqr(x2) + p1
-x1 * x2 + p1
-(x1 - x2) * p1
-(x2 - x1) * p1
-x1 / x1 + p1
-x2 / x2 + p1
-x1 / x2 + p1
-x2 / x1 + p1
-abs(x1) ^ x1 + p1
-abs(x2) ^ x2 + p1
-abs(x1) ^ x1 * p1
-abs(x2) ^ x2 * p1
-abs(x1) ^ x2 + p1
-abs(x2) ^ x1 + p1
-abs(x1) ^ x2 * p1
-abs(x2) ^ x1 * p1
-abs(p1) ^ x1 + p2
-abs(p1) ^ x2 + p2
-abs(x1) ^ p1 + p2
-abs(x2) ^ p1 + p2
-(x1 / x1) * 2
-(x2 / x2) * 2
-(x1 / x2) * 2
-(x2 / x1) * 2
-(abs(x1) * 2) ^ x1
-(abs(x2) * 2) ^ x2
-(abs(x1) * 2) ^ x2
-(abs(x2) * 2) ^ x1
-(x1 + x2) / x1
-(x1 + x2) / x2
-(x1 + x2) / x2
-(x1 + x2) / x1
-abs(x1 + x2) ^ x1
-abs(x1 + x2) ^ x2
-abs(x1 + x2) ^ x2
-abs(x1 + x2) ^ x1
-sqr(x1) - x1
-sqr(x2) - x2
-sqr(x1) - x2
-sqr(x2) - x1
-sqr(x1) ^ x1
-sqr(x2) ^ x2
-sqr(x1) ^ x2
-sqr(x2) ^ x1
-x1 * x2 - x1
-x1 * x2 - x2
-x1 * x2 - x2
-x1 * x2 - x1
-abs(x1 * x2) ^ x1
-abs(x1 * x2) ^ x2
-abs(x1 * x2) ^ x2
-abs(x1 * x2) ^ x1
-(x1 + p1) / x1
-(x2 + p1) / x2
-(x1 + p1) / x2
-(x2 + p1) / x1
-abs(x1 + p1) ^ x1
-abs(x2 + p1) ^ x2
-abs(x1 + p1) ^ x2
-abs(x2 + p1) ^ x1
-x1 * p1 - x2
-x2 * p1 - x1
-abs(x1 * p1) ^ x1
-abs(x2 * p1) ^ x2
-abs(x1 * p1) ^ x2
-abs(x2 * p1) ^ x1
-0 ^ x1
-0 ^ x2
-(x1 - x2) / x1
-(x2 - x1) / x2
-(x1 - x2) / x2
-(x2 - x1) / x1
-abs(x1 - x2) ^ x1
-abs(x2 - x1) ^ x2
-abs(x1 - x2) ^ x2
-abs(x2 - x1) ^ x1
-x1 / x1 - x1
-x2 / x2 - x2
-x1 / x1 - x2
-x2 / x2 - x1
-abs(x1 / x1) ^ x1
-abs(x2 / x2) ^ x2
-abs(x1 / x1) ^ x2
-abs(x2 / x2) ^ x1
-x1 / x2 - x1
-x2 / x1 - x2
-x1 / x2 - x2
-x2 / x1 - x1
-abs(x1 / x2) ^ x1
-abs(x2 / x1) ^ x2
-abs(x1 / x2) ^ x2
-abs(x2 / x1) ^ x1
-abs(x1) ^ x1 - x1
-abs(x2) ^ x2 - x2
-abs(x1) ^ x1 - x2
-abs(x2) ^ x2 - x1
-abs(x1) ^ x1 / x1
-abs(x2) ^ x2 / x2
-abs(x1) ^ x1 / x2
-abs(x2) ^ x2 / x1
-(abs(x1) ^ x1) ^ x1
-(abs(x2) ^ x2) ^ x2
-(abs(x1) ^ x1) ^ x2
-(abs(x2) ^ x2) ^ x1
-abs(x1) ^ x2 - x1
-abs(x2) ^ x1 - x2
-abs(x1) ^ x2 - x2
-abs(x2) ^ x1 - x1
-abs(x1) ^ x2 / x1
-abs(x2) ^ x1 / x2
-abs(x1) ^ x2 / x2
-abs(x2) ^ x1 / x1
-(abs(x1) ^ x2) ^ x1
-(abs(x2) ^ x1) ^ x2
-(abs(x1) ^ x2) ^ x2
-(abs(x2) ^ x1) ^ x1
-(p1 - x1) / x1
-(p1 - x2) / x2
-(p1 - x1) / x2
-(p1 - x2) / x1
-p1 / x1 - x1
-p1 / x2 - x2
-p1 / x1 - x2
-p1 / x2 - x1
-abs(p1 / x1) ^ x1
-abs(p1 / x2) ^ x2
-abs(p1 / x1) ^ x2
-abs(p1 / x2) ^ x1
-abs(p1) ^ x1 - x1
-abs(p1) ^ x2 - x2
-abs(p1) ^ x1 - x2
-abs(p1) ^ x2 - x1
-abs(p1) ^ x1 / x1
-abs(p1) ^ x2 / x2
-abs(p1) ^ x1 / x2
-abs(p1) ^ x2 / x1
-abs(x1) ^ p1 - x1
-abs(x2) ^ p1 - x2
-abs(x1) ^ p1 - x2
-abs(x2) ^ p1 - x1
-abs(x1) ^ p1 / x1
-abs(x2) ^ p1 / x2
-abs(x1) ^ p1 / x2
-abs(x2) ^ p1 / x1
-(abs(x1) ^ p1) ^ x1
-(abs(x2) ^ p1) ^ x2
-(abs(x1) ^ p1) ^ x2
-(abs(x2) ^ p1) ^ x1
-abs(p1) - x1
-abs(p1) - x2
-abs(p1) / x1
-abs(p1) / x2
-(abs(x1) * 2) ^ p1
-(abs(x2) * 2) ^ p1
-abs(x1 + x2) ^ p1
-sqr(x1) ^ p1
-sqr(x2) ^ p1
-abs(x1 * x2) ^ p1
-abs(x1 + p1) ^ p2
-abs(x2 + p1) ^ p2
-abs(x1 * p1) ^ p2
-abs(x2 * p1) ^ p2
-abs(x1 - x2) ^ p1
-abs(x2 - x1) ^ p1
-abs(x1 / x1) ^ p1
-abs(x2 / x2) ^ p1
-abs(x1 / x2) ^ p1
-abs(x2 / x1) ^ p1
-(abs(x1) ^ x1) ^ p1
-(abs(x2) ^ x2) ^ p1
-(abs(x1) ^ x2) ^ p1
-(abs(x2) ^ x1) ^ p1
-abs(p1 / x1) ^ p2
-abs(p1 / x2) ^ p2
-(abs(x1) ^ p1) ^ p2
-(abs(x2) ^ p1) ^ p2
--- a/package/test/data/esr_nvar2_len10.txt.gz_7.txt/esr_nvar2_len10.txt.gz_7.txt
+++ b/package/test/data/esr_nvar2_len10.txt.gz_7.txt/esr_nvar2_len10.txt.gz_7.txt
--- a/package/test/data/esr_nvar2_len10.txt.gz_9.txt/esr_nvar2_len10.txt.gz_9.txt
+++ b/package/test/data/esr_nvar2_len10.txt.gz_9.txt/esr_nvar2_len10.txt.gz_9.txt
--- a/package/test/data/nikuradse_2.csv
+++ b/package/test/data/nikuradse_2.csv
--- a/package/test/results-fh-new/3-interpreter-smaller-stack-less-var-allocations-both-unsafe_free.json
+++ b/package/test/results-fh-new/3-interpreter-smaller-stack-less-var-allocations-both-unsafe_free.json
@ -1,196 +0,0 @@
-[
-	{
-		"Julia": "1.11.5",
-		"BenchmarkTools": {
-			"major": 1,
-			"minor": 6,
-			"patch": 0,
-			"prerelease": [],
-			"build": []
-		}
-	},
-	[
-		[
-			"BenchmarkGroup",
-			{
-				"data": {
-					"GPUT": [
-						"BenchmarkGroup",
-						{
-							"data": {
-                                "nikuradse_1": [
-                                    "Trial",
-                                    {
-                                        "allocs": 1534112879,
-                                        "gctimes": [
-                                            3.398826747854e12,
-                                            2.618070795579e12
-                                        ],
-                                        "memory": 51380857328968,
-                                        "params": [
-                                            "Parameters",
-                                            {
-                                                "gctrial": true,
-                                                "time_tolerance": 0.05,
-                                                "evals_set": false,
-                                                "samples": 50,
-                                                "evals": 1,
-                                                "gcsample": false,
-                                                "seconds": 43200.0,
-                                                "overhead": 0.0,
-                                                "memory_tolerance": 0.01
-                                            }
-                                        ],
-                                        "times": [
-                                            3.7202049569362e13,
-                                            3.7400159760069e13
-                                        ]
-                                    }
-                                ]
-                            },
-							"tags": [
-								"GPUTranspiler"
-							]
-						}
-					],
-					"GPUI": [
-						"BenchmarkGroup",
-						{
-							"data": {
-								"nikuradse_1": [
-									"Trial",
-									{
-										"allocs": 768766234,
-										"gctimes": [
-											9.039427718e9,
-											9.064446832e9,
-											9.800666936e9,
-											1.0827322595e10,
-											8.183176119e9,
-											1.0336680452e10,
-											1.2123016536e10,
-											1.1144637536e10,
-											1.1608950879e10,
-											8.957069847e9,
-											1.6269942403e10,
-											1.4918376698e10,
-											1.4251938232e10,
-											1.2206537223e10,
-											9.651032299e9,
-											8.903295497e9,
-											1.0685161605e10,
-											1.3667059513e10,
-											9.290015888e9,
-											9.461223008e9,
-											8.563242328e9,
-											9.004616808e9,
-											1.1567444604e10,
-											1.4886979643e10,
-											1.1748297074e10,
-											1.0925963713e10,
-											1.1739338325e10,
-											1.2370751697e10,
-											9.841839527e9,
-											1.0294011249e10,
-											1.0448009806e10,
-											1.0032240935e10,
-											1.0339378214e10,
-											1.0181439573e10,
-											1.0002432745e10,
-											1.024672632e10,
-											1.0288169821e10,
-											9.9328892e9,
-											9.691621257e9,
-											1.0178716919e10,
-											9.874193006e9,
-											1.0230965657e10,
-											9.986166398e9,
-											1.0348837109e10,
-											9.905019212e9,
-											1.0229049781e10,
-											1.0217382544e10,
-											9.984211393e9,
-											1.085035782e10,
-											9.611515998e9
-										],
-										"memory": 54082704216,
-										"params": [
-											"Parameters",
-											{
-												"gctrial": true,
-												"time_tolerance": 0.05,
-												"evals_set": false,
-												"samples": 50,
-												"evals": 1,
-												"gcsample": false,
-												"seconds": 43200.0,
-												"overhead": 0.0,
-												"memory_tolerance": 0.01
-											}
-										],
-										"times": [
-											4.59689343211e11,
-											4.58928721202e11,
-											4.58790866806e11,
-											4.57001612541e11,
-											4.18694344791e11,
-											4.51768004064e11,
-											4.72273439611e11,
-											4.71801815498e11,
-											4.7151631773e11,
-											4.59466410568e11,
-											4.78707344875e11,
-											4.57553935546e11,
-											4.83383119184e11,
-											4.93000010286e11,
-											4.73094508424e11,
-											4.61605014711e11,
-											4.5350924569e11,
-											4.60262826899e11,
-											4.89557260771e11,
-											4.9072202667e11,
-											5.01206569571e11,
-											4.98388682969e11,
-											4.97754134578e11,
-											4.77393384636e11,
-											4.86333985432e11,
-											4.95544193592e11,
-											4.61734198363e11,
-											4.65888337953e11,
-											4.62496887686e11,
-											4.6684460331e11,
-											4.67632785813e11,
-											4.6746114379e11,
-											4.66166811424e11,
-											4.66344731528e11,
-											4.67420138865e11,
-											4.6812935133e11,
-											4.67987294196e11,
-											4.67112396022e11,
-											4.65770084163e11,
-											4.673875228e11,
-											4.63872430175e11,
-											4.62557110467e11,
-											4.64486258696e11,
-											4.67577200165e11,
-											4.65189110368e11,
-											4.64529885356e11,
-											4.61770978471e11,
-											4.63199044468e11,
-											4.61538097167e11,
-											4.61694021731e11
-										]
-									}
-								]
-							},
-							"tags": [
-								"GPUInterpreter"
-							]
-						}
-					]
-				},
-				"tags": []
-			}
-		]
-	]
-]
--- a/thesis/chapters/conceptdesign.tex
+++ b/thesis/chapters/conceptdesign.tex
@ -1,7 +1,9 @@
 \chapter{Concept and Design}
 \label{cha:conceptdesign}
 % introduction to what needs to be done. also clarify terms "Host" and "Device" here
-To be able to determine whether evaluating mathematical expressions on the GPU is better suited than on the CPU, two prototypes need to be implemented. More specifically, a prototype for interpreting these expressions on the GPU, as well as a prototype that transpiles expressions into PTX code that can be executed by the GPU. The goal of this chapter, is to describe how these two prototypes can be implemented conceptually. First the requirements for the prototypes as well as the data they operate on are explained. This is followed by the design of the interpreter and the transpiler. The CPU interpreter will not be described, as it already exists.
+To be able to determine whether evaluating mathematical expressions on the GPU is better suited than on the CPU, a prototype needs to be implemented. More specifically, a prototype for interpreting these expressions on the GPU, as well as a prototype that transpiles expressions into code that can be executed by the GPU. The goal of this chapter, is to describe how these two prototypes can be implemented conceptually. First the requirements for the prototypes as well as the data they operate on are explained. This is followed by the design of the interpreter and the transpiler. The CPU interpreter will not be described, as it already exists.
+
+% TODO: maybe describe CPU interpreter too? We will see

 \section[Requirements]{Requirements and Data}
 The main goal of both prototypes or evaluators is to provide a speed-up compared to the CPU interpreter already in use. However, it is also important to determine which evaluator provides the most speed-up. This also means that if one of the evaluators is faster, it is intended to replace the CPU interpreter. Therefore, they must have similar capabilities, and therefore meet the following requirements:
@ -9,10 +11,10 @@ The main goal of both prototypes or evaluators is to provide a speed-up compared
 \begin{itemize}
 	\item Multiple expressions as input.
 	\item All input expressions have the same number of variables ($x_n$), but can have a different number of parameters ($p_n$).
-	\item The variables are parametrised using a matrix of the form $k \times N$, where $k$ is the number of variables in the expressions and $N$ is the number of data points. This matrix is the same for all expressions. 
+	\item The variables are parametrised using a matrix of the form $k \times N$, where $k$ is the number of variables in the expressions and $N$ is the number of different parametrisations for the variables. This matrix is the same for all expressions. 
 	\item The parameters are parametrised using a vector of vectors. Each vector $v_i$ corresponds to an expression $e_i$.
-	\item The following operations must be supported: $x + y$, $x - y$, $x * y$, $x / y$, $x ^ y$, $|x|$, $\log(x)$, $e^x$, $1 / x$ and $\sqrt{x}$. Note that $x$ and $y$ can either stand for a constant, a variable, a parameter, or another expression.
-	\item The results of the evaluations are returned in a matrix of the form $k \times N_e$. In this case, $k$ is equal to the $N$ of the variable matrix and $N_e$ is equal to the number of input expressions.
+	\item The following operations must be supported: $x + y$, $x - y$, $x * y$, $x / y$, $x ^ y$, $|x|$, $\log(x)$, $e^x$ and $\sqrt{x}$. Note that $x$ and $y$ can either stand for a value, a variable, or another operation.
+	\item The results of the evaluations are returned in a matrix of the form $k \times N$. In this case, $k$ is equal to the $N$ of the variable matrix and $N$ is equal to the number of input expressions.
 \end{itemize}

 \begin{figure}
@ -23,21 +25,21 @@ The main goal of both prototypes or evaluators is to provide a speed-up compared
 \end{figure}


-With this, the required capabilities are outlined. However, for a better understanding, the input and output data need to be explained further. The first input contains the expressions that need to be evaluated. These can be of any length and can contain constant values, variables and parameters, all of which are linked together with the supported operators. In the simplified example shown in Figure \ref{fig:input_output_explanation}, there are six expressions $e_1$ to $e_6$. 
+With this, the required capabilities are outlined. However, for a better understanding, the input and output data need to be explained further. The first input contains the expressions that need to be evaluated. These can be of any length and can contain constant values, variables and parameters, all of which are linked together with the supported operations. In the simplified example shown in Figure \ref{fig:input_output_explanation}, there are six expressions $e_1$ to $e_6$. 

-Next is the variable matrix. An entry in this matrix corresponds to one variable in every expression. The row indicates which variable it holds the value for. For example the values in row three are used to parameterise the variable $x_3$. Each column holds a different set of variables. Each expression must be evaluated using each set of variables. In the provided example, there are three data points, each containing the values for four variables $x_1$ to $x_4$. 
+Next is the variable matrix. An entry in this matrix corresponds to one variable in every expression. The row indicates which variable it holds the value for. For example the values in row three, are used to parameterise the variable $x_3$. Each column holds a different set of variables. Each expression must be evaluated using each set of variables. In the provided example, there are three variable sets, each containing the values for four variables $x_1$ to $x_4$. 

-After all expressions have been evaluated using all data points, the results of these evaluations must be stored in the result matrix. Each entry in this matrix holds the result of the evaluation of one expression parameterised with one data point. The row indicates the data point and the column indicates the expression.
+After all expressions have been evaluated using all variable sets, the results of these evaluations must be stored in the result matrix. Each entry in this matrix holds the result of the evaluation of one expression parameterised with one variable set. The row indicates the variable set and the column indicates the expression.

-The prototypes developed in this thesis, are part of a GP algorithm for symbolic regression. This means that the expressions that are evaluated, represent parts of the search space of all expressions being made up of any combination of allowed operators, the set of input variables, a set of parameters and constants. This means that the size of the search space grows exponentially. Exploring this search space by simply generating expressions, evaluating them once and then generating the next set of expressions leaves much of their potential hidden. To assist in finding better fitting expressions, parameters are introduced. This allows the algorithm to fit the expressions to the data. To enable this improved search, the prototypes must support not only variables, but also parameters.
+The prototypes developed in this thesis, are part of a GP algorithm for symbolic regression. This means that the expressions that are evaluated, represent parts of the search space of all expressions being made up of any combination of allowed operators, the set of input variables, a set of parameters and constants. This means that the size of the search space grows exponentially. Exploring this search space by simply generating expressions, evaluating them once and then generating the next set of expressions leaves much of the search space unexplored. To combat this, parameters are introduced. These allow the algorithm to perform some kind of local search. To enable this, the prototypes must support not only variables, but also parameters.

 The parameters themselves are unique to each expression, meaning they have a one-to-one mapping to an expression. Furthermore, as can be seen in Figure \ref{fig:input_output_explanation}, each expression can have a different number of parameters, or even no parameters at all. However, with no parameters, it wouldn't be possible to perform parameter optimisation. This is in contrast to variables, where each expression must have the same number of variables. Because parameters are unique to each expression and can vary in size, they are not structured as a matrix, but as a vector of vectors.

-An important thing to consider, is the volume and volatility of the data itself. The example shown in Figure \ref{fig:input_output_explanation} has been drastically simplified. It is expected, that there are hundreds of expressions evaluate per GP generation. Each of these expressions may contain between ten and 50 tokens. A token is equivalent to either a variable, a parameter, a constant value or an operator.
+An important thing to consider, is the volume and volatility of the data itself. The example used above has been drastically simplified. It is expected, that there are hundreds of expressions evaluate per GP generation. Each of these expressions may contain between ten and 50 tokens. A token is equivalent to either a variable, a parameter, a constant value or an operator.

-It can be assumed that typically the number of variables per expression is around ten. However, the number of data points can increase drastically. It can be considered that $1\,000$ data points is the lower limit. On the other hand, $100\,000$ can be considered as the upper limit. Considering that one variable takes up 4 bytes of memory and 10 variables are needed per expression, at least $4 * 10 * 1\,000 = 40\,000$ bytes and at most $4 * 10 * 100\,000 = 400\,000$ bytes need to be transferred to the GPU for the variables. Therefore this 
+Usually, the number of variables per expression is around ten. However, the number of variable sets can increase drastically. It can be considered, that $1\,000$ variable sets is the lower limit. On the other hand, $100\,000$ can be considered as the upper limit. Considering that one variable takes up 4 bytes of space and 10 variables are needed per expression, at least $4 * 10 * 1\,000 = 40\,000$ bytes and at most $4 * 10 * 100\,000 = 400\,000$ bytes need to be transferred to the GPU for the variables.

-These variables do not change during the runtime of the symbolic regression algorithm. As a result the data only needs to be sent to the GPU once. This means that the impact of this data transfer is minimal. On the other hand, the data for the parameters is much more volatile. As explained above, they are used for parameter optimisation and therefore vary from evaluation to evaluation and need to be sent to the GPU very frequently. The amount of data that needs to be sent depends on the number of expressions as well as on the number of parameters per expression. Considering $10\,000$ expressions that need to be evaluated and an average of two parameters per expression each requiring 4 bytes of memory, a total of $10\,000 * 2 * 4 = 80\,000$ bytes need to be transferred to the GPU on each parameter optimisation step. This is comparatively low, as the GPU is connected via PCI Express with version six allowing transfer rates of up to $256$ GB per second \parencite{pci-sig_pci_2025}. However, the amount of data is not of concern but rather the number of data transfers to the GPU, as every transfer has some overhead and waiting time associated with it.
+These variables do not change during the runtime of the symbolic regression algorithm. As a result the data only needs to be sent to the GPU once. This means that the impact of this data transfer is minimal. On the other hand, the data for the parameters is much more volatile. As explained above, they are used for parameter optimisation and therefore vary from evaluation to evaluation and need to be sent to the GPU very frequently. However, the amount of data that needs to be sent is also much smaller. TODO: ONCE I GET THE DATA SEE HOW MANY BYTES PARAMETERS TAKE ON AVERAGE

 \section{Architecture}
 \label{sec:architecture}
@ -50,17 +52,13 @@ Based on the requirements and data structure above, the architecture of both pro
 	\label{fig:kernel_architecture}
 \end{figure}

-A design decision that has been made for both prototypes is to split the evaluation of each expression into a separate kernel or kernel dispatch as seen in Figure \ref{fig:kernel_architecture}. As explained in Section \ref{sec:thread_hierarchy}, it is desirable to reduce the occurrence of thread divergence as much as possible. Although the SIMT programming model tries to mitigate the negative effects of thread divergence, it is still advisable to avoid it when possible. For this use-case, thread divergence can easily be avoided by not evaluating all expressions in a single kernel or kernel dispatch. GPUs are able to have multiple resident grids, with modern GPUs being able to accommodate 128 grids concurrently \parencite{nvidia_cuda_2025}. One grid corresponds to one kernel dispatch, and therefore allows up-to 128 kernels to be run concurrently. Therefore, dispatching a kernel for each expression, further increases GPU utilisation. In the case of the interpreter, having only one kernel that can be dispatched for each expression, also simplifies the kernel itself. This is because the kernel can focus on evaluating one expression and does not require additional code to handle multiple expressions at once. Similarly, the transpiler can also be simplified, as it can generate many smaller kernels rather than one big kernel. Additionally, the smaller kernels do not need any branching, because the generated code only needs to perform the operations as they occur in the expressions themselves. This also reduces the overhead on the GPU. One drawback of generating a kernel for each expression, is the generation itself. Especially for smaller data points, it is possible, that the time it takes to transpile an expression and compile the kernel into machine code is greater than the time it takes to evaluate it. However, for larger data points this should not be a concern, especially in parameter optimisation scenarios, where the kernel is re-used on each parameter optimisation step.
-
-%
-% TODO: Probably include a diagram that shows how the evaluators are integrated in the symbolic regression algorithm (assuming its a GP variant), to show the bigger picture
-%
+A design decision that has been made for both prototypes is to split the evaluation of each expression into a separate kernel or kernel dispatch as seen in Figure \ref{fig:kernel_architecture}. As explained in Section \ref{sec:thread_hierarchy}, it is desirable to reduce the occurrence of thread divergence as much as possible. Although the SIMT programming model tries to mitigate the negative effects of thread divergence, it is still a good idea to avoid it when possible. For this use-case, thread divergence can easily be avoided by not evaluating all expressions in a single kernel or kernel dispatch. GPUs are able to have multiple resident grids, with modern GPUs being able to accommodate 128 grids concurrently \parencite{nvidia_cuda_2025}. One grid corresponds to one kernel dispatch, and therefore allows up-to 128 kernels to be run concurrently. Therefore, dispatching a kernel for each expression, further increases GPU utilisation. In the case of the interpreter, having only one kernel that can be dispatched for each expression, also simplifies the kernel itself. This is because the kernel can focus on evaluating one expression and does not require additional code to handle multiple expressions at once. Similarly, the transpiler can also be simplified, as it can generate many smaller kernels rather than one big kernel. Additionally, the smaller kernels do not need any branching, because the generated code only needs to perform the operations as they occur in the expression itself. This also reduces the overhead on the GPU. One drawback of generating a kernel for each expression, is the generation itself. Especially for smaller variable sets, it is possible, that the time it takes to transpile an expression is greater than the time it takes to evaluate it. However, for larger variable sets this should not be a concern.

 \subsection{Pre-Processing}
 \label{sec:pre-processing}
-The first step in both prototypes is the pre-processing step. It is needed, as it simplifies working with the expressions in the later steps. One of the responsibilities of the pre-processor is to verify that only allowed operators and symbols are present in the given expressions. Secondly, this step also converts the expression into an intermediate representation. In essence, the pre-processing step can be compared to the frontend of a compiler as described in Section \ref{sec:compilers}. If new operators are required, the pre-processor must be extended as well. Otherwise, expressions containing these operators would be treated as invalid and never reach the evaluator.
+The first step in both prototypes is the pre-processing step. It is needed, as it simplifies working with the expressions in the later steps. One of the responsibilities of the pre-processor is to verify that only allowed operators and symbols are present in the given expressions. This is comparable to the work a scanner like Flex\footnote{\url{https://github.com/westes/flex}} performs. Secondly, this step also converts the expression into an intermediate representation. In essence, the pre-processing step can be compared to the frontend of a compiler as described in Section \ref{sec:compilers}. If new operators are required, the pre-processor must be extended as well. Otherwise, expressions containing these operators would be treated as invalid and never reach the evaluator.

-The conversion into the intermediate representation transforms the expressions from infix notation into postfix notation. This further allows the later parts to more easily evaluate the expressions. One of the major benefits of this notation is the implicit operator precedence. It allows the evaluators to evaluate the expressions token by token from left to right, without needing to worry about the correct order of operations. One token represents either an operator, a constant value, a variable or a parameter. Apart from the intermediate representation containing the expression in postfix notation, it also contains information about the types of the tokens themselves. This is all that is needed for the interpretation and transpilation steps. A simple expression like $x + 2$ would look like depicted in Figure \ref{fig:pre-processing_results} after the pre-processing step.
+The conversion into the intermediate representation transforms the expressions from infix-notation into postfix notation. This further allows the later parts to more easily evaluate the expressions. One of the major benefits of this notation is the implicit operator precedence. It allows the evaluators to evaluate the expressions token by token from left to right, without needing to worry about the correct order of operations. One token represents either an operator, a constant value, a variable or a parameter. Apart from the intermediate representation containing the expression in postfix notation, it also contains information about the types of the tokens themselves. This is all that is needed for the interpretation and transpilation steps. A simple expression like $x + 2$ would look like depicted in figure \ref{fig:pre-processing_results} after the pre-processing step.

 \begin{figure}
 	\centering
@ -84,9 +82,9 @@ The already mentioned concept of processing one expression per thread can also b

 The interpreter consists of two parts. The CPU side is the part of the program, that interacts with both the GPU and the caller. An overview of the components and the workflow of the interpreter is shown in Figure \ref{fig:component_diagram_interpreter}. Once the interpreter has received the expressions, they are pre-processed. This ensures that the expressions are valid, and that they are transformed into the intermediate representation needed to evaluate them. The result of this pre-processing step is then sent to the GPU, which performs the actual interpretation of the expressions. In addition to the expressions, the data for the variables and parameters must also be sent to the GPU. 

-Once all the necessary data is present on the GPU, the interpreter kernel can be dispatched. As previously mentioned, the kernel is dispatched for each expression to minimise thread divergence. In fact, dispatching the same kernel multiple times for each expression ensures that there will not occur any thread divergence, as will be explained later. 
+Once all the data is present on the GPU, the interpreter kernel can be dispatched. As already described, the kernel will be dispatched for each expression to reduce thread divergence. In fact, dispatching the same kernel multiple times with different expressions, means, there will not occur any thread divergence which will be explained later. 

-After the GPU has finished evaluating all expressions with all data points, the result is stored in a matrix on the GPU. The CPU then retrieves the results and returns them to the caller in the format specified by the requirements.
+After the GPU has finished evaluating all expressions with all variable sets, the result is stored in a matrix on the GPU. The CPU then retrieves the results and returns them to the caller in the format specified by the requirements.

 Evaluating the expressions is relatively straight forward. Because the expressions are in postfix notation, the actual interpreter just needs to iterate over all the tokens and perform the appropriate tasks. If the interpreter encounters a binary operator, it simply needs to read the previous two values and perform the operation specified by the operator. For unary operators, only the previous value needs to be read. As already mentioned, expressions in postfix notation implicitly contain the operator precedence, therefore no look-ahead or other strategies need to be used to ensure correct evaluation. This also means that each token is visited exactly once and no unnecessary or overhead work needs to be done. The Algorithm \ref{alg:eval_interpreter} shows how the interpreter works. Note that this is a simplified version, that only works with additions, multiplications, constants and variables.

@ -120,13 +118,13 @@ Evaluating the expressions is relatively straight forward. Because the expressio
 	\end{algorithmic}
 \end{algorithm}

-Handling constants, variables and parameters is very simple. Constants simply need to be stored on the stack for later use. Variables and parameters also only need to be stored on the stack. However, their value must first be loaded from the variable or parameter matrix according to the token value. Since the entire matrices are sent to the GPU, the index of the variable or parameter set is also needed to load the correct value. However, for simplicity, this has been omitted from the algorithm.
+The handling of constants and variables is very simple. Constants only need to be stored on the stack for later use. Variables also only need to be stored on the stack. However, their value must first be loaded from the variable matrix according to the token value of the variable. Since the entire variable matrix is sent to the GPU, the index of the variable set is also needed to load the variable value. However, for the sake of simplicity, it has been omitted from the algorithm.

-When an operator token is encountered, the handling becomes more complex. The value of the token indicates the type of operation to be applied. For binary operators, the top two values on the stack need to be used as input to the operator. For unary operators, only the top value of the stack needs to be used as an input. Once the result has been computed, it must be stored at the top of the stack to be used as an input for the next operation or the result for this expression. 
+When an operator token is encountered, the handling becomes more complex. The value of the token indicates the type of operation to be applied. For binary operators, the top two values on the stack need to be used as input to the operator. For unary operators, only the top value of the stack needs to be used as an input. Once the result has been computed, it must be stored at the top of the stack to be used as an input for the next operation. 

-At the end of the algorithm, the stack contains one last entry. This entry is the value computed by the expression with the designated data point and parameters. In order to send this value back to the CPU, it must be stored in the result matrix. The last statement performs this action. It again has been simplified to omit the index calculation of the expression and data point needed to store the result at the correct location.
+At the end of the algorithm, the stack contains one last entry. This entry is the value computed by the expression with the designated variable set and parameters. In order to send this value back to the CPU, it must be stored in the result matrix. The last statement performs this action. It again has been simplified to omit the index of the expression and variable set needed to store the result at the correct location.

-The Algorithm \ref{alg:eval_interpreter} in this case resembles the kernel. This kernel will be dispatched for each expression that needs to be evaluated, to prevent thread divergence. Thread divergence can only occur on data-dependent branches. In this case, the while loop and every if and else-if statement contains a data-dependent branch. Depending on the expression passed to the kernel, the while loop may run longer than for another expression. Similarly, not all expressions have the same constants, operators, variables or parameters in the same order, and would therefore cause each thread to take a different path. However, one expression always has the same constants, operators, variables and parameter in the same locations, meaning that all threads will take the same path. This also means that although the interpreter contains many data-dependent branches, these branches only depend on the expression itself. Because of this, all threads will follow the same path and will therefore never diverge from one another.
+The Algorithm \ref{alg:eval_interpreter} in this case resembles the kernel. This kernel will be dispatched for each expression that needs to be evaluated, to prevent thread divergence. Thread divergence can only occur on data-dependent branches. In this case, the while loop and every if and else-if statement contains a data-dependent branch. Depending on the expression passed to the kernel, the while loop may run longer than for another expression. Similarly, not all expressions have the same constants, operators or variables in the same order, and would therefore cause each thread to take a different path. However, one expression always has the same constants, operators and variables in the same locations, meaning that all threads will take the same path. This also means that although the interpreter contains many data-dependent branches, these branches only depend on the expression itself. Because of this, all threads will follow the same path and will therefore never diverge from one another as long as they are executing the same expression.

 \subsection{Transpiler}

@ -137,13 +135,13 @@ The Algorithm \ref{alg:eval_interpreter} in this case resembles the kernel. This
 	\label{fig:component_diagram_transpiler}
 \end{figure}

-Similar to the interpreter, the transpiler also consists of a part that runs on the CPU and a part that runs on the GPU. Looking at the components and workflow of the transpiler, as shown in Figure \ref{fig:component_diagram_transpiler}, it is almost identical to the interpreter. However, the key difference between the two, is the additional code generation, or transpilation step. Apart from that, the transpiler also needs the same pre-processing step and also the GPU to evaluate the expressions. However, the kernels generated by the transpiler work very differently to the kernel for the interpreter. The difference between these evaluators will be explained later.
+Similar to the interpreter, the transpiler also consists of a part that runs on the CPU and a part that runs on the GPU. Looking at the components and workflow of the transpiler, as shown in Figure \ref{fig:component_diagram_transpiler}, it is almost identical to the interpreter. However, the key difference between the two, is the additional code generation, or transpilation step. Apart from that, the transpiler also needs the same pre-processing step and also the GPU to evaluate the expressions. However, the GPU evaluator generated by the transpiler works very differently to the GPU evaluator for the interpreter. The difference between these evaluators will be explained later.

 Before the expressions can be transpiled into PTX code, they have to be pre-processed. As already described, this step ensures the validity of the expressions and transforms them into the intermediate representation described above. As with the interpreter, this also simplifies the code generation step. By transforming the expressions into postfix notation, the code generation follows a similar pattern to the interpretation already described. 

-Algorithm \ref{alg:transpile} shows how the transpiler takes an expression, transpiles it and then returns the finished code. It can be seen that the while loop is largely the same as the while loop of the interpreter. The main difference is in the operator branches, because now code needs to be generated instead of computing the result of the expression. Therefore, the branches themselves call their designated code generation function, such as $\textit{GetAddition}$. This function returns the PTX code responsible for the addition. However, this function must return more than just the code that performs the addition. When executed, this addition also returns a value which will be needed as an input by other operators. Therefore, not only the code fragment must be returned, but also the reference to the result.
+Algorithm \ref{alg:transpile} shows how the transpiler takes an expression, transpiles it and then returns the finished code. It can be seen that the while loop is largely the same as the while loop of the interpreter. The main difference is in the operator branches, because now code needs to be generated instead of the result of computing the expression. Therefore, the branches themselves call their designated code generation function, such as $\textit{GetAddition}$. This function returns the PTX code responsible for the addition. However, this function must return more than just the code that performs the addition. When executed, this addition also returns a value which will be needed as an input by other operators. Therefore, not only the code fragment must be returned, but also the reference to the result.

-This reference can then be put on the stack for later use, the same way the interpreter stores the value for later use. The code fragment must also be added to the already generated code so that it can be returned to the caller. As with the interpreter, there is a final value on the stack when the loop has finished. Once the code has been executed, this value is the reference to the result of the expression. This value then needs to be stored in the result matrix, so that it can be retrieved by the CPU after all expressions have been executed. Therefore, a final code fragment must be generated to handle the storage of this value in the result matrix. This fragment must then be added to the code already generated, and the transpilation process is complete.
+This reference can then be put on the stack for later use, just as the interpreter stores the value for later use. The code fragment must also be added to the already generated code so that it can be returned to the caller. As with the interpreter, there is a final value on the stack when the loop has finished. Once the code has been executed, this value is the reference to the result of the expression. This value then needs to be stored in the result matrix, so that it can be retrieved by the CPU after all expressions have been executed on the GPU. Therefore, a final code fragment must be generated to handle the storage of this value in the result matrix. This fragment must then be added to the code already generated, and the transpilation process is complete.

 \begin{algorithm}
 	\caption{Transpiling an equation in postfix notation}\label{alg:transpile}
@ -185,8 +183,10 @@ This reference can then be put on the stack for later use, the same way the inte
 	\end{algorithmic}
 \end{algorithm}

-The code generated by the transpiler is the kernel for the transpiled expressions. This means that a new kernel must be generated for each expression that needs to be evaluated. This is in contrast to the interpreter, which has one kernel and dispatches it once for each expression. Generating one kernel per expression results in a much simpler kernel, which allows the kernel to focus on evaluating the postfix expression. There is no overhead work such as branching or managing a stack. However, this overhead is now shifted to the transpilation step on the CPU which can be seen in Algorithm \ref{alg:transpile}. There is also a noticeable overhead in that a kernel has to be generated for each expression. In cases like parameter optimisation, many of the expressions would be transpiled multiple times as the transpiler is called multiple times with the same expressions.
+The code generated by the transpiler is the kernel for the transpiled expressions. This means that a new kernel must be generated for each expression that needs to be evaluated. This is in contrast to the interpreter, which has one kernel and dispatches it once for each expression. However, generating one kernel per expression results in a much simpler kernel. This allows the kernel to focus on evaluating the postfix expression from left to right. There is no overhead work such as branching or managing a stack. However, this overhead is now shifted to the transpilation step on the CPU which can be seen in Algorithm \ref{alg:transpile}. There is also a noticeable overhead in that a kernel has to be generated for each expression. In cases like parameter optimisation, many of the expressions will be transpiled multiple times as the transpiler is called multiple times with the same expressions.

 Both the transpiler and the interpreter have their respective advantages and disadvantages. While the interpreter puts less load on the CPU, the GPU has to perform more work. Much of this work involves branching or managing a stack, and therefore involves many instructions that are not used to evaluate the expression itself. However, this overhead can be mitigated by the fact, that all this work is performed in parallel rather than sequentially.

-On the other hand, the transpiler performs more work on the CPU. The kernels are much simpler, and most of the instructions are used to evaluate the expressions themselves. Furthermore, as explained in Section \ref{sec:ptx}, any program running on the GPU, must be transpiled into PTX code before the driver can compile it into machine code. Therefore, the kernel written for the interpreter, must also be transpiled into PTX and then be compiled. However, this needs to be done only once, while for the transpiler this needs to be done for each expression. Since the generated code is tailored to evaluate expressions and not to generate generic code, this means the kernels are simpler and can be transpiled and compiled faster. The overhead of transpiling and compiling the expressions is further mitigated by re-using the compiled kernels during the parameter optimisation step.
+On the other hand, the transpiler performs more work on the CPU. The kernels are much simpler, and most of the instructions are used to evaluate the expressions themselves. Furthermore, as explained in Section \ref{sec:ptx}, any program running on the GPU, must be transpiled into PTX code before the driver can compile it into machine code. Therefore, the kernel written for the interpreter, must also be transpiled into PTX. This overhead is in addition to the branch instruction overhead. The self-written transpiler removes this intermediate step by transpiling directly into PTX. In addition, the generated code is tailored to evaluate expressions and does not need to generate generic PTX code, which can reduce transpilation time. 
+
+Unlike the GPU, the CPU can manage state across multiple kernel dispatches. Concepts such as caches can be employed by the transpiler to reduce the overhead on the CPU. In cases such as parameter optimisation, where expressions remain the same across multiple calls, the resulting PTX code can be cached. As a result, the same expression doesn't need to be transpiled multiple times which drastically reducing the transpilation time. This is an important optimisation as this can improve the overall performance of the transpiler. 
--- a/thesis/chapters/conclusion.tex
+++ b/thesis/chapters/conclusion.tex
@ -1,40 +1,18 @@
 \chapter[Conclusion]{Conclusion and Future Work}
 \label{cha:conclusion}

-Research has been conducted on how to best approach the evaluation of dynamically generated expressions for symbolic regression. The GPU has been chosen to improve the performance as a cheap and powerful tool especially compared to compute clusters. Numerous instances exist were utilising the GPU lead to drastic performance improvements in many fields of research.
-
-Two GPU evaluators were implemented which are used to determine if the GPU is more suitable for evaluating expressions generated at runtime as compared to the CPU. The two implementations are as follows:
-
-\begin{description}
-	\item[GPU Interpreter] \mbox{} \\
-		A stack based interpreter that evaluates the expressions. The frontend converts these expressions into postfix notation to ensure the implementation can be as simple as possible. It consists of one kernel that is used to evaluate all expressions separately.
-	\item[GPU Transpiler] \mbox{} \\
-		A transpiler that takes the expressions and transpiles them into PTX code. Each expression is represented in its own unique kernel. The kernels are simpler than the one GPU interpreter kernel, but more effort is needed to generate them.
-\end{description}
-
-In total three benchmarks were conducted to determine if and under which circumstances the GPU is a more suitable choice for evaluating the expressions. A CPU-based implementation is the baseline against which the GPU evaluators are evaluated. To answer the research questions the benchmarks are structured as follows:
-\begin{enumerate}
-	\item Roughly $250\,000$ expressions with $362$ data points have been evaluated. The goal of this benchmark was determining how the evaluators can handle large volumes of expressions.
-	\item Roughly $10\,000$ expressions with $362$ data points have been evaluated. This benchmark should demonstrate how a change in the number of expressions impacts the performance, especially compared with each other.
-	\item Roughly $10\,000$ expressions and roughly $10\,000$ data points have been evaluated. By increasing the number of data points a more realistic use-case is modelled with this benchmark. Additionally, by using more data points the strengths of the GPU should get more exploited.
-\end{enumerate}
-
-After conducting the first and second benchmarks it was clear, that the CPU is the better choice in these scenarios. The CPU was faster by roughly four times when compared to the GPU interpreter and the GPU transpiler did not finish this benchmark at all.
-
-The first benchmark in particular demonstrated how the high RAM usage of this GPU transpiler implementation lead to it not finishing this benchmark. Storing $250\,000$ compiled kernels uses a lot of RAM, however, compiling the PTX kernels just in time before they are executed is not a feasible alternative to reduce RAM usage. Since the PTX kernels need to be compiled into machine code before they can be executed, one alternative would be to use batch processing as a compromise between compiling ahead of time and just in time. Since it is not expected that these evaluators need to evaluate hundreds of thousands of expressions, the non-trivial process of rewriting the implementation to support batch processing has not been done.
-
-Reducing the number of expressions demonstrated that the GPU transpiler can perform better than the GPU interpreter by roughly ten percent. However, in relation to the CPU implementation, no real change was observed between the first and second benchmark with the CPU being faster by roughly five times. 
-
-In the third benchmark, both GPU evaluators managed to outperform the CPU, with the GPU transpiler performing the best. The GPU interpreter was faster by roughly $1.6$ times and the GPU transpiler was faster by roughly $2$ times compared to the CPU interpreter. Furthermore, the GPU transpiler managed to outperform the GPU interpreter by roughly $1.2$ times.
-
-To address the research questions, this thesis demonstrates that evaluating expressions generated at runtime can be more efficient on the GPU under specific conditions. Utilizing the GPU becomes feasible when dealing with a high number of data points, typically in the thousands and above. For scenarios with fewer data points, the CPU remains the better choice. Additionally, in scenarios where RAM is abundant, the implementation of the GPU transpiler discussed in this thesis is the optimal choice. If too little RAM is available and the number of data points is sufficiently large, the GPU interpreter should be chosen, as it outperforms both the GPU transpiler and the CPU in such cases.
+Summarise the results
+talk again how a typical input is often not complex enough (basically repeat that statement from comparison section in evaluation)

 \section{Future Work}
-This thesis demonstrated how the GPU can be used to accelerate the evaluation of expressions and therefore the symbolic regression algorithm as a whole. However, the boundaries at which it is more feasible to utilise the GPU needs to be further refined. Therefore, conducting more research into how the number of expressions and data points impact performance is needed. Furthermore, only one dataset with only two variables per data point was used. Varying the number of variables per data point and their impact on performance could also be interesting. The impact of the parameters was omitted from this thesis entirely. Further research on how the number of parameters impact the performance is of interest. Since parameters need to be transferred to the GPU frequently, having too many parameters could impact the GPU more negatively than the CPU. Alternatively, performing the entire parameter optimisation step on the GPU and not just the evaluation might also result in better performance, as the number of data transfers is drastically reduced. 
+talk about what can be improved

-The current implementation also has flaws that can be improved in future work. Currently, no shared memory is utilised, meaning the threads need to always retrieve the data from global memory. This is a slow operation and efficiently utilising shared memory should further improve the performance of both GPU evaluators.
+Frontend:
+1.) extend frontend to support ternary operators (basically if the frontend sees a multiplication and an addition it should collapse them to an FMA instruction)

-Furthermore, as seen with the GPU transpiler and the first benchmark, reducing RAM usage is of essence for very large problems with hundreds of thousands of expressions or very RAM limited environments. Therefore, future work needs to be done to rewrite the transpiler to support batch processing and conduct benchmarks with this new implementation. This will answer the question if batch processing allows the GPU transpiler to outperform the CPU and GPU interpreters in these scenarios. Additionally, it is of interest if the batch processing transpiler manages to achieve the same or better performance in the other scenarios explored in this thesis.
+Transpiler: 
+1.) transpile expression directly from Julia AST -> would save time because no intermediate representation needs to be created (looses step and gains performance, but also makes transpiler itself more complex; since expressions do not need to be sent to the GPU, the IR theoretically isn't needed)
+2.) Better register management strategy might be helpful -> look into register pressure etc.

-Lastly, neither of the implementations supports special GPU instructions. Especially the Fused Multiply-Add (FMA) instruction is of interest. Given that multiplying two values and adding a third is a common operation, this special instruction allows these operations to be performed in a single clock cycle. The frontend can be extended to detect and convert sub-expressions of this form into a special ternary opcode, enabling the backend to generate more efficient code. If the effort of detecting these sub-expressions is outweighed by the performance improvement needs to be determined in a future work.

+CPU Interpreter: Probably more worth to dive into parallelising cpu interpreter itself (not really future work, as you wouldn't write a paper about that)
--- a/thesis/chapters/evaluation.tex
+++ b/thesis/chapters/evaluation.tex
@ -1,29 +1,27 @@
 \chapter{Evaluation}
 \label{cha:evaluation}

-This thesis aims to determine whether one of the two GPU evaluators is faster than the current CPU evaluator. This chapter describes the performance evaluation process. First, the environment in which the performance benchmarks are conducted is explained. Next the individual results for the GPU interpreter and transpiler are presented individually alongside the performance tuning process to achieve these results. Finally, the results of the GPU evaluators are compared to those of the CPU evaluator to answer the research questions of this thesis.
+This thesis aims to determine whether one of the two GPU evaluators is faster than the current CPU evaluator. This chapter describes the performance evaluation process. First, the environment in which the performance benchmarks are conducted is explained. Next the individual results for the GPU interpreter and transpiler are presented individually. This section also includes the performance tuning steps taken to achieve these results. Finally, the results of the GPU evaluators are compared to those of the CPU evaluator to answer the research questions of this thesis.

 \section{Benchmark Environment}
 In this section, the benchmark environment used to evaluate the performance is outlined. To ensure the validity and reliability of the results, it is necessary to specify the details of the environment. This includes a description of the hardware and software configuration as well as the performance evaluation process. With this, the variance between the results is minimised, which allows for better reproducibility and comparability between the implementations.

 \subsection{Hardware Configuration}
-The hardware configuration is the most important aspect of the benchmark environment. The capabilities of both the CPU and GPU can have a significant impact on the resulting performance. The following sections outline the importance of the individual components as well as the hardware used for the benchmarks and the performance tuning.
+The hardware configuration is the most important aspect of the benchmark environment. The capabilities of both the CPU and GPU can have a significant impact on the resulting performance. The following sections outline the importance of the individual components as well as the hardware used for the benchmarks.

 \subsubsection{GPU}
-The GPU plays a crucial role, as different microarchitectures typically operate differently and therefore require different performance tuning. Although the evaluators can generally operate on any Nvidia GPU with a compute capability of at least 6.1, they are tuned for the Ampere microarchitecture which has a compute capability of 8.6. Despite the evaluators being tuned for this microarchitecture, more recent microarchitectures can be used as well. However, additional tuning is required to ensure that the evaluators can utilise the hardware to its fullest potential.
-
-Tuning must also be done on a per-problem basis. In particular, the number of data points impact how well the hardware is utilised. Therefore, it is crucial to determine which configuration yields the best performance. Section \ref{sec:results} outlines steps to tune the configuration for a specific problem.
+Especially the GPU is important, as different microarchitectures typically require different optimisations. While the evaluators can generally run on any Nvidia GPU with a compute capability of at least 6.1, they are tuned for the Ampere microarchitecture with a compute capability of 8.6. Despite the evaluators being tuned for this microarchitecture, more modern ones can be used as well. However, additional tuning is required to ensure the evaluators can utilise the hardware to its fullest potential.

 \subsubsection{CPU}
-Although the GPU plays a crucial role, work is also carried out on the CPU. The interpreter primarily utilises the CPU for the frontend and data transfer, making it more GPU-bound as most of the work is performed on the GPU. However, the transpiler additionally relies on the CPU to perform the transpilation step. This step involves generating a kernel for each expression and sending these kernels to the driver for compilation, a process also handled by the CPU. By contrast, the interpreter only required one kernel which needs to be converted into PTX and compiled by the driver only once. Consequently, the transpiler is significantly more CPU-bound and variations in the CPU used have a much greater impact. Therefore, using a more powerful CPU benefits the transpiler more than the interpreter.
+Although the GPU plays a crucial role, work is also carried out on the CPU. The interpreter mainly uses the CPU for data transfer and the pre-processing step and is therefore more GPU-bound. However, the transpiler additionally needs the CPU to perform the transpilation step. This step produces a kernel for each expression and also involves sending these kernels to the driver for compilation, a process which is also performed by the CPU. By contrast, the interpreter only has one kernel that needs to be converted into PTX and compiled by the driver only once. Consequently, the transpiler is much more CPU-bound and variations in the used CPU have a much greater impact. Therefore, using a more powerful CPU benefits the transpiler more than the interpreter.

 \subsubsection{System Memory}
-In addition to the hardware configuration of the GPU and CPU, system memory (RAM) also plays a crucial role. Although RAM does not directly contribute to the overall performance, it can have a noticeable indirect impact due to its role in caching and general data storage. Insufficient RAM forces the operating system to use the page file, which is stored on a considerably slower SSD. This leads to slower data access, thereby reducing the overall performance of the application.
+In addition to the hardware configuration of the GPU and CPU, system memory (RAM) also plays a crucial role. While RAM does not directly contribute to the overall performance, it can have a noticeable indirect impact due to its role in caching. Insufficient RAM forces the operating system to use the page file, which is stored on a much slower SSD. This results in slower cache access, thereby reducing the overall performance of the application.

-As seen in the list below, only 16 GB of RAM were available during the benchmarking process. This amount is insufficient to utilise caching to the extent outlined in Chapter \ref{cha:implementation}. Additional RAM was not available, meaning caching had to be disabled for all benchmarks as further explained in Section \ref{sec:results}.
+As seen in the list below, only 16 GB of RAM were available during the benchmarking process. This amount is insufficient to utilise caching to the extent outlined in Chapter \ref{cha:implementation}. More RAM was not available, which means some caching had to be disabled, which will be further explained in Section \ref{sec:results}.

 \subsubsection{Hardware}
-With the requirements explained above in mind, the following hardware is used to perform the benchmarks for the CPU-based evaluator, as well as for the GPU-based evaluators:
+With the requirements explained above in mind, the following hardware is used to perform the benchmarks for the CPU-based evaluator, which was used as the baseline, as well as for the GPU-based evaluators:
 \begin{itemize}
 	\item Intel i5 12500
 	\item Nvidia RTX 3060 Ti
@ -32,281 +30,55 @@ With the requirements explained above in mind, the following hardware is used to


 \subsection{Software Configuration}
-Apart from the hardware, the performance of the evaluators can also be significantly affected by the software. Primarily these three software components or libraries are involved in the performance of the evaluators:
+Apart from the hardware, the performance of the evaluators can also be significantly affected by the software. Primarily these three software components are involved in the performance:
 \begin{itemize}
 	\item GPU Driver
 	\item Julia
 	\item CUDA.jl
 \end{itemize}

-Typically, newer versions of these components include, among other things, performance improvements. This is why it is important to specify the version which is used for benchmarking. The GPU driver has version \emph{561.17}, Julia has version \emph{1.11.5}, and CUDA.jl has version \emph{5.8.1}. As with the hardware configuration, this ensures that the results are reproducible and comparable to each other.
+Typically, newer versions of these components include performance improvements, among other things. This is why it is important to specify the version which is used for benchmarking. The GPU driver uses version \emph{561.17}, Julia uses version \emph{1.11.5}, and CUDA.jl uses version \emph{5.8.1}. As with the hardware configuration, this ensures that the results are reproducible and comparable to each other.


-\subsection{Performance Evaluation Process}
-Now that the hardware and software configurations have been established, the benchmarking process can be defined. This process is designed to simulate the load and scenario in which these evaluators will be used. The Nikuradse dataset \parencite{nikuradse_laws_1950} has been chosen as the data source. The dataset models the laws of flow in rough pipes and provides $362$ data points, each set containing two variables. This dataset has first been used by \textcite{guimera_bayesian_2020} to benchmark a symbolic regression algorithm.
+\subsection{Performance evaluation process}
+% explain the actual data
+% Nikuradse dataset (flowrate through rough pipes (fact check that again))
+%    250k expressions; ~300 variable sets; 100 parameter optimisation steps (simulated)
+% using Benchmarktools.jl as a tried and tested benchmark suite
+% 50 samples to eliminate any run-to-run variance

-Since only the evaluators are benchmarked, the expressions to be evaluated must already exist. These expressions are generated for the Nikuradse dataset using the exhaustive symbolic regression algorithm proposed by \textcite{bartlett_exhaustive_2024}. This ensures that the expressions are representative of what needs to be evaluated in a real-world application. In total, three benchmarks will be conducted, each having a different goal, which will be further explained in the following paragraphs.
-
-The first benchmark involves a very large set of roughly $250\,000$ expressions with $362$ data points. This means that when using GP all $250\,000$ expressions would be evaluated in a single generation. In a typical generation, significantly fewer expressions would be evaluated. However, this benchmark is designed to show how the evaluators can handle very large volumes of data. Because of memory constraints, it was not possible to conduct an additional benchmark with a higher number of data points.
-
-Both the second and third benchmarks are conducted to demonstrate how the evaluators will perform in more realistic scenarios. For the second benchmark the number of expressions has been reduced to roughly $10\,000$, and the number of data points is again $362$. The number of expressions is much more representative to a typical scenario, while the number of data points is still low. To determine if the GPU evaluators are a feasible alternative in scenarios with a realistic number of expressions but comparably few data points, this benchmark is conducted nonetheless.
-
-Finally, a third benchmark will be conducted. Similar to the second benchmark, this benchmark evaluates the same roughly $10\,000$ expressions but now with $30$ times more data points, which equates to roughly $10\,000$. This benchmark mimics the scenario where the evaluators will most likely be used. While the others simulate different conditions to determine if and where the GPU evaluators can be used efficiently, this benchmark is more focused on determining if the GPU evaluators are suitable for the specific scenario they are likely going to be used in.
-
-All three benchmarks also simulate a parameter optimisation step, as this is the intended use-case for these evaluators. For parameter optimisation, $100$ steps are used, meaning that all expressions are evaluated $100$ times. During the benchmark, this process is simulated by re-transmitting the parameters instead of generating new ones. Generating new parameters is not part of the evaluators and is therefore not implemented. However, because the parameters are re-transmitted each time, the overhead of sending the data is taken into account. This overhead is part of the evaluators and represents an additional burden that the CPU implementation does not have, making it important to be measured.
-
-\subsubsection{Measuring Performance}
-The performance measurements are taken, using the BenchmarkTools.jl\footnote{\url{https://juliaci.github.io/BenchmarkTools.jl/stable/}} package. It is the standard for benchmarking applications in Julia, which makes it an obvious choice for measuring the performance of the evaluators.
-
-It offers extensive support for measuring and comparing results of different implementations and versions of the same implementation. Benchmark groups allow to categorise the different implementations, take performance measurements and compare them. When taking performance measurements, it also supports setting a timeout and most importantly, set the number of samples to be taken. This is especially important, as it ensures to produce stable results by combating run-to-run variance. For this thesis, a sample size of $50$ has been used. This means that each of the previously-mentioned benchmarks, gets executed $50$ times. 
-
-\subsubsection{Theoretical Maximum Performance}
-To get an idea of how much performance would in theory be achievable, a rough optimistic estimation can be done. On average over all roughly $250\,000$ expressions of the first benchmark, a single expression has five operators. This translates to five floating point operations or FLOPS. Since some operators such as $x^y$ require three instructions, it is assumed that one of the five operators is such an operator. As a result $x^y$ needs three FLOPS which in total means a single expression on average requires seven FLOPS to be evaluated.
-
-Furthermore, expressions consist of variables and parameters, which need to be loaded from memory. It is assumed that per expression one parameter exists. Since the Nikuradse dataset is used, it is known that each expression contains exactly two variables. Loading a value from memory consists of three instructions. Therefore, it is assumed loading a value requires three FLOPS. This brings the total number of FLOPS per expression to $16$.
-
-The used GPU has a theoretical performance of $16.2$ Terra-FLOPS (TFLOPS) per second\footnote{\url{https://www.techpowerup.com/gpu-specs/geforce-rtx-3060-ti.c3681}}. Since the GPU has $4\,864$ cores, a single core has a theoretical performance of $16.2 / 4\,864 \approx 0.0033$ TFLOPS or $3.3$ GFLOPS per second. This means that a single core can perform $3.3$ billion 32-bit floating point operations per second. In return, this means that a single core can evaluate approximately $208$ million expressions per second. As a result, a single core would be able to evaluate all expressions of the first benchmark in less than a second, assuming the data is instantly accessible, and no more FLOPS are required to evaluate an expression than already accounted for.
-
-This calculation however is a very rough estimate. It does not take into account the time spent waiting for data to arrive nor does it take into account the time it takes to schedule the threads on the actual cores and other overhead work or waiting times. Especially the time spent waiting for data to arrive is important, as all data is present in global memory, which is the slowest form of memory on a GPU. While loading memory is a three instruction operation, it is very likely that the resulting machine code contains more instructions and therefore more FLOPS. Furthermore, both implementations contain many overhead instructions which are not accounted for in the above estimate. The interpreter loop for example contains many instructions that are not directly contributing to evaluating the expressions such as branching and jumping instructions. Additionally, not all FLOPS operate on FP32 values. Some also operate on FP64 instructions, which are about 64 times slower than FP32 instructions on this GPU.
-
-As seen in the results below, the benchmarks clearly show that the waiting time can not be neglected in the performance estimation. Furthermore, the CPU side has been omitted fully in the estimation. However, a significant part of the runtime is on the CPU, especially for the transpiler. Providing an estimation that incorporates both the waiting time and overhead FLOPS is an involved process which is out of scope of this thesis. Furthermore, no performance measurements of the runtime of a single kernel have been taken. While this would be interesting to get an idea of how much performance is lost compared to an ideal and optimistic scenario, it would have taken too much time to perform this analysis.

 \section{Results}
 \label{sec:results}
-This section presents the results of the benchmarks described above. First the results for the GPU-based interpreter and GPU transpiler alongside the performance tuning process will be presented in isolation. Finally, both GPU-based evaluators will be compared with each other to determine which of them performs the best. Additionally, these evaluators will be compared against the CPU-based interpreter to answer the research questions of this thesis.
+talk about what we will see now (results only for interpreter, then transpiler and then compared with each other and the CPU interpreter)
+
+BECAUSE OF RAM CONSTRAINTS, CACHING IS NOT USED TO THE FULL EXTEND AS IN CONTRAST TO HOW IT IS EXPLAINED IN THE IMPLEMENTATION CHAPTER. I hope I can cache the frontend. If only the finished kernels can not be cached, move this explanation to the transpiler section below and update the reference in subsubsection "System Memory"

 \subsection{Interpreter}
-In this section, the results for the GPU-based interpreter are presented in detail. Following the benchmark results, the process of tuning the interpreter is described as well as how to adapt the tuning for the different benchmarks. This part not only contains the tuning of the GPU, but also performance improvements done on the CPU side.
+Results only for Interpreter (also contains final kernel configuration and probably quick overview/recap of the implementation used and described in Implementation section)
+\subsection{Performance Tuning}
+Document the process of performance tuning

-\subsubsection{Benchmark 1}
-The first benchmark consists of $250\,000$ expressions and $362$ data points with $100$ parameter optimisation steps. Because each expression needs to be evaluated with each data point for each parameter optimisation step, a total of $250\,000 * 362 * 100 \approx 9.05\,\text{billion}$ evaluations have been performed per sample. In Figure \ref{fig:gpu_i_benchmark_1} the result over all $50$ samples is presented. The median value across all samples is $466.3$ seconds with a standard deviation of $14.2$ seconds.
-\begin{figure}
-	\centering
-	\includegraphics[width=.9\textwidth]{results/gpu-interpreter-final-performance-benchmark1.png}
-	\caption{The results of the GPU-based interpreter for benchmark 1}
-	\label{fig:gpu_i_benchmark_1}
-\end{figure}
+Initial: no cache; 256 blocksize; exprs pre-processed and sent to GPU on every call; vars sent on every call; frontend + dispatch are multithreaded

-For the kernel configuration, a block size of $128$ threads has been used. As will be explained below, this has been found to be the configuration that results in the most performance. During the benchmark, the utilisation of both the CPU and GPU was roughly $100\%$. 
-
-\subsubsection{Benchmark 2}
-With $10\,000$ expressions, $362$ data points and $100$ parameter optimisation steps, the total number of evaluations per sample was $362\,\text{million}$. The median across all samples is $21.3$ seconds with a standard deviation of $0.75$ seconds. Compared to the first benchmark, there were $25$ times fewer evaluations which also resulted in a reduction of the median and standard deviation of roughly $25$ times. This indicates a roughly linear correlation between the number of expressions and the runtime. Since the number of data points did not change, the block size for this benchmark remained at $128$ threads. Again the utilisation of the CPU and GPU during the benchmark was roughly $100\%$.
-\begin{figure}
-	\centering
-	\includegraphics[width=.9\textwidth]{results/gpu-interpreter-final-performance-benchmark2.png}
-	\caption{The results of the GPU-based interpreter for benchmark 2}
-	\label{fig:gpu_i_benchmark_2}
-\end{figure}
-
-\subsubsection{Benchmark 3}
-The third benchmark used the same $10\,000$ expressions and $100$ parameter optimisation steps. However, now there are $30$ times more data points that need to be used for evaluation. This means, that the total number of evaluations per sample is now $10.86\,\text{billion}$. Compared to the first benchmark, an additional $1.8\,\text{billion}$ evaluations were performed. However, as seen in Figure \ref{fig:gpu_i_benchmark_3}, the execution time was significantly faster. With a median of $30.3$ seconds and a standard deviation of $0.45$ seconds, this benchmark was only marginally slower than the second benchmark. This also indicates, that the GPU evaluators are much more suited for scenarios, where there is a high number of data points.
-\begin{figure}
-	\centering
-	\includegraphics[width=.9\textwidth]{results/gpu-interpreter-final-performance-benchmark3.png}
-	\caption{The results of the GPU-based interpreter for benchmark 3}
-	\label{fig:gpu_i_benchmark_3}
-\end{figure}
-
-Although the number of data points has been increased by $30$ times, the block size remained at $128$ threads. Unlike the previous benchmarks, the hardware utilisation was different. Now only the GPU was utilised to 100\% while the CPU utilisation started at 100\% and slowly dropped to 80\%. The GPU needs to perform $30$ times more evaluations per expression, meaning it takes longer for one kernel dispatch to be finished. At the same time, the CPU tries to dispatch the kernel at the same rate as before. Because only a certain number of kernels can be dispatched at once, the CPU needs to wait for the GPU to finish a kernel before another one can be dispatched. Therefore, in this scenario, the evaluator runs into a GPU-bottleneck and using a more performant GPU would consequently improve the runtime. In the previous benchmarks, both the CPU and GPU would need to be upgraded, to achieve better performance.
-
-
-\subsection{Performance Tuning Interpreter}
-\label{sec:tuning_interpreter}
-Optimising and tuning the interpreter is crucial to achieve good performance. Especially tuning the kernel, as a wrongly configured kernel can drastically degrade performance. Before any performance tuning and optimisation has been performed, the kernel was configured with a block size of $256$ threads since it is a good initial configuration as recommended by \textcite{nvidia_cuda_2025-1}. Additionally, on the CPU, the frontend was executed for each expression before every kernel dispatch, even in parameter optimisation scenarios, where the expressions did not change from one dispatch to the other. Moreover, the variables have also been transmitted to the GPU before ever dispatch. However, executing the frontend, as well as dispatching the kernel was multithreaded, utilising all 12 threads of the CPU and a cache for the frontend was utilised. 
-
-With this implementation, the initial performance measurements have been conducted for the first benchmark which served as the baseline for further performance optimisations. However, as already mentioned, during this benchmark, memory limitations where encountered, as too much RAM was being used. Therefore, the caching had to be disabled. Because the evaluator is multithreaded, this change resulted in significantly better performance. As the cache introduced critical sections where race conditions could occur, locking mechanisms were required. While locking ensures that no race conditions occur, it also means that parts of an otherwise entirely parallel implementation are now serialised, reducing the effect of parallelisation.
-
-Without a cache and utilising all 12 threads, the frontend achieved very good performance. Processing $250\,000$ expressions takes roughly $88.5$ milliseconds. On the other hand, using a cache, resulted in the frontend running for $6.9$ \text{seconds}. This equates to a speed-up of roughly 78 times when using no cache. Additionally, when looking at the benchmark results above, the time it takes to execute the frontend is negligible, meaning further optimising the frontend would not significantly improve the overall runtime.
-
-During the tuning process $362$ data points have been used, which is the number of data points used by benchmark one and two. Before conducting benchmark three, additional performance tuning has been performed to ensure that this benchmark also utilises the hardware as much as possible.
-
-\subsubsection{Optimisation 1}
-
-After caching has been disabled, the first performance improvement was to drastically reduce the number of calls to the frontend and the number of data transfers to the GPU. Because the expressions and variables never change during the parameter optimisation process, processing the expression and transmitting the data to the GPU on each step wastes resources. Therefore, the expressions are sent to the frontend once before the parameter optimisation process. Afterwards, the processed expressions as well as the variables are transferred to the GPU exactly once for this execution of the interpreter.
-
-Figure \ref{fig:gpu_i_optimisation_1} shows how this optimisation improved the overall performance as demonstrated with benchmark one. However, it can also be seen that the range the individual samples fall within is much greater now. While in all cases, this optimisation improved the performance, in some cases the difference between the initial and the optimised version is very low with roughly a two-second improvement. On median the performance improvement was roughly five percent.
-
-\begin{figure}
-	\centering
-	\includegraphics[width=.9\textwidth]{results/interpreter-comparison-initial-optim1.png}
-	\caption{Comparison of the initial implementation with the first optimisation applied on benchmark one. Note that while the results of the optimisation fall within a much wider range, all samples performed better than the initial implementation.}
-	\label{fig:gpu_i_optimisation_1}
-\end{figure}
-
-\subsubsection{Optimisation 2}
-
-The second optimisation was concerned with tuning the kernel configuration. Using NSight Compute\footnote{\url{https://developer.nvidia.com/nsight-compute}} it was possible to profile the kernel with different configurations. During the profiling a lot of metrics have been gathered that allowed to deeply analyse the kernel executions, with the application recommending different aspects that had potential for performance improvements.
-
-Since the evaluator is designed to execute many kernel dispatches in parallel, it was important to reduce the kernel runtime. Reducing the runtime per kernel has a knock-on effect, as the following kernel dispatches can begin execution sooner reducing the overall runtime.
-
-After the evaluator tuning has been concluded, it was found that a block size of $128$ yielded the best results. With this kernel configuration, another performance measurement has been conducted with the results shown in Figure \ref{fig:gpu_i_optimisation_2} using benchmark one. As can be seen, the overall runtime again was noticeably faster, albeit in improvement of roughly six percent. However, the standard deviation also drastically increased, with the duration from the fastest to the slowest sample differing by roughly 60 seconds.
-
-\begin{figure}
-	\centering
-	\includegraphics[width=.9\textwidth]{results/interpreter-comparison-optim1-optim2.png}
-	\caption{Comparison of the first optimisation with the second applied on benchmark one.}
-	\label{fig:gpu_i_optimisation_2}
-\end{figure}
-
-The found block size of $128$ might seem strange. However, it makes sense, as in total at least $362$ threads need to be started to evaluate one expression. If one block contains $128$ threads a total of $362 / 128 \approx 3$ blocks need to be started, totalling $384$ threads. As a result, only $384 - 362 = 22$  threads are excess threads. When choosing a block size of $121$ three blocks could be started, totalling one excess thread. However, there is no performance difference between a block size of $121$ and $128$. Since all threads are executed inside a warp, which consists of exactly $32$ threads, a block size that is not divisible by $32$ has no benefit and only hides the true amount of excess threads started.
-
-Benchmark three had a total of $10\,860$ data points, meaning at least this number of threads must be started. To ensure optimal hardware utilisation, the evaluator had to undergo another tuning process. As seen above, it is beneficial to start as little excess threads as possible. By utilising NSight Compute, a performance measurement with a block size of $128$ was used as the initial configuration. This already performed well as again very little excess threads are started. In total $10\,860 / 128 \approx 84.84$ blocks are needed, which must be round up to $85$ blocks with the last block being filled by roughly $84\%$ which equates to $20$ excess threads being started.
-
-This was repeated for two more configurations. Once for a block size of $160$ and once for $192$. With a block size of $160$, the total number of blocks was reduced to $68$, which again resulted in $20$ excess threads being started. With the hypothesis behind increasing the block size was that using fewer blocks would result in better utilisation and therefore better performance. The same idea was also behind choosing a block size $192$. However, While this only required $57$ blocks, the number of excess threads increased to $84$. 
-
-Using NSight Compute it was found, that a block size of $160$ was the best performing followed by the block size of $192$ and the worst performing configuration was with a block size of $128$. However, this is not representative of how these configurations performed during the benchmarks. As seen in Figure \ref{fig:gpu_i_128-160-192} using a block size of $128$ lead to significantly better performance than the other configurations. While a block size of $160$ lead to worse results, it needs to be noted that it also improved the standard deviation by 25\% when compared to the results with a block size of $128$. These results also demonstrate that it is important to not only use NSight Compute but also conduct performance tests with real data to ensure the best possible configuration is chosen.
-
-\begin{figure}
-	\centering
-	\includegraphics[width=.9\textwidth]{results/interpreter-comparison-128-160-192.png}
-	\caption{Comparison of the execution times of benchmark three with a block size of 128, 160 and 192.}
-	\label{fig:gpu_i_128-160-192}
-\end{figure}
-
-\subsubsection{Optimisation 3}
-As seen in Figure \ref{fig:gpu_i_optimisation_2}, while the performance overall improved, the standard deviation also significantly increased. With the third optimisation the goal was to reduce the standard deviation. In order to achieve this, some minor optimisations where applied.
-
-The first optimisation was to reduce the stack size of the interpreter from 25 to 10. As the stack is stored in local memory, it is beneficial to minimise the data transfer and allocation of memory. This change, however, means that the stack might not be sufficient for larger expressions. Because with a stack size of 10 no problems were found during testing, it was assumed to be sufficient. In cases where this isn't sufficient, the stack size can be increased.
-
-During the parameter optimisation step a lot of memory operations where performed. These are required as for each step new memory on the GPU must be allocated for both the parameters and the meta information. The documentation of CUDA.jl\footnote{\url{https://cuda.juliagpu.org/stable/usage/memory/\#Avoiding-GC-pressure}} mentioned that this can lead to higher garbage-collector (GC) pressure, increasing the time spent garbage-collecting. To reduce this, CUDA.jl provides the \verb|CUDA.unsafe_free!(::CuArray)| function. This frees the memory on the GPU without requiring to run the Julia GC and therefore spending less resources on garbage-collecting and more on evaluating the expressions.
-
-With these two changes the overall runtime has been improved by two percent as can be seen in Figure \ref{fig:gpu_i_optimisation_3}. Moreover, the standard deviation was also reduced which was the main goal of this optimisation.
-
-\begin{figure}
-	\centering
-	\includegraphics[width=.9\textwidth]{results/interpreter-comparison-optim2-optim3.png}
-	\caption{Comparison of the second optimisation with the third applied on benchmark one.}
-	\label{fig:gpu_i_optimisation_3}
-\end{figure}
+1.) Done before parameter optimisation loop: Frontend, transmitting Exprs and Variables (improved runtime)
+2.) tuned blocksize to have as little wasted threads as possible (new blocksize 121 -> 3-blocks -> 363 threads but 362 threads needed per expression)


 \subsection{Transpiler}
-In this section the results for the transpiler are presented in detail. First the results for all three benchmarks are shown. The benchmarks are the same as already explained in the previous sections. After the results, an overview of the steps taken to optimise the transpiler execution times is given. 
-
-\subsubsection{Benchmark 1}
-\label{sec:gput_bench1}
-This benchmark lead to very poor results for the transpiler. While the best performing kernel configuration of $128$ threads per block was used, the above-mentioned RAM constraints meant that this benchmark performed poorly. After roughly $20$ hours of execution only two samples have been taken at which point it was decided to not finish this benchmark and treat it as failed. 
-
-As described in Chapter \ref{cha:implementation} the expressions are transpiled into PTX code and then immediately compiled into machine code by the GPU driver before the compiled kernels are sent to the parameter optimisation step. This order of operations makes sense as the expressions remain the same during this process and otherwise would result in performing a lot of unnecessary transpilations and compilations.
-
-However, only 16 GB of RAM where available with about half of that being used by the operating system. This meant that about eight GB of RAM where available to store $250\,000$ compiled kernels next to other required data for example the variable matrix. As a result, this was not enough memory and the benchmark failed. To combat this the step of compiling the kernels was moved into the parameter optimisation process, as this would free the memory taken up by the compiled kernel after it has been executed. As seen above consequently the performance was hurt dramatically and has shown that for these scenarios much more memory is required for the transpiler to work properly.
+Results only for Transpiler (also contains final kernel configuration and probably quick overview/recap of the implementation used and described in Implementation section


-\subsubsection{Benchmark 2}
-By reducing the number of expressions from $250\,000$ to roughly $10\,000$ the RAM constraint that hindered the first benchmark is not a concern any more. This can also be seen in Figure \ref{fig:gpu_t_benchmark_2} where the benchmark could be completed in a much more reasonable time. The median of this benchmark was $19.6$ seconds with a standard deviation of $1.16$ seconds. Again for this benchmark a block size of $128$ threads has been chosen.
-\begin{figure}
-	\centering
-	\includegraphics[width=.9\textwidth]{results/gpu-transpiler-final-performance-benchmark2.png}
-	\caption{The results of the transpiler for benchmark 2.}
-	\label{fig:gpu_t_benchmark_2}
-\end{figure}

-During the benchmark it was observed that the CPU maintained a utilisation of 100\%. However crucially the GPU rapidly oscillated between 0\% and 100\% utilisation. This pattern suggests that while the kernels can fully utilise the GPU, they complete the evaluations almost immediately. Consequently, although the evaluation is performed very quickly, the time spent evaluating is smaller than the time spent preparing the expressions for evaluation. To better leverage the GPU, more evaluations should be performed. This would increase the GPU's share of total execution time and therefore increase the efficiency drastically.
+\subsection{Performance Tuning}
+Document the process of performance tuning

+Initial: no cache; 256 blocksize; exprs pre-processed and transpiled on every call; vars sent on every call; frontend + transpilation + dispatch are multithreaded

-\subsubsection{Benchmark 3}
-This benchmark increased the amount of data points by $30$ times and therefore also increases the total number of evaluations by $30$ times. As observed in the second benchmark, the GPU was underutilised and thus had more resources available for evaluating the expressions. As shown in Figure \ref{fig:gpu_t_benchmark_3} the available resources were better utilised. Although the number of evaluations increased by a factor of $30$, the median execution time only increased by approximately six seconds, or $1.3$ times, from $19.6$ to $25.4$. The standard deviation also decreased from $1.16$ seconds to $0.65$ seconds.
-\begin{figure}
-	\centering
-	\includegraphics[width=.9\textwidth]{results/gpu-transpiler-final-performance-benchmark3.png}
-	\caption{The results of the transpiler for benchmark 3.}
-	\label{fig:gpu_t_benchmark_3}
-\end{figure}
-
-Given the change in the number of data points, additional performance tests with different block sizes were conducted. During this process it was found, that changing the block size from $128$ to $160$ threads resulted in the best performance. This is in contrast to the GPU interpreter where changing the block size to $160$ resulted in degraded performance.
-
-While conducting this benchmark, the CPU utilisation began at 100\% during the frontend step as well as the transpilation and compilation steps. However, similar to the third benchmark of the GPU interpreter, the CPU utilisation dropped to 80\% during the evaluation phase. This is very likely due to the same reason that the kernels are dispatched too quickly in succession, filling up the number of allowed resident grids on the GPU.
-
-However, GPU utilisation also increased drastically. During the second benchmark, rapid oscillation was observed. With this benchmark the utilisation remained much more stable with the utilisation hovering around 60\% to 70\% most of the time. It should also be noted that there appeared frequent spikes to 100\% and slightly less frequent drops to 20\% utilisation. Overall the GPU utilisation was much higher compared to the second benchmark, which explains why the execution time only increased slightly despite the drastic increase in the number of evaluations.
-
-\subsection{Performance Tuning Transpiler}
-% Initial: no cache; 256 blocksize; exprs pre-processed and transpiled on every call; vars sent on every call; frontend + transpilation + dispatch are multithreaded
-This section describes how the transpiler has been tuned to achieve good performance. Steps taken to improve the performance of the CPU-side of the transpiler are presented. Additionally, steps taken to improve the performance of the kernels are also shown.
-
-Before any optimisations were applied, the block size was set to $256$ threads. The frontend as well as the transpilation and compilation were performed during the parameter optimisation step before the expression needed to be evaluated. Additionally, the variables have also been sent to the GPU on every parameter optimisation step. Multithreading has been used for the frontend, transpilation, compilation and kernel dispatch. Caching has also been used for the frontend and for the transpilation process in an effort to reduce the runtime.
-
-As already mentioned in Section \ref{sec:tuning_interpreter}, using a cache in combination with multithreading for the frontend drastically slowed down the execution, which is the reason it has been disabled before conducting any benchmarks. 
-
-Caching has also been used for the transpilation step. The reason for this was to reduce the runtime during the parameter optimisation step. While this reduced the overhead of transpilation, the overhead of searching the cache if the expression has already been transpiled still existed. Because of the already mentioned RAM constraints this cache has been disabled and a better solution has been implemented in the first and second optimisation steps.
-
-Most data of the tuning process has been gathered with the number of expressions and data points of the first benchmark, as this was the worst performing scenario. Therefore, it would show best where potential for performance improvements was. Before any optimisations were applied a single sample of the first benchmark took roughly 15 hours. However, it needs to be noted that only two samples were taken due to the duration of one sample.
-
-\subsubsection{Optimisation 1}
-% 1.) Done before parameter optimisation loop: Frontend, transmitting Variables (improved runtime)
-Since all caching has been disabled, a better solution for reducing the number of calls to the frontend was needed. For this, the calls to the frontend were moved outside the parameter optimisation step and storing the result for later use. Furthermore, transmitting the variables to the GPU has also been performed before the parameter optimisation is started, further reducing the number and volume of data transfer to the GPU. These two optimisations were able to reduce the runtime of one sample to roughly 14 hours and are equivalent to the first optimisation step of the GPU interpreter.
-
-\subsubsection{Optimisation 2}
-% 2.) All expressions to execute are transpiled first (before they were transpiled for every execution, even in parameter optimisation scenarios). Compilation is done every time in benchmark 1, because too little RAM was available (compilation takes the most time). 
-With this optimisation step the number of calls to the transpiler and compiler have been drastically reduced. Both steps are now performed at the same time the frontend is called. The compiled kernels are then stored and only need to be executed during the parameter optimisation step. This meant that a cache was not needed any more. Because each time a new set of expressions needs to be evaluated, it is extremely unlikely that the same expression needs to be evaluated more than once. Consequently, the benefit of reducing the RAM consumption far outweighs the potential time savings of using a cache. Moreover, removing the cache also reduced the overhead of accessing it on every parameter optimisation step, further improving performance.
-
-It also must be noted, that compiling the PTX kernels and storing the result before the parameter optimisation step lead to an out of memory error for the first benchmark. In order to get any results, this step had to be reverted for this benchmark. If much more RAM were available, the runtime would have been significantly better.
-
-Nonetheless, these optimisations lead to a runtime of one sample of roughly ten hours for the first benchmark. Therefore, a substantial improvement of roughly four hours or 40\% per sample was achieved. When $10\,000$ expressions are transpiled it takes on average $0.05$ seconds over ten samples. Comparing this to the time spent compiling the resulting $10\,000$ kernels it takes on average $3.2$ seconds over ten samples. This suggests that performing the compilation before the parameter optimisation step would yield drastically better results in the first benchmark.
-
-\subsubsection{Optimisation 3}
-% 3.) benchmark3 std noticeably improved with blocksize 160 (around 70\% better) (also includes call to unsafe_free)
-% here I can show chart of comparing the two blocksizes
-% unsafe_free in benchmark one reduced std. but could also be run to run variance. at least no negative effects
-The third optimisation step was more focused on improving the performance for the third benchmark as it has a higher number of data points than the first and second one. However, as with the interpreter, the function \verb|CUDA.unsafe_free!(::CuArray)| has been used to reduce the standard deviation for all benchmarks.
-
-Since the number of data points has changed in the third benchmark, it is important to re-do the performance tuning. This was done by measuring the kernel performance using NSight Compute. As with the interpreter, block sizes of $128$ and $160$ threads have been compared with each other. A block size of $192$ threads has been omitted here since the number of excess threads is very high. In the case of the interpreter the performance of this configuration was the worst out of the three configurations, and it was assumed it will be similar in this scenario.
-
-However, since the number of excess threads for $128$ and $160$ threads per block is the same, the latter using fewer blocks might lead to performance improvements in the case of the transpiler. As seen in Figure \ref{fig:gpu_t_128_160} this assumption was true and using a block size of $160$ threads resulted in better performance for the third benchmark. This is in contrast to the interpreter, where this configuration performed much more poorly.
-\begin{figure}
-	\centering
-	\includegraphics[width=.9\textwidth]{results/transpiler-comparison-128-160.png}
-	\caption{Runtime comparison of the third benchmark with block sizes of 128 and 160 threads.}
-	\label{fig:gpu_t_128_160}
-\end{figure}
+1.) Done before parameter optimisation loop: Frontend, transmitting Exprs and Variables (improved runtime)
+2.) All expressions to execute are transpiled first (before they were transpiled for every execution, even in parameter optimisation scenarios). Compilation is still done every time, because too little RAM was available (compilation takes the most time, so this is only a minor boost)

 \subsection{Comparison}
-% Comparison of Interpreter and Transpiler as well as Comparing the two with CPU interpreter
-% more var sets == better performance for GPU; more expressions == more performance for CPU evaluator
-With the individual results of the GPU interpreter and transpiler presented, it is possible to compare them with the existing CPU interpreter. This section aims at outlining and comparing the performance of all three implementations across all three benchmarks to understand their strengths and weaknesses. Through this analysis the scenarios will be identified where it is best to leverage the GPU but also when using the CPU interpreter is the better choice, ultimately answering the research questions of this thesis.
+Comparison of Interpreter and Transpiler as well as Comparing the two with CPU interpreter

-\subsubsection{Benchmark 1}
-The goal of the first benchmark was to determine how the evaluators are able to handle large amounts of expressions. While this benchmark is not representative of a typical scenario, it allows for demonstrating the impact the number of expressions has on the execution time. As already explained in Section \ref{sec:gput_bench1} the transpiler failed to finish this benchmark due to RAM limitations. This required a slightly modified implementation to obtain results for at least two samples, each taking roughly ten hours to complete, which is the reason it has been omitted from this comparison.
-
-\begin{figure}
-	\centering
-	\includegraphics[width=.9\textwidth]{results/cpu_gpui_gput_bench1.png}
-	\caption{The results of the comparison of the CPU and GPU based interpreter for the first benchmark. Note that the transpiler is absent because it did not finish this benchmark.}
-	\label{fig:cpu_gpui_gput_benchmark_1}
-\end{figure}
-
-Figure \ref{fig:cpu_gpui_gput_benchmark_1} shows the results of the first benchmark for the CPU and GPU interpreter. It can be seen that the GPU interpreter takes roughly four times as long on median than the CPU interpreter. Additionally, the standard deviation is much larger on the GPU interpreter. This shows that the CPU heavily benefits from scenarios where a lot of expressions need to be evaluated with very few data points. Therefore, it is not advisable to use the GPU to increase the performance in such scenarios. 
-
-\subsubsection{Benchmark 2}
-Since the first benchmark has shown that with a large number of expressions the GPU is not a suitable alternative to the CPU. To further proof this statement a second benchmark with much fewer expressions was conducted. Now instead of $250\,000$ expressions, only $10\,000$ are evaluated. This reduction also meant that the transpiler can now be included in the comparison as it does not face any RAM limitations any more.
-
-\begin{figure}
-	\centering
-	\includegraphics[width=.9\textwidth]{results/cpu_gpui_gput_bench2.png}
-	\caption{The results of the comparison of all three implementations for the second benchmark.}
-	\label{fig:cpu_gpui_gput_benchmark_2}
-\end{figure}
-
-Reducing the number of expressions did not benefit the GPU evaluators at all in relation to the CPU interpreter. This can be seen in Figure \ref{fig:cpu_gpui_gput_benchmark_2}. Furthermore, now the GPU evaluators are both roughly five times slower than the CPU interpreter instead of the previous performance reduction of roughly four times. Again the standard deviation is also much higher on both GPU evaluators when compared to the CPU interpreter. This means that a lower number of expressions does not necessarily mean that the GPU can outperform the CPU. Thus disproving the above statement that only a large number of expressions results in the GPU performing poorly.
-
-On the other side, it can also be seen that the GPU transpiler tends to perform better than the GPU interpreter. While in the worst case both implementations are roughly equal, the GPU transpiler on median performs better. Additionally, the GPU transpiler can also outperform the GPU interpreter in the best case.
-
-\subsubsection{Benchmark 3}
-As found by the previous two benchmarks, varying the number of expressions only has a slight impact on the performance of the GPU in relation to the performance of the CPU. However, instead of varying the number of expressions, the number of data points can also be changed. For this benchmark, instead of $362$ data points, a total of $10\,860$ data points were used, which translates to an increase in performance by $30$ times. It needs to be noted, that it was only possible to evaluate the performance with roughly $10\,000$ expressions with this number of data points. When using the same roughly $250\,000$ expressions of the first benchmark and the increased number of data points, none of the implementations managed to complete the benchmark, as there was too little RAM available.
-
-\begin{figure}
-	\centering
-	\includegraphics[width=.9\textwidth]{results/cpu_gpui_gput_bench3.png}
-	\caption{The results of the comparison of all three implementations for the third benchmark.}
-	\label{fig:cpu_gpui_gput_benchmark_3}
-\end{figure}
-
-Increasing the number of data points greatly benefited both GPU evaluators as seen in Figure \ref{fig:cpu_gpui_gput_benchmark_3}. With this change, the CPU interpreter noticeably fell behind the GPU evaluators. Compared to the GPU transpiler, the CPU interpreter took roughly twice as long on median. The GPU transpiler continued its trend of performing better than the GPU interpreter. Furthermore, the standard deviation of all three evaluators is also very similar.
-
-From this benchmark it can be concluded that the GPU heavily benefits from a larger number of data points. If the number of data points is increased even further, the difference in performance between the GPU and CPU should be even more pronounced. 
-
-While the GPU is very limited in terms of concurrent kernel dispatches that can be evaluated, the number of threads and blocks can virtually be infinitely large. This means that a higher degree of parallelism is achievable with a higher number of data points. Increasing the number of expressions on the other hand does not influence the degree of parallelism to this extent. This is the reason no performance benefit was found by only decreasing the number of expressions with the same number of data points.
-
-\subsection{Discussion}
-A similar problem statement of this thesis has already been explored by \textcite{weinberger_vektoroperationen_2018}. In his thesis he explored how utilising vector operations can be used in evaluating expression trees generated with GP. He used OpenCL to, on the one hand, vectorise a CPU implementation, and on the other hand utilise the GPU. Utilising the GPU using CUDA to evaluate expressions generated at runtime has also been the focus of this thesis. However, the goal of this thesis was to compare two GPU implementations with each other and a CPU implementation specifically for the use in symbolic regression utilising parameter optimisation. 
-
-In his thesis, Weinberger found that the GPU was able to outperform the CPU in all instances. Especially with larger datasets the advantage of the GPU was clearly visible. This trend was also confirmed in this thesis, specifically when comparing the second and third benchmarks. However, in this thesis, the CPU implementation was able to outperform the GPU clearly in two out of three benchmarks. This difference might be caused by the sophisticated usage of vectorisation in the CPU implementation which used for comparison. Overall this thesis was able to confirm the findings of Weinberger. Additionally, implementations are demonstrated that support the evaluation of expressions generated at runtime on the GPU that allow the usage of parameter optimisation which was not possible with Weinberger's implementation.
+talk about that compute portion is just too little. Only more complex expressions with higher var set count benefit well (make one or two performance evaluations, with 10 larger expressions and at least 1k var sets and present that here as point for that statement)
--- a/thesis/chapters/implementation.tex
+++ b/thesis/chapters/implementation.tex
@ -1,36 +1,40 @@
 \chapter{Implementation}
 \label{cha:implementation}

-This chapter focuses on the implementation phase of the thesis, building upon the concepts and designs previously discussed. It begins with an overview of the technologies employed for both the CPU and GPU parts of the prototypes. This is followed by a description of the pre-processing or frontend phase. The chapter concludes with a detailed overview of the core components, the interpreter and the transpiler.
+This chapter focuses on the implementation phase of the project, building upon the concepts and designs previously discussed. It begins with an overview of the technologies employed for both the CPU and GPU parts of the application. This is followed by a description of the pre-processing or frontend phase. The chapter concludes with a detailed overview of the core components, the interpreter and the transpiler.
+
+% Go into the details why this implementation is tuned towards performance and should be the optimum at that

 \section{Technologies}
 This section describes the technologies used for both the CPU side of the prototypes and the GPU side. The rationale behind these choices, including consideration of their performance implications, is presented. In addition, the hardware limitations imposed by the choice of GPU technology are outlined.

 \subsection{CPU side}
-Both prototypes were implemented using the Julia programming language. It was chosen mainly, because the current symbolic regression algorithm is also implemented in Julia. Being a high-level programming language, with modern features such as a garbage-collector (GC), support for meta-programming and dynamic typing, it also offers great convenience to the developer. 
+Both prototypes were implemented using the Julia programming language. It was chosen mainly, because the current symbolic regression algorithm is also implemented in Julia. Being a high-level programming language, with modern features such as a garbage collector, support for meta-programming and dynamic typing, it also offers great convenience to the developer. 

 More interestingly however, is the high performance that can be achieved with this language. It is possible to achieve high performance despite the supported modern features, which are often deemed to be harmful to performance. \textcite{bezanson_julia_2017} have shown how Julia can provide C-like performance while supporting the developer with modern quality of life features. The ability of Julia to be used in high performance computing scenarios and to be competitive with C has been demonstrated by \textcite{lin_comparing_2021}. This shows how Julia is a good and valid choice for scenarios where developer comfort and C-like performance are needed.

 \subsection{GPU side}
 In addition to a programming language for the CPU, a method for programming the GPU is also required. For this purpose, the CUDA API was chosen. While CUDA offers robust capabilities, it is important to note that it is exclusively compatible with Nvidia GPUs. An alternative would have been OpenCL, which provides broader compatibility by supporting GPUs from Nvidia, AMD and Intel. However, considering Nvidia's significant market share and the widespread adoption of CUDA in the industry, the decision was made to use CUDA.

-A typical CUDA program is primarily written C++ and Nvidia also provides their CUDA compiler nvcc\footnote{\url{https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/}} for C and C++ and their official CUDA programming guide \parencite{nvidia_cuda_2025} also uses C++ for code examples. It is also possible to call C++ code from within Julia. This would allow for writing the kernel and interaction with the GPU in C++, leveraging the knowledge built up in the industry over several years.
+A typical CUDA program is primarily written C++ and Nvidia also provides their CUDA compiler nvcc\footnote{\url{https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/}} for C and C++ and their official CUDA programming guide \parencite{nvidia_cuda_2025} also uses C++ for code examples. It is also possible to call C++ code from within Julia. This would allow for writing the kernel and interacting with the GPU in C++, leveraging the knowledge built up over several years.

 \subsubsection{CUDA and Julia}
-Instead of writing the kernel in C++ and calling it from Julia, a much simpler and effective alternative is available. The Julia package CUDA.jl\footnote{\url{https://cuda.juliagpu.org/}} enables a developer to write a kernel in Julia similar to how a kernel is written in C++ with CUDA. One drawback of using CUDA.jl however, is the fact that it is much newer compared to CUDA and therefore does not have years of testing and bug fixing in its history, which might be a concern for some applications. Apart from writing kernels with CUDA.jl, it also offers a method for interacting with the driver to compile PTX code into machine code. This is a must-have feature as otherwise, it wouldn't have been possible to fully develop the transpiler in Julia.
+Instead of writing the kernel in C++ and calling it from Julia, a much simpler and effective alternative can be used. The Julia package CUDA.jl\footnote{\url{https://cuda.juliagpu.org/}} enables a developer to write a kernel in Julia similar to how a kernel is written in C++ with CUDA. One drawback of using CUDA.jl however, is the fact that it is much newer compared to CUDA and therefore does not have years of testing and bug fixing in its history, which might be a concern for some applications. Apart from writing kernels with CUDA.jl, it also offers a method for interacting with the driver, to compile PTX code into machine code. This is a must-have feature as otherwise, it wouldn't have been possible to fully develop the transpiler in Julia.

-Additionally, the JuliaGPU initiative\footnote{\url{https://juliagpu.org/}} offers a collection of additional packages to enable GPU development for AMD, Intel and Apple and not just for Nvidia. However, CUDA.jl is also the most mature of the available implementations, which is another reason why CUDA has been chosen instead of for example OpenCL. 
+Additionally, the JuliaGPU initiative\footnote{\url{https://juliagpu.org/}} offers a collection of additional packages to enable GPU development for AMD, Intel and Apple and not just for Nvidia. However, CUDA.jl is also the most mature of the available implementations, which is also a reason why CUDA has been chosen instead of for example OpenCL. 

-Again, the question arises as to whether the performance of CUDA.jl is sufficient for it to be used as an alternative to C++ and CUDA. Studies by \textcite{besard_rapid_2019, lin_comparing_2021, faingnaert_flexible_2022} have demonstrated, that CUDA.jl provides sufficient performance. They found that, in some cases, CUDA.jl performed better than the same algorithm implemented in C and C++, and that it is on par otherwise. These results provide the confidence, that Julia alongside CUDA.jl is a good choice for leveraging the performance of GPUs to speed up expression evaluation.
+Again, the question arises if the performance of CUDA.jl is sufficient to be used as an alternative to C++ and CUDA. Performance studies by \textcite{besard_rapid_2019}, \textcite{lin_comparing_2021} and \textcite{faingnaert_flexible_2022} have demonstrated, that CUDA.jl provides sufficient performance. They found that in some cases CUDA.jl was able to perform better than the same algorithm implemented in C and C++. This provides the confidence, that Julia alongside CUDA.jl is a good choice for leveraging the performance of GPUs to speed-up expression evaluation.

 \section{Pre-Processing}
+% Talk about why this needs to be done and how it is done (the why is basically: simplifies evaluation/transpilation process; the how is in ExpressionProcessing.jl (the why is probably not needed because it is explained in concept and design))
 The pre-processing or frontend step is very important. As already explained in Chapter \ref{cha:conceptdesign}, it is responsible for ensuring that the given expressions are valid and that they are transformed into an intermediate representation. This section aims to explain how the intermediate representation is implemented, as well as how it is generated from a mathematical expression.

 \subsection{Intermediate Representation}
 \label{sec:ir}
-The intermediate representation is mainly designed to be lightweight and easily transferrable to the GPU. Since the interpreter runs on the GPU, this was a very important consideration. Because the transpilation process is done on the CPU, and is therefore very flexible in terms of the intermediate representation, the focus lied mainly on being efficient for the interpreter.
+% Talk about how it looks and why it was chosen to look like this
+The intermediate representation is mainly designed to be lightweight and easily transferrable to the GPU. Since the interpreter runs on the GPU, this was a very important consideration. Because the transpilation process is done on the CPU, and is therefore very flexible in terms of the intermediate representation, the focus was mainly on being efficient for the interpreter.

-The intermediate representation cannot take any form. While it has already been defined that expressions are converted to postfix notation, there are several ways to store the data. The first logical choice is to create an array where each entry represents a token. On the CPU it would be possible to define each entry as a pointer to the token object. Each of these objects could be of a different type, for example one object that holds a constant value while another object holds an operator. In addition, each of these objects could contain its own logic about what to do when it is encountered during the evaluation process. However, on the GPU, this is not possible, as an array entry must hold a value and not a pointer to another memory location. Furthermore, even if it were possible, it would not be a feasible solution. As explained in Section \ref{sec:memory_model}, when loading data from global memory, larger chunks are retrieved at once. If the data is scattered across the GPU's global memory, a lot of unwanted data will be transferred. This can be seen in Figure \ref{fig:excessive-memory-transfer}, where if the data is stored sequentially, far fewer data operations and far less data in general needs to be transferred.
+The intermediate representation cannot take any form. While it has already been defined that expressions are converted to postfix notation, there are several ways to store the data. The first logical choice is to create an array where each entry represents a token. On the CPU it would be possible to define each entry as a pointer to the token object. Each of these objects could be of a different type, for example one object that holds a constant value while another object holds an operator. In addition, each of these objects could contain its own logic about what to do when it is encountered during the evaluation process. However, on the GPU, this is not possible, as an array entry must hold a value and not a pointer to another memory location. Furthermore, even if it were possible, it would be a bad idea. As explained in Section \ref{sec:memory_model}, when loading data from global memory, larger chunks are retrieved at once. If the data is scattered across the GPU's global memory, a lot of unwanted data will be transferred. This can be seen in Figure \ref{fig:excessive-memory-transfer}, where if the data is stored sequentially, far fewer data operations and far less data in general needs to be transferred.

 \begin{figure}
 	\centering
@ -39,21 +43,21 @@ The intermediate representation cannot take any form. While it has already been
 	\label{fig:excessive-memory-transfer}
 \end{figure}

-Due to this, and the fact that the GPU does not allow pointers, an alternative approach is required. Rather than storing pointers to objects of different types in an array, it is possible to store objects of a single type. As described in Section \ref{sec:pre-processing}, the objects thus contain the type of the stored value and the value itself. The four types of values that need to be stored in this object differ significantly in terms of the value they represent. The following paragraphs explain how these values can be stored in objects of a single type.
+Because of this and because the GPU does not allow pointers, another solution is required. Instead of storing pointers to objects of different types in an array, it is possible to store one object with meta information. The object thus contains the type of the stored value, and the value itself, as described in Section \ref{sec:pre-processing}. The four types that need to be stored in this object, differ significantly in the value they represent. 

 Variables and parameters are very simple to store. Because they represent indices to the variable matrix or the parameter vector, this (integer) index can be stored as is in the value property of the object. The type can then be used to determine whether it is an index to a variable or a parameter access.

-Constants are also very simple, as they represent a single 32-bit floating point value. However, due to the variables and parameters, the value property is already defined as an integer and not as a floating point number. Unlike in dynamically typed languages such as Python, where every number is a floating point number, in Julia they these have different types and therefore cannot be stored in the same property. Creating a second property for constants only is not feasible, as this would introduce four bytes per object that need to be sent to the GPU, which most of the time does not contain a defined value. 
+Constants are also very simple, as they represent a single 32-bit floating point value. However, because of the variables and parameters, the value property is already defined as an integer and not as a floating point number. Unlike languages like Python, where every number is a floating point number, in Julia they are different and therefore cannot be stored in the same property. Creating a second property for constants only is not feasible, as this would introduce 4 bytes per object that need to be sent to the GPU which most of the time does not contain a defined value. 

-To avoid sending unnecessary bytes, Julia provides a mechanism called \verb|reinterpret| that can be used. This allows the bits of a variable of one type, to be treated as the bits of a different type. For example, the bits used to represent a floating point number are then interpreted as an integer and can be stored in the same property. On the GPU, the same concept can be applied to reinterpret the integer value as a floating point value for further calculations. This is also the reason why the original type of the value needs to be stored alongside the value in order for the stored value to be interpreted and the expressions to be evaluated correctly.
+To avoid sending unnecessary bytes, a mechanism provided by Julia called reinterpret can be used. This allows the bits of a variable of one type, to be treated as the bits of another type. The bits used to represent a floating point number are then interpreted as an integer and can be stored in the same property. On the GPU, the same concept can be applied to reinterpret the integer value as a floating point value for further calculations. This is also the reason why the original type of the value needs to be stored alongside the value in order for the stored to be interpreted correctly and the expressions to be evaluated correctly.

-Operators are very different from variables, parameters and constants. Because they represent an operation rather than a value, a different way of storing them is required. An operator can be uniquely mapped to a number to identify the operation. For example, if the addition operator is mapped to the integer $1$. Consequently, when the evaluator encounters an object of type operator and a value of $1$, it can determine the corresponding operation to perform. This can be done for all operators which means it is possible to store them in the same object structure. The type must be specified to be an operator and the value can be stored without needing to reinterpret it. The mapping of an operator to a value is commonly referred to as an operation code, or opcode, ensuring that each operator is uniquely identifiable.
+Operators are very different from variables, parameters and constants. Because they represent an operation rather than a value, a different way of storing them is required. An operator can be mapped to a number to identify the operation. For example, if the addition operator is mapped to the integer $1$, then when the evaluator encounters an object of type operator and a value of $1$, it will know which operation to perform. This can be done for all operators which means it is possible to store them in the same object with the same property. and only the type needs to be specified. The mapping of an operator to a value is often called an operation code, or opcode, and each operator is represented as one opcode.

-With this, the intermediate representation is defined. Figure \ref{fig:pre-processing-result-impl} shows how a simple expression would look after the pre-processing step. Note that the bit representation of the value $2.5$ has been reinterpreted as an integer, resulting in the seemingly random value. 
+With this, the intermediate representation is defined. Figure \ref{fig:pre-processing-result-impl} shows how a simple expression would look after the pre-processing step. Note that the vluae $2.5$ has been reinterpreted as an integer, resulting in the seemingly random value. 
 \begin{figure}
 	\centering
 	\includegraphics[width=.9\textwidth]{pre-processing_result_impl.png}
-	\caption{The expression $x_1 + 2.5$ after it has been converted to the intermediate representation. Note that the constant value $2.5$ stores a seemingly random value due to the bits being reinterpreted as an integer.}
+	\caption{The expression $x_1 + 2.5$ after it has been converted to the intermediate representation. Note that the constant value $2.5$ stores a seemingly random value due to it being reinterpreted as an integer.}
 	\label{fig:pre-processing-result-impl}
 \end{figure}

@ -62,9 +66,9 @@ With this, the intermediate representation is defined. Figure \ref{fig:pre-proce
 Now that the intermediate representation has been defined, the processing step can be implemented. This section describes the structure of the expressions and how they are processed. It also explains the process of parsing the expressions to ensure their validity and converting them into the intermediate representation.

 \subsubsection{Expressions} 
-With the pre-processing step, the first modern feature of Julia has been used. As already mentioned, Julia provides extensive support for meta-programming, which is important for this step. Julia represents its own code as a data structure, which allows a developer to manipulate the code at runtime. The code is stored in the so-called \verb|Expr| object as an Abstract Syntax Tree (AST), which is the most minimal tree representation of a given expression. As a result, mathematical expressions can also be represented as such an \verb|Expr| object instead of a simple string. This is a major benefit, because these expressions can then be easily manipulated by the symbolic regression algorithm. Because of this, the pre-processing step requires the expressions to be provided as an \verb|Expr| object instead of a string.
+With the pre-processing step, the first modern feature of Julia has been used. As already mentioned, Julia provides extensive support for meta-programming, which is important for this step. Julia represents its own code as a data structure, which allows a developer to manipulate the code at runtime. The code is stored in the so-called Expr object as an Abstract Syntax Tree (AST), which is the most minimal tree representation of a given expression. As a result, mathematical expressions can also be represented as such an Expr object instead of a simple string. Which is a major benefit, because these expressions can then be easily manipulated by the symbolic regression algorithm. This is the main reason why the pre-processing step requires the expressions to be provided as an Expr object instead of a string.

-Another major benefit of the expressions being stored in the \verb|Expr| object and therefore as an AST, is the included operator precedence. Because it is a tree where the leaves are the constants, variables or parameters (also called terminal symbols) and the nodes are the operators, the correct result will be calculated when evaluating the tree from bottom to top. As can be seen in Figure \ref{fig:expr-ast}, the expression $1 + x_1 \, \log(p_1)$, when parsed as an AST, contains the correct operator precedence. First the bottom most subtree $\log(p_1)$ must be evaluated before the multiplication, and after that, the addition can be evaluated.
+Another major benefit of the expressions being stored in the Expr object and therefore as an AST, is the included operator precedence. Because it is a tree where the leaves are the constants, variables or parameters (also called terminal symbols) and the nodes are the operators, the correct result will be calculated when evaluating the tree from bottom to top. As can be seen in Figure \ref{fig:expr-ast}, the expression $1 + x_1 \, \log(p_1)$, when parsed as an AST, contains the correct operator precedence. First the bottom most subtree $\log(p_1)$ must be evaluated before the multiplication, and after that, the addition can be evaluated.

 It should be noted however, that Julia stores the tree as a list of arrays to allow a node to have as many children as necessary. For example the expression $1+2+\dots+n$ contains only additions, which is a commutative operation, meaning that the order of operations is irrelevant. The AST for this expression would contain the operator at the first position in the array and the values at the following positions. This ensures that the AST is as minimal as possible.

@ -75,7 +79,7 @@ It should be noted however, that Julia stores the tree as a list of arrays to al
 	\label{fig:expr-ast}
 \end{figure}

-\subsubsection{Conversion into the Intermediate Representation}
+\subsubsection{Parsing}
 To convert the AST of an expression into the intermediate representation, a top-down traversal of the tree is required. The steps for this are as follows:

 \begin{enumerate}
@ -88,16 +92,18 @@ To convert the AST of an expression into the intermediate representation, a top-
 	\item Return the generated postfix expression/intermediate representation.
 \end{enumerate}

-The validation of the expression is performed throughout the conversion process. Validating that only correct operators are used is performed in step 1. To be able to convert the operator to its corresponding opcode, it must be validated that an opcode exists for it, and therefore whether it is valid or not. Similarly, converting the tokens into an expression element object ensures that only variables and parameters in the correct format are present in the expression. This is handled in step 2.
+The validation of the expression is performed throughout the parsing process. Validating that only correct operators are used is performed in step 1. To be able to convert the operator to its corresponding opcode, it must be validated that an opcode exists for it, and therefore whether it is valid or not. Similarly, converting the tokens into an expression element object ensures that only valid variables and parameters are present in the expression. This is handled in step 2.

-As explained above, a node of a binary operator can have $n$ children. In these cases, additional handling is required to ensure correct conversion. This handling is summarised in step 4. Essentially, the operator must be added after the first two elements, for each subsequent element, the operator must also be added. The expression $1+2+3+4$ is converted to the AST $+\,1\,2\,3\,4$ and without step 4 the postfix expression would be $1\,2\,3\,4\,+$. If the operator is added after the first two elements and then after each subsequent element, the correct postfix expression $1\,2\,+\,3\,+\,4\,+$ will be generated.
+As explained above, a node of a binary operator can have $n$ children. In these cases, additional handling is required to ensure correct conversion. This handling is summarised in step 4. Essentially, the operator must be added after the first two elements, and for each subsequent element, the operator must also be added. The expression $1+2+3+4$ is converted to the AST $+\,1\,2\,3\,4$ and without step 4 the postfix expression would be $1\,2\,3\,4\,+$. If the operator is added after the first two elements and then after each subsequent element, the correct postfix expression $1\,2\,+\,3\,+\,4\,+$ will be generated.

-Each subtree of the AST is its own separate AST, which can be converted to postfix notation in the same way the whole AST can be converted. This means that the algorithm only needs to be able to handle leave nodes, and when it encounters a subtree, it recursively calls itself to convert the remaining AST. Step 5 indicates this recursive behaviour. 
+Each subtree of the AST is its own separate AST, which can be converted to postfix notation in the same way the whole AST can be converted. This means that the algorithm only needs to be able to handle leave nodes, and when it encounters a subtree, it recursively calls itself to parse the remaining AST. Step 5 indicates this recursive behaviour. 

-While the same expression usually occurs only once, sub-expressions can occur multiple times. In the example in Figure \ref{fig:expr-ast}, the whole expression $1 + x_1 \, \log(p_1)$ is unlikely to be generated more than once by the symbolic regression algorithm. However, the sub-expression $\log(p_1)$ is much more likely to be generated multiple times. This means that the generation of the intermediate representation for this subtree only needs to be done once and can be reused later. Therefore, a cache can be used to store the intermediate representation for this sub-expression and access it again later to eliminate the conversion overhead.
+While the same expression usually occurs only once, sub-expressions can occur multiple times. In the example in Figure \ref{fig:expr-ast}, the whole expression $1 + x_1 \, \log(p_1)$ is unlikely to be generated more than once by the symbolic regression algorithm. However, the sub-expression $\log(p_1)$ is much more likely to be generated multiple times. This means that the generation of the intermediate representation for this subtree only needs to be done once and can be reused later. Therefore, a cache can be used to store the intermediate representation for this sub-expression and access it again later to eliminate the parsing overhead.
+
+Caching can be applied to both individual sub-expressions as well as the entire expression. While it is unlikely for the whole expression to recur frequently, either as a whole or as part of a larger expression, implementing a cache will not degrade performance and will, in fact, enhance it if repetitions do occur. In the context of parameter optimisation, where the evaluators are employed, expressions will recur, making full-expression caching advantageous. The primary drawback of caching is the increased use of RAM. However, given that RAM is plentiful in modern systems, this should not pose a significant issue.

 \section{Interpreter}
-The implementation of the interpreter is divided into two main components, the CPU-based control logic and the GPU-based interpreter as outlined in the Concept and Design chapter. This section aims to describe the technical details of these components. First the CPU-based control logic will be discussed. This component handles the communication with the GPU and is the entry point which is called by the symbolic regression algorithm. Following this, the GPU-based interpreter will be explored, highlighting the specifics of developing an interpreter on the GPU.
+The implementation is divided into two main components, the CPU-based control logic and the GPU-based interpreter as outlined in the Concept and Design chapter. This section aims to describe the technical details of these components. First the CPU-based control logic will be discussed. This component handles the communication with the GPU and is the entry point which is called by the symbolic regression algorithm. Following this, the GPU-based interpreter will be explored, highlighting the specifics of developing an interpreter on the GPU.

 An overview of how these components interact with each other is outlined in Figure \ref{fig:interpreter-sequence}. The parts of this figure are explained in detail in the following sections.

@ -109,14 +115,16 @@ An overview of how these components interact with each other is outlined in Figu
 \end{figure}

 \subsection{CPU Side}
-The interpreter is given all the expressions it needs to interpret as an input. Additionally, it needs the variable matrix as well as the parameters for each expression. All expressions are passed to the interpreter as an array of \verb|Expr| objects, as they are needed for the pre-processing step or the frontend. The first loop as shown in Figure \ref{fig:interpreter-sequence}, is responsible for sending the expressions to the frontend to be converted into the intermediate representation. After this step, the expressions are in the correct format to be sent to the GPU and the interpretation process can continue.
+The interpreter is given all the expressions it needs to interpret as an input. Additionally, it needs the variable matrix as well as the parameters for each expression. All expressions are passed to the interpreter as an array of Expr objects, as they are needed for the pre-processing step or the frontend. The first loop as shown in Figure \ref{fig:interpreter-sequence}, is responsible for sending the expressions to the frontend to be converted into the intermediate representation. After this step, the expressions are in the correct format to be sent to the GPU and the interpretation process can continue.

 \subsubsection{Data Transfer}
-Before the GPU can start with the interpretation, the data needs to be present on it. Because the variables are already in matrix form, transferring the data is fairly straightforward. Memory must be allocated in the global memory of the GPU and then be copied from RAM into the allocated memory. Allocating memory and transferring the data to the GPU is handled implicitly by the \verb|CuArray| type provided by CUDA.jl.
+Before the GPU can start with the interpretation, the data needs to be sent to the GPU. Because the variables are already in matrix form, transferring the data is fairly straightforward. Memory must be allocated in the global memory of the GPU and then be copied from RAM into the allocated memory. Allocating memory and transferring the data to the GPU is handled implicitly by the CuArray type provided by CUDA.jl.

-To optimise the interpreter for parameter optimisation workloads, this step is performed before it is called. Although, the diagram includes this transmission for completeness, it is important to note that the variables never change, as they represent the observed inputs of the system that is being modelled by the symbolic regression algorithm. As a symbolic regression algorithm is usually implemented with GP, there are many generations that need to be evaluated. Therefore, re-transmitting the variables for each generation is inefficient. By transmitting the variables once before the symbolic regression algorithm begins, additional performance gains are very likely. However, this approach would require modifying the symbolic regression algorithm, which is the reason this optimisation has not been applied. Nonetheless, if needed it is still possible to modify the implementation at a later stage with minimal effort.
+To optimise the interpreter for parameter optimisation workloads, this step is actually performed before the interpreter is called. Although, the diagram includes this transmission for completeness, it is important to note that the variables never change, as they represent the observed inputs of the system that being modelled by the symbolic regression algorithm. Therefore, re-transmitting the variables for each step of the parameter optimisation process would be inefficient. By transmitting the variables once and reusing them throughout the parameter optimisation, significant time can be saved. 

-Once the variables are transmitted, the parameters must also be transferred to the GPU. Unlike the variables, the parameters are stored as a vector of vectors. In order to transmit the parameters efficiently, they also need to be put in a matrix form. The matrix needs to be of the form $k \times N$, where $k$ is equal to the length of the longest inner vector and $N$ is equal to the length of the outer vector. This ensures that all values can be stored in the matrix. It also means that if the inner vectors are of different lengths, some extra unnecessary values will be transmitted, but the overall benefit of treating them as a matrix outweighs this drawback. The Program \ref{code:julia_vec-to-mat} shows how this conversion can be implemented. Note that it is required to provide an invalid element. This ensures defined behaviour and helps with finding errors in the code. After the parameters have been brought into matrix form, they can be transferred to the GPU the same way the variables are transferred.
+Furthermore, transferring the data to the GPU before the symbolic regression algorithm begins, could save even more time. However, this approach would require modification to the symbolic regression algorithm. Therefore, the decision has been made to neglect this optimisation. Nonetheless, it is still possible to modify the implementation at a later stage with minimal effort, if needed.
+
+Once the variables are transmitted, the parameters also must be transferred to the GPU. Unlike the variables, the parameters are stored as a vector of vectors. In order to transmit the parameters efficiently, they also need to be put in a matrix form. The matrix needs to be of the form $k \times N$, where $k$ is equal to the length of the longest inner vector and $N$ is equal to the length of the outer vector. This ensures that all values can be stored in the matrix. It also means that if the inner vectors are of different lengths, some extra unnecessary values will be transmitted, but the overall benefit of treating them as a matrix outweighs this drawback. The Program \ref{code:julia_vec-to-mat} shows how this conversion can be implemented. Note that it is required to provide an invalid element. This ensures defined behaviour and helps with finding errors in the code. After the parameters have been brought into matrix form, they can be transferred to the GPU the same way the variables are transferred.

 \begin{program}
 	\begin{GenericCode}
@ -138,11 +146,11 @@ end
 	\label{code:julia_vec-to-mat}
 \end{program}

-Similar to the parameters, the expressions are also stored as a vector of vectors. The outer vector contains each expression, while the inner vectors hold the expressions in their intermediate representation. Therefore, this vector of vectors also needs to be brought into matrix form following the same concept as the parameters. To simplify development, the special opcode \textit{stop} has been introduced, which is used for the \verb|invalidElement| in Program \ref{code:julia_vec-to-mat}. As seen in Section \ref{sec:interpreter-gpu-side}, this element is used to determine if the end of an expression has been reached during the interpretation process. This removes the need for additional data to be sent which stores the length of each expression to determine if the entire expression has been interpreted or not. Therefore, a lot of overhead can be reduced.
+Similar to the parameters, the expressions are also stored as a vector of vectors. The outer vector contains each expression, while the inner vectors hold the expressions in their intermediate representation. Therefore, this vector of vectors also needs to be brought into matrix form the same way the parameters are brought into matrix form. To simplify development, the special opcode \textit{stop} has been introduced, which is used for the invalidElement in Program \ref{code:julia_vec-to-mat}. As seen in Section \ref{sec:interpreter-gpu-side}, this element is used to determine if the end of an expression has been reached during the interpretation process. This removes the need for additional data to be sent which stores the length of each expression to determine if the entire expression has been interpreted or not. Therefore, a lot of overhead can be reduced.

-Once the conversion into matrix form has been performed, the expressions are transferred to the GPU. Just like with the variables, the expressions remain the same over the course of the parameter optimisation part. Which is the reason they are transferred to the GPU before the interpreter is called, reducing the number of unnecessary data transfers.
+Once the conversion into matrix form has been performed, the expressions are transferred to the GPU. Just like with the variables, the expressions remain the same over the course of the parameter optimisation part. Therefore, they are transferred to the GPU before the interpreter is called, to reduce the amount of unnecessary data transfer.

-Only raw data can be sent to the GPU, which means that meta information about the data layout is missing. The matrices are represented as flat arrays, which means they have lost their column and row information. This information must be sent separately to inform the kernel about the dimensions of the expressions, variables and parameters. Otherwise, the kernel does not know at which memory location the second data point is stored for example, as it does not know how large a single set is. Figure \ref{fig:memory-layout-data} shows how the data is stored without any information about the rows or columns of the matrices. The thick lines help to identify where a new column, and therefore a new set of data begins. However, the GPU has no knowledge of this and therefore the meta information must be transferred separately to ensure that the data is accessed correctly.
+In addition to the already described data that needs to be sent, two more steps are required that have not been included in the Sequence Diagram \ref{fig:interpreter-sequence}. The first one is the allocation of global memory for the result matrix. Without this, the kernel would not know where to store the interpretation results and the CPU would not know from which memory location to read the results from. Therefore, enough global memory needs to be allocated beforehand so that the results can be stored and retrieved after all kernel executions have finished. 

 \begin{figure}
 	\centering
@ -151,30 +159,31 @@ Only raw data can be sent to the GPU, which means that meta information about th
 	\label{fig:memory-layout-data}
 \end{figure}

-In addition to the already described data that needs to be sent, one more step is required that has not been included in the Sequence Diagram \ref{fig:interpreter-sequence}. Global memory must be allocated, that allows the results of the evaluation to be stored. Without this, the kernel would not know where to store the interpretation results and the CPU would not know from which memory location to read the results from. Therefore, enough global memory needs to be allocated beforehand so that the results can be stored and retrieved after all kernel executions have finished. 
-
+Only raw data can be sent to the GPU, which means that information about the data is missing. The matrices are represented as flat arrays, which means they have lost their column and row information. This information must be sent separately to let the kernel know the dimensions of the expressions, variables and parameters. Otherwise, the kernel does not know at which memory location the second variable set is stored, as it does not know how large a single set is for example. Figure \ref{fig:memory-layout-data} shows how the data is stored without any information about the rows or columns of the matrices. The thick lines help to identify where a new column, and therefore a new set of data begins. However, the GPU has no knowledge of this and therefore the additional information must be transferred to ensure that the data is accessed correctly.

 \subsubsection{Kernel Dispatch}
-Once all the data is present on the GPU, the CPU can dispatch the kernel for each expression. This dispatch requires parameters that specify the number of threads and their organisation into thread blocks. In total, one thread is required for each data point and therefore the grouping into thread blocks is the primary variable. Taking into account the constraints explained in Section \ref{sec:occupancy}, this grouping needs to be tuned for optimal performance. The specific values alongside the methodology for determining these values will be explained in Chapter \ref{cha:evaluation}.
+Once all the data is present on the GPU, the CPU can dispatch the kernel for each expression. This dispatch requires parameters that specify the number of threads and their organisation into thread blocks. In total, one thread is required for each variable set and therefore the grouping into thread blocks is the primary variable. Taking into account the constraints explained in Section \ref{sec:occupancy}, this grouping needs to be tuned for optimal performance. The specific values alongside the methodology for determining these values will be explained in Chapter \ref{cha:evaluation}.

 In addition, the dispatch parameters also include the pointers to the location of the data allocated and transferred above, as well as the index of the expression to be interpreted. Since all expressions and parameters are sent to the GPU at once, this index ensures that the kernel knows where in memory to find the expression it needs to interpret and which parameter set it needs to use. After the kernel has finished, the result matrix needs to be read from the GPU and passed back to the symbolic regression algorithm.

-Crucially, dispatching a kernel is an asynchronous operation, which means that the CPU does not wait for the kernel to finish before continuing. This allows the CPU to dispatch all kernels at once, rather than one at a time. As explained in Section \ref{sec:architecture}, a GPU can have multiple resident grids, meaning that the dispatched kernels can run concurrently, reducing evaluation time. Only once the result matrix is read from the GPU does the CPU have to wait for all kernels to finish execution.
+Crucially, dispatching a kernel is an asynchronous operation, which means that the CPU does not wait for the kernel to finish before continuing. This allows the CPU to dispatch all kernels at once, rather than one at a time. As explained in Section \ref{sec:architecture}, a GPU can have multiple resident grids, meaning that the dispatched kernels can run concurrently, drastically reducing evaluation times. Only once the result matrix is read from the GPU does the CPU have to wait for all kernels to finish execution.

 \subsection{GPU Side}
 \label{sec:interpreter-gpu-side}
-With the GPU's global memory containing all the necessary data and the kernel being dispatched, the interpretation process can begin. Before interpreting an expression, the global thread ID must be calculated. This step is crucial because each data point is assigned to a unique thread. Therefore, the global thread ID determines which data point should be used for the current interpretation instance.
+% Memory access (currently global memory only)
+% no dynamic memory allocation like on CPU (stack needs to have fixed size; also stack is stored in local memory)
+With the GPU's global memory now containing all the necessary data and the kernel being dispatched, the interpretation process can begin. Before interpreting an expression, the global thread ID must be calculated. This step is crucial because each variable set is assigned to a unique thread. Therefore, the global thread ID determines which variable set should be used for the current interpretation instance.

-Moreover, the global thread ID ensures that excess threads do not perform any work. As otherwise these threads would try to access a data point that does not exist and therefore would lead to an illegal memory access. This is necessary because the number of required threads often does not align perfectly with the number of threads per block multiplied by the number of blocks. If for example $1031$ threads are required, then at least two thread blocks are needed, as one thread block can hold at most $1024$ threads. Because $1031$ is a prime number, it can not be divided by any practical number of thread blocks. If two thread blocks are allocated, each holding $1024$ threads, a total of $2048$ threads is started. Therefore, the excess $2048 - 1031 = 1017$ threads must be prevented from executing. By using the global thread ID and the number of available data points, these excess threads can be easily identified and terminated early in the kernel execution.
+Moreover, the global thread ID ensures that excess threads do not perform any work. As otherwise these threads would try to access a variable set that does not exist and therefore would lead to an illegal memory access. This is necessary because the number of required threads often does not align perfectly with the number of threads per block multiplied by the number of blocks. If for example $1031$ threads are required, then at least two thread blocks are needed, as one thread block can hold at most $1024$ threads. Because $1031$ is a prime number, it can not be divided by any practical number of thread blocks. If two thread blocks are allocated, each holding $1024$ threads, a total of $2048$ threads is started. Therefore, the excess $2048 - 1031 = 1017$ threads must be prevented from executing. By using the global thread ID and the number of available variable sets, these excess threads can be easily identified and terminated early in the kernel execution.

-Afterwards the stack for the interpretation can be created. It is possible to dynamically allocate memory on the GPU, which enables a similar programming model as on the CPU. \textcite{winter_are_2021} have compared many dynamic memory managers and found, that the performance impact of them is rather small. However, if it is easily possible to use static allocations, it still offers better performance. In the case of this thesis, it is easily possible which is the reason why the stack has been chosen to have a static size. Because it is known that expressions do not exceed 50 tokens, including the operators, the stack size has been set to ten, which should be more than enough to hold the values and partial results, even in the worst case. It is very unlikely that ten values must be stored before a binary operator is encountered that reduces the number of values on the stack. Therefore, a stack size of ten should be sufficient, however it is possible to increase the stack size if needed.
+Afterwards the stack for the interpretation can be created. It is possible to dynamically allocate memory on the GPU, which enables a similar programming model as on the CPU. \textcite{winter_are_2021} have even compared many dynamic memory managers and found, that the performance impact of them is rather small. However, if it is easily possible to use static allocations, it still offers better performance. In the case of this thesis, it is easily possible which is the reason why the stack has been chosen to have a static size. Because it is known that expressions do not exceed 50 tokens, including the operators, the stack size has been set to 25, which should be more than enough to hold the values and partial results, even in the worst case.

 \subsubsection{Main Loop}
 Once everything is initialised, the main interpreter loop starts interpreting the expression. Because of the intermediate representation, the loop simply iterates through the expression from left to right. On each iteration the type of the current token is checked, to decide which operation to perform. 

-If the current token type matches the \textit{stop} opcode, the interpreter knows that it is finished. This simplicity is the reason why this opcode was introduced, as mentioned above.
+If the current token type matches the \textit{stop} opcode, the interpreter knows that it is finished. This simplicity is the reason why this opcode was introduced, as explained above.

-More interestingly is the case, where the current token corresponds to an index to either the variable matrix, or the parameter matrix. In this case, the token's value is important. To access one of these matrices, the correct starting index of the set must first be calculated. As previously explained, information about the dimensions of the data is lost during transfer. At this stage, the kernel only knows the index of the first element of either matrix, which set to use for this evaluation, and the index of the value within the current set. However, the boundaries of these sets are unknown. Therefore, the additionally transferred data about the dimensions is used in this step to calculate the index of the first element in each set. With this calculated index and the index stored in the token, the correct value can be loaded by adding the token value to the index of the first element of the set. After the value has been loaded, it is pushed to the top of the stack for later use.
+More interestingly is the case, where the current token corresponds to an index to either the variable matrix, or the parameter matrix. In this case, the token's value is important. To access one of these matrices, the correct starting index of the set must first be calculated. As previously explained, information about the dimensions of the data is lost during transfer. At this stage, the kernel only knows the index of the first element of either matrix, which set to use for this evaluation, and the index of the value within the current set. However, the boundaries of these sets are unknown. Therefore, the additionally transferred data about the dimensions is used in this step to calculate the index of the first element in each set. With this calculated index and the index stored in the token, the correct value can be loaded. After the value has been loaded, it is pushed to the top of the stack for later use.

 % MAYBE:
 % Algorithm that shows how this calculation works
@ -185,7 +194,7 @@ Evaluating the expression is happening if the current token is an operator. The

 Support for ternary operators could also be easily added. An example of a ternary operator that would help improve performance would be the GPU supported Fused Multiply-Add (FMA) operator. While this operator does not exist in Julia, the frontend can generate it when it encounters a sub-expression of the form $x * y + z$. Since this expression performs the multiplication and addition in a single clock cycle instead of two, it would be a feasible optimisation. However, detecting such sub-expressions is complicated, which why it is not supported in the current implementation.

-Once the interpreter loop has finished, the result of the evaluation must be stored in the result matrix. By using the index of the current expression, as well as the index of the current data point (the global thread ID) it is possible to calculate the index where the result must be stored. The last value on the stack is the result, which is stored in the result matrix at the calculated location.
+Once the interpreter loop has finished, the result of the evaluation must be stored in the result matrix. By using the index of the current expression, as well as the index of the current variable set (the global thread ID) it is possible to calculate the index where the result must be stored. The last value on the stack is the result, which is stored in the result matrix at the calculated location.

 \section{Transpiler}
 Unlike the interpreter, the transpiler primarily operates on the CPU, with only a minor GPU-based component. This is because the transpiler must generate entire PTX kernels from Julia expressions, rather than simply executing a pre-written kernel like the interpreter. Similar to the interpreter, the CPU side of the transpiler manages communication with both the GPU and the symbolic regression algorithm. This section provides a detailed overview of the transpiler's functionality.
@ -200,15 +209,15 @@ An overview of how the transpiler interacts with the frontend and GPU is outline
 \end{figure}

 \subsection{CPU Side}
-After the transpiler has received the expressions to be transpiled, it first sends them to the frontend for processing. Once an expression has been processed, it is sent to the transpiler backend which is explained in more detail Section \ref{sec:transpiler-backend}. The backend is responsible for generating the kernels. When finished, each expression is transpiled into its own kernel written in PTX code. 
+After the transpiler has received the expressions to be transpiled, it first sends them to the frontend for processing. Once they have been processed, the expressions are sent to the transpiler backend which is explained in more detail Section \ref{sec:transpiler-backend}. The backend is responsible for generating the kernels. The output of the backend are the kernels written as PTX code for all expressions. 

 \subsubsection{Data Transfer}
-Data is sent to the GPU in the same way it is sent in the interpreter. The variables are sent as they are, while the parameters are again brought into matrix form. Memory must also be allocated for the result matrix. Unlike the interpreter however, only the variables and parameters need to be sent to the GPU. The variables are again sent before the parameter optimisation step to reduce the number of data transfers.
+Data is sent to the GPU in the same way as it is sent by the interpreter. The variables are sent as they are, while the parameters are again brought into matrix form. Memory must also be allocated for the result matrix. Unlike the interpreter however, this is the only data that needs to be sent to the GPU for the transpiler. 

-Because each expression is represented by its own kernel, there is no need to transfer the expressions themselves. Moreover, there is also no need to send information about the layout of the variables and parameters to the GPU. The reason for this is explained in the transpiler backend section below.
+Because each expression has its own kernel, there is no need to transfer the expressions themselves. Moreover, there is also no need to send information about the layout of the variables and parameters to the GPU. The reason for this is explained in the transpiler backend section below.

 \subsubsection{Kernel Dispatch}
-Once all the data is present on the GPU, the transpiled kernels can be dispatched. Dispatching the transpiled kernels is more involved than dispatching the interpreter kernel. Program \ref{code:julia_dispatch-comparison} shows the difference between dispatching the interpreter kernel and the transpiled kernels. An important note, is that the transpiled kernels must be manually compiled into machine code. To achieve this, CUDA.jl provides functionality to instruct the driver to compile the PTX code. The same process of creating PTX code and compiling it must also be done for the interpreter kernel, however, this is done by CUDA.jl automatically when calling the @cuda macro in line 6.
+Once all the data is present on the GPU, the transpiled kernels can be dispatched. Dispatching the transpiled kernels is more involved than dispatching the interpreter kernel. Program \ref{code:julia_dispatch-comparison} shows the difference between dispatching the interpreter kernel and the transpiled kernels. An important note, is that the transpiled kernels must be manually compiled into machine code. To achieve this, CUDA.jl provides functionality to instruct the drivers to compile the PTX code. The same process of creating PTX code and compiling it must also be done for the interpreter kernel, however, this is done by CUDA.jl automatically when calling the @cuda macro in line 6.

 \begin{program}
 	\begin{JuliaCode}
@ -241,9 +250,7 @@ end	\end{JuliaCode}
 	\label{code:julia_dispatch-comparison}
 \end{program}

-Similar to the interpreter, the frontend and backend are executed before the parameter optimisation step to improve the runtime. Each kernel is compiled into machine code after it has been generated to ensure, as little work as possible needs to be done during the parameter optimisation loop. However, as will be explained in Chapter \ref{cha:evaluation}, storing the compiled kernels is very memory intensive. This means that if many expressions need to be evaluated at once, a lot of memory is required.
-
-After all kernels have been dispatched, the CPU waits for the kernels to complete their execution. Once the kernels have finished, the result matrix is read from global memory into system memory. The results can then be returned to the symbolic regression algorithm. 
+After all kernels have been dispatched, the CPU waits for the kernels to complete their execution. When the kernels have finished, the result matrix is read from global memory into system memory. The results can then be returned to the symbolic regression algorithm. 

 \subsection{Transpiler Backend} 
 \label{sec:transpiler-backend}
@ -261,7 +268,7 @@ PTX assumes a register machine, which means that a developer has to work with a
 \subsubsection{Register Management}
 Register management is a crucial part of the transpiler as it is important to balance register usage with occupancy and performance. \textcite{aho_compilers_2006, cooper_engineering_2022} describe techniques for efficient register management, especially for machines with few registers and register usage by convention on the CPU. On the GPU however, there are many more registers available, all of which can be used as needed without restrictions.

-To allow for maximum occupancy and avoid spilling registers into local memory, the transpiler tries to reuse as many registers as possible. Furthermore, allocating and using a register in PTX is very similar to using variables in high level code, as they represent virtual registers. Therefore, much of the complexity of managing registers is handled by the PTX compiler of the driver. 
+To allow for maximum occupancy and avoid spilling registers into local memory, the transpiler tries to reuse as many registers as possible. Furthermore, allocating and using a register in PTX is very similar to using variables in code, as they represent virtual registers. Therefore, much of the complexity of managing registers is handled by the PTX compiler of the driver. 

 Because much of the complexity of managing registers is hidden by the compiler, or does not apply in this scenario, it is implemented very simple. If a register is needed at any point in the transpilation process, it can be requested by the register manager. A register must be given a name and the manager uses this name to determine the type of this register. For example, if the name of the register is \verb|f|, it is assumed to be an FP32 register. Several naming conventions exist to ensure that the register is of the correct data type. The manager then returns the identifying name of the register, which is used to access it. The identifying name, is the name given as an input and a zero-based number that is incremented by one for each successive call.

@ -303,11 +310,13 @@ End:
 	ret;
 \end{PTXCode}

-It needs to be noted, that the register \verb|%r2| is not needed. Since the transpiler already knows the number of data points, it would be wasteful to transmit this information to the kernel. Instead, the transpiler inserts the number directly as a constant to save resources.
+It needs to be noted, that the register \verb|%r2| is not needed. Since the transpiler already knows the number of variable sets, it would be wasteful to transmit this information to the kernel. Instead, the transpiler inserts the number directly as a constant to save resources.

 \subsubsection{Main Loop}
 The main loop of the transpiler, which generates the kernel for evaluating a single expression, is analogous to the interpreter's main loop. Since the transpiler uses the same intermediate representation as the interpreter, both loops behave similarly. The transpiler loop also uses a stack to store the values and intermediate results. However, the transpiler does not require the special opcode \textit{stop} which was necessary in the interpreter to handle expressions padded to fit into a matrix. The transpiler only needs to process a single expression, which is stored in an unpadded vector of known length. This means that all tokens within the vector are valid and therefore do not require this opcode.

+% MAYBE : activity diagram for this loop (also add to interpreter main loop section (would maybe fit better in concept and design so basically move the algorithms of C&D here and add activity diagram to C&D ))
+
 When the loop encounters a token that represents an index to either the variable or the parameter matrix, the transpiler needs to generate code to load these values. In the general case, this works in exactly the same way as the interpreter, calculating the index and accessing the matrices at that location. 

 However, the first time a variable or parameter is accessed, it must be loaded from global memory. Although registers already exist that hold a pointer to the address of the matrices in global memory, the data is still not accessible. To make it accessible, the index to the value must first be calculated in the same way as it is calculated in the interpreter. Afterwards the value must be loaded into a register with the instruction \verb|ld.global.f32  %reg1, [%reg2]|. Using the first register of the instruction, the data can be accessed. For example, if the variable $x_1$ is accessed several times, all subsequent calls only need to reference this register and do not need to load the data from global memory again.
@ -375,12 +384,12 @@ On the GPU, the transpiled kernels are executed. Given that these kernels are re

 Note that Program \ref{code:ptx_kernel} has been slightly simplified to omit the mandatory directives and the register allocation. From line five to line ten, the addresses stored in the parameters are converted from parameter state space into global state space so that they reference the correct portion of the GPU's memory. It needs to be noted, that this kernel uses 64-bit addresses, which is the reason why some 64-bit instructions are used throughout the kernel. However, the evaluation of the expression itself is performed entirely using the faster 32-bit instructions.

-Lines 12 through 17 are responsible for calculating the global thread ID and ensuring that excessive threads are terminated early. Note that in line 16, if the global thread ID stored in register \verb|%r3| is greater than one, it must terminate early. This is because only one data point needs to be evaluated in this example.
+Lines 12 through 17 are responsible for calculating the global thread ID and ensuring that excessive threads are terminated early. Note that in line 16, if the global thread ID stored in register \verb|%r3| is greater than one, it must terminate early. This is because only one variable set needs to be evaluated in this example.

 The PTX code from line 22 to line 28 is the actual evaluation of the expression, with line 28 performing the calculation $x_1 + p_1$. All other lines are responsible for loading the values from global memory. The instructions in lines 22, 23, 25 and 26 are responsible for calculating the offset in bytes to the memory location where the value is stored with respect to the location of the first element. 

-The constants $4$ and $0$ are introduced for performance reasons. The number $4$ is the size of a data point in bytes. Since one data point in this case stores only a single FP32 value, each data point has a size of four bytes. Similarly, the number $0$ represents the index of the value within the data point. More precisely, this is the offset in bytes from the index to the data point, which is zero for the first element, four for the second, and so on. These two constants are calculated during the transpilation process to minimise the amount of data to be transferred to the GPU. 
+The constants $4$ and $0$ are introduced for performance reasons. The number $4$ is the size of a variable set in bytes. Since one variable set in this case stores only a single FP32 value, each variable set has a size of four bytes. Similarly, the number $0$ represents the index of the value within the variable set. More precisely, this is the offset in bytes from the index to the variable set, which is zero for the first element, four for the second, and so on. These two constants are calculated during the transpilation process to minimise the amount of data to be transferred to the GPU. 

-Storing the result in the result matrix is performed from line 31 to 33. The location where the value is to be stored is calculated in lines 31 and 32. Line 31 calculates the index inside the result matrix according to the current data point stored in register \verb|%rd3|. The constant $0$ is the product of the index of the expression being evaluated and the number of data points, and represents the column of the result matrix. Converting this index into bytes and adding it as an offset to the first element of the result matrix gives the correct memory location to store the result at.
+Storing the result in the result matrix is performed from line 31 to 33. The location where the value is to be stored is calculated in lines 31 and 32. Line 31 calculates the index inside the result matrix according to the current variable set stored in register \verb|%rd3|. The constant $0$ is the product of the index of the expression being evaluated and the number of variable sets, and represents the column of the result matrix. Converting this index into bytes and adding it as an offset to the first element of the result matrix gives the correct memory location to store the result at.

 This kernel consists mostly of overhead code, as only lines 22 through 33 contribute to calculating the result of the expression with the designated variable and parameter set. However, for larger expressions, the percentage of overhead code shrinks drastically.
--- a/thesis/chapters/introduction.tex
+++ b/thesis/chapters/introduction.tex
@ -1,21 +1,21 @@
 \chapter{Introduction}
 \label{cha:Introduction}

-This chapter provides an entry point for this thesis. First, the motivation of exploring this topic is presented. In addition, the research questions of this thesis are outlined. Finally, the structure of this thesis is described, explaining how each part contributes to answering the research questions. 
+This chapter provides an entry point for this thesis. First the motivation of exploring this topic is presented. In addition, the research questions of this thesis are outlined. Lastly the methodology on how to answer these questions will be explained. This master thesis is associated with the FFG COMET project ProMetHeus (\#904919). The developed software is used and further developed for modelling in the ProMetHeus project.

 \section{Background and Motivation}
 %
 % Not totally happy with this yet
 %
-Optimisation and acceleration of program code is a crucial part in many fields. For example video games need optimisation to lower the minimum hardware requirements which allows more people to run the game, increasing sales. Another example where optimisation is important are computer simulations. For those, optimisation is even more crucial, as this allows the scientists to run more detailed simulations or get the simulation results faster. Equation learning or symbolic regression is another field that can heavily benefit from optimisation. One part of equation learning, is to evaluate the expressions generated by a search algorithm, which can make up a significant portion of the runtime. This thesis is concerned with optimising the evaluation part to increase the overall performance of equation learning algorithms.
+Optimisation and acceleration of program code is a crucial part in many fields. For example video games need optimisation to lower the minimum hardware requirements which allows more people to run the game, increasing sales. Another example where optimisation is important are computer simulations. For those, optimisation is even more crucial, as this allows the scientists to run more detailed simulations or get the simulation results faster. Equation learning or symbolic regression is another field that can heavily benefit from optimisation. One part of equation learning, is to evaluate the expressions generated by a search algorithm which can make up a significant portion of the runtime. This thesis is concerned with optimising the evaluation part to increase the overall performance of equation learning algorithms.

-The following expression $5 - \text{abs}(x_1) \, \sqrt{p_1} / 10 + 2^{x_2}$, which contains simple mathematical operations as well as variables $x_n$ and parameters $p_n$, is one example that can be generated by the equation learning algorithm, Usually an equation learning algorithm generates hundreds or even thousands of such expressions per iteration, all of which have to be evaluated. Additionally, multiple different values must be entered for all variables and parameters, drastically increasing the amount of evaluations that need to be performed.
+The following expression $5 - \text{abs}(x_1) \, \sqrt{p_1} / 10 + 2^{x_2}$ which contains simple mathematical operations as well as variables $x_n$ and parameters $p_n$ is one example that can be generated by the equation learning algorithm, Usually an equation learning algorithm generates multiple of such expressions per iteration. Out of these expressions all possibly relevant ones have to be evaluated. Additionally, multiple different values need to be inserted for all variables and parameters, drastically increasing the amount of evaluations that need to be performed.

-In his blog, \textcite{sutter_free_2004} described how the free lunch is over in terms of the ever-increasing performance of hardware like the CPU. He states that to gain additional performance, developers need to start developing software for multiple cores and not just hope that on the next generation of CPUs the program magically runs faster. While this approach means more development overhead, a much greater speed-up can be achieved. However, in some cases the speed-up achieved by this is still not large enough, and another approach is needed. One of these approaches is the utilisation of Graphics Processing Units (GPUs) as an easy and affordable option as compared to compute clusters. Especially when talking about performance per dollar, GPUs are very inexpensive as found by \textcite{brodtkorb_graphics_2013}. \textcite{michalakes_gpu_2008} have shown a noticeable speed-up when using GPUs for weather simulation. In addition to computer simulations, GPU acceleration also can be found in other places such as networking \parencite{han_packetshader_2010} or structural analysis of buildings \parencite{georgescu_gpu_2013}. These solutions were all developed using CUDA\footnote{\url{https://developer.nvidia.com/cuda-toolkit}}. However, it is also possible to develop assembly like code for GPUs using Parallel Thread Execution (PTX)\footnote{\url{https://docs.nvidia.com/cuda/parallel-thread-execution/}} to gain more control.
+In his blog, \textcite{sutter_free_2004} described how the free lunch is over in terms of the ever-increasing performance of hardware like the CPU. He states that to gain additional performance, developers need to start developing software for multiple cores and not just hope that on the next generation of CPUs the program magically runs faster. While this approach means more development overhead, a much greater speed-up can be achieved. However, in some cases the speed-up achieved by this is still not large enough and another approach is needed. One of these approaches is the utilisation of Graphics Processing Units (GPUs) as an easy and affordable option as compared to compute clusters. Especially when talking about performance per dollar, GPUs are very inexpensive as found by \textcite{brodtkorb_graphics_2013}. \textcite{michalakes_gpu_2008} have shown a noticeable speed-up when using GPUs for weather simulation. In addition to computer simulations, GPU acceleration also can be found in other places such as networking \parencite{han_packetshader_2010} or structural analysis of buildings \parencite{georgescu_gpu_2013}. These solutions were all developed using CUDA\footnote{\url{https://developer.nvidia.com/cuda-toolkit}}. However, it is also possible to develop assembly like code for GPUs using Parallel Thread Execution (PTX)\footnote{\url{https://docs.nvidia.com/cuda/parallel-thread-execution/}} to gain more control.


 \section{Research Question}
-Given the successful implementation of GPU acceleration, the aim of this thesis is to improve the performance of evaluating mathematical equations, generated at runtime for symbolic regression using GPUs. Therefore, the following research questions are formulated:
+With these successful implementations of GPU acceleration, this thesis also attempts to improve the performance of evaluating mathematical equations, generated at runtime for symbolic regression using GPUs. Therefore, the following research questions are formulated:

 \begin{itemize}
 	\item How can simple arithmetic expressions that are generated at runtime be efficiently evaluated on GPUs?
@ -23,7 +23,7 @@ Given the successful implementation of GPU acceleration, the aim of this thesis
 	\item Under which circumstances is the interpretation of the expressions on the GPU or the translation to the intermediate language Parallel Thread Execution (PTX) more efficient?
 \end{itemize}

-Answering the first question is necessary to ensure the approach of this thesis is feasible. If it is feasible, it is important to determine if evaluating the expressions on the GPU improves the performance over a parallelised CPU evaluator. To answer if the GPU evaluator is faster than the CPU evaluator, the last research question is important. As there are two major ways of implementing an evaluator on the GPU, both need to be implemented and evaluated to finally state if evaluating expressions on the GPU is faster and if so, which type of implementation results in the best performance under which circumstances.
+Answering the first question is necessary to ensure the approach of this thesis is actually feasible. If it is feasible, it is important to evaluate if evaluating the expressions on the GPU actually improves the performance over a parallelised CPU evaluator. To answer if the GPU evaluator is faster than the CPU evaluator, the last research question is important. As there are two major ways of implementing an evaluator on the GPU, they need to be implemented and evaluated to finally state if evaluating expressions on the GPU is faster and if so, which type of implementation results in the best performance.


 \section{Thesis Structure}
--- a/thesis/chapters/relwork.tex
+++ b/thesis/chapters/relwork.tex
@ -21,18 +21,14 @@ An implementation for an equation learner in the physics domain is proposed by \

 % A survey conducted by \textcite{dabhi_survey_2012} shows how overfitting is not desirable and why more generalisable solutions are preferred. 

-\subsection{Genetic Programming}
-To generate equations, first the operators which are allowed to be used during generation need to be defined. It is also possible to define a maximum length for an expression as proposed by \textcite{koza_genetic_1994}. Expressions also consist of variables which represent the inputs as well as constants. Assuming that a given problem has two variables and one parameter, GP could generate an expression as seen in Equation \ref{eq:example} where $x_n$ are the variables, $p_1$ is the parameter and $O$ is the output which should correspond to the observed output for the given variables.
+
+To generate an equation, first the operators need to be defined that make up the equation. It is also possible to define a maximum length for an expression as proposed by \textcite{koza_genetic_1994}. Expressions also consist of constants as well as variables which represent the inputs. Assuming that a given problem has two variables and one parameter, the equation learner could generate an expression as seen in Equation \ref{eq:example} where $x_n$ are the variables, $p_1$ is the parameter and $O$ is the output which should correspond to the observed output for the given variables.

 \begin{equation} \label{eq:example}
 	O = 5 - \text{abs}(x_1) + x_2 \, \sqrt{p_1} / 10
 \end{equation}

-A typical GP generation generates multiple expressions at once. If for example a single generation consists of $300$ solution candidates or expressions, each of these expressions needs to be evaluated at least once to determine how well they can produce the desired output. 
-
-Each expression is part of a search space of all possible expressions consisting of the defined operators, variables and constants up to a defined maximum length. With the help of GP, this search space is explored, however, the generated expressions might not perfectly fit the data. To further refine the generated expressions, the concept of parameter optimisation can be used as described by \textcite{kommenda_local_2018}. Parameter optimisation is a kind of local search where parameters $p$ are introduced in the generated equations. In Equation \ref{eq:example} the parameter $p_1$ will be modified over some amount of iterations. This modification should assist in finding a local or even the global optimum by better fitting the expressions to the data. For example $50$ local search steps can be used, meaning that each expression needs to be evaluated $50$ times with the same variables, but different parameters. As a result, one GP generation consequently requires a total $300 * 50 = 15\,000$ evaluations of the expressions. However, typically more than one GP generation is needed to find a good solution. While the exact number of generations is problem specific, for this example a total of $100$ generations can be assumed. Each generation again generates $300$ expressions and needs to perform $50$ local search steps. This results in a total of $300 * 50 * 100 = 1\,500\,000$ evaluations which need to be performed during the entire runtime of the GP algorithm. These values have been taken from the GP algorithm for predicting discharge voltage curves of batteries as described by \textcite{kronberger_symbolic_2024}. Their GP algorithm converged after $54$ generations, resulting in $300 * 50 * 54 \approx 800\,000$ evaluations. This calculation omits the number of data points, which are the main contributor towards the total runtime. As for each generated expression, each data point needs to be used for parametrising the variables, drastically increasing the number of evaluations. They used a total of $11\,000$ data points, resulting in a total of $800\,000 * 11\,000 = 8.8 \text{billion}$ evaluations. Their results took over two days to compute on an eight core desktop CPU. While they did not provide runtime information for all problems they tested, the voltage curve prediction was the slowest. The other problems were in the range of a few seconds and up to a day. Especially the problems that took several hours to days to finish show, that there is still room for performance improvements. While a better CPU with more cores can be used, it is interesting to determine, if using GPUs can yield noticeable better performance.
-
-In his master's thesis \textcite{weinberger_vektoroperationen_2018} explored the possibility of utilising vector operations in the field of GP. He mainly focused on vectorising the evaluation on the CPU and by utilising the GPU to evaluate the expression trees generated by a GP algorithm. By utilising OpenCL and an AMD GPU he achieved a speed-up of two when utilising vectorisation on the CPU and a speed-up of 116 when utilising the GPU. This shows that the GPU also has great potential in the more specific case of symbolic regression with the above described parameter optimisation.
+A typical equation learner generates multiple expressions at once. If for example the equation learner generates $300$ expressions per GP generation, each of these expressions needs to be evaluated at least once to determine how well they can produce the desired output. Each expression lies in a different part of the search space and with only the variables, it would not easily be possible to explore the surrounding search space. To perform for example local search in this area, the parameter $p_1$ can be used. This local search phase helps to find the local or even global optimum. For example $50$ local search steps can be used, meaning that each expression needs to be evaluated $50$ times with the same variables, but different parameters. As a result, one GP generation consequently requires a total $300 * 50 = 15\,000$ evaluations of the expressions. However, typically more than one GP generation is needed to find a good local optimum. While the exact number of generations is problem specific, for this example a total of $100$ generations can be assumed. Each generation again generates $300$ expressions and needs to perform $50$ local search steps. This results in a total of $300 * 50 * 100 = 1\,500\,000$ evaluations which need to be performed during the entire runtime of the GP algorithm. These values have been taken from the equation learner for predicting discharge voltage curves of batteries as described by \textcite{kronberger_symbolic_2024}. Their equation learner converged after 54 generations, resulting in $300 * 50 * 54 \approx 800\,000$ evaluations. Depending on the complexity of the generated expressions, performing all of these evaluations takes up a lot of the runtime. Their results took over two days to compute on an eight core desktop CPU. While they did not provide runtime information for all problems they tested, the voltage curve prediction was the slowest. The other problems were in the range of a few seconds and up to a day. Especially the problems that took several hours to days to finish show, that there is still room for performance improvements. While a better CPU with more cores can be used, it is interesting to determine, if using GPUs can yield noticeable better performance.

 \section[GPGPU]{General Purpose Computation on Graphics Processing Units}
 \label{sec:gpgpu}
@ -44,10 +40,16 @@ If not specified otherwise, the following section and its subsections use the in

 Generally, simulations are great candidates for using GPUs, as they can benefit heavily from a high degree of parallelism and data throughput. \textcite{koster_high-performance_2020} have developed a way of using adaptive time steps on the GPU to considerably improve the performance of numerical and discrete simulations. In addition to the performance gains they were able to retain the precision and constraint correctness of the simulation. Black hole simulations are crucial for science and education for a better understanding of our world. \textcite{verbraeck_interactive_2021} have shown that simulating complex Kerr (rotating) black holes can be done on consumer hardware in a few seconds. Schwarzschild black hole simulations can be performed in real-time with GPUs as described by \textcite{hissbach_overview_2022} which is especially helpful for educational scenarios. While both approaches do not have the same accuracy as detailed simulations on supercomputers, they show how a single GPU can yield similar accuracy at a fraction of the cost. 

-Software network routing can also heavily benefit from GPU acceleration as shown by \textcite{han_packetshader_2010}, where they achieved a significantly higher throughput than with a CPU only implementation. Finite element structural analysis is an essential tool for many branches of engineering and can also heavily benefit from the usage of GPUs as demonstrated by \textcite{georgescu_gpu_2013}. Generating test data for DeepQ learning can also significantly benefit from using the GPU \parencite{koster_macsq_2022}. However, it also needs to be noted, that GPUs are not always better performing than CPUs as illustrated by \textcite{lee_debunking_2010}, so it is important to consider if it is worth using GPUs for specific tasks.
+Software network routing can also heavily benefit from GPU acceleration as shown by \textcite{han_packetshader_2010}, where they achieved a significantly higher throughput than with a CPU only implementation. 
+
+Finite element structural analysis is an essential tool for many branches of engineering and can also heavily benefit from the usage of GPUs as demonstrated by \textcite{georgescu_gpu_2013}.
+
+Generating test data for DeepQ learning can also significantly benefit from using the GPU \parencite{koster_macsq_2022}. 
+
+However, it also needs to be noted, that GPUs are not always better performing than CPUs as illustrated by \textcite{lee_debunking_2010}, so it is important to consider if it is worth using GPUs for specific tasks.

 \subsection{Programming GPUs}
-The development process on a GPU is vastly different from a CPU. A CPU has tens or hundreds of complex cores with the AMD Epyc 9965\footnote{\url{https://www.amd.com/en/products/processors/server/epyc/9005-series/amd-epyc-9965.html}} having $192$ cores and twice as many threads. Current CPUs are complex, and often contain features such as sophisticated branch prediction among other things to achieve higher and higher performance. This makes a CPU perfect for handling complex control flows on a single program thread and even multiple threads simultaneously \parencite{palacios_comparison_2011}. However, as seen in Section \ref{sec:gpgpu}, this often is not enough. On the other hand, a GPU contains thousands or even tens of thousands of cores. For example, the GeForce RTX 5090\footnote{\url{https://www.nvidia.com/en-us/geforce/graphics-cards/50-series/rtx-5090/}} contains a total of $21\,760$ CUDA cores. To achieve this enormous core count, a single GPU core has to be much simpler than a single CPU core. As described by \textcite{nvidia_cuda_2025}, a GPU designates much more transistors towards floating-point computations. This, however, results in less efficient integer arithmetic and control flow handling. There is also less Cache available per core and clock speeds are usually also much lower than those on a CPU. An overview of the differences of a CPU and a GPU architecture can be seen in Figure \ref{fig:cpu_vs_gpu}.
+The development process on a GPU is vastly different from a CPU. A CPU has tens or hundreds of complex cores with the AMD Epyc 9965\footnote{\url{https://www.amd.com/en/products/processors/server/epyc/9005-series/amd-epyc-9965.html}} having $192$ cores and twice as many threads. To demonstrate how a modern CPU works \textcite{knuth_mmix_1999} introduced the MMIX architecture. It is a 64-bit CPU architecture containing many concepts and design decisions to compete with other CPUs on the market at that time. He provides the information in great detail and demonstrates the complexity of CPU architectures. Current CPUs are even more complex, and often contain features like sophisticated branch prediction among other things to achieve higher and higher performance. This makes a CPU perfect for handling complex control flows on a single program thread and even multiple threads simultaneously \parencite{palacios_comparison_2011}. However, as seen in Section \ref{sec:gpgpu}, this often is not enough. On the other hand, a GPU contains thousands or even tens of thousands of cores. For example, the GeForce RTX 5090\footnote{\url{https://www.nvidia.com/en-us/geforce/graphics-cards/50-series/rtx-5090/}} contains a total of $21\,760$ CUDA cores. To achieve this enormous core count, a single GPU core has to be much simpler than a single CPU core. As described by \textcite{nvidia_cuda_2025}, a GPU designates much more transistors towards floating-point computations. This, however, results in less efficient integer arithmetic and control flow handling. There is also less Cache available per core and clock speeds are usually also much lower than those on a CPU. An overview of the differences of a CPU and a GPU architecture can be seen in Figure \ref{fig:cpu_vs_gpu}.

 \begin{figure}
 	\centering
@ -60,7 +62,7 @@ Despite these drawbacks, the sheer number of cores, makes a GPU a valid choice w

 \subsubsection{Thread Hierarchy and Tuning}
 \label{sec:thread_hierarchy}
-The thousands of cores on a GPU, as well as the threads created by the developer, are grouped together in several categories. This is the so-called thread hierarchy of GPUs. The developer can influence this grouping to a degree which allows them to tune their algorithm for optimal performance. To develop a well performing algorithm, it is necessary to know how this grouping works. Tuning the grouping is unique to each algorithm and also dependent on the GPU used, which means it is important to test a lot of different configurations to achieve the best possible result. This section aims at exploring the thread hierarchy and how it can be tuned to fit an algorithm.
+The thousands of cores on a GPU, as well as the threads created by the developer, are grouped together in several categories. This is the so-called thread hierarchy of GPUs. The developer can influence this grouping to a degree which allows them to tune their algorithm for optimal performance. In order to develop a well performing algorithm, it is necessary to know how this grouping works. Tuning the grouping is unique to each algorithm and also dependent on the GPU used, which means it is important to test a lot of different configurations to achieve the best possible result. This section aims at exploring the thread hierarchy and how it can be tuned to fit an algorithm.

 At the lowest level of a GPU exists a Streaming Multiprocessor (SM), which is a hardware unit responsible for scheduling and executing threads and also contains the registers used by these threads. An SM is always executing a group of 32 threads simultaneously, and this group is called a warp. The number of threads that can be started is virtually unlimited. However, threads must be grouped in a block, with one block typically containing a maximum of $1024$ threads but is often configured to be less. Therefore, if more than $1024$ threads are required, more blocks must be created. Blocks can also be grouped into thread block clusters which is optional, but can be useful in certain scenarios. All thread blocks or thread block clusters are part of a grid, which manifests as a dispatch of the code run on the GPU, also called kernel \parencite{amd_hip_2025}. All threads in one block have access to some shared memory, which can be used for L1 caching or communication between threads. It is important that the blocks can be scheduled independently, with no dependencies between them. This allows the scheduler to schedule blocks and threads as efficiently as possible. All threads within a warp are guaranteed to be part of the same block, and are therefore executed simultaneously and can access the same memory addresses. Figure \ref{fig:thread_hierarchy} depicts how threads in a block are grouped into warps for execution and how they shared memory. 

@ -77,7 +79,7 @@ Once a kernel is dispatched, all threads start at the same point in a program. H

 \begin{figure}
 	\centering
-	\includegraphics[width=.4\textwidth]{thread_divergence.png}
+	\includegraphics[width=.8\textwidth]{thread_divergence.png}
 	\caption{Thread T2 wants to execute instruction B while T1 and T3 want to execute instruction A. Therefore T2 will be an inactive thread this cycle and active once T1 and T3 are finished. This means that now the divergent threads are serialised.}
 	\label{fig:thread_divergence}
 \end{figure}
@ -85,9 +87,7 @@ Once a kernel is dispatched, all threads start at the same point in a program. H

 Modern GPUs implement what is known as the Single-Instruction Multiple-Thread (SIMT) architecture. In many cases a developer does not need to know the details of SIMT and can design fast, correct and accurate programs with just the SIMD architecture in mind. However, leveraging the power of SIMT can yield substantial performance gains by re-converging threads after data-dependent divergence has occurred. SIMT can also help with increasing the occupancy of the GPU. Occupancy and its importance to performance is discussed in detail in Section \ref{sec:occupancy}.

-A stack-less re-convergence algorithm was proposed by \textcite{collange_stack-less_2011} as an alternative to the default stack-based re-convergence algorithm. Their algorithm was able to achieve higher performance than the default one. Another approach for increasing occupancy using the SIMT architecture is proposed by \textcite{fung_thread_2011}. They introduced a technique for compacting thread blocks by moving divergent threads to new warps until they re-converge. This approach resulted in a noticeable speed-up between 17\% and 22\%. Another example where a SIMT aware algorithm can perform better was proposed by \textcite{koster_massively_2020}. While they did not implement techniques for thread re-convergence, they implemented a thread compaction algorithm. On data-dependent divergence it is possible for threads to end early, leaving a warp with only partial active threads. This means the inactive threads are still occupied and cannot be used for other work. Their thread compaction tackles this problem by moving active threads into a new thread block, releasing the inactive threads to perform other work. With this they were able to gain a speed-up of roughly 4 times compared to previous implementations. 
-
-Adapting Multiple-Instruction Multiple-Data (MIMD) programs with synchronisation to run on SIMT architecture can be a difficult task, especially if the underlying architecture is not well understood. A static analysis tool and a transformer specifically designed to help avoid deadlocks with MIMD synchronisation is proposed by \textcite{eltantawy_mimd_2016}. In addition, they proposed a hardware re-convergence mechanism that supports MIMD synchronisation. A survey by \textcite{khairy_survey_2019} explores different aspects of improving GPGPU performance architecturally. Specifically, they have compiled a list of different publications discussing algorithms for thread re-convergence, thread compaction and much more. Their main goal was to give a broad overview of many ways to improve the performance of GPGPU programming to help other developers.
+A stack-less re-convergence algorithm was proposed by \textcite{collange_stack-less_2011} as an alternative to the default stack-based re-convergence algorithm. Their algorithm was able to achieve higher performance than the default one. Another approach for increasing occupancy using the SIMT architecture is proposed by \textcite{fung_thread_2011}. They introduced a technique for compacting thread blocks by moving divergent threads to new warps until they re-converge. This approach resulted in a noticeable speed-up between 17\% and 22\%. Another example where a SIMT aware algorithm can perform better was proposed by \textcite{koster_massively_2020}. While they did not implement techniques for thread re-convergence, they implemented a thread compaction algorithm. On data-dependent divergence it is possible for threads to end early, leaving a warp with only partial active threads. This means the inactive threads are still occupied and cannot be used for other work. Their thread compaction tackles this problem by moving active threads into a new thread block, releasing the inactive threads to perform other work. With this they were able to gain a speed-up of roughly 4 times compared to previous implementations. Adapting Multiple-Instruction Multiple-Data (MIMD) programs with synchronisation to run on SIMT architecture can be a difficult task, especially if the underlying architecture is not well understood. A static analysis tool and a transformer specifically designed to help avoid deadlocks with MIMD synchronisation is proposed by \textcite{eltantawy_mimd_2016}. In addition, they proposed a hardware re-convergence mechanism that supports MIMD synchronisation. A survey by \textcite{khairy_survey_2019} explores different aspects of improving GPGPU performance architecturally. Specifically, they have compiled a list of different publications discussing algorithms for thread re-convergence, thread compaction and much more. Their main goal was to give a broad overview of many ways to improve the performance of GPGPU programming to help other developers.

 \subsubsection{Memory Model}
 \label{sec:memory_model}
@ -200,7 +200,7 @@ Compilers are a necessary tool for many developers. If a developer wants to run
 \begin{figure}
 	\centering
 	\includegraphics[width=.9\textwidth]{compiler_architecture.png}
-	\caption{A simplified overview of the architecture of a compiler.}
+	\caption{A simplified overview of how the architecture of a compiler looks, using Flex and Bison.}
 	\label{fig:compiler_layout}
 \end{figure}

@ -208,8 +208,8 @@ Compilers are a necessary tool for many developers. If a developer wants to run

 \subsection{Interpreters}
 % What are interpreters; how they work; should mostly contain/reference gpu interpreters
-Interpreters are a different kind of program for executing source code. Rather than compiling the code and executing the result, an interpreter executes the source code directly. Languages like Python and JavaScript are prominent examples of interpreted languages, but also Java, or more precise Java-Bytecode, is also interpreted before it gets compiled \parencite{lindholm_java_2025}. However, interpreters can not only be used for interpreting programming languages. It is also possible for them to be used in GP. \textcite{langdon_simd_2008} have shown how a SIMD interpreter can be efficiently used for evaluating entire GP populations on the GPU directly. In a later work \textcite{cano_gpu-parallel_2014} further improved this interpreter. They used the fact that a GP individual represents a tree which can be split into independent subtrees. These can be evaluated concurrently and with the help of communication via shared memory, they were able to evaluate the entire tree. With this they achieved a significant performance improvement over previous implementations. As shown by \textcite{dietz_mimd_2010}, it is even possible to develop an interpreter that can execute MIMD programs on a SIMD GPU. However, as noted by the authors, any kind of interpretation comes with an overhead. This means that with the additional challenges of executing MIMD programs on SIMD hardware, their interpreter, while achieving reasonable efficiency, still suffers from performance problems. Another field where interpreters can be useful are rule-based simulations. \textcite{koster_massively_2020} has shown how they implemented a GPU interpreter for such simulations. In addition with other novel performance improvements in running programs on a GPU, they were able to gain a speed-up of 4 over non-interpreted implementations. While publications like \textcite{fua_comparing_2020} and \textcite{gherardi_java_2012} have shown, interpreted languages often trail behind in terms of performance compared to compiled languages, interpreters per se are not slow. And while they come with performance overhead as demonstrated by \textcite{dietz_mimd_2010} and \textcite{romer_structure_1996}, they can still be a very fast, easy and powerful alternative for certain tasks.
+Interpreters are a different kind of program for executing source code. Rather than compiling the code and executing the result, an interpreter executes the source code directly. Languages like Python and JavaScript are prominent examples of interpreted languages, but also Java, or more precise Java-Bytecode, is also interpreted before it gets compiled \parencite{lindholm_java_2025}. However, interpreters can not only be used for interpreting programming languages. It is also possible for them to be used in GP. \textcite{langdon_simd_2008} have shown how a SIMD interpreter can be efficiently used for evaluating entire GP populations on the GPU directly. In a later work \textcite{cano_gpu-parallel_2014} further improved this interpreter. They used the fact that a GP individual represents a tree which can be split into independent subtrees. These can be evaluated concurrently and with the help of communication via shared memory, they were able to evaluate the entire tree. With this they achieved a significant performance improvement over previous implementations. As shown by \textcite{dietz_mimd_2010}, it is even possible to develop an interpreter that can execute MIMD programs on a SIMD GPU. However, as noted by the authors, any kind interpretation comes with an overhead. This means that with the additional challenges of executing MIMD programs on SIMD hardware, their interpreter, while achieving reasonable efficiency, still suffers from performance problems. Another field where interpreters can be useful are rule-based simulations. \textcite{koster_massively_2020} has shown how they implemented a GPU interpreter for such simulations. In addition with other novel performance improvements in running programs on a GPU, they were able to gain a speed-up of 4 over non-interpreted implementations. While publications like \textcite{fua_comparing_2020} and \textcite{gherardi_java_2012} have shown, interpreted languages often trail behind in terms of performance compared to compiled languages, interpreters per se are not slow. And while they come with performance overhead as demonstrated by \textcite{dietz_mimd_2010} and \textcite{romer_structure_1996}, they can still be a very fast, easy and powerful alternative for certain tasks.

 \subsection{Transpilers}
 % talk about what transpilers are and how to implement them. If possible also gpu specific transpilation.
-With the concepts already mentioned, it is possible to generate executable code from code written in a programming language. However, sometimes it is desired to convert a program from one programming language to another and therefore the major difference between these use-cases is the backend. A popular transpiler example is the TypeScript transpiler, which transforms TypeScript source code into JavaScript source code \parencite{microsoft_typescript_2025}. Other examples for transpilers are the C2Rust transpiler \parencite{ling_rust_2022} that transpiles C code into Rust code as well as the PyJL transpiler \parencite{marcelino_transpiling_2022} which transpiles Python code into Julia code. \textcite{chaber_effectiveness_2016} proposed a transpiler that takes MATLAB and C code and transforms it into pure and optimised C code for an STM32 microcontroller. An early example for a transpiler has been developed by \textcite{intel_mcs86_1978} where they built a transpiler for transforming assembly code for their 8080 CPU to assembly code for their 8086 CPU. Transpilers can also be used in parallelisation environments, like OpenMP \parencite{wang_automatic_2015}. \textcite{moses_high-performance_2023} describe a transpiler, that can transform CUDA code into highly parallel CPU code, where they found that it performs noticeably better than doing this transformation by hand. When designing complex processors and accelerators, register-transfer level (RTL) simulations are essential \parencite{wang_electronic_2009}. In a later study \textcite{zhang_opportunities_2020} have shown how RTL simulations can be performed on GPUs with a speed-up of 20. This led to \textcite{lin_rtl_2023} developing a transpiler to transform RTL into CUDA kernels instead of handwriting them. The compared their results with a CPU implementation running on 80 CPUs, where they found that the transpiled CUDA version was 40 times faster. Using transpilers for software backend and business logic has been proposed by \textcite{bastidas_fuertes_transpiler-based_2023}. Their approach implemented a programming language that can be transpiled into different programming languages, for usage in a multi-programming-language environment that share some business logic. In another study, \textcite{bastidas_fuertes_transpilers_2023} reviewed over 600 publications to map the use of transpilers alongside their implementations in different fields of research, demonstrating the versatility of transpiler use.
+With the concepts already mentioned, it is possible to generate executable code from code written in a programming language. However, sometimes it is desired to convert a program from one programming language to another and therefore the major difference between these use-cases is the backend. A popular transpiler example is the TypeScript transpiler, which transforms TypeScript source code into JavaScript source code \parencite{microsoft_typescript_2025}. Other examples for transpilers are the C2Rust transpiler \parencite{ling_rust_2022} that transpiles C code into Rust code as well as the PyJL transpiler \parencite{marcelino_transpiling_2022} which transpiles Python code into Julia code. \textcite{chaber_effectiveness_2016} proposed a transpiler that takes MATLAB and C code and transforms it into pure and optimised C code for an STM32 microcontroller. An early example for a transpiler has been developed by \textcite{intel_mcs86_1978} where they built a transpiler for transforming assembly code for their 8080 CPU to assembly code for their 8086 CPU. Transpilers can also be used in parallelisation environments, like OpenMP \parencite{wang_automatic_2015}. There also exists a transpiler that transforms CUDA code into highly parallel CPU code. \textcite{moses_high-performance_2023} described this transpiler, and they found that the generated code performs noticeably better than doing this transformation by hand. When designing complex processors and accelerators, Register-transfer level (RTL) simulations are essential \parencite{wang_electronic_2009}. In a later study \textcite{zhang_opportunities_2020} have shown how RTL simulations can be performed on GPUs with a speed-up of 20. This led to \textcite{lin_rtl_2023} developing a transpiler to transform RTL into CUDA kernels instead of handwriting them. The compared their results with a CPU implementation running on 80 CPUs, where they found that the transpiled CUDA version was 40 times faster. Using transpilers for software backend and business logic has been proposed by \textcite{bastidas_fuertes_transpiler-based_2023}. Their approach implemented a programming language that can be transpiled into different programming languages, for usage in a multi-programming-language environment that share some business logic. In another study, \textcite{bastidas_fuertes_transpilers_2023} reviewed over 600 publications to map the use of transpilers alongside their implementations in different fields of research, demonstrating the versatility of transpiler use.
--- a/thesis/front/abstract.tex
+++ b/thesis/front/abstract.tex
@ -1,11 +1,5 @@
 \chapter{Abstract}

-The objective of symbolic regression is to identify an expression that accurately models a system based on a set of inputs. For instance, one might determine the flow through pipes using inputs such as roughness, diameter, and length by conducting experiments with varying input configurations and observing the resulting flow and derive an expression from the experiments. This methodology, exemplified by \textcite{nikuradse_laws_1950}, can be applied to any system through symbolic regression. To find the best-fitting expression, millions of candidate expressions are generated, each requiring evaluation against every data point to assess how well they fit to the system. Consequently, millions of evaluations must be performed, a process that is computationally intensive and time-consuming. Thus, optimizing the evaluation phase of symbolic regression is crucial for discovering expressions that describe large and complex systems within a feasible timeframe.

-% Applications such as weather simulation \parencite{michalakes_gpu_2008}, simulation of static and rotating black holes \parencite{hissbach_overview_2022, verbraeck_interactive_2021}, and structural analysis \parencite{georgescu_gpu_2013} significantly benefit from optimized algorithms that leverage the graphics processing unit (GPU).
+This should be a 1-page (maximum) summary of your work in English.

-This thesis presents the design and implementation of two evaluators that utilize the GPU to evaluate expressions generated at runtime by the symbolic regression algorithm. Performance benchmarks are conducted to compare the efficiency of the GPU evaluators against a CPU evaluator.
-
-The benchmark results indicate that the GPU can serve as a viable alternative to the CPU in certain scenarios. The determining factor for choosing between GPU and CPU evaluation is the number of data points. In a scenario with $10\,000$ expressions and $10\,000$ data points, the GPU outperformed the CPU by a factor between $1.6$ and $2$.
-
-This master thesis is associated with the FFG COMET project ProMetHeus (\#904919). The developed software is used and further developed for symbolic regression in the ProMetHeus project.
--- a/thesis/front/kurzfassung.tex
+++ b/thesis/front/kurzfassung.tex
@ -1,12 +1,7 @@
 \chapter{Kurzfassung}

 \begin{german}
-Das Ziel der symbolischen Regression ist es, einen Ausdruck zu finden, der ein System basierend auf einer Reihe von Variablen modelliert. Beispielsweise kann man den Durchfluss durch Rohre unter Verwendung von Variablen wie Rauheit, Durchmesser und Länge bestimmen, indem Experimente mit verschiedenen Werten für die Variablen durchgeführt werden. Für jedes Experiment wird der Durchfluss gemessen, wodurch man eine allgemeine Formel ableiten kann, welche die Beziehung der Variablen mit dem Durchfluss beschreibt. Diese Methodik, veranschaulicht durch die Arbeit von \textcite{nikuradse_laws_1950}, kann auf unterschiedliche Systeme mithilfe von symbolischer Regression angewendet werden. Um einen Ausdruck zu finden, welcher das System am besten beschreibt, werden Millionen von Kandidatenausdrücken generiert. Diese müssen, unter Verwendung der Daten aller Experimente ausgewertet werden, um ihre Passgenauigkeit zum System zu beurteilen. Folglich müssen Millionen von Auswertungen durchgeführt werden, ein Prozess, der rechenintensiv und zeitaufwendig ist. Daher ist die Optimierung der Auswertungsphase der symbolischen Regression entscheidend. So wird es ermöglicht Ausdrücke in einem angemessenen Zeitrahmen zu finden, welche große und komplexe Systeme beschreiben.
-
-Diese Arbeit präsentiert das Design und die Implementierung von zwei Evaluatoren, die die Grafikkarte (GPU) nutzen, um Ausdrücke zu bewerten, die zur Laufzeit der symbolischen Regression generiert werden. Leistungsbenchmarks werden durchgeführt, um die Performanz der GPU-Evaluatoren mit dem aktuellen CPU-Evaluator zu vergleichen.
-
-Die Benchmark-Ergebnisse zeigen, dass die GPU in bestimmten Szenarien eine geeignete Alternative zur CPU darstellt. Der entscheidende Faktor für die Wahl zwischen GPU- und CPU-Auswertung ist die Anzahl der Experimente und folglich die Anzahl der Datenpunkte. In einer Konfiguration mit $10\,000$ Ausdrücken und $10\,000$ Variablenkonfigurationen übertraf die GPU die CPU um ein bedeutendes Maß.
-
-Diese Masterarbeit ist Teil des FFG COMET Projekt ProMetHeus (\#904919). Die entwickelte Software wird für die symbolische Regression im ProMetHeus Projekt verwendet und weiterentwickelt.
-
+An dieser Stelle steht eine Zusammenfassung der Arbeit, Umfang
+max.\ 1 Seite. 
+...
 \end{german}
--- a/thesis/images/input_output_explanation.png
+++ b/thesis/images/input_output_explanation.png
--- a/thesis/images/interpreter_sequence_diagram.png
+++ b/thesis/images/interpreter_sequence_diagram.png
--- a/thesis/images/results/cpu_gpui_gput_bench1.png
+++ b/thesis/images/results/cpu_gpui_gput_bench1.png
--- a/thesis/images/results/cpu_gpui_gput_bench2.png
+++ b/thesis/images/results/cpu_gpui_gput_bench2.png
--- a/thesis/images/results/cpu_gpui_gput_bench3.png
+++ b/thesis/images/results/cpu_gpui_gput_bench3.png
--- a/thesis/images/results/gpu-interpreter-final-performance-benchmark1.png
+++ b/thesis/images/results/gpu-interpreter-final-performance-benchmark1.png
--- a/thesis/images/results/gpu-interpreter-final-performance-benchmark2.png
+++ b/thesis/images/results/gpu-interpreter-final-performance-benchmark2.png
--- a/thesis/images/results/gpu-interpreter-final-performance-benchmark3.png
+++ b/thesis/images/results/gpu-interpreter-final-performance-benchmark3.png
--- a/thesis/images/results/gpu-transpiler-final-performance-benchmark2.png
+++ b/thesis/images/results/gpu-transpiler-final-performance-benchmark2.png
--- a/thesis/images/results/gpu-transpiler-final-performance-benchmark3.png
+++ b/thesis/images/results/gpu-transpiler-final-performance-benchmark3.png
--- a/thesis/images/results/interpreter-comparison-128-160-192.png
+++ b/thesis/images/results/interpreter-comparison-128-160-192.png
--- a/thesis/images/results/interpreter-comparison-initial-optim1.png
+++ b/thesis/images/results/interpreter-comparison-initial-optim1.png
--- a/thesis/images/results/interpreter-comparison-optim1-optim2.png
+++ b/thesis/images/results/interpreter-comparison-optim1-optim2.png
--- a/thesis/images/results/interpreter-comparison-optim2-optim3.png
+++ b/thesis/images/results/interpreter-comparison-optim2-optim3.png
--- a/thesis/images/results/transpiler-comparison-128-160.png
+++ b/thesis/images/results/transpiler-comparison-128-160.png
--- a/thesis/images/transpiler_sequence_diagram.png
+++ b/thesis/images/transpiler_sequence_diagram.png
--- a/thesis/main.pdf
+++ b/thesis/main.pdf
--- a/thesis/main.tex
+++ b/thesis/main.tex
@ -49,9 +49,7 @@
 \frontmatter                                   % Front part (roman page numbers)
 %%%-----------------------------------------------------------------------------

-\includepdf[pages=1]{title_page.pdf}
-\includepdf[pages=2, pagecommand={\thispagestyle{plain}}]{title_page.pdf}
-
+\maketitle
 \tableofcontents

 \include{front/abstract}
--- a/thesis/references.bib
+++ b/thesis/references.bib
@ -400,7 +400,6 @@
 	author = {Winter, Martin and Parger, Mathias and Mlakar, Daniel and Steinberger, Markus},
 	urldate = {2025-02-27},
 	date = {2021-02-17},
-	file = {PDF:C\:\\Users\\danwi\\Zotero\\storage\\UURX5BER\\Winter et al. - 2021 - Are dynamic memory managers on GPUs slow a survey and benchmarks.pdf:application/pdf},
 }

@article{bartlett_exhaustive_2024,
@ -1254,56 +1253,6 @@
 	author = {Faingnaert, Thomas and Besard, Tim and De Sutter, Bjorn},
 	urldate = {2025-04-20},
 	date = {2022-09},
-	keywords = {Graphics processing units, Kernel, Programming, Instruction sets, Codes, graphics processors, high-level programming languages, Libraries, Matrix multiplication, Productivity},
+	keywords = {Codes, Graphics processing units, graphics processors, high-level programming languages, Instruction sets, Kernel, Libraries, Matrix multiplication, Productivity, Programming},
 	file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\QCJ6LSF3\\Faingnaert et al. - 2022 - Flexible Performant GEMM Kernels on GPUs.pdf:application/pdf},
 }
-
-@report{nikuradse_laws_1950,
-	title = {Laws of Flow in Rough Pipes},
-	url = {https://digital.library.unt.edu/ark:/67531/metadc63009/},
-	author = {Nikuradse, J.},
-	date = {1950-11},
-}
-
-@article{guimera_bayesian_2020,
-	title = {A Bayesian machine scientist to aid in the solution of challenging scientific problems},
-	volume = {6},
-	url = {https://www.science.org/doi/10.1126/sciadv.aav6971},
-	doi = {10.1126/sciadv.aav6971},
-	abstract = {Closed-form, interpretable mathematical models have been instrumental for advancing our understanding of the world; with the data revolution, we may now be in a position to uncover new such models for many systems from physics to the social sciences. However, to deal with increasing amounts of data, we need “machine scientists” that are able to extract these models automatically from data. Here, we introduce a Bayesian machine scientist, which establishes the plausibility of models using explicit approximations to the exact marginal posterior over models and establishes its prior expectations about models by learning from a large empirical corpus of mathematical expressions. It explores the space of models using Markov chain Monte Carlo. We show that this approach uncovers accurate models for synthetic and real data and provides out-of-sample predictions that are more accurate than those of existing approaches and of other nonparametric methods.},
-	pages = {eaav6971},
-	number = {5},
-	journaltitle = {Science Advances},
-	author = {Guimerà, Roger and Reichardt, Ignasi and Aguilar-Mogas, Antoni and Massucci, Francesco A. and Miranda, Manuel and Pallarès, Jordi and Sales-Pardo, Marta},
-	urldate = {2025-05-21},
-	date = {2020-01-31},
-	file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\HLG9FD4H\\Guimerà et al. - 2020 - A Bayesian machine scientist to aid in the solution of challenging scientific problems.pdf:application/pdf},
-}
-
-@book{kommenda_local_2018,
-	title = {Local Optimization and Complexity Control for Symbolic Regression / eingereicht von Michael Kommenda},
-	url = {http://epub.jku.at/obvulihs/2581907},
-	abstract = {Hochschulschriften. Local Optimization and Complexity Control for Symbolic Regression / eingereicht von Michael Kommenda. Linz, 2018},
-	author = {Kommenda, Michael},
-	urldate = {2025-06-28},
-	date = {2018},
-	langid = {english},
-	file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\9LXXSHJ8\\Kommenda - 2018 - Local Optimization and Complexity Control for Symbolic Regression  eingereicht von Michael Kommenda.pdf:application/pdf},
-}
-
-@online{pci-sig_pci_2025,
-	title = {{PCI} Express 6.0 Specification {\textbar} {PCI}-{SIG}},
-	url = {https://pcisig.com/pci-express-6.0-specification},
-	author = {{PCI-SIG}},
-	urldate = {2025-06-28},
-	date = {2025},
-	file = {PCI Express 6.0 Specification | PCI-SIG:C\:\\Users\\danwi\\Zotero\\storage\\MSYN4ZIU\\pci-express-6.html:text/html},
-}
-
-@thesis{weinberger_vektoroperationen_2018,
-	title = {Vektoroperationen in der genetischen Programmierung},
-	institution = {University of Applied Sciences Upper Austria},
-	type = {phdthesis},
-	author = {Weinberger, Patrick},
-	date = {2018},
-}
--- a/thesis/title_page.pdf
+++ b/thesis/title_page.pdf
				`@ -1 +0,0 @@`
				[{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"nikuradse_1":["Trial",{"allocs":36814947,"gctimes":[1.082739415e9,9.35589349e8,8.95739997e8,8.82797331e8,8.44175578e8,8.27278981e8,8.24664534e8,8.41590342e8,8.23430705e8,8.26304622e8,8.7328356e8,8.48151374e8,8.20769383e8,8.36210366e8,8.25357919e8,8.18247354e8,8.05126298e8,8.10738655e8,8.14534413e8,8.05974078e8,8.08104945e8,8.07549224e8,8.11047079e8,8.36937224e8,8.19217772e8,8.03258649e8,8.00177357e8,8.05390572e8,7.81551092e8,7.84470283e8,7.84717493e8,7.87670826e8,7.91518273e8,7.95865535e8,7.9488509e8,7.85908564e8,7.96303832e8,7.83015419e8,7.98406799e8,7.95693404e8,7.89571842e8,7.87009536e8,7.92931167e8,8.0354065e8,8.01147304e8,7.90650725e8,7.91114336e8,8.14447424e8,8.09202389e8,8.0150787e8],"memory":19327142456,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":50,"evals":1,"gcsample":false,"seconds":28800.0,"overhead":0.0,"memory_tolerance":0.01}],"times":[1.11960461697e11,1.12658407743e11,1.11797123654e11,1.14086430365e11,1.12540701243e11,1.13057199848e11,1.12421343743e11,1.12335917668e11,1.11873753956e11,1.12087309285e11,1.15372551368e11,1.12857587668e11,1.12212954999e11,1.12352839748e11,1.12799090735e11,1.12712852105e11,1.11910175268e11,1.12890418194e11,1.12536406676e11,1.12333546234e11,1.12414119618e11,1.12632975657e11,1.12274854817e11,1.13642350405e11,1.13191424262e11,1.12623305956e11,1.12519637206e11,1.12733882055e11,1.13175515626e11,1.12499258654e11,1.12175542007e11,1.14221603568e11,1.12620900601e11,1.12996891317e11,1.12370260538e11,1.12760626809e11,1.13153933145e11,1.12762108936e11,1.12758858333e11,1.13381876923e11,1.12152161607e11,1.12831962905e11,1.12135760011e11,1.14343808852e11,1.12720432473e11,1.13061653545e11,1.12414150523e11,1.13142168741e11,1.12805546557e11,1.13053409368e11]}]},"tags":["CPUInterpreter"]}]},"tags":[]}]]]