benchmarking: tested int32 also on uni pc

benchmarking: used int32 wherever possible; resulted in noticeable performance drop
evaluation: added introduction text and made plan for additional text
2025-04-13 11:43:17 +02:00 · 2025-04-13 11:32:54 +02:00 · 2025-04-12 16:22:14 +02:00 · 2025-04-12 13:20:50 +02:00 · 2025-04-12 12:13:28 +02:00 · 2025-04-12 10:24:45 +02:00
44 changed files with 1662 additions and 357 deletions
--- a/images/input-explanation.png
+++ b/images/input-explanation.png
--- a/other/component_diagram_interpreter.drawio
+++ b/other/component_diagram_interpreter.drawio
@ -0,0 +1,75 @@
+<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0" version="26.2.6">
+  <diagram name="Page-1" id="R-oAYELteez0U9UgfQ2t">
+    <mxGraphModel dx="2068" dy="1147" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
+      <root>
+        <mxCell id="0" />
+        <mxCell id="1" parent="0" />
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-14" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="GDUa8-GdCzSgoxu7vCdt-4" target="GDUa8-GdCzSgoxu7vCdt-12" edge="1">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-4" value="Pre-Processing" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
+          <mxGeometry x="500" y="280" width="120" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-8" value="Interpreter" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;width=90;height=40;" parent="1" vertex="1">
+          <mxGeometry x="440" y="160" width="440" height="480" as="geometry" />
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-9" value="" style="ellipse;html=1;shape=endState;fillColor=#000000;strokeColor=#000000;" parent="1" vertex="1">
+          <mxGeometry x="270" y="520" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-13" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="GDUa8-GdCzSgoxu7vCdt-10" target="GDUa8-GdCzSgoxu7vCdt-4" edge="1">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-15" value="&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;&lt;b&gt;Input:&lt;/b&gt;&lt;/font&gt;&lt;/div&gt;&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;Expressions&lt;/font&gt;&lt;/div&gt;&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;Variable-Sets&lt;/font&gt;&lt;/div&gt;&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;Parameters&lt;/font&gt;&lt;/div&gt;" style="edgeLabel;html=1;align=left;verticalAlign=middle;resizable=0;points=[];" parent="GDUa8-GdCzSgoxu7vCdt-13" vertex="1" connectable="0">
+          <mxGeometry x="-0.4633" relative="1" as="geometry">
+            <mxPoint x="-33" as="offset" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-10" value="" style="ellipse;html=1;shape=endState;fillColor=#000000;strokeColor=none;" parent="1" vertex="1">
+          <mxGeometry x="270" y="280" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-11" value="CPU" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" parent="1" vertex="1">
+          <mxGeometry x="460" y="220" width="400" height="140" as="geometry" />
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-18" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" parent="1" source="GDUa8-GdCzSgoxu7vCdt-12" target="GDUa8-GdCzSgoxu7vCdt-17" edge="1">
+          <mxGeometry relative="1" as="geometry">
+            <Array as="points">
+              <mxPoint x="770" y="356" />
+              <mxPoint x="770" y="356" />
+            </Array>
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-19" value="&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;&lt;b&gt;Input:&lt;br&gt;&lt;/b&gt;&lt;/font&gt;&lt;/div&gt;&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;Processed Expressions&lt;/font&gt;&lt;/div&gt;&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;Variable-Sets&lt;/font&gt;&lt;/div&gt;&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;Parameters&lt;/font&gt;&lt;/div&gt;" style="edgeLabel;html=1;align=left;verticalAlign=middle;resizable=0;points=[];" parent="GDUa8-GdCzSgoxu7vCdt-18" vertex="1" connectable="0">
+          <mxGeometry x="0.1565" y="-2" relative="1" as="geometry">
+            <mxPoint x="-48" y="-26" as="offset" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-12" value="Dispatch Kernel" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
+          <mxGeometry x="710" y="280" width="120" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-16" value="GPU" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" parent="1" vertex="1">
+          <mxGeometry x="680" y="456" width="180" height="139" as="geometry" />
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-25" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0.5;exitDx=0;exitDy=0;entryX=1;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="GDUa8-GdCzSgoxu7vCdt-17" target="GDUa8-GdCzSgoxu7vCdt-21" edge="1">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-17" value="Evaluation" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
+          <mxGeometry x="710" y="520" width="120" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-20" value="CPU" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" parent="1" vertex="1">
+          <mxGeometry x="460" y="456" width="170" height="139" as="geometry" />
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-22" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0.5;exitDx=0;exitDy=0;entryX=1;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="GDUa8-GdCzSgoxu7vCdt-21" target="GDUa8-GdCzSgoxu7vCdt-9" edge="1">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-26" value="&lt;div&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;&lt;b&gt;Output:&lt;/b&gt;&lt;/font&gt;&lt;/div&gt;&lt;div&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;Evaluation-Results&lt;/font&gt;&lt;/div&gt;" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="GDUa8-GdCzSgoxu7vCdt-22" vertex="1" connectable="0">
+          <mxGeometry x="0.4108" y="-1" relative="1" as="geometry">
+            <mxPoint x="13" y="1" as="offset" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="GDUa8-GdCzSgoxu7vCdt-21" value="Retrieve Results" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
+          <mxGeometry x="485" y="520" width="120" height="40" as="geometry" />
+        </mxCell>
+      </root>
+    </mxGraphModel>
+  </diagram>
+</mxfile>
--- a/other/component_diagram_transpiler.drawio
+++ b/other/component_diagram_transpiler.drawio
@ -0,0 +1,84 @@
+<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0" version="26.2.6">
+  <diagram name="Page-1" id="KFoKKVRmhU8qG_-FEeqA">
+    <mxGraphModel dx="2068" dy="1147" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
+      <root>
+        <mxCell id="0" />
+        <mxCell id="1" parent="0" />
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-1" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="tQMPqDGkYp4bv8unJ6VJ-21" target="tQMPqDGkYp4bv8unJ6VJ-11" edge="1">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-2" value="Pre-Processing" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
+          <mxGeometry x="480" y="280" width="120" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-3" value="Transpiler" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;width=90;height=40;" parent="1" vertex="1">
+          <mxGeometry x="440" y="160" width="480" height="480" as="geometry" />
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-4" value="" style="ellipse;html=1;shape=endState;fillColor=#000000;strokeColor=#000000;" parent="1" vertex="1">
+          <mxGeometry x="270" y="520" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-5" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="tQMPqDGkYp4bv8unJ6VJ-7" target="tQMPqDGkYp4bv8unJ6VJ-2" edge="1">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-6" value="&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;&lt;b&gt;Input:&lt;/b&gt;&lt;/font&gt;&lt;/div&gt;&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;Expressions&lt;/font&gt;&lt;/div&gt;&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;Variable-Sets&lt;/font&gt;&lt;/div&gt;&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;Parameters&lt;/font&gt;&lt;/div&gt;" style="edgeLabel;html=1;align=left;verticalAlign=middle;resizable=0;points=[];" parent="tQMPqDGkYp4bv8unJ6VJ-5" vertex="1" connectable="0">
+          <mxGeometry x="-0.4633" relative="1" as="geometry">
+            <mxPoint x="-16" as="offset" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-7" value="" style="ellipse;html=1;shape=endState;fillColor=#000000;strokeColor=none;" parent="1" vertex="1">
+          <mxGeometry x="270" y="280" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-8" value="CPU" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" parent="1" vertex="1">
+          <mxGeometry x="460" y="220" width="440" height="140" as="geometry" />
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-9" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" parent="1" source="tQMPqDGkYp4bv8unJ6VJ-11" target="tQMPqDGkYp4bv8unJ6VJ-14" edge="1">
+          <mxGeometry relative="1" as="geometry">
+            <Array as="points">
+              <mxPoint x="820" y="420" />
+              <mxPoint x="820" y="420" />
+            </Array>
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-10" value="&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;&lt;b&gt;Input:&lt;br&gt;&lt;/b&gt;&lt;/font&gt;&lt;/div&gt;&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;Processed Expressions&lt;/font&gt;&lt;/div&gt;&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;Variable-Sets&lt;/font&gt;&lt;/div&gt;&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;Parameters&lt;/font&gt;&lt;/div&gt;" style="edgeLabel;html=1;align=left;verticalAlign=middle;resizable=0;points=[];" parent="tQMPqDGkYp4bv8unJ6VJ-9" vertex="1" connectable="0">
+          <mxGeometry x="0.1565" y="-2" relative="1" as="geometry">
+            <mxPoint x="-48" y="-25" as="offset" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-11" value="Dispatch Kernel" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
+          <mxGeometry x="760" y="280" width="120" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-12" value="GPU" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" parent="1" vertex="1">
+          <mxGeometry x="720" y="456" width="180" height="134" as="geometry" />
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-13" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0.5;exitDx=0;exitDy=0;entryX=1;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="tQMPqDGkYp4bv8unJ6VJ-14" target="tQMPqDGkYp4bv8unJ6VJ-18" edge="1">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-14" value="Evaluation" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
+          <mxGeometry x="760" y="520" width="120" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-15" value="CPU" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" parent="1" vertex="1">
+          <mxGeometry x="460" y="456" width="180" height="134" as="geometry" />
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-16" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0.5;exitDx=0;exitDy=0;entryX=1;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="tQMPqDGkYp4bv8unJ6VJ-18" target="tQMPqDGkYp4bv8unJ6VJ-4" edge="1">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-17" value="&lt;div&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;&lt;b&gt;Output:&lt;/b&gt;&lt;/font&gt;&lt;/div&gt;&lt;div&gt;&lt;font style=&quot;font-size: 12px;&quot;&gt;Evaluation-Results&lt;/font&gt;&lt;/div&gt;" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="tQMPqDGkYp4bv8unJ6VJ-16" vertex="1" connectable="0">
+          <mxGeometry x="0.4108" y="-1" relative="1" as="geometry">
+            <mxPoint x="13" y="1" as="offset" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-18" value="Retrieve Results" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
+          <mxGeometry x="485" y="520" width="120" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-22" value="" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="tQMPqDGkYp4bv8unJ6VJ-2" target="tQMPqDGkYp4bv8unJ6VJ-21" edge="1">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="600" y="300" as="sourcePoint" />
+            <mxPoint x="760" y="300" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="tQMPqDGkYp4bv8unJ6VJ-21" value="Code-Generation" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
+          <mxGeometry x="620" y="280" width="120" height="40" as="geometry" />
+        </mxCell>
+      </root>
+    </mxGraphModel>
+  </diagram>
+</mxfile>
--- a/other/input-explanation.drawio
+++ b/other/input-explanation.drawio
@ -1,6 +1,6 @@
-<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:129.0) Gecko/20100101 Firefox/129.0" version="24.7.6">
+<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0" version="26.1.1">
  <diagram name="Page-1" id="gpsZjoig8lt5hVv5Hzwz">
-    <mxGraphModel dx="989" dy="539" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
+    <mxGraphModel dx="830" dy="457" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
      <root>
        <mxCell id="0" />
        <mxCell id="1" parent="0" />
@ -40,22 +40,22 @@
            <mxPoint x="200" as="targetPoint" />
          </mxGeometry>
        </mxCell>
-        <mxCell id="9og6d5YY-6gPx96OlZrF-18" value="e1" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" vertex="1" parent="9Xn2HrUYLFHSwPnNgvM3-13">
+        <mxCell id="9og6d5YY-6gPx96OlZrF-18" value="e1" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="9Xn2HrUYLFHSwPnNgvM3-13" vertex="1">
          <mxGeometry width="40" height="40" as="geometry" />
        </mxCell>
-        <mxCell id="9og6d5YY-6gPx96OlZrF-19" value="e2" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" vertex="1" parent="9Xn2HrUYLFHSwPnNgvM3-13">
+        <mxCell id="9og6d5YY-6gPx96OlZrF-19" value="e2" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="9Xn2HrUYLFHSwPnNgvM3-13" vertex="1">
          <mxGeometry x="40" width="40" height="40" as="geometry" />
        </mxCell>
-        <mxCell id="9og6d5YY-6gPx96OlZrF-20" value="e3" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" vertex="1" parent="9Xn2HrUYLFHSwPnNgvM3-13">
+        <mxCell id="9og6d5YY-6gPx96OlZrF-20" value="e3" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="9Xn2HrUYLFHSwPnNgvM3-13" vertex="1">
          <mxGeometry x="80" width="40" height="40" as="geometry" />
        </mxCell>
-        <mxCell id="9og6d5YY-6gPx96OlZrF-21" value="e4" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" vertex="1" parent="9Xn2HrUYLFHSwPnNgvM3-13">
+        <mxCell id="9og6d5YY-6gPx96OlZrF-21" value="e4" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="9Xn2HrUYLFHSwPnNgvM3-13" vertex="1">
          <mxGeometry x="120" width="40" height="40" as="geometry" />
        </mxCell>
-        <mxCell id="9og6d5YY-6gPx96OlZrF-22" value="e5" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" vertex="1" parent="9Xn2HrUYLFHSwPnNgvM3-13">
+        <mxCell id="9og6d5YY-6gPx96OlZrF-22" value="e5" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="9Xn2HrUYLFHSwPnNgvM3-13" vertex="1">
          <mxGeometry x="160" width="40" height="40" as="geometry" />
        </mxCell>
-        <mxCell id="9og6d5YY-6gPx96OlZrF-23" value="e6" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" vertex="1" parent="9Xn2HrUYLFHSwPnNgvM3-13">
+        <mxCell id="9og6d5YY-6gPx96OlZrF-23" value="e6" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="9Xn2HrUYLFHSwPnNgvM3-13" vertex="1">
          <mxGeometry x="200" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-14" value="" style="group" parent="1" vertex="1" connectable="0">
@ -179,7 +179,7 @@
          </mxGeometry>
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-44" value="" style="rounded=0;whiteSpace=wrap;html=1;rotation=90;" parent="1" vertex="1">
-          <mxGeometry x="1040" y="440" width="40" height="40" as="geometry" />
+          <mxGeometry x="960" y="520" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-51" value="" style="rounded=0;whiteSpace=wrap;html=1;rotation=90;" parent="1" vertex="1">
          <mxGeometry x="880" y="480" width="120" height="40" as="geometry" />
@ -208,12 +208,6 @@
            <mxPoint x="1000" y="480" as="targetPoint" />
          </mxGeometry>
        </mxCell>
-        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-61" value="" style="endArrow=none;html=1;rounded=0;exitX=0.167;exitY=1;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0.167;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" edge="1">
-          <mxGeometry width="50" height="50" relative="1" as="geometry">
-            <mxPoint x="1040" y="480" as="sourcePoint" />
-            <mxPoint x="1080" y="480" as="targetPoint" />
-          </mxGeometry>
-        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-62" value="" style="endArrow=none;html=1;rounded=0;exitX=0.167;exitY=1;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0.167;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
            <mxPoint x="920" y="480" as="sourcePoint" />
@ -244,7 +238,7 @@
            <mxPoint x="1019.6700000000001" y="440" as="targetPoint" />
          </mxGeometry>
        </mxCell>
-        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-68" value="" style="endArrow=classic;html=1;rounded=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="1" edge="1">
+        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-68" value="" style="endArrow=baseDash;html=1;rounded=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;endFill=0;endSize=18;" parent="1" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
            <mxPoint x="1059.8300000000002" y="400" as="sourcePoint" />
            <mxPoint x="1059.8300000000002" y="440" as="targetPoint" />
@ -313,8 +307,8 @@
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-95" value="&lt;div&gt;p5&lt;/div&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
          <mxGeometry x="1000" y="600" width="40" height="40" as="geometry" />
        </mxCell>
-        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-96" value="p1" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
-          <mxGeometry x="1040" y="440" width="40" height="40" as="geometry" />
+        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-96" value="p3" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
+          <mxGeometry x="960" y="520" width="40" height="40" as="geometry" />
        </mxCell>
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-97" value="p1" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
          <mxGeometry x="920" y="440" width="40" height="40" as="geometry" />
@ -424,7 +418,7 @@
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-118" value="x3" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="1" vertex="1">
          <mxGeometry x="640" y="560" width="40" height="40" as="geometry" />
        </mxCell>
-        <mxCell id="9og6d5YY-6gPx96OlZrF-12" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;" edge="1" parent="1" source="9Xn2HrUYLFHSwPnNgvM3-119">
+        <mxCell id="9og6d5YY-6gPx96OlZrF-12" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;" parent="1" source="9Xn2HrUYLFHSwPnNgvM3-119" edge="1">
          <mxGeometry relative="1" as="geometry">
            <mxPoint x="720" y="740" as="targetPoint" />
            <Array as="points">
@ -444,7 +438,7 @@
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-122" value="x3" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="1" vertex="1">
          <mxGeometry x="600" y="560" width="40" height="40" as="geometry" />
        </mxCell>
-        <mxCell id="9og6d5YY-6gPx96OlZrF-14" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;" edge="1" parent="1" source="9Xn2HrUYLFHSwPnNgvM3-123">
+        <mxCell id="9og6d5YY-6gPx96OlZrF-14" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;" parent="1" source="9Xn2HrUYLFHSwPnNgvM3-123" edge="1">
          <mxGeometry relative="1" as="geometry">
            <mxPoint x="720" y="780" as="targetPoint" />
            <Array as="points">
@ -464,7 +458,7 @@
        <mxCell id="9Xn2HrUYLFHSwPnNgvM3-126" value="x3" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="1" vertex="1">
          <mxGeometry x="560" y="560" width="40" height="40" as="geometry" />
        </mxCell>
-        <mxCell id="9og6d5YY-6gPx96OlZrF-11" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;" edge="1" parent="1" source="9Xn2HrUYLFHSwPnNgvM3-127">
+        <mxCell id="9og6d5YY-6gPx96OlZrF-11" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;" parent="1" source="9Xn2HrUYLFHSwPnNgvM3-127" edge="1">
          <mxGeometry relative="1" as="geometry">
            <mxPoint x="720" y="700" as="targetPoint" />
            <Array as="points">
@ -535,61 +529,61 @@
            </Array>
          </mxGeometry>
        </mxCell>
-        <mxCell id="9og6d5YY-6gPx96OlZrF-9" value="" style="group" vertex="1" connectable="0" parent="1">
+        <mxCell id="9og6d5YY-6gPx96OlZrF-9" value="" style="group" parent="1" vertex="1" connectable="0">
          <mxGeometry x="721" y="680" width="240" height="120" as="geometry" />
        </mxCell>
-        <mxCell id="9og6d5YY-6gPx96OlZrF-1" value="" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="9og6d5YY-6gPx96OlZrF-9">
+        <mxCell id="9og6d5YY-6gPx96OlZrF-1" value="" style="rounded=0;whiteSpace=wrap;html=1;" parent="9og6d5YY-6gPx96OlZrF-9" vertex="1">
          <mxGeometry width="240" height="120" as="geometry" />
        </mxCell>
-        <mxCell id="9og6d5YY-6gPx96OlZrF-2" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="9og6d5YY-6gPx96OlZrF-9">
+        <mxCell id="9og6d5YY-6gPx96OlZrF-2" value="" style="endArrow=none;html=1;rounded=0;" parent="9og6d5YY-6gPx96OlZrF-9" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
            <mxPoint x="200" y="120" as="sourcePoint" />
            <mxPoint x="200" as="targetPoint" />
          </mxGeometry>
        </mxCell>
-        <mxCell id="9og6d5YY-6gPx96OlZrF-3" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="9og6d5YY-6gPx96OlZrF-9">
+        <mxCell id="9og6d5YY-6gPx96OlZrF-3" value="" style="endArrow=none;html=1;rounded=0;" parent="9og6d5YY-6gPx96OlZrF-9" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
            <mxPoint y="40" as="sourcePoint" />
            <mxPoint x="240" y="40" as="targetPoint" />
          </mxGeometry>
        </mxCell>
-        <mxCell id="9og6d5YY-6gPx96OlZrF-4" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="9og6d5YY-6gPx96OlZrF-9">
+        <mxCell id="9og6d5YY-6gPx96OlZrF-4" value="" style="endArrow=none;html=1;rounded=0;" parent="9og6d5YY-6gPx96OlZrF-9" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
            <mxPoint y="80" as="sourcePoint" />
            <mxPoint x="240" y="80" as="targetPoint" />
          </mxGeometry>
        </mxCell>
-        <mxCell id="9og6d5YY-6gPx96OlZrF-5" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="9og6d5YY-6gPx96OlZrF-9">
+        <mxCell id="9og6d5YY-6gPx96OlZrF-5" value="" style="endArrow=none;html=1;rounded=0;" parent="9og6d5YY-6gPx96OlZrF-9" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
            <mxPoint x="40" y="120" as="sourcePoint" />
            <mxPoint x="40" as="targetPoint" />
          </mxGeometry>
        </mxCell>
-        <mxCell id="9og6d5YY-6gPx96OlZrF-6" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="9og6d5YY-6gPx96OlZrF-9">
+        <mxCell id="9og6d5YY-6gPx96OlZrF-6" value="" style="endArrow=none;html=1;rounded=0;" parent="9og6d5YY-6gPx96OlZrF-9" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
            <mxPoint x="80" y="120" as="sourcePoint" />
            <mxPoint x="80" as="targetPoint" />
          </mxGeometry>
        </mxCell>
-        <mxCell id="9og6d5YY-6gPx96OlZrF-7" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="9og6d5YY-6gPx96OlZrF-9">
+        <mxCell id="9og6d5YY-6gPx96OlZrF-7" value="" style="endArrow=none;html=1;rounded=0;" parent="9og6d5YY-6gPx96OlZrF-9" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
            <mxPoint x="119.65999999999997" y="120" as="sourcePoint" />
            <mxPoint x="119.65999999999997" as="targetPoint" />
          </mxGeometry>
        </mxCell>
-        <mxCell id="9og6d5YY-6gPx96OlZrF-8" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="9og6d5YY-6gPx96OlZrF-9">
+        <mxCell id="9og6d5YY-6gPx96OlZrF-8" value="" style="endArrow=none;html=1;rounded=0;" parent="9og6d5YY-6gPx96OlZrF-9" edge="1">
          <mxGeometry width="50" height="50" relative="1" as="geometry">
            <mxPoint x="160" y="120" as="sourcePoint" />
            <mxPoint x="160" as="targetPoint" />
          </mxGeometry>
        </mxCell>
-        <mxCell id="9og6d5YY-6gPx96OlZrF-10" value="&lt;div&gt;Results&lt;/div&gt;&lt;div&gt;Matrix&lt;/div&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
+        <mxCell id="9og6d5YY-6gPx96OlZrF-10" value="&lt;div&gt;Results&lt;/div&gt;&lt;div&gt;Matrix&lt;/div&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
          <mxGeometry x="721" y="630" width="70" height="40" as="geometry" />
        </mxCell>
-        <mxCell id="9og6d5YY-6gPx96OlZrF-16" value="" style="shape=curlyBracket;whiteSpace=wrap;html=1;rounded=1;labelPosition=left;verticalLabelPosition=middle;align=right;verticalAlign=middle;rotation=-90;" vertex="1" parent="1">
+        <mxCell id="9og6d5YY-6gPx96OlZrF-16" value="" style="shape=curlyBracket;whiteSpace=wrap;html=1;rounded=1;labelPosition=left;verticalLabelPosition=middle;align=right;verticalAlign=middle;rotation=-90;" parent="1" vertex="1">
          <mxGeometry x="832" y="701" width="20" height="240" as="geometry" />
        </mxCell>
-        <mxCell id="9og6d5YY-6gPx96OlZrF-17" value="Expression 1 through Expression n" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
+        <mxCell id="9og6d5YY-6gPx96OlZrF-17" value="Expression 1 through Expression n" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
          <mxGeometry x="727" y="832" width="230" height="30" as="geometry" />
        </mxCell>
      </root>
--- a/other/kernel_architecture.drawio
+++ b/other/kernel_architecture.drawio
@ -0,0 +1,112 @@
+<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0" version="26.2.6">
+  <diagram name="Page-1" id="ZW0hAwE0V4rwrlzxzp_e">
+    <mxGraphModel dx="1426" dy="791" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
+      <root>
+        <mxCell id="0" />
+        <mxCell id="1" parent="0" />
+        <mxCell id="EzEPb8_loPXt5I1V_28y-3" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="EzEPb8_loPXt5I1V_28y-1" target="EzEPb8_loPXt5I1V_28y-2">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-1" value="Interpreter" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="250" y="120" width="120" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-7" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="EzEPb8_loPXt5I1V_28y-2" target="EzEPb8_loPXt5I1V_28y-4">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-8" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="EzEPb8_loPXt5I1V_28y-2" target="EzEPb8_loPXt5I1V_28y-5">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-9" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="EzEPb8_loPXt5I1V_28y-2" target="EzEPb8_loPXt5I1V_28y-6">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-2" value="Kernel" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="250" y="200" width="120" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-14" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="EzEPb8_loPXt5I1V_28y-4" target="EzEPb8_loPXt5I1V_28y-11">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-4" value="Dispatch" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="180" y="280" width="80" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-15" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="EzEPb8_loPXt5I1V_28y-5" target="EzEPb8_loPXt5I1V_28y-12">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-5" value="Dispatch" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="270" y="280" width="80" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-16" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="EzEPb8_loPXt5I1V_28y-6" target="EzEPb8_loPXt5I1V_28y-13">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-6" value="Dispatch" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="360" y="280" width="80" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-11" value="Evaluate" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="180" y="360" width="80" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-12" value="Evaluate" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="270" y="360" width="80" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-13" value="&lt;div&gt;Evaluate&lt;/div&gt;" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="360" y="360" width="80" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-36" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="EzEPb8_loPXt5I1V_28y-18" target="EzEPb8_loPXt5I1V_28y-32">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-37" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="EzEPb8_loPXt5I1V_28y-18" target="EzEPb8_loPXt5I1V_28y-33">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-38" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="EzEPb8_loPXt5I1V_28y-18" target="EzEPb8_loPXt5I1V_28y-34">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-18" value="Transpiler" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="550" y="120" width="120" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-23" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" source="EzEPb8_loPXt5I1V_28y-24" target="EzEPb8_loPXt5I1V_28y-29" parent="1">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-24" value="Dispatch" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="480" y="280" width="80" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-25" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" source="EzEPb8_loPXt5I1V_28y-26" target="EzEPb8_loPXt5I1V_28y-30" parent="1">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-26" value="Dispatch" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="570" y="280" width="80" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-27" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" source="EzEPb8_loPXt5I1V_28y-28" target="EzEPb8_loPXt5I1V_28y-31" parent="1">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-28" value="Dispatch" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="660" y="280" width="80" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-29" value="Evaluate" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="480" y="360" width="80" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-30" value="Evaluate" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="570" y="360" width="80" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-31" value="&lt;div&gt;Evaluate&lt;/div&gt;" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="660" y="360" width="80" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-39" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="EzEPb8_loPXt5I1V_28y-32" target="EzEPb8_loPXt5I1V_28y-24">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-32" value="Kernel" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="480" y="200" width="80" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-40" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="EzEPb8_loPXt5I1V_28y-33" target="EzEPb8_loPXt5I1V_28y-26">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-33" value="Kernel" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="570" y="200" width="80" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-41" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="EzEPb8_loPXt5I1V_28y-34" target="EzEPb8_loPXt5I1V_28y-28">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="EzEPb8_loPXt5I1V_28y-34" value="Kernel" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="660" y="200" width="80" height="40" as="geometry" />
+        </mxCell>
+      </root>
+    </mxGraphModel>
+  </diagram>
+</mxfile>
--- a/other/pre-processing_result.drawio
+++ b/other/pre-processing_result.drawio
@ -0,0 +1,40 @@
+<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0" version="26.2.5">
+  <diagram name="Page-1" id="93wPJxm0qDUx-9UJ1EZK">
+    <mxGraphModel dx="1182" dy="655" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
+      <root>
+        <mxCell id="0" />
+        <mxCell id="1" parent="0" />
+        <mxCell id="399UxkHvPDb8lwnND9dC-1" value="X&lt;sub&gt;1&lt;/sub&gt;" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="265" y="240" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="399UxkHvPDb8lwnND9dC-2" value="2" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="355" y="240" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="399UxkHvPDb8lwnND9dC-3" value="+" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="445" y="240" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="399UxkHvPDb8lwnND9dC-5" value="&lt;div&gt;Type: Variable&lt;/div&gt;&lt;div&gt;Value: 1&lt;/div&gt;" style="rounded=0;whiteSpace=wrap;html=1;align=left;" vertex="1" parent="1">
+          <mxGeometry x="240" y="280" width="90" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="399UxkHvPDb8lwnND9dC-6" value="&lt;div&gt;Type: Constant&lt;/div&gt;&lt;div&gt;Value: 2&lt;/div&gt;" style="rounded=0;whiteSpace=wrap;html=1;align=left;" vertex="1" parent="1">
+          <mxGeometry x="330" y="280" width="90" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="399UxkHvPDb8lwnND9dC-9" value="&lt;div&gt;Type: Operator&lt;/div&gt;&lt;div&gt;Value: Addition&lt;/div&gt;" style="rounded=0;whiteSpace=wrap;html=1;align=left;" vertex="1" parent="1">
+          <mxGeometry x="420" y="280" width="90" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="399UxkHvPDb8lwnND9dC-10" value="X&lt;sub&gt;1&lt;/sub&gt;" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="80" y="280" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="399UxkHvPDb8lwnND9dC-14" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" edge="1" parent="1" source="399UxkHvPDb8lwnND9dC-11" target="399UxkHvPDb8lwnND9dC-5">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="399UxkHvPDb8lwnND9dC-11" value="2" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="160" y="280" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="399UxkHvPDb8lwnND9dC-12" value="+" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="120" y="280" width="40" height="40" as="geometry" />
+        </mxCell>
+      </root>
+    </mxGraphModel>
+  </diagram>
+</mxfile>
--- a/package/Project.toml
+++ b/package/Project.toml
@ -1,13 +1,19 @@
 name = "ExpressionExecutorCuda"
 uuid = "5b8ee377-1e19-4ba5-a85c-78c7d1694bfe"
-authors = ["Daniel Wiplinger"]
+authors = ["Daniel Roth"]
 version = "1.0.0-DEV"

 [deps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"

 [compat]
+LinearAlgebra = "1.11.0"
+Printf = "1.11.0"
+Random = "1.11.0"
 julia = "1.6.7"

 [extras]
--- a/package/src/Code.jl
+++ b/package/src/Code.jl
@ -0,0 +1,207 @@
+using Printf
+
+@enum Opcode::UInt8 begin
+  opc_stop = 1 # must start with 1 here TODO: remove stop
+  opc_add 
+  opc_sub
+  opc_mul 
+  opc_div
+  opc_inv
+  opc_log 
+  opc_log10 
+  opc_exp 
+  opc_pow
+  opc_powconst
+  opc_powabs
+  opc_neg
+  opc_abs 
+  opc_sign
+  opc_sin
+  opc_asin
+  opc_tan
+  opc_tanh
+  opc_cos
+  opc_cosh
+  opc_constant 
+  opc_param 
+  opc_variable
+end
+
+const terminal_opcodes = [opc_stop, opc_constant, opc_param, opc_variable]
+const unary_opcodes = [opc_log, opc_log10, opc_exp, opc_abs, opc_sign, opc_sin, opc_cos, opc_cosh, opc_asin, opc_tan, opc_tanh, opc_powconst, opc_neg, opc_inv]
+const binary_opcodes = [opc_add, opc_sub, opc_mul, opc_div, opc_pow, opc_powabs]
+
+function opcode(sy::Symbol)::Opcode
+    if sy == :+ return opc_add 
+    elseif sy == :- return opc_sub
+    elseif sy == :* return opc_mul
+    elseif sy == :/ return opc_div
+    elseif sy == :inv return opc_inv
+    elseif sy == :log return opc_log
+    elseif sy == :log10 return opc_log10
+    elseif sy == :exp return opc_exp
+    elseif sy == :^ return opc_powabs # TODO: this is temporary to enforce that all powers are evaluated as pow(abs(...)) for parameter optimization
+    elseif sy == :powabs return opc_powabs # TODO: this is temporary to enforce that all powers are evaluated as pow(abs(...)) for parameter optimization
+    elseif sy == :abs return opc_abs
+    elseif sy == :sign return opc_sign
+    elseif sy == :sin return opc_sin
+    elseif sy == :asin return opc_asin
+    elseif sy == :cos return opc_cos
+    elseif sy == :cosh return opc_cosh
+    elseif sy == :tan return opc_tan
+    elseif sy == :tanh return opc_tanh
+    else error("no opcode for symbol $sy")
+    end
+end
+
+function degree(opc::Opcode)::Integer 
+    if opc in terminal_opcodes return 0
+    elseif opc in unary_opcodes return 1
+    elseif opc in binary_opcodes return 2
+    else error("unknown degree of opcode $opc")
+    end
+end
+
+
+# code is a Vector{Instruction} which is a linear representation of a directed acyclic graph of expressions.
+# The code can be evaluated from left to right.
+struct Instruction{T}
+    opcode::Opcode
+    arg1idx::UInt32 # index of first argument. 0 for terminals
+    arg2idx::UInt32 # index of second argument. 0 for functions with a single argument
+    idx::UInt32  # for variables and parameters
+    val::T # for constants
+end
+
+
+function Base.show(io::IO, instr::Instruction)
+    Printf.format(io, Printf.format"%15s %3d %3d %3d %f", instr.opcode, instr.arg1idx, instr.arg2idx, instr.idx, instr.val)
+end
+
+create_const_instruction(val::T) where {T}                                = Instruction{T}(opc_constant, UInt32(0), UInt32(0), UInt32(0), val)
+create_var_instruction(::Type{T}, varidx) where {T}                       = Instruction{T}(opc_variable, UInt32(0), UInt32(0), UInt32(varidx), zero(T))
+create_param_instruction(::Type{T}, paramidx; val::T = zero(T)) where {T} = Instruction{T}(opc_param, UInt32(0), UInt32(0), UInt32(paramidx), val)
+
+
+function convert_expr_to_code(::Type{T}, expr::Expr)::Vector{Instruction{T}} where {T}
+    code = Vector{Instruction{T}}()
+
+    Base.remove_linenums!(expr)
+    paramTup = expr.args[1]
+    xSy = paramTup.args[1]
+    pSy = paramTup.args[2]
+    body = expr.args[2]
+
+    cache = Dict{Any,Int32}() # for de-duplication of expressions. If an expression is in the cache simply return the index of the existing code
+
+    convert_expr_to_code!(code, cache, body, xSy, pSy)
+
+    # for debugging
+    # for tup in sort(cache; byvalue=true)
+    #     println(tup)
+    # end
+    return code
+end
+
+# uses cache (hashcons) to de-duplicate subexpressions in the tree.
+function convert_expr_to_code!(code::Vector{Instruction{T}}, cache, val::TV, xSy, pSy)::UInt32 where {T,TV}
+    if haskey(cache, val) return cache[val] end
+
+    push!(code, create_const_instruction(T(val)))
+    cache[val] = length(code)    
+    return length(code)
+end
+
+function convert_expr_to_code!(code::Vector{Instruction{T}}, cache, expr::Expr, xSy, pSy)::UInt32 where {T}
+    # predicate to check if an expression is abs(...)
+    is_abs(a) = a isa Expr && a.head == :call && a.args[1] == :abs
+
+    if haskey(cache, expr) return cache[expr] end
+
+    sy = expr.head
+    if sy == :call
+        func = expr.args[1]
+        arg1idx::UInt32 = 0
+        arg2idx::UInt32 = 0
+        # unary functions
+        if length(expr.args) == 2
+            arg1idx = convert_expr_to_code!(code, cache, expr.args[2], xSy, pSy)
+            if (func == :-)
+                # - with one argument => negate
+                push!(code, Instruction{T}(opc_neg, arg1idx, UInt32(0), UInt32(0), zero(T)))
+            elseif (func == :sqrt)
+                push!(code, Instruction{T}(opc_powconst, arg1idx, UInt32(0), UInt32(0), T(0.5)))
+            else
+                push!(code, Instruction{T}(opcode(func), arg1idx, UInt32(0), UInt32(0), zero(T)))
+            end
+        elseif length(expr.args) == 3
+            arg1idx = convert_expr_to_code!(code, cache, expr.args[2], xSy, pSy)
+            if func == :^ && expr.args[3] isa Number && round(expr.args[3]) == expr.args[3] # is integer 
+                # special case for constant powers
+                push!(code, Instruction{T}(opc_powconst, arg1idx, UInt32(0), UInt32(0), T(expr.args[3])))
+            elseif func == :^ && is_abs(expr.args[2])
+                # fuse pow(abs(x), y) --> powabs(x,y)
+                absexpr = expr.args[2]
+                x = absexpr.args[2]
+                arg1idx = convert_expr_to_code!(code, cache, x, xSy, pSy) # because of hashconsing this will return the index within the code for abs(x) generated above
+                arg2idx = convert_expr_to_code!(code, cache, expr.args[3], xSy, pSy)
+                push!(code, Instruction{T}(opc_powabs, arg1idx, arg2idx, UInt32(0), zero(T)))
+            else 
+                arg2idx = convert_expr_to_code!(code, cache, expr.args[3], xSy, pSy)
+                push!(code, Instruction{T}(opcode(func), arg1idx, arg2idx, UInt32(0), zero(T)))
+            end
+        else 
+            # dump(expr)
+            errpr("only unary and binary functions are supported ($func is not supported)")
+        end
+    elseif sy == :ref
+        arrSy = expr.args[1]
+        idx = expr.args[2]
+        if arrSy == xSy
+            push!(code, create_var_instruction(T, idx))
+        elseif arrSy == pSy
+            push!(code, create_param_instruction(T, idx))
+        else
+            dump(expr)
+            throw(UndefVarError("unknown symbol"))
+        end
+    else 
+        error("Unsupported symbol $sy")
+    end
+
+    cache[expr] = length(code)
+    return length(code)
+end
+
+
+function Base.show(io::IO, code::AbstractArray{Instruction{T}}) where {T}
+    sym = Dict(
+        opc_stop => ".",
+        opc_add => "+",
+        opc_sub => "-",
+        opc_neg => "neg",
+        opc_mul => "*",
+        opc_div => "/",
+        opc_inv => "inv",
+        opc_pow => "^",
+        opc_powabs => "abs^",
+        opc_powconst => "^c",
+        opc_log => "log",
+        opc_log10 => "l10",
+        opc_exp => "exp",
+        opc_abs => "abs",
+        opc_sign => "sgn",
+        opc_sin => "sin",
+        opc_cos => "cos",
+        opc_variable => "var",
+        opc_constant => "con",
+        opc_param => "par",
+    )
+    
+    for i in eachindex(code)
+        instr = code[i]
+        Printf.format(io, Printf.format"%4d %4s %3d %3d %3d %f", i, sym[instr.opcode], instr.arg1idx, instr.arg2idx, instr.idx, instr.val)
+        println(io)
+        # printfmtln(io, "{1:>4d} {2:>4s} {3:>3d} {4:>3d} {5:>3d} {6:>}", i, sym[instr.opcode], instr.arg1idx, instr.arg2idx, instr.idx, instr.val)
+    end
+end
--- a/package/src/CpuInterpreter.jl
+++ b/package/src/CpuInterpreter.jl
@ -0,0 +1,172 @@
+using Random 
+
+struct InterpreterBuffers{T}
+    resultcache::Matrix{T} # for forward eval
+    diffcache::Matrix{T} # for reverse AD
+    jaccache::Matrix{T} # for Jacobian
+    tmp::Vector{T} # a temporary space for each of the vector operations
+
+    function InterpreterBuffers{T}(codelen, num_param, batchsize) where {T<:AbstractFloat}
+        buf = Matrix{T}(undef, batchsize, codelen) 
+        rev_buf = Matrix{T}(undef, batchsize, codelen)
+        jac_buf = Matrix{T}(undef, batchsize, num_param)
+        tmp = Vector{T}(undef, batchsize)
+
+        new(buf, rev_buf, jac_buf, tmp)
+    end
+end
+
+mutable struct Interpreter{T}
+    const code::Vector{Instruction{T}}
+    const buffers::InterpreterBuffers{T}
+    const batchsize::UInt32
+    pc::Int32
+
+    function Interpreter{T}(expr::Expr, num_param; batchsize = 1024) where {T<:AbstractFloat}
+        code = convert_expr_to_code(T, expr)
+        # println(code)
+        buffers = InterpreterBuffers{T}(length(code), num_param, batchsize)
+        new(code, buffers, batchsize, 1)
+    end
+end
+
+peek_instruction(interpreter) = interpreter.code[interpreter.pc]
+
+
+
+# batch size 1024 was fast in benchmark
+interpret!(result::AbstractVector{T}, expr::Expr, x::AbstractMatrix{T}, p; batchsize=1024) where {T} = interpret!(result, Interpreter{T}(expr, length(p); batchsize), x, p)
+
+# for Float evaluation use the preallocated buffer
+function interpret!(result::AbstractVector{T}, interpreter::Interpreter{T}, x::AbstractMatrix{T}, p::AbstractArray{T}) where {T} 
+    interpret_withbuf!(result, interpreter, interpreter.buffers.resultcache, interpreter.buffers.tmp, x, p)
+end
+
+function interpret_withbuf!(result::AbstractVector{T}, interpreter::Interpreter{T}, batchresult, tmp, x::AbstractMatrix{T}, p::AbstractArray{TD}) where {T,TD}
+    allrows = axes(x, 1)
+    @assert length(result) == length(allrows)
+
+    
+    # all batches
+    start = first(allrows)
+    while start + interpreter.batchsize < last(allrows)
+        batchrows = start:(start + interpreter.batchsize - 1)
+        interpret_batch!(interpreter, batchresult, tmp, x, p, batchrows)
+        copy!((@view result[batchrows]), (@view batchresult[:, end]))
+        start += interpreter.batchsize
+    end
+
+
+    # process remaining rows
+    remrows = start:last(allrows)
+    if length(remrows) > 0
+        interpret_batch!(interpreter, batchresult, tmp, x, p, remrows)
+        copy!((@view result[remrows]), (@view batchresult[1:length(remrows), end]))
+        # res += sum(view(batchresult, 1:length(remrows), lastcolidx))
+    end
+    # res
+    result
+end
+
+function interpret_batch!(interpreter, 
+                    batchresult, tmp,
+                    x, p, rows)
+    # forward pass
+    interpret_fwd!(interpreter, batchresult, tmp, x, p, rows)
+
+    nothing
+end
+
+function interpret_fwd!(interpreter, batchresult, tmp, x, p, rows)
+    interpreter.pc = 1 
+    while interpreter.pc <= length(interpreter.code)
+        step!(interpreter, batchresult, tmp, x, p, rows)
+    end
+end
+
+
+function step!(interpreter, batchresult, tmp, x, p, range)
+    instr = interpreter.code[interpreter.pc]
+    opc = instr.opcode
+    res = view(batchresult, :, interpreter.pc)
+
+    if degree(opc) == 0
+        if opc == opc_variable 
+            copyto!(res, view(x, range, instr.idx))
+        elseif opc == opc_param
+            fill!(res, p[instr.idx])
+        elseif opc == opc_constant
+            fill!(res, instr.val)
+        end
+    elseif degree(opc) == 1
+        arg = view(batchresult, :, instr.arg1idx)
+        # is converted to a switch automatically by LLVM
+        if     opc == opc_log      vec_log!(res, arg, tmp)
+        elseif opc == opc_log10    vec_log10!(res, arg, tmp)
+        elseif opc == opc_exp      vec_exp!(res, arg, tmp)
+        elseif opc == opc_abs      vec_abs!(res, arg, tmp)
+        elseif opc == opc_neg      vec_neg!(res, arg, tmp)
+        elseif opc == opc_inv      vec_inv!(res, arg, tmp)
+        elseif opc == opc_sign     vec_sign!(res, arg, tmp)
+        elseif opc == opc_powconst vec_powconst!(res, arg, instr.val, tmp);
+        elseif opc == opc_sin      vec_sin!(res, arg, tmp)
+        elseif opc == opc_cos      vec_cos!(res, arg, tmp)
+        elseif opc == opc_cosh     vec_cosh!(res, arg, tmp)
+        elseif opc == opc_asin     vec_asin!(res, arg, tmp)
+        elseif opc == opc_tan      vec_tan!(res, arg, tmp)
+        elseif opc == opc_tanh     vec_tanh!(res, arg, tmp)
+
+        else throw(DomainError("Unsupported opcode $opc"))
+        end
+    elseif degree(opc) == 2
+        left = view(batchresult, :, instr.arg1idx)
+        right = view(batchresult, :, instr.arg2idx)
+
+        if     opc == opc_add    vec_add!(res, left, right, tmp)
+        elseif opc == opc_sub    vec_sub!(res, left, right, tmp)
+        elseif opc == opc_mul    vec_mul!(res, left, right, tmp)
+        elseif opc == opc_div    vec_div!(res, left, right, tmp)
+        elseif opc == opc_pow    vec_pow!(res, left, right, tmp)
+        elseif opc == opc_powabs vec_powabs!(res, left, right, tmp)
+        else throw(DomainError("Unsupported opcode $opc"))
+        end
+        # if any(isnan, res) 
+        #     throw(DomainError("got NaN for $opc $(interpreter.pc) $left $right"))
+        # end
+    end
+
+    interpreter.pc += 1
+
+    return nothing
+end
+
+
+for unaryfunc in (:exp, :abs, :sin, :cos, :cosh, :asin, :tan, :tanh, :sinh)
+    funcsy = Symbol("vec_$(unaryfunc)!")
+    @eval function $funcsy(res::AbstractVector{T}, arg::AbstractVector{T}, ::AbstractVector{T}) where T<:Real
+        @simd for i in eachindex(res)
+            @inbounds res[i] = Base.$unaryfunc(arg[i])
+        end
+    end
+end
+
+
+function vec_add!(res::AbstractVector{TE}, left::AbstractVector{TE}, right::AbstractVector{TE}, ::AbstractVector{TE}) where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = left[i] + right[i] end end
+function vec_sub!(res::AbstractVector{TE}, left::AbstractVector{TE}, right::AbstractVector{TE}, ::AbstractVector{TE}) where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = left[i] - right[i] end end
+function vec_mul!(res::AbstractVector{TE}, left::AbstractVector{TE}, right::AbstractVector{TE}, ::AbstractVector{TE}) where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = left[i] * right[i] end end
+function vec_div!(res::AbstractVector{TE}, left::AbstractVector{TE}, right::AbstractVector{TE}, ::AbstractVector{TE}) where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = left[i] / right[i] end end
+function vec_pow!(res::AbstractVector{TE}, left::AbstractVector{TE}, right::AbstractVector{TE}, ::AbstractVector{TE}) where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = left[i] ^ right[i] end end
+
+# TODO: special case scalar power
+function vec_powconst!(res::AbstractVector{TE}, left::AbstractVector{TE}, right::TC, ::AbstractVector{TE}) where {TE<:Real,TC<:Real} @simd for i in eachindex(res) @inbounds res[i] = left[i] ^ right end end
+function vec_powabs!(res::AbstractVector{TE}, left::AbstractVector{TE}, right::AbstractVector{TE}, ::AbstractVector{TE}) where TE<:Real   @simd for i in eachindex(res) @inbounds res[i] = abs(left[i]) ^ right[i] end end
+
+function vec_neg!(res::AbstractVector{TE}, arg::AbstractVector{TE}, ::AbstractVector{TE})   where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = -arg[i] end end
+function vec_inv!(res::AbstractVector{TE}, arg::AbstractVector{TE}, ::AbstractVector{TE})   where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = inv(arg[i]) end end
+function vec_sign!(res::AbstractVector{TE}, arg::AbstractVector{TE}, ::AbstractVector{TE})  where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = sign(arg[i]) end end
+
+# handle log and exp specially to use NaN instead of DomainError
+function vec_log!(res::AbstractVector{TE}, arg::AbstractVector{TE}, ::AbstractVector{TE})   where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = arg[i] < zero(TE) ? TE(NaN) : log(arg[i]) end end
+function vec_log10!(res::AbstractVector{TE}, arg::AbstractVector{TE}, ::AbstractVector{TE}) where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = arg[i] < zero(TE) ? TE(NaN) : log10(arg[i]) end end
+
+
--- a/package/src/ExpressionExecutorCuda.jl
+++ b/package/src/ExpressionExecutorCuda.jl
@ -1,30 +1,76 @@
 module ExpressionExecutorCuda
+include("Utils.jl")
 include("ExpressionProcessing.jl")
 include("Interpreter.jl")
+include("Transpiler.jl")

-export interpret_gpu
+module CpuInterpreter
+include("Code.jl")
+include("CpuInterpreter.jl")
+end
+
+export interpret_gpu,interpret_cpu
 export evaluate_gpu
 export test

 # Some assertions:
 # Variables and parameters start their naming with "1" meaning the first variable/parameter has to be "x1/p1" and not "x0/p0"
+# Matrix X is column major
 # each index i in exprs has to have the matching values in the column i in Matrix X so that X[:,i] contains the values for expr[i]. The same goes for p
 #     This assertion is made, because in julia, the first index doesn't have to be 1
 #

 # Evaluate Expressions on the GPU
-function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
-	exprsPostfix = ExpressionProcessing.expr_to_postfix(exprs[1])
+function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32}
+	@assert axes(exprs) == axes(p)
+	ncols = size(X, 2)
+
+	results = Matrix{Float32}(undef, ncols, length(exprs))
+
+	for i in 1:repetitions # Simulate parameter tuning
+		results = Interpreter.interpret(exprs, X, p)
+	end
+
+	return results
 end

 # Convert Expressions to PTX Code and execute that instead
-function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
-	# Look into this to maybe speed up PTX generation: https://cuda.juliagpu.org/stable/tutorials/introduction/#Parallelization-on-the-CPU
+function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32}
+	@assert axes(exprs) == axes(p)
+	ncols = size(X, 2)
+
+	results = Matrix{Float32}(undef, ncols, length(exprs))
+
+	for i in 1:repetitions # Simulate parameter tuning
+		results = Transpiler.evaluate(exprs, X, p)
 	end

+	return results
 end


+# Evaluate Expressions on the CPU
+function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32}
+	@assert axes(exprs) == axes(p)
+	nrows = size(X, 1)
+	
+	# each column of the matrix has the result for an expr
+	res = Matrix{Float32}(undef, nrows, length(exprs))
+
+	for i in eachindex(exprs) 
+		# The interpreter holds the postfix code and buffers for evaluation. It is costly to create
+		interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i])) 
+
+		# If an expression has to be evaluated multiple times (e.g. for different parameters),
+		# it is worthwhile to reuse the interpreter to reduce the number of allocations
+		for rep in 1:repetitions
+			CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i])
+		end
+	end
+
+	res
+end
+

 # Flow
 # input: Vector expr    == expressions contains eg. 4 expressions
@ -35,3 +81,5 @@ end
 # The following can be done on the CPU
 #     convert expression to postfix notation (mandatory)
 #     optional: replace every parameter with the correct value (should only improve performance if data transfer is the bottleneck)
+
+end
--- a/package/src/ExpressionProcessing.jl
+++ b/package/src/ExpressionProcessing.jl
@ -22,9 +22,9 @@ NOTE: All 64-Bit values will be converted to 32-Bit. Be aware of the lost precis
 "
 function expr_to_postfix(expr::Expr)::PostfixType
 	postfix = PostfixType()
-	operator = get_operator(expr.args[1])
+	@inbounds operator = get_operator(expr.args[1])

-	for j in 2:length(expr.args)
+	@inbounds for j in 2:length(expr.args)
 		arg = expr.args[j]

 		if typeof(arg) === Expr
@ -71,6 +71,10 @@ function get_operator(op::Symbol)::Operator
 		return EXP
 	elseif op == :sqrt
 		return SQRT
+	elseif op == :powabs
+		return POWER # TODO: Fix this
+	else
+		throw("Operator unknown")
 	end
 end

--- a/package/src/Interpreter.jl
+++ b/package/src/Interpreter.jl
@ -1,7 +1,9 @@
 module Interpreter
 using CUDA
+using CUDA: i32
 using StaticArrays
 using ..ExpressionProcessing
+using ..Utils

 export interpret

@ -11,22 +13,28 @@ export interpret
 - variables::Matrix{Float32} : The variables to use. Each column is mapped to the variables x1..xn
 - parameters::Vector{Vector{Float32}} : The parameters to use. Each Vector contains the values for the parameters p1..pn. The number of parameters can be different for every expression
 "
-function interpret(expressions::Vector{ExpressionProcessing.PostfixType}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
+function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
+	
+	exprs = Vector{ExpressionProcessing.PostfixType}(undef, length(expressions))
+	@inbounds for i in eachindex(expressions)
+		exprs[i] = ExpressionProcessing.expr_to_postfix(expressions[i])
+	end
+	
 	variableCols = size(variables, 2) # number of variable sets to use for each expression
 	cudaVars = CuArray(variables)
-	cudaParams = create_cuda_array(parameters, NaN32) # column corresponds to data for one expression
-	cudaExprs = create_cuda_array(expressions, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression
+	cudaParams = Utils.create_cuda_array(parameters, NaN32) # column corresponds to data for one expression
+	cudaExprs = Utils.create_cuda_array(exprs, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression
 	# put into seperate cuArray, as this is static and would be inefficient to send seperatly to every kernel
-	cudaStepsize = CuArray([get_max_inner_length(expressions), get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression
+	cudaStepsize::CuArray{Int32} = CuArray([Utils.get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression

 	# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
-	cudaResults = CuArray{Float32}(undef, variableCols, length(expressions))
+	cudaResults = CuArray{Float32}(undef, variableCols, length(exprs))

 	# Start kernel for each expression to ensure that no warp is working on different expressions
-	for i in eachindex(expressions)
-		kernel = @cuda launch=false interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
-		config = launch_configuration(kernel.fun)
-		threads = min(variableCols, config.threads)
+	@inbounds for i in eachindex(exprs)
+		kernel = @cuda launch=false interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, convert(Int32, i))
+		# config = launch_configuration(kernel.fun)
+		threads = min(variableCols, 128)
 		blocks = cld(variableCols, threads)

 		kernel(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i; threads, blocks)
@ -37,54 +45,56 @@ end

 #TODO: Add @inbounds to all indexing after it is verified that all works https://cuda.juliagpu.org/stable/development/kernel/#Bounds-checking
 const MAX_STACK_SIZE = 25 # The depth of the stack to store the values and intermediate results
-function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, variables::CuDeviceArray{Float32}, parameters::CuDeviceArray{Float32}, results::CuDeviceArray{Float32}, stepsize::CuDeviceArray{Int}, exprIndex::Int)
-	index = (blockIdx().x - 1) * blockDim().x + threadIdx().x # ctaid.x * ntid.x + tid.x
-	stride = gridDim().x * blockDim().x # nctaid.x * ntid.x
+function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, variables::CuDeviceArray{Float32}, parameters::CuDeviceArray{Float32}, results::CuDeviceArray{Float32}, stepsize::CuDeviceArray{Int32}, exprIndex::Int32)
+	varSetIndex = (blockIdx().x - 1i32) * blockDim().x + threadIdx().x # ctaid.x * ntid.x + tid.x (1-based)
+	@inbounds variableCols = length(variables) / stepsize[2]

-	firstExprIndex = ((exprIndex - 1) * stepsize[1]) + 1 # Inclusive
-	lastExprIndex = firstExprIndex + stepsize[1] - 1 # Inclusive
-	firstParamIndex = ((exprIndex - 1) * stepsize[2]) # Exclusive
-	variableCols = length(variables) / stepsize[3]
+	if varSetIndex > variableCols
+		return
+	end
+
+	# firstExprIndex = ((exprIndex - 1) * stepsize[1]) + 1 # Inclusive
+	# lastExprIndex = firstExprIndex + stepsize[1] - 1 # Inclusive
+	@inbounds firstParamIndex = ((exprIndex - 1i32) * stepsize[1]) # Exclusive

 	operationStack = MVector{MAX_STACK_SIZE, Float32}(undef) # Try to get this to function with variable size too, to allow better memory usage
-	operationStackTop = 0 # stores index of the last defined/valid value
+	operationStackTop = 0i32 # stores index of the last defined/valid value
 	
-	for varSetIndex in index:stride
-		firstVariableIndex = ((varSetIndex - 1) * stepsize[3]) # Exclusive
+	@inbounds firstVariableIndex = ((varSetIndex - 1i32) * stepsize[2]) # Exclusive
 	
-		for i in firstExprIndex:lastExprIndex
-			if expressions[i].Type == EMPTY
+	@inbounds for expr in expressions
+		if expr.Type == EMPTY
 			break
-			elseif expressions[i].Type == INDEX
-				val = expressions[i].Value
-				operationStackTop += 1
+		elseif expr.Type == INDEX
+			val = expr.Value
+			operationStackTop += 1i32

 			if val > 0
 				operationStack[operationStackTop] = variables[firstVariableIndex + val]
 			else
-					val = -val
+				val = abs(val)
 				operationStack[operationStackTop] = parameters[firstParamIndex + val]
 			end
-			elseif expressions[i].Type == FLOAT32
-				operationStackTop += 1
-				operationStack[operationStackTop] = reinterpret(Float32, expressions[i].Value)
-			elseif expressions[i].Type == OPERATOR
-				type = reinterpret(Operator, expressions[i].Value)
+		elseif expr.Type == FLOAT32
+			operationStackTop += 1i32
+			operationStack[operationStackTop] = reinterpret(Float32, expr.Value)
+		elseif expr.Type == OPERATOR
+			type = reinterpret(Operator, expr.Value)
 			if type == ADD
-					operationStackTop -= 1
-					operationStack[operationStackTop] = operationStack[operationStackTop] + operationStack[operationStackTop + 1]
+				operationStackTop -= 1i32
+				operationStack[operationStackTop] = operationStack[operationStackTop] + operationStack[operationStackTop + 1i32]
 			elseif type == SUBTRACT
-					operationStackTop -= 1
-					operationStack[operationStackTop] = operationStack[operationStackTop] - operationStack[operationStackTop + 1]
+				operationStackTop -= 1i32
+				operationStack[operationStackTop] = operationStack[operationStackTop] - operationStack[operationStackTop + 1i32]
 			elseif type == MULTIPLY
-					operationStackTop -= 1
-					operationStack[operationStackTop] = operationStack[operationStackTop] * operationStack[operationStackTop + 1]
+				operationStackTop -= 1i32
+				operationStack[operationStackTop] = operationStack[operationStackTop] * operationStack[operationStackTop + 1i32]
 			elseif type == DIVIDE
-					operationStackTop -= 1
-					operationStack[operationStackTop] = operationStack[operationStackTop] / operationStack[operationStackTop + 1]
+				operationStackTop -= 1i32
+				operationStack[operationStackTop] = operationStack[operationStackTop] / operationStack[operationStackTop + 1i32]
 			elseif type == POWER
-					operationStackTop -= 1
-					operationStack[operationStackTop] = operationStack[operationStackTop] ^ operationStack[operationStackTop + 1]
+				operationStackTop -= 1i32
+				operationStack[operationStackTop] = operationStack[operationStackTop] ^ operationStack[operationStackTop + 1i32]
 			elseif type == ABS
 				operationStack[operationStackTop] = abs(operationStack[operationStackTop])
 			elseif type == LOG
@ -95,57 +105,17 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var
 				operationStack[operationStackTop] = sqrt(operationStack[operationStackTop])
 			end
 		else
-				operationStack[operationStackTop] = NaN
+			operationStack[operationStackTop] = NaN32
 			break
 		end
 	end
+
 	# "(exprIndex - 1) * variableCols" -> calculates the column in which to insert the result (expression = column)
 	# "+ varSetIndex" -> to get the row inside the column at which to insert the result of the variable set (variable set = row)
-		resultIndex = convert(Int, (exprIndex - 1) * variableCols + varSetIndex) # Inclusive
-		results[resultIndex] = operationStack[operationStackTop]
-	end
+	resultIndex = convert(Int, (exprIndex - 1i32) * variableCols + varSetIndex) # Inclusive
+	@inbounds results[resultIndex] = operationStack[operationStackTop]

 	return
 end

-
-"Retrieves the number of entries for the largest inner vector"
-function get_max_inner_length(vec::Vector{Vector{T}})::Int where T
-	maxLength = 0
-	@inbounds for i in eachindex(vec)
-		if length(vec[i]) > maxLength
-			maxLength = length(vec[i])
-		end
-	end
-
-	return maxLength
-end
-
-"Returns a CuArray filed with the data provided. The inner vectors do not have to have the same length. All missing elements will be the value ```invalidElement```"
-function create_cuda_array(data::Vector{Vector{T}}, invalidElement::T)::CuArray{T} where T
-	dataCols = get_max_inner_length(data)
-	dataRows = length(data)
-	dataMat = convert_to_matrix(data, invalidElement)
-	cudaArr = CuArray{T}(undef, dataCols, dataRows) # length(parameters) == number of expressions
-	copyto!(cudaArr, dataMat)
-
-	return cudaArr
-end
-
-"Converts a vector of vectors into a matrix. The inner vectors do not need to have the same length.
-
-All entries that cannot be filled have ```invalidElement``` as their value
-"
-function convert_to_matrix(vec::Vector{Vector{T}}, invalidElement::T)::Matrix{T} where T
-	vecCols = get_max_inner_length(vec)
-	vecRows = length(vec)
-	vecMat = fill(invalidElement, vecCols, vecRows)
-	
-	for i in eachindex(vec)
-		vecMat[:,i] = copyto!(vecMat[:,i], vec[i])
-	end
-
-	return vecMat
-end
-
 end
--- a/package/src/Transpiler.jl
+++ b/package/src/Transpiler.jl
@ -1,55 +1,112 @@
 module Transpiler
 using CUDA
 using ..ExpressionProcessing
+using ..Utils

-# Number of threads per block/SM + max number of registers
 # https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications
-# Need to assume a max of 2048 threads per Streaming Multiprocessor (SM)
-# One SM can have 64*1024 32-bit registers at max
-# One thread can at max use 255 registers
-# Meaning one has access to at most 32 registers in the worst case. Using 64 bit values this number gets halfed (see: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#multiprocessor-level (almost at the end of the linked section))
-
-# Maybe helpful for future performance tuning: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#maximum-number-of-registers-per-thread
-
-# https://docs.nvidia.com/cuda/cuda-c-programming-guide/#multiprocessor-level
-# This states, that using fewer registers allows more threads to reside on a single SM which improves performance. 
-# So I could use more registers at the expense for performance. Depending on how this would simplify my algorithm, I might do this and leave more optimisation to future work
-
-# Since the generated expressions should have between 10 and 50 symbols, I think allowing a max. of 128 32-bit registers should make for an easy algorithm. If during testing the result is slow, maybe try reducing the number of registers and perform more intelligent allocation/assignment
-# With 128 Registers, one could have 32 Warps on one SM ((128 * 16 = 2048) * 32 == 64*1024 == max number of registers per SM) This means 512 Threads per SM in the worst case
-
-#
-# Make a "function execute(...)" that takes the data and the transpiled code. Pass the data to the kernel and start executing
-# Note: Maybe make an additional function that transpiles and executed the code. This would then be the function the user calls
-#

+const BYTES = sizeof(Float32)
 const Operand = Union{Float32, String} # Operand is either fixed value or register
+cache = Dict{Expr, CuFunction}() # needed if multiple runs with the same expr but different parameters are performed

-function evaluate(expression::ExpressionProcessing.PostfixType, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})
-	# TODO: think of how to do this. Probably get all expressions. Transpile them in parallel and then execute the generatd code.
-	cudaVars = CuArray(variables)
+function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
+	varRows = size(variables, 1)
+	variableCols = size(variables, 2)
+	kernels = Vector{CuFunction}(undef, length(expressions))
 	
-	#kernel = transpile(expression, )
-	# execute kernel.
+	# TODO: test this again with multiple threads. The first time I tried, I was using only one thread
+	# Test this parallel version again when doing performance tests. With the simple "functionality" tests this took 0.03 seconds while sequential took "0.00009" seconds
+	# Threads.@threads for i in eachindex(expressions)
+	# 	cacheLock = ReentrantLock()
+	# 	cacheHit = false
+	# 	lock(cacheLock) do 
+	# 		if haskey(cache, expressions[i])
+	# 			kernels[i] = cache[expressions[i]]
+	# 			cacheHit = true
+	# 		end
+	# 	end
+
+	# 	if cacheHit
+	# 		continue
+	# 	end
+
+	# 	formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
+
+	# 	kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i-1) # i-1 because julia is 1-based but PTX needs 0-based indexing
+
+	# 	linker = CuLink()
+	# 	add_data!(linker, "ExpressionProcessing", kernel)
+
+	# 	image = complete(linker)
+	
+	# 	mod = CuModule(image)
+	# 	kernels[i] = CuFunction(mod, "ExpressionProcessing")
+
+	# 	@lock cacheLock cache[expressions[i]] = kernels[i]
+	# end
+
+	@inbounds for i in eachindex(expressions)
+		if haskey(cache, expressions[i])
+			kernels[i] = cache[expressions[i]]
+			continue
+		end
+
+		formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
+		kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i-1) # i-1 because julia is 1-based but PTX needs 0-based indexing
+		
+		linker = CuLink()
+		add_data!(linker, "ExpressionProcessing", kernel)
+		
+		image = complete(linker)
+		
+		mod = CuModule(image)
+		kernels[i] = CuFunction(mod, "ExpressionProcessing")
+		cache[expressions[i]] = kernels[i]
+	end
+
+	cudaVars = CuArray(variables) # maybe put in shared memory (see PerformanceTests.jl for more info)
+	cudaParams = Utils.create_cuda_array(parameters, NaN32) # maybe make constant (see PerformanceTests.jl for more info)
+
+	# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
+	cudaResults = CuArray{Float32}(undef, variableCols, length(expressions))
+
+	# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
+	for kernel in kernels
+		# config = launch_configuration(kernels[i])
+		threads = min(variableCols, 96)
+		blocks = cld(variableCols, threads)
+
+		cudacall(kernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
+	end
+
+	return cudaResults
 end

 # To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string
 # seekstart(buf1); write(buf2, buf1)
-function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Integer, paramSetSize::Integer)::String
+"
+- param ```varSetSize```: The size of a variable set. Equal to number of rows of variable matrix (in a column major matrix)
+- param ```paramSetSize```: The size of the longest parameter set. As it has to be stored in a column major matrix, the nr of rows is dependent oon the longest parameter set
+- param ```expressionIndex```: The 0-based index of the expression
+"
+function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Integer, paramSetSize::Integer, 
+				   nrOfVariableSets::Integer, expressionIndex::Integer)::String
 	exitJumpLocationMarker = "\$L__BB0_2"
 	ptxBuffer = IOBuffer()
+	regManager = Utils.RegisterManager(Dict(), Dict())

 	# TODO: Suboptimal solution
-	signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Int32, Float32, Float32]) # nrOfVarSets, Vars, Params
-	guardClause, threadIdReg = get_guard_clause(exitJumpLocationMarker, "%parameter0") # parameter0 because first entry holds the number of variable sets and that is always stored in %parameter0
+	signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Float32, Float32, Float32], regManager) # Vars, Params, Results
+	guardClause, threadId64Reg = get_guard_clause(exitJumpLocationMarker, nrOfVariableSets, regManager)

 	println(ptxBuffer, get_cuda_header())
 	println(ptxBuffer, signature)
 	println(ptxBuffer, "{")


-	calc_code = generate_calculation_code(expression, "%parameter1", varSetSize, "%parameter2", paramSetSize, threadIdReg)
-	println(ptxBuffer, get_register_definitions())
+	calc_code = generate_calculation_code(expression, "%parameter0", varSetSize, "%parameter1", paramSetSize, "%parameter2", 
+										  threadId64Reg, expressionIndex, nrOfVariableSets, regManager)
+	println(ptxBuffer, Utils.get_register_definitions(regManager))
 	println(ptxBuffer, paramLoading)
 	println(ptxBuffer, guardClause)
 	println(ptxBuffer, calc_code)
@ -59,20 +116,23 @@ function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Int
 	println(ptxBuffer, "}")

 	generatedCode = String(take!(ptxBuffer))
-	println(generatedCode)
 	return generatedCode
 end

 # TODO: Make version, target and address_size configurable; also see what address_size means exactly
 function get_cuda_header()::String
 	return "
-.version 7.1
+.version 8.5
 .target sm_61
-.address_size 32
+.address_size 64
 "
 end

-function get_kernel_signature(kernelName::String, parameters::Vector{DataType})::Tuple{String, String}
+"
+param ```parameters```: [1] = nr of var sets; [2] = variables; [3] = parameters; [4] = result
+"
+function get_kernel_signature(kernelName::String, parameters::Vector{DataType}, regManager::Utils.RegisterManager)::Tuple{String, String}
+
 	signatureBuffer = IOBuffer()
 	paramLoadingBuffer = IOBuffer()
 	print(signatureBuffer, ".visible .entry ")
@ -80,11 +140,11 @@ function get_kernel_signature(kernelName::String, parameters::Vector{DataType}):
 	println(signatureBuffer, "(")
 	
 	for i in eachindex(parameters)
-		print(signatureBuffer, "  .param .u32", " ", "param_", i)
+		print(signatureBuffer, "  .param .u64", " ", "param_", i)

-		parametersReg = get_next_free_register("r")
-		println(paramLoadingBuffer, "ld.param.u32   $parametersReg, [param_$i];")
-		println(paramLoadingBuffer, "cvta.to.global.u32   $(get_next_free_register("parameter")), $parametersReg;")
+		parametersLocation = Utils.get_next_free_register(regManager, "rd")
+		println(paramLoadingBuffer, "ld.param.u64   $parametersLocation, [param_$i];")
+		println(paramLoadingBuffer, "cvta.to.global.u64   $(Utils.get_next_free_register(regManager, "parameter")), $parametersLocation;")
 		if i != lastindex(parameters)
 			println(signatureBuffer, ",")
 		end
@ -99,38 +159,46 @@ Constructs the PTX code used for handling the case where too many threads are st

 - param ```nrOfVarSetsRegister```: The register which holds the total amount of variable sets for the kernel
 "
-function get_guard_clause(exitJumpLocation::String, nrOfVarSetsRegister::String)::Tuple{String, String}
+function get_guard_clause(exitJumpLocation::String, nrOfVarSets::Integer, regManager::Utils.RegisterManager)::Tuple{String, String}
 	guardBuffer = IOBuffer()

-	threadIds = get_next_free_register("r")
-	threadsPerCTA = get_next_free_register("r")
-	currentThreadId = get_next_free_register("r")
+	threadIds = Utils.get_next_free_register(regManager, "r")
+	threadsPerCTA = Utils.get_next_free_register(regManager, "r")
+	currentThreadId = Utils.get_next_free_register(regManager, "r")

-	# load data into above defined registers
 	println(guardBuffer, "mov.u32    $threadIds, %ntid.x;")
 	println(guardBuffer, "mov.u32    $threadsPerCTA, %ctaid.x;")
 	println(guardBuffer, "mov.u32    $currentThreadId, %tid.x;")

-	globalThreadId = get_next_free_register("r") # basically the index of the thread in the variable set
-	breakCondition = get_next_free_register("p")
-	nrOfVarSets = get_next_free_register("i")
-	println(guardBuffer, "ld.global.u32  $nrOfVarSets, [$nrOfVarSetsRegister];")
+	globalThreadId = Utils.get_next_free_register(regManager, "r") # basically the index of the thread in the variable set
+	breakCondition = Utils.get_next_free_register(regManager, "p")
 	println(guardBuffer, "mad.lo.s32     $globalThreadId, $threadIds, $threadsPerCTA, $currentThreadId;")
-	println(guardBuffer, "setp.ge.s32    $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets
+	println(guardBuffer, "setp.gt.s32    $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets

 	# branch to end if breakCondition is true
-	print(guardBuffer, "@$breakCondition bra    $exitJumpLocation;")
+	println(guardBuffer, "@$breakCondition bra    $exitJumpLocation;")

-	return (String(take!(guardBuffer)), globalThreadId)
+	# Convert threadIdReg to a 64 bit register. Not 64 bit from the start, as this would take up more registers. Performance tests can be performed to determin if it is faster doing this, or making everything 64-bit from the start
+	threadId64Reg = Utils.get_next_free_register(regManager, "rd")
+	print(guardBuffer, "cvt.u64.u32    $threadId64Reg, $globalThreadId;")
+
+	return (String(take!(guardBuffer)), threadId64Reg)
 end

-function generate_calculation_code(expression::ExpressionProcessing.PostfixType, variablesReg::String, variablesSetSize::Integer, 
-								   parametersReg::String, parametersSetSize::Integer, threadIdReg::String)::String
+"
+- param ```parametersSetSize```: Size of the largest parameter set
+"
+function generate_calculation_code(expression::ExpressionProcessing.PostfixType, variablesLocation::String, variablesSetSize::Integer, 
+								   parametersLocation::String, parametersSetSize::Integer, resultsLocation::String, 
+								   threadId64Reg::String, expressionIndex::Integer, nrOfVarSets::Integer, regManager::Utils.RegisterManager)::String
+
 	codeBuffer = IOBuffer()
 	operands = Vector{Operand}()

-	for i in eachindex(expression)
-		token = expression[i]
+	exprId64Reg = Utils.get_next_free_register(regManager, "rd")
+	println(codeBuffer, "mov.u64 $exprId64Reg, $expressionIndex;")
+
+	for token in expression

 		if token.Type == FLOAT32
 			push!(operands, reinterpret(Float32, token.Value))
@ -144,47 +212,57 @@ function generate_calculation_code(expression::ExpressionProcessing.PostfixType,
 			else
 				left = pop!(operands)
 			end
-			operation, resultRegister = get_operation(operator, left, right)
+			operation, resultRegister = get_operation(operator, regManager, left, right)
 			
 			println(codeBuffer, operation)
 			push!(operands, resultRegister)
 		elseif token.Type == INDEX
 			if token.Value > 0 # varaibles
-				var, first_access = get_register_for_name("x$(token.Value)")
+				var, first_access = Utils.get_register_for_name(regManager, "x$(token.Value)")
 				if first_access
-					println(codeBuffer, load_into_register(var, variablesReg, token.Value, threadIdReg, variablesSetSize))
+					println(codeBuffer, load_into_register(var, variablesLocation, token.Value, threadId64Reg, variablesSetSize, regManager))
 				end
 				push!(operands, var)
 			else
 				absVal = abs(token.Value)
-				param, first_access = get_register_for_name("p$absVal")
+				param, first_access = Utils.get_register_for_name(regManager, "p$absVal")
 				if first_access
-					println(codeBuffer, load_into_register(param, parametersReg, absVal, threadIdReg, parametersSetSize))
+					println(codeBuffer, load_into_register(param, parametersLocation, absVal, exprId64Reg, parametersSetSize, regManager))
 				end
 				push!(operands, param)
 			end
 		end
 	end

+	tempReg = Utils.get_next_free_register(regManager, "rd")
+	# reg = pop!(operands)
+	# tmp = "abs.f32  $(reg), 16.0;"
+	# push!(operands, reg)
+	println(codeBuffer, "
+	add.u64        $tempReg, $((expressionIndex)*nrOfVarSets), $threadId64Reg;
+	mad.lo.u64     $tempReg, $tempReg, $BYTES, $resultsLocation;
+	st.global.f32  [$tempReg], $(pop!(operands));
+	")
+
 	return String(take!(codeBuffer))
 end

 "
+Loads a value from a location into the given register. It is assumed that the location refers to a column-major matrix
+
 - param ```register```: The register where the loaded value will be stored
 - param ```loadLocation```: The location from where to load the value
- param ```valueIndex```: 0-based index of the value in the variable set/parameter set
- param ```setIndexReg```: 0-based index of the set. Needed to calculate the actual index from the ```valueIndex```. Is equal to the global threadId
- param ```setSize```: The size of one set. Needed to calculate the actual index from the ```valueIndex```
+- param ```valueIndex```: 1-based index of the value in the variable set/parameter set
+- param ```setIndexReg64```: 0-based index of the set. Needed to calculate the actual index from the ```valueIndex```. Is equal to the global threadId
+- param ```setSize```: The size of one set. Needed to calculate the actual index from the ```valueIndex```. Total number of elements in the set (length(set))
 "
-function load_into_register(register::String, loadLocation::String, valueIndex::Integer, setIndexReg::String, setSize::Integer)::String
-	# loadLocation + startIndex + valueIndex * bytes (4 in our case)
-	# startIndex: setIndex * setSize
-	tempReg = get_next_free_register("i")
-	# we are using "sizeof(valueIndex)" because it has to use the same amount of bytes as the actual stored values, even though it could use more bytes
+function load_into_register(register::String, loadLocation::String, valueIndex::Integer, setIndexReg64::String, setSize::Integer, regManager::Utils.RegisterManager)::String
+	tempReg = Utils.get_next_free_register(regManager, "rd")
+
+	# "mad" calculates the offset and "add" applies the offset. Classical pointer arithmetic for accessing values of an array like in C
 	return "
-	mul.lo.u32 $tempReg, $setIndexReg, $setSize;
-	add.u32 $tempReg, $tempReg, $(valueIndex*sizeof(valueIndex));
-	add.u32 $tempReg, $loadLocation, $tempReg;
+	mad.lo.u64  $tempReg, $setIndexReg64, $(setSize*BYTES), $((valueIndex - 1) * BYTES);
+	add.u64     $tempReg, $loadLocation, $tempReg;
 	ld.global.f32 $register, [$tempReg];"
 end

@ -200,8 +278,8 @@ function type_to_ptx_type(type::DataType)::String
 	end
 end

-function get_operation(operator::Operator, left::Operand, right::Union{Operand, Nothing} = nothing)::Tuple{String, String}
-	resultRegister = get_next_free_register("f")
+function get_operation(operator::Operator, regManager::Utils.RegisterManager, left::Operand, right::Union{Operand, Nothing} = nothing)::Tuple{String, String}
+	resultRegister = Utils.get_next_free_register(regManager, "f")
 	resultCode = ""

 	if is_binary_operator(operator) && isnothing(right)
@ -219,6 +297,7 @@ function get_operation(operator::Operator, left::Operand, right::Union{Operand,
 	elseif operator == POWER
 		# x^y == 2^(y*log2(x)) as generated by nvcc for "pow(x, y)"
 		resultCode = "
+		// x^y:
 		lg2.approx.f32   $resultRegister, $left;
 		mul.f32          $resultRegister, $right, $resultRegister;
 		ex2.approx.f32   $resultRegister, $resultRegister;"
@ -227,11 +306,13 @@ function get_operation(operator::Operator, left::Operand, right::Union{Operand,
 	elseif operator == LOG
 		# log(x) == log2(x) * ln(2) as generated by nvcc for "log(x)"
 		resultCode = "
+		// log(x):
 		lg2.approx.f32   $resultRegister, $left;
 		mul.f32          $resultRegister, $resultRegister, 0.693147182;"
 	elseif operator == EXP
 		# e^x == 2^(x/ln(2)) as generated by nvcc for "exp(x)"
 		resultCode = "
+		// e^x:
 		mul.f32          $resultRegister, $left, 1.44269502; 
 		ex2.approx.f32   $resultRegister, $resultRegister;"
 	elseif operator == SQRT
@ -242,68 +323,5 @@ function get_operation(operator::Operator, left::Operand, right::Union{Operand,
 	return (resultCode, resultRegister)
 end

-let registers = Dict() # stores the count of the register already used.
-	global get_next_free_register
-	global get_register_definitions
-
-	# By convention these names correspond to the following types:
-	# - p -> pred
-	# - f -> float32
-	# - r -> 32 bit
-	# - var -> float32 (used for variables and params)
-	function get_next_free_register(name::String)::String
-		if haskey(registers, name)
-			registers[name] += 1
-		else
-			registers[name] = 1
-		end
-
-		return string("%", name, registers[name] - 1)
-	end
-
-	function get_register_definitions()::String
-		registersBuffer = IOBuffer()
-	
-		for definition in registers
-			regType = ""
-			if definition.first == "p"
-				regType = ".pred"
-			elseif definition.first == "f"
-				regType = ".f32"
-			elseif definition.first == "var"
-				regType = ".f32"
-			elseif definition.first == "param"
-				regType = ".f32"
-			elseif definition.first == "r"
-				regType = ".b32"
-			elseif definition.first == "parameter"
-				regType = ".u32"
-			elseif definition.first == "i"
-				regType = ".u32"
-			else
-				throw(ArgumentError("Unknown register name used. Name '$(definition.first)' cannot be mapped to a PTX type."))
-			end
-			println(registersBuffer, ".reg $regType   %$(definition.first)<$(definition.second)>;")
-		end
-	
-		return String(take!(registersBuffer))
-	end
-end
-
-let symtable = Dict()
-	global get_register_for_name
-
-	"Returns the register for this variable/parameter and true if it is used for the first time and false otherwise."
-	function get_register_for_name(varName::String)
-		if haskey(symtable, varName)
-			return (symtable[varName], false)
-		else
-			reg = get_next_free_register("var")
-			symtable[varName] = reg
-			return (reg, true)
-		end
-	end
-end
-
 end

--- a/package/src/Utils.jl
+++ b/package/src/Utils.jl
@ -0,0 +1,88 @@
+module Utils
+
+using CUDA
+
+"Converts a vector of vectors into a matrix. The inner vectors do not need to have the same length.
+
+All entries that cannot be filled have ```invalidElement``` as their value
+"
+function convert_to_matrix(vecs::Vector{Vector{T}}, invalidElement::T)::Matrix{T} where T
+	maxLength = get_max_inner_length(vecs)
+
+	# Pad the shorter vectors with the invalidElement
+	paddedVecs = [vcat(vec, fill(invalidElement, maxLength - length(vec))) for vec in vecs]
+	vecMat = hcat(paddedVecs...)
+
+	return vecMat
+end
+
+"Retrieves the number of entries for the largest inner vector"
+function get_max_inner_length(vecs::Vector{Vector{T}})::Int where T
+	return maximum(length.(vecs))
+end
+
+"Returns a CuArray filed with the data provided. The inner vectors do not have to have the same length. All missing elements will be the value ```invalidElement```"
+function create_cuda_array(data::Vector{Vector{T}}, invalidElement::T)::CuArray{T} where T
+	dataMat = convert_to_matrix(data, invalidElement)
+	cudaArr = CuArray(dataMat)
+
+	return cudaArr
+end
+
+struct RegisterManager
+	registers::Dict
+	symtable::Dict
+end
+
+function get_next_free_register(manager::RegisterManager, name::String)::String
+	if haskey(manager.registers, name)
+		manager.registers[name] += 1
+	else
+		manager.registers[name] = 1
+	end
+
+	return string("%", name, manager.registers[name] - 1)
+end
+
+function get_register_definitions(manager::RegisterManager)::String
+	registersBuffer = IOBuffer()
+
+	for definition in manager.registers
+		regType = ""
+		if definition.first == "p"
+			regType = ".pred"
+		elseif definition.first == "f"
+			regType = ".f32"
+		elseif definition.first == "var"
+			regType = ".f32"
+		elseif definition.first == "param"
+			regType = ".f32"
+		elseif definition.first == "r"
+			regType = ".b32"
+		elseif definition.first == "rd"
+			regType = ".b64"
+		elseif definition.first == "parameter"
+			regType = ".b64"
+		elseif definition.first == "i"
+			regType = ".b64"
+		else
+			throw(ArgumentError("Unknown register name used. Name '$(definition.first)' cannot be mapped to a PTX type."))
+		end
+		println(registersBuffer, ".reg $regType   %$(definition.first)<$(definition.second)>;")
+	end
+
+	return String(take!(registersBuffer))
+end
+
+"Returns the register for this variable/parameter and true if it is used for the first time and false otherwise."
+function get_register_for_name(manager::RegisterManager, varName::String)
+	if haskey(manager.symtable, varName)
+		return (manager.symtable[varName], false)
+	else
+		reg = get_next_free_register(manager, "var")
+		manager.symtable[varName] = reg
+		return (reg, true)
+	end
+end
+
+end
--- a/package/test/CpuInterpreterTests.jl
+++ b/package/test/CpuInterpreterTests.jl
@ -0,0 +1,47 @@
+using LinearAlgebra
+using BenchmarkTools
+
+function test_cpu_interpreter(nrows; parallel = false)
+    exprs = [
+        # CPU interpreter requires an anonymous function and array ref s
+        :(p[1] * x[1] + p[2]), # 5 op
+        :((((x[1] + x[2]) + x[3]) + x[4]) + x[5]), # 9 op
+        :(log(abs(x[1]))), # 3 op
+        :(powabs(p[2] - powabs(p[1] + x[1], 1/x[1]),p[3])) # 13 op
+    ] # 30 op
+    exprs = map(e -> Expr(:->, :(x,p), e), exprs)
+    X = randn(Float32, nrows, 10)
+    p = [randn(Float32, 10) for _ in 1:length(exprs)] # generate 10 random parameter values for each expr
+    
+    # warmup
+    interpret_cpu(exprs, X, p)
+    expr_reps = 100 # for each expr
+    reps= 100
+
+    if parallel 
+        # t_sec = @elapsed fetch.([Threads.@spawn interpret_cpu(exprs, X, p; repetitions=expr_reps) for i in 1:reps])
+        @btime parallel(exprs, X, p, expr_reps, reps)
+        println("~ $(round(30 * reps * expr_reps * nrows  / 1e9 / t_sec, digits=2)) GFLOPS ($(Threads.nthreads()) threads) ($(round(LinearAlgebra.peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))")
+    else
+        # t_sec = @elapsed for i in 1:reps interpret_cpu(exprs, X, p; repetitions=expr_reps) end
+        @btime single(exprs, X, p, expr_reps, reps)
+        println("~ $(round(30 * reps * expr_reps * nrows  / 1e9 / t_sec, digits=2)) GFLOPS (single-core) ($(round(LinearAlgebra.peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))")
+    end
+    true
+end
+
+function parallel(exprs, X, p, expr_reps, reps)
+	fetch.([Threads.@spawn interpret_cpu(exprs, X, p; repetitions=expr_reps) for i in 1:reps])
+end
+
+function single(exprs, X, p, expr_reps, reps)
+	for i in 1:reps interpret_cpu(exprs, X, p; repetitions=expr_reps) end
+end
+
+
+# LinearAlgebra.BLAS.set_num_threads(1) # only use a single thread for peakflops
+
+@test test_cpu_interpreter(1000)
+@test test_cpu_interpreter(1000, parallel=true) # start julia -t 6 for six threads
+@test test_cpu_interpreter(10000)
+@test test_cpu_interpreter(10000, parallel=true)
--- a/package/test/InterpreterTests.jl
+++ b/package/test/InterpreterTests.jl
@ -1,6 +1,7 @@
 using CUDA
 using .ExpressionProcessing
 using .Interpreter
+using .Utils

 expressions = Vector{Expr}(undef, 2)
 variables = Matrix{Float32}(undef, 2,2)
@ -20,8 +21,8 @@ parameters[2][1] = 5.0
 parameters[2][2] = 0.0

 function testHelper(expression::Expr, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}}, expectedResult)
-	postfix = Vector([expr_to_postfix(expression)])
-	result = Interpreter.interpret(postfix, variables, parameters)
+	exprs = Vector([expression])
+	result = Interpreter.interpret(exprs, variables, parameters)

 	expectedResult32 = convert(Float32, expectedResult)
 	@test isequal(result[1,1], expectedResult32)
@ -35,7 +36,7 @@ end
 	reference[2,2] = 0.0
 	# reference = Matrix([5.0, NaN],
 	# 				   [5.0, 0.0])
-	result = Interpreter.convert_to_matrix(parameters, NaN32)
+	result = Utils.convert_to_matrix(parameters, NaN32)

 	@test isequal(result, reference)
 end
@ -126,8 +127,8 @@ end
 	expr1 = :((x1 + 5) * p1 - 3 / abs(x2) + (2^4) - log(8))
 	expr2 = :(1 + 5 * x1 - 10^2 + (p1 - p2) / 9 + exp(x2))

-	postfix = Vector([expr_to_postfix(expr1), expr_to_postfix(expr2)])
-	result = Interpreter.interpret(postfix, var, param)
+	exprs = Vector([expr1, expr2])
+	result = Interpreter.interpret(exprs, var, param)

 	# var set 1
 	@test isapprox(result[1,1], 37.32, atol=0.01) # expr1
--- a/package/test/PerformanceTests.jl
+++ b/package/test/PerformanceTests.jl
@ -0,0 +1,179 @@
+using LinearAlgebra
+using BenchmarkTools
+
+using .Transpiler
+using .Interpreter
+
+const BENCHMARKS_RESULTS_PATH = "./results-fh"
+exprsCPU = [
+	# CPU interpreter requires an anonymous function and array ref s
+	:(p[1] * x[1] + p[2]), # 5 op
+	:((((x[1] + x[2]) + x[3]) + x[4]) + x[5]), # 9 op
+	:(log(abs(x[1]))), # 3 op
+	:(powabs(p[2] - powabs(p[1] + x[1], 1/x[1]),p[3])) # 13 op
+] # 30 op
+exprsCPU = map(e -> Expr(:->, :(x,p), e), exprsCPU)
+
+exprsGPU = [
+	# CPU interpreter requires an anonymous function and array ref s
+	:(p1 * x1 + p2), # 5 op
+	:((((x1 + x2) + x3) + x4) + x5), # 9 op
+	:(log(abs(x1))), # 3 op
+	:(powabs(p2 - powabs(p1 + x1, 1/x1),p3)) # 13 op
+] # 30 op
+
+# p is the same for CPU and GPU
+p = [randn(Float32, 10) for _ in 1:length(exprsCPU)] # generate 10 random parameter values for each expr
+expr_reps = 100 # 100 parameter optimisation steps basically
+
+
+@testset "CPU performance" begin
+	# warmup
+	# interpret_cpu(exprsCPU, X, p)
+	
+	# @btime interpret_cpu(exprsCPU, X, p; repetitions=expr_reps) # repetitions simulates parameter optimisation
+	# @btime test_cpu_interpreter(1000)
+	# @btime fetch.([Threads.@spawn interpret_cpu(exprsCPU, X, p; repetitions=expr_reps) for i in 1:reps])
+
+	# test_cpu_interpreter(1000, parallel=true) # start julia -t 6 for six threads
+	# @btime test_cpu_interpreter(10000)
+	# @btime test_cpu_interpreter(10000, parallel=true)
+
+end
+
+@testset "Interpreter Performance" begin
+	# Put data in shared memory: 
+	# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
+
+	# Make array const:
+	# https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays
+
+	# Memory management like in C++ might help with performance improvements
+	# https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management
+end
+
+@testset "Transpiler Performance" begin
+	# Put data in shared memory: 
+	# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
+
+	# Make array const:
+	# https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays
+
+	# Memory management like in C++ might help with performance improvements
+	# https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management
+end
+
+# After these tests have been redone, use Nsight Compute/Systems as described here: 
+#https://cuda.juliagpu.org/stable/development/profiling/#NVIDIA-Nsight-Systems
+# Systems and Compute installable via WSL. Compute UI can even be used inside wsl
+# Add /usr/local/cuda/bin in .bashrc to PATH to access ncu and nsys (depending how well this works with my 1080 do it on my machine, otherwise re do the tests and perform them on FH PCs)
+# University setup at 10.20.1.7 if needed
+
+compareWithCPU = true
+
+
+suite = BenchmarkGroup()
+suite["CPU"] = BenchmarkGroup(["CPUInterpreter"])
+suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"])
+suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"])
+varsets_small = 100
+varsets_medium = 1000
+varsets_large = 10000
+
+if compareWithCPU
+	X_small = randn(Float32, varsets_small, 5)
+	suite["CPU"]["small varset"] = @benchmarkable interpret_cpu(exprsCPU, X_small, p; repetitions=expr_reps)
+	X_medium = randn(Float32, varsets_medium, 5)
+	suite["CPU"]["medium varset"] = @benchmarkable interpret_cpu(exprsCPU, X_medium, p; repetitions=expr_reps)
+	X_large = randn(Float32, varsets_large, 5)
+	suite["CPU"]["large varset"] = @benchmarkable interpret_cpu(exprsCPU, X_large, p; repetitions=expr_reps)
+end
+
+X_small_GPU = randn(Float32, 5, varsets_small)
+suite["GPUI"]["small varset"] = @benchmarkable interpret_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
+suite["GPUT"]["small varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
+
+X_medium_GPU = randn(Float32, 5, varsets_medium)
+suite["GPUI"]["medium varset"] = @benchmarkable interpret_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
+suite["GPUT"]["medium varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
+
+X_large_GPU = randn(Float32, 5, varsets_large)
+suite["GPUI"]["large varset"] = @benchmarkable interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
+suite["GPUT"]["large varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
+
+# interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
+
+# tune!(suite)
+# BenchmarkTools.save("params.json", params(suite))
+
+loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)
+
+results = run(suite, verbose=true, seconds=180)
+
+if compareWithCPU
+	medianCPU = median(results["CPU"])
+	stdCPU = std(results["CPU"])
+	
+	medianInterpreter = median(results["GPUI"])
+	stdInterpreter = std(results["GPUI"])
+	
+	medianTranspiler = median(results["GPUT"])
+	stdTranspiler = std(results["GPUT"])
+	
+	cpuVsGPUI_median = judge(medianInterpreter, medianCPU) # is interpreter better than cpu?
+	cpuVsGPUT_median = judge(medianTranspiler, medianCPU) # is transpiler better than cpu?
+	gpuiVsGPUT_median = judge(medianTranspiler, medianInterpreter) # is tranpiler better than interpreter?
+	
+	cpuVsGPUI_std = judge(stdInterpreter, stdCPU) # is interpreter better than cpu?
+	cpuVsGPUT_std = judge(stdTranspiler, stdCPU) # is transpiler better than cpu?
+	gpuiVsGPUT_std = judge(stdTranspiler, stdInterpreter) # is tranpiler better than interpreter?
+	
+	println()
+	println("Is the interpreter better than the CPU implementation:")
+	println(cpuVsGPUI_median)
+	println(cpuVsGPUI_std)
+	
+	println()
+	println("Is the transpiler better than the CPU implementation:")
+	println(cpuVsGPUT_median)
+	println(cpuVsGPUT_std)
+	
+	println()
+	println("Is the transpiler better than the interpreter:")
+	println(gpuiVsGPUT_median)
+	println(gpuiVsGPUT_std)
+	
+	BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/4-interpreter_using_int32.json", results)
+else
+	resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/2-using_inbounds.json")[1]
+	
+	medianGPUI_old = median(resultsOld["GPUI"])
+	stdGPUI_old = std(resultsOld["GPUI"])
+	
+	medianGPUT_old = median(resultsOld["GPUT"])
+	stdGPUT_old = std(resultsOld["GPUT"])
+	
+	medianInterpreter = median(results["GPUI"])
+	stdInterpreter = std(results["GPUI"])
+	
+	medianTranspiler = median(results["GPUT"])
+	stdTranspiler = std(results["GPUT"])
+	
+	oldVsGPUI_median = judge(medianInterpreter, medianGPUI_old) # is interpreter better than old?
+	oldVsGPUI_std = judge(stdInterpreter, stdGPUI_old) # is interpreter better than old?
+	
+	oldVsGPUT_median = judge(medianTranspiler, medianGPUT_old) # is transpiler better than old?
+	oldVsGPUT_std = judge(stdTranspiler, stdGPUT_old) # is transpiler better than old?
+	
+	
+	println()
+	println("Is the interpreter better than the old implementation:")
+	println(oldVsGPUI_median)
+	println(oldVsGPUI_std)
+	
+	println()
+	println("Is the transpiler better than the old implementation:")
+	println(oldVsGPUT_median)
+	println(oldVsGPUT_std)
+end
+
--- a/package/test/PerformanceTuning.jl
+++ b/package/test/PerformanceTuning.jl
@ -0,0 +1,30 @@
+using CUDA
+
+using .Transpiler
+using .Interpreter
+
+varsets_medium = 1000
+X = randn(Float32, 5, varsets_medium)
+
+exprsGPU = [
+	# CPU interpreter requires an anonymous function and array ref s
+	:(p1 * x1 + p2), # 5 op
+	:((((x1 + x2) + x3) + x4) + x5), # 9 op
+	:(log(abs(x1))), # 3 op
+	:(powabs(p2 - powabs(p1 + x1, 1/x1),p3)) # 13 op
+] # 30 op
+
+# p is the same for CPU and GPU
+p = [randn(Float32, 10) for _ in 1:length(exprsGPU)] # generate 10 random parameter values for each expr
+expr_reps = 1
+
+
+
+@testset "Interpreter Tuning" begin
+    CUDA.@profile interpret_gpu(exprsGPU, X, p; repetitions=expr_reps)
+end
+
+
+@testset "Transpiler Tuning" begin
+    # CUDA.@profile evaluate_gpu(exprsGPU, X, p; repetitions=expr_reps)
+end
--- a/package/test/Project.toml
+++ b/package/test/Project.toml
@ -1,4 +1,8 @@
 [deps]
+BenchmarkPlots = "ab8c0f59-4072-4e0d-8f91-a91e1495eb26"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
+StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
--- a/package/test/TranspilerTests.jl
+++ b/package/test/TranspilerTests.jl
@ -2,42 +2,65 @@ using CUDA
 using .ExpressionProcessing
 using .Transpiler

-expressions = Vector{Expr}(undef, 2)
-variables = Matrix{Float32}(undef, 2,2)
-parameters = Vector{Vector{Float32}}(undef, 2)
+expressions = Vector{Expr}(undef, 3)
+variables = Matrix{Float32}(undef, 5, 4)
+parameters = Vector{Vector{Float32}}(undef, 3)

-# Resulting value should be 1.14... for the first expression
 expressions[1] = :(1 + 3 * 5 / 7 - sqrt(4))
-expressions[2] = :(5 + x1 + 1 * x2 + p1 + p2)
+expressions[2] = :(5 + x1 + 1 * x2 + p1 + p2 + x1^x3)
+expressions[3] = :(log(x1) / x2 * sqrt(p1) + x3^x4 - exp(x5))
+
 variables[1,1] = 2.0
 variables[2,1] = 3.0
-variables[1,2] = 0.0
+variables[3,1] = 0.0
+variables[4,1] = 1.0
+variables[5,1] = 0.0
+
+variables[1,2] = 2.0
 variables[2,2] = 5.0
-parameters[1] = Vector{Float32}(undef, 1)
+variables[3,2] = 3.0
+variables[4,2] = 0.0 
+variables[5,2] = 0.0
+
+variables[1,3] = 6.0
+variables[2,3] = 2.0
+variables[3,3] = 2.0
+variables[4,3] = 4.0
+variables[5,3] = 2.0
+
+variables[1,4] = 1.0
+variables[2,4] = 2.0
+variables[3,4] = 3.0
+variables[4,4] = 4.0
+variables[5,4] = 5.0
+
+parameters[1] = Vector{Float32}(undef, 0)
 parameters[2] = Vector{Float32}(undef, 2)
-parameters[1][1] = 5.0
+parameters[3] = Vector{Float32}(undef, 1)
 parameters[2][1] = 5.0
 parameters[2][2] = 0.0
+parameters[3][1] = 16.0

+@testset "Test transpiler evaluation" begin
+	results = Transpiler.evaluate(expressions, variables, parameters)

-@testset "Test TMP transpiler" begin
-	postfixExpr = expr_to_postfix(expressions[1])
-	postfixExprs = Vector([postfixExpr])
-	push!(postfixExprs, expr_to_postfix(expressions[2]))
-	push!(postfixExprs, expr_to_postfix(:(5^3 + x1)))
+	# dump(expressions[3]; maxdepth=10)
+	# Expr 1:
+	@test isapprox(results[1,1], 1.14286)
+	@test isapprox(results[2,1], 1.14286)
+	@test isapprox(results[3,1], 1.14286)
+	@test isapprox(results[4,1], 1.14286)
+	#Expr 2:
+	@test isapprox(results[1,2], 16.0)
+	@test isapprox(results[2,2], 25.0)
+	@test isapprox(results[3,2], 54.0)
+	@test isapprox(results[4,2], 14.0)

-	# generatedCode = Transpiler.transpile(postfixExpr)
-	generatedCode = Transpiler.transpile(postfixExprs[3], 2, 3) # TEMP
-	# CUDA.@sync interpret(postfixExprs, variables, parameters)
-
-	# This is just here for testing. This will be called inside the execute method in the Transpiler module
-	linker = CuLink()
-	add_data!(linker, "ExpressionProcessing", generatedCode)
-
-	image = complete(linker)
-
-	mod = CuModule(image)
-	func = CuFunction(mod, "ExpressionProcessing")
+	#Expr3:
+	@test isapprox(results[1,3],  -0.07580)
+	@test isapprox(results[2,3],   0.55452)
+	@test isapprox(results[3,3],  12.19446)
+	@test isapprox(results[4,3], -67.41316)
 end

 # TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)"
--- a/package/test/params.json
+++ b/package/test/params.json
@ -0,0 +1 @@
+[{"Julia":"1.11.4","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["CPUInterpreter"]}],"GPUT":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]]
--- a/package/test/results-fh/0-initial_results.json
+++ b/package/test/results-fh/0-initial_results.json
--- a/package/test/results-fh/1-256_blocksize.json
+++ b/package/test/results-fh/1-256_blocksize.json
--- a/package/test/results-fh/2-using_inbounds.json
+++ b/package/test/results-fh/2-using_inbounds.json
--- a/package/test/results-fh/3-tuned-blocksize_I128_T96.json
+++ b/package/test/results-fh/3-tuned-blocksize_I128_T96.json
--- a/package/test/results-fh/4-interpreter_using_int32.json
+++ b/package/test/results-fh/4-interpreter_using_int32.json
--- a/package/test/results/0-initial_results.json
+++ b/package/test/results/0-initial_results.json
--- a/package/test/results/1-256_blocksize.json
+++ b/package/test/results/1-256_blocksize.json
--- a/package/test/results/2-using_inbounds.json
+++ b/package/test/results/2-using_inbounds.json
--- a/package/test/results/4-interpreter_using_int32.json
+++ b/package/test/results/4-interpreter_using_int32.json
--- a/package/test/runtests.jl
+++ b/package/test/runtests.jl
@ -2,12 +2,23 @@ using ExpressionExecutorCuda
 using Test

 const baseFolder = dirname(dirname(pathof(ExpressionExecutorCuda)))
+include(joinpath(baseFolder, "src", "Utils.jl"))
 include(joinpath(baseFolder, "src", "ExpressionProcessing.jl"))
 include(joinpath(baseFolder, "src", "Interpreter.jl"))
 include(joinpath(baseFolder, "src", "Transpiler.jl"))

-@testset "ExpressionExecutorCuda.jl" begin
-	include("ExpressionProcessingTests.jl")
-	include("InterpreterTests.jl")
-	include("TranspilerTests.jl")
+@testset "Functionality tests" begin
+	# include("ExpressionProcessingTests.jl")
+	# include("InterpreterTests.jl")
+	# include("TranspilerTests.jl")
+end
+
+
+# @testset "CPU Interpreter" begin
+# 	include("CpuInterpreterTests.jl")
+# end
+
+@testset "Performance tests" begin
+	# include("PerformanceTuning.jl")
+	include("PerformanceTests.jl")
 end
--- a/thesis/chapters/conceptdesign.tex
+++ b/thesis/chapters/conceptdesign.tex
@ -1,33 +1,165 @@
 \chapter{Concept and Design}
 \label{cha:conceptdesign}
-introduction to what needs to be done. also clarify terms "Host" and "Device" here
+% introduction to what needs to be done. also clarify terms "Host" and "Device" here
+To be able to determine whether evaluating mathematical expressions on the GPU is better suited than on the CPU, a prototype needs to be implemented. More specifically, a prototype for interpreting these expressions on the GPU, as well as a prototype that transpiles expressions into code that can be executed by the GPU. The goal of this chapter, is to describe how these two prototypes can be implemented conceptually. First the requirements for the prototypes as well as the data they operate on are explained. This is followed by the design of the interpreter and the transpiler. The CPU interpreter will not be described, as it already exists.
+
+% TODO: maybe describe CPU interpreter too? We will see

 \section[Requirements]{Requirements and Data}
-short section.
-Multiple expressions; vars for all expressions; params unique to expression; operators that need to be supported
+The main goal of both prototypes or evaluators is to provide a speed-up compared to the CPU interpreter already in use. However, it is also important to determine which evaluator provides the most speed-up. This also means that if one of the evaluators is faster, it is intended to replace the CPU interpreter. Therefore, they must have similar capabilities, and therefore meet the following requirements:
+
+\begin{itemize}
+	\item Multiple expressions as input.
+	\item All input expressions have the same number of variables ($x_n$), but can have a different number of parameters ($p_n$).
+	\item The variables are parametrised using a matrix of the form $k \times N$, where $k$ is the number of variables in the expressions and $N$ is the number of different parametrisations for the variables. This matrix is the same for all expressions. 
+	\item The parameters are parametrised using a vector of vectors. Each vector $v_i$ corresponds to an expression $e_i$.
+	\item The following operations must be supported: $x + y$, $x - y$, $x * y$, $x / y$, $x ^ y$, $|x|$, $\log(x)$, $e^x$ and $\sqrt{x}$. Note that $x$ and $y$ can either stand for a value, a variable, or another operation.
+	\item The results of the evaluations are returned in a matrix of the form $k \times N$. In this case, $k$ is equal to the $N$ of the variable matrix and $N$ is equal to the number of input expressions.
+\end{itemize}
+
+\begin{figure}
+	\centering
+	\includegraphics[width=.9\textwidth]{input_output_explanation.png}
+	\caption{This diagram shows how the input and output looks like and how they interact with each other.}
+	\label{fig:input_output_explanation}
+\end{figure}


-\section{Interpreter}
-as introduction to this section talk about what "interpreter" means in this context. so "gpu parses expr and calculates"
+With this, the required capabilities are outlined. However, the input and output data need to further be explained for a better understanding. The first input contains the expressions that need to be evaluated. These can have any length and can contain constant values, variables and parameters and all of these are linked together with the supported operations. In the example shown in Figure \ref{fig:input_output_explanation}, there are six expressions $e_1$ through $e_6$. Next is the variable matrix. One entry in this matrix, corresponds to one variable in every expression. The row indicates which variable it holds the value for. For example the values in row three, are used to parametrise the variable $x_3$. Each column holds a different set of variables. Each expression must be evaluated using every variable set. In the provided example, there are three variable sets, each holding the values for four variables $x_1$ through $x_4$. After all expressions are evaluated using all variable sets the results of these evaluations must be stored in the results matrix. Each entry in this matrix holds the resulting value of the evaluation of one expression parametrised with one variable set. The row indicates the variable set while the column indicates the expression.

-\subsection{Architecture}
-talk about the coarse grained architecture on how the interpreter will work. (.5 to 1 page probably)
+This is the minimal functionality needed to evaluate expressions with variables generated by a symbolic regression algorithm. In the case of parameter optimisation, it is useful to have a different type of variable, called parameter. For parameter optimisation it is important that for the given variable sets, the best fitting parameters need to be found. To achieve this, the evaluator is called multiple times with different parameters, but the same variables. The results are then evaluated for their fitness by the caller. In this case, the parameters do not change within one call. Parameters could therefore be treated as constant values of the expressions, and no separate input for them would be needed. However, providing the possibility to have the parameters as an input, makes the process of parameter optimisation easier. Unlike variables, not all expressions need to have the same number of parameters. Therefore, they are structured as a vector of vectors and not a matrix. The example in Figure \ref{fig:input_output_explanation} shows how the parameters are structured. For example one expression has zero parameters, while another has six parameters $p_1$ through $p_6$. It needs to be mentioned that just like the number of variables, the number of parameters per expression is not limited. It is also possible to completely omit the parameters if they are not needed. Because these evaluators will primarily be used in parameter optimisation use-cases, allowing parameters as an input is required.

-\subsection{Host}
-talk about the steps taken to prepare for GPU interpretation
+% \subsection{Non-Goals}
+% Probably a good idea. Probably move this to "introduction"
+\section{Architecture}

-\subsection{Device}
-talk about how the actual interpreter will be implemented
+Based on the requirements above, the architecture of both prototypes can be designed. While the requirements only specify the input and output, the components and workflow also need to be specified. This section aims at giving an architectural overview of both prototypes, alongside their design decisions. 

+\begin{figure}
+	\centering
+	\includegraphics[width=.9\textwidth]{kernel_architecture.png}
+	\caption{The interpreter has one kernel that is dispatched multiple times, while the transpiler, has multiple kernels that are dispatched once. This helps to eliminate thread divergence.}
+	\label{fig:kernel_architecture}
+\end{figure}

-\section{Transpiler}
-as introduction to this section talk about what "transpiler" means in this context. so "cpu takes expressions and generates ptx for gpu execution"
+A design decision that has been made for both prototypes is to split the evaluation of each expression into a separate kernel or kernel dispatch as seen in Figure \ref{fig:kernel_architecture}. As explained in Section \ref{sec:thread_hierarchy}, it is desirable to reduce the occurrence of thread divergence as much as possible. Although the SIMT programming model tries to mitigate the negative effects of thread divergence, it is still a good idea to avoid it when possible. For this use-case, thread divergence can easily be avoided by not evaluating all expressions in a single kernel or kernel dispatch. GPUs are able to have multiple resident grids, with modern GPUs being able to accommodate 128 grids concurrently \parencite{nvidia_cuda_2025}. One grid corresponds to one kernel dispatch, and therefore allows up-to 128 kernels to be run concurrently. Therefore, dispatching a kernel for each expression, has the possibility to improve the performance. In the case of the interpreter, having only one kernel that can be dispatched for each expression, also simplifies the kernel itself. This is because the kernel can focus on evaluating one expression and does not require additional code to handle multiple expressions at once. Similarly, the transpiler can also be simplified, as it can generate many smaller kernels than one big kernel. Additionally, the smaller kernels do not need any branching, because the generated code only needs to perform the operations as they occur in the expression itself.

-\subsection{Architecture}
-talk about the coarse grained architecture on how the transpiler will work. (.5 to 1 page probably)
+\subsection{Pre-Processing}
+The first step in both prototypes is the pre-processing step. It is needed, as it simplifies working with the expressions in the later steps. One of the responsibilities of the pre-processor is to verify that only allowed operators and symbols are present in the given expressions. This is comparable to the work a scanner like Flex\footnote{\url{https://github.com/westes/flex}} performs. Additionally, this step also converts the expression into an intermediate representation. In essence, the pre-processing step can be compared to the front-end of a compiler as described in Section \ref{sec:compilers}. The conversion into the intermediate representation transforms the expressions from infix-notation into postfix-notation. This further allows the later parts to more easily evaluate the expressions. One of the major benefits of this notation is the implicit operator precedence. It allows the evaluators to evaluate the expressions token by token from left to right, without needing to worry about the correct order of operations. One token represents either an operator, a constant value, a variable or a parameter. Apart from the intermediate representation containing the expression in postfix-notation, it also contains the information about the types of the tokens themselves. This is all that is needed for the interpretation and transpilation steps. A simple expression like $x + 2$ would look like depicted in figure \ref{fig:pre-processing_results} after the pre-processing step.

-\subsection{Host}
-talk about how the transpiler is implemented
+\begin{figure}
+	\centering
+	\includegraphics[width=.9\textwidth]{pre-processing_result.png}
+	\caption{This diagram shows how an expression will be transformed in the pre-processing step.}
+	\label{fig:pre-processing_results}
+\end{figure}

-\subsection{Device}
-talk about what the GPU does. short section since the gpu does not do much
+It would have also been possible to perform the pre-processing step on the GPU. However, pre-processing only one expression can not easily be split into multiple threads, which means one GPU thread would need to process one expression. As described in Section \ref{sec:gpgpu} a single GPU thread is slower than a single CPU thread and as a result means the processing will also be slower. Furthermore, it wouldn't make sense to process all expressions in a single kernel. This would lead to a lot of thread divergence, which essentially means processing one expression after the other. The SIMT programming model might help with parallelising at least some parts of the processing work. However, the generated expressions can differ a lot from each other and restricting them to be similar and therefore SIMT friendly, would likely reduce the overall quality of the symbolic regression algorithm. Therefore, it does not make sense to perform the processing step on the GPU. This is a typical example of code that is better run on the CPU, also because the parallelisation possibilities of one thread per expression can be applied to the CPU as well. Concepts like caching processed expressions, or caching parts of the processed expressions can also be employed on the CPU. This would not be possible on the GPU, because a GPU can not save state between two kernel dispatches.
+
+\subsection{Interpreter}
+
+\begin{figure}
+	\centering
+	\includegraphics[width=.9\textwidth]{component_diagram_interpreter.png}
+	\caption{This diagram depicts the coarse-grained workflow of the interpreter. It shows how the parts interact with each other and with the system it will operate in.}
+	\label{fig:component_diagram_interpreter}
+\end{figure}
+
+The interpreter consists of two parts. The CPU side is the part of the program, that interacts with both the GPU and the caller. An overview on the components and the workflow of the interpreter can be seen in Figure \ref{fig:component_diagram_interpreter}. Once the interpreter receives the expressions, they are pre-processed. This ensures the expressions are valid, and that they are transformed into the intermediate representation needed for evaluating them. The results of this pre-processing are then sent to the GPU, which performs the actual interpretation of the expressions. Alongside the expressions, the data for the variables and parameters also needs to be sent to the GPU. Once all the data resides on the GPU, the interpreter kernel can be dispatched. It needs to be noted, that for each of the expressions, a separate kernel will be dispatched. As already described, this decision has been made to reduce thread divergence and therefore increase performance. In fact, dispatching the same kernel multiple times with different expressions, means, there will not occur any thread divergence as explained later. Once the GPU has finished evaluating all expressions with all variable sets, the result will be stored in a matrix on the GPU. The CPU then retrieves the results and returns them to the caller in the format specified by the requirements.
+
+Evaluating the expressions is relatively straight forward. Due to the expressions being in postfix-notation, the actual interpreter must only iterate over all tokens once and perform the appropriate tasks. If the interpreter encounters a binary operator, it must simply read the previous two values and perform the operation specified by the operator. For unary operators, only the previous value must be read. As already mentioned, expressions in postfix-notation implicitly contain the operator precedence, therefore no look-ahead or other strategies need to be used to ensure correct evaluation. The Algorithm \ref{alg:eval_interpreter} shows how the interpreter works. Note that this is a simplified version, that only works with additions, multiplications, constant values and variables.
+
+\begin{algorithm}
+	\caption{Interpreting an equation in postfix-notation}\label{alg:eval_interpreter}
+	\begin{algorithmic}[1]
+		\Procedure{Evaluate}{\textit{expr}: PostfixExpression}
+			\State $\textit{stack} \gets []$
+
+			\While{HasTokenLeft(\textit{expr})}
+				\State $\textit{token} \gets \text{GetNextToken}(\textit{expr})$ 
+				\If{$\textit{token.Type} = \text{Constant}$} 
+					\State Push($\textit{stack}$, $\textit{token.Value}$)
+				\ElsIf{$\textit{token.Type} = \text{Variable}$}
+					\State Push($\textit{stack}$, GetVariable($\textit{token.Value}$))
+				\ElsIf{$\textit{token.Type} = \text{Operator}$} 
+					\If{$\textit{token.Value} = \text{Addition}$}
+						\State $\textit{right} \gets \text{Pop}(\textit{stack})$
+						\State $\textit{left} \gets \text{Pop}(\textit{stack})$
+						\State Push($\textit{stack}$, $\textit{left} + \textit{right}$)
+					\ElsIf{$\textit{token.Value} = \text{Multiplication}$}
+						\State $\textit{right} \gets \text{Pop}(\textit{stack})$
+						\State $\textit{left} \gets \text{Pop}(\textit{stack})$
+						\State Push($\textit{stack}$, $\textit{left} * \textit{right}$)
+					\EndIf
+				\EndIf
+			\EndWhile
+
+			\Return $\text{Pop}(\textit{stack})$
+		\EndProcedure
+	\end{algorithmic}
+\end{algorithm}
+
+If a new operator is needed, it must simply be added as another else-if block inside the operator branch. New token types like variables or parameters, can also be added by adding a new outer else-if block that checks for these token types. However, the pre-processing step also needs to be extended with these new operators and token types. Otherwise, the expression will never reach the evaluation step as they would be seen as invalid. It is also possible to add unary operators like $\log()$. In this case only one value would be read from the stack, the operation would be applied, and the result would be written back to the stack.
+
+The Algorithm \ref{alg:eval_interpreter} in this case resembles the kernel. This kernel will be dispatched for every expression that needs to be evaluated, to eliminate thread divergence. Thread divergence can only happen on data dependent branches. In this case, the while loop and every if and else-if statement contains a data dependent branch. Depending on the expression passed to the kernel, the while loop may run longer than for another expression. Similarly, not all expressions have the same constants, operators and variables in the same order and would therefore lead to each thread, taking different paths. However, one expression, always has the same constants, operators and variables in the same locations, meaning all threads will take the same paths. This also means that despite the interpreter containing many data dependent branches, these branches only depend on the expression itself. Because of this, all threads will take the same paths and therefore will never diverge from one another if they execute the same expression.
+
+\subsection{Transpiler}
+
+\begin{figure}
+	\centering
+	\includegraphics[width=.9\textwidth]{component_diagram_transpiler.png}
+	\caption{This diagram depicts the coarse-grained workflow of the transpiler. It shows how the parts interact with each other and with the system it will operate in.}
+	\label{fig:component_diagram_transpiler}
+\end{figure}
+
+Similar to the interpreter, the transpiler also consists of a part that runs on the CPU and a part that runs on the GPU. When looking at the components and workflow of the transpiler, as shown in Figure \ref{fig:component_diagram_transpiler}, it is almost identical to the interpreter. However, the key difference between the two, is the additional code generation, or transpilation step. Apart from that, the transpiler also needs the same pre-processing step and also the GPU to evaluate the expressions. However, the GPU evaluator generated by the transpiler works differently to the GPU evaluator for the interpreter. The difference between these evaluators will be explained later.
+
+Before the expressions can be transpiled into PTX code, they need to be pre-processed. As already described, this step ensures the validity of the expressions and transforms them into the intermediate representation described above. As with the interpreter, this also simplifies the code generation step at the cost of some performance because the validity has to be ensured, and the intermediate representation needs to be generated. However, in this case the benefit of having a simple code generation step was more important than performance. By transforming the expressions into postfix-notation, the code generation follows a similar pattern to the interpretation already described. Algorithm \ref{alg:transpile} shows how the transpiler takes an expression, transpiles it and then returns the finished code. It can be seen that the while loop is the same as the while loop of the interpreter. The main difference is in the operator branches. Because now code needs to be generated, the branches themselves call their designated code generation function, such as $\textit{GetAddition}$. However, this function can not only return the code that performs the addition for example. When executed, this addition also returns a value which will be needed as an input by other operators. Therefore, not only the code fragment must be returned, but also the reference to the result. This reference can then be put on the stack for later use the same as the interpreter stores the value for later use. The code fragment must also be added to the already generated code so that it can be returned to the caller. As with the interpreter, there is a final value on the stack when the loop has finished. Once the code is executed, this value is the reference to the result of the expression. This value then needs to be stored in the results matrix, so that it can be retrieved by the CPU after all expressions have been executed on the GPU. Therefore, one last code fragment must be generated to handle the storage of this value in the results matrix. This fragment must then be added to the code already generated, and the transpilation process is completed.
+
+\begin{algorithm}
+	\caption{Transpiling an equation in postfix-notation}\label{alg:transpile}
+	\begin{algorithmic}[1]
+		\Procedure{Transpile}{\textit{expr}: PostfixExpression}: String
+			\State $\textit{stack} \gets []$
+			\State $\textit{code} \gets$ ""
+
+			\While{HasTokenLeft(\textit{expr})}
+				\State $\textit{token} \gets \text{GetNextToken}(\textit{expr})$ 
+				\If{$\textit{token.Type} = \text{Constant}$} 
+					\State Push($\textit{stack}$, $\textit{token.Value}$)
+				\ElsIf{$\textit{token.Type} = \text{Variable}$} 
+					\State ($\textit{codeFragment}, \textit{referenceToValue}$) $\gets$ GetVariable($\textit{token.Value}$)
+					\State Push($\textit{stack}$, $\textit{referenceToValue}$)
+					\State Append($\textit{code}$, $\textit{codeFragment}$)
+				\ElsIf{$\textit{token.Type} = \text{Operator}$} 
+					\If{$\textit{token.Value} = \text{Addition}$}
+						\State $\textit{right} \gets \text{Pop}(\textit{stack})$
+						\State $\textit{left} \gets \text{Pop}(\textit{stack})$
+						\State $(\textit{referenceToValue}, \textit{codeFragment}) \gets \text{GetAddition}(\textit{left}, \textit{right})$
+						\State Push($\textit{stack}$, $\textit{referenceToValue}$)
+						\State Append($\textit{code}$, $\textit{codeFragment}$)
+					\ElsIf{$\textit{token.Value} = \text{Multiplication}$}
+						\State $\textit{right} \gets \text{Pop}(\textit{stack})$
+						\State $\textit{left} \gets \text{Pop}(\textit{stack})$
+						\State $(\textit{referenceToValue}, \textit{codeFragment}) \gets \text{GetMultiplication}(\textit{left}, \textit{right})$
+						\State Push($\textit{stack}$, $\textit{referenceToValue}$)
+						\State Append($\textit{code}$, $\textit{codeFragment}$)
+					\EndIf
+				\EndIf
+			\EndWhile
+						
+			\State $\textit{codeFragment} \gets$ GenerateResultStoring($\text{Pop}(\textit{stack})$)
+			\State Append($\textit{code}$, $\textit{codeFragment}$)
+
+			\Return $\textit{code}$
+		\EndProcedure
+	\end{algorithmic}
+\end{algorithm}
+
+The code generated by the transpiler is the kernel for the transpiled expressions. This means that a new kernel must be generated for each expression that needs to be evaluated. This is in contrast to the interpreter, which has one kernel and dispatches it once for each expression. However, generating one kernel per expression results in a much simpler kernel. This allows the kernel to focus on evaluating the postfix expression from left to right. No overhead work, like branching or managing a stack is needed. However, this overhead is now offloaded to the transpilation step on the CPU as can be seen in Algorithm \ref{alg:transpile}. There is also a noticeable overhead in that a kernel has to be generated for each expression. In cases like parameter optimisation, many of the expressions will be transpiled multiple times as the transpiler is called multiple times with the same expressions.
+
+Both the transpiler and the interpreter have their respective advantages and disadvantages. While the interpreter puts less load on the CPU, the GPU has to perform more work. Much of this work is branching or managing a stack and therefore involves many instructions that are not used to evaluate the expression itself. However, this overhead can be mitigated by the fact, that all of this overhead is performed in parallel and not sequentially. 
+
+On the other hand, the transpiler performs more work on the CPU. The kernels are much simpler, and most of the instructions are used to evaluate the expressions themselves. Furthermore, as explained in Section \ref{sec:ptx}, any program running on the GPU, must be transpiled into PTX code before the driver can compile it into machine code. Therefore, the kernel written for the interpreter, must also be transpiled into PTX. This overhead is in addition to the branch instruction overhead. The self-written transpiler removes this intermediate step by transpiling directly to PTX. In addition, the generated code is tailored to evaluate expressions and does not need to generate generic PTX code, which can reduce transpilation time. 
+
+Unlike the GPU, the CPU can manage state across multiple calls. Concepts such as caches can be employed by the transpiler to reduce the overhead on the CPU. In cases such as parameter optimisation, where expressions remain the same over multiple calls, the resulting PTX code can be cached. As a result the same expression doesn't need to be transpiled multiple times, drastically reducing the transpilation time and therefore improving the overall performance of the transpiler. 
--- a/thesis/chapters/conclusion.tex
+++ b/thesis/chapters/conclusion.tex
@ -5,3 +5,5 @@ Summarise the results

 \section{Future Work}
 talk about what can be improved
+
+Transpiler: transpile expression directly from Julia AST -> would save time because no intermediate representation needs to be created (looses step and gains performance, but also makes transpiler itself more complex)
--- a/thesis/chapters/evaluation.tex
+++ b/thesis/chapters/evaluation.tex
@ -1,14 +1,40 @@
 \chapter{Evaluation}
 \label{cha:evaluation}

+The aim of this thesis is to determine whether at least one of the GPU evaluators is faster than the current CPU evaluator. This chapter describes the performance evaluation. First, the environment in which the performance tests are performed is explained. Then the individual results for the GPU interpreter and the transpiler are presented. In addition, this part also includes the performance tuning steps taken to achieve these results. Finally, the results of the GPU evaluators are compared to the CPU evaluator in order to answer the research questions of this thesis.
+
 \section{Test environment}
 Explain the hardware used, as well as the actual data (how many expressions, variables etc.)

+three scenarios -> few, normal and many variable sets;; expr repetitions to simulate parameter optimisation
+Benchmarktools.jl -> 1000 samples per scenario
+
 \section{Results}
 talk about what we will see now (results only for interpreter, then transpiler and then compared with each other and a CPU interpreter)
+
 \subsection{Interpreter}
-Results only for Interpreter
+Results only for Interpreter (also contains final kernel configuration and probably quick overview/recap of the implementation used and described in Implementation section)
+\subsection{Performance tuning}
+Document the process of performance tuning
+
+Initial: CPU-Side single-threaded; up to 1024 threads per block; bounds-checking enabled (especially in kernel)
+
+1.) Blocksize reduced to a maximum of 256 -> moderate improvement in medium and large
+2.) Using @inbounds -> noticeable improvement in 2 out of 3
+3.) Tuned blocksize with NSight compute -> slight improvement
+4.) used int32 everywhere to reduce register usage -> significant performance drop (probably because a lot more waiting time, or more type conversions happening on GPU? would need to look at PTX)
+
 \subsection{Transpiler}
-Results only for Transpiler
+Results only for Transpiler (also contains final kernel configuration and probably quick overview/recap of the implementation used and described in Implementation section
+\subsection{Performance tuning}
+Document the process of performance tuning
+
+Initial: CPU-Side single-threaded; up to 1024 threads per block; bounds-checking enabled
+
+1.) Blocksize reduced to a maximum of 256 -> moderate improvement in medium and large
+2.) Using @inbounds -> small improvement only on CPU side code
+3.) Tuned blocksize with NSight compute -> slight improvement
+4.) Only changed things on interpreter side
+
 \subsection{Comparison}
 Comparison of Interpreter and Transpiler as well as Comparing the two with CPU interpreter
--- a/thesis/chapters/implementation.tex
+++ b/thesis/chapters/implementation.tex
@ -1,13 +1,34 @@
 \chapter{Implementation}
 \label{cha:implementation}

+somewhere in here explain why one kernel per expression and not one kernel for all expressions
+
 \section{Technologies}
 Short section; CUDA, PTX, Julia, CUDA.jl

 Probably reference the performance evaluation papers for Julia and CUDA.jl

+\section{Expression Processing}
+Talk about why this needs to be done and how it is done (the why is basically: simplifies evaluation/transpilation process; the how is in ExpressionProcessing.jl)
+
 \section{Interpreter}
 Talk about how the interpreter has been developed.

+UML-Ablaufdiagram
+
+main loop; kernel transpiled by CUDA.jl into PTX and then executed
+
+Memory access (currently global memory only)
+no dynamic memory allocation like on CPU (stack needs to have fixed size)
+
 \section{Transpiler}
-Talk about how the transpiler has been developed
+Talk about how the transpiler has been developed (probably largest section, because it just has more interesting parts)
+
+UML-Ablaufdiagram
+
+Front-End and Back-End
+Caching of back-end results
+
+PTX code generated and compiled using CUDA.jl (so basically the driver) and then executed
+
+Memory access (global memory and register management especially register management)
--- a/thesis/chapters/introduction.tex
+++ b/thesis/chapters/introduction.tex
@ -11,15 +11,13 @@ Optimisation and acceleration of program code is a crucial part in many fields.

 The following expression $5 - \text{abs}(x_1) * \text{sqrt}(x_2) / 10 + 2 \char`^ x_3$ which contains simple mathematical operations as well as variables $x_n$ and parameters $p_n$ is one example that can be generated by the equation learning algorithm, Usually an equation learning algorithm generates multiple of such expressions per iteration. Out of these expressions all possibly relevant ones have to be evaluated. Additionally, multiple different values need to be inserted for all variables and parameters, drastically increasing the amount of evaluations that need to be performed.

-In his Blog \textcite{sutter_free_2004} described how the free lunch is over in terms of the ever-increasing performance of hardware like the CPU. He states that to gain additional performance, developers need to start developing software for multiple cores and not just hope that on the next generation of CPUs the program magically runs faster. While this approach means more development overhead, a much greater speed-up can be achieved. However, in some cases the speed-up achieved by this is still not large enough and another approach is needed. One of these approaches is the utilisation of Graphics Processing Units (GPUs) as an easy and affordable option as compared to compute clusters. Especially when talking about performance per dollar, GPUs are very inexpensive as found by \textcite{brodtkorb_graphics_2013}. \textcite{michalakes_gpu_2008} have shown a noticeable speed-up when using GPUs for weather simulation. In addition to computer simulations, GPU acceleration also can be found in other places such as networking \parencite{han_packetshader_2010} or structural analysis of buildings \parencite{georgescu_gpu_2013}.
-
-
-%The free lunch theorem as described by \textcite{adam_no_2019} states that to gain additional performance, a developer cannot just hope for future hardware to be faster, especially on a single core. 
+In his blog, \textcite{sutter_free_2004} described how the free lunch is over in terms of the ever-increasing performance of hardware like the CPU. He states that to gain additional performance, developers need to start developing software for multiple cores and not just hope that on the next generation of CPUs the program magically runs faster. While this approach means more development overhead, a much greater speed-up can be achieved. However, in some cases the speed-up achieved by this is still not large enough and another approach is needed. One of these approaches is the utilisation of Graphics Processing Units (GPUs) as an easy and affordable option as compared to compute clusters. Especially when talking about performance per dollar, GPUs are very inexpensive as found by \textcite{brodtkorb_graphics_2013}. \textcite{michalakes_gpu_2008} have shown a noticeable speed-up when using GPUs for weather simulation. In addition to computer simulations, GPU acceleration also can be found in other places such as networking \parencite{han_packetshader_2010} or structural analysis of buildings \parencite{georgescu_gpu_2013}. 

+% TODO: Incorporate PTX somehow


 \section{Research Question}
-With these successful implementations of GPU acceleration, this thesis also attempts to improve the performance of evaluating mathematical equations using GPUs. Therefore, the following research questions are formulated:
+With these successful implementations of GPU acceleration, this thesis also attempts to improve the performance of evaluating mathematical equations, generated at runtime for symbolic regression using GPUs. Therefore, the following research questions are formulated:

 \begin{itemize}
 	\item How can simple arithmetic expressions that are generated at runtime be efficiently evaluated on GPUs?
@ -41,7 +39,7 @@ In order to answer the research questions, this thesis is divided into the follo
 	\item[Chapter 4: Implementation] \mbox{} \\
 		This chapter explains the implementation of the GPU interpreter and transpiler. The details of the implementation with the used technologies are covered, such as the interpretation process and the transpilation of the expressions into Parallel Thread Execution (PTX) code.
 	\item[Chapter 5: Evaluation] \mbox{} \\
-		The software and hardware requirements and the evaluation environment are introduced in this chapter. All three evaluators will be compared against each other and the form of the expressions used for the comparisons are outlined. Finally, the results of the comparison of the GPU and CPU evaluators are presented to show which of these yields the best performance.
+		The software and hardware requirements and the evaluation environment are introduced in this chapter. All three evaluators will be compared against each other and the form of the expressions used for the comparisons are outlined. The comparison will not only include the time taken for the pure evaluation, but it will also include the overhead, like PTX code generation. Finally, the results of the comparison of the GPU and CPU evaluators are presented to show which of these yields the best performance.
 	\item[Chapter 6: Conclusion] \mbox{} \\
 		In the final chapter, the entire work is summarised. A brief overview of the implementation as well as the evaluation results will be provided. Additionally, an outlook of possible future research is given.
 \end{description}
--- a/thesis/chapters/relwork.tex
+++ b/thesis/chapters/relwork.tex
@ -25,7 +25,7 @@ Graphics cards (GPUs) are commonly used to increase the performance of many diff
 While in the early days of GPGPU programming a lot of research has been done to assess if this approach is feasible, it now seems obvious to use GPUs to accelerate algorithms. GPUs have been used early to speed up weather simulation models. \textcite{michalakes_gpu_2008} proposed a method for simulating weather with the Weather Research and Forecast (WRF) model on a GPU. With their approach, they reached a speed-up of 5 to 2 for the most compute intensive task, with little GPU optimisation effort. They also found that the GPU usage was low, meaning there are resources and potential for more detailed simulations. Generally, simulations are great candidates for using GPUs, as they can benefit heavily from a high degree of parallelism and data throughput. \textcite{koster_high-performance_2020} have developed a way of using adaptive time steps on the GPU to considerably improve the performance of numerical and discrete simulations. In addition to the performance gains they were able to retain the precision and constraint correctness of the simulation. Black hole simulations are crucial for science and education for a better understanding of our world. \textcite{verbraeck_interactive_2021} have shown that simulating complex Kerr (rotating) black holes can be done on consumer hardware in a few seconds. Schwarzschild black hole simulations can be performed in real-time with GPUs as described by \textcite{hissbach_overview_2022} which is especially helpful for educational scenarios. While both approaches do not have the same accuracy as detailed simulations on supercomputers, they show how a single GPU can yield similar accuracy at a fraction of the cost. Software network routing can also heavily benefit from GPU acceleration as shown by \textcite{han_packetshader_2010}, where they achieved a significantly higher throughput than with a CPU only implementation. Finite element structural analysis is an essential tool for many branches of engineering and can also heavily benefit from the usage of GPUs as demonstrated by \textcite{georgescu_gpu_2013}. Generating test data for DeepQ learning can also significantly benefit from using the GPU \parencite{koster_macsq_2022}. However, it also needs to be noted, that GPUs are not always better performing than CPUs as illustrated by \textcite{lee_debunking_2010}, so it is important to consider if it is worth using GPUs for specific tasks.

 \subsection{Programming GPUs}
-The development process on a GPU is vastly different from a CPU. A CPU has tens or hundreds of complex cores with the AMD Epyc 9965\footnote{\url{https://www.amd.com/en/products/processors/server/epyc/9005-series/amd-epyc-9965.html}} having a staggering $192$ cores and twice as many threads. To demonstrate the complexity of a simple one core 8-bit CPU \textcite{schuurman_step-by-step_2013} has written a development guide. He describes the different parts of one CPU core and how they interact. Modern CPUs are even more complex, with dedicated fast integer and floating-point arithmetic gates as well as logic gates, sophisticated branch prediction and much more. This makes a CPU perfect for handling complex control flows on a single program strand and on modern CPUs even multiple strands simultaneously \parencite{palacios_comparison_2011}. However, as seen in section \ref{sec:gpgpu}, this often isn't enough. On the other hand, a GPU contains thousands or even tens of thousands of cores. For example, the GeForce RTX 5090\footnote{\url{https://www.nvidia.com/en-us/geforce/graphics-cards/50-series/rtx-5090/}} contains a total of $21\,760$ CUDA cores. To achieve this enormous core count a single GPU core has to be much simpler than one CPU core. As described by \textcite{nvidia_cuda_2025} a GPU designates much more transistors towards floating-point computations. This results in less efficient integer arithmetic and control flow handling. There is also less Cache available per core and clock speeds are usually also much lower than those on a CPU. An overview of the differences of a CPU and a GPU architecture can be seen in figure \ref{fig:cpu_vs_gpu}.
+The development process on a GPU is vastly different from a CPU. A CPU has tens or hundreds of complex cores with the AMD Epyc 9965\footnote{\url{https://www.amd.com/en/products/processors/server/epyc/9005-series/amd-epyc-9965.html}} having $192$ cores and twice as many threads. To demonstrate the complexity of a simple one core 8-bit CPU \textcite{schuurman_step-by-step_2013} has written a development guide. He describes the different parts of one CPU core and how they interact. Modern CPUs are even more complex, with dedicated fast integer and floating-point arithmetic gates as well as logic gates, sophisticated branch prediction and much more. This makes a CPU perfect for handling complex control flows on a single program strand and on modern CPUs even multiple strands simultaneously \parencite{palacios_comparison_2011}. However, as seen in Section \ref{sec:gpgpu}, this often is not enough. On the other hand, a GPU contains thousands or even tens of thousands of cores. For example, the GeForce RTX 5090\footnote{\url{https://www.nvidia.com/en-us/geforce/graphics-cards/50-series/rtx-5090/}} contains a total of $21\,760$ CUDA cores. To achieve this enormous core count a single GPU core has to be much simpler than one CPU core. As described by \textcite{nvidia_cuda_2025} a GPU designates much more transistors towards floating-point computations. This results in less efficient integer arithmetic and control flow handling. There is also less Cache available per core and clock speeds are usually also much lower than those on a CPU. An overview of the differences of a CPU and a GPU architecture can be seen in Figure \ref{fig:cpu_vs_gpu}.

 \begin{figure}
 	\centering
@ -34,9 +34,10 @@ The development process on a GPU is vastly different from a CPU. A CPU has tens
 	\label{fig:cpu_vs_gpu}
 \end{figure}

-Despite these drawbacks, the sheer number of cores, makes a GPU a valid choice when considering improving the performance of an algorithm. Because of the high number of cores, GPUs are best suited for data parallel scenarios. This is due to the SIMD architecture of these cards. SIMD stands for Sinlge-Instruction Multiple-Data and states that there is a single stream of instructions that is executed on a huge number of data streams. \textcite{franchetti_efficient_2005} and \textcite{tian_compiling_2012} describe ways of using SIMD instructions on the CPU. Their approaches lead to noticeable speed-ups of 3.3 and 4.7 respectively by using SIMD instructions instead of serial computations. Extending this to GPUs which are specifically built for SIMD/data parallel calculations shows why they are so powerful despite having less complex and slower cores than a CPU. 
+Despite these drawbacks, the sheer number of cores, makes a GPU a valid choice when considering improving the performance of an algorithm. Because of the high number of cores, GPUs are best suited for data parallel scenarios. This is due to the SIMD architecture of these cards. SIMD stands for Sinlge-Instruction Multiple-Data and states that there is a single stream of instructions that is executed on a huge number of data streams. \textcite{franchetti_efficient_2005} and \textcite{tian_compiling_2012} describe ways of using SIMD instructions on the CPU. Their approaches lead to noticeable speed-ups of 3.3 and 4.7 respectively by using SIMD instructions instead of serial computations. Extending this to GPUs which are specifically built for SIMD/data parallel calculations shows why they are so powerful despite having less complex and slower cores than a CPU. It is also important to note, that a GPU also always needs a CPU, as the CPU is responsible for sending the data to the GPU and starting the GPU program. In GPGPU programming, the CPU is usually called the host, while the GPU is usually called the device.

 \subsubsection{Thread Hierarchy and Tuning}
+\label{sec:thread_hierarchy}
 The thousands of cores on a GPU, also called threads, are grouped together in several categories. This is the Thread hierarchy of GPUs. The developer can influence this grouping to a degree which allows them to tune their algorithm for optimal performance. In order to develop a well performing algorithm, it is necessary to know how this grouping works. Tuning the grouping is unique to each algorithm and also dependent on the GPU used, which means it is important to test a lot of different configurations to achieve the best possible result. This section aims at exploring the thread hierarchy and how it can be tuned to fit an algorithm.

 At the lowest level of a GPU exists a Streaming Multiprocessor (SM), which is a hardware unit responsible for scheduling and executing threads and also contains the registers used by these threads. An SM is always executing a group of 32 threads simultaneously, and this group is called a warp. The number of threads that can be started is virtually unlimited. However, threads must be grouped in a block, with one block typically containing a maximum of $1024$ threads but is often configured to be less. Therefore, if more than $1024$ threads are required, more blocks must be created. Blocks can also be grouped into thread block clusters which is optional, but can be useful in certain scenarios. All thread blocks or thread block clusters are part of a grid, which manifests as a dispatch of the code run on the GPU, also called kernel \parencite{amd_hip_2025}. All threads in one block have access to some shared memory, which can be used for L1 caching or communication between threads. It is important that the blocks can be scheduled independently, with no dependencies between them. This allows the scheduler to schedule blocks and threads as efficiently as possible. All threads within a warp are guaranteed to be part of the same block, and are therefore executed simultaneously and can access the same memory addresses. Figure \ref{fig:thread_hierarchy} depicts how threads in a block are grouped into warps for execution and how they shared memory. 
@ -48,7 +49,7 @@ At the lowest level of a GPU exists a Streaming Multiprocessor (SM), which is a
 	\label{fig:thread_hierarchy}
 \end{figure}

-A piece of code that is executed on a GPU is written as a kernel which can be configured. The most important configuration is how threads are grouped into blocks. The GPU allows the kernel to allocate threads and blocks and block clusters in up to three dimensions. This is often useful because of the already mentioned shared memory, which will be explained in more detail in section \ref{sec:memory_model}. Considering the case where an image needs to be blurred, it not only simplifies the development if threads are arranged in a 2D grid, it also helps with optimising memory access. As the threads in a block, need to access a lot of the same data, this data can be loaded in the shared memory of the block. This allows the data to be accessed much quicker compared to when threads are allocated in only one dimension. With one dimensional blocks it is possible that threads assigned to nearby pixels, are part of a different block, leading to a lot of duplicate data transfer. The size in each dimension of a block can be almost arbitrary within the maximum allowed number of threads. However, blocks that are too large might lead to other problems which are described in more detail in section \ref{sec:occupancy}.
+A piece of code that is executed on a GPU is written as a kernel which can be configured. The most important configuration is how threads are grouped into blocks. The GPU allows the kernel to allocate threads and blocks and block clusters in up to three dimensions. This is often useful because of the already mentioned shared memory, which will be explained in more detail in Section \ref{sec:memory_model}. Considering the case where an image needs to be blurred, it not only simplifies the development if threads are arranged in a 2D grid, it also helps with optimising memory access. As the threads in a block, need to access a lot of the same data, this data can be loaded in the shared memory of the block. This allows the data to be accessed much quicker compared to when threads are allocated in only one dimension. With one dimensional blocks it is possible that threads assigned to nearby pixels, are part of a different block, leading to a lot of duplicate data transfer. The size in each dimension of a block can be almost arbitrary within the maximum allowed number of threads. However, blocks that are too large might lead to other problems which are described in more detail in Section \ref{sec:occupancy}.

 All threads in a warp start at the same point in a program, but with their own instruction address, allowing them to work independently. Because of the SIMD architecture, all threads in a warp must execute the same instructions and if threads start diverging, the SM must pause threads with different instructions and execute them later. Figure \ref{fig:thread_divergence} shows how such divergences can impact performance. The situation described by the figure also shows, that after the divergence the thread could re-converge. On older hardware this does not happen and leads to T2 being executed after T1 and T3 are finished. In situations where a lot of data dependent thread divergence happens, most of the benefits of using a GPU likely have vanished. Threads not executing the same instruction is strictly speaking against the SIMD principle but can happen in reality, due to data dependent branching. Consequently, this leads to bad resource utilisation, which in turn leads to worse performance. Another possibility of threads being paused (inactive threads) is the fact that sometimes, the number of threads started is not divisible by 32. In such cases, the last warp still contains 32 threads but only the threads with work are executed.

@ -68,7 +69,7 @@ Modern GPUs implement the so called Single-Instruction Multiple-Thread (SIMT) ar
 % - Memory allocation (with the one paper diving into dynamic allocations)
 % - Memory transfer (with streams potentially)

-On a GPU there are two parts that contribute to the performance of an algorithm. The one already looked at is the compute-portion of the GPU. This is necessary because if threads are serialised or run inefficiently, there is nothing that can make the algorithm execute faster. However, algorithms run on a GPU usually require huge amounts of data to be processed, as they are designed for exactly that purpose. The purpose of this section is to explain how the memory model of the GPU works and how it can influence the performance of an algorithm. In figure \ref{fig:gpu_memory_layout} the memory layout and the kinds of memory available are depicted. The different parts will be explained in this section.
+On a GPU there are two parts that contribute to the performance of an algorithm. The one already looked at is the compute-portion of the GPU. This is necessary because if threads are serialised or run inefficiently, there is nothing that can make the algorithm execute faster. However, algorithms run on a GPU usually require huge amounts of data to be processed, as they are designed for exactly that purpose. The purpose of this section is to explain how the memory model of the GPU works and how it can influence the performance of an algorithm. In Figure \ref{fig:gpu_memory_layout} the memory layout and the kinds of memory available are depicted. The different parts will be explained in this section.

 \begin{figure}
 	\centering
@ -77,7 +78,7 @@ On a GPU there are two parts that contribute to the performance of an algorithm.
 	\label{fig:gpu_memory_layout}
 \end{figure}

-On a GPU there are multiple levels and kinds of memory available. All these levels and kinds have different purposes they are optimised for. This means that it is important to know what they are and how they can be best used for specific tasks. On the lowest level threads have registers and local memory available. Registers is the fastest way to access memory but is also the least abundant memory with up to a maximum of 255 32-Bit registers per thread on Nvidia GPUs and 256 on AMD GPUs \parencite{amd_hardware_2025}. However, using all registers of a thread can lead to other problems which are described in more detail in section \ref{sec:occupancy}. On the other side, the thread local memory is significantly slower than registers. This is due to the fact, that local memory is actually stored in global memory and therefore has the same limitations which are explained later. This means it is important to try and avoid local memory as much as possible. Local memory is usually only used when a thread uses too many registers. The compiler will then spill the remaining data into local memory and loads it into registers once needed, drastically slowing down the application.
+On a GPU there are multiple levels and kinds of memory available. All these levels and kinds have different purposes they are optimised for. This means that it is important to know what they are and how they can be best used for specific tasks. On the lowest level threads have registers and local memory available. Registers is the fastest way to access memory but is also the least abundant memory with up to a maximum of 255 32-Bit registers per thread on Nvidia GPUs and 256 on AMD GPUs \parencite{amd_hardware_2025}. However, using all registers of a thread can lead to other problems which are described in more detail in Section \ref{sec:occupancy}. On the other side, the thread local memory is significantly slower than registers. This is due to the fact, that local memory is actually stored in global memory and therefore has the same limitations which are explained later. This means it is important to try and avoid local memory as much as possible. Local memory is usually only used when a thread uses too many registers. The compiler will then spill the remaining data into local memory and loads it into registers once needed, drastically slowing down the application.

 Shared memory is the next tier of memory on a GPU. Unlike local memory and registers, shared memory is shared between all threads inside a block. The amount of shared memory is depending on the GPU architecture but for Nvidia it hovers at around 100 Kilobyte (KB) per block. While this memory is slower than registers, its primary use-case is communicating and sharing data between threads in a block. If all threads in a block access a lot of overlapping data this data can be loaded from global memory into faster shared memory once. It can then be accessed multiple times, further increasing performance. Loading data into shared memory and accessing that data has to be done manually. Because shared memory is part of the unified data cache, it can either be used as a cache or for manual use, meaning a developer can allocate more shared memory towards caching if needed. Another feature of shared memory are the so-called memory banks. Shared memory is always split into 32 equally sized memory modules also called memory banks. All available memory addresses lie in one of these banks. This means if two threads access two memory addresses which lie in different banks, the access can be performed simultaneously, increasing the throughput.

@ -118,13 +119,14 @@ When starting a kernel, the most important configuration is the number of thread

 In general, it is important to have as many warps as possible ready for execution. While this means that a lot of warps could be executed but are not, this is actually desired. A key feature of GPUs is so-called latency hiding, meaning that while a warp waits for data to be retrieved for example, another warp ready for execution can now be run. With low occupancy, and therefore little to no warps waiting for execution, latency hiding does not work, as now the hardware is idle. As a result, the runtime increases which also explains why high occupancy is not guaranteed to result in performance improvements while low occupancy can and often will increase the runtime.

-As seen in table \ref{tab:compute_capabilities}, there exist different limitations that can impact occupancy. The number of warps per SM is important, as this means this is the degree of parallelism achievable per SM. If due to other limitations, the number of warps per SM is below the maximum, there is idle hardware. One such limitation is the number of registers per block and SM. In the case of compute capability 8.9, one SM can handle $32 * 48 = 1\,536$ threads. This leaves $64\,000 / 1\,536 \approx 41$ registers per thread, which is lower than the theoretical maximum of $255$ registers per thread. Typically, one register is mapped to one variable in the kernel code, meaning a developer can use up to 41 variables in their code. However, if the variable needs 64 bits to store its value, the register usage doubles, as all registers on a GPU are 32-bit. On a GPU with compute capability 10.x a developer can use up to $64\,000 / 2\,048 \approx 31$ registers. Of course a developer can use more registers, but this results in less occupancy. However, depending on the algorithm using more registers might be more beneficial to performance than the lower occupancy, in which case occupancy is not as important. If a developer needs more than $255$ registers for their variables the additional variables will spill into local memory which is, as described in section \ref{sec:memory_model}, not desirable.
+As seen in table \ref{tab:compute_capabilities}, there exist different limitations that can impact occupancy. The number of warps per SM is important, as this means this is the degree of parallelism achievable per SM. If due to other limitations, the number of warps per SM is below the maximum, there is idle hardware. One such limitation is the number of registers per block and SM. In the case of compute capability 8.9, one SM can handle $32 * 48 = 1\,536$ threads. This leaves $64\,000 / 1\,536 \approx 41$ registers per thread, which is lower than the theoretical maximum of $255$ registers per thread. Typically, one register is mapped to one variable in the kernel code, meaning a developer can use up to 41 variables in their code. However, if the variable needs 64 bits to store its value, the register usage doubles, as all registers on a GPU are 32-bit. On a GPU with compute capability 10.x a developer can use up to $64\,000 / 2\,048 \approx 31$ registers. Of course a developer can use more registers, but this results in less occupancy. However, depending on the algorithm using more registers might be more beneficial to performance than the lower occupancy, in which case occupancy is not as important. If a developer needs more than $255$ registers for their variables the additional variables will spill into local memory which is, as described in Section \ref{sec:memory_model}, not desirable.

 Additionally, shared memory consumption can also impact the occupancy. If for example a block needs all the available shared memory, which is almost the same as the amount of shared memory per SM, this SM can only serve this block. On compute capability 10.x, this would mean that occupancy would be at maximum $50\%$ as a block can have up to $1\,024$ threads while an SM supports up to $2\,048$ threads. Again, in such cases it needs to be determined, if the performance gain of using this much shared memory is worth the lower occupancy.

 Balancing these limitations and therefore the occupancy and performance often requires a lot of trial and error with help of the aforementioned tools. In cases where occupancy is already high and the amount of warps ready for execution is also high, other areas for performance improvements need to be explored. Algorithmic optimisation is always a good idea. Some performance improvements can be achieved by altering the computations to use different parts of the GPU. One of such optimisations is using FP32 operations wherever possible. Another well suited optimisation is to rewrite the algorithm to use as many Fused Multiply-Add (FMA) instructions. FMA is a special floating point instruction, that multiplies two values and adds a third, all in a single clock cycle \parencite{nvidia_cuda_2025-1}. However, the result might slightly deviate compared to performing these two operations separately, which means in accuracy sensitive scenarios, this instruction should be avoided. If the compiler detects a floating point operation with the FMA structure, it will automatically be compiled to an FMA instruction. To prevent this, in C++ the developer can call the functions \_\_fadd\_ and \_\_fmul\_ for addition and multiplication respectively.

 \subsection[PTX]{Parallel Thread Execution}
+\label{sec:ptx}
 % https://docs.nvidia.com/cuda/parallel-thread-execution/
 While in most cases a GPU can be programmed in a higher level language like C++ or even Julia\footnote{\url{https://juliagpu.org/}}, it is also possible to program GPUs with the low level language Parallel Thread Execution (PTX) developed by Nvidia. A brief overview of what PTX is and how it can be used to program GPUs is given in this section. Information in this section is taken from the PTX documentation \parencite{nvidia_parallel_2025} if not stated otherwise.

@ -134,7 +136,7 @@ Syntactically PTX resembles Assembly style code. Every PTX code must have a \ver
 \begin{GenericCode}[numbers=none]
 add.f32		\%n, 0.1, 0.2;
 \end{GenericCode}
-Loops in the classical sense do not exist in PTX. Alternatively a developer needs to define jump targets for the beginning and end of the loop. The code in \ref{code:ptx_loop} shows how a function with simple loop can be implemented. The loop counts down to zero from the passed parameter $N$ which is loaded into the register \%n in line 6. If the value in the register \%n reached zero the loop branches at line 9 to the jump target at line 12 and the loop has finished. All other used directives and further information on writing PTX code can be taken from the PTX documentation \parencite{nvidia_parallel_2025}.
+Loops in the classical sense do not exist in PTX. Alternatively a developer needs to define jump targets for the beginning and end of the loop. The Program in \ref{code:ptx_loop} shows how a function with simple loop can be implemented. The loop counts down to zero from the passed parameter $N$ which is loaded into the register \%n in line 6. If the value in the register \%n reached zero the loop branches at line 9 to the jump target at line 12 and the loop has finished. All other used directives and further information on writing PTX code can be taken from the PTX documentation \parencite{nvidia_parallel_2025}.

 \begin{program}
 \begin{GenericCode}
@ -157,11 +159,12 @@ Done:
 \end{program}

 \section{Compilers}
+\label{sec:compilers}
 Compilers are a necessary tool for many developers. If a developer wants to run their program it is very likely they need one. As best described by \textcite{aho_compilers_2006} in their dragon book, a compiler takes code written by a human in some source language and translates it into a destination language readable by a computer. This section briefly explores what compilers are and research done in this old field of computer science. Furthermore, the topics of transpilers and interpreters are explored, as their use-cases are very similar.

 \textcite{aho_compilers_2006} and \textcite{cooper_engineering_2022} describe how a compiler can be developed, with the latter focusing on more modern approaches. They describe how a compiler consists of two parts, the analyser, also called frontend, and the synthesiser also called backend. The front end is responsible for ensuring syntactic and semantic correctness and converts the source code into an intermediate representation, an abstract syntax tree (AST), for the backend. Generating code in the target language, from the intermediate representation is the job of the backend. This target code can be assembly or anything else that is needed for a specific use-case. This intermediate representation also makes it simple to swap out frontends or backends. The Gnu Compiler Collection \textcite{gcc_gcc_2025} takes advantage of using different frontends to provide support for many languages including C, C++, Ada and more. Instead of compiling source code for specific machines directly, many languages compile code for virtual machines instead. Notable examples are the Java Virtual Machine (JVM) \parencite{lindholm_java_2025} and the low level virtual machine (LLVM) \parencite{lattner_llvm_2004}. Such virtual machines provide a bytecode which can be used as a target language for compilers. A huge benefit of such virtual machines is the ability for one program to be run on all physical machines the virtual machine exists for, without the developer needing to change that program \parencite{lindholm_java_2025}. Programs written for virtual machines are compiled into their respective bytecode. This bytecode can then be interpreted or compiled to physical machine code and then be run. According to the JVM specification \textcite{lindholm_java_2025} the Java bytecode is interpreted and also compiled with a just-in-time (JIT) compiler to increase the performance of code blocks that are often executed. On the other hand, the common language runtime (CLR)\footnote{\url{https://learn.microsoft.com/en-us/dotnet/standard/clr}}, the virtual machine for languages like C\#, never interprets the generated bytecode. As described by \textcite{microsoft_overview_2023} the CLR always compiles the bytecode to physical machine code using a JIT compiler before it is executed.

-A grammar describes how a language is structured. It not only describes the structure of natural language, but it can also be used to describe the structure of a programming language. \textcite{chomsky_certain_1959} found that grammars can be grouped into four levels, with regular and context-free grammars being the most relevant for programming languages. A regular grammar is of the structure $A = a\,|\,a\,B$ which is called a rule. The symbols $A$ and $B$ are non-terminal symbols and $a$ is a terminal symbol. A non-terminal symbol stands for another rule with the same structure and must only occur after a terminal symbol. Terminal symbols are fixed symbols or a value that can be found in the input stream, like literals in programming languages. Context-free grammars are more complex and are of the structure $A = \beta$. In this context $\beta$ stands for any combination of terminal and non-terminal symbols. Therefore, a rule like $A = a\,| a\,B\,a$ is allowed with this grammar level. This shows that with context-free grammars enclosing structures are possible. To write grammars for programming languages, other properties are also important to efficiently validate or parse some input to be defined by this grammar. However, these are not discussed here, but are described by \textcite{aho_compilers_2006}. They also described that generating a parser out of a grammar can be automated. This automation can be performed by parser generators like Yacc \parencite{johnson_yacc_1975} as described in their book. More modern alternatives are Bison\footnote{\url{https://www.gnu.org/software/bison/}} or Antlr\footnote{\url{https://www.antlr.org/}}. Before the parser can validate the input stream, a scanner is needed as described by \textcite{cooper_engineering_2022}. The scanner reads every character of the input stream and is responsible for removing white-spaces and ensures only valid characters and words are present. Flex \footnote{\url{https://github.com/westes/flex}} is a tool that allows generating a scanner and is often used in combination with Bison. A simplified version of the compiler architecture using Flex and Bison is depicted in figure \ref{fig:compiler_layout}. It shows how source code is taken and transformed into the intermediate representation by the frontend, and how it is converted into executable machine code by the backend.
+A grammar describes how a language is structured. It not only describes the structure of natural language, but it can also be used to describe the structure of a programming language. \textcite{chomsky_certain_1959} found that grammars can be grouped into four levels, with regular and context-free grammars being the most relevant for programming languages. A regular grammar is of the structure $A = a\,|\,a\,B$ which is called a rule. The symbols $A$ and $B$ are non-terminal symbols and $a$ is a terminal symbol. A non-terminal symbol stands for another rule with the same structure and must only occur after a terminal symbol. Terminal symbols are fixed symbols or a value that can be found in the input stream, like literals in programming languages. Context-free grammars are more complex and are of the structure $A = \beta$. In this context $\beta$ stands for any combination of terminal and non-terminal symbols. Therefore, a rule like $A = a\,| a\,B\,a$ is allowed with this grammar level. This shows that with context-free grammars enclosing structures are possible. To write grammars for programming languages, other properties are also important to efficiently validate or parse some input to be defined by this grammar. However, these are not discussed here, but are described by \textcite{aho_compilers_2006}. They also described that generating a parser out of a grammar can be automated. This automation can be performed by parser generators like Yacc \parencite{johnson_yacc_1975} as described in their book. More modern alternatives are Bison\footnote{\url{https://www.gnu.org/software/bison/}} or Antlr\footnote{\url{https://www.antlr.org/}}. Before the parser can validate the input stream, a scanner is needed as described by \textcite{cooper_engineering_2022}. The scanner reads every character of the input stream and is responsible for removing white-spaces and ensures only valid characters and words are present. Flex \footnote{\url{https://github.com/westes/flex}} is a tool that allows generating a scanner and is often used in combination with Bison. A simplified version of the compiler architecture using Flex and Bison is depicted in Figure \ref{fig:compiler_layout}. It shows how source code is taken and transformed into the intermediate representation by the frontend, and how it is converted into executable machine code by the backend.

 \begin{figure}
 	\centering
--- a/thesis/images/component_diagram_interpreter.png
+++ b/thesis/images/component_diagram_interpreter.png
--- a/thesis/images/component_diagram_transpiler.png
+++ b/thesis/images/component_diagram_transpiler.png
--- a/thesis/images/input_output_explanation.png
+++ b/thesis/images/input_output_explanation.png
--- a/thesis/images/kernel_architecture.png
+++ b/thesis/images/kernel_architecture.png
--- a/thesis/images/pre-processing_result.png
+++ b/thesis/images/pre-processing_result.png
--- a/thesis/main.pdf
+++ b/thesis/main.pdf
--- a/thesis/main.tex
+++ b/thesis/main.tex
@ -31,7 +31,7 @@
 % Title page entries
 %%%-----------------------------------------------------------------------------

-\title{Interpreter and Transpiler for simple expressions on Nvidia GPUs using Julia}
+\title{Interpreter and Transpiler for Simple Expressions on Nvidia GPUs using Julia}
 \author{Daniel Roth}
 \programname{Software Engineering}
Author	SHA1	Message	Date
Daniel	278a493595	benchmarking: tested int32 also on uni pc	2025-04-13 11:43:17 +02:00
Daniel	af3b72f196	benchmarking: used int32 wherever possible; resulted in noticeable performance drop Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-04-13 11:32:54 +02:00
Daniel	4c60331288	evaluation: added introduction text and made plan for additional text Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-04-12 16:22:14 +02:00
Wiplinger Daniel - s2310454043	101ccef67b	benchmarking: tuned blocksizes; slightly improved performance; mostly improved standard deviation Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-04-12 13:20:50 +02:00
Wiplinger Daniel - s2310454043	c6e2ce47aa	benchmarking: redid performance tests on uni pc Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-04-12 12:13:28 +02:00
Daniel	9746db56c0	benchmarking: added tests for performance tuning with Nsight compute and systems Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-04-12 10:24:45 +02:00
Daniel	5a9760d221	concept and design: minor improvements Some checks failed CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled Details	2025-04-10 11:17:50 +02:00
Daniel	c68e0d04a0	concept and design: improved wording and added overview diagram of kernel usage Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-04-10 10:21:01 +02:00
Daniel	258d33c338	benchmarking: redid inbounds tests because first test run did not use inbounds everywhere Some checks failed CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled Details	2025-04-06 14:29:07 +02:00
Daniel	20fcbab4ca	concept and design: added transpiler section Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details	2025-04-06 13:59:14 +02:00
Daniel	9e1094ac43	benchmarking: added reminder to redo one of the performance tests Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-04-05 12:56:06 +02:00
Daniel	2a8de064a6	concept & design: improved existing sections Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-04-05 12:54:26 +02:00
Daniel	8afc3a5e3b	concept and design: continued with architecture Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-04-04 14:18:56 +02:00
Daniel	d8f5454e9c	concept and design: started writing this chapter Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-04-03 13:43:23 +02:00
Daniel	2b9c394f1b	benchmarking: improved performance with @inbounds. still slower in most cases Some checks failed CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled Details	2025-04-01 21:48:59 +02:00
Daniel	d9c83caad9	benchmarking: updated blocksize to 256 with moderate improvements Some checks failed CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled Details	2025-03-30 13:56:25 +02:00
Daniel	1dc0c1898d	benchmarking: fixed bugs; took initial_benchmark Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-03-30 12:54:50 +02:00
Daniel	ad175abac0	benchmarking: added gpu evaluators to performance testing. getting execution errors still Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-03-29 13:35:59 +01:00
Daniel	690ee33db1	benchmarks: started preparing benchmarks Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-03-29 12:01:06 +01:00
Daniel	effd477558	transpiler: generates valid PTX and evaluates expressions correctly Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-03-28 19:32:48 +01:00
Daniel	9df78ca72e	transpiler: invalid memory access error finally fixed Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-03-27 22:32:24 +01:00
Daniel	561b37160b	transpiler: trying to fix problem with writing to global memory; not yet fixed Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-03-27 18:00:31 +01:00
Daniel	eaee21ca75	transpiler: results are now written in results array; preperation for performance testing Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-03-27 09:55:29 +01:00
Daniel	baa37ea183	code: started finalising transpilation process and preparing for performance testing and tuning Some checks failed CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled Details	2025-03-23 13:38:22 +01:00
daniwipes	db02e9f90f	Merge pull request #1 from daniwipes/cpu-interpreter Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details Cpu interpreter	2025-03-23 10:23:54 +01:00
Gabriel Kronberger	f4f39ec47c	Improvements / fixes.	2025-02-19 17:18:43 +01:00
Gabriel Kronberger	942adb8612	Add CPU Interpreter and a test case.	2025-02-19 16:38:11 +01:00
				`@ -0,0 +1 @@`
				[{"Julia":"1.11.4","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["CPUInterpreter"]}],"GPUT":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]]