benchmarking: added initial results for transpiler

benchmarking: added interpreter results
benchmarking: added CPU benchmark results
2025-05-19 09:12:59 +02:00 · 2025-05-17 16:07:03 +02:00 · 2025-05-16 14:29:47 +02:00 · 2025-05-15 16:25:32 +02:00 · 2025-05-11 16:54:19 +02:00 · 2025-05-10 15:10:26 +02:00
50 changed files with 2641 additions and 328 deletions
--- a/other/excessive_memory_transfer.drawio
+++ b/other/excessive_memory_transfer.drawio
@ -0,0 +1,190 @@
+<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0" version="26.2.13">
+  <diagram name="Page-1" id="lU6yIZpM7DUpZBHmU8TQ">
+    <mxGraphModel dx="985" dy="546" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
+      <root>
+        <mxCell id="0" />
+        <mxCell id="1" parent="0" />
+        <mxCell id="At30AJG1-aIKQqd058rT-89" value="" style="group" vertex="1" connectable="0" parent="1">
+          <mxGeometry x="40" y="240" width="400" height="120" as="geometry" />
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-90" value="" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="At30AJG1-aIKQqd058rT-89">
+          <mxGeometry width="400" height="120" as="geometry" />
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-91" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="At30AJG1-aIKQqd058rT-89">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint y="40" as="sourcePoint" />
+            <mxPoint x="400" y="40" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-92" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="At30AJG1-aIKQqd058rT-89">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint y="79.57999999999998" as="sourcePoint" />
+            <mxPoint x="400" y="79.57999999999998" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-93" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="At30AJG1-aIKQqd058rT-89">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="360" y="120" as="sourcePoint" />
+            <mxPoint x="360" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-94" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="At30AJG1-aIKQqd058rT-89">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="320" y="120" as="sourcePoint" />
+            <mxPoint x="319.58" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-95" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="At30AJG1-aIKQqd058rT-89">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="280" y="120" as="sourcePoint" />
+            <mxPoint x="280" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-96" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="At30AJG1-aIKQqd058rT-89">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="240" y="120" as="sourcePoint" />
+            <mxPoint x="240" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-97" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="At30AJG1-aIKQqd058rT-89">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="200" y="120" as="sourcePoint" />
+            <mxPoint x="200" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-98" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="At30AJG1-aIKQqd058rT-89">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="160" y="120" as="sourcePoint" />
+            <mxPoint x="160" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-99" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="At30AJG1-aIKQqd058rT-89">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="120" y="120" as="sourcePoint" />
+            <mxPoint x="120" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-100" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="At30AJG1-aIKQqd058rT-89">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="80" y="120" as="sourcePoint" />
+            <mxPoint x="80" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-101" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="At30AJG1-aIKQqd058rT-89">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="40" y="120" as="sourcePoint" />
+            <mxPoint x="40" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-88" value="" style="group" vertex="1" connectable="0" parent="1">
+          <mxGeometry x="40" y="80" width="400" height="120" as="geometry" />
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-16" value="" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry width="400" height="120" as="geometry" />
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-24" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint y="40" as="sourcePoint" />
+            <mxPoint x="400" y="40" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-25" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint y="79.57999999999998" as="sourcePoint" />
+            <mxPoint x="400" y="79.57999999999998" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-31" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="360" y="120" as="sourcePoint" />
+            <mxPoint x="360" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-64" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="320" y="120" as="sourcePoint" />
+            <mxPoint x="319.58" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-65" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="280" y="120" as="sourcePoint" />
+            <mxPoint x="280" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-66" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="240" y="120" as="sourcePoint" />
+            <mxPoint x="240" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-67" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="200" y="120" as="sourcePoint" />
+            <mxPoint x="200" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-68" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="160" y="120" as="sourcePoint" />
+            <mxPoint x="160" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-69" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="120" y="120" as="sourcePoint" />
+            <mxPoint x="120" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-70" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="80" y="120" as="sourcePoint" />
+            <mxPoint x="80" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-71" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="40" y="120" as="sourcePoint" />
+            <mxPoint x="40" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-54" value="Elem" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
+          <mxGeometry x="40" y="240" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-55" value="Elem" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
+          <mxGeometry x="200" y="320" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-56" value="Elem" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
+          <mxGeometry x="280" y="240" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-60" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=none;strokeColor=#00CC00;strokeWidth=3;" vertex="1" parent="1">
+          <mxGeometry x="195" y="315" width="170" height="50" as="geometry" />
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-51" value="Elem" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
+          <mxGeometry x="40" y="80" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-52" value="Elem" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
+          <mxGeometry x="80" y="80" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-53" value="Elem" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
+          <mxGeometry x="120" y="80" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-58" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=none;strokeColor=#00CC00;strokeWidth=3;" vertex="1" parent="1">
+          <mxGeometry x="35" y="75" width="170" height="50" as="geometry" />
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-57" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=none;strokeColor=#00CC00;strokeWidth=3;" vertex="1" parent="1">
+          <mxGeometry x="35" y="235" width="170" height="50" as="geometry" />
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-59" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=none;strokeColor=#00CC00;strokeWidth=3;" vertex="1" parent="1">
+          <mxGeometry x="275" y="235" width="170" height="50" as="geometry" />
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-62" value="&lt;b&gt;Array of Elements:&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
+          <mxGeometry x="35" y="48" width="110" height="30" as="geometry" />
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-63" value="&lt;b&gt;Array of Pointers:&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
+          <mxGeometry x="35" y="208" width="105" height="30" as="geometry" />
+        </mxCell>
+      </root>
+    </mxGraphModel>
+  </diagram>
+</mxfile>
--- a/other/expr_ast.drawio
+++ b/other/expr_ast.drawio
@ -0,0 +1,58 @@
+<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0" version="26.2.14">
+  <diagram name="Page-1" id="6PRo98IcIigsbWnrE1av">
+    <mxGraphModel dx="984" dy="2200" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
+      <root>
+        <mxCell id="0" />
+        <mxCell id="1" parent="0" />
+        <mxCell id="3yAczgE19xE4neCjO-Sk-2" value="&lt;font style=&quot;font-size: 15px;&quot;&gt;log&lt;/font&gt;" style="rounded=0;whiteSpace=wrap;html=1;fontFamily=Lucida Console;" vertex="1" parent="1">
+          <mxGeometry x="140" y="-960" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="3yAczgE19xE4neCjO-Sk-18" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="3yAczgE19xE4neCjO-Sk-4" target="3yAczgE19xE4neCjO-Sk-10">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="3yAczgE19xE4neCjO-Sk-4" value="&lt;font face=&quot;Lucida Console&quot;&gt;1 + x&lt;sub&gt;1&lt;/sub&gt; * log(p&lt;sub&gt;1&lt;/sub&gt;)&lt;/font&gt;" style="rounded=0;whiteSpace=wrap;html=1;fontSize=12;" vertex="1" parent="1">
+          <mxGeometry x="130" y="-1200" width="140" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="3yAczgE19xE4neCjO-Sk-5" value="1" style="rounded=0;whiteSpace=wrap;html=1;fontFamily=Lucida Console;fontSize=15;" vertex="1" parent="1">
+          <mxGeometry x="180" y="-1120" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="3yAczgE19xE4neCjO-Sk-6" value="x&lt;sub&gt;1&lt;/sub&gt;" style="rounded=0;whiteSpace=wrap;html=1;fontFamily=Lucida Console;fontSize=15;" vertex="1" parent="1">
+          <mxGeometry x="180" y="-1040" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="3yAczgE19xE4neCjO-Sk-7" value="p&lt;sub&gt;1&lt;/sub&gt;" style="rounded=0;whiteSpace=wrap;html=1;fontFamily=Lucida Console;fontSize=15;" vertex="1" parent="1">
+          <mxGeometry x="180" y="-960" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="3yAczgE19xE4neCjO-Sk-10" value="&lt;font face=&quot;Lucida Console&quot;&gt;+&lt;/font&gt;" style="rounded=0;whiteSpace=wrap;html=1;fontSize=20;" vertex="1" parent="1">
+          <mxGeometry x="140" y="-1120" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="3yAczgE19xE4neCjO-Sk-13" value="&lt;font face=&quot;Lucida Console&quot;&gt;*&lt;/font&gt;" style="rounded=0;whiteSpace=wrap;html=1;fontSize=20;" vertex="1" parent="1">
+          <mxGeometry x="140" y="-1040" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="3yAczgE19xE4neCjO-Sk-16" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="3yAczgE19xE4neCjO-Sk-14" target="3yAczgE19xE4neCjO-Sk-13">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="3yAczgE19xE4neCjO-Sk-14" value="" style="rounded=0;whiteSpace=wrap;html=1;fontFamily=Lucida Console;fontSize=15;" vertex="1" parent="1">
+          <mxGeometry x="220" y="-1120" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="3yAczgE19xE4neCjO-Sk-17" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="3yAczgE19xE4neCjO-Sk-15" target="3yAczgE19xE4neCjO-Sk-2">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="3yAczgE19xE4neCjO-Sk-15" value="" style="rounded=0;whiteSpace=wrap;html=1;fontFamily=Lucida Console;fontSize=15;" vertex="1" parent="1">
+          <mxGeometry x="220" y="-1040" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="3yAczgE19xE4neCjO-Sk-21" value="Expression:" style="text;html=1;align=right;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
+          <mxGeometry x="60" y="-1195" width="60" height="30" as="geometry" />
+        </mxCell>
+        <mxCell id="3yAczgE19xE4neCjO-Sk-22" value="Node 1:" style="text;html=1;align=right;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
+          <mxGeometry x="60" y="-1115" width="60" height="30" as="geometry" />
+        </mxCell>
+        <mxCell id="3yAczgE19xE4neCjO-Sk-23" value="Node 2:" style="text;html=1;align=right;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
+          <mxGeometry x="60" y="-1035" width="60" height="30" as="geometry" />
+        </mxCell>
+        <mxCell id="3yAczgE19xE4neCjO-Sk-24" value="Node 3:" style="text;html=1;align=right;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
+          <mxGeometry x="60" y="-955" width="60" height="30" as="geometry" />
+        </mxCell>
+      </root>
+    </mxGraphModel>
+  </diagram>
+</mxfile>
--- a/other/interpreter_sequence_diagram.drawio
+++ b/other/interpreter_sequence_diagram.drawio
@ -0,0 +1,174 @@
+<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:138.0) Gecko/20100101 Firefox/138.0" version="26.2.14">
+  <diagram name="Page-1" id="6PRo98IcIigsbWnrE1av">
+    <mxGraphModel dx="1181" dy="655" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
+      <root>
+        <mxCell id="0" />
+        <mxCell id="1" parent="0" />
+        <mxCell id="gfXG8frgiKgzaB5gouxS-22" value="Interpreter" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
+          <mxGeometry x="260" y="60" width="100" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-1" value="Pre-Processing" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
+          <mxGeometry x="500" y="60" width="90" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-2" value="GPU" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
+          <mxGeometry x="640" y="60" width="90" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-3" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" parent="1" vertex="1">
+          <mxGeometry x="305" y="100" width="10" height="420" as="geometry" />
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-7" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" parent="1" vertex="1">
+          <mxGeometry x="540" y="170" width="10" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-8" value="expr_to_postfix(expr): ExpressionElement[]" style="html=1;verticalAlign=bottom;endArrow=block;curved=0;rounded=0;entryX=0;entryY=0;entryDx=0;entryDy=5;" parent="1" source="hKyrbmUfddmyC9NB2b_t-3" target="hKyrbmUfddmyC9NB2b_t-7" edge="1">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="420" y="185" as="sourcePoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-9" value="" style="html=1;verticalAlign=bottom;endArrow=open;endSize=8;curved=0;rounded=0;exitX=0;exitY=1;exitDx=0;exitDy=-5;dashed=1;" parent="1" source="hKyrbmUfddmyC9NB2b_t-7" target="hKyrbmUfddmyC9NB2b_t-3" edge="1">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="420" y="255" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-16" value="intermediate_representation" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="hKyrbmUfddmyC9NB2b_t-9" vertex="1" connectable="0">
+          <mxGeometry x="-0.008" y="-1" relative="1" as="geometry">
+            <mxPoint y="-9" as="offset" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-10" value="" style="endArrow=none;dashed=1;html=1;rounded=0;entryX=0.5;entryY=1;entryDx=0;entryDy=0;" parent="1" source="hKyrbmUfddmyC9NB2b_t-7" target="hKyrbmUfddmyC9NB2b_t-1" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="550" y="150" as="sourcePoint" />
+            <mxPoint x="780" y="260" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-11" value="" style="endArrow=none;dashed=1;html=1;rounded=0;" parent="1" target="hKyrbmUfddmyC9NB2b_t-7" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="545" y="520" as="sourcePoint" />
+            <mxPoint x="539.76" y="260" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-12" value="loop" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" parent="1" vertex="1">
+          <mxGeometry x="170" y="150" width="420" height="80" as="geometry" />
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-14" value="&lt;font style=&quot;font-size: 9px;&quot;&gt;[for each expression]&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
+          <mxGeometry x="170" y="180" width="90" height="20" as="geometry" />
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-17" value="loop" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" parent="1" vertex="1">
+          <mxGeometry x="170" y="370" width="560" height="60" as="geometry" />
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-18" value="&lt;font style=&quot;font-size: 9px;&quot;&gt;[for each intermediate_representation]&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
+          <mxGeometry x="172" y="403" width="120" height="20" as="geometry" />
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-20" value="kernel(intermediate_representation, variables, parameters)" style="html=1;verticalAlign=bottom;endArrow=open;curved=0;rounded=0;endFill=0;" parent="1" source="hKyrbmUfddmyC9NB2b_t-3" edge="1">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="320" y="403" as="sourcePoint" />
+            <mxPoint x="685" y="403" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-23" value="" style="endArrow=none;dashed=1;html=1;rounded=0;entryX=0.5;entryY=1;entryDx=0;entryDy=0;" parent="1" source="hKyrbmUfddmyC9NB2b_t-34" target="hKyrbmUfddmyC9NB2b_t-2" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="610" y="250" as="sourcePoint" />
+            <mxPoint x="660" y="200" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-26" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" parent="1" vertex="1">
+          <mxGeometry x="680" y="460" width="10" height="30" as="geometry" />
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-27" value="read_results()" style="html=1;verticalAlign=bottom;endArrow=block;curved=0;rounded=0;entryX=0;entryY=0;entryDx=0;entryDy=5;" parent="1" source="hKyrbmUfddmyC9NB2b_t-3" target="hKyrbmUfddmyC9NB2b_t-26" edge="1">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="305" y="444" as="sourcePoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-28" value="resultMatrix" style="html=1;verticalAlign=bottom;endArrow=open;dashed=1;endSize=8;curved=0;rounded=0;exitX=0;exitY=1;exitDx=0;exitDy=-5;" parent="1" source="hKyrbmUfddmyC9NB2b_t-26" target="hKyrbmUfddmyC9NB2b_t-3" edge="1">
+          <mxGeometry x="0.0012" relative="1" as="geometry">
+            <mxPoint x="305" y="494.0000000000001" as="targetPoint" />
+            <mxPoint as="offset" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-30" value="" style="endArrow=none;dashed=1;html=1;rounded=0;" parent="1" target="hKyrbmUfddmyC9NB2b_t-26" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="685" y="520" as="sourcePoint" />
+            <mxPoint x="710" y="390" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-35" value="send_data(variables)" style="html=1;verticalAlign=bottom;endArrow=block;curved=0;rounded=0;entryX=0;entryY=0;entryDx=0;entryDy=5;" parent="1" source="hKyrbmUfddmyC9NB2b_t-3" target="hKyrbmUfddmyC9NB2b_t-34" edge="1">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="720" y="225" as="sourcePoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-36" value="" style="html=1;verticalAlign=bottom;endArrow=open;dashed=1;endSize=8;curved=0;rounded=0;exitX=0;exitY=1;exitDx=0;exitDy=-5;" parent="1" source="hKyrbmUfddmyC9NB2b_t-34" target="hKyrbmUfddmyC9NB2b_t-3" edge="1">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="720" y="255" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-37" value="" style="endArrow=none;dashed=1;html=1;rounded=0;entryX=0.5;entryY=1;entryDx=0;entryDy=0;" parent="1" source="hKyrbmUfddmyC9NB2b_t-38" target="hKyrbmUfddmyC9NB2b_t-34" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="700" y="349" as="sourcePoint" />
+            <mxPoint x="700" y="120" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-34" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" parent="1" vertex="1">
+          <mxGeometry x="680" y="250" width="10" height="20" as="geometry" />
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-39" value="send_data(parameters)" style="html=1;verticalAlign=bottom;endArrow=block;curved=0;rounded=0;entryX=0;entryY=0;entryDx=0;entryDy=5;" parent="1" source="hKyrbmUfddmyC9NB2b_t-3" target="hKyrbmUfddmyC9NB2b_t-38" edge="1">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="750" y="288" as="sourcePoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-40" value="" style="html=1;verticalAlign=bottom;endArrow=open;dashed=1;endSize=8;curved=0;rounded=0;exitX=0;exitY=1;exitDx=0;exitDy=-5;" parent="1" source="hKyrbmUfddmyC9NB2b_t-38" target="hKyrbmUfddmyC9NB2b_t-3" edge="1">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="750" y="358" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-38" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" parent="1" vertex="1">
+          <mxGeometry x="680" y="290" width="10" height="20" as="geometry" />
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-42" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" parent="1" vertex="1">
+          <mxGeometry x="680" y="330" width="10" height="21" as="geometry" />
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-43" value="send_data(intermediate_representations)" style="html=1;verticalAlign=bottom;endArrow=block;curved=0;rounded=0;entryX=0;entryY=0;entryDx=0;entryDy=5;" parent="1" source="hKyrbmUfddmyC9NB2b_t-3" target="hKyrbmUfddmyC9NB2b_t-42" edge="1">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="820" y="325" as="sourcePoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-44" value="" style="html=1;verticalAlign=bottom;endArrow=open;dashed=1;endSize=8;curved=0;rounded=0;exitX=0;exitY=1;exitDx=0;exitDy=-5;" parent="1" source="hKyrbmUfddmyC9NB2b_t-42" target="hKyrbmUfddmyC9NB2b_t-3" edge="1">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="820" y="395" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-45" value="" style="endArrow=none;dashed=1;html=1;rounded=0;" parent="1" source="hKyrbmUfddmyC9NB2b_t-42" target="hKyrbmUfddmyC9NB2b_t-38" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="710" y="310" as="sourcePoint" />
+            <mxPoint x="710" y="290" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-46" value="" style="endArrow=none;dashed=1;html=1;rounded=0;" parent="1" source="hKyrbmUfddmyC9NB2b_t-42" target="hKyrbmUfddmyC9NB2b_t-26" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="710" y="363" as="sourcePoint" />
+            <mxPoint x="710" y="330" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-49" value="&lt;div&gt;interpret(expressions,&lt;/div&gt;&lt;div&gt;variables, parameters)&lt;/div&gt;" style="html=1;verticalAlign=bottom;startArrow=circle;startFill=1;endArrow=open;startSize=6;endSize=8;curved=0;rounded=0;" parent="1" target="hKyrbmUfddmyC9NB2b_t-3" edge="1">
+          <mxGeometry x="0.1057" width="80" relative="1" as="geometry">
+            <mxPoint x="172" y="130" as="sourcePoint" />
+            <mxPoint x="295" y="130" as="targetPoint" />
+            <mxPoint as="offset" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-50" value="" style="ellipse;html=1;shape=endState;fillColor=#000000;strokeColor=default;" parent="1" vertex="1">
+          <mxGeometry x="180" y="500" width="20" height="20" as="geometry" />
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-51" value="" style="endArrow=open;html=1;rounded=0;entryX=1;entryY=0.5;entryDx=0;entryDy=0;dashed=1;endFill=0;" parent="1" source="hKyrbmUfddmyC9NB2b_t-3" target="hKyrbmUfddmyC9NB2b_t-50" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="230" y="640" as="sourcePoint" />
+            <mxPoint x="280" y="590" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="hKyrbmUfddmyC9NB2b_t-52" value="resultMatrix" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="hKyrbmUfddmyC9NB2b_t-51" vertex="1" connectable="0">
+          <mxGeometry x="0.1271" relative="1" as="geometry">
+            <mxPoint x="8" y="-10" as="offset" />
+          </mxGeometry>
+        </mxCell>
+      </root>
+    </mxGraphModel>
+  </diagram>
+</mxfile>
--- a/other/memory_layout_data.drawio
+++ b/other/memory_layout_data.drawio
@ -0,0 +1,187 @@
+<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0" version="26.2.14">
+  <diagram name="Page-1" id="lU6yIZpM7DUpZBHmU8TQ">
+    <mxGraphModel dx="826" dy="459" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
+      <root>
+        <mxCell id="0" />
+        <mxCell id="1" parent="0" />
+        <mxCell id="At30AJG1-aIKQqd058rT-88" value="" style="group" parent="1" vertex="1" connectable="0">
+          <mxGeometry x="40" y="80" width="400" height="120" as="geometry" />
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-16" value="" style="rounded=0;whiteSpace=wrap;html=1;" parent="At30AJG1-aIKQqd058rT-88" vertex="1">
+          <mxGeometry width="360" height="120" as="geometry" />
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-24" value="" style="endArrow=none;html=1;rounded=0;entryX=1;entryY=1;entryDx=0;entryDy=0;" parent="At30AJG1-aIKQqd058rT-88" edge="1" target="uzwrH1q6poAx0joUIa_l-32">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint y="40" as="sourcePoint" />
+            <mxPoint x="400" y="40" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-25" value="" style="endArrow=none;html=1;rounded=0;entryX=1;entryY=1;entryDx=0;entryDy=0;" parent="At30AJG1-aIKQqd058rT-88" edge="1" target="uzwrH1q6poAx0joUIa_l-6">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint y="79.57999999999998" as="sourcePoint" />
+            <mxPoint x="400" y="79.57999999999998" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-31" value="" style="endArrow=none;html=1;rounded=0;" parent="At30AJG1-aIKQqd058rT-88" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="360" y="120" as="sourcePoint" />
+            <mxPoint x="360" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-64" value="" style="endArrow=none;html=1;rounded=0;" parent="At30AJG1-aIKQqd058rT-88" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="320" y="120" as="sourcePoint" />
+            <mxPoint x="319.58" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-65" value="" style="endArrow=none;html=1;rounded=0;" parent="At30AJG1-aIKQqd058rT-88" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="280" y="120" as="sourcePoint" />
+            <mxPoint x="280" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-66" value="" style="endArrow=none;html=1;rounded=0;" parent="At30AJG1-aIKQqd058rT-88" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="240" y="120" as="sourcePoint" />
+            <mxPoint x="240" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-67" value="" style="endArrow=none;html=1;rounded=0;" parent="At30AJG1-aIKQqd058rT-88" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="200" y="120" as="sourcePoint" />
+            <mxPoint x="200" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-68" value="" style="endArrow=none;html=1;rounded=0;" parent="At30AJG1-aIKQqd058rT-88" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="160" y="120" as="sourcePoint" />
+            <mxPoint x="160" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-69" value="" style="endArrow=none;html=1;rounded=0;" parent="At30AJG1-aIKQqd058rT-88" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="120" y="120" as="sourcePoint" />
+            <mxPoint x="120" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-70" value="" style="endArrow=none;html=1;rounded=0;" parent="At30AJG1-aIKQqd058rT-88" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="80" y="120" as="sourcePoint" />
+            <mxPoint x="80" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-71" value="" style="endArrow=none;html=1;rounded=0;" parent="At30AJG1-aIKQqd058rT-88" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="40" y="120" as="sourcePoint" />
+            <mxPoint x="40" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="uzwrH1q6poAx0joUIa_l-1" value="var" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry x="120" y="40" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="uzwrH1q6poAx0joUIa_l-2" value="var" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry x="160" y="40" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="uzwrH1q6poAx0joUIa_l-3" value="var" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry x="200" y="40" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="uzwrH1q6poAx0joUIa_l-4" value="var" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry x="240" y="40" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="uzwrH1q6poAx0joUIa_l-5" value="var" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry x="280" y="40" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="uzwrH1q6poAx0joUIa_l-6" value="var" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry x="320" y="40" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="uzwrH1q6poAx0joUIa_l-9" value="param" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry x="160" y="80" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="uzwrH1q6poAx0joUIa_l-10" value="param" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry x="200" y="80" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="uzwrH1q6poAx0joUIa_l-14" value="param" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry x="40" y="80" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="uzwrH1q6poAx0joUIa_l-15" value="param" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry y="80" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-51" value="var" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="At30AJG1-aIKQqd058rT-88" vertex="1">
+          <mxGeometry y="40" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-52" value="var" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="At30AJG1-aIKQqd058rT-88" vertex="1">
+          <mxGeometry x="40" y="40" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="At30AJG1-aIKQqd058rT-53" value="var" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="At30AJG1-aIKQqd058rT-88" vertex="1">
+          <mxGeometry x="80" y="40" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="uzwrH1q6poAx0joUIa_l-16" value="&lt;div&gt;expr&lt;/div&gt;elem" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="uzwrH1q6poAx0joUIa_l-25" value="&lt;div&gt;expr&lt;/div&gt;elem" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry x="40" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="uzwrH1q6poAx0joUIa_l-26" value="&lt;div&gt;expr&lt;/div&gt;elem" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry x="80" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="uzwrH1q6poAx0joUIa_l-28" value="&lt;div&gt;expr&lt;/div&gt;elem" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry x="280" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="uzwrH1q6poAx0joUIa_l-29" value="&lt;div&gt;expr&lt;/div&gt;elem" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry x="240" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="uzwrH1q6poAx0joUIa_l-30" value="&lt;div&gt;expr&lt;/div&gt;elem" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry x="120" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="uzwrH1q6poAx0joUIa_l-31" value="&lt;div&gt;expr&lt;/div&gt;elem" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry x="160" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="uzwrH1q6poAx0joUIa_l-32" value="&lt;div&gt;expr&lt;/div&gt;elem" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry x="320" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="uzwrH1q6poAx0joUIa_l-33" value="" style="endArrow=none;html=1;rounded=0;strokeWidth=3;" edge="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="120" y="40" as="sourcePoint" />
+            <mxPoint x="120" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="uzwrH1q6poAx0joUIa_l-36" value="" style="endArrow=none;html=1;rounded=0;strokeWidth=3;" edge="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="120" y="80" as="sourcePoint" />
+            <mxPoint x="120" y="40" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="uzwrH1q6poAx0joUIa_l-37" value="" style="endArrow=none;html=1;rounded=0;strokeWidth=2;" edge="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="240" y="80" as="sourcePoint" />
+            <mxPoint x="240" y="40" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="uzwrH1q6poAx0joUIa_l-38" value="" style="endArrow=none;html=1;rounded=0;strokeWidth=3;" edge="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="80" y="120" as="sourcePoint" />
+            <mxPoint x="80" y="80" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="uzwrH1q6poAx0joUIa_l-39" value="" style="endArrow=none;html=1;rounded=0;strokeWidth=3;" edge="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="160" y="120" as="sourcePoint" />
+            <mxPoint x="160" y="80" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="uzwrH1q6poAx0joUIa_l-34" value="" style="endArrow=none;html=1;rounded=0;strokeWidth=2;" edge="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="240" y="120" as="sourcePoint" />
+            <mxPoint x="240" y="80" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="uzwrH1q6poAx0joUIa_l-35" value="" style="endArrow=none;html=1;rounded=0;strokeWidth=3;" edge="1" parent="At30AJG1-aIKQqd058rT-88">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="240" y="120" as="sourcePoint" />
+            <mxPoint x="240" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+      </root>
+    </mxGraphModel>
+  </diagram>
+</mxfile>
--- a/other/pre-processing_result.drawio
+++ b/other/pre-processing_result.drawio
@ -1,37 +1,37 @@
-<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0" version="26.2.5">
+<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0" version="26.2.13">
  <diagram name="Page-1" id="93wPJxm0qDUx-9UJ1EZK">
-    <mxGraphModel dx="1182" dy="655" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
+    <mxGraphModel dx="985" dy="546" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
      <root>
        <mxCell id="0" />
        <mxCell id="1" parent="0" />
-        <mxCell id="399UxkHvPDb8lwnND9dC-1" value="X&lt;sub&gt;1&lt;/sub&gt;" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+        <mxCell id="399UxkHvPDb8lwnND9dC-1" value="X&lt;sub&gt;1&lt;/sub&gt;" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
          <mxGeometry x="265" y="240" width="40" height="40" as="geometry" />
        </mxCell>
-        <mxCell id="399UxkHvPDb8lwnND9dC-2" value="2" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+        <mxCell id="399UxkHvPDb8lwnND9dC-2" value="2.5" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
          <mxGeometry x="355" y="240" width="40" height="40" as="geometry" />
        </mxCell>
-        <mxCell id="399UxkHvPDb8lwnND9dC-3" value="+" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+        <mxCell id="399UxkHvPDb8lwnND9dC-3" value="+" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
          <mxGeometry x="445" y="240" width="40" height="40" as="geometry" />
        </mxCell>
-        <mxCell id="399UxkHvPDb8lwnND9dC-5" value="&lt;div&gt;Type: Variable&lt;/div&gt;&lt;div&gt;Value: 1&lt;/div&gt;" style="rounded=0;whiteSpace=wrap;html=1;align=left;" vertex="1" parent="1">
+        <mxCell id="399UxkHvPDb8lwnND9dC-5" value="&lt;div&gt;Type: Variable&lt;/div&gt;&lt;div&gt;Value: 1&lt;/div&gt;" style="rounded=0;whiteSpace=wrap;html=1;align=left;" parent="1" vertex="1">
          <mxGeometry x="240" y="280" width="90" height="40" as="geometry" />
        </mxCell>
-        <mxCell id="399UxkHvPDb8lwnND9dC-6" value="&lt;div&gt;Type: Constant&lt;/div&gt;&lt;div&gt;Value: 2&lt;/div&gt;" style="rounded=0;whiteSpace=wrap;html=1;align=left;" vertex="1" parent="1">
+        <mxCell id="399UxkHvPDb8lwnND9dC-6" value="&lt;div&gt;Type: Constant&lt;/div&gt;&lt;div&gt;Value: 2.5&lt;/div&gt;" style="rounded=0;whiteSpace=wrap;html=1;align=left;" parent="1" vertex="1">
          <mxGeometry x="330" y="280" width="90" height="40" as="geometry" />
        </mxCell>
-        <mxCell id="399UxkHvPDb8lwnND9dC-9" value="&lt;div&gt;Type: Operator&lt;/div&gt;&lt;div&gt;Value: Addition&lt;/div&gt;" style="rounded=0;whiteSpace=wrap;html=1;align=left;" vertex="1" parent="1">
+        <mxCell id="399UxkHvPDb8lwnND9dC-9" value="&lt;div&gt;Type: Operator&lt;/div&gt;&lt;div&gt;Value: Addition&lt;/div&gt;" style="rounded=0;whiteSpace=wrap;html=1;align=left;" parent="1" vertex="1">
          <mxGeometry x="420" y="280" width="90" height="40" as="geometry" />
        </mxCell>
-        <mxCell id="399UxkHvPDb8lwnND9dC-10" value="X&lt;sub&gt;1&lt;/sub&gt;" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+        <mxCell id="399UxkHvPDb8lwnND9dC-10" value="X&lt;sub&gt;1&lt;/sub&gt;" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
          <mxGeometry x="80" y="280" width="40" height="40" as="geometry" />
        </mxCell>
-        <mxCell id="399UxkHvPDb8lwnND9dC-14" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" edge="1" parent="1" source="399UxkHvPDb8lwnND9dC-11" target="399UxkHvPDb8lwnND9dC-5">
+        <mxCell id="399UxkHvPDb8lwnND9dC-14" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="399UxkHvPDb8lwnND9dC-11" target="399UxkHvPDb8lwnND9dC-5" edge="1">
          <mxGeometry relative="1" as="geometry" />
        </mxCell>
-        <mxCell id="399UxkHvPDb8lwnND9dC-11" value="2" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+        <mxCell id="399UxkHvPDb8lwnND9dC-11" value="2.5" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
          <mxGeometry x="160" y="280" width="40" height="40" as="geometry" />
        </mxCell>
-        <mxCell id="399UxkHvPDb8lwnND9dC-12" value="+" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+        <mxCell id="399UxkHvPDb8lwnND9dC-12" value="+" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
          <mxGeometry x="120" y="280" width="40" height="40" as="geometry" />
        </mxCell>
      </root>
--- a/other/pre-processing_result_impl.drawio
+++ b/other/pre-processing_result_impl.drawio
@ -0,0 +1,40 @@
+<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0" version="26.2.14">
+  <diagram name="Page-1" id="RS3gGc-6zrWzfM8RHVPg">
+    <mxGraphModel dx="984" dy="546" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
+      <root>
+        <mxCell id="0" />
+        <mxCell id="1" parent="0" />
+        <mxCell id="rjAdOlqX2aqhqL3wS-0c-1" value="X&lt;sub&gt;1&lt;/sub&gt;" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="265" y="240" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="rjAdOlqX2aqhqL3wS-0c-2" value="2.5" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="365" y="240" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="rjAdOlqX2aqhqL3wS-0c-3" value="+" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="465" y="240" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="rjAdOlqX2aqhqL3wS-0c-4" value="&lt;div&gt;Type: Variable&lt;/div&gt;&lt;div&gt;Value: 1&lt;/div&gt;" style="rounded=0;whiteSpace=wrap;html=1;align=left;" vertex="1" parent="1">
+          <mxGeometry x="240" y="280" width="90" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="rjAdOlqX2aqhqL3wS-0c-5" value="&lt;div&gt;Type: Constant&lt;/div&gt;&lt;div&gt;Value: 1075838976&lt;/div&gt;" style="rounded=0;whiteSpace=wrap;html=1;align=left;" vertex="1" parent="1">
+          <mxGeometry x="330" y="280" width="110" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="rjAdOlqX2aqhqL3wS-0c-6" value="&lt;div&gt;Type: Operator&lt;/div&gt;&lt;div&gt;Value: 1&lt;/div&gt;" style="rounded=0;whiteSpace=wrap;html=1;align=left;" vertex="1" parent="1">
+          <mxGeometry x="440" y="280" width="90" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="rjAdOlqX2aqhqL3wS-0c-7" value="X&lt;sub&gt;1&lt;/sub&gt;" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="80" y="280" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="rjAdOlqX2aqhqL3wS-0c-8" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" edge="1" parent="1" source="rjAdOlqX2aqhqL3wS-0c-9" target="rjAdOlqX2aqhqL3wS-0c-4">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="rjAdOlqX2aqhqL3wS-0c-9" value="2.5" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="160" y="280" width="40" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="rjAdOlqX2aqhqL3wS-0c-10" value="+" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="120" y="280" width="40" height="40" as="geometry" />
+        </mxCell>
+      </root>
+    </mxGraphModel>
+  </diagram>
+</mxfile>
--- a/other/transpiler_sequence_diagram.drawio
+++ b/other/transpiler_sequence_diagram.drawio
@ -0,0 +1,168 @@
+<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:138.0) Gecko/20100101 Firefox/138.0" version="26.2.14">
+  <diagram name="Page-1" id="dN1vCd9jYV9B4u8MPVmJ">
+    <mxGraphModel dx="1181" dy="655" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
+      <root>
+        <mxCell id="0" />
+        <mxCell id="1" parent="0" />
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-1" value="Transpiler" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="260" y="60" width="100" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-2" value="Pre-Processing" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="500" y="60" width="90" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-3" value="GPU" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+          <mxGeometry x="640" y="60" width="90" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-4" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" vertex="1" parent="1">
+          <mxGeometry x="305" y="100" width="10" height="420" as="geometry" />
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-5" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" vertex="1" parent="1">
+          <mxGeometry x="540" y="170" width="10" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-6" value="expr_to_postfix(expr): ExpressionElement[]" style="html=1;verticalAlign=bottom;endArrow=block;curved=0;rounded=0;entryX=0;entryY=0;entryDx=0;entryDy=5;" edge="1" parent="1" source="gMhPBGUGI9FZGhFn2pCe-4" target="gMhPBGUGI9FZGhFn2pCe-5">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="420" y="185" as="sourcePoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-7" value="" style="html=1;verticalAlign=bottom;endArrow=open;endSize=8;curved=0;rounded=0;exitX=0;exitY=1;exitDx=0;exitDy=-5;dashed=1;" edge="1" parent="1" source="gMhPBGUGI9FZGhFn2pCe-5" target="gMhPBGUGI9FZGhFn2pCe-4">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="420" y="255" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-8" value="intermediate_representation" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" vertex="1" connectable="0" parent="gMhPBGUGI9FZGhFn2pCe-7">
+          <mxGeometry x="-0.008" y="-1" relative="1" as="geometry">
+            <mxPoint y="-9" as="offset" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-9" value="" style="endArrow=none;dashed=1;html=1;rounded=0;entryX=0.5;entryY=1;entryDx=0;entryDy=0;" edge="1" parent="1" source="gMhPBGUGI9FZGhFn2pCe-5" target="gMhPBGUGI9FZGhFn2pCe-2">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="550" y="150" as="sourcePoint" />
+            <mxPoint x="780" y="260" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-11" value="loop" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" vertex="1" parent="1">
+          <mxGeometry x="170" y="150" width="420" height="140" as="geometry" />
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-12" value="&lt;font style=&quot;font-size: 9px;&quot;&gt;[for each expression]&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
+          <mxGeometry x="170" y="180" width="90" height="20" as="geometry" />
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-13" value="loop" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" vertex="1" parent="1">
+          <mxGeometry x="170" y="370" width="560" height="60" as="geometry" />
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-14" value="&lt;font style=&quot;font-size: 9px;&quot;&gt;[for each kernel]&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
+          <mxGeometry x="172" y="403" width="68" height="17" as="geometry" />
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-15" value="kernel(variables, parameters)" style="html=1;verticalAlign=bottom;endArrow=open;curved=0;rounded=0;endFill=0;" edge="1" parent="1" source="gMhPBGUGI9FZGhFn2pCe-4">
+          <mxGeometry x="0.0008" relative="1" as="geometry">
+            <mxPoint x="320" y="403" as="sourcePoint" />
+            <mxPoint x="685" y="403" as="targetPoint" />
+            <mxPoint as="offset" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-16" value="" style="endArrow=none;dashed=1;html=1;rounded=0;entryX=0.5;entryY=1;entryDx=0;entryDy=0;" edge="1" parent="1" source="gMhPBGUGI9FZGhFn2pCe-24" target="gMhPBGUGI9FZGhFn2pCe-3">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="610" y="250" as="sourcePoint" />
+            <mxPoint x="660" y="200" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-17" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" vertex="1" parent="1">
+          <mxGeometry x="680" y="460" width="10" height="30" as="geometry" />
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-18" value="read_results()" style="html=1;verticalAlign=bottom;endArrow=block;curved=0;rounded=0;entryX=0;entryY=0;entryDx=0;entryDy=5;" edge="1" parent="1" source="gMhPBGUGI9FZGhFn2pCe-4" target="gMhPBGUGI9FZGhFn2pCe-17">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="305" y="444" as="sourcePoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-19" value="resultMatrix" style="html=1;verticalAlign=bottom;endArrow=open;dashed=1;endSize=8;curved=0;rounded=0;exitX=0;exitY=1;exitDx=0;exitDy=-5;" edge="1" parent="1" source="gMhPBGUGI9FZGhFn2pCe-17" target="gMhPBGUGI9FZGhFn2pCe-4">
+          <mxGeometry x="0.0012" relative="1" as="geometry">
+            <mxPoint x="305" y="494.0000000000001" as="targetPoint" />
+            <mxPoint as="offset" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-20" value="" style="endArrow=none;dashed=1;html=1;rounded=0;" edge="1" parent="1" target="gMhPBGUGI9FZGhFn2pCe-17">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="685" y="520" as="sourcePoint" />
+            <mxPoint x="710" y="390" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-21" value="send_data(variables)" style="html=1;verticalAlign=bottom;endArrow=block;curved=0;rounded=0;entryX=0;entryY=0;entryDx=0;entryDy=5;" edge="1" parent="1" source="gMhPBGUGI9FZGhFn2pCe-4" target="gMhPBGUGI9FZGhFn2pCe-24">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="720" y="225" as="sourcePoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-22" value="" style="html=1;verticalAlign=bottom;endArrow=open;dashed=1;endSize=8;curved=0;rounded=0;exitX=0;exitY=1;exitDx=0;exitDy=-5;" edge="1" parent="1" source="gMhPBGUGI9FZGhFn2pCe-24" target="gMhPBGUGI9FZGhFn2pCe-4">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="720" y="255" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-23" value="" style="endArrow=none;dashed=1;html=1;rounded=0;entryX=0.5;entryY=1;entryDx=0;entryDy=0;" edge="1" parent="1" source="gMhPBGUGI9FZGhFn2pCe-27" target="gMhPBGUGI9FZGhFn2pCe-24">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="700" y="349" as="sourcePoint" />
+            <mxPoint x="700" y="120" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-24" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" vertex="1" parent="1">
+          <mxGeometry x="680" y="310" width="10" height="20" as="geometry" />
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-25" value="send_data(parameters)" style="html=1;verticalAlign=bottom;endArrow=block;curved=0;rounded=0;entryX=0;entryY=0;entryDx=0;entryDy=5;" edge="1" parent="1" source="gMhPBGUGI9FZGhFn2pCe-4" target="gMhPBGUGI9FZGhFn2pCe-27">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="750" y="288" as="sourcePoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-26" value="" style="html=1;verticalAlign=bottom;endArrow=open;dashed=1;endSize=8;curved=0;rounded=0;exitX=0;exitY=1;exitDx=0;exitDy=-5;" edge="1" parent="1" source="gMhPBGUGI9FZGhFn2pCe-27" target="gMhPBGUGI9FZGhFn2pCe-4">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="750" y="358" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-27" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" vertex="1" parent="1">
+          <mxGeometry x="680" y="340" width="10" height="20" as="geometry" />
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-31" value="" style="endArrow=none;dashed=1;html=1;rounded=0;" edge="1" parent="1" source="gMhPBGUGI9FZGhFn2pCe-17" target="gMhPBGUGI9FZGhFn2pCe-27">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="685" y="330" as="sourcePoint" />
+            <mxPoint x="710" y="290" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-33" value="&lt;div&gt;transpile(expressions,&lt;/div&gt;&lt;div&gt;variables, parameters)&lt;/div&gt;" style="html=1;verticalAlign=bottom;startArrow=circle;startFill=1;endArrow=open;startSize=6;endSize=8;curved=0;rounded=0;" edge="1" parent="1" target="gMhPBGUGI9FZGhFn2pCe-4">
+          <mxGeometry x="0.1057" width="80" relative="1" as="geometry">
+            <mxPoint x="172" y="130" as="sourcePoint" />
+            <mxPoint x="295" y="130" as="targetPoint" />
+            <mxPoint as="offset" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-34" value="" style="ellipse;html=1;shape=endState;fillColor=#000000;strokeColor=default;" vertex="1" parent="1">
+          <mxGeometry x="180" y="500" width="20" height="20" as="geometry" />
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-35" value="" style="endArrow=open;html=1;rounded=0;entryX=1;entryY=0.5;entryDx=0;entryDy=0;dashed=1;endFill=0;" edge="1" parent="1" source="gMhPBGUGI9FZGhFn2pCe-4" target="gMhPBGUGI9FZGhFn2pCe-34">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="230" y="640" as="sourcePoint" />
+            <mxPoint x="280" y="590" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-36" value="resultMatrix" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" vertex="1" connectable="0" parent="gMhPBGUGI9FZGhFn2pCe-35">
+          <mxGeometry x="0.1271" relative="1" as="geometry">
+            <mxPoint x="8" y="-10" as="offset" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-38" value="" style="endArrow=none;dashed=1;html=1;rounded=0;" edge="1" parent="1" target="gMhPBGUGI9FZGhFn2pCe-5">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="545" y="520" as="sourcePoint" />
+            <mxPoint x="545" y="280" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-53" value="" style="html=1;points=[[0,0,0,0,5],[0,1,0,0,-5],[1,0,0,0,5],[1,1,0,0,-5]];perimeter=orthogonalPerimeter;outlineConnect=0;targetShapes=umlLifeline;portConstraint=eastwest;newEdgeStyle={&quot;curved&quot;:0,&quot;rounded&quot;:0};" vertex="1" parent="1">
+          <mxGeometry x="310" y="243" width="10" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="gMhPBGUGI9FZGhFn2pCe-54" value="transpile(intermediate_representation): Kernel" style="html=1;align=left;spacingLeft=2;endArrow=block;rounded=0;edgeStyle=orthogonalEdgeStyle;curved=0;rounded=0;" edge="1" target="gMhPBGUGI9FZGhFn2pCe-53" parent="1">
+          <mxGeometry x="-0.005" relative="1" as="geometry">
+            <mxPoint x="315" y="223" as="sourcePoint" />
+            <Array as="points">
+              <mxPoint x="345" y="253" />
+            </Array>
+            <mxPoint as="offset" />
+          </mxGeometry>
+        </mxCell>
+      </root>
+    </mxGraphModel>
+  </diagram>
+</mxfile>
--- a/package/Project.toml
+++ b/package/Project.toml
@ -5,12 +5,16 @@ version = "1.0.0-DEV"

 [deps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
+GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"

 [compat]
+DelimitedFiles = "1.9.1"
+GZip = "0.6.2"
 LinearAlgebra = "1.11.0"
 Printf = "1.11.0"
 Random = "1.11.0"
--- a/package/src/ExpressionExecutorCuda.jl
+++ b/package/src/ExpressionExecutorCuda.jl
@ -9,9 +9,10 @@ include("Code.jl")
 include("CpuInterpreter.jl")
 end

+using ..ExpressionProcessing
+
 export interpret_gpu,interpret_cpu
 export evaluate_gpu
-export test

 # Some assertions:
 # Variables and parameters start their naming with "1" meaning the first variable/parameter has to be "x1/p1" and not "x0/p0"
@ -26,8 +27,12 @@ function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector
 	ncols = size(X, 2)

 	results = Matrix{Float32}(undef, ncols, length(exprs))
+	# TODO: create CuArray for variables here already, as they never change 
+	#		could/should be done even before calling this, but I guess it would be diminishing returns 
+	# 		TODO: test how this would impact performance, if it gets faster, adapt implementation section
+	# TODO: create CuArray for expressions here already. They also do not change over the course of parameter optimisation and therefore a lot of unnecessary calls to expr_to_postfix can be save (even though a cache is used, this should still be faster)

-	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially)
+	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl)
 		results = Interpreter.interpret(exprs, X, p)
 	end

@ -40,8 +45,12 @@ function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{
 	ncols = size(X, 2)

 	results = Matrix{Float32}(undef, ncols, length(exprs))
+	# TODO: create CuArray for variables here already, as they never change 
+	#		could/should be done even before calling this, but I guess it would be diminishing returns 
+	# 		TODO: test how this would impact performance, if it gets faster, adapt implementation section
+	# TODO: create CuArray for expressions here already. They also do not change over the course of parameter optimisation and therefore a lot of unnecessary calls to expr_to_postfix can be save (even though a cache is used, this should still be faster)

-	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially)
+	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl)
 		results = Transpiler.evaluate(exprs, X, p)
 	end

@ -50,21 +59,34 @@ end


 # Evaluate Expressions on the CPU
-function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32}
+function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1, parallel=false)::Matrix{Float32}
 	@assert axes(exprs) == axes(p)
 	nrows = size(X, 1)
 	
 	# each column of the matrix has the result for an expr
 	res = Matrix{Float32}(undef, nrows, length(exprs))

-	for i in eachindex(exprs) 
-		# The interpreter holds the postfix code and buffers for evaluation. It is costly to create
-		interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i])) 
+	if parallel
+		Threads.@threads for i in eachindex(exprs) 
+			# The interpreter holds the postfix code and buffers for evaluation. It is costly to create
+			interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i])) 
 	
-		# If an expression has to be evaluated multiple times (e.g. for different parameters),
-		# it is worthwhile to reuse the interpreter to reduce the number of allocations
-		for rep in 1:repetitions
-			CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i])
+			# If an expression has to be evaluated multiple times (e.g. for different parameters),
+			# it is worthwhile to reuse the interpreter to reduce the number of allocations
+			for rep in 1:repetitions
+				CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i])
+			end
+		end
+	else
+		for i in eachindex(exprs) 
+			# The interpreter holds the postfix code and buffers for evaluation. It is costly to create
+			interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i])) 
+			
+			# If an expression has to be evaluated multiple times (e.g. for different parameters),
+			# it is worthwhile to reuse the interpreter to reduce the number of allocations
+			for rep in 1:repetitions
+				CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i])
+			end
 		end
 	end

--- a/package/src/ExpressionProcessing.jl
+++ b/package/src/ExpressionProcessing.jl
@ -2,26 +2,63 @@ module ExpressionProcessing

 export expr_to_postfix, is_binary_operator
 export PostfixType
-export Operator, ADD, SUBTRACT, MULTIPLY, DIVIDE, POWER, ABS, LOG, EXP, SQRT
-export ElementType, EMPTY, FLOAT32, OPERATOR, INDEX
+export Operator, ADD, SUBTRACT, MULTIPLY, DIVIDE, POWER, ABS, LOG, EXP, SQRT, INV
+export ElementType, EMPTY, FLOAT32, OPERATOR, VARIABLE, PARAMETER
 export ExpressionElement

-@enum Operator ADD=1 SUBTRACT=2 MULTIPLY=3 DIVIDE=4 POWER=5 ABS=6 LOG=7 EXP=8 SQRT=9
-@enum ElementType EMPTY=0 FLOAT32=1 OPERATOR=2 INDEX=3
+@enum Operator ADD=1 SUBTRACT=2 MULTIPLY=3 DIVIDE=4 POWER=5 ABS=6 LOG=7 EXP=8 SQRT=9 INV=10
+@enum ElementType EMPTY=0 FLOAT32=1 OPERATOR=2 VARIABLE=3 PARAMETER=4
+
+const binary_operators = [ADD, SUBTRACT, MULTIPLY, DIVIDE, POWER]
+const unary_operators = [ABS, LOG, EXP, SQRT]

 struct ExpressionElement
 	Type::ElementType
-	Value::Int32 # Reinterpret the stored value to type "ElementType" when using it
+	Value::UInt32 # Reinterpret the stored value to type "ElementType" when using it
 end

 const PostfixType = Vector{ExpressionElement}

 "
 Converts a julia expression to its postfix notation.
-NOTE: All 64-Bit values will be converted to 32-Bit. Be aware of the lost precision
+NOTE: All 64-Bit values will be converted to 32-Bit. Be aware of the lost precision.
+NOTE: This function is not thread save, especially cache access is not thread save
 "
-function expr_to_postfix(expr::Expr)::PostfixType
+function expr_to_postfix(expression::Expr)::PostfixType
+	expr = expression
+	if expression.head === :->
+		# if typeof(expression.args[2]) == Float64
+		# 	println()
+		# 	println("Expression: $expression")
+		# 	println("Expr: $expr")
+		# 	println()
+		# 	dump(expression; maxdepth=10)
+		# end
+		# if the expression equals (x, p) -> (...) then the below statement extracts the expression to evaluate
+		if typeof(expression.args[2]) == Float64
+			return [convert_to_ExpressionElement(expression.args[2])]
+		elseif expression.args[2].head == :block # expressions that are not generated with the parser (./test/parser.jl) contain this extra "block" node, which needs to be skipped
+			expr = expression.args[2].args[2] 
+		else # ... if the are generated with the parser, this node is not present and therefore doesn't need to be skipped
+			expr = expression.args[2]
+		end
+	end
+
+	# if haskey(cache, expr)
+	# 	return cache[expr]
+	# end
+
 	postfix = PostfixType()	
+
+	# Special handling in the case where the expression is an array access
+	# This can happen if the token is a variable/parameter of the form x[n]/p[n]
+	if expr.head == :ref
+		exprElement = convert_to_ExpressionElement(expr.args[1], expr.args[2]) # we assume that an array access never contains an expression, as this would not make much sense in this case
+		push!(postfix, exprElement)
+		# cache[expr] = postfix
+		return postfix
+	end
+
 	@inbounds operator = get_operator(expr.args[1])

 	@inbounds for j in 2:length(expr.args)
@ -29,16 +66,16 @@ function expr_to_postfix(expr::Expr)::PostfixType

 		if typeof(arg) === Expr
 			append!(postfix, expr_to_postfix(arg))
-		elseif typeof(arg) === Symbol # variables/parameters
-			# maybe TODO: replace the parameters with their respective values, as this might make the expr evaluation faster
-			exprElement = convert_to_ExpressionElement(convert_var_to_int(arg))
+		elseif typeof(arg) === Symbol # variables/parameters of the form xn/pn
+			exprElement = convert_to_ExpressionElement(arg)
 			push!(postfix, exprElement)
 		else
 			exprElement = convert_to_ExpressionElement(convert(Float32, arg))
 			push!(postfix, exprElement)
 		end

-		# only add operator if at least 2 values are added. For the case where another expression is added first, we check if we are at the first iteration or not ( j != 2)
+		# only add operator if at least 2 values are added. Needed because e.g. multiple consecutive additions are one subtree with one operator, but multiple operators need to be added to the postfix notation. 
+		# For the case where another expression has already been added to the final postfix notation, we check if we are at the first iteration or not ( j != 2)
 		if length(postfix) >= 2 && j != 2
 			exprElement = convert_to_ExpressionElement(operator)
 			push!(postfix, exprElement)
@ -46,9 +83,11 @@ function expr_to_postfix(expr::Expr)::PostfixType
 	end

 	# For the case this expression has an operator that only takes in a single value like "abs(x)"
-	if length(postfix) == 1
+	if operator in unary_operators 
 		push!(postfix, convert_to_ExpressionElement(operator))
 	end
+
+	# cache[expr] = postfix
 	return postfix
 end

@ -63,6 +102,8 @@ function get_operator(op::Symbol)::Operator
 		return DIVIDE
 	elseif op == :^
 		return POWER
+	elseif op == :powabs
+		return POWER # TODO: Fix this
 	elseif op == :abs
 		return ABS
 	elseif op == :log
@ -71,45 +112,47 @@ function get_operator(op::Symbol)::Operator
 		return EXP
 	elseif op == :sqrt
 		return SQRT
-	elseif op == :powabs
-		return POWER # TODO: Fix this
+	elseif op == :inv
+		return INV
 	else
-		throw("Operator unknown")
+		throw("Operator unknown. Operator was $op")
 	end
 end

-"Extracts the number from a variable/parameter and returns it. If the symbol is a parameter ```pn```, the resulting value will be negativ.
+"parses a symbol to be either a variable or a parameter and returns the corresponding Expressionelement"
+function convert_to_ExpressionElement(element::Symbol)::ExpressionElement
+	varStr = String(element)
+	index = parse(UInt32, SubString(varStr, 2))

-```x0 and p0``` are not allowed."
-function convert_var_to_int(var::Symbol)::Int32
-	varStr = String(var)
-	number = parse(Int32, SubString(varStr, 2))
-
-	if varStr[1] == 'p'
-		number = -number
+	if varStr[1] == 'x'
+		return ExpressionElement(VARIABLE, index)
+	elseif varStr[1] == 'p'
+		return ExpressionElement(PARAMETER, index)
+	else
+		throw("Cannot parse symbol to be either a variable or a parameter. Symbol was '$varStr'")
+	end
+end
+"parses a symbol to be either a variable or a parameter and returns the corresponding Expressionelement"
+function convert_to_ExpressionElement(element::Symbol, index::Integer)::ExpressionElement
+	if element == :x
+		return ExpressionElement(VARIABLE, convert(UInt32, index))
+	elseif element == :p
+		return ExpressionElement(PARAMETER, convert(UInt32, index))
+	else
+		throw("Cannot parse symbol to be either a variable or a parameter. Symbol was '$varStr'")
 	end
-
-	return number
 end

-function convert_to_ExpressionElement(element::Int32)::ExpressionElement
-	value = reinterpret(Int32, element)
-	return ExpressionElement(INDEX, value)
-end
-function convert_to_ExpressionElement(element::Int64)::ExpressionElement
-	value = reinterpret(Int32, convert(Int32, element))
-	return ExpressionElement(INDEX, value)
-end
 function convert_to_ExpressionElement(element::Float32)::ExpressionElement
-	value = reinterpret(Int32, element)
+	value = reinterpret(UInt32, element)
 	return ExpressionElement(FLOAT32, value)
 end
 function convert_to_ExpressionElement(element::Float64)::ExpressionElement
-	value = reinterpret(Int32, convert(Float32, element))
+	value = reinterpret(UInt32, convert(Float32, element))
 	return ExpressionElement(FLOAT32, value)
 end
 function convert_to_ExpressionElement(element::Operator)::ExpressionElement
-	value = reinterpret(Int32, element)
+	value = reinterpret(UInt32, element)
 	return ExpressionElement(OPERATOR, value)
 end

--- a/package/src/Interpreter.jl
+++ b/package/src/Interpreter.jl
@ -11,9 +11,9 @@ export interpret
 - expressions::Vector{ExpressionProcessing.PostfixType} : The expressions to execute in postfix form
 - variables::Matrix{Float32} : The variables to use. Each column is mapped to the variables x1..xn
 - parameters::Vector{Vector{Float32}} : The parameters to use. Each Vector contains the values for the parameters p1..pn. The number of parameters can be different for every expression
+ - kwparam ```frontendCache```: The cache that stores the (partial) results of the frontend
 "
 function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
-	
 	exprs = Vector{ExpressionProcessing.PostfixType}(undef, length(expressions))
 	@inbounds for i in eachindex(expressions)
 		exprs[i] = ExpressionProcessing.expr_to_postfix(expressions[i])
@ -22,85 +22,84 @@ function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parame
 	variableCols = size(variables, 2) # number of variable sets to use for each expression
 	cudaVars = CuArray(variables)
 	cudaParams = Utils.create_cuda_array(parameters, NaN32) # column corresponds to data for one expression
-	cudaExprs = Utils.create_cuda_array(exprs, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression
-	# put into seperate cuArray, as this is static and would be inefficient to send seperatly to every kernel
-	cudaStepsize = CuArray([Utils.get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression
+	cudaExprs = Utils.create_cuda_array(exprs, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression;
+	# put into seperate cuArray, as this is static and would be inefficient to send seperatly to each kernel
+	cudaStepsize = CuArray([Utils.get_max_inner_length(exprs), Utils.get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression

 	# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
 	cudaResults = CuArray{Float32}(undef, variableCols, length(exprs))

 	# Start kernel for each expression to ensure that no warp is working on different expressions
-	@inbounds for i in eachindex(exprs)
-		kernel = @cuda launch=false fastmath=true interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
-		# config = launch_configuration(kernel.fun)
-		threads = min(variableCols, 128)
-		blocks = cld(variableCols, threads)
+	@inbounds Threads.@threads for i in eachindex(exprs)
+		numThreads = min(variableCols, 256)
+		numBlocks = cld(variableCols, numThreads)

-		kernel(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i; threads, blocks)
+		@cuda threads=numThreads blocks=numBlocks fastmath=true interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
 	end

 	return cudaResults
 end

-#TODO: Add @inbounds to all indexing after it is verified that all works https://cuda.juliagpu.org/stable/development/kernel/#Bounds-checking
 const MAX_STACK_SIZE = 25 # The depth of the stack to store the values and intermediate results
 function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, variables::CuDeviceArray{Float32}, parameters::CuDeviceArray{Float32}, results::CuDeviceArray{Float32}, stepsize::CuDeviceArray{Int}, exprIndex::Int)
 	varSetIndex = (blockIdx().x - 1) * blockDim().x + threadIdx().x # ctaid.x * ntid.x + tid.x (1-based)
-	@inbounds variableCols = length(variables) / stepsize[2]
+	@inbounds variableCols = length(variables) / stepsize[3] # number of variable sets

 	if varSetIndex > variableCols
 		return
 	end

-	@inbounds firstParamIndex = ((exprIndex - 1) * stepsize[1]) # Exclusive
+	@inbounds firstExprIndex = ((exprIndex - 1) * stepsize[1]) + 1 # Inclusive
+	@inbounds lastExprIndex = firstExprIndex + stepsize[1] - 1 # Inclusive
+	@inbounds firstParamIndex = ((exprIndex - 1) * stepsize[2]) # Exclusive
 	# TODO: Use @cuDynamicSharedMem/@cuStaticSharedMem for variables and or parameters

 	operationStack = MVector{MAX_STACK_SIZE, Float32}(undef) # Try to get this to function with variable size too, to allow better memory usage
 	operationStackTop = 0 # stores index of the last defined/valid value
 	
-	@inbounds firstVariableIndex = ((varSetIndex-1) * stepsize[2]) # Exclusive
+	@inbounds firstVariableIndex = ((varSetIndex-1) * stepsize[3]) # Exclusive
 	
-	@inbounds for expr in expressions
-		if expr.Type == EMPTY
+	@inbounds for i in firstExprIndex:lastExprIndex
+		token = expressions[i]
+		if token.Type == EMPTY
 			break
-		elseif expr.Type == INDEX
-			val = expr.Value
+		elseif token.Type == VARIABLE
 			operationStackTop += 1
-
-			if val > 0
-				operationStack[operationStackTop] = variables[firstVariableIndex + val]
-			else
-				val = abs(val)
-				operationStack[operationStackTop] = parameters[firstParamIndex + val]
-			end
-		elseif expr.Type == FLOAT32
+			operationStack[operationStackTop] = variables[firstVariableIndex + token.Value]
+		elseif token.Type == PARAMETER
 			operationStackTop += 1
-			operationStack[operationStackTop] = reinterpret(Float32, expr.Value)
-		elseif expr.Type == OPERATOR
-			type = reinterpret(Operator, expr.Value)
-			if type == ADD
+			operationStack[operationStackTop] = parameters[firstParamIndex + token.Value]
+		elseif token.Type == FLOAT32
+			operationStackTop += 1
+			operationStack[operationStackTop] = reinterpret(Float32, token.Value)
+		elseif token.Type == OPERATOR
+			opcode = reinterpret(Operator, token.Value)
+			if opcode == ADD
 				operationStackTop -= 1
 				operationStack[operationStackTop] = operationStack[operationStackTop] + operationStack[operationStackTop + 1]
-			elseif type == SUBTRACT
+			elseif opcode == SUBTRACT
 				operationStackTop -= 1
 				operationStack[operationStackTop] = operationStack[operationStackTop] - operationStack[operationStackTop + 1]
-			elseif type == MULTIPLY
+			elseif opcode == MULTIPLY
 				operationStackTop -= 1
 				operationStack[operationStackTop] = operationStack[operationStackTop] * operationStack[operationStackTop + 1]
-			elseif type == DIVIDE
+			elseif opcode == DIVIDE
 				operationStackTop -= 1
 				operationStack[operationStackTop] = operationStack[operationStackTop] / operationStack[operationStackTop + 1]
-			elseif type == POWER
+			elseif opcode == POWER
 				operationStackTop -= 1
 				operationStack[operationStackTop] = operationStack[operationStackTop] ^ operationStack[operationStackTop + 1]
-			elseif type == ABS
+			elseif opcode == ABS
 				operationStack[operationStackTop] = abs(operationStack[operationStackTop])
-			elseif type == LOG
+			elseif opcode == LOG
 				operationStack[operationStackTop] = log(operationStack[operationStackTop])
-			elseif type == EXP
+			elseif opcode == EXP
 				operationStack[operationStackTop] = exp(operationStack[operationStackTop])
-			elseif type == SQRT
+			elseif opcode == SQRT
 				operationStack[operationStackTop] = sqrt(operationStack[operationStackTop])
+			elseif opcode == INV
+				# operationStack[operationStackTop] = 1f0 / operationStack[operationStackTop]
+				operationStack[operationStackTop] = inv(operationStack[operationStackTop])
 			end
 		else
 			operationStack[operationStackTop] = NaN32
--- a/package/src/Transpiler.jl
+++ b/package/src/Transpiler.jl
@ -7,12 +7,15 @@ using ..Utils

 const BYTES = sizeof(Float32)
 const Operand = Union{Float32, String} # Operand is either fixed value or register
-cache = Dict{Expr, CuFunction}() # needed if multiple runs with the same expr but different parameters are performed

+"
+ - kwparam ```frontendCache```: The cache that stores the (partial) results of the frontend, to speedup the pre-processing
+ - kwparam ```frontendCache```: The cache that stores the result of the transpilation. Useful for parameter optimisation, as the same expression gets executed multiple times
+"
 function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
 	varRows = size(variables, 1)
 	variableCols = size(variables, 2)
-	kernels = Vector{CuFunction}(undef, length(expressions))
+	# kernels = Vector{CuFunction}(undef, length(expressions))

 	# TODO: test this again with multiple threads. The first time I tried, I was using only one thread
 	# Test this parallel version again when doing performance tests. With the simple "functionality" tests this took 0.03 seconds while sequential took "0.00009" seconds
@ -20,8 +23,8 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
 	# 	cacheLock = ReentrantLock()
 	# 	cacheHit = false
 	# 	lock(cacheLock) do 
-	# 		if haskey(cache, expressions[i])
-	# 			kernels[i] = cache[expressions[i]]
+	# 		if haskey(transpilerCache, expressions[i])
+	# 			kernels[i] = transpilerCache[expressions[i]]
 	# 			cacheHit = true
 	# 		end
 	# 	end
@ -42,43 +45,43 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
 	# 	mod = CuModule(image)
 	# 	kernels[i] = CuFunction(mod, "ExpressionProcessing")

-	# 	@lock cacheLock cache[expressions[i]] = kernels[i]
+	# 	@lock cacheLock transpilerCache[expressions[i]] = kernels[i]
 	# end

-	@inbounds for i in eachindex(expressions)
-		if haskey(cache, expressions[i])
-			kernels[i] = cache[expressions[i]]
-			continue
-		end
-
-		formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
-		kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i-1) # i-1 because julia is 1-based but PTX needs 0-based indexing
-		
-		linker = CuLink()
-		add_data!(linker, "ExpressionProcessing", kernel)
-		
-		image = complete(linker)
-		
-		mod = CuModule(image)
-		kernels[i] = CuFunction(mod, "ExpressionProcessing")
-		cache[expressions[i]] = kernels[i]
-	end
-
 	cudaVars = CuArray(variables) # maybe put in shared memory (see PerformanceTests.jl for more info)
 	cudaParams = Utils.create_cuda_array(parameters, NaN32) # maybe make constant (see PerformanceTests.jl for more info)

 	# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
 	cudaResults = CuArray{Float32}(undef, variableCols, length(expressions))

-	# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
-	for kernel in kernels
-		# config = launch_configuration(kernels[i])
-		threads = min(variableCols, 96)
-		blocks = cld(variableCols, threads)
+	threads = min(variableCols, 256)
+	blocks = cld(variableCols, threads)
 	
-		cudacall(kernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
+	kernelName = "evaluate_gpu"
+	# TODO: Implement batching as a middleground between "transpile everything and then run" and "tranpile one run one" even though cudacall is async
+	@inbounds Threads.@threads for i in eachindex(expressions)
+		# if haskey(resultCache, expressions[i])
+		# 	kernels[i] = resultCache[expressions[i]]
+		# 	continue
+		# end
+		
+		formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
+		kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i-1, kernelName) # i-1 because julia is 1-based but PTX needs 0-based indexing
+
+		linker = CuLink()
+		add_data!(linker, kernelName, kernel)
+		
+		image = complete(linker)
+		mod = CuModule(image)
+		compiledKernel = CuFunction(mod, kernelName)
+
+		cudacall(compiledKernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
 	end

+	# for kernel in kernels
+	# 	cudacall(kernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
+	# end
+	
 	return cudaResults
 end

@ -90,13 +93,13 @@ end
 - param ```expressionIndex```: The 0-based index of the expression
 "
 function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Integer, paramSetSize::Integer, 
-				   nrOfVariableSets::Integer, expressionIndex::Integer)::String
-	exitJumpLocationMarker = "\$L__BB0_2"
+				   nrOfVariableSets::Integer, expressionIndex::Integer, kernelName::String)::String
+	exitJumpLocationMarker = "L__BB0_2"
 	ptxBuffer = IOBuffer()
 	regManager = Utils.RegisterManager(Dict(), Dict())

-	# TODO: Suboptimal solution
-	signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Float32, Float32, Float32], regManager) # Vars, Params, Results
+	# TODO: Suboptimal solution. get_kernel_signature should also return the name of the registers used for the parameters, so further below, we do not have to hard-code them
+	signature, paramLoading = get_kernel_signature(kernelName, [Float32, Float32, Float32], regManager) # Vars, Params, Results
 	guardClause, threadId64Reg = get_guard_clause(exitJumpLocationMarker, nrOfVariableSets, regManager)

 	println(ptxBuffer, get_cuda_header())
@ -119,7 +122,7 @@ function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Int
 	return generatedCode
 end

-# TODO: Make version, target and address_size configurable; also see what address_size means exactly
+# TODO: Make version, target and address_size configurable
 function get_cuda_header()::String
 	return "
 .version 8.5
@ -140,7 +143,7 @@ function get_kernel_signature(kernelName::String, parameters::Vector{DataType},
 	println(signatureBuffer, "(")
 	
 	for i in eachindex(parameters)
-		print(signatureBuffer, "  .param .u64", " ", "param_", i)
+		print(signatureBuffer, "  .param .u64 param_", i)

 		parametersLocation = Utils.get_next_free_register(regManager, "rd")
 		println(paramLoadingBuffer, "ld.param.u64   $parametersLocation, [param_$i];")
@ -166,21 +169,21 @@ function get_guard_clause(exitJumpLocation::String, nrOfVarSets::Integer, regMan
 	threadsPerCTA = Utils.get_next_free_register(regManager, "r")
 	currentThreadId = Utils.get_next_free_register(regManager, "r")

-	println(guardBuffer, "mov.u32    $threadIds, %ntid.x;")
-	println(guardBuffer, "mov.u32    $threadsPerCTA, %ctaid.x;")
-	println(guardBuffer, "mov.u32    $currentThreadId, %tid.x;")
+	println(guardBuffer, "mov.u32  $threadIds, %ntid.x;")
+	println(guardBuffer, "mov.u32  $threadsPerCTA, %ctaid.x;")
+	println(guardBuffer, "mov.u32  $currentThreadId, %tid.x;")

 	globalThreadId = Utils.get_next_free_register(regManager, "r") # basically the index of the thread in the variable set
 	breakCondition = Utils.get_next_free_register(regManager, "p")
-	println(guardBuffer, "mad.lo.s32     $globalThreadId, $threadIds, $threadsPerCTA, $currentThreadId;")
-	println(guardBuffer, "setp.gt.s32    $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets
+	println(guardBuffer, "mad.lo.s32   $globalThreadId, $threadIds, $threadsPerCTA, $currentThreadId;")
+	println(guardBuffer, "setp.gt.s32  $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets

 	# branch to end if breakCondition is true
-	println(guardBuffer, "@$breakCondition bra    $exitJumpLocation;")
+	println(guardBuffer, "@$breakCondition bra  $exitJumpLocation;")

 	# Convert threadIdReg to a 64 bit register. Not 64 bit from the start, as this would take up more registers. Performance tests can be performed to determin if it is faster doing this, or making everything 64-bit from the start
 	threadId64Reg = Utils.get_next_free_register(regManager, "rd")
-	print(guardBuffer, "cvt.u64.u32    $threadId64Reg, $globalThreadId;")
+	print(guardBuffer, "cvt.u64.u32  $threadId64Reg, $globalThreadId;")

 	return (String(take!(guardBuffer)), threadId64Reg)
 end
@ -201,7 +204,12 @@ function generate_calculation_code(expression::ExpressionProcessing.PostfixType,
 	for token in expression

 		if token.Type == FLOAT32
-			push!(operands, reinterpret(Float32, token.Value))
+			value = reinterpret(Float32, token.Value)
+			if isfinite(value)
+				push!(operands, value)
+			else
+				push!(operands, "0f" * string(token.Value, base = 16)) # otherwise, values like "Inf" would be written as "Inf" and therefore not understandable to the PTX compiler
+			end
 		elseif token.Type == OPERATOR
 			operator = reinterpret(Operator, token.Value)

@ -216,28 +224,24 @@ function generate_calculation_code(expression::ExpressionProcessing.PostfixType,
 			
 			println(codeBuffer, operation)
 			push!(operands, resultRegister)
-		elseif token.Type == INDEX
-			if token.Value > 0 # varaibles
-				var, first_access = Utils.get_register_for_name(regManager, "x$(token.Value)")
-				if first_access
-					println(codeBuffer, load_into_register(var, variablesLocation, token.Value, threadId64Reg, variablesSetSize, regManager))
-				end
-				push!(operands, var)
-			else
-				absVal = abs(token.Value)
-				param, first_access = Utils.get_register_for_name(regManager, "p$absVal")
-				if first_access
-					println(codeBuffer, load_into_register(param, parametersLocation, absVal, exprId64Reg, parametersSetSize, regManager))
-				end
-				push!(operands, param)
+		elseif token.Type == VARIABLE
+			var, first_access = Utils.get_register_for_name(regManager, "x$(token.Value)")
+			if first_access
+				println(codeBuffer, load_into_register(var, variablesLocation, token.Value, threadId64Reg, variablesSetSize, regManager))
 			end
+			push!(operands, var)
+		elseif token.Type == PARAMETER
+			param, first_access = Utils.get_register_for_name(regManager, "p$(token.Value)")
+			if first_access
+				println(codeBuffer, load_into_register(param, parametersLocation, token.Value, exprId64Reg, parametersSetSize, regManager))
+			end
+			push!(operands, param)
+		else
+			throw("Token unkown. Token was '$(token)'")
 		end
 	end

 	tempReg = Utils.get_next_free_register(regManager, "rd")
-	# reg = pop!(operands)
-	# tmp = "abs.f32  $(reg), 16.0;"
-	# push!(operands, reg)
 	println(codeBuffer, "
 	add.u64        $tempReg, $((expressionIndex)*nrOfVarSets), $threadId64Reg;
 	mad.lo.u64     $tempReg, $tempReg, $BYTES, $resultsLocation;
@ -264,6 +268,7 @@ function load_into_register(register::String, loadLocation::String, valueIndex::
 	mad.lo.u64  $tempReg, $setIndexReg64, $(setSize*BYTES), $((valueIndex - 1) * BYTES);
 	add.u64     $tempReg, $loadLocation, $tempReg;
 	ld.global.f32 $register, [$tempReg];"
+	#TODO: This is not the most efficient way. The index of the set should be calculated only once if possible and not like here multiple times
 end

 function type_to_ptx_type(type::DataType)::String
@ -283,40 +288,42 @@ function get_operation(operator::Operator, regManager::Utils.RegisterManager, le
 	resultCode = ""

 	if is_binary_operator(operator) && isnothing(right)
-		throw(ArgumentError("Given operator '$operator' is a binary operator. However only one operator has been given."))
+		throw(ArgumentError("Given operator '$operator' is a binary operator. However only one operand has been given."))
 	end

 	if operator == ADD
-		resultCode = "add.f32    $resultRegister, $left, $right;"
+		resultCode = "add.f32  $resultRegister, $left, $right;"
 	elseif operator == SUBTRACT
-		resultCode = "sub.f32    $resultRegister, $left, $right;"
+		resultCode = "sub.f32  $resultRegister, $left, $right;"
 	elseif operator == MULTIPLY
-		resultCode = "mul.f32    $resultRegister, $left, $right;"
+		resultCode = "mul.f32  $resultRegister, $left, $right;"
 	elseif operator == DIVIDE
-		resultCode = "div.approx.f32    $resultRegister, $left, $right;"
+		resultCode = "div.approx.f32  $resultRegister, $left, $right;"
 	elseif operator == POWER
 		# x^y == 2^(y*log2(x)) as generated by nvcc for "pow(x, y)"
 		resultCode = "
 		// x^y:
-		lg2.approx.f32   $resultRegister, $left;
-		mul.f32          $resultRegister, $right, $resultRegister;
-		ex2.approx.f32   $resultRegister, $resultRegister;"
+		lg2.approx.f32  $resultRegister, $left;
+		mul.f32         $resultRegister, $right, $resultRegister;
+		ex2.approx.f32  $resultRegister, $resultRegister;"
 	elseif operator == ABS
-		resultCode = "abs.f32    $resultRegister, $left;"
+		resultCode = "abs.f32  $resultRegister, $left;"
 	elseif operator == LOG
 		# log(x) == log2(x) * ln(2) as generated by nvcc for "log(x)"
 		resultCode = "
 		// log(x):
-		lg2.approx.f32   $resultRegister, $left;
-		mul.f32          $resultRegister, $resultRegister, 0.693147182;"
+		lg2.approx.f32  $resultRegister, $left;
+		mul.f32         $resultRegister, $resultRegister, 0.693147182;"
 	elseif operator == EXP
 		# e^x == 2^(x/ln(2)) as generated by nvcc for "exp(x)"
 		resultCode = "
 		// e^x:
-		mul.f32          $resultRegister, $left, 1.44269502; 
-		ex2.approx.f32   $resultRegister, $resultRegister;"
+		mul.f32         $resultRegister, $left, 1.44269502; 
+		ex2.approx.f32  $resultRegister, $resultRegister;"
 	elseif operator == SQRT
-		resultCode = "sqrt.approx.f32    $resultRegister, $left;"
+		resultCode = "sqrt.approx.f32  $resultRegister, $left;"
+	elseif operator == INV
+		resultCode = "rcp.approx.f32  $resultRegister, $left;"
 	else
 		throw(ArgumentError("Operator conversion to ptx not implemented for '$operator'"))
 	end
--- a/package/src/Utils.jl
+++ b/package/src/Utils.jl
@ -79,7 +79,7 @@ function get_register_for_name(manager::RegisterManager, varName::String)
 	if haskey(manager.symtable, varName)
 		return (manager.symtable[varName], false)
 	else
-		reg = get_next_free_register(manager, "var")
+		reg = get_next_free_register(manager, "f")
 		manager.symtable[varName] = reg
 		return (reg, true)
 	end
--- a/package/test/CpuInterpreterTests.jl
+++ b/package/test/CpuInterpreterTests.jl
@ -1,5 +1,9 @@
 using LinearAlgebra
 using BenchmarkTools
+using DelimitedFiles
+using GZip
+
+include("parser.jl") # to parse expressions from a file

 function test_cpu_interpreter(nrows; parallel = false)
    exprs = [
@ -20,12 +24,12 @@ function test_cpu_interpreter(nrows; parallel = false)

    if parallel 
        # t_sec = @elapsed fetch.([Threads.@spawn interpret_cpu(exprs, X, p; repetitions=expr_reps) for i in 1:reps])
-        @btime parallel(exprs, X, p, expr_reps, reps)
-        println("~ $(round(30 * reps * expr_reps * nrows  / 1e9 / t_sec, digits=2)) GFLOPS ($(Threads.nthreads()) threads) ($(round(LinearAlgebra.peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))")
+        @btime parallel($exprs, $X, $p, $expr_reps, $reps)
+        # println("~ $(round(30 * reps * expr_reps * nrows  / 1e9 / t_sec, digits=2)) GFLOPS ($(Threads.nthreads()) threads) ($(round(LinearAlgebra.peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))")
    else
        # t_sec = @elapsed for i in 1:reps interpret_cpu(exprs, X, p; repetitions=expr_reps) end
-        @btime single(exprs, X, p, expr_reps, reps)
-        println("~ $(round(30 * reps * expr_reps * nrows  / 1e9 / t_sec, digits=2)) GFLOPS (single-core) ($(round(LinearAlgebra.peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))")
+        @btime single($exprs, $X, $p, $expr_reps, $reps)
+        # println("~ $(round(30 * reps * expr_reps * nrows  / 1e9 / t_sec, digits=2)) GFLOPS (single-core) ($(round(LinearAlgebra.peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))")
    end
    true
 end
@ -41,7 +45,67 @@ end

 # LinearAlgebra.BLAS.set_num_threads(1) # only use a single thread for peakflops

-@test test_cpu_interpreter(1000)
-@test test_cpu_interpreter(1000, parallel=true) # start julia -t 6 for six threads
-@test test_cpu_interpreter(10000)
-@test test_cpu_interpreter(10000, parallel=true)
+# @test test_cpu_interpreter(1000)
+# @test test_cpu_interpreter(1000, parallel=true) # start julia -t 6 for six threads
+# @test test_cpu_interpreter(10000)
+# @test test_cpu_interpreter(10000, parallel=true)
+
+
+function test_cpu_interpreter_nikuradse() 
+    data,varnames = readdlm("data/nikuradse_1.csv", ',', header=true);
+    X = convert(Matrix{Float32}, data)
+    
+    exprs = Expr[]
+    parameters = Vector{Vector{Float32}}()
+    varnames = ["x$i" for i in 1:10]
+    paramnames = ["p$i" for i in 1:20]
+    # data/esr_nvar2_len10.txt.gz_9.txt.gz has  ~250_000 exprs
+    # data/esr_nvar2_len10.txt.gz_10.txt.gz has ~800_000 exrps
+    GZip.open("data/esr_nvar2_len10.txt.gz_9.txt.gz") do io 
+		i = 0
+        for line in eachline(io)
+            expr, p = parse_infix(line, varnames, paramnames)
+
+            push!(exprs, expr)
+            push!(parameters, randn(Float32, length(p)))
+
+			i += 1
+        end
+    end
+    
+    
+    interpret_cpu(exprs, X, parameters) # TODO: sufficient to do up to 10 repetitions per expression, 
+end
+
+
+# @test test_cpu_interpreter_nikuradse()
+
+data,varnames = readdlm("data/nikuradse_1.csv", ',', header=true);
+X = convert(Matrix{Float32}, data)
+X_t = permutedims(X) # for gpu
+
+exprs = Expr[]
+parameters = Vector{Vector{Float32}}()
+varnames = ["x$i" for i in 1:10]
+paramnames = ["p$i" for i in 1:20]
+# data/esr_nvar2_len10.txt.gz_9.txt.gz has  ~250_000 exprs
+# data/esr_nvar2_len10.txt.gz_10.txt.gz has ~800_000 exrps
+GZip.open("data/esr_nvar2_len10.txt.gz_9.txt.gz") do io 
+	for line in eachline(io)
+		expr, p = parse_infix(line, varnames, paramnames)
+
+		push!(exprs, expr)
+		push!(parameters, randn(Float32, length(p)))
+	end
+end
+expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially; only p changes but not X)
+
+suite = BenchmarkGroup()
+suite["CPU"] = BenchmarkGroup(["CPUInterpreter"])
+
+suite["CPU"]["nikuradse_1"] = @benchmarkable interpret_cpu(exprs, X, parameters; repetitions=expr_reps, parallel=true)
+
+loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)
+
+results = run(suite, verbose=true, seconds=28800) # 8 hour timeout
+BenchmarkTools.save("./results-fh-new/cpu.json", results)
--- a/package/test/ExpressionProcessingTests.jl
+++ b/package/test/ExpressionProcessingTests.jl
@ -1,34 +1,30 @@
 using .ExpressionProcessing

-expressions = Vector{Expr}(undef, 1)
-variables = Matrix{Float32}(undef, 1,2)
-parameters = Vector{Vector{Float32}}(undef, 1)
+expressions = Vector{Expr}(undef, 2)

-# Resulting value should be 10
 expressions[1] = :(x1 + 1 * x2 + p1)
-variables[1,1] = 2
-variables[1,2] = 3
-parameters[1] = Vector{Float32}(undef, 1)
-parameters[1][1] = 5
+expressions[2] = :(x[1] + 1 * x[2] + p[1])

@testset "Test conversion expression element" begin
 	reference1 = ExpressionElement(FLOAT32, reinterpret(Int32, 1f0))
-	reference2 = ExpressionElement(INDEX, reinterpret(Int32, Int32(1)))
+	reference2 = ExpressionElement(VARIABLE, Int32(1))
 	reference3 = ExpressionElement(OPERATOR, reinterpret(Int32, ADD))
 	
 	@test isequal(reference1, ExpressionProcessing.convert_to_ExpressionElement(1.0))
-	@test isequal(reference2, ExpressionProcessing.convert_to_ExpressionElement(1))
+	@test isequal(reference2, ExpressionProcessing.convert_to_ExpressionElement(:x1))
 	@test isequal(reference3, ExpressionProcessing.convert_to_ExpressionElement(ADD))
 end

@testset "Test conversion to postfix" begin
 	reference = PostfixType()

-	append!(reference, [ExpressionProcessing.convert_to_ExpressionElement(1), ExpressionProcessing.convert_to_ExpressionElement(1.0), ExpressionProcessing.convert_to_ExpressionElement(2), ExpressionProcessing.convert_to_ExpressionElement(MULTIPLY), 
-	ExpressionProcessing.convert_to_ExpressionElement(ADD), ExpressionProcessing.convert_to_ExpressionElement(-1), ExpressionProcessing.convert_to_ExpressionElement(ADD)])
-	postfix = expr_to_postfix(expressions[1])
+	append!(reference, [ExpressionProcessing.convert_to_ExpressionElement(:x1), ExpressionProcessing.convert_to_ExpressionElement(1.0), ExpressionProcessing.convert_to_ExpressionElement(:x2), ExpressionProcessing.convert_to_ExpressionElement(MULTIPLY), 
+	ExpressionProcessing.convert_to_ExpressionElement(ADD), ExpressionProcessing.convert_to_ExpressionElement(:p1), ExpressionProcessing.convert_to_ExpressionElement(ADD)])
+	postfixVarsAsSymbol = expr_to_postfix(expressions[1], Dict{Expr, PostfixType}())
+	postfixVarsAsArray = expr_to_postfix(expressions[2], Dict{Expr, PostfixType}())

-	@test isequal(reference, postfix)
+	@test isequal(reference, postfixVarsAsSymbol)
+	@test isequal(reference, postfixVarsAsArray)

 	# TODO: Do more complex expressions because these have led to errors in the past
 end
--- a/package/test/InterpreterTests.jl
+++ b/package/test/InterpreterTests.jl
@ -132,8 +132,8 @@ end

 	# var set 1
 	@test isapprox(result[1,1], 37.32, atol=0.01) # expr1
-	@test isapprox(result[1,2], 64.74, atol=0.01) # expr2
+	@test isapprox(result[1,2], 64.75, atol=0.01) # expr2
 	# var set 2
 	@test isapprox(result[2,1], 37.32, atol=0.01) # expr1
-	@test isapprox(result[2,2], -83.65, atol=0.01) # expr2
+	@test isapprox(result[2,2], -83.66, atol=0.01) # expr2
 end
--- a/package/test/PerformanceTests.jl
+++ b/package/test/PerformanceTests.jl
@ -1,51 +1,41 @@
 using LinearAlgebra
 using BenchmarkTools
+using DelimitedFiles
+using GZip
+using CUDA

 using .Transpiler
 using .Interpreter
+using .ExpressionProcessing

-const BENCHMARKS_RESULTS_PATH = "./results-fh"
+include("parser.jl") # to parse expressions from a file

-# TODO: Expressions can get much much bigger (into millions) (will be provided by Mr. Kronberger)
-# TODO: Variable-Sets: 1000 can be considered the minimum; 100.000 can be considered the maximum (will be provided by Mr. Kronberger)
+const BENCHMARKS_RESULTS_PATH = "./results-fh-new"

-exprsCPU = [
-	# CPU interpreter requires an anonymous function and array ref s
-	:(p[1] * x[1] + p[2]), # 5 op
-	:((((x[1] + x[2]) + x[3]) + x[4]) + x[5]), # 9 op
-	:(log(abs(x[1]))), # 3 op
-	:(powabs(p[2] - powabs(p[1] + x[1], 1/x[1]),p[3])) # 13 op
-] # 30 op
-exprsCPU = map(e -> Expr(:->, :(x,p), e), exprsCPU)
+# Number of expressions can get really big (into millions)
+# Variable-Sets: 1000 can be considered the minimum; 100.000 can be considered the maximum

-exprsGPU = [
-	# CPU interpreter requires an anonymous function and array ref s
-	:(p1 * x1 + p2), # 5 op
-	:((((x1 + x2) + x3) + x4) + x5), # 9 op
-	:(log(abs(x1))), # 3 op
-	:(powabs(p2 - powabs(p1 + x1, 1/x1),p3)) # 13 op
-] # 30 op
+data,varnames = readdlm("data/nikuradse_1.csv", ',', header=true);
+X = convert(Matrix{Float32}, data)
+X_t = permutedims(X) # for gpu

-# p is the same for CPU and GPU
-p = [randn(Float32, 10) for _ in 1:length(exprsCPU)] # generate 10 random parameter values for each expr
+exprs = Expr[]
+parameters = Vector{Vector{Float32}}()
+varnames = ["x$i" for i in 1:10]
+paramnames = ["p$i" for i in 1:20]
+# data/esr_nvar2_len10.txt.gz_9.txt.gz has  ~250_000 exprs
+# data/esr_nvar2_len10.txt.gz_10.txt.gz has ~800_000 exrps
+GZip.open("data/esr_nvar2_len10.txt.gz_9.txt.gz") do io 
+	for line in eachline(io)
+		expr, p = parse_infix(line, varnames, paramnames)
+
+		push!(exprs, expr)
+		push!(parameters, randn(Float32, length(p)))
+	end
+end
 expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially; only p changes but not X)

-
-@testset "CPU performance" begin
-	# warmup
-	# interpret_cpu(exprsCPU, X, p)
-	
-	# @btime interpret_cpu(exprsCPU, X, p; repetitions=expr_reps) # repetitions simulates parameter optimisation
-	# @btime test_cpu_interpreter(1000)
-	# @btime fetch.([Threads.@spawn interpret_cpu(exprsCPU, X, p; repetitions=expr_reps) for i in 1:reps])
-
-	# test_cpu_interpreter(1000, parallel=true) # start julia -t 6 for six threads
-	# @btime test_cpu_interpreter(10000)
-	# @btime test_cpu_interpreter(10000, parallel=true)
-
-end
-
-@testset "Interpreter Performance" begin
+# TODO: Tipps for tuning:
 	# Put data in shared memory: 
 	# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory

@ -54,69 +44,36 @@ end

 	# Memory management like in C++ might help with performance improvements
 	# https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management
-end

-@testset "Transpiler Performance" begin
-	# Put data in shared memory: 
-	# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
-
-	# Make array const:
-	# https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays
-
-	# Memory management like in C++ might help with performance improvements
-	# https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management
-end
-
-# After these tests have been redone, use Nsight Compute/Systems as described here: 
-#https://cuda.juliagpu.org/stable/development/profiling/#NVIDIA-Nsight-Systems
+# https://cuda.juliagpu.org/stable/development/profiling/#NVIDIA-Nsight-Systems
 # Systems and Compute installable via WSL. Compute UI can even be used inside wsl
-# Add /usr/local/cuda/bin in .bashrc to PATH to access ncu and nsys (depending how well this works with my 1080 do it on my machine, otherwise re do the tests and perform them on FH PCs)
-# University setup at 10.20.1.7 if needed
+# Add /usr/local/cuda/bin in .bashrc to PATH to access ncu and nsys (do the tests on FH PCs)
+# University setup at 10.20.1.7 and 10.20.1.13

 compareWithCPU = true

-
 suite = BenchmarkGroup()
-suite["CPU"] = BenchmarkGroup(["CPUInterpreter"])
 suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"])
 suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"])
-varsets_small = 100
-varsets_medium = 1000
-varsets_large = 10000

-if compareWithCPU
-	X_small = randn(Float32, varsets_small, 5)
-	suite["CPU"]["small varset"] = @benchmarkable interpret_cpu(exprsCPU, X_small, p; repetitions=expr_reps)
-	X_medium = randn(Float32, varsets_medium, 5)
-	suite["CPU"]["medium varset"] = @benchmarkable interpret_cpu(exprsCPU, X_medium, p; repetitions=expr_reps)
-	X_large = randn(Float32, varsets_large, 5)
-	suite["CPU"]["large varset"] = @benchmarkable interpret_cpu(exprsCPU, X_large, p; repetitions=expr_reps)
-end
+# cacheInterpreter = Dict{Expr, PostfixType}()
+suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprs, X_t, parameters; repetitions=expr_reps)

-X_small_GPU = randn(Float32, 5, varsets_small) # column-major
-suite["GPUI"]["small varset"] = @benchmarkable interpret_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
-suite["GPUT"]["small varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
-
-X_medium_GPU = randn(Float32, 5, varsets_medium) # column-major
-suite["GPUI"]["medium varset"] = @benchmarkable interpret_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
-suite["GPUT"]["medium varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
-
-X_large_GPU = randn(Float32, 5, varsets_large) # column-major
-suite["GPUI"]["large varset"] = @benchmarkable interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
-suite["GPUT"]["large varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
-
-# interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
+# cacheTranspilerFront = Dict{Expr, PostfixType}()
+# cacheTranspilerRes = Dict{Expr, CuFunction}()
+suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprs, X_t, parameters; repetitions=expr_reps)

 # tune!(suite)
 # BenchmarkTools.save("params.json", params(suite))

 loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)

-results = run(suite, verbose=true, seconds=180)
+results = run(suite, verbose=true, seconds=28800) # 8 hour timeout
+resultsCPU = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/cpu.json")[1]

 if compareWithCPU
-	medianCPU = median(results["CPU"])
-	stdCPU = std(results["CPU"])
+	medianCPU = median(resultsCPU["CPU"])
+	stdCPU = std(resultsCPU["CPU"])
 	
 	medianInterpreter = median(results["GPUI"])
 	stdInterpreter = std(results["GPUI"])
@ -147,7 +104,7 @@ if compareWithCPU
 	println(gpuiVsGPUT_median)
 	println(gpuiVsGPUT_std)
 	
-	BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/5-interpreter_using_fastmath.json", results)
+	BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/0-initial.json", results)
 else
 	resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json")[1]
 	# resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json")[1]
--- a/package/test/PerformanceTuning.jl
+++ b/package/test/PerformanceTuning.jl
@ -3,7 +3,7 @@ using CUDA
 using .Transpiler
 using .Interpreter

-varsets_medium = 1000
+varsets_medium = 10000
 X = randn(Float32, 5, varsets_medium)

 exprsGPU = [
--- a/package/test/Project.toml
+++ b/package/test/Project.toml
@ -2,6 +2,8 @@
 BenchmarkPlots = "ab8c0f59-4072-4e0d-8f91-a91e1495eb26"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
+GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd"
--- a/package/test/TranspilerTests.jl
+++ b/package/test/TranspilerTests.jl
@ -6,7 +6,7 @@ expressions = Vector{Expr}(undef, 3)
 variables = Matrix{Float32}(undef, 5, 4)
 parameters = Vector{Vector{Float32}}(undef, 3)

-expressions[1] = :(1 + 3 * 5 / 7 - sqrt(4))
+expressions[1] = :(1 + 3 * 5 / 7 - sqrt(log(4)))
 expressions[2] = :(5 + x1 + 1 * x2 + p1 + p2 + x1^x3)
 expressions[3] = :(log(x1) / x2 * sqrt(p1) + x3^x4 - exp(x5))

@ -41,15 +41,26 @@ parameters[2][1] = 5.0
 parameters[2][2] = 0.0
 parameters[3][1] = 16.0

+@testset "TEMP" begin
+	return
+	exprs = [:(x1 + p1)]
+	vars = Matrix{Float32}(undef, 1, 1)
+	params = Vector{Vector{Float32}}(undef, 1)
+
+	vars[1, 1] = 1
+	params[1] = [1]
+	Transpiler.evaluate(exprs, vars, params)
+end
+
@testset "Test transpiler evaluation" begin
 	results = Transpiler.evaluate(expressions, variables, parameters)

 	# dump(expressions[3]; maxdepth=10)
 	# Expr 1:
-	@test isapprox(results[1,1], 1.14286)
-	@test isapprox(results[2,1], 1.14286)
-	@test isapprox(results[3,1], 1.14286)
-	@test isapprox(results[4,1], 1.14286)
+	@test isapprox(results[1,1], 1.96545)
+	@test isapprox(results[2,1], 1.96545)
+	@test isapprox(results[3,1], 1.96545)
+	@test isapprox(results[4,1], 1.96545)
 	#Expr 2:
 	@test isapprox(results[1,2], 16.0)
 	@test isapprox(results[2,2], 25.0)
--- a/package/test/data/esr_nvar2_len10.txt.gz_1.txt.gz
+++ b/package/test/data/esr_nvar2_len10.txt.gz_1.txt.gz
--- a/package/test/data/esr_nvar2_len10.txt.gz_10.txt.gz
+++ b/package/test/data/esr_nvar2_len10.txt.gz_10.txt.gz
--- a/package/test/data/esr_nvar2_len10.txt.gz_2.txt.gz
+++ b/package/test/data/esr_nvar2_len10.txt.gz_2.txt.gz
--- a/package/test/data/esr_nvar2_len10.txt.gz_3.txt.gz
+++ b/package/test/data/esr_nvar2_len10.txt.gz_3.txt.gz
--- a/package/test/data/esr_nvar2_len10.txt.gz_4.txt.gz
+++ b/package/test/data/esr_nvar2_len10.txt.gz_4.txt.gz
--- a/package/test/data/esr_nvar2_len10.txt.gz_5.txt.gz
+++ b/package/test/data/esr_nvar2_len10.txt.gz_5.txt.gz
--- a/package/test/data/esr_nvar2_len10.txt.gz_6.txt.gz
+++ b/package/test/data/esr_nvar2_len10.txt.gz_6.txt.gz
--- a/package/test/data/esr_nvar2_len10.txt.gz_7.txt.gz
+++ b/package/test/data/esr_nvar2_len10.txt.gz_7.txt.gz
--- a/package/test/data/esr_nvar2_len10.txt.gz_8.txt.gz
+++ b/package/test/data/esr_nvar2_len10.txt.gz_8.txt.gz
--- a/package/test/data/esr_nvar2_len10.txt.gz_9.txt.gz
+++ b/package/test/data/esr_nvar2_len10.txt.gz_9.txt.gz
--- a/package/test/data/nikuradse_1.csv
+++ b/package/test/data/nikuradse_1.csv
@ -0,0 +1,363 @@
+r_k,log_Re,target
+507,4.114,0.456
+507,4.23,0.438
+507,4.322,0.417
+507,4.362,0.407
+507,4.362,0.403
+507,4.462,0.381
+507,4.491,0.38
+507,4.532,0.366
+507,4.568,0.365
+507,4.591,0.356
+507,4.623,0.347
+507,4.672,0.333
+507,4.69,0.324
+507,4.716,0.32
+507,4.763,0.307
+507,4.806,0.303
+507,4.851,0.292
+507,4.898,0.286
+507,4.94,0.278
+507,4.973,0.274
+507,5.009,0.274
+507,5.025,0.272
+507,5.049,0.27
+507,5.1,0.262
+507,5.143,0.26
+507,5.199,0.255
+507,5.236,0.253
+507,5.27,0.255
+507,5.281,0.253
+507,5.303,0.25
+507,5.326,0.252
+507,5.377,0.255
+507,5.43,0.253
+507,5.493,0.258
+507,5.534,0.26
+507,5.574,0.262
+507,5.608,0.29
+507,5.63,0.272
+507,5.668,0.272
+507,5.709,0.272
+507,5.756,0.278
+507,5.792,0.279
+507,5.833,0.283
+507,5.94,0.286
+507,5.965,0.288
+507,5.929,0.289
+507,5.954,0.288
+507,5.987,0.286
+252,4.21,0.4506
+252,4.279,0.4349
+252,4.465,0.3808
+252,4.507,0.3636
+252,4.549,0.3579
+252,4.597,0.3562
+252,4.644,0.3434
+252,4.778,0.3257
+252,4.82,0.3282
+252,4.916,0.3222
+252,4.987,0.3197
+252,5.057,0.321
+252,5.1,0.3228
+252,5.173,0.3197
+252,5.21,0.3276
+252,5.283,0.3322
+252,5.366,0.3416
+252,5.494,0.3504
+252,5.58,0.3562
+252,5.623,0.3602
+252,5.702,0.3636
+252,4.708,0.3371
+252,5.305,0.3328
+252,5.544,0.3562
+252,5.787,0.3661
+252,4.748,0.3335
+252,4.869,0.3228
+252,4.954,0.321
+252,5.134,0.321
+252,5.255,0.3294
+252,5.415,0.3434
+252,5.58,0.3551
+252,5.748,0.3608
+252,5.845,0.3666
+252,5.881,0.3688
+252,5.924,0.3727
+252,5.967,0.3705
+252,5.991,0.3716
+60,3.653,0.593
+60,3.7,0.577
+60,3.74,0.571
+60,3.785,0.56
+60,3.851,0.544
+60,3.869,0.531
+60,3.909,0.512
+60,3.949,0.512
+60,3.996,0.507
+60,4.057,0.494
+60,4.09,0.49
+60,4.161,0.494
+60,4.236,0.487
+60,4.29,0.487
+60,4.391,0.481
+60,4.412,0.489
+60,4.512,0.49
+60,4.54,0.487
+60,4.553,0.498
+60,4.58,0.493
+60,4.609,0.507
+60,4.654,0.504
+60,4.665,0.507
+60,4.699,0.509
+60,4.74,0.517
+60,4.769,0.52
+60,4.813,0.528
+60,4.849,0.526
+60,4.93,0.543
+60,4.954,0.534
+60,5.034,0.543
+60,5.155,0.543
+60,5.083,0.545
+60,5.185,0.55
+60,5.231,0.537
+60,4.875,0.535
+60,4.924,0.534
+60,4.954,0.542
+60,5.052,0.535
+60,5.033,0.54
+60,5.13,0.545
+60,5.17,0.55
+60,5.196,0.547
+60,5.23,0.568
+60,5.258,0.551
+60,5.283,0.555
+60,5.312,0.551
+60,5.35,0.555
+60,5.408,0.55
+60,5.47,0.555
+60,5.497,0.543
+60,5.515,0.551
+60,5.549,0.55
+60,5.554,0.558
+60,5.575,0.551
+60,5.6,0.55
+60,5.621,0.56
+60,5.625,0.543
+60,5.641,0.543
+60,5.655,0.55
+60,5.659,0.551
+60,5.668,0.56
+60,5.691,0.553
+60,5.714,0.551
+60,5.748,0.558
+60,5.757,0.55
+60,5.789,0.551
+60,5.836,0.547
+60,5.865,0.555
+60,5.914,0.553
+60,5.916,0.55
+60,5.945,0.551
+60,5.962,0.555
+15,3.77,0.696
+15,3.82,0.699
+15,3.855,0.707
+15,3.905,0.712
+15,3.955,0.717
+15,4,0.73
+15,4.041,0.734
+15,4.076,0.736
+15,4.079,0.744
+15,4.114,0.751
+15,4.133,0.74
+15,4.179,0.744
+15,4.196,0.754
+15,4.27,0.76
+15,4.29,0.756
+15,4.314,0.769
+15,4.34,0.763
+15,4.366,0.778
+15,4.386,0.772
+15,4.41,0.772
+15,4.425,0.782
+15,4.466,0.785
+15,4.52,0.78
+15,4.59,0.781
+15,4.63,0.777
+15,4.725,0.78
+15,4.811,0.781
+15,4.865,0.777
+15,4.885,0.776
+15,4.965,0.779
+15,5,0.781
+15,5.042,0.78
+15,5.098,0.781
+15,5.155,0.778
+15,5.179,0.781
+15,5.285,0.779
+15,4.44,0.775
+15,4.5,0.777
+15,4.54,0.778
+15,4.596,0.78
+15,4.685,0.781
+15,4.722,0.777
+15,4.845,0.775
+15,4.869,0.778
+15,4.929,0.78
+15,4.949,0.779
+15,5.002,0.777
+15,5.005,0.775
+15,5.097,0.778
+15,5.139,0.783
+15,5.156,0.784
+15,5.22,0.777
+15,5.236,0.78
+15,5.31,0.778
+15,5.36,0.775
+15,5.41,0.78
+15,5.446,0.78
+15,5.455,0.777
+15,5.515,0.781
+15,5.567,0.778
+15,5.613,0.78
+15,5.69,0.784
+15,5.834,0.781
+15,5.882,0.777
+15,5.959,0.778
+15,6.008,0.78
+15,5.793,0.78
+15,5.857,0.777
+15,5.93,0.778
+15,5.987,0.78
+126,3.63,0.594
+126,3.675,0.588
+126,3.715,0.576
+126,3.76,0.566
+126,3.81,0.552
+126,3.833,0.564
+126,3.895,0.532
+126,3.925,0.515
+126,3.95,0.503
+126,3.965,0.498
+126,4.015,0.491
+126,4.111,0.471
+126,4.196,0.451
+126,4.265,0.435
+126,4.33,0.424
+126,4.386,0.415
+126,4.425,0.412
+126,4.47,0.4
+126,4.496,0.396
+126,4.511,0.4
+126,4.55,0.393
+126,4.62,0.392
+126,4.697,0.391
+126,4.76,0.4
+126,4.82,0.403
+126,4.91,0.408
+126,4.985,0.414
+126,5.057,0.422
+126,5.121,0.424
+126,5.164,0.43
+126,5.591,0.45
+126,5.616,0.453
+126,5.655,0.447
+126,5.675,0.45
+126,5.708,0.445
+126,5.736,0.452
+126,5.756,0.445
+126,5.775,0.445
+126,5.798,0.45
+126,5.831,0.45
+126,5.835,0.446
+126,5.874,0.45
+126,5.894,0.447
+126,5.935,0.45
+126,5.961,0.444
+126,5.97,0.449
+126,5.987,0.447
+126,4.95,0.43
+126,5.049,0.432
+126,5.021,0.415
+126,5.1,0.422
+126,5.13,0.422
+126,5.179,0.43
+126,5.196,0.43
+126,5.225,0.435
+126,5.225,0.43
+126,5.25,0.436
+126,5.274,0.438
+126,5.29,0.438
+126,5.31,0.436
+126,5.33,0.439
+126,5.35,0.439
+126,5.366,0.444
+126,5.393,0.444
+126,5.423,0.446
+126,5.432,0.447
+126,5.455,0.45
+126,5.476,0.452
+126,5.501,0.447
+126,5.525,0.447
+126,5.56,0.45
+30.6,3.672,0.592
+30.6,3.708,0.59
+30.6,3.748,0.592
+30.6,3.763,0.597
+30.6,3.785,0.583
+30.6,3.826,0.585
+30.6,3.869,0.596
+30.6,3.881,0.578
+30.6,3.929,0.578
+30.6,3.935,0.583
+30.6,3.978,0.578
+30.6,4.009,0.585
+30.6,4.049,0.583
+30.6,4.079,0.592
+30.6,4.124,0.59
+30.6,4.13,0.599
+30.6,4.19,0.599
+30.6,4.27,0.609
+30.6,4.29,0.618
+30.6,4.309,0.612
+30.6,4.584,0.639
+30.6,4.653,0.644
+30.6,4.799,0.647
+30.6,4.9,0.656
+30.6,4.965,0.656
+30.6,5.029,0.652
+30.6,5.068,0.65
+30.6,5.134,0.65
+30.6,5.176,0.65
+30.6,4.425,0.637
+30.6,4.44,0.63
+30.6,4.56,0.637
+30.6,4.636,0.647
+30.6,4.74,0.654
+30.6,4.83,0.654
+30.6,4.855,0.661
+30.6,4.99,0.657
+30.6,5.1,0.652
+30.6,5.24,0.657
+30.6,5.275,0.657
+30.6,5.323,0.647
+30.6,5.473,0.657
+30.6,5.655,0.652
+30.6,4.934,0.656
+30.6,5.068,0.657
+30.6,5.17,0.659
+30.6,5.223,0.656
+30.6,5.255,0.652
+30.6,5.342,0.657
+30.6,5.344,0.657
+30.6,5.394,0.659
+30.6,5.428,0.659
+30.6,5.444,0.661
+30.6,5.516,0.657
+30.6,5.541,0.659
+30.6,5.559,0.657
+30.6,5.776,0.659
+30.6,5.81,0.659
+30.6,5.863,0.657
+30.6,5.916,0.659
+30.6,5.962,0.65
+30.6,6,0.659
--- a/package/test/params.json
+++ b/package/test/params.json
@ -1 +1,94 @@
-[{"Julia":"1.11.4","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["CPUInterpreter"]}],"GPUT":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]]
+[
+    {
+        "Julia": "1.11.5",
+        "BenchmarkTools": {
+            "major": 1,
+            "minor": 6,
+            "patch": 0,
+            "prerelease": [],
+            "build": []
+        }
+    },
+    [
+        [
+            "BenchmarkGroup",
+            {
+                "data": {
+                    "CPU": [
+                        "BenchmarkGroup",
+                        {
+                            "data": {
+                                "nikuradse_1": [
+                                    "Parameters",
+                                    {
+                                        "gctrial": true,
+                                        "time_tolerance": 0.05,
+                                        "evals_set": false,
+                                        "samples": 50,
+                                        "evals": 1,
+                                        "gcsample": false,
+                                        "seconds": 5.0,
+                                        "overhead": 0.0,
+                                        "memory_tolerance": 0.01
+                                    }
+                                ]
+                            },
+                            "tags": [
+                                "CPUInterpreter"
+                            ]
+                        }
+                    ],
+                    "GPUT": [
+                        "BenchmarkGroup",
+                        {
+                            "data": {
+                                "nikuradse_1": [
+                                    "Parameters",
+                                    {
+                                        "gctrial": true,
+                                        "time_tolerance": 0.05,
+                                        "evals_set": false,
+                                        "samples": 50,
+                                        "evals": 1,
+                                        "gcsample": false,
+                                        "seconds": 5.0,
+                                        "overhead": 0.0,
+                                        "memory_tolerance": 0.01
+                                    }
+                                ]
+                            },
+                            "tags": [
+                                "GPUTranspiler"
+                            ]
+                        }
+                    ],
+                    "GPUI": [
+                        "BenchmarkGroup",
+                        {
+                            "data": {
+                                "nikuradse_1": [
+                                    "Parameters",
+                                    {
+                                        "gctrial": true,
+                                        "time_tolerance": 0.05,
+                                        "evals_set": false,
+                                        "samples": 50,
+                                        "evals": 1,
+                                        "gcsample": false,
+                                        "seconds": 5.0,
+                                        "overhead": 0.0,
+                                        "memory_tolerance": 0.01
+                                    }
+                                ]
+                            },
+                            "tags": [
+                                "GPUInterpreter"
+                            ]
+                        }
+                    ]
+                },
+                "tags": []
+            }
+        ]
+    ]
+]
--- a/package/test/parser.jl
+++ b/package/test/parser.jl
@ -0,0 +1,294 @@
+## Parser for (ESR) expressions in infix format
+
+mutable struct Parser
+    const str::AbstractString               # string to be parsed
+    pos::Int64                        # current position in string
+    sy::Union{AbstractString,Nothing} # current lookahead symbol
+    const pSy::Symbol
+    const xSy::Symbol
+    const varnames::Vector{<:AbstractString}
+    const paramnames::Vector{<:AbstractString}
+    const coeff::Vector{Float64}
+    const numbers_as_parameters::Bool
+    const integers_as_constants::Bool # TODO rename and implement as rationals_as_constants
+    
+    # The kwparam numbers_as_parameters allows to include coefficient values directly in the expression and the values are parsed as parameters
+    # In this mode the suffix 'f' allows to mark constants. E.g. 3 * x ^ 2f  would create the parameterized expression a0*x^2 with 2 a constant value.
+    function Parser(str::AbstractString, varnames::Vector{<:AbstractString}, paramnames::Vector{<:AbstractString}; numbers_as_parameters=false, integers_as_constants=false)
+        if numbers_as_parameters && length(paramnames) > 0 
+            error("the parser does not support paramnames when numbers_as_parameters=true")
+        end
+        if !numbers_as_parameters && integers_as_constants
+            error("Set numbers_as_parameters=true to parse integers_as_constants")
+        end
+
+        p = new(lowercase(str), 1, nothing, :p, :x, varnames, paramnames, Vector{Float64}(), numbers_as_parameters, integers_as_constants)
+        next_symbol!(p)
+        return p;
+    end
+end
+
+# recursive descent parser
+# scanner is also defined in this file
+
+# LL(1) grammar:
+# G(Expr):
+# Expr      = Term { ('+' | '-') Term }
+# Term      = Fact { ('*' | '/') Fact }
+# Fact      = { '+' | '-' }
+#             (ident | number | parameter
+#              | '(' Expr ')'
+#              | ident ParamList                           // function call
+#             ) [ ('**' | '^') Fact ]
+# ParamList = '(' Expr { ',' Expr } ')'
+
+
+
+# scanner
+
+
+function parse_infix(exprStr::AbstractString, varnames::Vector{<:AbstractString}, paramnames::Vector{<:AbstractString}; 
+                     numbers_as_parameters = false, integers_as_constants = false)::Tuple{Expr, Vector{Float64}}
+    parser = Parser(exprStr, varnames, paramnames; 
+                    numbers_as_parameters = numbers_as_parameters, integers_as_constants = integers_as_constants)
+    body = parse_expr!(parser)
+    expr = Expr(:->, Expr(:tuple, :x, :p), body) # :((x,p) -> $body) 
+    (expr, parser.coeff)    
+end
+
+function parse_expr!(p::Parser)
+    t1 = parse_term!(p)
+    while p.sy == "+" || p.sy == "-"
+        if p.sy == "+"
+            next_symbol!(p)
+            t2 = parse_term!(p)
+            t1 = :($t1 + $t2) # add_simpl(t1, t2)
+        else
+            next_symbol!(p)
+            t2 = parse_term!(p)
+            t1 = :($t1 - $t2) # sub_simpl(t1, t2)
+        end
+    end
+    return t1
+end
+
+function parse_term!(p::Parser)
+    f1 = parse_factor!(p)
+    while p.sy == "*" || p.sy == "/"
+        if p.sy == "*"
+            next_symbol!(p)
+            f2 = parse_factor!(p)
+            f1 = :($f1 * $f2) # mul_simpl(f1, f2)
+        else
+            next_symbol!(p)
+            f2 = parse_factor!(p)
+            f1 = :($f1 / $f2) # div_simpl(f1, f2)
+        end
+    end
+    return f1
+end
+
+# Fact      = { '+' | '-' }
+#             (constant | parameter
+#              | '(' Expr ')'
+#              | ident [ ParamList ]        variable or function call
+#             ) [ ('**' | '^') Fact ]
+# ParamList = '(' Expr { ',' Expr } ')'
+
+function parse_factor!(p::Parser)
+    sign = 1.0
+    
+    while p.sy == "+" || p.sy == "-"
+        if p.sy == "-"
+            sign = sign * -1.0
+        end
+        next_symbol!(p)
+    end
+    
+    factor = 1.0
+    
+    if isident(p.sy)
+        ident = p.sy
+        next_symbol!(p)
+        if p.sy == "("
+            parameters = parse_paramlist!(p)
+            
+            if ident == "sqr"
+                # convert sqr(x) call to x**2 (so that we don't have to update the interpreters)
+                factor = Expr(:call, func_symbol("pow"), parameters..., 2.0)
+            else
+                factor = Expr(:call, func_symbol(ident), parameters...)
+            end
+        else
+            idx =  findfirst(p -> p==ident, p.varnames)
+            if !isnothing(idx)
+              factor = Expr(:ref, p.xSy, idx) 
+            elseif !p.numbers_as_parameters  # only if paramnames are given
+               idx =  findfirst(p -> p==ident, p.paramnames)
+
+               # replace parameter variables with access to coefficient vector (initialized to zero)
+               if !isnothing(idx)
+                    factor = Expr(:ref, p.pSy, idx)
+                    push!(p.coeff, 0.0)
+                else 
+                    error("undefined symbol $ident")
+                end
+            else
+                error("undefined variable $ident")
+            end            
+        end
+        
+    elseif isnumber(p.sy)
+        if p.numbers_as_parameters 
+            numStr = p.sy
+            val = parse(Float64, numStr)
+            next_symbol!(p)
+            if p.sy == "f"
+                # constant
+                factor = sign * val    # numbers are parsed without sign (if we parsed a sign above then we can include this in the constant here)
+                sign = 1.0
+                next_symbol!(p)
+            elseif p.integers_as_constants && isinteger(val)
+                # integers are parsed as constants
+                factor = sign * val    # numbers are parsed without sign (if we parsed a sign above then we can include this in the constant here)
+                sign = 1.0
+            else 
+                # parameter
+                factor = new_param!(p, sign * val)
+                sign = 1.0
+            end
+        else 
+            # otherwise all numbers are parsed as constants
+            numStr = p.sy
+            next_symbol!(p)
+            
+            if p.sy == "//"
+                num = parse(Int64, numStr)
+                next_symbol!(p)
+                denom = parse(Int64, p.sy)
+                val = num // denom
+                next_symbol!(p)
+            else
+                val = parse(Float64, numStr)
+            end
+            
+            factor = sign * val
+            sign = 1.0            
+        end
+        
+    elseif p.sy == "("
+        next_symbol!(p)
+        factor = parse_expr!(p)
+        expect_and_next!(p, ")")
+        
+    else
+        error("cannot parse expression")
+    end
+    
+    if p.sy == "**" || p.sy == "^"
+        next_symbol!(p)
+        exponent = parse_factor!(p)
+        factor = :($factor ^ $exponent) # pow_simpl(factor, exponent)
+    end
+
+    if sign == -1
+        :(-$factor)
+    else
+        factor
+    end
+end
+
+function parse_paramlist!(p::Parser)::Vector
+    parameters = Vector()
+    expect_and_next!(p, "(")
+    push!(parameters, parse_expr!(p))
+    while p.sy == ","
+        next_symbol!(p)
+        push!(parameters, parse_expr!(p))
+    end
+    expect_and_next!(p, ")")
+    return parameters
+end
+
+function expect_and_next!(p::Parser, expectedSy::AbstractString)
+    if p.sy != expectedSy
+        error("expected: $(expectedSy) at column $(p.pos)")
+    else
+        next_symbol!(p)
+    end   
+end
+
+function new_param!(p::Parser, val::Float64)::Expr
+    push!(p.coeff, val)
+    return Expr(:ref, p.pSy, length(p.coeff))
+end
+
+
+function isident(s::AbstractString)::Bool
+    return s != "nan" && s != "inf" && !isnothing(match(r"^[_a-zA-Z][_a-zA-Z0-9]*$", s))
+end
+
+function isnumber(s::AbstractString)::Bool
+    return !isnothing(tryparse(Float64, s))
+end
+
+function variable_index(p::Parser, str::AbstractString)
+    return findfirst(s->s==str, p.varNames)
+end
+
+function func_symbol(id::AbstractString)
+    if id == "pow"
+        return :^;
+    else
+        return Symbol(id)
+    end
+end
+
+function next_symbol!(p::Parser)
+    s = p.str
+    pos = p.pos
+    # skip whitespace
+    while pos <= length(s) && isspace(s[pos])
+        pos += 1
+    end
+
+    if pos > length(s)
+        p.sy = nothing
+        p.pos = pos
+        return
+    end
+    
+    if isdigit(s[pos]) # numbers
+        m = match(r"(\d+([.]\d*)?([eE][+-]?\d+)?|[.]\d+([eE][+-]?\d+)?)", s, pos) # match floating point number
+        pos += length(m[1]) # get the whole match
+        p.sy = m[1]
+    elseif isletter(s[pos]) # identifiers
+        idStr = string(s[pos])
+        pos += 1
+        while pos <= length(s) && (isdigit(s[pos]) || isletter(s[pos]) || s[pos] == '_')
+            idStr = idStr * s[pos]
+            pos += 1
+        end
+        p.sy = idStr
+    elseif s[pos] == '*'
+        pos += 1
+        p.sy = "*"
+        if s[pos] == '*'
+            p.sy = "**"
+            pos += 1
+        end
+    elseif s[pos] == '/'
+        pos += 1
+        p.sy = "/"
+        if s[pos] == '/'
+            p.sy = "//"
+            pos += 1
+        end
+    else
+        p.sy = string(s[pos]) # single character symbol
+        pos += 1
+    end
+
+    p.pos = pos
+#    println((p.sy, pos)) # for debugging
+end
--- a/package/test/results-fh-new/0-initial.json
+++ b/package/test/results-fh-new/0-initial.json
@ -0,0 +1,194 @@
+[
+	{
+		"Julia": "1.11.5",
+		"BenchmarkTools": {
+			"major": 1,
+			"minor": 6,
+			"patch": 0,
+			"prerelease": [],
+			"build": []
+		}
+	},
+	[
+		[
+			"BenchmarkGroup",
+			{
+				"data": {
+					"GPUT": [
+						"BenchmarkGroup",
+						{
+							"data": { 
+                                "nikuradse_1": [
+                                    "Trial",
+                                    {
+                                        "allocs": 10537236713,
+                                        "gctimes": [
+                                            6.422630609021e12
+                                        ],
+                                        "memory": 99746249534032,
+                                        "params": [
+                                            "Parameters",
+                                            {
+                                                "gctrial": true,
+                                                "time_tolerance": 0.05,
+                                                "evals_set": false,
+                                                "samples": 50,
+                                                "evals": 1,
+                                                "gcsample": false,
+                                                "seconds": 28800.0,
+                                                "overhead": 0.0,
+                                                "memory_tolerance": 0.01
+                                            }
+                                        ],
+                                        "times": [
+                                            5.4294504010681e13
+                                        ]
+                                    }
+                                ]
+                            },
+							"tags": [
+								"GPUTranspiler"
+							]
+						}
+					],
+					"GPUI": [
+						"BenchmarkGroup",
+						{
+							"data": {
+								"nikuradse_1": [
+									"Trial",
+									{
+										"allocs": 1825331206,
+										"gctimes": [
+											1.8938185191e10,
+											1.7792800779e10,
+											1.8160529276e10,
+											1.7946505031e10,
+											1.77973843e10,
+											1.7616008261e10,
+											1.7620413248e10,
+											1.768910028e10,
+											1.772636066e10,
+											1.7706216778e10,
+											1.8173891003e10,
+											1.7667273912e10,
+											1.7526904901e10,
+											1.749445276e10,
+											1.7567194654e10,
+											1.7649119926e10,
+											1.7639951452e10,
+											1.7533807088e10,
+											1.7517726514e10,
+											1.7626783198e10,
+											1.7511788769e10,
+											1.7492068732e10,
+											1.7553945009e10,
+											1.7478083952e10,
+											1.7437663283e10,
+											1.7472329594e10,
+											1.7519969261e10,
+											1.7519953931e10,
+											1.7526082936e10,
+											1.751558218e10,
+											1.7402059945e10,
+											1.7250338348e10,
+											1.7250474046e10,
+											1.7291033872e10,
+											1.7551432788e10,
+											1.7850397239e10,
+											1.7847877387e10,
+											1.7447038841e10,
+											1.754309134e10,
+											1.7566433958e10,
+											1.7503437877e10,
+											1.7647987775e10,
+											1.7401002748e10,
+											1.7385713445e10,
+											1.7385171642e10,
+											1.7348026466e10,
+											1.7438744763e10,
+											1.7309013112e10,
+											1.7577725655e10,
+											1.7432755306e10
+										],
+										"memory": 115414870368,
+										"params": [
+											"Parameters",
+											{
+												"gctrial": true,
+												"time_tolerance": 0.05,
+												"evals_set": false,
+												"samples": 50,
+												"evals": 1,
+												"gcsample": false,
+												"seconds": 28800.0,
+												"overhead": 0.0,
+												"memory_tolerance": 0.01
+											}
+										],
+										"times": [
+											5.31951749725e11,
+											5.31404501757e11,
+											5.33657147801e11,
+											5.31489160462e11,
+											5.30386250505e11,
+											5.30026023598e11,
+											5.29887080071e11,
+											5.34175638749e11,
+											5.32476620162e11,
+											5.32276123554e11,
+											5.43002738488e11,
+											5.30251592144e11,
+											5.30190125835e11,
+											5.28451973319e11,
+											5.30828202555e11,
+											5.29236820908e11,
+											5.3205118374e11,
+											5.30259980405e11,
+											5.29369982343e11,
+											5.29968522607e11,
+											5.29094509442e11,
+											5.3023736481e11,
+											5.3026832017e11,
+											5.30138026522e11,
+											5.30291814111e11,
+											5.28886430445e11,
+											5.30786719418e11,
+											5.31872294453e11,
+											5.29735616869e11,
+											5.32322531477e11,
+											5.32945923244e11,
+											5.28063077052e11,
+											5.26379810748e11,
+											5.2904720469e11,
+											5.33989526381e11,
+											5.37245240551e11,
+											5.37790009675e11,
+											5.30206196299e11,
+											5.30276314709e11,
+											5.30385782035e11,
+											5.29114269928e11,
+											5.31785585619e11,
+											5.28768646361e11,
+											5.27012226469e11,
+											5.26681637262e11,
+											5.28646301524e11,
+											5.27917175176e11,
+											5.28633753225e11,
+											5.29807712794e11,
+											5.27063144055e11
+										]
+									}
+								]
+							},
+							"tags": [
+								"GPUInterpreter"
+							]
+						}
+					]
+				},
+				"tags": []
+			}
+		]
+	]
+]
--- a/package/test/results-fh-new/cpu.json
+++ b/package/test/results-fh-new/cpu.json
@ -0,0 +1 @@
+[{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"nikuradse_1":["Trial",{"allocs":36814947,"gctimes":[1.082739415e9,9.35589349e8,8.95739997e8,8.82797331e8,8.44175578e8,8.27278981e8,8.24664534e8,8.41590342e8,8.23430705e8,8.26304622e8,8.7328356e8,8.48151374e8,8.20769383e8,8.36210366e8,8.25357919e8,8.18247354e8,8.05126298e8,8.10738655e8,8.14534413e8,8.05974078e8,8.08104945e8,8.07549224e8,8.11047079e8,8.36937224e8,8.19217772e8,8.03258649e8,8.00177357e8,8.05390572e8,7.81551092e8,7.84470283e8,7.84717493e8,7.87670826e8,7.91518273e8,7.95865535e8,7.9488509e8,7.85908564e8,7.96303832e8,7.83015419e8,7.98406799e8,7.95693404e8,7.89571842e8,7.87009536e8,7.92931167e8,8.0354065e8,8.01147304e8,7.90650725e8,7.91114336e8,8.14447424e8,8.09202389e8,8.0150787e8],"memory":19327142456,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":50,"evals":1,"gcsample":false,"seconds":28800.0,"overhead":0.0,"memory_tolerance":0.01}],"times":[1.11960461697e11,1.12658407743e11,1.11797123654e11,1.14086430365e11,1.12540701243e11,1.13057199848e11,1.12421343743e11,1.12335917668e11,1.11873753956e11,1.12087309285e11,1.15372551368e11,1.12857587668e11,1.12212954999e11,1.12352839748e11,1.12799090735e11,1.12712852105e11,1.11910175268e11,1.12890418194e11,1.12536406676e11,1.12333546234e11,1.12414119618e11,1.12632975657e11,1.12274854817e11,1.13642350405e11,1.13191424262e11,1.12623305956e11,1.12519637206e11,1.12733882055e11,1.13175515626e11,1.12499258654e11,1.12175542007e11,1.14221603568e11,1.12620900601e11,1.12996891317e11,1.12370260538e11,1.12760626809e11,1.13153933145e11,1.12762108936e11,1.12758858333e11,1.13381876923e11,1.12152161607e11,1.12831962905e11,1.12135760011e11,1.14343808852e11,1.12720432473e11,1.13061653545e11,1.12414150523e11,1.13142168741e11,1.12805546557e11,1.13053409368e11]}]},"tags":["CPUInterpreter"]}]},"tags":[]}]]]
--- a/package/test/runtests.jl
+++ b/package/test/runtests.jl
@ -16,9 +16,9 @@ include(joinpath(baseFolder, "src", "Transpiler.jl"))
 end


-# @testset "CPU Interpreter" begin
-# 	include("CpuInterpreterTests.jl")
-# end
+@testset "CPU Interpreter" begin
+	# include("CpuInterpreterTests.jl")
+end

@testset "Performance tests" begin
 	# include("PerformanceTuning.jl")
--- a/thesis/chapters/conceptdesign.tex
+++ b/thesis/chapters/conceptdesign.tex
@ -1,5 +1,3 @@
-RE-READ to ensure that concepts why this is done to improve performance and why this should be the "locally best" implementation (most should be in implementation though)
-
 \chapter{Concept and Design}
 \label{cha:conceptdesign}
 % introduction to what needs to be done. also clarify terms "Host" and "Device" here
@ -29,7 +27,9 @@ The main goal of both prototypes or evaluators is to provide a speed-up compared

 With this, the required capabilities are outlined. However, for a better understanding, the input and output data need to be explained further. The first input contains the expressions that need to be evaluated. These can be of any length and can contain constant values, variables and parameters, all of which are linked together with the supported operations. In the simplified example shown in Figure \ref{fig:input_output_explanation}, there are six expressions $e_1$ to $e_6$. 

-Next is the variable matrix. An entry in this matrix corresponds to one variable in every expression. The row indicates which variable it holds the value for. For example the values in row three, are used to parameterise the variable $x_3$. Each column holds a different set of variables. Each expression must be evaluated using each set of variable. In the provided example, there are three variable sets, each containing the values for four variables $x_1$ to $x_4$. After all expressions have been evaluated using all variable sets, the results of these evaluations must be stored in the result matrix. Each entry in this matrix holds the result of the evaluation of one expression parameterised with one variable set. The row indicates the variable set and the column indicates the expression.
+Next is the variable matrix. An entry in this matrix corresponds to one variable in every expression. The row indicates which variable it holds the value for. For example the values in row three, are used to parameterise the variable $x_3$. Each column holds a different set of variables. Each expression must be evaluated using each set of variables. In the provided example, there are three variable sets, each containing the values for four variables $x_1$ to $x_4$. 
+
+After all expressions have been evaluated using all variable sets, the results of these evaluations must be stored in the result matrix. Each entry in this matrix holds the result of the evaluation of one expression parameterised with one variable set. The row indicates the variable set and the column indicates the expression.

 The prototypes developed in this thesis, are part of a GP algorithm for symbolic regression. This means that the expressions that are evaluated, represent parts of the search space of all expressions being made up of any combination of allowed operators, the set of input variables, a set of parameters and constants. This means that the size of the search space grows exponentially. Exploring this search space by simply generating expressions, evaluating them once and then generating the next set of expressions leaves much of the search space unexplored. To combat this, parameters are introduced. These allow the algorithm to perform some kind of local search. To enable this, the prototypes must support not only variables, but also parameters.

@ -42,7 +42,7 @@ Usually, the number of variables per expression is around ten. However, the numb
 These variables do not change during the runtime of the symbolic regression algorithm. As a result the data only needs to be sent to the GPU once. This means that the impact of this data transfer is minimal. On the other hand, the data for the parameters is much more volatile. As explained above, they are used for parameter optimisation and therefore vary from evaluation to evaluation and need to be sent to the GPU very frequently. However, the amount of data that needs to be sent is also much smaller. TODO: ONCE I GET THE DATA SEE HOW MANY BYTES PARAMETERS TAKE ON AVERAGE

 \section{Architecture}
-
+\label{sec:architecture}
 Based on the requirements and data structure above, the architecture of both prototypes can be designed. While the requirements only specify the input and output, the components and workflow also need to be specified. This section aims at giving an architectural overview of both prototypes, alongside their design decisions. 

 \begin{figure}
@ -52,9 +52,10 @@ Based on the requirements and data structure above, the architecture of both pro
 	\label{fig:kernel_architecture}
 \end{figure}

-A design decision that has been made for both prototypes is to split the evaluation of each expression into a separate kernel or kernel dispatch as seen in Figure \ref{fig:kernel_architecture}. As explained in Section \ref{sec:thread_hierarchy}, it is desirable to reduce the occurrence of thread divergence as much as possible. Although the SIMT programming model tries to mitigate the negative effects of thread divergence, it is still a good idea to avoid it when possible. For this use-case, thread divergence can easily be avoided by not evaluating all expressions in a single kernel or kernel dispatch. GPUs are able to have multiple resident grids, with modern GPUs being able to accommodate 128 grids concurrently \parencite{nvidia_cuda_2025}. One grid corresponds to one kernel dispatch, and therefore allows up-to 128 kernels to be run concurrently. Therefore, dispatching a kernel for each expression, further increases GPU utilisation. In the case of the interpreter, having only one kernel that can be dispatched for each expression, also simplifies the kernel itself. This is because the kernel can focus on evaluating one expression and does not require additional code to handle multiple expressions at once. Similarly, the transpiler can also be simplified, as it can generate many smaller kernels rather than one big kernel. Additionally, the smaller kernels do not need any branching, because the generated code only needs to perform the operations as they occur in the expression itself. This also reduces the overhead on the GPU.
+A design decision that has been made for both prototypes is to split the evaluation of each expression into a separate kernel or kernel dispatch as seen in Figure \ref{fig:kernel_architecture}. As explained in Section \ref{sec:thread_hierarchy}, it is desirable to reduce the occurrence of thread divergence as much as possible. Although the SIMT programming model tries to mitigate the negative effects of thread divergence, it is still a good idea to avoid it when possible. For this use-case, thread divergence can easily be avoided by not evaluating all expressions in a single kernel or kernel dispatch. GPUs are able to have multiple resident grids, with modern GPUs being able to accommodate 128 grids concurrently \parencite{nvidia_cuda_2025}. One grid corresponds to one kernel dispatch, and therefore allows up-to 128 kernels to be run concurrently. Therefore, dispatching a kernel for each expression, further increases GPU utilisation. In the case of the interpreter, having only one kernel that can be dispatched for each expression, also simplifies the kernel itself. This is because the kernel can focus on evaluating one expression and does not require additional code to handle multiple expressions at once. Similarly, the transpiler can also be simplified, as it can generate many smaller kernels rather than one big kernel. Additionally, the smaller kernels do not need any branching, because the generated code only needs to perform the operations as they occur in the expression itself. This also reduces the overhead on the GPU. One drawback of generating a kernel for each expression, is the generation itself. Especially for smaller variable sets, it is possible, that the time it takes to transpile an expression is greater than the time it takes to evaluate it. However, for larger variable sets this should not be a concern.

 \subsection{Pre-Processing}
+\label{sec:pre-processing}
 The first step in both prototypes is the pre-processing step. It is needed, as it simplifies working with the expressions in the later steps. One of the responsibilities of the pre-processor is to verify that only allowed operators and symbols are present in the given expressions. This is comparable to the work a scanner like Flex\footnote{\url{https://github.com/westes/flex}} performs. Secondly, this step also converts the expression into an intermediate representation. In essence, the pre-processing step can be compared to the frontend of a compiler as described in Section \ref{sec:compilers}. If new operators are required, the pre-processor must be extended as well. Otherwise, expressions containing these operators would be treated as invalid and never reach the evaluator.

 The conversion into the intermediate representation transforms the expressions from infix-notation into postfix notation. This further allows the later parts to more easily evaluate the expressions. One of the major benefits of this notation is the implicit operator precedence. It allows the evaluators to evaluate the expressions token by token from left to right, without needing to worry about the correct order of operations. One token represents either an operator, a constant value, a variable or a parameter. Apart from the intermediate representation containing the expression in postfix notation, it also contains information about the types of the tokens themselves. This is all that is needed for the interpretation and transpilation steps. A simple expression like $x + 2$ would look like depicted in figure \ref{fig:pre-processing_results} after the pre-processing step.
--- a/thesis/chapters/conclusion.tex
+++ b/thesis/chapters/conclusion.tex
@ -7,6 +7,12 @@ talk again how a typical input is often not complex enough (basically repeat tha
 \section{Future Work}
 talk about what can be improved

-Transpiler: transpile expression directly from Julia AST -> would save time because no intermediate representation needs to be created (looses step and gains performance, but also makes transpiler itself more complex)
+Frontend:
+1.) extend frontend to support ternary operators (basically if the frontend sees a multiplication and an addition it should collapse them to an FMA instruction)
+
+Transpiler: 
+1.) transpile expression directly from Julia AST -> would save time because no intermediate representation needs to be created (looses step and gains performance, but also makes transpiler itself more complex; since expressions do not need to be sent to the GPU, the IR theoretically isn't needed)
+2.) Better register management strategy might be helpful -> look into register pressure etc.
+

 CPU Interpreter: Probably more worth to dive into parallelising cpu interpreter itself (not really future work, as you wouldn't write a paper about that)
--- a/thesis/chapters/implementation.tex
+++ b/thesis/chapters/implementation.tex
@ -1,36 +1,395 @@
 \chapter{Implementation}
 \label{cha:implementation}

-somewhere in here explain why one kernel per expression and not one kernel for all expressions
+This chapter focuses on the implementation phase of the project, building upon the concepts and designs previously discussed. It begins with an overview of the technologies employed for both the CPU and GPU parts of the application. This is followed by a description of the pre-processing or frontend phase. The chapter concludes with a detailed overview of the core components, the interpreter and the transpiler.

-Go into the details why this implementation is tuned towards performance and should be the optimum at that
+% Go into the details why this implementation is tuned towards performance and should be the optimum at that

 \section{Technologies}
-Short section; CUDA, PTX, Julia, CUDA.jl
+This section describes the technologies used for both the CPU side of the prototypes and the GPU side. The rationale behind these choices, including consideration of their performance implications, is presented. In addition, the hardware limitations imposed by the choice of GPU technology are outlined.

-Probably reference the performance evaluation papers for Julia and CUDA.jl
+\subsection{CPU side}
+Both prototypes were implemented using the Julia programming language. It was chosen mainly, because the current symbolic regression algorithm is also implemented in Julia. Being a high-level programming language, with modern features such as a garbage collector, support for meta-programming and dynamic typing, it also offers great convenience to the developer. 

-\section{Expression Processing}
-Talk about why this needs to be done and how it is done (the why is basically: simplifies evaluation/transpilation process; the how is in ExpressionProcessing.jl)
+More interestingly however, is the high performance that can be achieved with this language. It is possible to achieve high performance despite the supported modern features, which are often deemed to be harmful to performance. \textcite{bezanson_julia_2017} have shown how Julia can provide C-like performance while supporting the developer with modern quality of life features. The ability of Julia to be used in high performance computing scenarios and to be competitive with C has been demonstrated by \textcite{lin_comparing_2021}. This shows how Julia is a good and valid choice for scenarios where developer comfort and C-like performance are needed.
+
+\subsection{GPU side}
+In addition to a programming language for the CPU, a method for programming the GPU is also required. For this purpose, the CUDA API was chosen. While CUDA offers robust capabilities, it is important to note that it is exclusively compatible with Nvidia GPUs. An alternative would have been OpenCL, which provides broader compatibility by supporting GPUs from Nvidia, AMD and Intel. However, considering Nvidia's significant market share and the widespread adoption of CUDA in the industry, the decision was made to use CUDA.
+
+A typical CUDA program is primarily written C++ and Nvidia also provides their CUDA compiler nvcc\footnote{\url{https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/}} for C and C++ and their official CUDA programming guide \parencite{nvidia_cuda_2025} also uses C++ for code examples. It is also possible to call C++ code from within Julia. This would allow for writing the kernel and interacting with the GPU in C++, leveraging the knowledge built up over several years.
+
+\subsubsection{CUDA and Julia}
+Instead of writing the kernel in C++ and calling it from Julia, a much simpler and effective alternative can be used. The Julia package CUDA.jl\footnote{\url{https://cuda.juliagpu.org/}} enables a developer to write a kernel in Julia similar to how a kernel is written in C++ with CUDA. One drawback of using CUDA.jl however, is the fact that it is much newer compared to CUDA and therefore does not have years of testing and bug fixing in its history, which might be a concern for some applications. Apart from writing kernels with CUDA.jl, it also offers a method for interacting with the driver, to compile PTX code into machine code. This is a must-have feature as otherwise, it wouldn't have been possible to fully develop the transpiler in Julia.
+
+Additionally, the JuliaGPU initiative\footnote{\url{https://juliagpu.org/}} offers a collection of additional packages to enable GPU development for AMD, Intel and Apple and not just for Nvidia. However, CUDA.jl is also the most mature of the available implementations, which is also a reason why CUDA has been chosen instead of for example OpenCL. 
+
+Again, the question arises if the performance of CUDA.jl is sufficient to be used as an alternative to C++ and CUDA. Performance studies by \textcite{besard_rapid_2019}, \textcite{lin_comparing_2021} and \textcite{faingnaert_flexible_2022} have demonstrated, that CUDA.jl provides sufficient performance. They found that in some cases CUDA.jl was able to perform better than the same algorithm implemented in C and C++. This provides the confidence, that Julia alongside CUDA.jl is a good choice for leveraging the performance of GPUs to speed-up expression evaluation.
+
+\section{Pre-Processing}
+% Talk about why this needs to be done and how it is done (the why is basically: simplifies evaluation/transpilation process; the how is in ExpressionProcessing.jl (the why is probably not needed because it is explained in concept and design))
+The pre-processing or frontend step is very important. As already explained in Chapter \ref{cha:conceptdesign}, it is responsible for ensuring that the given expressions are valid and that they are transformed into an intermediate representation. This section aims to explain how the intermediate representation is implemented, as well as how it is generated from a mathematical expression.
+
+\subsection{Intermediate Representation}
+\label{sec:ir}
+% Talk about how it looks and why it was chosen to look like this
+The intermediate representation is mainly designed to be lightweight and easily transferrable to the GPU. Since the interpreter runs on the GPU, this was a very important consideration. Because the transpilation process is done on the CPU, and is therefore very flexible in terms of the intermediate representation, the focus was mainly on being efficient for the interpreter.
+
+The intermediate representation cannot take any form. While it has already been defined that expressions are converted to postfix notation, there are several ways to store the data. The first logical choice is to create an array where each entry represents a token. On the CPU it would be possible to define each entry as a pointer to the token object. Each of these objects could be of a different type, for example one object that holds a constant value while another object holds an operator. In addition, each of these objects could contain its own logic about what to do when it is encountered during the evaluation process. However, on the GPU, this is not possible, as an array entry must hold a value and not a pointer to another memory location. Furthermore, even if it were possible, it would be a bad idea. As explained in Section \ref{sec:memory_model}, when loading data from global memory, larger chunks are retrieved at once. If the data is scattered across the GPU's global memory, a lot of unwanted data will be transferred. This can be seen in Figure \ref{fig:excessive-memory-transfer}, where if the data is stored sequentially, far fewer data operations and far less data in general needs to be transferred.
+
+\begin{figure}
+	\centering
+	\includegraphics[width=.9\textwidth]{excessive_memory_transfer.png}
+	\caption{Loading data from global memory on the GPU always loads 32, 64 or 128 bytes (see Section \ref{sec:memory_model}). If pointers were supported and data would be scattered around global memory, many more data load operations would be required. Additionally, much more unwanted data would be loaded.}
+	\label{fig:excessive-memory-transfer}
+\end{figure}
+
+Because of this and because the GPU does not allow pointers, another solution is required. Instead of storing pointers to objects of different types in an array, it is possible to store one object with meta information. The object thus contains the type of the stored value, and the value itself, as described in Section \ref{sec:pre-processing}. The four types that need to be stored in this object, differ significantly in the value they represent. 
+
+Variables and parameters are very simple to store. Because they represent indices to the variable matrix or the parameter vector, this (integer) index can be stored as is in the value property of the object. The type can then be used to determine whether it is an index to a variable or a parameter access.
+
+Constants are also very simple, as they represent a single 32-bit floating point value. However, because of the variables and parameters, the value property is already defined as an integer and not as a floating point number. Unlike languages like Python, where every number is a floating point number, in Julia they are different and therefore cannot be stored in the same property. Creating a second property for constants only is not feasible, as this would introduce 4 bytes per object that need to be sent to the GPU which most of the time does not contain a defined value. 
+
+To avoid sending unnecessary bytes, a mechanism provided by Julia called reinterpret can be used. This allows the bits of a variable of one type, to be treated as the bits of another type. The bits used to represent a floating point number are then interpreted as an integer and can be stored in the same property. On the GPU, the same concept can be applied to reinterpret the integer value as a floating point value for further calculations. This is also the reason why the original type of the value needs to be stored alongside the value in order for the stored to be interpreted correctly and the expressions to be evaluated correctly.
+
+Operators are very different from variables, parameters and constants. Because they represent an operation rather than a value, a different way of storing them is required. An operator can be mapped to a number to identify the operation. For example, if the addition operator is mapped to the integer $1$, then when the evaluator encounters an object of type operator and a value of $1$, it will know which operation to perform. This can be done for all operators which means it is possible to store them in the same object with the same property. and only the type needs to be specified. The mapping of an operator to a value is often called an operation code, or opcode, and each operator is represented as one opcode.
+
+With this, the intermediate representation is defined. Figure \ref{fig:pre-processing-result-impl} shows how a simple expression would look after the pre-processing step. Note that the vluae $2.5$ has been reinterpreted as an integer, resulting in the seemingly random value. 
+\begin{figure}
+	\centering
+	\includegraphics[width=.9\textwidth]{pre-processing_result_impl.png}
+	\caption{The expression $x_1 + 2.5$ after it has been converted to the intermediate representation. Note that the constant value $2.5$ stores a seemingly random value due to it being reinterpreted as an integer.}
+	\label{fig:pre-processing-result-impl}
+\end{figure}
+
+
+\subsection{Processing}
+Now that the intermediate representation has been defined, the processing step can be implemented. This section describes the structure of the expressions and how they are processed. It also explains the process of parsing the expressions to ensure their validity and converting them into the intermediate representation.
+
+\subsubsection{Expressions} 
+With the pre-processing step, the first modern feature of Julia has been used. As already mentioned, Julia provides extensive support for meta-programming, which is important for this step. Julia represents its own code as a data structure, which allows a developer to manipulate the code at runtime. The code is stored in the so-called Expr object as an Abstract Syntax Tree (AST), which is the most minimal tree representation of a given expression. As a result, mathematical expressions can also be represented as such an Expr object instead of a simple string. Which is a major benefit, because these expressions can then be easily manipulated by the symbolic regression algorithm. This is the main reason why the pre-processing step requires the expressions to be provided as an Expr object instead of a string.
+
+Another major benefit of the expressions being stored in the Expr object and therefore as an AST, is the included operator precedence. Because it is a tree where the leaves are the constants, variables or parameters (also called terminal symbols) and the nodes are the operators, the correct result will be calculated when evaluating the tree from bottom to top. As can be seen in Figure \ref{fig:expr-ast}, the expression $1 + x_1 \, \log(p_1)$, when parsed as an AST, contains the correct operator precedence. First the bottom most subtree $\log(p_1)$ must be evaluated before the multiplication, and after that, the addition can be evaluated.
+
+It should be noted however, that Julia stores the tree as a list of arrays to allow a node to have as many children as necessary. For example the expression $1+2+\dots+n$ contains only additions, which is a commutative operation, meaning that the order of operations is irrelevant. The AST for this expression would contain the operator at the first position in the array and the values at the following positions. This ensures that the AST is as minimal as possible.
+
+\begin{figure}
+	\centering
+	\includegraphics[width=.45\textwidth]{expr_ast.png}
+	\caption{The AST for the expression $1 + x_1 \, \log(p_1)$ as generated by Julia. Some additional details Julia includes in its AST have been omitted as they are not relevant.}
+	\label{fig:expr-ast}
+\end{figure}
+
+\subsubsection{Parsing}
+To convert the AST of an expression into the intermediate representation, a top-down traversal of the tree is required. The steps for this are as follows:
+
+\begin{enumerate}
+	\item Extract the operator and convert it to its opcode for later use.
+	\item Convert all constants, variables and parameters and operators to the object (expression element) described in Section \ref{sec:ir}.
+	\item Append the expression elements to the postfix expression.
+	\item If the operator is a binary operator and there are more than two expression elements, append the operator after the first two elements and then after each element.
+	\item If a subtree exists, apply all previous steps and append it to the existing postfix expression.
+	\item Append the operator
+	\item Return the generated postfix expression/intermediate representation.
+\end{enumerate}
+
+The validation of the expression is performed throughout the parsing process. Validating that only correct operators are used is performed in step 1. To be able to convert the operator to its corresponding opcode, it must be validated that an opcode exists for it, and therefore whether it is valid or not. Similarly, converting the tokens into an expression element object ensures that only valid variables and parameters are present in the expression. This is handled in step 2.
+
+As explained above, a node of a binary operator can have $n$ children. In these cases, additional handling is required to ensure correct conversion. This handling is summarised in step 4. Essentially, the operator must be added after the first two elements, and for each subsequent element, the operator must also be added. The expression $1+2+3+4$ is converted to the AST $+\,1\,2\,3\,4$ and without step 4 the postfix expression would be $1\,2\,3\,4\,+$. If the operator is added after the first two elements and then after each subsequent element, the correct postfix expression $1\,2\,+\,3\,+\,4\,+$ will be generated.
+
+Each subtree of the AST is its own separate AST, which can be converted to postfix notation in the same way the whole AST can be converted. This means that the algorithm only needs to be able to handle leave nodes, and when it encounters a subtree, it recursively calls itself to parse the remaining AST. Step 5 indicates this recursive behaviour. 
+
+While the same expression usually occurs only once, sub-expressions can occur multiple times. In the example in Figure \ref{fig:expr-ast}, the whole expression $1 + x_1 \, \log(p_1)$ is unlikely to be generated more than once by the symbolic regression algorithm. However, the sub-expression $\log(p_1)$ is much more likely to be generated multiple times. This means that the generation of the intermediate representation for this subtree only needs to be done once and can be reused later. Therefore, a cache can be used to store the intermediate representation for this sub-expression and access it again later to eliminate the parsing overhead.
+
+Caching can be applied to both individual sub-expressions as well as the entire expression. While it is unlikely for the whole expression to recur frequently, either as a whole or as part of a larger expression, implementing a cache will not degrade performance and will, in fact, enhance it if repetitions do occur. In the context of parameter optimisation, where the evaluators are employed, expressions will recur, making full-expression caching advantageous. The primary drawback of caching is the increased use of RAM. However, given that RAM is plentiful in modern systems, this should not pose a significant issue.

 \section{Interpreter}
-Talk about how the interpreter has been developed.
+The implementation is divided into two main components, the CPU-based control logic and the GPU-based interpreter as outlined in the Concept and Design chapter. This section aims to describe the technical details of these components. First the CPU-based control logic will be discussed. This component handles the communication with the GPU and is the entry point which is called by the symbolic regression algorithm. Following this, the GPU-based interpreter will be explored, highlighting the specifics of developing an interpreter on the GPU.

-UML-Ablaufdiagram
+An overview of how these components interact with each other is outlined in Figure \ref{fig:interpreter-sequence}. The parts of this figure are explained in detail in the following sections.

-main loop; kernel transpiled by CUDA.jl into PTX and then executed
+\begin{figure}
+	\centering
+	\includegraphics[width=.95\textwidth]{interpreter_sequence_diagram.png}
+	\caption{The sequence diagram of the interpreter.}
+	\label{fig:interpreter-sequence}
+\end{figure}

-Memory access (currently global memory only)
-no dynamic memory allocation like on CPU (stack needs to have fixed size)
+\subsection{CPU Side}
+The interpreter is given all the expressions it needs to interpret as an input. Additionally, it needs the variable matrix as well as the parameters for each expression. All expressions are passed to the interpreter as an array of Expr objects, as they are needed for the pre-processing step or the frontend. The first loop as shown in Figure \ref{fig:interpreter-sequence}, is responsible for sending the expressions to the frontend to be converted into the intermediate representation. After this step, the expressions are in the correct format to be sent to the GPU and the interpretation process can continue.
+
+\subsubsection{Data Transfer}
+Before the GPU can start with the interpretation, the data needs to be sent to the GPU. Because the variables are already in matrix form, transferring the data is fairly straightforward. Memory must be allocated in the global memory of the GPU and then be copied from RAM into the allocated memory. Allocating memory and transferring the data to the GPU is handled implicitly by the CuArray type provided by CUDA.jl.
+
+To optimise the interpreter for parameter optimisation workloads, this step is actually performed before the interpreter is called. Although, the diagram includes this transmission for completeness, it is important to note that the variables never change, as they represent the observed inputs of the system that being modelled by the symbolic regression algorithm. Therefore, re-transmitting the variables for each step of the parameter optimisation process would be inefficient. By transmitting the variables once and reusing them throughout the parameter optimisation, significant time can be saved. 
+
+Furthermore, transferring the data to the GPU before the symbolic regression algorithm begins, could save even more time. However, this approach would require modification to the symbolic regression algorithm. Therefore, the decision has been made to neglect this optimisation. Nonetheless, it is still possible to modify the implementation at a later stage with minimal effort, if needed.
+
+Once the variables are transmitted, the parameters also must be transferred to the GPU. Unlike the variables, the parameters are stored as a vector of vectors. In order to transmit the parameters efficiently, they also need to be put in a matrix form. The matrix needs to be of the form $k \times N$, where $k$ is equal to the length of the longest inner vector and $N$ is equal to the length of the outer vector. This ensures that all values can be stored in the matrix. It also means that if the inner vectors are of different lengths, some extra unnecessary values will be transmitted, but the overall benefit of treating them as a matrix outweighs this drawback. The Program \ref{code:julia_vec-to-mat} shows how this conversion can be implemented. Note that it is required to provide an invalid element. This ensures defined behaviour and helps with finding errors in the code. After the parameters have been brought into matrix form, they can be transferred to the GPU the same way the variables are transferred.
+
+\begin{program}
+	\begin{GenericCode}
+function convert_to_matrix(vecs::Vector{Vector{T}}, invalidElement::T)::Matrix{T} where T
+	maxLength = get_max_inner_length(vecs)
+
+	# Pad the shorter vectors with the invalidElement to make all equal length
+	paddedVecs = [vcat(vec, fill(invalidElement, maxLength - length(vec))) for vec in vecs]
+	vecMat = hcat(paddedVecs...) # transform vector of vectors into column-major matrix
+
+	return vecMat
+end
+
+function get_max_inner_length(vecs::Vector{Vector{T}})::Int where T
+	return maximum(length.(vecs))
+end
+	\end{GenericCode}
+	\caption{A Julia program fragment depicting the conversion from a vector of vectors into a matrix of the form $k \times N$. }
+	\label{code:julia_vec-to-mat}
+\end{program}
+
+Similar to the parameters, the expressions are also stored as a vector of vectors. The outer vector contains each expression, while the inner vectors hold the expressions in their intermediate representation. Therefore, this vector of vectors also needs to be brought into matrix form the same way the parameters are brought into matrix form. To simplify development, the special opcode \textit{stop} has been introduced, which is used for the invalidElement in Program \ref{code:julia_vec-to-mat}. As seen in Section \ref{sec:interpreter-gpu-side}, this element is used to determine if the end of an expression has been reached during the interpretation process. This removes the need for additional data to be sent which stores the length of each expression to determine if the entire expression has been interpreted or not. Therefore, a lot of overhead can be reduced.
+
+Once the conversion into matrix form has been performed, the expressions are transferred to the GPU. Just like with the variables, the expressions remain the same over the course of the parameter optimisation part. Therefore, they are transferred to the GPU before the interpreter is called, to reduce the amount of unnecessary data transfer.
+
+In addition to the already described data that needs to be sent, two more steps are required that have not been included in the Sequence Diagram \ref{fig:interpreter-sequence}. The first one is the allocation of global memory for the result matrix. Without this, the kernel would not know where to store the interpretation results and the CPU would not know from which memory location to read the results from. Therefore, enough global memory needs to be allocated beforehand so that the results can be stored and retrieved after all kernel executions have finished. 
+
+\begin{figure}
+	\centering
+	\includegraphics[width=.9\textwidth]{memory_layout_data.png}
+	\caption{The expressions, variables and parameters as they are stored in the GPUs global memory. Note that while on the CPU they are stored as matrices, on the GPU, they are only three arrays of data. The thick lines represent, where a new column and therefore a new set of data begins.}
+	\label{fig:memory-layout-data}
+\end{figure}
+
+Only raw data can be sent to the GPU, which means that information about the data is missing. The matrices are represented as flat arrays, which means they have lost their column and row information. This information must be sent separately to let the kernel know the dimensions of the expressions, variables and parameters. Otherwise, the kernel does not know at which memory location the second variable set is stored, as it does not know how large a single set is for example. Figure \ref{fig:memory-layout-data} shows how the data is stored without any information about the rows or columns of the matrices. The thick lines help to identify where a new column, and therefore a new set of data begins. However, the GPU has no knowledge of this and therefore the additional information must be transferred to ensure that the data is accessed correctly.
+
+\subsubsection{Kernel Dispatch}
+Once all the data is present on the GPU, the CPU can dispatch the kernel for each expression. This dispatch requires parameters that specify the number of threads and their organisation into thread blocks. In total, one thread is required for each variable set and therefore the grouping into thread blocks is the primary variable. Taking into account the constraints explained in Section \ref{sec:occupancy}, this grouping needs to be tuned for optimal performance. The specific values alongside the methodology for determining these values will be explained in Chapter \ref{cha:evaluation}.
+
+In addition, the dispatch parameters also include the pointers to the location of the data allocated and transferred above, as well as the index of the expression to be interpreted. Since all expressions and parameters are sent to the GPU at once, this index ensures that the kernel knows where in memory to find the expression it needs to interpret and which parameter set it needs to use. After the kernel has finished, the result matrix needs to be read from the GPU and passed back to the symbolic regression algorithm.
+
+Crucially, dispatching a kernel is an asynchronous operation, which means that the CPU does not wait for the kernel to finish before continuing. This allows the CPU to dispatch all kernels at once, rather than one at a time. As explained in Section \ref{sec:architecture}, a GPU can have multiple resident grids, meaning that the dispatched kernels can run concurrently, drastically reducing evaluation times. Only once the result matrix is read from the GPU does the CPU have to wait for all kernels to finish execution.
+
+\subsection{GPU Side}
+\label{sec:interpreter-gpu-side}
+% Memory access (currently global memory only)
+% no dynamic memory allocation like on CPU (stack needs to have fixed size; also stack is stored in local memory)
+With the GPU's global memory now containing all the necessary data and the kernel being dispatched, the interpretation process can begin. Before interpreting an expression, the global thread ID must be calculated. This step is crucial because each variable set is assigned to a unique thread. Therefore, the global thread ID determines which variable set should be used for the current interpretation instance.
+
+Moreover, the global thread ID ensures that excess threads do not perform any work. As otherwise these threads would try to access a variable set that does not exist and therefore would lead to an illegal memory access. This is necessary because the number of required threads often does not align perfectly with the number of threads per block multiplied by the number of blocks. If for example $1031$ threads are required, then at least two thread blocks are needed, as one thread block can hold at most $1024$ threads. Because $1031$ is a prime number, it can not be divided by any practical number of thread blocks. If two thread blocks are allocated, each holding $1024$ threads, a total of $2048$ threads is started. Therefore, the excess $2048 - 1031 = 1017$ threads must be prevented from executing. By using the global thread ID and the number of available variable sets, these excess threads can be easily identified and terminated early in the kernel execution.
+
+Afterwards the stack for the interpretation can be created. It is possible to dynamically allocate memory on the GPU, which enables a similar programming model as on the CPU. \textcite{winter_are_2021} have even compared many dynamic memory managers and found, that the performance impact of them is rather small. However, if it is easily possible to use static allocations, it still offers better performance. In the case of this thesis, it is easily possible which is the reason why the stack has been chosen to have a static size. Because it is known that expressions do not exceed 50 tokens, including the operators, the stack size has been set to 25, which should be more than enough to hold the values and partial results, even in the worst case.
+
+\subsubsection{Main Loop}
+Once everything is initialised, the main interpreter loop starts interpreting the expression. Because of the intermediate representation, the loop simply iterates through the expression from left to right. On each iteration the type of the current token is checked, to decide which operation to perform. 
+
+If the current token type matches the \textit{stop} opcode, the interpreter knows that it is finished. This simplicity is the reason why this opcode was introduced, as explained above.
+
+More interestingly is the case, where the current token corresponds to an index to either the variable matrix, or the parameter matrix. In this case, the token's value is important. To access one of these matrices, the correct starting index of the set must first be calculated. As previously explained, information about the dimensions of the data is lost during transfer. At this stage, the kernel only knows the index of the first element of either matrix, which set to use for this evaluation, and the index of the value within the current set. However, the boundaries of these sets are unknown. Therefore, the additionally transferred data about the dimensions is used in this step to calculate the index of the first element in each set. With this calculated index and the index stored in the token, the correct value can be loaded. After the value has been loaded, it is pushed to the top of the stack for later use.
+
+% MAYBE:
+% Algorithm that shows how this calculation works
+
+Constants work very similarly in that the token value is read and added to the top of the stack. However, the constants have been reinterpreted from floating-point values to integers for easy transfer to the GPU. This operation must be reversed before adding the value to the stack as otherwise the wrong values would be used for evaluation. 
+
+Evaluating the expression is happening if the current token is an operator. The token's value, which serves as the opcode, determines the operation that needs to be performed. If the opcode represents a unary operator, only the top value of the stack needs to be popped for the operation. The operation is then executed on this value and the result is pushed back to the stack. On the other hand, if the opcode represents a binary operator, the top two values of the stack are popped. These are then used for the operation, and the result is subsequently pushed back onto the stack.
+
+Support for ternary operators could also be easily added. An example of a ternary operator that would help improve performance would be the GPU supported Fused Multiply-Add (FMA) operator. While this operator does not exist in Julia, the frontend can generate it when it encounters a sub-expression of the form $x * y + z$. Since this expression performs the multiplication and addition in a single clock cycle instead of two, it would be a feasible optimisation. However, detecting such sub-expressions is complicated, which why it is not supported in the current implementation.
+
+Once the interpreter loop has finished, the result of the evaluation must be stored in the result matrix. By using the index of the current expression, as well as the index of the current variable set (the global thread ID) it is possible to calculate the index where the result must be stored. The last value on the stack is the result, which is stored in the result matrix at the calculated location.

 \section{Transpiler}
-Talk about how the transpiler has been developed (probably largest section, because it just has more interesting parts)
+Unlike the interpreter, the transpiler primarily operates on the CPU, with only a minor GPU-based component. This is because the transpiler must generate entire PTX kernels from Julia expressions, rather than simply executing a pre-written kernel like the interpreter. Similar to the interpreter, the CPU side of the transpiler manages communication with both the GPU and the symbolic regression algorithm. This section provides a detailed overview of the transpiler's functionality.

-UML-Ablaufdiagram
+An overview of how the transpiler interacts with the frontend and GPU is outlined in Figure \ref{fig:transpiler-sequence}. The parts of this figure are explained in detail in the following sections.

-Front-End and Back-End
-Caching of back-end results
+\begin{figure}
+	\centering
+	\includegraphics[width=.95\textwidth]{transpiler_sequence_diagram.png}
+	\caption{The sequence diagram of the transpiler.}
+	\label{fig:transpiler-sequence}
+\end{figure}

-PTX code generated and compiled using CUDA.jl (so basically the driver) and then executed
+\subsection{CPU Side}
+After the transpiler has received the expressions to be transpiled, it first sends them to the frontend for processing. Once they have been processed, the expressions are sent to the transpiler backend which is explained in more detail Section \ref{sec:transpiler-backend}. The backend is responsible for generating the kernels. The output of the backend are the kernels written as PTX code for all expressions. 

-Memory access (global memory and register management especially register management)
+\subsubsection{Data Transfer}
+Data is sent to the GPU in the same way as it is sent by the interpreter. The variables are sent as they are, while the parameters are again brought into matrix form. Memory must also be allocated for the result matrix. Unlike the interpreter however, this is the only data that needs to be sent to the GPU for the transpiler. 
+
+Because each expression has its own kernel, there is no need to transfer the expressions themselves. Moreover, there is also no need to send information about the layout of the variables and parameters to the GPU. The reason for this is explained in the transpiler backend section below.
+
+\subsubsection{Kernel Dispatch}
+Once all the data is present on the GPU, the transpiled kernels can be dispatched. Dispatching the transpiled kernels is more involved than dispatching the interpreter kernel. Program \ref{code:julia_dispatch-comparison} shows the difference between dispatching the interpreter kernel and the transpiled kernels. An important note, is that the transpiled kernels must be manually compiled into machine code. To achieve this, CUDA.jl provides functionality to instruct the drivers to compile the PTX code. The same process of creating PTX code and compiling it must also be done for the interpreter kernel, however, this is done by CUDA.jl automatically when calling the @cuda macro in line 6.
+
+\begin{program}
+	\begin{JuliaCode}
+# Dispatching the interpreter kernel
+for i in eachindex(exprs)
+	numThreads = ...
+	numBlocks = ...
+
+	@cuda threads=numThreads blocks=numBlocks fastmath=true interpret(cudaExprs, cudaVars, cudaParams, cudaResults, cudaAdditional)
+end
+
+# Dispatching the transpiled kernels
+for kernelPTX in kernelsPTX
+	# Create linker object, add the code and compile it
+	linker = CuLink()
+	add_data!(linker, "KernelName", kernelPTX)
+	image = complete(linker)
+
+	# Get callable function from compiled result
+	mod = CuModule(image)
+	kernel = CuFunction(mod, "KernelName")
+
+	numThreads = ...
+	numBlocks = ...
+
+	# Dispatching the kernel
+	cudacall(kernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=numThreads, blocks=numBlocks)
+end	\end{JuliaCode}
+	\caption{A Julia program fragment showing how the transpiled kernels need to be dispatched as compared to the interpreter kernel}
+	\label{code:julia_dispatch-comparison}
+\end{program}
+
+After all kernels have been dispatched, the CPU waits for the kernels to complete their execution. When the kernels have finished, the result matrix is read from global memory into system memory. The results can then be returned to the symbolic regression algorithm. 
+
+\subsection{Transpiler Backend} 
+\label{sec:transpiler-backend}
+The transpiler backend is responsible for creating a kernel from an expression in its intermediate representation. Transpiling an expression is divided into several parts, these parts are as follows:
+
+\begin{itemize}
+	\item Register management
+	\item Generating the header and kernel entry point
+	\item Ensuring that only the requested amount of threads is performing work
+	\item Generating the Code for evaluating the expression and storing the result
+\end{itemize}
+
+PTX assumes a register machine, which means that a developer has to work with a limited number of registers. This also means that the transpiler has to define a strategy for managing these registers. The second and third parts are rather simple and can be considered as overhead code. Finally, the last part is the main part of the generated kernel. It contains the code to load variables and parameters, evaluate the expression and store the result in the result matrix. All parts are explained in the following sections.
+
+\subsubsection{Register Management}
+Register management is a crucial part of the transpiler as it is important to balance register usage with occupancy and performance. \textcite{aho_compilers_2006, cooper_engineering_2022} describe techniques for efficient register management, especially for machines with few registers and register usage by convention on the CPU. On the GPU however, there are many more registers available, all of which can be used as needed without restrictions.
+
+To allow for maximum occupancy and avoid spilling registers into local memory, the transpiler tries to reuse as many registers as possible. Furthermore, allocating and using a register in PTX is very similar to using variables in code, as they represent virtual registers. Therefore, much of the complexity of managing registers is handled by the PTX compiler of the driver. 
+
+Because much of the complexity of managing registers is hidden by the compiler, or does not apply in this scenario, it is implemented very simple. If a register is needed at any point in the transpilation process, it can be requested by the register manager. A register must be given a name and the manager uses this name to determine the type of this register. For example, if the name of the register is \verb|f|, it is assumed to be an FP32 register. Several naming conventions exist to ensure that the register is of the correct data type. The manager then returns the identifying name of the register, which is used to access it. The identifying name, is the name given as an input and a zero-based number that is incremented by one for each successive call.
+
+PTX requires that the registers are defined before they are used. Therefore, after the transpiler has finished generating the code, the registers must be defined at the top of the kernel. As the manager has kept track of the registers used, it can generate the code to allocate and define the registers. If the kernel only uses five FP32 registers, the manager would generate the code \verb|.reg .f32  %f<5>;|. This will allocate and define the registers \verb|%f0| through \verb|%f4|.
+
+\subsubsection{Header and Entry Point}
+Each PTX program must begin with certain directives in order to compile and use that program correctly. The first directive must be the \verb|.version| directive. It indicates which PTX version the code was written for, to ensure that it is compiled with the correct tools in the correct version. Following the \verb|.version| directive is the \verb|.target| directive, which specifies the target hardware architecture.
+
+Once these directives have been added to the generated code, the entry point to the kernel can be generated. It contains the name of the kernel, as well as all parameters that are passed to it, such as the pointers to the variable, parameter and result matrix. The kernel name is important as it is required by the CPU to dispatch it.
+
+When the entry point is generated, the PTX code for loading the parameters into the kernel is also generated. This removes the need to iterate over the kernel parameters a second time. Loading the parameters into the kernel is necessary because it is not possible to address these values directly. \textcite{nvidia_parallel_2025} states that addresses in the parameter state space can only be accessed using the \verb|ld.param| instruction. Furthermore, since all three matrices are stored in global memory, the parameter address must be converted from parameter state space to global state space using the \verb|cvta.to.global.datatype| instruction.
+
+\subsubsection{Guard Clause}
+As explained in Section \ref{sec:interpreter-gpu-side}, the guard clause ensures that any excess threads do not participate in the evaluation. The following code shows what this guard clause looks like when the kernel is written with Julia and CUDA.jl:
+\begin{JuliaCode}
+function my_kernel(nrOfVarSets::Int32)
+    threadId = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if threadId > nrOfVarSets
+        return
+    end
+    # remaining kernel
+end
+\end{JuliaCode}
+
+This can be translated into the following PTX code fragment:
+
+\begin{PTXCode}
+	mov.u32         %r3, %ntid.x;   // r3 = blockIdx().x - 1
+	mov.u32         %r4, %ctaid.x;  // r4 = blockDim().x
+	mov.u32         %r5, %tid.x;    // r5 = threadIdx().x
+
+	mad.lo.s32      %r1, %r3, %r4, %r5; //r1 = r3 * r4 + r5
+	setp.ge.s32    %p1, %r1, %r2;  // p1 = r1 >= r2 (r2 = nrOfVarSets)
+	@%p1 bra        End;
+
+	// remaining Kernel
+
+End:
+	ret;
+\end{PTXCode}
+
+It needs to be noted, that the register \verb|%r2| is not needed. Since the transpiler already knows the number of variable sets, it would be wasteful to transmit this information to the kernel. Instead, the transpiler inserts the number directly as a constant to save resources.
+
+\subsubsection{Main Loop}
+The main loop of the transpiler, which generates the kernel for evaluating a single expression, is analogous to the interpreter's main loop. Since the transpiler uses the same intermediate representation as the interpreter, both loops behave similarly. The transpiler loop also uses a stack to store the values and intermediate results. However, the transpiler does not require the special opcode \textit{stop} which was necessary in the interpreter to handle expressions padded to fit into a matrix. The transpiler only needs to process a single expression, which is stored in an unpadded vector of known length. This means that all tokens within the vector are valid and therefore do not require this opcode.
+
+% MAYBE : activity diagram for this loop (also add to interpreter main loop section (would maybe fit better in concept and design so basically move the algorithms of C&D here and add activity diagram to C&D ))
+
+When the loop encounters a token that represents an index to either the variable or the parameter matrix, the transpiler needs to generate code to load these values. In the general case, this works in exactly the same way as the interpreter, calculating the index and accessing the matrices at that location. 
+
+However, the first time a variable or parameter is accessed, it must be loaded from global memory. Although registers already exist that hold a pointer to the address of the matrices in global memory, the data is still not accessible. To make it accessible, the index to the value must first be calculated in the same way as it is calculated in the interpreter. Afterwards the value must be loaded into a register with the instruction \verb|ld.global.f32  %reg1, [%reg2]|. Using the first register of the instruction, the data can be accessed. For example, if the variable $x_1$ is accessed several times, all subsequent calls only need to reference this register and do not need to load the data from global memory again.
+
+In the case where the current token represents an operation, the code for this operation needs to be generated. Many operators have direct equivalents on the GPU. For example addition has the \verb|add.f32  %reg1, %reg2, %reg3;| instruction. The instructions for division and square root operations have equivalent instruction, but these only support approximate calculations. Although the accuracy can be controlled with different options, the fastest option \verb|.approx| has been selected. While a slightly slower but more accurate option \verb|.full| exists, it is not fully IEEE 754 compliant and has therefore not been used. 
+
+However, not all supported operators have a single instruction GPU equivalent. For example, the $x^y$ operation does not have an equivalent and must be generated differently. Compiling a kernel containing this operation using the Nvidia compiler and the \textit{-\,-use\_fast\_math} compiler flag will generate the following code:
+\begin{PTXCode}[numbers=none]
+lg2.approx.f32   %reg1, %reg2;
+mul.f32          %reg4, %reg3, %reg1;
+ex2.approx.f32   %reg5, %reg4;
+\end{PTXCode}
+While this compiler flag trades accuracy for performance, the more accurate version of this operation contains about 100 instructions instead of the three above. Therefore, the more performant version was chosen to be generated by the transpiler. Similarly, the operations $\log(x)$ and $e^x$ have no equivalent instruction and are therefore generated using the same principle.
+
+The final register of the generated code stores the result of the operation once it has been executed. As with the interpreter, this result is either the final value or an input to another operation. Therefore, this register must be stored on the stack for later use.
+
+Once the main loop has finished, the last element on the stack holds the register with the result of the evaluation. The value of this register must be stored in the result matrix. As the result matrix is stored in global memory, the code for storing the data is similar to the code responsible for loading the data from global memory. First, the location where the result is to be stored must be calculated. Storing the result at this location is performed with the instruction \verb|st.global.f32  [%reg1], %reg2;|.
+
+\subsection{GPU Side}
+On the GPU, the transpiled kernels are executed. Given that these kernels are relatively simple, containing minimal branching and overhead, the GPU does not need to perform a lot of operations. As illustrated in Program \ref{code:ptx_kernel}, the kernel for the expression $x_1 + p_1$ is quite straightforward. It involves only two load operations, the addition and the storing of the result in the result matrix. Essentially, the kernel mirrors the expression directly, with the already explained added overhead.
+
+\begin{program}
+	\begin{PTXCode}
+.visible .entry Evaluator(
+	.param .u64 param_1, .param .u64 param_2, .param .u64 param_3)
+	{
+		// Make parameters stored in global memory accessible
+		ld.param.u64   %rd0, [param_1];
+		cvta.to.global.u64   %parameter0, %rd0;
+		ld.param.u64   %rd1, [param_2];
+		cvta.to.global.u64   %parameter1, %rd1;
+		ld.param.u64   %rd2, [param_3];
+		cvta.to.global.u64   %parameter2, %rd2;
+	
+		mov.u32    %r0, %ntid.x;
+		mov.u32    %r1, %ctaid.x;
+		mov.u32    %r2, %tid.x;
+		mad.lo.s32     %r3, %r0, %r1, %r2;
+		setp.gt.s32    %p0, %r3, 1;
+	@%p0 bra    L__BB0_2; // Jump to end of kernel if too many threads are started
+		cvt.u64.u32    %rd3, %r3;
+		mov.u64 %rd4, 0;
+	
+		// Load variable and parameter from global memory and add them together
+		mad.lo.u64  %rd5, %rd3, 4, 0;
+		add.u64     %rd5, %parameter0, %rd5;
+		ld.global.f32 %var0, [%rd5];
+		mad.lo.u64  %rd6, %rd4, 4, 0;
+		add.u64     %rd6, %parameter1, %rd6;
+		ld.global.f32 %var1, [%rd6];
+		add.f32    %f0, %var0, %var1;
+		
+		// Store the result in the result matrix
+		add.u64        %rd7, 0, %rd3;
+		mad.lo.u64     %rd7, %rd7, 4, %parameter2;
+		st.global.f32  [%rd7], %f0;
+	
+	L__BB0_2: ret;
+	}\end{PTXCode}
+	\caption{The slightly simplified PTX kernel for the expression $x_1 + p_1$. For simplicity, the allocation of registers and the required directives \texttt{.version} and \texttt{.target} have been removed.}
+	\label{code:ptx_kernel}
+\end{program}
+
+%\verb|.version| and \verb|.target|
+
+Note that Program \ref{code:ptx_kernel} has been slightly simplified to omit the mandatory directives and the register allocation. From line five to line ten, the addresses stored in the parameters are converted from parameter state space into global state space so that they reference the correct portion of the GPU's memory. It needs to be noted, that this kernel uses 64-bit addresses, which is the reason why some 64-bit instructions are used throughout the kernel. However, the evaluation of the expression itself is performed entirely using the faster 32-bit instructions.
+
+Lines 12 through 17 are responsible for calculating the global thread ID and ensuring that excessive threads are terminated early. Note that in line 16, if the global thread ID stored in register \verb|%r3| is greater than one, it must terminate early. This is because only one variable set needs to be evaluated in this example.
+
+The PTX code from line 22 to line 28 is the actual evaluation of the expression, with line 28 performing the calculation $x_1 + p_1$. All other lines are responsible for loading the values from global memory. The instructions in lines 22, 23, 25 and 26 are responsible for calculating the offset in bytes to the memory location where the value is stored with respect to the location of the first element. 
+
+The constants $4$ and $0$ are introduced for performance reasons. The number $4$ is the size of a variable set in bytes. Since one variable set in this case stores only a single FP32 value, each variable set has a size of four bytes. Similarly, the number $0$ represents the index of the value within the variable set. More precisely, this is the offset in bytes from the index to the variable set, which is zero for the first element, four for the second, and so on. These two constants are calculated during the transpilation process to minimise the amount of data to be transferred to the GPU. 
+
+Storing the result in the result matrix is performed from line 31 to 33. The location where the value is to be stored is calculated in lines 31 and 32. Line 31 calculates the index inside the result matrix according to the current variable set stored in register \verb|%rd3|. The constant $0$ is the product of the index of the expression being evaluated and the number of variable sets, and represents the column of the result matrix. Converting this index into bytes and adding it as an offset to the first element of the result matrix gives the correct memory location to store the result at.
+
+This kernel consists mostly of overhead code, as only lines 22 through 33 contribute to calculating the result of the expression with the designated variable and parameter set. However, for larger expressions, the percentage of overhead code shrinks drastically.
--- a/thesis/chapters/relwork.tex
+++ b/thesis/chapters/relwork.tex
@ -160,28 +160,30 @@ While in most cases a GPU can be programmed in a higher level language like C++

 PTX defines a virtual machine with an own instruction set architecture (ISA) and is designed for data-parallel processing on a GPU. It is an abstraction of the underlying hardware instruction set, allowing PTX code to be portable across Nvidia GPUs. In order for PTX code to be usable for the GPU, the driver is responsible for compiling the code to the hardware instruction set of the GPU it is run on. A developer typically writes a kernel in CUDA using C++, for example, and the Nvidia compiler generates the PTX code for that kernel. This PTX code is then compiled by the driver once it is executed. The concepts for programming the GPU with PTX and CUDA are the same, apart from the terminology which is slightly different. For consistency, the CUDA terminology will continue to be used.

-Syntactically, PTX is similar to assembler style code. Every PTX code must have a \verb|.version| directive which indicates the PTX version and an optional \verb|.target| directive which indicates the compute capability. If the program works in 64 bit addresses, the optional \verb|.address_size| directive can be used to indicate that, which simplifies the code for such applications. After these directives, the actual code is written. As each PTX code needs an entry point (the kernel) the \verb|.entry| directive indicates the name of the kernel and the parameters needed. It is also possible to write helper functions with the \verb|.func| directive. Inside the kernel or a helper function, normal PTX code can be written. Because PTX is very low level, it assumes an underlying register machine, therefore a developer needs to think about register management. This includes loading data from global or shared memory into registers if needed. Code for manipulating data like addition and subtraction generally follow the structure \verb|operation.datatype| followed by up to four parameters for that operation. For adding two FP32 values together and storing them in the register \%n, the code looks like the following:
+Syntactically, PTX is similar to assembler style code. Every PTX code must have a \verb|.version| directive which indicates the PTX version and is immediately followed by the \verb|.target| directive which indicates the compute capability. If the program needs 64-bit addresses instead of the default 32-bit addresses, the optional \verb|.address_size| directive can be used to indicate this. Using 64-bit addresses enables the developer to access more than 4 GB of memory but also increases register usage, as a 64-bit address must be stored in two registers. 
+
+After these directives, the actual code is written. As each PTX code needs an entry point (the kernel) the \verb|.entry| directive indicates the name of the kernel and the parameters needed. It is also possible to write helper functions with the \verb|.func| directive. Inside the kernel or a helper function, normal PTX code can be written. Because PTX is very low level, it assumes an underlying register machine, therefore a developer needs to think about register management. This includes loading data from global or shared memory into registers if needed. Code for manipulating data like addition and subtraction generally follow the structure \verb|operation.datatype| followed by up to four parameters for that operation. For adding two FP32 values together and storing them in the register \%n, the code looks like the following:
 \begin{GenericCode}[numbers=none]
-add.f32		\%n, 0.1, 0.2;
+add.f32		%n, 0.1, 0.2;
 \end{GenericCode}
 Loops in the classical sense do not exist in PTX. Instead, a developer needs to define jump targets for the beginning and end of the loop. The Program in \ref{code:ptx_loop} shows how a function with simple loop can be implemented. The loop counts down to zero from the passed parameter $N$ which is loaded into the register \%n in line 6. If the value in the register \%n reached zero the loop branches at line 9 to the jump target at line 12 and the loop has finished. All other used directives and further information on writing PTX code can be taken from the PTX documentation \parencite{nvidia_parallel_2025}.

 \begin{program}
-\begin{GenericCode}
+\begin{PTXCode}
 .func loop(.param .u32 N)
 {
-	.reg .u32 \%n;
-	.reg .pred \%p;
+	.reg .u32 %n;
+	.reg .pred %p;

-	ld.param.u32 \%n, [N];
+	ld.param.u32 %n, [N];
 Loop: 
-	setp.eq.u32 \%p, \%n, 0;
-@\%p bra 		Done;
-	sub.u32		\%n, \%n, 1;
+	setp.eq.u32 %p, %n, 0;
+@%p bra 		Done;
+	sub.u32		%n, %n, 1;
 	bra			Loop;
 Done:
 }
-\end{GenericCode}
+\end{PTXCode}
 \caption{A PTX program fragment depicting how loops can be implemented.}
 \label{code:ptx_loop}
 \end{program}
--- a/thesis/hgblistings.sty
+++ b/thesis/hgblistings.sty
@ -135,6 +135,50 @@ keepspaces=true,%
 	#1}}%
 {}

+% Language Definition and Code Environment for Julia
+\lstdefinelanguage{Julia}{
+    keywords={if, for, continue, break, end, else, true, false, @cuda, return, function},
+	keywordstyle=\color{blue},
+    sensitive=true,
+    morestring=[b]",
+    morestring=[d]',
+    morecomment=[l]{\#},
+    commentstyle=\color{gray},
+	stringstyle=\color{brown}
+}
+
+
+\lstnewenvironment{JuliaCode}[1][]
+{\lstset{%
+	language=Julia,
+	escapeinside={/+}{+/}, % makes "/+" and "+/" available for Latex escapes (labels etc.)
+	#1}}%
+{}
+
+% Language Definition and Code Environment for Julia
+\lstdefinelanguage{PTX}{
+	alsoletter={.},
+    morekeywords={mov.u32, mad.lo.s32, setp.ge.s32, bra, mov.u64,
+				  mad.lo.u64, add.u64, ld.global.f32, add.f32, st.global.f32, 
+				  cvta.to.global.u64, ld.param.u64, setp.gt.s32, cvt.u64.u32,
+			  ret, .func, .entry, .visible, .param, .u64},
+	keywordstyle=\color{blue},
+    sensitive=true,
+    morestring=[b]",
+    morestring=[d]',
+    morecomment=[l]{//},
+    commentstyle=\color{gray},
+	stringstyle=\color{brown}
+}
+
+
+\lstnewenvironment{PTXCode}[1][]
+{\lstset{%
+	language=PTX,
+	escapeinside={/+}{+/}, % makes "/+" and "+/" available for Latex escapes (labels etc.)
+	#1}}%
+{}
+

 % Code Enivornmente for Generic Code
 \lstnewenvironment{GenericCode}[1][]
--- a/thesis/images/excessive_memory_transfer.png
+++ b/thesis/images/excessive_memory_transfer.png
--- a/thesis/images/expr_ast.png
+++ b/thesis/images/expr_ast.png
--- a/thesis/images/interpreter_sequence_diagram.png
+++ b/thesis/images/interpreter_sequence_diagram.png
--- a/thesis/images/memory_layout_data.png
+++ b/thesis/images/memory_layout_data.png
--- a/thesis/images/pre-processing_result.png
+++ b/thesis/images/pre-processing_result.png
--- a/thesis/images/pre-processing_result_impl.png
+++ b/thesis/images/pre-processing_result_impl.png
--- a/thesis/images/transpiler_sequence_diagram.png
+++ b/thesis/images/transpiler_sequence_diagram.png
--- a/thesis/main.pdf
+++ b/thesis/main.pdf
--- a/thesis/references.bib
+++ b/thesis/references.bib
@ -1176,7 +1176,7 @@
 	booktitle = {2005 {IEEE} Congress on Evolutionary Computation},
 	author = {Gustafson, S. and Burke, E.K. and Krasnogor, N.},
 	date = {2005-09},
-	keywords = {Computer science, Concrete, Diversity methods, Evolutionary computation, Genetic programming, Problem-solving},
+	keywords = {Evolutionary computation, Computer science, Concrete, Diversity methods, Genetic programming, Problem-solving},
 	file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\28ZEEUYG\\Gustafson et al. - 2005 - On improving genetic programming for symbolic regression.pdf:application/pdf},
 }

@ -1204,7 +1204,7 @@
 	publisher = {{arXiv}},
 	author = {Bruneton, J.-P.},
 	date = {2025-03-24},
-	keywords = {Computer Science - Neural and Evolutionary Computing, Computer Science - Symbolic Computation, Physics - Data Analysis, Statistics and Probability},
+	keywords = {Computer Science - Symbolic Computation, Computer Science - Neural and Evolutionary Computing, Physics - Data Analysis, Statistics and Probability},
 	file = {Preprint PDF:C\:\\Users\\danwi\\Zotero\\storage\\9U346ZEV\\Bruneton - 2025 - Enhancing Symbolic Regression with Quality-Diversity and Physics-Inspired Constraints.pdf:application/pdf},
 }

@ -1222,3 +1222,37 @@
 	date = {1999},
 	langid = {english},
 }
+
+@article{bezanson_julia_2017,
+	title = {Julia: A Fresh Approach to Numerical Computing},
+	volume = {59},
+	issn = {0036-1445},
+	url = {https://epubs.siam.org/doi/10.1137/141000671},
+	doi = {10.1137/141000671},
+	shorttitle = {Julia},
+	abstract = {This is the third in a series of papers on aspects of modern computing environments that are relevant to statistical data analysis. In this paper, we discuss programming environments. In particular, we argue that integrated programming environments (for example, Lisp and Smalltalk environments) are more appropriate as a base for data analysis than conventional operating systems (for example, Unix).},
+	pages = {65--98},
+	number = {1},
+	journaltitle = {{SIAM} Review},
+	shortjournal = {{SIAM} Rev.},
+	author = {Bezanson, Jeff and Edelman, Alan and Karpinski, Stefan and Shah, Viral B.},
+	date = {2017-01},
+	file = {Submitted Version:C\:\\Users\\danwi\\Zotero\\storage\\9R4QSU35\\Bezanson et al. - 2017 - Julia A Fresh Approach to Numerical Computing.pdf:application/pdf},
+}
+
+@article{faingnaert_flexible_2022,
+	title = {Flexible Performant {GEMM} Kernels on {GPUs}},
+	volume = {33},
+	issn = {1558-2183},
+	url = {https://ieeexplore.ieee.org/document/9655458},
+	doi = {10.1109/TPDS.2021.3136457},
+	abstract = {General Matrix Multiplication or {GEMM} kernels take centre place in high performance computing and machine learning. Recent {NVIDIA} {GPUs} include {GEMM} accelerators, such as {NVIDIA}’s Tensor Cores. Their exploitation is hampered by the two-language problem: it requires either low-level programming which implies low programmer productivity or using libraries that only offer a limited set of components. Because rephrasing algorithms in terms of established components often introduces overhead, the libraries’ lack of flexibility limits the freedom to explore new algorithms. Researchers using {GEMMs} can hence not enjoy programming productivity, high performance, and research flexibility at once. In this paper we solve this problem. We present three sets of abstractions and interfaces to program {GEMMs} within the scientific Julia programming language. The interfaces and abstractions are co-designed for researchers’ needs and Julia’s features to achieve sufficient separation of concerns and flexibility to easily extend basic {GEMMs} in many different ways without paying a performance price. Comparing our {GEMMs} to state-of-the-art libraries {cuBLAS} and {CUTLASS}, we demonstrate that our performance is in the same ballpark of the libraries, and in some cases even exceeds it, without having to write a single line of code in {CUDA} C++ or assembly, and without facing flexibility limitations.},
+	pages = {2230--2248},
+	number = {9},
+	journaltitle = {{IEEE} Transactions on Parallel and Distributed Systems},
+	author = {Faingnaert, Thomas and Besard, Tim and De Sutter, Bjorn},
+	urldate = {2025-04-20},
+	date = {2022-09},
+	keywords = {Codes, Graphics processing units, graphics processors, high-level programming languages, Instruction sets, Kernel, Libraries, Matrix multiplication, Productivity, Programming},
+	file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\QCJ6LSF3\\Faingnaert et al. - 2022 - Flexible Performant GEMM Kernels on GPUs.pdf:application/pdf},
+}
Author	SHA1	Message	Date
Daniel	6bcc9000b1	benchmarking: added initial results for transpiler	2025-05-19 09:12:59 +02:00
Daniel	88ee8d20bd	benchmarking: added interpreter results Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-05-17 16:07:03 +02:00
Daniel	47dcc29b33	benchmarking: added CPU benchmark results Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-05-16 14:29:47 +02:00
Daniel	d7e18f183d	benchmarking: updated benchmarking suite and prepared for taking the benchmarks Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-05-15 16:25:32 +02:00
Daniel	3d80ae95e4	benchmarking: further tests done. Seems like transpiler takes ages, need to investigate further Some checks failed CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled Details	2025-05-11 16:54:19 +02:00
Daniel	5b31fbb270	benchmarking: changes to not fill up all of the RAM Some checks failed CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled Details	2025-05-10 15:10:26 +02:00
Daniel	2ba1fef5ba	benchmarking: removed caches to get initial performance measurement. still some problems Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-05-10 13:11:27 +02:00
Daniel	6d3c3164cf	expression processing: added support for inverse/reciprocal Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-05-10 09:36:02 +02:00
Daniel	7121329a17	benchmarking: started tuning benchmarking results. found some errors that need fixing Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-05-09 19:19:53 +02:00
Daniel	327e4ebf1b	benchmarking: prepared tests for using actual data Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-05-09 13:58:10 +02:00
Daniel	2c8a9cd2d8	added support for variables and parameters as array. also improved conversion of variables and parameters into Expressionelement Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-05-09 11:04:10 +02:00
Daniel	aaa3f2c7c0	interpreter: reverted previous change to make interpreter work correctly again Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-05-09 09:45:43 +02:00
Daniel	ae03ebdf1d	implementation: finished re-reading chapter and fixed suboptimal wording Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-05-08 16:32:05 +02:00
Daniel	6b7205e026	implementation: Finished chapter. checking for errors and textual improvements not finished Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-05-08 12:34:08 +02:00
Daniel	b69a3efe96	implementation: continued writing transpiler section Some checks failed CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled Details	2025-05-04 13:54:05 +02:00
Daniel	18d89e27ca	implementation: started transpiler section Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-05-03 12:25:34 +02:00
Daniel	e8e457eae9	implementation: finished interpreter section and started transpiler section Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-05-02 12:35:03 +02:00
Daniel	c4187a131e	implementation: finished interpreter section Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-05-01 14:36:14 +02:00
Daniel	101b13e7e7	implementation: started writing interpreter section; finished CPU-side part; also found error with interpreter Some checks failed CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled Details	2025-04-27 13:17:35 +02:00
Daniel	e571fa5bd6	implementation: finished pre-processing section; updated code Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-04-26 13:46:23 +02:00
Daniel	ad2eab2e0a	implementation: continued pre-processing section; added cache to ExpressionProcessing.jl to improve performance Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-04-25 16:22:57 +02:00
Daniel	b40a06af3f	implementation: started describing frontend Some checks failed CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled Details	2025-04-21 11:58:48 +02:00
Daniel	210831146a	implementation: started writing impl; finished technology section Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-04-20 12:18:40 +02:00
Daniel	90a4194283	expression processing: fixed error if expression contained nested unary operators such as log(sqrt(4)) Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-04-19 10:54:41 +02:00
daniwipes	0d888edc52	Merge pull request #2 from daniwipes/expression-file Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details Expression file	2025-04-19 08:43:35 +02:00
Gabriel Kronberger	4fe9040a6f	Make @btime work	2025-04-18 11:41:25 +02:00
Gabriel Kronberger	bc49b33149	Bugfix	2025-04-18 11:41:11 +02:00
Gabriel Kronberger	293c5f13a4	Add files with expressions, parser, Nikuradse dataset and a new test case.	2025-04-18 08:19:22 +02:00
				`@ -0,0 +1 @@`
				[{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"nikuradse_1":["Trial",{"allocs":36814947,"gctimes":[1.082739415e9,9.35589349e8,8.95739997e8,8.82797331e8,8.44175578e8,8.27278981e8,8.24664534e8,8.41590342e8,8.23430705e8,8.26304622e8,8.7328356e8,8.48151374e8,8.20769383e8,8.36210366e8,8.25357919e8,8.18247354e8,8.05126298e8,8.10738655e8,8.14534413e8,8.05974078e8,8.08104945e8,8.07549224e8,8.11047079e8,8.36937224e8,8.19217772e8,8.03258649e8,8.00177357e8,8.05390572e8,7.81551092e8,7.84470283e8,7.84717493e8,7.87670826e8,7.91518273e8,7.95865535e8,7.9488509e8,7.85908564e8,7.96303832e8,7.83015419e8,7.98406799e8,7.95693404e8,7.89571842e8,7.87009536e8,7.92931167e8,8.0354065e8,8.01147304e8,7.90650725e8,7.91114336e8,8.14447424e8,8.09202389e8,8.0150787e8],"memory":19327142456,"params":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":50,"evals":1,"gcsample":false,"seconds":28800.0,"overhead":0.0,"memory_tolerance":0.01}],"times":[1.11960461697e11,1.12658407743e11,1.11797123654e11,1.14086430365e11,1.12540701243e11,1.13057199848e11,1.12421343743e11,1.12335917668e11,1.11873753956e11,1.12087309285e11,1.15372551368e11,1.12857587668e11,1.12212954999e11,1.12352839748e11,1.12799090735e11,1.12712852105e11,1.11910175268e11,1.12890418194e11,1.12536406676e11,1.12333546234e11,1.12414119618e11,1.12632975657e11,1.12274854817e11,1.13642350405e11,1.13191424262e11,1.12623305956e11,1.12519637206e11,1.12733882055e11,1.13175515626e11,1.12499258654e11,1.12175542007e11,1.14221603568e11,1.12620900601e11,1.12996891317e11,1.12370260538e11,1.12760626809e11,1.13153933145e11,1.12762108936e11,1.12758858333e11,1.13381876923e11,1.12152161607e11,1.12831962905e11,1.12135760011e11,1.14343808852e11,1.12720432473e11,1.13061653545e11,1.12414150523e11,1.13142168741e11,1.12805546557e11,1.13053409368e11]}]},"tags":["CPUInterpreter"]}]},"tags":[]}]]]