benchmarking: updated blocksize to 256 with moderate improvements

benchmarking: fixed bugs; took initial_benchmark
benchmarking: added gpu evaluators to performance testing. getting execution errors still
2025-03-30 13:56:25 +02:00 · 2025-03-30 12:54:50 +02:00 · 2025-03-29 13:35:59 +01:00 · 2025-03-29 12:01:06 +01:00 · 2025-03-28 19:32:48 +01:00 · 2025-03-27 22:32:24 +01:00
36 changed files with 2673 additions and 773 deletions
--- a/Ressources.txt
+++ b/Ressources.txt
@ -1,2 +0,0 @@
-https://www.markussteinberger.net/papers/DynMemory.pdf
- - Shows the performance impact of dynamically allocating Memory for different allocators (including the CUDA internal which I am using. Might be a topic for "Future Work" so as in the future, one could look into another allocator to gain more performance)
--- a/other/compiler_architecture.drawio
+++ b/other/compiler_architecture.drawio
@ -0,0 +1,71 @@
+<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:136.0) Gecko/20100101 Firefox/136.0" version="26.1.1">
+  <diagram name="Page-1" id="CwRLx42RAcgxm35m21Lx">
+    <mxGraphModel dx="691" dy="1208" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
+      <root>
+        <mxCell id="0" />
+        <mxCell id="1" parent="0" />
+        <mxCell id="wWUOW6dZojJ5Lo5lHCWY-24" value="" style="swimlane;startSize=0;" parent="1" vertex="1">
+          <mxGeometry x="680" y="-180" width="440" height="120" as="geometry" />
+        </mxCell>
+        <mxCell id="wWUOW6dZojJ5Lo5lHCWY-31" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="wWUOW6dZojJ5Lo5lHCWY-24" source="wWUOW6dZojJ5Lo5lHCWY-29" target="wWUOW6dZojJ5Lo5lHCWY-30" edge="1">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="wWUOW6dZojJ5Lo5lHCWY-29" value="Scanner" style="rounded=0;whiteSpace=wrap;html=1;" parent="wWUOW6dZojJ5Lo5lHCWY-24" vertex="1">
+          <mxGeometry x="180" y="50" width="80" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="wWUOW6dZojJ5Lo5lHCWY-30" value="Parser" style="rounded=0;whiteSpace=wrap;html=1;" parent="wWUOW6dZojJ5Lo5lHCWY-24" vertex="1">
+          <mxGeometry x="340" y="50" width="80" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="wWUOW6dZojJ5Lo5lHCWY-27" value="Frontend" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="wWUOW6dZojJ5Lo5lHCWY-24" vertex="1">
+          <mxGeometry x="185" width="70" height="30" as="geometry" />
+        </mxCell>
+        <mxCell id="wWUOW6dZojJ5Lo5lHCWY-33" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="wWUOW6dZojJ5Lo5lHCWY-24" source="wWUOW6dZojJ5Lo5lHCWY-26" target="wWUOW6dZojJ5Lo5lHCWY-29" edge="1">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="wWUOW6dZojJ5Lo5lHCWY-26" value="Source Code" style="rounded=1;whiteSpace=wrap;html=1;" parent="wWUOW6dZojJ5Lo5lHCWY-24" vertex="1">
+          <mxGeometry x="20" y="50" width="80" height="41" as="geometry" />
+        </mxCell>
+        <mxCell id="wWUOW6dZojJ5Lo5lHCWY-35" value="" style="swimlane;startSize=0;" parent="1" vertex="1">
+          <mxGeometry x="680" y="20" width="440" height="120" as="geometry" />
+        </mxCell>
+        <mxCell id="wWUOW6dZojJ5Lo5lHCWY-36" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="wWUOW6dZojJ5Lo5lHCWY-35" source="wWUOW6dZojJ5Lo5lHCWY-38" target="wWUOW6dZojJ5Lo5lHCWY-39" edge="1">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="wWUOW6dZojJ5Lo5lHCWY-38" value="Code Generator" style="rounded=0;whiteSpace=wrap;html=1;" parent="wWUOW6dZojJ5Lo5lHCWY-35" vertex="1">
+          <mxGeometry x="170" y="50" width="100" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="wWUOW6dZojJ5Lo5lHCWY-39" value="Machine code" style="rounded=1;whiteSpace=wrap;html=1;" parent="wWUOW6dZojJ5Lo5lHCWY-35" vertex="1">
+          <mxGeometry x="330" y="50" width="90" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="wWUOW6dZojJ5Lo5lHCWY-40" value="Backend" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="wWUOW6dZojJ5Lo5lHCWY-35" vertex="1">
+          <mxGeometry x="185" width="70" height="30" as="geometry" />
+        </mxCell>
+        <mxCell id="wWUOW6dZojJ5Lo5lHCWY-41" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="wWUOW6dZojJ5Lo5lHCWY-35" source="wWUOW6dZojJ5Lo5lHCWY-43" target="wWUOW6dZojJ5Lo5lHCWY-38" edge="1">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="wWUOW6dZojJ5Lo5lHCWY-43" value="Optimiser" style="rounded=0;whiteSpace=wrap;html=1;" parent="wWUOW6dZojJ5Lo5lHCWY-35" vertex="1">
+          <mxGeometry x="20" y="50" width="90" height="41" as="geometry" />
+        </mxCell>
+        <mxCell id="wWUOW6dZojJ5Lo5lHCWY-46" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="wWUOW6dZojJ5Lo5lHCWY-44" target="wWUOW6dZojJ5Lo5lHCWY-43" edge="1">
+          <mxGeometry relative="1" as="geometry">
+            <Array as="points">
+              <mxPoint x="660" y="-20" />
+              <mxPoint x="660" y="91" />
+            </Array>
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="wWUOW6dZojJ5Lo5lHCWY-44" value="Intermediate representation" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
+          <mxGeometry x="820" y="-40" width="160" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="wWUOW6dZojJ5Lo5lHCWY-45" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=1;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="wWUOW6dZojJ5Lo5lHCWY-30" target="wWUOW6dZojJ5Lo5lHCWY-44" edge="1">
+          <mxGeometry relative="1" as="geometry">
+            <Array as="points">
+              <mxPoint x="1140" y="-110" />
+              <mxPoint x="1140" y="-20" />
+            </Array>
+          </mxGeometry>
+        </mxCell>
+      </root>
+    </mxGraphModel>
+  </diagram>
+</mxfile>
--- a/other/gpu_memory_layout.drawio
+++ b/other/gpu_memory_layout.drawio
@ -0,0 +1,226 @@
+<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:136.0) Gecko/20100101 Firefox/136.0" version="25.0.3">
+  <diagram name="Page-1" id="jQ_StKdA93h0BzUtdCFj">
+    <mxGraphModel dx="1430" dy="1615" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
+      <root>
+        <mxCell id="0" />
+        <mxCell id="1" parent="0" />
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-103" value="" style="swimlane;startSize=0;" vertex="1" parent="1">
+          <mxGeometry x="380" y="-640" width="640" height="540" as="geometry" />
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-155" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#cdeb8b;strokeColor=#36393d;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-103">
+          <mxGeometry x="30" y="320" width="580" height="80" as="geometry" />
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-104" value="GPU" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-103">
+          <mxGeometry width="60" height="30" as="geometry" />
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-105" value="" style="swimlane;startSize=0;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-103">
+          <mxGeometry x="20" y="30" width="280" height="260" as="geometry" />
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-106" value="Block" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-105">
+          <mxGeometry width="60" height="30" as="geometry" />
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-107" value="Shared memory" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#fad7ac;strokeColor=#b46504;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-105">
+          <mxGeometry x="20" y="40" width="240" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-108" value="Registers" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#fad7ac;strokeColor=#b46504;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-105">
+          <mxGeometry x="20" y="120" width="70" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-109" value="Registers" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#fad7ac;strokeColor=#b46504;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-105">
+          <mxGeometry x="190" y="120" width="70" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-110" value="Thread" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#f5f5f5;fontColor=#333333;strokeColor=#666666;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-105">
+          <mxGeometry x="20" y="200" width="110" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-111" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#fad7ac;strokeColor=#b46504;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-105">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="59.83000000000001" y="200" as="sourcePoint" />
+            <mxPoint x="59.83000000000001" y="160" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-112" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#fad7ac;strokeColor=#b46504;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-105">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="115" y="200" as="sourcePoint" />
+            <mxPoint x="115" y="80" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-113" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#fad7ac;strokeColor=#b46504;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-105">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="165" y="200" as="sourcePoint" />
+            <mxPoint x="165" y="80" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-114" value="Thread" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#f5f5f5;fontColor=#333333;strokeColor=#666666;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-105">
+          <mxGeometry x="150" y="200" width="110" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-115" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#fad7ac;strokeColor=#b46504;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-105">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="219.83000000000004" y="200" as="sourcePoint" />
+            <mxPoint x="219.83000000000004" y="160" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-116" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#fad7ac;strokeColor=#b46504;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-105">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="165" y="310" as="sourcePoint" />
+            <mxPoint x="165" y="240" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-117" value="" style="swimlane;startSize=0;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-103">
+          <mxGeometry x="340" y="30" width="280" height="260" as="geometry" />
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-118" value="Block" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-117">
+          <mxGeometry width="60" height="30" as="geometry" />
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-119" value="Shared memory" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#fad7ac;strokeColor=#b46504;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-117">
+          <mxGeometry x="20" y="40" width="240" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-120" value="Registers" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#fad7ac;strokeColor=#b46504;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-117">
+          <mxGeometry x="20" y="120" width="70" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-121" value="Registers" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#fad7ac;strokeColor=#b46504;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-117">
+          <mxGeometry x="190" y="120" width="70" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-122" value="Thread" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#f5f5f5;fontColor=#333333;strokeColor=#666666;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-117">
+          <mxGeometry x="20" y="200" width="110" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-123" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#fad7ac;strokeColor=#b46504;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-117">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="59.83000000000001" y="200" as="sourcePoint" />
+            <mxPoint x="59.83000000000001" y="160" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-124" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#fad7ac;strokeColor=#b46504;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-117">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="115" y="200" as="sourcePoint" />
+            <mxPoint x="115" y="80" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-125" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#fad7ac;strokeColor=#b46504;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-117">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="165" y="200" as="sourcePoint" />
+            <mxPoint x="165" y="80" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-126" value="Thread" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#f5f5f5;fontColor=#333333;strokeColor=#666666;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-117">
+          <mxGeometry x="150" y="200" width="110" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-127" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#fad7ac;strokeColor=#b46504;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-117">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="219.83000000000004" y="200" as="sourcePoint" />
+            <mxPoint x="219.83000000000004" y="160" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-149" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#d5e8d4;strokeColor=#3A4F2D;endSize=6;startSize=6;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="60" y="320" as="sourcePoint" />
+            <mxPoint x="60" y="270" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-150" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#d5e8d4;strokeColor=#3A4F2D;endSize=6;startSize=6;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="210" y="320" as="sourcePoint" />
+            <mxPoint x="210" y="270" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-151" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#d5e8d4;strokeColor=#3A4F2D;endSize=6;startSize=6;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="430" y="320" as="sourcePoint" />
+            <mxPoint x="430" y="270" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-152" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#d5e8d4;strokeColor=#3A4F2D;endSize=6;startSize=6;jumpSize=6;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="580" y="320" as="sourcePoint" />
+            <mxPoint x="580" y="270" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-146" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#fad7ac;strokeColor=#b46504;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="80" y="340" as="sourcePoint" />
+            <mxPoint x="80" y="270" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-148" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#fad7ac;strokeColor=#b46504;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="455" y="340" as="sourcePoint" />
+            <mxPoint x="455" y="270" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-147" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#fad7ac;strokeColor=#b46504;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="560" y="340" as="sourcePoint" />
+            <mxPoint x="560" y="270" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-156" value="Global memory" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-103">
+          <mxGeometry x="275" y="320" width="90" height="30" as="geometry" />
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-157" value="Texture/Surface memory" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#cdeb8b;strokeColor=#36393d;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-103">
+          <mxGeometry x="30" y="420" width="580" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-158" value="Constant memory" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#cdeb8b;strokeColor=#36393d;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-103">
+          <mxGeometry x="30" y="480" width="580" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-159" value="" style="endArrow=classic;html=1;rounded=0;fillColor=#d5e8d4;strokeColor=#3A4F2D;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="520" y="480" as="sourcePoint" />
+            <mxPoint x="520" y="270" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-160" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#d5e8d4;strokeColor=#3A4F2D;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="500" y="420" as="sourcePoint" />
+            <mxPoint x="500" y="270" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-161" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#d5e8d4;strokeColor=#3A4F2D;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="370" y="420" as="sourcePoint" />
+            <mxPoint x="370" y="270" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-162" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#d5e8d4;strokeColor=#3A4F2D;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="270" y="420" as="sourcePoint" />
+            <mxPoint x="270" y="270" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-163" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#d5e8d4;strokeColor=#3A4F2D;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="140" y="420" as="sourcePoint" />
+            <mxPoint x="140" y="270" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-164" value="" style="endArrow=classic;html=1;rounded=0;fillColor=#d5e8d4;strokeColor=#3A4F2D;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="120" y="480" as="sourcePoint" />
+            <mxPoint x="120" y="270" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-165" value="" style="endArrow=classic;html=1;rounded=0;fillColor=#d5e8d4;strokeColor=#3A4F2D;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="250" y="480" as="sourcePoint" />
+            <mxPoint x="250" y="270" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-166" value="" style="endArrow=classic;html=1;rounded=0;fillColor=#d5e8d4;strokeColor=#3A4F2D;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="390" y="480" as="sourcePoint" />
+            <mxPoint x="390" y="270" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-167" value="Local memory" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#ffe6cc;strokeColor=#d79b00;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-103">
+          <mxGeometry x="530" y="340" width="70" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-168" value="Local memory" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#ffe6cc;strokeColor=#d79b00;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-103">
+          <mxGeometry x="400" y="340" width="70" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-169" value="Local memory" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#ffe6cc;strokeColor=#d79b00;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-103">
+          <mxGeometry x="170" y="340" width="70" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="pr25IfJEznjW-GiSo1Zk-170" value="Local memory" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#ffe6cc;strokeColor=#d79b00;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-103">
+          <mxGeometry x="40" y="340" width="70" height="40" as="geometry" />
+        </mxCell>
+      </root>
+    </mxGraphModel>
+  </diagram>
+</mxfile>
--- a/package/Project.toml
+++ b/package/Project.toml
@ -1,13 +1,19 @@
 name = "ExpressionExecutorCuda"
 uuid = "5b8ee377-1e19-4ba5-a85c-78c7d1694bfe"
-authors = ["Daniel Wiplinger"]
+authors = ["Daniel Roth"]
 version = "1.0.0-DEV"

 [deps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"

 [compat]
+LinearAlgebra = "1.11.0"
+Printf = "1.11.0"
+Random = "1.11.0"
 julia = "1.6.7"

 [extras]
--- a/package/src/Code.jl
+++ b/package/src/Code.jl
@ -0,0 +1,207 @@
+using Printf
+
+@enum Opcode::UInt8 begin
+  opc_stop = 1 # must start with 1 here TODO: remove stop
+  opc_add 
+  opc_sub
+  opc_mul 
+  opc_div
+  opc_inv
+  opc_log 
+  opc_log10 
+  opc_exp 
+  opc_pow
+  opc_powconst
+  opc_powabs
+  opc_neg
+  opc_abs 
+  opc_sign
+  opc_sin
+  opc_asin
+  opc_tan
+  opc_tanh
+  opc_cos
+  opc_cosh
+  opc_constant 
+  opc_param 
+  opc_variable
+end
+
+const terminal_opcodes = [opc_stop, opc_constant, opc_param, opc_variable]
+const unary_opcodes = [opc_log, opc_log10, opc_exp, opc_abs, opc_sign, opc_sin, opc_cos, opc_cosh, opc_asin, opc_tan, opc_tanh, opc_powconst, opc_neg, opc_inv]
+const binary_opcodes = [opc_add, opc_sub, opc_mul, opc_div, opc_pow, opc_powabs]
+
+function opcode(sy::Symbol)::Opcode
+    if sy == :+ return opc_add 
+    elseif sy == :- return opc_sub
+    elseif sy == :* return opc_mul
+    elseif sy == :/ return opc_div
+    elseif sy == :inv return opc_inv
+    elseif sy == :log return opc_log
+    elseif sy == :log10 return opc_log10
+    elseif sy == :exp return opc_exp
+    elseif sy == :^ return opc_powabs # TODO: this is temporary to enforce that all powers are evaluated as pow(abs(...)) for parameter optimization
+    elseif sy == :powabs return opc_powabs # TODO: this is temporary to enforce that all powers are evaluated as pow(abs(...)) for parameter optimization
+    elseif sy == :abs return opc_abs
+    elseif sy == :sign return opc_sign
+    elseif sy == :sin return opc_sin
+    elseif sy == :asin return opc_asin
+    elseif sy == :cos return opc_cos
+    elseif sy == :cosh return opc_cosh
+    elseif sy == :tan return opc_tan
+    elseif sy == :tanh return opc_tanh
+    else error("no opcode for symbol $sy")
+    end
+end
+
+function degree(opc::Opcode)::Integer 
+    if opc in terminal_opcodes return 0
+    elseif opc in unary_opcodes return 1
+    elseif opc in binary_opcodes return 2
+    else error("unknown degree of opcode $opc")
+    end
+end
+
+
+# code is a Vector{Instruction} which is a linear representation of a directed acyclic graph of expressions.
+# The code can be evaluated from left to right.
+struct Instruction{T}
+    opcode::Opcode
+    arg1idx::UInt32 # index of first argument. 0 for terminals
+    arg2idx::UInt32 # index of second argument. 0 for functions with a single argument
+    idx::UInt32  # for variables and parameters
+    val::T # for constants
+end
+
+
+function Base.show(io::IO, instr::Instruction)
+    Printf.format(io, Printf.format"%15s %3d %3d %3d %f", instr.opcode, instr.arg1idx, instr.arg2idx, instr.idx, instr.val)
+end
+
+create_const_instruction(val::T) where {T}                                = Instruction{T}(opc_constant, UInt32(0), UInt32(0), UInt32(0), val)
+create_var_instruction(::Type{T}, varidx) where {T}                       = Instruction{T}(opc_variable, UInt32(0), UInt32(0), UInt32(varidx), zero(T))
+create_param_instruction(::Type{T}, paramidx; val::T = zero(T)) where {T} = Instruction{T}(opc_param, UInt32(0), UInt32(0), UInt32(paramidx), val)
+
+
+function convert_expr_to_code(::Type{T}, expr::Expr)::Vector{Instruction{T}} where {T}
+    code = Vector{Instruction{T}}()
+
+    Base.remove_linenums!(expr)
+    paramTup = expr.args[1]
+    xSy = paramTup.args[1]
+    pSy = paramTup.args[2]
+    body = expr.args[2]
+
+    cache = Dict{Any,Int32}() # for de-duplication of expressions. If an expression is in the cache simply return the index of the existing code
+
+    convert_expr_to_code!(code, cache, body, xSy, pSy)
+
+    # for debugging
+    # for tup in sort(cache; byvalue=true)
+    #     println(tup)
+    # end
+    return code
+end
+
+# uses cache (hashcons) to de-duplicate subexpressions in the tree.
+function convert_expr_to_code!(code::Vector{Instruction{T}}, cache, val::TV, xSy, pSy)::UInt32 where {T,TV}
+    if haskey(cache, val) return cache[val] end
+
+    push!(code, create_const_instruction(T(val)))
+    cache[val] = length(code)    
+    return length(code)
+end
+
+function convert_expr_to_code!(code::Vector{Instruction{T}}, cache, expr::Expr, xSy, pSy)::UInt32 where {T}
+    # predicate to check if an expression is abs(...)
+    is_abs(a) = a isa Expr && a.head == :call && a.args[1] == :abs
+
+    if haskey(cache, expr) return cache[expr] end
+
+    sy = expr.head
+    if sy == :call
+        func = expr.args[1]
+        arg1idx::UInt32 = 0
+        arg2idx::UInt32 = 0
+        # unary functions
+        if length(expr.args) == 2
+            arg1idx = convert_expr_to_code!(code, cache, expr.args[2], xSy, pSy)
+            if (func == :-)
+                # - with one argument => negate
+                push!(code, Instruction{T}(opc_neg, arg1idx, UInt32(0), UInt32(0), zero(T)))
+            elseif (func == :sqrt)
+                push!(code, Instruction{T}(opc_powconst, arg1idx, UInt32(0), UInt32(0), T(0.5)))
+            else
+                push!(code, Instruction{T}(opcode(func), arg1idx, UInt32(0), UInt32(0), zero(T)))
+            end
+        elseif length(expr.args) == 3
+            arg1idx = convert_expr_to_code!(code, cache, expr.args[2], xSy, pSy)
+            if func == :^ && expr.args[3] isa Number && round(expr.args[3]) == expr.args[3] # is integer 
+                # special case for constant powers
+                push!(code, Instruction{T}(opc_powconst, arg1idx, UInt32(0), UInt32(0), T(expr.args[3])))
+            elseif func == :^ && is_abs(expr.args[2])
+                # fuse pow(abs(x), y) --> powabs(x,y)
+                absexpr = expr.args[2]
+                x = absexpr.args[2]
+                arg1idx = convert_expr_to_code!(code, cache, x, xSy, pSy) # because of hashconsing this will return the index within the code for abs(x) generated above
+                arg2idx = convert_expr_to_code!(code, cache, expr.args[3], xSy, pSy)
+                push!(code, Instruction{T}(opc_powabs, arg1idx, arg2idx, UInt32(0), zero(T)))
+            else 
+                arg2idx = convert_expr_to_code!(code, cache, expr.args[3], xSy, pSy)
+                push!(code, Instruction{T}(opcode(func), arg1idx, arg2idx, UInt32(0), zero(T)))
+            end
+        else 
+            # dump(expr)
+            errpr("only unary and binary functions are supported ($func is not supported)")
+        end
+    elseif sy == :ref
+        arrSy = expr.args[1]
+        idx = expr.args[2]
+        if arrSy == xSy
+            push!(code, create_var_instruction(T, idx))
+        elseif arrSy == pSy
+            push!(code, create_param_instruction(T, idx))
+        else
+            dump(expr)
+            throw(UndefVarError("unknown symbol"))
+        end
+    else 
+        error("Unsupported symbol $sy")
+    end
+
+    cache[expr] = length(code)
+    return length(code)
+end
+
+
+function Base.show(io::IO, code::AbstractArray{Instruction{T}}) where {T}
+    sym = Dict(
+        opc_stop => ".",
+        opc_add => "+",
+        opc_sub => "-",
+        opc_neg => "neg",
+        opc_mul => "*",
+        opc_div => "/",
+        opc_inv => "inv",
+        opc_pow => "^",
+        opc_powabs => "abs^",
+        opc_powconst => "^c",
+        opc_log => "log",
+        opc_log10 => "l10",
+        opc_exp => "exp",
+        opc_abs => "abs",
+        opc_sign => "sgn",
+        opc_sin => "sin",
+        opc_cos => "cos",
+        opc_variable => "var",
+        opc_constant => "con",
+        opc_param => "par",
+    )
+    
+    for i in eachindex(code)
+        instr = code[i]
+        Printf.format(io, Printf.format"%4d %4s %3d %3d %3d %f", i, sym[instr.opcode], instr.arg1idx, instr.arg2idx, instr.idx, instr.val)
+        println(io)
+        # printfmtln(io, "{1:>4d} {2:>4s} {3:>3d} {4:>3d} {5:>3d} {6:>}", i, sym[instr.opcode], instr.arg1idx, instr.arg2idx, instr.idx, instr.val)
+    end
+end
--- a/package/src/CpuInterpreter.jl
+++ b/package/src/CpuInterpreter.jl
@ -0,0 +1,172 @@
+using Random 
+
+struct InterpreterBuffers{T}
+    resultcache::Matrix{T} # for forward eval
+    diffcache::Matrix{T} # for reverse AD
+    jaccache::Matrix{T} # for Jacobian
+    tmp::Vector{T} # a temporary space for each of the vector operations
+
+    function InterpreterBuffers{T}(codelen, num_param, batchsize) where {T<:AbstractFloat}
+        buf = Matrix{T}(undef, batchsize, codelen) 
+        rev_buf = Matrix{T}(undef, batchsize, codelen)
+        jac_buf = Matrix{T}(undef, batchsize, num_param)
+        tmp = Vector{T}(undef, batchsize)
+
+        new(buf, rev_buf, jac_buf, tmp)
+    end
+end
+
+mutable struct Interpreter{T}
+    const code::Vector{Instruction{T}}
+    const buffers::InterpreterBuffers{T}
+    const batchsize::UInt32
+    pc::Int32
+
+    function Interpreter{T}(expr::Expr, num_param; batchsize = 1024) where {T<:AbstractFloat}
+        code = convert_expr_to_code(T, expr)
+        # println(code)
+        buffers = InterpreterBuffers{T}(length(code), num_param, batchsize)
+        new(code, buffers, batchsize, 1)
+    end
+end
+
+peek_instruction(interpreter) = interpreter.code[interpreter.pc]
+
+
+
+# batch size 1024 was fast in benchmark
+interpret!(result::AbstractVector{T}, expr::Expr, x::AbstractMatrix{T}, p; batchsize=1024) where {T} = interpret!(result, Interpreter{T}(expr, length(p); batchsize), x, p)
+
+# for Float evaluation use the preallocated buffer
+function interpret!(result::AbstractVector{T}, interpreter::Interpreter{T}, x::AbstractMatrix{T}, p::AbstractArray{T}) where {T} 
+    interpret_withbuf!(result, interpreter, interpreter.buffers.resultcache, interpreter.buffers.tmp, x, p)
+end
+
+function interpret_withbuf!(result::AbstractVector{T}, interpreter::Interpreter{T}, batchresult, tmp, x::AbstractMatrix{T}, p::AbstractArray{TD}) where {T,TD}
+    allrows = axes(x, 1)
+    @assert length(result) == length(allrows)
+
+    
+    # all batches
+    start = first(allrows)
+    while start + interpreter.batchsize < last(allrows)
+        batchrows = start:(start + interpreter.batchsize - 1)
+        interpret_batch!(interpreter, batchresult, tmp, x, p, batchrows)
+        copy!((@view result[batchrows]), (@view batchresult[:, end]))
+        start += interpreter.batchsize
+    end
+
+
+    # process remaining rows
+    remrows = start:last(allrows)
+    if length(remrows) > 0
+        interpret_batch!(interpreter, batchresult, tmp, x, p, remrows)
+        copy!((@view result[remrows]), (@view batchresult[1:length(remrows), end]))
+        # res += sum(view(batchresult, 1:length(remrows), lastcolidx))
+    end
+    # res
+    result
+end
+
+function interpret_batch!(interpreter, 
+                    batchresult, tmp,
+                    x, p, rows)
+    # forward pass
+    interpret_fwd!(interpreter, batchresult, tmp, x, p, rows)
+
+    nothing
+end
+
+function interpret_fwd!(interpreter, batchresult, tmp, x, p, rows)
+    interpreter.pc = 1 
+    while interpreter.pc <= length(interpreter.code)
+        step!(interpreter, batchresult, tmp, x, p, rows)
+    end
+end
+
+
+function step!(interpreter, batchresult, tmp, x, p, range)
+    instr = interpreter.code[interpreter.pc]
+    opc = instr.opcode
+    res = view(batchresult, :, interpreter.pc)
+
+    if degree(opc) == 0
+        if opc == opc_variable 
+            copyto!(res, view(x, range, instr.idx))
+        elseif opc == opc_param
+            fill!(res, p[instr.idx])
+        elseif opc == opc_constant
+            fill!(res, instr.val)
+        end
+    elseif degree(opc) == 1
+        arg = view(batchresult, :, instr.arg1idx)
+        # is converted to a switch automatically by LLVM
+        if     opc == opc_log      vec_log!(res, arg, tmp)
+        elseif opc == opc_log10    vec_log10!(res, arg, tmp)
+        elseif opc == opc_exp      vec_exp!(res, arg, tmp)
+        elseif opc == opc_abs      vec_abs!(res, arg, tmp)
+        elseif opc == opc_neg      vec_neg!(res, arg, tmp)
+        elseif opc == opc_inv      vec_inv!(res, arg, tmp)
+        elseif opc == opc_sign     vec_sign!(res, arg, tmp)
+        elseif opc == opc_powconst vec_powconst!(res, arg, instr.val, tmp);
+        elseif opc == opc_sin      vec_sin!(res, arg, tmp)
+        elseif opc == opc_cos      vec_cos!(res, arg, tmp)
+        elseif opc == opc_cosh     vec_cosh!(res, arg, tmp)
+        elseif opc == opc_asin     vec_asin!(res, arg, tmp)
+        elseif opc == opc_tan      vec_tan!(res, arg, tmp)
+        elseif opc == opc_tanh     vec_tanh!(res, arg, tmp)
+
+        else throw(DomainError("Unsupported opcode $opc"))
+        end
+    elseif degree(opc) == 2
+        left = view(batchresult, :, instr.arg1idx)
+        right = view(batchresult, :, instr.arg2idx)
+
+        if     opc == opc_add    vec_add!(res, left, right, tmp)
+        elseif opc == opc_sub    vec_sub!(res, left, right, tmp)
+        elseif opc == opc_mul    vec_mul!(res, left, right, tmp)
+        elseif opc == opc_div    vec_div!(res, left, right, tmp)
+        elseif opc == opc_pow    vec_pow!(res, left, right, tmp)
+        elseif opc == opc_powabs vec_powabs!(res, left, right, tmp)
+        else throw(DomainError("Unsupported opcode $opc"))
+        end
+        # if any(isnan, res) 
+        #     throw(DomainError("got NaN for $opc $(interpreter.pc) $left $right"))
+        # end
+    end
+
+    interpreter.pc += 1
+
+    return nothing
+end
+
+
+for unaryfunc in (:exp, :abs, :sin, :cos, :cosh, :asin, :tan, :tanh, :sinh)
+    funcsy = Symbol("vec_$(unaryfunc)!")
+    @eval function $funcsy(res::AbstractVector{T}, arg::AbstractVector{T}, ::AbstractVector{T}) where T<:Real
+        @simd for i in eachindex(res)
+            @inbounds res[i] = Base.$unaryfunc(arg[i])
+        end
+    end
+end
+
+
+function vec_add!(res::AbstractVector{TE}, left::AbstractVector{TE}, right::AbstractVector{TE}, ::AbstractVector{TE}) where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = left[i] + right[i] end end
+function vec_sub!(res::AbstractVector{TE}, left::AbstractVector{TE}, right::AbstractVector{TE}, ::AbstractVector{TE}) where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = left[i] - right[i] end end
+function vec_mul!(res::AbstractVector{TE}, left::AbstractVector{TE}, right::AbstractVector{TE}, ::AbstractVector{TE}) where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = left[i] * right[i] end end
+function vec_div!(res::AbstractVector{TE}, left::AbstractVector{TE}, right::AbstractVector{TE}, ::AbstractVector{TE}) where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = left[i] / right[i] end end
+function vec_pow!(res::AbstractVector{TE}, left::AbstractVector{TE}, right::AbstractVector{TE}, ::AbstractVector{TE}) where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = left[i] ^ right[i] end end
+
+# TODO: special case scalar power
+function vec_powconst!(res::AbstractVector{TE}, left::AbstractVector{TE}, right::TC, ::AbstractVector{TE}) where {TE<:Real,TC<:Real} @simd for i in eachindex(res) @inbounds res[i] = left[i] ^ right end end
+function vec_powabs!(res::AbstractVector{TE}, left::AbstractVector{TE}, right::AbstractVector{TE}, ::AbstractVector{TE}) where TE<:Real   @simd for i in eachindex(res) @inbounds res[i] = abs(left[i]) ^ right[i] end end
+
+function vec_neg!(res::AbstractVector{TE}, arg::AbstractVector{TE}, ::AbstractVector{TE})   where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = -arg[i] end end
+function vec_inv!(res::AbstractVector{TE}, arg::AbstractVector{TE}, ::AbstractVector{TE})   where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = inv(arg[i]) end end
+function vec_sign!(res::AbstractVector{TE}, arg::AbstractVector{TE}, ::AbstractVector{TE})  where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = sign(arg[i]) end end
+
+# handle log and exp specially to use NaN instead of DomainError
+function vec_log!(res::AbstractVector{TE}, arg::AbstractVector{TE}, ::AbstractVector{TE})   where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = arg[i] < zero(TE) ? TE(NaN) : log(arg[i]) end end
+function vec_log10!(res::AbstractVector{TE}, arg::AbstractVector{TE}, ::AbstractVector{TE}) where TE<:Real @simd for i in eachindex(res) @inbounds res[i] = arg[i] < zero(TE) ? TE(NaN) : log10(arg[i]) end end
+
+
--- a/package/src/ExpressionExecutorCuda.jl
+++ b/package/src/ExpressionExecutorCuda.jl
@ -1,30 +1,76 @@
 module ExpressionExecutorCuda
+include("Utils.jl")
 include("ExpressionProcessing.jl")
 include("Interpreter.jl")
+include("Transpiler.jl")

-export interpret_gpu
+module CpuInterpreter
+include("Code.jl")
+include("CpuInterpreter.jl")
+end
+
+export interpret_gpu,interpret_cpu
 export evaluate_gpu
 export test

 # Some assertions:
 # Variables and parameters start their naming with "1" meaning the first variable/parameter has to be "x1/p1" and not "x0/p0"
+# Matrix X is column major
 # each index i in exprs has to have the matching values in the column i in Matrix X so that X[:,i] contains the values for expr[i]. The same goes for p
 #     This assertion is made, because in julia, the first index doesn't have to be 1
 #

 # Evaluate Expressions on the GPU
-function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
-	exprsPostfix = ExpressionProcessing.expr_to_postfix(exprs[1])
+function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32}
+	@assert axes(exprs) == axes(p)
+	ncols = size(X, 2)
+
+	results = Matrix{Float32}(undef, ncols, length(exprs))
+
+	for i in 1:repetitions # Simulate parameter tuning
+		results = Interpreter.interpret(exprs, X, p)
+	end
+
+	return results
 end

 # Convert Expressions to PTX Code and execute that instead
-function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}})::Matrix{Float32}
-	# Look into this to maybe speed up PTX generation: https://cuda.juliagpu.org/stable/tutorials/introduction/#Parallelization-on-the-CPU
+function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32}
+	@assert axes(exprs) == axes(p)
+	ncols = size(X, 2)
+
+	results = Matrix{Float32}(undef, ncols, length(exprs))
+
+	for i in 1:repetitions # Simulate parameter tuning
+		results = Transpiler.evaluate(exprs, X, p)
 	end

+	return results
 end


+# Evaluate Expressions on the CPU
+function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32}
+	@assert axes(exprs) == axes(p)
+	nrows = size(X, 1)
+	
+	# each column of the matrix has the result for an expr
+	res = Matrix{Float32}(undef, nrows, length(exprs))
+
+	for i in eachindex(exprs) 
+		# The interpreter holds the postfix code and buffers for evaluation. It is costly to create
+		interpreter = CpuInterpreter.Interpreter{Float32}(exprs[i], length(p[i])) 
+
+		# If an expression has to be evaluated multiple times (e.g. for different parameters),
+		# it is worthwhile to reuse the interpreter to reduce the number of allocations
+		for rep in 1:repetitions
+			CpuInterpreter.interpret!((@view res[:,i]), interpreter, X, p[i])
+		end
+	end
+
+	res
+end
+

 # Flow
 # input: Vector expr    == expressions contains eg. 4 expressions
@ -35,3 +81,5 @@ end
 # The following can be done on the CPU
 #     convert expression to postfix notation (mandatory)
 #     optional: replace every parameter with the correct value (should only improve performance if data transfer is the bottleneck)
+
+end
--- a/package/src/ExpressionProcessing.jl
+++ b/package/src/ExpressionProcessing.jl
@ -30,6 +30,7 @@ function expr_to_postfix(expr::Expr)::PostfixType
 		if typeof(arg) === Expr
 			append!(postfix, expr_to_postfix(arg))
 		elseif typeof(arg) === Symbol # variables/parameters
+			# maybe TODO: replace the parameters with their respective values, as this might make the expr evaluation faster
 			exprElement = convert_to_ExpressionElement(convert_var_to_int(arg))
 			push!(postfix, exprElement)
 		else
@ -70,6 +71,10 @@ function get_operator(op::Symbol)::Operator
 		return EXP
 	elseif op == :sqrt
 		return SQRT
+	elseif op == :powabs
+		return POWER # TODO: Fix this
+	else
+		throw("Operator unknown")
 	end
 end

@ -132,54 +137,4 @@ function is_binary_operator(operator::Operator)::Bool
 	end
 end

-#
-# Everything below is currently not needed. Left here for potential future use
-#
-
-const SymbolTable32 = Dict{Tuple{Expr, Symbol},Float32}
-
-"Replaces all the variables and parameters of the given expression with their corresponding Value stored in the symtable
-# Arguments
- - `symtable::SymbolTable32`: Contains the values of all variables for each expression
- - `originalExpr::Expr`: Contains a deep copy of the original expression. It is used to link the expression and variables to their according Value stored in the symtable
-"
-function replace_variables!(ex::Expr, symtable::SymbolTable32, originalExpr::Expr)
-	for i in 1:length(ex.args)
-		arg = ex.args[i]
-		if typeof(arg) === Expr
-			replace_variables!(arg, symtable, originalExpr)
-		elseif haskey(symtable, (originalExpr,arg)) # We found a variable/parameter and can replace it with the actual Value
-			ex.args[i] = symtable[(originalExpr,arg)]
-		end
-	end
-end
-
-# TODO: Completly rewrite this function because I misunderstood it. Not every column is linked to an expression. therefore all other functions need to be reworked as well. Probably can't replace the variables in julia anymore, look into this. (see ExpressionExecutorCuda.jl for more info)
-#       Before rewriting, proceed with just creating a postfix notation and sending the variable matrix as well as the parameter "matrix" to the GPU to perform first calculations
-function construct_symtable(expressions::Vector{Expr}, mat::Matrix{Float32}, params::Vector{Vector{Float32}})::SymbolTable32
-	symtable = SymbolTable32()
-
-	for i in eachindex(expressions)
-		expr = expressions[i]
-		values = mat[i,:]
-		parameters = params[i]
-
-		fill_symtable!(expr, symtable, values, "x")
-		fill_symtable!(expr, symtable, parameters, "p")
-	end
-
-	return symtable
-end
-
-function fill_symtable!(expr::Expr, symtable::SymbolTable32, values::Vector{Float32}, symbolPrefix::String)
-	varIndex = 1
-	for j in eachindex(values)
-		val = values[j]
-		sym = Symbol(symbolPrefix, varIndex)
-
-		symtable[expr,sym] = val
-		varIndex += 1
-	end
-end
-
 end
--- a/package/src/Interpreter.jl
+++ b/package/src/Interpreter.jl
@ -2,6 +2,7 @@ module Interpreter
 using CUDA
 using StaticArrays
 using ..ExpressionProcessing
+using ..Utils

 export interpret

@ -11,22 +12,28 @@ export interpret
 - variables::Matrix{Float32} : The variables to use. Each column is mapped to the variables x1..xn
 - parameters::Vector{Vector{Float32}} : The parameters to use. Each Vector contains the values for the parameters p1..pn. The number of parameters can be different for every expression
 "
-function interpret(expressions::Vector{ExpressionProcessing.PostfixType}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
-	variableCols = size(variables, 2) # number of sets of variables to use for each expression
+function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
+	
+	exprs = Vector{ExpressionProcessing.PostfixType}(undef, length(expressions))
+	for i in eachindex(expressions)
+		exprs[i] = ExpressionProcessing.expr_to_postfix(expressions[i])
+	end
+	
+	variableCols = size(variables, 2) # number of variable sets to use for each expression
 	cudaVars = CuArray(variables)
-	cudaParams = create_cuda_array(parameters, NaN32) # column corresponds to data for one expression
-	cudaExprs = create_cuda_array(expressions, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression
+	cudaParams = Utils.create_cuda_array(parameters, NaN32) # column corresponds to data for one expression
+	cudaExprs = Utils.create_cuda_array(exprs, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression
 	# put into seperate cuArray, as this is static and would be inefficient to send seperatly to every kernel
-	cudaStepsize = CuArray([get_max_inner_length(expressions), get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression
+	cudaStepsize = CuArray([Utils.get_max_inner_length(exprs), Utils.get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression

 	# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
-	cudaResults = CuArray{Float32}(undef, variableCols, length(expressions))
+	cudaResults = CuArray{Float32}(undef, variableCols, length(exprs))

 	# Start kernel for each expression to ensure that no warp is working on different expressions
-	for i in eachindex(expressions)
+	for i in eachindex(exprs)
 		kernel = @cuda launch=false interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
-		config = launch_configuration(kernel.fun)
-		threads = min(variableCols, config.threads)
+		# config = launch_configuration(kernel.fun)
+		threads = min(variableCols, 256)
 		blocks = cld(variableCols, threads)

 		kernel(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i; threads, blocks)
@ -36,20 +43,22 @@ function interpret(expressions::Vector{ExpressionProcessing.PostfixType}, variab
 end

 #TODO: Add @inbounds to all indexing after it is verified that all works https://cuda.juliagpu.org/stable/development/kernel/#Bounds-checking
-const MAX_STACK_SIZE = 25 # The max number of values the expression can have. so Constant values, Variables and parameters
+const MAX_STACK_SIZE = 25 # The depth of the stack to store the values and intermediate results
 function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, variables::CuDeviceArray{Float32}, parameters::CuDeviceArray{Float32}, results::CuDeviceArray{Float32}, stepsize::CuDeviceArray{Int}, exprIndex::Int)
-	index = (blockIdx().x - 1) * blockDim().x + threadIdx().x # ctaid.x * ntid.x + tid.x
-	stride = gridDim().x * blockDim().x # nctaid.x * ntid.x
+	varSetIndex = (blockIdx().x - 1) * blockDim().x + threadIdx().x # ctaid.x * ntid.x + tid.x (1-based)
+	variableCols = length(variables) / stepsize[3]
+
+	if varSetIndex > variableCols
+		return
+	end

 	firstExprIndex = ((exprIndex - 1) * stepsize[1]) + 1 # Inclusive
 	lastExprIndex = firstExprIndex + stepsize[1] - 1 # Inclusive
 	firstParamIndex = ((exprIndex - 1) * stepsize[2]) # Exclusive
-	variableCols = length(variables) / stepsize[3]

 	operationStack = MVector{MAX_STACK_SIZE, Float32}(undef) # Try to get this to function with variable size too, to allow better memory usage
 	operationStackTop = 0 # stores index of the last defined/valid value
 	
-	for varSetIndex in index:stride
 	firstVariableIndex = ((varSetIndex-1) * stepsize[3]) # Exclusive
 	
 	for i in firstExprIndex:lastExprIndex
@ -62,7 +71,7 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var
 			if val > 0
 				operationStack[operationStackTop] = variables[firstVariableIndex + val]
 			else
-					val = -val
+				val = abs(val)
 				operationStack[operationStackTop] = parameters[firstParamIndex + val]
 			end
 		elseif expressions[i].Type == FLOAT32
@ -103,71 +112,8 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var
 	# "+ varSetIndex" -> to get the row inside the column at which to insert the result of the variable set (variable set = row)
 	resultIndex = convert(Int, (exprIndex - 1) * variableCols + varSetIndex) # Inclusive
 	results[resultIndex] = operationStack[operationStackTop]
-	end

 	return
 end

-
-"Retrieves the number of entries for the largest inner vector"
-function get_max_inner_length(vec::Vector{Vector{T}})::Int where T
-	maxLength = 0
-	@inbounds for i in eachindex(vec)
-		if length(vec[i]) > maxLength
-			maxLength = length(vec[i])
-		end
-	end
-
-	return maxLength
-end
-
-"Returns a CuArray filed with the data provided. The inner vectors do not have to have the same length. All missing elements will be the value ```invalidElement```"
-function create_cuda_array(data::Vector{Vector{T}}, invalidElement::T)::CuArray{T} where T
-	dataCols = get_max_inner_length(data)
-	dataRows = length(data)
-	dataMat = convert_to_matrix(data, invalidElement)
-	cudaArr = CuArray{T}(undef, dataCols, dataRows) # length(parameters) == number of expressions
-	copyto!(cudaArr, dataMat)
-
-	return cudaArr
-end
-
-"Converts a vector of vectors into a matrix. The inner vectors do not need to have the same length.
-
-All entries that cannot be filled have ```invalidElement``` as their value
-"
-function convert_to_matrix(vec::Vector{Vector{T}}, invalidElement::T)::Matrix{T} where T
-	vecCols = get_max_inner_length(vec)
-	vecRows = length(vec)
-	vecMat = fill(invalidElement, vecCols, vecRows)
-	
-	for i in eachindex(vec)
-		vecMat[:,i] = copyto!(vecMat[:,i], vec[i])
-	end
-
-	return vecMat
-end
-
-
-
-# Kernel
-function InterpretExplicit!(op::Operator, x, y)
-	index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-	stride = gridDim().x * blockDim().x
-
-	if op == ADD
-		# @cuprintln("Performing Addition") # Will only be displayed when the GPU is synchronized
-		for i = index:stride:length(y)
-			@inbounds y[i] += x[i]
-		end
-		return
-	elseif op == SUBTRACT
-		# @cuprintln("Performing Subtraction") # Will only be displayed when the GPU is synchronized
-		for i = index:stride:length(y)
-			@inbounds y[i] -= x[i]
-		end
-		return
-	end
-end
-
 end
--- a/package/src/Transpiler.jl
+++ b/package/src/Transpiler.jl
@ -1,47 +1,96 @@
 module Transpiler
 using CUDA
 using ..ExpressionProcessing
+using ..Utils

-# Number of threads per block/SM + max number of registers
 # https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications
-# Need to assume a max of 2048 threads per Streaming Multiprocessor (SM)
-# One SM can have 64*1024 32-bit registers at max
-# One thread can at max use 255 registers
-# Meaning one has access to at most 32 registers in the worst case. Using 64 bit values this number gets halfed (see: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#multiprocessor-level (almost at the end of the linked section))
-
-# Maybe helpful for future performance tuning: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#maximum-number-of-registers-per-thread
-
-# https://docs.nvidia.com/cuda/cuda-c-programming-guide/#multiprocessor-level
-# This states, that using fewer registers allows more threads to reside on a single SM which improves performance. 
-# So I could use more registers at the expense for performance. Depending on how this would simplify my algorithm, I might do this and leave more optimisation to future work
-
-# Since the generated expressions should have between 10 and 50 symbols, I think allowing a max. of 128 32-bit registers should make for an easy algorithm. If during testing the result is slow, maybe try reducing the number of registers and perform more intelligent allocation/assignment
-# With 128 Registers, one could have 32 Warps on one SM ((128 * 16 = 2048) * 32 == 64*1024 == max number of registers per SM) This means 512 Threads per SM in the worst case
-
-#
-# Make a "function execute(...)" that takes the data and the transpiled code. Pass the data to the kernel and start executing
-# Note: Maybe make an additional function that transpiles and executed the code. This would then be the function the user calls
-#

+const BYTES = sizeof(Float32)
 const Operand = Union{Float32, String} # Operand is either fixed value or register
+cache = Dict{Expr, CuFunction}() # needed if multiple runs with the same expr but different parameters are performed
+
+function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
+	varRows = size(variables, 1)
+	variableCols = size(variables, 2)
+	kernels = Vector{CuFunction}(undef, length(expressions))
+	
+	# Test this parallel version again when doing performance tests. With the simple "functionality" tests this took 0.03 seconds while sequential took "0.00009" seconds
+	# Threads.@threads for i in eachindex(expressions)
+	#   TODO: Use cache
+	# 	kernel = transpile(expressions[i], varRows, Utils.get_max_inner_length(parameters))
+
+	# 	linker = CuLink()
+	# 	add_data!(linker, "ExpressionProcessing", kernel)
+
+	# 	image = complete(linker)
+	
+	# 	mod = CuModule(image)
+	# 	kernels[i] = CuFunction(mod, "ExpressionProcessing")
+	# end
+
+	for i in eachindex(expressions)
+		if haskey(cache, expressions[i])
+			kernels[i] = cache[expressions[i]]
+			continue
+		end
+
+		formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
+		kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i-1) # i-1 because julia is 1-based but PTX needs 0-based indexing
+		
+		linker = CuLink()
+		add_data!(linker, "ExpressionProcessing", kernel)
+		
+		image = complete(linker)
+		
+		mod = CuModule(image)
+		kernels[i] = CuFunction(mod, "ExpressionProcessing")
+		cache[expressions[i]] = kernels[i]
+	end
+
+	cudaVars = CuArray(variables) # maybe put in shared memory (see PerformanceTests.jl for more info)
+	cudaParams = Utils.create_cuda_array(parameters, NaN32) # maybe make constant (see PerformanceTests.jl for more info)
+
+	# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
+	cudaResults = CuArray{Float32}(undef, variableCols, length(expressions))
+
+	# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
+	for i in eachindex(kernels)
+		# config = launch_configuration(kernels[i])
+		threads = min(variableCols, 256)
+		blocks = cld(variableCols, threads)
+
+		cudacall(kernels[i], (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
+	end
+
+	return cudaResults
+end

 # To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string
-function transpile(expression::ExpressionProcessing.PostfixType)::String
+# seekstart(buf1); write(buf2, buf1)
+"
+- param ```varSetSize```: The size of a variable set. Equal to number of rows of variable matrix (in a column major matrix)
+- param ```paramSetSize```: The size of the longest parameter set. As it has to be stored in a column major matrix, the nr of rows is dependent oon the longest parameter set
+- param ```expressionIndex```: The 0-based index of the expression
+"
+function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Integer, paramSetSize::Integer, 
+				   nrOfVariableSets::Integer, expressionIndex::Integer)::String
 	exitJumpLocationMarker = "\$L__BB0_2"
 	ptxBuffer = IOBuffer()
+	regManager = Utils.RegisterManager(Dict(), Dict())
+
+	# TODO: Suboptimal solution
+	signature, paramLoading = get_kernel_signature("ExpressionProcessing", [Float32, Float32, Float32], regManager) # Vars, Params, Results
+	guardClause, threadId64Reg = get_guard_clause(exitJumpLocationMarker, nrOfVariableSets, regManager)

 	println(ptxBuffer, get_cuda_header())
-	println(ptxBuffer, get_kernel_signature("ExpressionProcessing", [Int32, Float32]))
+	println(ptxBuffer, signature)
 	println(ptxBuffer, "{")

-	# TODO: Parameter loading

-	# TODO: once parameters are loaded, the second parameter for the guard clause can be set
-	temp = get_next_free_register("r")
-	guardClause = get_guard_clause(exitJumpLocationMarker, temp) # since we need to know how many registers we used, we cannot yet write the guard clause to the ptxBuffer
-
-	calc_code = generate_calculation_code(expression)
-	println(ptxBuffer, get_register_definitions())
+	calc_code = generate_calculation_code(expression, "%parameter0", varSetSize, "%parameter1", paramSetSize, "%parameter2", 
+										  threadId64Reg, expressionIndex, nrOfVariableSets, regManager)
+	println(ptxBuffer, Utils.get_register_definitions(regManager))
+	println(ptxBuffer, paramLoading)
 	println(ptxBuffer, guardClause)
 	println(ptxBuffer, calc_code)

@ -50,37 +99,42 @@ function transpile(expression::ExpressionProcessing.PostfixType)::String
 	println(ptxBuffer, "}")

 	generatedCode = String(take!(ptxBuffer))
-	println(generatedCode)
 	return generatedCode
 end

 # TODO: Make version, target and address_size configurable; also see what address_size means exactly
 function get_cuda_header()::String
 	return "
-.version 7.1
-.target sm_52
+.version 8.5
+.target sm_61
 .address_size 64
 "
 end

-function get_kernel_signature(kernelName::String, parameters::Vector{DataType})::String
+"
+param ```parameters```: [1] = nr of var sets; [2] = variables; [3] = parameters; [4] = result
+"
+function get_kernel_signature(kernelName::String, parameters::Vector{DataType}, regManager::Utils.RegisterManager)::Tuple{String, String}
+
 	signatureBuffer = IOBuffer()
+	paramLoadingBuffer = IOBuffer()
 	print(signatureBuffer, ".visible .entry ")
 	print(signatureBuffer, kernelName)
 	println(signatureBuffer, "(")
 	
-	
 	for i in eachindex(parameters)
-		type = type_to_ptx_type(parameters[i])
-		print(signatureBuffer, 
-			  "  .param ", type, " ", kernelName, "_param_", i)
+		print(signatureBuffer, "  .param .u64", " ", "param_", i)
+
+		parametersLocation = Utils.get_next_free_register(regManager, "rd")
+		println(paramLoadingBuffer, "ld.param.u64   $parametersLocation, [param_$i];")
+		println(paramLoadingBuffer, "cvta.to.global.u64   $(Utils.get_next_free_register(regManager, "parameter")), $parametersLocation;")
 		if i != lastindex(parameters)
 			println(signatureBuffer, ",")
 		end
 	end

 	print(signatureBuffer, ")")
-	return String(take!(signatureBuffer))
+	return (String(take!(signatureBuffer)), String(take!(paramLoadingBuffer)))
 end

 "
@ -88,35 +142,45 @@ Constructs the PTX code used for handling the case where too many threads are st

 - param ```nrOfVarSetsRegister```: The register which holds the total amount of variable sets for the kernel
 "
-function get_guard_clause(exitJumpLocation::String, nrOfVarSetsRegister::String)::String
+function get_guard_clause(exitJumpLocation::String, nrOfVarSets::Integer, regManager::Utils.RegisterManager)::Tuple{String, String}
 	guardBuffer = IOBuffer()

-	threadIds = get_next_free_register("r")
-	threadsPerCTA = get_next_free_register("r")
-	currentThreadId = get_next_free_register("r")
+	threadIds = Utils.get_next_free_register(regManager, "r")
+	threadsPerCTA = Utils.get_next_free_register(regManager, "r")
+	currentThreadId = Utils.get_next_free_register(regManager, "r")

-	# load data into above defined registers
 	println(guardBuffer, "mov.u32    $threadIds, %ntid.x;")
 	println(guardBuffer, "mov.u32    $threadsPerCTA, %ctaid.x;")
 	println(guardBuffer, "mov.u32    $currentThreadId, %tid.x;")

-	globalThreadId = get_next_free_register("r") # basically the index of the thread in the variable set
-	breakCondition = get_next_free_register("p")
+	globalThreadId = Utils.get_next_free_register(regManager, "r") # basically the index of the thread in the variable set
+	breakCondition = Utils.get_next_free_register(regManager, "p")
 	println(guardBuffer, "mad.lo.s32     $globalThreadId, $threadIds, $threadsPerCTA, $currentThreadId;")
-	println(guardBuffer, "setp.ge.s32    $breakCondition, $globalThreadId, $nrOfVarSetsRegister;") # guard clause = index > nrOfVariableSets
+	println(guardBuffer, "setp.gt.s32    $breakCondition, $globalThreadId, $nrOfVarSets;") # guard clause = index > nrOfVariableSets

 	# branch to end if breakCondition is true
-	print(guardBuffer, "@$breakCondition bra    $exitJumpLocation;")
+	println(guardBuffer, "@$breakCondition bra    $exitJumpLocation;")

-	return String(take!(guardBuffer))
+	# Convert threadIdReg to a 64 bit register. Not 64 bit from the start, as this would take up more registers. Performance tests can be performed to determin if it is faster doing this, or making everything 64-bit from the start
+	threadId64Reg = Utils.get_next_free_register(regManager, "rd")
+	print(guardBuffer, "cvt.u64.u32    $threadId64Reg, $globalThreadId;")
+
+	return (String(take!(guardBuffer)), threadId64Reg)
 end

-# Current assumption: Expression only made out of constant values
-function generate_calculation_code(expression::ExpressionProcessing.PostfixType)::String
+"
+- param ```parametersSetSize```: Size of the largest parameter set
+"
+function generate_calculation_code(expression::ExpressionProcessing.PostfixType, variablesLocation::String, variablesSetSize::Integer, 
+								   parametersLocation::String, parametersSetSize::Integer, resultsLocation::String, 
+								   threadId64Reg::String, expressionIndex::Integer, nrOfVarSets::Integer, regManager::Utils.RegisterManager)::String
+
 	codeBuffer = IOBuffer()
 	operands = Vector{Operand}()

-	println(expression)
+	exprId64Reg = Utils.get_next_free_register(regManager, "rd")
+	println(codeBuffer, "mov.u64 $exprId64Reg, $expressionIndex;")
+
 	for i in eachindex(expression)
 		token = expression[i]

@ -132,21 +196,65 @@ function generate_calculation_code(expression::ExpressionProcessing.PostfixType)
 			else
 				left = pop!(operands)
 			end
-			operation, resultRegister = get_operation(operator, left, right)
+			operation, resultRegister = get_operation(operator, regManager, left, right)
 			
 			println(codeBuffer, operation)
 			push!(operands, resultRegister)
 		elseif token.Type == INDEX
-			# TODO
+			if token.Value > 0 # varaibles
+				var, first_access = Utils.get_register_for_name(regManager, "x$(token.Value)")
+				if first_access
+					println(codeBuffer, load_into_register(var, variablesLocation, token.Value, threadId64Reg, variablesSetSize, regManager))
+				end
+				push!(operands, var)
+			else
+				absVal = abs(token.Value)
+				param, first_access = Utils.get_register_for_name(regManager, "p$absVal")
+				if first_access
+					println(codeBuffer, load_into_register(param, parametersLocation, absVal, exprId64Reg, parametersSetSize, regManager))
+				end
+				push!(operands, param)
+			end
 		end
 	end

+	tempReg = Utils.get_next_free_register(regManager, "rd")
+	# reg = pop!(operands)
+	# tmp = "abs.f32  $(reg), 16.0;"
+	# push!(operands, reg)
+	println(codeBuffer, "
+	add.u64        $tempReg, $((expressionIndex)*nrOfVarSets), $threadId64Reg;
+	mad.lo.u64     $tempReg, $tempReg, $BYTES, $resultsLocation;
+	st.global.f32  [$tempReg], $(pop!(operands));
+	")
+
 	return String(take!(codeBuffer))
 end

+"
+Loads a value from a location into the given register. It is assumed that the location refers to a column-major matrix
+
+- param ```register```: The register where the loaded value will be stored
+- param ```loadLocation```: The location from where to load the value
+- param ```valueIndex```: 1-based index of the value in the variable set/parameter set
+- param ```setIndexReg64```: 0-based index of the set. Needed to calculate the actual index from the ```valueIndex```. Is equal to the global threadId
+- param ```setSize```: The size of one set. Needed to calculate the actual index from the ```valueIndex```. Total number of elements in the set (length(set))
+"
+function load_into_register(register::String, loadLocation::String, valueIndex::Integer, setIndexReg64::String, setSize::Integer, regManager::Utils.RegisterManager)::String
+	tempReg = Utils.get_next_free_register(regManager, "rd")
+
+	# "mad" calculates the offset and "add" applies the offset. Classical pointer arithmetic for accessing values of an array like in C
+	return "
+	mad.lo.u64  $tempReg, $setIndexReg64, $(setSize*BYTES), $((valueIndex - 1) * BYTES);
+	add.u64     $tempReg, $loadLocation, $tempReg;
+	ld.global.f32 $register, [$tempReg];"
+end
+
 function type_to_ptx_type(type::DataType)::String
 	if type == Int64
 		return ".s64"
+	elseif type == Int32
+		return ".s32"
 	elseif type == Float32
 		return ".f32"
 	else
@ -154,8 +262,8 @@ function type_to_ptx_type(type::DataType)::String
 	end
 end

-function get_operation(operator::Operator, left::Operand, right::Union{Operand, Nothing} = nothing)::Tuple{String, String}
-	resultRegister = get_next_free_register("f")
+function get_operation(operator::Operator, regManager::Utils.RegisterManager, left::Operand, right::Union{Operand, Nothing} = nothing)::Tuple{String, String}
+	resultRegister = Utils.get_next_free_register(regManager, "f")
 	resultCode = ""

 	if is_binary_operator(operator) && isnothing(right)
@ -171,66 +279,33 @@ function get_operation(operator::Operator, left::Operand, right::Union{Operand,
 	elseif operator == DIVIDE
 		resultCode = "div.approx.f32    $resultRegister, $left, $right;"
 	elseif operator == POWER
-		resultCode = "    $resultRegister, $left;" # TODO
+		# x^y == 2^(y*log2(x)) as generated by nvcc for "pow(x, y)"
+		resultCode = "
+		// x^y:
+		lg2.approx.f32   $resultRegister, $left;
+		mul.f32          $resultRegister, $right, $resultRegister;
+		ex2.approx.f32   $resultRegister, $resultRegister;"
 	elseif operator == ABS
 		resultCode = "abs.f32    $resultRegister, $left;"
 	elseif operator == LOG
-		resultCode = "lg2.approx.f32    $resultRegister, $left;"
+		# log(x) == log2(x) * ln(2) as generated by nvcc for "log(x)"
+		resultCode = "
+		// log(x):
+		lg2.approx.f32   $resultRegister, $left;
+		mul.f32          $resultRegister, $resultRegister, 0.693147182;"
 	elseif operator == EXP
-		resultCode = "    $resultRegister, $left;" # TODO
+		# e^x == 2^(x/ln(2)) as generated by nvcc for "exp(x)"
+		resultCode = "
+		// e^x:
+		mul.f32          $resultRegister, $left, 1.44269502; 
+		ex2.approx.f32   $resultRegister, $resultRegister;"
 	elseif operator == SQRT
 		resultCode = "sqrt.approx.f32    $resultRegister, $left;"
 	else
 		throw(ArgumentError("Operator conversion to ptx not implemented for '$operator'"))
 	end
-
 	return (resultCode, resultRegister)
 end

-let registers = Dict() # stores the count of the register already used.
-	global get_next_free_register
-	global get_register_definitions
-
-	# By convention these names correspond to the following types:
-	# - p -> pred
-	# - f -> float32
-	# - r -> 32 bit
-	# - var -> float32
-	# - param -> float32 !! although, they might get inserted as fixed number and not be sent to gpu?
-	function get_next_free_register(name::String)::String
-		if haskey(registers, name)
-			registers[name] += 1
-		else
-			registers[name] = 1
-		end
-
-		return string("%", name, registers[name] - 1)
-	end
-
-	function get_register_definitions()::String
-		registersBuffer = IOBuffer()
-	
-		for definition in registers
-			regType = ""
-			if definition.first == "p"
-				regType = ".pred"
-			elseif definition.first == "f"
-				regType = ".f32"
-			elseif definition.first == "var"
-				regType = ".f32"
-			elseif definition.first == "param"
-				regType = ".f32"
-			elseif definition.first == "r"
-				regType = ".b32"
-			else
-				throw(ArgumentError("Unknown register name used. Name '$(definition.first)' cannot be mapped to a PTX type."))
-			end
-			println(registersBuffer, ".reg $regType   %$(definition.first)<$(definition.second)>;")
-		end
-	
-		return String(take!(registersBuffer))
-	end
-end
-
 end

--- a/package/src/Utils.jl
+++ b/package/src/Utils.jl
@ -0,0 +1,88 @@
+module Utils
+
+using CUDA
+
+"Converts a vector of vectors into a matrix. The inner vectors do not need to have the same length.
+
+All entries that cannot be filled have ```invalidElement``` as their value
+"
+function convert_to_matrix(vecs::Vector{Vector{T}}, invalidElement::T)::Matrix{T} where T
+	maxLength = get_max_inner_length(vecs)
+
+	# Pad the shorter vectors with the invalidElement
+	paddedVecs = [vcat(vec, fill(invalidElement, maxLength - length(vec))) for vec in vecs]
+	vecMat = hcat(paddedVecs...)
+
+	return vecMat
+end
+
+"Retrieves the number of entries for the largest inner vector"
+function get_max_inner_length(vecs::Vector{Vector{T}})::Int where T
+	return maximum(length.(vecs))
+end
+
+"Returns a CuArray filed with the data provided. The inner vectors do not have to have the same length. All missing elements will be the value ```invalidElement```"
+function create_cuda_array(data::Vector{Vector{T}}, invalidElement::T)::CuArray{T} where T
+	dataMat = convert_to_matrix(data, invalidElement)
+	cudaArr = CuArray(dataMat)
+
+	return cudaArr
+end
+
+struct RegisterManager
+	registers::Dict
+	symtable::Dict
+end
+
+function get_next_free_register(manager::RegisterManager, name::String)::String
+	if haskey(manager.registers, name)
+		manager.registers[name] += 1
+	else
+		manager.registers[name] = 1
+	end
+
+	return string("%", name, manager.registers[name] - 1)
+end
+
+function get_register_definitions(manager::RegisterManager)::String
+	registersBuffer = IOBuffer()
+
+	for definition in manager.registers
+		regType = ""
+		if definition.first == "p"
+			regType = ".pred"
+		elseif definition.first == "f"
+			regType = ".f32"
+		elseif definition.first == "var"
+			regType = ".f32"
+		elseif definition.first == "param"
+			regType = ".f32"
+		elseif definition.first == "r"
+			regType = ".b32"
+		elseif definition.first == "rd"
+			regType = ".b64"
+		elseif definition.first == "parameter"
+			regType = ".b64"
+		elseif definition.first == "i"
+			regType = ".b64"
+		else
+			throw(ArgumentError("Unknown register name used. Name '$(definition.first)' cannot be mapped to a PTX type."))
+		end
+		println(registersBuffer, ".reg $regType   %$(definition.first)<$(definition.second)>;")
+	end
+
+	return String(take!(registersBuffer))
+end
+
+"Returns the register for this variable/parameter and true if it is used for the first time and false otherwise."
+function get_register_for_name(manager::RegisterManager, varName::String)
+	if haskey(manager.symtable, varName)
+		return (manager.symtable[varName], false)
+	else
+		reg = get_next_free_register(manager, "var")
+		manager.symtable[varName] = reg
+		return (reg, true)
+	end
+end
+
+end
--- a/package/test/CpuInterpreterTests.jl
+++ b/package/test/CpuInterpreterTests.jl
@ -0,0 +1,47 @@
+using LinearAlgebra
+using BenchmarkTools
+
+function test_cpu_interpreter(nrows; parallel = false)
+    exprs = [
+        # CPU interpreter requires an anonymous function and array ref s
+        :(p[1] * x[1] + p[2]), # 5 op
+        :((((x[1] + x[2]) + x[3]) + x[4]) + x[5]), # 9 op
+        :(log(abs(x[1]))), # 3 op
+        :(powabs(p[2] - powabs(p[1] + x[1], 1/x[1]),p[3])) # 13 op
+    ] # 30 op
+    exprs = map(e -> Expr(:->, :(x,p), e), exprs)
+    X = randn(Float32, nrows, 10)
+    p = [randn(Float32, 10) for _ in 1:length(exprs)] # generate 10 random parameter values for each expr
+    
+    # warmup
+    interpret_cpu(exprs, X, p)
+    expr_reps = 100 # for each expr
+    reps= 100
+
+    if parallel 
+        # t_sec = @elapsed fetch.([Threads.@spawn interpret_cpu(exprs, X, p; repetitions=expr_reps) for i in 1:reps])
+        @btime parallel(exprs, X, p, expr_reps, reps)
+        println("~ $(round(30 * reps * expr_reps * nrows  / 1e9 / t_sec, digits=2)) GFLOPS ($(Threads.nthreads()) threads) ($(round(LinearAlgebra.peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))")
+    else
+        # t_sec = @elapsed for i in 1:reps interpret_cpu(exprs, X, p; repetitions=expr_reps) end
+        @btime single(exprs, X, p, expr_reps, reps)
+        println("~ $(round(30 * reps * expr_reps * nrows  / 1e9 / t_sec, digits=2)) GFLOPS (single-core) ($(round(LinearAlgebra.peakflops(1000, eltype=Float32, ntrials=1) / 1e9, digits=2)) GFLOPS (peak, single-core))")
+    end
+    true
+end
+
+function parallel(exprs, X, p, expr_reps, reps)
+	fetch.([Threads.@spawn interpret_cpu(exprs, X, p; repetitions=expr_reps) for i in 1:reps])
+end
+
+function single(exprs, X, p, expr_reps, reps)
+	for i in 1:reps interpret_cpu(exprs, X, p; repetitions=expr_reps) end
+end
+
+
+# LinearAlgebra.BLAS.set_num_threads(1) # only use a single thread for peakflops
+
+@test test_cpu_interpreter(1000)
+@test test_cpu_interpreter(1000, parallel=true) # start julia -t 6 for six threads
+@test test_cpu_interpreter(10000)
+@test test_cpu_interpreter(10000, parallel=true)
--- a/package/test/InterpreterTests.jl
+++ b/package/test/InterpreterTests.jl
@ -1,6 +1,7 @@
 using CUDA
 using .ExpressionProcessing
 using .Interpreter
+using .Utils

 expressions = Vector{Expr}(undef, 2)
 variables = Matrix{Float32}(undef, 2,2)
@ -20,8 +21,8 @@ parameters[2][1] = 5.0
 parameters[2][2] = 0.0

 function testHelper(expression::Expr, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}}, expectedResult)
-	postfix = Vector([expr_to_postfix(expression)])
-	result = Interpreter.interpret(postfix, variables, parameters)
+	exprs = Vector([expression])
+	result = Interpreter.interpret(exprs, variables, parameters)

 	expectedResult32 = convert(Float32, expectedResult)
 	@test isequal(result[1,1], expectedResult32)
@ -35,7 +36,7 @@ end
 	reference[2,2] = 0.0
 	# reference = Matrix([5.0, NaN],
 	# 				   [5.0, 0.0])
-	result = Interpreter.convert_to_matrix(parameters, NaN32)
+	result = Utils.convert_to_matrix(parameters, NaN32)

 	@test isequal(result, reference)
 end
@ -126,8 +127,8 @@ end
 	expr1 = :((x1 + 5) * p1 - 3 / abs(x2) + (2^4) - log(8))
 	expr2 = :(1 + 5 * x1 - 10^2 + (p1 - p2) / 9 + exp(x2))

-	postfix = Vector([expr_to_postfix(expr1), expr_to_postfix(expr2)])
-	result = Interpreter.interpret(postfix, var, param)
+	exprs = Vector([expr1, expr2])
+	result = Interpreter.interpret(exprs, var, param)

 	# var set 1
 	@test isapprox(result[1,1], 37.32, atol=0.01) # expr1
--- a/package/test/PerformanceTests.jl
+++ b/package/test/PerformanceTests.jl
@ -0,0 +1,177 @@
+using LinearAlgebra
+using BenchmarkTools
+
+using .Transpiler
+using .Interpreter
+
+const BENCHMARKS_RESULTS_PATH = "./results"
+# University setup at 10.20.1.7 if needed
+exprsCPU = [
+	# CPU interpreter requires an anonymous function and array ref s
+	:(p[1] * x[1] + p[2]), # 5 op
+	:((((x[1] + x[2]) + x[3]) + x[4]) + x[5]), # 9 op
+	:(log(abs(x[1]))), # 3 op
+	:(powabs(p[2] - powabs(p[1] + x[1], 1/x[1]),p[3])) # 13 op
+] # 30 op
+exprsCPU = map(e -> Expr(:->, :(x,p), e), exprsCPU)
+
+exprsGPU = [
+	# CPU interpreter requires an anonymous function and array ref s
+	:(p1 * x1 + p2), # 5 op
+	:((((x1 + x2) + x3) + x4) + x5), # 9 op
+	:(log(abs(x1))), # 3 op
+	:(powabs(p2 - powabs(p1 + x1, 1/x1),p3)) # 13 op
+] # 30 op
+
+# p is the same for CPU and GPU
+p = [randn(Float32, 10) for _ in 1:length(exprsCPU)] # generate 10 random parameter values for each expr
+expr_reps = 100 # 100 parameter optimisation steps basically
+
+
+@testset "CPU performance" begin
+	# warmup
+	# interpret_cpu(exprsCPU, X, p)
+	
+	# @btime interpret_cpu(exprsCPU, X, p; repetitions=expr_reps) # repetitions simulates parameter optimisation
+	# @btime test_cpu_interpreter(1000)
+	# @btime fetch.([Threads.@spawn interpret_cpu(exprsCPU, X, p; repetitions=expr_reps) for i in 1:reps])
+
+	# test_cpu_interpreter(1000, parallel=true) # start julia -t 6 for six threads
+	# @btime test_cpu_interpreter(10000)
+	# @btime test_cpu_interpreter(10000, parallel=true)
+
+end
+
+@testset "Interpreter Performance" begin
+	# Put data in shared memory: 
+	# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
+
+	# Make array const:
+	# https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays
+
+	# Memory management like in C++ might help with performance improvements
+	# https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management
+end
+
+@testset "Transpiler Performance" begin
+	# Put data in shared memory: 
+	# https://cuda.juliagpu.org/v2.6/api/kernel/#Shared-memory
+
+	# Make array const:
+	# https://cuda.juliagpu.org/v2.6/api/kernel/#Device-arrays
+
+	# Memory management like in C++ might help with performance improvements
+	# https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management
+end
+
+compareWithCPU = true
+
+
+suite = BenchmarkGroup()
+suite["CPU"] = BenchmarkGroup(["CPUInterpreter"])
+suite["GPUI"] = BenchmarkGroup(["GPUInterpreter"])
+suite["GPUT"] = BenchmarkGroup(["GPUTranspiler"])
+varsets_small = 100
+varsets_medium = 1000
+varsets_large = 10000
+
+if compareWithCPU
+	X_small = randn(Float32, varsets_small, 5)
+	suite["CPU"]["small varset"] = @benchmarkable interpret_cpu(exprsCPU, X_small, p; repetitions=expr_reps)
+	X_medium = randn(Float32, varsets_medium, 5)
+	suite["CPU"]["medium varset"] = @benchmarkable interpret_cpu(exprsCPU, X_medium, p; repetitions=expr_reps)
+	X_large = randn(Float32, varsets_large, 5)
+	suite["CPU"]["large varset"] = @benchmarkable interpret_cpu(exprsCPU, X_large, p; repetitions=expr_reps)
+end
+
+X_small_GPU = randn(Float32, 5, varsets_small)
+suite["GPUI"]["small varset"] = @benchmarkable interpret_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
+suite["GPUT"]["small varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
+
+X_medium_GPU = randn(Float32, 5, varsets_medium)
+suite["GPUI"]["medium varset"] = @benchmarkable interpret_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
+suite["GPUT"]["medium varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
+
+X_large_GPU = randn(Float32, 5, varsets_large)
+suite["GPUI"]["large varset"] = @benchmarkable interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
+suite["GPUT"]["large varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
+
+# interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
+
+# tune!(suite)
+# BenchmarkTools.save("params.json", params(suite))
+
+loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gctrial, :time_tolerance, :evals_set, :gcsample, :seconds, :overhead, :memory_tolerance)
+
+results = run(suite, verbose=true, seconds=180)
+
+BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/256_blocksize.json", results)
+
+
+
+if compareWithCPU
+	medianCPU = median(results["CPU"])
+	stdCPU = std(results["CPU"])
+	
+	medianInterpreter = median(results["GPUI"])
+	stdInterpreter = std(results["GPUI"])
+	
+	medianTranspiler = median(results["GPUT"])
+	stdTranspiler = std(results["GPUT"])
+	
+	cpuVsGPUI_median = judge(medianInterpreter, medianCPU) # is interpreter better than cpu?
+	cpuVsGPUT_median = judge(medianTranspiler, medianCPU) # is transpiler better than cpu?
+	gpuiVsGPUT_median = judge(medianTranspiler, medianInterpreter) # is tranpiler better than interpreter?
+	
+	cpuVsGPUI_std = judge(stdInterpreter, stdCPU) # is interpreter better than cpu?
+	cpuVsGPUT_std = judge(stdTranspiler, stdCPU) # is transpiler better than cpu?
+	gpuiVsGPUT_std = judge(stdTranspiler, stdInterpreter) # is tranpiler better than interpreter?
+	
+	println()
+	println("Is the interpreter better than the CPU implementation:")
+	println(cpuVsGPUI_median)
+	println(cpuVsGPUI_std)
+	
+	println()
+	println("Is the transpiler better than the CPU implementation:")
+	println(cpuVsGPUT_median)
+	println(cpuVsGPUT_std)
+	
+	println()
+	println("Is the transpiler better than the interpreter:")
+	println(gpuiVsGPUT_median)
+	println(gpuiVsGPUT_std)
+	
+else
+	resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/initial_results.json")[1]
+	
+	medianGPUI_old = median(resultsOld["GPUI"])
+	stdGPUI_old = std(resultsOld["GPUI"])
+	
+	medianGPUT_old = median(resultsOld["GPUT"])
+	stdGPUT_old = std(resultsOld["GPUT"])
+	
+	medianInterpreter = median(results["GPUI"])
+	stdInterpreter = std(results["GPUI"])
+	
+	medianTranspiler = median(results["GPUT"])
+	stdTranspiler = std(results["GPUT"])
+	
+	oldVsGPUI_median = judge(medianInterpreter, medianGPUI_old) # is interpreter better than old?
+	oldVsGPUI_std = judge(stdInterpreter, stdGPUI_old) # is interpreter better than old?
+	
+	oldVsGPUT_median = judge(medianTranspiler, medianGPUT_old) # is transpiler better than old?
+	oldVsGPUT_std = judge(stdTranspiler, stdGPUT_old) # is transpiler better than old?
+	
+	
+	println()
+	println("Is the interpreter better than the old implementation:")
+	println(oldVsGPUI_median)
+	println(oldVsGPUI_std)
+	
+	println()
+	println("Is the transpiler better than the old implementation:")
+	println(oldVsGPUT_median)
+	println(oldVsGPUT_std)
+end
+
--- a/package/test/Project.toml
+++ b/package/test/Project.toml
@ -1,4 +1,8 @@
 [deps]
+BenchmarkPlots = "ab8c0f59-4072-4e0d-8f91-a91e1495eb26"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
+StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
--- a/package/test/TranspilerTests.jl
+++ b/package/test/TranspilerTests.jl
@ -2,40 +2,65 @@ using CUDA
 using .ExpressionProcessing
 using .Transpiler

-expressions = Vector{Expr}(undef, 2)
-variables = Matrix{Float32}(undef, 2,2)
-parameters = Vector{Vector{Float32}}(undef, 2)
+expressions = Vector{Expr}(undef, 3)
+variables = Matrix{Float32}(undef, 5, 4)
+parameters = Vector{Vector{Float32}}(undef, 3)

-# Resulting value should be 1.14... for the first expression
 expressions[1] = :(1 + 3 * 5 / 7 - sqrt(4))
-expressions[2] = :(5 + x1 + 1 * x2 + p1 + p2)
+expressions[2] = :(5 + x1 + 1 * x2 + p1 + p2 + x1^x3)
+expressions[3] = :(log(x1) / x2 * sqrt(p1) + x3^x4 - exp(x5))
+
 variables[1,1] = 2.0
 variables[2,1] = 3.0
-variables[1,2] = 0.0
+variables[3,1] = 0.0
+variables[4,1] = 1.0
+variables[5,1] = 0.0
+
+variables[1,2] = 2.0
 variables[2,2] = 5.0
-parameters[1] = Vector{Float32}(undef, 1)
+variables[3,2] = 3.0
+variables[4,2] = 0.0 
+variables[5,2] = 0.0
+
+variables[1,3] = 6.0
+variables[2,3] = 2.0
+variables[3,3] = 2.0
+variables[4,3] = 4.0
+variables[5,3] = 2.0
+
+variables[1,4] = 1.0
+variables[2,4] = 2.0
+variables[3,4] = 3.0
+variables[4,4] = 4.0
+variables[5,4] = 5.0
+
+parameters[1] = Vector{Float32}(undef, 0)
 parameters[2] = Vector{Float32}(undef, 2)
-parameters[1][1] = 5.0
+parameters[3] = Vector{Float32}(undef, 1)
 parameters[2][1] = 5.0
 parameters[2][2] = 0.0
+parameters[3][1] = 16.0

+@testset "Test transpiler evaluation" begin
+	results = Transpiler.evaluate(expressions, variables, parameters)

-@testset "Test TMP transpiler" begin
-	postfixExpr = expr_to_postfix(expressions[1])
-	postfixExprs = Vector([postfixExpr])
-	push!(postfixExprs, expr_to_postfix(expressions[2]))
+	# dump(expressions[3]; maxdepth=10)
+	# Expr 1:
+	@test isapprox(results[1,1], 1.14286)
+	@test isapprox(results[2,1], 1.14286)
+	@test isapprox(results[3,1], 1.14286)
+	@test isapprox(results[4,1], 1.14286)
+	#Expr 2:
+	@test isapprox(results[1,2], 16.0)
+	@test isapprox(results[2,2], 25.0)
+	@test isapprox(results[3,2], 54.0)
+	@test isapprox(results[4,2], 14.0)

-	generatedCode = Transpiler.transpile(postfixExpr)
-	# CUDA.@sync interpret(postfixExprs, variables, parameters)
-
-	# This is just here for testing. This will be called inside the execute method in the Transpiler module
-	linker = CuLink()
-	add_data!(linker, "ExpressionProcessing", generatedCode)
-
-	image = complete(linker)
-
-	mod = CuModule(image)
-	func = CuFunction(mod, "ExpressionProcessing")
+	#Expr3:
+	@test isapprox(results[1,3],  -0.07580)
+	@test isapprox(results[2,3],   0.55452)
+	@test isapprox(results[3,3],  12.19446)
+	@test isapprox(results[4,3], -67.41316)
 end

 # TODO: test performance of transpiler PTX generation when doing "return String(take!(buffer))" vs "return take!(buffer)"
--- a/package/test/params.json
+++ b/package/test/params.json
@ -0,0 +1 @@
+[{"Julia":"1.11.4","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["CPUInterpreter"]}],"GPUT":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]]
--- a/package/test/results/256_blocksize.json
+++ b/package/test/results/256_blocksize.json
--- a/package/test/results/initial_results.json
+++ b/package/test/results/initial_results.json
--- a/package/test/runtests.jl
+++ b/package/test/runtests.jl
@ -2,12 +2,22 @@ using ExpressionExecutorCuda
 using Test

 const baseFolder = dirname(dirname(pathof(ExpressionExecutorCuda)))
+include(joinpath(baseFolder, "src", "Utils.jl"))
 include(joinpath(baseFolder, "src", "ExpressionProcessing.jl"))
 include(joinpath(baseFolder, "src", "Interpreter.jl"))
 include(joinpath(baseFolder, "src", "Transpiler.jl"))

-@testset "ExpressionExecutorCuda.jl" begin
-	include("ExpressionProcessingTests.jl")
-	include("InterpreterTests.jl")
-	include("TranspilerTests.jl")
+@testset "Functionality tests" begin
+	# include("ExpressionProcessingTests.jl")
+	# include("InterpreterTests.jl")
+	# include("TranspilerTests.jl")
+end
+
+
+# @testset "CPU Interpreter" begin
+# 	include("CpuInterpreterTests.jl")
+# end
+
+@testset "Performance tests" begin
+	include("PerformanceTests.jl")
 end
--- a/thesis/.vscode/ltex.dictionary.en-GB.txt
+++ b/thesis/.vscode/ltex.dictionary.en-GB.txt
@ -1,2 +1,4 @@
 CUDA
 GPGPU
+SIMT
+Sinlge-Instruction
--- a/thesis/.vscode/ltex.disabledRules.en-GB.txt
+++ b/thesis/.vscode/ltex.disabledRules.en-GB.txt
@ -0,0 +1 @@
+OXFORD_SPELLING_Z_NOT_S
--- a/thesis/.vscode/ltex.hiddenFalsePositives.en-GB.txt
+++ b/thesis/.vscode/ltex.hiddenFalsePositives.en-GB.txt
@ -0,0 +1,3 @@
+{"rule":"OXFORD_SPELLING_Z_NOT_S","sentence":"^\\QOptimisation of software\\E$"}
+{"rule":"TO_TOO","sentence":"^\\QThey introduced the division operator, which led to much better results.\\E$"}
+{"rule":"COLLECTIVE_NOUN_VERB_AGREEMENT_VBP","sentence":"^\\QIn cases where occupancy is already high and the amount of warps ready for execution is also high, other areas for performance improvements need to be explored.\\E$"}
--- a/thesis/chapters/implementation.tex
+++ b/thesis/chapters/implementation.tex
@ -9,5 +9,12 @@ Probably reference the performance evaluation papers for Julia and CUDA.jl
 \section{Interpreter}
 Talk about how the interpreter has been developed.

+\subsection{Performance tuning}
+Document the process of performance tuning
+
+
 \section{Transpiler}
 Talk about how the transpiler has been developed
+
+\subsection{Performance tuning}
+Document the process of performance tuning
--- a/thesis/chapters/introduction.tex
+++ b/thesis/chapters/introduction.tex
@ -1,10 +1,49 @@
 \chapter{Introduction}
 \label{cha:Introduction}

-Introduction into topic and why someone would want to evaluate expressions on the GPU. Also include "Non-Goals"
+This chapter provides an entry point for this thesis. First the motivation of exploring this topic is presented. In addition, the research questions of this thesis are outlined. Lastly the methodology on how to answer these questions will be explained.

-\section[Research Question]{Research Question and Methodology}
-What are the research questions and how they will be answered
+\section{Background and Motivation}
+%
+% Not totally happy with this yet
+%
+Optimisation and acceleration of program code is a crucial part in many fields. For example video games need optimisation to lower the minimum hardware requirements which allows more people to run the game, increasing sales. Another example where optimisation is important are computer simulations. For those, optimisation is even more crucial, as this allows the scientists to run more detailed simulations or get the simulation results faster. Equation learning or symbolic regression is another field that can heavily benefit from optimisation. One part of equation learning, is to evaluate the expressions generated by a search algorithm which can make up a significant portion of the runtime. This thesis is concerned with optimising the evaluation part to increase the overall performance of equation learning algorithms.

-\section{Overview}
-Will give an overview of the chapters and what to expect
+The following expression $5 - \text{abs}(x_1) * \text{sqrt}(x_2) / 10 + 2 \char`^ x_3$ which contains simple mathematical operations as well as variables $x_n$ and parameters $p_n$ is one example that can be generated by the equation learning algorithm, Usually an equation learning algorithm generates multiple of such expressions per iteration. Out of these expressions all possibly relevant ones have to be evaluated. Additionally, multiple different values need to be inserted for all variables and parameters, drastically increasing the amount of evaluations that need to be performed.
+
+In his Blog \textcite{sutter_free_2004} described how the free lunch is over in terms of the ever-increasing performance of hardware like the CPU. He states that to gain additional performance, developers need to start developing software for multiple cores and not just hope that on the next generation of CPUs the program magically runs faster. While this approach means more development overhead, a much greater speed-up can be achieved. However, in some cases the speed-up achieved by this is still not large enough and another approach is needed. One of these approaches is the utilisation of Graphics Processing Units (GPUs) as an easy and affordable option as compared to compute clusters. Especially when talking about performance per dollar, GPUs are very inexpensive as found by \textcite{brodtkorb_graphics_2013}. \textcite{michalakes_gpu_2008} have shown a noticeable speed-up when using GPUs for weather simulation. In addition to computer simulations, GPU acceleration also can be found in other places such as networking \parencite{han_packetshader_2010} or structural analysis of buildings \parencite{georgescu_gpu_2013}.
+
+
+%The free lunch theorem as described by \textcite{adam_no_2019} states that to gain additional performance, a developer cannot just hope for future hardware to be faster, especially on a single core. 
+
+
+
+\section{Research Question}
+With these successful implementations of GPU acceleration, this thesis also attempts to improve the performance of evaluating mathematical equations using GPUs. Therefore, the following research questions are formulated:
+
+\begin{itemize}
+	\item How can simple arithmetic expressions that are generated at runtime be efficiently evaluated on GPUs?
+	\item Under what circumstances is the evaluation of simple arithmetic expressions faster on a GPU than on a CPU?
+	\item Under which circumstances is the interpretation of the expressions on the GPU or the translation to the intermediate language Parallel Thread Execution (PTX) more efficient?
+\end{itemize}
+
+Answering the first question is necessary to ensure the approach of this thesis is actually feasible. If it is feasible, it is important to evaluate if evaluating the expressions on the GPU actually improves the performance over a parallelised CPU evaluator. To answer if the GPU evaluator is faster than the CPU evaluator, the last research question is important. As there are two major ways of implementing an evaluator on the GPU, they need to be implemented and evaluated to finally state if evaluating expressions on the GPU is faster and if so, which type of implementation results in the best performance.
+
+
+\section{Thesis Structure}
+In order to answer the research questions, this thesis is divided into the following chapters:
+
+\begin{description}
+	\item[Chapter 2: Fundamentals and Related Work] \mbox{} \\
+		In this chapter, the topic of this thesis is explored. It covers the fundamentals of equation learning and how this thesis fits into this field of research. In addition, the fundamentals of General Purpose GPU computing and how interpreters and transpilers work are explained. Previous research already done within this topic is also explored.
+	\item[Chapter 3: Concept and Design] \mbox{} \\
+		Within this chapter, the concepts of implementing the GPU interpreter and transpiler are explained. How these two prototypes can be implemented disregarding concrete technologies is part of this chapter.
+	\item[Chapter 4: Implementation] \mbox{} \\
+		This chapter explains the implementation of the GPU interpreter and transpiler. The details of the implementation with the used technologies are covered, such as the interpretation process and the transpilation of the expressions into Parallel Thread Execution (PTX) code.
+	\item[Chapter 5: Evaluation] \mbox{} \\
+		The software and hardware requirements and the evaluation environment are introduced in this chapter. All three evaluators will be compared against each other and the form of the expressions used for the comparisons are outlined. The comparison will not only include the time taken for the pure evaluation, but it will also include the overhead, like PTX code generation. Finally, the results of the comparison of the GPU and CPU evaluators are presented to show which of these yields the best performance.
+	\item[Chapter 6: Conclusion] \mbox{} \\
+		In the final chapter, the entire work is summarised. A brief overview of the implementation as well as the evaluation results will be provided. Additionally, an outlook of possible future research is given.
+\end{description}
+
+With this structure the process of creating and evaluating a basic interpreter on the GPU as well as a transpiler for creating PTX code is outlined. Research is done to ensure the implementations are relevant and not outdated. Finally, the evaluation results will answer the research questions and determine if expressions generated at runtime can be evaluated more efficiently on the GPU than on the CPU.
--- a/thesis/chapters/relwork.tex
+++ b/thesis/chapters/relwork.tex
@ -1,19 +1,183 @@
 \chapter{Fundamentals and Related Work}
 \label{cha:relwork}
+The goal of this chapter is to provide an overview of equation learning or symbolic regression to establish common knowledge of the topic and problem this thesis is trying to solve. First the field of equation learning is explored which helps to contextualise the topic of this thesis. The main part of this chapter is split into two sub-parts. The first part is exploring research that has been done in the field of general purpose computations on the GPU (GPGPU) as well as the fundamentals of it. Focus lies on exploring how graphics processing units (GPUs) are used to achieve substantial speed-ups and when and where they can be effectively employed. The second part describes the basics of how interpreters and compilers are built and how they can be adapted to the workflow of programming GPUs. When discussing GPU programming concepts, the terminology used is that of Nvidia and may differ from that used for AMD GPUs.

 \section{Equation learning}
-Section describing what equation learning is and why it is relevant for the thesis
+Equation learning is a field of research that can be used for understanding and discovering equations from a set of data from various fields like mathematics and physics. Data is usually much more abundant while models often are elusive which is demonstrated by \textcite{guillemot_climate_2022} where they explain how validating the models against large amounts of data is a big part in creating such models. Because of this effort, generating equations with a computer can more easily lead to discovering equations that describe the observed data. \textcite{brunton_discovering_2016} describe an algorithm that leverages equation learning to discover equations for physical systems. A more literal interpretation of equation learning is demonstrated by \textcite{pfahler_semantic_2020}. They use machine learning to learn the form of equations. Their aim was to simplify the discovery of relevant publications by the equations they use and not by technical terms, as they may differ by the field of research. However, this kind of equation learning is not relevant for this thesis.
+
+Symbolic regression is a subset of equation learning, that specialises more towards discovering mathematical equations. A lot of research is done in this field. Using genetic programming (GP) for different problems, including symbolic regression, was first described by \textcite{koza_genetic_1994}. He described that finding a computer program to solve a problem for a given input and output, can be done by traversing the search space of all solutions. This fits well for the goal of symbolic regression, where a mathematical expression needs to be found to describe a problem with specific inputs and outputs. Later, \textcite{koza_human-competitive_2010} provided an overview of results that were generated with the help of GP and were competitive with human solutions, showing how symbolic regression is a useful tool. In their book Symbolic Regression, \textcite{kronberger_symbolic_2024} show how symbolic regression can be applied for real world scenarios. They also describe symbolic regression in great detail, while being tailored towards beginners and experts alike. 
+
+\textcite{keijzer_scaled_2004} and \textcite{korns_accuracy_2011} presented ways of improving the quality of symbolic regression algorithms, making symbolic regression more feasible for problem-solving. \textcite{bartlett_exhaustive_2024} describe an exhaustive approach for symbolic regression which can find the true optimum for perfectly optimised parameters while retaining simple and interpretable results. Alternatives to GP for symbolic regression also exist with one proposed by \textcite{jin_bayesian_2020}. Their approach increased the quality of the results noticeably compared to GP alternatives. Another alternative to heuristics like GP is the usage of neural networks. One such alternative has been introduced by \textcite{martius_extrapolation_2016} where they used a neural network for their equation learner with mixed results. Later, an extension has been provided by \textcite{sahoo_learning_2018}. They introduced the division operator, which led to much better results. Further improvements have been described by \textcite{werner_informed_2021} with their informed equation learner. By incorporating domain expert knowledge they could limit the search space and find better solutions for particular domains. One drawback of these three implementations is the fact that their neural networks are fixed. An equation learner which can change the network at runtime and therefore evolve over time is proposed by \textcite{dong_evolving_2024}. Their approach further improved the results of neural network equation learners. In their work, \textcite{lemos_rediscovering_2022} also used a neural network for symbolic regression. They were able to find an equivalent to Newton's law of gravitation and rediscovered Newton's second and third law only with trajectory data of bodies of our solar system. Although these laws were already known, this research has shown how neural networks and machine learning in general have great potential. An implementation for an equation learner in the physics domain is proposed by \textcite{sun_symbolic_2023}. Their algorithm was specifically designed for nonlinear dynamics often occurring in physical systems. When compared to other implementations their equation learner was able to create better results but has the main drawback of high computational cost. As seen by these publications, increasing the quality of generated equations and also increasing the speed of finding these equations is a central part in symbolic regression and equation learning in general. 
+
+As described earlier, the goal of equation learning is to find an expression that fits a given set of data. The data usually consists of a set of inputs that have been applied to the unknown expression and the output after the input has been applied. An example for such data is described by \textcite{werner_informed_2021}. In one instance they want to find the power loss formula for an electric machine. They used four inputs, direct and quadratic current as well as temperature and motor speed, and they have an observed output which is the power loss. Now for an arbitrary problem with different input and outputs, the equation learner tries to find an expression that fits this data \parencite{koza_genetic_1994}. Fitting in this context means that when the input is applied to the expression, the result will be the same as the observed output. In order to avoid overfitting \textcite{bomarito_bayesian_2022} have proposed a way of using Bayesian model selection to combat overfitting and reduce the complexity of the generated expressions. This also helps with making the expressions more generalisable and therefore be applicable to unseen inputs. A survey conducted by \textcite{dabhi_survey_2012} shows how overfitting is not desirable and why more generalisable solutions are preferred. To generate an equation, first the operators need to be defined that make up the equation. It is also possible to define a maximum length for an expression as proposed by \textcite{bartlett_exhaustive_2024}. Expressions also consist of constants as well as variables which represent the inputs. Assuming that a given problem has three variables, the equation learner could generate an expression as seen in \ref{eq:example} where $x_n$ are the variables and $O$ is the output which should correspond to the observed output for the given variables.
+
+\begin{equation} \label{eq:example}
+	O = 5 - \text{abs}(x_1) * \text{sqrt}(x_2) / 10 + 2 \char`^ x_3
+\end{equation}
+
+A typical equation learner generates multiple expressions at once. If the equation learner generates $300$ expressions and each expression needs to be evaluated $50$ times to get the best parametrisation for each of these expressions, the total number of evaluations is $300 * 50 = 15\,000$. However, it is likely that multiple runs or generations in the context of GP need to be performed. The number of generations is dependent to the problem, but assuming a maximum of $100$ generations, the total number of evaluations is equal to $300 * 50 * 100 = 1\,500\,000$. These values have been taken from the equation learner for predicting discharge voltage curves of batteries as described by \textcite{kronberger_symbolic_2024}. Their equation learner converged after 54 generations, resulting in evaluating $800\,000$ expressions. Depending on the complexity of the generated expressions, performing all of these evaluations takes up a lot of the runtime. Their results took over two days on an eight core desktop CPU. While they did not provide runtime information for all problems they tested, the voltage curve prediction was the slowest. The other problems were in the range of a few seconds and up to a day. Especially the problems that took several hours to days to finish show, that there is still room for performance improvements. While a better CPU with more cores can be used, it is interesting to determine, if using Graphics cards can yield noticeable better performance or not, which is the goal of this thesis. 


 \section[GPGPU]{General Purpose Computation on Graphics Processing Units}
-Describe what GPGPU is and how it differs from classical programming. talk about architecture (SIMD) and some scientific papers on how they use GPUs to accelerate tasks
+\label{sec:gpgpu}
+Graphics cards (GPUs) are commonly used to increase the performance of many different applications. Originally they were designed to improve performance and visual quality in games. \textcite{dokken_gpu_2005} first described the usage of GPUs for general purpose programming (GPGPU). They have shown how the graphics pipeline can be used for GPGPU programming. Because this approach also requires the programmer to understand the graphics terminology, this was not a great solution. Therefore, Nvidia released CUDA\footnote{\url{https://developer.nvidia.com/cuda-toolkit}} in 2007 with the goal of allowing developers to program GPUs independent of the graphics pipeline and terminology. A study of the programmability of GPUs with CUDA and the resulting performance has been conducted by \textcite{huang_gpu_2008}. They found that GPGPU programming has potential, even for non-embarassingly parallel problems. Research is also done in making the low level CUDA development simpler. \textcite{han_hicuda_2011} have described a directive-based language to make development simpler and less error-prone, while retaining the performance of handwritten code. To drastically simplify CUDA development, \textcite{besard_effective_2019} showed that it is possible to develop with CUDA in the high level programming language Julia\footnote{\url{https://julialang.org/}} with similar performance to CUDA written in C. In a subsequent study \textcite{lin_comparing_2021} found, that high performance computing (HPC) on the CPU and GPU in Julia performs similar to HPC development in C. This means that Julia can be a viable alternative to Fortran, C and C++ in the HPC field. Additional Julia has the benefit of developer comfort since it is a high level language with modern features such as a garbage-collector. \textcite{besard_rapid_2019} have also shown how the combination of Julia and CUDA help in rapidly developing HPC software. While this thesis in general revolves around CUDA, there also exist alternatives by AMD called ROCm\footnote{\url{https://www.amd.com/de/products/software/rocm.html}} and a vendor independent alternative called OpenCL\footnote{\url{https://www.khronos.org/opencl/}}. If not specified otherwise, the following section and its subsections use the information presented by \textcite{nvidia_cuda_2025} in their CUDA programming guide.
+
+While in the early days of GPGPU programming a lot of research has been done to assess if this approach is feasible, it now seems obvious to use GPUs to accelerate algorithms. GPUs have been used early to speed up weather simulation models. \textcite{michalakes_gpu_2008} proposed a method for simulating weather with the Weather Research and Forecast (WRF) model on a GPU. With their approach, they reached a speed-up of 5 to 2 for the most compute intensive task, with little GPU optimisation effort. They also found that the GPU usage was low, meaning there are resources and potential for more detailed simulations. Generally, simulations are great candidates for using GPUs, as they can benefit heavily from a high degree of parallelism and data throughput. \textcite{koster_high-performance_2020} have developed a way of using adaptive time steps on the GPU to considerably improve the performance of numerical and discrete simulations. In addition to the performance gains they were able to retain the precision and constraint correctness of the simulation. Black hole simulations are crucial for science and education for a better understanding of our world. \textcite{verbraeck_interactive_2021} have shown that simulating complex Kerr (rotating) black holes can be done on consumer hardware in a few seconds. Schwarzschild black hole simulations can be performed in real-time with GPUs as described by \textcite{hissbach_overview_2022} which is especially helpful for educational scenarios. While both approaches do not have the same accuracy as detailed simulations on supercomputers, they show how a single GPU can yield similar accuracy at a fraction of the cost. Software network routing can also heavily benefit from GPU acceleration as shown by \textcite{han_packetshader_2010}, where they achieved a significantly higher throughput than with a CPU only implementation. Finite element structural analysis is an essential tool for many branches of engineering and can also heavily benefit from the usage of GPUs as demonstrated by \textcite{georgescu_gpu_2013}. Generating test data for DeepQ learning can also significantly benefit from using the GPU \parencite{koster_macsq_2022}. However, it also needs to be noted, that GPUs are not always better performing than CPUs as illustrated by \textcite{lee_debunking_2010}, so it is important to consider if it is worth using GPUs for specific tasks.
+
+\subsection{Programming GPUs}
+The development process on a GPU is vastly different from a CPU. A CPU has tens or hundreds of complex cores with the AMD Epyc 9965\footnote{\url{https://www.amd.com/en/products/processors/server/epyc/9005-series/amd-epyc-9965.html}} having a staggering $192$ cores and twice as many threads. To demonstrate the complexity of a simple one core 8-bit CPU \textcite{schuurman_step-by-step_2013} has written a development guide. He describes the different parts of one CPU core and how they interact. Modern CPUs are even more complex, with dedicated fast integer and floating-point arithmetic gates as well as logic gates, sophisticated branch prediction and much more. This makes a CPU perfect for handling complex control flows on a single program strand and on modern CPUs even multiple strands simultaneously \parencite{palacios_comparison_2011}. However, as seen in section \ref{sec:gpgpu}, this often isn't enough. On the other hand, a GPU contains thousands or even tens of thousands of cores. For example, the GeForce RTX 5090\footnote{\url{https://www.nvidia.com/en-us/geforce/graphics-cards/50-series/rtx-5090/}} contains a total of $21\,760$ CUDA cores. To achieve this enormous core count a single GPU core has to be much simpler than one CPU core. As described by \textcite{nvidia_cuda_2025} a GPU designates much more transistors towards floating-point computations. This results in less efficient integer arithmetic and control flow handling. There is also less Cache available per core and clock speeds are usually also much lower than those on a CPU. An overview of the differences of a CPU and a GPU architecture can be seen in figure \ref{fig:cpu_vs_gpu}.
+
+\begin{figure}
+	\centering
+	\includegraphics[width=1\textwidth]{nvidia_cpu_vs_gpu.png}
+	\caption{Overview of the architecture of a CPU (left) and a GPU (right). Note the higher number of simpler and smaller cores on the GPU \parencite{nvidia_cuda_2025}.}
+	\label{fig:cpu_vs_gpu}
+\end{figure}
+
+Despite these drawbacks, the sheer number of cores, makes a GPU a valid choice when considering improving the performance of an algorithm. Because of the high number of cores, GPUs are best suited for data parallel scenarios. This is due to the SIMD architecture of these cards. SIMD stands for Sinlge-Instruction Multiple-Data and states that there is a single stream of instructions that is executed on a huge number of data streams. \textcite{franchetti_efficient_2005} and \textcite{tian_compiling_2012} describe ways of using SIMD instructions on the CPU. Their approaches lead to noticeable speed-ups of 3.3 and 4.7 respectively by using SIMD instructions instead of serial computations. Extending this to GPUs which are specifically built for SIMD/data parallel calculations shows why they are so powerful despite having less complex and slower cores than a CPU. 
+
+\subsubsection{Thread Hierarchy and Tuning}
+The thousands of cores on a GPU, also called threads, are grouped together in several categories. This is the Thread hierarchy of GPUs. The developer can influence this grouping to a degree which allows them to tune their algorithm for optimal performance. In order to develop a well performing algorithm, it is necessary to know how this grouping works. Tuning the grouping is unique to each algorithm and also dependent on the GPU used, which means it is important to test a lot of different configurations to achieve the best possible result. This section aims at exploring the thread hierarchy and how it can be tuned to fit an algorithm.
+
+At the lowest level of a GPU exists a Streaming Multiprocessor (SM), which is a hardware unit responsible for scheduling and executing threads and also contains the registers used by these threads. An SM is always executing a group of 32 threads simultaneously, and this group is called a warp. The number of threads that can be started is virtually unlimited. However, threads must be grouped in a block, with one block typically containing a maximum of $1024$ threads but is often configured to be less. Therefore, if more than $1024$ threads are required, more blocks must be created. Blocks can also be grouped into thread block clusters which is optional, but can be useful in certain scenarios. All thread blocks or thread block clusters are part of a grid, which manifests as a dispatch of the code run on the GPU, also called kernel \parencite{amd_hip_2025}. All threads in one block have access to some shared memory, which can be used for L1 caching or communication between threads. It is important that the blocks can be scheduled independently, with no dependencies between them. This allows the scheduler to schedule blocks and threads as efficiently as possible. All threads within a warp are guaranteed to be part of the same block, and are therefore executed simultaneously and can access the same memory addresses. Figure \ref{fig:thread_hierarchy} depicts how threads in a block are grouped into warps for execution and how they shared memory. 
+
+\begin{figure}
+	\centering
+	\includegraphics[width=.8\textwidth]{thread_hierarchy.png}
+	\caption{An overview of the thread hierarchy with blocks being split into multiple warps and their shared memory \parencite{amd_hip_2025}.}
+	\label{fig:thread_hierarchy}
+\end{figure}
+
+A piece of code that is executed on a GPU is written as a kernel which can be configured. The most important configuration is how threads are grouped into blocks. The GPU allows the kernel to allocate threads and blocks and block clusters in up to three dimensions. This is often useful because of the already mentioned shared memory, which will be explained in more detail in section \ref{sec:memory_model}. Considering the case where an image needs to be blurred, it not only simplifies the development if threads are arranged in a 2D grid, it also helps with optimising memory access. As the threads in a block, need to access a lot of the same data, this data can be loaded in the shared memory of the block. This allows the data to be accessed much quicker compared to when threads are allocated in only one dimension. With one dimensional blocks it is possible that threads assigned to nearby pixels, are part of a different block, leading to a lot of duplicate data transfer. The size in each dimension of a block can be almost arbitrary within the maximum allowed number of threads. However, blocks that are too large might lead to other problems which are described in more detail in section \ref{sec:occupancy}.
+
+All threads in a warp start at the same point in a program, but with their own instruction address, allowing them to work independently. Because of the SIMD architecture, all threads in a warp must execute the same instructions and if threads start diverging, the SM must pause threads with different instructions and execute them later. Figure \ref{fig:thread_divergence} shows how such divergences can impact performance. The situation described by the figure also shows, that after the divergence the thread could re-converge. On older hardware this does not happen and leads to T2 being executed after T1 and T3 are finished. In situations where a lot of data dependent thread divergence happens, most of the benefits of using a GPU likely have vanished. Threads not executing the same instruction is strictly speaking against the SIMD principle but can happen in reality, due to data dependent branching. Consequently, this leads to bad resource utilisation, which in turn leads to worse performance. Another possibility of threads being paused (inactive threads) is the fact that sometimes, the number of threads started is not divisible by 32. In such cases, the last warp still contains 32 threads but only the threads with work are executed.
+
+\begin{figure}
+	\centering
+	\includegraphics[width=.8\textwidth]{thread_divergence.png}
+	\caption{Thread T2 wants to execute instruction B while T1 and T3 want to execute instruction A. Therefore T2 will be an inactive thread this cycle and active once T1 and T3 are finished. This means that now the divergent threads are serialised.}
+	\label{fig:thread_divergence}
+\end{figure}
+
+
+Modern GPUs implement the so called Single-Instruction Multiple-Thread (SIMT) architecture. In many cases a developer does not need to know the details of SIMT and can develop fast and correct programs with just the SIMD architecture in mind. However, leveraging the power of SIMT can yield substantial performance gains by re-converging threads after data dependent divergence occurred. A stack-less re-convergence algorithm was proposed by \textcite{collange_stack-less_2011} as an alternative to the default stack-based re-convergence algorithm. Their algorithm was able to achieve higher performance than the default one. Another approach for increasing occupancy using the SIMT architecture is proposed by \textcite{fung_thread_2011}. They introduced a technique for compacting thread blocks by moving divergent threads to new warps until they re-converge. This approach resulted in a noticeable speed-up between 17\% and 22\%. Another example where a SIMT aware algorithm can perform better was proposed by \textcite{koster_massively_2020}. While they did not implement techniques for thread re-convergence, they implemented a thread compaction algorithm. On data-dependent divergence it is possible for threads to end early, leaving a warp with only partial active threads. This means the inactive threads are still occupied and cannot be used for other work. Their thread compaction tackles this problem by moving active threads into a new thread block, releasing the inactive threads to perform other work. With this they were able to gain a speed-up of roughly 4 times compared to previous implementations. Adapting Multiple-Instruction Multiple-Data (MIMD) programs with synchronisation to run on SIMT architecture can be a difficult task, especially if the underlying architecture is not well understood. A static analysis tool and a transformer specifically designed to help avoid deadlocks with MIMD synchronisation is proposed by \textcite{eltantawy_mimd_2016}. In addition, they proposed a hardware re-convergence mechanism that supports MIMD synchronisation. A survey by \textcite{khairy_survey_2019} explores different aspects of improving GPGPU performance architecturally. Specifically, they have compiled a list of different publications discussing algorithms for thread re-convergence, thread compaction and much more. Their main goal was to give a broad overview of many ways to improve the performance of GPGPU programming to help other developers.
+
+\subsubsection{Memory Model}
+\label{sec:memory_model}
+% If more is needed talk about the following: 
+% - Memory allocation (with the one paper diving into dynamic allocations)
+% - Memory transfer (with streams potentially)
+
+On a GPU there are two parts that contribute to the performance of an algorithm. The one already looked at is the compute-portion of the GPU. This is necessary because if threads are serialised or run inefficiently, there is nothing that can make the algorithm execute faster. However, algorithms run on a GPU usually require huge amounts of data to be processed, as they are designed for exactly that purpose. The purpose of this section is to explain how the memory model of the GPU works and how it can influence the performance of an algorithm. In figure \ref{fig:gpu_memory_layout} the memory layout and the kinds of memory available are depicted. The different parts will be explained in this section.
+
+\begin{figure}
+	\centering
+	\includegraphics[width=.9\textwidth]{gpu_memory_layout.png}
+	\caption{The layout of the memory in the GPU. The connections between the memory regions can be seen as well as the different kinds of memory available.}
+	\label{fig:gpu_memory_layout}
+\end{figure}
+
+On a GPU there are multiple levels and kinds of memory available. All these levels and kinds have different purposes they are optimised for. This means that it is important to know what they are and how they can be best used for specific tasks. On the lowest level threads have registers and local memory available. Registers is the fastest way to access memory but is also the least abundant memory with up to a maximum of 255 32-Bit registers per thread on Nvidia GPUs and 256 on AMD GPUs \parencite{amd_hardware_2025}. However, using all registers of a thread can lead to other problems which are described in more detail in section \ref{sec:occupancy}. On the other side, the thread local memory is significantly slower than registers. This is due to the fact, that local memory is actually stored in global memory and therefore has the same limitations which are explained later. This means it is important to try and avoid local memory as much as possible. Local memory is usually only used when a thread uses too many registers. The compiler will then spill the remaining data into local memory and loads it into registers once needed, drastically slowing down the application.
+
+Shared memory is the next tier of memory on a GPU. Unlike local memory and registers, shared memory is shared between all threads inside a block. The amount of shared memory is depending on the GPU architecture but for Nvidia it hovers at around 100 Kilobyte (KB) per block. While this memory is slower than registers, its primary use-case is communicating and sharing data between threads in a block. If all threads in a block access a lot of overlapping data this data can be loaded from global memory into faster shared memory once. It can then be accessed multiple times, further increasing performance. Loading data into shared memory and accessing that data has to be done manually. Because shared memory is part of the unified data cache, it can either be used as a cache or for manual use, meaning a developer can allocate more shared memory towards caching if needed. Another feature of shared memory are the so-called memory banks. Shared memory is always split into 32 equally sized memory modules also called memory banks. All available memory addresses lie in one of these banks. This means if two threads access two memory addresses which lie in different banks, the access can be performed simultaneously, increasing the throughput.
+
+The most abundant and slowest memory is the global memory and resides in device memory. A key constraint of device memory and therefore global memory is, that can only be accessed in either 32, 64 or 128 byte chunks. This means if a thread wants to access 8 bytes from global memory, alongside the 8 bytes, the 24 bytes after the requested 8 bytes are also transferred. As a result, the throughput is only a fourth of the theoretical maximum. Therefore, it is important to follow optimal access patterns. What these optimal patterns are, are architecture dependent and are described in the according sections in the CUDA programming guide.
+
+A small portion of device memory is allocated to constant memory. Constant memory is accessible by all threads and as the name implies, can not be written to by threads. It can be initialised by the CPU when starting a kernel if needed. As constant memory has a separate cache, it can be used to speed-up data access for constant and frequently accessed data.
+
+Another special kind of memory is the texture and surface memory. According to \textcite{amd_hip_2025} texture memory is read-only memory, while surface memory can also be written to, which is the only difference between these two kinds of memory. Nvidia does not explicitly state this behaviour, but due to the fact that accessing textures is only performed via caches, it is implied that on Nvidia GPUs, texture memory is also read-only. As the name implies, this kind of memory is optimised for accessing textures. This means that threads of the same warp, accessing data which is spatially close together, will result in increased performance. As already mentioned, surface memory works the same way, with the difference, that it can be written to. It is therefore well suited for manipulating two- or three-dimensional data.
+
+
+\subsubsection{Occupancy}
+\label{sec:occupancy}
+% Describe occupancy, why it is important and what can impact it. Maybe add a simplified version of this table: \url{https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications-technical-specifications-per-compute-capability} to explain the bounds and effects on occupancy
+
+Occupancy describes the utilisation of a GPU. A high occupancy means, that there are Warps executing, or in other words, the cores are occupied with work. This is important, as a low occupancy means that the GPU is waiting for work to be scheduled and is therefore idle. As a result, it is desired to achieve high occupancy in order to increase the performance of an algorithm. It needs to be noted, that occupancy is not the only option for improving performance. As it is possible for the GPU to have a high occupancy while performing a lot of unnecessary or redundant work or utilising compute-resources that are slower. An example for the latter would be developing an algorithm that uses 64-bit floating point (FP64) numbers while 32-bit floating point (FP32) numbers would have sufficient accuracy. Because GPUs tend to have fewer FP64 compute-resources than they have FP32 compute-resources, performing FP64 operations will take longer. However, despite these drawbacks, having low occupancy will very likely result in performance degradation while high occupancy will either improve performance or do no harm otherwise. Ways of achieving high occupancy will be outlined in this section as most other performance problems can be solved algorithmically.
+
+\begin{table}
+	\centering
+	\begin{tabular}{l|cc}
+		Compute Capability & 8.9 & 10.x \\
+		\hline
+		Max. number of threads per block & \multicolumn{2}{c}{1\,024} \\
+		Warp size & \multicolumn{2}{c}{32 threads} \\
+		Max. number of warps per SM & 48 & 64 \\
+		Max. number of blocks per SM & 24 & 32 \\
+		Max. number of threads per SM & 1\,536 & 2\,048 \\
+		Number of 32-bit registers per SM & \multicolumn{2}{c}{64\,000} \\
+		Max. number of 32-bit registers per block & \multicolumn{2}{c}{64\,000} \\
+		Max. number of 32-bit registers per thread & \multicolumn{2}{c}{255} \\
+		Max. amount of shared memory per SM & 100 Kilobytes & 228 Kilobytes \\
+		Max. amount of shared memory per block & 99 Kilobytes & 227 Kilobytes
+	\end{tabular}
+	\caption{A simplified version of the technical specifications for the Compute Capabilities 8.9 and 10.x \parencite{nvidia_cuda_2025}. These correspond to the Nvidia Ada Lovelace and Blackwell microarchitectures.}
+	\label{tab:compute_capabilities}
+\end{table}
+
+When starting a kernel, the most important configuration is the number of threads and thread blocks that need to be started. This is important, as this has other effects on occupancy as well. In table \ref{tab:compute_capabilities} the most notable limitations are presented that can affect occupancy. These limitations need to be considered when choosing a kernel configuration. It is important to note, that depending on the GPU and problem, the occupancy tuning might differ, and the same approach might perform well on one GPU but perform poorly on another GPU. Therefore, the things discussed here are only guidelines. Tools like Nvidia Nsight Compute\footnote{\url{https://developer.nvidia.com/nsight-compute}} and Nsight Systems\footnote{\url{https://developer.nvidia.com/nsight-systems}} are essential for performance tuning. Nsight compute also contains an occupancy calculator which takes a kernel and computes how the configuration performs in terms of occupancy and also lets the developer try out different configurations \parencite{nvidia_nsight_2025}.
+
+In general, it is important to have as many warps as possible ready for execution. While this means that a lot of warps could be executed but are not, this is actually desired. A key feature of GPUs is so-called latency hiding, meaning that while a warp waits for data to be retrieved for example, another warp ready for execution can now be run. With low occupancy, and therefore little to no warps waiting for execution, latency hiding does not work, as now the hardware is idle. As a result, the runtime increases which also explains why high occupancy is not guaranteed to result in performance improvements while low occupancy can and often will increase the runtime.
+
+As seen in table \ref{tab:compute_capabilities}, there exist different limitations that can impact occupancy. The number of warps per SM is important, as this means this is the degree of parallelism achievable per SM. If due to other limitations, the number of warps per SM is below the maximum, there is idle hardware. One such limitation is the number of registers per block and SM. In the case of compute capability 8.9, one SM can handle $32 * 48 = 1\,536$ threads. This leaves $64\,000 / 1\,536 \approx 41$ registers per thread, which is lower than the theoretical maximum of $255$ registers per thread. Typically, one register is mapped to one variable in the kernel code, meaning a developer can use up to 41 variables in their code. However, if the variable needs 64 bits to store its value, the register usage doubles, as all registers on a GPU are 32-bit. On a GPU with compute capability 10.x a developer can use up to $64\,000 / 2\,048 \approx 31$ registers. Of course a developer can use more registers, but this results in less occupancy. However, depending on the algorithm using more registers might be more beneficial to performance than the lower occupancy, in which case occupancy is not as important. If a developer needs more than $255$ registers for their variables the additional variables will spill into local memory which is, as described in section \ref{sec:memory_model}, not desirable.
+
+Additionally, shared memory consumption can also impact the occupancy. If for example a block needs all the available shared memory, which is almost the same as the amount of shared memory per SM, this SM can only serve this block. On compute capability 10.x, this would mean that occupancy would be at maximum $50\%$ as a block can have up to $1\,024$ threads while an SM supports up to $2\,048$ threads. Again, in such cases it needs to be determined, if the performance gain of using this much shared memory is worth the lower occupancy.
+
+Balancing these limitations and therefore the occupancy and performance often requires a lot of trial and error with help of the aforementioned tools. In cases where occupancy is already high and the amount of warps ready for execution is also high, other areas for performance improvements need to be explored. Algorithmic optimisation is always a good idea. Some performance improvements can be achieved by altering the computations to use different parts of the GPU. One of such optimisations is using FP32 operations wherever possible. Another well suited optimisation is to rewrite the algorithm to use as many Fused Multiply-Add (FMA) instructions. FMA is a special floating point instruction, that multiplies two values and adds a third, all in a single clock cycle \parencite{nvidia_cuda_2025-1}. However, the result might slightly deviate compared to performing these two operations separately, which means in accuracy sensitive scenarios, this instruction should be avoided. If the compiler detects a floating point operation with the FMA structure, it will automatically be compiled to an FMA instruction. To prevent this, in C++ the developer can call the functions \_\_fadd\_ and \_\_fmul\_ for addition and multiplication respectively.

 \subsection[PTX]{Parallel Thread Execution}
-Describe what PTX is to get a common ground for the implementation chapter. Probably a short section
+% https://docs.nvidia.com/cuda/parallel-thread-execution/
+While in most cases a GPU can be programmed in a higher level language like C++ or even Julia\footnote{\url{https://juliagpu.org/}}, it is also possible to program GPUs with the low level language Parallel Thread Execution (PTX) developed by Nvidia. A brief overview of what PTX is and how it can be used to program GPUs is given in this section. Information in this section is taken from the PTX documentation \parencite{nvidia_parallel_2025} if not stated otherwise.
+
+PTX defines a virtual machine with an own instruction set architecture (ISA) and is designed for data-parallel processing on a GPU. It is an abstraction of the underlying hardware instruction set, allowing PTX code to be portable across Nvidia GPUs. In order for PTX code to be usable for the GPU, the driver is responsible for compiling the code to the hardware instruction set of the GPU it is run on. A developer typically writes a kernel in CUDA using C++, for example, and the Nvidia compiler generates the PTX code for that kernel. This PTX code is then compiled by the driver once it is executed. The concepts for programming the GPU with PTX and CUDA are the same, apart from the terminology which is slightly different. For consistency, the CUDA terminology will continue to be used.
+
+Syntactically PTX resembles Assembly style code. Every PTX code must have a \verb|.version| directive which indicates the PTX version and an optional \verb|.target| directive which indicates the compute capability. If the program works in 64 bit addresses, the optional \verb|.address_size| directive can be used to indicate that, which simplifies the code for such applications. After these directives, the actual code is written. As each PTX code needs an entry point (the kernel) the \verb|.entry| directive indicates the name of the kernel and the parameters needed. It is also possible to write helper functions with the \verb|.func| directive. Inside the kernel or a helper function, normal PTX code can be written. Because PTX is very low level, it assumes an underlying register machine, therefore a developer needs to think about register management. This includes loading data from global or shared memory into registers if needed. Code for manipulating data like addition and subtraction generally follow the structure \verb|operation.datatype| followed by up to four parameters for that operation. For adding two FP32 values together and storing them in the register \%n, the code looks like the following:
+\begin{GenericCode}[numbers=none]
+add.f32		\%n, 0.1, 0.2;
+\end{GenericCode}
+Loops in the classical sense do not exist in PTX. Alternatively a developer needs to define jump targets for the beginning and end of the loop. The code in \ref{code:ptx_loop} shows how a function with simple loop can be implemented. The loop counts down to zero from the passed parameter $N$ which is loaded into the register \%n in line 6. If the value in the register \%n reached zero the loop branches at line 9 to the jump target at line 12 and the loop has finished. All other used directives and further information on writing PTX code can be taken from the PTX documentation \parencite{nvidia_parallel_2025}.
+
+\begin{program}
+\begin{GenericCode}
+.func loop(.param .u32 N)
+{
+	.reg .u32 \%n;
+	.reg .pred \%p;
+
+	ld.param.u32 \%n, [N];
+Loop: 
+	setp.eq.u32 \%p, \%n, 0;
+@\%p bra 		Done;
+	sub.u32		\%n, \%n, 1;
+	bra			Loop;
+Done:
+}
+\end{GenericCode}
+\caption{A PTX program fragment depicting how loops can be implemented.}
+\label{code:ptx_loop}
+\end{program}
+
+\section{Compilers}
+Compilers are a necessary tool for many developers. If a developer wants to run their program it is very likely they need one. As best described by \textcite{aho_compilers_2006} in their dragon book, a compiler takes code written by a human in some source language and translates it into a destination language readable by a computer. This section briefly explores what compilers are and research done in this old field of computer science. Furthermore, the topics of transpilers and interpreters are explored, as their use-cases are very similar.
+
+\textcite{aho_compilers_2006} and \textcite{cooper_engineering_2022} describe how a compiler can be developed, with the latter focusing on more modern approaches. They describe how a compiler consists of two parts, the analyser, also called frontend, and the synthesiser also called backend. The front end is responsible for ensuring syntactic and semantic correctness and converts the source code into an intermediate representation, an abstract syntax tree (AST), for the backend. Generating code in the target language, from the intermediate representation is the job of the backend. This target code can be assembly or anything else that is needed for a specific use-case. This intermediate representation also makes it simple to swap out frontends or backends. The Gnu Compiler Collection \textcite{gcc_gcc_2025} takes advantage of using different frontends to provide support for many languages including C, C++, Ada and more. Instead of compiling source code for specific machines directly, many languages compile code for virtual machines instead. Notable examples are the Java Virtual Machine (JVM) \parencite{lindholm_java_2025} and the low level virtual machine (LLVM) \parencite{lattner_llvm_2004}. Such virtual machines provide a bytecode which can be used as a target language for compilers. A huge benefit of such virtual machines is the ability for one program to be run on all physical machines the virtual machine exists for, without the developer needing to change that program \parencite{lindholm_java_2025}. Programs written for virtual machines are compiled into their respective bytecode. This bytecode can then be interpreted or compiled to physical machine code and then be run. According to the JVM specification \textcite{lindholm_java_2025} the Java bytecode is interpreted and also compiled with a just-in-time (JIT) compiler to increase the performance of code blocks that are often executed. On the other hand, the common language runtime (CLR)\footnote{\url{https://learn.microsoft.com/en-us/dotnet/standard/clr}}, the virtual machine for languages like C\#, never interprets the generated bytecode. As described by \textcite{microsoft_overview_2023} the CLR always compiles the bytecode to physical machine code using a JIT compiler before it is executed.
+
+A grammar describes how a language is structured. It not only describes the structure of natural language, but it can also be used to describe the structure of a programming language. \textcite{chomsky_certain_1959} found that grammars can be grouped into four levels, with regular and context-free grammars being the most relevant for programming languages. A regular grammar is of the structure $A = a\,|\,a\,B$ which is called a rule. The symbols $A$ and $B$ are non-terminal symbols and $a$ is a terminal symbol. A non-terminal symbol stands for another rule with the same structure and must only occur after a terminal symbol. Terminal symbols are fixed symbols or a value that can be found in the input stream, like literals in programming languages. Context-free grammars are more complex and are of the structure $A = \beta$. In this context $\beta$ stands for any combination of terminal and non-terminal symbols. Therefore, a rule like $A = a\,| a\,B\,a$ is allowed with this grammar level. This shows that with context-free grammars enclosing structures are possible. To write grammars for programming languages, other properties are also important to efficiently validate or parse some input to be defined by this grammar. However, these are not discussed here, but are described by \textcite{aho_compilers_2006}. They also described that generating a parser out of a grammar can be automated. This automation can be performed by parser generators like Yacc \parencite{johnson_yacc_1975} as described in their book. More modern alternatives are Bison\footnote{\url{https://www.gnu.org/software/bison/}} or Antlr\footnote{\url{https://www.antlr.org/}}. Before the parser can validate the input stream, a scanner is needed as described by \textcite{cooper_engineering_2022}. The scanner reads every character of the input stream and is responsible for removing white-spaces and ensures only valid characters and words are present. Flex \footnote{\url{https://github.com/westes/flex}} is a tool that allows generating a scanner and is often used in combination with Bison. A simplified version of the compiler architecture using Flex and Bison is depicted in figure \ref{fig:compiler_layout}. It shows how source code is taken and transformed into the intermediate representation by the frontend, and how it is converted into executable machine code by the backend.
+
+\begin{figure}
+	\centering
+	\includegraphics[width=.9\textwidth]{compiler_architecture.png}
+	\caption{A simplified overview of how the architecture of a compiler looks, using Flex and Bison.}
+	\label{fig:compiler_layout}
+\end{figure}
+
+% More references to JIT: https://dl.acm.org/doi/abs/10.1145/857076.857077


-\section{GPU Interpretation}
-Different sources on how to do interpretation on the gpu (and maybe interpretation in general too?)
+\subsection{Transpilers}
+% talk about what transpilers are and how to implement them. If possible also gpu specific transpilation.
+With the concepts already mentioned, it is possible to generate executable code from code written in a programming language. However, sometimes it is desired to convert a program from one programming language to another and therefore the major difference between these use-cases is the backend. A popular transpiler example is TypeScript, which transforms TypeScript source code into JavaScript source code \parencite{microsoft_typescript_2025}. Other examples for transpilers are the C2Rust transpiler \parencite{ling_rust_2022} that transpiles C code into Rust code as well as the PyJL transpiler \parencite{marcelino_transpiling_2022} which transpiles Python code into Julia code. \textcite{chaber_effectiveness_2016} proposed a transpiler that takes MATLAB and C code and transforms it into pure and optimised C code for an STM32 microcontroller. An early example for a transpiler has been developed by \textcite{intel_mcs86_1978} where they built a transpiler for transforming assembly code for their 8080 CPU to assembly code for their 8086 CPU. Transpilers can also be used in parallelisation environments, like OpenMP \parencite{wang_automatic_2015}. There also exists a transpiler that transforms CUDA code into highly parallel CPU code. \textcite{moses_high-performance_2023} described this transpiler, and they found that the generated code performs noticeably better than doing this transformation by hand. When designing complex processors and accelerators, Register-transfer level (RTL) simulations are essential \parencite{wang_electronic_2009}. In a later study \textcite{zhang_opportunities_2020} have shown how RTL simulations can be performed on GPUs with a speed-up of 20. This led to \textcite{lin_rtl_2023} developing a transpiler to transform RTL into CUDA kernels instead of handwriting them. The compared their results with a CPU implementation running on 80 CPUs, where they found that the transpiled CUDA version was 40 times faster. Using transpilers for software backend and business logic has been proposed by \textcite{bastidas_fuertes_transpiler-based_2023}. Their approach implemented a programming language that can be transpiled into different programming languages, for usage in a multi-programming-language environment that share some business logic. In another study, \textcite{bastidas_fuertes_transpilers_2023} reviewed over 600 publications to map the use of transpilers alongside their implementations in different fields of research, demonstrating the versatility of transpiler use.

-\section{Transpiler}
-talk about what transpilers are and how to implement them. If possible also gpu specific transpilation. Also talk about compilation and register management. and probably find a better title
+
+\subsection{Interpreters}
+% What are interpreters; how they work; should mostly contain/reference gpu interpreters
+Interpreters are a different kind of program for executing source code. Rather than compiling the code and executing the result, an interpreter executes the source code directly. Languages like Python and JavaScript are prominent examples of interpreted languages, but also Java, or more precise Java-Bytecode, is also interpreted before it gets compiled \parencite{lindholm_java_2025}. However, interpreters can not only be used for interpreting programming languages. It is also possible for them to be used in GP. \textcite{langdon_simd_2008} have shown how a SIMD interpreter can be efficiently used for evaluating entire GP populations on the GPU directly. In a later work \textcite{cano_gpu-parallel_2014} further improved this interpreter. They used the fact that a GP individual represents a tree which can be split into independent subtrees. These can be evaluated concurrently and with the help of communication via shared memory, they were able to evaluate the entire tree. With this they achieved a significant performance improvement over previous implementations. As shown by \textcite{dietz_mimd_2010}, it is even possible to develop an interpreter that can execute MIMD programs on a SIMD GPU. However, as noted by the authors, any kind interpretation comes with an overhead. This means that with the additional challenges of executing MIMD programs on SIMD hardware, their interpreter, while achieving reasonable efficiency, still suffers from performance problems. Another field where interpreters can be useful are rule-based simulations. \textcite{koster_massively_2020} has shown how they implemented a GPU interpreter for such simulations. In addition with other novel performance improvements in running programs on a GPU, they were able to gain a speed-up of 4 over non-interpreted implementations. While publications like \textcite{fua_comparing_2020} and \textcite{gherardi_java_2012} have shown, interpreted languages often trail behind in terms of performance compared to compiled languages, interpreters per se are not slow. And while they come with performance overhead as demonstrated by \textcite{dietz_mimd_2010} and \textcite{romer_structure_1996}, they can still be a very fast, easy and powerful alternative for certain tasks.
--- a/thesis/hgbbib.sty
+++ b/thesis/hgbbib.sty
@ -14,7 +14,7 @@
 \RequirePackage{xifthen}

 %\usepackage[style=numeric-comp,backend=biber,bibencoding=auto]{biblatex}
-\usepackage[style=\@bibstyle,backend=biber]{biblatex}
+\usepackage[style=\@bibstyle,backend=biber,uniquelist=false]{biblatex}
 \ExecuteBibliographyOptions{
 	bibencoding=auto,
 	bibwarn=true,
--- a/thesis/images/compiler_architecture.png
+++ b/thesis/images/compiler_architecture.png
--- a/thesis/images/gpu_memory_layout.png
+++ b/thesis/images/gpu_memory_layout.png
--- a/thesis/images/nvidia_cpu_vs_gpu.png
+++ b/thesis/images/nvidia_cpu_vs_gpu.png
--- a/thesis/images/thread_divergence.png
+++ b/thesis/images/thread_divergence.png
--- a/thesis/images/thread_hierarchy.png
+++ b/thesis/images/thread_hierarchy.png
--- a/thesis/images/thread_hierarchy.svg
+++ b/thesis/images/thread_hierarchy.svg
--- a/thesis/main.pdf
+++ b/thesis/main.pdf
--- a/thesis/main.tex
+++ b/thesis/main.tex
@ -32,7 +32,7 @@
 %%%-----------------------------------------------------------------------------

 \title{Interpreter and Transpiler for simple expressions on Nvidia GPUs using Julia}
-\author{Daniel Wiplinger}
+\author{Daniel Roth}
 \programname{Software Engineering}

 %\programtype{Fachhochschul-Bachelorstudiengang} % select/edit
--- a/thesis/references.bib
+++ b/thesis/references.bib
				`@ -0,0 +1 @@`
				[{"Julia":"1.11.4","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["CPUInterpreter"]}],"GPUT":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]]