Compare commits
No commits in common. "main" and "initial-implementation" have entirely different histories.
main
...
initial-im
2
LICENSE
|
@ -1,6 +1,6 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2024 Daniel Roth
|
||||
Copyright (c) 2024 Daniel Wiplinger
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
|
Before Width: | Height: | Size: 154 KiB After Width: | Height: | Size: 152 KiB |
|
@ -1,75 +0,0 @@
|
|||
<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0" version="26.2.6">
|
||||
<diagram name="Page-1" id="R-oAYELteez0U9UgfQ2t">
|
||||
<mxGraphModel dx="2068" dy="1147" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
|
||||
<root>
|
||||
<mxCell id="0" />
|
||||
<mxCell id="1" parent="0" />
|
||||
<mxCell id="GDUa8-GdCzSgoxu7vCdt-14" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="GDUa8-GdCzSgoxu7vCdt-4" target="GDUa8-GdCzSgoxu7vCdt-12" edge="1">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="GDUa8-GdCzSgoxu7vCdt-4" value="Pre-Processing" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
|
||||
<mxGeometry x="500" y="280" width="120" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="GDUa8-GdCzSgoxu7vCdt-8" value="Interpreter" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;width=90;height=40;" parent="1" vertex="1">
|
||||
<mxGeometry x="440" y="160" width="440" height="480" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="GDUa8-GdCzSgoxu7vCdt-9" value="" style="ellipse;html=1;shape=endState;fillColor=#000000;strokeColor=#000000;" parent="1" vertex="1">
|
||||
<mxGeometry x="270" y="520" width="40" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="GDUa8-GdCzSgoxu7vCdt-13" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="GDUa8-GdCzSgoxu7vCdt-10" target="GDUa8-GdCzSgoxu7vCdt-4" edge="1">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="GDUa8-GdCzSgoxu7vCdt-15" value="<div align="left"><font style="font-size: 12px;"><b>Input:</b></font></div><div align="left"><font style="font-size: 12px;">Expressions</font></div><div align="left"><font style="font-size: 12px;">Variable-Sets</font></div><div align="left"><font style="font-size: 12px;">Parameters</font></div>" style="edgeLabel;html=1;align=left;verticalAlign=middle;resizable=0;points=[];" parent="GDUa8-GdCzSgoxu7vCdt-13" vertex="1" connectable="0">
|
||||
<mxGeometry x="-0.4633" relative="1" as="geometry">
|
||||
<mxPoint x="-33" as="offset" />
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="GDUa8-GdCzSgoxu7vCdt-10" value="" style="ellipse;html=1;shape=endState;fillColor=#000000;strokeColor=none;" parent="1" vertex="1">
|
||||
<mxGeometry x="270" y="280" width="40" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="GDUa8-GdCzSgoxu7vCdt-11" value="CPU" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" parent="1" vertex="1">
|
||||
<mxGeometry x="460" y="220" width="400" height="140" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="GDUa8-GdCzSgoxu7vCdt-18" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" parent="1" source="GDUa8-GdCzSgoxu7vCdt-12" target="GDUa8-GdCzSgoxu7vCdt-17" edge="1">
|
||||
<mxGeometry relative="1" as="geometry">
|
||||
<Array as="points">
|
||||
<mxPoint x="770" y="356" />
|
||||
<mxPoint x="770" y="356" />
|
||||
</Array>
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="GDUa8-GdCzSgoxu7vCdt-19" value="<div align="left"><font style="font-size: 12px;"><b>Input:<br></b></font></div><div align="left"><font style="font-size: 12px;">Processed Expressions</font></div><div align="left"><font style="font-size: 12px;">Variable-Sets</font></div><div align="left"><font style="font-size: 12px;">Parameters</font></div>" style="edgeLabel;html=1;align=left;verticalAlign=middle;resizable=0;points=[];" parent="GDUa8-GdCzSgoxu7vCdt-18" vertex="1" connectable="0">
|
||||
<mxGeometry x="0.1565" y="-2" relative="1" as="geometry">
|
||||
<mxPoint x="-48" y="-26" as="offset" />
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="GDUa8-GdCzSgoxu7vCdt-12" value="Dispatch Kernel" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
|
||||
<mxGeometry x="710" y="280" width="120" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="GDUa8-GdCzSgoxu7vCdt-16" value="GPU" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" parent="1" vertex="1">
|
||||
<mxGeometry x="680" y="456" width="180" height="139" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="GDUa8-GdCzSgoxu7vCdt-25" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0.5;exitDx=0;exitDy=0;entryX=1;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="GDUa8-GdCzSgoxu7vCdt-17" target="GDUa8-GdCzSgoxu7vCdt-21" edge="1">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="GDUa8-GdCzSgoxu7vCdt-17" value="Evaluation" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
|
||||
<mxGeometry x="710" y="520" width="120" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="GDUa8-GdCzSgoxu7vCdt-20" value="CPU" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" parent="1" vertex="1">
|
||||
<mxGeometry x="460" y="456" width="170" height="139" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="GDUa8-GdCzSgoxu7vCdt-22" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0.5;exitDx=0;exitDy=0;entryX=1;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="GDUa8-GdCzSgoxu7vCdt-21" target="GDUa8-GdCzSgoxu7vCdt-9" edge="1">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="GDUa8-GdCzSgoxu7vCdt-26" value="<div><font style="font-size: 12px;"><b>Output:</b></font></div><div><font style="font-size: 12px;">Evaluation-Results</font></div>" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="GDUa8-GdCzSgoxu7vCdt-22" vertex="1" connectable="0">
|
||||
<mxGeometry x="0.4108" y="-1" relative="1" as="geometry">
|
||||
<mxPoint x="13" y="1" as="offset" />
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="GDUa8-GdCzSgoxu7vCdt-21" value="Retrieve Results" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
|
||||
<mxGeometry x="485" y="520" width="120" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
</root>
|
||||
</mxGraphModel>
|
||||
</diagram>
|
||||
</mxfile>
|
|
@ -1,84 +0,0 @@
|
|||
<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0" version="26.2.6">
|
||||
<diagram name="Page-1" id="KFoKKVRmhU8qG_-FEeqA">
|
||||
<mxGraphModel dx="2068" dy="1147" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
|
||||
<root>
|
||||
<mxCell id="0" />
|
||||
<mxCell id="1" parent="0" />
|
||||
<mxCell id="tQMPqDGkYp4bv8unJ6VJ-1" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="tQMPqDGkYp4bv8unJ6VJ-21" target="tQMPqDGkYp4bv8unJ6VJ-11" edge="1">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="tQMPqDGkYp4bv8unJ6VJ-2" value="Pre-Processing" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
|
||||
<mxGeometry x="480" y="280" width="120" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="tQMPqDGkYp4bv8unJ6VJ-3" value="Transpiler" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;width=90;height=40;" parent="1" vertex="1">
|
||||
<mxGeometry x="440" y="160" width="480" height="480" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="tQMPqDGkYp4bv8unJ6VJ-4" value="" style="ellipse;html=1;shape=endState;fillColor=#000000;strokeColor=#000000;" parent="1" vertex="1">
|
||||
<mxGeometry x="270" y="520" width="40" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="tQMPqDGkYp4bv8unJ6VJ-5" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="tQMPqDGkYp4bv8unJ6VJ-7" target="tQMPqDGkYp4bv8unJ6VJ-2" edge="1">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="tQMPqDGkYp4bv8unJ6VJ-6" value="<div align="left"><font style="font-size: 12px;"><b>Input:</b></font></div><div align="left"><font style="font-size: 12px;">Expressions</font></div><div align="left"><font style="font-size: 12px;">Variable-Sets</font></div><div align="left"><font style="font-size: 12px;">Parameters</font></div>" style="edgeLabel;html=1;align=left;verticalAlign=middle;resizable=0;points=[];" parent="tQMPqDGkYp4bv8unJ6VJ-5" vertex="1" connectable="0">
|
||||
<mxGeometry x="-0.4633" relative="1" as="geometry">
|
||||
<mxPoint x="-16" as="offset" />
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="tQMPqDGkYp4bv8unJ6VJ-7" value="" style="ellipse;html=1;shape=endState;fillColor=#000000;strokeColor=none;" parent="1" vertex="1">
|
||||
<mxGeometry x="270" y="280" width="40" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="tQMPqDGkYp4bv8unJ6VJ-8" value="CPU" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" parent="1" vertex="1">
|
||||
<mxGeometry x="460" y="220" width="440" height="140" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="tQMPqDGkYp4bv8unJ6VJ-9" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" parent="1" source="tQMPqDGkYp4bv8unJ6VJ-11" target="tQMPqDGkYp4bv8unJ6VJ-14" edge="1">
|
||||
<mxGeometry relative="1" as="geometry">
|
||||
<Array as="points">
|
||||
<mxPoint x="820" y="420" />
|
||||
<mxPoint x="820" y="420" />
|
||||
</Array>
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="tQMPqDGkYp4bv8unJ6VJ-10" value="<div align="left"><font style="font-size: 12px;"><b>Input:<br></b></font></div><div align="left"><font style="font-size: 12px;">Processed Expressions</font></div><div align="left"><font style="font-size: 12px;">Variable-Sets</font></div><div align="left"><font style="font-size: 12px;">Parameters</font></div>" style="edgeLabel;html=1;align=left;verticalAlign=middle;resizable=0;points=[];" parent="tQMPqDGkYp4bv8unJ6VJ-9" vertex="1" connectable="0">
|
||||
<mxGeometry x="0.1565" y="-2" relative="1" as="geometry">
|
||||
<mxPoint x="-48" y="-25" as="offset" />
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="tQMPqDGkYp4bv8unJ6VJ-11" value="Dispatch Kernel" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
|
||||
<mxGeometry x="760" y="280" width="120" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="tQMPqDGkYp4bv8unJ6VJ-12" value="GPU" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" parent="1" vertex="1">
|
||||
<mxGeometry x="720" y="456" width="180" height="134" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="tQMPqDGkYp4bv8unJ6VJ-13" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0.5;exitDx=0;exitDy=0;entryX=1;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="tQMPqDGkYp4bv8unJ6VJ-14" target="tQMPqDGkYp4bv8unJ6VJ-18" edge="1">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="tQMPqDGkYp4bv8unJ6VJ-14" value="Evaluation" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
|
||||
<mxGeometry x="760" y="520" width="120" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="tQMPqDGkYp4bv8unJ6VJ-15" value="CPU" style="shape=umlFrame;whiteSpace=wrap;html=1;pointerEvents=0;" parent="1" vertex="1">
|
||||
<mxGeometry x="460" y="456" width="180" height="134" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="tQMPqDGkYp4bv8unJ6VJ-16" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0.5;exitDx=0;exitDy=0;entryX=1;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="tQMPqDGkYp4bv8unJ6VJ-18" target="tQMPqDGkYp4bv8unJ6VJ-4" edge="1">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="tQMPqDGkYp4bv8unJ6VJ-17" value="<div><font style="font-size: 12px;"><b>Output:</b></font></div><div><font style="font-size: 12px;">Evaluation-Results</font></div>" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" parent="tQMPqDGkYp4bv8unJ6VJ-16" vertex="1" connectable="0">
|
||||
<mxGeometry x="0.4108" y="-1" relative="1" as="geometry">
|
||||
<mxPoint x="13" y="1" as="offset" />
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="tQMPqDGkYp4bv8unJ6VJ-18" value="Retrieve Results" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
|
||||
<mxGeometry x="485" y="520" width="120" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="tQMPqDGkYp4bv8unJ6VJ-22" value="" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="1" source="tQMPqDGkYp4bv8unJ6VJ-2" target="tQMPqDGkYp4bv8unJ6VJ-21" edge="1">
|
||||
<mxGeometry relative="1" as="geometry">
|
||||
<mxPoint x="600" y="300" as="sourcePoint" />
|
||||
<mxPoint x="760" y="300" as="targetPoint" />
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="tQMPqDGkYp4bv8unJ6VJ-21" value="Code-Generation" style="rounded=0;whiteSpace=wrap;html=1;" parent="1" vertex="1">
|
||||
<mxGeometry x="620" y="280" width="120" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
</root>
|
||||
</mxGraphModel>
|
||||
</diagram>
|
||||
</mxfile>
|
|
@ -1,6 +1,6 @@
|
|||
<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0" version="26.1.1">
|
||||
<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:129.0) Gecko/20100101 Firefox/129.0" version="24.7.6">
|
||||
<diagram name="Page-1" id="gpsZjoig8lt5hVv5Hzwz">
|
||||
<mxGraphModel dx="830" dy="457" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
|
||||
<mxGraphModel dx="989" dy="539" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
|
||||
<root>
|
||||
<mxCell id="0" />
|
||||
<mxCell id="1" parent="0" />
|
||||
|
@ -40,22 +40,22 @@
|
|||
<mxPoint x="200" as="targetPoint" />
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-18" value="e1" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="9Xn2HrUYLFHSwPnNgvM3-13" vertex="1">
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-18" value="e1" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" vertex="1" parent="9Xn2HrUYLFHSwPnNgvM3-13">
|
||||
<mxGeometry width="40" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-19" value="e2" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="9Xn2HrUYLFHSwPnNgvM3-13" vertex="1">
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-19" value="e2" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" vertex="1" parent="9Xn2HrUYLFHSwPnNgvM3-13">
|
||||
<mxGeometry x="40" width="40" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-20" value="e3" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="9Xn2HrUYLFHSwPnNgvM3-13" vertex="1">
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-20" value="e3" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" vertex="1" parent="9Xn2HrUYLFHSwPnNgvM3-13">
|
||||
<mxGeometry x="80" width="40" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-21" value="e4" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="9Xn2HrUYLFHSwPnNgvM3-13" vertex="1">
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-21" value="e4" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" vertex="1" parent="9Xn2HrUYLFHSwPnNgvM3-13">
|
||||
<mxGeometry x="120" width="40" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-22" value="e5" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="9Xn2HrUYLFHSwPnNgvM3-13" vertex="1">
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-22" value="e5" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" vertex="1" parent="9Xn2HrUYLFHSwPnNgvM3-13">
|
||||
<mxGeometry x="160" width="40" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-23" value="e6" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="9Xn2HrUYLFHSwPnNgvM3-13" vertex="1">
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-23" value="e6" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" vertex="1" parent="9Xn2HrUYLFHSwPnNgvM3-13">
|
||||
<mxGeometry x="200" width="40" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="9Xn2HrUYLFHSwPnNgvM3-14" value="" style="group" parent="1" vertex="1" connectable="0">
|
||||
|
@ -179,7 +179,7 @@
|
|||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="9Xn2HrUYLFHSwPnNgvM3-44" value="" style="rounded=0;whiteSpace=wrap;html=1;rotation=90;" parent="1" vertex="1">
|
||||
<mxGeometry x="960" y="520" width="40" height="40" as="geometry" />
|
||||
<mxGeometry x="1040" y="440" width="40" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="9Xn2HrUYLFHSwPnNgvM3-51" value="" style="rounded=0;whiteSpace=wrap;html=1;rotation=90;" parent="1" vertex="1">
|
||||
<mxGeometry x="880" y="480" width="120" height="40" as="geometry" />
|
||||
|
@ -208,6 +208,12 @@
|
|||
<mxPoint x="1000" y="480" as="targetPoint" />
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="9Xn2HrUYLFHSwPnNgvM3-61" value="" style="endArrow=none;html=1;rounded=0;exitX=0.167;exitY=1;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0.167;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" edge="1">
|
||||
<mxGeometry width="50" height="50" relative="1" as="geometry">
|
||||
<mxPoint x="1040" y="480" as="sourcePoint" />
|
||||
<mxPoint x="1080" y="480" as="targetPoint" />
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="9Xn2HrUYLFHSwPnNgvM3-62" value="" style="endArrow=none;html=1;rounded=0;exitX=0.167;exitY=1;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0.167;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" edge="1">
|
||||
<mxGeometry width="50" height="50" relative="1" as="geometry">
|
||||
<mxPoint x="920" y="480" as="sourcePoint" />
|
||||
|
@ -238,7 +244,7 @@
|
|||
<mxPoint x="1019.6700000000001" y="440" as="targetPoint" />
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="9Xn2HrUYLFHSwPnNgvM3-68" value="" style="endArrow=baseDash;html=1;rounded=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;endFill=0;endSize=18;" parent="1" edge="1">
|
||||
<mxCell id="9Xn2HrUYLFHSwPnNgvM3-68" value="" style="endArrow=classic;html=1;rounded=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" parent="1" edge="1">
|
||||
<mxGeometry width="50" height="50" relative="1" as="geometry">
|
||||
<mxPoint x="1059.8300000000002" y="400" as="sourcePoint" />
|
||||
<mxPoint x="1059.8300000000002" y="440" as="targetPoint" />
|
||||
|
@ -307,8 +313,8 @@
|
|||
<mxCell id="9Xn2HrUYLFHSwPnNgvM3-95" value="<div>p5</div>" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
|
||||
<mxGeometry x="1000" y="600" width="40" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="9Xn2HrUYLFHSwPnNgvM3-96" value="p3" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
|
||||
<mxGeometry x="960" y="520" width="40" height="40" as="geometry" />
|
||||
<mxCell id="9Xn2HrUYLFHSwPnNgvM3-96" value="p1" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
|
||||
<mxGeometry x="1040" y="440" width="40" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="9Xn2HrUYLFHSwPnNgvM3-97" value="p1" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
|
||||
<mxGeometry x="920" y="440" width="40" height="40" as="geometry" />
|
||||
|
@ -418,7 +424,7 @@
|
|||
<mxCell id="9Xn2HrUYLFHSwPnNgvM3-118" value="x3" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="1" vertex="1">
|
||||
<mxGeometry x="640" y="560" width="40" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-12" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;" parent="1" source="9Xn2HrUYLFHSwPnNgvM3-119" edge="1">
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-12" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;" edge="1" parent="1" source="9Xn2HrUYLFHSwPnNgvM3-119">
|
||||
<mxGeometry relative="1" as="geometry">
|
||||
<mxPoint x="720" y="740" as="targetPoint" />
|
||||
<Array as="points">
|
||||
|
@ -438,7 +444,7 @@
|
|||
<mxCell id="9Xn2HrUYLFHSwPnNgvM3-122" value="x3" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="1" vertex="1">
|
||||
<mxGeometry x="600" y="560" width="40" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-14" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;" parent="1" source="9Xn2HrUYLFHSwPnNgvM3-123" edge="1">
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-14" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;" edge="1" parent="1" source="9Xn2HrUYLFHSwPnNgvM3-123">
|
||||
<mxGeometry relative="1" as="geometry">
|
||||
<mxPoint x="720" y="780" as="targetPoint" />
|
||||
<Array as="points">
|
||||
|
@ -458,7 +464,7 @@
|
|||
<mxCell id="9Xn2HrUYLFHSwPnNgvM3-126" value="x3" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;direction=south;" parent="1" vertex="1">
|
||||
<mxGeometry x="560" y="560" width="40" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-11" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;" parent="1" source="9Xn2HrUYLFHSwPnNgvM3-127" edge="1">
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-11" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;" edge="1" parent="1" source="9Xn2HrUYLFHSwPnNgvM3-127">
|
||||
<mxGeometry relative="1" as="geometry">
|
||||
<mxPoint x="720" y="700" as="targetPoint" />
|
||||
<Array as="points">
|
||||
|
@ -529,61 +535,61 @@
|
|||
</Array>
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-9" value="" style="group" parent="1" vertex="1" connectable="0">
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-9" value="" style="group" vertex="1" connectable="0" parent="1">
|
||||
<mxGeometry x="721" y="680" width="240" height="120" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-1" value="" style="rounded=0;whiteSpace=wrap;html=1;" parent="9og6d5YY-6gPx96OlZrF-9" vertex="1">
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-1" value="" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="9og6d5YY-6gPx96OlZrF-9">
|
||||
<mxGeometry width="240" height="120" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-2" value="" style="endArrow=none;html=1;rounded=0;" parent="9og6d5YY-6gPx96OlZrF-9" edge="1">
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-2" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="9og6d5YY-6gPx96OlZrF-9">
|
||||
<mxGeometry width="50" height="50" relative="1" as="geometry">
|
||||
<mxPoint x="200" y="120" as="sourcePoint" />
|
||||
<mxPoint x="200" as="targetPoint" />
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-3" value="" style="endArrow=none;html=1;rounded=0;" parent="9og6d5YY-6gPx96OlZrF-9" edge="1">
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-3" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="9og6d5YY-6gPx96OlZrF-9">
|
||||
<mxGeometry width="50" height="50" relative="1" as="geometry">
|
||||
<mxPoint y="40" as="sourcePoint" />
|
||||
<mxPoint x="240" y="40" as="targetPoint" />
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-4" value="" style="endArrow=none;html=1;rounded=0;" parent="9og6d5YY-6gPx96OlZrF-9" edge="1">
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-4" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="9og6d5YY-6gPx96OlZrF-9">
|
||||
<mxGeometry width="50" height="50" relative="1" as="geometry">
|
||||
<mxPoint y="80" as="sourcePoint" />
|
||||
<mxPoint x="240" y="80" as="targetPoint" />
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-5" value="" style="endArrow=none;html=1;rounded=0;" parent="9og6d5YY-6gPx96OlZrF-9" edge="1">
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-5" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="9og6d5YY-6gPx96OlZrF-9">
|
||||
<mxGeometry width="50" height="50" relative="1" as="geometry">
|
||||
<mxPoint x="40" y="120" as="sourcePoint" />
|
||||
<mxPoint x="40" as="targetPoint" />
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-6" value="" style="endArrow=none;html=1;rounded=0;" parent="9og6d5YY-6gPx96OlZrF-9" edge="1">
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-6" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="9og6d5YY-6gPx96OlZrF-9">
|
||||
<mxGeometry width="50" height="50" relative="1" as="geometry">
|
||||
<mxPoint x="80" y="120" as="sourcePoint" />
|
||||
<mxPoint x="80" as="targetPoint" />
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-7" value="" style="endArrow=none;html=1;rounded=0;" parent="9og6d5YY-6gPx96OlZrF-9" edge="1">
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-7" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="9og6d5YY-6gPx96OlZrF-9">
|
||||
<mxGeometry width="50" height="50" relative="1" as="geometry">
|
||||
<mxPoint x="119.65999999999997" y="120" as="sourcePoint" />
|
||||
<mxPoint x="119.65999999999997" as="targetPoint" />
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-8" value="" style="endArrow=none;html=1;rounded=0;" parent="9og6d5YY-6gPx96OlZrF-9" edge="1">
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-8" value="" style="endArrow=none;html=1;rounded=0;" edge="1" parent="9og6d5YY-6gPx96OlZrF-9">
|
||||
<mxGeometry width="50" height="50" relative="1" as="geometry">
|
||||
<mxPoint x="160" y="120" as="sourcePoint" />
|
||||
<mxPoint x="160" as="targetPoint" />
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-10" value="<div>Results</div><div>Matrix</div>" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-10" value="<div>Results</div><div>Matrix</div>" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
|
||||
<mxGeometry x="721" y="630" width="70" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-16" value="" style="shape=curlyBracket;whiteSpace=wrap;html=1;rounded=1;labelPosition=left;verticalLabelPosition=middle;align=right;verticalAlign=middle;rotation=-90;" parent="1" vertex="1">
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-16" value="" style="shape=curlyBracket;whiteSpace=wrap;html=1;rounded=1;labelPosition=left;verticalLabelPosition=middle;align=right;verticalAlign=middle;rotation=-90;" vertex="1" parent="1">
|
||||
<mxGeometry x="832" y="701" width="20" height="240" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-17" value="Expression 1 through Expression n" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
|
||||
<mxCell id="9og6d5YY-6gPx96OlZrF-17" value="Expression 1 through Expression n" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
|
||||
<mxGeometry x="727" y="832" width="230" height="30" as="geometry" />
|
||||
</mxCell>
|
||||
</root>
|
||||
|
|
|
@ -1,112 +0,0 @@
|
|||
<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0" version="26.2.6">
|
||||
<diagram name="Page-1" id="ZW0hAwE0V4rwrlzxzp_e">
|
||||
<mxGraphModel dx="1426" dy="791" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
|
||||
<root>
|
||||
<mxCell id="0" />
|
||||
<mxCell id="1" parent="0" />
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-3" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="EzEPb8_loPXt5I1V_28y-1" target="EzEPb8_loPXt5I1V_28y-2">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-1" value="Interpreter" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
|
||||
<mxGeometry x="250" y="120" width="120" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-7" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="EzEPb8_loPXt5I1V_28y-2" target="EzEPb8_loPXt5I1V_28y-4">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-8" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="EzEPb8_loPXt5I1V_28y-2" target="EzEPb8_loPXt5I1V_28y-5">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-9" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="EzEPb8_loPXt5I1V_28y-2" target="EzEPb8_loPXt5I1V_28y-6">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-2" value="Kernel" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
|
||||
<mxGeometry x="250" y="200" width="120" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-14" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="EzEPb8_loPXt5I1V_28y-4" target="EzEPb8_loPXt5I1V_28y-11">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-4" value="Dispatch" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
|
||||
<mxGeometry x="180" y="280" width="80" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-15" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="EzEPb8_loPXt5I1V_28y-5" target="EzEPb8_loPXt5I1V_28y-12">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-5" value="Dispatch" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
|
||||
<mxGeometry x="270" y="280" width="80" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-16" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="EzEPb8_loPXt5I1V_28y-6" target="EzEPb8_loPXt5I1V_28y-13">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-6" value="Dispatch" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
|
||||
<mxGeometry x="360" y="280" width="80" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-11" value="Evaluate" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
|
||||
<mxGeometry x="180" y="360" width="80" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-12" value="Evaluate" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
|
||||
<mxGeometry x="270" y="360" width="80" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-13" value="<div>Evaluate</div>" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
|
||||
<mxGeometry x="360" y="360" width="80" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-36" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="EzEPb8_loPXt5I1V_28y-18" target="EzEPb8_loPXt5I1V_28y-32">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-37" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="EzEPb8_loPXt5I1V_28y-18" target="EzEPb8_loPXt5I1V_28y-33">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-38" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="EzEPb8_loPXt5I1V_28y-18" target="EzEPb8_loPXt5I1V_28y-34">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-18" value="Transpiler" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
|
||||
<mxGeometry x="550" y="120" width="120" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-23" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" source="EzEPb8_loPXt5I1V_28y-24" target="EzEPb8_loPXt5I1V_28y-29" parent="1">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-24" value="Dispatch" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
|
||||
<mxGeometry x="480" y="280" width="80" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-25" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" source="EzEPb8_loPXt5I1V_28y-26" target="EzEPb8_loPXt5I1V_28y-30" parent="1">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-26" value="Dispatch" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
|
||||
<mxGeometry x="570" y="280" width="80" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-27" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" source="EzEPb8_loPXt5I1V_28y-28" target="EzEPb8_loPXt5I1V_28y-31" parent="1">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-28" value="Dispatch" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
|
||||
<mxGeometry x="660" y="280" width="80" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-29" value="Evaluate" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
|
||||
<mxGeometry x="480" y="360" width="80" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-30" value="Evaluate" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
|
||||
<mxGeometry x="570" y="360" width="80" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-31" value="<div>Evaluate</div>" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
|
||||
<mxGeometry x="660" y="360" width="80" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-39" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="EzEPb8_loPXt5I1V_28y-32" target="EzEPb8_loPXt5I1V_28y-24">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-32" value="Kernel" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
|
||||
<mxGeometry x="480" y="200" width="80" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-40" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="EzEPb8_loPXt5I1V_28y-33" target="EzEPb8_loPXt5I1V_28y-26">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-33" value="Kernel" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
|
||||
<mxGeometry x="570" y="200" width="80" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-41" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" edge="1" parent="1" source="EzEPb8_loPXt5I1V_28y-34" target="EzEPb8_loPXt5I1V_28y-28">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="EzEPb8_loPXt5I1V_28y-34" value="Kernel" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
|
||||
<mxGeometry x="660" y="200" width="80" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
</root>
|
||||
</mxGraphModel>
|
||||
</diagram>
|
||||
</mxfile>
|
|
@ -1,40 +0,0 @@
|
|||
<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:137.0) Gecko/20100101 Firefox/137.0" version="26.2.5">
|
||||
<diagram name="Page-1" id="93wPJxm0qDUx-9UJ1EZK">
|
||||
<mxGraphModel dx="1182" dy="655" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
|
||||
<root>
|
||||
<mxCell id="0" />
|
||||
<mxCell id="1" parent="0" />
|
||||
<mxCell id="399UxkHvPDb8lwnND9dC-1" value="X<sub>1</sub>" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
|
||||
<mxGeometry x="265" y="240" width="40" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="399UxkHvPDb8lwnND9dC-2" value="2" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
|
||||
<mxGeometry x="355" y="240" width="40" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="399UxkHvPDb8lwnND9dC-3" value="+" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
|
||||
<mxGeometry x="445" y="240" width="40" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="399UxkHvPDb8lwnND9dC-5" value="<div>Type: Variable</div><div>Value: 1</div>" style="rounded=0;whiteSpace=wrap;html=1;align=left;" vertex="1" parent="1">
|
||||
<mxGeometry x="240" y="280" width="90" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="399UxkHvPDb8lwnND9dC-6" value="<div>Type: Constant</div><div>Value: 2</div>" style="rounded=0;whiteSpace=wrap;html=1;align=left;" vertex="1" parent="1">
|
||||
<mxGeometry x="330" y="280" width="90" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="399UxkHvPDb8lwnND9dC-9" value="<div>Type: Operator</div><div>Value: Addition</div>" style="rounded=0;whiteSpace=wrap;html=1;align=left;" vertex="1" parent="1">
|
||||
<mxGeometry x="420" y="280" width="90" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="399UxkHvPDb8lwnND9dC-10" value="X<sub>1</sub>" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
|
||||
<mxGeometry x="80" y="280" width="40" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="399UxkHvPDb8lwnND9dC-14" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" edge="1" parent="1" source="399UxkHvPDb8lwnND9dC-11" target="399UxkHvPDb8lwnND9dC-5">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="399UxkHvPDb8lwnND9dC-11" value="2" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
|
||||
<mxGeometry x="160" y="280" width="40" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="399UxkHvPDb8lwnND9dC-12" value="+" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
|
||||
<mxGeometry x="120" y="280" width="40" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
</root>
|
||||
</mxGraphModel>
|
||||
</diagram>
|
||||
</mxfile>
|
|
@ -27,7 +27,7 @@ function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector
|
|||
|
||||
results = Matrix{Float32}(undef, ncols, length(exprs))
|
||||
|
||||
for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially)
|
||||
for i in 1:repetitions # Simulate parameter tuning
|
||||
results = Interpreter.interpret(exprs, X, p)
|
||||
end
|
||||
|
||||
|
@ -41,7 +41,7 @@ function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{
|
|||
|
||||
results = Matrix{Float32}(undef, ncols, length(exprs))
|
||||
|
||||
for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially)
|
||||
for i in 1:repetitions # Simulate parameter tuning
|
||||
results = Transpiler.evaluate(exprs, X, p)
|
||||
end
|
||||
|
||||
|
|
|
@ -22,9 +22,9 @@ NOTE: All 64-Bit values will be converted to 32-Bit. Be aware of the lost precis
|
|||
"
|
||||
function expr_to_postfix(expr::Expr)::PostfixType
|
||||
postfix = PostfixType()
|
||||
@inbounds operator = get_operator(expr.args[1])
|
||||
operator = get_operator(expr.args[1])
|
||||
|
||||
@inbounds for j in 2:length(expr.args)
|
||||
for j in 2:length(expr.args)
|
||||
arg = expr.args[j]
|
||||
|
||||
if typeof(arg) === Expr
|
||||
|
|
|
@ -15,7 +15,7 @@ export interpret
|
|||
function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parameters::Vector{Vector{Float32}})::Matrix{Float32}
|
||||
|
||||
exprs = Vector{ExpressionProcessing.PostfixType}(undef, length(expressions))
|
||||
@inbounds for i in eachindex(expressions)
|
||||
for i in eachindex(expressions)
|
||||
exprs[i] = ExpressionProcessing.expr_to_postfix(expressions[i])
|
||||
end
|
||||
|
||||
|
@ -24,16 +24,16 @@ function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parame
|
|||
cudaParams = Utils.create_cuda_array(parameters, NaN32) # column corresponds to data for one expression
|
||||
cudaExprs = Utils.create_cuda_array(exprs, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression
|
||||
# put into seperate cuArray, as this is static and would be inefficient to send seperatly to every kernel
|
||||
cudaStepsize = CuArray([Utils.get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression
|
||||
cudaStepsize = CuArray([Utils.get_max_inner_length(exprs), Utils.get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression
|
||||
|
||||
# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
|
||||
cudaResults = CuArray{Float32}(undef, variableCols, length(exprs))
|
||||
|
||||
# Start kernel for each expression to ensure that no warp is working on different expressions
|
||||
@inbounds for i in eachindex(exprs)
|
||||
kernel = @cuda launch=false fastmath=true interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
|
||||
# config = launch_configuration(kernel.fun)
|
||||
threads = min(variableCols, 128)
|
||||
for i in eachindex(exprs)
|
||||
kernel = @cuda launch=false interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
|
||||
config = launch_configuration(kernel.fun)
|
||||
threads = min(variableCols, config.threads)
|
||||
blocks = cld(variableCols, threads)
|
||||
|
||||
kernel(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i; threads, blocks)
|
||||
|
@ -46,73 +46,75 @@ end
|
|||
const MAX_STACK_SIZE = 25 # The depth of the stack to store the values and intermediate results
|
||||
function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, variables::CuDeviceArray{Float32}, parameters::CuDeviceArray{Float32}, results::CuDeviceArray{Float32}, stepsize::CuDeviceArray{Int}, exprIndex::Int)
|
||||
varSetIndex = (blockIdx().x - 1) * blockDim().x + threadIdx().x # ctaid.x * ntid.x + tid.x (1-based)
|
||||
@inbounds variableCols = length(variables) / stepsize[2]
|
||||
# stride = gridDim().x * blockDim().x # nctaid.x * ntid.x
|
||||
variableCols = length(variables) / stepsize[3]
|
||||
|
||||
if varSetIndex > variableCols
|
||||
return
|
||||
end
|
||||
|
||||
# firstExprIndex = ((exprIndex - 1) * stepsize[1]) + 1 # Inclusive
|
||||
# lastExprIndex = firstExprIndex + stepsize[1] - 1 # Inclusive
|
||||
@inbounds firstParamIndex = ((exprIndex - 1) * stepsize[1]) # Exclusive
|
||||
firstExprIndex = ((exprIndex - 1) * stepsize[1]) + 1 # Inclusive
|
||||
lastExprIndex = firstExprIndex + stepsize[1] - 1 # Inclusive
|
||||
firstParamIndex = ((exprIndex - 1) * stepsize[2]) # Exclusive
|
||||
|
||||
operationStack = MVector{MAX_STACK_SIZE, Float32}(undef) # Try to get this to function with variable size too, to allow better memory usage
|
||||
operationStackTop = 0 # stores index of the last defined/valid value
|
||||
|
||||
@inbounds firstVariableIndex = ((varSetIndex-1) * stepsize[2]) # Exclusive
|
||||
|
||||
@inbounds for expr in expressions
|
||||
if expr.Type == EMPTY
|
||||
break
|
||||
elseif expr.Type == INDEX
|
||||
val = expr.Value
|
||||
operationStackTop += 1
|
||||
# for varSetIndex in index:stride
|
||||
firstVariableIndex = ((varSetIndex-1) * stepsize[3]) # Exclusive
|
||||
|
||||
for i in firstExprIndex:lastExprIndex
|
||||
if expressions[i].Type == EMPTY
|
||||
break
|
||||
elseif expressions[i].Type == INDEX
|
||||
val = expressions[i].Value
|
||||
operationStackTop += 1
|
||||
|
||||
if val > 0
|
||||
operationStack[operationStackTop] = variables[firstVariableIndex + val]
|
||||
if val > 0
|
||||
operationStack[operationStackTop] = variables[firstVariableIndex + val]
|
||||
else
|
||||
val = abs(val)
|
||||
operationStack[operationStackTop] = parameters[firstParamIndex + val]
|
||||
end
|
||||
elseif expressions[i].Type == FLOAT32
|
||||
operationStackTop += 1
|
||||
operationStack[operationStackTop] = reinterpret(Float32, expressions[i].Value)
|
||||
elseif expressions[i].Type == OPERATOR
|
||||
type = reinterpret(Operator, expressions[i].Value)
|
||||
if type == ADD
|
||||
operationStackTop -= 1
|
||||
operationStack[operationStackTop] = operationStack[operationStackTop] + operationStack[operationStackTop + 1]
|
||||
elseif type == SUBTRACT
|
||||
operationStackTop -= 1
|
||||
operationStack[operationStackTop] = operationStack[operationStackTop] - operationStack[operationStackTop + 1]
|
||||
elseif type == MULTIPLY
|
||||
operationStackTop -= 1
|
||||
operationStack[operationStackTop] = operationStack[operationStackTop] * operationStack[operationStackTop + 1]
|
||||
elseif type == DIVIDE
|
||||
operationStackTop -= 1
|
||||
operationStack[operationStackTop] = operationStack[operationStackTop] / operationStack[operationStackTop + 1]
|
||||
elseif type == POWER
|
||||
operationStackTop -= 1
|
||||
operationStack[operationStackTop] = operationStack[operationStackTop] ^ operationStack[operationStackTop + 1]
|
||||
elseif type == ABS
|
||||
operationStack[operationStackTop] = abs(operationStack[operationStackTop])
|
||||
elseif type == LOG
|
||||
operationStack[operationStackTop] = log(operationStack[operationStackTop])
|
||||
elseif type == EXP
|
||||
operationStack[operationStackTop] = exp(operationStack[operationStackTop])
|
||||
elseif type == SQRT
|
||||
operationStack[operationStackTop] = sqrt(operationStack[operationStackTop])
|
||||
end
|
||||
else
|
||||
val = abs(val)
|
||||
operationStack[operationStackTop] = parameters[firstParamIndex + val]
|
||||
operationStack[operationStackTop] = NaN
|
||||
break
|
||||
end
|
||||
elseif expr.Type == FLOAT32
|
||||
operationStackTop += 1
|
||||
operationStack[operationStackTop] = reinterpret(Float32, expr.Value)
|
||||
elseif expr.Type == OPERATOR
|
||||
type = reinterpret(Operator, expr.Value)
|
||||
if type == ADD
|
||||
operationStackTop -= 1
|
||||
operationStack[operationStackTop] = operationStack[operationStackTop] + operationStack[operationStackTop + 1]
|
||||
elseif type == SUBTRACT
|
||||
operationStackTop -= 1
|
||||
operationStack[operationStackTop] = operationStack[operationStackTop] - operationStack[operationStackTop + 1]
|
||||
elseif type == MULTIPLY
|
||||
operationStackTop -= 1
|
||||
operationStack[operationStackTop] = operationStack[operationStackTop] * operationStack[operationStackTop + 1]
|
||||
elseif type == DIVIDE
|
||||
operationStackTop -= 1
|
||||
operationStack[operationStackTop] = operationStack[operationStackTop] / operationStack[operationStackTop + 1]
|
||||
elseif type == POWER
|
||||
operationStackTop -= 1
|
||||
operationStack[operationStackTop] = operationStack[operationStackTop] ^ operationStack[operationStackTop + 1]
|
||||
elseif type == ABS
|
||||
operationStack[operationStackTop] = abs(operationStack[operationStackTop])
|
||||
elseif type == LOG
|
||||
operationStack[operationStackTop] = log(operationStack[operationStackTop])
|
||||
elseif type == EXP
|
||||
operationStack[operationStackTop] = exp(operationStack[operationStackTop])
|
||||
elseif type == SQRT
|
||||
operationStack[operationStackTop] = sqrt(operationStack[operationStackTop])
|
||||
end
|
||||
else
|
||||
operationStack[operationStackTop] = NaN32
|
||||
break
|
||||
end
|
||||
end
|
||||
|
||||
# "(exprIndex - 1) * variableCols" -> calculates the column in which to insert the result (expression = column)
|
||||
# "+ varSetIndex" -> to get the row inside the column at which to insert the result of the variable set (variable set = row)
|
||||
resultIndex = convert(Int, (exprIndex - 1) * variableCols + varSetIndex) # Inclusive
|
||||
@inbounds results[resultIndex] = operationStack[operationStackTop]
|
||||
# "(exprIndex - 1) * variableCols" -> calculates the column in which to insert the result (expression = column)
|
||||
# "+ varSetIndex" -> to get the row inside the column at which to insert the result of the variable set (variable set = row)
|
||||
resultIndex = convert(Int, (exprIndex - 1) * variableCols + varSetIndex) # Inclusive
|
||||
results[resultIndex] = operationStack[operationStackTop]
|
||||
# end
|
||||
|
||||
return
|
||||
end
|
||||
|
|
|
@ -14,25 +14,10 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
|
|||
variableCols = size(variables, 2)
|
||||
kernels = Vector{CuFunction}(undef, length(expressions))
|
||||
|
||||
# TODO: test this again with multiple threads. The first time I tried, I was using only one thread
|
||||
# Test this parallel version again when doing performance tests. With the simple "functionality" tests this took 0.03 seconds while sequential took "0.00009" seconds
|
||||
# Threads.@threads for i in eachindex(expressions)
|
||||
# cacheLock = ReentrantLock()
|
||||
# cacheHit = false
|
||||
# lock(cacheLock) do
|
||||
# if haskey(cache, expressions[i])
|
||||
# kernels[i] = cache[expressions[i]]
|
||||
# cacheHit = true
|
||||
# end
|
||||
# end
|
||||
|
||||
# if cacheHit
|
||||
# continue
|
||||
# end
|
||||
|
||||
# formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i])
|
||||
|
||||
# kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i-1) # i-1 because julia is 1-based but PTX needs 0-based indexing
|
||||
# TODO: Use cache
|
||||
# kernel = transpile(expressions[i], varRows, Utils.get_max_inner_length(parameters))
|
||||
|
||||
# linker = CuLink()
|
||||
# add_data!(linker, "ExpressionProcessing", kernel)
|
||||
|
@ -41,11 +26,9 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
|
|||
|
||||
# mod = CuModule(image)
|
||||
# kernels[i] = CuFunction(mod, "ExpressionProcessing")
|
||||
|
||||
# @lock cacheLock cache[expressions[i]] = kernels[i]
|
||||
# end
|
||||
|
||||
@inbounds for i in eachindex(expressions)
|
||||
for i in eachindex(expressions)
|
||||
if haskey(cache, expressions[i])
|
||||
kernels[i] = cache[expressions[i]]
|
||||
continue
|
||||
|
@ -71,12 +54,12 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
|
|||
cudaResults = CuArray{Float32}(undef, variableCols, length(expressions))
|
||||
|
||||
# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
|
||||
for kernel in kernels
|
||||
# config = launch_configuration(kernels[i])
|
||||
threads = min(variableCols, 96)
|
||||
for i in eachindex(kernels)
|
||||
config = launch_configuration(kernels[i])
|
||||
threads = min(variableCols, config.threads)
|
||||
blocks = cld(variableCols, threads)
|
||||
|
||||
cudacall(kernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
|
||||
cudacall(kernels[i], (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
|
||||
end
|
||||
|
||||
return cudaResults
|
||||
|
@ -198,7 +181,8 @@ function generate_calculation_code(expression::ExpressionProcessing.PostfixType,
|
|||
exprId64Reg = Utils.get_next_free_register(regManager, "rd")
|
||||
println(codeBuffer, "mov.u64 $exprId64Reg, $expressionIndex;")
|
||||
|
||||
for token in expression
|
||||
for i in eachindex(expression)
|
||||
token = expression[i]
|
||||
|
||||
if token.Type == FLOAT32
|
||||
push!(operands, reinterpret(Float32, token.Value))
|
||||
|
|
|
@ -5,10 +5,7 @@ using .Transpiler
|
|||
using .Interpreter
|
||||
|
||||
const BENCHMARKS_RESULTS_PATH = "./results"
|
||||
|
||||
# TODO: Expressions can get much much bigger (into millions) (will be provided by Mr. Kronberger)
|
||||
# TODO: Variable-Sets: 1000 can be considered the minimum; 100.000 can be considered the maximum (will be provided by Mr. Kronberger)
|
||||
|
||||
# University setup at 10.20.1.7 if needed
|
||||
exprsCPU = [
|
||||
# CPU interpreter requires an anonymous function and array ref s
|
||||
:(p[1] * x[1] + p[2]), # 5 op
|
||||
|
@ -28,7 +25,7 @@ exprsGPU = [
|
|||
|
||||
# p is the same for CPU and GPU
|
||||
p = [randn(Float32, 10) for _ in 1:length(exprsCPU)] # generate 10 random parameter values for each expr
|
||||
expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially; only p changes but not X)
|
||||
expr_reps = 100 # 100 parameter optimisation steps basically
|
||||
|
||||
|
||||
@testset "CPU performance" begin
|
||||
|
@ -67,13 +64,6 @@ end
|
|||
# https://cuda.juliagpu.org/v2.6/lib/driver/#Memory-Management
|
||||
end
|
||||
|
||||
# After these tests have been redone, use Nsight Compute/Systems as described here:
|
||||
#https://cuda.juliagpu.org/stable/development/profiling/#NVIDIA-Nsight-Systems
|
||||
# Systems and Compute installable via WSL. Compute UI can even be used inside wsl
|
||||
# Add /usr/local/cuda/bin in .bashrc to PATH to access ncu and nsys (depending how well this works with my 1080 do it on my machine, otherwise re do the tests and perform them on FH PCs)
|
||||
# University setup at 10.20.1.7 if needed
|
||||
|
||||
compareWithCPU = true
|
||||
|
||||
|
||||
suite = BenchmarkGroup()
|
||||
|
@ -84,24 +74,22 @@ varsets_small = 100
|
|||
varsets_medium = 1000
|
||||
varsets_large = 10000
|
||||
|
||||
if compareWithCPU
|
||||
X_small = randn(Float32, varsets_small, 5)
|
||||
suite["CPU"]["small varset"] = @benchmarkable interpret_cpu(exprsCPU, X_small, p; repetitions=expr_reps)
|
||||
X_medium = randn(Float32, varsets_medium, 5)
|
||||
suite["CPU"]["medium varset"] = @benchmarkable interpret_cpu(exprsCPU, X_medium, p; repetitions=expr_reps)
|
||||
X_large = randn(Float32, varsets_large, 5)
|
||||
suite["CPU"]["large varset"] = @benchmarkable interpret_cpu(exprsCPU, X_large, p; repetitions=expr_reps)
|
||||
end
|
||||
X_small = randn(Float32, varsets_small, 5)
|
||||
suite["CPU"]["small varset"] = @benchmarkable interpret_cpu(exprsCPU, X_small, p; repetitions=expr_reps)
|
||||
X_medium = randn(Float32, varsets_medium, 5)
|
||||
suite["CPU"]["medium varset"] = @benchmarkable interpret_cpu(exprsCPU, X_medium, p; repetitions=expr_reps)
|
||||
X_large = randn(Float32, varsets_large, 5)
|
||||
suite["CPU"]["large varset"] = @benchmarkable interpret_cpu(exprsCPU, X_large, p; repetitions=expr_reps)
|
||||
|
||||
X_small_GPU = randn(Float32, 5, varsets_small) # column-major
|
||||
X_small_GPU = randn(Float32, 5, varsets_small)
|
||||
suite["GPUI"]["small varset"] = @benchmarkable interpret_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
|
||||
suite["GPUT"]["small varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
|
||||
|
||||
X_medium_GPU = randn(Float32, 5, varsets_medium) # column-major
|
||||
X_medium_GPU = randn(Float32, 5, varsets_medium)
|
||||
suite["GPUI"]["medium varset"] = @benchmarkable interpret_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
|
||||
suite["GPUT"]["medium varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
|
||||
|
||||
X_large_GPU = randn(Float32, 5, varsets_large) # column-major
|
||||
X_large_GPU = randn(Float32, 5, varsets_large)
|
||||
suite["GPUI"]["large varset"] = @benchmarkable interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
|
||||
suite["GPUT"]["large varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
|
||||
|
||||
|
@ -114,71 +102,45 @@ loadparams!(suite, BenchmarkTools.load("params.json")[1], :samples, :evals, :gct
|
|||
|
||||
results = run(suite, verbose=true, seconds=180)
|
||||
|
||||
if compareWithCPU
|
||||
medianCPU = median(results["CPU"])
|
||||
stdCPU = std(results["CPU"])
|
||||
|
||||
medianInterpreter = median(results["GPUI"])
|
||||
stdInterpreter = std(results["GPUI"])
|
||||
|
||||
medianTranspiler = median(results["GPUT"])
|
||||
stdTranspiler = std(results["GPUT"])
|
||||
|
||||
cpuVsGPUI_median = judge(medianInterpreter, medianCPU) # is interpreter better than cpu?
|
||||
cpuVsGPUT_median = judge(medianTranspiler, medianCPU) # is transpiler better than cpu?
|
||||
gpuiVsGPUT_median = judge(medianTranspiler, medianInterpreter) # is tranpiler better than interpreter?
|
||||
|
||||
cpuVsGPUI_std = judge(stdInterpreter, stdCPU) # is interpreter better than cpu?
|
||||
cpuVsGPUT_std = judge(stdTranspiler, stdCPU) # is transpiler better than cpu?
|
||||
gpuiVsGPUT_std = judge(stdTranspiler, stdInterpreter) # is tranpiler better than interpreter?
|
||||
|
||||
println()
|
||||
println("Is the interpreter better than the CPU implementation:")
|
||||
println(cpuVsGPUI_median)
|
||||
println(cpuVsGPUI_std)
|
||||
|
||||
println()
|
||||
println("Is the transpiler better than the CPU implementation:")
|
||||
println(cpuVsGPUT_median)
|
||||
println(cpuVsGPUT_std)
|
||||
|
||||
println()
|
||||
println("Is the transpiler better than the interpreter:")
|
||||
println(gpuiVsGPUT_median)
|
||||
println(gpuiVsGPUT_std)
|
||||
|
||||
BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/5-interpreter_using_fastmath.json", results)
|
||||
else
|
||||
resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/2-using_inbounds.json")[1]
|
||||
# resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/3-tuned-blocksize_I128_T96.json")[1]
|
||||
|
||||
medianGPUI_old = median(resultsOld["GPUI"])
|
||||
stdGPUI_old = std(resultsOld["GPUI"])
|
||||
|
||||
medianGPUT_old = median(resultsOld["GPUT"])
|
||||
stdGPUT_old = std(resultsOld["GPUT"])
|
||||
|
||||
medianInterpreter = median(results["GPUI"])
|
||||
stdInterpreter = std(results["GPUI"])
|
||||
|
||||
medianTranspiler = median(results["GPUT"])
|
||||
stdTranspiler = std(results["GPUT"])
|
||||
|
||||
oldVsGPUI_median = judge(medianInterpreter, medianGPUI_old) # is interpreter better than old?
|
||||
oldVsGPUI_std = judge(stdInterpreter, stdGPUI_old) # is interpreter better than old?
|
||||
|
||||
oldVsGPUT_median = judge(medianTranspiler, medianGPUT_old) # is transpiler better than old?
|
||||
oldVsGPUT_std = judge(stdTranspiler, stdGPUT_old) # is transpiler better than old?
|
||||
|
||||
|
||||
println()
|
||||
println("Is the interpreter better than the old implementation:")
|
||||
println(oldVsGPUI_median)
|
||||
println(oldVsGPUI_std)
|
||||
|
||||
println()
|
||||
println("Is the transpiler better than the old implementation:")
|
||||
println(oldVsGPUT_median)
|
||||
println(oldVsGPUT_std)
|
||||
end
|
||||
# BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/initial_results.json", results)
|
||||
# initial_results = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATHinitial_results.json")
|
||||
|
||||
medianCPU = median(results["CPU"])
|
||||
minimumCPU = minimum(results["CPU"])
|
||||
stdCPU = std(results["CPU"])
|
||||
|
||||
medianInterpreter = median(results["GPUI"])
|
||||
minimumInterpreter = minimum(results["GPUI"])
|
||||
stdInterpreter = std(results["GPUI"])
|
||||
|
||||
medianTranspiler = median(results["GPUT"])
|
||||
minimumTranspiler = minimum(results["GPUT"])
|
||||
stdTranspiler = std(results["GPUT"])
|
||||
|
||||
cpuVsGPUI_median = judge(medianInterpreter, medianCPU) # is interpreter better than cpu?
|
||||
cpuVsGPUT_median = judge(medianTranspiler, medianCPU) # is transpiler better than cpu?
|
||||
gpuiVsGPUT_median = judge(medianTranspiler, medianInterpreter) # is tranpiler better than interpreter?
|
||||
|
||||
cpuVsGPUI_minimum = judge(minimumInterpreter, minimumCPU) # is interpreter better than cpu?
|
||||
cpuVsGPUT_minimum = judge(minimumTranspiler, minimumCPU) # is transpiler better than cpu?
|
||||
gpuiVsGPUT_minimum = judge(minimumTranspiler, minimumInterpreter) # is tranpiler better than interpreter?
|
||||
|
||||
cpuVsGPUI_std = judge(stdInterpreter, stdCPU) # is interpreter better than cpu?
|
||||
cpuVsGPUT_std = judge(stdTranspiler, stdCPU) # is transpiler better than cpu?
|
||||
gpuiVsGPUT_std = judge(stdTranspiler, stdInterpreter) # is tranpiler better than interpreter?
|
||||
|
||||
|
||||
println("Is the interpreter better than the CPU implementation:")
|
||||
println(cpuVsGPUI_median)
|
||||
println(cpuVsGPUI_minimum)
|
||||
println(cpuVsGPUI_std)
|
||||
|
||||
println("Is the transpiler better than the CPU implementation:")
|
||||
println(cpuVsGPUT_median)
|
||||
println(cpuVsGPUT_minimum)
|
||||
println(cpuVsGPUT_std)
|
||||
|
||||
println("Is the transpiler better than the interpreter:")
|
||||
println(gpuiVsGPUT_median)
|
||||
println(gpuiVsGPUT_minimum)
|
||||
println(gpuiVsGPUT_std)
|
||||
|
|
|
@ -1,30 +0,0 @@
|
|||
using CUDA
|
||||
|
||||
using .Transpiler
|
||||
using .Interpreter
|
||||
|
||||
varsets_medium = 1000
|
||||
X = randn(Float32, 5, varsets_medium)
|
||||
|
||||
exprsGPU = [
|
||||
# CPU interpreter requires an anonymous function and array ref s
|
||||
:(p1 * x1 + p2), # 5 op
|
||||
:((((x1 + x2) + x3) + x4) + x5), # 9 op
|
||||
:(log(abs(x1))), # 3 op
|
||||
:(powabs(p2 - powabs(p1 + x1, 1/x1),p3)) # 13 op
|
||||
] # 30 op
|
||||
|
||||
# p is the same for CPU and GPU
|
||||
p = [randn(Float32, 10) for _ in 1:length(exprsGPU)] # generate 10 random parameter values for each expr
|
||||
expr_reps = 1
|
||||
|
||||
|
||||
|
||||
@testset "Interpreter Tuning" begin
|
||||
CUDA.@profile interpret_gpu(exprsGPU, X, p; repetitions=expr_reps)
|
||||
end
|
||||
|
||||
|
||||
@testset "Transpiler Tuning" begin
|
||||
CUDA.@profile evaluate_gpu(exprsGPU, X, p; repetitions=expr_reps)
|
||||
end
|
|
@ -1,8 +1,6 @@
|
|||
using ExpressionExecutorCuda
|
||||
using Test
|
||||
|
||||
using BenchmarkTools
|
||||
|
||||
const baseFolder = dirname(dirname(pathof(ExpressionExecutorCuda)))
|
||||
include(joinpath(baseFolder, "src", "Utils.jl"))
|
||||
include(joinpath(baseFolder, "src", "ExpressionProcessing.jl"))
|
||||
|
@ -21,6 +19,5 @@ end
|
|||
# end
|
||||
|
||||
@testset "Performance tests" begin
|
||||
# include("PerformanceTuning.jl")
|
||||
# include("PerformanceTests.jl")
|
||||
include("PerformanceTests.jl")
|
||||
end
|
|
@ -1,167 +1,33 @@
|
|||
RE-READ to ensure that concepts why this is done to improve performance and why this should be the "locally best" implementation (most should be in implementation though)
|
||||
|
||||
\chapter{Concept and Design}
|
||||
\label{cha:conceptdesign}
|
||||
% introduction to what needs to be done. also clarify terms "Host" and "Device" here
|
||||
To be able to determine whether evaluating mathematical expressions on the GPU is better suited than on the CPU, a prototype needs to be implemented. More specifically, a prototype for interpreting these expressions on the GPU, as well as a prototype that transpiles expressions into code that can be executed by the GPU. The goal of this chapter, is to describe how these two prototypes can be implemented conceptually. First the requirements for the prototypes as well as the data they operate on are explained. This is followed by the design of the interpreter and the transpiler. The CPU interpreter will not be described, as it already exists.
|
||||
|
||||
% TODO: maybe describe CPU interpreter too? We will see
|
||||
introduction to what needs to be done. also clarify terms "Host" and "Device" here
|
||||
|
||||
\section[Requirements]{Requirements and Data}
|
||||
The main goal of both prototypes or evaluators is to provide a speed-up compared to the CPU interpreter already in use. However, it is also important to determine which evaluator provides the most speed-up. This also means that if one of the evaluators is faster, it is intended to replace the CPU interpreter. Therefore, they must have similar capabilities, and therefore meet the following requirements:
|
||||
|
||||
\begin{itemize}
|
||||
\item Multiple expressions as input.
|
||||
\item All input expressions have the same number of variables ($x_n$), but can have a different number of parameters ($p_n$).
|
||||
\item The variables are parametrised using a matrix of the form $k \times N$, where $k$ is the number of variables in the expressions and $N$ is the number of different parametrisations for the variables. This matrix is the same for all expressions.
|
||||
\item The parameters are parametrised using a vector of vectors. Each vector $v_i$ corresponds to an expression $e_i$.
|
||||
\item The following operations must be supported: $x + y$, $x - y$, $x * y$, $x / y$, $x ^ y$, $|x|$, $\log(x)$, $e^x$ and $\sqrt{x}$. Note that $x$ and $y$ can either stand for a value, a variable, or another operation.
|
||||
\item The results of the evaluations are returned in a matrix of the form $k \times N$. In this case, $k$ is equal to the $N$ of the variable matrix and $N$ is equal to the number of input expressions.
|
||||
\end{itemize}
|
||||
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics[width=.9\textwidth]{input_output_explanation.png}
|
||||
\caption{This diagram shows how the input and output looks like and how they interact with each other.}
|
||||
\label{fig:input_output_explanation}
|
||||
\end{figure}
|
||||
short section.
|
||||
Multiple expressions; vars for all expressions; params unique to expression; operators that need to be supported
|
||||
|
||||
|
||||
With this, the required capabilities are outlined. However, the input and output data need to further be explained for a better understanding. The first input contains the expressions that need to be evaluated. These can have any length and can contain constant values, variables and parameters and all of these are linked together with the supported operations. In the example shown in Figure \ref{fig:input_output_explanation}, there are six expressions $e_1$ through $e_6$. Next is the variable matrix. One entry in this matrix, corresponds to one variable in every expression. The row indicates which variable it holds the value for. For example the values in row three, are used to parametrise the variable $x_3$. Each column holds a different set of variables. Each expression must be evaluated using every variable set. In the provided example, there are three variable sets, each holding the values for four variables $x_1$ through $x_4$. After all expressions are evaluated using all variable sets the results of these evaluations must be stored in the results matrix. Each entry in this matrix holds the resulting value of the evaluation of one expression parametrised with one variable set. The row indicates the variable set while the column indicates the expression.
|
||||
\section{Interpreter}
|
||||
as introduction to this section talk about what "interpreter" means in this context. so "gpu parses expr and calculates"
|
||||
|
||||
This is the minimal functionality needed to evaluate expressions with variables generated by a symbolic regression algorithm. In the case of parameter optimisation, it is useful to have a different type of variable, called parameter. For parameter optimisation it is important that for the given variable sets, the best fitting parameters need to be found. To achieve this, the evaluator is called multiple times with different parameters, but the same variables. The results are then evaluated for their fitness by the caller. In this case, the parameters do not change within one call. Parameters could therefore be treated as constant values of the expressions, and no separate input for them would be needed. However, providing the possibility to have the parameters as an input, makes the process of parameter optimisation easier. Unlike variables, not all expressions need to have the same number of parameters. Therefore, they are structured as a vector of vectors and not a matrix. The example in Figure \ref{fig:input_output_explanation} shows how the parameters are structured. For example one expression has zero parameters, while another has six parameters $p_1$ through $p_6$. It needs to be mentioned that just like the number of variables, the number of parameters per expression is not limited. It is also possible to completely omit the parameters if they are not needed. Because these evaluators will primarily be used in parameter optimisation use-cases, allowing parameters as an input is required.
|
||||
\subsection{Architecture}
|
||||
talk about the coarse grained architecture on how the interpreter will work. (.5 to 1 page probably)
|
||||
|
||||
% \subsection{Non-Goals}
|
||||
% Probably a good idea. Probably move this to "introduction"
|
||||
\section{Architecture}
|
||||
\subsection{Host}
|
||||
talk about the steps taken to prepare for GPU interpretation
|
||||
|
||||
Based on the requirements above, the architecture of both prototypes can be designed. While the requirements only specify the input and output, the components and workflow also need to be specified. This section aims at giving an architectural overview of both prototypes, alongside their design decisions.
|
||||
\subsection{Device}
|
||||
talk about how the actual interpreter will be implemented
|
||||
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics[width=.9\textwidth]{kernel_architecture.png}
|
||||
\caption{The interpreter has one kernel that is dispatched multiple times, while the transpiler, has multiple kernels that are dispatched once. This helps to eliminate thread divergence.}
|
||||
\label{fig:kernel_architecture}
|
||||
\end{figure}
|
||||
|
||||
A design decision that has been made for both prototypes is to split the evaluation of each expression into a separate kernel or kernel dispatch as seen in Figure \ref{fig:kernel_architecture}. As explained in Section \ref{sec:thread_hierarchy}, it is desirable to reduce the occurrence of thread divergence as much as possible. Although the SIMT programming model tries to mitigate the negative effects of thread divergence, it is still a good idea to avoid it when possible. For this use-case, thread divergence can easily be avoided by not evaluating all expressions in a single kernel or kernel dispatch. GPUs are able to have multiple resident grids, with modern GPUs being able to accommodate 128 grids concurrently \parencite{nvidia_cuda_2025}. One grid corresponds to one kernel dispatch, and therefore allows up-to 128 kernels to be run concurrently. Therefore, dispatching a kernel for each expression, has the possibility to improve the performance. In the case of the interpreter, having only one kernel that can be dispatched for each expression, also simplifies the kernel itself. This is because the kernel can focus on evaluating one expression and does not require additional code to handle multiple expressions at once. Similarly, the transpiler can also be simplified, as it can generate many smaller kernels than one big kernel. Additionally, the smaller kernels do not need any branching, because the generated code only needs to perform the operations as they occur in the expression itself.
|
||||
\section{Transpiler}
|
||||
as introduction to this section talk about what "transpiler" means in this context. so "cpu takes expressions and generates ptx for gpu execution"
|
||||
|
||||
\subsection{Pre-Processing}
|
||||
The first step in both prototypes is the pre-processing step. It is needed, as it simplifies working with the expressions in the later steps. One of the responsibilities of the pre-processor is to verify that only allowed operators and symbols are present in the given expressions. This is comparable to the work a scanner like Flex\footnote{\url{https://github.com/westes/flex}} performs. Additionally, this step also converts the expression into an intermediate representation. In essence, the pre-processing step can be compared to the front-end of a compiler as described in Section \ref{sec:compilers}. The conversion into the intermediate representation transforms the expressions from infix-notation into postfix-notation. This further allows the later parts to more easily evaluate the expressions. One of the major benefits of this notation is the implicit operator precedence. It allows the evaluators to evaluate the expressions token by token from left to right, without needing to worry about the correct order of operations. One token represents either an operator, a constant value, a variable or a parameter. Apart from the intermediate representation containing the expression in postfix-notation, it also contains the information about the types of the tokens themselves. This is all that is needed for the interpretation and transpilation steps. A simple expression like $x + 2$ would look like depicted in figure \ref{fig:pre-processing_results} after the pre-processing step.
|
||||
\subsection{Architecture}
|
||||
talk about the coarse grained architecture on how the transpiler will work. (.5 to 1 page probably)
|
||||
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics[width=.9\textwidth]{pre-processing_result.png}
|
||||
\caption{This diagram shows how an expression will be transformed in the pre-processing step.}
|
||||
\label{fig:pre-processing_results}
|
||||
\end{figure}
|
||||
\subsection{Host}
|
||||
talk about how the transpiler is implemented
|
||||
|
||||
It would have also been possible to perform the pre-processing step on the GPU. However, pre-processing only one expression can not easily be split into multiple threads, which means one GPU thread would need to process one expression. As described in Section \ref{sec:gpgpu} a single GPU thread is slower than a single CPU thread and as a result means the processing will also be slower. Furthermore, it wouldn't make sense to process all expressions in a single kernel. This would lead to a lot of thread divergence, which essentially means processing one expression after the other. The SIMT programming model might help with parallelising at least some parts of the processing work. However, the generated expressions can differ a lot from each other and restricting them to be similar and therefore SIMT friendly, would likely reduce the overall quality of the symbolic regression algorithm. Therefore, it does not make sense to perform the processing step on the GPU. This is a typical example of code that is better run on the CPU, also because the parallelisation possibilities of one thread per expression can be applied to the CPU as well. Concepts like caching processed expressions, or caching parts of the processed expressions can also be employed on the CPU. This would not be possible on the GPU, because a GPU can not save state between two kernel dispatches.
|
||||
|
||||
\subsection{Interpreter}
|
||||
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics[width=.9\textwidth]{component_diagram_interpreter.png}
|
||||
\caption{This diagram depicts the coarse-grained workflow of the interpreter. It shows how the parts interact with each other and with the system it will operate in.}
|
||||
\label{fig:component_diagram_interpreter}
|
||||
\end{figure}
|
||||
|
||||
The interpreter consists of two parts. The CPU side is the part of the program, that interacts with both the GPU and the caller. An overview on the components and the workflow of the interpreter can be seen in Figure \ref{fig:component_diagram_interpreter}. Once the interpreter receives the expressions, they are pre-processed. This ensures the expressions are valid, and that they are transformed into the intermediate representation needed for evaluating them. The results of this pre-processing are then sent to the GPU, which performs the actual interpretation of the expressions. Alongside the expressions, the data for the variables and parameters also needs to be sent to the GPU. Once all the data resides on the GPU, the interpreter kernel can be dispatched. It needs to be noted, that for each of the expressions, a separate kernel will be dispatched. As already described, this decision has been made to reduce thread divergence and therefore increase performance. In fact, dispatching the same kernel multiple times with different expressions, means, there will not occur any thread divergence as explained later. Once the GPU has finished evaluating all expressions with all variable sets, the result will be stored in a matrix on the GPU. The CPU then retrieves the results and returns them to the caller in the format specified by the requirements.
|
||||
|
||||
Evaluating the expressions is relatively straight forward. Due to the expressions being in postfix-notation, the actual interpreter must only iterate over all tokens once and perform the appropriate tasks. If the interpreter encounters a binary operator, it must simply read the previous two values and perform the operation specified by the operator. For unary operators, only the previous value must be read. As already mentioned, expressions in postfix-notation implicitly contain the operator precedence, therefore no look-ahead or other strategies need to be used to ensure correct evaluation. The Algorithm \ref{alg:eval_interpreter} shows how the interpreter works. Note that this is a simplified version, that only works with additions, multiplications, constant values and variables.
|
||||
|
||||
\begin{algorithm}
|
||||
\caption{Interpreting an equation in postfix-notation}\label{alg:eval_interpreter}
|
||||
\begin{algorithmic}[1]
|
||||
\Procedure{Evaluate}{\textit{expr}: PostfixExpression}
|
||||
\State $\textit{stack} \gets []$
|
||||
|
||||
\While{HasTokenLeft(\textit{expr})}
|
||||
\State $\textit{token} \gets \text{GetNextToken}(\textit{expr})$
|
||||
\If{$\textit{token.Type} = \text{Constant}$}
|
||||
\State Push($\textit{stack}$, $\textit{token.Value}$)
|
||||
\ElsIf{$\textit{token.Type} = \text{Variable}$}
|
||||
\State Push($\textit{stack}$, GetVariable($\textit{token.Value}$))
|
||||
\ElsIf{$\textit{token.Type} = \text{Operator}$}
|
||||
\If{$\textit{token.Value} = \text{Addition}$}
|
||||
\State $\textit{right} \gets \text{Pop}(\textit{stack})$
|
||||
\State $\textit{left} \gets \text{Pop}(\textit{stack})$
|
||||
\State Push($\textit{stack}$, $\textit{left} + \textit{right}$)
|
||||
\ElsIf{$\textit{token.Value} = \text{Multiplication}$}
|
||||
\State $\textit{right} \gets \text{Pop}(\textit{stack})$
|
||||
\State $\textit{left} \gets \text{Pop}(\textit{stack})$
|
||||
\State Push($\textit{stack}$, $\textit{left} * \textit{right}$)
|
||||
\EndIf
|
||||
\EndIf
|
||||
\EndWhile
|
||||
|
||||
\Return $\text{Pop}(\textit{stack})$
|
||||
\EndProcedure
|
||||
\end{algorithmic}
|
||||
\end{algorithm}
|
||||
|
||||
If a new operator is needed, it must simply be added as another else-if block inside the operator branch. New token types like variables or parameters, can also be added by adding a new outer else-if block that checks for these token types. However, the pre-processing step also needs to be extended with these new operators and token types. Otherwise, the expression will never reach the evaluation step as they would be seen as invalid. It is also possible to add unary operators like $\log()$. In this case only one value would be read from the stack, the operation would be applied, and the result would be written back to the stack.
|
||||
|
||||
The Algorithm \ref{alg:eval_interpreter} in this case resembles the kernel. This kernel will be dispatched for every expression that needs to be evaluated, to eliminate thread divergence. Thread divergence can only happen on data dependent branches. In this case, the while loop and every if and else-if statement contains a data dependent branch. Depending on the expression passed to the kernel, the while loop may run longer than for another expression. Similarly, not all expressions have the same constants, operators and variables in the same order and would therefore lead to each thread, taking different paths. However, one expression, always has the same constants, operators and variables in the same locations, meaning all threads will take the same paths. This also means that despite the interpreter containing many data dependent branches, these branches only depend on the expression itself. Because of this, all threads will take the same paths and therefore will never diverge from one another if they execute the same expression.
|
||||
|
||||
\subsection{Transpiler}
|
||||
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics[width=.9\textwidth]{component_diagram_transpiler.png}
|
||||
\caption{This diagram depicts the coarse-grained workflow of the transpiler. It shows how the parts interact with each other and with the system it will operate in.}
|
||||
\label{fig:component_diagram_transpiler}
|
||||
\end{figure}
|
||||
|
||||
Similar to the interpreter, the transpiler also consists of a part that runs on the CPU and a part that runs on the GPU. When looking at the components and workflow of the transpiler, as shown in Figure \ref{fig:component_diagram_transpiler}, it is almost identical to the interpreter. However, the key difference between the two, is the additional code generation, or transpilation step. Apart from that, the transpiler also needs the same pre-processing step and also the GPU to evaluate the expressions. However, the GPU evaluator generated by the transpiler works differently to the GPU evaluator for the interpreter. The difference between these evaluators will be explained later.
|
||||
|
||||
Before the expressions can be transpiled into PTX code, they need to be pre-processed. As already described, this step ensures the validity of the expressions and transforms them into the intermediate representation described above. As with the interpreter, this also simplifies the code generation step at the cost of some performance because the validity has to be ensured, and the intermediate representation needs to be generated. However, in this case the benefit of having a simple code generation step was more important than performance. By transforming the expressions into postfix-notation, the code generation follows a similar pattern to the interpretation already described. Algorithm \ref{alg:transpile} shows how the transpiler takes an expression, transpiles it and then returns the finished code. It can be seen that the while loop is the same as the while loop of the interpreter. The main difference is in the operator branches. Because now code needs to be generated, the branches themselves call their designated code generation function, such as $\textit{GetAddition}$. However, this function can not only return the code that performs the addition for example. When executed, this addition also returns a value which will be needed as an input by other operators. Therefore, not only the code fragment must be returned, but also the reference to the result. This reference can then be put on the stack for later use the same as the interpreter stores the value for later use. The code fragment must also be added to the already generated code so that it can be returned to the caller. As with the interpreter, there is a final value on the stack when the loop has finished. Once the code is executed, this value is the reference to the result of the expression. This value then needs to be stored in the results matrix, so that it can be retrieved by the CPU after all expressions have been executed on the GPU. Therefore, one last code fragment must be generated to handle the storage of this value in the results matrix. This fragment must then be added to the code already generated, and the transpilation process is completed.
|
||||
|
||||
\begin{algorithm}
|
||||
\caption{Transpiling an equation in postfix-notation}\label{alg:transpile}
|
||||
\begin{algorithmic}[1]
|
||||
\Procedure{Transpile}{\textit{expr}: PostfixExpression}: String
|
||||
\State $\textit{stack} \gets []$
|
||||
\State $\textit{code} \gets$ ""
|
||||
|
||||
\While{HasTokenLeft(\textit{expr})}
|
||||
\State $\textit{token} \gets \text{GetNextToken}(\textit{expr})$
|
||||
\If{$\textit{token.Type} = \text{Constant}$}
|
||||
\State Push($\textit{stack}$, $\textit{token.Value}$)
|
||||
\ElsIf{$\textit{token.Type} = \text{Variable}$}
|
||||
\State ($\textit{codeFragment}, \textit{referenceToValue}$) $\gets$ GetVariable($\textit{token.Value}$)
|
||||
\State Push($\textit{stack}$, $\textit{referenceToValue}$)
|
||||
\State Append($\textit{code}$, $\textit{codeFragment}$)
|
||||
\ElsIf{$\textit{token.Type} = \text{Operator}$}
|
||||
\If{$\textit{token.Value} = \text{Addition}$}
|
||||
\State $\textit{right} \gets \text{Pop}(\textit{stack})$
|
||||
\State $\textit{left} \gets \text{Pop}(\textit{stack})$
|
||||
\State $(\textit{referenceToValue}, \textit{codeFragment}) \gets \text{GetAddition}(\textit{left}, \textit{right})$
|
||||
\State Push($\textit{stack}$, $\textit{referenceToValue}$)
|
||||
\State Append($\textit{code}$, $\textit{codeFragment}$)
|
||||
\ElsIf{$\textit{token.Value} = \text{Multiplication}$}
|
||||
\State $\textit{right} \gets \text{Pop}(\textit{stack})$
|
||||
\State $\textit{left} \gets \text{Pop}(\textit{stack})$
|
||||
\State $(\textit{referenceToValue}, \textit{codeFragment}) \gets \text{GetMultiplication}(\textit{left}, \textit{right})$
|
||||
\State Push($\textit{stack}$, $\textit{referenceToValue}$)
|
||||
\State Append($\textit{code}$, $\textit{codeFragment}$)
|
||||
\EndIf
|
||||
\EndIf
|
||||
\EndWhile
|
||||
|
||||
\State $\textit{codeFragment} \gets$ GenerateResultStoring($\text{Pop}(\textit{stack})$)
|
||||
\State Append($\textit{code}$, $\textit{codeFragment}$)
|
||||
|
||||
\Return $\textit{code}$
|
||||
\EndProcedure
|
||||
\end{algorithmic}
|
||||
\end{algorithm}
|
||||
|
||||
The code generated by the transpiler is the kernel for the transpiled expressions. This means that a new kernel must be generated for each expression that needs to be evaluated. This is in contrast to the interpreter, which has one kernel and dispatches it once for each expression. However, generating one kernel per expression results in a much simpler kernel. This allows the kernel to focus on evaluating the postfix expression from left to right. No overhead work, like branching or managing a stack is needed. However, this overhead is now offloaded to the transpilation step on the CPU as can be seen in Algorithm \ref{alg:transpile}. There is also a noticeable overhead in that a kernel has to be generated for each expression. In cases like parameter optimisation, many of the expressions will be transpiled multiple times as the transpiler is called multiple times with the same expressions.
|
||||
|
||||
Both the transpiler and the interpreter have their respective advantages and disadvantages. While the interpreter puts less load on the CPU, the GPU has to perform more work. Much of this work is branching or managing a stack and therefore involves many instructions that are not used to evaluate the expression itself. However, this overhead can be mitigated by the fact, that all of this overhead is performed in parallel and not sequentially.
|
||||
|
||||
On the other hand, the transpiler performs more work on the CPU. The kernels are much simpler, and most of the instructions are used to evaluate the expressions themselves. Furthermore, as explained in Section \ref{sec:ptx}, any program running on the GPU, must be transpiled into PTX code before the driver can compile it into machine code. Therefore, the kernel written for the interpreter, must also be transpiled into PTX. This overhead is in addition to the branch instruction overhead. The self-written transpiler removes this intermediate step by transpiling directly to PTX. In addition, the generated code is tailored to evaluate expressions and does not need to generate generic PTX code, which can reduce transpilation time.
|
||||
|
||||
Unlike the GPU, the CPU can manage state across multiple calls. Concepts such as caches can be employed by the transpiler to reduce the overhead on the CPU. In cases such as parameter optimisation, where expressions remain the same over multiple calls, the resulting PTX code can be cached. As a result the same expression doesn't need to be transpiled multiple times, drastically reducing the transpilation time and therefore improving the overall performance of the transpiler.
|
||||
\subsection{Device}
|
||||
talk about what the GPU does. short section since the gpu does not do much
|
|
@ -2,11 +2,6 @@
|
|||
\label{cha:conclusion}
|
||||
|
||||
Summarise the results
|
||||
talk again how a typical input is often not complex enough (basically repeat that statement from comparison section in evaluation)
|
||||
|
||||
\section{Future Work}
|
||||
talk about what can be improved
|
||||
|
||||
Transpiler: transpile expression directly from Julia AST -> would save time because no intermediate representation needs to be created (looses step and gains performance, but also makes transpiler itself more complex)
|
||||
|
||||
CPU Interpreter: Probably more worth to dive into parallelising cpu interpreter itself (not really future work, as you wouldn't write a paper about that)
|
||||
talk about what can be improved
|
|
@ -1,44 +1,14 @@
|
|||
\chapter{Evaluation}
|
||||
\label{cha:evaluation}
|
||||
|
||||
The aim of this thesis is to determine whether at least one of the GPU evaluators is faster than the current CPU evaluator. This chapter describes the performance evaluation. First, the environment in which the performance tests are performed is explained. Then the individual results for the GPU interpreter and the transpiler are presented. In addition, this part also includes the performance tuning steps taken to achieve these results. Finally, the results of the GPU evaluators are compared to the CPU evaluator in order to answer the research questions of this thesis.
|
||||
|
||||
\section{Test environment}
|
||||
Explain the hardware used, as well as the actual data (how many expressions, variables etc.)
|
||||
|
||||
three scenarios -> few, normal and many variable sets;; expr repetitions to simulate parameter optimisation
|
||||
Benchmarktools.jl -> 1000 samples per scenario
|
||||
|
||||
\section{Results}
|
||||
talk about what we will see now (results only for interpreter, then transpiler and then compared with each other and a CPU interpreter)
|
||||
|
||||
\subsection{Interpreter}
|
||||
Results only for Interpreter (also contains final kernel configuration and probably quick overview/recap of the implementation used and described in Implementation section)
|
||||
\subsection{Performance tuning}
|
||||
Document the process of performance tuning
|
||||
|
||||
Initial: CPU-Side single-threaded; up to 1024 threads per block; bounds-checking enabled (especially in kernel)
|
||||
|
||||
1.) Blocksize reduced to a maximum of 256 -> moderate improvement in medium and large
|
||||
2.) Using @inbounds -> noticeable improvement in 2 out of 3
|
||||
3.) Tuned blocksize with NSight compute -> slight improvement
|
||||
4.) used int32 everywhere to reduce register usage -> significant performance drop (probably because a lot more waiting time "latency hiding not working basically", or more type conversions happening on GPU? look at generated PTX code and use that as an argument to describe why it is slower)
|
||||
5.) reverted previous; used fastmath instead -> imporvement (large var set is now faster than on transpiler)
|
||||
|
||||
Results only for Interpreter
|
||||
\subsection{Transpiler}
|
||||
Results only for Transpiler (also contains final kernel configuration and probably quick overview/recap of the implementation used and described in Implementation section
|
||||
\subsection{Performance tuning}
|
||||
Document the process of performance tuning
|
||||
|
||||
Initial: CPU-Side single-threaded; up to 1024 threads per block; bounds-checking enabled
|
||||
|
||||
1.) Blocksize reduced to a maximum of 256 -> moderate improvement in medium and large
|
||||
2.) Using @inbounds -> small improvement only on CPU side code
|
||||
3.) Tuned blocksize with NSight compute -> slight improvement
|
||||
4.) Only changed things on interpreter side
|
||||
5.) Only changed things on interpreter side
|
||||
|
||||
Results only for Transpiler
|
||||
\subsection{Comparison}
|
||||
Comparison of Interpreter and Transpiler as well as Comparing the two with CPU interpreter
|
||||
|
||||
talk about that compute portion is just too little. Only more complex expressions with higher var set count benefit well (make one or two performance evaluations, with 10 larger expressions and at least 1k var sets and present that here as point for that statement)
|
||||
Comparison of Interpreter and Transpiler as well as Comparing the two with CPU interpreter
|
|
@ -1,36 +1,20 @@
|
|||
\chapter{Implementation}
|
||||
\label{cha:implementation}
|
||||
|
||||
somewhere in here explain why one kernel per expression and not one kernel for all expressions
|
||||
|
||||
Go into the details why this implementation is tuned towards performance and should be the optimum at that
|
||||
|
||||
\section{Technologies}
|
||||
Short section; CUDA, PTX, Julia, CUDA.jl
|
||||
|
||||
Probably reference the performance evaluation papers for Julia and CUDA.jl
|
||||
|
||||
\section{Expression Processing}
|
||||
Talk about why this needs to be done and how it is done (the why is basically: simplifies evaluation/transpilation process; the how is in ExpressionProcessing.jl)
|
||||
|
||||
\section{Interpreter}
|
||||
Talk about how the interpreter has been developed.
|
||||
|
||||
UML-Ablaufdiagram
|
||||
\subsection{Performance tuning}
|
||||
Document the process of performance tuning
|
||||
|
||||
main loop; kernel transpiled by CUDA.jl into PTX and then executed
|
||||
|
||||
Memory access (currently global memory only)
|
||||
no dynamic memory allocation like on CPU (stack needs to have fixed size)
|
||||
|
||||
\section{Transpiler}
|
||||
Talk about how the transpiler has been developed (probably largest section, because it just has more interesting parts)
|
||||
Talk about how the transpiler has been developed
|
||||
|
||||
UML-Ablaufdiagram
|
||||
|
||||
Front-End and Back-End
|
||||
Caching of back-end results
|
||||
|
||||
PTX code generated and compiled using CUDA.jl (so basically the driver) and then executed
|
||||
|
||||
Memory access (global memory and register management especially register management)
|
||||
\subsection{Performance tuning}
|
||||
Document the process of performance tuning
|
|
@ -11,13 +11,15 @@ Optimisation and acceleration of program code is a crucial part in many fields.
|
|||
|
||||
The following expression $5 - \text{abs}(x_1) * \text{sqrt}(x_2) / 10 + 2 \char`^ x_3$ which contains simple mathematical operations as well as variables $x_n$ and parameters $p_n$ is one example that can be generated by the equation learning algorithm, Usually an equation learning algorithm generates multiple of such expressions per iteration. Out of these expressions all possibly relevant ones have to be evaluated. Additionally, multiple different values need to be inserted for all variables and parameters, drastically increasing the amount of evaluations that need to be performed.
|
||||
|
||||
In his blog, \textcite{sutter_free_2004} described how the free lunch is over in terms of the ever-increasing performance of hardware like the CPU. He states that to gain additional performance, developers need to start developing software for multiple cores and not just hope that on the next generation of CPUs the program magically runs faster. While this approach means more development overhead, a much greater speed-up can be achieved. However, in some cases the speed-up achieved by this is still not large enough and another approach is needed. One of these approaches is the utilisation of Graphics Processing Units (GPUs) as an easy and affordable option as compared to compute clusters. Especially when talking about performance per dollar, GPUs are very inexpensive as found by \textcite{brodtkorb_graphics_2013}. \textcite{michalakes_gpu_2008} have shown a noticeable speed-up when using GPUs for weather simulation. In addition to computer simulations, GPU acceleration also can be found in other places such as networking \parencite{han_packetshader_2010} or structural analysis of buildings \parencite{georgescu_gpu_2013}.
|
||||
In his Blog \textcite{sutter_free_2004} described how the free lunch is over in terms of the ever-increasing performance of hardware like the CPU. He states that to gain additional performance, developers need to start developing software for multiple cores and not just hope that on the next generation of CPUs the program magically runs faster. While this approach means more development overhead, a much greater speed-up can be achieved. However, in some cases the speed-up achieved by this is still not large enough and another approach is needed. One of these approaches is the utilisation of Graphics Processing Units (GPUs) as an easy and affordable option as compared to compute clusters. Especially when talking about performance per dollar, GPUs are very inexpensive as found by \textcite{brodtkorb_graphics_2013}. \textcite{michalakes_gpu_2008} have shown a noticeable speed-up when using GPUs for weather simulation. In addition to computer simulations, GPU acceleration also can be found in other places such as networking \parencite{han_packetshader_2010} or structural analysis of buildings \parencite{georgescu_gpu_2013}.
|
||||
|
||||
|
||||
%The free lunch theorem as described by \textcite{adam_no_2019} states that to gain additional performance, a developer cannot just hope for future hardware to be faster, especially on a single core.
|
||||
|
||||
% TODO: Incorporate PTX somehow
|
||||
|
||||
|
||||
\section{Research Question}
|
||||
With these successful implementations of GPU acceleration, this thesis also attempts to improve the performance of evaluating mathematical equations, generated at runtime for symbolic regression using GPUs. Therefore, the following research questions are formulated:
|
||||
With these successful implementations of GPU acceleration, this thesis also attempts to improve the performance of evaluating mathematical equations using GPUs. Therefore, the following research questions are formulated:
|
||||
|
||||
\begin{itemize}
|
||||
\item How can simple arithmetic expressions that are generated at runtime be efficiently evaluated on GPUs?
|
||||
|
|
|
@ -25,7 +25,7 @@ Graphics cards (GPUs) are commonly used to increase the performance of many diff
|
|||
While in the early days of GPGPU programming a lot of research has been done to assess if this approach is feasible, it now seems obvious to use GPUs to accelerate algorithms. GPUs have been used early to speed up weather simulation models. \textcite{michalakes_gpu_2008} proposed a method for simulating weather with the Weather Research and Forecast (WRF) model on a GPU. With their approach, they reached a speed-up of 5 to 2 for the most compute intensive task, with little GPU optimisation effort. They also found that the GPU usage was low, meaning there are resources and potential for more detailed simulations. Generally, simulations are great candidates for using GPUs, as they can benefit heavily from a high degree of parallelism and data throughput. \textcite{koster_high-performance_2020} have developed a way of using adaptive time steps on the GPU to considerably improve the performance of numerical and discrete simulations. In addition to the performance gains they were able to retain the precision and constraint correctness of the simulation. Black hole simulations are crucial for science and education for a better understanding of our world. \textcite{verbraeck_interactive_2021} have shown that simulating complex Kerr (rotating) black holes can be done on consumer hardware in a few seconds. Schwarzschild black hole simulations can be performed in real-time with GPUs as described by \textcite{hissbach_overview_2022} which is especially helpful for educational scenarios. While both approaches do not have the same accuracy as detailed simulations on supercomputers, they show how a single GPU can yield similar accuracy at a fraction of the cost. Software network routing can also heavily benefit from GPU acceleration as shown by \textcite{han_packetshader_2010}, where they achieved a significantly higher throughput than with a CPU only implementation. Finite element structural analysis is an essential tool for many branches of engineering and can also heavily benefit from the usage of GPUs as demonstrated by \textcite{georgescu_gpu_2013}. Generating test data for DeepQ learning can also significantly benefit from using the GPU \parencite{koster_macsq_2022}. However, it also needs to be noted, that GPUs are not always better performing than CPUs as illustrated by \textcite{lee_debunking_2010}, so it is important to consider if it is worth using GPUs for specific tasks.
|
||||
|
||||
\subsection{Programming GPUs}
|
||||
The development process on a GPU is vastly different from a CPU. A CPU has tens or hundreds of complex cores with the AMD Epyc 9965\footnote{\url{https://www.amd.com/en/products/processors/server/epyc/9005-series/amd-epyc-9965.html}} having $192$ cores and twice as many threads. To demonstrate the complexity of a simple one core 8-bit CPU \textcite{schuurman_step-by-step_2013} has written a development guide. He describes the different parts of one CPU core and how they interact. Modern CPUs are even more complex, with dedicated fast integer and floating-point arithmetic gates as well as logic gates, sophisticated branch prediction and much more. This makes a CPU perfect for handling complex control flows on a single program strand and on modern CPUs even multiple strands simultaneously \parencite{palacios_comparison_2011}. However, as seen in Section \ref{sec:gpgpu}, this often is not enough. On the other hand, a GPU contains thousands or even tens of thousands of cores. For example, the GeForce RTX 5090\footnote{\url{https://www.nvidia.com/en-us/geforce/graphics-cards/50-series/rtx-5090/}} contains a total of $21\,760$ CUDA cores. To achieve this enormous core count a single GPU core has to be much simpler than one CPU core. As described by \textcite{nvidia_cuda_2025} a GPU designates much more transistors towards floating-point computations. This results in less efficient integer arithmetic and control flow handling. There is also less Cache available per core and clock speeds are usually also much lower than those on a CPU. An overview of the differences of a CPU and a GPU architecture can be seen in Figure \ref{fig:cpu_vs_gpu}.
|
||||
The development process on a GPU is vastly different from a CPU. A CPU has tens or hundreds of complex cores with the AMD Epyc 9965\footnote{\url{https://www.amd.com/en/products/processors/server/epyc/9005-series/amd-epyc-9965.html}} having a staggering $192$ cores and twice as many threads. To demonstrate the complexity of a simple one core 8-bit CPU \textcite{schuurman_step-by-step_2013} has written a development guide. He describes the different parts of one CPU core and how they interact. Modern CPUs are even more complex, with dedicated fast integer and floating-point arithmetic gates as well as logic gates, sophisticated branch prediction and much more. This makes a CPU perfect for handling complex control flows on a single program strand and on modern CPUs even multiple strands simultaneously \parencite{palacios_comparison_2011}. However, as seen in section \ref{sec:gpgpu}, this often isn't enough. On the other hand, a GPU contains thousands or even tens of thousands of cores. For example, the GeForce RTX 5090\footnote{\url{https://www.nvidia.com/en-us/geforce/graphics-cards/50-series/rtx-5090/}} contains a total of $21\,760$ CUDA cores. To achieve this enormous core count a single GPU core has to be much simpler than one CPU core. As described by \textcite{nvidia_cuda_2025} a GPU designates much more transistors towards floating-point computations. This results in less efficient integer arithmetic and control flow handling. There is also less Cache available per core and clock speeds are usually also much lower than those on a CPU. An overview of the differences of a CPU and a GPU architecture can be seen in figure \ref{fig:cpu_vs_gpu}.
|
||||
|
||||
\begin{figure}
|
||||
\centering
|
||||
|
@ -34,10 +34,9 @@ The development process on a GPU is vastly different from a CPU. A CPU has tens
|
|||
\label{fig:cpu_vs_gpu}
|
||||
\end{figure}
|
||||
|
||||
Despite these drawbacks, the sheer number of cores, makes a GPU a valid choice when considering improving the performance of an algorithm. Because of the high number of cores, GPUs are best suited for data parallel scenarios. This is due to the SIMD architecture of these cards. SIMD stands for Sinlge-Instruction Multiple-Data and states that there is a single stream of instructions that is executed on a huge number of data streams. \textcite{franchetti_efficient_2005} and \textcite{tian_compiling_2012} describe ways of using SIMD instructions on the CPU. Their approaches lead to noticeable speed-ups of 3.3 and 4.7 respectively by using SIMD instructions instead of serial computations. Extending this to GPUs which are specifically built for SIMD/data parallel calculations shows why they are so powerful despite having less complex and slower cores than a CPU. It is also important to note, that a GPU also always needs a CPU, as the CPU is responsible for sending the data to the GPU and starting the GPU program. In GPGPU programming, the CPU is usually called the host, while the GPU is usually called the device.
|
||||
Despite these drawbacks, the sheer number of cores, makes a GPU a valid choice when considering improving the performance of an algorithm. Because of the high number of cores, GPUs are best suited for data parallel scenarios. This is due to the SIMD architecture of these cards. SIMD stands for Sinlge-Instruction Multiple-Data and states that there is a single stream of instructions that is executed on a huge number of data streams. \textcite{franchetti_efficient_2005} and \textcite{tian_compiling_2012} describe ways of using SIMD instructions on the CPU. Their approaches lead to noticeable speed-ups of 3.3 and 4.7 respectively by using SIMD instructions instead of serial computations. Extending this to GPUs which are specifically built for SIMD/data parallel calculations shows why they are so powerful despite having less complex and slower cores than a CPU.
|
||||
|
||||
\subsubsection{Thread Hierarchy and Tuning}
|
||||
\label{sec:thread_hierarchy}
|
||||
The thousands of cores on a GPU, also called threads, are grouped together in several categories. This is the Thread hierarchy of GPUs. The developer can influence this grouping to a degree which allows them to tune their algorithm for optimal performance. In order to develop a well performing algorithm, it is necessary to know how this grouping works. Tuning the grouping is unique to each algorithm and also dependent on the GPU used, which means it is important to test a lot of different configurations to achieve the best possible result. This section aims at exploring the thread hierarchy and how it can be tuned to fit an algorithm.
|
||||
|
||||
At the lowest level of a GPU exists a Streaming Multiprocessor (SM), which is a hardware unit responsible for scheduling and executing threads and also contains the registers used by these threads. An SM is always executing a group of 32 threads simultaneously, and this group is called a warp. The number of threads that can be started is virtually unlimited. However, threads must be grouped in a block, with one block typically containing a maximum of $1024$ threads but is often configured to be less. Therefore, if more than $1024$ threads are required, more blocks must be created. Blocks can also be grouped into thread block clusters which is optional, but can be useful in certain scenarios. All thread blocks or thread block clusters are part of a grid, which manifests as a dispatch of the code run on the GPU, also called kernel \parencite{amd_hip_2025}. All threads in one block have access to some shared memory, which can be used for L1 caching or communication between threads. It is important that the blocks can be scheduled independently, with no dependencies between them. This allows the scheduler to schedule blocks and threads as efficiently as possible. All threads within a warp are guaranteed to be part of the same block, and are therefore executed simultaneously and can access the same memory addresses. Figure \ref{fig:thread_hierarchy} depicts how threads in a block are grouped into warps for execution and how they shared memory.
|
||||
|
@ -49,7 +48,7 @@ At the lowest level of a GPU exists a Streaming Multiprocessor (SM), which is a
|
|||
\label{fig:thread_hierarchy}
|
||||
\end{figure}
|
||||
|
||||
A piece of code that is executed on a GPU is written as a kernel which can be configured. The most important configuration is how threads are grouped into blocks. The GPU allows the kernel to allocate threads and blocks and block clusters in up to three dimensions. This is often useful because of the already mentioned shared memory, which will be explained in more detail in Section \ref{sec:memory_model}. Considering the case where an image needs to be blurred, it not only simplifies the development if threads are arranged in a 2D grid, it also helps with optimising memory access. As the threads in a block, need to access a lot of the same data, this data can be loaded in the shared memory of the block. This allows the data to be accessed much quicker compared to when threads are allocated in only one dimension. With one dimensional blocks it is possible that threads assigned to nearby pixels, are part of a different block, leading to a lot of duplicate data transfer. The size in each dimension of a block can be almost arbitrary within the maximum allowed number of threads. However, blocks that are too large might lead to other problems which are described in more detail in Section \ref{sec:occupancy}.
|
||||
A piece of code that is executed on a GPU is written as a kernel which can be configured. The most important configuration is how threads are grouped into blocks. The GPU allows the kernel to allocate threads and blocks and block clusters in up to three dimensions. This is often useful because of the already mentioned shared memory, which will be explained in more detail in section \ref{sec:memory_model}. Considering the case where an image needs to be blurred, it not only simplifies the development if threads are arranged in a 2D grid, it also helps with optimising memory access. As the threads in a block, need to access a lot of the same data, this data can be loaded in the shared memory of the block. This allows the data to be accessed much quicker compared to when threads are allocated in only one dimension. With one dimensional blocks it is possible that threads assigned to nearby pixels, are part of a different block, leading to a lot of duplicate data transfer. The size in each dimension of a block can be almost arbitrary within the maximum allowed number of threads. However, blocks that are too large might lead to other problems which are described in more detail in section \ref{sec:occupancy}.
|
||||
|
||||
All threads in a warp start at the same point in a program, but with their own instruction address, allowing them to work independently. Because of the SIMD architecture, all threads in a warp must execute the same instructions and if threads start diverging, the SM must pause threads with different instructions and execute them later. Figure \ref{fig:thread_divergence} shows how such divergences can impact performance. The situation described by the figure also shows, that after the divergence the thread could re-converge. On older hardware this does not happen and leads to T2 being executed after T1 and T3 are finished. In situations where a lot of data dependent thread divergence happens, most of the benefits of using a GPU likely have vanished. Threads not executing the same instruction is strictly speaking against the SIMD principle but can happen in reality, due to data dependent branching. Consequently, this leads to bad resource utilisation, which in turn leads to worse performance. Another possibility of threads being paused (inactive threads) is the fact that sometimes, the number of threads started is not divisible by 32. In such cases, the last warp still contains 32 threads but only the threads with work are executed.
|
||||
|
||||
|
@ -69,7 +68,7 @@ Modern GPUs implement the so called Single-Instruction Multiple-Thread (SIMT) ar
|
|||
% - Memory allocation (with the one paper diving into dynamic allocations)
|
||||
% - Memory transfer (with streams potentially)
|
||||
|
||||
On a GPU there are two parts that contribute to the performance of an algorithm. The one already looked at is the compute-portion of the GPU. This is necessary because if threads are serialised or run inefficiently, there is nothing that can make the algorithm execute faster. However, algorithms run on a GPU usually require huge amounts of data to be processed, as they are designed for exactly that purpose. The purpose of this section is to explain how the memory model of the GPU works and how it can influence the performance of an algorithm. In Figure \ref{fig:gpu_memory_layout} the memory layout and the kinds of memory available are depicted. The different parts will be explained in this section.
|
||||
On a GPU there are two parts that contribute to the performance of an algorithm. The one already looked at is the compute-portion of the GPU. This is necessary because if threads are serialised or run inefficiently, there is nothing that can make the algorithm execute faster. However, algorithms run on a GPU usually require huge amounts of data to be processed, as they are designed for exactly that purpose. The purpose of this section is to explain how the memory model of the GPU works and how it can influence the performance of an algorithm. In figure \ref{fig:gpu_memory_layout} the memory layout and the kinds of memory available are depicted. The different parts will be explained in this section.
|
||||
|
||||
\begin{figure}
|
||||
\centering
|
||||
|
@ -78,7 +77,7 @@ On a GPU there are two parts that contribute to the performance of an algorithm.
|
|||
\label{fig:gpu_memory_layout}
|
||||
\end{figure}
|
||||
|
||||
On a GPU there are multiple levels and kinds of memory available. All these levels and kinds have different purposes they are optimised for. This means that it is important to know what they are and how they can be best used for specific tasks. On the lowest level threads have registers and local memory available. Registers is the fastest way to access memory but is also the least abundant memory with up to a maximum of 255 32-Bit registers per thread on Nvidia GPUs and 256 on AMD GPUs \parencite{amd_hardware_2025}. However, using all registers of a thread can lead to other problems which are described in more detail in Section \ref{sec:occupancy}. On the other side, the thread local memory is significantly slower than registers. This is due to the fact, that local memory is actually stored in global memory and therefore has the same limitations which are explained later. This means it is important to try and avoid local memory as much as possible. Local memory is usually only used when a thread uses too many registers. The compiler will then spill the remaining data into local memory and loads it into registers once needed, drastically slowing down the application.
|
||||
On a GPU there are multiple levels and kinds of memory available. All these levels and kinds have different purposes they are optimised for. This means that it is important to know what they are and how they can be best used for specific tasks. On the lowest level threads have registers and local memory available. Registers is the fastest way to access memory but is also the least abundant memory with up to a maximum of 255 32-Bit registers per thread on Nvidia GPUs and 256 on AMD GPUs \parencite{amd_hardware_2025}. However, using all registers of a thread can lead to other problems which are described in more detail in section \ref{sec:occupancy}. On the other side, the thread local memory is significantly slower than registers. This is due to the fact, that local memory is actually stored in global memory and therefore has the same limitations which are explained later. This means it is important to try and avoid local memory as much as possible. Local memory is usually only used when a thread uses too many registers. The compiler will then spill the remaining data into local memory and loads it into registers once needed, drastically slowing down the application.
|
||||
|
||||
Shared memory is the next tier of memory on a GPU. Unlike local memory and registers, shared memory is shared between all threads inside a block. The amount of shared memory is depending on the GPU architecture but for Nvidia it hovers at around 100 Kilobyte (KB) per block. While this memory is slower than registers, its primary use-case is communicating and sharing data between threads in a block. If all threads in a block access a lot of overlapping data this data can be loaded from global memory into faster shared memory once. It can then be accessed multiple times, further increasing performance. Loading data into shared memory and accessing that data has to be done manually. Because shared memory is part of the unified data cache, it can either be used as a cache or for manual use, meaning a developer can allocate more shared memory towards caching if needed. Another feature of shared memory are the so-called memory banks. Shared memory is always split into 32 equally sized memory modules also called memory banks. All available memory addresses lie in one of these banks. This means if two threads access two memory addresses which lie in different banks, the access can be performed simultaneously, increasing the throughput.
|
||||
|
||||
|
@ -119,14 +118,13 @@ When starting a kernel, the most important configuration is the number of thread
|
|||
|
||||
In general, it is important to have as many warps as possible ready for execution. While this means that a lot of warps could be executed but are not, this is actually desired. A key feature of GPUs is so-called latency hiding, meaning that while a warp waits for data to be retrieved for example, another warp ready for execution can now be run. With low occupancy, and therefore little to no warps waiting for execution, latency hiding does not work, as now the hardware is idle. As a result, the runtime increases which also explains why high occupancy is not guaranteed to result in performance improvements while low occupancy can and often will increase the runtime.
|
||||
|
||||
As seen in table \ref{tab:compute_capabilities}, there exist different limitations that can impact occupancy. The number of warps per SM is important, as this means this is the degree of parallelism achievable per SM. If due to other limitations, the number of warps per SM is below the maximum, there is idle hardware. One such limitation is the number of registers per block and SM. In the case of compute capability 8.9, one SM can handle $32 * 48 = 1\,536$ threads. This leaves $64\,000 / 1\,536 \approx 41$ registers per thread, which is lower than the theoretical maximum of $255$ registers per thread. Typically, one register is mapped to one variable in the kernel code, meaning a developer can use up to 41 variables in their code. However, if the variable needs 64 bits to store its value, the register usage doubles, as all registers on a GPU are 32-bit. On a GPU with compute capability 10.x a developer can use up to $64\,000 / 2\,048 \approx 31$ registers. Of course a developer can use more registers, but this results in less occupancy. However, depending on the algorithm using more registers might be more beneficial to performance than the lower occupancy, in which case occupancy is not as important. If a developer needs more than $255$ registers for their variables the additional variables will spill into local memory which is, as described in Section \ref{sec:memory_model}, not desirable.
|
||||
As seen in table \ref{tab:compute_capabilities}, there exist different limitations that can impact occupancy. The number of warps per SM is important, as this means this is the degree of parallelism achievable per SM. If due to other limitations, the number of warps per SM is below the maximum, there is idle hardware. One such limitation is the number of registers per block and SM. In the case of compute capability 8.9, one SM can handle $32 * 48 = 1\,536$ threads. This leaves $64\,000 / 1\,536 \approx 41$ registers per thread, which is lower than the theoretical maximum of $255$ registers per thread. Typically, one register is mapped to one variable in the kernel code, meaning a developer can use up to 41 variables in their code. However, if the variable needs 64 bits to store its value, the register usage doubles, as all registers on a GPU are 32-bit. On a GPU with compute capability 10.x a developer can use up to $64\,000 / 2\,048 \approx 31$ registers. Of course a developer can use more registers, but this results in less occupancy. However, depending on the algorithm using more registers might be more beneficial to performance than the lower occupancy, in which case occupancy is not as important. If a developer needs more than $255$ registers for their variables the additional variables will spill into local memory which is, as described in section \ref{sec:memory_model}, not desirable.
|
||||
|
||||
Additionally, shared memory consumption can also impact the occupancy. If for example a block needs all the available shared memory, which is almost the same as the amount of shared memory per SM, this SM can only serve this block. On compute capability 10.x, this would mean that occupancy would be at maximum $50\%$ as a block can have up to $1\,024$ threads while an SM supports up to $2\,048$ threads. Again, in such cases it needs to be determined, if the performance gain of using this much shared memory is worth the lower occupancy.
|
||||
|
||||
Balancing these limitations and therefore the occupancy and performance often requires a lot of trial and error with help of the aforementioned tools. In cases where occupancy is already high and the amount of warps ready for execution is also high, other areas for performance improvements need to be explored. Algorithmic optimisation is always a good idea. Some performance improvements can be achieved by altering the computations to use different parts of the GPU. One of such optimisations is using FP32 operations wherever possible. Another well suited optimisation is to rewrite the algorithm to use as many Fused Multiply-Add (FMA) instructions. FMA is a special floating point instruction, that multiplies two values and adds a third, all in a single clock cycle \parencite{nvidia_cuda_2025-1}. However, the result might slightly deviate compared to performing these two operations separately, which means in accuracy sensitive scenarios, this instruction should be avoided. If the compiler detects a floating point operation with the FMA structure, it will automatically be compiled to an FMA instruction. To prevent this, in C++ the developer can call the functions \_\_fadd\_ and \_\_fmul\_ for addition and multiplication respectively.
|
||||
|
||||
\subsection[PTX]{Parallel Thread Execution}
|
||||
\label{sec:ptx}
|
||||
% https://docs.nvidia.com/cuda/parallel-thread-execution/
|
||||
While in most cases a GPU can be programmed in a higher level language like C++ or even Julia\footnote{\url{https://juliagpu.org/}}, it is also possible to program GPUs with the low level language Parallel Thread Execution (PTX) developed by Nvidia. A brief overview of what PTX is and how it can be used to program GPUs is given in this section. Information in this section is taken from the PTX documentation \parencite{nvidia_parallel_2025} if not stated otherwise.
|
||||
|
||||
|
@ -136,7 +134,7 @@ Syntactically PTX resembles Assembly style code. Every PTX code must have a \ver
|
|||
\begin{GenericCode}[numbers=none]
|
||||
add.f32 \%n, 0.1, 0.2;
|
||||
\end{GenericCode}
|
||||
Loops in the classical sense do not exist in PTX. Alternatively a developer needs to define jump targets for the beginning and end of the loop. The Program in \ref{code:ptx_loop} shows how a function with simple loop can be implemented. The loop counts down to zero from the passed parameter $N$ which is loaded into the register \%n in line 6. If the value in the register \%n reached zero the loop branches at line 9 to the jump target at line 12 and the loop has finished. All other used directives and further information on writing PTX code can be taken from the PTX documentation \parencite{nvidia_parallel_2025}.
|
||||
Loops in the classical sense do not exist in PTX. Alternatively a developer needs to define jump targets for the beginning and end of the loop. The code in \ref{code:ptx_loop} shows how a function with simple loop can be implemented. The loop counts down to zero from the passed parameter $N$ which is loaded into the register \%n in line 6. If the value in the register \%n reached zero the loop branches at line 9 to the jump target at line 12 and the loop has finished. All other used directives and further information on writing PTX code can be taken from the PTX documentation \parencite{nvidia_parallel_2025}.
|
||||
|
||||
\begin{program}
|
||||
\begin{GenericCode}
|
||||
|
@ -159,12 +157,11 @@ Done:
|
|||
\end{program}
|
||||
|
||||
\section{Compilers}
|
||||
\label{sec:compilers}
|
||||
Compilers are a necessary tool for many developers. If a developer wants to run their program it is very likely they need one. As best described by \textcite{aho_compilers_2006} in their dragon book, a compiler takes code written by a human in some source language and translates it into a destination language readable by a computer. This section briefly explores what compilers are and research done in this old field of computer science. Furthermore, the topics of transpilers and interpreters are explored, as their use-cases are very similar.
|
||||
|
||||
\textcite{aho_compilers_2006} and \textcite{cooper_engineering_2022} describe how a compiler can be developed, with the latter focusing on more modern approaches. They describe how a compiler consists of two parts, the analyser, also called frontend, and the synthesiser also called backend. The front end is responsible for ensuring syntactic and semantic correctness and converts the source code into an intermediate representation, an abstract syntax tree (AST), for the backend. Generating code in the target language, from the intermediate representation is the job of the backend. This target code can be assembly or anything else that is needed for a specific use-case. This intermediate representation also makes it simple to swap out frontends or backends. The Gnu Compiler Collection \textcite{gcc_gcc_2025} takes advantage of using different frontends to provide support for many languages including C, C++, Ada and more. Instead of compiling source code for specific machines directly, many languages compile code for virtual machines instead. Notable examples are the Java Virtual Machine (JVM) \parencite{lindholm_java_2025} and the low level virtual machine (LLVM) \parencite{lattner_llvm_2004}. Such virtual machines provide a bytecode which can be used as a target language for compilers. A huge benefit of such virtual machines is the ability for one program to be run on all physical machines the virtual machine exists for, without the developer needing to change that program \parencite{lindholm_java_2025}. Programs written for virtual machines are compiled into their respective bytecode. This bytecode can then be interpreted or compiled to physical machine code and then be run. According to the JVM specification \textcite{lindholm_java_2025} the Java bytecode is interpreted and also compiled with a just-in-time (JIT) compiler to increase the performance of code blocks that are often executed. On the other hand, the common language runtime (CLR)\footnote{\url{https://learn.microsoft.com/en-us/dotnet/standard/clr}}, the virtual machine for languages like C\#, never interprets the generated bytecode. As described by \textcite{microsoft_overview_2023} the CLR always compiles the bytecode to physical machine code using a JIT compiler before it is executed.
|
||||
|
||||
A grammar describes how a language is structured. It not only describes the structure of natural language, but it can also be used to describe the structure of a programming language. \textcite{chomsky_certain_1959} found that grammars can be grouped into four levels, with regular and context-free grammars being the most relevant for programming languages. A regular grammar is of the structure $A = a\,|\,a\,B$ which is called a rule. The symbols $A$ and $B$ are non-terminal symbols and $a$ is a terminal symbol. A non-terminal symbol stands for another rule with the same structure and must only occur after a terminal symbol. Terminal symbols are fixed symbols or a value that can be found in the input stream, like literals in programming languages. Context-free grammars are more complex and are of the structure $A = \beta$. In this context $\beta$ stands for any combination of terminal and non-terminal symbols. Therefore, a rule like $A = a\,| a\,B\,a$ is allowed with this grammar level. This shows that with context-free grammars enclosing structures are possible. To write grammars for programming languages, other properties are also important to efficiently validate or parse some input to be defined by this grammar. However, these are not discussed here, but are described by \textcite{aho_compilers_2006}. They also described that generating a parser out of a grammar can be automated. This automation can be performed by parser generators like Yacc \parencite{johnson_yacc_1975} as described in their book. More modern alternatives are Bison\footnote{\url{https://www.gnu.org/software/bison/}} or Antlr\footnote{\url{https://www.antlr.org/}}. Before the parser can validate the input stream, a scanner is needed as described by \textcite{cooper_engineering_2022}. The scanner reads every character of the input stream and is responsible for removing white-spaces and ensures only valid characters and words are present. Flex \footnote{\url{https://github.com/westes/flex}} is a tool that allows generating a scanner and is often used in combination with Bison. A simplified version of the compiler architecture using Flex and Bison is depicted in Figure \ref{fig:compiler_layout}. It shows how source code is taken and transformed into the intermediate representation by the frontend, and how it is converted into executable machine code by the backend.
|
||||
A grammar describes how a language is structured. It not only describes the structure of natural language, but it can also be used to describe the structure of a programming language. \textcite{chomsky_certain_1959} found that grammars can be grouped into four levels, with regular and context-free grammars being the most relevant for programming languages. A regular grammar is of the structure $A = a\,|\,a\,B$ which is called a rule. The symbols $A$ and $B$ are non-terminal symbols and $a$ is a terminal symbol. A non-terminal symbol stands for another rule with the same structure and must only occur after a terminal symbol. Terminal symbols are fixed symbols or a value that can be found in the input stream, like literals in programming languages. Context-free grammars are more complex and are of the structure $A = \beta$. In this context $\beta$ stands for any combination of terminal and non-terminal symbols. Therefore, a rule like $A = a\,| a\,B\,a$ is allowed with this grammar level. This shows that with context-free grammars enclosing structures are possible. To write grammars for programming languages, other properties are also important to efficiently validate or parse some input to be defined by this grammar. However, these are not discussed here, but are described by \textcite{aho_compilers_2006}. They also described that generating a parser out of a grammar can be automated. This automation can be performed by parser generators like Yacc \parencite{johnson_yacc_1975} as described in their book. More modern alternatives are Bison\footnote{\url{https://www.gnu.org/software/bison/}} or Antlr\footnote{\url{https://www.antlr.org/}}. Before the parser can validate the input stream, a scanner is needed as described by \textcite{cooper_engineering_2022}. The scanner reads every character of the input stream and is responsible for removing white-spaces and ensures only valid characters and words are present. Flex \footnote{\url{https://github.com/westes/flex}} is a tool that allows generating a scanner and is often used in combination with Bison. A simplified version of the compiler architecture using Flex and Bison is depicted in figure \ref{fig:compiler_layout}. It shows how source code is taken and transformed into the intermediate representation by the frontend, and how it is converted into executable machine code by the backend.
|
||||
|
||||
\begin{figure}
|
||||
\centering
|
||||
|
|
Before Width: | Height: | Size: 88 KiB |
Before Width: | Height: | Size: 91 KiB |
Before Width: | Height: | Size: 154 KiB |
Before Width: | Height: | Size: 54 KiB |
Before Width: | Height: | Size: 19 KiB |
BIN
thesis/main.pdf
|
@ -31,7 +31,7 @@
|
|||
% Title page entries
|
||||
%%%-----------------------------------------------------------------------------
|
||||
|
||||
\title{Interpreter and Transpiler for Simple Expressions on Nvidia GPUs using Julia}
|
||||
\title{Interpreter and Transpiler for simple expressions on Nvidia GPUs using Julia}
|
||||
\author{Daniel Roth}
|
||||
\programname{Software Engineering}
|
||||
|
||||
|
|