relwork: continuation of programing gpus
Some checks failed
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled

This commit is contained in:
Daniel 2025-03-15 14:33:33 +01:00
parent f3446a2b11
commit 84fdf5c9ca
6 changed files with 287 additions and 15 deletions

View File

@ -0,0 +1,226 @@
<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:136.0) Gecko/20100101 Firefox/136.0" version="25.0.3">
<diagram name="Page-1" id="jQ_StKdA93h0BzUtdCFj">
<mxGraphModel dx="1430" dy="1615" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
<root>
<mxCell id="0" />
<mxCell id="1" parent="0" />
<mxCell id="pr25IfJEznjW-GiSo1Zk-103" value="" style="swimlane;startSize=0;" vertex="1" parent="1">
<mxGeometry x="380" y="-640" width="640" height="540" as="geometry" />
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-155" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#cdeb8b;strokeColor=#36393d;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-103">
<mxGeometry x="30" y="320" width="580" height="80" as="geometry" />
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-104" value="GPU" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-103">
<mxGeometry width="60" height="30" as="geometry" />
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-105" value="" style="swimlane;startSize=0;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-103">
<mxGeometry x="20" y="30" width="280" height="260" as="geometry" />
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-106" value="Block" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-105">
<mxGeometry width="60" height="30" as="geometry" />
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-107" value="Shared memory" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#fad7ac;strokeColor=#b46504;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-105">
<mxGeometry x="20" y="40" width="240" height="40" as="geometry" />
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-108" value="Registers" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#fad7ac;strokeColor=#b46504;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-105">
<mxGeometry x="20" y="120" width="70" height="40" as="geometry" />
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-109" value="Registers" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#fad7ac;strokeColor=#b46504;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-105">
<mxGeometry x="190" y="120" width="70" height="40" as="geometry" />
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-110" value="Thread" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#f5f5f5;fontColor=#333333;strokeColor=#666666;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-105">
<mxGeometry x="20" y="200" width="110" height="40" as="geometry" />
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-111" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#fad7ac;strokeColor=#b46504;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-105">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="59.83000000000001" y="200" as="sourcePoint" />
<mxPoint x="59.83000000000001" y="160" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-112" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#fad7ac;strokeColor=#b46504;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-105">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="115" y="200" as="sourcePoint" />
<mxPoint x="115" y="80" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-113" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#fad7ac;strokeColor=#b46504;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-105">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="165" y="200" as="sourcePoint" />
<mxPoint x="165" y="80" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-114" value="Thread" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#f5f5f5;fontColor=#333333;strokeColor=#666666;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-105">
<mxGeometry x="150" y="200" width="110" height="40" as="geometry" />
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-115" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#fad7ac;strokeColor=#b46504;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-105">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="219.83000000000004" y="200" as="sourcePoint" />
<mxPoint x="219.83000000000004" y="160" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-116" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#fad7ac;strokeColor=#b46504;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-105">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="165" y="310" as="sourcePoint" />
<mxPoint x="165" y="240" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-117" value="" style="swimlane;startSize=0;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-103">
<mxGeometry x="340" y="30" width="280" height="260" as="geometry" />
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-118" value="Block" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-117">
<mxGeometry width="60" height="30" as="geometry" />
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-119" value="Shared memory" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#fad7ac;strokeColor=#b46504;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-117">
<mxGeometry x="20" y="40" width="240" height="40" as="geometry" />
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-120" value="Registers" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#fad7ac;strokeColor=#b46504;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-117">
<mxGeometry x="20" y="120" width="70" height="40" as="geometry" />
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-121" value="Registers" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#fad7ac;strokeColor=#b46504;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-117">
<mxGeometry x="190" y="120" width="70" height="40" as="geometry" />
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-122" value="Thread" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#f5f5f5;fontColor=#333333;strokeColor=#666666;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-117">
<mxGeometry x="20" y="200" width="110" height="40" as="geometry" />
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-123" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#fad7ac;strokeColor=#b46504;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-117">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="59.83000000000001" y="200" as="sourcePoint" />
<mxPoint x="59.83000000000001" y="160" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-124" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#fad7ac;strokeColor=#b46504;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-117">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="115" y="200" as="sourcePoint" />
<mxPoint x="115" y="80" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-125" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#fad7ac;strokeColor=#b46504;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-117">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="165" y="200" as="sourcePoint" />
<mxPoint x="165" y="80" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-126" value="Thread" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#f5f5f5;fontColor=#333333;strokeColor=#666666;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-117">
<mxGeometry x="150" y="200" width="110" height="40" as="geometry" />
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-127" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#fad7ac;strokeColor=#b46504;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-117">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="219.83000000000004" y="200" as="sourcePoint" />
<mxPoint x="219.83000000000004" y="160" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-149" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#d5e8d4;strokeColor=#3A4F2D;endSize=6;startSize=6;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="60" y="320" as="sourcePoint" />
<mxPoint x="60" y="270" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-150" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#d5e8d4;strokeColor=#3A4F2D;endSize=6;startSize=6;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="210" y="320" as="sourcePoint" />
<mxPoint x="210" y="270" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-151" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#d5e8d4;strokeColor=#3A4F2D;endSize=6;startSize=6;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="430" y="320" as="sourcePoint" />
<mxPoint x="430" y="270" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-152" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#d5e8d4;strokeColor=#3A4F2D;endSize=6;startSize=6;jumpSize=6;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="580" y="320" as="sourcePoint" />
<mxPoint x="580" y="270" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-146" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#fad7ac;strokeColor=#b46504;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="80" y="340" as="sourcePoint" />
<mxPoint x="80" y="270" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-148" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#fad7ac;strokeColor=#b46504;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="455" y="340" as="sourcePoint" />
<mxPoint x="455" y="270" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-147" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#fad7ac;strokeColor=#b46504;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="560" y="340" as="sourcePoint" />
<mxPoint x="560" y="270" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-156" value="Global memory" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-103">
<mxGeometry x="275" y="320" width="90" height="30" as="geometry" />
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-157" value="Texture/Surface memory" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#cdeb8b;strokeColor=#36393d;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-103">
<mxGeometry x="30" y="420" width="580" height="40" as="geometry" />
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-158" value="Constant memory" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#cdeb8b;strokeColor=#36393d;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-103">
<mxGeometry x="30" y="480" width="580" height="40" as="geometry" />
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-159" value="" style="endArrow=classic;html=1;rounded=0;fillColor=#d5e8d4;strokeColor=#3A4F2D;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="520" y="480" as="sourcePoint" />
<mxPoint x="520" y="270" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-160" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#d5e8d4;strokeColor=#3A4F2D;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="500" y="420" as="sourcePoint" />
<mxPoint x="500" y="270" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-161" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#d5e8d4;strokeColor=#3A4F2D;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="370" y="420" as="sourcePoint" />
<mxPoint x="370" y="270" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-162" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#d5e8d4;strokeColor=#3A4F2D;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="270" y="420" as="sourcePoint" />
<mxPoint x="270" y="270" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-163" value="" style="endArrow=classic;startArrow=classic;html=1;rounded=0;fillColor=#d5e8d4;strokeColor=#3A4F2D;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="140" y="420" as="sourcePoint" />
<mxPoint x="140" y="270" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-164" value="" style="endArrow=classic;html=1;rounded=0;fillColor=#d5e8d4;strokeColor=#3A4F2D;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="120" y="480" as="sourcePoint" />
<mxPoint x="120" y="270" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-165" value="" style="endArrow=classic;html=1;rounded=0;fillColor=#d5e8d4;strokeColor=#3A4F2D;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="250" y="480" as="sourcePoint" />
<mxPoint x="250" y="270" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-166" value="" style="endArrow=classic;html=1;rounded=0;fillColor=#d5e8d4;strokeColor=#3A4F2D;" edge="1" parent="pr25IfJEznjW-GiSo1Zk-103">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="390" y="480" as="sourcePoint" />
<mxPoint x="390" y="270" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-167" value="Local memory" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#ffe6cc;strokeColor=#d79b00;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-103">
<mxGeometry x="530" y="340" width="70" height="40" as="geometry" />
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-168" value="Local memory" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#ffe6cc;strokeColor=#d79b00;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-103">
<mxGeometry x="400" y="340" width="70" height="40" as="geometry" />
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-169" value="Local memory" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#ffe6cc;strokeColor=#d79b00;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-103">
<mxGeometry x="170" y="340" width="70" height="40" as="geometry" />
</mxCell>
<mxCell id="pr25IfJEznjW-GiSo1Zk-170" value="Local memory" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#ffe6cc;strokeColor=#d79b00;" vertex="1" parent="pr25IfJEznjW-GiSo1Zk-103">
<mxGeometry x="40" y="340" width="70" height="40" as="geometry" />
</mxCell>
</root>
</mxGraphModel>
</diagram>
</mxfile>

View File

@ -1,3 +1,4 @@
CUDA
GPGPU
SIMT
Sinlge-Instruction

View File

@ -20,7 +20,7 @@ A typical equation learner generates multiple expressions at once. If the equati
\section[GPGPU]{General Purpose Computation on Graphics Processing Units}
\label{sec:gpgpu}
Graphics cards (GPUs) are commonly used to increase the performance of many different applications. Originally they were designed to improve performance and visual quality in games. \textcite{dokken_gpu_2005} first described the usage of GPUs for general purpose programming (GPGPU). They have shown how the graphics pipeline can be used for GPGPU programming. Because this approach also requires the programmer to understand the graphics terminology, this was not a great solution. Therefore, Nvidia released CUDA\footnote{\url{https://developer.nvidia.com/cuda-toolkit}} in 2007 with the goal of allowing developers to program GPUs independent of the graphics pipeline and terminology. A study of the programmability of GPUs with CUDA and the resulting performance has been conducted by \textcite{huang_gpu_2008}. They found that GPGPU programming has potential, even for non-embarassingly parallel problems. Research is also done in making the low level CUDA development simpler. \textcite{han_hicuda_2011} have described a directive-based language to make development simpler and less error-prone, while retaining the performance of handwritten code. To drastically simplify CUDA development, \textcite{besard_effective_2019} showed that it is possible to develop with CUDA in the high level programming language Julia\footnote{\url{https://julialang.org/}} with similar performance to CUDA written in C. In a subsequent study \textcite{lin_comparing_2021} found that high performance computing (HPC) on the CPU and GPU in Julia performs similar to HPC development in C. This means that Julia can be a viable alternative to Fortran, C and C++ in the HPC field and has the additional benefit of developer comfort since it is a high level language with modern features such as garbage-collectors. \textcite{besard_rapid_2019} have also shown how the combination of Julia and CUDA help in rapidly developing HPC software. While this thesis in general revolves around CUDA, there also exist alternatives by AMD called ROCm\footnote{\url{https://www.amd.com/de/products/software/rocm.html}} and a vendor independent alternative called OpenCL\footnote{\url{https://www.khronos.org/opencl/}}.
Graphics cards (GPUs) are commonly used to increase the performance of many different applications. Originally they were designed to improve performance and visual quality in games. \textcite{dokken_gpu_2005} first described the usage of GPUs for general purpose programming (GPGPU). They have shown how the graphics pipeline can be used for GPGPU programming. Because this approach also requires the programmer to understand the graphics terminology, this was not a great solution. Therefore, Nvidia released CUDA\footnote{\url{https://developer.nvidia.com/cuda-toolkit}} in 2007 with the goal of allowing developers to program GPUs independent of the graphics pipeline and terminology. A study of the programmability of GPUs with CUDA and the resulting performance has been conducted by \textcite{huang_gpu_2008}. They found that GPGPU programming has potential, even for non-embarassingly parallel problems. Research is also done in making the low level CUDA development simpler. \textcite{han_hicuda_2011} have described a directive-based language to make development simpler and less error-prone, while retaining the performance of handwritten code. To drastically simplify CUDA development, \textcite{besard_effective_2019} showed that it is possible to develop with CUDA in the high level programming language Julia\footnote{\url{https://julialang.org/}} with similar performance to CUDA written in C. In a subsequent study \textcite{lin_comparing_2021} found that high performance computing (HPC) on the CPU and GPU in Julia performs similar to HPC development in C. This means that Julia can be a viable alternative to Fortran, C and C++ in the HPC field and has the additional benefit of developer comfort since it is a high level language with modern features such as garbage-collectors. \textcite{besard_rapid_2019} have also shown how the combination of Julia and CUDA help in rapidly developing HPC software. While this thesis in general revolves around CUDA, there also exist alternatives by AMD called ROCm\footnote{\url{https://www.amd.com/de/products/software/rocm.html}} and a vendor independent alternative called OpenCL\footnote{\url{https://www.khronos.org/opencl/}}. If not specified otherwise, the following section and its subsections use the information presented by \textcite{nvidia_cuda_2024} in their CUDA programming guide.
While in the early days of GPGPU programming a lot of research has been done to assess if this approach is feasible, it now seems obvious to use GPUs to accelerate algorithms. GPUs have been used early to speed up weather simulation models. \textcite{michalakes_gpu_2008} proposed a method for simulating weather with the Weather Research and Forecast (WRF) model on a GPU. With their approach, they reached a speed-up of the most compute intensive task of 5 to 20, with little GPU optimisation effort. They also found that the GPU usage was low, meaning there are resources and potential for more detailed simulations. Generally, simulations are great candidates for using GPUs, as they can benefit heavily from a high degree of parallelism and data throughput. \textcite{koster_high-performance_2020} have developed a way of using adaptive time steps on the GPU to considerably improve the performance of numerical and discrete simulations. In addition to the performance gains they were able to retain the precision and constraint correctness of the simulation. Black hole simulations are crucial for science and education for a better understanding of our world. \textcite{verbraeck_interactive_2021} have shown that simulating complex Kerr (rotating) black holes can be done on consumer hardware in a few seconds. Schwarzschild black hole simulations can be performed in real-time with GPUs as described by \textcite{hissbach_overview_2022} which is especially helpful for educational scenarios. While both approaches do not have the same accuracy as detailed simulations on supercomputers, they show how a single GPU can yield similar accuracy at a fraction of the cost. Software network routing can also heavily benefit from GPU acceleration as shown by \textcite{han_packetshader_2010}, where they achieved a significantly higher throughput than with a CPU only implementation. Finite element structural analysis is an essential tool for many branches of engineering and can also heavily benefit from the usage of GPUs as demonstrated by \textcite{georgescu_gpu_2013}. However, it also needs to be noted, that GPUs are not always better performing than CPUs as illustrated by \textcite{lee_debunking_2010}, but they still can lead to performance improvements nonetheless.
@ -30,7 +30,7 @@ The development process on a GPU is vastly different from a CPU. A CPU has tens
\begin{figure}
\centering
\includegraphics[width=1\textwidth]{nvidia_cpu_vs_gpu.png}
\caption{Overview of the architecture of a CPU (left) and a GPU (right). Note the higher number of simpler cores on the GPU \parencite{nvidia_cuda_2024}.}
\caption{Overview of the architecture of a CPU (left) and a GPU (right). Note the higher number of simpler and smaller cores on the GPU \parencite{nvidia_cuda_2024}.}
\label{fig:cpu_vs_gpu}
\end{figure}
@ -48,7 +48,7 @@ At the lowest level of a GPU exists a Streaming Multiprocessor (SM), which is a
\label{fig:thread_hierarchy}
\end{figure}
A piece of code that is executed on a GPU is written as a kernel which can be configured. The most important configuration is how threads are grouped into blocks. The GPU allows the kernel to allocate threads and blocks and block clusters in up to three dimensions. This is often useful because of the already mentioned shared memory, which will be explained in more detail in section \ref{sec:memory_model}. Considering the case where an image needs to be blurred, it not only simplifies the development if threads are arranged in a 2D grid, it also helps with optimising memory access. As the threads in a block, need to access a lot of the same data, this data can be loaded in the shared memory of the block. This allows the data to be accessed much quicker compared to when threads are allocated in only one dimension. With one dimensional blocks it is possible that threads assigned to nearby pixels, are part of a different block, leading to a lot of duplicate data transfer.
A piece of code that is executed on a GPU is written as a kernel which can be configured. The most important configuration is how threads are grouped into blocks. The GPU allows the kernel to allocate threads and blocks and block clusters in up to three dimensions. This is often useful because of the already mentioned shared memory, which will be explained in more detail in section \ref{sec:memory_model}. Considering the case where an image needs to be blurred, it not only simplifies the development if threads are arranged in a 2D grid, it also helps with optimising memory access. As the threads in a block, need to access a lot of the same data, this data can be loaded in the shared memory of the block. This allows the data to be accessed much quicker compared to when threads are allocated in only one dimension. With one dimensional blocks it is possible that threads assigned to nearby pixels, are part of a different block, leading to a lot of duplicate data transfer. Although the size in each dimension of the blocks can be almost arbitrary, blocks that are too large might lead to other problems which are described in more detail in section \ref{sec:occupancy}.
All threads in a warp start at the same point in a program, they have their own instruction address, allowing them to work independently. Because of the SIMD architecture, all threads in a warp must execute the same instructions and if threads start diverging, the SM must pause threads with different instructions and execute them later. Figure \ref{fig:thread_divergence} shows how such divergences can impact performance. The situation described by the figure also shows, that after the divergent thread would reconverge, this does not happen and leads to T2 being executed after T1 and T3 are finished. In situations where a lot of data dependent thread divergence happens, most of the benefits of using a GPU have vanished.
@ -59,7 +59,7 @@ All threads in a warp start at the same point in a program, they have their own
\label{fig:thread_divergence}
\end{figure}
Threads not executing the same instruction is against the SIMD principle but can happen in reality, due to data dependent branching. Consequently, this leads to bad resource utilisation, which in turn leads to worse performance. Another possibility of threads being paused (inactive threads) is the fact that sometimes, the number of threads started is not divisible by 32. In such cases, the last warp still contains 32 threads but only the threads with work are executed \parencite{nvidia_cuda_2024}.
Threads not executing the same instruction is against the SIMD principle but can happen in reality, due to data dependent branching. Consequently, this leads to bad resource utilisation, which in turn leads to worse performance. Another possibility of threads being paused (inactive threads) is the fact that sometimes, the number of threads started is not divisible by 32. In such cases, the last warp still contains 32 threads but only the threads with work are executed.
Modern GPUs implement the so called Single-Instruction Multiple-Thread (SIMT) architecture. In many cases a developer does not need to know the details of SIMT and can develop fast and correct programs with just the SIMD architecture in mind. However, leveraging the power of SIMT can yield substantial performance gains by re-converging threads once data dependent divergence occurred. A proposal for a re-convergence algorithm was proposed by \textcite{collange_stack-less_2011} where they have shown that these approaches help with hardware occupation, resulting in improved performance as threads are now no longer fully serialised. Another approach for increasing occupancy using the SIMT architecture is proposed by \textcite{fung_thread_2011}. They introduced a technique for compacting thread blocks by moving divergent threads to new warps until they reconverge. This approach resulted in a noticeable speed-up between 17\% and 22\%. Another example where a SIMT aware algorithm can perform better was proposed by \textcite{koster_massively_2020}. While they did not implement techniques for thread re-convergence, they implemented a thread compaction algorithm. On data-dependent divergence it is possible for threads to end early, leaving a warp with only partial active threads. This means the deactivated threads are still occupied and cannot be used for other work. Their thread compaction tackles this problem by moving active threads into a new thread block, releasing the inactive threads to perform other work. With this they were able to gain a speed-up of roughly 4 times compared to previous implementations.
@ -68,13 +68,44 @@ Modern GPUs implement the so called Single-Instruction Multiple-Thread (SIMT) ar
\subsubsection{Memory Model}
\label{sec:memory_model}
On a GPU there are two parts that contribute to the performance of an algorithm. The one already looked at is the compute-portion of the GPU. This is necessary because if threads are serialised or run inefficiently, there is nothing that can make the algorithm execute faster. However, algorithms run on a GPU usually require huge amounts of data to be processed, as they are designed for exactly that purpose. The purpose of this section is to explain how the memory model of the GPU works and how it can influence the performance of an algorithm.
% If more is needed talk about the following:
% - Memory allocation (with the one paper diving into dynamic allocations)
% - Memory transfer (with streams potentially)
Talk about memory model and memory allocation (with the one paper diving into dynamic allocations)
Memory transfer (with streams potentially)
On a GPU there are two parts that contribute to the performance of an algorithm. The one already looked at is the compute-portion of the GPU. This is necessary because if threads are serialised or run inefficiently, there is nothing that can make the algorithm execute faster. However, algorithms run on a GPU usually require huge amounts of data to be processed, as they are designed for exactly that purpose. The purpose of this section is to explain how the memory model of the GPU works and how it can influence the performance of an algorithm. In figure \ref{fig:gpu_memory_layout} the memory layout and the kinds of memory available are depicted. The different parts will be explained in this section.
\begin{figure}
\centering
\includegraphics[width=.9\textwidth]{gpu_memory_layout.png}
\caption{The layout of the memory in the GPU. The connections between the memory regions can be seen as well as the different kinds of memory available.}
\label{fig:gpu_memory_layout}
\end{figure}
On a GPU there are multiple levels and kinds of memory available. All these levels and kinds have different purposes they are optimised for. This means that it is important to know what they are and how they can be best used for specific tasks. On the lowest level threads have registers and local memory available. Registers is the fastest way to access memory but is also the least abundant memory with up to a maximum of 255 32-Bit registers per thread on Nvidia and 256 on AMD \parencite{amd_hardware_2025}. However, using all registers of a thread can lead to other problems which are described in more detail in section \ref{sec:occupancy}. On the other side, the thread local memory is significantly slower than registers. This is due to the fact, that local memory is actually stored in global memory and therefore has the same limitations which are explained later. This means it is important to try and avoid local memory as much as possible. Local memory is usually only used when a thread uses too many registers. The compiler will then spill the remaining data into local memory and loads it into registers once needed, drastically slowing down the application.
Shared memory is the next tier of memory on a GPU. Unlike local memory and registers, shared memory is shared between all threads inside a block. The amount of shared memory is depending on the GPU architecture but for Nvidia it hovers at around 100 Kilobyte (KB) per block. While this memory is slower than registers, its primary use-case is communicating and sharing data between threads in a block. It is advised that all threads in a block access a lot of overlapping data, as then data from global memory can be loaded into faster shared memory once and then accessed multiple times further increasing performance. Loading data into shared memory and accessing that data has to be done manually. Because shared memory is part of the unified data cache, it can either be used as a cache or for manual use, meaning a developer can allocate more shared memory towards caching if needed. Another feature of shared memory are the so-called memory banks. Shared memory is always split into 32 equally sized memory modules also called memory banks. All available memory addresses lie in one of these banks. This means if two threads access two different memory addresses which lie in different banks, the access can be performed simultaneously, increasing the throughput.
The most abundant and slowest memory is the global memory and resides in device memory. A key constraint of device memory and therefore global memory is, that is accessed in either 32, 64 or 128 byte chunks. This means if a thread wants to access 8 bytes from global memory, alongside the 8 bytes, the 24 bytes after the requested 8 bytes are also transferred. As a result, the throughput is only a fourth of the theoretical maximum. Therefore, it is important to follow optimal access patterns. What these optimal patterns are, are architecture dependent and are described in the according sections in the CUDA programming guide.
A small portion of device memory is allocated to constant memory. Constant memory is accessible by all threads and as the name implies, can not be written to by threads. It can be initialised by the CPU when starting a kernel if needed. As constant memory has a separate cache, it can be used to speed-up data access for constant and frequently accessed data.
Another special kind of memory is the texture and surface memory. According to \textcite{amd_hip_2025} texture memory is read-only memory, while surface memory can also be written to, which is the only difference between these two kinds of memory. Nvidia does not explicitly state this behaviour, but due to the fact that accessing textures is only performed via caches, it is implied that on Nvidia GPUs, texture memory is also read-only. As the name implies, this kind of memory is optimised for accessing textures. This means that threads of the same warp, accessing data which is spatially close together, will result in increased performance. As already mentioned, surface memory works the same way, with the difference, that it can be written to. It is therefore well suited for manipulating two- or three-dimensional data.
\subsubsection{Occupancy}
\label{sec:occupancy}
% Describe occupancy, why it is important and what can impact it. Maybe add a simplified version of this table: \url{https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications-technical-specifications-per-compute-capability} to explain the bounds and effects on occupancy
Occupancy describes the utilisation of a GPU. A high occupancy means, that the compute-resources of the GPU are utilised, or in other words occupied with work. This is important, as a high occupancy means that the GPU is performing work as compared to low occupancy, where the GPU is waiting for work to be scheduled. As a result, it is important to achieve high occupancy in order to increase the performance of an algorithm. It needs to be noted, that occupancy is not the only option for improving performance. As it is possible for the GPU to have a high occupancy while performing a lot of unnecessary work or utilising compute-resources that are slower. An example for the latter would be developing an algorithm that uses 64-bit floating point (FP64) numbers while 32-bit floating point (FP32) numbers would have sufficient accuracy. Because GPUs tend to have fewer FP64 compute-resources than they have FP32 compute-resources, performing FP64 operations will take longer. However, despite these drawbacks, having high occupancy is still an important metric and ways of achieving high occupancy will be outlined in this section.
\subsection[PTX]{Parallel Thread Execution}
Describe what PTX is to get a common ground for the implementation chapter. Probably a short section
% Describe what PTX is to get a common ground for the implementation chapter. Probably a short section
% https://docs.nvidia.com/cuda/parallel-thread-execution/
While in most cases a GPU in a higher level language like C++ or even Julia\footnote{\url{https://juliagpu.org/}}, it is also possible to program GPUs with the low level language Parallel Thread Execution (PTX) developed by Nvidia. A brief overview of what PTX is and how it can be used to program GPUs is given in this section. Information in this section is taken from the PTX documentation \parencite{nvidia_parallel_2025} if not stated otherwise.
% PTX is IL and every CUDA program is compiled to PTX; Driver compiles PTX to machine code
% Quick overview of how PTX instructions are structured and I think thats it for this section.
\section{Compilers}

Binary file not shown.

After

Width:  |  Height:  |  Size: 112 KiB

Binary file not shown.

View File

@ -334,7 +334,6 @@ Publisher: Multidisciplinary Digital Publishing Institute},
author = {Dong, Junlan and Zhong, Jinghui and Liu, Wei-Li and Zhang, Jun},
urldate = {2025-02-26},
date = {2024},
note = {Conference Name: {IEEE} Transactions on Evolutionary Computation},
keywords = {Optimization, Adaptation models, Complexity theory, Equation Learner, Evolutionary computation, Evolving equation learner, Mathematical models, Neural networks, Progressive Evolutionary Structure Search, Training},
file = {IEEE Xplore Abstract Record:C\:\\Users\\danwi\\Zotero\\storage\\8PQADTZP\\metrics.html:text/html},
}
@ -585,10 +584,10 @@ Publisher: Multidisciplinary Digital Publishing Institute},
@online{amd_hip_2025,
title = {{HIP} programming model — {HIP} 6.3.42134 Documentation},
url = {https://rocm.docs.amd.com/projects/HIP/en/latest/understand/programming_model.html#programming-model-simt},
url = {https://rocm.docs.amd.com/projects/HIP/en/latest/understand/programming_model.html},
author = {{AMD}},
urldate = {2025-03-09},
date = {2025-03},
date = {2025-02},
file = {HIP programming model — HIP 6.3.42134 Documentation:C\:\\Users\\danwi\\Zotero\\storage\\6KRNU6PG\\programming_model.html:text/html},
}
@ -713,14 +712,13 @@ Publisher: Multidisciplinary Digital Publishing Institute},
edition = {1},
title = {Climate Models},
isbn = {978-1-009-08209-9 978-1-316-51427-6},
url = {https://www.cambridge.org/core/product/identifier/9781009082099%23CN-bp-14/type/book_part},
url = {http://dx.doi.org/10.1017/9781009082099.018},
pages = {126--136},
booktitle = {A Critical Assessment of the Intergovernmental Panel on Climate Change},
publisher = {Cambridge University Press},
author = {Guillemot, Hélène},
bookauthor = {Hulme, Mike},
editor = {De Pryck, Kari},
urldate = {2025-03-14},
date = {2022-12-31},
file = {Full Text:C\:\\Users\\danwi\\Zotero\\storage\\MUKXXCV9\\Guillemot - 2022 - Climate Models.pdf:application/pdf},
}
@ -777,7 +775,7 @@ Publisher: Multidisciplinary Digital Publishing Institute},
author = {Sun, Fangzheng and Liu, Yang and Wang, Jian-Xun and Sun, Hao},
urldate = {2025-03-14},
date = {2023-02-02},
keywords = {Computer Science - Artificial Intelligence, Computer Science - Machine Learning, Computer Science - Symbolic Computation, Nonlinear Sciences - Chaotic Dynamics, Physics - Computational Physics},
keywords = {Computer Science - Machine Learning, Computer Science - Artificial Intelligence, Computer Science - Symbolic Computation, Nonlinear Sciences - Chaotic Dynamics, Physics - Computational Physics},
file = {Preprint PDF:C\:\\Users\\danwi\\Zotero\\storage\\YBXYH5D6\\Sun et al. - 2023 - Symbolic Physics Learner Discovering governing equations via Monte Carlo tree search.pdf:application/pdf;Snapshot:C\:\\Users\\danwi\\Zotero\\storage\\D9SDYVT3\\2205.html:text/html},
}
@ -811,6 +809,22 @@ Publisher: Multidisciplinary Digital Publishing Institute},
author = {Lemos, Pablo and Jeffrey, Niall and Cranmer, Miles and Ho, Shirley and Battaglia, Peter},
urldate = {2025-03-14},
date = {2022-02-04},
keywords = {Astrophysics - Earth and Planetary Astrophysics, Astrophysics - Instrumentation and Methods for Astrophysics, Computer Science - Machine Learning},
keywords = {Computer Science - Machine Learning, Astrophysics - Earth and Planetary Astrophysics, Astrophysics - Instrumentation and Methods for Astrophysics},
file = {Preprint PDF:C\:\\Users\\danwi\\Zotero\\storage\\9YPFHHRY\\Lemos et al. - 2022 - Rediscovering orbital mechanics with machine learning.pdf:application/pdf;Snapshot:C\:\\Users\\danwi\\Zotero\\storage\\YIFHYWCY\\2202.html:text/html},
}
@online{amd_hardware_2025,
title = {Hardware features — {HIP} 6.3.42134 Documentation},
url = {https://rocm.docs.amd.com/projects/HIP/en/latest/reference/hardware_features.html},
author = {{AMD}},
urldate = {2025-03-15},
date = {2025-02},
}
@online{nvidia_parallel_2025,
title = {Parallel Thread Execution {ISA} Version 8.7},
url = {https://docs.nvidia.com/cuda/parallel-thread-execution/},
author = {{Nvidia}},
urldate = {2025-03-15},
date = {2025-03},
}