relwork: finished relwork
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run

This commit is contained in:
Daniel 2025-03-21 14:35:55 +01:00
parent db3ea32b66
commit a718a3572e
6 changed files with 323 additions and 31 deletions

View File

@ -0,0 +1,81 @@
<mxfile host="app.diagrams.net" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:136.0) Gecko/20100101 Firefox/136.0" version="26.1.1">
<diagram name="Page-1" id="CwRLx42RAcgxm35m21Lx">
<mxGraphModel dx="1430" dy="1615" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1169" pageHeight="827" math="0" shadow="0">
<root>
<mxCell id="0" />
<mxCell id="1" parent="0" />
<mxCell id="wWUOW6dZojJ5Lo5lHCWY-24" value="" style="swimlane;startSize=0;" vertex="1" parent="1">
<mxGeometry x="680" y="-180" width="440" height="120" as="geometry" />
</mxCell>
<mxCell id="wWUOW6dZojJ5Lo5lHCWY-31" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" edge="1" parent="wWUOW6dZojJ5Lo5lHCWY-24" source="wWUOW6dZojJ5Lo5lHCWY-29" target="wWUOW6dZojJ5Lo5lHCWY-30">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="wWUOW6dZojJ5Lo5lHCWY-32" value="Bison" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" vertex="1" connectable="0" parent="wWUOW6dZojJ5Lo5lHCWY-31">
<mxGeometry x="-0.0202" relative="1" as="geometry">
<mxPoint as="offset" />
</mxGeometry>
</mxCell>
<mxCell id="wWUOW6dZojJ5Lo5lHCWY-29" value="Scanner" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="wWUOW6dZojJ5Lo5lHCWY-24">
<mxGeometry x="180" y="50" width="80" height="40" as="geometry" />
</mxCell>
<mxCell id="wWUOW6dZojJ5Lo5lHCWY-30" value="Parser" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="wWUOW6dZojJ5Lo5lHCWY-24">
<mxGeometry x="340" y="50" width="80" height="40" as="geometry" />
</mxCell>
<mxCell id="wWUOW6dZojJ5Lo5lHCWY-27" value="Frontend" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" vertex="1" parent="wWUOW6dZojJ5Lo5lHCWY-24">
<mxGeometry x="185" width="70" height="30" as="geometry" />
</mxCell>
<mxCell id="wWUOW6dZojJ5Lo5lHCWY-33" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" edge="1" parent="wWUOW6dZojJ5Lo5lHCWY-24" source="wWUOW6dZojJ5Lo5lHCWY-26" target="wWUOW6dZojJ5Lo5lHCWY-29">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="wWUOW6dZojJ5Lo5lHCWY-34" value="Flex" style="edgeLabel;html=1;align=center;verticalAlign=middle;resizable=0;points=[];" vertex="1" connectable="0" parent="wWUOW6dZojJ5Lo5lHCWY-33">
<mxGeometry x="0.0455" relative="1" as="geometry">
<mxPoint as="offset" />
</mxGeometry>
</mxCell>
<mxCell id="wWUOW6dZojJ5Lo5lHCWY-26" value="Source Code" style="rounded=1;whiteSpace=wrap;html=1;" vertex="1" parent="wWUOW6dZojJ5Lo5lHCWY-24">
<mxGeometry x="20" y="50" width="80" height="41" as="geometry" />
</mxCell>
<mxCell id="wWUOW6dZojJ5Lo5lHCWY-35" value="" style="swimlane;startSize=0;" vertex="1" parent="1">
<mxGeometry x="680" y="20" width="440" height="120" as="geometry" />
</mxCell>
<mxCell id="wWUOW6dZojJ5Lo5lHCWY-36" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" edge="1" parent="wWUOW6dZojJ5Lo5lHCWY-35" source="wWUOW6dZojJ5Lo5lHCWY-38" target="wWUOW6dZojJ5Lo5lHCWY-39">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="wWUOW6dZojJ5Lo5lHCWY-38" value="Code Generator" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="wWUOW6dZojJ5Lo5lHCWY-35">
<mxGeometry x="170" y="50" width="100" height="40" as="geometry" />
</mxCell>
<mxCell id="wWUOW6dZojJ5Lo5lHCWY-39" value="Machine code" style="rounded=1;whiteSpace=wrap;html=1;" vertex="1" parent="wWUOW6dZojJ5Lo5lHCWY-35">
<mxGeometry x="330" y="50" width="90" height="40" as="geometry" />
</mxCell>
<mxCell id="wWUOW6dZojJ5Lo5lHCWY-40" value="Backend" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" vertex="1" parent="wWUOW6dZojJ5Lo5lHCWY-35">
<mxGeometry x="185" width="70" height="30" as="geometry" />
</mxCell>
<mxCell id="wWUOW6dZojJ5Lo5lHCWY-41" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" edge="1" parent="wWUOW6dZojJ5Lo5lHCWY-35" source="wWUOW6dZojJ5Lo5lHCWY-43" target="wWUOW6dZojJ5Lo5lHCWY-38">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="wWUOW6dZojJ5Lo5lHCWY-43" value="Optimiser" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="wWUOW6dZojJ5Lo5lHCWY-35">
<mxGeometry x="20" y="50" width="90" height="41" as="geometry" />
</mxCell>
<mxCell id="wWUOW6dZojJ5Lo5lHCWY-46" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" edge="1" parent="1" source="wWUOW6dZojJ5Lo5lHCWY-44" target="wWUOW6dZojJ5Lo5lHCWY-43">
<mxGeometry relative="1" as="geometry">
<Array as="points">
<mxPoint x="660" y="-20" />
<mxPoint x="660" y="91" />
</Array>
</mxGeometry>
</mxCell>
<mxCell id="wWUOW6dZojJ5Lo5lHCWY-44" value="Intermediate representation" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
<mxGeometry x="820" y="-40" width="160" height="40" as="geometry" />
</mxCell>
<mxCell id="wWUOW6dZojJ5Lo5lHCWY-45" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=1;entryY=0.5;entryDx=0;entryDy=0;" edge="1" parent="1" source="wWUOW6dZojJ5Lo5lHCWY-30" target="wWUOW6dZojJ5Lo5lHCWY-44">
<mxGeometry relative="1" as="geometry">
<Array as="points">
<mxPoint x="1140" y="-110" />
<mxPoint x="1140" y="-20" />
</Array>
</mxGeometry>
</mxCell>
</root>
</mxGraphModel>
</diagram>
</mxfile>

View File

@ -1,2 +1,3 @@
{"rule":"OXFORD_SPELLING_Z_NOT_S","sentence":"^\\QOptimisation of software\\E$"}
{"rule":"TO_TOO","sentence":"^\\QThey introduced the division operator, which led to much better results.\\E$"}
{"rule":"COLLECTIVE_NOUN_VERB_AGREEMENT_VBP","sentence":"^\\QIn cases where occupancy is already high and the amount of warps ready for execution is also high, other areas for performance improvements need to be explored.\\E$"}

View File

@ -22,7 +22,7 @@ A typical equation learner generates multiple expressions at once. If the equati
\label{sec:gpgpu}
Graphics cards (GPUs) are commonly used to increase the performance of many different applications. Originally they were designed to improve performance and visual quality in games. \textcite{dokken_gpu_2005} first described the usage of GPUs for general purpose programming (GPGPU). They have shown how the graphics pipeline can be used for GPGPU programming. Because this approach also requires the programmer to understand the graphics terminology, this was not a great solution. Therefore, Nvidia released CUDA\footnote{\url{https://developer.nvidia.com/cuda-toolkit}} in 2007 with the goal of allowing developers to program GPUs independent of the graphics pipeline and terminology. A study of the programmability of GPUs with CUDA and the resulting performance has been conducted by \textcite{huang_gpu_2008}. They found that GPGPU programming has potential, even for non-embarassingly parallel problems. Research is also done in making the low level CUDA development simpler. \textcite{han_hicuda_2011} have described a directive-based language to make development simpler and less error-prone, while retaining the performance of handwritten code. To drastically simplify CUDA development, \textcite{besard_effective_2019} showed that it is possible to develop with CUDA in the high level programming language Julia\footnote{\url{https://julialang.org/}} with similar performance to CUDA written in C. In a subsequent study \textcite{lin_comparing_2021} found that high performance computing (HPC) on the CPU and GPU in Julia performs similar to HPC development in C. This means that Julia can be a viable alternative to Fortran, C and C++ in the HPC field and has the additional benefit of developer comfort since it is a high level language with modern features such as garbage-collectors. \textcite{besard_rapid_2019} have also shown how the combination of Julia and CUDA help in rapidly developing HPC software. While this thesis in general revolves around CUDA, there also exist alternatives by AMD called ROCm\footnote{\url{https://www.amd.com/de/products/software/rocm.html}} and a vendor independent alternative called OpenCL\footnote{\url{https://www.khronos.org/opencl/}}. If not specified otherwise, the following section and its subsections use the information presented by \textcite{nvidia_cuda_2025} in their CUDA programming guide.
While in the early days of GPGPU programming a lot of research has been done to assess if this approach is feasible, it now seems obvious to use GPUs to accelerate algorithms. GPUs have been used early to speed up weather simulation models. \textcite{michalakes_gpu_2008} proposed a method for simulating weather with the Weather Research and Forecast (WRF) model on a GPU. With their approach, they reached a speed-up of the most compute intensive task of 5 to 20, with little GPU optimisation effort. They also found that the GPU usage was low, meaning there are resources and potential for more detailed simulations. Generally, simulations are great candidates for using GPUs, as they can benefit heavily from a high degree of parallelism and data throughput. \textcite{koster_high-performance_2020} have developed a way of using adaptive time steps on the GPU to considerably improve the performance of numerical and discrete simulations. In addition to the performance gains they were able to retain the precision and constraint correctness of the simulation. Black hole simulations are crucial for science and education for a better understanding of our world. \textcite{verbraeck_interactive_2021} have shown that simulating complex Kerr (rotating) black holes can be done on consumer hardware in a few seconds. Schwarzschild black hole simulations can be performed in real-time with GPUs as described by \textcite{hissbach_overview_2022} which is especially helpful for educational scenarios. While both approaches do not have the same accuracy as detailed simulations on supercomputers, they show how a single GPU can yield similar accuracy at a fraction of the cost. Software network routing can also heavily benefit from GPU acceleration as shown by \textcite{han_packetshader_2010}, where they achieved a significantly higher throughput than with a CPU only implementation. Finite element structural analysis is an essential tool for many branches of engineering and can also heavily benefit from the usage of GPUs as demonstrated by \textcite{georgescu_gpu_2013}. However, it also needs to be noted, that GPUs are not always better performing than CPUs as illustrated by \textcite{lee_debunking_2010}, but they still can lead to performance improvements nonetheless.
While in the early days of GPGPU programming a lot of research has been done to assess if this approach is feasible, it now seems obvious to use GPUs to accelerate algorithms. GPUs have been used early to speed up weather simulation models. \textcite{michalakes_gpu_2008} proposed a method for simulating weather with the Weather Research and Forecast (WRF) model on a GPU. With their approach, they reached a speed-up of the most compute intensive task of 5 to 20, with little GPU optimisation effort. They also found that the GPU usage was low, meaning there are resources and potential for more detailed simulations. Generally, simulations are great candidates for using GPUs, as they can benefit heavily from a high degree of parallelism and data throughput. \textcite{koster_high-performance_2020} have developed a way of using adaptive time steps on the GPU to considerably improve the performance of numerical and discrete simulations. In addition to the performance gains they were able to retain the precision and constraint correctness of the simulation. Black hole simulations are crucial for science and education for a better understanding of our world. \textcite{verbraeck_interactive_2021} have shown that simulating complex Kerr (rotating) black holes can be done on consumer hardware in a few seconds. Schwarzschild black hole simulations can be performed in real-time with GPUs as described by \textcite{hissbach_overview_2022} which is especially helpful for educational scenarios. While both approaches do not have the same accuracy as detailed simulations on supercomputers, they show how a single GPU can yield similar accuracy at a fraction of the cost. Software network routing can also heavily benefit from GPU acceleration as shown by \textcite{han_packetshader_2010}, where they achieved a significantly higher throughput than with a CPU only implementation. Finite element structural analysis is an essential tool for many branches of engineering and can also heavily benefit from the usage of GPUs as demonstrated by \textcite{georgescu_gpu_2013}. Generating test data for DeepQ learning can also significantly benefit from using the GPU \parencite{koster_macsq_2022}. However, it also needs to be noted, that GPUs are not always better performing than CPUs as illustrated by \textcite{lee_debunking_2010}, but they still can lead to performance improvements nonetheless.
\subsection{Programming GPUs}
The development process on a GPU is vastly different from a CPU. A CPU has tens or hundreds of complex cores with the AMD Epyc 9965\footnote{\url{https://www.amd.com/en/products/processors/server/epyc/9005-series/amd-epyc-9965.html}} having a staggering $192$ of those complex cores and twice as many threads. A guide for a simple one core 8-bit CPU has been published by \textcite{schuurman_step-by-step_2013}. He describes the different and complex parts of a CPU core. Modern CPUs are even more complex, with dedicated fast integer and floating-point arithmetic gates as well as logic gates, sophisticated branch prediction and much more. This makes a CPU perfect for handling complex control flows on a single program strand and on modern CPUs even multiple strands simultaneously. However, as seen in section \ref{sec:gpgpu}, this often isn't enough. On the other hand, a GPU contains thousands or even tens of thousands of cores. For example, the GeForce RTX 5090\footnote{\url{https://www.nvidia.com/en-us/geforce/graphics-cards/50-series/rtx-5090/}} contains a total of $21760$ CUDA cores. To achieve this enormous core count a single GPU core has to be much simpler than one CPU core. As described by \textcite{nvidia_cuda_2025} a GPU designates much more transistors towards floating-point computations. This results in less efficient integer arithmetic and control flow handling. There is also less Cache available per core and clock speeds are usually also much lower than those on a CPU. An overview of the differences of a CPU and a GPU architecture can be seen in figure \ref{fig:cpu_vs_gpu}.
@ -61,10 +61,7 @@ All threads in a warp start at the same point in a program, they have their own
Threads not executing the same instruction is against the SIMD principle but can happen in reality, due to data dependent branching. Consequently, this leads to bad resource utilisation, which in turn leads to worse performance. Another possibility of threads being paused (inactive threads) is the fact that sometimes, the number of threads started is not divisible by 32. In such cases, the last warp still contains 32 threads but only the threads with work are executed.
Modern GPUs implement the so called Single-Instruction Multiple-Thread (SIMT) architecture. In many cases a developer does not need to know the details of SIMT and can develop fast and correct programs with just the SIMD architecture in mind. However, leveraging the power of SIMT can yield substantial performance gains by re-converging threads once data dependent divergence occurred. A proposal for a re-convergence algorithm was proposed by \textcite{collange_stack-less_2011} where they have shown that these approaches help with hardware occupation, resulting in improved performance as threads are now no longer fully serialised. Another approach for increasing occupancy using the SIMT architecture is proposed by \textcite{fung_thread_2011}. They introduced a technique for compacting thread blocks by moving divergent threads to new warps until they reconverge. This approach resulted in a noticeable speed-up between 17\% and 22\%. Another example where a SIMT aware algorithm can perform better was proposed by \textcite{koster_massively_2020}. While they did not implement techniques for thread re-convergence, they implemented a thread compaction algorithm. On data-dependent divergence it is possible for threads to end early, leaving a warp with only partial active threads. This means the deactivated threads are still occupied and cannot be used for other work. Their thread compaction tackles this problem by moving active threads into a new thread block, releasing the inactive threads to perform other work. With this they were able to gain a speed-up of roughly 4 times compared to previous implementations. A survey by \textcite{khairy_survey_2019} explores different aspects of improving GPGPU performance architecturally. Specifically, they have compiled a list of different publications discussing algorithms for thread re-convergence, thread compaction and much more. Their main goal was to give a broad overview of many ways to improve the performance of GPGPU programming to help other developers.
% MIMD synchronisation on SIMT architecture: https://ieeexplore.ieee.org/abstract/document/7783714
% could be interesting to also include in this paragraph (I haven't yet fully looked at the paper)
Modern GPUs implement the so called Single-Instruction Multiple-Thread (SIMT) architecture. In many cases a developer does not need to know the details of SIMT and can develop fast and correct programs with just the SIMD architecture in mind. However, leveraging the power of SIMT can yield substantial performance gains by re-converging threads once data dependent divergence occurred. A proposal for a re-convergence algorithm was proposed by \textcite{collange_stack-less_2011} where they have shown that these approaches help with hardware occupation, resulting in improved performance as threads are now no longer fully serialised. Another approach for increasing occupancy using the SIMT architecture is proposed by \textcite{fung_thread_2011}. They introduced a technique for compacting thread blocks by moving divergent threads to new warps until they reconverge. This approach resulted in a noticeable speed-up between 17\% and 22\%. Another example where a SIMT aware algorithm can perform better was proposed by \textcite{koster_massively_2020}. While they did not implement techniques for thread re-convergence, they implemented a thread compaction algorithm. On data-dependent divergence it is possible for threads to end early, leaving a warp with only partial active threads. This means the deactivated threads are still occupied and cannot be used for other work. Their thread compaction tackles this problem by moving active threads into a new thread block, releasing the inactive threads to perform other work. With this they were able to gain a speed-up of roughly 4 times compared to previous implementations. Adapting Multiple-Instruction Multiple-Data (MIMD) programs with synchronisation to run on SIMT architecture can be a difficult task, especially if the underlying architecture is not well understood. A static analysis tool and a transformer specifically designed to help avoid deadlocks with MIMD synchronisation is proposed by \textcite{eltantawy_mimd_2016}. In addition, they proposed a hardware re-convergence mechanism that supports MIMD synchronisation. A survey by \textcite{khairy_survey_2019} explores different aspects of improving GPGPU performance architecturally. Specifically, they have compiled a list of different publications discussing algorithms for thread re-convergence, thread compaction and much more. Their main goal was to give a broad overview of many ways to improve the performance of GPGPU programming to help other developers.
\subsubsection{Memory Model}
\label{sec:memory_model}
@ -129,14 +126,11 @@ Additionally, shared memory consumption can also impact the occupancy. If for ex
Balancing these limitations and therefore the occupancy and performance often requires a lot of trial and error with help of the aforementioned tools. In cases where occupancy is already high and the amount of warps ready for execution is also high, other areas for performance improvements need to be explored. Algorithmic optimisation is always a good idea. Some performance improvements can be achieved by altering the computations to use different parts of the GPU. One of such optimisations is using FP32 operations wherever possible. Another well suited optimisation is to rewrite the algorithm to use as many Fused Multiply-Add (FMA) instructions. FMA is a special floating point instruction, that multiplies two values and adds a third, all in a single clock cycle \parencite{nvidia_cuda_2025-1}. However, the result might slightly deviate from performing these two operations separately, which means in accuracy sensitive scenarios, this instruction should be avoided. If the compiler detects a floating point operation with the FMA structure, it will automatically be compiled to an FMA instruction. To prevent this, in C++ the developer can call the functions \_\_fadd\_ and \_\_fmul\_ for addition and multiplication respectively.
\subsection[PTX]{Parallel Thread Execution}
% Describe what PTX is to get a common ground for the implementation chapter. Probably a short section
% https://docs.nvidia.com/cuda/parallel-thread-execution/
While in most cases a GPU in a higher level language like C++ or even Julia\footnote{\url{https://juliagpu.org/}}, it is also possible to program GPUs with the low level language Parallel Thread Execution (PTX) developed by Nvidia. A brief overview of what PTX is and how it can be used to program GPUs is given in this section. Information in this section is taken from the PTX documentation \parencite{nvidia_parallel_2025} if not stated otherwise.
PTX defines a virtual machine with an own instruction set architecture (ISA) and is designed for data-parallel processing on a GPU. It is an abstraction of the underlying hardware instruction set, allowing PTX code to be portable across Nvidia GPUs. In order for PTX code to be usable for the GPU, the compiler is responsible for compiling the code to the hardware instruction set of the GPU it is run on. A developer typically writes a kernel in CUDA using C++, for example, and the Nvidia compiler generates the PTX code for that kernel. The concepts for programming the GPU with PTX and CUDA are the same, apart from the terminology which is slightly different. For consistency, the CUDA terminology will continue to be used.
% Quick overview of how PTX instructions are structured and I think thats it for this section.
% structured: Begin with .version, then optional .target then code I think. "Code" can be explained in more detail.
Syntactically PTX resembles Assembly style code. Every PTX code must have a .version directive which indicates the PTX version and an optional .target directive which indicates the compute capability. If the program works in 64 bit addresses, the optional .address\_size directive can be used to indicate that, which simplifies the code for such applications. After these directives, the actual code is written. As each PTX code needs an entry point (the kernel) the .entry directive indicates the name of the kernel and the parameters needed. It is also possible to write helper functions with the .func directive. Inside the kernel or a helper function, normal PTX code can be written. Because PTX is very low level, it assumes an underlying register machine, therefore a developer needs to think about register management. This includes loading data from global or shared memory into registers if needed. Code for manipulating data like addition and subtraction generally follow the structure operation.datatype followed by three parameters for that operation. For adding two FP32 values together and storing them in the register \%n, the code looks like the following:
\begin{GenericCode}[numbers=none]
add.f32 \%n, 0.1, 0.2;
@ -164,21 +158,27 @@ Done:
\end{program}
\section{Compilers}
Compilers are a necessary tool for many developers. If a developer wants to run their program it is very likely they need one. As best described by \textcite{aho_compilers_2006} in their dragon book, a compiler takes code written by a human in some source language and translates it into a destination language readable by a computer. This section briefly explores what compilers are and research done in this old field of computer science. Furthermore, the topics of transpilers and interpreters are explored, as their use-cases are very similar. % TODO: Maybe not a subsection for transpilers? Would be very short and could fit here very well?
Compilers are a necessary tool for many developers. If a developer wants to run their program it is very likely they need one. As best described by \textcite{aho_compilers_2006} in their dragon book, a compiler takes code written by a human in some source language and translates it into a destination language readable by a computer. This section briefly explores what compilers are and research done in this old field of computer science. Furthermore, the topics of transpilers and interpreters are explored, as their use-cases are very similar.
%brief overview about compilers (just setting the stage for the subsections basically). Talk about register management and these things.
\textcite{aho_compilers_2006} and \textcite{cooper_engineering_2022} describe how a compiler can be developed, with the latter focusing on more modern approaches. They describe how a compiler consists of two parts, the analyser, also called frontend, and the synthesiser also called backend. While the front end is responsible for ensuring syntactic and semantic correctness and converts the source code into an intermediate representation for the backend. The backend is then responsible to generate target code from the intermediate representation. This target code can be assembly or anything else that is needed for a specific use-case. This intermediate representation also makes it simple to swap out frontends or backends. The Gnu Compiler Collection \textcite{gcc_gcc_2025} takes advantage of using different frontends to provide support for many languages including C, C++, Ada and more. Instead of compiling source code for specific machines directly, many languages compile for virtual machines instead. Notable examples are the Java Virtual Machine (JVM) \parencite{lindholm_java_2025} and the low level virtual machine (LLVM) \parencite{lattner_llvm_2004}. Such virtual machines provide a bytecode which can be used as a target language for compilers. A huge benefit of such virtual machines is the ability for one program to be run on all physical machines the virtual machine exists for, without the developer needing to change that program \parencite{lindholm_java_2025}. Programs written for virtual machines are usually compiled to a bytecode. This bytecode can then be interpreted or compiled to physical machine code and then run. According to the JVM specification \textcite{lindholm_java_2025} the Java bytecode is interpreted and also compiled with a just-in-time (JIT) compiler to increase the performance of code blocks that are often executed. On the other hand, the common language runtime (CLR)\footnote{\url{https://learn.microsoft.com/en-us/dotnet/standard/clr}}, the virtual machine for languages like C\#, never interprets the generated bytecode. As described by \textcite{microsoft_overview_2023} the CLR always compiles the bytecode to physical machine code using a JIT.
\textcite{aho_compilers_2006} and \textcite{cooper_engineering_2022} describe how a compiler can be developed, with the latter focusing on more modern approaches. They describe how a compiler consists of two parts, the analyser, also called frontend, and the synthesiser also called backend. While the front end is responsible for ensuring syntactic and semantic correctness and converts the source code into an intermediate representation, an abstract syntax tree (AST), for the backend. The backend is then responsible to generate target code from the intermediate representation. This target code can be assembly or anything else that is needed for a specific use-case. This intermediate representation also makes it simple to swap out frontends or backends. The Gnu Compiler Collection \textcite{gcc_gcc_2025} takes advantage of using different frontends to provide support for many languages including C, C++, Ada and more. Instead of compiling source code for specific machines directly, many languages compile for virtual machines instead. Notable examples are the Java Virtual Machine (JVM) \parencite{lindholm_java_2025} and the low level virtual machine (LLVM) \parencite{lattner_llvm_2004}. Such virtual machines provide a bytecode which can be used as a target language for compilers. A huge benefit of such virtual machines is the ability for one program to be run on all physical machines the virtual machine exists for, without the developer needing to change that program \parencite{lindholm_java_2025}. Programs written for virtual machines are usually compiled to a bytecode. This bytecode can then be interpreted or compiled to physical machine code and then run. According to the JVM specification \textcite{lindholm_java_2025} the Java bytecode is interpreted and also compiled with a just-in-time (JIT) compiler to increase the performance of code blocks that are often executed. On the other hand, the common language runtime (CLR)\footnote{\url{https://learn.microsoft.com/en-us/dotnet/standard/clr}}, the virtual machine for languages like C\#, never interprets the generated bytecode. As described by \textcite{microsoft_overview_2023} the CLR always compiles the bytecode to physical machine code using a JIT.
A grammar describes how a language is structured. It not only describes the structure of natural language, but it can also be used to describe the structure of a programming language. \textcite{chomsky_certain_1959} found that language can be grouped into four levels, with regular and context-free grammars being the most relevant for programming languages. A regular grammar is of the structure $A = a\,|\,a\,B$ which is called a rule. The symbols $A$ and $B$ are non-terminal symbols and $a$ is a terminal symbol. A non-terminal symbol stands for another rule that follows a terminal symbol. Terminal symbols are fixed symbols or a value that can be found in the input stream, like literals in programming languages. Context-free grammars are more complex and are of the structure $A = \beta$. In this context $\beta$ stands for any combination of terminal and non-terminal symbols. Therefore, a rule like $A = a\,| a\,B\,a$ is allowed with this grammar level. This shows that with context-free grammars hierarchical structures are possible. To write grammars for programming language, other properties are also important to efficiently validate or parse some input to be defined by this grammar. However, these are not discussed here, but are described by \textcite{aho_compilers_2006}. They also described that generating a parser out of a grammar can be automated. This automation can be performed by parser generators like Yacc \parencite{johnson_yacc_1975} as described in their book. More modern alternatives are Bison\footnote{\url{https://www.gnu.org/software/bison/}} or Antlr\footnote{\url{https://www.antlr.org/}}. Before the parser can validate the input stream, a scanner is needed as described by \textcite{cooper_engineering_2022}. The scanner reads every character of the input stream and is responsible for removing white-spaces and ensures only valid characters and words are present. Flex \footnote{\url{https://github.com/westes/flex}} is a tool that allows generating a scanner and is often used in combination with Bison.
A grammar describes how a language is structured. It not only describes the structure of natural language, but it can also be used to describe the structure of a programming language. \textcite{chomsky_certain_1959} found that language can be grouped into four levels, with regular and context-free grammars being the most relevant for programming languages. A regular grammar is of the structure $A = a\,|\,a\,B$ which is called a rule. The symbols $A$ and $B$ are non-terminal symbols and $a$ is a terminal symbol. A non-terminal symbol stands for another rule that follows a terminal symbol. Terminal symbols are fixed symbols or a value that can be found in the input stream, like literals in programming languages. Context-free grammars are more complex and are of the structure $A = \beta$. In this context $\beta$ stands for any combination of terminal and non-terminal symbols. Therefore, a rule like $A = a\,| a\,B\,a$ is allowed with this grammar level. This shows that with context-free grammars hierarchical structures are possible. To write grammars for programming language, other properties are also important to efficiently validate or parse some input to be defined by this grammar. However, these are not discussed here, but are described by \textcite{aho_compilers_2006}. They also described that generating a parser out of a grammar can be automated. This automation can be performed by parser generators like Yacc \parencite{johnson_yacc_1975} as described in their book. More modern alternatives are Bison\footnote{\url{https://www.gnu.org/software/bison/}} or Antlr\footnote{\url{https://www.antlr.org/}}. Before the parser can validate the input stream, a scanner is needed as described by \textcite{cooper_engineering_2022}. The scanner reads every character of the input stream and is responsible for removing white-spaces and ensures only valid characters and words are present. Flex \footnote{\url{https://github.com/westes/flex}} is a tool that allows generating a scanner and is often used in combination with Bison. A simplified version of the compiler architecture using Flex and Bison is depicted in figure \ref{fig:compiler_layout}. It shows how source code is taken and transformed into the intermediate representation by the frontend, and how it is converted into executable machine code.
Continue with a bit about transpilers and I think thats it for this section
\begin{figure}
\centering
\includegraphics[width=.9\textwidth]{compiler_architecture.png}
\caption{A simplified overview of how the architecture of a compiler looks, using Flex and Bison.}
\label{fig:compiler_layout}
\end{figure}
% find reference for JIT compilers
% as a starting point https://dl.acm.org/doi/abs/10.1145/857076.857077
% More references to JIT: https://dl.acm.org/doi/abs/10.1145/857076.857077
\subsection{Transpilers}
talk about what transpilers are and how to implement them. If possible also gpu specific transpilation.
% talk about what transpilers are and how to implement them. If possible also gpu specific transpilation.
With the concepts already mentioned, it is possible to generate executable code from code written in a programming language. However, sometimes it is desired to convert a program from one programming language to another and therefore the major difference between them is the backend. A popular transpiler example is TypeScript, which transforms TypeScript source code into JavaScript source code \parencite{microsoft_typescript_2025}. Other examples for transpilers are the C2Rust transpiler \parencite{ling_rust_2022} that transpiles C code into Rust code as well as the PyJL transpiler \parencite{marcelino_transpiling_2022} which transpiles Python code into Julia code. \textcite{chaber_effectiveness_2016} proposed a transpiler that takes MATLAB and C code and transforms it into pure and optimised C code for an STM32 microcontroller. An early example for a transpiler has been developed by \textcite{intel_mcs86_1978} where they built a transpiler for transforming assembly code for their 8080 CPU to assembly code for their 8086 CPU. Transpilers are also used in parallelisation environments, like OpenMP \parencite{wang_automatic_2015}. There also exists a transpiler that transforms CUDA code into highly parallel CPU code. \textcite{moses_high-performance_2023} described this transpiler, and they found that the generated code performs noticeably better than handwritten parallel code. When designing complex processors and accelerators, Register-transfer level (RTL) simulations are essential \parencite{wang_electronic_2009}. In a later study \textcite{zhang_opportunities_2020} have shown how RTL simulations can be performed on GPUs with a speed-up of 20. This led to \textcite{lin_rtl_2023} developing a transpiler to transform RTL into CUDA kernels instead of handwriting them. Using transpilers for software backend and business logic has been proposed by \textcite{bastidas_fuertes_transpiler-based_2023}. Their approach implemented a programming language that can be transpiled into different programming languages, for usage in a multi-programming-language environment that share some business logic. In another study, \textcite{bastidas_fuertes_transpilers_2023} reviewed over 600 publications to map the use of transpilers alongside their implementations in different fields of research, demonstrating the versatility of transpiler use.
\subsection{Interpreters}
What are interpreters; how they work; should mostly contain/reference gpu interpreters
% What are interpreters; how they work; should mostly contain/reference gpu interpreters
Interpreters are a different kind of program for executing source code. Rather than compiling the code and running the result, an interpreter executes the source code directly. Languages like Python and JavaScript are prominent examples of interpreted languages, but also Java, or more precise Java-Bytecode, is also interpreted before it gets compiled \parencite{lindholm_java_2025}. However, interpreters can not only be used for interpreting programming languages. It is also possible for them to be used in GP. \textcite{langdon_simd_2008} have shown how a SIMD interpreter can be efficiently used for evaluating entire GP populations on the GPU directly. In a later work \textcite{cano_gpu-parallel_2014} further improved this interpreter. They used the fact that a GP individual represents a tree which can be split into independent subtrees. These can be evaluated concurrently and with the help of communication via shared memory, they could therefore evaluate the entire tree. With this they achieved a significant performance improvement over previous implementations. As shown by \textcite{dietz_mimd_2010}, it is even possible to develop an interpreter that can execute Multiple-Instruction Multiple-Data (MIMD) programs on a SIMD GPU. However, as noted by the authors, any kind interpretation comes with an overhead. This means that With the additional challenges of executing MIMD programs on SIMD hardware, their interpreter, while achieving reasonable efficiency, still suffers from performance problems. Another field where interpreters can be useful are rule-based simulations. \textcite{koster_massively_2020} has shown how they implemented a GPU interpreter. In addition with other novel performance improvements in running programs on a GPU, they were able to gain a speed-up of 4 over non-interpreted implementations. While publications like \textcite{fua_comparing_2020} and \textcite{gherardi_java_2012} have shown, interpreted languages often trail behind in terms of performance compared to compiled languages, interpreters per se are not slow. And while they come with performance overhead as demonstrated by \textcite{dietz_mimd_2010} and \textcite{romer_structure_1996}, they can still be a very fast and easy alternative for certain tasks.

Binary file not shown.

After

Width:  |  Height:  |  Size: 51 KiB

Binary file not shown.

View File

@ -231,11 +231,8 @@
number = {20},
journaltitle = {Applied Sciences},
author = {Bastidas Fuertes, Andrés and Pérez, María and Meza, Jaime},
urldate = {2025-01-03},
date = {2023-01},
date = {2023-10},
langid = {english},
note = {Number: 20
Publisher: Multidisciplinary Digital Publishing Institute},
keywords = {back-end layers, design model, software architecture, software development, source-to-source transformations, transpiler},
file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\AD55DPJ4\\Bastidas Fuertes et al. - 2023 - Transpiler-Based Architecture Design Model for Back-End Layers in Software Development.pdf:application/pdf},
}
@ -252,10 +249,8 @@ Publisher: Multidisciplinary Digital Publishing Institute},
publisher = {Springer International Publishing},
author = {Adam, Stavros P. and Alexandropoulos, Stamatios-Aggelos N. and Pardalos, Panos M. and Vrahatis, Michael N.},
editor = {Demetriou, Ioannis C. and Pardalos, Panos M.},
urldate = {2025-02-14},
date = {2019},
langid = {english},
doi = {10.1007/978-3-030-12767-1_5},
}
@inproceedings{michalakes_gpu_2008,
@ -550,7 +545,6 @@ Publisher: Multidisciplinary Digital Publishing Institute},
author = {Kyung, Gyutaek and Jung, Changmin and Lee, Kwangyeob},
urldate = {2025-03-08},
date = {2014-10},
note = {{ISSN}: 2159-3450},
keywords = {Graphics processing units, Computer architecture, Graphics, Registers, Educational institutions, {GPGPU}, Instruction sets, Mobile communication, {SIMT} Architecture, Stream Processor},
file = {IEEE Xplore Abstract Record:C\:\\Users\\danwi\\Zotero\\storage\\9B85REHH\\7022313.html:text/html},
}
@ -576,7 +570,6 @@ Publisher: Multidisciplinary Digital Publishing Institute},
pages = {25--36},
booktitle = {2011 {IEEE} 17th International Symposium on High Performance Computer Architecture},
author = {Fung, Wilson W. L. and Aamodt, Tor M.},
urldate = {2025-03-08},
date = {2011-02},
keywords = {Pipelines, Kernel, Graphics processing unit, Hardware, Instruction sets, Compaction, Random access memory},
file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\TRPWUTI6\\Fung und Aamodt - 2011 - Thread block compaction for efficient SIMT control flow.pdf:application/pdf;IEEE Xplore Abstract Record:C\:\\Users\\danwi\\Zotero\\storage\\LYPYEA8U\\5749714.html:text/html},
@ -644,7 +637,6 @@ Publisher: Multidisciplinary Digital Publishing Institute},
author = {Martius, Georg and Lampert, Christoph H.},
urldate = {2025-03-13},
date = {2016},
note = {Version Number: 1},
keywords = {68T05, 68T30, 68T40, 62J02, 65D15, Artificial Intelligence (cs.{AI}), {FOS}: Computer and information sciences, I.2.6; I.2.8, Machine Learning (cs.{LG})},
}
@ -734,7 +726,6 @@ Publisher: Multidisciplinary Digital Publishing Institute},
booktitle = {Proceedings of the Genetic and Evolutionary Computation Conference Companion},
publisher = {{ACM}},
author = {Bomarito, G. F. and Leser, P. E. and Strauss, N. C. M. and Garbrecht, K. M. and Hochhalter, J. D.},
urldate = {2025-03-14},
date = {2022-07-09},
langid = {english},
file = {PDF:C\:\\Users\\danwi\\Zotero\\storage\\ZPS5ZYYQ\\Bomarito et al. - 2022 - Bayesian model selection for reducing bloat and overfitting in genetic programming for symbolic regr.pdf:application/pdf},
@ -747,7 +738,6 @@ Publisher: Multidisciplinary Digital Publishing Institute},
doi = {10.48550/ARXIV.1211.1119},
abstract = {In the field of empirical modeling using Genetic Programming ({GP}), it is important to evolve solution with good generalization ability. Generalization ability of {GP} solutions get affected by two important issues: bloat and over-fitting. We surveyed and classified existing literature related to different techniques used by {GP} research community to deal with these issues. We also point out limitation of these techniques, if any. Moreover, the classification of different bloat control approaches and measures for bloat and over-fitting are also discussed. We believe that this work will be useful to {GP} practitioners in following ways: (i) to better understand concepts of generalization in {GP} (ii) comparing existing bloat and over-fitting control techniques and (iii) selecting appropriate approach to improve generalization ability of {GP} evolved solutions.},
author = {Dabhi, Vipul K. and Chaudhary, Sanjay},
urldate = {2025-03-14},
date = {2012},
keywords = {{FOS}: Computer and information sciences, Neural and Evolutionary Computing (cs.{NE})},
file = {PDF:C\:\\Users\\danwi\\Zotero\\storage\\JCULR888\\Dabhi and Chaudhary - 2012 - A Survey on Techniques of Improving Generalization Ability of Genetic Programming Solutions.pdf:application/pdf},
@ -878,7 +868,6 @@ Publisher: Multidisciplinary Digital Publishing Institute},
url = {http://dx.doi.org/10.1016/C2014-0-01395-0},
publisher = {Elsevier},
author = {Cooper, Keith D. and Torczon, Linda},
urldate = {2025-03-18},
date = {2022},
langid = {english},
}
@ -910,9 +899,8 @@ Publisher: Multidisciplinary Digital Publishing Institute},
journaltitle = {Journal of Parallel and Distributed Computing},
shortjournal = {Journal of Parallel and Distributed Computing},
author = {Khairy, Mahmoud and Wassal, Amr G. and Zahran, Mohamed},
urldate = {2025-03-20},
date = {2019-05-01},
keywords = {Control divergence, {GPGPU}, Heterogeneous architecture, Memory systems},
keywords = {{GPGPU}, Control divergence, Heterogeneous architecture, Memory systems},
file = {PDF:C\:\\Users\\danwi\\Zotero\\storage\\FQJC5EUT\\Khairy et al. - 2019 - A survey of architectural approaches for improving GPGPU performance, programmability and heterogene.pdf:application/pdf},
}
@ -947,3 +935,225 @@ Publisher: Multidisciplinary Digital Publishing Institute},
author = {Johnson, Stephen C},
date = {1975},
}
@article{bastidas_fuertes_transpilers_2023,
title = {Transpilers: A Systematic Mapping Review of Their Usage in Research and Industry},
volume = {13},
rights = {http://creativecommons.org/licenses/by/3.0/},
issn = {2076-3417},
url = {https://www.mdpi.com/2076-3417/13/6/3667},
doi = {10.3390/app13063667},
shorttitle = {Transpilers},
abstract = {Transpilers refer to a special type of compilation that takes source code and translates it into target source code. This type of technique has been used for different types of implementations in scientific studies. A review of the research areas related to the use of transpilers allows the understanding of the direction in this branch of knowledge. The objective was to carry out an exhaustive and extended mapping of the usage and implementation of transpilers in research studies in the last 10 years. A systematic mapping review was carried out for answering the 5 research questions proposed. The {PSALSAR} method is used as a guide to the steps needed for the review. In total, from 1181 articles collected, 683 primary studies were selected, reviewed, and analyzed. Proposals from the industry were also analyzed. A new method for automatic data tabulation has been proposed for the mapping objective, using a relational database and {SQL} language. It was identified that the most common uses of transpilers are related to performance optimizations, parallel programming, embedded systems, compilers, testing, {AI}, graphics, and software development. In conclusion, it was possible to determine the extent and identification of research sub-areas and their impact on the usage of the transpilers. Future research could be considered about the usage of transpilers in transactional software, migration strategies for legacy systems, {AI}, math, multiplatform games and apps, automatic source code generation, and networking.},
pages = {3667},
number = {6},
journaltitle = {Applied Sciences},
author = {Bastidas Fuertes, Andrés and Pérez, María and Meza Hormaza, Jaime},
date = {2023-01},
langid = {english},
keywords = {cross compiler, software architecture, source-to-source compiler, systematic literature review, transcompiler, transpiler},
file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\7IRRHZJG\\Bastidas Fuertes et al. - 2023 - Transpilers A Systematic Mapping Review of Their Usage in Research and Industry.pdf:application/pdf},
}
@online{microsoft_typescript_2025,
title = {{TypeScript}: The starting point for learning {TypeScript}},
url = {https://www.typescriptlang.org/docs/handbook/intro.html},
author = {{Microsoft}},
urldate = {2025-03-21},
date = {2025-03},
}
@inproceedings{ling_rust_2022,
location = {New York, {NY}, {USA}},
title = {In rust we trust: a transpiler from unsafe C to safer rust},
isbn = {978-1-4503-9223-5},
url = {https://dl.acm.org/doi/10.1145/3510454.3528640},
doi = {10.1145/3510454.3528640},
series = {{ICSE} '22},
shorttitle = {In rust we trust},
abstract = {Rust is a type-safe system programming language with a compiler checking memory and concurrency safety. For a smooth transition from existing C projects, a source-to-source transpiler can autotransform C programs into Rust using program transformation. However, existing C-to-Rust transformation tools (e.g. the open-source C2Rust transpiler1 project) have the drawback of preserving the unsafe semantics of C, while rewriting them in Rust's syntax. The work by Emre et el. [2] acknowledged these drawbacks, and used rustc compiler feedback to refactor one certain type of raw pointers to Rust references to improve overall safety and idiomaticness of C2Rust output. Focusing on improving {API}-safeness (i.e. lowering unsafe keyword usage in function signatures), we apply source-to-source transformation technique to auto-refactor C2Rust output using code structure pattern matching and transformation, which does not rely on rustc compiler feedback. And by relaxing the semantics-preserving constraints of transformations, we present {CRustS}2 a fully-automated source-to-source transformation approach that increases the ratio of the transformed code passing the safety checks of the rustc compiler. Our method uses 220 new {TXL} [1] source-to-source transformation rules, of which 198 are strictly semantics-preserving and 22 are semantics-approximating, thus reducing the scope of unsafe expressions and exposing more opportunities for safe Rust refactoring. Our method has been evaluated on both open-source and commercial C projects, and demonstrates significantly higher safe code ratios after the transformations, with function-level safe code ratios comparable to the average level of idiomatic Rust projects.},
pages = {354--355},
booktitle = {Proceedings of the {ACM}/{IEEE} 44th International Conference on Software Engineering: Companion Proceedings},
publisher = {Association for Computing Machinery},
author = {Ling, Michael and Yu, Yijun and Wu, Haitao and Wang, Yuan and Cordy, James R. and Hassan, Ahmed E.},
urldate = {2025-03-21},
date = {2022-10-19},
file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\RC6EAG9X\\Ling et al. - 2022 - In rust we trust a transpiler from unsafe C to safer rust.pdf:application/pdf},
}
@article{marcelino_transpiling_2022,
title = {Transpiling Python to Julia using {PyJL}},
rights = {Creative Commons Attribution 4.0 International},
url = {https://zenodo.org/record/6332890},
doi = {10.5281/ZENODO.6332890},
author = {Marcelino, Miguel and Leitão, António Menezes},
date = {2022},
keywords = {Automatic Transpilation, Julia, Library Translation, Python, Source-to-Source Compiler},
file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\MD8RTI6D\\Marcelino and Leitão - Transpiling Python to Julia using PyJL.pdf:application/pdf},
}
@online{intel_mcs86_1978,
title = {{MCS}·86 Assembly Language Converter Operating Instructions For {ISIS}·{II} Users},
url = {http://www.bitsavers.org/pdf/intel/ISIS_II/9800642A_MCS-86_Assembly_Language_Converter_Operating_Instructions_for_ISIS-II_Users_Mar79.pdf},
author = {{Intel}},
urldate = {2025-03-21},
date = {1978},
note = {Technical Report},
file = {http\://www.bitsavers.org/pdf/intel/ISIS_II/9800642A_MCS-86_Assembly_Language_Converter_Operating_Instructions_for_ISIS-II_Users_Mar79.pdf:C\:\\Users\\danwi\\Zotero\\storage\\N63NW3B5\\9800642A_MCS-86_Assembly_Language_Converter_Operating_Instructions_for_ISIS-II_Users_Mar79.pdf:application/pdf},
}
@article{wang_automatic_2015,
title = {Automatic scoping of task clauses for the {OpenMP} tasking model},
volume = {71},
issn = {1573-0484},
url = {https://doi.org/10.1007/s11227-014-1326-3},
doi = {10.1007/s11227-014-1326-3},
abstract = {{OpenMP} provides an easy-to-learn and powerful programming environment for the development of parallel programs. We propose here an algorithm for the automatic correction of the {OpenMP} tasking model. Assuming a compiler or programmer has identified task regions in the source programs, the proposed algorithm will automatically generate correct task clauses and synchronization. The proposed algorithm is implemented here based on the {ROSE} compiler infrastructure; 14 benchmark programs are tested, each of which has had all clauses in the task directives removed for the evaluation. The results of this experimental evaluation show that the proposed technique can successfully generate correct clauses for the tested benchmark programs. The proposed technique can simplify the parallelizing of programs using the {OpenMP} tasking model, making parallel programming more effective and productive.},
pages = {808--823},
number = {3},
journaltitle = {The Journal of Supercomputing},
shortjournal = {J Supercomput},
author = {Wang, Chun-Kun and Chen, Peng-Sheng},
urldate = {2025-03-21},
date = {2015-03-01},
langid = {english},
keywords = {{OpenMP}, Parallelization, Tasking model, Validation},
file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\65ARV27L\\Wang and Chen - 2015 - Automatic scoping of task clauses for the OpenMP tasking model.pdf:application/pdf},
}
@inproceedings{chaber_effectiveness_2016,
title = {Effectiveness of {PID} and {DMC} control algorithms automatic code generation for microcontrollers: Application to a thermal process},
url = {https://ieeexplore.ieee.org/document/7739817/references#references},
doi = {10.1109/SYSTOL.2016.7739817},
shorttitle = {Effectiveness of {PID} and {DMC} control algorithms automatic code generation for microcontrollers},
abstract = {An effective approach to implement control algorithms using code auto-generation is presented. Using {MATLAB} and C languages as input, an optimised pure C code is generated using a custom transcompiler. The considered solution is focused on microcontrollers from the {STM}32 family but any other can be used due to flexibility of the presented system. Controller development for a laboratory thermal process is thoroughly described, {PID} and {DMC} algorithms are used. Electronic connection between microcontroller and the process is discussed. Results of the experiments are reported.},
eventtitle = {2016 3rd Conference on Control and Fault-Tolerant Systems ({SysTol})},
pages = {618--623},
booktitle = {2016 3rd Conference on Control and Fault-Tolerant Systems ({SysTol})},
author = {Chaber, Patryk and Ławryńczuk, Maciej},
urldate = {2025-03-21},
date = {2016-09},
keywords = {Fans, Hardware, Heating, {MATLAB}, Microcontrollers, Process control, Standards},
}
@inproceedings{moses_high-performance_2023,
location = {New York, {NY}, {USA}},
title = {High-Performance {GPU}-to-{CPU} Transpilation and Optimization via High-Level Parallel Constructs},
isbn = {979-8-4007-0015-6},
url = {https://dl.acm.org/doi/10.1145/3572848.3577475},
doi = {10.1145/3572848.3577475},
series = {{PPoPP} '23},
abstract = {While parallelism remains the main source of performance, architectural implementations and programming models change with each new hardware generation, often leading to costly application re-engineering. Most tools for performance portability require manual and costly application porting to yet another programming model.We propose an alternative approach that automatically translates programs written in one programming model ({CUDA}), into another ({CPU} threads) based on Polygeist/{MLIR}. Our approach includes a representation of parallel constructs that allows conventional compiler transformations to apply transparently and without modification and enables parallelism-specific optimizations. We evaluate our framework by transpiling and optimizing the {CUDA} Rodinia benchmark suite for a multi-core {CPU} and achieve a 58\% geomean speedup over handwritten {OpenMP} code. Further, we show how {CUDA} kernels from {PyTorch} can efficiently run and scale on the {CPU}-only Supercomputer Fugaku without user intervention. Our {PyTorch} compatibility layer making use of transpiled {CUDA} {PyTorch} kernels outperforms the {PyTorch} {CPU} native backend by 2.7×.},
pages = {119--134},
booktitle = {Proceedings of the 28th {ACM} {SIGPLAN} Annual Symposium on Principles and Practice of Parallel Programming},
publisher = {Association for Computing Machinery},
author = {Moses, William S. and Ivanov, Ivan R. and Domke, Jens and Endo, Toshio and Doerfert, Johannes and Zinenko, Oleksandr},
date = {2023-02-21},
file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\5I8STUQP\\Moses et al. - 2023 - High-Performance GPU-to-CPU Transpilation and Optimization via High-Level Parallel Constructs.pdf:application/pdf},
}
@inproceedings{lin_rtl_2023,
location = {New York, {NY}, {USA}},
title = {From {RTL} to {CUDA}: A {GPU} Acceleration Flow for {RTL} Simulation with Batch Stimulus},
isbn = {978-1-4503-9733-9},
url = {https://dl.acm.org/doi/10.1145/3545008.3545091},
doi = {10.1145/3545008.3545091},
series = {{ICPP} '22},
shorttitle = {From {RTL} to {CUDA}},
abstract = {High-throughput {RTL} simulation is critical for verifying todays highly complex {SoCs}. Recent research has explored accelerating {RTL} simulation by leveraging event-driven approaches or partitioning heuristics to speed up simulation on a single stimulus. To further accelerate throughput performance, industry-quality functional verification signoff must explore running multiple stimulus (i.e., batch stimulus) simultaneously, either with directed tests or random inputs. In this paper, we propose {RTLFlow}, a {GPU}-accelerated {RTL} simulation flow with batch stimulus. {RTLflow} first transpiles {RTL} into {CUDA} kernels that each simulates a partition of the {RTL} simultaneously across multiple stimulus. It also leverages {CUDA} Graph and pipeline scheduling for efficient runtime execution. Measuring experimental results on a large industrial design ({NVDLA}) with 65536 stimulus, we show that {RTLflow} running on a single A6000 {GPU} can achieve a 40 × runtime speed-up when compared to an 80-thread multi-core {CPU} baseline.},
pages = {1--12},
booktitle = {Proceedings of the 51st International Conference on Parallel Processing},
publisher = {Association for Computing Machinery},
author = {Lin, Dian-Lun and Ren, Haoxing and Zhang, Yanqing and Khailany, Brucek and Huang, Tsung-Wei},
date = {2023-01-13},
file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\U36JRJA8\\Lin et al. - 2023 - From RTL to CUDA A GPU Acceleration Flow for RTL Simulation with Batch Stimulus.pdf:application/pdf},
}
@book{wang_electronic_2009,
title = {Electronic Design Automation: Synthesis, Verification, and Test},
isbn = {978-0-08-092200-3},
shorttitle = {Electronic Design Automation},
abstract = {This book provides broad and comprehensive coverage of the entire {EDA} flow. {EDA}/{VLSI} practitioners and researchers in need of fluency in an "adjacent" field will find this an invaluable reference to the basic {EDA} concepts, principles, data structures, algorithms, and architectures for the design, verification, and test of {VLSI} circuits. Anyone who needs to learn the concepts, principles, data structures, algorithms, and architectures of the {EDA} flow will benefit from this book. - Covers complete spectrum of the {EDA} flow, from {ESL} design modeling to logic/test synthesis, verification, physical design, and test - helps {EDA} newcomers to get "up-and-running" quickly - Includes comprehensive coverage of {EDA} concepts, principles, data structures, algorithms, and architectures - helps all readers improve their {VLSI} design competence - Contains latest advancements not yet available in other books, including Test compression, {ESL} design modeling, large-scale floorplanning, placement, routing, synthesis of clock and power/ground networks - helps readers to design/develop testable chips or products - Includes industry best-practices wherever appropriate in most chapters - helps readers avoid costly mistakes},
pagetotal = {971},
publisher = {Morgan Kaufmann},
author = {Wang, Laung-Terng and Chang, Yao-Wen and Cheng, Kwang-Ting},
date = {2009-03-11},
langid = {english},
keywords = {Computers / Computer Science, Technology \& Engineering / Industrial Design / Product},
}
@inproceedings{zhang_opportunities_2020,
location = {New York, {NY}, {USA}},
title = {Opportunities for {RTL} and gate level simulation using {GPUs}},
isbn = {978-1-4503-8026-3},
url = {https://dl.acm.org/doi/10.1145/3400302.3415773},
doi = {10.1145/3400302.3415773},
series = {{ICCAD} '20},
abstract = {This paper summarizes the opportunities in accelerating simulation on parallel processing hardware platforms such as {GPUs}. First, we give a summary of prior art. Then, we propose the idea that coding frameworks usually used for popular machine learning ({ML}) topics, such as {PyTorch}/{DGL}.ai, can also be used for exploring simulation purposes. We demo a crude oblivious two-value cycle gate-level simulator using the higher level {ML} framework {APIs} that exhibits \&gt;20X speedup, despite its simplistic construction. Next, we summarize recent advances in {GPU} features that may provide additional opportunities to further state-of-the-art results. Finally, we conclude and touch upon some potential areas for furthering research into the topic of {GPU} accelerated simulation.},
pages = {1--5},
booktitle = {Proceedings of the 39th International Conference on Computer-Aided Design},
publisher = {Association for Computing Machinery},
author = {Zhang, Yanqing and Ren, Haoxing and Khailany, Brucek},
urldate = {2025-03-21},
date = {2020-12-17},
file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\6JZSGT83\\Zhang et al. - 2020 - Opportunities for RTL and gate level simulation using GPUs.pdf:application/pdf},
}
@article{romer_structure_1996,
title = {The structure and performance of interpreters},
volume = {31},
issn = {0362-1340},
url = {https://dl.acm.org/doi/10.1145/248209.237175},
doi = {10.1145/248209.237175},
abstract = {Interpreted languages have become increasingly popular due to demands for rapid program development, ease of use, portability, and safety. Beyond the general impression that they are "slow," however, little has been documented about the performance of interpreters as a class of applications.This paper examines interpreter performance by measuring and analyzing interpreters from both software and hardware perspectives. As examples, we measure the {MIPSI}, Java, Perl, and Tcl interpreters running an array of micro and macro benchmarks on a {DEC} Alpha platform. Our measurements of these interpreters relate performance to the complexity of the interpreter's virtual machine and demonstrate that native runtime libraries can play a key role in providing good performance. From an architectural perspective, we show that interpreter performance is primarily a function of the interpreter itself and is relatively independent of the application being interpreted. We also demonstrate that high-level interpreters' demands on processor resources are comparable to those of other complex compiled programs, such as gcc. We conclude that interpreters, as a class of applications, do not currently motivate special hardware support for increased performance.},
pages = {150--159},
number = {9},
journaltitle = {{SIGPLAN} Not.},
author = {Romer, Theodore H. and Lee, Dennis and Voelker, Geoffrey M. and Wolman, Alec and Wong, Wayne A. and Baer, Jean-Loup and Bershad, Brian N. and Levy, Henry M.},
urldate = {2025-03-21},
date = {1996-09-01},
file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\76EU5U2P\\Romer et al. - 1996 - The structure and performance of interpreters.pdf:application/pdf},
}
@misc{fua_comparing_2020,
title = {Comparing Python, Go, and C++ on the N-Queens Problem},
url = {http://arxiv.org/abs/2001.02491},
doi = {10.48550/arXiv.2001.02491},
abstract = {Python currently is the dominant language in the field of Machine Learning but is often criticized for being slow to perform certain tasks. In this report, we use the well-known \$N\$-queens puzzle as a benchmark to show that once compiled using the Numba compiler it becomes competitive with C++ and Go in terms of execution speed while still allowing for very fast prototyping. This is true of both sequential and parallel programs. In most cases that arise in an academic environment, it therefore makes sense to develop in ordinary Python, identify computational bottlenecks, and use Numba to remove them.},
number = {{arXiv}:2001.02491},
publisher = {{arXiv}},
author = {Fua, Pascal and Lis, Krzysztof},
date = {2020-01-08},
keywords = {Computer Science - Mathematical Software},
file = {Preprint PDF:C\:\\Users\\danwi\\Zotero\\storage\\WZRCTXMG\\Fua and Lis - 2020 - Comparing Python, Go, and C++ on the N-Queens Problem.pdf:application/pdf},
}
@inproceedings{gherardi_java_2012,
location = {Berlin, Heidelberg},
title = {A Java vs. C++ Performance Evaluation: A 3D Modeling Benchmark},
isbn = {978-3-642-34327-8},
doi = {10.1007/978-3-642-34327-8_17},
shorttitle = {A Java vs. C++ Performance Evaluation},
abstract = {Along the years robotics software and applications have been typically implemented in compiled languages, such as C and C++, rather than interpreted languages, like Java. This choice has been due to their well-known faster behaviors, which meet the high performance requirements of robotics. Nevertheless, several projects that implement robotics functionality in Java can be found in literature and different experiments conduced by computer scientists have proved that the difference between Java and C++ is not so evident.},
pages = {161--172},
booktitle = {Simulation, Modeling, and Programming for Autonomous Robots},
publisher = {Springer},
author = {Gherardi, Luca and Brugali, Davide and Comotti, Daniele},
editor = {Noda, Itsuki and Ando, Noriaki and Brugali, Davide and Kuffner, James J.},
date = {2012},
langid = {english},
}
@inproceedings{eltantawy_mimd_2016,
title = {{MIMD} synchronization on {SIMT} architectures},
url = {https://ieeexplore.ieee.org/abstract/document/7783714},
doi = {10.1109/MICRO.2016.7783714},
abstract = {In the single-instruction multiple-threads ({SIMT}) execution model, small groups of scalar threads operate in lockstep. Within each group, current {SIMT} hardware implementations serialize the execution of threads that follow different paths, and to ensure efficiency, revert to lockstep execution as soon as possible. These constraints must be considered when adapting algorithms that employ synchronization. A deadlock-free program on a multiple-instruction multiple-data ({MIMD}) architecture may deadlock on a {SIMT} machine. To avoid this, programmers need to restructure control flow with {SIMT} scheduling constraints in mind. This requires programmers to be familiar with the underlying {SIMT} hardware. In this paper, we propose a static analysis technique that detects {SIMT} deadlocks by inspecting the application control flow graph ({CFG}). We further propose a {CFG} transformation that avoids {SIMT} deadlocks when synchronization is local to a function. Both the analysis and the transformation algorithms are implemented as {LLVM} compiler passes. Finally, we propose an adaptive hardware reconvergence mechanism that supports {MIMD} synchronization without changing the application {CFG}, but which can leverage our compiler analysis to gain efficiency. The static detection has a false detection rate of only 4\%-5\%. The automated transformation has an average performance overhead of 8.2\%-10.9\% compared to manual transformation. Our hardware approach performs on par with the compiler transformation, however, it avoids synchronization scope limitations, static instruction and register overheads, and debuggability challenges that are present in the compiler only solution.},
eventtitle = {2016 49th Annual {IEEE}/{ACM} International Symposium on Microarchitecture ({MICRO})},
pages = {1--14},
booktitle = {2016 49th Annual {IEEE}/{ACM} International Symposium on Microarchitecture ({MICRO})},
author = {{ElTantawy}, Ahmed and Aamodt, Tor M.},
date = {2016-10},
keywords = {Graphics processing units, Hardware, Instruction sets, Manuals, Programming, Synchronization, System recovery},
file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\EKKWUQQM\\ElTantawy and Aamodt - 2016 - MIMD synchronization on SIMT architectures.pdf:application/pdf},
}