relwork: continued with 'programming GPUs'

2025-03-08 12:28:46 +01:00
parent 203e157f11
commit b683f3ae96
5 changed files with 100 additions and 5 deletions
--- a/thesis/references.bib
+++ b/thesis/references.bib
@ -490,7 +490,7 @@ Publisher: Multidisciplinary Digital Publishing Institute},
 	urldate = {2025-03-02},
 	date = {2021-02},
 	note = {Conference Name: {IEEE} Transactions on Visualization and Computer Graphics},
-	keywords = {Algorithms, Cameras, Computer Graphics Techniques, Distortion, Engineering, Mathematics, Observers, Physical \& Environmental Sciences, Ray tracing, Real-time systems, Rendering (computer graphics), Visualization},
+	keywords = {Rendering (computer graphics), Algorithms, Cameras, Computer Graphics Techniques, Distortion, Engineering, Mathematics, Observers, Physical \& Environmental Sciences, Ray tracing, Real-time systems, Visualization},
 	file = {PDF:C\:\\Users\\danwi\\Zotero\\storage\\HDASRGYN\\Verbraeck und Eisemann - 2021 - Interactive Black-Hole Visualization.pdf:application/pdf},
 }

@ -506,3 +506,71 @@ Publisher: Multidisciplinary Digital Publishing Institute},
 	langid = {english},
 	file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\TBBLEZ5N\\Hissbach et al. - 2022 - An Overview of Techniques for Egocentric Black Hole Visualization and Their Suitability for Planetar.pdf:application/pdf},
 }
+
+@inproceedings{schuurman_step-by-step_2013,
+	location = {New York, {NY}, {USA}},
+	title = {Step-by-step design and simulation of a simple {CPU} architecture},
+	isbn = {978-1-4503-1868-6},
+	url = {https://dl.acm.org/doi/10.1145/2445196.2445296},
+	doi = {10.1145/2445196.2445296},
+	series = {{SIGCSE} '13},
+	abstract = {This paper describes a sequence of assignments, each building upon the next, leading students to a working simulation of a simple 8-bit {CPU} (Central Processing Unit). The design features a classic Von Neumann architecture comprising a simple data path with a few registers, a simple {ALU} (Arithmetic Logic Unit), and a microprogram to direct all the control signals. The first step involves the design of the {ALU} which is capable of eight basic operations. The second step guides students to construct a datapath complete with several 8-bit registers. The third step involves the design and implementation of a control unit which uses a microprogram to implement machine code instructions. The microprogram implements nine basic machine language instructions which are sufficient for writing many simple programs. The final step involves adding program memory and an input and output device to form a simple working simulation of a computer. At this point, students may hand-assemble code for their {CPU} and simulate its execution. All simulations are performed using a free and open source simulator called Logisim which performs digital logic simulations with the ability to build larger circuits from smaller subcircuits. Students can set an adjustable clock rate and observe the internal {CPU} state and registers as it retrieves instructions and steps through the microcode. The basic {CPU} architecture provides many opportunities for more advanced exercises, such as adding an instruction fetch unit, adding pipelining, or adding more machine language instructions. The assignments were introduced in a second year course on computer organization, providing an effective hands-on approach to understanding how a {CPU} actually operates.},
+	pages = {335--340},
+	booktitle = {Proceeding of the 44th {ACM} technical symposium on Computer science education},
+	publisher = {Association for Computing Machinery},
+	author = {Schuurman, Derek C.},
+	urldate = {2025-03-08},
+	date = {2013-03-06},
+	file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\KM664H87\\Schuurman - 2013 - Step-by-step design and simulation of a simple CPU architecture.pdf:application/pdf},
+}
+
+@article{franchetti_efficient_2005,
+	title = {Efficient Utilization of {SIMD} Extensions},
+	volume = {93},
+	issn = {1558-2256},
+	url = {https://ieeexplore.ieee.org/abstract/document/1386659},
+	doi = {10.1109/JPROC.2004.840491},
+	abstract = {This paper targets automatic performance tuning of numerical kernels in the presence of multilayered memory hierarchies and single-instruction, multiple-data ({SIMD}) parallelism. The studied {SIMD} instruction set extensions include Intel's {SSE} family, {AMD}'s 3DNow!, Motorola's {AltiVec}, and {IBM}'s {BlueGene}/L {SIMD} instructions. {FFTW}, {ATLAS}, and {SPIRAL} demonstrate that near-optimal performance of numerical kernels across a variety of modern computers featuring deep memory hierarchies can be achieved only by means of automatic performance tuning. These software packages generate and optimize {ANSI} C code and feed it into the target machine's general-purpose C compiler to maintain portability. The scalar C code produced by performance tuning systems poses a severe challenge for vectorizing compilers. The particular code structure hampers automatic vectorization and, thus, inhibits satisfactory performance on processors featuring short vector extensions. This paper describes special-purpose compiler technology that supports automatic performance tuning on machines with vector instructions. The work described includes: 1) symbolic vectorization of digital signal processing transforms; 2) straight-line code vectorization for numerical kernels; and 3) compiler back ends for straight-line code with vector instructions. Methods from all three areas were combined with {FFTW}, {SPIRAL}, and {ATLAS} to optimize both for memory hierarchy and vector instructions. Experiments show that the presented methods lead to substantial speedups (up to 1.8 for two-way and 3.3 for four-way vector extensions) over the best scalar C codes generated by the original systems as well as roughly matching the performance of hand-tuned vendor libraries.},
+	pages = {409--425},
+	number = {2},
+	journaltitle = {Proceedings of the {IEEE}},
+	author = {Franchetti, F. and Kral, S. and Lorenz, J. and Ueberhuber, C.W.},
+	urldate = {2025-03-08},
+	date = {2005-02},
+	note = {Conference Name: Proceedings of the {IEEE}},
+	keywords = {Automatic vectorization, Boosting, Computer aided instruction, Computer applications, Concurrent computing, Digital signal processing, digital signal processing ({DSP}), fast Fourier transform ({FFT}), Kernel, Parallel processing, Registers, short vector single instruction, multiple data ({SIMD}), Signal processing algorithms, Spirals, symbolic vectorization},
+	file = {Eingereichte Version:C\:\\Users\\danwi\\Zotero\\storage\\J48HM9VD\\Franchetti et al. - 2005 - Efficient Utilization of SIMD Extensions.pdf:application/pdf;IEEE Xplore Abstract Record:C\:\\Users\\danwi\\Zotero\\storage\\W6PT75CV\\1386659.html:text/html},
+}
+
+@inproceedings{tian_compiling_2012,
+	title = {Compiling C/C++ {SIMD} Extensions for Function and Loop Vectorizaion on Multicore-{SIMD} Processors},
+	url = {https://ieeexplore.ieee.org/abstract/document/6270606},
+	doi = {10.1109/IPDPSW.2012.292},
+	abstract = {{SIMD} vectorization has received significant attention in the past decade as an important method to accelerate scientific applications, media and embedded applications on {SIMD} architectures such as Intel® {SSE}, {AVX}, and {IBM}* {AltiVec}. However, most of the focus has been directed at loops, effectively executing their iterations on multiple {SIMD} lanes concurrently relying upon program hints and compiler analysis. This paper presents a set of new C/C++ high-level vector extensions for {SIMD} programming, and the Intel® C++ product compiler that is extended to translate these vector extensions and produce optimized {SIMD} instruction sequences of vectorized functions and loops. For a function, our main idea is to vectorize the entire function for callers instead of just vectorizing loops (if any) inside the function. It poses the challenge of dealing with complicated control-flow in the function body, and matching caller and callee for {SIMD} vector calls while vectorizing caller functions (or loops) and callee functions. Our compilation methods for automatically compiling vector extensions are described. We present performance results of several non-trivial visual computing, computational, and simulation workloads, utilizing {SIMD} units through the vector extensions on Intel® Multicore 128-bit {SIMD} processors, and we show that significant {SIMD} speedups (3.07x to 4.69x) are achieved over the serial execution.},
+	eventtitle = {2012 {IEEE} 26th International Parallel and Distributed Processing Symposium Workshops \& {PhD} Forum},
+	pages = {2349--2358},
+	booktitle = {2012 {IEEE} 26th International Parallel and Distributed Processing Symposium Workshops \& {PhD} Forum},
+	author = {Tian, Xinmin and Saito, Hideki and Girkar, Milind and Preis, Serguei V. and Kozhukhov, Sergey S. and Cherkasov, Aleksei G. and Nelson, Clark and Panchenko, Nikolay and Geva, Robert},
+	urldate = {2025-03-08},
+	date = {2012-05},
+	keywords = {Cloning, Compiler, {GPU}, Graphics processing unit, Hardware, Multicore, Parallel processing, Programming, {SIMD}, Vectorization, Vectors},
+	file = {IEEE Xplore Abstract Record:C\:\\Users\\danwi\\Zotero\\storage\\HBSGBKT2\\6270606.html:text/html},
+}
+
+@inproceedings{lee_debunking_2010,
+	location = {New York, {NY}, {USA}},
+	title = {Debunking the 100X {GPU} vs. {CPU} myth: an evaluation of throughput computing on {CPU} and {GPU}},
+	isbn = {978-1-4503-0053-7},
+	url = {https://dl.acm.org/doi/10.1145/1815961.1816021},
+	doi = {10.1145/1815961.1816021},
+	series = {{ISCA} '10},
+	shorttitle = {Debunking the 100X {GPU} vs. {CPU} myth},
+	abstract = {Recent advances in computing have led to an explosion in the amount of data being generated. Processing the ever-growing data in a timely manner has made throughput computing an important aspect for emerging applications. Our analysis of a set of important throughput computing kernels shows that there is an ample amount of parallelism in these kernels which makes them suitable for today's multi-core {CPUs} and {GPUs}. In the past few years there have been many studies claiming {GPUs} deliver substantial speedups (between 10X and 1000X) over multi-core {CPUs} on these kernels. To understand where such large performance difference comes from, we perform a rigorous performance analysis and find that after applying optimizations appropriate for both {CPUs} and {GPUs} the performance gap between an Nvidia {GTX}280 processor and the Intel Core i7-960 processor narrows to only 2.5x on average. In this paper, we discuss optimization techniques for both {CPU} and {GPU}, analyze what architecture features contributed to performance differences between the two architectures, and recommend a set of architectural features which provide significant improvement in architectural efficiency for throughput kernels.},
+	pages = {451--460},
+	booktitle = {Proceedings of the 37th annual international symposium on Computer architecture},
+	publisher = {Association for Computing Machinery},
+	author = {Lee, Victor W. and Kim, Changkyu and Chhugani, Jatin and Deisher, Michael and Kim, Daehyun and Nguyen, Anthony D. and Satish, Nadathur and Smelyanskiy, Mikhail and Chennupaty, Srinivas and Hammarlund, Per and Singhal, Ronak and Dubey, Pradeep},
+	urldate = {2025-03-08},
+	date = {2010-06-19},
+	file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\D64U9R8Q\\Lee et al. - 2010 - Debunking the 100X GPU vs. CPU myth an evaluation of throughput computing on CPU and GPU.pdf:application/pdf},
+}