related work: small continuation of explaining SIMT

2025-03-08 14:12:50 +01:00
parent b683f3ae96
commit 4e48686b62
4 changed files with 53 additions and 3 deletions
--- a/thesis/references.bib
+++ b/thesis/references.bib
@ -574,3 +574,48 @@ Publisher: Multidisciplinary Digital Publishing Institute},
 	date = {2010-06-19},
 	file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\D64U9R8Q\\Lee et al. - 2010 - Debunking the 100X GPU vs. CPU myth an evaluation of throughput computing on CPU and GPU.pdf:application/pdf},
 }
+
+@inproceedings{kyung_implementation_2014,
+	title = {An implementation of a {SIMT} architecture-based stream processor},
+	url = {https://ieeexplore.ieee.org/abstract/document/7022313},
+	doi = {10.1109/TENCON.2014.7022313},
+	abstract = {In this paper, we designed a {SIMT} architecture-based stream processor for parallel processing in the mobile environment. The designed processor is a superscalar architecture and can issue up to four instructions. Considering the limited resources of the mobile environment, this processor was consisted of 16 stream processors ({SPs}). To verify the operation of the designed processor, a functional level simulation was conducted with the Modelsim {SE} 10.0b simulator. We synthesized on Virtex-7 {FPGA} as the target with the Xilinx {ISE} 14.7 tool and the results analyzed. The performance of the designed processor was 150M Triangles/Sec, 4.8 {GFLOPS} at 100 {MHz}. When the performance was compared with that of conventional processors, the proposed architecture of the processor attested to be effective in processing 3D graphics and parallel general-purpose computing in the mobile environment.},
+	eventtitle = {{TENCON} 2014 - 2014 {IEEE} Region 10 Conference},
+	pages = {1--5},
+	booktitle = {{TENCON} 2014 - 2014 {IEEE} Region 10 Conference},
+	author = {Kyung, Gyutaek and Jung, Changmin and Lee, Kwangyeob},
+	urldate = {2025-03-08},
+	date = {2014-10},
+	note = {{ISSN}: 2159-3450},
+	keywords = {Computer architecture, Educational institutions, {GPGPU}, Graphics, Graphics processing units, Instruction sets, Mobile communication, Registers, {SIMT} Architecture, Stream Processor},
+	file = {IEEE Xplore Abstract Record:C\:\\Users\\danwi\\Zotero\\storage\\9B85REHH\\7022313.html:text/html},
+}
+
+@report{collange_stack-less_2011,
+	title = {Stack-less {SIMT} reconvergence at low cost},
+	url = {https://hal.science/hal-00622654},
+	abstract = {Parallel architectures following the {SIMT} model such as {GPUs} benefit from application regularity by issuing concurrent threads running in lockstep on {SIMD} units. As threads take different paths across the control-flow graph, lockstep execution is partially lost, and must be regained whenever possible in order to maximize the occupancy of {SIMD} units. In this paper, we propose a technique to handle {SIMT} control divergence that operates in constant space and handles indirect jumps and recursion. We describe a possible implementation which leverage the existing memory divergence management unit, ensuring a low hardware cost. In terms of performance, this solution is at least as efficient as existing techniques.},
+	institution = {{ENS} Lyon},
+	type = {Research Report},
+	author = {Collange, Caroline},
+	urldate = {2025-03-08},
+	date = {2011-09},
+	keywords = {Control-flow reconvergence, {GPU}, {SIMD}, {SIMT}},
+	file = {HAL PDF Full Text:C\:\\Users\\danwi\\Zotero\\storage\\M2WPWNXF\\Collange - 2011 - Stack-less SIMT reconvergence at low cost.pdf:application/pdf},
+}
+
+@inproceedings{fung_thread_2011,
+	title = {Thread block compaction for efficient {SIMT} control flow},
+	url = {https://ieeexplore.ieee.org/abstract/document/5749714},
+	doi = {10.1109/HPCA.2011.5749714},
+	abstract = {Manycore accelerators such as graphics processor units ({GPUs}) organize processing units into single-instruction, multiple data “cores” to improve throughput per unit hardware cost. Programming models for these accelerators encourage applications to run kernels with large groups of parallel scalar threads. The hardware groups these threads into warps/wavefronts and executes them in lockstep-dubbed single-instruction, multiple-thread ({SIMT}) by {NVIDIA}. While current {GPUs} employ a per-warp (or per-wavefront) stack to manage divergent control flow, it incurs decreased efficiency for applications with nested, data-dependent control flow. In this paper, we propose and evaluate the benefits of extending the sharing of resources in a block of warps, already used for scratchpad memory, to exploit control flow locality among threads (where such sharing may at first seem detrimental). In our proposal, warps within a thread block share a common block-wide stack for divergence handling. At a divergent branch, threads are compacted into new warps in hardware. Our simulation results show that this compaction mechanism provides an average speedup of 22\% over a baseline per-warp, stack-based reconvergence mechanism, and 17\% versus dynamic warp formation on a set of {CUDA} applications that suffer significantly from control flow divergence.},
+	eventtitle = {2011 {IEEE} 17th International Symposium on High Performance Computer Architecture},
+	pages = {25--36},
+	booktitle = {2011 {IEEE} 17th International Symposium on High Performance Computer Architecture},
+	author = {Fung, Wilson W. L. and Aamodt, Tor M.},
+	urldate = {2025-03-08},
+	date = {2011-02},
+	note = {{ISSN}: 2378-203X},
+	keywords = {Compaction, Graphics processing unit, Hardware, Instruction sets, Kernel, Pipelines, Random access memory},
+	file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\TRPWUTI6\\Fung und Aamodt - 2011 - Thread block compaction for efficient SIMT control flow.pdf:application/pdf;IEEE Xplore Abstract Record:C\:\\Users\\danwi\\Zotero\\storage\\LYPYEA8U\\5749714.html:text/html},
+}