master-thesis/thesis/references.bib


@article{besard_rapid_2019,
	title = {Rapid software prototyping for heterogeneous and distributed platforms},
	volume = {132},
	issn = {09659978},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S0965997818310123},
	doi = {10.1016/j.advengsoft.2019.02.002},
	pages = {29--46},
	journaltitle = {Advances in Engineering Software},
	shortjournal = {Advances in Engineering Software},
	author = {Besard, Tim and Churavy, Valentin and Edelman, Alan and Sutter, Bjorn De},
	urldate = {2024-11-22},
	date = {2019-06},
	langid = {english},
	file = {Volltext:C\:\\Users\\danwi\\Zotero\\storage\\VNWQAR9Q\\Besard et al. - 2019 - Rapid software prototyping for heterogeneous and distributed platforms.pdf:application/pdf},
}

@article{besard_effective_2019,
	title = {Effective Extensible Programming: Unleashing Julia on {GPUs}},
	volume = {30},
	rights = {https://ieeexplore.ieee.org/Xplorehelp/downloads/license-information/{IEEE}.html},
	issn = {1045-9219, 1558-2183, 2161-9883},
	url = {https://ieeexplore.ieee.org/document/8471188/},
	doi = {10.1109/TPDS.2018.2872064},
	shorttitle = {Effective Extensible Programming},
	pages = {827--841},
	number = {4},
	journaltitle = {{IEEE} Transactions on Parallel and Distributed Systems},
	shortjournal = {{IEEE} Trans. Parallel Distrib. Syst.},
	author = {Besard, Tim and Foket, Christophe and De Sutter, Bjorn},
	urldate = {2024-11-22},
	date = {2019-04-01},
	file = {Eingereichte Version:C\:\\Users\\danwi\\Zotero\\storage\\T34I73BI\\Besard et al. - 2019 - Effective Extensible Programming Unleashing Julia on GPUs.pdf:application/pdf},
}

@inproceedings{lin_comparing_2021,
	location = {St. Louis, {MO}, {USA}},
	title = {Comparing Julia to Performance Portable Parallel Programming Models for {HPC}},
	rights = {https://ieeexplore.ieee.org/Xplorehelp/downloads/license-information/{IEEE}.html},
	isbn = {978-1-6654-1118-9},
	url = {https://ieeexplore.ieee.org/document/9652798/},
	doi = {10.1109/PMBS54543.2021.00016},
	eventtitle = {2021 International Workshop on Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems ({PMBS})},
	pages = {94--105},
	booktitle = {2021 International Workshop on Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems ({PMBS})},
	publisher = {{IEEE}},
	author = {Lin, Wei-Chen and {McIntosh}-Smith, Simon},
	urldate = {2024-11-22},
	date = {2021-11},
	file = {Eingereichte Version:C\:\\Users\\danwi\\Zotero\\storage\\U6EQPD62\\Lin und McIntosh-Smith - 2021 - Comparing Julia to Performance Portable Parallel Programming Models for HPC.pdf:application/pdf},
}

@online{nvidia_cuda_2024,
	title = {{CUDA} C++ Programming Guide},
	url = {https://docs.nvidia.com/cuda/cuda-c-programming-guide/},
	author = {{Nvidia}},
	urldate = {2024-11-22},
	date = {2024-11},
}

@article{koster_massively_2020,
	title = {Massively Parallel Rule-Based Interpreter Execution on {GPUs} Using Thread Compaction},
	volume = {48},
	issn = {1573-7640},
	url = {https://doi.org/10.1007/s10766-020-00670-2},
	doi = {10.1007/s10766-020-00670-2},
	abstract = {Interpreters are well researched in the field of compiler construction and program generation. They are typically used to realize program execution of different programming languages without a compilation step. However, they can also be used to model complex rule-based simulations: The interpreter applies all rules one after another. These can be iteratively applied on a globally updated state in order to get the final simulation result. Many simulations for domain-specific problems already leverage the parallel processing capabilities of Graphics Processing Units ({GPUs}). They use hardware-specific tuned rule implementations to achieve maximum performance. However, every interpreter-based system requires a high-level algorithm that detects active rules and determines when they are evaluated. A common approach in this context is the use of different interpreter routines for every problem domain. Executing such functions in an efficient way mainly involves dealing with hardware peculiarities like thread divergences, {ALU} computations and memory operations. Furthermore, the interpreter is often executed on multiple states in parallel these days. This is particularly important for heuristic search or what-if analyses, for instance. In this paper, we present a novel and easy-to-implement method based on thread compaction to realize generic rule-based interpreters in an efficient way on {GPUs}. It is optimized for many states using a specially designed memory layout. Benchmarks on our evaluation scenarios show that the performance can be significantly increased in comparison to existing commonly-used implementations.},
	pages = {675--691},
	number = {4},
	journaltitle = {International Journal of Parallel Programming},
	shortjournal = {Int J Parallel Prog},
	author = {Köster, M. and Groß, J. and Krüger, A.},
	urldate = {2024-11-29},
	date = {2020-08-01},
	langid = {english},
	keywords = {{GPU}, Interpreter execution, Memory layout, Thread compaction},
	file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\8ETAIXGL\\Köster et al. - 2020 - Massively Parallel Rule-Based Interpreter Execution on GPUs Using Thread Compaction.pdf:application/pdf},
}

@inproceedings{krolik_r3d3_2021,
	title = {r3d3: Optimized Query Compilation on {GPUs}},
	url = {https://ieeexplore.ieee.org/document/9370323},
	doi = {10.1109/CGO51591.2021.9370323},
	shorttitle = {r3d3},
	abstract = {Query compilation is an effective approach to improve the performance of repeated database queries. {GPU}-based approaches have significant promise, but face difficulties in managing compilation time, data transfer costs, and in addressing a reasonably comprehensive range of {SQL} operations. In this work we describe a hybrid {AoT}/{JIT} approach to {GPU}-based query compilation. We use multiple optimizations to reduce execution, compile, and data transfer times, improving performance over both other {GPU}-based approaches and {CPU}-based query compilers as well. Our design addresses a wide range of {SQL} queries, sufficient to demonstrate the practicality of using {GPUs} for query optimization.},
	eventtitle = {2021 {IEEE}/{ACM} International Symposium on Code Generation and Optimization ({CGO})},
	pages = {277--288},
	booktitle = {2021 {IEEE}/{ACM} International Symposium on Code Generation and Optimization ({CGO})},
	author = {Krolik, Alexander and Verbrugge, Clark and Hendren, Laurie},
	urldate = {2024-11-29},
	date = {2021-02},
	keywords = {Compilers, Data transfer, Databases, {GPUs}, Graphics processing units, Memory management, Optimization, Query processing, Runtime, {SQL} database queries},
	file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\NJM2FK56\\Krolik et al. - 2021 - r3d3 Optimized Query Compilation on GPUs.pdf:application/pdf;IEEE Xplore Abstract Record:C\:\\Users\\danwi\\Zotero\\storage\\F6KDT83Y\\9370323.html:text/html},
}

@inproceedings{koster_high-performance_2020,
	location = {Cham},
	title = {High-Performance Simulations on {GPUs} Using Adaptive Time Steps},
	isbn = {978-3-030-60245-1},
	doi = {10.1007/978-3-030-60245-1_26},
	abstract = {Graphics Processing Units ({GPUs}) are widely spread nowadays due to their parallel processing capabilities. Leveraging these hardware features is particularly important for computationally expensive tasks and workloads. Prominent use cases are optimization problems and simulations that can be parallelized and tuned for these architectures. In the general domain of simulations (numerical and discrete), the overall logic is split into several components that are executed one after another. These components need step-size information which determines the number of steps (e.g. the elapsed time) they have to perform. Small step sizes are often required to ensure a valid simulation result with respect to precision and constraint correctness. Unfortunately, they are often the main bottleneck of the simulation. In this paper, we introduce a new and generic way of realizing high-performance simulations with multiple components using adaptive time steps on {GPUs}. Our method relies on a code-analysis phase that resolves data dependencies between different components. This knowledge is used to generate specially-tuned execution kernels that encapsulate the underlying component logic. An evaluation on our simulation benchmarks shows that we are able to considerably improve runtime performance compared to prior work.},
	pages = {369--385},
	booktitle = {Algorithms and Architectures for Parallel Processing},
	publisher = {Springer International Publishing},
	author = {Köster, Marcel and Groß, Julian and Krüger, Antonio},
	editor = {Qiu, Meikang},
	date = {2020},
	langid = {english},
	keywords = {Related-Work},
}

@inproceedings{koster_macsq_2022,
	location = {Cham},
	title = {{MACSQ}: Massively Accelerated {DeepQ} Learning on {GPUs} Using On-the-fly State Construction},
	isbn = {978-3-030-96772-7},
	doi = {10.1007/978-3-030-96772-7_35},
	shorttitle = {{MACSQ}},
	abstract = {The current trend of using artificial neural networks to solve computationally intensive problems is omnipresent. In this scope, {DeepQ} learning is a common choice for agent-based problems. {DeepQ} combines the concept of Q-Learning with (deep) neural networks to learn different Q-values/matrices based on environmental conditions. Unfortunately, {DeepQ} learning requires hundreds of thousands of iterations/Q-samples that must be generated and learned for large-scale problems. Gathering data sets for such challenging tasks is extremely time consuming and requires large data-storage containers. Consequently, a common solution is the automatic generation of input samples for agent-based {DeepQ} networks. However, a usual workflow is to create the samples separately from the training process in either a (set of) pre-processing step(s) or interleaved with the training process. This requires the input Q-samples to be materialized in order to be fed into the training step of the attached neural network. In this paper, we propose a new {GPU}-focussed method for on-the-fly generation of training samples tightly coupled with the training process itself. This allows us to skip the materialization process of all samples (e.g. avoid dumping them disk), as they are (re)constructed when needed. Our method significantly outperforms usual workflows that generate the input samples on the {CPU} in terms of runtime performance and memory/storage consumption.},
	pages = {383--395},
	booktitle = {Parallel and Distributed Computing, Applications and Technologies},
	publisher = {Springer International Publishing},
	author = {Köster, Marcel and Groß, Julian and Krüger, Antonio},
	editor = {Shen, Hong and Sang, Yingpeng and Zhang, Yong and Xiao, Nong and Arabnia, Hamid R. and Fox, Geoffrey and Gupta, Ajay and Malek, Manu},
	date = {2022},
	langid = {english},
	keywords = {Related Work},
}

@inproceedings{dietz_mimd_2010,
	location = {Berlin, Heidelberg},
	title = {{MIMD} Interpretation on a {GPU}},
	isbn = {978-3-642-13374-9},
	doi = {10.1007/978-3-642-13374-9_5},
	abstract = {Programming heterogeneous parallel computer systems is notoriously difficult, but {MIMD} models have proven to be portable across multi-core processors, clusters, and massively parallel systems. It would be highly desirable for {GPUs} (Graphics Processing Units) also to be able to leverage algorithms and programming tools designed for {MIMD} targets. Unfortunately, most {GPU} hardware implements a very restrictive multi-threaded {SIMD}-based execution model.},
	pages = {65--79},
	booktitle = {Languages and Compilers for Parallel Computing},
	publisher = {Springer},
	author = {Dietz, Henry G. and Young, B. Dalton},
	editor = {Gao, Guang R. and Pollock, Lori L. and Cavazos, John and Li, Xiaoming},
	date = {2010},
	langid = {english},
	keywords = {Hilfreich},
}

@inproceedings{langdon_simd_2008,
	location = {Berlin, Heidelberg},
	title = {A {SIMD} Interpreter for Genetic Programming on {GPU} Graphics Cards},
	isbn = {978-3-540-78671-9},
	doi = {10.1007/978-3-540-78671-9_7},
	abstract = {Mackey-Glass chaotic time series prediction and nuclear protein classification show the feasibility of evaluating genetic programming populations directly on parallel consumer gaming graphics processing units. Using a Linux {KDE} computer equipped with an {nVidia} {GeForce} 8800 {GTX} graphics processing unit card the C++ {SPMD} interpretter evolves programs at Giga {GP} operations per second (895 million {GPops}). We use the {RapidMind} general processing on {GPU} ({GPGPU}) framework to evaluate an entire population of a quarter of a million individual programs on a non-trivial problem in 4 seconds. An efficient reverse polish notation ({RPN}) tree based {GP} is given.},
	pages = {73--85},
	booktitle = {Genetic Programming},
	publisher = {Springer},
	author = {Langdon, W. B. and Banzhaf, Wolfgang},
	editor = {O’Neill, Michael and Vanneschi, Leonardo and Gustafson, Steven and Esparcia Alcázar, Anna Isabel and De Falco, Ivanoe and Della Cioppa, Antonio and Tarantino, Ernesto},
	date = {2008},
	langid = {english},
	keywords = {Hilfreich},
}

@inproceedings{cano_gpu-parallel_2014,
	location = {New York, {NY}, {USA}},
	title = {{GPU}-parallel subtree interpreter for genetic programming},
	isbn = {978-1-4503-2662-9},
	url = {https://dl.acm.org/doi/10.1145/2576768.2598272},
	doi = {10.1145/2576768.2598272},
	series = {{GECCO} '14},
	abstract = {Genetic Programming ({GP}) is a computationally intensive technique but its nature is embarrassingly parallel. Graphic Processing Units ({GPUs}) are many-core architectures which have been widely employed to speed up the evaluation of {GP}. In recent years, many works have shown the high performance and efficiency of {GPUs} on evaluating both the individuals and the fitness cases in parallel. These approaches are known as population parallel and data parallel. This paper presents a parallel {GP} interpreter which extends these approaches and adds a new parallelization level based on the concurrent evaluation of the individual's subtrees. A {GP} individual defined by a tree structure with nodes and branches comprises different depth levels in which there are independent subtrees which can be evaluated concurrently. Threads can cooperate to evaluate different subtrees and share the results via {GPU}'s shared memory. The experimental results show the better performance of the proposal in terms of the {GP} operations per second ({GPops}/s) that the {GP} interpreter is capable of processing, achieving up to 21 billion {GPops}/s using a {NVIDIA} 480 {GPU}. However, some issues raised due to limitations of currently available hardware are to be overcomed by the dynamic parallelization capabilities of the next generation of {GPUs}.},
	pages = {887--894},
	booktitle = {Proceedings of the 2014 Annual Conference on Genetic and Evolutionary Computation},
	publisher = {Association for Computing Machinery},
	author = {Cano, Alberto and Ventura, Sebastian},
	urldate = {2024-11-28},
	date = {2014-07-12},
	keywords = {Hilfreich},
	file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\NYV739K8\\Cano und Ventura - 2014 - GPU-parallel subtree interpreter for genetic programming.pdf:application/pdf},
}

@inproceedings{pfahler_semantic_2020,
	location = {New York, {NY}, {USA}},
	title = {Semantic Search in Millions of Equations},
	isbn = {978-1-4503-7998-4},
	url = {https://dl.acm.org/doi/10.1145/3394486.3403056},
	doi = {10.1145/3394486.3403056},
	series = {{KDD} '20},
	abstract = {Given the increase of publications, search for relevant papers becomes tedious. In particular, search across disciplines or schools of thinking is not supported. This is mainly due to the retrieval with keyword queries: technical terms differ in different sciences or at different times. Relevant articles might better be identified by their mathematical problem descriptions. Just looking at the equations in a paper already gives a hint to whether the paper is relevant. Hence, we propose a new approach for retrieval of mathematical expressions based on machine learning. We design an unsupervised representation learning task that combines embedding learning with self-supervised learning. Using graph convolutional neural networks we embed mathematical expression into low-dimensional vector spaces that allow efficient nearest neighbor queries. To train our models, we collect a huge dataset with over 29 million mathematical expressions from over 900,000 publications published on {arXiv}.org. The math is converted into an {XML} format, which we view as graph data. Our empirical evaluations involving a new dataset of manually annotated search queries show the benefits of using embedding models for mathematical retrieval.},
	pages = {135--143},
	booktitle = {Proceedings of the 26th {ACM} {SIGKDD} International Conference on Knowledge Discovery \& Data Mining},
	publisher = {Association for Computing Machinery},
	author = {Pfahler, Lukas and Morik, Katharina},
	urldate = {2024-11-30},
	date = {2020-08-20},
	file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\TQBLKG25\\Pfahler und Morik - 2020 - Semantic Search in Millions of Equations.pdf:application/pdf},
}

@misc{werner_informed_2021,
	title = {Informed Equation Learning},
	url = {http://arxiv.org/abs/2105.06331},
	doi = {10.48550/arXiv.2105.06331},
	abstract = {Distilling data into compact and interpretable analytic equations is one of the goals of science. Instead, contemporary supervised machine learning methods mostly produce unstructured and dense maps from input to output. Particularly in deep learning, this property is owed to the generic nature of simple standard link functions. To learn equations rather than maps, standard non-linearities can be replaced with structured building blocks of atomic functions. However, without strong priors on sparsity and structure, representational complexity and numerical conditioning limit this direct approach. To scale to realistic settings in science and engineering, we propose an informed equation learning system. It provides a way to incorporate expert knowledge about what are permitted or prohibited equation components, as well as a domain-dependent structured sparsity prior. Our system then utilizes a robust method to learn equations with atomic functions exhibiting singularities, as e.g. logarithm and division. We demonstrate several artificial and real-world experiments from the engineering domain, in which our system learns interpretable models of high predictive power.},
	number = {{arXiv}:2105.06331},
	publisher = {{arXiv}},
	author = {Werner, Matthias and Junginger, Andrej and Hennig, Philipp and Martius, Georg},
	urldate = {2024-11-30},
	date = {2021-05-13},
	eprinttype = {arxiv},
	eprint = {2105.06331},
	keywords = {Computer Science - Machine Learning},
	file = {Preprint PDF:C\:\\Users\\danwi\\Zotero\\storage\\HEYBR254\\Werner et al. - 2021 - Informed Equation Learning.pdf:application/pdf},
}

@article{memarzia_-depth_2015,
	title = {An In-depth Study on the Performance Impact of {CUDA}, {OpenCL}, and {PTX} Code},
	volume = {10},
	abstract = {In recent years, the rise of {GPGPU} as a viable solution for high performance computing has been accompanied by fresh challenges for developers. Chief among these challenges is efficiently harnessing the formidable power of the {GPU} and finding performance bottlenecks. Many factors play a role in a {GPU} application’s performance. This creates the need for studies performance comparisons, and ways to analyze programs from a fundamental level. With that in mind, our goal is to present an in-depth performance comparison of the {CUDA} and {OpenCL} platforms, and study how {PTX} code can affect performance. In order to achieve this goal, we explore the subject from three different angles: kernel execution times, data transfers that occur between the host and device, and the {PTX} code that is generated by each platform’s compiler. We carry out our experiments using ten real-world {GPU} kernels from the digital image processing domain, a selection of variable input data sizes, and a pair of {GPUs} based on the Nvidia Fermi and Kepler architectures. We show how {PTX} statistics and analysis can be used to provide further insight on performance discrepancies and bottlenecks. Our results indicate that, in an unbiased comparison such as this one, the {OpenCL} and {CUDA} platforms are essentially similar in terms of performance.},
	author = {Memarzia, Puya and Khunjush, Farshad},
	date = {2015},
	langid = {english},
	file = {PDF:C\:\\Users\\danwi\\Zotero\\storage\\GKAYMMNN\\Memarzia und Khunjush - 2015 - An In-depth Study on the Performance Impact of CUDA, OpenCL, and PTX Code.pdf:application/pdf},
}

@article{bastidas_fuertes_transpiler-based_2023,
	title = {Transpiler-Based Architecture Design Model for Back-End Layers in Software Development},
	volume = {13},
	rights = {http://creativecommons.org/licenses/by/3.0/},
	issn = {2076-3417},
	url = {https://www.mdpi.com/2076-3417/13/20/11371},
	doi = {10.3390/app132011371},
	abstract = {The utilization of software architectures and designs is widespread in software development, offering conceptual frameworks to address recurring challenges. A transpiler is a tool that automatically converts source code from one high-level programming language to another, ensuring algorithmic equivalence. This study introduces an innovative software architecture design model that integrates transpilers into the back-end layer, enabling the automatic transformation of business logic and back-end components from a single source code (the coding artifact) into diverse equivalent versions using distinct programming languages (the automatically produced code). This work encompasses both abstract and detailed design aspects, covering the proposal, automated processes, layered design, development environment, nest implementations, and cross-cutting components. In addition, it defines the main target audiences, discusses pros and cons, examines their relationships with prevalent design paradigms, addresses considerations about compatibility and debugging, and emphasizes the pivotal role of the transpiler. An empirical experiment involving the practical application of this model was conducted by implementing a collaborative to-do list application. This paper comprehensively outlines the relevant methodological approach, strategic planning, precise execution, observed outcomes, and insightful reflections while underscoring the the model’s pragmatic viability and highlighting its relevance across various software development contexts. Our contribution aims to enrich the field of software architecture design by introducing a new way of designing multi-programming-language software.},
	pages = {11371},
	number = {20},
	journaltitle = {Applied Sciences},
	author = {Bastidas Fuertes, Andrés and Pérez, María and Meza, Jaime},
	urldate = {2025-01-03},
	date = {2023-01},
	langid = {english},
	note = {Number: 20
Publisher: Multidisciplinary Digital Publishing Institute},
	keywords = {back-end layers, design model, software architecture, software development, source-to-source transformations, transpiler},
	file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\AD55DPJ4\\Bastidas Fuertes et al. - 2023 - Transpiler-Based Architecture Design Model for Back-End Layers in Software Development.pdf:application/pdf},
}

@incollection{adam_no_2019,
	location = {Cham},
	title = {No Free Lunch Theorem: A Review},
	isbn = {978-3-030-12767-1},
	url = {https://doi.org/10.1007/978-3-030-12767-1_5},
	shorttitle = {No Free Lunch Theorem},
	abstract = {The “No Free Lunch” theorem states that, averaged over all optimization problems, without re-sampling, all optimization algorithms perform equally well. Optimization, search, and supervised learning are the areas that have benefited more from this important theoretical concept. Formulation of the initial No Free Lunch theorem, very soon, gave rise to a number of research works which resulted in a suite of theorems that define an entire research field with significant results in other scientific areas where successfully exploring a search space is an essential and critical task. The objective of this paper is to go through the main research efforts that contributed to this research field, reveal the main issues, and disclose those points that are helpful in understanding the hypotheses, the restrictions, or even the inability of applying No Free Lunch theorems.},
	pages = {57--82},
	booktitle = {Approximation and Optimization : Algorithms, Complexity and Applications},
	publisher = {Springer International Publishing},
	author = {Adam, Stavros P. and Alexandropoulos, Stamatios-Aggelos N. and Pardalos, Panos M. and Vrahatis, Michael N.},
	editor = {Demetriou, Ioannis C. and Pardalos, Panos M.},
	urldate = {2025-02-14},
	date = {2019},
	langid = {english},
	doi = {10.1007/978-3-030-12767-1_5},
}

@inproceedings{michalakes_gpu_2008,
	title = {{GPU} acceleration of numerical weather prediction},
	url = {https://ieeexplore.ieee.org/abstract/document/4536351},
	doi = {10.1109/IPDPS.2008.4536351},
	abstract = {Weather and climate prediction software has enjoyed the benefits of exponentially increasing processor power for almost 50 years. Even with the advent of large-scale parallelism in weather models, much of the performance increase has come from increasing processor speed rather than increased parallelism. This free ride is nearly over. Recent results also indicate that simply increasing the use of large- scale parallelism will prove ineffective for many scenarios. We present an alternative method of scaling model performance by exploiting emerging architectures using the fine-grain parallelism once used in vector machines. The paper shows the promise of this approach by demonstrating a 20 times speedup for a computationally intensive portion of the Weather Research and Forecast ({WRF}) model on an {NVIDIA} 8800 {GTX} graphics processing unit ({GPU}). We expect an overall 1.3 times speedup from this change alone.},
	eventtitle = {2008 {IEEE} International Symposium on Parallel and Distributed Processing},
	pages = {1--7},
	booktitle = {2008 {IEEE} International Symposium on Parallel and Distributed Processing},
	author = {Michalakes, John and Vachharajani, Manish},
	urldate = {2025-02-14},
	date = {2008-04},
	note = {{ISSN}: 1530-2075},
	keywords = {Acceleration, Bandwidth, Computer architecture, Concurrent computing, Graphics, Large-scale systems, Parallel processing, Predictive models, Weather forecasting, Yarn},
	file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\ZFEVRLEZ\\Michalakes und Vachharajani - 2008 - GPU acceleration of numerical weather prediction.pdf:application/pdf;IEEE Xplore Abstract Record:C\:\\Users\\danwi\\Zotero\\storage\\PYY4F7JB\\4536351.html:text/html},
}

@article{han_packetshader_2010,
	title = {{PacketShader}: a {GPU}-accelerated software router},
	volume = {40},
	issn = {0146-4833},
	url = {https://doi.org/10.1145/1851275.1851207},
	doi = {10.1145/1851275.1851207},
	shorttitle = {{PacketShader}},
	abstract = {We present {PacketShader}, a high-performance software router framework for general packet processing with Graphics Processing Unit ({GPU}) acceleration. {PacketShader} exploits the massively-parallel processing power of {GPU} to address the {CPU} bottleneck in current software routers. Combined with our high-performance packet I/O engine, {PacketShader} outperforms existing software routers by more than a factor of four, forwarding 64B {IPv}4 packets at 39 Gbps on a single commodity {PC}. We have implemented {IPv}4 and {IPv}6 forwarding, {OpenFlow} switching, and {IPsec} tunneling to demonstrate the flexibility and performance advantage of {PacketShader}. The evaluation results show that {GPU} brings significantly higher throughput over the {CPU}-only implementation, confirming the effectiveness of {GPU} for computation and memory-intensive operations in packet processing.},
	pages = {195--206},
	number = {4},
	journaltitle = {{SIGCOMM} Comput. Commun. Rev.},
	author = {Han, Sangjin and Jang, Keon and Park, {KyoungSoo} and Moon, Sue},
	urldate = {2025-02-14},
	date = {2010-08-30},
}

@article{georgescu_gpu_2013,
	title = {{GPU} Acceleration for {FEM}-Based Structural Analysis},
	volume = {20},
	issn = {1886-1784},
	url = {https://doi.org/10.1007/s11831-013-9082-8},
	doi = {10.1007/s11831-013-9082-8},
	abstract = {Graphic Processing Units ({GPUs}) have greatly exceeded their initial role of graphics accelerators and have taken a new role of co-processors for computation—heavy tasks. Both hardware and software ecosystems have now matured, with fully {IEEE} compliant double precision and memory correction being supported and a rich set of software tools and libraries being available. This in turn has lead to their increased adoption in a growing number of fields, both in academia and, more recently, in industry. In this review we investigate the adoption of {GPUs} as accelerators in the field of Finite Element Structural Analysis, a design tool that is now essential in many branches of engineering. We survey the work that has been done in accelerating the most time consuming steps of the analysis, indicate the speedup that has been achieved and, where available, highlight software libraries and packages that will enable the reader to take advantage of such acceleration. Overall, we try to draw a high level picture of where the state of the art is currently at.},
	pages = {111--121},
	number = {2},
	journaltitle = {Archives of Computational Methods in Engineering},
	shortjournal = {Arch Computat Methods Eng},
	author = {Georgescu, Serban and Chow, Peter and Okuda, Hiroshi},
	urldate = {2025-02-14},
	date = {2013-06-01},
	langid = {english},
	keywords = {Compute Unify Device Architecture, Element Stiffness Matrice, Global Stiffness Matrix, Iterative Solver, Matrix Solver},
	file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\352VGH3Y\\Georgescu et al. - 2013 - GPU Acceleration for FEM-Based Structural Analysis.pdf:application/pdf},
}

@article{brunton_discovering_2016,
	title = {Discovering governing equations from data by sparse identification of nonlinear dynamical systems},
	volume = {113},
	url = {https://www.pnas.org/doi/abs/10.1073/pnas.1517384113},
	doi = {10.1073/pnas.1517384113},
	abstract = {Extracting governing equations from data is a central challenge in many diverse areas of science and engineering. Data are abundant whereas models often remain elusive, as in climate science, neuroscience, ecology, finance, and epidemiology, to name only a few examples. In this work, we combine sparsity-promoting techniques and machine learning with nonlinear dynamical systems to discover governing equations from noisy measurement data. The only assumption about the structure of the model is that there are only a few important terms that govern the dynamics, so that the equations are sparse in the space of possible functions; this assumption holds for many physical systems in an appropriate basis. In particular, we use sparse regression to determine the fewest terms in the dynamic governing equations required to accurately represent the data. This results in parsimonious models that balance accuracy with model complexity to avoid overfitting. We demonstrate the algorithm on a wide range of problems, from simple canonical systems, including linear and nonlinear oscillators and the chaotic Lorenz system, to the fluid vortex shedding behind an obstacle. The fluid example illustrates the ability of this method to discover the underlying dynamics of a system that took experts in the community nearly 30 years to resolve. We also show that this method generalizes to parameterized systems and systems that are time-varying or have external forcing.},
	pages = {3932--3937},
	number = {15},
	journaltitle = {Proceedings of the National Academy of Sciences},
	author = {Brunton, Steven L. and Proctor, Joshua L. and Kutz, J. Nathan},
	urldate = {2025-02-26},
	date = {2016-04-12},
	note = {Publisher: Proceedings of the National Academy of Sciences},
	file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\6R643NFZ\\Brunton et al. - 2016 - Discovering governing equations from data by sparse identification of nonlinear dynamical systems.pdf:application/pdf},
}

@article{dong_evolving_2024,
	title = {Evolving Equation Learner For Symbolic Regression},
	issn = {1941-0026},
	url = {https://ieeexplore.ieee.org/abstract/document/10538006/metrics#metrics},
	doi = {10.1109/TEVC.2024.3404650},
	abstract = {Symbolic regression, a multifaceted optimization challenge involving the refinement of both structural components and coefficients, has gained significant research interest in recent years. The Equation Learner ({EQL}), a neural network designed to optimize both equation structure and coefficients through gradient-based optimization algorithms, has emerged as an important topic of concern within this field. Thus far, several variations of {EQL} have been introduced. Nevertheless, these existing {EQL} methodologies suffer from a fundamental constraint that they necessitate a predefined network structure. This limitation imposes constraints on the complexity of equations and makes them ill-suited for high-dimensional or high-order problem domains. To tackle the aforementioned shortcomings, we present a novel approach known as the evolving Equation Learner ({eEQL}). {eEQL} introduces a unique network structure characterized by automatically defined functions ({ADFs}). This new architectural design allows for dynamic adaptations of the network structure. Moreover, by engaging in self-learning and self-evolution during the search process, {eEQL} facilitates the generation of intricate, high-order, and constructive sub-functions. This enhancement can improve the accuracy and efficiency of the algorithm. To evaluate its performance, the proposed {eEQL} method has been tested across various datasets, including benchmark datasets, physics datasets, and real-world datasets. The results have demonstrated that our approach outperforms several well-known methods.},
	pages = {1--1},
	journaltitle = {{IEEE} Transactions on Evolutionary Computation},
	author = {Dong, Junlan and Zhong, Jinghui and Liu, Wei-Li and Zhang, Jun},
	urldate = {2025-02-26},
	date = {2024},
	note = {Conference Name: {IEEE} Transactions on Evolutionary Computation},
	keywords = {Optimization, Adaptation models, Complexity theory, Equation Learner, Evolutionary computation, Evolving equation learner, Mathematical models, Neural networks, Progressive Evolutionary Structure Search, Training},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\danwi\\Zotero\\storage\\8PQADTZP\\metrics.html:text/html},
}

@incollection{korns_accuracy_2011,
	location = {New York, {NY}},
	title = {Accuracy in Symbolic Regression},
	isbn = {978-1-4614-1770-5},
	url = {https://doi.org/10.1007/978-1-4614-1770-5_8},
	abstract = {This chapter asserts that, in current state-of-the-art symbolic regression engines, accuracy is poor. That is to say that state-of-the-art symbolic regression engines return a champion with good fitness; however, obtaining a champion with the correct formula is not forthcoming even in cases of only one basis function with minimally complex grammar depth. Ideally, users expect that for test problems created with no noise, using only functions in the specified grammar, with only one basis function and some minimal grammar depth, that state-of-the-art symbolic regression systems should return the exact formula (or at least an isomorph) used to create the test data. Unfortunately, this expectation cannot currently be achieved using published state-of-the-art symbolic regression techniques. Several classes of test formulas, which prove intractable, are examined and an understanding of why they are intractable is developed. Techniques in Abstract Expression Grammars are employed to render these problems tractable, including manipulation of the epigenome during the evolutionary process, together with breeding of multiple targeted epigenomes in separate population islands. Aselected set of currently intractable problems are shown to be solvable, using these techniques, and a proposal is put forward for a discipline-wide program of improving accuracy in state-of-the-art symbolic regression systems.},
	pages = {129--151},
	booktitle = {Genetic Programming Theory and Practice {IX}},
	publisher = {Springer},
	author = {Korns, Michael F.},
	editor = {Riolo, Rick and Vladislavleva, Ekaterina and Moore, Jason H.},
	urldate = {2025-02-27},
	date = {2011},
	langid = {english},
	doi = {10.1007/978-1-4614-1770-5_8},
}

@article{keijzer_scaled_2004,
	title = {Scaled Symbolic Regression},
	volume = {5},
	issn = {1573-7632},
	url = {https://doi.org/10.1023/B:GENP.0000030195.77571.f9},
	doi = {10.1023/B:GENP.0000030195.77571.f9},
	abstract = {Performing a linear regression on the outputs of arbitrary symbolic expressions has empirically been found to provide great benefits. Here some basic theoretical results of linear regression are reviewed on their applicability for use in symbolic regression. It will be proven that the use of a scaled error measure, in which the error is calculated after scaling, is expected to perform better than its unscaled counterpart on all possible symbolic regression problems. As the method (i) does not introduce additional parameters to a symbolic regression run, (ii) is guaranteed to improve results on most symbolic regression problems (and is not worse on any other problem), and (iii) has a well-defined upper bound on the error, scaled squared error is an ideal candidate to become the standard error measure for practical applications of symbolic regression.},
	pages = {259--269},
	number = {3},
	journaltitle = {Genetic Programming and Evolvable Machines},
	shortjournal = {Genet Program Evolvable Mach},
	author = {Keijzer, Maarten},
	urldate = {2025-02-27},
	date = {2004-09-01},
	langid = {english},
	keywords = {Artificial Intelligence, genetic programming, linear regression, symbolic regression},
	file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\ZH9LAN74\\Keijzer - 2004 - Scaled Symbolic Regression.pdf:application/pdf},
}

@misc{jin_bayesian_2020,
	title = {Bayesian Symbolic Regression},
	url = {http://arxiv.org/abs/1910.08892},
	doi = {10.48550/arXiv.1910.08892},
	abstract = {Interpretability is crucial for machine learning in many scenarios such as quantitative finance, banking, healthcare, etc. Symbolic regression ({SR}) is a classic interpretable machine learning method by bridging X and Y using mathematical expressions composed of some basic functions. However, the search space of all possible expressions grows exponentially with the length of the expression, making it infeasible for enumeration. Genetic programming ({GP}) has been traditionally and commonly used in {SR} to search for the optimal solution, but it suffers from several limitations, e.g. the difficulty in incorporating prior knowledge; overly-complicated output expression and reduced interpretability etc. To address these issues, we propose a new method to fit {SR} under a Bayesian framework. Firstly, Bayesian model can naturally incorporate prior knowledge (e.g., preference of basis functions, operators and raw features) to improve the efficiency of fitting {SR}. Secondly, to improve interpretability of expressions in {SR}, we aim to capture concise but informative signals. To this end, we assume the expected signal has an additive structure, i.e., a linear combination of several concise expressions, whose complexity is controlled by a well-designed prior distribution. In our setup, each expression is characterized by a symbolic tree, and the proposed {SR} model could be solved by sampling symbolic trees from the posterior distribution using an efficient Markov chain Monte Carlo ({MCMC}) algorithm. Finally, compared with {GP}, the proposed {BSR}(Bayesian Symbolic Regression) method saves computer memory with no need to keep an updated 'genome pool'. Numerical experiments show that, compared with {GP}, the solutions of {BSR} are closer to the ground truth and the expressions are more concise. Meanwhile we find the solution of {BSR} is robust to hyper-parameter specifications such as the number of trees.},
	number = {{arXiv}:1910.08892},
	publisher = {{arXiv}},
	author = {Jin, Ying and Fu, Weilin and Kang, Jian and Guo, Jiadong and Guo, Jian},
	urldate = {2025-02-27},
	date = {2020-01-16},
	eprinttype = {arxiv},
	eprint = {1910.08892 [stat]},
	keywords = {Statistics - Methodology},
	file = {Preprint PDF:C\:\\Users\\danwi\\Zotero\\storage\\3MP48UI3\\Jin et al. - 2020 - Bayesian Symbolic Regression.pdf:application/pdf;Snapshot:C\:\\Users\\danwi\\Zotero\\storage\\UNNZKPRJ\\1910.html:text/html},
}

@inproceedings{winter_are_2021,
	location = {New York, {NY}, {USA}},
	title = {Are dynamic memory managers on {GPUs} slow? a survey and benchmarks},
	isbn = {978-1-4503-8294-6},
	url = {https://doi.org/10.1145/3437801.3441612},
	doi = {10.1145/3437801.3441612},
	series = {{PPoPP} '21},
	shorttitle = {Are dynamic memory managers on {GPUs} slow?},
	abstract = {Dynamic memory management on {GPUs} is generally understood to be a challenging topic. On current {GPUs}, hundreds of thousands of threads might concurrently allocate new memory or free previously allocated memory. This leads to problems with thread contention, synchronization overhead and fragmentation. Various approaches have been proposed in the last ten years and we set out to evaluate them on a level playing field on modern hardware to answer the question, if dynamic memory managers are as slow as commonly thought of. In this survey paper, we provide a consistent framework to evaluate all publicly available memory managers in a large set of scenarios. We summarize each approach and thoroughly evaluate allocation performance (thread-based as well as warp-based), and look at performance scaling, fragmentation and real-world performance considering a synthetic workload as well as updating dynamic graphs. We discuss the strengths and weaknesses of each approach and provide guidelines for the respective best usage scenario. We provide a unified interface to integrate any of the tested memory managers into an application and switch between them for benchmarking purposes. Given our results, we can dispel some of the dread associated with dynamic memory managers on the {GPU}.},
	pages = {219--233},
	booktitle = {Proceedings of the 26th {ACM} {SIGPLAN} Symposium on Principles and Practice of Parallel Programming},
	publisher = {Association for Computing Machinery},
	author = {Winter, Martin and Parger, Mathias and Mlakar, Daniel and Steinberger, Markus},
	urldate = {2025-02-27},
	date = {2021-02-17},
}

@article{bartlett_exhaustive_2024,
	title = {Exhaustive Symbolic Regression},
	volume = {28},
	issn = {1941-0026},
	url = {https://ieeexplore.ieee.org/abstract/document/10136815},
	doi = {10.1109/TEVC.2023.3280250},
	abstract = {Symbolic regression ({SR}) algorithms attempt to learn analytic expressions which fit data accurately and in a highly interpretable manner. Conventional {SR} suffers from two fundamental issues which we address here. First, these methods search the space stochastically (typically using genetic programming) and hence do not necessarily find the best function. Second, the criteria used to select the equation optimally balancing accuracy with simplicity have been variable and subjective. To address these issues we introduce exhaustive {SR} ({ESR}), which systematically and efficiently considers all possible equations—made with a given basis set of operators and up to a specified maximum complexity—and is therefore guaranteed to find the true optimum (if parameters are perfectly optimized) and a complete function ranking subject to these constraints. We implement the minimum description length principle as a rigorous method for combining these preferences into a single objective. To illustrate the power of {ESR} we apply it to a catalog of cosmic chronometers and the Pantheon+ sample of supernovae to learn the Hubble rate as a function of redshift, finding 40 functions (out of 5.2 million trial functions) that fit the data more economically than the Friedmann equation. These low-redshift data therefore do not uniquely prefer the expansion history of the standard model of cosmology. We make our code and full equation sets publicly available.},
	pages = {950--964},
	number = {4},
	journaltitle = {{IEEE} Transactions on Evolutionary Computation},
	author = {Bartlett, Deaglan J. and Desmond, Harry and Ferreira, Pedro G.},
	urldate = {2025-02-28},
	date = {2024-08},
	note = {Conference Name: {IEEE} Transactions on Evolutionary Computation},
	keywords = {Optimization, Complexity theory, Mathematical models, Biological system modeling, Cosmology data analysis, minimum description length, model selection, Numerical models, Search problems, Standards, symbolic regression ({SR})},
	file = {Eingereichte Version:C\:\\Users\\danwi\\Zotero\\storage\\Y6LFWDH2\\Bartlett et al. - 2024 - Exhaustive Symbolic Regression.pdf:application/pdf;IEEE Xplore Abstract Record:C\:\\Users\\danwi\\Zotero\\storage\\2HU5A8RL\\10136815.html:text/html},
}

@inproceedings{dokken_gpu_2005,
	location = {New York, {NY}, {USA}},
	title = {The {GPU} as a high performance computational resource},
	isbn = {978-1-59593-204-4},
	url = {https://doi.org/10.1145/1090122.1090126},
	doi = {10.1145/1090122.1090126},
	series = {{SCCG} '05},
	abstract = {With the introduction in 2003 of standard {GPUs} with 32 bit floating point numbers and programmable Vertex and Fragment processors, the processing power of the {GPU} was made available to non-graphics applications. As the {GPU} is aimed at computer graphics, the concepts in {GPU}-programming are based on computer graphics terminology, and the strategies for programming have to be based on the architecture of the graphics pipeline. At {SINTEF} in Norway a 4-year strategic institute project (2004-2007) "Graphics hardware as a high-end computational resource", http://www.math.sintef.no/gpu/ aims at making {GPUs} available as a computational resource both to academia and industry. This paper addresses the challenges of {GPU}-programming and results of the project's first year.},
	pages = {21--26},
	booktitle = {Proceedings of the 21st Spring Conference on Computer Graphics},
	publisher = {Association for Computing Machinery},
	author = {Dokken, Tor and Hagen, Trond R. and Hjelmervik, Jon M.},
	urldate = {2025-03-01},
	date = {2005-05-12},
}

@inproceedings{huang_gpu_2008,
	title = {{GPU} as a General Purpose Computing Resource},
	url = {https://ieeexplore.ieee.org/abstract/document/4710975/references#references},
	doi = {10.1109/PDCAT.2008.38},
	abstract = {In the last few years, {GPUs}(Graphics Processing Units) have made rapid development. Their ever-increasing computing power and decreasing cost have attracted attention from both industry and academia. In addition to graphics applications, researchers are interested in using them for general purpose computing. Recently, {NVIDIA} released a new computing architecture, {CUDA} (compute united device architecture), for its {GeForce} 8 series, Quadro {FX}, and Tesla {GPU} products. This new architecture can change fundamentally the way in which {GPUs} are used. In this paper, we study the programmability of {CUDA} and its {GeForce} 8 {GPU} and compare its performance with general purpose processors, in order to investigate its suitability for general purpose computation.},
	eventtitle = {2008 Ninth International Conference on Parallel and Distributed Computing, Applications and Technologies},
	pages = {151--158},
	booktitle = {2008 Ninth International Conference on Parallel and Distributed Computing, Applications and Technologies},
	author = {Huang, Qihang and Huang, Zhiyi and Werstein, Paul and Purvis, Martin},
	urldate = {2025-03-01},
	date = {2008-12},
	note = {{ISSN}: 2379-5352},
	keywords = {Computer architecture, Application software, Central Processing Unit, Computer graphics, Distributed computing, Grid computing, Multicore processing, Pipelines, Programming profession, Rendering (computer graphics)},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\danwi\\Zotero\\storage\\2FJP9K25\\references.html:text/html},
}

@article{han_hicuda_2011,
	title = {{hiCUDA}: High-Level {GPGPU} Programming},
	volume = {22},
	url = {https://ieeexplore.ieee.org/abstract/document/5445082},
	shorttitle = {{hiCUDA}},
	abstract = {Graphics Processing Units ({GPUs}) have become a competitive accelerator for applications outside the graphics domain, mainly driven by the improvements in {GPU} programmability. Although the Compute Unified Device Architecture ({CUDA}) is a simple C-like interface for programming {NVIDIA} {GPUs}, porting applications to {CUDA} remains a challenge to average programmers. In particular, {CUDA} places on the programmer the burden of packaging {GPU} code in separate functions, of explicitly managing data transfer between the host and {GPU} memories, and of manually optimizing the utilization of the {GPU} memory. Practical experience shows that the programmer needs to make significant code changes, often tedious and error-prone, before getting an optimized program. We have designed {hiCUDA}},
	pages = {78--90},
	number = {1},
	journaltitle = {{IEEE} Transactions on Parallel and Distributed Systems},
	author = {Han, Tianyi David and Abdelrahman, Tarek S.},
	urldate = {2025-03-01},
	date = {2011},
	note = {Conference Name: {IEEE} Transactions on Parallel and Distributed Systems},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\danwi\\Zotero\\storage\\5K63T7RB\\5445082.html:text/html},
}

@article{verbraeck_interactive_2021,
	title = {Interactive Black-Hole Visualization},
	volume = {27},
	issn = {1941-0506},
	url = {https://ieeexplore.ieee.org/abstract/document/9226126},
	doi = {10.1109/TVCG.2020.3030452},
	abstract = {We present an efficient algorithm for visualizing the effect of black holes on its distant surroundings as seen from an observer nearby in orbit. Our solution is {GPU}-based and builds upon a two-step approach, where we first derive an adaptive grid to map the 360-view around the observer to the distorted celestial sky, which can be directly reused for different camera orientations. Using a grid, we can rapidly trace rays back to the observer through the distorted spacetime, avoiding the heavy workload of standard tracing solutions at real-time rates. By using a novel interpolation technique we can also simulate an observer path by smoothly transitioning between multiple grids. Our approach accepts real star catalogues and environment maps of the celestial sky and generates the resulting black-hole deformations in real time.},
	pages = {796--805},
	number = {2},
	journaltitle = {{IEEE} Transactions on Visualization and Computer Graphics},
	author = {Verbraeck, Annemieke and Eisemann, Elmar},
	urldate = {2025-03-02},
	date = {2021-02},
	note = {Conference Name: {IEEE} Transactions on Visualization and Computer Graphics},
	keywords = {Algorithms, Cameras, Computer Graphics Techniques, Distortion, Engineering, Mathematics, Observers, Physical \& Environmental Sciences, Ray tracing, Real-time systems, Rendering (computer graphics), Visualization},
	file = {PDF:C\:\\Users\\danwi\\Zotero\\storage\\HDASRGYN\\Verbraeck und Eisemann - 2021 - Interactive Black-Hole Visualization.pdf:application/pdf},
}

@book{hissbach_overview_2022,
	title = {An Overview of Techniques for Egocentric Black Hole Visualization and Their Suitability for Planetarium Applications},
	isbn = {978-3-03868-189-2},
	url = {https://doi.org/10.2312/vmv.20221207},
	abstract = {The visualization of black holes is used in science communication to educate people about our universe and concepts of general relativity. Recent visualizations aim to depict black holes in realtime, overcoming the challenge of efficient general relativistic ray tracing. In this state-of-the-art report, we provide the first overview of existing works about egocentric black hole visualization that generate images targeted at general audiences. We focus on Schwarzschild and Kerr black holes and discuss current methods to depict the distortion of background panoramas, point-shaped stars, nearby objects, and accretion disks. Approaches to realtime visualizations are highlighted. Furthermore, we present the implementation of a state-of-the-art black hole visualization in the planetarium software Uniview.},
	publisher = {The Eurographics Association},
	author = {Hissbach, Anny-Marleen and Dick, Christian and Lawonn, Kai},
	urldate = {2025-03-02},
	date = {2022},
	langid = {english},
	file = {Full Text PDF:C\:\\Users\\danwi\\Zotero\\storage\\TBBLEZ5N\\Hissbach et al. - 2022 - An Overview of Techniques for Egocentric Black Hole Visualization and Their Suitability for Planetar.pdf:application/pdf},
}