diff --git a/other/interpreter_sequence_diagram.drawio b/other/interpreter_sequence_diagram.drawio index 9ca2434..bc61ec8 100644 --- a/other/interpreter_sequence_diagram.drawio +++ b/other/interpreter_sequence_diagram.drawio @@ -1,169 +1,172 @@ - + - + - - + + - - + + - - + + - - + + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - + + - - - - - - - + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/other/transpiler_sequence_diagram.drawio b/other/transpiler_sequence_diagram.drawio index 9762fc3..f5c9656 100644 --- a/other/transpiler_sequence_diagram.drawio +++ b/other/transpiler_sequence_diagram.drawio @@ -1,165 +1,181 @@ - + - + - - + + - - + + - - + + - - + + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - + + - - - - - - - + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - + + + + + + - - + + + + - - - - - - - + + + + diff --git a/package/src/ExpressionExecutorCuda.jl b/package/src/ExpressionExecutorCuda.jl index 9834a79..f10a56d 100644 --- a/package/src/ExpressionExecutorCuda.jl +++ b/package/src/ExpressionExecutorCuda.jl @@ -56,18 +56,19 @@ function evaluate_gpu(expressions::Vector{Expr}, X::Matrix{Float32}, p::Vector{V largestParameterSetSize = Utils.get_max_inner_length(p) # parameters get transformed into matrix. Will be nr. of rows in parameter matrix - ptxKernels = Vector{String}(undef, length(expressions)) + compiledKernels = Vector{CuFunction}(undef, length(expressions)) kernelName = "evaluate_gpu" @inbounds Threads.@threads for i in eachindex(expressions) ex = ExpressionProcessing.expr_to_postfix(expressions[i]) - ptxKernels[i] = Transpiler.transpile(ex, variableSetSize, largestParameterSetSize, numVariableSets, i-1, kernelName) # i-1 because julia is 1-based but PTX needs 0-based indexing + ptxKernel = Transpiler.transpile(ex, variableSetSize, largestParameterSetSize, numVariableSets, i-1, kernelName) # i-1 because julia is 1-based but PTX needs 0-based indexing + compiledKernels[i] = Transpiler.compile_kernel(ptxKernel, kernelName) end results = Matrix{Float32}(undef, numVariableSets, length(expressions)) for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl) # evaluate # results = Transpiler.evaluate(exprs, variables, numVariableSets, variableSetSize, p) - results = Transpiler.evaluate(ptxKernels, variables, numVariableSets, p, kernelName) + results = Transpiler.evaluate(compiledKernels, variables, numVariableSets, p, kernelName) end return results diff --git a/package/src/Transpiler.jl b/package/src/Transpiler.jl index 46ab7ad..270b29f 100644 --- a/package/src/Transpiler.jl +++ b/package/src/Transpiler.jl @@ -25,7 +25,7 @@ function evaluate(expressions::Vector{ExpressionProcessing.PostfixType}, cudaVar kernelName = "evaluate_gpu" @inbounds Threads.@threads for i in eachindex(expressions) kernel = transpile(expressions[i], variableRows, Utils.get_max_inner_length(parameters), variableColumns, i-1, kernelName) # i-1 because julia is 1-based but PTX needs 0-based indexing - compiledKernel = CompileKernel(kernel, kernelName) + compiledKernel = compile_kernel(kernel, kernelName) cudacall(compiledKernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks) end @@ -37,7 +37,7 @@ end " A simplified version of the evaluate function. It takes a list of already transpiled kernels to be executed. This should yield better performance, where the same expressions should be evaluated multiple times i.e. for parameter optimisation. " -function evaluate(kernels::Vector{String}, cudaVars::CuArray{Float32}, nrOfVariableSets::Integer, parameters::Vector{Vector{Float32}}, kernelName::String)::Matrix{Float32} +function evaluate(kernels::Vector{CuFunction}, cudaVars::CuArray{Float32}, nrOfVariableSets::Integer, parameters::Vector{Vector{Float32}}, kernelName::String)::Matrix{Float32} cudaParams = Utils.create_cuda_array(parameters, NaN32) # maybe make constant (see PerformanceTests.jl for more info) @@ -48,14 +48,13 @@ function evaluate(kernels::Vector{String}, cudaVars::CuArray{Float32}, nrOfVaria blocks = cld(nrOfVariableSets, threads) @inbounds Threads.@threads for i in eachindex(kernels) - compiledKernel = CompileKernel(kernels[i], kernelName) - cudacall(compiledKernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks) + cudacall(kernels[i], (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks) end return cudaResults end -function CompileKernel(ptxKernel::String, kernelName::String)::CuFunction +function compile_kernel(ptxKernel::String, kernelName::String)::CuFunction linker = CuLink() add_data!(linker, kernelName, ptxKernel) diff --git a/thesis/chapters/conceptdesign.tex b/thesis/chapters/conceptdesign.tex index 39e47ed..f8f7cce 100644 --- a/thesis/chapters/conceptdesign.tex +++ b/thesis/chapters/conceptdesign.tex @@ -54,6 +54,10 @@ Based on the requirements and data structure above, the architecture of both pro A design decision that has been made for both prototypes is to split the evaluation of each expression into a separate kernel or kernel dispatch as seen in Figure \ref{fig:kernel_architecture}. As explained in Section \ref{sec:thread_hierarchy}, it is desirable to reduce the occurrence of thread divergence as much as possible. Although the SIMT programming model tries to mitigate the negative effects of thread divergence, it is still a good idea to avoid it when possible. For this use-case, thread divergence can easily be avoided by not evaluating all expressions in a single kernel or kernel dispatch. GPUs are able to have multiple resident grids, with modern GPUs being able to accommodate 128 grids concurrently \parencite{nvidia_cuda_2025}. One grid corresponds to one kernel dispatch, and therefore allows up-to 128 kernels to be run concurrently. Therefore, dispatching a kernel for each expression, further increases GPU utilisation. In the case of the interpreter, having only one kernel that can be dispatched for each expression, also simplifies the kernel itself. This is because the kernel can focus on evaluating one expression and does not require additional code to handle multiple expressions at once. Similarly, the transpiler can also be simplified, as it can generate many smaller kernels rather than one big kernel. Additionally, the smaller kernels do not need any branching, because the generated code only needs to perform the operations as they occur in the expression itself. This also reduces the overhead on the GPU. One drawback of generating a kernel for each expression, is the generation itself. Especially for smaller variable sets, it is possible, that the time it takes to transpile an expression is greater than the time it takes to evaluate it. However, for larger variable sets this should not be a concern. +% +% TODO: Probably include a diagram that shows how the evaluators are integrated in the symbolic regression algorithm (assuming its a GP variant), to show the bigger picture +% + \subsection{Pre-Processing} \label{sec:pre-processing} The first step in both prototypes is the pre-processing step. It is needed, as it simplifies working with the expressions in the later steps. One of the responsibilities of the pre-processor is to verify that only allowed operators and symbols are present in the given expressions. This is comparable to the work a scanner like Flex\footnote{\url{https://github.com/westes/flex}} performs. Secondly, this step also converts the expression into an intermediate representation. In essence, the pre-processing step can be compared to the frontend of a compiler as described in Section \ref{sec:compilers}. If new operators are required, the pre-processor must be extended as well. Otherwise, expressions containing these operators would be treated as invalid and never reach the evaluator. diff --git a/thesis/chapters/evaluation.tex b/thesis/chapters/evaluation.tex index dc6b3ad..be86d90 100644 --- a/thesis/chapters/evaluation.tex +++ b/thesis/chapters/evaluation.tex @@ -83,7 +83,7 @@ The first benchmark consisted of $250\,000$ expressions and $362$ variable sets For the kernel configuration, a block size of $128$ threads has been used. As will be explained below, this has been found to be the configuration that results in the most performance. During the benchmark, the utilisation of both the CPU and GPU was roughly $100\%$. \subsubsection{Benchmark 2} -With $10\,000$ expressions, $362$ variable sets and $100$ parameter optimisation steps, the total number of evaluations per sample was $362\,\textit{million}$. The median across all samples is $21.3$ seconds with a standard deviation of $0.75$ seconds. Compared to benchmark 1, there were $25$ times fewer evaluations which also resulted in a reduction of the median and standard deviation of roughly $25$ times. Since the number of variable sets did not change, the block size for this benchmark remained at $128$ threads. Again the utilisation of the CPU and GPU during the benchmark was roughly $100\%$. +With $10\,000$ expressions, $362$ variable sets and $100$ parameter optimisation steps, the total number of evaluations per sample was $362\,\textit{million}$. The median across all samples is $21.3$ seconds with a standard deviation of $0.75$ seconds. Compared to benchmark 1, there were $25$ times fewer evaluations which also resulted in a reduction of the median and standard deviation of roughly $25$ times. This indicates a roughly linear correlation between the number of expressions and the runtime. Since the number of variable sets did not change, the block size for this benchmark remained at $128$ threads. Again the utilisation of the CPU and GPU during the benchmark was roughly $100\%$. \begin{figure} \centering \includegraphics[width=.9\textwidth]{results/gpu-interpreter-final-performance-benchmark2.png} @@ -100,17 +100,71 @@ The third benchmark used the same $10\,000$ expressions and $100$ parameter opti \label{fig:gpu_i_benchmark_3} \end{figure} -Although the number of variable sets has been increased by 30 times, the block size remained at 128 threads. Unlike the previous benchmarks, the hardware utilisation was different. Now only the GPU was utilised to 100\% while the CPU utilisation started at 100\% and slowly dropped to 80\%. The GPU needs to perform 30 times more evaluations, meaning it takes longer for one kernel dispatch to be finished. At the same time, the CPU tries to dispatch the kernel at the same rate as before. Because only a certain amount of kernels can be dispatched at once, the CPU needs to wait for the GPU to finish a kernel before another one can be dispatched again. Therefore, in this scenario, the evaluator runs into a GPU-bottleneck and using a GPU with more performance, would consequently improve the runtime in this scenario. In the benchmarks before, both the CPU and GPU would need to be upgraded, to achieve better performance. +Although the number of variable sets has been increased by 30 times, the block size remained at 128 threads. Unlike the previous benchmarks, the hardware utilisation was different. Now only the GPU was utilised to 100\% while the CPU utilisation started at 100\% and slowly dropped to 80\%. The GPU needs to perform 30 times more evaluations, meaning it takes longer for one kernel dispatch to be finished. At the same time, the CPU tries to dispatch the kernel at the same rate as before. Because only a certain number of kernels can be dispatched at once, the CPU needs to wait for the GPU to finish a kernel before another one can be dispatched again. Therefore, in this scenario, the evaluator runs into a GPU-bottleneck and using a more performant GPU would consequently improve the runtime in this scenario. In the benchmarks before, both the CPU and GPU would need to be upgraded, to achieve better performance. -blocksize 128: 84.84 blocks fast (prolly because less wasted threads) -bocksize 192: 56.56 blocks very slow -\subsubsection{Performance Tuning} % either subsubSection or change the title to "Performance Tuning Interpreter" -Document the process of performance tuning (mostly GPU, but also talk about CPU. Especially the re-aranging of data transfer and non usage of a cache) +\subsection{Performance Tuning Interpreter} % either subsubSection or change the title to "Performance Tuning Interpreter" +% Document the process of performance tuning (mostly GPU, but also talk about CPU. Especially the re-aranging of data transfer and non usage of a cache) + +% Initial: no cache; 256 blocksize; exprs pre-processed and sent to GPU on every call; vars sent on every call; frontend + dispatch are multithreaded +Optimising and tuning the interpreter is crucial to achieve good performance. Especially tuning the kernel, as a wrongly configured kernel can drastically degrade performance. Before any performance tuning and optimisation has been performed, the kernel was configured with a block size of 256 threads. Additionally, on the CPU, the frontend was executed for each expression before every kernel dispatch, even in parameter optimisation scenarios, where the expressions did not change from one dispatch to the other. Moreover, the variables have also been transmitted to the GPU before ever dispatch. However, executing the frontend, as well as dispatching the kernel was multithreaded, utilising all 12 threads of the CPU and a cache for the frontend has been used. + +With this implementation, the initial performance measurements have been conducted for benchmark 1 which served as the baseline for further performance optimisations. However, as already mentioned, during this benchmark, memory limitations where encountered, as too much RAM was being used. Therefore, the caching had to be disabled. Because the evaluator is multithreaded, this change resulted in significantly better performance. As the cache introduced critical sections where race conditions could occur, locking mechanisms needed to be used. While locking ensures that no race conditions occur, it also means that parts of an otherwise entirely parallel implementation are now serialised, reducing the effect of parallelisation. + +Without a cache and utilising all 12 threads, the frontend achieved very good performance. Processing $250\,000$ expressions takes roughly $88.5$ milliseconds. On the other hand, using a cache, resulted in the frontend running for $6.9$ \textit{seconds}. This equates to a speed-up of roughly 78 times when using no cache. Additionally, when looking at the results above, the time it takes to execute the frontend is negligible, meaning further optimising the frontend would not significantly improve the overall runtime. + +All optimisations have been performed with the same set of expressions and variables as benchmark one. This decision has been made because such high volumes of data are more likely to identify possible problems. Before conduction benchmarks two and three, additional performance tuning has been performed to ensure that these benchmarks also utilise the hardware as much as possible. + +\subsubsection{Optimisation 1} + +After caching has been disabled, the first performance improvement was to drastically reduce the number of calls to the frontend and the number of data transfers to the GPU. Because the expressions and variables never change during the parameter optimisation process, processing the expression and transmitting the data to the GPU on each step are wasted resources. Therefore, the expressions are sent to the frontend once before the parameter optimisation process. Afterwards, the processed expressions as well as the variables are transferred to the GPU exactly once for this execution of the interpreter. + +Figure \ref{fig:gpu_i_optimisation_1} shows how this optimisation improved the overall performance. However, it can also be seen that the range the individual samples fall within is much greater now. While in all cases, this optimisation improved the performance, in some cases the difference between the initial and the optimised version is very low with roughly a two-second improvement. + +\begin{figure} + \centering + \includegraphics[width=.9\textwidth]{results/interpreter-comparison-initial-optim1.png} + \caption{Comparison of the initial implementation with the first optimisation. Note that while the results of the optimisation have a much wider range, all samples performed better than the initial implementation.} + \label{fig:gpu_i_optimisation_1} +\end{figure} + +\subsubsection{Optimisation 2} + +The second optimisation was concerned with tuning the kernel configuration. Using NSight Compute\footnote{\url{https://developer.nvidia.com/nsight-compute}} it was possible to profile the kernel with different configurations. During the profiling a lot of metrics have been gathered that allowed to deeply analyse the kernel executions, with the application recommending different aspects that had a lot of potential for performance improvements. + +Since the evaluator is designed to execute many kernel dispatches in parallel, it was important to reduce the kernel runtime by as much as possible. Reducing the runtime per kernel has a knock-on effect, as the following kernel dispatches can more begin execution sooner reducing the overall runtime. + +After the evaluator tuning has been concluded, it was found that a block size of $128$ yielded the best results. With this kernel configuration, another performance measurement has been conducted with the results shown in Figure \ref{fig:gpu_i_optimisation_2}. As can be seen, the overall runtime again was noticeably faster. However, the standard deviation also drastically increased, with the duration from the fastest to the slowest sample differing by roughly 60 seconds. + +\begin{figure} + \centering + \includegraphics[width=.9\textwidth]{results/interpreter-comparison-optim1-optim2.png} + \caption{Comparison of the first optimisation with the second.} + \label{fig:gpu_i_optimisation_2} +\end{figure} + +The found block size of $128$ might seem strange. However, it makes sense, as in total at least $362$ threads need to be started to evaluate one expression. If one block contains $128$ threads a total of $362 / 128 \approx 3$ blocks need to be started, totalling $384$ threads. As a result, only $384 - 362 = 22$ threads are excess threads. When choosing a block size of $121$ three blocks could be started, totalling one excess thread. However, there is no performance difference between a block size of $121$ and $128$. Since all threads are executed inside a warp, which consists of exactly $32$ threads, a block size that is not divisible by $32$ has no benefit and only hides the true amount of excess threads started. +% TODO Include screenshots from nsight compute + +Benchmark three had a total of $10\,860$ variable sets, meaning at least this number of threads must be started. To ensure optimal hardware utilisation, the evaluator had to undergo another tuning process. As seen above, it is beneficial to start as little excess threads as possible. By utilising NSight Compute, a performance measurement with a block size of $128$ was used as the initial configuration. This already performed well as again very little excess threads are started. In total $10\,860 / 128 \approx 84.84$ blocks are needed which must be rounded up to $85$ blocks with the last block being filled by roughly $84\%$ which equates to $20$ excess threads being started. +% TODO: Include nsight compute screenshots +% TODO: also here include that finding the smallest common divisor that is divisible by 32 is a great starting point for performance tuning. then just use nsight compute to experiment with different configurations to find the best solution + +%Describe the theory behind these two block sizes (more excess threads but much fewer blocks -> more of the evaluations can be performed simultanously [38 or so SMs available], found that less excess threads is much more important) +%blocksize 128: 84.84 blocks fast (prolly because less wasted threads) +%bocksize 192: 56.56 blocks very slow + + +\subsubsection{Optimisation 3} + +\begin{figure} + \centering + \includegraphics[width=.9\textwidth]{results/interpreter-comparison-optim2-optim3.png} + \caption{Comparison of the second optimisation with the third.} + \label{fig:gpu_i_optimisation_3} +\end{figure} -Initial: no cache; 256 blocksize; exprs pre-processed and sent to GPU on every call; vars sent on every call; frontend + dispatch are multithreaded -1.) Done before parameter optimisation loop: Frontend, transmitting Exprs and Variables (improved runtime) 2.) tuned blocksize to have as little wasted threads as possible (new blocksize 121 -> 3-blocks -> 363 threads but 362 threads needed per expression) (128 should lead to the same results. Talk here a bit what to look out for, so block-size should be a multiple of 32 and should divide the nr. of varsets as best as possible to a whole number without going over) 3.) Minor optimisations. Reduced stacksize; reduced memory allocations on the CPU; reduced GC pressure diff --git a/thesis/chapters/implementation.tex b/thesis/chapters/implementation.tex index c67b997..022190f 100644 --- a/thesis/chapters/implementation.tex +++ b/thesis/chapters/implementation.tex @@ -118,11 +118,9 @@ An overview of how these components interact with each other is outlined in Figu The interpreter is given all the expressions it needs to interpret as an input. Additionally, it needs the variable matrix as well as the parameters for each expression. All expressions are passed to the interpreter as an array of Expr objects, as they are needed for the pre-processing step or the frontend. The first loop as shown in Figure \ref{fig:interpreter-sequence}, is responsible for sending the expressions to the frontend to be converted into the intermediate representation. After this step, the expressions are in the correct format to be sent to the GPU and the interpretation process can continue. \subsubsection{Data Transfer} -Before the GPU can start with the interpretation, the data needs to be sent to the GPU. Because the variables are already in matrix form, transferring the data is fairly straightforward. Memory must be allocated in the global memory of the GPU and then be copied from RAM into the allocated memory. Allocating memory and transferring the data to the GPU is handled implicitly by the CuArray type provided by CUDA.jl. +Before the GPU can start with the interpretation, the data needs to be present on it. Because the variables are already in matrix form, transferring the data is fairly straightforward. Memory must be allocated in the global memory of the GPU and then be copied from RAM into the allocated memory. Allocating memory and transferring the data to the GPU is handled implicitly by the CuArray type provided by CUDA.jl. -To optimise the interpreter for parameter optimisation workloads, this step is actually performed before the interpreter is called. Although, the diagram includes this transmission for completeness, it is important to note that the variables never change, as they represent the observed inputs of the system that being modelled by the symbolic regression algorithm. Therefore, re-transmitting the variables for each step of the parameter optimisation process would be inefficient. By transmitting the variables once and reusing them throughout the parameter optimisation, significant time can be saved. - -Furthermore, transferring the data to the GPU before the symbolic regression algorithm begins, could save even more time. However, this approach would require modification to the symbolic regression algorithm. Therefore, the decision has been made to neglect this optimisation. Nonetheless, it is still possible to modify the implementation at a later stage with minimal effort, if needed. +To optimise the interpreter for parameter optimisation workloads, this step is performed before it is called. Although, the diagram includes this transmission for completeness, it is important to note that the variables never change, as they represent the observed inputs of the system that is being modelled by the symbolic regression algorithm. As a symbolic regression algorithm is usually implemented with GP, there are many generations that need to be evaluated. Therefore, re-transmitting the variables for each generation is inefficient. By transmitting the variables once before the symbolic regression algorithm begins, additional performance gains are very likely. However, this approach would require modifying the symbolic regression algorithm, which is the reason this optimisation has not been applied. Nonetheless, if needed it is still possible to modify the implementation at a later stage with minimal effort. Once the variables are transmitted, the parameters also must be transferred to the GPU. Unlike the variables, the parameters are stored as a vector of vectors. In order to transmit the parameters efficiently, they also need to be put in a matrix form. The matrix needs to be of the form $k \times N$, where $k$ is equal to the length of the longest inner vector and $N$ is equal to the length of the outer vector. This ensures that all values can be stored in the matrix. It also means that if the inner vectors are of different lengths, some extra unnecessary values will be transmitted, but the overall benefit of treating them as a matrix outweighs this drawback. The Program \ref{code:julia_vec-to-mat} shows how this conversion can be implemented. Note that it is required to provide an invalid element. This ensures defined behaviour and helps with finding errors in the code. After the parameters have been brought into matrix form, they can be transferred to the GPU the same way the variables are transferred. @@ -146,11 +144,11 @@ end \label{code:julia_vec-to-mat} \end{program} -Similar to the parameters, the expressions are also stored as a vector of vectors. The outer vector contains each expression, while the inner vectors hold the expressions in their intermediate representation. Therefore, this vector of vectors also needs to be brought into matrix form the same way the parameters are brought into matrix form. To simplify development, the special opcode \textit{stop} has been introduced, which is used for the invalidElement in Program \ref{code:julia_vec-to-mat}. As seen in Section \ref{sec:interpreter-gpu-side}, this element is used to determine if the end of an expression has been reached during the interpretation process. This removes the need for additional data to be sent which stores the length of each expression to determine if the entire expression has been interpreted or not. Therefore, a lot of overhead can be reduced. +Similar to the parameters, the expressions are also stored as a vector of vectors. The outer vector contains each expression, while the inner vectors hold the expressions in their intermediate representation. Therefore, this vector of vectors also needs to be brought into matrix form following the same concept as the parameters. To simplify development, the special opcode \textit{stop} has been introduced, which is used for the invalidElement in Program \ref{code:julia_vec-to-mat}. As seen in Section \ref{sec:interpreter-gpu-side}, this element is used to determine if the end of an expression has been reached during the interpretation process. This removes the need for additional data to be sent which stores the length of each expression to determine if the entire expression has been interpreted or not. Therefore, a lot of overhead can be reduced. -Once the conversion into matrix form has been performed, the expressions are transferred to the GPU. Just like with the variables, the expressions remain the same over the course of the parameter optimisation part. Therefore, they are transferred to the GPU before the interpreter is called, to reduce the amount of unnecessary data transfer. +Once the conversion into matrix form has been performed, the expressions are transferred to the GPU. Just like with the variables, the expressions remain the same over the course of the parameter optimisation part. Which is the reason they are transferred to the GPU before the interpreter is called, reducing the number of unnecessary data transfers. -In addition to the already described data that needs to be sent, two more steps are required that have not been included in the Sequence Diagram \ref{fig:interpreter-sequence}. The first one is the allocation of global memory for the result matrix. Without this, the kernel would not know where to store the interpretation results and the CPU would not know from which memory location to read the results from. Therefore, enough global memory needs to be allocated beforehand so that the results can be stored and retrieved after all kernel executions have finished. +Only raw data can be sent to the GPU, which means that meta information about the data layout is missing. The matrices are represented as flat arrays, which means they have lost their column and row information. This information must be sent separately to let the kernel know the dimensions of the expressions, variables and parameters. Otherwise, the kernel does not know at which memory location the second variable set is stored, as it does not know how large a single set is for example. Figure \ref{fig:memory-layout-data} shows how the data is stored without any information about the rows or columns of the matrices. The thick lines help to identify where a new column, and therefore a new set of data begins. However, the GPU has no knowledge of this and therefore the meta information must be transferred separately to ensure that the data is accessed correctly. \begin{figure} \centering @@ -159,14 +157,15 @@ In addition to the already described data that needs to be sent, two more steps \label{fig:memory-layout-data} \end{figure} -Only raw data can be sent to the GPU, which means that information about the data is missing. The matrices are represented as flat arrays, which means they have lost their column and row information. This information must be sent separately to let the kernel know the dimensions of the expressions, variables and parameters. Otherwise, the kernel does not know at which memory location the second variable set is stored, as it does not know how large a single set is for example. Figure \ref{fig:memory-layout-data} shows how the data is stored without any information about the rows or columns of the matrices. The thick lines help to identify where a new column, and therefore a new set of data begins. However, the GPU has no knowledge of this and therefore the additional information must be transferred to ensure that the data is accessed correctly. +In addition to the already described data that needs to be sent, one more step is required that has not been included in the Sequence Diagram \ref{fig:interpreter-sequence}. Global memory must be allocated, that allows the results of the evaluation to be stored. Without this, the kernel would not know where to store the interpretation results and the CPU would not know from which memory location to read the results from. Therefore, enough global memory needs to be allocated beforehand so that the results can be stored and retrieved after all kernel executions have finished. + \subsubsection{Kernel Dispatch} Once all the data is present on the GPU, the CPU can dispatch the kernel for each expression. This dispatch requires parameters that specify the number of threads and their organisation into thread blocks. In total, one thread is required for each variable set and therefore the grouping into thread blocks is the primary variable. Taking into account the constraints explained in Section \ref{sec:occupancy}, this grouping needs to be tuned for optimal performance. The specific values alongside the methodology for determining these values will be explained in Chapter \ref{cha:evaluation}. In addition, the dispatch parameters also include the pointers to the location of the data allocated and transferred above, as well as the index of the expression to be interpreted. Since all expressions and parameters are sent to the GPU at once, this index ensures that the kernel knows where in memory to find the expression it needs to interpret and which parameter set it needs to use. After the kernel has finished, the result matrix needs to be read from the GPU and passed back to the symbolic regression algorithm. -Crucially, dispatching a kernel is an asynchronous operation, which means that the CPU does not wait for the kernel to finish before continuing. This allows the CPU to dispatch all kernels at once, rather than one at a time. As explained in Section \ref{sec:architecture}, a GPU can have multiple resident grids, meaning that the dispatched kernels can run concurrently, drastically reducing evaluation times. Only once the result matrix is read from the GPU does the CPU have to wait for all kernels to finish execution. +Crucially, dispatching a kernel is an asynchronous operation, which means that the CPU does not wait for the kernel to finish before continuing. This allows the CPU to dispatch all kernels at once, rather than one at a time. As explained in Section \ref{sec:architecture}, a GPU can have multiple resident grids, meaning that the dispatched kernels can run concurrently, drastically reducing evaluation time. Only once the result matrix is read from the GPU does the CPU have to wait for all kernels to finish execution. \subsection{GPU Side} \label{sec:interpreter-gpu-side} @@ -209,15 +208,15 @@ An overview of how the transpiler interacts with the frontend and GPU is outline \end{figure} \subsection{CPU Side} -After the transpiler has received the expressions to be transpiled, it first sends them to the frontend for processing. Once they have been processed, the expressions are sent to the transpiler backend which is explained in more detail Section \ref{sec:transpiler-backend}. The backend is responsible for generating the kernels. The output of the backend are the kernels written as PTX code for all expressions. +After the transpiler has received the expressions to be transpiled, it first sends them to the frontend for processing. Once an expression has been processed, it is sent to the transpiler backend which is explained in more detail Section \ref{sec:transpiler-backend}. The backend is responsible for generating the kernels. When finished, each expression is transpiled into its own kernels written in PTX code. \subsubsection{Data Transfer} -Data is sent to the GPU in the same way as it is sent by the interpreter. The variables are sent as they are, while the parameters are again brought into matrix form. Memory must also be allocated for the result matrix. Unlike the interpreter however, this is the only data that needs to be sent to the GPU for the transpiler. +Data is sent to the GPU in the same way it is sent in the interpreter. The variables are sent as they are, while the parameters are again brought into matrix form. Memory must also be allocated for the result matrix. Unlike the interpreter however, only the variables and parameters need to be sent to the GPU. The variables are again sent before the parameter optimisation step to reduce the number of data transfers. Because each expression has its own kernel, there is no need to transfer the expressions themselves. Moreover, there is also no need to send information about the layout of the variables and parameters to the GPU. The reason for this is explained in the transpiler backend section below. \subsubsection{Kernel Dispatch} -Once all the data is present on the GPU, the transpiled kernels can be dispatched. Dispatching the transpiled kernels is more involved than dispatching the interpreter kernel. Program \ref{code:julia_dispatch-comparison} shows the difference between dispatching the interpreter kernel and the transpiled kernels. An important note, is that the transpiled kernels must be manually compiled into machine code. To achieve this, CUDA.jl provides functionality to instruct the drivers to compile the PTX code. The same process of creating PTX code and compiling it must also be done for the interpreter kernel, however, this is done by CUDA.jl automatically when calling the @cuda macro in line 6. +Once all the data is present on the GPU, the transpiled kernels can be dispatched. Dispatching the transpiled kernels is more involved than dispatching the interpreter kernel. Program \ref{code:julia_dispatch-comparison} shows the difference between dispatching the interpreter kernel and the transpiled kernels. An important note, is that the transpiled kernels must be manually compiled into machine code. To achieve this, CUDA.jl provides functionality to instruct the driver to compile the PTX code. The same process of creating PTX code and compiling it must also be done for the interpreter kernel, however, this is done by CUDA.jl automatically when calling the @cuda macro in line 6. \begin{program} \begin{JuliaCode} @@ -250,7 +249,9 @@ end \end{JuliaCode} \label{code:julia_dispatch-comparison} \end{program} -After all kernels have been dispatched, the CPU waits for the kernels to complete their execution. When the kernels have finished, the result matrix is read from global memory into system memory. The results can then be returned to the symbolic regression algorithm. +Similar to the interpreter, the frontend and backend are executed before the parameter optimisation step to improve the runtime. Each kernel is compiled into machine code after it has been generated to ensure, as little work as possible needs to be done during the parameter optimisation loop. However, as will be explained in Chapter \ref{cha:evaluation}, storing the compiled kernels is very memory intensive. This means that if many expressions need to be evaluated at once, a lot of memory will be required. + +After all kernels have been dispatched, the CPU waits for the kernels to complete their execution. Once the kernels have finished, the result matrix is read from global memory into system memory. The results can then be returned to the symbolic regression algorithm. \subsection{Transpiler Backend} \label{sec:transpiler-backend} diff --git a/thesis/images/interpreter_sequence_diagram.png b/thesis/images/interpreter_sequence_diagram.png index 48b015f..e7347b0 100644 Binary files a/thesis/images/interpreter_sequence_diagram.png and b/thesis/images/interpreter_sequence_diagram.png differ diff --git a/thesis/images/results/interpreter-comparison-initial-optim1.png b/thesis/images/results/interpreter-comparison-initial-optim1.png new file mode 100644 index 0000000..79515d5 Binary files /dev/null and b/thesis/images/results/interpreter-comparison-initial-optim1.png differ diff --git a/thesis/images/results/interpreter-comparison-optim1-optim2.png b/thesis/images/results/interpreter-comparison-optim1-optim2.png new file mode 100644 index 0000000..94abf9c Binary files /dev/null and b/thesis/images/results/interpreter-comparison-optim1-optim2.png differ diff --git a/thesis/images/results/interpreter-comparison-optim2-optim3.png b/thesis/images/results/interpreter-comparison-optim2-optim3.png new file mode 100644 index 0000000..986f78b Binary files /dev/null and b/thesis/images/results/interpreter-comparison-optim2-optim3.png differ diff --git a/thesis/images/transpiler_sequence_diagram.png b/thesis/images/transpiler_sequence_diagram.png index 4b288f4..7decfd3 100644 Binary files a/thesis/images/transpiler_sequence_diagram.png and b/thesis/images/transpiler_sequence_diagram.png differ diff --git a/thesis/main.pdf b/thesis/main.pdf index 979b960..564c5dc 100644 Binary files a/thesis/main.pdf and b/thesis/main.pdf differ