implementation: Finished chapter. checking for errors and textual improvements not finished

2025-05-08 12:34:08 +02:00
parent b69a3efe96
commit 6b7205e026
7 changed files with 104 additions and 62 deletions
--- a/package/src/Transpiler.jl
+++ b/package/src/Transpiler.jl
@ -56,6 +56,8 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
 		formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i], cacheFrontend)
 		kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableCols, i-1) # i-1 because julia is 1-based but PTX needs 0-based indexing
 		
+		# println(kernel)
+
 		linker = CuLink()
 		add_data!(linker, "ExpressionProcessing", kernel)
 		
@ -93,7 +95,7 @@ end
 "
 function transpile(expression::ExpressionProcessing.PostfixType, varSetSize::Integer, paramSetSize::Integer, 
 				   nrOfVariableSets::Integer, expressionIndex::Integer)::String
-	exitJumpLocationMarker = "\$L__BB0_2"
+	exitJumpLocationMarker = "L__BB0_2"
 	ptxBuffer = IOBuffer()
 	regManager = Utils.RegisterManager(Dict(), Dict())

@ -237,9 +239,6 @@ function generate_calculation_code(expression::ExpressionProcessing.PostfixType,
 	end

 	tempReg = Utils.get_next_free_register(regManager, "rd")
-	# reg = pop!(operands)
-	# tmp = "abs.f32  $(reg), 16.0;"
-	# push!(operands, reg)
 	println(codeBuffer, "
 	add.u64        $tempReg, $((expressionIndex)*nrOfVarSets), $threadId64Reg;
 	mad.lo.u64     $tempReg, $tempReg, $BYTES, $resultsLocation;
@ -286,7 +285,7 @@ function get_operation(operator::Operator, regManager::Utils.RegisterManager, le
 	resultCode = ""

 	if is_binary_operator(operator) && isnothing(right)
-		throw(ArgumentError("Given operator '$operator' is a binary operator. However only one operator has been given."))
+		throw(ArgumentError("Given operator '$operator' is a binary operator. However only one operand has been given."))
 	end

 	if operator == ADD
--- a/package/src/Utils.jl
+++ b/package/src/Utils.jl
@ -79,7 +79,7 @@ function get_register_for_name(manager::RegisterManager, varName::String)
 	if haskey(manager.symtable, varName)
 		return (manager.symtable[varName], false)
 	else
-		reg = get_next_free_register(manager, "var")
+		reg = get_next_free_register(manager, "f")
 		manager.symtable[varName] = reg
 		return (reg, true)
 	end
--- a/package/test/TranspilerTests.jl
+++ b/package/test/TranspilerTests.jl
@ -41,6 +41,17 @@ parameters[2][1] = 5.0
 parameters[2][2] = 0.0
 parameters[3][1] = 16.0

+@testset "TEMP" begin
+	return
+	exprs = [:(x1 + p1)]
+	vars = Matrix{Float32}(undef, 1, 1)
+	params = Vector{Vector{Float32}}(undef, 1)
+
+	vars[1, 1] = 1
+	params[1] = [1]
+	Transpiler.evaluate(exprs, vars, params)
+end
+
@testset "Test transpiler evaluation" begin
 	results = Transpiler.evaluate(expressions, variables, parameters)

--- a/package/test/runtests.jl
+++ b/package/test/runtests.jl
@ -12,7 +12,7 @@ include(joinpath(baseFolder, "src", "Transpiler.jl"))
@testset "Functionality tests" begin
 	# include("ExpressionProcessingTests.jl")
 	# include("InterpreterTests.jl")
-	# include("TranspilerTests.jl")
+	include("TranspilerTests.jl")
 end


@ -22,5 +22,5 @@ end

@testset "Performance tests" begin
 	# include("PerformanceTuning.jl")
-	include("PerformanceTests.jl")
+	# include("PerformanceTests.jl")
 end
--- a/thesis/chapters/implementation.tex
+++ b/thesis/chapters/implementation.tex
@ -178,7 +178,7 @@ Moreover, the global thread ID ensures that excess threads do not perform any wo

 Afterwards the stack for the interpretation can be created. It is possible to dynamically allocate memory on the GPU, which enables a similar programming model as on the CPU. \textcite{winter_are_2021} have even compared many dynamic memory managers and found, that the performance impact of them is rather small. However, if it is easily possible to use static allocations, it still offers better performance. In the case of this thesis, it is easily possible which is the reason why the stack has been chosen to have a static size. Because it is known that expressions do not exceed 50 tokens, including the operators, the stack size has been set to 25, which should be more than enough to hold the values and partial results, even in the worst case.

-\subsubsection{Main Loop} % MAYBE
+\subsubsection{Main Loop}
 Once everything is initialised, the main interpreter loop starts interpreting the expression. Because of the intermediate representation, the loop simply iterates through the expression from left to right. On each iteration the type of the current token is checked, to decide which operation to perform. 

 If the current token type matches the \textit{stop} opcode, the interpreter knows that it is finished. This simplicity is the reason why this opcode was introduced, as explained above.
@ -197,7 +197,6 @@ Support for ternary operators could also be easily added. An example of a ternar
 Once the interpreter loop has finished, the result of the evaluation must be stored in the result matrix. By using the index of the current expression, as well as the index of the current variable set (the global thread ID) it is possible to calculate the index where the result must be stored. The last value on the stack is the result, which is stored in the result matrix at the calculated location.

 \section{Transpiler}
-% Talk about how the transpiler has been developed (probably largest section, because it just has more interesting parts); CPU-side part will be much larger than GPU side
 Unlike the interpreter, the transpiler primarily operates on the CPU, with only a minor GPU-based component. This is because the transpiler must generate entire PTX kernels from Julia expressions, rather than simply executing a pre-written kernel like the interpreter. Similar to the interpreter, the CPU side of the transpiler manages communication with both the GPU and the symbolic regression algorithm. This section provides a detailed overview of the transpiler's functionality.

 An overview of how the transpiler interacts with the frontend and GPU is outlined in Figure \ref{fig:transpiler-sequence}. The parts of this figure are explained in detail in the following sections.
@ -210,7 +209,6 @@ An overview of how the transpiler interacts with the frontend and GPU is outline
 \end{figure}

 \subsection{CPU Side}
-% TODO: Finish on Saturday
 After the transpiler has received the expressions to be transpiled, it first sends them to the frontend for processing. Once they have been processed, the expressions are sent to the transpiler backend which is explained in more detail Section \ref{sec:transpiler-backend}. The backend is responsible for generating the kernels. The output of the backend are the kernels written as PTX code for all expressions. 

 \subsubsection{Data Transfer}
@ -219,8 +217,6 @@ Data is sent to the GPU in the same way as it is sent by the interpreter. The va
 Because each expression has its own kernel, there is no need to transfer the expressions themselves. Moreover, there is also no need to send information about the layout of the variables and parameters to the GPU. The reason for this is explained in the transpiler backend section below.

 \subsubsection{Kernel Dispatch}
-% similar to interpreter dispatch with tuning etc.
-% mention that CUDA.jl is used to instruct the driver to compile the kernel for the specific hardware
 Once all the data is present on the GPU, the transpiled kernels can be dispatched. Dispatching the transpiled kernels is more involved than dispatching the interpreter kernel. Program \ref{code:julia_dispatch-comparison} shows the difference between dispatching the interpreter kernel and the transpiled kernels. An important note, is that the transpiled kernels must be manually compiled into machine code. To achieve this, CUDA.jl provides functionality to instruct the drivers to compile the PTX code. The same process of creating PTX code and compiling it must also be done for the interpreter kernel, however, this is done by CUDA.jl automatically when calling the @cuda macro in line 6.

 \begin{program}
@ -256,40 +252,37 @@ end	\end{JuliaCode}

 After all kernels have been dispatched, the CPU waits for the kernels to complete their execution. When the kernels have finished, the result matrix is read from global memory into system memory. The results can then be returned to the symbolic regression algorithm. 

-%
-% TODO: DeepL pass
-%
 \subsection{Transpiler Backend} 
 \label{sec:transpiler-backend}
-The transpiler backend is responsible for creating a kernel from an expression in its intermediate representation. Transpiling an expression is split into several parts, these parts are as follows:
+The transpiler backend is responsible for creating a kernel from an expression in its intermediate representation. Transpiling an expression is divided into several parts, these parts are as follows:

 \begin{itemize}
 	\item Register management
 	\item Generating the header and kernel entry point
 	\item Ensuring that only the requested amount of threads is performing work
-	\item Generating the Code for evaluating the expression
+	\item Generating the Code for evaluating the expression and storing the result
 \end{itemize}

-PTX assumes a register machine, which means that a developer has to work with a limited number of registers. This also means that the transpiler must define a strategy for managing these registers. The second and third parts are rather simple and can be considered overhead. Finally, the last part is the main part of the generated kernel. It contains the code to load variables and parameters, evaluate the expression and store the result in the result matrix. All parts will be explained in the following sections.
+PTX assumes a register machine, which means that a developer has to work with a limited number of registers. This also means that the transpiler has to define a strategy for managing these registers. The second and third parts are rather simple and can be considered as overhead code. Finally, the last part is the main part of the generated kernel. It contains the code to load variables and parameters, evaluate the expression and store the result in the result matrix. All parts are explained in the following sections.

 \subsubsection{Register Management}
-Register management is a crucial part of the transpiler as it is important to balance register usage with occupancy and performance. \textcite{aho_compilers_2006, cooper_engineering_2022} describe techniques for efficient register management, especially for machines with few registers and register usage by convention on the CPU. However, on the GPU, there are much more registers available, which can all be used as needed.
+Register management is a crucial part of the transpiler as it is important to balance register usage with occupancy and performance. \textcite{aho_compilers_2006, cooper_engineering_2022} describe techniques for efficient register management, especially for machines with few registers and register usage by convention on the CPU. On the GPU however, there are many more registers available, all of which can be used as needed without restrictions.

-To allow for maximum occupancy and avoid spilling registers into local memory, the transpiler tries to re-use as many registers as possible. Furthermore, in PTX, allocating and using a register is very similar to using variables in code, as they do not represent physical registers. Therefore, much of the complexity of managing registers is managed by the PTX compiler. 
+To allow for maximum occupancy and avoid spilling registers into local memory, the transpiler tries to reuse as many registers as possible. Furthermore, allocating and using a register in PTX is very similar to using variables in code, as they represent virtual registers. Therefore, much of the complexity of managing registers is handled by the PTX compiler of the driver. 

-Because much of the complexity of managing registers is hidden by the compiler, or does not apply in this scenario, it is implemented very simple. If at any point in the transpilation process, a register is required, it can be requested by the register manager. A register must be given a name and the manager uses this name to determine the type of this register. For example if the name of the register is \verb|f|, it will be assumed that it is a FP32 register. Several naming conventions exist to ensure that the register will have the correct datatype. The manager then returns the identifying name of the register, which is used for accessing it. The identifying name, is the name given as an input and a zero-based number that is incremented by one for each consecutive call.
+Because much of the complexity of managing registers is hidden by the compiler, or does not apply in this scenario, it is implemented very simple. If a register is needed at any point in the transpilation process, it can be requested by the register manager. A register must be given a name and the manager uses this name to determine the type of this register. For example, if the name of the register is \verb|f|, it is assumed to be an FP32 register. Several naming conventions exist to ensure that the register is of the correct data type. The manager then returns the identifying name of the register, which is used to access it. The identifying name, is the name given as an input and a zero-based number that is incremented by one for each successive call.

-PTX requires the registers to be defined before they are used. Therefore, after the transpiler has finished generating the code, the registers must be defined at the top of the kernel. As the manager has kept track of the registers used, it can generate the code to allocate and define the registers. If the kernel only used five FP32 registers, the manager would generate the code \verb|.reg .f32  %f<5>;|. This allocates and defines the registers \verb|%f0| through \verb|%f4|.
+PTX requires that the registers are defined before they are used. Therefore, after the transpiler has finished generating the code, the registers must be defined at the top of the kernel. As the manager has kept track of the registers used, it can generate the code to allocate and define the registers. If the kernel only uses five FP32 registers, the manager would generate the code \verb|.reg .f32  %f<5>;|. This will allocate and define the registers \verb|%f0| through \verb|%f4|.

 \subsubsection{Header and Entry Point}
-Each PTX program must begin with certain instructions to correctly compile and use this program. The first directive must be the \verb|.version| directive. It indicates for which PTX version the code has been written, to ensure that it is compiled with the correct tools in the correct version. Following the \verb|.version| directive, is the \verb|.target| directive, which specifies the target hardware architecture.
+Each PTX program must begin with certain directives in order to compile and use that program correctly. The first directive must be the \verb|.version| directive. It indicates which PTX version the code was written for, to ensure that it is compiled with the correct tools in the correct version. Following the \verb|.version| directive is the \verb|.target| directive, which specifies the target hardware architecture.

-Once these directives are added to the generated code, the entry point to the kernel can be generated. It contains the name of the kernel, as well as all parameters that are passed to it, like the pointer to the variable, parameter and result matrix. The kernel name is important, as it is required by the CPU to dispatch it.
+Once these directives have been added to the generated code, the entry point to the kernel can be generated. It contains the name of the kernel, as well as all parameters that are passed to it, such as the pointers to the variable, parameter and result matrix. The kernel name is important as it is required by the CPU to dispatch it.

-While generating the entry point, the PTX code for loading the parameters into the kernel is also generated. This removes the need to iterate over the kernel parameters a second time. Loading the parameters into the kernel is required, as it is not possible to address these values directly. \textcite{nvidia_parallel_2025} states that addresses in the parameter state space, can only be accessed via the \verb|ld.param| instruction. Furthermore, since all three matrices are stored in global memory, the parameter address needs to be converted from parameter state space to global state space using the \verb|cvta.to.global.datatype| instruction.
+When the entry point is generated, the PTX code for loading the parameters into the kernel is also generated. This removes the need to iterate over the kernel parameters a second time. Loading the parameters into the kernel is necessary because it is not possible to address these values directly. \textcite{nvidia_parallel_2025} states that addresses in the parameter state space can only be accessed using the \verb|ld.param| instruction. Furthermore, since all three matrices are stored in global memory, the parameter address must be converted from parameter state space to global state space using the \verb|cvta.to.global.datatype| instruction.

 \subsubsection{Guard Clause}
-As previously explained in Section \ref{sec:interpreter-gpu-side}, the guard clause ensures that all excessive threads do not take part in the evaluation. The following code shows how this guard clause looks if the kernel is written with Julia and CUDA.jl:
+As explained in Section \ref{sec:interpreter-gpu-side}, the guard clause ensures that any excess threads do not participate in the evaluation. The following code shows what this guard clause looks like when the kernel is written with Julia and CUDA.jl:
 \begin{JuliaCode}
 function my_kernel(nrOfVarSets::Int32)
    threadId = (blockIdx().x - 1) * blockDim().x + threadIdx().x
@ -317,54 +310,91 @@ End:
 	ret;
 \end{PTXCode}

-It needs to be noted, that the register \verb|r2| is not needed. Because the transpiler already knows the number of variable sets, it would be wasteful to transmit it to the kernel. Therefore, instead the transpiler inserts the number directly to save resources.
+It needs to be noted, that the register \verb|%r2| is not needed. Since the transpiler already knows the number of variable sets, it would be wasteful to transmit this information to the kernel. Instead, the transpiler inserts the number directly as a constant to save resources.

-\subsubsection{Evaluation}
-The equivalent of the interpreter's main loop, is the loop for generating the code for evaluating the expression. Because the transpiler uses the same intermediate representation as the interpreter, the loop also works very similar. It also uses a stack for storing the values and the intermediate results. Unlike the interpreter, the special opcode \textit{stop} is not needed any more. As only a single expression needs to be transpiled, it is stored in an unpadded vector and therefore all tokens inside the vector are valid. This opcode has only been introduced for the interpreter, as the expressions might get padded to fully fit in the matrix.
+\subsubsection{Main Loop}
+The main loop of the transpiler, which generates the kernel for evaluating a single expression, is analogous to the interpreter's main loop. Since the transpiler uses the same intermediate representation as the interpreter, both loops behave similarly. The transpiler loop also uses a stack to store the values and intermediate results. However, the transpiler does not require the special opcode \textit{stop} which was necessary in the interpreter to handle expressions padded to fit into a matrix. The transpiler only needs to process a single expression, which is stored in an unpadded vector of known length. This means that all tokens within the vector are valid and therefore do not require this opcode.

-% MAYBE : activity diagram for this loop (also add to interpreter main loop section)
+% MAYBE : activity diagram for this loop (also add to interpreter main loop section (would maybe fit better in concept and design so basically move the algorithms of C&D here and add activity diagram to C&D ))

-When the loop encounters a token that represents an index to either the variable or the parameter matrix, the transpiler needs to generate code for loading these values. In the general case, this works exactly the same as with the interpreter. 
+When the loop encounters a token that represents an index to either the variable or the parameter matrix, the transpiler needs to generate code to load these values. In the general case, this works in exactly the same way as the interpreter, calculating the index and accessing the matrices at that location. 

-However, on the first time a variable or parameter is accessed, it needs to be loaded from global memory. While a register already exists that holds a pointer to the address of the matrices in global memory, the data is still not accessible. To make it accessible, first the index to the value must be calculated the same way it is calculated in the interpreter. Afterwards the value must be loaded into a register with the instruction \verb|ld.global.f32  %reg1, %reg2|. Using the first register of the instruction, the data can be accessed. If for example the variable $x_1$ is accessed multiple times, all subsequent calls only need to reference this register and do not need to load the data from global memory again.
+However, the first time a variable or parameter is accessed, it must be loaded from global memory. Although registers already exist that hold a pointer to the address of the matrices in global memory, the data is still not accessible. To make it accessible, the index to the value must first be calculated in the same way as it is calculated in the interpreter. Afterwards the value must be loaded into a register with the instruction \verb|ld.global.f32  %reg1, [%reg2]|. Using the first register of the instruction, the data can be accessed. For example, if the variable $x_1$ is accessed several times, all subsequent calls only need to reference this register and do not need to load the data from global memory again.

-% token -> operator -> same as interpreter but instead of result, add register that holds result to stack
+%
+% TODO: Deepl pass
+%
+In the case where the current token represents an operation, the code for this operation needs to be generated. Many operators have an equivalent on the GPU. For example addition has the \verb|add.f32  %reg1, %reg2, %reg3;| instruction. The instructions for division and square root have an equivalent instruction but only support approximate calculation. While the approximation can be controlled with different options, the fastest option \verb|.approx| has been chosen. Although a slightly slower but more accurate option \verb|.full| exists, it is not fully IEEE 754 compliant and has therefore not been used. 

-% after loop storing result is also interesting
+However, not all supported operators have a single instruction GPU equivalent. For example the operation $x^y$ has no equivalent and therefore needs to be generated differently. When compiling a kernel with this operation using the Nvidia compiler and the compiler flag \textit{-\,-use\_fast\_math} the following code will be generated:
+\begin{PTXCode}[numbers=none]
+lg2.approx.f32   %reg1, %reg2;
+mul.f32          %reg4, %reg3, %reg1;
+ex2.approx.f32   %reg5, %reg4;
+\end{PTXCode}
+While this compiler flag trades accuracy for performance, the more accurate version of this operation contains about 100 instructions instead of the above three. Because of this, the more performant version has been chosen to be generated by the transpiler. Similarly, the operations $\log(x)$ and $e^x$ also have no equivalent instruction and are therefore generated following the same principle.
+
+The final register of the generated code, stores the result of the operation once executed. As with the interpreter, this result is either the final value, or an input to another operation. Therefore, this register must be stored in the stack for later use.
+
+Once the main loop has finished, the last element on the stack, holds the register with the result of the evaluation. The value of this register must be stored in the result matrix. As the result matrix is stored in global memory, the code for storing the data is similar to the code responsible for loading the data from global memory. First the location where to store the result must be calculated. Storing the result at this location, is performed with the instruction \verb|st.global.f32  [%reg1], %reg2;|.

 \subsection{GPU Side}
 % I am not really happy with this. The length of the paragraph is fine, but the content not so much
 % Maybe show a kernel for the expression "x1+p1" or so to show the complexity or something?
-On the GPU, the transpiled kernels are simply executed. Because the kernels themselves are very simple, as they contain almost no branching and other overhead work, the GPU does not need to perform a lot of operations. As can be seen in Program TODO, the kernel for the expression $x_1 + p_1$ is very straightforward, with only two load operations, the addition and then the storing of the result in the result matrix. In fact, the kernel is a one-to-one mapping of the expression, with the overhead of ensuring only the one thread is executing and loading the variable and parameter.
+On the GPU, the transpiled kernels are simply executed. Because the kernels themselves are very simple, as they contain almost no branching and other overhead work, the GPU does not need to perform a lot of operations. As can be seen in Program \ref{code:ptx_kernel}, the kernel for the expression $x_1 + p_1$ is very straightforward, with only two load operations, the addition and then the storing of the result in the result matrix. In fact, the kernel is a one-to-one mapping of the expression, with the overhead of ensuring only the one thread is executing and loading the variable and parameter.

 \begin{program}
-	\begin{GenericCode}
-		TODO
-		TODO
-		TODO
-		TODO
-	.func loop(.param .u32 N)
+	\begin{PTXCode}
+.visible .entry Evaluator(
+	.param .u64 param_1, .param .u64 param_2, .param .u64 param_3)
 	{
-		.reg .u32 %n;
-		.reg .pred %p;
+		// Make parameters stored in global memory accessible
+		ld.param.u64   %rd0, [param_1];
+		cvta.to.global.u64   %parameter0, %rd0;
+		ld.param.u64   %rd1, [param_2];
+		cvta.to.global.u64   %parameter1, %rd1;
+		ld.param.u64   %rd2, [param_3];
+		cvta.to.global.u64   %parameter2, %rd2;
 	
-		ld.param.u32 %n, [N];
-	Loop: 
-		setp.eq.u32 %p, %n, 0;
-	@%p bra 		Done;
-		sub.u32		%n, %n, 1;
-		bra			Loop;
-	Done:
-	}
-	\end{GenericCode}
-	\caption{The PTX kernel for the expression $x_1 + p_1$}
-	\label{code:ptx_loop}
+		mov.u32    %r0, %ntid.x;
+		mov.u32    %r1, %ctaid.x;
+		mov.u32    %r2, %tid.x;
+		mad.lo.s32     %r3, %r0, %r1, %r2;
+		setp.gt.s32    %p0, %r3, 1;
+	@%p0 bra    L__BB0_2; // Jump to end of kernel if too many threads are started
+		cvt.u64.u32    %rd3, %r3;
+		mov.u64 %rd4, 0;
+	
+		// Load variable and parameter from global memory and add them together
+		mad.lo.u64  %rd5, %rd3, 4, 0;
+		add.u64     %rd5, %parameter0, %rd5;
+		ld.global.f32 %var0, [%rd5];
+		mad.lo.u64  %rd6, %rd4, 4, 0;
+		add.u64     %rd6, %parameter1, %rd6;
+		ld.global.f32 %var1, [%rd6];
+		add.f32    %f0, %var0, %var1;
+		
+		// Store the result in the result matrix
+		add.u64        %rd7, 0, %rd3;
+		mad.lo.u64     %rd7, %rd7, 4, %parameter2;
+		st.global.f32  [%rd7], %f0;
+	
+	L__BB0_2: ret;
+	}\end{PTXCode}
+	\caption{The slightly simplified PTX kernel for the expression $x_1 + p_1$. For simplicity, the allocation of registers and the required directives \texttt{.version} and \texttt{.target} have been removed.}
+	\label{code:ptx_kernel}
 \end{program}
-% TODO: Probably explain the code a bit too?

-% Front-End and Back-End
-% Caching of back-end results
+%\verb|.version| and \verb|.target|

-% PTX code generated and compiled using CUDA.jl (so basically using the driver) and then executed
+Note that Program \ref{code:ptx_kernel} has been slightly simplified to not include the allocation of the registers. From line five to line ten, the addresses stored in the parameters, are converted from parameter state space into global state space, so that they reference the correct portion of the GPUs memory. It needs to be noted, that this kernel uses 64-bit addresses, which is the reason why throughout the kernel, there are some 64-bit instructions used. Evaluating the expression itself is however entirely performed using the faster 32-bit instructions.

-% Memory access (global memory and register management especially register management)
+Lines 12 through 17 are responsible to calculate the global thread-ID and ensure that excessive threads are terminated early. Note that in line 16, if the global thread-ID stored in register \verb|%r3| is greater than one, it must terminate early. This is because in this example, only one variable set needs to be evaluated.
+
+The PTX code from line 22 to line 28 is the actual evaluation of the expression, with line 28 performing the calculation $x_1 + p_1$. All other lines, are responsible for loading the values from global memory. The instructions at the lines 22, 23 and 25, 26 are responsible for calculating the offset in bytes to the memory location where the value is stored with respect to the location of the first element. 
+
+The constants $4$ and $0$ are introduced due to performance reasons. The number $4$ is the size in bytes of one variable set. As one variable set only stores a single FP32 value in this case, each variable set has a size of four bytes. Similarly, the number $0$ represents the index of the value inside the variable set. More precise, this is the offset in bytes from index to the variable set, which is zero for the first element and four for the second etc. These two constants are calculated during the transpilation process, to minimise the amount of data that needs to be transferred to the GPU. 
+
+Storing the result in the result matrix is performed from line 31 to 33. The location at which to store the value is calculated in the lines 31 and 32. Line 31 calculates the index inside the result matrix, according to the current variable set stored in register \verb|%rd3|. The constant $0$ is the product of the index of the expression that is evaluated and the number of variable sets and represents the column of the result matrix. Converting this index into bytes and adding it as the offset to the first element of the result matrix, results in the correct memory location to store the result at.
+
+This kernel consists mostly of overhead code, as only lines 22 through 33 contribute to calculating the result of the expression with the designated variable and parameter set. However, for larger expressions, the percentage of overhead code drastically shrinks.
--- a/thesis/hgblistings.sty
+++ b/thesis/hgblistings.sty
@ -158,8 +158,10 @@ keepspaces=true,%
 % Language Definition and Code Environment for Julia
 \lstdefinelanguage{PTX}{
 	alsoletter={.},
-    morekeywords={mov.u32, mad.lo.s32, setp.ge.s32, bra,
-			  ret, .func, .entry},
+    morekeywords={mov.u32, mad.lo.s32, setp.ge.s32, bra, mov.u64,
+				  mad.lo.u64, add.u64, ld.global.f32, add.f32, st.global.f32, 
+				  cvta.to.global.u64, ld.param.u64, setp.gt.s32, cvt.u64.u32,
+			  ret, .func, .entry, .visible, .param, .u64},
 	keywordstyle=\color{blue},
    sensitive=true,
    morestring=[b]",
--- a/thesis/main.pdf
+++ b/thesis/main.pdf