benchmarking: tested int32 also on uni pc

benchmarking: used int32 wherever possible; resulted in noticeable performance drop
evaluation: added introduction text and made plan for additional text
2025-04-13 11:43:17 +02:00 · 2025-04-13 11:32:54 +02:00 · 2025-04-12 16:22:14 +02:00 · 2025-04-12 13:20:50 +02:00 · 2025-04-12 12:13:28 +02:00
16 changed files with 46 additions and 30 deletions
--- a/package/src/Interpreter.jl
+++ b/package/src/Interpreter.jl
@ -1,5 +1,6 @@
 module Interpreter
 using CUDA
+using CUDA: i32
 using StaticArrays
 using ..ExpressionProcessing
 using ..Utils
@ -24,16 +25,16 @@ function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parame
 	cudaParams = Utils.create_cuda_array(parameters, NaN32) # column corresponds to data for one expression
 	cudaExprs = Utils.create_cuda_array(exprs, ExpressionElement(EMPTY, 0)) # column corresponds to data for one expression
 	# put into seperate cuArray, as this is static and would be inefficient to send seperatly to every kernel
-	cudaStepsize = CuArray([Utils.get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression
+	cudaStepsize::CuArray{Int32} = CuArray([Utils.get_max_inner_length(parameters), size(variables, 1)]) # max num of values per expression; max nam of parameters per expression; number of variables per expression

 	# each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions
 	cudaResults = CuArray{Float32}(undef, variableCols, length(exprs))

 	# Start kernel for each expression to ensure that no warp is working on different expressions
 	@inbounds for i in eachindex(exprs)
-		kernel = @cuda launch=false interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i)
+		kernel = @cuda launch=false interpret_expression(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, convert(Int32, i))
 		# config = launch_configuration(kernel.fun)
-		threads = min(variableCols, 256)
+		threads = min(variableCols, 128)
 		blocks = cld(variableCols, threads)

 		kernel(cudaExprs, cudaVars, cudaParams, cudaResults, cudaStepsize, i; threads, blocks)
@ -44,8 +45,8 @@ end

 #TODO: Add @inbounds to all indexing after it is verified that all works https://cuda.juliagpu.org/stable/development/kernel/#Bounds-checking
 const MAX_STACK_SIZE = 25 # The depth of the stack to store the values and intermediate results
-function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, variables::CuDeviceArray{Float32}, parameters::CuDeviceArray{Float32}, results::CuDeviceArray{Float32}, stepsize::CuDeviceArray{Int}, exprIndex::Int)
-	varSetIndex = (blockIdx().x - 1) * blockDim().x + threadIdx().x # ctaid.x * ntid.x + tid.x (1-based)
+function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, variables::CuDeviceArray{Float32}, parameters::CuDeviceArray{Float32}, results::CuDeviceArray{Float32}, stepsize::CuDeviceArray{Int32}, exprIndex::Int32)
+	varSetIndex = (blockIdx().x - 1i32) * blockDim().x + threadIdx().x # ctaid.x * ntid.x + tid.x (1-based)
 	@inbounds variableCols = length(variables) / stepsize[2]

 	if varSetIndex > variableCols
@ -54,19 +55,19 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var

 	# firstExprIndex = ((exprIndex - 1) * stepsize[1]) + 1 # Inclusive
 	# lastExprIndex = firstExprIndex + stepsize[1] - 1 # Inclusive
-	@inbounds firstParamIndex = ((exprIndex - 1) * stepsize[1]) # Exclusive
+	@inbounds firstParamIndex = ((exprIndex - 1i32) * stepsize[1]) # Exclusive

 	operationStack = MVector{MAX_STACK_SIZE, Float32}(undef) # Try to get this to function with variable size too, to allow better memory usage
-	operationStackTop = 0 # stores index of the last defined/valid value
+	operationStackTop = 0i32 # stores index of the last defined/valid value
 	
-	@inbounds firstVariableIndex = ((varSetIndex-1) * stepsize[2]) # Exclusive
+	@inbounds firstVariableIndex = ((varSetIndex - 1i32) * stepsize[2]) # Exclusive
 	
 	@inbounds for expr in expressions
 		if expr.Type == EMPTY
 			break
 		elseif expr.Type == INDEX
 			val = expr.Value
-			operationStackTop += 1
+			operationStackTop += 1i32

 			if val > 0
 				operationStack[operationStackTop] = variables[firstVariableIndex + val]
@ -75,25 +76,25 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var
 				operationStack[operationStackTop] = parameters[firstParamIndex + val]
 			end
 		elseif expr.Type == FLOAT32
-			operationStackTop += 1
+			operationStackTop += 1i32
 			operationStack[operationStackTop] = reinterpret(Float32, expr.Value)
 		elseif expr.Type == OPERATOR
 			type = reinterpret(Operator, expr.Value)
 			if type == ADD
-				operationStackTop -= 1
-				operationStack[operationStackTop] = operationStack[operationStackTop] + operationStack[operationStackTop + 1]
+				operationStackTop -= 1i32
+				operationStack[operationStackTop] = operationStack[operationStackTop] + operationStack[operationStackTop + 1i32]
 			elseif type == SUBTRACT
-				operationStackTop -= 1
-				operationStack[operationStackTop] = operationStack[operationStackTop] - operationStack[operationStackTop + 1]
+				operationStackTop -= 1i32
+				operationStack[operationStackTop] = operationStack[operationStackTop] - operationStack[operationStackTop + 1i32]
 			elseif type == MULTIPLY
-				operationStackTop -= 1
-				operationStack[operationStackTop] = operationStack[operationStackTop] * operationStack[operationStackTop + 1]
+				operationStackTop -= 1i32
+				operationStack[operationStackTop] = operationStack[operationStackTop] * operationStack[operationStackTop + 1i32]
 			elseif type == DIVIDE
-				operationStackTop -= 1
-				operationStack[operationStackTop] = operationStack[operationStackTop] / operationStack[operationStackTop + 1]
+				operationStackTop -= 1i32
+				operationStack[operationStackTop] = operationStack[operationStackTop] / operationStack[operationStackTop + 1i32]
 			elseif type == POWER
-				operationStackTop -= 1
-				operationStack[operationStackTop] = operationStack[operationStackTop] ^ operationStack[operationStackTop + 1]
+				operationStackTop -= 1i32
+				operationStack[operationStackTop] = operationStack[operationStackTop] ^ operationStack[operationStackTop + 1i32]
 			elseif type == ABS
 				operationStack[operationStackTop] = abs(operationStack[operationStackTop])
 			elseif type == LOG
@ -104,14 +105,14 @@ function interpret_expression(expressions::CuDeviceArray{ExpressionElement}, var
 				operationStack[operationStackTop] = sqrt(operationStack[operationStackTop])
 			end
 		else
-			operationStack[operationStackTop] = NaN
+			operationStack[operationStackTop] = NaN32
 			break
 		end
 	end

 	# "(exprIndex - 1) * variableCols" -> calculates the column in which to insert the result (expression = column)
 	# "+ varSetIndex" -> to get the row inside the column at which to insert the result of the variable set (variable set = row)
-	resultIndex = convert(Int, (exprIndex - 1) * variableCols + varSetIndex) # Inclusive
+	resultIndex = convert(Int, (exprIndex - 1i32) * variableCols + varSetIndex) # Inclusive
 	@inbounds results[resultIndex] = operationStack[operationStackTop]

 	return
--- a/package/src/Transpiler.jl
+++ b/package/src/Transpiler.jl
@ -73,7 +73,7 @@ function evaluate(expressions::Vector{Expr}, variables::Matrix{Float32}, paramet
 	# execute each kernel (also try doing this with Threads.@threads. Since we can have multiple grids, this might improve performance)
 	for kernel in kernels
 		# config = launch_configuration(kernels[i])
-		threads = min(variableCols, 256)
+		threads = min(variableCols, 96)
 		blocks = cld(variableCols, threads)

 		cudacall(kernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks)
--- a/package/test/PerformanceTests.jl
+++ b/package/test/PerformanceTests.jl
@ -4,7 +4,7 @@ using BenchmarkTools
 using .Transpiler
 using .Interpreter

-const BENCHMARKS_RESULTS_PATH = "./results"
+const BENCHMARKS_RESULTS_PATH = "./results-fh"
 exprsCPU = [
 	# CPU interpreter requires an anonymous function and array ref s
 	:(p[1] * x[1] + p[2]), # 5 op
@ -69,7 +69,7 @@ end
 # Add /usr/local/cuda/bin in .bashrc to PATH to access ncu and nsys (depending how well this works with my 1080 do it on my machine, otherwise re do the tests and perform them on FH PCs)
 # University setup at 10.20.1.7 if needed

-compareWithCPU = false
+compareWithCPU = true


 suite = BenchmarkGroup()
@ -143,9 +143,9 @@ if compareWithCPU
 	println(gpuiVsGPUT_median)
 	println(gpuiVsGPUT_std)
 	
-	# BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/using_inbounds.json", results)
+	BenchmarkTools.save("$BENCHMARKS_RESULTS_PATH/4-interpreter_using_int32.json", results)
 else
-	resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/using_inbounds.json")[1]
+	resultsOld = BenchmarkTools.load("$BENCHMARKS_RESULTS_PATH/2-using_inbounds.json")[1]
 	
 	medianGPUI_old = median(resultsOld["GPUI"])
 	stdGPUI_old = std(resultsOld["GPUI"])
--- a/package/test/PerformanceTuning.jl
+++ b/package/test/PerformanceTuning.jl
@ -26,5 +26,5 @@ end


@testset "Transpiler Tuning" begin
-    CUDA.@profile evaluate_gpu(exprsGPU, X, p; repetitions=expr_reps)
+    # CUDA.@profile evaluate_gpu(exprsGPU, X, p; repetitions=expr_reps)
 end
--- a/package/test/results-fh/0-initial_results.json
+++ b/package/test/results-fh/0-initial_results.json
--- a/package/test/results-fh/1-256_blocksize.json
+++ b/package/test/results-fh/1-256_blocksize.json
--- a/package/test/results-fh/2-using_inbounds.json
+++ b/package/test/results-fh/2-using_inbounds.json
--- a/package/test/results-fh/3-tuned-blocksize_I128_T96.json
+++ b/package/test/results-fh/3-tuned-blocksize_I128_T96.json
--- a/package/test/results-fh/4-interpreter_using_int32.json
+++ b/package/test/results-fh/4-interpreter_using_int32.json
--- a/package/test/results/0-initial_results.json
+++ b/package/test/results/0-initial_results.json
--- a/package/test/results/1-256_blocksize.json
+++ b/package/test/results/1-256_blocksize.json
--- a/package/test/results/2-using_inbounds.json
+++ b/package/test/results/2-using_inbounds.json
--- a/package/test/results/4-interpreter_using_int32.json
+++ b/package/test/results/4-interpreter_using_int32.json
--- a/package/test/runtests.jl
+++ b/package/test/runtests.jl
@ -19,6 +19,6 @@ end
 # end

@testset "Performance tests" begin
-	include("PerformanceTuning.jl")
-	# include("PerformanceTests.jl")
+	# include("PerformanceTuning.jl")
+	include("PerformanceTests.jl")
 end
--- a/thesis/chapters/evaluation.tex
+++ b/thesis/chapters/evaluation.tex
@ -1,9 +1,14 @@
 \chapter{Evaluation}
 \label{cha:evaluation}

+The aim of this thesis is to determine whether at least one of the GPU evaluators is faster than the current CPU evaluator. This chapter describes the performance evaluation. First, the environment in which the performance tests are performed is explained. Then the individual results for the GPU interpreter and the transpiler are presented. In addition, this part also includes the performance tuning steps taken to achieve these results. Finally, the results of the GPU evaluators are compared to the CPU evaluator in order to answer the research questions of this thesis.
+
 \section{Test environment}
 Explain the hardware used, as well as the actual data (how many expressions, variables etc.)

+three scenarios -> few, normal and many variable sets;; expr repetitions to simulate parameter optimisation
+Benchmarktools.jl -> 1000 samples per scenario
+
 \section{Results}
 talk about what we will see now (results only for interpreter, then transpiler and then compared with each other and a CPU interpreter)

@ -16,6 +21,8 @@ Initial: CPU-Side single-threaded; up to 1024 threads per block; bounds-checking

 1.) Blocksize reduced to a maximum of 256 -> moderate improvement in medium and large
 2.) Using @inbounds -> noticeable improvement in 2 out of 3
+3.) Tuned blocksize with NSight compute -> slight improvement
+4.) used int32 everywhere to reduce register usage -> significant performance drop (probably because a lot more waiting time, or more type conversions happening on GPU? would need to look at PTX)

 \subsection{Transpiler}
 Results only for Transpiler (also contains final kernel configuration and probably quick overview/recap of the implementation used and described in Implementation section
@ -26,6 +33,8 @@ Initial: CPU-Side single-threaded; up to 1024 threads per block; bounds-checking

 1.) Blocksize reduced to a maximum of 256 -> moderate improvement in medium and large
 2.) Using @inbounds -> small improvement only on CPU side code
+3.) Tuned blocksize with NSight compute -> slight improvement
+4.) Only changed things on interpreter side

 \subsection{Comparison}
 Comparison of Interpreter and Transpiler as well as Comparing the two with CPU interpreter
--- a/thesis/main.pdf
+++ b/thesis/main.pdf
Author	SHA1	Message	Date
Daniel	278a493595	benchmarking: tested int32 also on uni pc	2025-04-13 11:43:17 +02:00
Daniel	af3b72f196	benchmarking: used int32 wherever possible; resulted in noticeable performance drop Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-04-13 11:32:54 +02:00
Daniel	4c60331288	evaluation: added introduction text and made plan for additional text Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-04-12 16:22:14 +02:00
Wiplinger Daniel - s2310454043	101ccef67b	benchmarking: tuned blocksizes; slightly improved performance; mostly improved standard deviation Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-04-12 13:20:50 +02:00
Wiplinger Daniel - s2310454043	c6e2ce47aa	benchmarking: redid performance tests on uni pc Some checks are pending CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run Details CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run Details	2025-04-12 12:13:28 +02:00