diff --git a/LICENSE b/LICENSE
index f8d3825..043dc29 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2024 Daniel Wiplinger
+Copyright (c) 2024 Daniel Roth
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/package/src/ExpressionExecutorCuda.jl b/package/src/ExpressionExecutorCuda.jl
index 6670167..8dd54a2 100644
--- a/package/src/ExpressionExecutorCuda.jl
+++ b/package/src/ExpressionExecutorCuda.jl
@@ -27,7 +27,7 @@ function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector
 
 	results = Matrix{Float32}(undef, ncols, length(exprs))
 
-	for i in 1:repetitions # Simulate parameter tuning
+	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially)
 		results = Interpreter.interpret(exprs, X, p)
 	end
 
@@ -41,7 +41,7 @@ function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{
 
 	results = Matrix{Float32}(undef, ncols, length(exprs))
 
-	for i in 1:repetitions # Simulate parameter tuning
+	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially)
 		results = Transpiler.evaluate(exprs, X, p)
 	end
 
diff --git a/package/test/PerformanceTests.jl b/package/test/PerformanceTests.jl
index ec2718a..39f1e4b 100644
--- a/package/test/PerformanceTests.jl
+++ b/package/test/PerformanceTests.jl
@@ -5,6 +5,10 @@ using .Transpiler
 using .Interpreter
 
 const BENCHMARKS_RESULTS_PATH = "./results"
+
+# TODO: Expressions can get much much bigger (into millions) (will be provided by Mr. Kronberger)
+# TODO: Variable-Sets: 1000 can be considered the minimum; 100.000 can be considered the maximum (will be provided by Mr. Kronberger)
+
 exprsCPU = [
 	# CPU interpreter requires an anonymous function and array ref s
 	:(p[1] * x[1] + p[2]), # 5 op
@@ -24,7 +28,7 @@ exprsGPU = [
 
 # p is the same for CPU and GPU
 p = [randn(Float32, 10) for _ in 1:length(exprsCPU)] # generate 10 random parameter values for each expr
-expr_reps = 100 # 100 parameter optimisation steps basically
+expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially; only p changes but not X)
 
 
 @testset "CPU performance" begin
@@ -89,15 +93,15 @@ if compareWithCPU
 	suite["CPU"]["large varset"] = @benchmarkable interpret_cpu(exprsCPU, X_large, p; repetitions=expr_reps)
 end
 
-X_small_GPU = randn(Float32, 5, varsets_small)
+X_small_GPU = randn(Float32, 5, varsets_small) # column-major
 suite["GPUI"]["small varset"] = @benchmarkable interpret_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
 suite["GPUT"]["small varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
 
-X_medium_GPU = randn(Float32, 5, varsets_medium)
+X_medium_GPU = randn(Float32, 5, varsets_medium) # column-major
 suite["GPUI"]["medium varset"] = @benchmarkable interpret_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
 suite["GPUT"]["medium varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
 
-X_large_GPU = randn(Float32, 5, varsets_large)
+X_large_GPU = randn(Float32, 5, varsets_large) # column-major
 suite["GPUI"]["large varset"] = @benchmarkable interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
 suite["GPUT"]["large varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
 
diff --git a/package/test/runtests.jl b/package/test/runtests.jl
index 8c6f5e3..5ec29a4 100644
--- a/package/test/runtests.jl
+++ b/package/test/runtests.jl
@@ -1,6 +1,8 @@
 using ExpressionExecutorCuda
 using Test
 
+using BenchmarkTools
+
 const baseFolder = dirname(dirname(pathof(ExpressionExecutorCuda)))
 include(joinpath(baseFolder, "src", "Utils.jl"))
 include(joinpath(baseFolder, "src", "ExpressionProcessing.jl"))
@@ -20,5 +22,5 @@ end
 
 @testset "Performance tests" begin
 	# include("PerformanceTuning.jl")
-	include("PerformanceTests.jl")
+	# include("PerformanceTests.jl")
 end
\ No newline at end of file
diff --git a/thesis/chapters/conceptdesign.tex b/thesis/chapters/conceptdesign.tex
index 3038b07..f180ec7 100644
--- a/thesis/chapters/conceptdesign.tex
+++ b/thesis/chapters/conceptdesign.tex
@@ -1,3 +1,5 @@
+RE-READ to ensure that concepts why this is done to improve performance and why this should be the "locally best" implementation (most should be in implementation though)
+
 \chapter{Concept and Design}
 \label{cha:conceptdesign}
 % introduction to what needs to be done. also clarify terms "Host" and "Device" here
diff --git a/thesis/chapters/implementation.tex b/thesis/chapters/implementation.tex
index c16f58e..4ebfa80 100644
--- a/thesis/chapters/implementation.tex
+++ b/thesis/chapters/implementation.tex
@@ -3,6 +3,8 @@
 
 somewhere in here explain why one kernel per expression and not one kernel for all expressions
 
+Go into the details why this implementation is tuned towards performance and should be the optimum at that
+
 \section{Technologies}
 Short section; CUDA, PTX, Julia, CUDA.jl
 
diff --git a/thesis/main.pdf b/thesis/main.pdf
index 72208b9..e459b42 100644
Binary files a/thesis/main.pdf and b/thesis/main.pdf differ