small updates and notes for further writing

2025-04-15 19:32:39 +02:00
parent ef721b13e0
commit c62aff806a
7 changed files with 18 additions and 8 deletions
--- a/2
+++ b/2
@ -1,6 +1,6 @@
 MIT License
-Copyright (c) 2024 Daniel Wiplinger
+Copyright (c) 2024 Daniel Roth
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/package/src/ExpressionExecutorCuda.jl
+++ b/package/src/ExpressionExecutorCuda.jl
@ -27,7 +27,7 @@ function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector
 	results = Matrix{Float32}(undef, ncols, length(exprs))
-	for i in 1:repetitions # Simulate parameter tuning
+	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially)
 		results = Interpreter.interpret(exprs, X, p)
 	end
@ -41,7 +41,7 @@ function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{
 	results = Matrix{Float32}(undef, ncols, length(exprs))
-	for i in 1:repetitions # Simulate parameter tuning
+	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially)
 		results = Transpiler.evaluate(exprs, X, p)
 	end
--- a/package/test/PerformanceTests.jl
+++ b/package/test/PerformanceTests.jl
@ -5,6 +5,10 @@ using .Transpiler
 using .Interpreter
 const BENCHMARKS_RESULTS_PATH = "./results"
 # TODO: Expressions can get much much bigger (into millions) (will be provided by Mr. Kronberger)
 # TODO: Variable-Sets: 1000 can be considered the minimum; 100.000 can be considered the maximum (will be provided by Mr. Kronberger)
 exprsCPU = [
 	# CPU interpreter requires an anonymous function and array ref s
 	:(p[1] * x[1] + p[2]), # 5 op
@ -24,7 +28,7 @@ exprsGPU = [
 # p is the same for CPU and GPU
 p = [randn(Float32, 10) for _ in 1:length(exprsCPU)] # generate 10 random parameter values for each expr
-expr_reps = 100 # 100 parameter optimisation steps basically
+expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially; only p changes but not X)
@testset "CPU performance" begin
@ -89,15 +93,15 @@ if compareWithCPU
 	suite["CPU"]["large varset"] = @benchmarkable interpret_cpu(exprsCPU, X_large, p; repetitions=expr_reps)
 end
-X_small_GPU = randn(Float32, 5, varsets_small)
+X_small_GPU = randn(Float32, 5, varsets_small) # column-major
 suite["GPUI"]["small varset"] = @benchmarkable interpret_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
 suite["GPUT"]["small varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
-X_medium_GPU = randn(Float32, 5, varsets_medium)
+X_medium_GPU = randn(Float32, 5, varsets_medium) # column-major
 suite["GPUI"]["medium varset"] = @benchmarkable interpret_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
 suite["GPUT"]["medium varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
-X_large_GPU = randn(Float32, 5, varsets_large)
+X_large_GPU = randn(Float32, 5, varsets_large) # column-major
 suite["GPUI"]["large varset"] = @benchmarkable interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
 suite["GPUT"]["large varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
--- a/package/test/runtests.jl
+++ b/package/test/runtests.jl
@ -1,6 +1,8 @@
 using ExpressionExecutorCuda
 using Test
 using BenchmarkTools
 const baseFolder = dirname(dirname(pathof(ExpressionExecutorCuda)))
 include(joinpath(baseFolder, "src", "Utils.jl"))
 include(joinpath(baseFolder, "src", "ExpressionProcessing.jl"))
@ -20,5 +22,5 @@ end
@testset "Performance tests" begin
 	# include("PerformanceTuning.jl")
-	include("PerformanceTests.jl")
+	# include("PerformanceTests.jl")
 end
--- a/thesis/chapters/conceptdesign.tex
+++ b/thesis/chapters/conceptdesign.tex
@ -1,3 +1,5 @@
 RE-READ to ensure that concepts why this is done to improve performance and why this should be the "locally best" implementation (most should be in implementation though)
 \chapter{Concept and Design}
 \label{cha:conceptdesign}
 % introduction to what needs to be done. also clarify terms "Host" and "Device" here
--- a/thesis/chapters/implementation.tex
+++ b/thesis/chapters/implementation.tex
@ -3,6 +3,8 @@
 somewhere in here explain why one kernel per expression and not one kernel for all expressions
 Go into the details why this implementation is tuned towards performance and should be the optimum at that
 \section{Technologies}
 Short section; CUDA, PTX, Julia, CUDA.jl
--- a/thesis/main.pdf
+++ b/thesis/main.pdf