small updates and notes for further writing

2025-04-15 19:32:39 +02:00
parent ef721b13e0
commit c62aff806a
7 changed files with 18 additions and 8 deletions
--- a/2
+++ b/2
@ -1,6 +1,6 @@
 MIT License

-Copyright (c) 2024 Daniel Wiplinger
+Copyright (c) 2024 Daniel Roth

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/package/src/ExpressionExecutorCuda.jl
+++ b/package/src/ExpressionExecutorCuda.jl
@ -27,7 +27,7 @@ function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector

 	results = Matrix{Float32}(undef, ncols, length(exprs))

-	for i in 1:repetitions # Simulate parameter tuning
+	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially)
 		results = Interpreter.interpret(exprs, X, p)
 	end

@ -41,7 +41,7 @@ function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{

 	results = Matrix{Float32}(undef, ncols, length(exprs))

-	for i in 1:repetitions # Simulate parameter tuning
+	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially)
 		results = Transpiler.evaluate(exprs, X, p)
 	end

--- a/package/test/PerformanceTests.jl
+++ b/package/test/PerformanceTests.jl
@ -5,6 +5,10 @@ using .Transpiler
 using .Interpreter

 const BENCHMARKS_RESULTS_PATH = "./results"
+
+# TODO: Expressions can get much much bigger (into millions) (will be provided by Mr. Kronberger)
+# TODO: Variable-Sets: 1000 can be considered the minimum; 100.000 can be considered the maximum (will be provided by Mr. Kronberger)
+
 exprsCPU = [
 	# CPU interpreter requires an anonymous function and array ref s
 	:(p[1] * x[1] + p[2]), # 5 op
@ -24,7 +28,7 @@ exprsGPU = [

 # p is the same for CPU and GPU
 p = [randn(Float32, 10) for _ in 1:length(exprsCPU)] # generate 10 random parameter values for each expr
-expr_reps = 100 # 100 parameter optimisation steps basically
+expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially; only p changes but not X)


@testset "CPU performance" begin
@ -89,15 +93,15 @@ if compareWithCPU
 	suite["CPU"]["large varset"] = @benchmarkable interpret_cpu(exprsCPU, X_large, p; repetitions=expr_reps)
 end

-X_small_GPU = randn(Float32, 5, varsets_small)
+X_small_GPU = randn(Float32, 5, varsets_small) # column-major
 suite["GPUI"]["small varset"] = @benchmarkable interpret_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
 suite["GPUT"]["small varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)

-X_medium_GPU = randn(Float32, 5, varsets_medium)
+X_medium_GPU = randn(Float32, 5, varsets_medium) # column-major
 suite["GPUI"]["medium varset"] = @benchmarkable interpret_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
 suite["GPUT"]["medium varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)

-X_large_GPU = randn(Float32, 5, varsets_large)
+X_large_GPU = randn(Float32, 5, varsets_large) # column-major
 suite["GPUI"]["large varset"] = @benchmarkable interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
 suite["GPUT"]["large varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)

--- a/package/test/runtests.jl
+++ b/package/test/runtests.jl
@ -1,6 +1,8 @@
 using ExpressionExecutorCuda
 using Test

+using BenchmarkTools
+
 const baseFolder = dirname(dirname(pathof(ExpressionExecutorCuda)))
 include(joinpath(baseFolder, "src", "Utils.jl"))
 include(joinpath(baseFolder, "src", "ExpressionProcessing.jl"))
@ -20,5 +22,5 @@ end

@testset "Performance tests" begin
 	# include("PerformanceTuning.jl")
-	include("PerformanceTests.jl")
+	# include("PerformanceTests.jl")
 end
--- a/thesis/chapters/conceptdesign.tex
+++ b/thesis/chapters/conceptdesign.tex
@ -1,3 +1,5 @@
+RE-READ to ensure that concepts why this is done to improve performance and why this should be the "locally best" implementation (most should be in implementation though)
+
 \chapter{Concept and Design}
 \label{cha:conceptdesign}
 % introduction to what needs to be done. also clarify terms "Host" and "Device" here
--- a/thesis/chapters/implementation.tex
+++ b/thesis/chapters/implementation.tex
@ -3,6 +3,8 @@

 somewhere in here explain why one kernel per expression and not one kernel for all expressions

+Go into the details why this implementation is tuned towards performance and should be the optimum at that
+
 \section{Technologies}
 Short section; CUDA, PTX, Julia, CUDA.jl

--- a/thesis/main.pdf
+++ b/thesis/main.pdf