diff --git a/LICENSE b/LICENSE index f8d3825..043dc29 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2024 Daniel Wiplinger +Copyright (c) 2024 Daniel Roth Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/package/src/ExpressionExecutorCuda.jl b/package/src/ExpressionExecutorCuda.jl index 6670167..8dd54a2 100644 --- a/package/src/ExpressionExecutorCuda.jl +++ b/package/src/ExpressionExecutorCuda.jl @@ -27,7 +27,7 @@ function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector results = Matrix{Float32}(undef, ncols, length(exprs)) - for i in 1:repetitions # Simulate parameter tuning + for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially) results = Interpreter.interpret(exprs, X, p) end @@ -41,7 +41,7 @@ function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{ results = Matrix{Float32}(undef, ncols, length(exprs)) - for i in 1:repetitions # Simulate parameter tuning + for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially) results = Transpiler.evaluate(exprs, X, p) end diff --git a/package/test/PerformanceTests.jl b/package/test/PerformanceTests.jl index ec2718a..39f1e4b 100644 --- a/package/test/PerformanceTests.jl +++ b/package/test/PerformanceTests.jl @@ -5,6 +5,10 @@ using .Transpiler using .Interpreter const BENCHMARKS_RESULTS_PATH = "./results" + +# TODO: Expressions can get much much bigger (into millions) (will be provided by Mr. Kronberger) +# TODO: Variable-Sets: 1000 can be considered the minimum; 100.000 can be considered the maximum (will be provided by Mr. Kronberger) + exprsCPU = [ # CPU interpreter requires an anonymous function and array ref s :(p[1] * x[1] + p[2]), # 5 op @@ -24,7 +28,7 @@ exprsGPU = [ # p is the same for CPU and GPU p = [randn(Float32, 10) for _ in 1:length(exprsCPU)] # generate 10 random parameter values for each expr -expr_reps = 100 # 100 parameter optimisation steps basically +expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially; only p changes but not X) @testset "CPU performance" begin @@ -89,15 +93,15 @@ if compareWithCPU suite["CPU"]["large varset"] = @benchmarkable interpret_cpu(exprsCPU, X_large, p; repetitions=expr_reps) end -X_small_GPU = randn(Float32, 5, varsets_small) +X_small_GPU = randn(Float32, 5, varsets_small) # column-major suite["GPUI"]["small varset"] = @benchmarkable interpret_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps) suite["GPUT"]["small varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps) -X_medium_GPU = randn(Float32, 5, varsets_medium) +X_medium_GPU = randn(Float32, 5, varsets_medium) # column-major suite["GPUI"]["medium varset"] = @benchmarkable interpret_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps) suite["GPUT"]["medium varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps) -X_large_GPU = randn(Float32, 5, varsets_large) +X_large_GPU = randn(Float32, 5, varsets_large) # column-major suite["GPUI"]["large varset"] = @benchmarkable interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps) suite["GPUT"]["large varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps) diff --git a/package/test/runtests.jl b/package/test/runtests.jl index 8c6f5e3..5ec29a4 100644 --- a/package/test/runtests.jl +++ b/package/test/runtests.jl @@ -1,6 +1,8 @@ using ExpressionExecutorCuda using Test +using BenchmarkTools + const baseFolder = dirname(dirname(pathof(ExpressionExecutorCuda))) include(joinpath(baseFolder, "src", "Utils.jl")) include(joinpath(baseFolder, "src", "ExpressionProcessing.jl")) @@ -20,5 +22,5 @@ end @testset "Performance tests" begin # include("PerformanceTuning.jl") - include("PerformanceTests.jl") + # include("PerformanceTests.jl") end \ No newline at end of file diff --git a/thesis/chapters/conceptdesign.tex b/thesis/chapters/conceptdesign.tex index 3038b07..f180ec7 100644 --- a/thesis/chapters/conceptdesign.tex +++ b/thesis/chapters/conceptdesign.tex @@ -1,3 +1,5 @@ +RE-READ to ensure that concepts why this is done to improve performance and why this should be the "locally best" implementation (most should be in implementation though) + \chapter{Concept and Design} \label{cha:conceptdesign} % introduction to what needs to be done. also clarify terms "Host" and "Device" here diff --git a/thesis/chapters/implementation.tex b/thesis/chapters/implementation.tex index c16f58e..4ebfa80 100644 --- a/thesis/chapters/implementation.tex +++ b/thesis/chapters/implementation.tex @@ -3,6 +3,8 @@ somewhere in here explain why one kernel per expression and not one kernel for all expressions +Go into the details why this implementation is tuned towards performance and should be the optimum at that + \section{Technologies} Short section; CUDA, PTX, Julia, CUDA.jl diff --git a/thesis/main.pdf b/thesis/main.pdf index 72208b9..e459b42 100644 Binary files a/thesis/main.pdf and b/thesis/main.pdf differ