small updates and notes for further writing
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run

This commit is contained in:
Daniel 2025-04-15 19:32:39 +02:00
parent ef721b13e0
commit c62aff806a
7 changed files with 18 additions and 8 deletions

View File

@ -1,6 +1,6 @@
MIT License
Copyright (c) 2024 Daniel Wiplinger
Copyright (c) 2024 Daniel Roth
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal

View File

@ -27,7 +27,7 @@ function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector
results = Matrix{Float32}(undef, ncols, length(exprs))
for i in 1:repetitions # Simulate parameter tuning
for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially)
results = Interpreter.interpret(exprs, X, p)
end
@ -41,7 +41,7 @@ function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{
results = Matrix{Float32}(undef, ncols, length(exprs))
for i in 1:repetitions # Simulate parameter tuning
for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially)
results = Transpiler.evaluate(exprs, X, p)
end

View File

@ -5,6 +5,10 @@ using .Transpiler
using .Interpreter
const BENCHMARKS_RESULTS_PATH = "./results"
# TODO: Expressions can get much much bigger (into millions) (will be provided by Mr. Kronberger)
# TODO: Variable-Sets: 1000 can be considered the minimum; 100.000 can be considered the maximum (will be provided by Mr. Kronberger)
exprsCPU = [
# CPU interpreter requires an anonymous function and array ref s
:(p[1] * x[1] + p[2]), # 5 op
@ -24,7 +28,7 @@ exprsGPU = [
# p is the same for CPU and GPU
p = [randn(Float32, 10) for _ in 1:length(exprsCPU)] # generate 10 random parameter values for each expr
expr_reps = 100 # 100 parameter optimisation steps basically
expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially; only p changes but not X)
@testset "CPU performance" begin
@ -89,15 +93,15 @@ if compareWithCPU
suite["CPU"]["large varset"] = @benchmarkable interpret_cpu(exprsCPU, X_large, p; repetitions=expr_reps)
end
X_small_GPU = randn(Float32, 5, varsets_small)
X_small_GPU = randn(Float32, 5, varsets_small) # column-major
suite["GPUI"]["small varset"] = @benchmarkable interpret_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
suite["GPUT"]["small varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
X_medium_GPU = randn(Float32, 5, varsets_medium)
X_medium_GPU = randn(Float32, 5, varsets_medium) # column-major
suite["GPUI"]["medium varset"] = @benchmarkable interpret_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
suite["GPUT"]["medium varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
X_large_GPU = randn(Float32, 5, varsets_large)
X_large_GPU = randn(Float32, 5, varsets_large) # column-major
suite["GPUI"]["large varset"] = @benchmarkable interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
suite["GPUT"]["large varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)

View File

@ -1,6 +1,8 @@
using ExpressionExecutorCuda
using Test
using BenchmarkTools
const baseFolder = dirname(dirname(pathof(ExpressionExecutorCuda)))
include(joinpath(baseFolder, "src", "Utils.jl"))
include(joinpath(baseFolder, "src", "ExpressionProcessing.jl"))
@ -20,5 +22,5 @@ end
@testset "Performance tests" begin
# include("PerformanceTuning.jl")
include("PerformanceTests.jl")
# include("PerformanceTests.jl")
end

View File

@ -1,3 +1,5 @@
RE-READ to ensure that concepts why this is done to improve performance and why this should be the "locally best" implementation (most should be in implementation though)
\chapter{Concept and Design}
\label{cha:conceptdesign}
% introduction to what needs to be done. also clarify terms "Host" and "Device" here

View File

@ -3,6 +3,8 @@
somewhere in here explain why one kernel per expression and not one kernel for all expressions
Go into the details why this implementation is tuned towards performance and should be the optimum at that
\section{Technologies}
Short section; CUDA, PTX, Julia, CUDA.jl

Binary file not shown.