small updates and notes for further writing
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
Some checks are pending
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
This commit is contained in:
parent
ef721b13e0
commit
c62aff806a
2
LICENSE
2
LICENSE
|
@ -1,6 +1,6 @@
|
||||||
MIT License
|
MIT License
|
||||||
|
|
||||||
Copyright (c) 2024 Daniel Wiplinger
|
Copyright (c) 2024 Daniel Roth
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
|
|
@ -27,7 +27,7 @@ function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector
|
||||||
|
|
||||||
results = Matrix{Float32}(undef, ncols, length(exprs))
|
results = Matrix{Float32}(undef, ncols, length(exprs))
|
||||||
|
|
||||||
for i in 1:repetitions # Simulate parameter tuning
|
for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially)
|
||||||
results = Interpreter.interpret(exprs, X, p)
|
results = Interpreter.interpret(exprs, X, p)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -41,7 +41,7 @@ function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{
|
||||||
|
|
||||||
results = Matrix{Float32}(undef, ncols, length(exprs))
|
results = Matrix{Float32}(undef, ncols, length(exprs))
|
||||||
|
|
||||||
for i in 1:repetitions # Simulate parameter tuning
|
for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially)
|
||||||
results = Transpiler.evaluate(exprs, X, p)
|
results = Transpiler.evaluate(exprs, X, p)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,10 @@ using .Transpiler
|
||||||
using .Interpreter
|
using .Interpreter
|
||||||
|
|
||||||
const BENCHMARKS_RESULTS_PATH = "./results"
|
const BENCHMARKS_RESULTS_PATH = "./results"
|
||||||
|
|
||||||
|
# TODO: Expressions can get much much bigger (into millions) (will be provided by Mr. Kronberger)
|
||||||
|
# TODO: Variable-Sets: 1000 can be considered the minimum; 100.000 can be considered the maximum (will be provided by Mr. Kronberger)
|
||||||
|
|
||||||
exprsCPU = [
|
exprsCPU = [
|
||||||
# CPU interpreter requires an anonymous function and array ref s
|
# CPU interpreter requires an anonymous function and array ref s
|
||||||
:(p[1] * x[1] + p[2]), # 5 op
|
:(p[1] * x[1] + p[2]), # 5 op
|
||||||
|
@ -24,7 +28,7 @@ exprsGPU = [
|
||||||
|
|
||||||
# p is the same for CPU and GPU
|
# p is the same for CPU and GPU
|
||||||
p = [randn(Float32, 10) for _ in 1:length(exprsCPU)] # generate 10 random parameter values for each expr
|
p = [randn(Float32, 10) for _ in 1:length(exprsCPU)] # generate 10 random parameter values for each expr
|
||||||
expr_reps = 100 # 100 parameter optimisation steps basically
|
expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially; only p changes but not X)
|
||||||
|
|
||||||
|
|
||||||
@testset "CPU performance" begin
|
@testset "CPU performance" begin
|
||||||
|
@ -89,15 +93,15 @@ if compareWithCPU
|
||||||
suite["CPU"]["large varset"] = @benchmarkable interpret_cpu(exprsCPU, X_large, p; repetitions=expr_reps)
|
suite["CPU"]["large varset"] = @benchmarkable interpret_cpu(exprsCPU, X_large, p; repetitions=expr_reps)
|
||||||
end
|
end
|
||||||
|
|
||||||
X_small_GPU = randn(Float32, 5, varsets_small)
|
X_small_GPU = randn(Float32, 5, varsets_small) # column-major
|
||||||
suite["GPUI"]["small varset"] = @benchmarkable interpret_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
|
suite["GPUI"]["small varset"] = @benchmarkable interpret_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
|
||||||
suite["GPUT"]["small varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
|
suite["GPUT"]["small varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps)
|
||||||
|
|
||||||
X_medium_GPU = randn(Float32, 5, varsets_medium)
|
X_medium_GPU = randn(Float32, 5, varsets_medium) # column-major
|
||||||
suite["GPUI"]["medium varset"] = @benchmarkable interpret_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
|
suite["GPUI"]["medium varset"] = @benchmarkable interpret_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
|
||||||
suite["GPUT"]["medium varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
|
suite["GPUT"]["medium varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps)
|
||||||
|
|
||||||
X_large_GPU = randn(Float32, 5, varsets_large)
|
X_large_GPU = randn(Float32, 5, varsets_large) # column-major
|
||||||
suite["GPUI"]["large varset"] = @benchmarkable interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
|
suite["GPUI"]["large varset"] = @benchmarkable interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
|
||||||
suite["GPUT"]["large varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
|
suite["GPUT"]["large varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps)
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
using ExpressionExecutorCuda
|
using ExpressionExecutorCuda
|
||||||
using Test
|
using Test
|
||||||
|
|
||||||
|
using BenchmarkTools
|
||||||
|
|
||||||
const baseFolder = dirname(dirname(pathof(ExpressionExecutorCuda)))
|
const baseFolder = dirname(dirname(pathof(ExpressionExecutorCuda)))
|
||||||
include(joinpath(baseFolder, "src", "Utils.jl"))
|
include(joinpath(baseFolder, "src", "Utils.jl"))
|
||||||
include(joinpath(baseFolder, "src", "ExpressionProcessing.jl"))
|
include(joinpath(baseFolder, "src", "ExpressionProcessing.jl"))
|
||||||
|
@ -20,5 +22,5 @@ end
|
||||||
|
|
||||||
@testset "Performance tests" begin
|
@testset "Performance tests" begin
|
||||||
# include("PerformanceTuning.jl")
|
# include("PerformanceTuning.jl")
|
||||||
include("PerformanceTests.jl")
|
# include("PerformanceTests.jl")
|
||||||
end
|
end
|
|
@ -1,3 +1,5 @@
|
||||||
|
RE-READ to ensure that concepts why this is done to improve performance and why this should be the "locally best" implementation (most should be in implementation though)
|
||||||
|
|
||||||
\chapter{Concept and Design}
|
\chapter{Concept and Design}
|
||||||
\label{cha:conceptdesign}
|
\label{cha:conceptdesign}
|
||||||
% introduction to what needs to be done. also clarify terms "Host" and "Device" here
|
% introduction to what needs to be done. also clarify terms "Host" and "Device" here
|
||||||
|
|
|
@ -3,6 +3,8 @@
|
||||||
|
|
||||||
somewhere in here explain why one kernel per expression and not one kernel for all expressions
|
somewhere in here explain why one kernel per expression and not one kernel for all expressions
|
||||||
|
|
||||||
|
Go into the details why this implementation is tuned towards performance and should be the optimum at that
|
||||||
|
|
||||||
\section{Technologies}
|
\section{Technologies}
|
||||||
Short section; CUDA, PTX, Julia, CUDA.jl
|
Short section; CUDA, PTX, Julia, CUDA.jl
|
||||||
|
|
||||||
|
|
BIN
thesis/main.pdf
BIN
thesis/main.pdf
Binary file not shown.
Loading…
Reference in New Issue
Block a user