small updates and notes for further writing
	
		
			
	
		
	
	
		
	
		
			Some checks are pending
		
		
	
	
		
			
				
	
				CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
				
			
		
			
				
	
				CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
				
			
		
			
				
	
				CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
				
			
		
		
	
	
				
					
				
			
		
			Some checks are pending
		
		
	
	CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Waiting to run
				
			CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Waiting to run
				
			CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Waiting to run
				
			This commit is contained in:
		
							
								
								
									
										2
									
								
								LICENSE
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								LICENSE
									
									
									
									
									
								
							| @ -1,6 +1,6 @@ | ||||
| MIT License | ||||
|  | ||||
| Copyright (c) 2024 Daniel Wiplinger | ||||
| Copyright (c) 2024 Daniel Roth | ||||
|  | ||||
| Permission is hereby granted, free of charge, to any person obtaining a copy | ||||
| of this software and associated documentation files (the "Software"), to deal | ||||
|  | ||||
| @ -27,7 +27,7 @@ function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector | ||||
|  | ||||
| 	results = Matrix{Float32}(undef, ncols, length(exprs)) | ||||
|  | ||||
| 	for i in 1:repetitions # Simulate parameter tuning | ||||
| 	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially) | ||||
| 		results = Interpreter.interpret(exprs, X, p) | ||||
| 	end | ||||
|  | ||||
| @ -41,7 +41,7 @@ function evaluate_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{ | ||||
|  | ||||
| 	results = Matrix{Float32}(undef, ncols, length(exprs)) | ||||
|  | ||||
| 	for i in 1:repetitions # Simulate parameter tuning | ||||
| 	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially) | ||||
| 		results = Transpiler.evaluate(exprs, X, p) | ||||
| 	end | ||||
|  | ||||
|  | ||||
| @ -5,6 +5,10 @@ using .Transpiler | ||||
| using .Interpreter | ||||
|  | ||||
| const BENCHMARKS_RESULTS_PATH = "./results" | ||||
|  | ||||
| # TODO: Expressions can get much much bigger (into millions) (will be provided by Mr. Kronberger) | ||||
| # TODO: Variable-Sets: 1000 can be considered the minimum; 100.000 can be considered the maximum (will be provided by Mr. Kronberger) | ||||
|  | ||||
| exprsCPU = [ | ||||
| 	# CPU interpreter requires an anonymous function and array ref s | ||||
| 	:(p[1] * x[1] + p[2]), # 5 op | ||||
| @ -24,7 +28,7 @@ exprsGPU = [ | ||||
|  | ||||
| # p is the same for CPU and GPU | ||||
| p = [randn(Float32, 10) for _ in 1:length(exprsCPU)] # generate 10 random parameter values for each expr | ||||
| expr_reps = 100 # 100 parameter optimisation steps basically | ||||
| expr_reps = 100 # 100 parameter optimisation steps (local search; sequentially; only p changes but not X) | ||||
|  | ||||
|  | ||||
| @testset "CPU performance" begin | ||||
| @ -89,15 +93,15 @@ if compareWithCPU | ||||
| 	suite["CPU"]["large varset"] = @benchmarkable interpret_cpu(exprsCPU, X_large, p; repetitions=expr_reps) | ||||
| end | ||||
|  | ||||
| X_small_GPU = randn(Float32, 5, varsets_small) | ||||
| X_small_GPU = randn(Float32, 5, varsets_small) # column-major | ||||
| suite["GPUI"]["small varset"] = @benchmarkable interpret_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps) | ||||
| suite["GPUT"]["small varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_small_GPU, p; repetitions=expr_reps) | ||||
|  | ||||
| X_medium_GPU = randn(Float32, 5, varsets_medium) | ||||
| X_medium_GPU = randn(Float32, 5, varsets_medium) # column-major | ||||
| suite["GPUI"]["medium varset"] = @benchmarkable interpret_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps) | ||||
| suite["GPUT"]["medium varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_medium_GPU, p; repetitions=expr_reps) | ||||
|  | ||||
| X_large_GPU = randn(Float32, 5, varsets_large) | ||||
| X_large_GPU = randn(Float32, 5, varsets_large) # column-major | ||||
| suite["GPUI"]["large varset"] = @benchmarkable interpret_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps) | ||||
| suite["GPUT"]["large varset"] = @benchmarkable evaluate_gpu(exprsGPU, X_large_GPU, p; repetitions=expr_reps) | ||||
|  | ||||
|  | ||||
| @ -1,6 +1,8 @@ | ||||
| using ExpressionExecutorCuda | ||||
| using Test | ||||
|  | ||||
| using BenchmarkTools | ||||
|  | ||||
| const baseFolder = dirname(dirname(pathof(ExpressionExecutorCuda))) | ||||
| include(joinpath(baseFolder, "src", "Utils.jl")) | ||||
| include(joinpath(baseFolder, "src", "ExpressionProcessing.jl")) | ||||
| @ -20,5 +22,5 @@ end | ||||
|  | ||||
| @testset "Performance tests" begin | ||||
| 	# include("PerformanceTuning.jl") | ||||
| 	include("PerformanceTests.jl") | ||||
| 	# include("PerformanceTests.jl") | ||||
| end | ||||
| @ -1,3 +1,5 @@ | ||||
| RE-READ to ensure that concepts why this is done to improve performance and why this should be the "locally best" implementation (most should be in implementation though) | ||||
|  | ||||
| \chapter{Concept and Design} | ||||
| \label{cha:conceptdesign} | ||||
| % introduction to what needs to be done. also clarify terms "Host" and "Device" here | ||||
|  | ||||
| @ -3,6 +3,8 @@ | ||||
|  | ||||
| somewhere in here explain why one kernel per expression and not one kernel for all expressions | ||||
|  | ||||
| Go into the details why this implementation is tuned towards performance and should be the optimum at that | ||||
|  | ||||
| \section{Technologies} | ||||
| Short section; CUDA, PTX, Julia, CUDA.jl | ||||
|  | ||||
|  | ||||
							
								
								
									
										
											BIN
										
									
								
								thesis/main.pdf
									
									
									
									
									
								
							
							
						
						
									
										
											BIN
										
									
								
								thesis/main.pdf
									
									
									
									
									
								
							
										
											Binary file not shown.
										
									
								
							
		Reference in New Issue
	
	Block a user