benchmarking: further tests done. Seems like transpiler takes ages, need to investigate further
	
		
			
	
		
	
	
		
	
		
			Some checks failed
		
		
	
	
		
			
				
	
				CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled
				
			
		
			
				
	
				CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled
				
			
		
			
				
	
				CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled
				
			
		
		
	
	
				
					
				
			
		
			Some checks failed
		
		
	
	CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled
				
			CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled
				
			CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled
				
			This commit is contained in:
		| @ -34,6 +34,7 @@ function interpret_gpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector | |||||||
|  |  | ||||||
| 	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl) | 	for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl) | ||||||
| 		results = Interpreter.interpret(exprs, X, p) | 		results = Interpreter.interpret(exprs, X, p) | ||||||
|  | 		println("got results") | ||||||
| 	end | 	end | ||||||
|  |  | ||||||
| 	return results | 	return results | ||||||
|  | |||||||
| @ -27,15 +27,17 @@ NOTE: This function is not thread save, especially cache access is not thread sa | |||||||
| function expr_to_postfix(expression::Expr)::PostfixType | function expr_to_postfix(expression::Expr)::PostfixType | ||||||
| 	expr = expression | 	expr = expression | ||||||
| 	if expression.head === :-> | 	if expression.head === :-> | ||||||
| 		if typeof(expression.args[2]) == Float64 | 		# if typeof(expression.args[2]) == Float64 | ||||||
| 			println() | 		# 	println() | ||||||
| 			println("Expression: $expression") | 		# 	println("Expression: $expression") | ||||||
| 			println("Expr: $expr") | 		# 	println("Expr: $expr") | ||||||
| 			println() | 		# 	println() | ||||||
| 			dump(expression; maxdepth=10) | 		# 	dump(expression; maxdepth=10) | ||||||
| 		end | 		# end | ||||||
| 		# if the expression equals (x, p) -> (...) then the below statement extracts the expression to evaluate | 		# if the expression equals (x, p) -> (...) then the below statement extracts the expression to evaluate | ||||||
| 		if expression.args[2].head == :block # expressions that are not generated with the parser (./test/parser.jl) contain this extra "block" node, which needs to be skipped | 		if typeof(expression.args[2]) == Float64 | ||||||
|  | 			return [convert_to_ExpressionElement(expression.args[2])] | ||||||
|  | 		elseif expression.args[2].head == :block # expressions that are not generated with the parser (./test/parser.jl) contain this extra "block" node, which needs to be skipped | ||||||
| 			expr = expression.args[2].args[2]  | 			expr = expression.args[2].args[2]  | ||||||
| 		else # ... if the are generated with the parser, this node is not present and therefore doesn't need to be skipped | 		else # ... if the are generated with the parser, this node is not present and therefore doesn't need to be skipped | ||||||
| 			expr = expression.args[2] | 			expr = expression.args[2] | ||||||
| @ -48,8 +50,6 @@ function expr_to_postfix(expression::Expr)::PostfixType | |||||||
|  |  | ||||||
| 	postfix = PostfixType()	 | 	postfix = PostfixType()	 | ||||||
|  |  | ||||||
| 	 |  | ||||||
|  |  | ||||||
| 	# Special handling in the case where the expression is an array access | 	# Special handling in the case where the expression is an array access | ||||||
| 	# This can happen if the token is a variable/parameter of the form x[n]/p[n] | 	# This can happen if the token is a variable/parameter of the form x[n]/p[n] | ||||||
| 	if expr.head == :ref | 	if expr.head == :ref | ||||||
|  | |||||||
| @ -30,7 +30,7 @@ function interpret(expressions::Vector{Expr}, variables::Matrix{Float32}, parame | |||||||
| 	cudaResults = CuArray{Float32}(undef, variableCols, length(exprs)) | 	cudaResults = CuArray{Float32}(undef, variableCols, length(exprs)) | ||||||
|  |  | ||||||
| 	# Start kernel for each expression to ensure that no warp is working on different expressions | 	# Start kernel for each expression to ensure that no warp is working on different expressions | ||||||
| 	@inbounds for i in eachindex(exprs) | 	@inbounds Threads.@threads for i in eachindex(exprs) | ||||||
| 		numThreads = min(variableCols, 256) | 		numThreads = min(variableCols, 256) | ||||||
| 		numBlocks = cld(variableCols, numThreads) | 		numBlocks = cld(variableCols, numThreads) | ||||||
|  |  | ||||||
|  | |||||||
| @ -157,7 +157,7 @@ function get_kernel_signature(kernelName::String, parameters::Vector{DataType}, | |||||||
| 	println(signatureBuffer, "(") | 	println(signatureBuffer, "(") | ||||||
| 	 | 	 | ||||||
| 	for i in eachindex(parameters) | 	for i in eachindex(parameters) | ||||||
| 		print(signatureBuffer, "  .param .u64", " ", "param_", i) | 		print(signatureBuffer, "  .param .u64 param_", i) | ||||||
|  |  | ||||||
| 		parametersLocation = Utils.get_next_free_register(regManager, "rd") | 		parametersLocation = Utils.get_next_free_register(regManager, "rd") | ||||||
| 		println(paramLoadingBuffer, "ld.param.u64   $parametersLocation, [param_$i];") | 		println(paramLoadingBuffer, "ld.param.u64   $parametersLocation, [param_$i];") | ||||||
|  | |||||||
| @ -63,11 +63,11 @@ if compareWithCPU | |||||||
| end | end | ||||||
|  |  | ||||||
| # cacheInterpreter = Dict{Expr, PostfixType}() | # cacheInterpreter = Dict{Expr, PostfixType}() | ||||||
| # suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprs, X_t, parameters; repetitions=expr_reps) | suite["GPUI"]["nikuradse_1"] = @benchmarkable interpret_gpu(exprs, X_t, parameters; repetitions=expr_reps) | ||||||
|  |  | ||||||
| # cacheTranspilerFront = Dict{Expr, PostfixType}() | # cacheTranspilerFront = Dict{Expr, PostfixType}() | ||||||
| # cacheTranspilerRes = Dict{Expr, CuFunction}() | # cacheTranspilerRes = Dict{Expr, CuFunction}() | ||||||
| suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprs, X_t, parameters; repetitions=expr_reps) | suite["GPUT"]["nikuradse_1"] = @benchmarkable evaluate_gpu(exprs, X_t, parameters; repetitions=expr_reps) # Takes forever. Needs more investigation | ||||||
|  |  | ||||||
| tune!(suite) | tune!(suite) | ||||||
| BenchmarkTools.save("params.json", params(suite)) | BenchmarkTools.save("params.json", params(suite)) | ||||||
|  | |||||||
| @ -1 +1 @@ | |||||||
| [{"Julia":"1.11.4","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["CPUInterpreter"]}],"GPUT":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"medium varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"large varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"small varset":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":1000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]] | [{"Julia":"1.11.5","BenchmarkTools":{"major":1,"minor":6,"patch":0,"prerelease":[],"build":[]}},[["BenchmarkGroup",{"data":{"CPU":["BenchmarkGroup",{"data":{},"tags":["CPUInterpreter"]}],"GPUT":["BenchmarkGroup",{"data":{},"tags":["GPUTranspiler"]}],"GPUI":["BenchmarkGroup",{"data":{"nikuradse_1":["Parameters",{"gctrial":true,"time_tolerance":0.05,"evals_set":false,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":["GPUInterpreter"]}]},"tags":[]}]]] | ||||||
		Reference in New Issue
	
	Block a user