From f33551e25fb46e090602942d2b82f46d7b41e7f5 Mon Sep 17 00:00:00 2001 From: Daniel Date: Mon, 19 May 2025 11:39:49 +0200 Subject: [PATCH] benchmarking: updated transpiler to drastically reduce the number of transpilations at the expense of memory usage --- package/src/ExpressionExecutorCuda.jl | 20 ++++--- package/src/Transpiler.jl | 80 ++++++++++---------------- thesis/chapters/evaluation.tex | 17 ++---- thesis/main.pdf | Bin 924411 -> 923864 bytes 4 files changed, 48 insertions(+), 69 deletions(-) diff --git a/package/src/ExpressionExecutorCuda.jl b/package/src/ExpressionExecutorCuda.jl index b361411..81cd408 100644 --- a/package/src/ExpressionExecutorCuda.jl +++ b/package/src/ExpressionExecutorCuda.jl @@ -49,19 +49,26 @@ end # Convert Expressions to PTX Code and execute that instead function evaluate_gpu(expressions::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector{Float32}}; repetitions=1)::Matrix{Float32} @assert axes(expressions) == axes(p) - variableCols = size(X, 2) - variableRows = size(X, 1) + numVariableSets = size(X, 2) # nr. of columns of X + variableSetSize = size(X, 1) # nr. of rows of X variables = CuArray(X) - exprs = Vector{ExpressionProcessing.PostfixType}(undef, length(expressions)) + largestParameterSetSize = Utils.get_max_inner_length(p) # parameters get transformed into matrix. Will be nr. of rows in parameter matrix + + compiledKernels = Vector{CuFunction}(undef, length(expressions)) + kernelName = "evaluate_gpu" @inbounds Threads.@threads for i in eachindex(expressions) - exprs[i] = ExpressionProcessing.expr_to_postfix(expressions[i]) + ex = ExpressionProcessing.expr_to_postfix(expressions[i]) + ptxKernel = Transpiler.transpile(ex, variableSetSize, largestParameterSetSize, numVariableSets, i-1, kernelName) # i-1 because julia is 1-based but PTX needs 0-based indexing + compiledKernels[i] = Transpiler.CompileKernel(ptxKernel, kernelName) end - results = Matrix{Float32}(undef, variableCols, length(exprs)) + results = Matrix{Float32}(undef, numVariableSets, length(exprs)) for i in 1:repetitions # Simulate parameter tuning -> local search (X remains the same, p gets changed in small steps and must be performed sequentially, which it is with this impl) - results = Transpiler.evaluate(exprs, variables, variableCols, variableRows, p) + # evaluate + # results = Transpiler.evaluate(exprs, variables, numVariableSets, variableSetSize, p) + results = Transpiler.evaluate(compiledKernels, variables, variableSetSize, p) end return results @@ -103,7 +110,6 @@ function interpret_cpu(exprs::Vector{Expr}, X::Matrix{Float32}, p::Vector{Vector res end - # Flow # input: Vector expr == expressions contains eg. 4 expressions # Matrix X == |expr| columns, n rows. n == number of variabls x1..xn; n is the same for all expressions --- WRONG diff --git a/package/src/Transpiler.jl b/package/src/Transpiler.jl index 059fca2..33afd7e 100644 --- a/package/src/Transpiler.jl +++ b/package/src/Transpiler.jl @@ -14,37 +14,6 @@ const Operand = Union{Float32, String} # Operand is either fixed value or regist " function evaluate(expressions::Vector{ExpressionProcessing.PostfixType}, cudaVars::CuArray{Float32}, variableColumns::Integer, variableRows::Integer, parameters::Vector{Vector{Float32}})::Matrix{Float32} - # TODO: test this again with multiple threads. The first time I tried, I was using only one thread - # Test this parallel version again when doing performance tests. With the simple "functionality" tests this took 0.03 seconds while sequential took "0.00009" seconds - # Threads.@threads for i in eachindex(expressions) - # cacheLock = ReentrantLock() - # cacheHit = false - # lock(cacheLock) do - # if haskey(transpilerCache, expressions[i]) - # kernels[i] = transpilerCache[expressions[i]] - # cacheHit = true - # end - # end - - # if cacheHit - # continue - # end - - # formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i]) - - # kernel = transpile(formattedExpr, varRows, Utils.get_max_inner_length(parameters), variableColumns, i-1) # i-1 because julia is 1-based but PTX needs 0-based indexing - - # linker = CuLink() - # add_data!(linker, "ExpressionProcessing", kernel) - - # image = complete(linker) - - # mod = CuModule(image) - # kernels[i] = CuFunction(mod, "ExpressionProcessing") - - # @lock cacheLock transpilerCache[expressions[i]] = kernels[i] - # end - cudaParams = Utils.create_cuda_array(parameters, NaN32) # maybe make constant (see PerformanceTests.jl for more info) # each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions @@ -54,33 +23,44 @@ function evaluate(expressions::Vector{ExpressionProcessing.PostfixType}, cudaVar blocks = cld(variableColumns, threads) kernelName = "evaluate_gpu" - # TODO: Implement batching as a middleground between "transpile everything and then run" and "tranpile one run one" even though cudacall is async @inbounds Threads.@threads for i in eachindex(expressions) - # if haskey(resultCache, expressions[i]) - # kernels[i] = resultCache[expressions[i]] - # continue - # end - - # formattedExpr = ExpressionProcessing.expr_to_postfix(expressions[i]) kernel = transpile(expressions[i], variableRows, Utils.get_max_inner_length(parameters), variableColumns, i-1, kernelName) # i-1 because julia is 1-based but PTX needs 0-based indexing - - linker = CuLink() - add_data!(linker, kernelName, kernel) - - image = complete(linker) - mod = CuModule(image) - compiledKernel = CuFunction(mod, kernelName) - + compiledKernel = CompileKernel(kernel, kernelName) cudacall(compiledKernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks) end - # for kernel in kernels - # cudacall(kernel, (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks) - # end - return cudaResults end +" +A simplified version of the evaluate function. It takes a list of already compiled kernels to be executed. This should yield better performance, where the same expressions should be evaluated multiple times i.e. for parameter optimisation. +" +function evaluate(kernels::Vector{CuFunction}, cudaVars::CuArray{Float32}, nrOfVariableSets::Integer, parameters::Vector{Vector{Float32}})::Matrix{Float32} + + cudaParams = Utils.create_cuda_array(parameters, NaN32) # maybe make constant (see PerformanceTests.jl for more info) + + # each expression has nr. of variable sets (nr. of columns of the variables) results and there are n expressions + cudaResults = CuArray{Float32}(undef, nrOfVariableSets, length(expressions)) + + threads = min(nrOfVariableSets, 256) + blocks = cld(nrOfVariableSets, threads) + + @inbounds Threads.@threads for i in eachindex(kernels) + cudacall(kernels[i], (CuPtr{Float32},CuPtr{Float32},CuPtr{Float32}), cudaVars, cudaParams, cudaResults; threads=threads, blocks=blocks) + end + + return cudaResults +end + +function CompileKernel(ptxKernel::String, kernelName::String)::CuFunction + linker = CuLink() + add_data!(linker, kernelName, ptxKernel) + + image = complete(linker) + mod = CuModule(image) + return CuFunction(mod, kernelName) +end + # To increase performance, it would probably be best for all helper functions to return their IO Buffer and not a string # seekstart(buf1); write(buf2, buf1) " diff --git a/thesis/chapters/evaluation.tex b/thesis/chapters/evaluation.tex index 8e598c7..d0fa1f3 100644 --- a/thesis/chapters/evaluation.tex +++ b/thesis/chapters/evaluation.tex @@ -59,13 +59,10 @@ Results only for Interpreter (also contains final kernel configuration and proba \subsection{Performance Tuning} Document the process of performance tuning -Initial: CPU-Side single-threaded; up to 1024 threads per block; bounds-checking enabled (especially in kernel) +Initial: no cache; 256 blocksize; exprs pre-processed and sent to GPU on every call; vars sent on every call; frontend + dispatch are multithreaded + +1.) Done before parameter optimisation loop: Frontend, transmitting Exprs and Variables (improved runtime) -1.) Blocksize reduced to a maximum of 256 -> moderate improvement in medium and large -2.) Using @inbounds -> noticeable improvement in 2 out of 3 -3.) Tuned blocksize with NSight compute -> slight improvement -4.) used int32 everywhere to reduce register usage -> significant performance drop (probably because a lot more waiting time "latency hiding not working basically", or more type conversions happening on GPU? look at generated PTX code and use that as an argument to describe why it is slower) -5.) reverted previous; used fastmath instead -> imporvement (large var set is now faster than on transpiler) \subsection{Transpiler} Results only for Transpiler (also contains final kernel configuration and probably quick overview/recap of the implementation used and described in Implementation section @@ -75,13 +72,9 @@ Results only for Transpiler (also contains final kernel configuration and probab \subsection{Performance Tuning} Document the process of performance tuning -Initial: CPU-Side single-threaded; up to 1024 threads per block; bounds-checking enabled +Initial: no cache; 256 blocksize; exprs pre-processed and transpiled on every call; vars sent on every call; frontend + transpilation + dispatch are multithreaded -1.) Blocksize reduced to a maximum of 256 -> moderate improvement in medium and large -2.) Using @inbounds -> small improvement only on CPU side code -3.) Tuned blocksize with NSight compute -> slight improvement -4.) Only changed things on interpreter side -5.) Only changed things on interpreter side +1.) Done before parameter optimisation loop: Frontend, transmitting Exprs and Variables (improved runtime) \subsection{Comparison} Comparison of Interpreter and Transpiler as well as Comparing the two with CPU interpreter diff --git a/thesis/main.pdf b/thesis/main.pdf index 48e715cedd82570234c23f67de82da62fa920e95..13f357eca10ac36fe27949aa0c9e015ff5b04a53 100644 GIT binary patch delta 8724 zcmai(WlSAFv#^1KySux)9ExjkcXxNE$ia%cJCx$?4n4TT!JR@WP+W@^_;}xZH}~gF z_Q^;lo0-k*W|Dnr1Fkgz8x!Q9fms910A^73jB6ez;ppWP3osW@FJ=PhFL&rsPSa`2 z@%iH;K0Kj@gF__O_8#nms6Vc&uI+7kOh-=-{PiGuxsSkpBa>=#z;U7+2Yp>me(eH3 zG{5^RS2GElHKugO?w5QR4RxO!rjpEA!srg5d)m9oOaDlUB@4VgLD7273+1o7(xQ=Vb91E@qu#V0L~+Nu08 zpy3oSA@^QjSf4GJ$tI}vMi|9d&Pdtd=@fadrg59ewXfehH$m^0g!%!`h%!p6u{~Sr z`SCk9=sQlp=-7^m3DL504&UWbLD1(R+Q0e{h2P)cz7^kyMLcOIf%z=8ny35!A*jQ3SW3 zw^5fC&YiY3jZ;vsUs=W>*?PR8EybpEkzK5X_m!pb@wCrV4f#yE9((4zrOkD#sw8*a z%@IU5#~{~7cxWDAP7?4`Spdpbu*R3oUEMxh z3V6+K@6r33HfZVTC`UdY_8+l_DPld0ub8&A?0F5?Mf;C$!u4q4SPP%=#4~qmDR|W{ z9ZkCo$s1E#2X^FN29)L=a{-=9Y;#N$d&gP!Ppt6Rf6nWBnZN8g`t}l|1p|I?dZMIZ zVc~>_D@KJYLVp1B0pJ7J58yt4?+8~!IMasaPlIk^1}*C;dT#P#_P^*}Au6-w?n+^& zrCCfheQG~zI^gG#$G}7~jAu%vlM}jVxMasqW{NtYEwJb5LIhj*Hf@b{gd|QX(CXh# z7X*0k3O*zr^@{m*2m?Q%bqhdHqH2fU8UC}Jh}$s~sfGKSX`y1~nh&1n$R*ir3e348zane# zPT&(hSIk$|LIOdY0PR0-}gIj-Hl=Yiv4UnulhLL3Tn2A03xmO7VzeNv4~}d4DX7U#zDKbUH3uu&C@uf@rD~pR%c$SoXHcEESENr?dY96CieRKyw8=SwtgCvZB zHjQpxX8ZZsPhyyuqqoe4h7|%DoJb7Ba_qoLwrM%GU9Aa;S!>X;CDpa1hC48!C2TF%0JnP5!+w~tq_ZjW zQ_po;_mf0QCjAAy@(Yc#N3;hmZwx9ZeZR)Aq_-KXUxgrc=(7uH{GYAp#2^8oLh9S| z8B{BgBICjZKaoB#lI>VlT;9Q7uYSE9(gl2Z(xMrH%NWid-`w51Xo(;`lBEY_M04?P zi<`Hensj82wFEX4nJsX!iO|$8cKIB3uu|Ye60f^dMgzfk+blcCW#qQgY{DbxYzQwl ze~4gO#fL9dU9Pk9Q<3pB{!sroD?(a`GD2E1+ff!HQE1Ivx$e|@n<7^Mj=Ql=H>p+B zNsBaYYeBZ0LLOo|ce;tUy@gIGd(+&dT@CAvZ3(Xq?m*S}l7TIXlQ5m=EZe0O8&VGCc}tk=a*32~wXt)O^bNz@gG-?{jX-OEa$a*V zJo7s}7_~k5;6@aZOQ2^<%+|DWADVR}+r-p$$_DW(OkkfzG&ke^S!V*ek|%YH!0uv> zp&M`_0-)M_ZGK6{ete~77zBJxYUyr-q`Q9Y*-nU#pyvz?ZVd~zjDK#K@$wM98yP4^ zI^40OFG_UCiH@bL?Bzk53e#+_^5{yrbG|n-i?}c9hiTQSE}d1XE;W=}RJs*$P#R}j z3JRdOdkL7|5k<7yZD|LML#jqz1tMOC@5WaQ`u4Mv*HJ=TN`(4(>|kV_(8MKx`RG2+ zm|iAU%Jeq^!$yBz!hY1b=OMe=Zzjp|>K}|MkU3?%()~=OTncbzXG=~Ts$RTi;$7QN ztI=na&yi>8u7^Dn@h%Th?-Z|-&$IQM!Sa9e0qOdYO4D@mFp+>x>M5LI?i`jh2UnU# zbxQt8O6685@FHE*`CeEPa^zcx1RZFkrT;Kj z#qo|5P_%aEz9|ODovLW(ZPXZk651RGX`1#uC>hJP!Lx3%^&<$*i$&y{5mGTEdsSSR z^oFcJs}%fvc6kUZ7=~Gk=_Il%ZRtzr{~@aw$n-~P(C`YyU@#Wq@o%yC;DH}}B_=s} zg+OCX_=T8hDESl>D*`Dxw6jbdo1u>}h6J8Yy??=vYd8sXu@RRt;M{!{QQ(-We|my} zWclaht6HRx7(8iRP(iIfTHc{H_67LR3MG#^egXngg^2c_dE#DePE7eWqM#dj4(=asdM-{=nblT)zNb;aJ%(TKksWihN`CKU6yX zk#{3po~|@O#O|-P%hYZ(f1oFHIad4H9N(bCNf_<_U_b?Ra`FVKcRZ#e(iciZH}kKf zZa&YE5{h+9#%g$HwN6rLdmN`e((#c{x_WsSymtMQM3IJK^n}#k^Bdn*VtjTg*j^MO zKP(Tn|7InXR_fI86;%;%`E}UD%1-q@vf+Ib3WolGRaSWIT_mK0MNu{Wtst868-ie{ z0wu1@^xM&Q^qHJTq;1r;eJB4b0i!+@!7QvX1e`4ODg;~@Ztkp?$6^O=!O1rqm65NagDT9gMDhML^Wf3#4bJxATE32nV%Ef)HvC zBD`+Xk_8(vSWMN(z{&WD7h;cF8qoOJ&)ncjM`6N8!1vY5KEI+jEj8DE;^)~d2glGA zRwP7$?zxVcRC0$|+Q-|-{OX(CdmAi;rOoKgZn-61Rd-*U_Cnb-$!czwLbT{>(oRb7 zug$_`vri4JEg1dXjt+ay{Iiz#zBTW>QBh{~ehrcucGe0R*+w@Mf0J~V?vBt;x7H64 zj*Uz_{#$NGRt>@+34mKrFzXr`kqKmA=t?q%7kt!^J$Co4#gpBWB_2wivKlT#g>xA^lbqM?+e@&mprU*_ZW0|&F?s3wx=6rEdXf?a_C^5*Gev=_9 z?cANr$@Hv1pQ;w`MLJOux&$_|(!w^MOOLiuHlKyP5epv-YeXBsvhUI+6e7P*1O82F z3n!_DH42ndy?AG1QW3WY?XYBa;{hGeroBGh(Ps$s)g|MIvyk>$i?VEo;%+*1~7#pTS=IWb$= z9l*b6L#3psM&xjzbfXQC@Qbao(*WT^+;5GL4Q8xbWSrcpRAS4vPF%-A?W^==e~@GyyS7;!75I0f z37xyXSs(HE0pjU0_zK3X4mv4bg^xX)Xr(a^knE2bGriQCp_J12*&Ke~MmX{-RVYiC z?z++EX>_pma3{)PtD8sS;W7k&d43tH_Ru%ShL6@0s3w3-)UaxXJaUU{pe?I&P4JoN zicvctQK-CK;VKeS3B>8g52v$;iAr3WxgJN*vR4+MIgY;VDDdy!)Q>CL03o>CRE9sE z*L$xo`|GQw@1kz;JFLqHwI|A|L15~_ZzrN=HXI8`iofA1YA}GX4OfNiW=zX0=t>lB z>?V7dL13mA#sdD7?7-;){!o%9=>q-%_XnU_0hqNv3(2CW4YWv$J$|%ck6k}NaB+t^ z?CRRog#pBKwQa=!+L(GPlKcc2y}EUN(w`x#`}yed6zuQs=kD>zS9m7}4Pk0zvd?cX z5@3bo_lW3g=3nWtX{G41XfqvlZUE!NX-os3e(Y^jDz}L9JDS^>+oHMl2_bE$X6oR0 zFYsWeb8mSH01_ab5v$*deEx!&tS_hz3|Zo#Zr$)dzAZ?vGLI0o)TZzRMBMkB{6Iq4 zPio8;);$heCJc-Y@|zj5^iS3PPB##6_VrHWX&!A#Yy(7pP_MjT>BJq_qQ@F~J)<{G zsOV$jwEfF0A`dN!;;oPPL9Bq|O^Br5*rafVW3@CaY0TxZh9o#5)@^E&O-oR8_dZn9 z_N-7+g$GF8=y4ZYQfH%j<&l9ZB-8jfQ#m0e@UKzW;M3Yk&7v>EM`dNJVmYHeBLuNb zev}DMRv^|gZVO(?gk0SSFAiH>zuUmBE69!Vj_mtn(cb=3B8Nwen&ztp)?qn^4U6h; z@-bfqU0kC5dx!mbwX$%bdf9Mo=-b=R;jHR>&_0=%Ci+ETZv6g`?%UcL zgC9sTARd2x)dS?w7qTaOuuUH%YZZA%(ytb2b$3woThWrj^y2oBctZKl=!Qk?~m$vTC=PAy7wz=1vn{#mp&DT$)LGN>ykgo4KSnQ#eocH_|)jE|V=H2lY>w2S` zk96?&Zln80)ClfI1WAJaTJe@bBa(|B{Q&lJcULIP;B+ zlHDj%l{)xH)aj`p)#5{liX{^?$W{17%;#!NjB~tGk@eUC65TtP!E#Nc_i8Ov@(TWn zO`i^BB>1WB{!&j!y;o+(VlWFWIWYnEyQykJd&|9)ABpTTVaoP&K_2&;f!FsAPeq`N z4)gBMOUR+FRXK2w3H0(WFS5lrMuG+*EOl~EmioPw5nq-v#(k-SK0HVc=t}t zU<>!Cn0RYrdH&{F9l4$2m))4}A|6Ny;nMll#pEm-XKiQbCU>tA8UaNHjr z{SmpI5VRv4(%{Pqf|w4?;NykvFljB&yruL8@%LUEs0>Bl{{A{!D&byeq{abFo&SMu z($K4rdppz@MG zvT3)#eL&34-Ui#6D(dP*cmf2Z{cQ<>Wc7PIyPougS|y4O53|GZzV7^Qy`C*V+zDy0 z+Oyg6oWdl&B@~E_sp|V zuJ7&FR@tN9&qbAA>}zv2KPB*N5~ghuYE}f1SbF?txsJk*p@|9n+X7vA4pCG*kdV3f zmGyC)`)hhks8C$BHkE|cUa+MeNuSY_RdkYJ(N)!_%2-R1_R+h?yCEX)pfYMl6bo;jb8tu$ zJcy7LUJ0wWaePkg3j+1ZVM8FL*N(v|ZbW!<>Aq{xSuhl}B!gHIrkMEr5fuOG)gBxn z?vGP@^FKk&221F9OOl3`(tfH+zs+EK)y62A*If-Qo7~sXr|5XUK%9E%^Hc5bbFcan zv3?fxKIm2)-Hse@)xVQUTBx`?hDB7Gi9JVuQrL$r6DTItI!8{?k!^dUz)g#Nhzf-u ze1P}?(g(;NpnQPZ5eh-WVnPt);^O4ybjqoel;4FgAAmS2#ck4K7|Tk4}D zczFc`xp?`cfU=U@61*RCmMotrrSSh5VvNk8WbI<>Wk<=wE0FaK^EH990dZl(BIUFyE=hZ1-H&=@K=-`235kCdxZ zU}wH}Md;gwTJ@mBKrTj9D0Z6P)ph(HpLE=TR=iVEr7-L&5K^aZh2wi{2g6o8UqbV@ z+Zxen&w$4cp@#L-bM>eroby(5f)2wUm>LNIJ-|wsx0%Qq6n_{^L-=k+=%wcssEgm$+LyUn8$(DF=?2E*0VI2NxxqaDVs!ceoa8BcKa=W(|8K=uYbog*a{2o%LByh z<{3Z@6)Eh6Ue~2@495SE3Im9Ye!&rda^gw@WBimXTwqwX!t_Ll4kj2u@X0!Fq~h&w z!e&9&ayxV3EKt@jiuh$>^+EVfF?eg=of;%K0JN8?`gbGf%2{m_im0pE5*U=n3pK5+ zcH4tniB;cp^om;}Q)u5JWlUdcBD?L|T|vVn3Ax5DNu3S%=FL$T3zMNRX2ymfrttBz ze;raqE{vL-l1Pw=3WhL0Cup-#YC+kJ`iMP8;qHgL+wARr^jDx<`z<$ z!%9=NILOFseiE(BID!otiCc4mnq|lewn!H(;7pGB1MXi>^lZDr1MaG6vh%<;jq@I9 zt`IhnHjdNRFSm8#Nn7xGW4i62@ZkS6{R`Nnam2&(JFzv?8BEUrIk})r9LB&TCLufm zn2@k_>zCqlUAa%PDJlGC&anYlAS!+6Xx{(tTIhZr7TW(>07&hFe+zFTLG8z94?`ofpwfb06Ur~0BkB%WbJU!!Uib*;6k|SXCZ(9EW?R` z52kBGWDlovgJbl2?!2io+SBYBtnjcV${H7pQoH9j05V9K6CqEj=|v7UYeRjtc7__T zCM(7%UPL{x`Wqj(=-~>|x{Mrci9P7=AgNmVYtZD$IY>eRwkh_mD>4ZhQMXtz>naMA z0(kM%Abn}GvID`bkGFA6=TX9Ubozv3_NTHoEW=H`o{JHeAvnQ0vOU}gc{!7bXRp&w zyTelh4n7Fd|Vbc7oKNpB%?TSC^3;j?tiAnO<;sj#NLLshqJ(ec|p@bi9>v zTlWu{=GMw8Xd4vsseFKOt)C%Wm`XO`cm4VvDBWkrEfqmTJ)Jp_is$)5<5JtA)U=VQ zn`R=7cpcD%yLd6S=OSa>c|<7O;cp0o~Kx26b3{j7VjADc4v?av3cg8N8rYaJ9ImjRa~rcOi%L6$?ClAQso#* z3`}~8rN+<$JW%b})NEiU<3mnhs!{%PQK7U544}DD{xBD}L69J(dT8CyVJKRx&~;z` zngPT$%yGmWq0!)^$uQ-_DZi~wX0dg`S6%#!-_Or-ROb)xu5|2ihE16_F=u!Dt}K(NSi& zrS958X<>f^fXrtnG2WEKq|`;_iMWUzR6;S5nwyp(m>Gh+A>Nbm z!*q#fB}|0{beB8L4i2uHL}=astmr>!@|kT03#6IW`L;?l@YpH0DAL+`C6_m5hp2Ktu%s~4wp$?E=E!TSED2uJ0u;d zwE7WITmec0Cb>Nvy||B!T$O1^bMh*kTbPcKNmThE1iD>x*%l*9!ek+b z6u4n?Aoi})v3r#MEKCP=j0rwxB3Jdl#qFr@6CTY34^!+lfKZm;~cX?>8jQ(9%uG5I7B3#$QBg#G(}5=B5ccIMZD|L-INI|(XA zUkZnZz~)Mz*ugm$1H8hPLxRYTbm3+jkV%d4fL_dQJ6gNFRbYS3P@1a1Zl{ARq1I5V zQ{4(3EG=;wJ-7`Cf{$^=@aclV>sGY51cZnR$T)NYowjByk^7O_byUTAA^+!O34!P~ zU{)ZsN|>MhqgE^aMIO#lP`QFd<@)|cL+!pKN*TlcLjM+1GeN@LZ=2Uo4f zjaGZNvP#Vt-H3K1$5c`nyEg?hJ))TQk&cQHNkSe`K~fDnEOoc$!v$m=F#m8}8T{9U znJ%5@?C>iBCqCkPbSXilNin^OPe?&AMq`$0v~`IkC7u&3FBPa5Nk!?(G|Vz>ocZrV zbv&&sl7j-G-NGaab~Ee&EhB?ES0lNbPO@8lDym-D950l*TzTEX-&m!=G|ajPKiZ1g zg;xx3%*x-qL$0v787h#q>Cu}w^q4~~EVq(Q8lKq{Xd z^Jjt5f4m%$Hpvvcsz-q`UgPt};;DpP(8aF%)&`Y}=L59G=rP?AN*)lZ&m9zLe}q=h zDn7mv)|S&N}l?6SW$P3<6X28qn;?#kpF|&Pm~q&vo!MrY{i&2|A$D zws+&)+PA!)8a3<0&$i1BO_51IOzN~(t{yt7mK_qJc-l5m zYZh}ap1KT-SMH`X+p_%qt0I4H*Eq(Ws27Kjx0JtMsL^)-ni(!&X;pfOlX?Qz@0=EFGjhBu)oSsamTcp3;*QtY5N=XH9ay_~bEJk~rFU!;O}#&0;scev~NF|sF? zSxH!~o+jCMD4HHAA?krjj03NMm&6a_6u1<=D0K}c54;Ns%jC4IGtSd5bp(4h6&fZ8 z8MEzD?lQz(0d_ORJg`g>vdC4CgKC|w--6skz0P!=gX_Pnp@+FULoxo&^FyETLu611 z2?CYD(A6K{t0cgUXXELpg>iB}m?bC_-b)FnkFt@BT)n@5$Hudl{1{9-wIE07`{obn zUN@FQvE|-w*_UdSFRuCyxnQ@S%F>@>=Zoe{iyzj!wzh9E`tS&AV$|7CZh$FdozUo-Te;1yNuwl{*b%`2m z?W__5O1>y%iX2D#R5NkUJDO%?E6gpQx7X9ltc5>07^Whxe0^z31%aP_Q*s{AuZ~8m zYU*#ky?xr_U{%zl=xT}rZ@_MvP;|lC+I3CQJ&oHAAn#8ET+$Cvt!T>QJTN3}EIQ_r zq-L30em`Bv4%hojC7Dxe$n$g1JOPQgLdXjaAKc?*lnO1FR3#bkVt@6ijfvd~1lcg2 zxJOc>AM#%}S{pU`Yzw9@70(J+4M#x-9_GQh0%;pjqDXC9H8+?%D$4>fu~sNO@?=7P zMQ347)gt^_#YM~Qu?~FU!GDA?02$NJ&1}`eQsxFD+>4BFU1Wg1rgb?K63t9@TU z8X_MVA`kcn^gm$!0s9ZQf55ke$Rk{8!tnF(WehaJFoJcqoHjw2;5p;HFGNvz0eUOQ z_KNClkMne5`zCN3!W`IUZr^B?QCdMrI9&&_gb}l^ zO1L%F8eq*qNaEmHbm(o}RK8$9!Sn_5y>tqPtCiTWuTrg1VrZ7z<-1l;2%m=@D^KEe z5~W>yr`W+lj`xQ4p`SW$9E^%JYIeuxsVgVKFvr8W>ddLm#2K>RmAd!ky7FYzQh7TE zD}|ucY9-MJ?gtDOf}*{w#4}2V zd4q`Mo1MhUA;S}!3Wfv`ER(r#O}0&VEgW(4Hca?QR1&eP3Ntn6Y*398 z#5vrodLQ$CMflwnDW`DD=G}8A6kgj70-n6z0XMJ`9Ry_6}fBPi8 z%P8D5~RNpFO zn{*e^Zw=|G5bxqZU+01nq^^m@J<%N8eOz#~0vXKF;cE=f61fYn1^hwsNWBpKj#OrCMCZjcQnX>jM|GHJ{Kd)Y3j%$_Aax%YC7iuQLT>J#{Q zkR=~1G)DVt5B7fbg?OET#-9%?^1&gJ_aK4OBs+D>;+`--h9HqgtzH`lnG zWC;#ubveB@^imau9*4Gx3rE%lCs;jVsj60CdJJ zU@$c;_kzBorMos2$t`D|YQG-PVST7{($8t@Lv1%P4r@~<_kNB~>RYpU;V4f7euDmW zC&ueKbdTYxzKkLI*oVRnvr0Z#Efu{Q*NZQDj#R8ZF<^ zm+Ej4#D&kH&Ghw>+a_NI;^<2koR6k5$12)3u^vj_tzV8(AdZND!>l@f>eeWf9+DP* zo8Kqmq1(ql>!q$7@X{3P4*~bv$cKMs)!xCpS}hfCT{#u*lT4_2*AV}0U{wzrcZ%8{ zUeE6&6m=Y{#v_L-(*Fi1QNg6kU>e(mt>k85IV{#)RXYTN2Y}vZH}by;i~ZT7P{Uz) zl&g7!y4a>082F8D?8Yp1KD5mUZ^fA+QS2^S99Xk1cJK zoL|^}*y6)Rc%xHt&u5IDlqRKwqN;x>Uc@DwGpxuOePlin- zN-sV*H`3jRW1IVLby4;FnS=g}Qk&8NeKhAA(ZDHNj)(OqcgMnE6`Ne)7VctkC(rg8 zca4dNOg6tDhYRdXgX zLN`)!@Lz#!N`h8L+vw%W3_W$mslmG2 zy+*lpFlZJtunE4e%6AS!I7#i}DJlMXCztL{iPo6QE&>!byEVAhBty+M2izA+e7B5Y zxEgYJB%EY^zfNOa8c|Qz`(qn1E6y&*9-a$g#p)QMAec9cH6`nAExyIiM{J*`f>cy4 zv+I~-?C^DAb*>}&ehFe|Mn>!BS4M0Vg>^(1tPO-#BLnl2!iAzM{TjW+opriQi&y=D z!1i+O0_Pp?x%LHxZ-sAB>)NpE4{yxxU7U1Vcg2|`2gyFQvZ66Vq7{zo$z4Q98X&r$ z=A#rwua1%0q{!-^mfZ~NbgG^nr6RU5JT^;Nkh424HBQIHE01ux9Y_A1CvS}xu90Bk zl-zcx@B_SA-)BFx|HgLNx1$1g_LH+eLSj>KZ_Qnu&wm1K(1Wm84$P}J zW{sFE3VB2#>y(Zu8b!6FbGGpPlwd;4(BXjB7i(RY1}BKO$m{m?reQ-{%4g4a#iY?I zJ{v@OW3T*+t#>12D!wXjTA2VUJS(_5nIC*Ol{AkyJ_Rwf6{*SF!WV%Hy-j9OQfzpd zz-L1+xUOyet!FX{vl^^65K)+0479RHcX|Yt)+g6Lrqc2-(2da576y`(g za_b0<_AW07!fVFzVmb>@+9qftgT2tNc19dDVkzSO&c1AO6n_xD@G4+Uhl6(bcNK+WTa%84SUL- zy%gJoBR*F%n0{etF**5nhCMF`emBH2$QF*~ZtkuYU)!825T-xDaB*;Dxcoq11nX@% zk&K~%`}$1DmLg}-MUI&{plMuk;bl~Z)Iy6R$TXg9nR%{Z&q&%o0>F?Q;WH&Qp>)@Q zGqLPEL$=7*ZxXKF`o*2QG4N$rFdepBo-`o5FC)k2nRsUIxlE&nVfNqiBy$ML#H^&M z)L+vO9r5ONW#SYqND^Fle=|UyufQIt z3|di0`w3e$Qi8i!joqaI4!Nyhi!|BBGjR>lkR|C4DsRuB{#g@jI150%6|Wt*7Htdq z0*k;1FqjYuOSQ7ymQzXinh1U0Zar#7K}F*rHyA==)`1P!U@c$R0>vm?wTg^7F?wM&w)NZ?k>I{FQM%$H1_n^G{2{5H1;Ig-973plqrd{O&a~i96B?6`&C3~ z;MSMaGGI?aEUt}j$gX;+th7RU%PXdjn7~bl zFF-I~>|w;VSI1Cyym8fPsOow7Zs=(J)pk1Upr&tbCpSN^>GjlTGhPQvz_ro*m%Q+R zZCQp%uyZF}AHb&2PNrcDxbHH1Y1T8;=eRidhnkgsPP5Y*D=i=@z<*}Q%qLZ|h;~5m zGCcG1^8(rwSY(avs8w-M!XD_~qr)zO;@}m9Vg=co7`X?8dV_LI?2c}dN`jxP# z0)?r34kLbLjL9SD2e>?4KbZ3LJ^Uq*us3#fq4K7P2S1y$lmJb+#pAQJ!`#cvwY~5C zJ8nduOYMumM{_r~{!;`UlKmPV<%n)e1giQ!WO{};Rb#BZQ_YSy2dG3qI)`(J_|H$J zN-cj=n-lTbl6X89_M!H(x@35$>C4#Erkwa?$yu!u&s$&gVM;d9 zi}Q~}!5t9Sg0&M-j4?CjPN#Yw2KbB1o_MhU8hF|Vr^1}({K`>& z)`u|E$tT_BW%Cj|2T$kLhT5SGZ+oM}j`)&zyOzuGZnBDa$mnPrsF$aeljj{L;6Pu# zOQfgo*lFzWF~G*nmby*#Dfi8#U2<)@C?m`i{Fc8Py~kq^!0KhSjzT!AzU}m6^|+$C zHXiOvgYedDK>Tv(noqyl4jFc20gkqOJfBBC_OWl*Xc1@|qYvdAbr>7bjN2`;yF{BXQ;9gNIYOY)4@MEkEWCrBDff6f8+N_O)%RwABI^1 z%NJkHfu&rUM{2+0r?)rV1O&SBIZ~+&Is3YZ4F<~q9jHI4vQy4iLVXRRT{y%F*N9b4 zdqQM6=bmZQ)sF6=@NmNK#n3@Vh_aVSR@(+>jde4O(}ltG%G~vnFRR~*StO39hX~i^ zg!3e=rq&V!LXWEzaKD}_@4SMHyL5yteJ;F6Y%xy0$?5iX@H z%Cpvm&_8j+ve1PtE$^J^crW1+^(W`=akW~KMNyk92A=qV#p@rq-n1CSxZ*Nk!46U6 zCj~;V5zqm&44rX!b~s;>ZwpJtm0dLWU6DYC<(l(bLpAe6D@zq#GVe-{*VqD}BWzTR z%2QJ%5o;`^(@?T#Oj4YvfTuI^g&oJ&_m6Y2fEyo ze<&k%tO`oIKc8v5903Zb*5PRQH9qyyokPOsw(MS<3)?zcrCy?sHw>G>SUPxe0nIL3 zlc$AJhv?y*);vXp>7RY@B1r_>B?zs3$v<8sFubqLc)ah>cL@TL68$hL}ur6dl5?y0` zbk4}!jQyf#o@ilAnZM_f6FxONs9bQ_l$~v+#J@*DW;ak_Sq6Sd?s@b6Mzbr*6-a&R z7BW@tUD`bs>dkmF(5cHub&4`LXu0PH)-I1+KIQv+boz%}M32=rN74329*C-&p3ap6 z+Nk`1z~PDKE6WP{rddZ=C|OecDoUC8xDmU5xx=tm*{`?d^=E$FBU^@~Gp!2>#c8q7~J8;h{V4^`PXr%2;jgnuCZ1L+^g|3LW%YMb#a8WtlHA15~_ zh?}3814z%oLC=WHuIOYg`PJQmf+uTeko2aUI`9v5T^w1X9}VJZ-^l>yMl$ImAf?s7dLMPR^Y!sINB^bdm~w& zvAO&0fcUx&foL2-UP0cm2GTL5%i-u0KCiKc--q#L^gj&UD;<8O8%BFFYibxa)C^0> z2&(-3F`w2h8+or)Hx_Q*r5HnE; zK>*!=EC*ca0HZT8csfYe89o=zZI^@(<9Gnh8Os0~Wf$0vJj&YfC(gh%JY#i0pnXRa zf>w){Y#M;GhT0mGX99&9B5cCGhVT>$v&%pTI6o5TC{TgGuCjEze(re{_@Of7P;Whv zmhsaWr8-E{8KXMb!2n*Q{e;@|ne{a75Z=$sTnxT$Pjth&$K zA}G>!B7}fb?o@^!ODdMY`0w)W#t&`%fTpIdhfTWq@$RxxixsFW0zH5({&>+m%y^b4 zS-7;(`%q)xf)bdQXoGWmAP(9FPo_##4{nhuNh`P+xt#AmL(FD_$8GSRCWKt9dD?im z4*ueslm5-csQJ?+bl8fWV{8HXHSi1JykUDlUK!B3>M81Vc{=J!K$D0how%5j8~s(k20d-I0pLD6F9QasH(^ z3+qak1ix2K%P!dy|8GN{!Vvllo9za?ZBaiA>%@gG>l5vrI|@Ns@qulC2Qnq8?jcju z4Dtlo4|(+e#ZL0qfmMb|B~53!b>jYv(mIGtqT;DiHU%{iDWBfGR<+j%f{Wf`qdrBT z$?KDlja`A6?#Vp2&4ZJ(Bb3Fy*1a3qsi=Cfq%K0@%I46&mi4Z^{n~g!uFeDw!ZtC~ zuL~Kz{M+_mK%`N9b58%sV=h43;Ana-2u9wVX4aA0j-6*A=~$}U>QKRJX>)+Bz{4R* zD_}&`blWui7g(Lt-ckd+)NDI3kK7pADLef_0f$j8s3dDFo4)#&mHG8r`MiYu5=y`3 z=D}JTGoP9;&gABjXcU3^PBf` z`qF&x3r_quVnloj1d4O#bk3fvEU1oNh4{=$I)}5HS|;ao&zw%osUApFPwusR@A@<& zukYcQ-;VQcDcfHKL@oI%r|R_1KAx%`;ki~VQO}NK*o~VPH_6qQv!}7XCLrGP)SDg+ z=GgW~#66&Zq!Ph2K5Eq9C|v+jIW!EM3M?&!N3DLxbNctHVElRHjq1CWoW%{YY4mxU zqXkKDA}4PM+dK?Nrvt2I=n=|ll9HAmAmy`PX$b6LwKi4C_tP z62TuS=hwTkD^d2i_un;*-pPr#vyNcuZPSzuzja8%h{sq^@CY|g$}3+GST5gxZaD8# zo*;putzz?6rpt~S=PYa)=cIO>jw)v>>|BH27=RBtH>DS~#4?m>#`gpKjm_z|4BDll zPmfwLC0|2N(tse%>q$yrseXaQ#*!gvJ_j#;M#H9*D#nfk1jis$ceWO5;o@Te4sjBp zQgOb?9Twxdm`cjHpvJ*)<9<`lg)L{K%8s(8DvU0tDNK%%qhgAdp_tX>5iHH1PK(l@ zQj32@LZip1n(i9ld$36`-BPlfZk2yx2%=&Dkg75jAlnuoafkkIBA^fw{=&}# z1jGnVThFUte8;rr?Upg>CLxGxb<5A9D=!0}sz7(68o_?dHH&5C;>t}l^x&^mfyWW& z?regJXM}wA%X2*i5!AbUgfIuYVAxDp`howoG{`sW|F&{A_&%s6;EhTLDq{ezAO#H} zp)>ujYE|G~Lxc;SLJ&U(23s&sKUP5?dHq!_4o00vuD7@N9#S|x?so?7!rkZ)dgSj= z-q5=q`_xeXIbC8H^V|4|ehGWu&Du{0uaRQL%HBB$EeSoU0>MQRLu(NgirgDFQ3E*M zI~L0y%do-nMdjr#W;n_fqD_Ru9Jfpr?q!7xtwY2_f`xCs7Jq)Vr#2O^;s_GC)mMEtklVt z0Lz25<^i1p()Dl;x--3{kzaz)--=97PpxO4+GO=*uY${hJaYuT%?ACuhcjVL`hETt zE-2OnU;wWT^*aU9m)Z`;%mZc((f2QMwu%ts>QHBMEWRf1;!#T&QWU7L;}Qh$bve^& z?pId)bVq~Gj1U=13jw>oVWq|unE#=rW*`!qK$MeK0fr>+RsECrScbm-lWz3>ldw`1 zavWxdiFgT#w6WGH=6!;>Tw4G4tb+1;j?=O1gkKDf53+;>a}Pxmy-zvYr|Z-_z6nTU z7oqZ4z;a)hhlH_6@Doi=xD%ejK=8~VRpvulpU6}YBq;^$T3CgFNXpX9!2NHAjT^0= zJecK|z^z|U#A%N6YtyB7v24-!oLJ7K?W|uXJ#r-*EfhLZ{IcU->NVJo`H~xGWsjZ& z+G8thYC`l$UzWizfr3CXadCUbg~l5I88ClMRyh733B61D<9i%Gg5j68=pDhQp;=PxZ2w{f9k z32bc~ct&F9DVag21P)K#DP`jz9P@21S=i_9k(@~75{1_cbo{4DMS>Bs52P0nOEaLX zOC2!?ADx;@afM*8Y@;APt`|0rA-@(LZ@lcez~Xw|sCm3IO~Vo`w19_s_&;M8O>GF_ z2P>Fg$dK?RPN?nb1mt?EMD0{K0@TsYcaiBjb{|^pGS8(XmY(BjT=Z1N10t^@aR>BM zaalVg9g{B(l{$1BwHa{9!ahr83yi4U8Wt&GClS(MMjun6MYp_7o`I=4KS{4kNEO)a zH7;8kYub(2=D#$`X;$^THp}r;^=ucZfm3*T{0vGg?R<~;)AXu4etD$n)jS&_XxcU% zJG~i{c-ry*aZCg0Ord=IP>XDNeLrCG(7Uj&lQUfT8%)aNp?|UYqH*5PdZ*9ysJCg{ zSF=XdW!obr6;_vz2bi)LB*cL4HfZp+(IGO*)QlCTXsh0CI1P_Aqn)FSN`Ja{}c z^@Fj5Ms`vcNGq~v81z$yU%u2_suLjz^X#)ITd~1xS+UrpVSpTbaU|5faz~5uLcnC2 zf6Sr!yKq|JlRGNKgFAG=tvfcwFL&gE%LiQ1pJ%8h$BbOcSv~>?DlWcc6uS@5qFraS zLH6oeN4O)U+5l6