evaluation: continued performance tuning interpreter. conducted some additional tests
Some checks failed
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled
Some checks failed
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.10) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, 1.6) (push) Has been cancelled
CI / Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} (x64, ubuntu-latest, pre) (push) Has been cancelled
This commit is contained in:
@ -1,80 +1,80 @@
|
||||
[
|
||||
{
|
||||
"Julia": "1.11.5",
|
||||
"BenchmarkTools": {
|
||||
"major": 1,
|
||||
"minor": 6,
|
||||
"patch": 0,
|
||||
"prerelease": [],
|
||||
"build": []
|
||||
}
|
||||
},
|
||||
[
|
||||
[
|
||||
"BenchmarkGroup",
|
||||
{
|
||||
"data": {
|
||||
"GPUT": [
|
||||
{
|
||||
"Julia": "1.11.5",
|
||||
"BenchmarkTools": {
|
||||
"major": 1,
|
||||
"minor": 6,
|
||||
"patch": 0,
|
||||
"prerelease": [],
|
||||
"build": []
|
||||
}
|
||||
},
|
||||
[
|
||||
[
|
||||
"BenchmarkGroup",
|
||||
{
|
||||
"data": {
|
||||
"GPUT": [
|
||||
"BenchmarkGroup",
|
||||
{
|
||||
"data": {
|
||||
"nikuradse_1": [
|
||||
"Trial",
|
||||
{
|
||||
"allocs": 27549909,
|
||||
"allocs": 27549794,
|
||||
"gctimes": [
|
||||
1.796502723e9,
|
||||
7.72059865e8,
|
||||
3.94563446e8,
|
||||
4.27997326e8,
|
||||
4.06964911e8,
|
||||
4.08277194e8,
|
||||
4.02770711e8,
|
||||
4.11141922e8,
|
||||
4.07309952e8,
|
||||
4.12815766e8,
|
||||
4.13257433e8,
|
||||
4.11708235e8,
|
||||
4.06349416e8,
|
||||
4.14353433e8,
|
||||
4.05742826e8,
|
||||
4.09829039e8,
|
||||
4.02646084e8,
|
||||
4.01623866e8,
|
||||
4.11190055e8,
|
||||
4.11476122e8,
|
||||
4.07361638e8,
|
||||
4.07028467e8,
|
||||
4.11106781e8,
|
||||
4.26360821e8,
|
||||
4.07521363e8,
|
||||
4.07228793e8,
|
||||
4.09025385e8,
|
||||
4.21241253e8,
|
||||
4.1859973e8,
|
||||
4.2067553e8,
|
||||
4.00959317e8,
|
||||
4.16666312e8,
|
||||
4.10104406e8,
|
||||
4.18910797e8,
|
||||
4.05213147e8,
|
||||
4.16627063e8,
|
||||
4.1920481e8,
|
||||
4.54088613e8,
|
||||
4.39532553e8,
|
||||
4.13238829e8,
|
||||
4.14822338e8,
|
||||
4.11867383e8,
|
||||
4.15005572e8,
|
||||
4.11339915e8,
|
||||
4.1448983e8,
|
||||
4.17699043e8,
|
||||
4.16447232e8,
|
||||
4.1597287e8,
|
||||
4.14369912e8,
|
||||
4.19276762e8
|
||||
7.34988931e8,
|
||||
5.41494997e8,
|
||||
4.54013175e8,
|
||||
4.35208291e8,
|
||||
4.3231789e8,
|
||||
4.55546184e8,
|
||||
4.23418621e8,
|
||||
4.50430938e8,
|
||||
4.57438035e8,
|
||||
4.40032177e8,
|
||||
4.44249114e8,
|
||||
4.59505029e8,
|
||||
4.68161721e8,
|
||||
4.78667113e8,
|
||||
4.41616067e8,
|
||||
4.5551461e8,
|
||||
4.75652448e8,
|
||||
4.72338385e8,
|
||||
4.47779781e8,
|
||||
4.52755333e8,
|
||||
4.76158081e8,
|
||||
4.48737222e8,
|
||||
4.55761564e8,
|
||||
4.39574521e8,
|
||||
4.86435134e8,
|
||||
4.43170348e8,
|
||||
4.33731271e8,
|
||||
4.61921334e8,
|
||||
4.37434039e8,
|
||||
4.59409079e8,
|
||||
4.36341634e8,
|
||||
4.71427401e8,
|
||||
4.31984388e8,
|
||||
4.59200269e8,
|
||||
4.52769327e8,
|
||||
4.44261215e8,
|
||||
4.61363275e8,
|
||||
4.61565013e8,
|
||||
4.48557831e8,
|
||||
4.85488793e8,
|
||||
4.4128917e8,
|
||||
4.7205662e8,
|
||||
4.55980625e8,
|
||||
4.49702326e8,
|
||||
4.57778953e8,
|
||||
4.52225066e8,
|
||||
4.53744762e8,
|
||||
4.61079024e8,
|
||||
4.47186032e8,
|
||||
4.51833021e8
|
||||
],
|
||||
"memory": 67507887480,
|
||||
"memory": 67507887608,
|
||||
"params": [
|
||||
"Parameters",
|
||||
{
|
||||
@ -90,56 +90,56 @@
|
||||
}
|
||||
],
|
||||
"times": [
|
||||
3.9931587632e10,
|
||||
3.8962332239e10,
|
||||
2.6658724209e10,
|
||||
2.769671872e10,
|
||||
2.6617417291e10,
|
||||
2.6695278116e10,
|
||||
2.6389594847e10,
|
||||
2.6500758348e10,
|
||||
2.6314618692e10,
|
||||
2.6869478695e10,
|
||||
2.6596999781e10,
|
||||
2.6195296634e10,
|
||||
2.6321536967e10,
|
||||
2.676203466e10,
|
||||
2.6810603797e10,
|
||||
2.6754603343e10,
|
||||
2.6616260783e10,
|
||||
2.7015249577e10,
|
||||
2.621089281e10,
|
||||
2.565195064e10,
|
||||
2.4093609228e10,
|
||||
2.6872052438e10,
|
||||
2.6312874968e10,
|
||||
2.6567674382e10,
|
||||
2.6188371615e10,
|
||||
2.6627277961e10,
|
||||
2.6351801318e10,
|
||||
2.6764821332e10,
|
||||
2.658020325e10,
|
||||
2.6845009549e10,
|
||||
2.6127450384e10,
|
||||
2.6523726565e10,
|
||||
2.6221363227e10,
|
||||
2.542875719e10,
|
||||
2.6885440863e10,
|
||||
2.7207730806e10,
|
||||
2.770831496e10,
|
||||
2.7896929881e10,
|
||||
2.7711770473e10,
|
||||
2.6842628626e10,
|
||||
2.4898863927e10,
|
||||
2.6687932301e10,
|
||||
2.6503076469e10,
|
||||
2.655039632e10,
|
||||
2.708347459e10,
|
||||
2.5440628322e10,
|
||||
2.6279933326e10,
|
||||
2.7371915793e10,
|
||||
2.6695784917e10,
|
||||
2.7225562291e10
|
||||
2.4717936323e10,
|
||||
2.4983074984e10,
|
||||
2.3139017877e10,
|
||||
2.4848874137e10,
|
||||
2.5056845586e10,
|
||||
2.547690064e10,
|
||||
2.4976535335e10,
|
||||
2.5575731567e10,
|
||||
2.5140349264e10,
|
||||
2.5896177615e10,
|
||||
2.5501376819e10,
|
||||
2.5327110754e10,
|
||||
2.5409913851e10,
|
||||
2.6295037648e10,
|
||||
2.4355540157e10,
|
||||
2.4657706641e10,
|
||||
2.5952612569e10,
|
||||
2.5854856758e10,
|
||||
2.5568112399e10,
|
||||
2.5490261014e10,
|
||||
2.5160759326e10,
|
||||
2.6260268676e10,
|
||||
2.5242980231e10,
|
||||
2.5638644329e10,
|
||||
2.3768975772e10,
|
||||
2.5146122285e10,
|
||||
2.5682055949e10,
|
||||
2.5237286107e10,
|
||||
2.5496022078e10,
|
||||
2.5568661702e10,
|
||||
2.4330249484e10,
|
||||
2.5685686423e10,
|
||||
2.5250886166e10,
|
||||
2.5401607442e10,
|
||||
2.5564544027e10,
|
||||
2.5868746223e10,
|
||||
2.5977606065e10,
|
||||
2.5405825803e10,
|
||||
2.4619705069e10,
|
||||
2.4325894725e10,
|
||||
2.566709978e10,
|
||||
2.5400372207e10,
|
||||
2.5148598725e10,
|
||||
2.5256329818e10,
|
||||
2.5236091538e10,
|
||||
2.602685786e10,
|
||||
2.5430861304e10,
|
||||
2.5972127622e10,
|
||||
2.3654688411e10,
|
||||
2.605084424e10
|
||||
]
|
||||
}
|
||||
]
|
||||
@ -149,67 +149,67 @@
|
||||
]
|
||||
}
|
||||
],
|
||||
"GPUI": [
|
||||
"GPUI": [
|
||||
"BenchmarkGroup",
|
||||
{
|
||||
"data": {
|
||||
"nikuradse_1": [
|
||||
"Trial",
|
||||
{
|
||||
"allocs": 32241320,
|
||||
"allocs": 32243751,
|
||||
"gctimes": [
|
||||
3.76843873e8,
|
||||
3.87520681e8,
|
||||
3.53674001e8,
|
||||
3.67061252e8,
|
||||
3.741527e8,
|
||||
3.69293996e8,
|
||||
3.63305802e8,
|
||||
3.61913634e8,
|
||||
3.51818682e8,
|
||||
3.48188601e8,
|
||||
3.62864887e8,
|
||||
3.47736729e8,
|
||||
3.50237523e8,
|
||||
3.53595403e8,
|
||||
3.51245475e8,
|
||||
3.57725399e8,
|
||||
3.48667085e8,
|
||||
3.5174771e8,
|
||||
3.50159541e8,
|
||||
3.57487652e8,
|
||||
3.61893033e8,
|
||||
3.67797485e8,
|
||||
3.44948035e8,
|
||||
3.50222654e8,
|
||||
3.36037781e8,
|
||||
3.50770955e8,
|
||||
3.48655148e8,
|
||||
3.46508038e8,
|
||||
3.48958873e8,
|
||||
4.49202169e8,
|
||||
3.53247995e8,
|
||||
3.71504213e8,
|
||||
3.5431637e8,
|
||||
3.59468716e8,
|
||||
3.46016454e8,
|
||||
3.69149583e8,
|
||||
3.65486404e8,
|
||||
4.45340687e8,
|
||||
4.37909167e8,
|
||||
3.3690913e8,
|
||||
3.50482929e8,
|
||||
3.49559472e8,
|
||||
3.38465639e8,
|
||||
3.44654417e8,
|
||||
3.49173998e8,
|
||||
3.50582847e8,
|
||||
3.55724581e8,
|
||||
3.4921611e8,
|
||||
3.55360179e8,
|
||||
3.48805235e8
|
||||
5.41994011e8,
|
||||
5.74350603e8,
|
||||
4.90525664e8,
|
||||
5.92143868e8,
|
||||
6.56922572e8,
|
||||
6.38722256e8,
|
||||
5.51324211e8,
|
||||
5.94380581e8,
|
||||
5.65880356e8,
|
||||
5.30293176e8,
|
||||
6.75544373e8,
|
||||
5.91556404e8,
|
||||
5.30953191e8,
|
||||
5.73477234e8,
|
||||
5.07802986e8,
|
||||
6.71908957e8,
|
||||
4.58611495e8,
|
||||
5.34383897e8,
|
||||
4.35307473e8,
|
||||
4.25796027e8,
|
||||
4.26650755e8,
|
||||
5.43969839e8,
|
||||
4.62966279e8,
|
||||
5.62772957e8,
|
||||
5.61112059e8,
|
||||
5.21608844e8,
|
||||
4.29687492e8,
|
||||
5.3098919e8,
|
||||
4.18511386e8,
|
||||
5.51285144e8,
|
||||
6.36456452e8,
|
||||
5.80375968e8,
|
||||
4.90520531e8,
|
||||
5.72019977e8,
|
||||
5.16803925e8,
|
||||
5.31636535e8,
|
||||
4.88470453e8,
|
||||
4.57291468e8,
|
||||
4.63585061e8,
|
||||
6.75995209e8,
|
||||
4.47446015e8,
|
||||
4.21505932e8,
|
||||
4.63417339e8,
|
||||
6.17901021e8,
|
||||
5.04952063e8,
|
||||
6.3799233e8,
|
||||
4.34554313e8,
|
||||
6.24205134e8,
|
||||
6.1699824e8,
|
||||
5.4327705e8
|
||||
],
|
||||
"memory": 45874227656,
|
||||
"memory": 45874268472,
|
||||
"params": [
|
||||
"Parameters",
|
||||
{
|
||||
@ -224,7 +224,7 @@
|
||||
"memory_tolerance": 0.01
|
||||
}
|
||||
],
|
||||
"times": [
|
||||
"times": [
|
||||
3.07178374e10,
|
||||
3.0668015775e10,
|
||||
3.0731090373e10,
|
||||
@ -284,9 +284,9 @@
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"tags": []
|
||||
}
|
||||
]
|
||||
]
|
||||
},
|
||||
"tags": []
|
||||
}
|
||||
]
|
||||
]
|
||||
]
|
@ -0,0 +1,292 @@
|
||||
[
|
||||
{
|
||||
"Julia": "1.11.5",
|
||||
"BenchmarkTools": {
|
||||
"major": 1,
|
||||
"minor": 6,
|
||||
"patch": 0,
|
||||
"prerelease": [],
|
||||
"build": []
|
||||
}
|
||||
},
|
||||
[
|
||||
[
|
||||
"BenchmarkGroup",
|
||||
{
|
||||
"data": {
|
||||
"GPUT": [
|
||||
"BenchmarkGroup",
|
||||
{
|
||||
"data": {
|
||||
"nikuradse_1": [
|
||||
"Trial",
|
||||
{
|
||||
"allocs": 27549909,
|
||||
"gctimes": [
|
||||
1.796502723e9,
|
||||
7.72059865e8,
|
||||
3.94563446e8,
|
||||
4.27997326e8,
|
||||
4.06964911e8,
|
||||
4.08277194e8,
|
||||
4.02770711e8,
|
||||
4.11141922e8,
|
||||
4.07309952e8,
|
||||
4.12815766e8,
|
||||
4.13257433e8,
|
||||
4.11708235e8,
|
||||
4.06349416e8,
|
||||
4.14353433e8,
|
||||
4.05742826e8,
|
||||
4.09829039e8,
|
||||
4.02646084e8,
|
||||
4.01623866e8,
|
||||
4.11190055e8,
|
||||
4.11476122e8,
|
||||
4.07361638e8,
|
||||
4.07028467e8,
|
||||
4.11106781e8,
|
||||
4.26360821e8,
|
||||
4.07521363e8,
|
||||
4.07228793e8,
|
||||
4.09025385e8,
|
||||
4.21241253e8,
|
||||
4.1859973e8,
|
||||
4.2067553e8,
|
||||
4.00959317e8,
|
||||
4.16666312e8,
|
||||
4.10104406e8,
|
||||
4.18910797e8,
|
||||
4.05213147e8,
|
||||
4.16627063e8,
|
||||
4.1920481e8,
|
||||
4.54088613e8,
|
||||
4.39532553e8,
|
||||
4.13238829e8,
|
||||
4.14822338e8,
|
||||
4.11867383e8,
|
||||
4.15005572e8,
|
||||
4.11339915e8,
|
||||
4.1448983e8,
|
||||
4.17699043e8,
|
||||
4.16447232e8,
|
||||
4.1597287e8,
|
||||
4.14369912e8,
|
||||
4.19276762e8
|
||||
],
|
||||
"memory": 67507887480,
|
||||
"params": [
|
||||
"Parameters",
|
||||
{
|
||||
"gctrial": true,
|
||||
"time_tolerance": 0.05,
|
||||
"evals_set": false,
|
||||
"samples": 50,
|
||||
"evals": 1,
|
||||
"gcsample": false,
|
||||
"seconds": 43200.0,
|
||||
"overhead": 0.0,
|
||||
"memory_tolerance": 0.01
|
||||
}
|
||||
],
|
||||
"times": [
|
||||
3.9931587632e10,
|
||||
3.8962332239e10,
|
||||
2.6658724209e10,
|
||||
2.769671872e10,
|
||||
2.6617417291e10,
|
||||
2.6695278116e10,
|
||||
2.6389594847e10,
|
||||
2.6500758348e10,
|
||||
2.6314618692e10,
|
||||
2.6869478695e10,
|
||||
2.6596999781e10,
|
||||
2.6195296634e10,
|
||||
2.6321536967e10,
|
||||
2.676203466e10,
|
||||
2.6810603797e10,
|
||||
2.6754603343e10,
|
||||
2.6616260783e10,
|
||||
2.7015249577e10,
|
||||
2.621089281e10,
|
||||
2.565195064e10,
|
||||
2.4093609228e10,
|
||||
2.6872052438e10,
|
||||
2.6312874968e10,
|
||||
2.6567674382e10,
|
||||
2.6188371615e10,
|
||||
2.6627277961e10,
|
||||
2.6351801318e10,
|
||||
2.6764821332e10,
|
||||
2.658020325e10,
|
||||
2.6845009549e10,
|
||||
2.6127450384e10,
|
||||
2.6523726565e10,
|
||||
2.6221363227e10,
|
||||
2.542875719e10,
|
||||
2.6885440863e10,
|
||||
2.7207730806e10,
|
||||
2.770831496e10,
|
||||
2.7896929881e10,
|
||||
2.7711770473e10,
|
||||
2.6842628626e10,
|
||||
2.4898863927e10,
|
||||
2.6687932301e10,
|
||||
2.6503076469e10,
|
||||
2.655039632e10,
|
||||
2.708347459e10,
|
||||
2.5440628322e10,
|
||||
2.6279933326e10,
|
||||
2.7371915793e10,
|
||||
2.6695784917e10,
|
||||
2.7225562291e10
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"tags": [
|
||||
"GPUTranspiler"
|
||||
]
|
||||
}
|
||||
],
|
||||
"GPUI": [
|
||||
"BenchmarkGroup",
|
||||
{
|
||||
"data": {
|
||||
"nikuradse_1": [
|
||||
"Trial",
|
||||
{
|
||||
"allocs": 32241320,
|
||||
"gctimes": [
|
||||
3.76843873e8,
|
||||
3.87520681e8,
|
||||
3.53674001e8,
|
||||
3.67061252e8,
|
||||
3.741527e8,
|
||||
3.69293996e8,
|
||||
3.63305802e8,
|
||||
3.61913634e8,
|
||||
3.51818682e8,
|
||||
3.48188601e8,
|
||||
3.62864887e8,
|
||||
3.47736729e8,
|
||||
3.50237523e8,
|
||||
3.53595403e8,
|
||||
3.51245475e8,
|
||||
3.57725399e8,
|
||||
3.48667085e8,
|
||||
3.5174771e8,
|
||||
3.50159541e8,
|
||||
3.57487652e8,
|
||||
3.61893033e8,
|
||||
3.67797485e8,
|
||||
3.44948035e8,
|
||||
3.50222654e8,
|
||||
3.36037781e8,
|
||||
3.50770955e8,
|
||||
3.48655148e8,
|
||||
3.46508038e8,
|
||||
3.48958873e8,
|
||||
4.49202169e8,
|
||||
3.53247995e8,
|
||||
3.71504213e8,
|
||||
3.5431637e8,
|
||||
3.59468716e8,
|
||||
3.46016454e8,
|
||||
3.69149583e8,
|
||||
3.65486404e8,
|
||||
4.45340687e8,
|
||||
4.37909167e8,
|
||||
3.3690913e8,
|
||||
3.50482929e8,
|
||||
3.49559472e8,
|
||||
3.38465639e8,
|
||||
3.44654417e8,
|
||||
3.49173998e8,
|
||||
3.50582847e8,
|
||||
3.55724581e8,
|
||||
3.4921611e8,
|
||||
3.55360179e8,
|
||||
3.48805235e8
|
||||
],
|
||||
"memory": 45874227656,
|
||||
"params": [
|
||||
"Parameters",
|
||||
{
|
||||
"gctrial": true,
|
||||
"time_tolerance": 0.05,
|
||||
"evals_set": false,
|
||||
"samples": 50,
|
||||
"evals": 1,
|
||||
"gcsample": false,
|
||||
"seconds": 43200.0,
|
||||
"overhead": 0.0,
|
||||
"memory_tolerance": 0.01
|
||||
}
|
||||
],
|
||||
"times": [
|
||||
7.3943918395e10,
|
||||
7.4070804594e10,
|
||||
7.3896520127e10,
|
||||
7.4134852923e10,
|
||||
7.4229052084e10,
|
||||
7.4064320483e10,
|
||||
7.3463069111e10,
|
||||
7.3918826132e10,
|
||||
7.3667157657e10,
|
||||
7.3970536289e10,
|
||||
7.4355207783e10,
|
||||
7.3727364718e10,
|
||||
7.384731378e10,
|
||||
7.4005447387e10,
|
||||
7.4051183283e10,
|
||||
7.3985867593e10,
|
||||
7.3531459498e10,
|
||||
7.3479080625e10,
|
||||
7.5207069603e10,
|
||||
7.4365038661e10,
|
||||
7.3929205754e10,
|
||||
7.4276829344e10,
|
||||
7.4038629545e10,
|
||||
7.4778589402e10,
|
||||
7.4428735243e10,
|
||||
7.3981806593e10,
|
||||
7.3927279144e10,
|
||||
7.3861975856e10,
|
||||
7.3529711339e10,
|
||||
7.3747593328e10,
|
||||
7.4109278095e10,
|
||||
7.421203285e10,
|
||||
7.3915105894e10,
|
||||
7.3744032137e10,
|
||||
7.4102811619e10,
|
||||
7.4106619627e10,
|
||||
7.3922721844e10,
|
||||
7.4218465669e10,
|
||||
7.4356041135e10,
|
||||
7.4323162031e10,
|
||||
7.3943656925e10,
|
||||
7.4352507972e10,
|
||||
7.4394224103e10,
|
||||
7.4250996553e10,
|
||||
7.3976550142e10,
|
||||
7.4218926316e10,
|
||||
7.4574530318e10,
|
||||
7.4235191697e10,
|
||||
7.4346408894e10,
|
||||
7.474792626e10
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"tags": [
|
||||
"GPUInterpreter"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"tags": []
|
||||
}
|
||||
]
|
||||
]
|
||||
]
|
@ -103,28 +103,25 @@ The third benchmark used the same $10\,000$ expressions and $100$ parameter opti
|
||||
Although the number of variable sets has been increased by 30 times, the block size remained at 128 threads. Unlike the previous benchmarks, the hardware utilisation was different. Now only the GPU was utilised to 100\% while the CPU utilisation started at 100\% and slowly dropped to 80\%. The GPU needs to perform 30 times more evaluations, meaning it takes longer for one kernel dispatch to be finished. At the same time, the CPU tries to dispatch the kernel at the same rate as before. Because only a certain number of kernels can be dispatched at once, the CPU needs to wait for the GPU to finish a kernel before another one can be dispatched again. Therefore, in this scenario, the evaluator runs into a GPU-bottleneck and using a more performant GPU would consequently improve the runtime in this scenario. In the benchmarks before, both the CPU and GPU would need to be upgraded, to achieve better performance.
|
||||
|
||||
|
||||
\subsection{Performance Tuning Interpreter} % either subsubSection or change the title to "Performance Tuning Interpreter"
|
||||
% Document the process of performance tuning (mostly GPU, but also talk about CPU. Especially the re-aranging of data transfer and non usage of a cache)
|
||||
|
||||
% Initial: no cache; 256 blocksize; exprs pre-processed and sent to GPU on every call; vars sent on every call; frontend + dispatch are multithreaded
|
||||
\subsection{Performance Tuning Interpreter}
|
||||
Optimising and tuning the interpreter is crucial to achieve good performance. Especially tuning the kernel, as a wrongly configured kernel can drastically degrade performance. Before any performance tuning and optimisation has been performed, the kernel was configured with a block size of 256 threads. Additionally, on the CPU, the frontend was executed for each expression before every kernel dispatch, even in parameter optimisation scenarios, where the expressions did not change from one dispatch to the other. Moreover, the variables have also been transmitted to the GPU before ever dispatch. However, executing the frontend, as well as dispatching the kernel was multithreaded, utilising all 12 threads of the CPU and a cache for the frontend has been used.
|
||||
|
||||
With this implementation, the initial performance measurements have been conducted for benchmark 1 which served as the baseline for further performance optimisations. However, as already mentioned, during this benchmark, memory limitations where encountered, as too much RAM was being used. Therefore, the caching had to be disabled. Because the evaluator is multithreaded, this change resulted in significantly better performance. As the cache introduced critical sections where race conditions could occur, locking mechanisms needed to be used. While locking ensures that no race conditions occur, it also means that parts of an otherwise entirely parallel implementation are now serialised, reducing the effect of parallelisation.
|
||||
|
||||
Without a cache and utilising all 12 threads, the frontend achieved very good performance. Processing $250\,000$ expressions takes roughly $88.5$ milliseconds. On the other hand, using a cache, resulted in the frontend running for $6.9$ \textit{seconds}. This equates to a speed-up of roughly 78 times when using no cache. Additionally, when looking at the results above, the time it takes to execute the frontend is negligible, meaning further optimising the frontend would not significantly improve the overall runtime.
|
||||
|
||||
All optimisations have been performed with the same set of expressions and variables as benchmark one. This decision has been made because such high volumes of data are more likely to identify possible problems. Before conduction benchmarks two and three, additional performance tuning has been performed to ensure that these benchmarks also utilise the hardware as much as possible.
|
||||
During the tuning process $362$ variable sets have been used, which is the number of variable sets used by benchmark one and two. Before conduction benchmark three, additional performance tuning has been performed to ensure that this benchmark also utilises the hardware as much as possible.
|
||||
|
||||
\subsubsection{Optimisation 1}
|
||||
|
||||
After caching has been disabled, the first performance improvement was to drastically reduce the number of calls to the frontend and the number of data transfers to the GPU. Because the expressions and variables never change during the parameter optimisation process, processing the expression and transmitting the data to the GPU on each step are wasted resources. Therefore, the expressions are sent to the frontend once before the parameter optimisation process. Afterwards, the processed expressions as well as the variables are transferred to the GPU exactly once for this execution of the interpreter.
|
||||
|
||||
Figure \ref{fig:gpu_i_optimisation_1} shows how this optimisation improved the overall performance. However, it can also be seen that the range the individual samples fall within is much greater now. While in all cases, this optimisation improved the performance, in some cases the difference between the initial and the optimised version is very low with roughly a two-second improvement.
|
||||
Figure \ref{fig:gpu_i_optimisation_1} shows how this optimisation improved the overall performance as demonstrated with benchmark one. However, it can also be seen that the range the individual samples fall within is much greater now. While in all cases, this optimisation improved the performance, in some cases the difference between the initial and the optimised version is very low with roughly a two-second improvement.
|
||||
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics[width=.9\textwidth]{results/interpreter-comparison-initial-optim1.png}
|
||||
\caption{Comparison of the initial implementation with the first optimisation. Note that while the results of the optimisation have a much wider range, all samples performed better than the initial implementation.}
|
||||
\caption{Comparison of the initial implementation with the first optimisation applied on benchmark one. Note that while the results of the optimisation have a much wider range, all samples performed better than the initial implementation.}
|
||||
\label{fig:gpu_i_optimisation_1}
|
||||
\end{figure}
|
||||
|
||||
@ -134,43 +131,41 @@ The second optimisation was concerned with tuning the kernel configuration. Usin
|
||||
|
||||
Since the evaluator is designed to execute many kernel dispatches in parallel, it was important to reduce the kernel runtime by as much as possible. Reducing the runtime per kernel has a knock-on effect, as the following kernel dispatches can more begin execution sooner reducing the overall runtime.
|
||||
|
||||
After the evaluator tuning has been concluded, it was found that a block size of $128$ yielded the best results. With this kernel configuration, another performance measurement has been conducted with the results shown in Figure \ref{fig:gpu_i_optimisation_2}. As can be seen, the overall runtime again was noticeably faster. However, the standard deviation also drastically increased, with the duration from the fastest to the slowest sample differing by roughly 60 seconds.
|
||||
After the evaluator tuning has been concluded, it was found that a block size of $128$ yielded the best results. With this kernel configuration, another performance measurement has been conducted with the results shown in Figure \ref{fig:gpu_i_optimisation_2} using benchmark one. As can be seen, the overall runtime again was noticeably faster. However, the standard deviation also drastically increased, with the duration from the fastest to the slowest sample differing by roughly 60 seconds.
|
||||
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics[width=.9\textwidth]{results/interpreter-comparison-optim1-optim2.png}
|
||||
\caption{Comparison of the first optimisation with the second.}
|
||||
\caption{Comparison of the first optimisation with the second applied on benchmark one.}
|
||||
\label{fig:gpu_i_optimisation_2}
|
||||
\end{figure}
|
||||
|
||||
The found block size of $128$ might seem strange. However, it makes sense, as in total at least $362$ threads need to be started to evaluate one expression. If one block contains $128$ threads a total of $362 / 128 \approx 3$ blocks need to be started, totalling $384$ threads. As a result, only $384 - 362 = 22$ threads are excess threads. When choosing a block size of $121$ three blocks could be started, totalling one excess thread. However, there is no performance difference between a block size of $121$ and $128$. Since all threads are executed inside a warp, which consists of exactly $32$ threads, a block size that is not divisible by $32$ has no benefit and only hides the true amount of excess threads started.
|
||||
% TODO Include screenshots from nsight compute
|
||||
|
||||
Benchmark three had a total of $10\,860$ variable sets, meaning at least this number of threads must be started. To ensure optimal hardware utilisation, the evaluator had to undergo another tuning process. As seen above, it is beneficial to start as little excess threads as possible. By utilising NSight Compute, a performance measurement with a block size of $128$ was used as the initial configuration. This already performed well as again very little excess threads are started. In total $10\,860 / 128 \approx 84.84$ blocks are needed which must be rounded up to $85$ blocks with the last block being filled by roughly $84\%$ which equates to $20$ excess threads being started.
|
||||
% TODO: Include nsight compute screenshots
|
||||
% TODO: also here include that finding the smallest common divisor that is divisible by 32 is a great starting point for performance tuning. then just use nsight compute to experiment with different configurations to find the best solution
|
||||
|
||||
%Describe the theory behind these two block sizes (more excess threads but much fewer blocks -> more of the evaluations can be performed simultanously [38 or so SMs available], found that less excess threads is much more important)
|
||||
%blocksize 128: 84.84 blocks fast (prolly because less wasted threads)
|
||||
%bocksize 192: 56.56 blocks very slow
|
||||
This has been repeated for two more configurations. Once for a block size of $160$ and once for $192$. With a block size of $160$ the total number of blocks was reduced to $68$ which again resulted in $20$ excess threads being started. With the hypothesis being, that using fewer blocks will result in better utilisation and therefore better performance. The same idea was also behind choosing the block size $192$. While this only requires $57$ blocks, the number of excess threads increased to $84$.
|
||||
|
||||
Using NSight Compute it was found, that a block size of $160$ was the best performing followed by the block size of $192$ and the worst performing configuration was with a block size of $128$. However, this is not representative of how these configurations perform during the benchmarks. As seen in Figure \ref{fig:gpu_i_128-160-192} using a block size of $128$ lead to significantly better performance than the other configurations. While a block size of $160$ lead to worse results, it needs to be noted that it also improved the standard deviation by 25\% when compared to the results with a block size of $128$. These results also show that it is important to not only use NSight Compute but also conduct performance tests with real data to ensure the best possible configuration is chosen.
|
||||
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics[width=.9\textwidth]{results/interpreter-comparison-128-160-192.png}
|
||||
\caption{Comparison of the execution times of benchmark three with a block size of 128, 160 and 192.}
|
||||
\label{fig:gpu_i_128-160-192}
|
||||
\end{figure}
|
||||
|
||||
\subsubsection{Optimisation 3}
|
||||
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics[width=.9\textwidth]{results/interpreter-comparison-optim2-optim3.png}
|
||||
\caption{Comparison of the second optimisation with the third.}
|
||||
\caption{Comparison of the second optimisation with the third applied on benchmark one.}
|
||||
\label{fig:gpu_i_optimisation_3}
|
||||
\end{figure}
|
||||
|
||||
|
||||
2.) tuned blocksize to have as little wasted threads as possible (new blocksize 121 -> 3-blocks -> 363 threads but 362 threads needed per expression) (128 should lead to the same results. Talk here a bit what to look out for, so block-size should be a multiple of 32 and should divide the nr. of varsets as best as possible to a whole number without going over)
|
||||
3.) Minor optimisations. Reduced stacksize; reduced memory allocations on the CPU; reduced GC pressure
|
||||
|
||||
CPU and GPU are almost all the time at 100\% utilisation (GPU every now and then drops to 70\%), meaning it is quite balanced.
|
||||
Uncached but multithreaded frontend only makes up a small percentage of the total runtime (optimisations there are not really needed, which is good because enabling caching took up too much RAM)
|
||||
Most of the time is spent doing the parameter optimisation step
|
||||
3.) Minor optimisations. Reduced stacksize; reduced memory allocations on the CPU; reduced GC pressure (helped with std)
|
||||
|
||||
\subsection{Transpiler}
|
||||
Results only for Transpiler (also contains final kernel configuration and probably quick overview/recap of the implementation used and described in Implementation section
|
||||
@ -192,11 +187,11 @@ GPU: very short bursts to 100\% then down to 0\% with a very high frequency (the
|
||||
|
||||
\subsubsection{Benchmark 3}
|
||||
Even larger var sets would be perfect. 10k is rather small and the GPU still has barely any work to do
|
||||
std: 2.64 seconds
|
||||
std: (re-calculate as block size changed to 160 from 128 before)
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics[width=.9\textwidth]{results/gpu-transpiler-final-performance-benchmark3.png}
|
||||
\caption{The results of the transpiler for benchmark 3}
|
||||
\caption{The results of the transpiler for benchmark 3; RE-DO THIS AS BLOCKSIZE CHANGED}
|
||||
\label{fig:gpu_t_benchmark_3}
|
||||
\end{figure}
|
||||
|
||||
@ -209,7 +204,8 @@ Document the process of performance tuning
|
||||
Initial: no cache; 256 blocksize; exprs pre-processed and transpiled on every call; vars sent on every call; frontend + transpilation + dispatch are multithreaded
|
||||
|
||||
1.) Done before parameter optimisation loop: Frontend, transmitting Exprs and Variables (improved runtime)
|
||||
2.) All expressions to execute are transpiled first (before they were transpiled for every execution, even in parameter optimisation scenarios). Compilation is still done every time, because too little RAM was available (compilation takes the most time, so this is only a minor boost). Also tried blocksize of 121. However, kernel itself is very fast anyway, so this didn't make a difference (further proof that the CPU is the bottleneck here)
|
||||
2.) All expressions to execute are transpiled first (before they were transpiled for every execution, even in parameter optimisation scenarios). Compilation is done every time in benchmark 1, because too little RAM was available (compilation takes the most time, so this is only a minor boost).
|
||||
3.) benchmark3 std noticeably improved with blocksize 160 (around 70\% better)
|
||||
|
||||
CPU at 100\% GPU at around 30\%. Heavily CPU bottlenecked. Mainly due to PTX compilation taking by far the longest (while kernels are finished more or less instantly)
|
||||
|
||||
|
BIN
thesis/images/results/interpreter-comparison-128-160-192.png
Normal file
BIN
thesis/images/results/interpreter-comparison-128-160-192.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 57 KiB |
BIN
thesis/main.pdf
BIN
thesis/main.pdf
Binary file not shown.
Reference in New Issue
Block a user