drm/amdgpu: throttle buffer migrations at CS using a fixed MBps limit (v2)

The old mechanism used a per-submission limit that didn't take previous submissions within the same time frame into account. It also filled VRAM slowly when VRAM usage dropped due to a big eviction or buffer deallocation. This new method establishes a configurable MBps limit that is obeyed when VRAM usage is very high. When VRAM usage is not very high, it gives the driver the freedom to fill it quickly. The result is more consistent performance. It can't keep the BO move rate low if lots of evictions are happening due to VRAM fragmentation, or if a big buffer is being migrated. The amdgpu.moverate parameter can be used to set a non-default limit. Measurements can be done to find out which amdgpu.moverate setting gives the best results. Mainly APUs and cards with small VRAM will benefit from this. For F1 2015, anything with 2 GB VRAM or less will benefit. Some benchmark results - F1 2015 (Tonga 2GB): Limit MinFPS AvgFPS Old code: 14 32.6 128 MB/s: 28 41 64 MB/s: 15.5 43 32 MB/s: 28.7 43.4 8 MB/s: 27.8 44.4 8 MB/s: 21.9 42.8 (different run) Random drops in Min FPS can still occur (due to fragmented VRAM?), but the average FPS is much better. 8 MB/s is probably a good limit for this game & the current VRAM management. The random FPS drops are still to be tackled. v2: use a spinlock Signed-off-by: Marek Olšák <marek.olsak@amd.com> Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
author: Marek Olšák <marek.olsak@amd.com> 2016-08-17 17:49:27 -0400
committer: Alex Deucher <alexander.deucher@amd.com> 2016-08-30 17:54:30 -0400
commit: 95844d20ae024b5d553c9923a0d3145c3956bf69 (patch)
tree: a2b95c430ed82db0b2085379c451499c5638b698 /drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
parent: 15f441db96adcac1dc86196b301f4204452bfe0c (diff)
1 files changed, 104 insertions, 48 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index d80e5d3a4add..82927570333a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -235,56 +235,115 @@ free_chunk:
        return ret;
 }
-/* Returns how many bytes TTM can move per IB.
+/* Convert microseconds to bytes. */
+static u64 us_to_bytes(struct amdgpu_device *adev, s64 us)
+{
+        if (us <= 0 || !adev->mm_stats.log2_max_MBps)
+                return 0;
+        /* Since accum_us is incremented by a million per second, just
+         * multiply it by the number of MB/s to get the number of bytes.
+         */
+        return us << adev->mm_stats.log2_max_MBps;
+}
+static s64 bytes_to_us(struct amdgpu_device *adev, u64 bytes)
+{
+        if (!adev->mm_stats.log2_max_MBps)
+                return 0;
+        return bytes >> adev->mm_stats.log2_max_MBps;
+}
+/* Returns how many bytes TTM can move right now. If no bytes can be moved,
+ * it returns 0. If it returns non-zero, it's OK to move at least one buffer,
+ * which means it can go over the threshold once. If that happens, the driver
+ * will be in debt and no other buffer migrations can be done until that debt
+ * is repaid.
+ *
+ * This approach allows moving a buffer of any size (it's important to allow
+ * that).
+ *
+ * The currency is simply time in microseconds and it increases as the clock
+ * ticks. The accumulated microseconds (us) are converted to bytes and
+ * returned.
 */
 static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
 {
-        u64 real_vram_size = adev->mc.real_vram_size;
+        s64 time_us, increment_us;
-        u64 vram_usage = atomic64_read(&adev->vram_usage);
+        u64 max_bytes;
+        u64 free_vram, total_vram, used_vram;
-        /* This function is based on the current VRAM usage.
+        /* Allow a maximum of 200 accumulated ms. This is basically per-IB
+         * throttling.
         *
-         * - If all of VRAM is free, allow relocating the number of bytes that
+         * It means that in order to get full max MBps, at least 5 IBs per
-         *   is equal to 1/4 of the size of VRAM for this IB.
+         * second must be submitted and not more than 200ms apart from each
+         * other.
+         */
+        const s64 us_upper_bound = 200000;
-         * - If more than one half of VRAM is occupied, only allow relocating
+        if (!adev->mm_stats.log2_max_MBps)
-         *   1 MB of data for this IB.
+                return 0;
-         *
-         * - From 0 to one half of used VRAM, the threshold decreases
+        total_vram = adev->mc.real_vram_size - adev->vram_pin_size;
-         *   linearly.
+        used_vram = atomic64_read(&adev->vram_usage);
-         *         __________________
+        free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram;
-         * 1/4 of -|\               |
-         * VRAM    | \              |
+        spin_lock(&adev->mm_stats.lock);
-         *         |  \             |
-         *         |   \            |
+        /* Increase the amount of accumulated us. */
-         *         |    \           |
+        time_us = ktime_to_us(ktime_get());
-         *         |     \          |
+        increment_us = time_us - adev->mm_stats.last_update_us;
-         *         |      \         |
+        adev->mm_stats.last_update_us = time_us;
-         *         |       \________|1 MB
+        adev->mm_stats.accum_us = min(adev->mm_stats.accum_us + increment_us,
-         *         |----------------|
+                                      us_upper_bound);
-         *    VRAM 0 %             100 %
-         *         used            used
+        /* This prevents the short period of low performance when the VRAM
-         *
+         * usage is low and the driver is in debt or doesn't have enough
-         * Note: It's a threshold, not a limit. The threshold must be crossed
+         * accumulated us to fill VRAM quickly.
-         * for buffer relocations to stop, so any buffer of an arbitrary size
-         * can be moved as long as the threshold isn't crossed before
-         * the relocation takes place. We don't want to disable buffer
-         * relocations completely.
         *
-         * The idea is that buffers should be placed in VRAM at creation time
+         * The situation can occur in these cases:
-         * and TTM should only do a minimum number of relocations during
+         * - a lot of VRAM is freed by userspace
-         * command submission. In practice, you need to submit at least
+         * - the presence of a big buffer causes a lot of evictions
-         * a dozen IBs to move all buffers to VRAM if they are in GTT.
+         *   (solution: split buffers into smaller ones)
         *
-         * Also, things can get pretty crazy under memory pressure and actual
+         * If 128 MB or 1/8th of VRAM is free, start filling it now by setting
-         * VRAM usage can change a lot, so playing safe even at 50% does
+         * accum_us to a positive number.
-         * consistently increase performance.
         */
+        if (free_vram >= 128 * 1024 * 1024 || free_vram >= total_vram / 8) {
+                s64 min_us;
+                /* Be more aggresive on dGPUs. Try to fill a portion of free
+                 * VRAM now.
+                 */
+                if (!(adev->flags & AMD_IS_APU))
+                        min_us = bytes_to_us(adev, free_vram / 4);
+                else
+                        min_us = 0; /* Reset accum_us on APUs. */
+                adev->mm_stats.accum_us = max(min_us, adev->mm_stats.accum_us);
+        }
-        u64 half_vram = real_vram_size >> 1;
+        /* This returns 0 if the driver is in debt to disallow (optional)
-        u64 half_free_vram = vram_usage >= half_vram ? 0 : half_vram - vram_usage;
+         * buffer moves.
-        u64 bytes_moved_threshold = half_free_vram >> 1;
+         */
-        return max(bytes_moved_threshold, 1024*1024ull);
+        max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
+        spin_unlock(&adev->mm_stats.lock);
+        return max_bytes;
+}
+/* Report how many bytes have really been moved for the last command
+ * submission. This can result in a debt that can stop buffer migrations
+ * temporarily.
+ */
+static void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev,
+                                         u64 num_bytes)
+{
+        spin_lock(&adev->mm_stats.lock);
+        adev->mm_stats.accum_us -= bytes_to_us(adev, num_bytes);
+        spin_unlock(&adev->mm_stats.lock);
 }
 static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
@@ -297,15 +356,10 @@ static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
        if (bo->pin_count)
                return 0;
-        /* Avoid moving this one if we have moved too many buffers
+        /* Don't move this buffer if we have depleted our allowance
-         * for this IB already.
+         * to move it. Don't move anything if the threshold is zero.
-         *
-         * Note that this allows moving at least one buffer of
-         * any size, because it doesn't take the current "bo"
-         * into account. We don't want to disallow buffer moves
-         * completely.
         */
-        if (p->bytes_moved <= p->bytes_moved_threshold)
+        if (p->bytes_moved < p->bytes_moved_threshold)
                domain = bo->prefered_domains;
        else
                domain = bo->allowed_domains;
@@ -494,6 +548,8 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
                goto error_validate;
        }
+        amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved);
        fpriv->vm.last_eviction_counter =
                atomic64_read(&p->adev->num_evictions);
author	Marek Olšák <marek.olsak@amd.com>	2016-08-17 17:49:27 -0400
committer	Alex Deucher <alexander.deucher@amd.com>	2016-08-30 17:54:30 -0400
commit	95844d20ae024b5d553c9923a0d3145c3956bf69 (patch)
tree	a2b95c430ed82db0b2085379c451499c5638b698 /drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
parent	15f441db96adcac1dc86196b301f4204452bfe0c (diff)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index d80e5d3a4add..82927570333a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -235,56 +235,115 @@ free_chunk:
235	return ret;	235	return ret;
236	}	236	}
237		237
238	/* Returns how many bytes TTM can move per IB.	238	/* Convert microseconds to bytes. */
		239	static u64 us_to_bytes(struct amdgpu_device *adev, s64 us)
		240	{
		241	if (us <= 0 \|\| !adev->mm_stats.log2_max_MBps)
		242	return 0;
		243
		244	/* Since accum_us is incremented by a million per second, just
		245	* multiply it by the number of MB/s to get the number of bytes.
		246	*/
		247	return us << adev->mm_stats.log2_max_MBps;
		248	}
		249
		250	static s64 bytes_to_us(struct amdgpu_device *adev, u64 bytes)
		251	{
		252	if (!adev->mm_stats.log2_max_MBps)
		253	return 0;
		254
		255	return bytes >> adev->mm_stats.log2_max_MBps;
		256	}
		257
		258	/* Returns how many bytes TTM can move right now. If no bytes can be moved,
		259	* it returns 0. If it returns non-zero, it's OK to move at least one buffer,
		260	* which means it can go over the threshold once. If that happens, the driver
		261	* will be in debt and no other buffer migrations can be done until that debt
		262	* is repaid.
		263	*
		264	* This approach allows moving a buffer of any size (it's important to allow
		265	* that).
		266	*
		267	* The currency is simply time in microseconds and it increases as the clock
		268	* ticks. The accumulated microseconds (us) are converted to bytes and
		269	* returned.
239	*/	270	*/
240	static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)	271	static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
241	{	272	{
242	u64 real_vram_size = adev->mc.real_vram_size;	273	s64 time_us, increment_us;
243	u64 vram_usage = atomic64_read(&adev->vram_usage);	274	u64 max_bytes;
		275	u64 free_vram, total_vram, used_vram;
244		276
245	/* This function is based on the current VRAM usage.	277	/* Allow a maximum of 200 accumulated ms. This is basically per-IB
		278	* throttling.
246	*	279	*
247	* - If all of VRAM is free, allow relocating the number of bytes that	280	* It means that in order to get full max MBps, at least 5 IBs per
248	* is equal to 1/4 of the size of VRAM for this IB.	281	* second must be submitted and not more than 200ms apart from each
		282	* other.
		283	*/
		284	const s64 us_upper_bound = 200000;
249		285
250	* - If more than one half of VRAM is occupied, only allow relocating	286	if (!adev->mm_stats.log2_max_MBps)
251	* 1 MB of data for this IB.	287	return 0;
252	*	288
253	* - From 0 to one half of used VRAM, the threshold decreases	289	total_vram = adev->mc.real_vram_size - adev->vram_pin_size;
254	* linearly.	290	used_vram = atomic64_read(&adev->vram_usage);
255	* __________________	291	free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram;
256	* 1/4 of -\|\ \|	292
257	* VRAM \| \ \|	293	spin_lock(&adev->mm_stats.lock);
258	* \| \ \|	294
259	* \| \ \|	295	/* Increase the amount of accumulated us. */
260	* \| \ \|	296	time_us = ktime_to_us(ktime_get());
261	* \| \ \|	297	increment_us = time_us - adev->mm_stats.last_update_us;
262	* \| \ \|	298	adev->mm_stats.last_update_us = time_us;
263	* \| \________\|1 MB	299	adev->mm_stats.accum_us = min(adev->mm_stats.accum_us + increment_us,
264	* \|----------------\|	300	us_upper_bound);
265	* VRAM 0 % 100 %	301
266	* used used	302	/* This prevents the short period of low performance when the VRAM
267	*	303	* usage is low and the driver is in debt or doesn't have enough
268	* Note: It's a threshold, not a limit. The threshold must be crossed	304	* accumulated us to fill VRAM quickly.
269	* for buffer relocations to stop, so any buffer of an arbitrary size
270	* can be moved as long as the threshold isn't crossed before
271	* the relocation takes place. We don't want to disable buffer
272	* relocations completely.
273	*	305	*
274	* The idea is that buffers should be placed in VRAM at creation time	306	* The situation can occur in these cases:
275	* and TTM should only do a minimum number of relocations during	307	* - a lot of VRAM is freed by userspace
276	* command submission. In practice, you need to submit at least	308	* - the presence of a big buffer causes a lot of evictions
277	* a dozen IBs to move all buffers to VRAM if they are in GTT.	309	* (solution: split buffers into smaller ones)
278	*	310	*
279	* Also, things can get pretty crazy under memory pressure and actual	311	* If 128 MB or 1/8th of VRAM is free, start filling it now by setting
280	* VRAM usage can change a lot, so playing safe even at 50% does	312	* accum_us to a positive number.
281	* consistently increase performance.
282	*/	313	*/
		314	if (free_vram >= 128 * 1024 * 1024 \|\| free_vram >= total_vram / 8) {
		315	s64 min_us;
		316
		317	/* Be more aggresive on dGPUs. Try to fill a portion of free
		318	* VRAM now.
		319	*/
		320	if (!(adev->flags & AMD_IS_APU))
		321	min_us = bytes_to_us(adev, free_vram / 4);
		322	else
		323	min_us = 0; /* Reset accum_us on APUs. */
		324
		325	adev->mm_stats.accum_us = max(min_us, adev->mm_stats.accum_us);
		326	}
283		327
284	u64 half_vram = real_vram_size >> 1;	328	/* This returns 0 if the driver is in debt to disallow (optional)
285	u64 half_free_vram = vram_usage >= half_vram ? 0 : half_vram - vram_usage;	329	* buffer moves.
286	u64 bytes_moved_threshold = half_free_vram >> 1;	330	*/
287	return max(bytes_moved_threshold, 1024*1024ull);	331	max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
		332
		333	spin_unlock(&adev->mm_stats.lock);
		334	return max_bytes;
		335	}
		336
		337	/* Report how many bytes have really been moved for the last command
		338	* submission. This can result in a debt that can stop buffer migrations
		339	* temporarily.
		340	*/
		341	static void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev,
		342	u64 num_bytes)
		343	{
		344	spin_lock(&adev->mm_stats.lock);
		345	adev->mm_stats.accum_us -= bytes_to_us(adev, num_bytes);
		346	spin_unlock(&adev->mm_stats.lock);
288	}	347	}
289		348
290	static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,	349	static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
@@ -297,15 +356,10 @@ static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
297	if (bo->pin_count)	356	if (bo->pin_count)
298	return 0;	357	return 0;
299		358
300	/* Avoid moving this one if we have moved too many buffers	359	/* Don't move this buffer if we have depleted our allowance
301	* for this IB already.	360	* to move it. Don't move anything if the threshold is zero.
302	*
303	* Note that this allows moving at least one buffer of
304	* any size, because it doesn't take the current "bo"
305	* into account. We don't want to disallow buffer moves
306	* completely.
307	*/	361	*/
308	if (p->bytes_moved <= p->bytes_moved_threshold)	362	if (p->bytes_moved < p->bytes_moved_threshold)
309	domain = bo->prefered_domains;	363	domain = bo->prefered_domains;
310	else	364	else
311	domain = bo->allowed_domains;	365	domain = bo->allowed_domains;
@@ -494,6 +548,8 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
494	goto error_validate;	548	goto error_validate;
495	}	549	}
496		550
		551	amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved);
		552
497	fpriv->vm.last_eviction_counter =	553	fpriv->vm.last_eviction_counter =
498	atomic64_read(&p->adev->num_evictions);	554	atomic64_read(&p->adev->num_evictions);
499		555