1 files changed, 104 insertions, 48 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index d80e5d3a4add..82927570333a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -235,56 +235,115 @@ free_chunk:
        return ret;
 }
-/* Returns how many bytes TTM can move per IB.
+/* Convert microseconds to bytes. */
+static u64 us_to_bytes(struct amdgpu_device *adev, s64 us)
+{
+        if (us <= 0 || !adev->mm_stats.log2_max_MBps)
+                return 0;
+        /* Since accum_us is incremented by a million per second, just
+         * multiply it by the number of MB/s to get the number of bytes.
+         */
+        return us << adev->mm_stats.log2_max_MBps;
+}
+static s64 bytes_to_us(struct amdgpu_device *adev, u64 bytes)
+{
+        if (!adev->mm_stats.log2_max_MBps)
+                return 0;
+        return bytes >> adev->mm_stats.log2_max_MBps;
+}
+/* Returns how many bytes TTM can move right now. If no bytes can be moved,
+ * it returns 0. If it returns non-zero, it's OK to move at least one buffer,
+ * which means it can go over the threshold once. If that happens, the driver
+ * will be in debt and no other buffer migrations can be done until that debt
+ * is repaid.
+ *
+ * This approach allows moving a buffer of any size (it's important to allow
+ * that).
+ *
+ * The currency is simply time in microseconds and it increases as the clock
+ * ticks. The accumulated microseconds (us) are converted to bytes and
+ * returned.
 */
 static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
 {
-        u64 real_vram_size = adev->mc.real_vram_size;
+        s64 time_us, increment_us;
-        u64 vram_usage = atomic64_read(&adev->vram_usage);
+        u64 max_bytes;
+        u64 free_vram, total_vram, used_vram;
-        /* This function is based on the current VRAM usage.
+        /* Allow a maximum of 200 accumulated ms. This is basically per-IB
+         * throttling.
         *
-         * - If all of VRAM is free, allow relocating the number of bytes that
+         * It means that in order to get full max MBps, at least 5 IBs per
-         *   is equal to 1/4 of the size of VRAM for this IB.
+         * second must be submitted and not more than 200ms apart from each
+         * other.
+         */
+        const s64 us_upper_bound = 200000;
-         * - If more than one half of VRAM is occupied, only allow relocating
+        if (!adev->mm_stats.log2_max_MBps)
-         *   1 MB of data for this IB.
+                return 0;
-         *
-         * - From 0 to one half of used VRAM, the threshold decreases
+        total_vram = adev->mc.real_vram_size - adev->vram_pin_size;
-         *   linearly.
+        used_vram = atomic64_read(&adev->vram_usage);
-         *         __________________
+        free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram;
-         * 1/4 of -|\               |
-         * VRAM    | \              |
+        spin_lock(&adev->mm_stats.lock);
-         *         |  \             |
-         *         |   \            |
+        /* Increase the amount of accumulated us. */
-         *         |    \           |
+        time_us = ktime_to_us(ktime_get());
-         *         |     \          |
+        increment_us = time_us - adev->mm_stats.last_update_us;
-         *         |      \         |
+        adev->mm_stats.last_update_us = time_us;
-         *         |       \________|1 MB
+        adev->mm_stats.accum_us = min(adev->mm_stats.accum_us + increment_us,
-         *         |----------------|
+                                      us_upper_bound);
-         *    VRAM 0 %             100 %
-         *         used            used
+        /* This prevents the short period of low performance when the VRAM
-         *
+         * usage is low and the driver is in debt or doesn't have enough
-         * Note: It's a threshold, not a limit. The threshold must be crossed
+         * accumulated us to fill VRAM quickly.
-         * for buffer relocations to stop, so any buffer of an arbitrary size
-         * can be moved as long as the threshold isn't crossed before
-         * the relocation takes place. We don't want to disable buffer
-         * relocations completely.
         *
-         * The idea is that buffers should be placed in VRAM at creation time
+         * The situation can occur in these cases:
-         * and TTM should only do a minimum number of relocations during
+         * - a lot of VRAM is freed by userspace
-         * command submission. In practice, you need to submit at least
+         * - the presence of a big buffer causes a lot of evictions
-         * a dozen IBs to move all buffers to VRAM if they are in GTT.
+         *   (solution: split buffers into smaller ones)
         *
-         * Also, things can get pretty crazy under memory pressure and actual
+         * If 128 MB or 1/8th of VRAM is free, start filling it now by setting
-         * VRAM usage can change a lot, so playing safe even at 50% does
+         * accum_us to a positive number.
-         * consistently increase performance.
         */
+        if (free_vram >= 128 * 1024 * 1024 || free_vram >= total_vram / 8) {
+                s64 min_us;
+                /* Be more aggresive on dGPUs. Try to fill a portion of free
+                 * VRAM now.
+                 */
+                if (!(adev->flags & AMD_IS_APU))
+                        min_us = bytes_to_us(adev, free_vram / 4);
+                else
+                        min_us = 0; /* Reset accum_us on APUs. */
+                adev->mm_stats.accum_us = max(min_us, adev->mm_stats.accum_us);
+        }
-        u64 half_vram = real_vram_size >> 1;
+        /* This returns 0 if the driver is in debt to disallow (optional)
-        u64 half_free_vram = vram_usage >= half_vram ? 0 : half_vram - vram_usage;
+         * buffer moves.
-        u64 bytes_moved_threshold = half_free_vram >> 1;
+         */
-        return max(bytes_moved_threshold, 1024*1024ull);
+        max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
+        spin_unlock(&adev->mm_stats.lock);
+        return max_bytes;
+}
+/* Report how many bytes have really been moved for the last command
+ * submission. This can result in a debt that can stop buffer migrations
+ * temporarily.
+ */
+static void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev,
+                                         u64 num_bytes)
+{
+        spin_lock(&adev->mm_stats.lock);
+        adev->mm_stats.accum_us -= bytes_to_us(adev, num_bytes);
+        spin_unlock(&adev->mm_stats.lock);
 }
 static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
@@ -297,15 +356,10 @@ static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
        if (bo->pin_count)
                return 0;
-        /* Avoid moving this one if we have moved too many buffers
+        /* Don't move this buffer if we have depleted our allowance
-         * for this IB already.
+         * to move it. Don't move anything if the threshold is zero.
-         *
-         * Note that this allows moving at least one buffer of
-         * any size, because it doesn't take the current "bo"
-         * into account. We don't want to disallow buffer moves
-         * completely.
         */
-        if (p->bytes_moved <= p->bytes_moved_threshold)
+        if (p->bytes_moved < p->bytes_moved_threshold)
                domain = bo->prefered_domains;
        else
                domain = bo->allowed_domains;
@@ -494,6 +548,8 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
                goto error_validate;
        }
+        amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved);
        fpriv->vm.last_eviction_counter =
                atomic64_read(&p->adev->num_evictions);

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index d80e5d3a4add..82927570333a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -235,56 +235,115 @@ free_chunk:
235	return ret;	235	return ret;
236	}	236	}
237		237
238	/* Returns how many bytes TTM can move per IB.	238	/* Convert microseconds to bytes. */
		239	static u64 us_to_bytes(struct amdgpu_device *adev, s64 us)
		240	{
		241	if (us <= 0 \|\| !adev->mm_stats.log2_max_MBps)
		242	return 0;
		243
		244	/* Since accum_us is incremented by a million per second, just
		245	* multiply it by the number of MB/s to get the number of bytes.
		246	*/
		247	return us << adev->mm_stats.log2_max_MBps;
		248	}
		249
		250	static s64 bytes_to_us(struct amdgpu_device *adev, u64 bytes)
		251	{
		252	if (!adev->mm_stats.log2_max_MBps)
		253	return 0;
		254
		255	return bytes >> adev->mm_stats.log2_max_MBps;
		256	}
		257
		258	/* Returns how many bytes TTM can move right now. If no bytes can be moved,
		259	* it returns 0. If it returns non-zero, it's OK to move at least one buffer,
		260	* which means it can go over the threshold once. If that happens, the driver
		261	* will be in debt and no other buffer migrations can be done until that debt
		262	* is repaid.
		263	*
		264	* This approach allows moving a buffer of any size (it's important to allow
		265	* that).
		266	*
		267	* The currency is simply time in microseconds and it increases as the clock
		268	* ticks. The accumulated microseconds (us) are converted to bytes and
		269	* returned.
239	*/	270	*/
240	static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)	271	static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
241	{	272	{
242	u64 real_vram_size = adev->mc.real_vram_size;	273	s64 time_us, increment_us;
243	u64 vram_usage = atomic64_read(&adev->vram_usage);	274	u64 max_bytes;
		275	u64 free_vram, total_vram, used_vram;
244		276
245	/* This function is based on the current VRAM usage.	277	/* Allow a maximum of 200 accumulated ms. This is basically per-IB
		278	* throttling.
246	*	279	*
247	* - If all of VRAM is free, allow relocating the number of bytes that	280	* It means that in order to get full max MBps, at least 5 IBs per
248	* is equal to 1/4 of the size of VRAM for this IB.	281	* second must be submitted and not more than 200ms apart from each
		282	* other.
		283	*/
		284	const s64 us_upper_bound = 200000;
249		285
250	* - If more than one half of VRAM is occupied, only allow relocating	286	if (!adev->mm_stats.log2_max_MBps)
251	* 1 MB of data for this IB.	287	return 0;
252	*	288
253	* - From 0 to one half of used VRAM, the threshold decreases	289	total_vram = adev->mc.real_vram_size - adev->vram_pin_size;
254	* linearly.	290	used_vram = atomic64_read(&adev->vram_usage);
255	* __________________	291	free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram;
256	* 1/4 of -\|\ \|	292
257	* VRAM \| \ \|	293	spin_lock(&adev->mm_stats.lock);
258	* \| \ \|	294
259	* \| \ \|	295	/* Increase the amount of accumulated us. */
260	* \| \ \|	296	time_us = ktime_to_us(ktime_get());
261	* \| \ \|	297	increment_us = time_us - adev->mm_stats.last_update_us;
262	* \| \ \|	298	adev->mm_stats.last_update_us = time_us;
263	* \| \________\|1 MB	299	adev->mm_stats.accum_us = min(adev->mm_stats.accum_us + increment_us,
264	* \|----------------\|	300	us_upper_bound);
265	* VRAM 0 % 100 %	301
266	* used used	302	/* This prevents the short period of low performance when the VRAM
267	*	303	* usage is low and the driver is in debt or doesn't have enough
268	* Note: It's a threshold, not a limit. The threshold must be crossed	304	* accumulated us to fill VRAM quickly.
269	* for buffer relocations to stop, so any buffer of an arbitrary size
270	* can be moved as long as the threshold isn't crossed before
271	* the relocation takes place. We don't want to disable buffer
272	* relocations completely.
273	*	305	*
274	* The idea is that buffers should be placed in VRAM at creation time	306	* The situation can occur in these cases:
275	* and TTM should only do a minimum number of relocations during	307	* - a lot of VRAM is freed by userspace
276	* command submission. In practice, you need to submit at least	308	* - the presence of a big buffer causes a lot of evictions
277	* a dozen IBs to move all buffers to VRAM if they are in GTT.	309	* (solution: split buffers into smaller ones)
278	*	310	*
279	* Also, things can get pretty crazy under memory pressure and actual	311	* If 128 MB or 1/8th of VRAM is free, start filling it now by setting
280	* VRAM usage can change a lot, so playing safe even at 50% does	312	* accum_us to a positive number.
281	* consistently increase performance.
282	*/	313	*/
		314	if (free_vram >= 128 * 1024 * 1024 \|\| free_vram >= total_vram / 8) {
		315	s64 min_us;
		316
		317	/* Be more aggresive on dGPUs. Try to fill a portion of free
		318	* VRAM now.
		319	*/
		320	if (!(adev->flags & AMD_IS_APU))
		321	min_us = bytes_to_us(adev, free_vram / 4);
		322	else
		323	min_us = 0; /* Reset accum_us on APUs. */
		324
		325	adev->mm_stats.accum_us = max(min_us, adev->mm_stats.accum_us);
		326	}
283		327
284	u64 half_vram = real_vram_size >> 1;	328	/* This returns 0 if the driver is in debt to disallow (optional)
285	u64 half_free_vram = vram_usage >= half_vram ? 0 : half_vram - vram_usage;	329	* buffer moves.
286	u64 bytes_moved_threshold = half_free_vram >> 1;	330	*/
287	return max(bytes_moved_threshold, 1024*1024ull);	331	max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
		332
		333	spin_unlock(&adev->mm_stats.lock);
		334	return max_bytes;
		335	}
		336
		337	/* Report how many bytes have really been moved for the last command
		338	* submission. This can result in a debt that can stop buffer migrations
		339	* temporarily.
		340	*/
		341	static void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev,
		342	u64 num_bytes)
		343	{
		344	spin_lock(&adev->mm_stats.lock);
		345	adev->mm_stats.accum_us -= bytes_to_us(adev, num_bytes);
		346	spin_unlock(&adev->mm_stats.lock);
288	}	347	}
289		348
290	static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,	349	static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
@@ -297,15 +356,10 @@ static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
297	if (bo->pin_count)	356	if (bo->pin_count)
298	return 0;	357	return 0;
299		358
300	/* Avoid moving this one if we have moved too many buffers	359	/* Don't move this buffer if we have depleted our allowance
301	* for this IB already.	360	* to move it. Don't move anything if the threshold is zero.
302	*
303	* Note that this allows moving at least one buffer of
304	* any size, because it doesn't take the current "bo"
305	* into account. We don't want to disallow buffer moves
306	* completely.
307	*/	361	*/
308	if (p->bytes_moved <= p->bytes_moved_threshold)	362	if (p->bytes_moved < p->bytes_moved_threshold)
309	domain = bo->prefered_domains;	363	domain = bo->prefered_domains;
310	else	364	else
311	domain = bo->allowed_domains;	365	domain = bo->allowed_domains;
@@ -494,6 +548,8 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
494	goto error_validate;	548	goto error_validate;
495	}	549	}
496		550
		551	amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved);
		552
497	fpriv->vm.last_eviction_counter =	553	fpriv->vm.last_eviction_counter =
498	atomic64_read(&p->adev->num_evictions);	554	atomic64_read(&p->adev->num_evictions);
499		555