aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu
diff options
context:
space:
mode:
authorMarek Olšák <marek.olsak@amd.com>2016-08-17 17:49:27 -0400
committerAlex Deucher <alexander.deucher@amd.com>2016-08-30 17:54:30 -0400
commit95844d20ae024b5d553c9923a0d3145c3956bf69 (patch)
treea2b95c430ed82db0b2085379c451499c5638b698 /drivers/gpu/drm/amd/amdgpu
parent15f441db96adcac1dc86196b301f4204452bfe0c (diff)
drm/amdgpu: throttle buffer migrations at CS using a fixed MBps limit (v2)
The old mechanism used a per-submission limit that didn't take previous submissions within the same time frame into account. It also filled VRAM slowly when VRAM usage dropped due to a big eviction or buffer deallocation. This new method establishes a configurable MBps limit that is obeyed when VRAM usage is very high. When VRAM usage is not very high, it gives the driver the freedom to fill it quickly. The result is more consistent performance. It can't keep the BO move rate low if lots of evictions are happening due to VRAM fragmentation, or if a big buffer is being migrated. The amdgpu.moverate parameter can be used to set a non-default limit. Measurements can be done to find out which amdgpu.moverate setting gives the best results. Mainly APUs and cards with small VRAM will benefit from this. For F1 2015, anything with 2 GB VRAM or less will benefit. Some benchmark results - F1 2015 (Tonga 2GB): Limit MinFPS AvgFPS Old code: 14 32.6 128 MB/s: 28 41 64 MB/s: 15.5 43 32 MB/s: 28.7 43.4 8 MB/s: 27.8 44.4 8 MB/s: 21.9 42.8 (different run) Random drops in Min FPS can still occur (due to fragmented VRAM?), but the average FPS is much better. 8 MB/s is probably a good limit for this game & the current VRAM management. The random FPS drops are still to be tackled. v2: use a spinlock Signed-off-by: Marek Olšák <marek.olsak@amd.com> Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu.h9
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c152
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c10
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c4
4 files changed, 127 insertions, 48 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 4cfcf9c37800..938ef1cb68cc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -64,6 +64,7 @@
64extern int amdgpu_modeset; 64extern int amdgpu_modeset;
65extern int amdgpu_vram_limit; 65extern int amdgpu_vram_limit;
66extern int amdgpu_gart_size; 66extern int amdgpu_gart_size;
67extern int amdgpu_moverate;
67extern int amdgpu_benchmarking; 68extern int amdgpu_benchmarking;
68extern int amdgpu_testing; 69extern int amdgpu_testing;
69extern int amdgpu_audio; 70extern int amdgpu_audio;
@@ -2034,6 +2035,14 @@ struct amdgpu_device {
2034 atomic64_t num_evictions; 2035 atomic64_t num_evictions;
2035 atomic_t gpu_reset_counter; 2036 atomic_t gpu_reset_counter;
2036 2037
2038 /* data for buffer migration throttling */
2039 struct {
2040 spinlock_t lock;
2041 s64 last_update_us;
2042 s64 accum_us; /* accumulated microseconds */
2043 u32 log2_max_MBps;
2044 } mm_stats;
2045
2037 /* display */ 2046 /* display */
2038 bool enable_virtual_display; 2047 bool enable_virtual_display;
2039 struct amdgpu_mode_info mode_info; 2048 struct amdgpu_mode_info mode_info;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index d80e5d3a4add..82927570333a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -235,56 +235,115 @@ free_chunk:
235 return ret; 235 return ret;
236} 236}
237 237
238/* Returns how many bytes TTM can move per IB. 238/* Convert microseconds to bytes. */
239static u64 us_to_bytes(struct amdgpu_device *adev, s64 us)
240{
241 if (us <= 0 || !adev->mm_stats.log2_max_MBps)
242 return 0;
243
244 /* Since accum_us is incremented by a million per second, just
245 * multiply it by the number of MB/s to get the number of bytes.
246 */
247 return us << adev->mm_stats.log2_max_MBps;
248}
249
250static s64 bytes_to_us(struct amdgpu_device *adev, u64 bytes)
251{
252 if (!adev->mm_stats.log2_max_MBps)
253 return 0;
254
255 return bytes >> adev->mm_stats.log2_max_MBps;
256}
257
258/* Returns how many bytes TTM can move right now. If no bytes can be moved,
259 * it returns 0. If it returns non-zero, it's OK to move at least one buffer,
260 * which means it can go over the threshold once. If that happens, the driver
261 * will be in debt and no other buffer migrations can be done until that debt
262 * is repaid.
263 *
264 * This approach allows moving a buffer of any size (it's important to allow
265 * that).
266 *
267 * The currency is simply time in microseconds and it increases as the clock
268 * ticks. The accumulated microseconds (us) are converted to bytes and
269 * returned.
239 */ 270 */
240static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev) 271static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
241{ 272{
242 u64 real_vram_size = adev->mc.real_vram_size; 273 s64 time_us, increment_us;
243 u64 vram_usage = atomic64_read(&adev->vram_usage); 274 u64 max_bytes;
275 u64 free_vram, total_vram, used_vram;
244 276
245 /* This function is based on the current VRAM usage. 277 /* Allow a maximum of 200 accumulated ms. This is basically per-IB
278 * throttling.
246 * 279 *
247 * - If all of VRAM is free, allow relocating the number of bytes that 280 * It means that in order to get full max MBps, at least 5 IBs per
248 * is equal to 1/4 of the size of VRAM for this IB. 281 * second must be submitted and not more than 200ms apart from each
282 * other.
283 */
284 const s64 us_upper_bound = 200000;
249 285
250 * - If more than one half of VRAM is occupied, only allow relocating 286 if (!adev->mm_stats.log2_max_MBps)
251 * 1 MB of data for this IB. 287 return 0;
252 * 288
253 * - From 0 to one half of used VRAM, the threshold decreases 289 total_vram = adev->mc.real_vram_size - adev->vram_pin_size;
254 * linearly. 290 used_vram = atomic64_read(&adev->vram_usage);
255 * __________________ 291 free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram;
256 * 1/4 of -|\ | 292
257 * VRAM | \ | 293 spin_lock(&adev->mm_stats.lock);
258 * | \ | 294
259 * | \ | 295 /* Increase the amount of accumulated us. */
260 * | \ | 296 time_us = ktime_to_us(ktime_get());
261 * | \ | 297 increment_us = time_us - adev->mm_stats.last_update_us;
262 * | \ | 298 adev->mm_stats.last_update_us = time_us;
263 * | \________|1 MB 299 adev->mm_stats.accum_us = min(adev->mm_stats.accum_us + increment_us,
264 * |----------------| 300 us_upper_bound);
265 * VRAM 0 % 100 % 301
266 * used used 302 /* This prevents the short period of low performance when the VRAM
267 * 303 * usage is low and the driver is in debt or doesn't have enough
268 * Note: It's a threshold, not a limit. The threshold must be crossed 304 * accumulated us to fill VRAM quickly.
269 * for buffer relocations to stop, so any buffer of an arbitrary size
270 * can be moved as long as the threshold isn't crossed before
271 * the relocation takes place. We don't want to disable buffer
272 * relocations completely.
273 * 305 *
274 * The idea is that buffers should be placed in VRAM at creation time 306 * The situation can occur in these cases:
275 * and TTM should only do a minimum number of relocations during 307 * - a lot of VRAM is freed by userspace
276 * command submission. In practice, you need to submit at least 308 * - the presence of a big buffer causes a lot of evictions
277 * a dozen IBs to move all buffers to VRAM if they are in GTT. 309 * (solution: split buffers into smaller ones)
278 * 310 *
279 * Also, things can get pretty crazy under memory pressure and actual 311 * If 128 MB or 1/8th of VRAM is free, start filling it now by setting
280 * VRAM usage can change a lot, so playing safe even at 50% does 312 * accum_us to a positive number.
281 * consistently increase performance.
282 */ 313 */
314 if (free_vram >= 128 * 1024 * 1024 || free_vram >= total_vram / 8) {
315 s64 min_us;
316
317 /* Be more aggresive on dGPUs. Try to fill a portion of free
318 * VRAM now.
319 */
320 if (!(adev->flags & AMD_IS_APU))
321 min_us = bytes_to_us(adev, free_vram / 4);
322 else
323 min_us = 0; /* Reset accum_us on APUs. */
324
325 adev->mm_stats.accum_us = max(min_us, adev->mm_stats.accum_us);
326 }
283 327
284 u64 half_vram = real_vram_size >> 1; 328 /* This returns 0 if the driver is in debt to disallow (optional)
285 u64 half_free_vram = vram_usage >= half_vram ? 0 : half_vram - vram_usage; 329 * buffer moves.
286 u64 bytes_moved_threshold = half_free_vram >> 1; 330 */
287 return max(bytes_moved_threshold, 1024*1024ull); 331 max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
332
333 spin_unlock(&adev->mm_stats.lock);
334 return max_bytes;
335}
336
337/* Report how many bytes have really been moved for the last command
338 * submission. This can result in a debt that can stop buffer migrations
339 * temporarily.
340 */
341static void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev,
342 u64 num_bytes)
343{
344 spin_lock(&adev->mm_stats.lock);
345 adev->mm_stats.accum_us -= bytes_to_us(adev, num_bytes);
346 spin_unlock(&adev->mm_stats.lock);
288} 347}
289 348
290static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p, 349static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
@@ -297,15 +356,10 @@ static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
297 if (bo->pin_count) 356 if (bo->pin_count)
298 return 0; 357 return 0;
299 358
300 /* Avoid moving this one if we have moved too many buffers 359 /* Don't move this buffer if we have depleted our allowance
301 * for this IB already. 360 * to move it. Don't move anything if the threshold is zero.
302 *
303 * Note that this allows moving at least one buffer of
304 * any size, because it doesn't take the current "bo"
305 * into account. We don't want to disallow buffer moves
306 * completely.
307 */ 361 */
308 if (p->bytes_moved <= p->bytes_moved_threshold) 362 if (p->bytes_moved < p->bytes_moved_threshold)
309 domain = bo->prefered_domains; 363 domain = bo->prefered_domains;
310 else 364 else
311 domain = bo->allowed_domains; 365 domain = bo->allowed_domains;
@@ -494,6 +548,8 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
494 goto error_validate; 548 goto error_validate;
495 } 549 }
496 550
551 amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved);
552
497 fpriv->vm.last_eviction_counter = 553 fpriv->vm.last_eviction_counter =
498 atomic64_read(&p->adev->num_evictions); 554 atomic64_read(&p->adev->num_evictions);
499 555
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 1ef4034b3be5..847583d8a3b3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1490,6 +1490,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
1490{ 1490{
1491 int r, i; 1491 int r, i;
1492 bool runtime = false; 1492 bool runtime = false;
1493 u32 max_MBps;
1493 1494
1494 adev->shutdown = false; 1495 adev->shutdown = false;
1495 adev->dev = &pdev->dev; 1496 adev->dev = &pdev->dev;
@@ -1549,6 +1550,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
1549 spin_lock_init(&adev->didt_idx_lock); 1550 spin_lock_init(&adev->didt_idx_lock);
1550 spin_lock_init(&adev->gc_cac_idx_lock); 1551 spin_lock_init(&adev->gc_cac_idx_lock);
1551 spin_lock_init(&adev->audio_endpt_idx_lock); 1552 spin_lock_init(&adev->audio_endpt_idx_lock);
1553 spin_lock_init(&adev->mm_stats.lock);
1552 1554
1553 INIT_LIST_HEAD(&adev->shadow_list); 1555 INIT_LIST_HEAD(&adev->shadow_list);
1554 mutex_init(&adev->shadow_list_lock); 1556 mutex_init(&adev->shadow_list_lock);
@@ -1660,6 +1662,14 @@ int amdgpu_device_init(struct amdgpu_device *adev,
1660 1662
1661 adev->accel_working = true; 1663 adev->accel_working = true;
1662 1664
1665 /* Initialize the buffer migration limit. */
1666 if (amdgpu_moverate >= 0)
1667 max_MBps = amdgpu_moverate;
1668 else
1669 max_MBps = 8; /* Allow 8 MB/s. */
1670 /* Get a log2 for easy divisions. */
1671 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
1672
1663 amdgpu_fbdev_init(adev); 1673 amdgpu_fbdev_init(adev);
1664 1674
1665 r = amdgpu_ib_pool_init(adev); 1675 r = amdgpu_ib_pool_init(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 1b787d974515..6fed75454800 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -62,6 +62,7 @@
62 62
63int amdgpu_vram_limit = 0; 63int amdgpu_vram_limit = 0;
64int amdgpu_gart_size = -1; /* auto */ 64int amdgpu_gart_size = -1; /* auto */
65int amdgpu_moverate = -1; /* auto */
65int amdgpu_benchmarking = 0; 66int amdgpu_benchmarking = 0;
66int amdgpu_testing = 0; 67int amdgpu_testing = 0;
67int amdgpu_audio = -1; 68int amdgpu_audio = -1;
@@ -100,6 +101,9 @@ module_param_named(vramlimit, amdgpu_vram_limit, int, 0600);
100MODULE_PARM_DESC(gartsize, "Size of PCIE/IGP gart to setup in megabytes (32, 64, etc., -1 = auto)"); 101MODULE_PARM_DESC(gartsize, "Size of PCIE/IGP gart to setup in megabytes (32, 64, etc., -1 = auto)");
101module_param_named(gartsize, amdgpu_gart_size, int, 0600); 102module_param_named(gartsize, amdgpu_gart_size, int, 0600);
102 103
104MODULE_PARM_DESC(moverate, "Maximum buffer migration rate in MB/s. (32, 64, etc., -1=auto, 0=1=disabled)");
105module_param_named(moverate, amdgpu_moverate, int, 0600);
106
103MODULE_PARM_DESC(benchmark, "Run benchmark"); 107MODULE_PARM_DESC(benchmark, "Run benchmark");
104module_param_named(benchmark, amdgpu_benchmarking, int, 0444); 108module_param_named(benchmark, amdgpu_benchmarking, int, 0444);
105 109