aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
diff options
context:
space:
mode:
authorMarek Olšák <marek.olsak@amd.com>2016-08-17 17:49:27 -0400
committerAlex Deucher <alexander.deucher@amd.com>2016-08-30 17:54:30 -0400
commit95844d20ae024b5d553c9923a0d3145c3956bf69 (patch)
treea2b95c430ed82db0b2085379c451499c5638b698 /drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
parent15f441db96adcac1dc86196b301f4204452bfe0c (diff)
drm/amdgpu: throttle buffer migrations at CS using a fixed MBps limit (v2)
The old mechanism used a per-submission limit that didn't take previous submissions within the same time frame into account. It also filled VRAM slowly when VRAM usage dropped due to a big eviction or buffer deallocation. This new method establishes a configurable MBps limit that is obeyed when VRAM usage is very high. When VRAM usage is not very high, it gives the driver the freedom to fill it quickly. The result is more consistent performance. It can't keep the BO move rate low if lots of evictions are happening due to VRAM fragmentation, or if a big buffer is being migrated. The amdgpu.moverate parameter can be used to set a non-default limit. Measurements can be done to find out which amdgpu.moverate setting gives the best results. Mainly APUs and cards with small VRAM will benefit from this. For F1 2015, anything with 2 GB VRAM or less will benefit. Some benchmark results - F1 2015 (Tonga 2GB): Limit MinFPS AvgFPS Old code: 14 32.6 128 MB/s: 28 41 64 MB/s: 15.5 43 32 MB/s: 28.7 43.4 8 MB/s: 27.8 44.4 8 MB/s: 21.9 42.8 (different run) Random drops in Min FPS can still occur (due to fragmented VRAM?), but the average FPS is much better. 8 MB/s is probably a good limit for this game & the current VRAM management. The random FPS drops are still to be tackled. v2: use a spinlock Signed-off-by: Marek Olšák <marek.olsak@amd.com> Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c152
1 files changed, 104 insertions, 48 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index d80e5d3a4add..82927570333a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -235,56 +235,115 @@ free_chunk:
235 return ret; 235 return ret;
236} 236}
237 237
238/* Returns how many bytes TTM can move per IB. 238/* Convert microseconds to bytes. */
239static u64 us_to_bytes(struct amdgpu_device *adev, s64 us)
240{
241 if (us <= 0 || !adev->mm_stats.log2_max_MBps)
242 return 0;
243
244 /* Since accum_us is incremented by a million per second, just
245 * multiply it by the number of MB/s to get the number of bytes.
246 */
247 return us << adev->mm_stats.log2_max_MBps;
248}
249
250static s64 bytes_to_us(struct amdgpu_device *adev, u64 bytes)
251{
252 if (!adev->mm_stats.log2_max_MBps)
253 return 0;
254
255 return bytes >> adev->mm_stats.log2_max_MBps;
256}
257
258/* Returns how many bytes TTM can move right now. If no bytes can be moved,
259 * it returns 0. If it returns non-zero, it's OK to move at least one buffer,
260 * which means it can go over the threshold once. If that happens, the driver
261 * will be in debt and no other buffer migrations can be done until that debt
262 * is repaid.
263 *
264 * This approach allows moving a buffer of any size (it's important to allow
265 * that).
266 *
267 * The currency is simply time in microseconds and it increases as the clock
268 * ticks. The accumulated microseconds (us) are converted to bytes and
269 * returned.
239 */ 270 */
240static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev) 271static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
241{ 272{
242 u64 real_vram_size = adev->mc.real_vram_size; 273 s64 time_us, increment_us;
243 u64 vram_usage = atomic64_read(&adev->vram_usage); 274 u64 max_bytes;
275 u64 free_vram, total_vram, used_vram;
244 276
245 /* This function is based on the current VRAM usage. 277 /* Allow a maximum of 200 accumulated ms. This is basically per-IB
278 * throttling.
246 * 279 *
247 * - If all of VRAM is free, allow relocating the number of bytes that 280 * It means that in order to get full max MBps, at least 5 IBs per
248 * is equal to 1/4 of the size of VRAM for this IB. 281 * second must be submitted and not more than 200ms apart from each
282 * other.
283 */
284 const s64 us_upper_bound = 200000;
249 285
250 * - If more than one half of VRAM is occupied, only allow relocating 286 if (!adev->mm_stats.log2_max_MBps)
251 * 1 MB of data for this IB. 287 return 0;
252 * 288
253 * - From 0 to one half of used VRAM, the threshold decreases 289 total_vram = adev->mc.real_vram_size - adev->vram_pin_size;
254 * linearly. 290 used_vram = atomic64_read(&adev->vram_usage);
255 * __________________ 291 free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram;
256 * 1/4 of -|\ | 292
257 * VRAM | \ | 293 spin_lock(&adev->mm_stats.lock);
258 * | \ | 294
259 * | \ | 295 /* Increase the amount of accumulated us. */
260 * | \ | 296 time_us = ktime_to_us(ktime_get());
261 * | \ | 297 increment_us = time_us - adev->mm_stats.last_update_us;
262 * | \ | 298 adev->mm_stats.last_update_us = time_us;
263 * | \________|1 MB 299 adev->mm_stats.accum_us = min(adev->mm_stats.accum_us + increment_us,
264 * |----------------| 300 us_upper_bound);
265 * VRAM 0 % 100 % 301
266 * used used 302 /* This prevents the short period of low performance when the VRAM
267 * 303 * usage is low and the driver is in debt or doesn't have enough
268 * Note: It's a threshold, not a limit. The threshold must be crossed 304 * accumulated us to fill VRAM quickly.
269 * for buffer relocations to stop, so any buffer of an arbitrary size
270 * can be moved as long as the threshold isn't crossed before
271 * the relocation takes place. We don't want to disable buffer
272 * relocations completely.
273 * 305 *
274 * The idea is that buffers should be placed in VRAM at creation time 306 * The situation can occur in these cases:
275 * and TTM should only do a minimum number of relocations during 307 * - a lot of VRAM is freed by userspace
276 * command submission. In practice, you need to submit at least 308 * - the presence of a big buffer causes a lot of evictions
277 * a dozen IBs to move all buffers to VRAM if they are in GTT. 309 * (solution: split buffers into smaller ones)
278 * 310 *
279 * Also, things can get pretty crazy under memory pressure and actual 311 * If 128 MB or 1/8th of VRAM is free, start filling it now by setting
280 * VRAM usage can change a lot, so playing safe even at 50% does 312 * accum_us to a positive number.
281 * consistently increase performance.
282 */ 313 */
314 if (free_vram >= 128 * 1024 * 1024 || free_vram >= total_vram / 8) {
315 s64 min_us;
316
317 /* Be more aggresive on dGPUs. Try to fill a portion of free
318 * VRAM now.
319 */
320 if (!(adev->flags & AMD_IS_APU))
321 min_us = bytes_to_us(adev, free_vram / 4);
322 else
323 min_us = 0; /* Reset accum_us on APUs. */
324
325 adev->mm_stats.accum_us = max(min_us, adev->mm_stats.accum_us);
326 }
283 327
284 u64 half_vram = real_vram_size >> 1; 328 /* This returns 0 if the driver is in debt to disallow (optional)
285 u64 half_free_vram = vram_usage >= half_vram ? 0 : half_vram - vram_usage; 329 * buffer moves.
286 u64 bytes_moved_threshold = half_free_vram >> 1; 330 */
287 return max(bytes_moved_threshold, 1024*1024ull); 331 max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
332
333 spin_unlock(&adev->mm_stats.lock);
334 return max_bytes;
335}
336
337/* Report how many bytes have really been moved for the last command
338 * submission. This can result in a debt that can stop buffer migrations
339 * temporarily.
340 */
341static void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev,
342 u64 num_bytes)
343{
344 spin_lock(&adev->mm_stats.lock);
345 adev->mm_stats.accum_us -= bytes_to_us(adev, num_bytes);
346 spin_unlock(&adev->mm_stats.lock);
288} 347}
289 348
290static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p, 349static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
@@ -297,15 +356,10 @@ static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
297 if (bo->pin_count) 356 if (bo->pin_count)
298 return 0; 357 return 0;
299 358
300 /* Avoid moving this one if we have moved too many buffers 359 /* Don't move this buffer if we have depleted our allowance
301 * for this IB already. 360 * to move it. Don't move anything if the threshold is zero.
302 *
303 * Note that this allows moving at least one buffer of
304 * any size, because it doesn't take the current "bo"
305 * into account. We don't want to disallow buffer moves
306 * completely.
307 */ 361 */
308 if (p->bytes_moved <= p->bytes_moved_threshold) 362 if (p->bytes_moved < p->bytes_moved_threshold)
309 domain = bo->prefered_domains; 363 domain = bo->prefered_domains;
310 else 364 else
311 domain = bo->allowed_domains; 365 domain = bo->allowed_domains;
@@ -494,6 +548,8 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
494 goto error_validate; 548 goto error_validate;
495 } 549 }
496 550
551 amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved);
552
497 fpriv->vm.last_eviction_counter = 553 fpriv->vm.last_eviction_counter =
498 atomic64_read(&p->adev->num_evictions); 554 atomic64_read(&p->adev->num_evictions);
499 555