aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu
diff options
context:
space:
mode:
authorHuang Rui <ray.huang@amd.com>2018-08-05 22:57:08 -0400
committerAlex Deucher <alexander.deucher@amd.com>2018-08-27 12:11:22 -0400
commitf921661bd4a112f80d57bbfb3e792da63787f4b0 (patch)
tree6f45aed19d115edb5eaa585c6d3a849965a846d3 /drivers/gpu/drm/amd/amdgpu
parent7748e2dcdaad901776c0d78e76e066403e95513c (diff)
drm/amdgpu: use bulk moves for efficient VM LRU handling (v6)
I continue to work for bulk moving that based on the proposal by Christian. Background: amdgpu driver will move all PD/PT and PerVM BOs into idle list. Then move all of them on the end of LRU list one by one. Thus, that cause so many BOs moved to the end of the LRU, and impact performance seriously. Then Christian provided a workaround to not move PD/PT BOs on LRU with below patch: Commit 0bbf32026cf5ba41e9922b30e26e1bed1ecd38ae ("drm/amdgpu: band aid validating VM PTs") However, the final solution should bulk move all PD/PT and PerVM BOs on the LRU instead of one by one. Whenever amdgpu_vm_validate_pt_bos() is called and we have BOs which need to be validated we move all BOs together to the end of the LRU without dropping the lock for the LRU. While doing so we note the beginning and end of this block in the LRU list. Now when amdgpu_vm_validate_pt_bos() is called and we don't have anything to do, we don't move every BO one by one, but instead cut the LRU list into pieces so that we bulk move everything to the end in just one operation. Test data: +--------------+-----------------+-----------+---------------------------------------+ | |The Talos |Clpeak(OCL)|BusSpeedReadback(OCL) | | |Principle(Vulkan)| | | +------------------------------------------------------------------------------------+ | | | |0.319 ms(1k) 0.314 ms(2K) 0.308 ms(4K) | | Original | 147.7 FPS | 76.86 us |0.307 ms(8K) 0.310 ms(16K) | +------------------------------------------------------------------------------------+ | Orignial + WA| | |0.254 ms(1K) 0.241 ms(2K) | |(don't move | 162.1 FPS | 42.15 us |0.230 ms(4K) 0.223 ms(8K) 0.204 ms(16K)| |PT BOs on LRU)| | | | +------------------------------------------------------------------------------------+ | Bulk move | 163.1 FPS | 40.52 us |0.244 ms(1K) 0.252 ms(2K) 0.213 ms(4K) | | | | |0.214 ms(8K) 0.225 ms(16K) | +--------------+-----------------+-----------+---------------------------------------+ After test them with above three benchmarks include vulkan and opencl. We can see the visible improvement than original, and even better than original with workaround. v2: move all BOs include idle, relocated, and moved list to the end of LRU and put them together. v3: remove unused parameter and use list_for_each_entry instead of the one with save entry. v4: move the amdgpu_vm_move_to_lru_tail after command submission, at that time, all bo will be back on idle list. v5: remove amdgpu_vm_move_to_lru_tail_by_list(), use bulk_moveable instread of validated, and move ttm_bo_bulk_move_lru_tail() also into amdgpu_vm_move_to_lru_tail(). v6: clean up and fix return value. Signed-off-by: Christian König <christian.koenig@amd.com> Signed-off-by: Huang Rui <ray.huang@amd.com> Tested-by: Mike Lothian <mike@fireburn.co.uk> Tested-by: Dieter Nützel <Dieter@nuetzel-hh.de> Acked-by: Chunming Zhou <david1.zhou@amd.com> Reviewed-by: Junwei Zhang <Jerry.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c3
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c66
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h11
3 files changed, 57 insertions, 23 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index fd3902983195..b62bbe71662d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -1259,6 +1259,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
1259 union drm_amdgpu_cs *cs = data; 1259 union drm_amdgpu_cs *cs = data;
1260 struct amdgpu_cs_parser parser = {}; 1260 struct amdgpu_cs_parser parser = {};
1261 bool reserved_buffers = false; 1261 bool reserved_buffers = false;
1262 struct amdgpu_fpriv *fpriv;
1262 int i, r; 1263 int i, r;
1263 1264
1264 if (!adev->accel_working) 1265 if (!adev->accel_working)
@@ -1303,6 +1304,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
1303 1304
1304 r = amdgpu_cs_submit(&parser, cs); 1305 r = amdgpu_cs_submit(&parser, cs);
1305 1306
1307 fpriv = filp->driver_priv;
1308 amdgpu_vm_move_to_lru_tail(adev, &fpriv->vm);
1306out: 1309out:
1307 amdgpu_cs_parser_fini(&parser, r, reserved_buffers); 1310 amdgpu_cs_parser_fini(&parser, r, reserved_buffers);
1308 return r; 1311 return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index d12bffa5f70c..7b0fdf5c79f9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -268,6 +268,47 @@ void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm,
268} 268}
269 269
270/** 270/**
271 * amdgpu_vm_move_to_lru_tail - move all BOs to the end of LRU
272 *
273 * @adev: amdgpu device pointer
274 * @vm: vm providing the BOs
275 *
276 * Move all BOs to the end of LRU and remember their positions to put them
277 * together.
278 */
279void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
280 struct amdgpu_vm *vm)
281{
282 struct ttm_bo_global *glob = adev->mman.bdev.glob;
283 struct amdgpu_vm_bo_base *bo_base;
284
285 if (vm->bulk_moveable) {
286 spin_lock(&glob->lru_lock);
287 ttm_bo_bulk_move_lru_tail(&vm->lru_bulk_move);
288 spin_unlock(&glob->lru_lock);
289 return;
290 }
291
292 memset(&vm->lru_bulk_move, 0, sizeof(vm->lru_bulk_move));
293
294 spin_lock(&glob->lru_lock);
295 list_for_each_entry(bo_base, &vm->idle, vm_status) {
296 struct amdgpu_bo *bo = bo_base->bo;
297
298 if (!bo->parent)
299 continue;
300
301 ttm_bo_move_to_lru_tail(&bo->tbo, &vm->lru_bulk_move);
302 if (bo->shadow)
303 ttm_bo_move_to_lru_tail(&bo->shadow->tbo,
304 &vm->lru_bulk_move);
305 }
306 spin_unlock(&glob->lru_lock);
307
308 vm->bulk_moveable = true;
309}
310
311/**
271 * amdgpu_vm_validate_pt_bos - validate the page table BOs 312 * amdgpu_vm_validate_pt_bos - validate the page table BOs
272 * 313 *
273 * @adev: amdgpu device pointer 314 * @adev: amdgpu device pointer
@@ -284,10 +325,11 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
284 int (*validate)(void *p, struct amdgpu_bo *bo), 325 int (*validate)(void *p, struct amdgpu_bo *bo),
285 void *param) 326 void *param)
286{ 327{
287 struct ttm_bo_global *glob = adev->mman.bdev.glob;
288 struct amdgpu_vm_bo_base *bo_base, *tmp; 328 struct amdgpu_vm_bo_base *bo_base, *tmp;
289 int r = 0; 329 int r = 0;
290 330
331 vm->bulk_moveable &= list_empty(&vm->evicted);
332
291 list_for_each_entry_safe(bo_base, tmp, &vm->evicted, vm_status) { 333 list_for_each_entry_safe(bo_base, tmp, &vm->evicted, vm_status) {
292 struct amdgpu_bo *bo = bo_base->bo; 334 struct amdgpu_bo *bo = bo_base->bo;
293 335
@@ -295,14 +337,6 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
295 if (r) 337 if (r)
296 break; 338 break;
297 339
298 if (bo->parent) {
299 spin_lock(&glob->lru_lock);
300 ttm_bo_move_to_lru_tail(&bo->tbo, NULL);
301 if (bo->shadow)
302 ttm_bo_move_to_lru_tail(&bo->shadow->tbo, NULL);
303 spin_unlock(&glob->lru_lock);
304 }
305
306 if (bo->tbo.type != ttm_bo_type_kernel) { 340 if (bo->tbo.type != ttm_bo_type_kernel) {
307 spin_lock(&vm->moved_lock); 341 spin_lock(&vm->moved_lock);
308 list_move(&bo_base->vm_status, &vm->moved); 342 list_move(&bo_base->vm_status, &vm->moved);
@@ -312,19 +346,6 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
312 } 346 }
313 } 347 }
314 348
315 spin_lock(&glob->lru_lock);
316 list_for_each_entry(bo_base, &vm->idle, vm_status) {
317 struct amdgpu_bo *bo = bo_base->bo;
318
319 if (!bo->parent)
320 continue;
321
322 ttm_bo_move_to_lru_tail(&bo->tbo, NULL);
323 if (bo->shadow)
324 ttm_bo_move_to_lru_tail(&bo->shadow->tbo, NULL);
325 }
326 spin_unlock(&glob->lru_lock);
327
328 return r; 349 return r;
329} 350}
330 351
@@ -2590,6 +2611,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
2590 return r; 2611 return r;
2591 2612
2592 vm->pte_support_ats = false; 2613 vm->pte_support_ats = false;
2614 vm->bulk_moveable = true;
2593 2615
2594 if (vm_context == AMDGPU_VM_CONTEXT_COMPUTE) { 2616 if (vm_context == AMDGPU_VM_CONTEXT_COMPUTE) {
2595 vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode & 2617 vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode &
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 1162c2bf3138..14bafe771c9b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -29,6 +29,7 @@
29#include <linux/rbtree.h> 29#include <linux/rbtree.h>
30#include <drm/gpu_scheduler.h> 30#include <drm/gpu_scheduler.h>
31#include <drm/drm_file.h> 31#include <drm/drm_file.h>
32#include <drm/ttm/ttm_bo_driver.h>
32 33
33#include "amdgpu_sync.h" 34#include "amdgpu_sync.h"
34#include "amdgpu_ring.h" 35#include "amdgpu_ring.h"
@@ -247,6 +248,11 @@ struct amdgpu_vm {
247 248
248 /* Some basic info about the task */ 249 /* Some basic info about the task */
249 struct amdgpu_task_info task_info; 250 struct amdgpu_task_info task_info;
251
252 /* Store positions of group of BOs */
253 struct ttm_lru_bulk_move lru_bulk_move;
254 /* mark whether can do the bulk move */
255 bool bulk_moveable;
250}; 256};
251 257
252struct amdgpu_vm_manager { 258struct amdgpu_vm_manager {
@@ -354,8 +360,11 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
354void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev); 360void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
355 361
356void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid, 362void amdgpu_vm_get_task_info(struct amdgpu_device *adev, unsigned int pasid,
357 struct amdgpu_task_info *task_info); 363 struct amdgpu_task_info *task_info);
358 364
359void amdgpu_vm_set_task_info(struct amdgpu_vm *vm); 365void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
360 366
367void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
368 struct amdgpu_vm *vm);
369
361#endif 370#endif