gpu: nvgpu: decrease refcount when sync-unmap fails

When nvgpu_vm_unmap_sync fails, nvgpu_unmap_sync currently bails out without decreasing the buffer refcount. This prevents from releasing the buffer, in case a deferred job completes after the timeout (which was observed 2 times during overnight stress tests). This also means that the fixed address is not re-useable. Throw out a warning when nvgpu_vm_unmap_sync fails, but proceed with decreasing refcount. Bug 200578193 Change-Id: Ie0cc7caa7d12ca0a3b42123a5f7a28bda72dabbc Signed-off-by: ddutta <ddutta@nvidia.com> (cherry picked from commit a433f26d5bb1ec3253fc2655998b1ef7fb2847cb in dev-main) Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2291352 Tested-by: Naveen Kumar S <nkumars@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com> Reviewed-by: automaticguardword <automaticguardword@nvidia.com> Reviewed-by: Bibek Basu <bbasu@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
author: ddutta <ddutta@nvidia.com> 2020-03-03 04:19:36 -0500
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2020-03-05 07:54:42 -0500
commit: bb2c8ef511763d9f61f7e3b4bbfa7a27b9d5c2b6 (patch)
tree: e3af53634d8280965117038b99754b0ad15c7c30
parent: fbad02d5e045fbc10f44fbecf419235938d4d0bc (diff)
1 files changed, 14 insertions, 13 deletions
diff --git a/drivers/gpu/nvgpu/common/mm/vm.c b/drivers/gpu/nvgpu/common/mm/vm.c
index dc928e18..4a7cc828 100644
--- a/drivers/gpu/nvgpu/common/mm/vm.c
+++ b/drivers/gpu/nvgpu/common/mm/vm.c
@@ -1180,6 +1180,7 @@ static int nvgpu_vm_unmap_sync_buffer(struct vm_gk20a *vm,
 {
        struct nvgpu_timeout timeout;
        int ret = 0;
+        bool done = false;
        nvgpu_mutex_release(&vm->update_gmmu_lock);
@@ -1189,16 +1190,18 @@ static int nvgpu_vm_unmap_sync_buffer(struct vm_gk20a *vm,
        nvgpu_timeout_init(vm->mm->g, &timeout, 100, NVGPU_TIMER_CPU_TIMER);
        do {
-                if (nvgpu_atomic_read(&mapped_buffer->ref.refcount) == 1) {
+                if (nvgpu_atomic_read(&mapped_buffer->ref.refcount) <= 1) {
-                        break;
+                        done = true;
-                }
+                } else if (nvgpu_timeout_expired_msg(&timeout,
-                nvgpu_msleep(10);
-        } while (nvgpu_timeout_expired_msg(&timeout,
                            "sync-unmap failed on 0x%llx",
-                            mapped_buffer->addr) == 0);
+                            mapped_buffer->addr) != 0) {
+                        done = true;
+                } else {
+                        nvgpu_msleep(10);
+                }
+        } while (!done);
-        if (nvgpu_atomic_read(&mapped_buffer->ref.refcount) != 1 &&
+        if (nvgpu_atomic_read(&mapped_buffer->ref.refcount) > 1) {
-                        nvgpu_timeout_expired(&timeout)) {
                ret = -ETIMEDOUT;
        }
@@ -1221,11 +1224,9 @@ void nvgpu_vm_unmap(struct vm_gk20a *vm, u64 offset,
        if (mapped_buffer->flags & NVGPU_VM_MAP_FIXED_OFFSET) {
                if (nvgpu_vm_unmap_sync_buffer(vm, mapped_buffer)) {
-                        /*
+                        nvgpu_warn(vm->mm->g, "%d references remaining on 0x%llx",
-                         * Looks like we have failed... Better not continue in
+                                nvgpu_atomic_read(&mapped_buffer->ref.refcount),
-                         * case the buffer is in use.
+                                mapped_buffer->addr);
-                         */
-                        goto done;
                }
        }
author	ddutta <ddutta@nvidia.com>	2020-03-03 04:19:36 -0500
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2020-03-05 07:54:42 -0500
commit	bb2c8ef511763d9f61f7e3b4bbfa7a27b9d5c2b6 (patch)
tree	e3af53634d8280965117038b99754b0ad15c7c30
parent	fbad02d5e045fbc10f44fbecf419235938d4d0bc (diff)