summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorddutta <ddutta@nvidia.com>2020-03-03 04:19:36 -0500
committermobile promotions <svcmobile_promotions@nvidia.com>2020-03-05 07:54:42 -0500
commitbb2c8ef511763d9f61f7e3b4bbfa7a27b9d5c2b6 (patch)
treee3af53634d8280965117038b99754b0ad15c7c30
parentfbad02d5e045fbc10f44fbecf419235938d4d0bc (diff)
gpu: nvgpu: decrease refcount when sync-unmap fails
When nvgpu_vm_unmap_sync fails, nvgpu_unmap_sync currently bails out without decreasing the buffer refcount. This prevents from releasing the buffer, in case a deferred job completes after the timeout (which was observed 2 times during overnight stress tests). This also means that the fixed address is not re-useable. Throw out a warning when nvgpu_vm_unmap_sync fails, but proceed with decreasing refcount. Bug 200578193 Change-Id: Ie0cc7caa7d12ca0a3b42123a5f7a28bda72dabbc Signed-off-by: ddutta <ddutta@nvidia.com> (cherry picked from commit a433f26d5bb1ec3253fc2655998b1ef7fb2847cb in dev-main) Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2291352 Tested-by: Naveen Kumar S <nkumars@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com> Reviewed-by: automaticguardword <automaticguardword@nvidia.com> Reviewed-by: Bibek Basu <bbasu@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
-rw-r--r--drivers/gpu/nvgpu/common/mm/vm.c27
1 files changed, 14 insertions, 13 deletions
diff --git a/drivers/gpu/nvgpu/common/mm/vm.c b/drivers/gpu/nvgpu/common/mm/vm.c
index dc928e18..4a7cc828 100644
--- a/drivers/gpu/nvgpu/common/mm/vm.c
+++ b/drivers/gpu/nvgpu/common/mm/vm.c
@@ -1180,6 +1180,7 @@ static int nvgpu_vm_unmap_sync_buffer(struct vm_gk20a *vm,
1180{ 1180{
1181 struct nvgpu_timeout timeout; 1181 struct nvgpu_timeout timeout;
1182 int ret = 0; 1182 int ret = 0;
1183 bool done = false;
1183 1184
1184 nvgpu_mutex_release(&vm->update_gmmu_lock); 1185 nvgpu_mutex_release(&vm->update_gmmu_lock);
1185 1186
@@ -1189,16 +1190,18 @@ static int nvgpu_vm_unmap_sync_buffer(struct vm_gk20a *vm,
1189 nvgpu_timeout_init(vm->mm->g, &timeout, 100, NVGPU_TIMER_CPU_TIMER); 1190 nvgpu_timeout_init(vm->mm->g, &timeout, 100, NVGPU_TIMER_CPU_TIMER);
1190 1191
1191 do { 1192 do {
1192 if (nvgpu_atomic_read(&mapped_buffer->ref.refcount) == 1) { 1193 if (nvgpu_atomic_read(&mapped_buffer->ref.refcount) <= 1) {
1193 break; 1194 done = true;
1194 } 1195 } else if (nvgpu_timeout_expired_msg(&timeout,
1195 nvgpu_msleep(10);
1196 } while (nvgpu_timeout_expired_msg(&timeout,
1197 "sync-unmap failed on 0x%llx", 1196 "sync-unmap failed on 0x%llx",
1198 mapped_buffer->addr) == 0); 1197 mapped_buffer->addr) != 0) {
1198 done = true;
1199 } else {
1200 nvgpu_msleep(10);
1201 }
1202 } while (!done);
1199 1203
1200 if (nvgpu_atomic_read(&mapped_buffer->ref.refcount) != 1 && 1204 if (nvgpu_atomic_read(&mapped_buffer->ref.refcount) > 1) {
1201 nvgpu_timeout_expired(&timeout)) {
1202 ret = -ETIMEDOUT; 1205 ret = -ETIMEDOUT;
1203 } 1206 }
1204 1207
@@ -1221,11 +1224,9 @@ void nvgpu_vm_unmap(struct vm_gk20a *vm, u64 offset,
1221 1224
1222 if (mapped_buffer->flags & NVGPU_VM_MAP_FIXED_OFFSET) { 1225 if (mapped_buffer->flags & NVGPU_VM_MAP_FIXED_OFFSET) {
1223 if (nvgpu_vm_unmap_sync_buffer(vm, mapped_buffer)) { 1226 if (nvgpu_vm_unmap_sync_buffer(vm, mapped_buffer)) {
1224 /* 1227 nvgpu_warn(vm->mm->g, "%d references remaining on 0x%llx",
1225 * Looks like we have failed... Better not continue in 1228 nvgpu_atomic_read(&mapped_buffer->ref.refcount),
1226 * case the buffer is in use. 1229 mapped_buffer->addr);
1227 */
1228 goto done;
1229 } 1230 }
1230 } 1231 }
1231 1232