diff options
author | ddutta <ddutta@nvidia.com> | 2020-03-03 04:19:36 -0500 |
---|---|---|
committer | mobile promotions <svcmobile_promotions@nvidia.com> | 2020-03-05 07:54:42 -0500 |
commit | bb2c8ef511763d9f61f7e3b4bbfa7a27b9d5c2b6 (patch) | |
tree | e3af53634d8280965117038b99754b0ad15c7c30 /drivers/gpu | |
parent | fbad02d5e045fbc10f44fbecf419235938d4d0bc (diff) |
gpu: nvgpu: decrease refcount when sync-unmap fails
When nvgpu_vm_unmap_sync fails, nvgpu_unmap_sync currently bails
out without decreasing the buffer refcount. This prevents from
releasing the buffer, in case a deferred job completes after the
timeout (which was observed 2 times during overnight
stress tests). This also means that the fixed address is not
re-useable.
Throw out a warning when nvgpu_vm_unmap_sync fails, but proceed
with decreasing refcount.
Bug 200578193
Change-Id: Ie0cc7caa7d12ca0a3b42123a5f7a28bda72dabbc
Signed-off-by: ddutta <ddutta@nvidia.com>
(cherry picked from commit a433f26d5bb1ec3253fc2655998b1ef7fb2847cb
in dev-main)
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2291352
Tested-by: Naveen Kumar S <nkumars@nvidia.com>
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Reviewed-by: automaticguardword <automaticguardword@nvidia.com>
Reviewed-by: Bibek Basu <bbasu@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
Diffstat (limited to 'drivers/gpu')
-rw-r--r-- | drivers/gpu/nvgpu/common/mm/vm.c | 27 |
1 files changed, 14 insertions, 13 deletions
diff --git a/drivers/gpu/nvgpu/common/mm/vm.c b/drivers/gpu/nvgpu/common/mm/vm.c index dc928e18..4a7cc828 100644 --- a/drivers/gpu/nvgpu/common/mm/vm.c +++ b/drivers/gpu/nvgpu/common/mm/vm.c | |||
@@ -1180,6 +1180,7 @@ static int nvgpu_vm_unmap_sync_buffer(struct vm_gk20a *vm, | |||
1180 | { | 1180 | { |
1181 | struct nvgpu_timeout timeout; | 1181 | struct nvgpu_timeout timeout; |
1182 | int ret = 0; | 1182 | int ret = 0; |
1183 | bool done = false; | ||
1183 | 1184 | ||
1184 | nvgpu_mutex_release(&vm->update_gmmu_lock); | 1185 | nvgpu_mutex_release(&vm->update_gmmu_lock); |
1185 | 1186 | ||
@@ -1189,16 +1190,18 @@ static int nvgpu_vm_unmap_sync_buffer(struct vm_gk20a *vm, | |||
1189 | nvgpu_timeout_init(vm->mm->g, &timeout, 100, NVGPU_TIMER_CPU_TIMER); | 1190 | nvgpu_timeout_init(vm->mm->g, &timeout, 100, NVGPU_TIMER_CPU_TIMER); |
1190 | 1191 | ||
1191 | do { | 1192 | do { |
1192 | if (nvgpu_atomic_read(&mapped_buffer->ref.refcount) == 1) { | 1193 | if (nvgpu_atomic_read(&mapped_buffer->ref.refcount) <= 1) { |
1193 | break; | 1194 | done = true; |
1194 | } | 1195 | } else if (nvgpu_timeout_expired_msg(&timeout, |
1195 | nvgpu_msleep(10); | ||
1196 | } while (nvgpu_timeout_expired_msg(&timeout, | ||
1197 | "sync-unmap failed on 0x%llx", | 1196 | "sync-unmap failed on 0x%llx", |
1198 | mapped_buffer->addr) == 0); | 1197 | mapped_buffer->addr) != 0) { |
1198 | done = true; | ||
1199 | } else { | ||
1200 | nvgpu_msleep(10); | ||
1201 | } | ||
1202 | } while (!done); | ||
1199 | 1203 | ||
1200 | if (nvgpu_atomic_read(&mapped_buffer->ref.refcount) != 1 && | 1204 | if (nvgpu_atomic_read(&mapped_buffer->ref.refcount) > 1) { |
1201 | nvgpu_timeout_expired(&timeout)) { | ||
1202 | ret = -ETIMEDOUT; | 1205 | ret = -ETIMEDOUT; |
1203 | } | 1206 | } |
1204 | 1207 | ||
@@ -1221,11 +1224,9 @@ void nvgpu_vm_unmap(struct vm_gk20a *vm, u64 offset, | |||
1221 | 1224 | ||
1222 | if (mapped_buffer->flags & NVGPU_VM_MAP_FIXED_OFFSET) { | 1225 | if (mapped_buffer->flags & NVGPU_VM_MAP_FIXED_OFFSET) { |
1223 | if (nvgpu_vm_unmap_sync_buffer(vm, mapped_buffer)) { | 1226 | if (nvgpu_vm_unmap_sync_buffer(vm, mapped_buffer)) { |
1224 | /* | 1227 | nvgpu_warn(vm->mm->g, "%d references remaining on 0x%llx", |
1225 | * Looks like we have failed... Better not continue in | 1228 | nvgpu_atomic_read(&mapped_buffer->ref.refcount), |
1226 | * case the buffer is in use. | 1229 | mapped_buffer->addr); |
1227 | */ | ||
1228 | goto done; | ||
1229 | } | 1230 | } |
1230 | } | 1231 | } |
1231 | 1232 | ||