summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSagar Kamble <skamble@nvidia.com>2021-05-24 05:06:01 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2021-06-07 09:40:08 -0400
commit12e89c21dec793269a0c0f1f3f2160bb7cdf5ebd (patch)
tree4becca378192113c3c71ee2edbc016180ed474d3
parentcbad9503a78c23336a32172dd701b73760969ff0 (diff)
gpu: nvgpu: fix the usermode mappings deadlock during railgate and munmap
Following locking sequence leads to deadlock: 1. gk20a_pm_prepare_poweroff (alter_usermode_mappings): ctrl_privs_lock -> mmap_lock 2. __do_munmap (usermode_vma_close): mmap_lock -> ctrl_privs_lock This lock contention can be resolved by retrying the usermode mapping alteration after a while releasing the ctrl_priv_lock for munmap to proceed. Below is the kernel panic log with deadlock. [] INFO: task kworker/1:1:116 blocked for more than 120 seconds. [] Tainted: G W 5.10.17-tegra #1 [] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [] task:kworker/1:1 state:D stack: 0 pid: 116 ppid: 2 flags:0x00000028 [] Workqueue: pm pm_runtime_work [] Call trace: [] __switch_to+0x104/0x160 [] __schedule+0x3d4/0x900 [] schedule+0x74/0x100 [] rwsem_down_write_slowpath+0x250/0x4b0 [] down_write+0x6c/0x80 [] alter_usermode_mappings+0xb4/0x160 [nvgpu] [] nvgpu_hide_usermode_for_poweroff+0x24/0x30 [nvgpu] [] gk20a_pm_prepare_poweroff+0xe8/0x140 [nvgpu] [] gk20a_pm_runtime_suspend+0x78/0xf0 [nvgpu] [] pm_generic_runtime_suspend+0x3c/0x60 [] genpd_runtime_suspend+0xb0/0x2c0 [] __rpm_callback+0x90/0x150 [] rpm_callback+0x34/0xa0 [] rpm_suspend+0xe0/0x5e0 [] pm_runtime_work+0xbc/0xc0 [] process_one_work+0x1c0/0x4a0 [] worker_thread+0x11c/0x430 [] kthread+0x148/0x170 [] ret_from_fork+0x10/0x18 [] INFO: task nvrm_gpu_tests:1273 blocked for more than 121 seconds. [] Tainted: G W 5.10.17-tegra #1 [] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [] task:nvrm_gpu_tests state:D stack: 0 pid: 1273 ppid: 1245 flags:0x00000000 [] Call trace: [] __switch_to+0x104/0x160 [] __schedule+0x3d4/0x900 [] schedule+0x74/0x100 [] schedule_preempt_disabled+0x28/0x40 [] __mutex_lock.isra.0+0x184/0x5c0 [] __mutex_lock_slowpath+0x24/0x30 [] mutex_lock+0x5c/0x70 [] usermode_vma_close+0x30/0x50 [nvgpu] [] remove_vma+0x34/0x60 [] __do_munmap+0x1f4/0x4a0 [] __vm_munmap+0x74/0xd0 [] __arm64_sys_munmap+0x3c/0x50 [] el0_svc_common.constprop.0+0x7c/0x1a0 [] do_el0_svc+0x34/0xa0 [] el0_svc+0x1c/0x30 [] el0_sync_handler+0xa8/0xb0 [] el0_sync+0x160/0x180 [] ---[ end Kernel panic - not syncing: hung_task: blocked tasks ]--- Bug 200703921 Change-Id: Ie7f017c92f20061d3bf891079f7fc7fe390f7cf7 Signed-off-by: Sagar Kamble <skamble@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2533853 (cherry picked from commit 1dd3e0761c1995c88e9f8e1a26cf5eaf197510be) Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2540111 Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com> Reviewed-by: Bibek Basu <bbasu@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> GVS: Gerrit_Virtual_Submit Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
-rw-r--r--drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c52
1 files changed, 39 insertions, 13 deletions
diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c b/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c
index f71921cb..b6cdffcb 100644
--- a/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c
+++ b/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c
@@ -2051,7 +2051,7 @@ int gk20a_ctrl_dev_mmap(struct file *filp, struct vm_area_struct *vma)
2051 return err; 2051 return err;
2052} 2052}
2053 2053
2054static void alter_usermode_mapping(struct gk20a *g, 2054static int alter_usermode_mapping(struct gk20a *g,
2055 struct gk20a_ctrl_priv *priv, 2055 struct gk20a_ctrl_priv *priv,
2056 bool poweroff) 2056 bool poweroff)
2057{ 2057{
@@ -2059,23 +2059,34 @@ static void alter_usermode_mapping(struct gk20a *g,
2059 struct vm_area_struct *vma = priv->usermode_vma.vma; 2059 struct vm_area_struct *vma = priv->usermode_vma.vma;
2060 bool vma_mapped = priv->usermode_vma.vma_mapped; 2060 bool vma_mapped = priv->usermode_vma.vma_mapped;
2061 u64 addr; 2061 u64 addr;
2062 int err; 2062 int err = 0;
2063 2063
2064 if (!vma) { 2064 if (!vma) {
2065 /* Nothing to do - no mmap called */ 2065 /* Nothing to do - no mmap called */
2066 return; 2066 return 0;
2067 } 2067 }
2068 2068
2069 addr = l->regs_bus_addr + g->ops.fifo.usermode_base(g); 2069 addr = l->regs_bus_addr + g->ops.fifo.usermode_base(g);
2070 2070
2071 down_write(&vma->vm_mm->mmap_sem);
2072
2073 /* 2071 /*
2074 * This is a no-op for the below cases 2072 * This is a no-op for the below cases
2075 * a) poweroff and !vma_mapped - > do nothing as no map exists 2073 * a) poweroff and !vma_mapped - > do nothing as no map exists
2076 * b) !poweroff and vmap_mapped -> do nothing as already mapped 2074 * b) !poweroff and vmap_mapped -> do nothing as already mapped
2077 */ 2075 */
2078 if (poweroff && vma_mapped) { 2076 if (poweroff != vma_mapped) {
2077 return 0;
2078 }
2079
2080 /*
2081 * We use trylock due to lock inversion: we need to acquire
2082 * mmap_lock while holding ctrl_privs_lock. usermode_vma_close
2083 * does it in reverse order. Trylock is a way to avoid deadlock.
2084 */
2085 if (!down_write_trylock(&vma->vm_mm->mmap_sem)) {
2086 return -EBUSY;
2087 }
2088
2089 if (poweroff) {
2079 err = zap_vma_ptes(vma, vma->vm_start, SZ_4K); 2090 err = zap_vma_ptes(vma, vma->vm_start, SZ_4K);
2080 if (err == 0) { 2091 if (err == 0) {
2081 vma->vm_flags = VM_NONE; 2092 vma->vm_flags = VM_NONE;
@@ -2083,7 +2094,7 @@ static void alter_usermode_mapping(struct gk20a *g,
2083 } else { 2094 } else {
2084 nvgpu_err(g, "can't remove usermode mapping"); 2095 nvgpu_err(g, "can't remove usermode mapping");
2085 } 2096 }
2086 } else if (!poweroff && !vma_mapped) { 2097 } else {
2087 vma->vm_flags = priv->usermode_vma.flags; 2098 vma->vm_flags = priv->usermode_vma.flags;
2088 err = io_remap_pfn_range(vma, vma->vm_start, 2099 err = io_remap_pfn_range(vma, vma->vm_start,
2089 addr >> PAGE_SHIFT, 2100 addr >> PAGE_SHIFT,
@@ -2097,19 +2108,34 @@ static void alter_usermode_mapping(struct gk20a *g,
2097 } 2108 }
2098 2109
2099 up_write(&vma->vm_mm->mmap_sem); 2110 up_write(&vma->vm_mm->mmap_sem);
2111
2112 return err;
2100} 2113}
2101 2114
2102static void alter_usermode_mappings(struct gk20a *g, bool poweroff) 2115static void alter_usermode_mappings(struct gk20a *g, bool poweroff)
2103{ 2116{
2104 struct gk20a_ctrl_priv *priv; 2117 struct gk20a_ctrl_priv *priv;
2105 struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); 2118 struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g);
2119 int err = 0;
2106 2120
2107 nvgpu_mutex_acquire(&l->ctrl.privs_lock); 2121 do {
2108 nvgpu_list_for_each_entry(priv, &l->ctrl.privs, 2122 nvgpu_mutex_acquire(&l->ctrl.privs_lock);
2109 gk20a_ctrl_priv, list) { 2123 nvgpu_list_for_each_entry(priv, &l->ctrl.privs,
2110 alter_usermode_mapping(g, priv, poweroff); 2124 gk20a_ctrl_priv, list) {
2111 } 2125 err = alter_usermode_mapping(g, priv, poweroff);
2112 nvgpu_mutex_release(&l->ctrl.privs_lock); 2126 if (err != 0) {
2127 break;
2128 }
2129 }
2130 nvgpu_mutex_release(&l->ctrl.privs_lock);
2131
2132 if (err == -EBUSY) {
2133 nvgpu_log_info(g, "ctrl_privs_lock lock contended. retry altering usermode mappings");
2134 nvgpu_udelay(10);
2135 } else if (err != 0) {
2136 nvgpu_err(g, "can't alter usermode mapping. err = %d", err);
2137 }
2138 } while (err == -EBUSY);
2113} 2139}
2114 2140
2115void nvgpu_hide_usermode_for_poweroff(struct gk20a *g) 2141void nvgpu_hide_usermode_for_poweroff(struct gk20a *g)