diff options
author | Sagar Kamble <skamble@nvidia.com> | 2021-05-24 05:06:01 -0400 |
---|---|---|
committer | mobile promotions <svcmobile_promotions@nvidia.com> | 2021-06-07 09:40:08 -0400 |
commit | 12e89c21dec793269a0c0f1f3f2160bb7cdf5ebd (patch) | |
tree | 4becca378192113c3c71ee2edbc016180ed474d3 | |
parent | cbad9503a78c23336a32172dd701b73760969ff0 (diff) |
gpu: nvgpu: fix the usermode mappings deadlock during railgate and munmap
Following locking sequence leads to deadlock:
1. gk20a_pm_prepare_poweroff (alter_usermode_mappings):
ctrl_privs_lock -> mmap_lock
2. __do_munmap (usermode_vma_close):
mmap_lock -> ctrl_privs_lock
This lock contention can be resolved by retrying the usermode mapping
alteration after a while releasing the ctrl_priv_lock for munmap to
proceed.
Below is the kernel panic log with deadlock.
[] INFO: task kworker/1:1:116 blocked for more than 120 seconds.
[] Tainted: G W 5.10.17-tegra #1
[] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[] task:kworker/1:1 state:D stack: 0 pid: 116 ppid: 2 flags:0x00000028
[] Workqueue: pm pm_runtime_work
[] Call trace:
[] __switch_to+0x104/0x160
[] __schedule+0x3d4/0x900
[] schedule+0x74/0x100
[] rwsem_down_write_slowpath+0x250/0x4b0
[] down_write+0x6c/0x80
[] alter_usermode_mappings+0xb4/0x160 [nvgpu]
[] nvgpu_hide_usermode_for_poweroff+0x24/0x30 [nvgpu]
[] gk20a_pm_prepare_poweroff+0xe8/0x140 [nvgpu]
[] gk20a_pm_runtime_suspend+0x78/0xf0 [nvgpu]
[] pm_generic_runtime_suspend+0x3c/0x60
[] genpd_runtime_suspend+0xb0/0x2c0
[] __rpm_callback+0x90/0x150
[] rpm_callback+0x34/0xa0
[] rpm_suspend+0xe0/0x5e0
[] pm_runtime_work+0xbc/0xc0
[] process_one_work+0x1c0/0x4a0
[] worker_thread+0x11c/0x430
[] kthread+0x148/0x170
[] ret_from_fork+0x10/0x18
[] INFO: task nvrm_gpu_tests:1273 blocked for more than 121 seconds.
[] Tainted: G W 5.10.17-tegra #1
[] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[] task:nvrm_gpu_tests state:D stack: 0 pid: 1273 ppid: 1245 flags:0x00000000
[] Call trace:
[] __switch_to+0x104/0x160
[] __schedule+0x3d4/0x900
[] schedule+0x74/0x100
[] schedule_preempt_disabled+0x28/0x40
[] __mutex_lock.isra.0+0x184/0x5c0
[] __mutex_lock_slowpath+0x24/0x30
[] mutex_lock+0x5c/0x70
[] usermode_vma_close+0x30/0x50 [nvgpu]
[] remove_vma+0x34/0x60
[] __do_munmap+0x1f4/0x4a0
[] __vm_munmap+0x74/0xd0
[] __arm64_sys_munmap+0x3c/0x50
[] el0_svc_common.constprop.0+0x7c/0x1a0
[] do_el0_svc+0x34/0xa0
[] el0_svc+0x1c/0x30
[] el0_sync_handler+0xa8/0xb0
[] el0_sync+0x160/0x180
[] ---[ end Kernel panic - not syncing: hung_task: blocked tasks ]---
Bug 200703921
Change-Id: Ie7f017c92f20061d3bf891079f7fc7fe390f7cf7
Signed-off-by: Sagar Kamble <skamble@nvidia.com>
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2533853
(cherry picked from commit 1dd3e0761c1995c88e9f8e1a26cf5eaf197510be)
Reviewed-on: https://git-master.nvidia.com/r/c/linux-nvgpu/+/2540111
Reviewed-by: svc-mobile-coverity <svc-mobile-coverity@nvidia.com>
Reviewed-by: Bibek Basu <bbasu@nvidia.com>
Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com>
GVS: Gerrit_Virtual_Submit
Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
-rw-r--r-- | drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c | 52 |
1 files changed, 39 insertions, 13 deletions
diff --git a/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c b/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c index f71921cb..b6cdffcb 100644 --- a/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c +++ b/drivers/gpu/nvgpu/os/linux/ioctl_ctrl.c | |||
@@ -2051,7 +2051,7 @@ int gk20a_ctrl_dev_mmap(struct file *filp, struct vm_area_struct *vma) | |||
2051 | return err; | 2051 | return err; |
2052 | } | 2052 | } |
2053 | 2053 | ||
2054 | static void alter_usermode_mapping(struct gk20a *g, | 2054 | static int alter_usermode_mapping(struct gk20a *g, |
2055 | struct gk20a_ctrl_priv *priv, | 2055 | struct gk20a_ctrl_priv *priv, |
2056 | bool poweroff) | 2056 | bool poweroff) |
2057 | { | 2057 | { |
@@ -2059,23 +2059,34 @@ static void alter_usermode_mapping(struct gk20a *g, | |||
2059 | struct vm_area_struct *vma = priv->usermode_vma.vma; | 2059 | struct vm_area_struct *vma = priv->usermode_vma.vma; |
2060 | bool vma_mapped = priv->usermode_vma.vma_mapped; | 2060 | bool vma_mapped = priv->usermode_vma.vma_mapped; |
2061 | u64 addr; | 2061 | u64 addr; |
2062 | int err; | 2062 | int err = 0; |
2063 | 2063 | ||
2064 | if (!vma) { | 2064 | if (!vma) { |
2065 | /* Nothing to do - no mmap called */ | 2065 | /* Nothing to do - no mmap called */ |
2066 | return; | 2066 | return 0; |
2067 | } | 2067 | } |
2068 | 2068 | ||
2069 | addr = l->regs_bus_addr + g->ops.fifo.usermode_base(g); | 2069 | addr = l->regs_bus_addr + g->ops.fifo.usermode_base(g); |
2070 | 2070 | ||
2071 | down_write(&vma->vm_mm->mmap_sem); | ||
2072 | |||
2073 | /* | 2071 | /* |
2074 | * This is a no-op for the below cases | 2072 | * This is a no-op for the below cases |
2075 | * a) poweroff and !vma_mapped - > do nothing as no map exists | 2073 | * a) poweroff and !vma_mapped - > do nothing as no map exists |
2076 | * b) !poweroff and vmap_mapped -> do nothing as already mapped | 2074 | * b) !poweroff and vmap_mapped -> do nothing as already mapped |
2077 | */ | 2075 | */ |
2078 | if (poweroff && vma_mapped) { | 2076 | if (poweroff != vma_mapped) { |
2077 | return 0; | ||
2078 | } | ||
2079 | |||
2080 | /* | ||
2081 | * We use trylock due to lock inversion: we need to acquire | ||
2082 | * mmap_lock while holding ctrl_privs_lock. usermode_vma_close | ||
2083 | * does it in reverse order. Trylock is a way to avoid deadlock. | ||
2084 | */ | ||
2085 | if (!down_write_trylock(&vma->vm_mm->mmap_sem)) { | ||
2086 | return -EBUSY; | ||
2087 | } | ||
2088 | |||
2089 | if (poweroff) { | ||
2079 | err = zap_vma_ptes(vma, vma->vm_start, SZ_4K); | 2090 | err = zap_vma_ptes(vma, vma->vm_start, SZ_4K); |
2080 | if (err == 0) { | 2091 | if (err == 0) { |
2081 | vma->vm_flags = VM_NONE; | 2092 | vma->vm_flags = VM_NONE; |
@@ -2083,7 +2094,7 @@ static void alter_usermode_mapping(struct gk20a *g, | |||
2083 | } else { | 2094 | } else { |
2084 | nvgpu_err(g, "can't remove usermode mapping"); | 2095 | nvgpu_err(g, "can't remove usermode mapping"); |
2085 | } | 2096 | } |
2086 | } else if (!poweroff && !vma_mapped) { | 2097 | } else { |
2087 | vma->vm_flags = priv->usermode_vma.flags; | 2098 | vma->vm_flags = priv->usermode_vma.flags; |
2088 | err = io_remap_pfn_range(vma, vma->vm_start, | 2099 | err = io_remap_pfn_range(vma, vma->vm_start, |
2089 | addr >> PAGE_SHIFT, | 2100 | addr >> PAGE_SHIFT, |
@@ -2097,19 +2108,34 @@ static void alter_usermode_mapping(struct gk20a *g, | |||
2097 | } | 2108 | } |
2098 | 2109 | ||
2099 | up_write(&vma->vm_mm->mmap_sem); | 2110 | up_write(&vma->vm_mm->mmap_sem); |
2111 | |||
2112 | return err; | ||
2100 | } | 2113 | } |
2101 | 2114 | ||
2102 | static void alter_usermode_mappings(struct gk20a *g, bool poweroff) | 2115 | static void alter_usermode_mappings(struct gk20a *g, bool poweroff) |
2103 | { | 2116 | { |
2104 | struct gk20a_ctrl_priv *priv; | 2117 | struct gk20a_ctrl_priv *priv; |
2105 | struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); | 2118 | struct nvgpu_os_linux *l = nvgpu_os_linux_from_gk20a(g); |
2119 | int err = 0; | ||
2106 | 2120 | ||
2107 | nvgpu_mutex_acquire(&l->ctrl.privs_lock); | 2121 | do { |
2108 | nvgpu_list_for_each_entry(priv, &l->ctrl.privs, | 2122 | nvgpu_mutex_acquire(&l->ctrl.privs_lock); |
2109 | gk20a_ctrl_priv, list) { | 2123 | nvgpu_list_for_each_entry(priv, &l->ctrl.privs, |
2110 | alter_usermode_mapping(g, priv, poweroff); | 2124 | gk20a_ctrl_priv, list) { |
2111 | } | 2125 | err = alter_usermode_mapping(g, priv, poweroff); |
2112 | nvgpu_mutex_release(&l->ctrl.privs_lock); | 2126 | if (err != 0) { |
2127 | break; | ||
2128 | } | ||
2129 | } | ||
2130 | nvgpu_mutex_release(&l->ctrl.privs_lock); | ||
2131 | |||
2132 | if (err == -EBUSY) { | ||
2133 | nvgpu_log_info(g, "ctrl_privs_lock lock contended. retry altering usermode mappings"); | ||
2134 | nvgpu_udelay(10); | ||
2135 | } else if (err != 0) { | ||
2136 | nvgpu_err(g, "can't alter usermode mapping. err = %d", err); | ||
2137 | } | ||
2138 | } while (err == -EBUSY); | ||
2113 | } | 2139 | } |
2114 | 2140 | ||
2115 | void nvgpu_hide_usermode_for_poweroff(struct gk20a *g) | 2141 | void nvgpu_hide_usermode_for_poweroff(struct gk20a *g) |