diff options
author | Kirill A. Shutemov <kirill.shutemov@linux.intel.com> | 2016-01-15 19:57:31 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-01-15 20:56:32 -0500 |
commit | 88f306b68cbb36e500da4b9601b2e3d13dd683c4 (patch) | |
tree | e0373c2ee59446b3f8f9c2bfae9f75ff05e73f6c /mm | |
parent | d645fc0eabbe783d34a14fbb31768ad8571f0ad4 (diff) |
mm: fix locking order in mm_take_all_locks()
Dmitry Vyukov has reported[1] possible deadlock (triggered by his
syzkaller fuzzer):
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(&hugetlbfs_i_mmap_rwsem_key);
lock(&mapping->i_mmap_rwsem);
lock(&hugetlbfs_i_mmap_rwsem_key);
lock(&mapping->i_mmap_rwsem);
Both traces points to mm_take_all_locks() as a source of the problem.
It doesn't take care about ordering or hugetlbfs_i_mmap_rwsem_key (aka
mapping->i_mmap_rwsem for hugetlb mapping) vs. i_mmap_rwsem.
huge_pmd_share() does memory allocation under hugetlbfs_i_mmap_rwsem_key
and allocator can take i_mmap_rwsem if it hit reclaim. So we need to
take i_mmap_rwsem from all hugetlb VMAs before taking i_mmap_rwsem from
rest of VMAs.
The patch also documents locking order for hugetlbfs_i_mmap_rwsem_key.
[1] http://lkml.kernel.org/r/CACT4Y+Zu95tBs-0EvdiAKzUOsb4tczRRfCRTpLr4bg_OP9HuVg@mail.gmail.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Michal Hocko <mhocko@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/mmap.c | 25 | ||||
-rw-r--r-- | mm/rmap.c | 31 |
2 files changed, 36 insertions, 20 deletions
@@ -3184,10 +3184,16 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | |||
3184 | * mapping->flags avoid to take the same lock twice, if more than one | 3184 | * mapping->flags avoid to take the same lock twice, if more than one |
3185 | * vma in this mm is backed by the same anon_vma or address_space. | 3185 | * vma in this mm is backed by the same anon_vma or address_space. |
3186 | * | 3186 | * |
3187 | * We can take all the locks in random order because the VM code | 3187 | * We take locks in following order, accordingly to comment at beginning |
3188 | * taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never | 3188 | * of mm/rmap.c: |
3189 | * takes more than one of them in a row. Secondly we're protected | 3189 | * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for |
3190 | * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. | 3190 | * hugetlb mapping); |
3191 | * - all i_mmap_rwsem locks; | ||
3192 | * - all anon_vma->rwseml | ||
3193 | * | ||
3194 | * We can take all locks within these types randomly because the VM code | ||
3195 | * doesn't nest them and we protected from parallel mm_take_all_locks() by | ||
3196 | * mm_all_locks_mutex. | ||
3191 | * | 3197 | * |
3192 | * mm_take_all_locks() and mm_drop_all_locks are expensive operations | 3198 | * mm_take_all_locks() and mm_drop_all_locks are expensive operations |
3193 | * that may have to take thousand of locks. | 3199 | * that may have to take thousand of locks. |
@@ -3206,7 +3212,16 @@ int mm_take_all_locks(struct mm_struct *mm) | |||
3206 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 3212 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
3207 | if (signal_pending(current)) | 3213 | if (signal_pending(current)) |
3208 | goto out_unlock; | 3214 | goto out_unlock; |
3209 | if (vma->vm_file && vma->vm_file->f_mapping) | 3215 | if (vma->vm_file && vma->vm_file->f_mapping && |
3216 | is_vm_hugetlb_page(vma)) | ||
3217 | vm_lock_mapping(mm, vma->vm_file->f_mapping); | ||
3218 | } | ||
3219 | |||
3220 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
3221 | if (signal_pending(current)) | ||
3222 | goto out_unlock; | ||
3223 | if (vma->vm_file && vma->vm_file->f_mapping && | ||
3224 | !is_vm_hugetlb_page(vma)) | ||
3210 | vm_lock_mapping(mm, vma->vm_file->f_mapping); | 3225 | vm_lock_mapping(mm, vma->vm_file->f_mapping); |
3211 | } | 3226 | } |
3212 | 3227 | ||
@@ -23,21 +23,22 @@ | |||
23 | * inode->i_mutex (while writing or truncating, not reading or faulting) | 23 | * inode->i_mutex (while writing or truncating, not reading or faulting) |
24 | * mm->mmap_sem | 24 | * mm->mmap_sem |
25 | * page->flags PG_locked (lock_page) | 25 | * page->flags PG_locked (lock_page) |
26 | * mapping->i_mmap_rwsem | 26 | * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) |
27 | * anon_vma->rwsem | 27 | * mapping->i_mmap_rwsem |
28 | * mm->page_table_lock or pte_lock | 28 | * anon_vma->rwsem |
29 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) | 29 | * mm->page_table_lock or pte_lock |
30 | * swap_lock (in swap_duplicate, swap_info_get) | 30 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) |
31 | * mmlist_lock (in mmput, drain_mmlist and others) | 31 | * swap_lock (in swap_duplicate, swap_info_get) |
32 | * mapping->private_lock (in __set_page_dirty_buffers) | 32 | * mmlist_lock (in mmput, drain_mmlist and others) |
33 | * mem_cgroup_{begin,end}_page_stat (memcg->move_lock) | 33 | * mapping->private_lock (in __set_page_dirty_buffers) |
34 | * mapping->tree_lock (widely used) | 34 | * mem_cgroup_{begin,end}_page_stat (memcg->move_lock) |
35 | * inode->i_lock (in set_page_dirty's __mark_inode_dirty) | 35 | * mapping->tree_lock (widely used) |
36 | * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) | 36 | * inode->i_lock (in set_page_dirty's __mark_inode_dirty) |
37 | * sb_lock (within inode_lock in fs/fs-writeback.c) | 37 | * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) |
38 | * mapping->tree_lock (widely used, in set_page_dirty, | 38 | * sb_lock (within inode_lock in fs/fs-writeback.c) |
39 | * in arch-dependent flush_dcache_mmap_lock, | 39 | * mapping->tree_lock (widely used, in set_page_dirty, |
40 | * within bdi.wb->list_lock in __sync_single_inode) | 40 | * in arch-dependent flush_dcache_mmap_lock, |
41 | * within bdi.wb->list_lock in __sync_single_inode) | ||
41 | * | 42 | * |
42 | * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) | 43 | * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) |
43 | * ->tasklist_lock | 44 | * ->tasklist_lock |