diff options
author | Davidlohr Bueso <dave@stgolabs.net> | 2014-12-12 19:54:24 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-12-13 15:42:45 -0500 |
commit | c8c06efa8b552608493b7066c234cfa82c47fcea (patch) | |
tree | 7e206c669149766fb5a77a3ef85cdd4fac63be78 | |
parent | 83cde9e8ba95d180eaefefe834958fbf7008cf39 (diff) |
mm: convert i_mmap_mutex to rwsem
The i_mmap_mutex is a close cousin of the anon vma lock, both protecting
similar data, one for file backed pages and the other for anon memory. To
this end, this lock can also be a rwsem. In addition, there are some
important opportunities to share the lock when there are no tree
modifications.
This conversion is straightforward. For now, all users take the write
lock.
[sfr@canb.auug.org.au: update fremap.c]
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: "Kirill A. Shutemov" <kirill@shutemov.name>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | fs/hugetlbfs/inode.c | 10 | ||||
-rw-r--r-- | fs/inode.c | 2 | ||||
-rw-r--r-- | include/linux/fs.h | 7 | ||||
-rw-r--r-- | include/linux/mmu_notifier.h | 2 | ||||
-rw-r--r-- | kernel/events/uprobes.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 10 | ||||
-rw-r--r-- | mm/hugetlb.c | 10 | ||||
-rw-r--r-- | mm/mmap.c | 8 | ||||
-rw-r--r-- | mm/mremap.c | 2 | ||||
-rw-r--r-- | mm/rmap.c | 6 |
10 files changed, 30 insertions, 29 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a082709aa427..5eba47f593f8 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c | |||
@@ -472,12 +472,12 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, | |||
472 | } | 472 | } |
473 | 473 | ||
474 | /* | 474 | /* |
475 | * Hugetlbfs is not reclaimable; therefore its i_mmap_mutex will never | 475 | * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never |
476 | * be taken from reclaim -- unlike regular filesystems. This needs an | 476 | * be taken from reclaim -- unlike regular filesystems. This needs an |
477 | * annotation because huge_pmd_share() does an allocation under | 477 | * annotation because huge_pmd_share() does an allocation under |
478 | * i_mmap_mutex. | 478 | * i_mmap_rwsem. |
479 | */ | 479 | */ |
480 | static struct lock_class_key hugetlbfs_i_mmap_mutex_key; | 480 | static struct lock_class_key hugetlbfs_i_mmap_rwsem_key; |
481 | 481 | ||
482 | static struct inode *hugetlbfs_get_inode(struct super_block *sb, | 482 | static struct inode *hugetlbfs_get_inode(struct super_block *sb, |
483 | struct inode *dir, | 483 | struct inode *dir, |
@@ -495,8 +495,8 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, | |||
495 | struct hugetlbfs_inode_info *info; | 495 | struct hugetlbfs_inode_info *info; |
496 | inode->i_ino = get_next_ino(); | 496 | inode->i_ino = get_next_ino(); |
497 | inode_init_owner(inode, dir, mode); | 497 | inode_init_owner(inode, dir, mode); |
498 | lockdep_set_class(&inode->i_mapping->i_mmap_mutex, | 498 | lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, |
499 | &hugetlbfs_i_mmap_mutex_key); | 499 | &hugetlbfs_i_mmap_rwsem_key); |
500 | inode->i_mapping->a_ops = &hugetlbfs_aops; | 500 | inode->i_mapping->a_ops = &hugetlbfs_aops; |
501 | inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; | 501 | inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; |
502 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 502 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; |
diff --git a/fs/inode.c b/fs/inode.c index 2ed95f7caa4f..ad60555b4768 100644 --- a/fs/inode.c +++ b/fs/inode.c | |||
@@ -346,7 +346,7 @@ void address_space_init_once(struct address_space *mapping) | |||
346 | memset(mapping, 0, sizeof(*mapping)); | 346 | memset(mapping, 0, sizeof(*mapping)); |
347 | INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); | 347 | INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); |
348 | spin_lock_init(&mapping->tree_lock); | 348 | spin_lock_init(&mapping->tree_lock); |
349 | mutex_init(&mapping->i_mmap_mutex); | 349 | init_rwsem(&mapping->i_mmap_rwsem); |
350 | INIT_LIST_HEAD(&mapping->private_list); | 350 | INIT_LIST_HEAD(&mapping->private_list); |
351 | spin_lock_init(&mapping->private_lock); | 351 | spin_lock_init(&mapping->private_lock); |
352 | mapping->i_mmap = RB_ROOT; | 352 | mapping->i_mmap = RB_ROOT; |
diff --git a/include/linux/fs.h b/include/linux/fs.h index bd0a1b2f3c02..6abcd0b72ae0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/pid.h> | 18 | #include <linux/pid.h> |
19 | #include <linux/bug.h> | 19 | #include <linux/bug.h> |
20 | #include <linux/mutex.h> | 20 | #include <linux/mutex.h> |
21 | #include <linux/rwsem.h> | ||
21 | #include <linux/capability.h> | 22 | #include <linux/capability.h> |
22 | #include <linux/semaphore.h> | 23 | #include <linux/semaphore.h> |
23 | #include <linux/fiemap.h> | 24 | #include <linux/fiemap.h> |
@@ -401,7 +402,7 @@ struct address_space { | |||
401 | atomic_t i_mmap_writable;/* count VM_SHARED mappings */ | 402 | atomic_t i_mmap_writable;/* count VM_SHARED mappings */ |
402 | struct rb_root i_mmap; /* tree of private and shared mappings */ | 403 | struct rb_root i_mmap; /* tree of private and shared mappings */ |
403 | struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ | 404 | struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ |
404 | struct mutex i_mmap_mutex; /* protect tree, count, list */ | 405 | struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ |
405 | /* Protected by tree_lock together with the radix tree */ | 406 | /* Protected by tree_lock together with the radix tree */ |
406 | unsigned long nrpages; /* number of total pages */ | 407 | unsigned long nrpages; /* number of total pages */ |
407 | unsigned long nrshadows; /* number of shadow entries */ | 408 | unsigned long nrshadows; /* number of shadow entries */ |
@@ -469,12 +470,12 @@ int mapping_tagged(struct address_space *mapping, int tag); | |||
469 | 470 | ||
470 | static inline void i_mmap_lock_write(struct address_space *mapping) | 471 | static inline void i_mmap_lock_write(struct address_space *mapping) |
471 | { | 472 | { |
472 | mutex_lock(&mapping->i_mmap_mutex); | 473 | down_write(&mapping->i_mmap_rwsem); |
473 | } | 474 | } |
474 | 475 | ||
475 | static inline void i_mmap_unlock_write(struct address_space *mapping) | 476 | static inline void i_mmap_unlock_write(struct address_space *mapping) |
476 | { | 477 | { |
477 | mutex_unlock(&mapping->i_mmap_mutex); | 478 | up_write(&mapping->i_mmap_rwsem); |
478 | } | 479 | } |
479 | 480 | ||
480 | /* | 481 | /* |
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 88787bb4b3b9..ab8564b03468 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h | |||
@@ -154,7 +154,7 @@ struct mmu_notifier_ops { | |||
154 | * Therefore notifier chains can only be traversed when either | 154 | * Therefore notifier chains can only be traversed when either |
155 | * | 155 | * |
156 | * 1. mmap_sem is held. | 156 | * 1. mmap_sem is held. |
157 | * 2. One of the reverse map locks is held (i_mmap_mutex or anon_vma->rwsem). | 157 | * 2. One of the reverse map locks is held (i_mmap_rwsem or anon_vma->rwsem). |
158 | * 3. No other concurrent thread can access the list (release) | 158 | * 3. No other concurrent thread can access the list (release) |
159 | */ | 159 | */ |
160 | struct mmu_notifier { | 160 | struct mmu_notifier { |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index aac81bf9df09..1901dbfa7ce0 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
@@ -731,7 +731,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) | |||
731 | 731 | ||
732 | if (!prev && !more) { | 732 | if (!prev && !more) { |
733 | /* | 733 | /* |
734 | * Needs GFP_NOWAIT to avoid i_mmap_mutex recursion through | 734 | * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through |
735 | * reclaim. This is optimistic, no harm done if it fails. | 735 | * reclaim. This is optimistic, no harm done if it fails. |
736 | */ | 736 | */ |
737 | prev = kmalloc(sizeof(struct map_info), | 737 | prev = kmalloc(sizeof(struct map_info), |
diff --git a/mm/filemap.c b/mm/filemap.c index 14b4642279f1..e8905bc3cbd7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -62,16 +62,16 @@ | |||
62 | /* | 62 | /* |
63 | * Lock ordering: | 63 | * Lock ordering: |
64 | * | 64 | * |
65 | * ->i_mmap_mutex (truncate_pagecache) | 65 | * ->i_mmap_rwsem (truncate_pagecache) |
66 | * ->private_lock (__free_pte->__set_page_dirty_buffers) | 66 | * ->private_lock (__free_pte->__set_page_dirty_buffers) |
67 | * ->swap_lock (exclusive_swap_page, others) | 67 | * ->swap_lock (exclusive_swap_page, others) |
68 | * ->mapping->tree_lock | 68 | * ->mapping->tree_lock |
69 | * | 69 | * |
70 | * ->i_mutex | 70 | * ->i_mutex |
71 | * ->i_mmap_mutex (truncate->unmap_mapping_range) | 71 | * ->i_mmap_rwsem (truncate->unmap_mapping_range) |
72 | * | 72 | * |
73 | * ->mmap_sem | 73 | * ->mmap_sem |
74 | * ->i_mmap_mutex | 74 | * ->i_mmap_rwsem |
75 | * ->page_table_lock or pte_lock (various, mainly in memory.c) | 75 | * ->page_table_lock or pte_lock (various, mainly in memory.c) |
76 | * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) | 76 | * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) |
77 | * | 77 | * |
@@ -85,7 +85,7 @@ | |||
85 | * sb_lock (fs/fs-writeback.c) | 85 | * sb_lock (fs/fs-writeback.c) |
86 | * ->mapping->tree_lock (__sync_single_inode) | 86 | * ->mapping->tree_lock (__sync_single_inode) |
87 | * | 87 | * |
88 | * ->i_mmap_mutex | 88 | * ->i_mmap_rwsem |
89 | * ->anon_vma.lock (vma_adjust) | 89 | * ->anon_vma.lock (vma_adjust) |
90 | * | 90 | * |
91 | * ->anon_vma.lock | 91 | * ->anon_vma.lock |
@@ -105,7 +105,7 @@ | |||
105 | * ->inode->i_lock (zap_pte_range->set_page_dirty) | 105 | * ->inode->i_lock (zap_pte_range->set_page_dirty) |
106 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) | 106 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) |
107 | * | 107 | * |
108 | * ->i_mmap_mutex | 108 | * ->i_mmap_rwsem |
109 | * ->tasklist_lock (memory_failure, collect_procs_ao) | 109 | * ->tasklist_lock (memory_failure, collect_procs_ao) |
110 | */ | 110 | */ |
111 | 111 | ||
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ffe19304cc09..989cb032eaf5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -2726,9 +2726,9 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb, | |||
2726 | * on its way out. We're lucky that the flag has such an appropriate | 2726 | * on its way out. We're lucky that the flag has such an appropriate |
2727 | * name, and can in fact be safely cleared here. We could clear it | 2727 | * name, and can in fact be safely cleared here. We could clear it |
2728 | * before the __unmap_hugepage_range above, but all that's necessary | 2728 | * before the __unmap_hugepage_range above, but all that's necessary |
2729 | * is to clear it before releasing the i_mmap_mutex. This works | 2729 | * is to clear it before releasing the i_mmap_rwsem. This works |
2730 | * because in the context this is called, the VMA is about to be | 2730 | * because in the context this is called, the VMA is about to be |
2731 | * destroyed and the i_mmap_mutex is held. | 2731 | * destroyed and the i_mmap_rwsem is held. |
2732 | */ | 2732 | */ |
2733 | vma->vm_flags &= ~VM_MAYSHARE; | 2733 | vma->vm_flags &= ~VM_MAYSHARE; |
2734 | } | 2734 | } |
@@ -3370,9 +3370,9 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, | |||
3370 | spin_unlock(ptl); | 3370 | spin_unlock(ptl); |
3371 | } | 3371 | } |
3372 | /* | 3372 | /* |
3373 | * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare | 3373 | * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare |
3374 | * may have cleared our pud entry and done put_page on the page table: | 3374 | * may have cleared our pud entry and done put_page on the page table: |
3375 | * once we release i_mmap_mutex, another task can do the final put_page | 3375 | * once we release i_mmap_rwsem, another task can do the final put_page |
3376 | * and that page table be reused and filled with junk. | 3376 | * and that page table be reused and filled with junk. |
3377 | */ | 3377 | */ |
3378 | flush_tlb_range(vma, start, end); | 3378 | flush_tlb_range(vma, start, end); |
@@ -3525,7 +3525,7 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) | |||
3525 | * and returns the corresponding pte. While this is not necessary for the | 3525 | * and returns the corresponding pte. While this is not necessary for the |
3526 | * !shared pmd case because we can allocate the pmd later as well, it makes the | 3526 | * !shared pmd case because we can allocate the pmd later as well, it makes the |
3527 | * code much cleaner. pmd allocation is essential for the shared case because | 3527 | * code much cleaner. pmd allocation is essential for the shared case because |
3528 | * pud has to be populated inside the same i_mmap_mutex section - otherwise | 3528 | * pud has to be populated inside the same i_mmap_rwsem section - otherwise |
3529 | * racing tasks could either miss the sharing (see huge_pte_offset) or select a | 3529 | * racing tasks could either miss the sharing (see huge_pte_offset) or select a |
3530 | * bad pmd for sharing. | 3530 | * bad pmd for sharing. |
3531 | */ | 3531 | */ |
@@ -232,7 +232,7 @@ error: | |||
232 | } | 232 | } |
233 | 233 | ||
234 | /* | 234 | /* |
235 | * Requires inode->i_mapping->i_mmap_mutex | 235 | * Requires inode->i_mapping->i_mmap_rwsem |
236 | */ | 236 | */ |
237 | static void __remove_shared_vm_struct(struct vm_area_struct *vma, | 237 | static void __remove_shared_vm_struct(struct vm_area_struct *vma, |
238 | struct file *file, struct address_space *mapping) | 238 | struct file *file, struct address_space *mapping) |
@@ -2791,7 +2791,7 @@ void exit_mmap(struct mm_struct *mm) | |||
2791 | 2791 | ||
2792 | /* Insert vm structure into process list sorted by address | 2792 | /* Insert vm structure into process list sorted by address |
2793 | * and into the inode's i_mmap tree. If vm_file is non-NULL | 2793 | * and into the inode's i_mmap tree. If vm_file is non-NULL |
2794 | * then i_mmap_mutex is taken here. | 2794 | * then i_mmap_rwsem is taken here. |
2795 | */ | 2795 | */ |
2796 | int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) | 2796 | int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) |
2797 | { | 2797 | { |
@@ -3086,7 +3086,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | |||
3086 | */ | 3086 | */ |
3087 | if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) | 3087 | if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) |
3088 | BUG(); | 3088 | BUG(); |
3089 | mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem); | 3089 | down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem); |
3090 | } | 3090 | } |
3091 | } | 3091 | } |
3092 | 3092 | ||
@@ -3113,7 +3113,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | |||
3113 | * vma in this mm is backed by the same anon_vma or address_space. | 3113 | * vma in this mm is backed by the same anon_vma or address_space. |
3114 | * | 3114 | * |
3115 | * We can take all the locks in random order because the VM code | 3115 | * We can take all the locks in random order because the VM code |
3116 | * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never | 3116 | * taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never |
3117 | * takes more than one of them in a row. Secondly we're protected | 3117 | * takes more than one of them in a row. Secondly we're protected |
3118 | * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. | 3118 | * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. |
3119 | * | 3119 | * |
diff --git a/mm/mremap.c b/mm/mremap.c index 426b448d6447..84aa36f9f308 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -99,7 +99,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
99 | spinlock_t *old_ptl, *new_ptl; | 99 | spinlock_t *old_ptl, *new_ptl; |
100 | 100 | ||
101 | /* | 101 | /* |
102 | * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma | 102 | * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma |
103 | * locks to ensure that rmap will always observe either the old or the | 103 | * locks to ensure that rmap will always observe either the old or the |
104 | * new ptes. This is the easiest way to avoid races with | 104 | * new ptes. This is the easiest way to avoid races with |
105 | * truncate_pagecache(), page migration, etc... | 105 | * truncate_pagecache(), page migration, etc... |
@@ -23,7 +23,7 @@ | |||
23 | * inode->i_mutex (while writing or truncating, not reading or faulting) | 23 | * inode->i_mutex (while writing or truncating, not reading or faulting) |
24 | * mm->mmap_sem | 24 | * mm->mmap_sem |
25 | * page->flags PG_locked (lock_page) | 25 | * page->flags PG_locked (lock_page) |
26 | * mapping->i_mmap_mutex | 26 | * mapping->i_mmap_rwsem |
27 | * anon_vma->rwsem | 27 | * anon_vma->rwsem |
28 | * mm->page_table_lock or pte_lock | 28 | * mm->page_table_lock or pte_lock |
29 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) | 29 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) |
@@ -1260,7 +1260,7 @@ out_mlock: | |||
1260 | /* | 1260 | /* |
1261 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes | 1261 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes |
1262 | * unstable result and race. Plus, We can't wait here because | 1262 | * unstable result and race. Plus, We can't wait here because |
1263 | * we now hold anon_vma->rwsem or mapping->i_mmap_mutex. | 1263 | * we now hold anon_vma->rwsem or mapping->i_mmap_rwsem. |
1264 | * if trylock failed, the page remain in evictable lru and later | 1264 | * if trylock failed, the page remain in evictable lru and later |
1265 | * vmscan could retry to move the page to unevictable lru if the | 1265 | * vmscan could retry to move the page to unevictable lru if the |
1266 | * page is actually mlocked. | 1266 | * page is actually mlocked. |
@@ -1684,7 +1684,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) | |||
1684 | * The page lock not only makes sure that page->mapping cannot | 1684 | * The page lock not only makes sure that page->mapping cannot |
1685 | * suddenly be NULLified by truncation, it makes sure that the | 1685 | * suddenly be NULLified by truncation, it makes sure that the |
1686 | * structure at mapping cannot be freed and reused yet, | 1686 | * structure at mapping cannot be freed and reused yet, |
1687 | * so we can safely take mapping->i_mmap_mutex. | 1687 | * so we can safely take mapping->i_mmap_rwsem. |
1688 | */ | 1688 | */ |
1689 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 1689 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
1690 | 1690 | ||