summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMike Kravetz <mike.kravetz@oracle.com>2019-05-13 20:19:41 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-05-14 12:47:48 -0400
commit1b426bac66e6cc83c9f2d92b96e4e72acf43419a (patch)
treeef27457ffb6793b09523ed400506ea0c61c67e8b
parent0919e1b69ab459e06df45d3ba6658d281962db80 (diff)
hugetlb: use same fault hash key for shared and private mappings
hugetlb uses a fault mutex hash table to prevent page faults of the same pages concurrently. The key for shared and private mappings is different. Shared keys off address_space and file index. Private keys off mm and virtual address. Consider a private mappings of a populated hugetlbfs file. A fault will map the page from the file and if needed do a COW to map a writable page. Hugetlbfs hole punch uses the fault mutex to prevent mappings of file pages. It uses the address_space file index key. However, private mappings will use a different key and could race with this code to map the file page. This causes problems (BUG) for the page cache remove code as it expects the page to be unmapped. A sample stack is: page dumped because: VM_BUG_ON_PAGE(page_mapped(page)) kernel BUG at mm/filemap.c:169! ... RIP: 0010:unaccount_page_cache_page+0x1b8/0x200 ... Call Trace: __delete_from_page_cache+0x39/0x220 delete_from_page_cache+0x45/0x70 remove_inode_hugepages+0x13c/0x380 ? __add_to_page_cache_locked+0x162/0x380 hugetlbfs_fallocate+0x403/0x540 ? _cond_resched+0x15/0x30 ? __inode_security_revalidate+0x5d/0x70 ? selinux_file_permission+0x100/0x130 vfs_fallocate+0x13f/0x270 ksys_fallocate+0x3c/0x80 __x64_sys_fallocate+0x1a/0x20 do_syscall_64+0x5b/0x180 entry_SYSCALL_64_after_hwframe+0x44/0xa9 There seems to be another potential COW issue/race with this approach of different private and shared keys as noted in commit 8382d914ebf7 ("mm, hugetlb: improve page-fault scalability"). Since every hugetlb mapping (even anon and private) is actually a file mapping, just use the address_space index key for all mappings. This results in potentially more hash collisions. However, this should not be the common case. Link: http://lkml.kernel.org/r/20190328234704.27083-3-mike.kravetz@oracle.com Link: http://lkml.kernel.org/r/20190412165235.t4sscoujczfhuiyt@linux-r8p5 Fixes: b5cec28d36f5 ("hugetlbfs: truncate_hugepages() takes a range of pages") Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Reviewed-by: Davidlohr Bueso <dbueso@suse.de> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/hugetlbfs/inode.c7
-rw-r--r--include/linux/hugetlb.h4
-rw-r--r--mm/hugetlb.c22
-rw-r--r--mm/userfaultfd.c3
4 files changed, 10 insertions, 26 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c74ef4426282..f23237135163 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -440,9 +440,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
440 u32 hash; 440 u32 hash;
441 441
442 index = page->index; 442 index = page->index;
443 hash = hugetlb_fault_mutex_hash(h, current->mm, 443 hash = hugetlb_fault_mutex_hash(h, mapping, index, 0);
444 &pseudo_vma,
445 mapping, index, 0);
446 mutex_lock(&hugetlb_fault_mutex_table[hash]); 444 mutex_lock(&hugetlb_fault_mutex_table[hash]);
447 445
448 /* 446 /*
@@ -639,8 +637,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
639 addr = index * hpage_size; 637 addr = index * hpage_size;
640 638
641 /* mutex taken here, fault path and hole punch */ 639 /* mutex taken here, fault path and hole punch */
642 hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping, 640 hash = hugetlb_fault_mutex_hash(h, mapping, index, addr);
643 index, addr);
644 mutex_lock(&hugetlb_fault_mutex_table[hash]); 641 mutex_lock(&hugetlb_fault_mutex_table[hash]);
645 642
646 /* See if already present in mapping to avoid alloc/free */ 643 /* See if already present in mapping to avoid alloc/free */
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 11943b60f208..edf476c8cfb9 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -123,9 +123,7 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
123void free_huge_page(struct page *page); 123void free_huge_page(struct page *page);
124void hugetlb_fix_reserve_counts(struct inode *inode); 124void hugetlb_fix_reserve_counts(struct inode *inode);
125extern struct mutex *hugetlb_fault_mutex_table; 125extern struct mutex *hugetlb_fault_mutex_table;
126u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, 126u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
127 struct vm_area_struct *vma,
128 struct address_space *mapping,
129 pgoff_t idx, unsigned long address); 127 pgoff_t idx, unsigned long address);
130 128
131pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); 129pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c33c5cbb67ff..98a3c7c224cb 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3824,8 +3824,7 @@ retry:
3824 * handling userfault. Reacquire after handling 3824 * handling userfault. Reacquire after handling
3825 * fault to make calling code simpler. 3825 * fault to make calling code simpler.
3826 */ 3826 */
3827 hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, 3827 hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
3828 idx, haddr);
3829 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 3828 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
3830 ret = handle_userfault(&vmf, VM_UFFD_MISSING); 3829 ret = handle_userfault(&vmf, VM_UFFD_MISSING);
3831 mutex_lock(&hugetlb_fault_mutex_table[hash]); 3830 mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -3933,21 +3932,14 @@ backout_unlocked:
3933} 3932}
3934 3933
3935#ifdef CONFIG_SMP 3934#ifdef CONFIG_SMP
3936u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, 3935u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
3937 struct vm_area_struct *vma,
3938 struct address_space *mapping,
3939 pgoff_t idx, unsigned long address) 3936 pgoff_t idx, unsigned long address)
3940{ 3937{
3941 unsigned long key[2]; 3938 unsigned long key[2];
3942 u32 hash; 3939 u32 hash;
3943 3940
3944 if (vma->vm_flags & VM_SHARED) { 3941 key[0] = (unsigned long) mapping;
3945 key[0] = (unsigned long) mapping; 3942 key[1] = idx;
3946 key[1] = idx;
3947 } else {
3948 key[0] = (unsigned long) mm;
3949 key[1] = address >> huge_page_shift(h);
3950 }
3951 3943
3952 hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0); 3944 hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
3953 3945
@@ -3958,9 +3950,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
3958 * For uniprocesor systems we always use a single mutex, so just 3950 * For uniprocesor systems we always use a single mutex, so just
3959 * return 0 and avoid the hashing overhead. 3951 * return 0 and avoid the hashing overhead.
3960 */ 3952 */
3961u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, 3953u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
3962 struct vm_area_struct *vma,
3963 struct address_space *mapping,
3964 pgoff_t idx, unsigned long address) 3954 pgoff_t idx, unsigned long address)
3965{ 3955{
3966 return 0; 3956 return 0;
@@ -4005,7 +3995,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
4005 * get spurious allocation failures if two CPUs race to instantiate 3995 * get spurious allocation failures if two CPUs race to instantiate
4006 * the same page in the page cache. 3996 * the same page in the page cache.
4007 */ 3997 */
4008 hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); 3998 hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
4009 mutex_lock(&hugetlb_fault_mutex_table[hash]); 3999 mutex_lock(&hugetlb_fault_mutex_table[hash]);
4010 4000
4011 entry = huge_ptep_get(ptep); 4001 entry = huge_ptep_get(ptep);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index d59b5a73dfb3..9932d5755e4c 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -271,8 +271,7 @@ retry:
271 */ 271 */
272 idx = linear_page_index(dst_vma, dst_addr); 272 idx = linear_page_index(dst_vma, dst_addr);
273 mapping = dst_vma->vm_file->f_mapping; 273 mapping = dst_vma->vm_file->f_mapping;
274 hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping, 274 hash = hugetlb_fault_mutex_hash(h, mapping, idx, dst_addr);
275 idx, dst_addr);
276 mutex_lock(&hugetlb_fault_mutex_table[hash]); 275 mutex_lock(&hugetlb_fault_mutex_table[hash]);
277 276
278 err = -ENOMEM; 277 err = -ENOMEM;