From caff3a2c333e11a794308bd9a875a09b94fee24a Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Tue, 12 Aug 2008 15:08:38 -0700 Subject: hugetlb: call arch_prepare_hugepage() for surplus pages The s390 software large page emulation implements shared page tables by using page->index of the first tail page from a compound large page to store page table information. This is set up in arch_prepare_hugepage(), which is called from alloc_fresh_huge_page_node(). A similar call to arch_prepare_hugepage() is missing for surplus large pages that are allocated in alloc_buddy_huge_page(), which breaks the software emulation mode for (surplus) large pages on s390. This patch adds the missing call to arch_prepare_hugepage(). It will have no effect on other architectures where arch_prepare_hugepage() is a nop. Also, use the correct order in the error path in alloc_fresh_huge_page_node(). Acked-by: Martin Schwidefsky Signed-off-by: Gerald Schaefer Acked-by: Nick Piggin Acked-by: Adam Litke Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 757ca983fd99..92155db888b9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -565,7 +565,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) huge_page_order(h)); if (page) { if (arch_prepare_hugepage(page)) { - __free_pages(page, HUGETLB_PAGE_ORDER); + __free_pages(page, huge_page_order(h)); return NULL; } prep_new_huge_page(h, page, nid); @@ -665,6 +665,11 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); + if (page && arch_prepare_hugepage(page)) { + __free_pages(page, huge_page_order(h)); + return NULL; + } + spin_lock(&hugetlb_lock); if (page) { /* -- cgit v1.2.2 From 57303d80175e10056bf51206f9961d586f02f967 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 12 Aug 2008 15:08:47 -0700 Subject: hugetlbfs: allocate structures for reservation tracking outside of spinlocks In the normal case, hugetlbfs reserves hugepages at map time so that the pages exist for future faults. A struct file_region is used to track when reservations have been consumed and where. These file_regions are allocated as necessary with kmalloc() which can sleep with the mm->page_table_lock held. This is wrong and triggers may-sleep warning when PREEMPT is enabled. Updates to the underlying file_region are done in two phases. The first phase prepares the region for the change, allocating any necessary memory, without actually making the change. The second phase actually commits the change. This patch makes use of this by checking the reservations before the page_table_lock is taken; triggering any necessary allocations. This may then be safely repeated within the locks without any allocations being required. Credit to Mel Gorman for diagnosing this failure and initial versions of the patch. Signed-off-by: Andy Whitcroft Tested-by: Gerald Schaefer Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 44 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 92155db888b9..4c97c174e2e1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1942,6 +1942,15 @@ retry: lock_page(page); } + /* + * If we are going to COW a private mapping later, we examine the + * pending reservations for this page now. This will ensure that + * any allocations necessary to record that reservation occur outside + * the spinlock. + */ + if (write_access && !(vma->vm_flags & VM_SHARED)) + vma_needs_reservation(h, vma, address); + spin_lock(&mm->page_table_lock); size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) @@ -1978,6 +1987,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *ptep; pte_t entry; int ret; + struct page *pagecache_page = NULL; static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); @@ -2000,19 +2010,35 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = 0; + /* + * If we are going to COW the mapping later, we examine the pending + * reservations for this page now. This will ensure that any + * allocations necessary to record that reservation occur outside the + * spinlock. For private mappings, we also lookup the pagecache + * page now as it is used to determine if a reservation has been + * consumed. + */ + if (write_access && !pte_write(entry)) { + vma_needs_reservation(h, vma, address); + + if (!(vma->vm_flags & VM_SHARED)) + pagecache_page = hugetlbfs_pagecache_page(h, + vma, address); + } + spin_lock(&mm->page_table_lock); /* Check for a racing update before calling hugetlb_cow */ if (likely(pte_same(entry, huge_ptep_get(ptep)))) - if (write_access && !pte_write(entry)) { - struct page *page; - page = hugetlbfs_pagecache_page(h, vma, address); - ret = hugetlb_cow(mm, vma, address, ptep, entry, page); - if (page) { - unlock_page(page); - put_page(page); - } - } + if (write_access && !pte_write(entry)) + ret = hugetlb_cow(mm, vma, address, ptep, entry, + pagecache_page); spin_unlock(&mm->page_table_lock); + + if (pagecache_page) { + unlock_page(pagecache_page); + put_page(pagecache_page); + } + mutex_unlock(&hugetlb_instantiation_mutex); return ret; -- cgit v1.2.2 From 2b26736c88db85c038e04c2306d0745553e69602 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 12 Aug 2008 15:08:49 -0700 Subject: allocate structures for reservation tracking in hugetlbfs outside of spinlocks v2 [Andrew this should replace the previous version which did not check the returns from the region prepare for errors. This has been tested by us and Gerald and it looks good. Bah, while reviewing the locking based on your previous email I spotted that we need to check the return from the vma_needs_reservation call for allocation errors. Here is an updated patch to correct this. This passes testing here.] Signed-off-by: Andy Whitcroft Tested-by: Gerald Schaefer Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4c97c174e2e1..67a71191136e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1949,7 +1949,10 @@ retry: * the spinlock. */ if (write_access && !(vma->vm_flags & VM_SHARED)) - vma_needs_reservation(h, vma, address); + if (vma_needs_reservation(h, vma, address) < 0) { + ret = VM_FAULT_OOM; + goto backout_unlocked; + } spin_lock(&mm->page_table_lock); size = i_size_read(mapping->host) >> huge_page_shift(h); @@ -1976,6 +1979,7 @@ out: backout: spin_unlock(&mm->page_table_lock); +backout_unlocked: unlock_page(page); put_page(page); goto out; @@ -2004,8 +2008,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = huge_ptep_get(ptep); if (huge_pte_none(entry)) { ret = hugetlb_no_page(mm, vma, address, ptep, write_access); - mutex_unlock(&hugetlb_instantiation_mutex); - return ret; + goto out_unlock; } ret = 0; @@ -2019,7 +2022,10 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * consumed. */ if (write_access && !pte_write(entry)) { - vma_needs_reservation(h, vma, address); + if (vma_needs_reservation(h, vma, address) < 0) { + ret = VM_FAULT_OOM; + goto out_unlock; + } if (!(vma->vm_flags & VM_SHARED)) pagecache_page = hugetlbfs_pagecache_page(h, @@ -2039,6 +2045,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, put_page(pagecache_page); } +out_unlock: mutex_unlock(&hugetlb_instantiation_mutex); return ret; -- cgit v1.2.2