aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Gibson <david@gibson.dropbear.id.au>2006-03-22 03:08:53 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-03-22 10:54:03 -0500
commit3935baa9bcda3ccaee4f7849f5157d316e34412e (patch)
tree45f6d064693a91171c57159acac43822cae6e129
parent79ac6ba40eb8d70f0d204e98ae9b63280ad1018c (diff)
[PATCH] hugepage: serialize hugepage allocation and instantiation
Currently, no lock or mutex is held between allocating a hugepage and inserting it into the pagetables / page cache. When we do go to insert the page into pagetables or page cache, we recheck and may free the newly allocated hugepage. However, since the number of hugepages in the system is strictly limited, and it's usualy to want to use all of them, this can still lead to spurious allocation failures. For example, suppose two processes are both mapping (MAP_SHARED) the same hugepage file, large enough to consume the entire available hugepage pool. If they race instantiating the last page in the mapping, they will both attempt to allocate the last available hugepage. One will fail, of course, returning OOM from the fault and thus causing the process to be killed, despite the fact that the entire mapping can, in fact, be instantiated. The patch fixes this race by the simple method of adding a (sleeping) mutex to serialize the hugepage fault path between allocation and insertion into pagetables and/or page cache. It would be possible to avoid the serialization by catching the allocation failures, waiting on some condition, then rechecking to see if someone else has instantiated the page for us. Given the likely frequency of hugepage instantiations, it seems very doubtful it's worth the extra complexity. This patch causes no regression on the libhugetlbfs testsuite, and one test, which can trigger this race now passes where it previously failed. Actually, the test still sometimes fails, though less often and only as a shmat() failure, rather processes getting OOM killed by the VM. The dodgy heuristic tests in fs/hugetlbfs/inode.c for whether there's enough hugepage space aren't protected by the new mutex, and would be ugly to do so, so there's still a race there. Another patch to replace those tests with something saner for this reason as well as others coming... Signed-off-by: David Gibson <dwg@au1.ibm.com> Cc: William Lee Irwin III <wli@holomorphy.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--mm/hugetlb.c25
1 files changed, 18 insertions, 7 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 41b1038f76da..d5987a87bbe5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -13,6 +13,7 @@
13#include <linux/pagemap.h> 13#include <linux/pagemap.h>
14#include <linux/mempolicy.h> 14#include <linux/mempolicy.h>
15#include <linux/cpuset.h> 15#include <linux/cpuset.h>
16#include <linux/mutex.h>
16 17
17#include <asm/page.h> 18#include <asm/page.h>
18#include <asm/pgtable.h> 19#include <asm/pgtable.h>
@@ -26,6 +27,10 @@ unsigned long max_huge_pages;
26static struct list_head hugepage_freelists[MAX_NUMNODES]; 27static struct list_head hugepage_freelists[MAX_NUMNODES];
27static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 28static unsigned int nr_huge_pages_node[MAX_NUMNODES];
28static unsigned int free_huge_pages_node[MAX_NUMNODES]; 29static unsigned int free_huge_pages_node[MAX_NUMNODES];
30/*
31 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
32 */
33static DEFINE_SPINLOCK(hugetlb_lock);
29 34
30static void clear_huge_page(struct page *page, unsigned long addr) 35static void clear_huge_page(struct page *page, unsigned long addr)
31{ 36{
@@ -50,11 +55,6 @@ static void copy_huge_page(struct page *dst, struct page *src,
50 } 55 }
51} 56}
52 57
53/*
54 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
55 */
56static DEFINE_SPINLOCK(hugetlb_lock);
57
58static void enqueue_huge_page(struct page *page) 58static void enqueue_huge_page(struct page *page)
59{ 59{
60 int nid = page_to_nid(page); 60 int nid = page_to_nid(page);
@@ -508,14 +508,24 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
508 pte_t *ptep; 508 pte_t *ptep;
509 pte_t entry; 509 pte_t entry;
510 int ret; 510 int ret;
511 static DEFINE_MUTEX(hugetlb_instantiation_mutex);
511 512
512 ptep = huge_pte_alloc(mm, address); 513 ptep = huge_pte_alloc(mm, address);
513 if (!ptep) 514 if (!ptep)
514 return VM_FAULT_OOM; 515 return VM_FAULT_OOM;
515 516
517 /*
518 * Serialize hugepage allocation and instantiation, so that we don't
519 * get spurious allocation failures if two CPUs race to instantiate
520 * the same page in the page cache.
521 */
522 mutex_lock(&hugetlb_instantiation_mutex);
516 entry = *ptep; 523 entry = *ptep;
517 if (pte_none(entry)) 524 if (pte_none(entry)) {
518 return hugetlb_no_page(mm, vma, address, ptep, write_access); 525 ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
526 mutex_unlock(&hugetlb_instantiation_mutex);
527 return ret;
528 }
519 529
520 ret = VM_FAULT_MINOR; 530 ret = VM_FAULT_MINOR;
521 531
@@ -525,6 +535,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
525 if (write_access && !pte_write(entry)) 535 if (write_access && !pte_write(entry))
526 ret = hugetlb_cow(mm, vma, address, ptep, entry); 536 ret = hugetlb_cow(mm, vma, address, ptep, entry);
527 spin_unlock(&mm->page_table_lock); 537 spin_unlock(&mm->page_table_lock);
538 mutex_unlock(&hugetlb_instantiation_mutex);
528 539
529 return ret; 540 return ret;
530} 541}