hugetlb: guarantee that COW faults for a process that called mmap(MAP_PRIVATE) on hugetlbfs will succeed

After patch 2 in this series, a process that successfully calls mmap() for a MAP_PRIVATE mapping will be guaranteed to successfully fault until a process calls fork(). At that point, the next write fault from the parent could fail due to COW if the child still has a reference. We only reserve pages for the parent but a copy must be made to avoid leaking data from the parent to the child after fork(). Reserves could be taken for both parent and child at fork time to guarantee faults but if the mapping is large it is highly likely we will not have sufficient pages for the reservation, and it is common to fork only to exec() immediatly after. A failure here would be very undesirable. Note that the current behaviour of mainline with MAP_PRIVATE pages is pretty bad. The following situation is allowed to occur today. 1. Process calls mmap(MAP_PRIVATE) 2. Process calls mlock() to fault all pages and makes sure it succeeds 3. Process forks() 4. Process writes to MAP_PRIVATE mapping while child still exists 5. If the COW fails at this point, the process gets SIGKILLed even though it had taken care to ensure the pages existed This patch improves the situation by guaranteeing the reliability of the process that successfully calls mmap(). When the parent performs COW, it will try to satisfy the allocation without using reserves. If that fails the parent will steal the page leaving any children without a page. Faults from the child after that point will result in failure. If the child COW happens first, an attempt will be made to allocate the page without reserves and the child will get SIGKILLed on failure. To summarise the new behaviour: 1. If the original mapper performs COW on a private mapping with multiple references, it will attempt to allocate a hugepage from the pool or the buddy allocator without using the existing reserves. On fail, VMAs mapping the same area are traversed and the page being COW'd is unmapped where found. It will then steal the original page as the last mapper in the normal way. 2. The VMAs the pages were unmapped from are flagged to note that pages with data no longer exist. Future no-page faults on those VMAs will terminate the process as otherwise it would appear that data was corrupted. A warning is printed to the console that this situation occured. 2. If the child performs COW first, it will attempt to satisfy the COW from the pool if there are enough pages or via the buddy allocator if overcommit is allowed and the buddy allocator can satisfy the request. If it fails, the child will be killed. If the pool is large enough, existing applications will not notice that the reserves were a factor. Existing applications depending on the no-reserves been set are unlikely to exist as for much of the history of hugetlbfs, pages were prefaulted at mmap(), allocating the pages at that point or failing the mmap(). [npiggin@suse.de: fix CONFIG_HUGETLB=n build] Signed-off-by: Mel Gorman <mel@csn.ul.ie> Acked-by: Adam Litke <agl@us.ibm.com> Cc: Andy Whitcroft <apw@shadowen.org> Cc: William Lee Irwin III <wli@holomorphy.com> Cc: Hugh Dickins <hugh@veritas.com> Cc: Nick Piggin <npiggin@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Mel Gorman <mel@csn.ul.ie> 2008-07-24 00:27:25 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2008-07-24 13:47:16 -0400
commit: 04f2cbe35699d22dbf428373682ead85ca1240f5 (patch)
tree: 1987a2c704cc97d8adf603054c9d89d18b9b30e0 /mm/hugetlb.c
parent: a1e78772d72b2616ed20e54896e68e0e7044854e (diff)
1 files changed, 183 insertions, 18 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0af500db3632..a2d29b84501f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -40,6 +40,9 @@ static int hugetlb_next_nid;
 */
 static DEFINE_SPINLOCK(hugetlb_lock);
+#define HPAGE_RESV_OWNER    (1UL << (BITS_PER_LONG - 1))
+#define HPAGE_RESV_UNMAPPED (1UL << (BITS_PER_LONG - 2))
+#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
 /*
 * These helpers are used to track how many pages are reserved for
 * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
@@ -54,17 +57,32 @@ static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma)
 {
        VM_BUG_ON(!is_vm_hugetlb_page(vma));
        if (!(vma->vm_flags & VM_SHARED))
-                return (unsigned long)vma->vm_private_data;
+                return (unsigned long)vma->vm_private_data & ~HPAGE_RESV_MASK;
        return 0;
 }
 static void set_vma_resv_huge_pages(struct vm_area_struct *vma,
                                                        unsigned long reserve)
 {
+        unsigned long flags;
        VM_BUG_ON(!is_vm_hugetlb_page(vma));
        VM_BUG_ON(vma->vm_flags & VM_SHARED);
-        vma->vm_private_data = (void *)reserve;
+        flags = (unsigned long)vma->vm_private_data & HPAGE_RESV_MASK;
+        vma->vm_private_data = (void *)(reserve | flags);
+}
+static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
+{
+        unsigned long reserveflags = (unsigned long)vma->vm_private_data;
+        VM_BUG_ON(!is_vm_hugetlb_page(vma));
+        vma->vm_private_data = (void *)(reserveflags | flags);
+}
+static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
+{
+        VM_BUG_ON(!is_vm_hugetlb_page(vma));
+        return ((unsigned long)vma->vm_private_data & flag) != 0;
 }
 /* Decrement the reserved pages in the hugepage pool by one */
@@ -78,14 +96,18 @@ static void decrement_hugepage_resv_vma(struct vm_area_struct *vma)
                 * Only the process that called mmap() has reserves for
                 * private mappings.
                 */
-                if (vma_resv_huge_pages(vma)) {
+                if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+                        unsigned long flags, reserve;
                        resv_huge_pages--;
+                        flags = (unsigned long)vma->vm_private_data &
+                                                        HPAGE_RESV_MASK;
                        reserve = (unsigned long)vma->vm_private_data - 1;
-                        vma->vm_private_data = (void *)reserve;
+                        vma->vm_private_data = (void *)(reserve | flags);
                }
        }
 }
+/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
 void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
 {
        VM_BUG_ON(!is_vm_hugetlb_page(vma));
@@ -153,7 +175,7 @@ static struct page *dequeue_huge_page(void)
 }
 static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
-                                unsigned long address)
+                                unsigned long address, int avoid_reserve)
 {
        int nid;
        struct page *page = NULL;
@@ -173,6 +195,10 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
                        free_huge_pages - resv_huge_pages == 0)
                return NULL;
+        /* If reserves cannot be used, ensure enough pages are in the pool */
+        if (avoid_reserve && free_huge_pages - resv_huge_pages == 0)
+                return NULL;
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                                MAX_NR_ZONES - 1, nodemask) {
                nid = zone_to_nid(zone);
@@ -183,7 +209,9 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
                        list_del(&page->lru);
                        free_huge_pages--;
                        free_huge_pages_node[nid]--;
-                        decrement_hugepage_resv_vma(vma);
+                        if (!avoid_reserve)
+                                decrement_hugepage_resv_vma(vma);
                        break;
                }
@@ -534,7 +562,7 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
 }
 static struct page *alloc_huge_page(struct vm_area_struct *vma,
-                                    unsigned long addr)
+                                    unsigned long addr, int avoid_reserve)
 {
        struct page *page;
        struct address_space *mapping = vma->vm_file->f_mapping;
@@ -546,14 +574,15 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
         * will not have accounted against quota. Check that the quota can be
         * made before satisfying the allocation
         */
-        if (!vma_has_private_reserves(vma)) {
+        if (!(vma->vm_flags & VM_SHARED) &&
+                        !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
                chg = 1;
                if (hugetlb_get_quota(inode->i_mapping, chg))
                        return ERR_PTR(-ENOSPC);
        }
        spin_lock(&hugetlb_lock);
-        page = dequeue_huge_page_vma(vma, addr);
+        page = dequeue_huge_page_vma(vma, addr, avoid_reserve);
        spin_unlock(&hugetlb_lock);
        if (!page) {
@@ -909,7 +938,7 @@ nomem:
 }
 void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
-                            unsigned long end)
+                            unsigned long end, struct page *ref_page)
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
@@ -937,6 +966,27 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                if (huge_pmd_unshare(mm, &address, ptep))
                        continue;
+                /*
+                 * If a reference page is supplied, it is because a specific
+                 * page is being unmapped, not a range. Ensure the page we
+                 * are about to unmap is the actual page of interest.
+                 */
+                if (ref_page) {
+                        pte = huge_ptep_get(ptep);
+                        if (huge_pte_none(pte))
+                                continue;
+                        page = pte_page(pte);
+                        if (page != ref_page)
+                                continue;
+                        /*
+                         * Mark the VMA as having unmapped its page so that
+                         * future faults in this VMA will fail rather than
+                         * looking like data was lost
+                         */
+                        set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
+                }
                pte = huge_ptep_get_and_clear(mm, address, ptep);
                if (huge_pte_none(pte))
                        continue;
@@ -955,7 +1005,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 }
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
-                          unsigned long end)
+                          unsigned long end, struct page *ref_page)
 {
        /*
         * It is undesirable to test vma->vm_file as it should be non-null
@@ -967,19 +1017,68 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
         */
        if (vma->vm_file) {
                spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
-                __unmap_hugepage_range(vma, start, end);
+                __unmap_hugepage_range(vma, start, end, ref_page);
                spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
        }
 }
+/*
+ * This is called when the original mapper is failing to COW a MAP_PRIVATE
+ * mappping it owns the reserve page for. The intention is to unmap the page
+ * from other VMAs and let the children be SIGKILLed if they are faulting the
+ * same region.
+ */
+int unmap_ref_private(struct mm_struct *mm,
+                                        struct vm_area_struct *vma,
+                                        struct page *page,
+                                        unsigned long address)
+{
+        struct vm_area_struct *iter_vma;
+        struct address_space *mapping;
+        struct prio_tree_iter iter;
+        pgoff_t pgoff;
+        /*
+         * vm_pgoff is in PAGE_SIZE units, hence the different calculation
+         * from page cache lookup which is in HPAGE_SIZE units.
+         */
+        address = address & huge_page_mask(hstate_vma(vma));
+        pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
+                + (vma->vm_pgoff >> PAGE_SHIFT);
+        mapping = (struct address_space *)page_private(page);
+        vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+                /* Do not unmap the current VMA */
+                if (iter_vma == vma)
+                        continue;
+                /*
+                 * Unmap the page from other VMAs without their own reserves.
+                 * They get marked to be SIGKILLed if they fault in these
+                 * areas. This is because a future no-page fault on this VMA
+                 * could insert a zeroed page instead of the data existing
+                 * from the time of fork. This would look like data corruption
+                 */
+                if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
+                        unmap_hugepage_range(iter_vma,
+                                address, address + HPAGE_SIZE,
+                                page);
+        }
+        return 1;
+}
 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
-                        unsigned long address, pte_t *ptep, pte_t pte)
+                        unsigned long address, pte_t *ptep, pte_t pte,
+                        struct page *pagecache_page)
 {
        struct page *old_page, *new_page;
        int avoidcopy;
+        int outside_reserve = 0;
        old_page = pte_page(pte);
+retry_avoidcopy:
        /* If no-one else is actually using this page, avoid the copy
         * and just make the page writable */
        avoidcopy = (page_count(old_page) == 1);
@@ -988,11 +1087,43 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
                return 0;
        }
+        /*
+         * If the process that created a MAP_PRIVATE mapping is about to
+         * perform a COW due to a shared page count, attempt to satisfy
+         * the allocation without using the existing reserves. The pagecache
+         * page is used to determine if the reserve at this address was
+         * consumed or not. If reserves were used, a partial faulted mapping
+         * at the time of fork() could consume its reserves on COW instead
+         * of the full address range.
+         */
+        if (!(vma->vm_flags & VM_SHARED) &&
+                        is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
+                        old_page != pagecache_page)
+                outside_reserve = 1;
        page_cache_get(old_page);
-        new_page = alloc_huge_page(vma, address);
+        new_page = alloc_huge_page(vma, address, outside_reserve);
        if (IS_ERR(new_page)) {
                page_cache_release(old_page);
+                /*
+                 * If a process owning a MAP_PRIVATE mapping fails to COW,
+                 * it is due to references held by a child and an insufficient
+                 * huge page pool. To guarantee the original mappers
+                 * reliability, unmap the page from child processes. The child
+                 * may get SIGKILLed if it later faults.
+                 */
+                if (outside_reserve) {
+                        BUG_ON(huge_pte_none(pte));
+                        if (unmap_ref_private(mm, vma, old_page, address)) {
+                                BUG_ON(page_count(old_page) != 1);
+                                BUG_ON(huge_pte_none(pte));
+                                goto retry_avoidcopy;
+                        }
+                        WARN_ON_ONCE(1);
+                }
                return -PTR_ERR(new_page);
        }
@@ -1015,6 +1146,20 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
        return 0;
 }
+/* Return the pagecache page at a given address within a VMA */
+static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma,
+                        unsigned long address)
+{
+        struct address_space *mapping;
+        unsigned long idx;
+        mapping = vma->vm_file->f_mapping;
+        idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
+                + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+        return find_lock_page(mapping, idx);
+}
 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, pte_t *ptep, int write_access)
 {
@@ -1025,6 +1170,18 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
        struct address_space *mapping;
        pte_t new_pte;
+        /*
+         * Currently, we are forced to kill the process in the event the
+         * original mapper has unmapped pages from the child due to a failed
+         * COW. Warn that such a situation has occured as it may not be obvious
+         */
+        if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
+                printk(KERN_WARNING
+                        "PID %d killed due to inadequate hugepage pool\n",
+                        current->pid);
+                return ret;
+        }
        mapping = vma->vm_file->f_mapping;
        idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
                + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
@@ -1039,7 +1196,7 @@ retry:
                size = i_size_read(mapping->host) >> HPAGE_SHIFT;
                if (idx >= size)
                        goto out;
-                page = alloc_huge_page(vma, address);
+                page = alloc_huge_page(vma, address, 0);
                if (IS_ERR(page)) {
                        ret = -PTR_ERR(page);
                        goto out;
@@ -1081,7 +1238,7 @@ retry:
        if (write_access && !(vma->vm_flags & VM_SHARED)) {
                /* Optimization, do the COW without a second fault */
-                ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
+                ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
        }
        spin_unlock(&mm->page_table_lock);
@@ -1126,8 +1283,15 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        spin_lock(&mm->page_table_lock);
        /* Check for a racing update before calling hugetlb_cow */
        if (likely(pte_same(entry, huge_ptep_get(ptep))))
-                if (write_access && !pte_write(entry))
+                if (write_access && !pte_write(entry)) {
-                        ret = hugetlb_cow(mm, vma, address, ptep, entry);
+                        struct page *page;
+                        page = hugetlbfs_pagecache_page(vma, address);
+                        ret = hugetlb_cow(mm, vma, address, ptep, entry, page);
+                        if (page) {
+                                unlock_page(page);
+                                put_page(page);
+                        }
+                }
        spin_unlock(&mm->page_table_lock);
        mutex_unlock(&hugetlb_instantiation_mutex);
@@ -1371,6 +1535,7 @@ int hugetlb_reserve_pages(struct inode *inode,
        else {
                chg = to - from;
                set_vma_resv_huge_pages(vma, chg);
+                set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
        }
        if (chg < 0)
author	Mel Gorman <mel@csn.ul.ie>	2008-07-24 00:27:25 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2008-07-24 13:47:16 -0400
commit	04f2cbe35699d22dbf428373682ead85ca1240f5 (patch)
tree	1987a2c704cc97d8adf603054c9d89d18b9b30e0 /mm/hugetlb.c
parent	a1e78772d72b2616ed20e54896e68e0e7044854e (diff)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0af500db3632..a2d29b84501f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c
@@ -40,6 +40,9 @@ static int hugetlb_next_nid;
40	*/	40	*/
41	static DEFINE_SPINLOCK(hugetlb_lock);	41	static DEFINE_SPINLOCK(hugetlb_lock);
42		42
		43	#define HPAGE_RESV_OWNER (1UL << (BITS_PER_LONG - 1))
		44	#define HPAGE_RESV_UNMAPPED (1UL << (BITS_PER_LONG - 2))
		45	#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER \| HPAGE_RESV_UNMAPPED)
43	/*	46	/*
44	* These helpers are used to track how many pages are reserved for	47	* These helpers are used to track how many pages are reserved for
45	* faults in a MAP_PRIVATE mapping. Only the process that called mmap()	48	* faults in a MAP_PRIVATE mapping. Only the process that called mmap()
@@ -54,17 +57,32 @@ static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma)
54	{	57	{
55	VM_BUG_ON(!is_vm_hugetlb_page(vma));	58	VM_BUG_ON(!is_vm_hugetlb_page(vma));
56	if (!(vma->vm_flags & VM_SHARED))	59	if (!(vma->vm_flags & VM_SHARED))
57	return (unsigned long)vma->vm_private_data;	60	return (unsigned long)vma->vm_private_data & ~HPAGE_RESV_MASK;
58	return 0;	61	return 0;
59	}	62	}
60		63
61	static void set_vma_resv_huge_pages(struct vm_area_struct *vma,	64	static void set_vma_resv_huge_pages(struct vm_area_struct *vma,
62	unsigned long reserve)	65	unsigned long reserve)
63	{	66	{
		67	unsigned long flags;
64	VM_BUG_ON(!is_vm_hugetlb_page(vma));	68	VM_BUG_ON(!is_vm_hugetlb_page(vma));
65	VM_BUG_ON(vma->vm_flags & VM_SHARED);	69	VM_BUG_ON(vma->vm_flags & VM_SHARED);
66		70
67	vma->vm_private_data = (void *)reserve;	71	flags = (unsigned long)vma->vm_private_data & HPAGE_RESV_MASK;
		72	vma->vm_private_data = (void *)(reserve \| flags);
		73	}
		74
		75	static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
		76	{
		77	unsigned long reserveflags = (unsigned long)vma->vm_private_data;
		78	VM_BUG_ON(!is_vm_hugetlb_page(vma));
		79	vma->vm_private_data = (void *)(reserveflags \| flags);
		80	}
		81
		82	static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
		83	{
		84	VM_BUG_ON(!is_vm_hugetlb_page(vma));
		85	return ((unsigned long)vma->vm_private_data & flag) != 0;
68	}	86	}
69		87
70	/* Decrement the reserved pages in the hugepage pool by one */	88	/* Decrement the reserved pages in the hugepage pool by one */
@@ -78,14 +96,18 @@ static void decrement_hugepage_resv_vma(struct vm_area_struct *vma)
78	* Only the process that called mmap() has reserves for	96	* Only the process that called mmap() has reserves for
79	* private mappings.	97	* private mappings.
80	*/	98	*/
81	if (vma_resv_huge_pages(vma)) {	99	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
		100	unsigned long flags, reserve;
82	resv_huge_pages--;	101	resv_huge_pages--;
		102	flags = (unsigned long)vma->vm_private_data &
		103	HPAGE_RESV_MASK;
83	reserve = (unsigned long)vma->vm_private_data - 1;	104	reserve = (unsigned long)vma->vm_private_data - 1;
84	vma->vm_private_data = (void *)reserve;	105	vma->vm_private_data = (void *)(reserve \| flags);
85	}	106	}
86	}	107	}
87	}	108	}
88		109
		110	/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
89	void reset_vma_resv_huge_pages(struct vm_area_struct *vma)	111	void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
90	{	112	{
91	VM_BUG_ON(!is_vm_hugetlb_page(vma));	113	VM_BUG_ON(!is_vm_hugetlb_page(vma));
@@ -153,7 +175,7 @@ static struct page *dequeue_huge_page(void)
153	}	175	}
154		176
155	static struct page dequeue_huge_page_vma(struct vm_area_struct vma,	177	static struct page dequeue_huge_page_vma(struct vm_area_struct vma,
156	unsigned long address)	178	unsigned long address, int avoid_reserve)
157	{	179	{
158	int nid;	180	int nid;
159	struct page *page = NULL;	181	struct page *page = NULL;
@@ -173,6 +195,10 @@ static struct page dequeue_huge_page_vma(struct vm_area_struct vma,
173	free_huge_pages - resv_huge_pages == 0)	195	free_huge_pages - resv_huge_pages == 0)
174	return NULL;	196	return NULL;
175		197
		198	/* If reserves cannot be used, ensure enough pages are in the pool */
		199	if (avoid_reserve && free_huge_pages - resv_huge_pages == 0)
		200	return NULL;
		201
176	for_each_zone_zonelist_nodemask(zone, z, zonelist,	202	for_each_zone_zonelist_nodemask(zone, z, zonelist,
177	MAX_NR_ZONES - 1, nodemask) {	203	MAX_NR_ZONES - 1, nodemask) {
178	nid = zone_to_nid(zone);	204	nid = zone_to_nid(zone);
@@ -183,7 +209,9 @@ static struct page dequeue_huge_page_vma(struct vm_area_struct vma,
183	list_del(&page->lru);	209	list_del(&page->lru);
184	free_huge_pages--;	210	free_huge_pages--;
185	free_huge_pages_node[nid]--;	211	free_huge_pages_node[nid]--;
186	decrement_hugepage_resv_vma(vma);	212
		213	if (!avoid_reserve)
		214	decrement_hugepage_resv_vma(vma);
187		215
188	break;	216	break;
189	}	217	}
@@ -534,7 +562,7 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
534	}	562	}
535		563
536	static struct page alloc_huge_page(struct vm_area_struct vma,	564	static struct page alloc_huge_page(struct vm_area_struct vma,
537	unsigned long addr)	565	unsigned long addr, int avoid_reserve)
538	{	566	{
539	struct page *page;	567	struct page *page;
540	struct address_space *mapping = vma->vm_file->f_mapping;	568	struct address_space *mapping = vma->vm_file->f_mapping;
@@ -546,14 +574,15 @@ static struct page alloc_huge_page(struct vm_area_struct vma,
546	* will not have accounted against quota. Check that the quota can be	574	* will not have accounted against quota. Check that the quota can be
547	* made before satisfying the allocation	575	* made before satisfying the allocation
548	*/	576	*/
549	if (!vma_has_private_reserves(vma)) {	577	if (!(vma->vm_flags & VM_SHARED) &&
		578	!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
550	chg = 1;	579	chg = 1;
551	if (hugetlb_get_quota(inode->i_mapping, chg))	580	if (hugetlb_get_quota(inode->i_mapping, chg))
552	return ERR_PTR(-ENOSPC);	581	return ERR_PTR(-ENOSPC);
553	}	582	}
554		583
555	spin_lock(&hugetlb_lock);	584	spin_lock(&hugetlb_lock);
556	page = dequeue_huge_page_vma(vma, addr);	585	page = dequeue_huge_page_vma(vma, addr, avoid_reserve);
557	spin_unlock(&hugetlb_lock);	586	spin_unlock(&hugetlb_lock);
558		587
559	if (!page) {	588	if (!page) {
@@ -909,7 +938,7 @@ nomem:
909	}	938	}
910		939
911	void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,	940	void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
912	unsigned long end)	941	unsigned long end, struct page *ref_page)
913	{	942	{
914	struct mm_struct *mm = vma->vm_mm;	943	struct mm_struct *mm = vma->vm_mm;
915	unsigned long address;	944	unsigned long address;
@@ -937,6 +966,27 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
937	if (huge_pmd_unshare(mm, &address, ptep))	966	if (huge_pmd_unshare(mm, &address, ptep))
938	continue;	967	continue;
939		968
		969	/*
		970	* If a reference page is supplied, it is because a specific
		971	* page is being unmapped, not a range. Ensure the page we
		972	* are about to unmap is the actual page of interest.
		973	*/
		974	if (ref_page) {
		975	pte = huge_ptep_get(ptep);
		976	if (huge_pte_none(pte))
		977	continue;
		978	page = pte_page(pte);
		979	if (page != ref_page)
		980	continue;
		981
		982	/*
		983	* Mark the VMA as having unmapped its page so that
		984	* future faults in this VMA will fail rather than
		985	* looking like data was lost
		986	*/
		987	set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
		988	}
		989
940	pte = huge_ptep_get_and_clear(mm, address, ptep);	990	pte = huge_ptep_get_and_clear(mm, address, ptep);
941	if (huge_pte_none(pte))	991	if (huge_pte_none(pte))
942	continue;	992	continue;
@@ -955,7 +1005,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
955	}	1005	}
956		1006
957	void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,	1007	void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
958	unsigned long end)	1008	unsigned long end, struct page *ref_page)
959	{	1009	{
960	/*	1010	/*
961	* It is undesirable to test vma->vm_file as it should be non-null	1011	* It is undesirable to test vma->vm_file as it should be non-null
@@ -967,19 +1017,68 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
967	*/	1017	*/
968	if (vma->vm_file) {	1018	if (vma->vm_file) {
969	spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);	1019	spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
970	__unmap_hugepage_range(vma, start, end);	1020	__unmap_hugepage_range(vma, start, end, ref_page);
971	spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);	1021	spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
972	}	1022	}
973	}	1023	}
974		1024
		1025	/*
		1026	* This is called when the original mapper is failing to COW a MAP_PRIVATE
		1027	* mappping it owns the reserve page for. The intention is to unmap the page
		1028	* from other VMAs and let the children be SIGKILLed if they are faulting the
		1029	* same region.
		1030	*/
		1031	int unmap_ref_private(struct mm_struct *mm,
		1032	struct vm_area_struct *vma,
		1033	struct page *page,
		1034	unsigned long address)
		1035	{
		1036	struct vm_area_struct *iter_vma;
		1037	struct address_space *mapping;
		1038	struct prio_tree_iter iter;
		1039	pgoff_t pgoff;
		1040
		1041	/*
		1042	* vm_pgoff is in PAGE_SIZE units, hence the different calculation
		1043	* from page cache lookup which is in HPAGE_SIZE units.
		1044	*/
		1045	address = address & huge_page_mask(hstate_vma(vma));
		1046	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
		1047	+ (vma->vm_pgoff >> PAGE_SHIFT);
		1048	mapping = (struct address_space *)page_private(page);
		1049
		1050	vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
		1051	/* Do not unmap the current VMA */
		1052	if (iter_vma == vma)
		1053	continue;
		1054
		1055	/*
		1056	* Unmap the page from other VMAs without their own reserves.
		1057	* They get marked to be SIGKILLed if they fault in these
		1058	* areas. This is because a future no-page fault on this VMA
		1059	* could insert a zeroed page instead of the data existing
		1060	* from the time of fork. This would look like data corruption
		1061	*/
		1062	if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
		1063	unmap_hugepage_range(iter_vma,
		1064	address, address + HPAGE_SIZE,
		1065	page);
		1066	}
		1067
		1068	return 1;
		1069	}
		1070
975	static int hugetlb_cow(struct mm_struct mm, struct vm_area_struct vma,	1071	static int hugetlb_cow(struct mm_struct mm, struct vm_area_struct vma,
976	unsigned long address, pte_t *ptep, pte_t pte)	1072	unsigned long address, pte_t *ptep, pte_t pte,
		1073	struct page *pagecache_page)
977	{	1074	{
978	struct page old_page, new_page;	1075	struct page old_page, new_page;
979	int avoidcopy;	1076	int avoidcopy;
		1077	int outside_reserve = 0;
980		1078
981	old_page = pte_page(pte);	1079	old_page = pte_page(pte);
982		1080
		1081	retry_avoidcopy:
983	/* If no-one else is actually using this page, avoid the copy	1082	/* If no-one else is actually using this page, avoid the copy
984	* and just make the page writable */	1083	* and just make the page writable */
985	avoidcopy = (page_count(old_page) == 1);	1084	avoidcopy = (page_count(old_page) == 1);
@@ -988,11 +1087,43 @@ static int hugetlb_cow(struct mm_struct mm, struct vm_area_struct vma,
988	return 0;	1087	return 0;
989	}	1088	}
990		1089
		1090	/*
		1091	* If the process that created a MAP_PRIVATE mapping is about to
		1092	* perform a COW due to a shared page count, attempt to satisfy
		1093	* the allocation without using the existing reserves. The pagecache
		1094	* page is used to determine if the reserve at this address was
		1095	* consumed or not. If reserves were used, a partial faulted mapping
		1096	* at the time of fork() could consume its reserves on COW instead
		1097	* of the full address range.
		1098	*/
		1099	if (!(vma->vm_flags & VM_SHARED) &&
		1100	is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
		1101	old_page != pagecache_page)
		1102	outside_reserve = 1;
		1103
991	page_cache_get(old_page);	1104	page_cache_get(old_page);
992	new_page = alloc_huge_page(vma, address);	1105	new_page = alloc_huge_page(vma, address, outside_reserve);
993		1106
994	if (IS_ERR(new_page)) {	1107	if (IS_ERR(new_page)) {
995	page_cache_release(old_page);	1108	page_cache_release(old_page);
		1109
		1110	/*
		1111	* If a process owning a MAP_PRIVATE mapping fails to COW,
		1112	* it is due to references held by a child and an insufficient
		1113	* huge page pool. To guarantee the original mappers
		1114	* reliability, unmap the page from child processes. The child
		1115	* may get SIGKILLed if it later faults.
		1116	*/
		1117	if (outside_reserve) {
		1118	BUG_ON(huge_pte_none(pte));
		1119	if (unmap_ref_private(mm, vma, old_page, address)) {
		1120	BUG_ON(page_count(old_page) != 1);
		1121	BUG_ON(huge_pte_none(pte));
		1122	goto retry_avoidcopy;
		1123	}
		1124	WARN_ON_ONCE(1);
		1125	}
		1126
996	return -PTR_ERR(new_page);	1127	return -PTR_ERR(new_page);
997	}	1128	}
998		1129
@@ -1015,6 +1146,20 @@ static int hugetlb_cow(struct mm_struct mm, struct vm_area_struct vma,
1015	return 0;	1146	return 0;
1016	}	1147	}
1017		1148
		1149	/* Return the pagecache page at a given address within a VMA */
		1150	static struct page hugetlbfs_pagecache_page(struct vm_area_struct vma,
		1151	unsigned long address)
		1152	{
		1153	struct address_space *mapping;
		1154	unsigned long idx;
		1155
		1156	mapping = vma->vm_file->f_mapping;
		1157	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
		1158	+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
		1159
		1160	return find_lock_page(mapping, idx);
		1161	}
		1162
1018	static int hugetlb_no_page(struct mm_struct mm, struct vm_area_struct vma,	1163	static int hugetlb_no_page(struct mm_struct mm, struct vm_area_struct vma,
1019	unsigned long address, pte_t *ptep, int write_access)	1164	unsigned long address, pte_t *ptep, int write_access)
1020	{	1165	{
@@ -1025,6 +1170,18 @@ static int hugetlb_no_page(struct mm_struct mm, struct vm_area_struct vma,
1025	struct address_space *mapping;	1170	struct address_space *mapping;
1026	pte_t new_pte;	1171	pte_t new_pte;
1027		1172
		1173	/*
		1174	* Currently, we are forced to kill the process in the event the
		1175	* original mapper has unmapped pages from the child due to a failed
		1176	* COW. Warn that such a situation has occured as it may not be obvious
		1177	*/
		1178	if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
		1179	printk(KERN_WARNING
		1180	"PID %d killed due to inadequate hugepage pool\n",
		1181	current->pid);
		1182	return ret;
		1183	}
		1184
1028	mapping = vma->vm_file->f_mapping;	1185	mapping = vma->vm_file->f_mapping;
1029	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)	1186	idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
1030	+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));	1187	+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
@@ -1039,7 +1196,7 @@ retry:
1039	size = i_size_read(mapping->host) >> HPAGE_SHIFT;	1196	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
1040	if (idx >= size)	1197	if (idx >= size)
1041	goto out;	1198	goto out;
1042	page = alloc_huge_page(vma, address);	1199	page = alloc_huge_page(vma, address, 0);
1043	if (IS_ERR(page)) {	1200	if (IS_ERR(page)) {
1044	ret = -PTR_ERR(page);	1201	ret = -PTR_ERR(page);
1045	goto out;	1202	goto out;
@@ -1081,7 +1238,7 @@ retry:
1081		1238
1082	if (write_access && !(vma->vm_flags & VM_SHARED)) {	1239	if (write_access && !(vma->vm_flags & VM_SHARED)) {
1083	/* Optimization, do the COW without a second fault */	1240	/* Optimization, do the COW without a second fault */
1084	ret = hugetlb_cow(mm, vma, address, ptep, new_pte);	1241	ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
1085	}	1242	}
1086		1243
1087	spin_unlock(&mm->page_table_lock);	1244	spin_unlock(&mm->page_table_lock);
@@ -1126,8 +1283,15 @@ int hugetlb_fault(struct mm_struct mm, struct vm_area_struct vma,
1126	spin_lock(&mm->page_table_lock);	1283	spin_lock(&mm->page_table_lock);
1127	/* Check for a racing update before calling hugetlb_cow */	1284	/* Check for a racing update before calling hugetlb_cow */
1128	if (likely(pte_same(entry, huge_ptep_get(ptep))))	1285	if (likely(pte_same(entry, huge_ptep_get(ptep))))
1129	if (write_access && !pte_write(entry))	1286	if (write_access && !pte_write(entry)) {
1130	ret = hugetlb_cow(mm, vma, address, ptep, entry);	1287	struct page *page;
		1288	page = hugetlbfs_pagecache_page(vma, address);
		1289	ret = hugetlb_cow(mm, vma, address, ptep, entry, page);
		1290	if (page) {
		1291	unlock_page(page);
		1292	put_page(page);
		1293	}
		1294	}
1131	spin_unlock(&mm->page_table_lock);	1295	spin_unlock(&mm->page_table_lock);
1132	mutex_unlock(&hugetlb_instantiation_mutex);	1296	mutex_unlock(&hugetlb_instantiation_mutex);
1133		1297
@@ -1371,6 +1535,7 @@ int hugetlb_reserve_pages(struct inode *inode,
1371	else {	1535	else {
1372	chg = to - from;	1536	chg = to - from;
1373	set_vma_resv_huge_pages(vma, chg);	1537	set_vma_resv_huge_pages(vma, chg);
		1538	set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
1374	}	1539	}
1375		1540
1376	if (chg < 0)	1541	if (chg < 0)