aboutsummaryrefslogtreecommitdiffstats
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
authorMel Gorman <mel@csn.ul.ie>2008-07-24 00:27:25 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-07-24 13:47:16 -0400
commit04f2cbe35699d22dbf428373682ead85ca1240f5 (patch)
tree1987a2c704cc97d8adf603054c9d89d18b9b30e0 /mm/hugetlb.c
parenta1e78772d72b2616ed20e54896e68e0e7044854e (diff)
hugetlb: guarantee that COW faults for a process that called mmap(MAP_PRIVATE) on hugetlbfs will succeed
After patch 2 in this series, a process that successfully calls mmap() for a MAP_PRIVATE mapping will be guaranteed to successfully fault until a process calls fork(). At that point, the next write fault from the parent could fail due to COW if the child still has a reference. We only reserve pages for the parent but a copy must be made to avoid leaking data from the parent to the child after fork(). Reserves could be taken for both parent and child at fork time to guarantee faults but if the mapping is large it is highly likely we will not have sufficient pages for the reservation, and it is common to fork only to exec() immediatly after. A failure here would be very undesirable. Note that the current behaviour of mainline with MAP_PRIVATE pages is pretty bad. The following situation is allowed to occur today. 1. Process calls mmap(MAP_PRIVATE) 2. Process calls mlock() to fault all pages and makes sure it succeeds 3. Process forks() 4. Process writes to MAP_PRIVATE mapping while child still exists 5. If the COW fails at this point, the process gets SIGKILLed even though it had taken care to ensure the pages existed This patch improves the situation by guaranteeing the reliability of the process that successfully calls mmap(). When the parent performs COW, it will try to satisfy the allocation without using reserves. If that fails the parent will steal the page leaving any children without a page. Faults from the child after that point will result in failure. If the child COW happens first, an attempt will be made to allocate the page without reserves and the child will get SIGKILLed on failure. To summarise the new behaviour: 1. If the original mapper performs COW on a private mapping with multiple references, it will attempt to allocate a hugepage from the pool or the buddy allocator without using the existing reserves. On fail, VMAs mapping the same area are traversed and the page being COW'd is unmapped where found. It will then steal the original page as the last mapper in the normal way. 2. The VMAs the pages were unmapped from are flagged to note that pages with data no longer exist. Future no-page faults on those VMAs will terminate the process as otherwise it would appear that data was corrupted. A warning is printed to the console that this situation occured. 2. If the child performs COW first, it will attempt to satisfy the COW from the pool if there are enough pages or via the buddy allocator if overcommit is allowed and the buddy allocator can satisfy the request. If it fails, the child will be killed. If the pool is large enough, existing applications will not notice that the reserves were a factor. Existing applications depending on the no-reserves been set are unlikely to exist as for much of the history of hugetlbfs, pages were prefaulted at mmap(), allocating the pages at that point or failing the mmap(). [npiggin@suse.de: fix CONFIG_HUGETLB=n build] Signed-off-by: Mel Gorman <mel@csn.ul.ie> Acked-by: Adam Litke <agl@us.ibm.com> Cc: Andy Whitcroft <apw@shadowen.org> Cc: William Lee Irwin III <wli@holomorphy.com> Cc: Hugh Dickins <hugh@veritas.com> Cc: Nick Piggin <npiggin@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c201
1 files changed, 183 insertions, 18 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0af500db3632..a2d29b84501f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -40,6 +40,9 @@ static int hugetlb_next_nid;
40 */ 40 */
41static DEFINE_SPINLOCK(hugetlb_lock); 41static DEFINE_SPINLOCK(hugetlb_lock);
42 42
43#define HPAGE_RESV_OWNER (1UL << (BITS_PER_LONG - 1))
44#define HPAGE_RESV_UNMAPPED (1UL << (BITS_PER_LONG - 2))
45#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
43/* 46/*
44 * These helpers are used to track how many pages are reserved for 47 * These helpers are used to track how many pages are reserved for
45 * faults in a MAP_PRIVATE mapping. Only the process that called mmap() 48 * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
@@ -54,17 +57,32 @@ static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma)
54{ 57{
55 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 58 VM_BUG_ON(!is_vm_hugetlb_page(vma));
56 if (!(vma->vm_flags & VM_SHARED)) 59 if (!(vma->vm_flags & VM_SHARED))
57 return (unsigned long)vma->vm_private_data; 60 return (unsigned long)vma->vm_private_data & ~HPAGE_RESV_MASK;
58 return 0; 61 return 0;
59} 62}
60 63
61static void set_vma_resv_huge_pages(struct vm_area_struct *vma, 64static void set_vma_resv_huge_pages(struct vm_area_struct *vma,
62 unsigned long reserve) 65 unsigned long reserve)
63{ 66{
67 unsigned long flags;
64 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 68 VM_BUG_ON(!is_vm_hugetlb_page(vma));
65 VM_BUG_ON(vma->vm_flags & VM_SHARED); 69 VM_BUG_ON(vma->vm_flags & VM_SHARED);
66 70
67 vma->vm_private_data = (void *)reserve; 71 flags = (unsigned long)vma->vm_private_data & HPAGE_RESV_MASK;
72 vma->vm_private_data = (void *)(reserve | flags);
73}
74
75static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
76{
77 unsigned long reserveflags = (unsigned long)vma->vm_private_data;
78 VM_BUG_ON(!is_vm_hugetlb_page(vma));
79 vma->vm_private_data = (void *)(reserveflags | flags);
80}
81
82static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
83{
84 VM_BUG_ON(!is_vm_hugetlb_page(vma));
85 return ((unsigned long)vma->vm_private_data & flag) != 0;
68} 86}
69 87
70/* Decrement the reserved pages in the hugepage pool by one */ 88/* Decrement the reserved pages in the hugepage pool by one */
@@ -78,14 +96,18 @@ static void decrement_hugepage_resv_vma(struct vm_area_struct *vma)
78 * Only the process that called mmap() has reserves for 96 * Only the process that called mmap() has reserves for
79 * private mappings. 97 * private mappings.
80 */ 98 */
81 if (vma_resv_huge_pages(vma)) { 99 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
100 unsigned long flags, reserve;
82 resv_huge_pages--; 101 resv_huge_pages--;
102 flags = (unsigned long)vma->vm_private_data &
103 HPAGE_RESV_MASK;
83 reserve = (unsigned long)vma->vm_private_data - 1; 104 reserve = (unsigned long)vma->vm_private_data - 1;
84 vma->vm_private_data = (void *)reserve; 105 vma->vm_private_data = (void *)(reserve | flags);
85 } 106 }
86 } 107 }
87} 108}
88 109
110/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
89void reset_vma_resv_huge_pages(struct vm_area_struct *vma) 111void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
90{ 112{
91 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 113 VM_BUG_ON(!is_vm_hugetlb_page(vma));
@@ -153,7 +175,7 @@ static struct page *dequeue_huge_page(void)
153} 175}
154 176
155static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, 177static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
156 unsigned long address) 178 unsigned long address, int avoid_reserve)
157{ 179{
158 int nid; 180 int nid;
159 struct page *page = NULL; 181 struct page *page = NULL;
@@ -173,6 +195,10 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
173 free_huge_pages - resv_huge_pages == 0) 195 free_huge_pages - resv_huge_pages == 0)
174 return NULL; 196 return NULL;
175 197
198 /* If reserves cannot be used, ensure enough pages are in the pool */
199 if (avoid_reserve && free_huge_pages - resv_huge_pages == 0)
200 return NULL;
201
176 for_each_zone_zonelist_nodemask(zone, z, zonelist, 202 for_each_zone_zonelist_nodemask(zone, z, zonelist,
177 MAX_NR_ZONES - 1, nodemask) { 203 MAX_NR_ZONES - 1, nodemask) {
178 nid = zone_to_nid(zone); 204 nid = zone_to_nid(zone);
@@ -183,7 +209,9 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
183 list_del(&page->lru); 209 list_del(&page->lru);
184 free_huge_pages--; 210 free_huge_pages--;
185 free_huge_pages_node[nid]--; 211 free_huge_pages_node[nid]--;
186 decrement_hugepage_resv_vma(vma); 212
213 if (!avoid_reserve)
214 decrement_hugepage_resv_vma(vma);
187 215
188 break; 216 break;
189 } 217 }
@@ -534,7 +562,7 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
534} 562}
535 563
536static struct page *alloc_huge_page(struct vm_area_struct *vma, 564static struct page *alloc_huge_page(struct vm_area_struct *vma,
537 unsigned long addr) 565 unsigned long addr, int avoid_reserve)
538{ 566{
539 struct page *page; 567 struct page *page;
540 struct address_space *mapping = vma->vm_file->f_mapping; 568 struct address_space *mapping = vma->vm_file->f_mapping;
@@ -546,14 +574,15 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
546 * will not have accounted against quota. Check that the quota can be 574 * will not have accounted against quota. Check that the quota can be
547 * made before satisfying the allocation 575 * made before satisfying the allocation
548 */ 576 */
549 if (!vma_has_private_reserves(vma)) { 577 if (!(vma->vm_flags & VM_SHARED) &&
578 !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
550 chg = 1; 579 chg = 1;
551 if (hugetlb_get_quota(inode->i_mapping, chg)) 580 if (hugetlb_get_quota(inode->i_mapping, chg))
552 return ERR_PTR(-ENOSPC); 581 return ERR_PTR(-ENOSPC);
553 } 582 }
554 583
555 spin_lock(&hugetlb_lock); 584 spin_lock(&hugetlb_lock);
556 page = dequeue_huge_page_vma(vma, addr); 585 page = dequeue_huge_page_vma(vma, addr, avoid_reserve);
557 spin_unlock(&hugetlb_lock); 586 spin_unlock(&hugetlb_lock);
558 587
559 if (!page) { 588 if (!page) {
@@ -909,7 +938,7 @@ nomem:
909} 938}
910 939
911void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 940void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
912 unsigned long end) 941 unsigned long end, struct page *ref_page)
913{ 942{
914 struct mm_struct *mm = vma->vm_mm; 943 struct mm_struct *mm = vma->vm_mm;
915 unsigned long address; 944 unsigned long address;
@@ -937,6 +966,27 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
937 if (huge_pmd_unshare(mm, &address, ptep)) 966 if (huge_pmd_unshare(mm, &address, ptep))
938 continue; 967 continue;
939 968
969 /*
970 * If a reference page is supplied, it is because a specific
971 * page is being unmapped, not a range. Ensure the page we
972 * are about to unmap is the actual page of interest.
973 */
974 if (ref_page) {
975 pte = huge_ptep_get(ptep);
976 if (huge_pte_none(pte))
977 continue;
978 page = pte_page(pte);
979 if (page != ref_page)
980 continue;
981
982 /*
983 * Mark the VMA as having unmapped its page so that
984 * future faults in this VMA will fail rather than
985 * looking like data was lost
986 */
987 set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
988 }
989
940 pte = huge_ptep_get_and_clear(mm, address, ptep); 990 pte = huge_ptep_get_and_clear(mm, address, ptep);
941 if (huge_pte_none(pte)) 991 if (huge_pte_none(pte))
942 continue; 992 continue;
@@ -955,7 +1005,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
955} 1005}
956 1006
957void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 1007void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
958 unsigned long end) 1008 unsigned long end, struct page *ref_page)
959{ 1009{
960 /* 1010 /*
961 * It is undesirable to test vma->vm_file as it should be non-null 1011 * It is undesirable to test vma->vm_file as it should be non-null
@@ -967,19 +1017,68 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
967 */ 1017 */
968 if (vma->vm_file) { 1018 if (vma->vm_file) {
969 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 1019 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
970 __unmap_hugepage_range(vma, start, end); 1020 __unmap_hugepage_range(vma, start, end, ref_page);
971 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 1021 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
972 } 1022 }
973} 1023}
974 1024
1025/*
1026 * This is called when the original mapper is failing to COW a MAP_PRIVATE
1027 * mappping it owns the reserve page for. The intention is to unmap the page
1028 * from other VMAs and let the children be SIGKILLed if they are faulting the
1029 * same region.
1030 */
1031int unmap_ref_private(struct mm_struct *mm,
1032 struct vm_area_struct *vma,
1033 struct page *page,
1034 unsigned long address)
1035{
1036 struct vm_area_struct *iter_vma;
1037 struct address_space *mapping;
1038 struct prio_tree_iter iter;
1039 pgoff_t pgoff;
1040
1041 /*
1042 * vm_pgoff is in PAGE_SIZE units, hence the different calculation
1043 * from page cache lookup which is in HPAGE_SIZE units.
1044 */
1045 address = address & huge_page_mask(hstate_vma(vma));
1046 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
1047 + (vma->vm_pgoff >> PAGE_SHIFT);
1048 mapping = (struct address_space *)page_private(page);
1049
1050 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1051 /* Do not unmap the current VMA */
1052 if (iter_vma == vma)
1053 continue;
1054
1055 /*
1056 * Unmap the page from other VMAs without their own reserves.
1057 * They get marked to be SIGKILLed if they fault in these
1058 * areas. This is because a future no-page fault on this VMA
1059 * could insert a zeroed page instead of the data existing
1060 * from the time of fork. This would look like data corruption
1061 */
1062 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
1063 unmap_hugepage_range(iter_vma,
1064 address, address + HPAGE_SIZE,
1065 page);
1066 }
1067
1068 return 1;
1069}
1070
975static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 1071static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
976 unsigned long address, pte_t *ptep, pte_t pte) 1072 unsigned long address, pte_t *ptep, pte_t pte,
1073 struct page *pagecache_page)
977{ 1074{
978 struct page *old_page, *new_page; 1075 struct page *old_page, *new_page;
979 int avoidcopy; 1076 int avoidcopy;
1077 int outside_reserve = 0;
980 1078
981 old_page = pte_page(pte); 1079 old_page = pte_page(pte);
982 1080
1081retry_avoidcopy:
983 /* If no-one else is actually using this page, avoid the copy 1082 /* If no-one else is actually using this page, avoid the copy
984 * and just make the page writable */ 1083 * and just make the page writable */
985 avoidcopy = (page_count(old_page) == 1); 1084 avoidcopy = (page_count(old_page) == 1);
@@ -988,11 +1087,43 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
988 return 0; 1087 return 0;
989 } 1088 }
990 1089
1090 /*
1091 * If the process that created a MAP_PRIVATE mapping is about to
1092 * perform a COW due to a shared page count, attempt to satisfy
1093 * the allocation without using the existing reserves. The pagecache
1094 * page is used to determine if the reserve at this address was
1095 * consumed or not. If reserves were used, a partial faulted mapping
1096 * at the time of fork() could consume its reserves on COW instead
1097 * of the full address range.
1098 */
1099 if (!(vma->vm_flags & VM_SHARED) &&
1100 is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
1101 old_page != pagecache_page)
1102 outside_reserve = 1;
1103
991 page_cache_get(old_page); 1104 page_cache_get(old_page);
992 new_page = alloc_huge_page(vma, address); 1105 new_page = alloc_huge_page(vma, address, outside_reserve);
993 1106
994 if (IS_ERR(new_page)) { 1107 if (IS_ERR(new_page)) {
995 page_cache_release(old_page); 1108 page_cache_release(old_page);
1109
1110 /*
1111 * If a process owning a MAP_PRIVATE mapping fails to COW,
1112 * it is due to references held by a child and an insufficient
1113 * huge page pool. To guarantee the original mappers
1114 * reliability, unmap the page from child processes. The child
1115 * may get SIGKILLed if it later faults.
1116 */
1117 if (outside_reserve) {
1118 BUG_ON(huge_pte_none(pte));
1119 if (unmap_ref_private(mm, vma, old_page, address)) {
1120 BUG_ON(page_count(old_page) != 1);
1121 BUG_ON(huge_pte_none(pte));
1122 goto retry_avoidcopy;
1123 }
1124 WARN_ON_ONCE(1);
1125 }
1126
996 return -PTR_ERR(new_page); 1127 return -PTR_ERR(new_page);
997 } 1128 }
998 1129
@@ -1015,6 +1146,20 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
1015 return 0; 1146 return 0;
1016} 1147}
1017 1148
1149/* Return the pagecache page at a given address within a VMA */
1150static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma,
1151 unsigned long address)
1152{
1153 struct address_space *mapping;
1154 unsigned long idx;
1155
1156 mapping = vma->vm_file->f_mapping;
1157 idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
1158 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
1159
1160 return find_lock_page(mapping, idx);
1161}
1162
1018static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 1163static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1019 unsigned long address, pte_t *ptep, int write_access) 1164 unsigned long address, pte_t *ptep, int write_access)
1020{ 1165{
@@ -1025,6 +1170,18 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1025 struct address_space *mapping; 1170 struct address_space *mapping;
1026 pte_t new_pte; 1171 pte_t new_pte;
1027 1172
1173 /*
1174 * Currently, we are forced to kill the process in the event the
1175 * original mapper has unmapped pages from the child due to a failed
1176 * COW. Warn that such a situation has occured as it may not be obvious
1177 */
1178 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
1179 printk(KERN_WARNING
1180 "PID %d killed due to inadequate hugepage pool\n",
1181 current->pid);
1182 return ret;
1183 }
1184
1028 mapping = vma->vm_file->f_mapping; 1185 mapping = vma->vm_file->f_mapping;
1029 idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 1186 idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
1030 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); 1187 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
@@ -1039,7 +1196,7 @@ retry:
1039 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 1196 size = i_size_read(mapping->host) >> HPAGE_SHIFT;
1040 if (idx >= size) 1197 if (idx >= size)
1041 goto out; 1198 goto out;
1042 page = alloc_huge_page(vma, address); 1199 page = alloc_huge_page(vma, address, 0);
1043 if (IS_ERR(page)) { 1200 if (IS_ERR(page)) {
1044 ret = -PTR_ERR(page); 1201 ret = -PTR_ERR(page);
1045 goto out; 1202 goto out;
@@ -1081,7 +1238,7 @@ retry:
1081 1238
1082 if (write_access && !(vma->vm_flags & VM_SHARED)) { 1239 if (write_access && !(vma->vm_flags & VM_SHARED)) {
1083 /* Optimization, do the COW without a second fault */ 1240 /* Optimization, do the COW without a second fault */
1084 ret = hugetlb_cow(mm, vma, address, ptep, new_pte); 1241 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
1085 } 1242 }
1086 1243
1087 spin_unlock(&mm->page_table_lock); 1244 spin_unlock(&mm->page_table_lock);
@@ -1126,8 +1283,15 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
1126 spin_lock(&mm->page_table_lock); 1283 spin_lock(&mm->page_table_lock);
1127 /* Check for a racing update before calling hugetlb_cow */ 1284 /* Check for a racing update before calling hugetlb_cow */
1128 if (likely(pte_same(entry, huge_ptep_get(ptep)))) 1285 if (likely(pte_same(entry, huge_ptep_get(ptep))))
1129 if (write_access && !pte_write(entry)) 1286 if (write_access && !pte_write(entry)) {
1130 ret = hugetlb_cow(mm, vma, address, ptep, entry); 1287 struct page *page;
1288 page = hugetlbfs_pagecache_page(vma, address);
1289 ret = hugetlb_cow(mm, vma, address, ptep, entry, page);
1290 if (page) {
1291 unlock_page(page);
1292 put_page(page);
1293 }
1294 }
1131 spin_unlock(&mm->page_table_lock); 1295 spin_unlock(&mm->page_table_lock);
1132 mutex_unlock(&hugetlb_instantiation_mutex); 1296 mutex_unlock(&hugetlb_instantiation_mutex);
1133 1297
@@ -1371,6 +1535,7 @@ int hugetlb_reserve_pages(struct inode *inode,
1371 else { 1535 else {
1372 chg = to - from; 1536 chg = to - from;
1373 set_vma_resv_huge_pages(vma, chg); 1537 set_vma_resv_huge_pages(vma, chg);
1538 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
1374 } 1539 }
1375 1540
1376 if (chg < 0) 1541 if (chg < 0)