aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndy Whitcroft <apw@shadowen.org>2008-07-24 00:27:32 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-07-24 13:47:16 -0400
commit84afd99b8398c9d73af8238aa3cd835858e3097a (patch)
tree11908ca096ef8e95e578b766d1077e7d12eb2a62
parentc37f9fb11c976ffc08200d631dada6dcbfd07ea4 (diff)
hugetlb reservations: fix hugetlb MAP_PRIVATE reservations across vma splits
When a hugetlb mapping with a reservation is split, a new VMA is cloned from the original. This new VMA is a direct copy of the original including the reservation count. When this pair of VMAs are unmapped we will incorrect double account the unused reservation and the overall reservation count will be incorrect, in extreme cases it will wrap. The problem occurs when we split an existing VMA say to unmap a page in the middle. split_vma() will create a new VMA copying all fields from the original. As we are storing our reservation count in vm_private_data this is also copies, endowing the new VMA with a duplicate of the original VMA's reservation. Neither of the new VMAs can exhaust these reservations as they are too small, but when we unmap and close these VMAs we will incorrect credit the remainder twice and resv_huge_pages will become out of sync. This can lead to allocation failures on mappings with reservations and even to resv_huge_pages wrapping which prevents all subsequent hugepage allocations. The simple fix would be to correctly apportion the remaining reservation count when the split is made. However the only hook we have vm_ops->open only has the new VMA we do not know the identity of the preceeding VMA. Also even if we did have that VMA to hand we do not know how much of the reservation was consumed each side of the split. This patch therefore takes a different tack. We know that the whole of any private mapping (which has a reservation) has a reservation over its whole size. Any present pages represent consumed reservation. Therefore if we track the instantiated pages we can calculate the remaining reservation. This patch reuses the existing regions code to track the regions for which we have consumed reservation (ie. the instantiated pages), as each page is faulted in we record the consumption of reservation for the new page. When we need to return unused reservations at unmap time we simply count the consumed reservation region subtracting that from the whole of the map. During a VMA split the newly opened VMA will point to the same region map, as this map is offset oriented it remains valid for both of the split VMAs. This map is referenced counted so that it is removed when all VMAs which are part of the mmap are gone. Thanks to Adam Litke and Mel Gorman for their review feedback. Signed-off-by: Andy Whitcroft <apw@shadowen.org> Acked-by: Mel Gorman <mel@csn.ul.ie> Cc: Adam Litke <agl@us.ibm.com> Cc: Johannes Weiner <hannes@saeurebad.de> Cc: Andy Whitcroft <apw@shadowen.org> Cc: William Lee Irwin III <wli@holomorphy.com> Cc: Hugh Dickins <hugh@veritas.com> Cc: Michael Kerrisk <mtk.manpages@googlemail.com> Cc: Jon Tollefson <kniht@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/hugetlb.c172
1 files changed, 145 insertions, 27 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 72acbb29d2cc..65616941a383 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -43,6 +43,16 @@ static DEFINE_SPINLOCK(hugetlb_lock);
43/* 43/*
44 * Region tracking -- allows tracking of reservations and instantiated pages 44 * Region tracking -- allows tracking of reservations and instantiated pages
45 * across the pages in a mapping. 45 * across the pages in a mapping.
46 *
47 * The region data structures are protected by a combination of the mmap_sem
48 * and the hugetlb_instantion_mutex. To access or modify a region the caller
49 * must either hold the mmap_sem for write, or the mmap_sem for read and
50 * the hugetlb_instantiation mutex:
51 *
52 * down_write(&mm->mmap_sem);
53 * or
54 * down_read(&mm->mmap_sem);
55 * mutex_lock(&hugetlb_instantiation_mutex);
46 */ 56 */
47struct file_region { 57struct file_region {
48 struct list_head link; 58 struct list_head link;
@@ -165,6 +175,30 @@ static long region_truncate(struct list_head *head, long end)
165 return chg; 175 return chg;
166} 176}
167 177
178static long region_count(struct list_head *head, long f, long t)
179{
180 struct file_region *rg;
181 long chg = 0;
182
183 /* Locate each segment we overlap with, and count that overlap. */
184 list_for_each_entry(rg, head, link) {
185 int seg_from;
186 int seg_to;
187
188 if (rg->to <= f)
189 continue;
190 if (rg->from >= t)
191 break;
192
193 seg_from = max(rg->from, f);
194 seg_to = min(rg->to, t);
195
196 chg += seg_to - seg_from;
197 }
198
199 return chg;
200}
201
168/* 202/*
169 * Convert the address within this vma to the page offset within 203 * Convert the address within this vma to the page offset within
170 * the mapping, in base page units. 204 * the mapping, in base page units.
@@ -187,9 +221,15 @@ static pgoff_t vma_pagecache_offset(struct vm_area_struct *vma,
187 (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); 221 (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
188} 222}
189 223
190#define HPAGE_RESV_OWNER (1UL << (BITS_PER_LONG - 1)) 224/*
191#define HPAGE_RESV_UNMAPPED (1UL << (BITS_PER_LONG - 2)) 225 * Flags for MAP_PRIVATE reservations. These are stored in the bottom
226 * bits of the reservation map pointer, which are always clear due to
227 * alignment.
228 */
229#define HPAGE_RESV_OWNER (1UL << 0)
230#define HPAGE_RESV_UNMAPPED (1UL << 1)
192#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) 231#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
232
193/* 233/*
194 * These helpers are used to track how many pages are reserved for 234 * These helpers are used to track how many pages are reserved for
195 * faults in a MAP_PRIVATE mapping. Only the process that called mmap() 235 * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
@@ -199,6 +239,15 @@ static pgoff_t vma_pagecache_offset(struct vm_area_struct *vma,
199 * the reserve counters are updated with the hugetlb_lock held. It is safe 239 * the reserve counters are updated with the hugetlb_lock held. It is safe
200 * to reset the VMA at fork() time as it is not in use yet and there is no 240 * to reset the VMA at fork() time as it is not in use yet and there is no
201 * chance of the global counters getting corrupted as a result of the values. 241 * chance of the global counters getting corrupted as a result of the values.
242 *
243 * The private mapping reservation is represented in a subtly different
244 * manner to a shared mapping. A shared mapping has a region map associated
245 * with the underlying file, this region map represents the backing file
246 * pages which have ever had a reservation assigned which this persists even
247 * after the page is instantiated. A private mapping has a region map
248 * associated with the original mmap which is attached to all VMAs which
249 * reference it, this region map represents those offsets which have consumed
250 * reservation ie. where pages have been instantiated.
202 */ 251 */
203static unsigned long get_vma_private_data(struct vm_area_struct *vma) 252static unsigned long get_vma_private_data(struct vm_area_struct *vma)
204{ 253{
@@ -211,22 +260,48 @@ static void set_vma_private_data(struct vm_area_struct *vma,
211 vma->vm_private_data = (void *)value; 260 vma->vm_private_data = (void *)value;
212} 261}
213 262
214static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma) 263struct resv_map {
264 struct kref refs;
265 struct list_head regions;
266};
267
268struct resv_map *resv_map_alloc(void)
269{
270 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
271 if (!resv_map)
272 return NULL;
273
274 kref_init(&resv_map->refs);
275 INIT_LIST_HEAD(&resv_map->regions);
276
277 return resv_map;
278}
279
280void resv_map_release(struct kref *ref)
281{
282 struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
283
284 /* Clear out any active regions before we release the map. */
285 region_truncate(&resv_map->regions, 0);
286 kfree(resv_map);
287}
288
289static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
215{ 290{
216 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 291 VM_BUG_ON(!is_vm_hugetlb_page(vma));
217 if (!(vma->vm_flags & VM_SHARED)) 292 if (!(vma->vm_flags & VM_SHARED))
218 return get_vma_private_data(vma) & ~HPAGE_RESV_MASK; 293 return (struct resv_map *)(get_vma_private_data(vma) &
294 ~HPAGE_RESV_MASK);
219 return 0; 295 return 0;
220} 296}
221 297
222static void set_vma_resv_huge_pages(struct vm_area_struct *vma, 298static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
223 unsigned long reserve)
224{ 299{
225 VM_BUG_ON(!is_vm_hugetlb_page(vma)); 300 VM_BUG_ON(!is_vm_hugetlb_page(vma));
226 VM_BUG_ON(vma->vm_flags & VM_SHARED); 301 VM_BUG_ON(vma->vm_flags & VM_SHARED);
227 302
228 set_vma_private_data(vma, 303 set_vma_private_data(vma, (get_vma_private_data(vma) &
229 (get_vma_private_data(vma) & HPAGE_RESV_MASK) | reserve); 304 HPAGE_RESV_MASK) | (unsigned long)map);
230} 305}
231 306
232static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) 307static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
@@ -253,19 +328,12 @@ static void decrement_hugepage_resv_vma(struct vm_area_struct *vma)
253 if (vma->vm_flags & VM_SHARED) { 328 if (vma->vm_flags & VM_SHARED) {
254 /* Shared mappings always use reserves */ 329 /* Shared mappings always use reserves */
255 resv_huge_pages--; 330 resv_huge_pages--;
256 } else { 331 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
257 /* 332 /*
258 * Only the process that called mmap() has reserves for 333 * Only the process that called mmap() has reserves for
259 * private mappings. 334 * private mappings.
260 */ 335 */
261 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 336 resv_huge_pages--;
262 unsigned long flags, reserve;
263 resv_huge_pages--;
264 flags = (unsigned long)vma->vm_private_data &
265 HPAGE_RESV_MASK;
266 reserve = (unsigned long)vma->vm_private_data - 1;
267 vma->vm_private_data = (void *)(reserve | flags);
268 }
269 } 337 }
270} 338}
271 339
@@ -282,7 +350,7 @@ static int vma_has_private_reserves(struct vm_area_struct *vma)
282{ 350{
283 if (vma->vm_flags & VM_SHARED) 351 if (vma->vm_flags & VM_SHARED)
284 return 0; 352 return 0;
285 if (!vma_resv_huge_pages(vma)) 353 if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER))
286 return 0; 354 return 0;
287 return 1; 355 return 1;
288} 356}
@@ -742,12 +810,19 @@ static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr)
742 return region_chg(&inode->i_mapping->private_list, 810 return region_chg(&inode->i_mapping->private_list,
743 idx, idx + 1); 811 idx, idx + 1);
744 812
745 } else { 813 } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
746 if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 814 return 1;
747 return 1;
748 }
749 815
750 return 0; 816 } else {
817 int err;
818 pgoff_t idx = vma_pagecache_offset(vma, addr);
819 struct resv_map *reservations = vma_resv_map(vma);
820
821 err = region_chg(&reservations->regions, idx, idx + 1);
822 if (err < 0)
823 return err;
824 return 0;
825 }
751} 826}
752static void vma_commit_reservation(struct vm_area_struct *vma, 827static void vma_commit_reservation(struct vm_area_struct *vma,
753 unsigned long addr) 828 unsigned long addr)
@@ -758,6 +833,13 @@ static void vma_commit_reservation(struct vm_area_struct *vma,
758 if (vma->vm_flags & VM_SHARED) { 833 if (vma->vm_flags & VM_SHARED) {
759 pgoff_t idx = vma_pagecache_offset(vma, addr); 834 pgoff_t idx = vma_pagecache_offset(vma, addr);
760 region_add(&inode->i_mapping->private_list, idx, idx + 1); 835 region_add(&inode->i_mapping->private_list, idx, idx + 1);
836
837 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
838 pgoff_t idx = vma_pagecache_offset(vma, addr);
839 struct resv_map *reservations = vma_resv_map(vma);
840
841 /* Mark this page used in the map. */
842 region_add(&reservations->regions, idx, idx + 1);
761 } 843 }
762} 844}
763 845
@@ -1047,11 +1129,41 @@ out:
1047 return ret; 1129 return ret;
1048} 1130}
1049 1131
1132static void hugetlb_vm_op_open(struct vm_area_struct *vma)
1133{
1134 struct resv_map *reservations = vma_resv_map(vma);
1135
1136 /*
1137 * This new VMA should share its siblings reservation map if present.
1138 * The VMA will only ever have a valid reservation map pointer where
1139 * it is being copied for another still existing VMA. As that VMA
1140 * has a reference to the reservation map it cannot dissappear until
1141 * after this open call completes. It is therefore safe to take a
1142 * new reference here without additional locking.
1143 */
1144 if (reservations)
1145 kref_get(&reservations->refs);
1146}
1147
1050static void hugetlb_vm_op_close(struct vm_area_struct *vma) 1148static void hugetlb_vm_op_close(struct vm_area_struct *vma)
1051{ 1149{
1052 unsigned long reserve = vma_resv_huge_pages(vma); 1150 struct resv_map *reservations = vma_resv_map(vma);
1053 if (reserve) 1151 unsigned long reserve;
1054 hugetlb_acct_memory(-reserve); 1152 unsigned long start;
1153 unsigned long end;
1154
1155 if (reservations) {
1156 start = vma_pagecache_offset(vma, vma->vm_start);
1157 end = vma_pagecache_offset(vma, vma->vm_end);
1158
1159 reserve = (end - start) -
1160 region_count(&reservations->regions, start, end);
1161
1162 kref_put(&reservations->refs, resv_map_release);
1163
1164 if (reserve)
1165 hugetlb_acct_memory(-reserve);
1166 }
1055} 1167}
1056 1168
1057/* 1169/*
@@ -1068,6 +1180,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1068 1180
1069struct vm_operations_struct hugetlb_vm_ops = { 1181struct vm_operations_struct hugetlb_vm_ops = {
1070 .fault = hugetlb_vm_op_fault, 1182 .fault = hugetlb_vm_op_fault,
1183 .open = hugetlb_vm_op_open,
1071 .close = hugetlb_vm_op_close, 1184 .close = hugetlb_vm_op_close,
1072}; 1185};
1073 1186
@@ -1617,8 +1730,13 @@ int hugetlb_reserve_pages(struct inode *inode,
1617 if (!vma || vma->vm_flags & VM_SHARED) 1730 if (!vma || vma->vm_flags & VM_SHARED)
1618 chg = region_chg(&inode->i_mapping->private_list, from, to); 1731 chg = region_chg(&inode->i_mapping->private_list, from, to);
1619 else { 1732 else {
1733 struct resv_map *resv_map = resv_map_alloc();
1734 if (!resv_map)
1735 return -ENOMEM;
1736
1620 chg = to - from; 1737 chg = to - from;
1621 set_vma_resv_huge_pages(vma, chg); 1738
1739 set_vma_resv_map(vma, resv_map);
1622 set_vma_resv_flags(vma, HPAGE_RESV_OWNER); 1740 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
1623 } 1741 }
1624 1742