aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMike Kravetz <mike.kravetz@oracle.com>2015-06-24 19:57:52 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-06-24 20:49:44 -0400
commit1dd308a7b49d4bdbc17bfa570675ecc8cf7bedb3 (patch)
tree01084f854cf648b9a37ea27e9a2147c41e600f3a
parent9b012a29a300ea780d919205906d00d15cc6286e (diff)
mm/hugetlb: document the reserve map/region tracking routines
While working on hugetlbfs fallocate support, I noticed the following race in the existing code. It is unlikely that this race is hit very often in the current code. However, if more functionality to add and remove pages to hugetlbfs mappings (such as fallocate) is added the likelihood of hitting this race will increase. alloc_huge_page and hugetlb_reserve_pages use information from the reserve map to determine if there are enough available huge pages to complete the operation, as well as adjust global reserve and subpool usage counts. The order of operations is as follows: - call region_chg() to determine the expected change based on reserve map - determine if enough resources are available for this operation - adjust global counts based on the expected change - call region_add() to update the reserve map The issue is that reserve map could change between the call to region_chg and region_add. In this case, the counters which were adjusted based on the output of region_chg will not be correct. In order to hit this race today, there must be an existing shared hugetlb mmap created with the MAP_NORESERVE flag. A page fault to allocate a huge page via this mapping must occur at the same another task is mapping the same region without the MAP_NORESERVE flag. The patch set does not prevent the race from happening. Rather, it adds simple functionality to detect when the race has occurred. If a race is detected, then the incorrect counts are adjusted. Review comments pointed out the need for documentation of the existing region/reserve map routines. This patch set also adds documentation in this area. This patch (of 3): This is a documentation only patch and does not modify any code. Descriptions of the routines used for reserve map/region tracking are added. Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: David Rientjes <rientjes@google.com> Cc: Luiz Capitulino <lcapitulino@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/hugetlb.c52
1 files changed, 50 insertions, 2 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 10de25cf1f99..4a1d7021efaf 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -217,8 +217,20 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
217 * Region tracking -- allows tracking of reservations and instantiated pages 217 * Region tracking -- allows tracking of reservations and instantiated pages
218 * across the pages in a mapping. 218 * across the pages in a mapping.
219 * 219 *
220 * The region data structures are embedded into a resv_map and 220 * The region data structures are embedded into a resv_map and protected
221 * protected by a resv_map's lock 221 * by a resv_map's lock. The set of regions within the resv_map represent
222 * reservations for huge pages, or huge pages that have already been
223 * instantiated within the map. The from and to elements are huge page
224 * indicies into the associated mapping. from indicates the starting index
225 * of the region. to represents the first index past the end of the region.
226 *
227 * For example, a file region structure with from == 0 and to == 4 represents
228 * four huge pages in a mapping. It is important to note that the to element
229 * represents the first element past the end of the region. This is used in
230 * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
231 *
232 * Interval notation of the form [from, to) will be used to indicate that
233 * the endpoint from is inclusive and to is exclusive.
222 */ 234 */
223struct file_region { 235struct file_region {
224 struct list_head link; 236 struct list_head link;
@@ -226,6 +238,14 @@ struct file_region {
226 long to; 238 long to;
227}; 239};
228 240
241/*
242 * Add the huge page range represented by [f, t) to the reserve
243 * map. Existing regions will be expanded to accommodate the
244 * specified range. We know only existing regions need to be
245 * expanded, because region_add is only called after region_chg
246 * with the same range. If a new file_region structure must
247 * be allocated, it is done in region_chg.
248 */
229static long region_add(struct resv_map *resv, long f, long t) 249static long region_add(struct resv_map *resv, long f, long t)
230{ 250{
231 struct list_head *head = &resv->regions; 251 struct list_head *head = &resv->regions;
@@ -265,6 +285,25 @@ static long region_add(struct resv_map *resv, long f, long t)
265 return 0; 285 return 0;
266} 286}
267 287
288/*
289 * Examine the existing reserve map and determine how many
290 * huge pages in the specified range [f, t) are NOT currently
291 * represented. This routine is called before a subsequent
292 * call to region_add that will actually modify the reserve
293 * map to add the specified range [f, t). region_chg does
294 * not change the number of huge pages represented by the
295 * map. However, if the existing regions in the map can not
296 * be expanded to represent the new range, a new file_region
297 * structure is added to the map as a placeholder. This is
298 * so that the subsequent region_add call will have all the
299 * regions it needs and will not fail.
300 *
301 * Returns the number of huge pages that need to be added
302 * to the existing reservation map for the range [f, t).
303 * This number is greater or equal to zero. -ENOMEM is
304 * returned if a new file_region structure is needed and can
305 * not be allocated.
306 */
268static long region_chg(struct resv_map *resv, long f, long t) 307static long region_chg(struct resv_map *resv, long f, long t)
269{ 308{
270 struct list_head *head = &resv->regions; 309 struct list_head *head = &resv->regions;
@@ -331,6 +370,11 @@ out_nrg:
331 return chg; 370 return chg;
332} 371}
333 372
373/*
374 * Truncate the reserve map at index 'end'. Modify/truncate any
375 * region which contains end. Delete any regions past end.
376 * Return the number of huge pages removed from the map.
377 */
334static long region_truncate(struct resv_map *resv, long end) 378static long region_truncate(struct resv_map *resv, long end)
335{ 379{
336 struct list_head *head = &resv->regions; 380 struct list_head *head = &resv->regions;
@@ -366,6 +410,10 @@ out:
366 return chg; 410 return chg;
367} 411}
368 412
413/*
414 * Count and return the number of huge pages in the reserve map
415 * that intersect with the range [f, t).
416 */
369static long region_count(struct resv_map *resv, long f, long t) 417static long region_count(struct resv_map *resv, long f, long t)
370{ 418{
371 struct list_head *head = &resv->regions; 419 struct list_head *head = &resv->regions;