diff options
author | Andi Kleen <ak@suse.de> | 2008-07-24 00:27:41 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-07-24 13:47:17 -0400 |
commit | a5516438959d90b071ff0a484ce4f3f523dc3152 (patch) | |
tree | e356ba9364c76b93c176b4d4a262b7aca3ee8f91 /mm/hugetlb.c | |
parent | b7ba30c679ed1eb7ed3ed8f281f6493282042bd4 (diff) |
hugetlb: modular state for hugetlb page size
The goal of this patchset is to support multiple hugetlb page sizes. This
is achieved by introducing a new struct hstate structure, which
encapsulates the important hugetlb state and constants (eg. huge page
size, number of huge pages currently allocated, etc).
The hstate structure is then passed around the code which requires these
fields, they will do the right thing regardless of the exact hstate they
are operating on.
This patch adds the hstate structure, with a single global instance of it
(default_hstate), and does the basic work of converting hugetlb to use the
hstate.
Future patches will add more hstate structures to allow for different
hugetlbfs mounts to have different page sizes.
[akpm@linux-foundation.org: coding-style fixes]
Acked-by: Adam Litke <agl@us.ibm.com>
Acked-by: Nishanth Aravamudan <nacc@us.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 368 |
1 files changed, 201 insertions, 167 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 32dff4290c6..0d8153e25f0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -22,18 +22,12 @@ | |||
22 | #include "internal.h" | 22 | #include "internal.h" |
23 | 23 | ||
24 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 24 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
25 | static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; | ||
26 | static unsigned long surplus_huge_pages; | ||
27 | static unsigned long nr_overcommit_huge_pages; | ||
28 | unsigned long max_huge_pages; | 25 | unsigned long max_huge_pages; |
29 | unsigned long sysctl_overcommit_huge_pages; | 26 | unsigned long sysctl_overcommit_huge_pages; |
30 | static struct list_head hugepage_freelists[MAX_NUMNODES]; | ||
31 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; | ||
32 | static unsigned int free_huge_pages_node[MAX_NUMNODES]; | ||
33 | static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; | ||
34 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; | 27 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; |
35 | unsigned long hugepages_treat_as_movable; | 28 | unsigned long hugepages_treat_as_movable; |
36 | static int hugetlb_next_nid; | 29 | |
30 | struct hstate default_hstate; | ||
37 | 31 | ||
38 | /* | 32 | /* |
39 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages | 33 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages |
@@ -203,11 +197,11 @@ static long region_count(struct list_head *head, long f, long t) | |||
203 | * Convert the address within this vma to the page offset within | 197 | * Convert the address within this vma to the page offset within |
204 | * the mapping, in pagecache page units; huge pages here. | 198 | * the mapping, in pagecache page units; huge pages here. |
205 | */ | 199 | */ |
206 | static pgoff_t vma_hugecache_offset(struct vm_area_struct *vma, | 200 | static pgoff_t vma_hugecache_offset(struct hstate *h, |
207 | unsigned long address) | 201 | struct vm_area_struct *vma, unsigned long address) |
208 | { | 202 | { |
209 | return ((address - vma->vm_start) >> HPAGE_SHIFT) + | 203 | return ((address - vma->vm_start) >> huge_page_shift(h)) + |
210 | (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); | 204 | (vma->vm_pgoff >> huge_page_order(h)); |
211 | } | 205 | } |
212 | 206 | ||
213 | /* | 207 | /* |
@@ -309,20 +303,21 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) | |||
309 | } | 303 | } |
310 | 304 | ||
311 | /* Decrement the reserved pages in the hugepage pool by one */ | 305 | /* Decrement the reserved pages in the hugepage pool by one */ |
312 | static void decrement_hugepage_resv_vma(struct vm_area_struct *vma) | 306 | static void decrement_hugepage_resv_vma(struct hstate *h, |
307 | struct vm_area_struct *vma) | ||
313 | { | 308 | { |
314 | if (vma->vm_flags & VM_NORESERVE) | 309 | if (vma->vm_flags & VM_NORESERVE) |
315 | return; | 310 | return; |
316 | 311 | ||
317 | if (vma->vm_flags & VM_SHARED) { | 312 | if (vma->vm_flags & VM_SHARED) { |
318 | /* Shared mappings always use reserves */ | 313 | /* Shared mappings always use reserves */ |
319 | resv_huge_pages--; | 314 | h->resv_huge_pages--; |
320 | } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { | 315 | } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { |
321 | /* | 316 | /* |
322 | * Only the process that called mmap() has reserves for | 317 | * Only the process that called mmap() has reserves for |
323 | * private mappings. | 318 | * private mappings. |
324 | */ | 319 | */ |
325 | resv_huge_pages--; | 320 | h->resv_huge_pages--; |
326 | } | 321 | } |
327 | } | 322 | } |
328 | 323 | ||
@@ -344,12 +339,13 @@ static int vma_has_private_reserves(struct vm_area_struct *vma) | |||
344 | return 1; | 339 | return 1; |
345 | } | 340 | } |
346 | 341 | ||
347 | static void clear_huge_page(struct page *page, unsigned long addr) | 342 | static void clear_huge_page(struct page *page, |
343 | unsigned long addr, unsigned long sz) | ||
348 | { | 344 | { |
349 | int i; | 345 | int i; |
350 | 346 | ||
351 | might_sleep(); | 347 | might_sleep(); |
352 | for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { | 348 | for (i = 0; i < sz/PAGE_SIZE; i++) { |
353 | cond_resched(); | 349 | cond_resched(); |
354 | clear_user_highpage(page + i, addr + i * PAGE_SIZE); | 350 | clear_user_highpage(page + i, addr + i * PAGE_SIZE); |
355 | } | 351 | } |
@@ -359,41 +355,43 @@ static void copy_huge_page(struct page *dst, struct page *src, | |||
359 | unsigned long addr, struct vm_area_struct *vma) | 355 | unsigned long addr, struct vm_area_struct *vma) |
360 | { | 356 | { |
361 | int i; | 357 | int i; |
358 | struct hstate *h = hstate_vma(vma); | ||
362 | 359 | ||
363 | might_sleep(); | 360 | might_sleep(); |
364 | for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { | 361 | for (i = 0; i < pages_per_huge_page(h); i++) { |
365 | cond_resched(); | 362 | cond_resched(); |
366 | copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); | 363 | copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); |
367 | } | 364 | } |
368 | } | 365 | } |
369 | 366 | ||
370 | static void enqueue_huge_page(struct page *page) | 367 | static void enqueue_huge_page(struct hstate *h, struct page *page) |
371 | { | 368 | { |
372 | int nid = page_to_nid(page); | 369 | int nid = page_to_nid(page); |
373 | list_add(&page->lru, &hugepage_freelists[nid]); | 370 | list_add(&page->lru, &h->hugepage_freelists[nid]); |
374 | free_huge_pages++; | 371 | h->free_huge_pages++; |
375 | free_huge_pages_node[nid]++; | 372 | h->free_huge_pages_node[nid]++; |
376 | } | 373 | } |
377 | 374 | ||
378 | static struct page *dequeue_huge_page(void) | 375 | static struct page *dequeue_huge_page(struct hstate *h) |
379 | { | 376 | { |
380 | int nid; | 377 | int nid; |
381 | struct page *page = NULL; | 378 | struct page *page = NULL; |
382 | 379 | ||
383 | for (nid = 0; nid < MAX_NUMNODES; ++nid) { | 380 | for (nid = 0; nid < MAX_NUMNODES; ++nid) { |
384 | if (!list_empty(&hugepage_freelists[nid])) { | 381 | if (!list_empty(&h->hugepage_freelists[nid])) { |
385 | page = list_entry(hugepage_freelists[nid].next, | 382 | page = list_entry(h->hugepage_freelists[nid].next, |
386 | struct page, lru); | 383 | struct page, lru); |
387 | list_del(&page->lru); | 384 | list_del(&page->lru); |
388 | free_huge_pages--; | 385 | h->free_huge_pages--; |
389 | free_huge_pages_node[nid]--; | 386 | h->free_huge_pages_node[nid]--; |
390 | break; | 387 | break; |
391 | } | 388 | } |
392 | } | 389 | } |
393 | return page; | 390 | return page; |
394 | } | 391 | } |
395 | 392 | ||
396 | static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, | 393 | static struct page *dequeue_huge_page_vma(struct hstate *h, |
394 | struct vm_area_struct *vma, | ||
397 | unsigned long address, int avoid_reserve) | 395 | unsigned long address, int avoid_reserve) |
398 | { | 396 | { |
399 | int nid; | 397 | int nid; |
@@ -411,26 +409,26 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, | |||
411 | * not "stolen". The child may still get SIGKILLed | 409 | * not "stolen". The child may still get SIGKILLed |
412 | */ | 410 | */ |
413 | if (!vma_has_private_reserves(vma) && | 411 | if (!vma_has_private_reserves(vma) && |
414 | free_huge_pages - resv_huge_pages == 0) | 412 | h->free_huge_pages - h->resv_huge_pages == 0) |
415 | return NULL; | 413 | return NULL; |
416 | 414 | ||
417 | /* If reserves cannot be used, ensure enough pages are in the pool */ | 415 | /* If reserves cannot be used, ensure enough pages are in the pool */ |
418 | if (avoid_reserve && free_huge_pages - resv_huge_pages == 0) | 416 | if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) |
419 | return NULL; | 417 | return NULL; |
420 | 418 | ||
421 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 419 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
422 | MAX_NR_ZONES - 1, nodemask) { | 420 | MAX_NR_ZONES - 1, nodemask) { |
423 | nid = zone_to_nid(zone); | 421 | nid = zone_to_nid(zone); |
424 | if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && | 422 | if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && |
425 | !list_empty(&hugepage_freelists[nid])) { | 423 | !list_empty(&h->hugepage_freelists[nid])) { |
426 | page = list_entry(hugepage_freelists[nid].next, | 424 | page = list_entry(h->hugepage_freelists[nid].next, |
427 | struct page, lru); | 425 | struct page, lru); |
428 | list_del(&page->lru); | 426 | list_del(&page->lru); |
429 | free_huge_pages--; | 427 | h->free_huge_pages--; |
430 | free_huge_pages_node[nid]--; | 428 | h->free_huge_pages_node[nid]--; |
431 | 429 | ||
432 | if (!avoid_reserve) | 430 | if (!avoid_reserve) |
433 | decrement_hugepage_resv_vma(vma); | 431 | decrement_hugepage_resv_vma(h, vma); |
434 | 432 | ||
435 | break; | 433 | break; |
436 | } | 434 | } |
@@ -439,12 +437,13 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, | |||
439 | return page; | 437 | return page; |
440 | } | 438 | } |
441 | 439 | ||
442 | static void update_and_free_page(struct page *page) | 440 | static void update_and_free_page(struct hstate *h, struct page *page) |
443 | { | 441 | { |
444 | int i; | 442 | int i; |
445 | nr_huge_pages--; | 443 | |
446 | nr_huge_pages_node[page_to_nid(page)]--; | 444 | h->nr_huge_pages--; |
447 | for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { | 445 | h->nr_huge_pages_node[page_to_nid(page)]--; |
446 | for (i = 0; i < pages_per_huge_page(h); i++) { | ||
448 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | | 447 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | |
449 | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | | 448 | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | |
450 | 1 << PG_private | 1<< PG_writeback); | 449 | 1 << PG_private | 1<< PG_writeback); |
@@ -452,11 +451,16 @@ static void update_and_free_page(struct page *page) | |||
452 | set_compound_page_dtor(page, NULL); | 451 | set_compound_page_dtor(page, NULL); |
453 | set_page_refcounted(page); | 452 | set_page_refcounted(page); |
454 | arch_release_hugepage(page); | 453 | arch_release_hugepage(page); |
455 | __free_pages(page, HUGETLB_PAGE_ORDER); | 454 | __free_pages(page, huge_page_order(h)); |
456 | } | 455 | } |
457 | 456 | ||
458 | static void free_huge_page(struct page *page) | 457 | static void free_huge_page(struct page *page) |
459 | { | 458 | { |
459 | /* | ||
460 | * Can't pass hstate in here because it is called from the | ||
461 | * compound page destructor. | ||
462 | */ | ||
463 | struct hstate *h = &default_hstate; | ||
460 | int nid = page_to_nid(page); | 464 | int nid = page_to_nid(page); |
461 | struct address_space *mapping; | 465 | struct address_space *mapping; |
462 | 466 | ||
@@ -466,12 +470,12 @@ static void free_huge_page(struct page *page) | |||
466 | INIT_LIST_HEAD(&page->lru); | 470 | INIT_LIST_HEAD(&page->lru); |
467 | 471 | ||
468 | spin_lock(&hugetlb_lock); | 472 | spin_lock(&hugetlb_lock); |
469 | if (surplus_huge_pages_node[nid]) { | 473 | if (h->surplus_huge_pages_node[nid]) { |
470 | update_and_free_page(page); | 474 | update_and_free_page(h, page); |
471 | surplus_huge_pages--; | 475 | h->surplus_huge_pages--; |
472 | surplus_huge_pages_node[nid]--; | 476 | h->surplus_huge_pages_node[nid]--; |
473 | } else { | 477 | } else { |
474 | enqueue_huge_page(page); | 478 | enqueue_huge_page(h, page); |
475 | } | 479 | } |
476 | spin_unlock(&hugetlb_lock); | 480 | spin_unlock(&hugetlb_lock); |
477 | if (mapping) | 481 | if (mapping) |
@@ -483,7 +487,7 @@ static void free_huge_page(struct page *page) | |||
483 | * balanced by operating on them in a round-robin fashion. | 487 | * balanced by operating on them in a round-robin fashion. |
484 | * Returns 1 if an adjustment was made. | 488 | * Returns 1 if an adjustment was made. |
485 | */ | 489 | */ |
486 | static int adjust_pool_surplus(int delta) | 490 | static int adjust_pool_surplus(struct hstate *h, int delta) |
487 | { | 491 | { |
488 | static int prev_nid; | 492 | static int prev_nid; |
489 | int nid = prev_nid; | 493 | int nid = prev_nid; |
@@ -496,15 +500,15 @@ static int adjust_pool_surplus(int delta) | |||
496 | nid = first_node(node_online_map); | 500 | nid = first_node(node_online_map); |
497 | 501 | ||
498 | /* To shrink on this node, there must be a surplus page */ | 502 | /* To shrink on this node, there must be a surplus page */ |
499 | if (delta < 0 && !surplus_huge_pages_node[nid]) | 503 | if (delta < 0 && !h->surplus_huge_pages_node[nid]) |
500 | continue; | 504 | continue; |
501 | /* Surplus cannot exceed the total number of pages */ | 505 | /* Surplus cannot exceed the total number of pages */ |
502 | if (delta > 0 && surplus_huge_pages_node[nid] >= | 506 | if (delta > 0 && h->surplus_huge_pages_node[nid] >= |
503 | nr_huge_pages_node[nid]) | 507 | h->nr_huge_pages_node[nid]) |
504 | continue; | 508 | continue; |
505 | 509 | ||
506 | surplus_huge_pages += delta; | 510 | h->surplus_huge_pages += delta; |
507 | surplus_huge_pages_node[nid] += delta; | 511 | h->surplus_huge_pages_node[nid] += delta; |
508 | ret = 1; | 512 | ret = 1; |
509 | break; | 513 | break; |
510 | } while (nid != prev_nid); | 514 | } while (nid != prev_nid); |
@@ -513,46 +517,46 @@ static int adjust_pool_surplus(int delta) | |||
513 | return ret; | 517 | return ret; |
514 | } | 518 | } |
515 | 519 | ||
516 | static void prep_new_huge_page(struct page *page, int nid) | 520 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) |
517 | { | 521 | { |
518 | set_compound_page_dtor(page, free_huge_page); | 522 | set_compound_page_dtor(page, free_huge_page); |
519 | spin_lock(&hugetlb_lock); | 523 | spin_lock(&hugetlb_lock); |
520 | nr_huge_pages++; | 524 | h->nr_huge_pages++; |
521 | nr_huge_pages_node[nid]++; | 525 | h->nr_huge_pages_node[nid]++; |
522 | spin_unlock(&hugetlb_lock); | 526 | spin_unlock(&hugetlb_lock); |
523 | put_page(page); /* free it into the hugepage allocator */ | 527 | put_page(page); /* free it into the hugepage allocator */ |
524 | } | 528 | } |
525 | 529 | ||
526 | static struct page *alloc_fresh_huge_page_node(int nid) | 530 | static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) |
527 | { | 531 | { |
528 | struct page *page; | 532 | struct page *page; |
529 | 533 | ||
530 | page = alloc_pages_node(nid, | 534 | page = alloc_pages_node(nid, |
531 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| | 535 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| |
532 | __GFP_REPEAT|__GFP_NOWARN, | 536 | __GFP_REPEAT|__GFP_NOWARN, |
533 | HUGETLB_PAGE_ORDER); | 537 | huge_page_order(h)); |
534 | if (page) { | 538 | if (page) { |
535 | if (arch_prepare_hugepage(page)) { | 539 | if (arch_prepare_hugepage(page)) { |
536 | __free_pages(page, HUGETLB_PAGE_ORDER); | 540 | __free_pages(page, HUGETLB_PAGE_ORDER); |
537 | return NULL; | 541 | return NULL; |
538 | } | 542 | } |
539 | prep_new_huge_page(page, nid); | 543 | prep_new_huge_page(h, page, nid); |
540 | } | 544 | } |
541 | 545 | ||
542 | return page; | 546 | return page; |
543 | } | 547 | } |
544 | 548 | ||
545 | static int alloc_fresh_huge_page(void) | 549 | static int alloc_fresh_huge_page(struct hstate *h) |
546 | { | 550 | { |
547 | struct page *page; | 551 | struct page *page; |
548 | int start_nid; | 552 | int start_nid; |
549 | int next_nid; | 553 | int next_nid; |
550 | int ret = 0; | 554 | int ret = 0; |
551 | 555 | ||
552 | start_nid = hugetlb_next_nid; | 556 | start_nid = h->hugetlb_next_nid; |
553 | 557 | ||
554 | do { | 558 | do { |
555 | page = alloc_fresh_huge_page_node(hugetlb_next_nid); | 559 | page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid); |
556 | if (page) | 560 | if (page) |
557 | ret = 1; | 561 | ret = 1; |
558 | /* | 562 | /* |
@@ -566,11 +570,11 @@ static int alloc_fresh_huge_page(void) | |||
566 | * if we just successfully allocated a hugepage so that | 570 | * if we just successfully allocated a hugepage so that |
567 | * the next caller gets hugepages on the next node. | 571 | * the next caller gets hugepages on the next node. |
568 | */ | 572 | */ |
569 | next_nid = next_node(hugetlb_next_nid, node_online_map); | 573 | next_nid = next_node(h->hugetlb_next_nid, node_online_map); |
570 | if (next_nid == MAX_NUMNODES) | 574 | if (next_nid == MAX_NUMNODES) |
571 | next_nid = first_node(node_online_map); | 575 | next_nid = first_node(node_online_map); |
572 | hugetlb_next_nid = next_nid; | 576 | h->hugetlb_next_nid = next_nid; |
573 | } while (!page && hugetlb_next_nid != start_nid); | 577 | } while (!page && h->hugetlb_next_nid != start_nid); |
574 | 578 | ||
575 | if (ret) | 579 | if (ret) |
576 | count_vm_event(HTLB_BUDDY_PGALLOC); | 580 | count_vm_event(HTLB_BUDDY_PGALLOC); |
@@ -580,8 +584,8 @@ static int alloc_fresh_huge_page(void) | |||
580 | return ret; | 584 | return ret; |
581 | } | 585 | } |
582 | 586 | ||
583 | static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, | 587 | static struct page *alloc_buddy_huge_page(struct hstate *h, |
584 | unsigned long address) | 588 | struct vm_area_struct *vma, unsigned long address) |
585 | { | 589 | { |
586 | struct page *page; | 590 | struct page *page; |
587 | unsigned int nid; | 591 | unsigned int nid; |
@@ -610,18 +614,18 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, | |||
610 | * per-node value is checked there. | 614 | * per-node value is checked there. |
611 | */ | 615 | */ |
612 | spin_lock(&hugetlb_lock); | 616 | spin_lock(&hugetlb_lock); |
613 | if (surplus_huge_pages >= nr_overcommit_huge_pages) { | 617 | if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { |
614 | spin_unlock(&hugetlb_lock); | 618 | spin_unlock(&hugetlb_lock); |
615 | return NULL; | 619 | return NULL; |
616 | } else { | 620 | } else { |
617 | nr_huge_pages++; | 621 | h->nr_huge_pages++; |
618 | surplus_huge_pages++; | 622 | h->surplus_huge_pages++; |
619 | } | 623 | } |
620 | spin_unlock(&hugetlb_lock); | 624 | spin_unlock(&hugetlb_lock); |
621 | 625 | ||
622 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP| | 626 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP| |
623 | __GFP_REPEAT|__GFP_NOWARN, | 627 | __GFP_REPEAT|__GFP_NOWARN, |
624 | HUGETLB_PAGE_ORDER); | 628 | huge_page_order(h)); |
625 | 629 | ||
626 | spin_lock(&hugetlb_lock); | 630 | spin_lock(&hugetlb_lock); |
627 | if (page) { | 631 | if (page) { |
@@ -636,12 +640,12 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, | |||
636 | /* | 640 | /* |
637 | * We incremented the global counters already | 641 | * We incremented the global counters already |
638 | */ | 642 | */ |
639 | nr_huge_pages_node[nid]++; | 643 | h->nr_huge_pages_node[nid]++; |
640 | surplus_huge_pages_node[nid]++; | 644 | h->surplus_huge_pages_node[nid]++; |
641 | __count_vm_event(HTLB_BUDDY_PGALLOC); | 645 | __count_vm_event(HTLB_BUDDY_PGALLOC); |
642 | } else { | 646 | } else { |
643 | nr_huge_pages--; | 647 | h->nr_huge_pages--; |
644 | surplus_huge_pages--; | 648 | h->surplus_huge_pages--; |
645 | __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); | 649 | __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); |
646 | } | 650 | } |
647 | spin_unlock(&hugetlb_lock); | 651 | spin_unlock(&hugetlb_lock); |
@@ -653,16 +657,16 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, | |||
653 | * Increase the hugetlb pool such that it can accomodate a reservation | 657 | * Increase the hugetlb pool such that it can accomodate a reservation |
654 | * of size 'delta'. | 658 | * of size 'delta'. |
655 | */ | 659 | */ |
656 | static int gather_surplus_pages(int delta) | 660 | static int gather_surplus_pages(struct hstate *h, int delta) |
657 | { | 661 | { |
658 | struct list_head surplus_list; | 662 | struct list_head surplus_list; |
659 | struct page *page, *tmp; | 663 | struct page *page, *tmp; |
660 | int ret, i; | 664 | int ret, i; |
661 | int needed, allocated; | 665 | int needed, allocated; |
662 | 666 | ||
663 | needed = (resv_huge_pages + delta) - free_huge_pages; | 667 | needed = (h->resv_huge_pages + delta) - h->free_huge_pages; |
664 | if (needed <= 0) { | 668 | if (needed <= 0) { |
665 | resv_huge_pages += delta; | 669 | h->resv_huge_pages += delta; |
666 | return 0; | 670 | return 0; |
667 | } | 671 | } |
668 | 672 | ||
@@ -673,7 +677,7 @@ static int gather_surplus_pages(int delta) | |||
673 | retry: | 677 | retry: |
674 | spin_unlock(&hugetlb_lock); | 678 | spin_unlock(&hugetlb_lock); |
675 | for (i = 0; i < needed; i++) { | 679 | for (i = 0; i < needed; i++) { |
676 | page = alloc_buddy_huge_page(NULL, 0); | 680 | page = alloc_buddy_huge_page(h, NULL, 0); |
677 | if (!page) { | 681 | if (!page) { |
678 | /* | 682 | /* |
679 | * We were not able to allocate enough pages to | 683 | * We were not able to allocate enough pages to |
@@ -694,7 +698,8 @@ retry: | |||
694 | * because either resv_huge_pages or free_huge_pages may have changed. | 698 | * because either resv_huge_pages or free_huge_pages may have changed. |
695 | */ | 699 | */ |
696 | spin_lock(&hugetlb_lock); | 700 | spin_lock(&hugetlb_lock); |
697 | needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); | 701 | needed = (h->resv_huge_pages + delta) - |
702 | (h->free_huge_pages + allocated); | ||
698 | if (needed > 0) | 703 | if (needed > 0) |
699 | goto retry; | 704 | goto retry; |
700 | 705 | ||
@@ -707,7 +712,7 @@ retry: | |||
707 | * before they are reserved. | 712 | * before they are reserved. |
708 | */ | 713 | */ |
709 | needed += allocated; | 714 | needed += allocated; |
710 | resv_huge_pages += delta; | 715 | h->resv_huge_pages += delta; |
711 | ret = 0; | 716 | ret = 0; |
712 | free: | 717 | free: |
713 | /* Free the needed pages to the hugetlb pool */ | 718 | /* Free the needed pages to the hugetlb pool */ |
@@ -715,7 +720,7 @@ free: | |||
715 | if ((--needed) < 0) | 720 | if ((--needed) < 0) |
716 | break; | 721 | break; |
717 | list_del(&page->lru); | 722 | list_del(&page->lru); |
718 | enqueue_huge_page(page); | 723 | enqueue_huge_page(h, page); |
719 | } | 724 | } |
720 | 725 | ||
721 | /* Free unnecessary surplus pages to the buddy allocator */ | 726 | /* Free unnecessary surplus pages to the buddy allocator */ |
@@ -743,7 +748,8 @@ free: | |||
743 | * allocated to satisfy the reservation must be explicitly freed if they were | 748 | * allocated to satisfy the reservation must be explicitly freed if they were |
744 | * never used. | 749 | * never used. |
745 | */ | 750 | */ |
746 | static void return_unused_surplus_pages(unsigned long unused_resv_pages) | 751 | static void return_unused_surplus_pages(struct hstate *h, |
752 | unsigned long unused_resv_pages) | ||
747 | { | 753 | { |
748 | static int nid = -1; | 754 | static int nid = -1; |
749 | struct page *page; | 755 | struct page *page; |
@@ -758,27 +764,27 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages) | |||
758 | unsigned long remaining_iterations = num_online_nodes(); | 764 | unsigned long remaining_iterations = num_online_nodes(); |
759 | 765 | ||
760 | /* Uncommit the reservation */ | 766 | /* Uncommit the reservation */ |
761 | resv_huge_pages -= unused_resv_pages; | 767 | h->resv_huge_pages -= unused_resv_pages; |
762 | 768 | ||
763 | nr_pages = min(unused_resv_pages, surplus_huge_pages); | 769 | nr_pages = min(unused_resv_pages, h->surplus_huge_pages); |
764 | 770 | ||
765 | while (remaining_iterations-- && nr_pages) { | 771 | while (remaining_iterations-- && nr_pages) { |
766 | nid = next_node(nid, node_online_map); | 772 | nid = next_node(nid, node_online_map); |
767 | if (nid == MAX_NUMNODES) | 773 | if (nid == MAX_NUMNODES) |
768 | nid = first_node(node_online_map); | 774 | nid = first_node(node_online_map); |
769 | 775 | ||
770 | if (!surplus_huge_pages_node[nid]) | 776 | if (!h->surplus_huge_pages_node[nid]) |
771 | continue; | 777 | continue; |
772 | 778 | ||
773 | if (!list_empty(&hugepage_freelists[nid])) { | 779 | if (!list_empty(&h->hugepage_freelists[nid])) { |
774 | page = list_entry(hugepage_freelists[nid].next, | 780 | page = list_entry(h->hugepage_freelists[nid].next, |
775 | struct page, lru); | 781 | struct page, lru); |
776 | list_del(&page->lru); | 782 | list_del(&page->lru); |
777 | update_and_free_page(page); | 783 | update_and_free_page(h, page); |
778 | free_huge_pages--; | 784 | h->free_huge_pages--; |
779 | free_huge_pages_node[nid]--; | 785 | h->free_huge_pages_node[nid]--; |
780 | surplus_huge_pages--; | 786 | h->surplus_huge_pages--; |
781 | surplus_huge_pages_node[nid]--; | 787 | h->surplus_huge_pages_node[nid]--; |
782 | nr_pages--; | 788 | nr_pages--; |
783 | remaining_iterations = num_online_nodes(); | 789 | remaining_iterations = num_online_nodes(); |
784 | } | 790 | } |
@@ -794,13 +800,14 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages) | |||
794 | * an instantiated the change should be committed via vma_commit_reservation. | 800 | * an instantiated the change should be committed via vma_commit_reservation. |
795 | * No action is required on failure. | 801 | * No action is required on failure. |
796 | */ | 802 | */ |
797 | static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr) | 803 | static int vma_needs_reservation(struct hstate *h, |
804 | struct vm_area_struct *vma, unsigned long addr) | ||
798 | { | 805 | { |
799 | struct address_space *mapping = vma->vm_file->f_mapping; | 806 | struct address_space *mapping = vma->vm_file->f_mapping; |
800 | struct inode *inode = mapping->host; | 807 | struct inode *inode = mapping->host; |
801 | 808 | ||
802 | if (vma->vm_flags & VM_SHARED) { | 809 | if (vma->vm_flags & VM_SHARED) { |
803 | pgoff_t idx = vma_hugecache_offset(vma, addr); | 810 | pgoff_t idx = vma_hugecache_offset(h, vma, addr); |
804 | return region_chg(&inode->i_mapping->private_list, | 811 | return region_chg(&inode->i_mapping->private_list, |
805 | idx, idx + 1); | 812 | idx, idx + 1); |
806 | 813 | ||
@@ -809,7 +816,7 @@ static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr) | |||
809 | 816 | ||
810 | } else { | 817 | } else { |
811 | int err; | 818 | int err; |
812 | pgoff_t idx = vma_hugecache_offset(vma, addr); | 819 | pgoff_t idx = vma_hugecache_offset(h, vma, addr); |
813 | struct resv_map *reservations = vma_resv_map(vma); | 820 | struct resv_map *reservations = vma_resv_map(vma); |
814 | 821 | ||
815 | err = region_chg(&reservations->regions, idx, idx + 1); | 822 | err = region_chg(&reservations->regions, idx, idx + 1); |
@@ -818,18 +825,18 @@ static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr) | |||
818 | return 0; | 825 | return 0; |
819 | } | 826 | } |
820 | } | 827 | } |
821 | static void vma_commit_reservation(struct vm_area_struct *vma, | 828 | static void vma_commit_reservation(struct hstate *h, |
822 | unsigned long addr) | 829 | struct vm_area_struct *vma, unsigned long addr) |
823 | { | 830 | { |
824 | struct address_space *mapping = vma->vm_file->f_mapping; | 831 | struct address_space *mapping = vma->vm_file->f_mapping; |
825 | struct inode *inode = mapping->host; | 832 | struct inode *inode = mapping->host; |
826 | 833 | ||
827 | if (vma->vm_flags & VM_SHARED) { | 834 | if (vma->vm_flags & VM_SHARED) { |
828 | pgoff_t idx = vma_hugecache_offset(vma, addr); | 835 | pgoff_t idx = vma_hugecache_offset(h, vma, addr); |
829 | region_add(&inode->i_mapping->private_list, idx, idx + 1); | 836 | region_add(&inode->i_mapping->private_list, idx, idx + 1); |
830 | 837 | ||
831 | } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { | 838 | } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { |
832 | pgoff_t idx = vma_hugecache_offset(vma, addr); | 839 | pgoff_t idx = vma_hugecache_offset(h, vma, addr); |
833 | struct resv_map *reservations = vma_resv_map(vma); | 840 | struct resv_map *reservations = vma_resv_map(vma); |
834 | 841 | ||
835 | /* Mark this page used in the map. */ | 842 | /* Mark this page used in the map. */ |
@@ -840,6 +847,7 @@ static void vma_commit_reservation(struct vm_area_struct *vma, | |||
840 | static struct page *alloc_huge_page(struct vm_area_struct *vma, | 847 | static struct page *alloc_huge_page(struct vm_area_struct *vma, |
841 | unsigned long addr, int avoid_reserve) | 848 | unsigned long addr, int avoid_reserve) |
842 | { | 849 | { |
850 | struct hstate *h = hstate_vma(vma); | ||
843 | struct page *page; | 851 | struct page *page; |
844 | struct address_space *mapping = vma->vm_file->f_mapping; | 852 | struct address_space *mapping = vma->vm_file->f_mapping; |
845 | struct inode *inode = mapping->host; | 853 | struct inode *inode = mapping->host; |
@@ -852,7 +860,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
852 | * MAP_NORESERVE mappings may also need pages and quota allocated | 860 | * MAP_NORESERVE mappings may also need pages and quota allocated |
853 | * if no reserve mapping overlaps. | 861 | * if no reserve mapping overlaps. |
854 | */ | 862 | */ |
855 | chg = vma_needs_reservation(vma, addr); | 863 | chg = vma_needs_reservation(h, vma, addr); |
856 | if (chg < 0) | 864 | if (chg < 0) |
857 | return ERR_PTR(chg); | 865 | return ERR_PTR(chg); |
858 | if (chg) | 866 | if (chg) |
@@ -860,11 +868,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
860 | return ERR_PTR(-ENOSPC); | 868 | return ERR_PTR(-ENOSPC); |
861 | 869 | ||
862 | spin_lock(&hugetlb_lock); | 870 | spin_lock(&hugetlb_lock); |
863 | page = dequeue_huge_page_vma(vma, addr, avoid_reserve); | 871 | page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); |
864 | spin_unlock(&hugetlb_lock); | 872 | spin_unlock(&hugetlb_lock); |
865 | 873 | ||
866 | if (!page) { | 874 | if (!page) { |
867 | page = alloc_buddy_huge_page(vma, addr); | 875 | page = alloc_buddy_huge_page(h, vma, addr); |
868 | if (!page) { | 876 | if (!page) { |
869 | hugetlb_put_quota(inode->i_mapping, chg); | 877 | hugetlb_put_quota(inode->i_mapping, chg); |
870 | return ERR_PTR(-VM_FAULT_OOM); | 878 | return ERR_PTR(-VM_FAULT_OOM); |
@@ -874,7 +882,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
874 | set_page_refcounted(page); | 882 | set_page_refcounted(page); |
875 | set_page_private(page, (unsigned long) mapping); | 883 | set_page_private(page, (unsigned long) mapping); |
876 | 884 | ||
877 | vma_commit_reservation(vma, addr); | 885 | vma_commit_reservation(h, vma, addr); |
878 | 886 | ||
879 | return page; | 887 | return page; |
880 | } | 888 | } |
@@ -882,21 +890,28 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
882 | static int __init hugetlb_init(void) | 890 | static int __init hugetlb_init(void) |
883 | { | 891 | { |
884 | unsigned long i; | 892 | unsigned long i; |
893 | struct hstate *h = &default_hstate; | ||
885 | 894 | ||
886 | if (HPAGE_SHIFT == 0) | 895 | if (HPAGE_SHIFT == 0) |
887 | return 0; | 896 | return 0; |
888 | 897 | ||
898 | if (!h->order) { | ||
899 | h->order = HPAGE_SHIFT - PAGE_SHIFT; | ||
900 | h->mask = HPAGE_MASK; | ||
901 | } | ||
902 | |||
889 | for (i = 0; i < MAX_NUMNODES; ++i) | 903 | for (i = 0; i < MAX_NUMNODES; ++i) |
890 | INIT_LIST_HEAD(&hugepage_freelists[i]); | 904 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); |
891 | 905 | ||
892 | hugetlb_next_nid = first_node(node_online_map); | 906 | h->hugetlb_next_nid = first_node(node_online_map); |
893 | 907 | ||
894 | for (i = 0; i < max_huge_pages; ++i) { | 908 | for (i = 0; i < max_huge_pages; ++i) { |
895 | if (!alloc_fresh_huge_page()) | 909 | if (!alloc_fresh_huge_page(h)) |
896 | break; | 910 | break; |
897 | } | 911 | } |
898 | max_huge_pages = free_huge_pages = nr_huge_pages = i; | 912 | max_huge_pages = h->free_huge_pages = h->nr_huge_pages = i; |
899 | printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); | 913 | printk(KERN_INFO "Total HugeTLB memory allocated, %ld\n", |
914 | h->free_huge_pages); | ||
900 | return 0; | 915 | return 0; |
901 | } | 916 | } |
902 | module_init(hugetlb_init); | 917 | module_init(hugetlb_init); |
@@ -922,34 +937,36 @@ static unsigned int cpuset_mems_nr(unsigned int *array) | |||
922 | 937 | ||
923 | #ifdef CONFIG_SYSCTL | 938 | #ifdef CONFIG_SYSCTL |
924 | #ifdef CONFIG_HIGHMEM | 939 | #ifdef CONFIG_HIGHMEM |
925 | static void try_to_free_low(unsigned long count) | 940 | static void try_to_free_low(struct hstate *h, unsigned long count) |
926 | { | 941 | { |
927 | int i; | 942 | int i; |
928 | 943 | ||
929 | for (i = 0; i < MAX_NUMNODES; ++i) { | 944 | for (i = 0; i < MAX_NUMNODES; ++i) { |
930 | struct page *page, *next; | 945 | struct page *page, *next; |
931 | list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { | 946 | struct list_head *freel = &h->hugepage_freelists[i]; |
932 | if (count >= nr_huge_pages) | 947 | list_for_each_entry_safe(page, next, freel, lru) { |
948 | if (count >= h->nr_huge_pages) | ||
933 | return; | 949 | return; |
934 | if (PageHighMem(page)) | 950 | if (PageHighMem(page)) |
935 | continue; | 951 | continue; |
936 | list_del(&page->lru); | 952 | list_del(&page->lru); |
937 | update_and_free_page(page); | 953 | update_and_free_page(page); |
938 | free_huge_pages--; | 954 | h->free_huge_pages--; |
939 | free_huge_pages_node[page_to_nid(page)]--; | 955 | h->free_huge_pages_node[page_to_nid(page)]--; |
940 | } | 956 | } |
941 | } | 957 | } |
942 | } | 958 | } |
943 | #else | 959 | #else |
944 | static inline void try_to_free_low(unsigned long count) | 960 | static inline void try_to_free_low(struct hstate *h, unsigned long count) |
945 | { | 961 | { |
946 | } | 962 | } |
947 | #endif | 963 | #endif |
948 | 964 | ||
949 | #define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) | 965 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) |
950 | static unsigned long set_max_huge_pages(unsigned long count) | 966 | static unsigned long set_max_huge_pages(unsigned long count) |
951 | { | 967 | { |
952 | unsigned long min_count, ret; | 968 | unsigned long min_count, ret; |
969 | struct hstate *h = &default_hstate; | ||
953 | 970 | ||
954 | /* | 971 | /* |
955 | * Increase the pool size | 972 | * Increase the pool size |
@@ -963,19 +980,19 @@ static unsigned long set_max_huge_pages(unsigned long count) | |||
963 | * within all the constraints specified by the sysctls. | 980 | * within all the constraints specified by the sysctls. |
964 | */ | 981 | */ |
965 | spin_lock(&hugetlb_lock); | 982 | spin_lock(&hugetlb_lock); |
966 | while (surplus_huge_pages && count > persistent_huge_pages) { | 983 | while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { |
967 | if (!adjust_pool_surplus(-1)) | 984 | if (!adjust_pool_surplus(h, -1)) |
968 | break; | 985 | break; |
969 | } | 986 | } |
970 | 987 | ||
971 | while (count > persistent_huge_pages) { | 988 | while (count > persistent_huge_pages(h)) { |
972 | /* | 989 | /* |
973 | * If this allocation races such that we no longer need the | 990 | * If this allocation races such that we no longer need the |
974 | * page, free_huge_page will handle it by freeing the page | 991 | * page, free_huge_page will handle it by freeing the page |
975 | * and reducing the surplus. | 992 | * and reducing the surplus. |
976 | */ | 993 | */ |
977 | spin_unlock(&hugetlb_lock); | 994 | spin_unlock(&hugetlb_lock); |
978 | ret = alloc_fresh_huge_page(); | 995 | ret = alloc_fresh_huge_page(h); |
979 | spin_lock(&hugetlb_lock); | 996 | spin_lock(&hugetlb_lock); |
980 | if (!ret) | 997 | if (!ret) |
981 | goto out; | 998 | goto out; |
@@ -997,21 +1014,21 @@ static unsigned long set_max_huge_pages(unsigned long count) | |||
997 | * and won't grow the pool anywhere else. Not until one of the | 1014 | * and won't grow the pool anywhere else. Not until one of the |
998 | * sysctls are changed, or the surplus pages go out of use. | 1015 | * sysctls are changed, or the surplus pages go out of use. |
999 | */ | 1016 | */ |
1000 | min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; | 1017 | min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; |
1001 | min_count = max(count, min_count); | 1018 | min_count = max(count, min_count); |
1002 | try_to_free_low(min_count); | 1019 | try_to_free_low(h, min_count); |
1003 | while (min_count < persistent_huge_pages) { | 1020 | while (min_count < persistent_huge_pages(h)) { |
1004 | struct page *page = dequeue_huge_page(); | 1021 | struct page *page = dequeue_huge_page(h); |
1005 | if (!page) | 1022 | if (!page) |
1006 | break; | 1023 | break; |
1007 | update_and_free_page(page); | 1024 | update_and_free_page(h, page); |
1008 | } | 1025 | } |
1009 | while (count < persistent_huge_pages) { | 1026 | while (count < persistent_huge_pages(h)) { |
1010 | if (!adjust_pool_surplus(1)) | 1027 | if (!adjust_pool_surplus(h, 1)) |
1011 | break; | 1028 | break; |
1012 | } | 1029 | } |
1013 | out: | 1030 | out: |
1014 | ret = persistent_huge_pages; | 1031 | ret = persistent_huge_pages(h); |
1015 | spin_unlock(&hugetlb_lock); | 1032 | spin_unlock(&hugetlb_lock); |
1016 | return ret; | 1033 | return ret; |
1017 | } | 1034 | } |
@@ -1041,9 +1058,10 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, | |||
1041 | struct file *file, void __user *buffer, | 1058 | struct file *file, void __user *buffer, |
1042 | size_t *length, loff_t *ppos) | 1059 | size_t *length, loff_t *ppos) |
1043 | { | 1060 | { |
1061 | struct hstate *h = &default_hstate; | ||
1044 | proc_doulongvec_minmax(table, write, file, buffer, length, ppos); | 1062 | proc_doulongvec_minmax(table, write, file, buffer, length, ppos); |
1045 | spin_lock(&hugetlb_lock); | 1063 | spin_lock(&hugetlb_lock); |
1046 | nr_overcommit_huge_pages = sysctl_overcommit_huge_pages; | 1064 | h->nr_overcommit_huge_pages = sysctl_overcommit_huge_pages; |
1047 | spin_unlock(&hugetlb_lock); | 1065 | spin_unlock(&hugetlb_lock); |
1048 | return 0; | 1066 | return 0; |
1049 | } | 1067 | } |
@@ -1052,37 +1070,40 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, | |||
1052 | 1070 | ||
1053 | int hugetlb_report_meminfo(char *buf) | 1071 | int hugetlb_report_meminfo(char *buf) |
1054 | { | 1072 | { |
1073 | struct hstate *h = &default_hstate; | ||
1055 | return sprintf(buf, | 1074 | return sprintf(buf, |
1056 | "HugePages_Total: %5lu\n" | 1075 | "HugePages_Total: %5lu\n" |
1057 | "HugePages_Free: %5lu\n" | 1076 | "HugePages_Free: %5lu\n" |
1058 | "HugePages_Rsvd: %5lu\n" | 1077 | "HugePages_Rsvd: %5lu\n" |
1059 | "HugePages_Surp: %5lu\n" | 1078 | "HugePages_Surp: %5lu\n" |
1060 | "Hugepagesize: %5lu kB\n", | 1079 | "Hugepagesize: %5lu kB\n", |
1061 | nr_huge_pages, | 1080 | h->nr_huge_pages, |
1062 | free_huge_pages, | 1081 | h->free_huge_pages, |
1063 | resv_huge_pages, | 1082 | h->resv_huge_pages, |
1064 | surplus_huge_pages, | 1083 | h->surplus_huge_pages, |
1065 | HPAGE_SIZE/1024); | 1084 | 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); |
1066 | } | 1085 | } |
1067 | 1086 | ||
1068 | int hugetlb_report_node_meminfo(int nid, char *buf) | 1087 | int hugetlb_report_node_meminfo(int nid, char *buf) |
1069 | { | 1088 | { |
1089 | struct hstate *h = &default_hstate; | ||
1070 | return sprintf(buf, | 1090 | return sprintf(buf, |
1071 | "Node %d HugePages_Total: %5u\n" | 1091 | "Node %d HugePages_Total: %5u\n" |
1072 | "Node %d HugePages_Free: %5u\n" | 1092 | "Node %d HugePages_Free: %5u\n" |
1073 | "Node %d HugePages_Surp: %5u\n", | 1093 | "Node %d HugePages_Surp: %5u\n", |
1074 | nid, nr_huge_pages_node[nid], | 1094 | nid, h->nr_huge_pages_node[nid], |
1075 | nid, free_huge_pages_node[nid], | 1095 | nid, h->free_huge_pages_node[nid], |
1076 | nid, surplus_huge_pages_node[nid]); | 1096 | nid, h->surplus_huge_pages_node[nid]); |
1077 | } | 1097 | } |
1078 | 1098 | ||
1079 | /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ | 1099 | /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ |
1080 | unsigned long hugetlb_total_pages(void) | 1100 | unsigned long hugetlb_total_pages(void) |
1081 | { | 1101 | { |
1082 | return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); | 1102 | struct hstate *h = &default_hstate; |
1103 | return h->nr_huge_pages * pages_per_huge_page(h); | ||
1083 | } | 1104 | } |
1084 | 1105 | ||
1085 | static int hugetlb_acct_memory(long delta) | 1106 | static int hugetlb_acct_memory(struct hstate *h, long delta) |
1086 | { | 1107 | { |
1087 | int ret = -ENOMEM; | 1108 | int ret = -ENOMEM; |
1088 | 1109 | ||
@@ -1105,18 +1126,18 @@ static int hugetlb_acct_memory(long delta) | |||
1105 | * semantics that cpuset has. | 1126 | * semantics that cpuset has. |
1106 | */ | 1127 | */ |
1107 | if (delta > 0) { | 1128 | if (delta > 0) { |
1108 | if (gather_surplus_pages(delta) < 0) | 1129 | if (gather_surplus_pages(h, delta) < 0) |
1109 | goto out; | 1130 | goto out; |
1110 | 1131 | ||
1111 | if (delta > cpuset_mems_nr(free_huge_pages_node)) { | 1132 | if (delta > cpuset_mems_nr(h->free_huge_pages_node)) { |
1112 | return_unused_surplus_pages(delta); | 1133 | return_unused_surplus_pages(h, delta); |
1113 | goto out; | 1134 | goto out; |
1114 | } | 1135 | } |
1115 | } | 1136 | } |
1116 | 1137 | ||
1117 | ret = 0; | 1138 | ret = 0; |
1118 | if (delta < 0) | 1139 | if (delta < 0) |
1119 | return_unused_surplus_pages((unsigned long) -delta); | 1140 | return_unused_surplus_pages(h, (unsigned long) -delta); |
1120 | 1141 | ||
1121 | out: | 1142 | out: |
1122 | spin_unlock(&hugetlb_lock); | 1143 | spin_unlock(&hugetlb_lock); |
@@ -1141,14 +1162,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) | |||
1141 | 1162 | ||
1142 | static void hugetlb_vm_op_close(struct vm_area_struct *vma) | 1163 | static void hugetlb_vm_op_close(struct vm_area_struct *vma) |
1143 | { | 1164 | { |
1165 | struct hstate *h = hstate_vma(vma); | ||
1144 | struct resv_map *reservations = vma_resv_map(vma); | 1166 | struct resv_map *reservations = vma_resv_map(vma); |
1145 | unsigned long reserve; | 1167 | unsigned long reserve; |
1146 | unsigned long start; | 1168 | unsigned long start; |
1147 | unsigned long end; | 1169 | unsigned long end; |
1148 | 1170 | ||
1149 | if (reservations) { | 1171 | if (reservations) { |
1150 | start = vma_hugecache_offset(vma, vma->vm_start); | 1172 | start = vma_hugecache_offset(h, vma, vma->vm_start); |
1151 | end = vma_hugecache_offset(vma, vma->vm_end); | 1173 | end = vma_hugecache_offset(h, vma, vma->vm_end); |
1152 | 1174 | ||
1153 | reserve = (end - start) - | 1175 | reserve = (end - start) - |
1154 | region_count(&reservations->regions, start, end); | 1176 | region_count(&reservations->regions, start, end); |
@@ -1156,7 +1178,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) | |||
1156 | kref_put(&reservations->refs, resv_map_release); | 1178 | kref_put(&reservations->refs, resv_map_release); |
1157 | 1179 | ||
1158 | if (reserve) | 1180 | if (reserve) |
1159 | hugetlb_acct_memory(-reserve); | 1181 | hugetlb_acct_memory(h, -reserve); |
1160 | } | 1182 | } |
1161 | } | 1183 | } |
1162 | 1184 | ||
@@ -1214,14 +1236,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
1214 | struct page *ptepage; | 1236 | struct page *ptepage; |
1215 | unsigned long addr; | 1237 | unsigned long addr; |
1216 | int cow; | 1238 | int cow; |
1239 | struct hstate *h = hstate_vma(vma); | ||
1240 | unsigned long sz = huge_page_size(h); | ||
1217 | 1241 | ||
1218 | cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | 1242 | cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; |
1219 | 1243 | ||
1220 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { | 1244 | for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { |
1221 | src_pte = huge_pte_offset(src, addr); | 1245 | src_pte = huge_pte_offset(src, addr); |
1222 | if (!src_pte) | 1246 | if (!src_pte) |
1223 | continue; | 1247 | continue; |
1224 | dst_pte = huge_pte_alloc(dst, addr); | 1248 | dst_pte = huge_pte_alloc(dst, addr, sz); |
1225 | if (!dst_pte) | 1249 | if (!dst_pte) |
1226 | goto nomem; | 1250 | goto nomem; |
1227 | 1251 | ||
@@ -1257,6 +1281,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
1257 | pte_t pte; | 1281 | pte_t pte; |
1258 | struct page *page; | 1282 | struct page *page; |
1259 | struct page *tmp; | 1283 | struct page *tmp; |
1284 | struct hstate *h = hstate_vma(vma); | ||
1285 | unsigned long sz = huge_page_size(h); | ||
1286 | |||
1260 | /* | 1287 | /* |
1261 | * A page gathering list, protected by per file i_mmap_lock. The | 1288 | * A page gathering list, protected by per file i_mmap_lock. The |
1262 | * lock is used to avoid list corruption from multiple unmapping | 1289 | * lock is used to avoid list corruption from multiple unmapping |
@@ -1265,11 +1292,11 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
1265 | LIST_HEAD(page_list); | 1292 | LIST_HEAD(page_list); |
1266 | 1293 | ||
1267 | WARN_ON(!is_vm_hugetlb_page(vma)); | 1294 | WARN_ON(!is_vm_hugetlb_page(vma)); |
1268 | BUG_ON(start & ~HPAGE_MASK); | 1295 | BUG_ON(start & ~huge_page_mask(h)); |
1269 | BUG_ON(end & ~HPAGE_MASK); | 1296 | BUG_ON(end & ~huge_page_mask(h)); |
1270 | 1297 | ||
1271 | spin_lock(&mm->page_table_lock); | 1298 | spin_lock(&mm->page_table_lock); |
1272 | for (address = start; address < end; address += HPAGE_SIZE) { | 1299 | for (address = start; address < end; address += sz) { |
1273 | ptep = huge_pte_offset(mm, address); | 1300 | ptep = huge_pte_offset(mm, address); |
1274 | if (!ptep) | 1301 | if (!ptep) |
1275 | continue; | 1302 | continue; |
@@ -1383,6 +1410,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1383 | unsigned long address, pte_t *ptep, pte_t pte, | 1410 | unsigned long address, pte_t *ptep, pte_t pte, |
1384 | struct page *pagecache_page) | 1411 | struct page *pagecache_page) |
1385 | { | 1412 | { |
1413 | struct hstate *h = hstate_vma(vma); | ||
1386 | struct page *old_page, *new_page; | 1414 | struct page *old_page, *new_page; |
1387 | int avoidcopy; | 1415 | int avoidcopy; |
1388 | int outside_reserve = 0; | 1416 | int outside_reserve = 0; |
@@ -1443,7 +1471,7 @@ retry_avoidcopy: | |||
1443 | __SetPageUptodate(new_page); | 1471 | __SetPageUptodate(new_page); |
1444 | spin_lock(&mm->page_table_lock); | 1472 | spin_lock(&mm->page_table_lock); |
1445 | 1473 | ||
1446 | ptep = huge_pte_offset(mm, address & HPAGE_MASK); | 1474 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); |
1447 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { | 1475 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { |
1448 | /* Break COW */ | 1476 | /* Break COW */ |
1449 | huge_ptep_clear_flush(vma, address, ptep); | 1477 | huge_ptep_clear_flush(vma, address, ptep); |
@@ -1458,14 +1486,14 @@ retry_avoidcopy: | |||
1458 | } | 1486 | } |
1459 | 1487 | ||
1460 | /* Return the pagecache page at a given address within a VMA */ | 1488 | /* Return the pagecache page at a given address within a VMA */ |
1461 | static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma, | 1489 | static struct page *hugetlbfs_pagecache_page(struct hstate *h, |
1462 | unsigned long address) | 1490 | struct vm_area_struct *vma, unsigned long address) |
1463 | { | 1491 | { |
1464 | struct address_space *mapping; | 1492 | struct address_space *mapping; |
1465 | pgoff_t idx; | 1493 | pgoff_t idx; |
1466 | 1494 | ||
1467 | mapping = vma->vm_file->f_mapping; | 1495 | mapping = vma->vm_file->f_mapping; |
1468 | idx = vma_hugecache_offset(vma, address); | 1496 | idx = vma_hugecache_offset(h, vma, address); |
1469 | 1497 | ||
1470 | return find_lock_page(mapping, idx); | 1498 | return find_lock_page(mapping, idx); |
1471 | } | 1499 | } |
@@ -1473,6 +1501,7 @@ static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma, | |||
1473 | static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | 1501 | static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, |
1474 | unsigned long address, pte_t *ptep, int write_access) | 1502 | unsigned long address, pte_t *ptep, int write_access) |
1475 | { | 1503 | { |
1504 | struct hstate *h = hstate_vma(vma); | ||
1476 | int ret = VM_FAULT_SIGBUS; | 1505 | int ret = VM_FAULT_SIGBUS; |
1477 | pgoff_t idx; | 1506 | pgoff_t idx; |
1478 | unsigned long size; | 1507 | unsigned long size; |
@@ -1493,7 +1522,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1493 | } | 1522 | } |
1494 | 1523 | ||
1495 | mapping = vma->vm_file->f_mapping; | 1524 | mapping = vma->vm_file->f_mapping; |
1496 | idx = vma_hugecache_offset(vma, address); | 1525 | idx = vma_hugecache_offset(h, vma, address); |
1497 | 1526 | ||
1498 | /* | 1527 | /* |
1499 | * Use page lock to guard against racing truncation | 1528 | * Use page lock to guard against racing truncation |
@@ -1502,7 +1531,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1502 | retry: | 1531 | retry: |
1503 | page = find_lock_page(mapping, idx); | 1532 | page = find_lock_page(mapping, idx); |
1504 | if (!page) { | 1533 | if (!page) { |
1505 | size = i_size_read(mapping->host) >> HPAGE_SHIFT; | 1534 | size = i_size_read(mapping->host) >> huge_page_shift(h); |
1506 | if (idx >= size) | 1535 | if (idx >= size) |
1507 | goto out; | 1536 | goto out; |
1508 | page = alloc_huge_page(vma, address, 0); | 1537 | page = alloc_huge_page(vma, address, 0); |
@@ -1510,7 +1539,7 @@ retry: | |||
1510 | ret = -PTR_ERR(page); | 1539 | ret = -PTR_ERR(page); |
1511 | goto out; | 1540 | goto out; |
1512 | } | 1541 | } |
1513 | clear_huge_page(page, address); | 1542 | clear_huge_page(page, address, huge_page_size(h)); |
1514 | __SetPageUptodate(page); | 1543 | __SetPageUptodate(page); |
1515 | 1544 | ||
1516 | if (vma->vm_flags & VM_SHARED) { | 1545 | if (vma->vm_flags & VM_SHARED) { |
@@ -1526,14 +1555,14 @@ retry: | |||
1526 | } | 1555 | } |
1527 | 1556 | ||
1528 | spin_lock(&inode->i_lock); | 1557 | spin_lock(&inode->i_lock); |
1529 | inode->i_blocks += BLOCKS_PER_HUGEPAGE; | 1558 | inode->i_blocks += blocks_per_huge_page(h); |
1530 | spin_unlock(&inode->i_lock); | 1559 | spin_unlock(&inode->i_lock); |
1531 | } else | 1560 | } else |
1532 | lock_page(page); | 1561 | lock_page(page); |
1533 | } | 1562 | } |
1534 | 1563 | ||
1535 | spin_lock(&mm->page_table_lock); | 1564 | spin_lock(&mm->page_table_lock); |
1536 | size = i_size_read(mapping->host) >> HPAGE_SHIFT; | 1565 | size = i_size_read(mapping->host) >> huge_page_shift(h); |
1537 | if (idx >= size) | 1566 | if (idx >= size) |
1538 | goto backout; | 1567 | goto backout; |
1539 | 1568 | ||
@@ -1569,8 +1598,9 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1569 | pte_t entry; | 1598 | pte_t entry; |
1570 | int ret; | 1599 | int ret; |
1571 | static DEFINE_MUTEX(hugetlb_instantiation_mutex); | 1600 | static DEFINE_MUTEX(hugetlb_instantiation_mutex); |
1601 | struct hstate *h = hstate_vma(vma); | ||
1572 | 1602 | ||
1573 | ptep = huge_pte_alloc(mm, address); | 1603 | ptep = huge_pte_alloc(mm, address, huge_page_size(h)); |
1574 | if (!ptep) | 1604 | if (!ptep) |
1575 | return VM_FAULT_OOM; | 1605 | return VM_FAULT_OOM; |
1576 | 1606 | ||
@@ -1594,7 +1624,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1594 | if (likely(pte_same(entry, huge_ptep_get(ptep)))) | 1624 | if (likely(pte_same(entry, huge_ptep_get(ptep)))) |
1595 | if (write_access && !pte_write(entry)) { | 1625 | if (write_access && !pte_write(entry)) { |
1596 | struct page *page; | 1626 | struct page *page; |
1597 | page = hugetlbfs_pagecache_page(vma, address); | 1627 | page = hugetlbfs_pagecache_page(h, vma, address); |
1598 | ret = hugetlb_cow(mm, vma, address, ptep, entry, page); | 1628 | ret = hugetlb_cow(mm, vma, address, ptep, entry, page); |
1599 | if (page) { | 1629 | if (page) { |
1600 | unlock_page(page); | 1630 | unlock_page(page); |
@@ -1615,6 +1645,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1615 | unsigned long pfn_offset; | 1645 | unsigned long pfn_offset; |
1616 | unsigned long vaddr = *position; | 1646 | unsigned long vaddr = *position; |
1617 | int remainder = *length; | 1647 | int remainder = *length; |
1648 | struct hstate *h = hstate_vma(vma); | ||
1618 | 1649 | ||
1619 | spin_lock(&mm->page_table_lock); | 1650 | spin_lock(&mm->page_table_lock); |
1620 | while (vaddr < vma->vm_end && remainder) { | 1651 | while (vaddr < vma->vm_end && remainder) { |
@@ -1626,7 +1657,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1626 | * each hugepage. We have to make * sure we get the | 1657 | * each hugepage. We have to make * sure we get the |
1627 | * first, for the page indexing below to work. | 1658 | * first, for the page indexing below to work. |
1628 | */ | 1659 | */ |
1629 | pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); | 1660 | pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); |
1630 | 1661 | ||
1631 | if (!pte || huge_pte_none(huge_ptep_get(pte)) || | 1662 | if (!pte || huge_pte_none(huge_ptep_get(pte)) || |
1632 | (write && !pte_write(huge_ptep_get(pte)))) { | 1663 | (write && !pte_write(huge_ptep_get(pte)))) { |
@@ -1644,7 +1675,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1644 | break; | 1675 | break; |
1645 | } | 1676 | } |
1646 | 1677 | ||
1647 | pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; | 1678 | pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; |
1648 | page = pte_page(huge_ptep_get(pte)); | 1679 | page = pte_page(huge_ptep_get(pte)); |
1649 | same_page: | 1680 | same_page: |
1650 | if (pages) { | 1681 | if (pages) { |
@@ -1660,7 +1691,7 @@ same_page: | |||
1660 | --remainder; | 1691 | --remainder; |
1661 | ++i; | 1692 | ++i; |
1662 | if (vaddr < vma->vm_end && remainder && | 1693 | if (vaddr < vma->vm_end && remainder && |
1663 | pfn_offset < HPAGE_SIZE/PAGE_SIZE) { | 1694 | pfn_offset < pages_per_huge_page(h)) { |
1664 | /* | 1695 | /* |
1665 | * We use pfn_offset to avoid touching the pageframes | 1696 | * We use pfn_offset to avoid touching the pageframes |
1666 | * of this compound page. | 1697 | * of this compound page. |
@@ -1682,13 +1713,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
1682 | unsigned long start = address; | 1713 | unsigned long start = address; |
1683 | pte_t *ptep; | 1714 | pte_t *ptep; |
1684 | pte_t pte; | 1715 | pte_t pte; |
1716 | struct hstate *h = hstate_vma(vma); | ||
1685 | 1717 | ||
1686 | BUG_ON(address >= end); | 1718 | BUG_ON(address >= end); |
1687 | flush_cache_range(vma, address, end); | 1719 | flush_cache_range(vma, address, end); |
1688 | 1720 | ||
1689 | spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); | 1721 | spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); |
1690 | spin_lock(&mm->page_table_lock); | 1722 | spin_lock(&mm->page_table_lock); |
1691 | for (; address < end; address += HPAGE_SIZE) { | 1723 | for (; address < end; address += huge_page_size(h)) { |
1692 | ptep = huge_pte_offset(mm, address); | 1724 | ptep = huge_pte_offset(mm, address); |
1693 | if (!ptep) | 1725 | if (!ptep) |
1694 | continue; | 1726 | continue; |
@@ -1711,6 +1743,7 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
1711 | struct vm_area_struct *vma) | 1743 | struct vm_area_struct *vma) |
1712 | { | 1744 | { |
1713 | long ret, chg; | 1745 | long ret, chg; |
1746 | struct hstate *h = hstate_inode(inode); | ||
1714 | 1747 | ||
1715 | if (vma && vma->vm_flags & VM_NORESERVE) | 1748 | if (vma && vma->vm_flags & VM_NORESERVE) |
1716 | return 0; | 1749 | return 0; |
@@ -1739,7 +1772,7 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
1739 | 1772 | ||
1740 | if (hugetlb_get_quota(inode->i_mapping, chg)) | 1773 | if (hugetlb_get_quota(inode->i_mapping, chg)) |
1741 | return -ENOSPC; | 1774 | return -ENOSPC; |
1742 | ret = hugetlb_acct_memory(chg); | 1775 | ret = hugetlb_acct_memory(h, chg); |
1743 | if (ret < 0) { | 1776 | if (ret < 0) { |
1744 | hugetlb_put_quota(inode->i_mapping, chg); | 1777 | hugetlb_put_quota(inode->i_mapping, chg); |
1745 | return ret; | 1778 | return ret; |
@@ -1751,12 +1784,13 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
1751 | 1784 | ||
1752 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | 1785 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) |
1753 | { | 1786 | { |
1787 | struct hstate *h = hstate_inode(inode); | ||
1754 | long chg = region_truncate(&inode->i_mapping->private_list, offset); | 1788 | long chg = region_truncate(&inode->i_mapping->private_list, offset); |
1755 | 1789 | ||
1756 | spin_lock(&inode->i_lock); | 1790 | spin_lock(&inode->i_lock); |
1757 | inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed; | 1791 | inode->i_blocks -= blocks_per_huge_page(h); |
1758 | spin_unlock(&inode->i_lock); | 1792 | spin_unlock(&inode->i_lock); |
1759 | 1793 | ||
1760 | hugetlb_put_quota(inode->i_mapping, (chg - freed)); | 1794 | hugetlb_put_quota(inode->i_mapping, (chg - freed)); |
1761 | hugetlb_acct_memory(-(chg - freed)); | 1795 | hugetlb_acct_memory(h, -(chg - freed)); |
1762 | } | 1796 | } |