aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorAndi Kleen <ak@suse.de>2008-07-24 00:27:41 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-07-24 13:47:17 -0400
commita5516438959d90b071ff0a484ce4f3f523dc3152 (patch)
treee356ba9364c76b93c176b4d4a262b7aca3ee8f91 /mm
parentb7ba30c679ed1eb7ed3ed8f281f6493282042bd4 (diff)
hugetlb: modular state for hugetlb page size
The goal of this patchset is to support multiple hugetlb page sizes. This is achieved by introducing a new struct hstate structure, which encapsulates the important hugetlb state and constants (eg. huge page size, number of huge pages currently allocated, etc). The hstate structure is then passed around the code which requires these fields, they will do the right thing regardless of the exact hstate they are operating on. This patch adds the hstate structure, with a single global instance of it (default_hstate), and does the basic work of converting hugetlb to use the hstate. Future patches will add more hstate structures to allow for different hugetlbfs mounts to have different page sizes. [akpm@linux-foundation.org: coding-style fixes] Acked-by: Adam Litke <agl@us.ibm.com> Acked-by: Nishanth Aravamudan <nacc@us.ibm.com> Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/hugetlb.c368
-rw-r--r--mm/memory.c2
-rw-r--r--mm/mempolicy.c9
-rw-r--r--mm/mmap.c3
4 files changed, 210 insertions, 172 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 32dff4290c66..0d8153e25f09 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -22,18 +22,12 @@
22#include "internal.h" 22#include "internal.h"
23 23
24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
26static unsigned long surplus_huge_pages;
27static unsigned long nr_overcommit_huge_pages;
28unsigned long max_huge_pages; 25unsigned long max_huge_pages;
29unsigned long sysctl_overcommit_huge_pages; 26unsigned long sysctl_overcommit_huge_pages;
30static struct list_head hugepage_freelists[MAX_NUMNODES];
31static unsigned int nr_huge_pages_node[MAX_NUMNODES];
32static unsigned int free_huge_pages_node[MAX_NUMNODES];
33static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
34static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 27static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
35unsigned long hugepages_treat_as_movable; 28unsigned long hugepages_treat_as_movable;
36static int hugetlb_next_nid; 29
30struct hstate default_hstate;
37 31
38/* 32/*
39 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 33 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
@@ -203,11 +197,11 @@ static long region_count(struct list_head *head, long f, long t)
203 * Convert the address within this vma to the page offset within 197 * Convert the address within this vma to the page offset within
204 * the mapping, in pagecache page units; huge pages here. 198 * the mapping, in pagecache page units; huge pages here.
205 */ 199 */
206static pgoff_t vma_hugecache_offset(struct vm_area_struct *vma, 200static pgoff_t vma_hugecache_offset(struct hstate *h,
207 unsigned long address) 201 struct vm_area_struct *vma, unsigned long address)
208{ 202{
209 return ((address - vma->vm_start) >> HPAGE_SHIFT) + 203 return ((address - vma->vm_start) >> huge_page_shift(h)) +
210 (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); 204 (vma->vm_pgoff >> huge_page_order(h));
211} 205}
212 206
213/* 207/*
@@ -309,20 +303,21 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
309} 303}
310 304
311/* Decrement the reserved pages in the hugepage pool by one */ 305/* Decrement the reserved pages in the hugepage pool by one */
312static void decrement_hugepage_resv_vma(struct vm_area_struct *vma) 306static void decrement_hugepage_resv_vma(struct hstate *h,
307 struct vm_area_struct *vma)
313{ 308{
314 if (vma->vm_flags & VM_NORESERVE) 309 if (vma->vm_flags & VM_NORESERVE)
315 return; 310 return;
316 311
317 if (vma->vm_flags & VM_SHARED) { 312 if (vma->vm_flags & VM_SHARED) {
318 /* Shared mappings always use reserves */ 313 /* Shared mappings always use reserves */
319 resv_huge_pages--; 314 h->resv_huge_pages--;
320 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 315 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
321 /* 316 /*
322 * Only the process that called mmap() has reserves for 317 * Only the process that called mmap() has reserves for
323 * private mappings. 318 * private mappings.
324 */ 319 */
325 resv_huge_pages--; 320 h->resv_huge_pages--;
326 } 321 }
327} 322}
328 323
@@ -344,12 +339,13 @@ static int vma_has_private_reserves(struct vm_area_struct *vma)
344 return 1; 339 return 1;
345} 340}
346 341
347static void clear_huge_page(struct page *page, unsigned long addr) 342static void clear_huge_page(struct page *page,
343 unsigned long addr, unsigned long sz)
348{ 344{
349 int i; 345 int i;
350 346
351 might_sleep(); 347 might_sleep();
352 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { 348 for (i = 0; i < sz/PAGE_SIZE; i++) {
353 cond_resched(); 349 cond_resched();
354 clear_user_highpage(page + i, addr + i * PAGE_SIZE); 350 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
355 } 351 }
@@ -359,41 +355,43 @@ static void copy_huge_page(struct page *dst, struct page *src,
359 unsigned long addr, struct vm_area_struct *vma) 355 unsigned long addr, struct vm_area_struct *vma)
360{ 356{
361 int i; 357 int i;
358 struct hstate *h = hstate_vma(vma);
362 359
363 might_sleep(); 360 might_sleep();
364 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { 361 for (i = 0; i < pages_per_huge_page(h); i++) {
365 cond_resched(); 362 cond_resched();
366 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); 363 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
367 } 364 }
368} 365}
369 366
370static void enqueue_huge_page(struct page *page) 367static void enqueue_huge_page(struct hstate *h, struct page *page)
371{ 368{
372 int nid = page_to_nid(page); 369 int nid = page_to_nid(page);
373 list_add(&page->lru, &hugepage_freelists[nid]); 370 list_add(&page->lru, &h->hugepage_freelists[nid]);
374 free_huge_pages++; 371 h->free_huge_pages++;
375 free_huge_pages_node[nid]++; 372 h->free_huge_pages_node[nid]++;
376} 373}
377 374
378static struct page *dequeue_huge_page(void) 375static struct page *dequeue_huge_page(struct hstate *h)
379{ 376{
380 int nid; 377 int nid;
381 struct page *page = NULL; 378 struct page *page = NULL;
382 379
383 for (nid = 0; nid < MAX_NUMNODES; ++nid) { 380 for (nid = 0; nid < MAX_NUMNODES; ++nid) {
384 if (!list_empty(&hugepage_freelists[nid])) { 381 if (!list_empty(&h->hugepage_freelists[nid])) {
385 page = list_entry(hugepage_freelists[nid].next, 382 page = list_entry(h->hugepage_freelists[nid].next,
386 struct page, lru); 383 struct page, lru);
387 list_del(&page->lru); 384 list_del(&page->lru);
388 free_huge_pages--; 385 h->free_huge_pages--;
389 free_huge_pages_node[nid]--; 386 h->free_huge_pages_node[nid]--;
390 break; 387 break;
391 } 388 }
392 } 389 }
393 return page; 390 return page;
394} 391}
395 392
396static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, 393static struct page *dequeue_huge_page_vma(struct hstate *h,
394 struct vm_area_struct *vma,
397 unsigned long address, int avoid_reserve) 395 unsigned long address, int avoid_reserve)
398{ 396{
399 int nid; 397 int nid;
@@ -411,26 +409,26 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
411 * not "stolen". The child may still get SIGKILLed 409 * not "stolen". The child may still get SIGKILLed
412 */ 410 */
413 if (!vma_has_private_reserves(vma) && 411 if (!vma_has_private_reserves(vma) &&
414 free_huge_pages - resv_huge_pages == 0) 412 h->free_huge_pages - h->resv_huge_pages == 0)
415 return NULL; 413 return NULL;
416 414
417 /* If reserves cannot be used, ensure enough pages are in the pool */ 415 /* If reserves cannot be used, ensure enough pages are in the pool */
418 if (avoid_reserve && free_huge_pages - resv_huge_pages == 0) 416 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
419 return NULL; 417 return NULL;
420 418
421 for_each_zone_zonelist_nodemask(zone, z, zonelist, 419 for_each_zone_zonelist_nodemask(zone, z, zonelist,
422 MAX_NR_ZONES - 1, nodemask) { 420 MAX_NR_ZONES - 1, nodemask) {
423 nid = zone_to_nid(zone); 421 nid = zone_to_nid(zone);
424 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && 422 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
425 !list_empty(&hugepage_freelists[nid])) { 423 !list_empty(&h->hugepage_freelists[nid])) {
426 page = list_entry(hugepage_freelists[nid].next, 424 page = list_entry(h->hugepage_freelists[nid].next,
427 struct page, lru); 425 struct page, lru);
428 list_del(&page->lru); 426 list_del(&page->lru);
429 free_huge_pages--; 427 h->free_huge_pages--;
430 free_huge_pages_node[nid]--; 428 h->free_huge_pages_node[nid]--;
431 429
432 if (!avoid_reserve) 430 if (!avoid_reserve)
433 decrement_hugepage_resv_vma(vma); 431 decrement_hugepage_resv_vma(h, vma);
434 432
435 break; 433 break;
436 } 434 }
@@ -439,12 +437,13 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
439 return page; 437 return page;
440} 438}
441 439
442static void update_and_free_page(struct page *page) 440static void update_and_free_page(struct hstate *h, struct page *page)
443{ 441{
444 int i; 442 int i;
445 nr_huge_pages--; 443
446 nr_huge_pages_node[page_to_nid(page)]--; 444 h->nr_huge_pages--;
447 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { 445 h->nr_huge_pages_node[page_to_nid(page)]--;
446 for (i = 0; i < pages_per_huge_page(h); i++) {
448 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 447 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
449 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 448 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
450 1 << PG_private | 1<< PG_writeback); 449 1 << PG_private | 1<< PG_writeback);
@@ -452,11 +451,16 @@ static void update_and_free_page(struct page *page)
452 set_compound_page_dtor(page, NULL); 451 set_compound_page_dtor(page, NULL);
453 set_page_refcounted(page); 452 set_page_refcounted(page);
454 arch_release_hugepage(page); 453 arch_release_hugepage(page);
455 __free_pages(page, HUGETLB_PAGE_ORDER); 454 __free_pages(page, huge_page_order(h));
456} 455}
457 456
458static void free_huge_page(struct page *page) 457static void free_huge_page(struct page *page)
459{ 458{
459 /*
460 * Can't pass hstate in here because it is called from the
461 * compound page destructor.
462 */
463 struct hstate *h = &default_hstate;
460 int nid = page_to_nid(page); 464 int nid = page_to_nid(page);
461 struct address_space *mapping; 465 struct address_space *mapping;
462 466
@@ -466,12 +470,12 @@ static void free_huge_page(struct page *page)
466 INIT_LIST_HEAD(&page->lru); 470 INIT_LIST_HEAD(&page->lru);
467 471
468 spin_lock(&hugetlb_lock); 472 spin_lock(&hugetlb_lock);
469 if (surplus_huge_pages_node[nid]) { 473 if (h->surplus_huge_pages_node[nid]) {
470 update_and_free_page(page); 474 update_and_free_page(h, page);
471 surplus_huge_pages--; 475 h->surplus_huge_pages--;
472 surplus_huge_pages_node[nid]--; 476 h->surplus_huge_pages_node[nid]--;
473 } else { 477 } else {
474 enqueue_huge_page(page); 478 enqueue_huge_page(h, page);
475 } 479 }
476 spin_unlock(&hugetlb_lock); 480 spin_unlock(&hugetlb_lock);
477 if (mapping) 481 if (mapping)
@@ -483,7 +487,7 @@ static void free_huge_page(struct page *page)
483 * balanced by operating on them in a round-robin fashion. 487 * balanced by operating on them in a round-robin fashion.
484 * Returns 1 if an adjustment was made. 488 * Returns 1 if an adjustment was made.
485 */ 489 */
486static int adjust_pool_surplus(int delta) 490static int adjust_pool_surplus(struct hstate *h, int delta)
487{ 491{
488 static int prev_nid; 492 static int prev_nid;
489 int nid = prev_nid; 493 int nid = prev_nid;
@@ -496,15 +500,15 @@ static int adjust_pool_surplus(int delta)
496 nid = first_node(node_online_map); 500 nid = first_node(node_online_map);
497 501
498 /* To shrink on this node, there must be a surplus page */ 502 /* To shrink on this node, there must be a surplus page */
499 if (delta < 0 && !surplus_huge_pages_node[nid]) 503 if (delta < 0 && !h->surplus_huge_pages_node[nid])
500 continue; 504 continue;
501 /* Surplus cannot exceed the total number of pages */ 505 /* Surplus cannot exceed the total number of pages */
502 if (delta > 0 && surplus_huge_pages_node[nid] >= 506 if (delta > 0 && h->surplus_huge_pages_node[nid] >=
503 nr_huge_pages_node[nid]) 507 h->nr_huge_pages_node[nid])
504 continue; 508 continue;
505 509
506 surplus_huge_pages += delta; 510 h->surplus_huge_pages += delta;
507 surplus_huge_pages_node[nid] += delta; 511 h->surplus_huge_pages_node[nid] += delta;
508 ret = 1; 512 ret = 1;
509 break; 513 break;
510 } while (nid != prev_nid); 514 } while (nid != prev_nid);
@@ -513,46 +517,46 @@ static int adjust_pool_surplus(int delta)
513 return ret; 517 return ret;
514} 518}
515 519
516static void prep_new_huge_page(struct page *page, int nid) 520static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
517{ 521{
518 set_compound_page_dtor(page, free_huge_page); 522 set_compound_page_dtor(page, free_huge_page);
519 spin_lock(&hugetlb_lock); 523 spin_lock(&hugetlb_lock);
520 nr_huge_pages++; 524 h->nr_huge_pages++;
521 nr_huge_pages_node[nid]++; 525 h->nr_huge_pages_node[nid]++;
522 spin_unlock(&hugetlb_lock); 526 spin_unlock(&hugetlb_lock);
523 put_page(page); /* free it into the hugepage allocator */ 527 put_page(page); /* free it into the hugepage allocator */
524} 528}
525 529
526static struct page *alloc_fresh_huge_page_node(int nid) 530static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
527{ 531{
528 struct page *page; 532 struct page *page;
529 533
530 page = alloc_pages_node(nid, 534 page = alloc_pages_node(nid,
531 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| 535 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
532 __GFP_REPEAT|__GFP_NOWARN, 536 __GFP_REPEAT|__GFP_NOWARN,
533 HUGETLB_PAGE_ORDER); 537 huge_page_order(h));
534 if (page) { 538 if (page) {
535 if (arch_prepare_hugepage(page)) { 539 if (arch_prepare_hugepage(page)) {
536 __free_pages(page, HUGETLB_PAGE_ORDER); 540 __free_pages(page, HUGETLB_PAGE_ORDER);
537 return NULL; 541 return NULL;
538 } 542 }
539 prep_new_huge_page(page, nid); 543 prep_new_huge_page(h, page, nid);
540 } 544 }
541 545
542 return page; 546 return page;
543} 547}
544 548
545static int alloc_fresh_huge_page(void) 549static int alloc_fresh_huge_page(struct hstate *h)
546{ 550{
547 struct page *page; 551 struct page *page;
548 int start_nid; 552 int start_nid;
549 int next_nid; 553 int next_nid;
550 int ret = 0; 554 int ret = 0;
551 555
552 start_nid = hugetlb_next_nid; 556 start_nid = h->hugetlb_next_nid;
553 557
554 do { 558 do {
555 page = alloc_fresh_huge_page_node(hugetlb_next_nid); 559 page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid);
556 if (page) 560 if (page)
557 ret = 1; 561 ret = 1;
558 /* 562 /*
@@ -566,11 +570,11 @@ static int alloc_fresh_huge_page(void)
566 * if we just successfully allocated a hugepage so that 570 * if we just successfully allocated a hugepage so that
567 * the next caller gets hugepages on the next node. 571 * the next caller gets hugepages on the next node.
568 */ 572 */
569 next_nid = next_node(hugetlb_next_nid, node_online_map); 573 next_nid = next_node(h->hugetlb_next_nid, node_online_map);
570 if (next_nid == MAX_NUMNODES) 574 if (next_nid == MAX_NUMNODES)
571 next_nid = first_node(node_online_map); 575 next_nid = first_node(node_online_map);
572 hugetlb_next_nid = next_nid; 576 h->hugetlb_next_nid = next_nid;
573 } while (!page && hugetlb_next_nid != start_nid); 577 } while (!page && h->hugetlb_next_nid != start_nid);
574 578
575 if (ret) 579 if (ret)
576 count_vm_event(HTLB_BUDDY_PGALLOC); 580 count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -580,8 +584,8 @@ static int alloc_fresh_huge_page(void)
580 return ret; 584 return ret;
581} 585}
582 586
583static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, 587static struct page *alloc_buddy_huge_page(struct hstate *h,
584 unsigned long address) 588 struct vm_area_struct *vma, unsigned long address)
585{ 589{
586 struct page *page; 590 struct page *page;
587 unsigned int nid; 591 unsigned int nid;
@@ -610,18 +614,18 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
610 * per-node value is checked there. 614 * per-node value is checked there.
611 */ 615 */
612 spin_lock(&hugetlb_lock); 616 spin_lock(&hugetlb_lock);
613 if (surplus_huge_pages >= nr_overcommit_huge_pages) { 617 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
614 spin_unlock(&hugetlb_lock); 618 spin_unlock(&hugetlb_lock);
615 return NULL; 619 return NULL;
616 } else { 620 } else {
617 nr_huge_pages++; 621 h->nr_huge_pages++;
618 surplus_huge_pages++; 622 h->surplus_huge_pages++;
619 } 623 }
620 spin_unlock(&hugetlb_lock); 624 spin_unlock(&hugetlb_lock);
621 625
622 page = alloc_pages(htlb_alloc_mask|__GFP_COMP| 626 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
623 __GFP_REPEAT|__GFP_NOWARN, 627 __GFP_REPEAT|__GFP_NOWARN,
624 HUGETLB_PAGE_ORDER); 628 huge_page_order(h));
625 629
626 spin_lock(&hugetlb_lock); 630 spin_lock(&hugetlb_lock);
627 if (page) { 631 if (page) {
@@ -636,12 +640,12 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
636 /* 640 /*
637 * We incremented the global counters already 641 * We incremented the global counters already
638 */ 642 */
639 nr_huge_pages_node[nid]++; 643 h->nr_huge_pages_node[nid]++;
640 surplus_huge_pages_node[nid]++; 644 h->surplus_huge_pages_node[nid]++;
641 __count_vm_event(HTLB_BUDDY_PGALLOC); 645 __count_vm_event(HTLB_BUDDY_PGALLOC);
642 } else { 646 } else {
643 nr_huge_pages--; 647 h->nr_huge_pages--;
644 surplus_huge_pages--; 648 h->surplus_huge_pages--;
645 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 649 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
646 } 650 }
647 spin_unlock(&hugetlb_lock); 651 spin_unlock(&hugetlb_lock);
@@ -653,16 +657,16 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
653 * Increase the hugetlb pool such that it can accomodate a reservation 657 * Increase the hugetlb pool such that it can accomodate a reservation
654 * of size 'delta'. 658 * of size 'delta'.
655 */ 659 */
656static int gather_surplus_pages(int delta) 660static int gather_surplus_pages(struct hstate *h, int delta)
657{ 661{
658 struct list_head surplus_list; 662 struct list_head surplus_list;
659 struct page *page, *tmp; 663 struct page *page, *tmp;
660 int ret, i; 664 int ret, i;
661 int needed, allocated; 665 int needed, allocated;
662 666
663 needed = (resv_huge_pages + delta) - free_huge_pages; 667 needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
664 if (needed <= 0) { 668 if (needed <= 0) {
665 resv_huge_pages += delta; 669 h->resv_huge_pages += delta;
666 return 0; 670 return 0;
667 } 671 }
668 672
@@ -673,7 +677,7 @@ static int gather_surplus_pages(int delta)
673retry: 677retry:
674 spin_unlock(&hugetlb_lock); 678 spin_unlock(&hugetlb_lock);
675 for (i = 0; i < needed; i++) { 679 for (i = 0; i < needed; i++) {
676 page = alloc_buddy_huge_page(NULL, 0); 680 page = alloc_buddy_huge_page(h, NULL, 0);
677 if (!page) { 681 if (!page) {
678 /* 682 /*
679 * We were not able to allocate enough pages to 683 * We were not able to allocate enough pages to
@@ -694,7 +698,8 @@ retry:
694 * because either resv_huge_pages or free_huge_pages may have changed. 698 * because either resv_huge_pages or free_huge_pages may have changed.
695 */ 699 */
696 spin_lock(&hugetlb_lock); 700 spin_lock(&hugetlb_lock);
697 needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); 701 needed = (h->resv_huge_pages + delta) -
702 (h->free_huge_pages + allocated);
698 if (needed > 0) 703 if (needed > 0)
699 goto retry; 704 goto retry;
700 705
@@ -707,7 +712,7 @@ retry:
707 * before they are reserved. 712 * before they are reserved.
708 */ 713 */
709 needed += allocated; 714 needed += allocated;
710 resv_huge_pages += delta; 715 h->resv_huge_pages += delta;
711 ret = 0; 716 ret = 0;
712free: 717free:
713 /* Free the needed pages to the hugetlb pool */ 718 /* Free the needed pages to the hugetlb pool */
@@ -715,7 +720,7 @@ free:
715 if ((--needed) < 0) 720 if ((--needed) < 0)
716 break; 721 break;
717 list_del(&page->lru); 722 list_del(&page->lru);
718 enqueue_huge_page(page); 723 enqueue_huge_page(h, page);
719 } 724 }
720 725
721 /* Free unnecessary surplus pages to the buddy allocator */ 726 /* Free unnecessary surplus pages to the buddy allocator */
@@ -743,7 +748,8 @@ free:
743 * allocated to satisfy the reservation must be explicitly freed if they were 748 * allocated to satisfy the reservation must be explicitly freed if they were
744 * never used. 749 * never used.
745 */ 750 */
746static void return_unused_surplus_pages(unsigned long unused_resv_pages) 751static void return_unused_surplus_pages(struct hstate *h,
752 unsigned long unused_resv_pages)
747{ 753{
748 static int nid = -1; 754 static int nid = -1;
749 struct page *page; 755 struct page *page;
@@ -758,27 +764,27 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
758 unsigned long remaining_iterations = num_online_nodes(); 764 unsigned long remaining_iterations = num_online_nodes();
759 765
760 /* Uncommit the reservation */ 766 /* Uncommit the reservation */
761 resv_huge_pages -= unused_resv_pages; 767 h->resv_huge_pages -= unused_resv_pages;
762 768
763 nr_pages = min(unused_resv_pages, surplus_huge_pages); 769 nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
764 770
765 while (remaining_iterations-- && nr_pages) { 771 while (remaining_iterations-- && nr_pages) {
766 nid = next_node(nid, node_online_map); 772 nid = next_node(nid, node_online_map);
767 if (nid == MAX_NUMNODES) 773 if (nid == MAX_NUMNODES)
768 nid = first_node(node_online_map); 774 nid = first_node(node_online_map);
769 775
770 if (!surplus_huge_pages_node[nid]) 776 if (!h->surplus_huge_pages_node[nid])
771 continue; 777 continue;
772 778
773 if (!list_empty(&hugepage_freelists[nid])) { 779 if (!list_empty(&h->hugepage_freelists[nid])) {
774 page = list_entry(hugepage_freelists[nid].next, 780 page = list_entry(h->hugepage_freelists[nid].next,
775 struct page, lru); 781 struct page, lru);
776 list_del(&page->lru); 782 list_del(&page->lru);
777 update_and_free_page(page); 783 update_and_free_page(h, page);
778 free_huge_pages--; 784 h->free_huge_pages--;
779 free_huge_pages_node[nid]--; 785 h->free_huge_pages_node[nid]--;
780 surplus_huge_pages--; 786 h->surplus_huge_pages--;
781 surplus_huge_pages_node[nid]--; 787 h->surplus_huge_pages_node[nid]--;
782 nr_pages--; 788 nr_pages--;
783 remaining_iterations = num_online_nodes(); 789 remaining_iterations = num_online_nodes();
784 } 790 }
@@ -794,13 +800,14 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
794 * an instantiated the change should be committed via vma_commit_reservation. 800 * an instantiated the change should be committed via vma_commit_reservation.
795 * No action is required on failure. 801 * No action is required on failure.
796 */ 802 */
797static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr) 803static int vma_needs_reservation(struct hstate *h,
804 struct vm_area_struct *vma, unsigned long addr)
798{ 805{
799 struct address_space *mapping = vma->vm_file->f_mapping; 806 struct address_space *mapping = vma->vm_file->f_mapping;
800 struct inode *inode = mapping->host; 807 struct inode *inode = mapping->host;
801 808
802 if (vma->vm_flags & VM_SHARED) { 809 if (vma->vm_flags & VM_SHARED) {
803 pgoff_t idx = vma_hugecache_offset(vma, addr); 810 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
804 return region_chg(&inode->i_mapping->private_list, 811 return region_chg(&inode->i_mapping->private_list,
805 idx, idx + 1); 812 idx, idx + 1);
806 813
@@ -809,7 +816,7 @@ static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr)
809 816
810 } else { 817 } else {
811 int err; 818 int err;
812 pgoff_t idx = vma_hugecache_offset(vma, addr); 819 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
813 struct resv_map *reservations = vma_resv_map(vma); 820 struct resv_map *reservations = vma_resv_map(vma);
814 821
815 err = region_chg(&reservations->regions, idx, idx + 1); 822 err = region_chg(&reservations->regions, idx, idx + 1);
@@ -818,18 +825,18 @@ static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr)
818 return 0; 825 return 0;
819 } 826 }
820} 827}
821static void vma_commit_reservation(struct vm_area_struct *vma, 828static void vma_commit_reservation(struct hstate *h,
822 unsigned long addr) 829 struct vm_area_struct *vma, unsigned long addr)
823{ 830{
824 struct address_space *mapping = vma->vm_file->f_mapping; 831 struct address_space *mapping = vma->vm_file->f_mapping;
825 struct inode *inode = mapping->host; 832 struct inode *inode = mapping->host;
826 833
827 if (vma->vm_flags & VM_SHARED) { 834 if (vma->vm_flags & VM_SHARED) {
828 pgoff_t idx = vma_hugecache_offset(vma, addr); 835 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
829 region_add(&inode->i_mapping->private_list, idx, idx + 1); 836 region_add(&inode->i_mapping->private_list, idx, idx + 1);
830 837
831 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 838 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
832 pgoff_t idx = vma_hugecache_offset(vma, addr); 839 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
833 struct resv_map *reservations = vma_resv_map(vma); 840 struct resv_map *reservations = vma_resv_map(vma);
834 841
835 /* Mark this page used in the map. */ 842 /* Mark this page used in the map. */
@@ -840,6 +847,7 @@ static void vma_commit_reservation(struct vm_area_struct *vma,
840static struct page *alloc_huge_page(struct vm_area_struct *vma, 847static struct page *alloc_huge_page(struct vm_area_struct *vma,
841 unsigned long addr, int avoid_reserve) 848 unsigned long addr, int avoid_reserve)
842{ 849{
850 struct hstate *h = hstate_vma(vma);
843 struct page *page; 851 struct page *page;
844 struct address_space *mapping = vma->vm_file->f_mapping; 852 struct address_space *mapping = vma->vm_file->f_mapping;
845 struct inode *inode = mapping->host; 853 struct inode *inode = mapping->host;
@@ -852,7 +860,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
852 * MAP_NORESERVE mappings may also need pages and quota allocated 860 * MAP_NORESERVE mappings may also need pages and quota allocated
853 * if no reserve mapping overlaps. 861 * if no reserve mapping overlaps.
854 */ 862 */
855 chg = vma_needs_reservation(vma, addr); 863 chg = vma_needs_reservation(h, vma, addr);
856 if (chg < 0) 864 if (chg < 0)
857 return ERR_PTR(chg); 865 return ERR_PTR(chg);
858 if (chg) 866 if (chg)
@@ -860,11 +868,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
860 return ERR_PTR(-ENOSPC); 868 return ERR_PTR(-ENOSPC);
861 869
862 spin_lock(&hugetlb_lock); 870 spin_lock(&hugetlb_lock);
863 page = dequeue_huge_page_vma(vma, addr, avoid_reserve); 871 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
864 spin_unlock(&hugetlb_lock); 872 spin_unlock(&hugetlb_lock);
865 873
866 if (!page) { 874 if (!page) {
867 page = alloc_buddy_huge_page(vma, addr); 875 page = alloc_buddy_huge_page(h, vma, addr);
868 if (!page) { 876 if (!page) {
869 hugetlb_put_quota(inode->i_mapping, chg); 877 hugetlb_put_quota(inode->i_mapping, chg);
870 return ERR_PTR(-VM_FAULT_OOM); 878 return ERR_PTR(-VM_FAULT_OOM);
@@ -874,7 +882,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
874 set_page_refcounted(page); 882 set_page_refcounted(page);
875 set_page_private(page, (unsigned long) mapping); 883 set_page_private(page, (unsigned long) mapping);
876 884
877 vma_commit_reservation(vma, addr); 885 vma_commit_reservation(h, vma, addr);
878 886
879 return page; 887 return page;
880} 888}
@@ -882,21 +890,28 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
882static int __init hugetlb_init(void) 890static int __init hugetlb_init(void)
883{ 891{
884 unsigned long i; 892 unsigned long i;
893 struct hstate *h = &default_hstate;
885 894
886 if (HPAGE_SHIFT == 0) 895 if (HPAGE_SHIFT == 0)
887 return 0; 896 return 0;
888 897
898 if (!h->order) {
899 h->order = HPAGE_SHIFT - PAGE_SHIFT;
900 h->mask = HPAGE_MASK;
901 }
902
889 for (i = 0; i < MAX_NUMNODES; ++i) 903 for (i = 0; i < MAX_NUMNODES; ++i)
890 INIT_LIST_HEAD(&hugepage_freelists[i]); 904 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
891 905
892 hugetlb_next_nid = first_node(node_online_map); 906 h->hugetlb_next_nid = first_node(node_online_map);
893 907
894 for (i = 0; i < max_huge_pages; ++i) { 908 for (i = 0; i < max_huge_pages; ++i) {
895 if (!alloc_fresh_huge_page()) 909 if (!alloc_fresh_huge_page(h))
896 break; 910 break;
897 } 911 }
898 max_huge_pages = free_huge_pages = nr_huge_pages = i; 912 max_huge_pages = h->free_huge_pages = h->nr_huge_pages = i;
899 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); 913 printk(KERN_INFO "Total HugeTLB memory allocated, %ld\n",
914 h->free_huge_pages);
900 return 0; 915 return 0;
901} 916}
902module_init(hugetlb_init); 917module_init(hugetlb_init);
@@ -922,34 +937,36 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
922 937
923#ifdef CONFIG_SYSCTL 938#ifdef CONFIG_SYSCTL
924#ifdef CONFIG_HIGHMEM 939#ifdef CONFIG_HIGHMEM
925static void try_to_free_low(unsigned long count) 940static void try_to_free_low(struct hstate *h, unsigned long count)
926{ 941{
927 int i; 942 int i;
928 943
929 for (i = 0; i < MAX_NUMNODES; ++i) { 944 for (i = 0; i < MAX_NUMNODES; ++i) {
930 struct page *page, *next; 945 struct page *page, *next;
931 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 946 struct list_head *freel = &h->hugepage_freelists[i];
932 if (count >= nr_huge_pages) 947 list_for_each_entry_safe(page, next, freel, lru) {
948 if (count >= h->nr_huge_pages)
933 return; 949 return;
934 if (PageHighMem(page)) 950 if (PageHighMem(page))
935 continue; 951 continue;
936 list_del(&page->lru); 952 list_del(&page->lru);
937 update_and_free_page(page); 953 update_and_free_page(page);
938 free_huge_pages--; 954 h->free_huge_pages--;
939 free_huge_pages_node[page_to_nid(page)]--; 955 h->free_huge_pages_node[page_to_nid(page)]--;
940 } 956 }
941 } 957 }
942} 958}
943#else 959#else
944static inline void try_to_free_low(unsigned long count) 960static inline void try_to_free_low(struct hstate *h, unsigned long count)
945{ 961{
946} 962}
947#endif 963#endif
948 964
949#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) 965#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
950static unsigned long set_max_huge_pages(unsigned long count) 966static unsigned long set_max_huge_pages(unsigned long count)
951{ 967{
952 unsigned long min_count, ret; 968 unsigned long min_count, ret;
969 struct hstate *h = &default_hstate;
953 970
954 /* 971 /*
955 * Increase the pool size 972 * Increase the pool size
@@ -963,19 +980,19 @@ static unsigned long set_max_huge_pages(unsigned long count)
963 * within all the constraints specified by the sysctls. 980 * within all the constraints specified by the sysctls.
964 */ 981 */
965 spin_lock(&hugetlb_lock); 982 spin_lock(&hugetlb_lock);
966 while (surplus_huge_pages && count > persistent_huge_pages) { 983 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
967 if (!adjust_pool_surplus(-1)) 984 if (!adjust_pool_surplus(h, -1))
968 break; 985 break;
969 } 986 }
970 987
971 while (count > persistent_huge_pages) { 988 while (count > persistent_huge_pages(h)) {
972 /* 989 /*
973 * If this allocation races such that we no longer need the 990 * If this allocation races such that we no longer need the
974 * page, free_huge_page will handle it by freeing the page 991 * page, free_huge_page will handle it by freeing the page
975 * and reducing the surplus. 992 * and reducing the surplus.
976 */ 993 */
977 spin_unlock(&hugetlb_lock); 994 spin_unlock(&hugetlb_lock);
978 ret = alloc_fresh_huge_page(); 995 ret = alloc_fresh_huge_page(h);
979 spin_lock(&hugetlb_lock); 996 spin_lock(&hugetlb_lock);
980 if (!ret) 997 if (!ret)
981 goto out; 998 goto out;
@@ -997,21 +1014,21 @@ static unsigned long set_max_huge_pages(unsigned long count)
997 * and won't grow the pool anywhere else. Not until one of the 1014 * and won't grow the pool anywhere else. Not until one of the
998 * sysctls are changed, or the surplus pages go out of use. 1015 * sysctls are changed, or the surplus pages go out of use.
999 */ 1016 */
1000 min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; 1017 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
1001 min_count = max(count, min_count); 1018 min_count = max(count, min_count);
1002 try_to_free_low(min_count); 1019 try_to_free_low(h, min_count);
1003 while (min_count < persistent_huge_pages) { 1020 while (min_count < persistent_huge_pages(h)) {
1004 struct page *page = dequeue_huge_page(); 1021 struct page *page = dequeue_huge_page(h);
1005 if (!page) 1022 if (!page)
1006 break; 1023 break;
1007 update_and_free_page(page); 1024 update_and_free_page(h, page);
1008 } 1025 }
1009 while (count < persistent_huge_pages) { 1026 while (count < persistent_huge_pages(h)) {
1010 if (!adjust_pool_surplus(1)) 1027 if (!adjust_pool_surplus(h, 1))
1011 break; 1028 break;
1012 } 1029 }
1013out: 1030out:
1014 ret = persistent_huge_pages; 1031 ret = persistent_huge_pages(h);
1015 spin_unlock(&hugetlb_lock); 1032 spin_unlock(&hugetlb_lock);
1016 return ret; 1033 return ret;
1017} 1034}
@@ -1041,9 +1058,10 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1041 struct file *file, void __user *buffer, 1058 struct file *file, void __user *buffer,
1042 size_t *length, loff_t *ppos) 1059 size_t *length, loff_t *ppos)
1043{ 1060{
1061 struct hstate *h = &default_hstate;
1044 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 1062 proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
1045 spin_lock(&hugetlb_lock); 1063 spin_lock(&hugetlb_lock);
1046 nr_overcommit_huge_pages = sysctl_overcommit_huge_pages; 1064 h->nr_overcommit_huge_pages = sysctl_overcommit_huge_pages;
1047 spin_unlock(&hugetlb_lock); 1065 spin_unlock(&hugetlb_lock);
1048 return 0; 1066 return 0;
1049} 1067}
@@ -1052,37 +1070,40 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1052 1070
1053int hugetlb_report_meminfo(char *buf) 1071int hugetlb_report_meminfo(char *buf)
1054{ 1072{
1073 struct hstate *h = &default_hstate;
1055 return sprintf(buf, 1074 return sprintf(buf,
1056 "HugePages_Total: %5lu\n" 1075 "HugePages_Total: %5lu\n"
1057 "HugePages_Free: %5lu\n" 1076 "HugePages_Free: %5lu\n"
1058 "HugePages_Rsvd: %5lu\n" 1077 "HugePages_Rsvd: %5lu\n"
1059 "HugePages_Surp: %5lu\n" 1078 "HugePages_Surp: %5lu\n"
1060 "Hugepagesize: %5lu kB\n", 1079 "Hugepagesize: %5lu kB\n",
1061 nr_huge_pages, 1080 h->nr_huge_pages,
1062 free_huge_pages, 1081 h->free_huge_pages,
1063 resv_huge_pages, 1082 h->resv_huge_pages,
1064 surplus_huge_pages, 1083 h->surplus_huge_pages,
1065 HPAGE_SIZE/1024); 1084 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
1066} 1085}
1067 1086
1068int hugetlb_report_node_meminfo(int nid, char *buf) 1087int hugetlb_report_node_meminfo(int nid, char *buf)
1069{ 1088{
1089 struct hstate *h = &default_hstate;
1070 return sprintf(buf, 1090 return sprintf(buf,
1071 "Node %d HugePages_Total: %5u\n" 1091 "Node %d HugePages_Total: %5u\n"
1072 "Node %d HugePages_Free: %5u\n" 1092 "Node %d HugePages_Free: %5u\n"
1073 "Node %d HugePages_Surp: %5u\n", 1093 "Node %d HugePages_Surp: %5u\n",
1074 nid, nr_huge_pages_node[nid], 1094 nid, h->nr_huge_pages_node[nid],
1075 nid, free_huge_pages_node[nid], 1095 nid, h->free_huge_pages_node[nid],
1076 nid, surplus_huge_pages_node[nid]); 1096 nid, h->surplus_huge_pages_node[nid]);
1077} 1097}
1078 1098
1079/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 1099/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
1080unsigned long hugetlb_total_pages(void) 1100unsigned long hugetlb_total_pages(void)
1081{ 1101{
1082 return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); 1102 struct hstate *h = &default_hstate;
1103 return h->nr_huge_pages * pages_per_huge_page(h);
1083} 1104}
1084 1105
1085static int hugetlb_acct_memory(long delta) 1106static int hugetlb_acct_memory(struct hstate *h, long delta)
1086{ 1107{
1087 int ret = -ENOMEM; 1108 int ret = -ENOMEM;
1088 1109
@@ -1105,18 +1126,18 @@ static int hugetlb_acct_memory(long delta)
1105 * semantics that cpuset has. 1126 * semantics that cpuset has.
1106 */ 1127 */
1107 if (delta > 0) { 1128 if (delta > 0) {
1108 if (gather_surplus_pages(delta) < 0) 1129 if (gather_surplus_pages(h, delta) < 0)
1109 goto out; 1130 goto out;
1110 1131
1111 if (delta > cpuset_mems_nr(free_huge_pages_node)) { 1132 if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
1112 return_unused_surplus_pages(delta); 1133 return_unused_surplus_pages(h, delta);
1113 goto out; 1134 goto out;
1114 } 1135 }
1115 } 1136 }
1116 1137
1117 ret = 0; 1138 ret = 0;
1118 if (delta < 0) 1139 if (delta < 0)
1119 return_unused_surplus_pages((unsigned long) -delta); 1140 return_unused_surplus_pages(h, (unsigned long) -delta);
1120 1141
1121out: 1142out:
1122 spin_unlock(&hugetlb_lock); 1143 spin_unlock(&hugetlb_lock);
@@ -1141,14 +1162,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
1141 1162
1142static void hugetlb_vm_op_close(struct vm_area_struct *vma) 1163static void hugetlb_vm_op_close(struct vm_area_struct *vma)
1143{ 1164{
1165 struct hstate *h = hstate_vma(vma);
1144 struct resv_map *reservations = vma_resv_map(vma); 1166 struct resv_map *reservations = vma_resv_map(vma);
1145 unsigned long reserve; 1167 unsigned long reserve;
1146 unsigned long start; 1168 unsigned long start;
1147 unsigned long end; 1169 unsigned long end;
1148 1170
1149 if (reservations) { 1171 if (reservations) {
1150 start = vma_hugecache_offset(vma, vma->vm_start); 1172 start = vma_hugecache_offset(h, vma, vma->vm_start);
1151 end = vma_hugecache_offset(vma, vma->vm_end); 1173 end = vma_hugecache_offset(h, vma, vma->vm_end);
1152 1174
1153 reserve = (end - start) - 1175 reserve = (end - start) -
1154 region_count(&reservations->regions, start, end); 1176 region_count(&reservations->regions, start, end);
@@ -1156,7 +1178,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
1156 kref_put(&reservations->refs, resv_map_release); 1178 kref_put(&reservations->refs, resv_map_release);
1157 1179
1158 if (reserve) 1180 if (reserve)
1159 hugetlb_acct_memory(-reserve); 1181 hugetlb_acct_memory(h, -reserve);
1160 } 1182 }
1161} 1183}
1162 1184
@@ -1214,14 +1236,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
1214 struct page *ptepage; 1236 struct page *ptepage;
1215 unsigned long addr; 1237 unsigned long addr;
1216 int cow; 1238 int cow;
1239 struct hstate *h = hstate_vma(vma);
1240 unsigned long sz = huge_page_size(h);
1217 1241
1218 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 1242 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
1219 1243
1220 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 1244 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
1221 src_pte = huge_pte_offset(src, addr); 1245 src_pte = huge_pte_offset(src, addr);
1222 if (!src_pte) 1246 if (!src_pte)
1223 continue; 1247 continue;
1224 dst_pte = huge_pte_alloc(dst, addr); 1248 dst_pte = huge_pte_alloc(dst, addr, sz);
1225 if (!dst_pte) 1249 if (!dst_pte)
1226 goto nomem; 1250 goto nomem;
1227 1251
@@ -1257,6 +1281,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
1257 pte_t pte; 1281 pte_t pte;
1258 struct page *page; 1282 struct page *page;
1259 struct page *tmp; 1283 struct page *tmp;
1284 struct hstate *h = hstate_vma(vma);
1285 unsigned long sz = huge_page_size(h);
1286
1260 /* 1287 /*
1261 * A page gathering list, protected by per file i_mmap_lock. The 1288 * A page gathering list, protected by per file i_mmap_lock. The
1262 * lock is used to avoid list corruption from multiple unmapping 1289 * lock is used to avoid list corruption from multiple unmapping
@@ -1265,11 +1292,11 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
1265 LIST_HEAD(page_list); 1292 LIST_HEAD(page_list);
1266 1293
1267 WARN_ON(!is_vm_hugetlb_page(vma)); 1294 WARN_ON(!is_vm_hugetlb_page(vma));
1268 BUG_ON(start & ~HPAGE_MASK); 1295 BUG_ON(start & ~huge_page_mask(h));
1269 BUG_ON(end & ~HPAGE_MASK); 1296 BUG_ON(end & ~huge_page_mask(h));
1270 1297
1271 spin_lock(&mm->page_table_lock); 1298 spin_lock(&mm->page_table_lock);
1272 for (address = start; address < end; address += HPAGE_SIZE) { 1299 for (address = start; address < end; address += sz) {
1273 ptep = huge_pte_offset(mm, address); 1300 ptep = huge_pte_offset(mm, address);
1274 if (!ptep) 1301 if (!ptep)
1275 continue; 1302 continue;
@@ -1383,6 +1410,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
1383 unsigned long address, pte_t *ptep, pte_t pte, 1410 unsigned long address, pte_t *ptep, pte_t pte,
1384 struct page *pagecache_page) 1411 struct page *pagecache_page)
1385{ 1412{
1413 struct hstate *h = hstate_vma(vma);
1386 struct page *old_page, *new_page; 1414 struct page *old_page, *new_page;
1387 int avoidcopy; 1415 int avoidcopy;
1388 int outside_reserve = 0; 1416 int outside_reserve = 0;
@@ -1443,7 +1471,7 @@ retry_avoidcopy:
1443 __SetPageUptodate(new_page); 1471 __SetPageUptodate(new_page);
1444 spin_lock(&mm->page_table_lock); 1472 spin_lock(&mm->page_table_lock);
1445 1473
1446 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 1474 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
1447 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 1475 if (likely(pte_same(huge_ptep_get(ptep), pte))) {
1448 /* Break COW */ 1476 /* Break COW */
1449 huge_ptep_clear_flush(vma, address, ptep); 1477 huge_ptep_clear_flush(vma, address, ptep);
@@ -1458,14 +1486,14 @@ retry_avoidcopy:
1458} 1486}
1459 1487
1460/* Return the pagecache page at a given address within a VMA */ 1488/* Return the pagecache page at a given address within a VMA */
1461static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma, 1489static struct page *hugetlbfs_pagecache_page(struct hstate *h,
1462 unsigned long address) 1490 struct vm_area_struct *vma, unsigned long address)
1463{ 1491{
1464 struct address_space *mapping; 1492 struct address_space *mapping;
1465 pgoff_t idx; 1493 pgoff_t idx;
1466 1494
1467 mapping = vma->vm_file->f_mapping; 1495 mapping = vma->vm_file->f_mapping;
1468 idx = vma_hugecache_offset(vma, address); 1496 idx = vma_hugecache_offset(h, vma, address);
1469 1497
1470 return find_lock_page(mapping, idx); 1498 return find_lock_page(mapping, idx);
1471} 1499}
@@ -1473,6 +1501,7 @@ static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma,
1473static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 1501static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1474 unsigned long address, pte_t *ptep, int write_access) 1502 unsigned long address, pte_t *ptep, int write_access)
1475{ 1503{
1504 struct hstate *h = hstate_vma(vma);
1476 int ret = VM_FAULT_SIGBUS; 1505 int ret = VM_FAULT_SIGBUS;
1477 pgoff_t idx; 1506 pgoff_t idx;
1478 unsigned long size; 1507 unsigned long size;
@@ -1493,7 +1522,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1493 } 1522 }
1494 1523
1495 mapping = vma->vm_file->f_mapping; 1524 mapping = vma->vm_file->f_mapping;
1496 idx = vma_hugecache_offset(vma, address); 1525 idx = vma_hugecache_offset(h, vma, address);
1497 1526
1498 /* 1527 /*
1499 * Use page lock to guard against racing truncation 1528 * Use page lock to guard against racing truncation
@@ -1502,7 +1531,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1502retry: 1531retry:
1503 page = find_lock_page(mapping, idx); 1532 page = find_lock_page(mapping, idx);
1504 if (!page) { 1533 if (!page) {
1505 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 1534 size = i_size_read(mapping->host) >> huge_page_shift(h);
1506 if (idx >= size) 1535 if (idx >= size)
1507 goto out; 1536 goto out;
1508 page = alloc_huge_page(vma, address, 0); 1537 page = alloc_huge_page(vma, address, 0);
@@ -1510,7 +1539,7 @@ retry:
1510 ret = -PTR_ERR(page); 1539 ret = -PTR_ERR(page);
1511 goto out; 1540 goto out;
1512 } 1541 }
1513 clear_huge_page(page, address); 1542 clear_huge_page(page, address, huge_page_size(h));
1514 __SetPageUptodate(page); 1543 __SetPageUptodate(page);
1515 1544
1516 if (vma->vm_flags & VM_SHARED) { 1545 if (vma->vm_flags & VM_SHARED) {
@@ -1526,14 +1555,14 @@ retry:
1526 } 1555 }
1527 1556
1528 spin_lock(&inode->i_lock); 1557 spin_lock(&inode->i_lock);
1529 inode->i_blocks += BLOCKS_PER_HUGEPAGE; 1558 inode->i_blocks += blocks_per_huge_page(h);
1530 spin_unlock(&inode->i_lock); 1559 spin_unlock(&inode->i_lock);
1531 } else 1560 } else
1532 lock_page(page); 1561 lock_page(page);
1533 } 1562 }
1534 1563
1535 spin_lock(&mm->page_table_lock); 1564 spin_lock(&mm->page_table_lock);
1536 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 1565 size = i_size_read(mapping->host) >> huge_page_shift(h);
1537 if (idx >= size) 1566 if (idx >= size)
1538 goto backout; 1567 goto backout;
1539 1568
@@ -1569,8 +1598,9 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
1569 pte_t entry; 1598 pte_t entry;
1570 int ret; 1599 int ret;
1571 static DEFINE_MUTEX(hugetlb_instantiation_mutex); 1600 static DEFINE_MUTEX(hugetlb_instantiation_mutex);
1601 struct hstate *h = hstate_vma(vma);
1572 1602
1573 ptep = huge_pte_alloc(mm, address); 1603 ptep = huge_pte_alloc(mm, address, huge_page_size(h));
1574 if (!ptep) 1604 if (!ptep)
1575 return VM_FAULT_OOM; 1605 return VM_FAULT_OOM;
1576 1606
@@ -1594,7 +1624,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
1594 if (likely(pte_same(entry, huge_ptep_get(ptep)))) 1624 if (likely(pte_same(entry, huge_ptep_get(ptep))))
1595 if (write_access && !pte_write(entry)) { 1625 if (write_access && !pte_write(entry)) {
1596 struct page *page; 1626 struct page *page;
1597 page = hugetlbfs_pagecache_page(vma, address); 1627 page = hugetlbfs_pagecache_page(h, vma, address);
1598 ret = hugetlb_cow(mm, vma, address, ptep, entry, page); 1628 ret = hugetlb_cow(mm, vma, address, ptep, entry, page);
1599 if (page) { 1629 if (page) {
1600 unlock_page(page); 1630 unlock_page(page);
@@ -1615,6 +1645,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1615 unsigned long pfn_offset; 1645 unsigned long pfn_offset;
1616 unsigned long vaddr = *position; 1646 unsigned long vaddr = *position;
1617 int remainder = *length; 1647 int remainder = *length;
1648 struct hstate *h = hstate_vma(vma);
1618 1649
1619 spin_lock(&mm->page_table_lock); 1650 spin_lock(&mm->page_table_lock);
1620 while (vaddr < vma->vm_end && remainder) { 1651 while (vaddr < vma->vm_end && remainder) {
@@ -1626,7 +1657,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1626 * each hugepage. We have to make * sure we get the 1657 * each hugepage. We have to make * sure we get the
1627 * first, for the page indexing below to work. 1658 * first, for the page indexing below to work.
1628 */ 1659 */
1629 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); 1660 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
1630 1661
1631 if (!pte || huge_pte_none(huge_ptep_get(pte)) || 1662 if (!pte || huge_pte_none(huge_ptep_get(pte)) ||
1632 (write && !pte_write(huge_ptep_get(pte)))) { 1663 (write && !pte_write(huge_ptep_get(pte)))) {
@@ -1644,7 +1675,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1644 break; 1675 break;
1645 } 1676 }
1646 1677
1647 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; 1678 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
1648 page = pte_page(huge_ptep_get(pte)); 1679 page = pte_page(huge_ptep_get(pte));
1649same_page: 1680same_page:
1650 if (pages) { 1681 if (pages) {
@@ -1660,7 +1691,7 @@ same_page:
1660 --remainder; 1691 --remainder;
1661 ++i; 1692 ++i;
1662 if (vaddr < vma->vm_end && remainder && 1693 if (vaddr < vma->vm_end && remainder &&
1663 pfn_offset < HPAGE_SIZE/PAGE_SIZE) { 1694 pfn_offset < pages_per_huge_page(h)) {
1664 /* 1695 /*
1665 * We use pfn_offset to avoid touching the pageframes 1696 * We use pfn_offset to avoid touching the pageframes
1666 * of this compound page. 1697 * of this compound page.
@@ -1682,13 +1713,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
1682 unsigned long start = address; 1713 unsigned long start = address;
1683 pte_t *ptep; 1714 pte_t *ptep;
1684 pte_t pte; 1715 pte_t pte;
1716 struct hstate *h = hstate_vma(vma);
1685 1717
1686 BUG_ON(address >= end); 1718 BUG_ON(address >= end);
1687 flush_cache_range(vma, address, end); 1719 flush_cache_range(vma, address, end);
1688 1720
1689 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 1721 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
1690 spin_lock(&mm->page_table_lock); 1722 spin_lock(&mm->page_table_lock);
1691 for (; address < end; address += HPAGE_SIZE) { 1723 for (; address < end; address += huge_page_size(h)) {
1692 ptep = huge_pte_offset(mm, address); 1724 ptep = huge_pte_offset(mm, address);
1693 if (!ptep) 1725 if (!ptep)
1694 continue; 1726 continue;
@@ -1711,6 +1743,7 @@ int hugetlb_reserve_pages(struct inode *inode,
1711 struct vm_area_struct *vma) 1743 struct vm_area_struct *vma)
1712{ 1744{
1713 long ret, chg; 1745 long ret, chg;
1746 struct hstate *h = hstate_inode(inode);
1714 1747
1715 if (vma && vma->vm_flags & VM_NORESERVE) 1748 if (vma && vma->vm_flags & VM_NORESERVE)
1716 return 0; 1749 return 0;
@@ -1739,7 +1772,7 @@ int hugetlb_reserve_pages(struct inode *inode,
1739 1772
1740 if (hugetlb_get_quota(inode->i_mapping, chg)) 1773 if (hugetlb_get_quota(inode->i_mapping, chg))
1741 return -ENOSPC; 1774 return -ENOSPC;
1742 ret = hugetlb_acct_memory(chg); 1775 ret = hugetlb_acct_memory(h, chg);
1743 if (ret < 0) { 1776 if (ret < 0) {
1744 hugetlb_put_quota(inode->i_mapping, chg); 1777 hugetlb_put_quota(inode->i_mapping, chg);
1745 return ret; 1778 return ret;
@@ -1751,12 +1784,13 @@ int hugetlb_reserve_pages(struct inode *inode,
1751 1784
1752void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 1785void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
1753{ 1786{
1787 struct hstate *h = hstate_inode(inode);
1754 long chg = region_truncate(&inode->i_mapping->private_list, offset); 1788 long chg = region_truncate(&inode->i_mapping->private_list, offset);
1755 1789
1756 spin_lock(&inode->i_lock); 1790 spin_lock(&inode->i_lock);
1757 inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed; 1791 inode->i_blocks -= blocks_per_huge_page(h);
1758 spin_unlock(&inode->i_lock); 1792 spin_unlock(&inode->i_lock);
1759 1793
1760 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 1794 hugetlb_put_quota(inode->i_mapping, (chg - freed));
1761 hugetlb_acct_memory(-(chg - freed)); 1795 hugetlb_acct_memory(h, -(chg - freed));
1762} 1796}
diff --git a/mm/memory.c b/mm/memory.c
index 72932489a082..c1c1d6d8c22b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -903,7 +903,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
903 if (unlikely(is_vm_hugetlb_page(vma))) { 903 if (unlikely(is_vm_hugetlb_page(vma))) {
904 unmap_hugepage_range(vma, start, end, NULL); 904 unmap_hugepage_range(vma, start, end, NULL);
905 zap_work -= (end - start) / 905 zap_work -= (end - start) /
906 (HPAGE_SIZE / PAGE_SIZE); 906 pages_per_huge_page(hstate_vma(vma));
907 start = end; 907 start = end;
908 } else 908 } else
909 start = unmap_page_range(*tlbp, vma, 909 start = unmap_page_range(*tlbp, vma,
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c94e58b192c3..e550bec20582 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1481,7 +1481,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1481 1481
1482 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { 1482 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1483 zl = node_zonelist(interleave_nid(*mpol, vma, addr, 1483 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1484 HPAGE_SHIFT), gfp_flags); 1484 huge_page_shift(hstate_vma(vma))), gfp_flags);
1485 } else { 1485 } else {
1486 zl = policy_zonelist(gfp_flags, *mpol); 1486 zl = policy_zonelist(gfp_flags, *mpol);
1487 if ((*mpol)->mode == MPOL_BIND) 1487 if ((*mpol)->mode == MPOL_BIND)
@@ -2220,9 +2220,12 @@ static void check_huge_range(struct vm_area_struct *vma,
2220{ 2220{
2221 unsigned long addr; 2221 unsigned long addr;
2222 struct page *page; 2222 struct page *page;
2223 struct hstate *h = hstate_vma(vma);
2224 unsigned long sz = huge_page_size(h);
2223 2225
2224 for (addr = start; addr < end; addr += HPAGE_SIZE) { 2226 for (addr = start; addr < end; addr += sz) {
2225 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK); 2227 pte_t *ptep = huge_pte_offset(vma->vm_mm,
2228 addr & huge_page_mask(h));
2226 pte_t pte; 2229 pte_t pte;
2227 2230
2228 if (!ptep) 2231 if (!ptep)
diff --git a/mm/mmap.c b/mm/mmap.c
index 57d3b6097deb..5e0cc99e9cd5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1812,7 +1812,8 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1812 struct mempolicy *pol; 1812 struct mempolicy *pol;
1813 struct vm_area_struct *new; 1813 struct vm_area_struct *new;
1814 1814
1815 if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK)) 1815 if (is_vm_hugetlb_page(vma) && (addr &
1816 ~(huge_page_mask(hstate_vma(vma)))))
1816 return -EINVAL; 1817 return -EINVAL;
1817 1818
1818 if (mm->map_count >= sysctl_max_map_count) 1819 if (mm->map_count >= sysctl_max_map_count)