aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/hugetlb.c368
-rw-r--r--mm/memory.c2
-rw-r--r--mm/mempolicy.c9
-rw-r--r--mm/mmap.c3
4 files changed, 210 insertions, 172 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 32dff4290c66..0d8153e25f09 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -22,18 +22,12 @@
22#include "internal.h" 22#include "internal.h"
23 23
24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
26static unsigned long surplus_huge_pages;
27static unsigned long nr_overcommit_huge_pages;
28unsigned long max_huge_pages; 25unsigned long max_huge_pages;
29unsigned long sysctl_overcommit_huge_pages; 26unsigned long sysctl_overcommit_huge_pages;
30static struct list_head hugepage_freelists[MAX_NUMNODES];
31static unsigned int nr_huge_pages_node[MAX_NUMNODES];
32static unsigned int free_huge_pages_node[MAX_NUMNODES];
33static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
34static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 27static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
35unsigned long hugepages_treat_as_movable; 28unsigned long hugepages_treat_as_movable;
36static int hugetlb_next_nid; 29
30struct hstate default_hstate;
37 31
38/* 32/*
39 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 33 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
@@ -203,11 +197,11 @@ static long region_count(struct list_head *head, long f, long t)
203 * Convert the address within this vma to the page offset within 197 * Convert the address within this vma to the page offset within
204 * the mapping, in pagecache page units; huge pages here. 198 * the mapping, in pagecache page units; huge pages here.
205 */ 199 */
206static pgoff_t vma_hugecache_offset(struct vm_area_struct *vma, 200static pgoff_t vma_hugecache_offset(struct hstate *h,
207 unsigned long address) 201 struct vm_area_struct *vma, unsigned long address)
208{ 202{
209 return ((address - vma->vm_start) >> HPAGE_SHIFT) + 203 return ((address - vma->vm_start) >> huge_page_shift(h)) +
210 (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); 204 (vma->vm_pgoff >> huge_page_order(h));
211} 205}
212 206
213/* 207/*
@@ -309,20 +303,21 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
309} 303}
310 304
311/* Decrement the reserved pages in the hugepage pool by one */ 305/* Decrement the reserved pages in the hugepage pool by one */
312static void decrement_hugepage_resv_vma(struct vm_area_struct *vma) 306static void decrement_hugepage_resv_vma(struct hstate *h,
307 struct vm_area_struct *vma)
313{ 308{
314 if (vma->vm_flags & VM_NORESERVE) 309 if (vma->vm_flags & VM_NORESERVE)
315 return; 310 return;
316 311
317 if (vma->vm_flags & VM_SHARED) { 312 if (vma->vm_flags & VM_SHARED) {
318 /* Shared mappings always use reserves */ 313 /* Shared mappings always use reserves */
319 resv_huge_pages--; 314 h->resv_huge_pages--;
320 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 315 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
321 /* 316 /*
322 * Only the process that called mmap() has reserves for 317 * Only the process that called mmap() has reserves for
323 * private mappings. 318 * private mappings.
324 */ 319 */
325 resv_huge_pages--; 320 h->resv_huge_pages--;
326 } 321 }
327} 322}
328 323
@@ -344,12 +339,13 @@ static int vma_has_private_reserves(struct vm_area_struct *vma)
344 return 1; 339 return 1;
345} 340}
346 341
347static void clear_huge_page(struct page *page, unsigned long addr) 342static void clear_huge_page(struct page *page,
343 unsigned long addr, unsigned long sz)
348{ 344{
349 int i; 345 int i;
350 346
351 might_sleep(); 347 might_sleep();
352 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { 348 for (i = 0; i < sz/PAGE_SIZE; i++) {
353 cond_resched(); 349 cond_resched();
354 clear_user_highpage(page + i, addr + i * PAGE_SIZE); 350 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
355 } 351 }
@@ -359,41 +355,43 @@ static void copy_huge_page(struct page *dst, struct page *src,
359 unsigned long addr, struct vm_area_struct *vma) 355 unsigned long addr, struct vm_area_struct *vma)
360{ 356{
361 int i; 357 int i;
358 struct hstate *h = hstate_vma(vma);
362 359
363 might_sleep(); 360 might_sleep();
364 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { 361 for (i = 0; i < pages_per_huge_page(h); i++) {
365 cond_resched(); 362 cond_resched();
366 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); 363 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
367 } 364 }
368} 365}
369 366
370static void enqueue_huge_page(struct page *page) 367static void enqueue_huge_page(struct hstate *h, struct page *page)
371{ 368{
372 int nid = page_to_nid(page); 369 int nid = page_to_nid(page);
373 list_add(&page->lru, &hugepage_freelists[nid]); 370 list_add(&page->lru, &h->hugepage_freelists[nid]);
374 free_huge_pages++; 371 h->free_huge_pages++;
375 free_huge_pages_node[nid]++; 372 h->free_huge_pages_node[nid]++;
376} 373}
377 374
378static struct page *dequeue_huge_page(void) 375static struct page *dequeue_huge_page(struct hstate *h)
379{ 376{
380 int nid; 377 int nid;
381 struct page *page = NULL; 378 struct page *page = NULL;
382 379
383 for (nid = 0; nid < MAX_NUMNODES; ++nid) { 380 for (nid = 0; nid < MAX_NUMNODES; ++nid) {
384 if (!list_empty(&hugepage_freelists[nid])) { 381 if (!list_empty(&h->hugepage_freelists[nid])) {
385 page = list_entry(hugepage_freelists[nid].next, 382 page = list_entry(h->hugepage_freelists[nid].next,
386 struct page, lru); 383 struct page, lru);
387 list_del(&page->lru); 384 list_del(&page->lru);
388 free_huge_pages--; 385 h->free_huge_pages--;
389 free_huge_pages_node[nid]--; 386 h->free_huge_pages_node[nid]--;
390 break; 387 break;
391 } 388 }
392 } 389 }
393 return page; 390 return page;
394} 391}
395 392
396static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, 393static struct page *dequeue_huge_page_vma(struct hstate *h,
394 struct vm_area_struct *vma,
397 unsigned long address, int avoid_reserve) 395 unsigned long address, int avoid_reserve)
398{ 396{
399 int nid; 397 int nid;
@@ -411,26 +409,26 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
411 * not "stolen". The child may still get SIGKILLed 409 * not "stolen". The child may still get SIGKILLed
412 */ 410 */
413 if (!vma_has_private_reserves(vma) && 411 if (!vma_has_private_reserves(vma) &&
414 free_huge_pages - resv_huge_pages == 0) 412 h->free_huge_pages - h->resv_huge_pages == 0)
415 return NULL; 413 return NULL;
416 414
417 /* If reserves cannot be used, ensure enough pages are in the pool */ 415 /* If reserves cannot be used, ensure enough pages are in the pool */
418 if (avoid_reserve && free_huge_pages - resv_huge_pages == 0) 416 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
419 return NULL; 417 return NULL;
420 418
421 for_each_zone_zonelist_nodemask(zone, z, zonelist, 419 for_each_zone_zonelist_nodemask(zone, z, zonelist,
422 MAX_NR_ZONES - 1, nodemask) { 420 MAX_NR_ZONES - 1, nodemask) {
423 nid = zone_to_nid(zone); 421 nid = zone_to_nid(zone);
424 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && 422 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
425 !list_empty(&hugepage_freelists[nid])) { 423 !list_empty(&h->hugepage_freelists[nid])) {
426 page = list_entry(hugepage_freelists[nid].next, 424 page = list_entry(h->hugepage_freelists[nid].next,
427 struct page, lru); 425 struct page, lru);
428 list_del(&page->lru); 426 list_del(&page->lru);
429 free_huge_pages--; 427 h->free_huge_pages--;
430 free_huge_pages_node[nid]--; 428 h->free_huge_pages_node[nid]--;
431 429
432 if (!avoid_reserve) 430 if (!avoid_reserve)
433 decrement_hugepage_resv_vma(vma); 431 decrement_hugepage_resv_vma(h, vma);
434 432
435 break; 433 break;
436 } 434 }
@@ -439,12 +437,13 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
439 return page; 437 return page;
440} 438}
441 439
442static void update_and_free_page(struct page *page) 440static void update_and_free_page(struct hstate *h, struct page *page)
443{ 441{
444 int i; 442 int i;
445 nr_huge_pages--; 443
446 nr_huge_pages_node[page_to_nid(page)]--; 444 h->nr_huge_pages--;
447 for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { 445 h->nr_huge_pages_node[page_to_nid(page)]--;
446 for (i = 0; i < pages_per_huge_page(h); i++) {
448 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 447 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
449 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 448 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
450 1 << PG_private | 1<< PG_writeback); 449 1 << PG_private | 1<< PG_writeback);
@@ -452,11 +451,16 @@ static void update_and_free_page(struct page *page)
452 set_compound_page_dtor(page, NULL); 451 set_compound_page_dtor(page, NULL);
453 set_page_refcounted(page); 452 set_page_refcounted(page);
454 arch_release_hugepage(page); 453 arch_release_hugepage(page);
455 __free_pages(page, HUGETLB_PAGE_ORDER); 454 __free_pages(page, huge_page_order(h));
456} 455}
457 456
458static void free_huge_page(struct page *page) 457static void free_huge_page(struct page *page)
459{ 458{
459 /*
460 * Can't pass hstate in here because it is called from the
461 * compound page destructor.
462 */
463 struct hstate *h = &default_hstate;
460 int nid = page_to_nid(page); 464 int nid = page_to_nid(page);
461 struct address_space *mapping; 465 struct address_space *mapping;
462 466
@@ -466,12 +470,12 @@ static void free_huge_page(struct page *page)
466 INIT_LIST_HEAD(&page->lru); 470 INIT_LIST_HEAD(&page->lru);
467 471
468 spin_lock(&hugetlb_lock); 472 spin_lock(&hugetlb_lock);
469 if (surplus_huge_pages_node[nid]) { 473 if (h->surplus_huge_pages_node[nid]) {
470 update_and_free_page(page); 474 update_and_free_page(h, page);
471 surplus_huge_pages--; 475 h->surplus_huge_pages--;
472 surplus_huge_pages_node[nid]--; 476 h->surplus_huge_pages_node[nid]--;
473 } else { 477 } else {
474 enqueue_huge_page(page); 478 enqueue_huge_page(h, page);
475 } 479 }
476 spin_unlock(&hugetlb_lock); 480 spin_unlock(&hugetlb_lock);
477 if (mapping) 481 if (mapping)
@@ -483,7 +487,7 @@ static void free_huge_page(struct page *page)
483 * balanced by operating on them in a round-robin fashion. 487 * balanced by operating on them in a round-robin fashion.
484 * Returns 1 if an adjustment was made. 488 * Returns 1 if an adjustment was made.
485 */ 489 */
486static int adjust_pool_surplus(int delta) 490static int adjust_pool_surplus(struct hstate *h, int delta)
487{ 491{
488 static int prev_nid; 492 static int prev_nid;
489 int nid = prev_nid; 493 int nid = prev_nid;
@@ -496,15 +500,15 @@ static int adjust_pool_surplus(int delta)
496 nid = first_node(node_online_map); 500 nid = first_node(node_online_map);
497 501
498 /* To shrink on this node, there must be a surplus page */ 502 /* To shrink on this node, there must be a surplus page */
499 if (delta < 0 && !surplus_huge_pages_node[nid]) 503 if (delta < 0 && !h->surplus_huge_pages_node[nid])
500 continue; 504 continue;
501 /* Surplus cannot exceed the total number of pages */ 505 /* Surplus cannot exceed the total number of pages */
502 if (delta > 0 && surplus_huge_pages_node[nid] >= 506 if (delta > 0 && h->surplus_huge_pages_node[nid] >=
503 nr_huge_pages_node[nid]) 507 h->nr_huge_pages_node[nid])
504 continue; 508 continue;
505 509
506 surplus_huge_pages += delta; 510 h->surplus_huge_pages += delta;
507 surplus_huge_pages_node[nid] += delta; 511 h->surplus_huge_pages_node[nid] += delta;
508 ret = 1; 512 ret = 1;
509 break; 513 break;
510 } while (nid != prev_nid); 514 } while (nid != prev_nid);
@@ -513,46 +517,46 @@ static int adjust_pool_surplus(int delta)
513 return ret; 517 return ret;
514} 518}
515 519
516static void prep_new_huge_page(struct page *page, int nid) 520static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
517{ 521{
518 set_compound_page_dtor(page, free_huge_page); 522 set_compound_page_dtor(page, free_huge_page);
519 spin_lock(&hugetlb_lock); 523 spin_lock(&hugetlb_lock);
520 nr_huge_pages++; 524 h->nr_huge_pages++;
521 nr_huge_pages_node[nid]++; 525 h->nr_huge_pages_node[nid]++;
522 spin_unlock(&hugetlb_lock); 526 spin_unlock(&hugetlb_lock);
523 put_page(page); /* free it into the hugepage allocator */ 527 put_page(page); /* free it into the hugepage allocator */
524} 528}
525 529
526static struct page *alloc_fresh_huge_page_node(int nid) 530static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
527{ 531{
528 struct page *page; 532 struct page *page;
529 533
530 page = alloc_pages_node(nid, 534 page = alloc_pages_node(nid,
531 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| 535 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
532 __GFP_REPEAT|__GFP_NOWARN, 536 __GFP_REPEAT|__GFP_NOWARN,
533 HUGETLB_PAGE_ORDER); 537 huge_page_order(h));
534 if (page) { 538 if (page) {
535 if (arch_prepare_hugepage(page)) { 539 if (arch_prepare_hugepage(page)) {
536 __free_pages(page, HUGETLB_PAGE_ORDER); 540 __free_pages(page, HUGETLB_PAGE_ORDER);
537 return NULL; 541 return NULL;
538 } 542 }
539 prep_new_huge_page(page, nid); 543 prep_new_huge_page(h, page, nid);
540 } 544 }
541 545
542 return page; 546 return page;
543} 547}
544 548
545static int alloc_fresh_huge_page(void) 549static int alloc_fresh_huge_page(struct hstate *h)
546{ 550{
547 struct page *page; 551 struct page *page;
548 int start_nid; 552 int start_nid;
549 int next_nid; 553 int next_nid;
550 int ret = 0; 554 int ret = 0;
551 555
552 start_nid = hugetlb_next_nid; 556 start_nid = h->hugetlb_next_nid;
553 557
554 do { 558 do {
555 page = alloc_fresh_huge_page_node(hugetlb_next_nid); 559 page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid);
556 if (page) 560 if (page)
557 ret = 1; 561 ret = 1;
558 /* 562 /*
@@ -566,11 +570,11 @@ static int alloc_fresh_huge_page(void)
566 * if we just successfully allocated a hugepage so that 570 * if we just successfully allocated a hugepage so that
567 * the next caller gets hugepages on the next node. 571 * the next caller gets hugepages on the next node.
568 */ 572 */
569 next_nid = next_node(hugetlb_next_nid, node_online_map); 573 next_nid = next_node(h->hugetlb_next_nid, node_online_map);
570 if (next_nid == MAX_NUMNODES) 574 if (next_nid == MAX_NUMNODES)
571 next_nid = first_node(node_online_map); 575 next_nid = first_node(node_online_map);
572 hugetlb_next_nid = next_nid; 576 h->hugetlb_next_nid = next_nid;
573 } while (!page && hugetlb_next_nid != start_nid); 577 } while (!page && h->hugetlb_next_nid != start_nid);
574 578
575 if (ret) 579 if (ret)
576 count_vm_event(HTLB_BUDDY_PGALLOC); 580 count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -580,8 +584,8 @@ static int alloc_fresh_huge_page(void)
580 return ret; 584 return ret;
581} 585}
582 586
583static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, 587static struct page *alloc_buddy_huge_page(struct hstate *h,
584 unsigned long address) 588 struct vm_area_struct *vma, unsigned long address)
585{ 589{
586 struct page *page; 590 struct page *page;
587 unsigned int nid; 591 unsigned int nid;
@@ -610,18 +614,18 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
610 * per-node value is checked there. 614 * per-node value is checked there.
611 */ 615 */
612 spin_lock(&hugetlb_lock); 616 spin_lock(&hugetlb_lock);
613 if (surplus_huge_pages >= nr_overcommit_huge_pages) { 617 if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
614 spin_unlock(&hugetlb_lock); 618 spin_unlock(&hugetlb_lock);
615 return NULL; 619 return NULL;
616 } else { 620 } else {
617 nr_huge_pages++; 621 h->nr_huge_pages++;
618 surplus_huge_pages++; 622 h->surplus_huge_pages++;
619 } 623 }
620 spin_unlock(&hugetlb_lock); 624 spin_unlock(&hugetlb_lock);
621 625
622 page = alloc_pages(htlb_alloc_mask|__GFP_COMP| 626 page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
623 __GFP_REPEAT|__GFP_NOWARN, 627 __GFP_REPEAT|__GFP_NOWARN,
624 HUGETLB_PAGE_ORDER); 628 huge_page_order(h));
625 629
626 spin_lock(&hugetlb_lock); 630 spin_lock(&hugetlb_lock);
627 if (page) { 631 if (page) {
@@ -636,12 +640,12 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
636 /* 640 /*
637 * We incremented the global counters already 641 * We incremented the global counters already
638 */ 642 */
639 nr_huge_pages_node[nid]++; 643 h->nr_huge_pages_node[nid]++;
640 surplus_huge_pages_node[nid]++; 644 h->surplus_huge_pages_node[nid]++;
641 __count_vm_event(HTLB_BUDDY_PGALLOC); 645 __count_vm_event(HTLB_BUDDY_PGALLOC);
642 } else { 646 } else {
643 nr_huge_pages--; 647 h->nr_huge_pages--;
644 surplus_huge_pages--; 648 h->surplus_huge_pages--;
645 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 649 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
646 } 650 }
647 spin_unlock(&hugetlb_lock); 651 spin_unlock(&hugetlb_lock);
@@ -653,16 +657,16 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
653 * Increase the hugetlb pool such that it can accomodate a reservation 657 * Increase the hugetlb pool such that it can accomodate a reservation
654 * of size 'delta'. 658 * of size 'delta'.
655 */ 659 */
656static int gather_surplus_pages(int delta) 660static int gather_surplus_pages(struct hstate *h, int delta)
657{ 661{
658 struct list_head surplus_list; 662 struct list_head surplus_list;
659 struct page *page, *tmp; 663 struct page *page, *tmp;
660 int ret, i; 664 int ret, i;
661 int needed, allocated; 665 int needed, allocated;
662 666
663 needed = (resv_huge_pages + delta) - free_huge_pages; 667 needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
664 if (needed <= 0) { 668 if (needed <= 0) {
665 resv_huge_pages += delta; 669 h->resv_huge_pages += delta;
666 return 0; 670 return 0;
667 } 671 }
668 672
@@ -673,7 +677,7 @@ static int gather_surplus_pages(int delta)
673retry: 677retry:
674 spin_unlock(&hugetlb_lock); 678 spin_unlock(&hugetlb_lock);
675 for (i = 0; i < needed; i++) { 679 for (i = 0; i < needed; i++) {
676 page = alloc_buddy_huge_page(NULL, 0); 680 page = alloc_buddy_huge_page(h, NULL, 0);
677 if (!page) { 681 if (!page) {
678 /* 682 /*
679 * We were not able to allocate enough pages to 683 * We were not able to allocate enough pages to
@@ -694,7 +698,8 @@ retry:
694 * because either resv_huge_pages or free_huge_pages may have changed. 698 * because either resv_huge_pages or free_huge_pages may have changed.
695 */ 699 */
696 spin_lock(&hugetlb_lock); 700 spin_lock(&hugetlb_lock);
697 needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); 701 needed = (h->resv_huge_pages + delta) -
702 (h->free_huge_pages + allocated);
698 if (needed > 0) 703 if (needed > 0)
699 goto retry; 704 goto retry;
700 705
@@ -707,7 +712,7 @@ retry:
707 * before they are reserved. 712 * before they are reserved.
708 */ 713 */
709 needed += allocated; 714 needed += allocated;
710 resv_huge_pages += delta; 715 h->resv_huge_pages += delta;
711 ret = 0; 716 ret = 0;
712free: 717free:
713 /* Free the needed pages to the hugetlb pool */ 718 /* Free the needed pages to the hugetlb pool */
@@ -715,7 +720,7 @@ free:
715 if ((--needed) < 0) 720 if ((--needed) < 0)
716 break; 721 break;
717 list_del(&page->lru); 722 list_del(&page->lru);
718 enqueue_huge_page(page); 723 enqueue_huge_page(h, page);
719 } 724 }
720 725
721 /* Free unnecessary surplus pages to the buddy allocator */ 726 /* Free unnecessary surplus pages to the buddy allocator */
@@ -743,7 +748,8 @@ free:
743 * allocated to satisfy the reservation must be explicitly freed if they were 748 * allocated to satisfy the reservation must be explicitly freed if they were
744 * never used. 749 * never used.
745 */ 750 */
746static void return_unused_surplus_pages(unsigned long unused_resv_pages) 751static void return_unused_surplus_pages(struct hstate *h,
752 unsigned long unused_resv_pages)
747{ 753{
748 static int nid = -1; 754 static int nid = -1;
749 struct page *page; 755 struct page *page;
@@ -758,27 +764,27 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
758 unsigned long remaining_iterations = num_online_nodes(); 764 unsigned long remaining_iterations = num_online_nodes();
759 765
760 /* Uncommit the reservation */ 766 /* Uncommit the reservation */
761 resv_huge_pages -= unused_resv_pages; 767 h->resv_huge_pages -= unused_resv_pages;
762 768
763 nr_pages = min(unused_resv_pages, surplus_huge_pages); 769 nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
764 770
765 while (remaining_iterations-- && nr_pages) { 771 while (remaining_iterations-- && nr_pages) {
766 nid = next_node(nid, node_online_map); 772 nid = next_node(nid, node_online_map);
767 if (nid == MAX_NUMNODES) 773 if (nid == MAX_NUMNODES)
768 nid = first_node(node_online_map); 774 nid = first_node(node_online_map);
769 775
770 if (!surplus_huge_pages_node[nid]) 776 if (!h->surplus_huge_pages_node[nid])
771 continue; 777 continue;
772 778
773 if (!list_empty(&hugepage_freelists[nid])) { 779 if (!list_empty(&h->hugepage_freelists[nid])) {
774 page = list_entry(hugepage_freelists[nid].next, 780 page = list_entry(h->hugepage_freelists[nid].next,
775 struct page, lru); 781 struct page, lru);
776 list_del(&page->lru); 782 list_del(&page->lru);
777 update_and_free_page(page); 783 update_and_free_page(h, page);
778 free_huge_pages--; 784 h->free_huge_pages--;
779 free_huge_pages_node[nid]--; 785 h->free_huge_pages_node[nid]--;
780 surplus_huge_pages--; 786 h->surplus_huge_pages--;
781 surplus_huge_pages_node[nid]--; 787 h->surplus_huge_pages_node[nid]--;
782 nr_pages--; 788 nr_pages--;
783 remaining_iterations = num_online_nodes(); 789 remaining_iterations = num_online_nodes();
784 } 790 }
@@ -794,13 +800,14 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
794 * an instantiated the change should be committed via vma_commit_reservation. 800 * an instantiated the change should be committed via vma_commit_reservation.
795 * No action is required on failure. 801 * No action is required on failure.
796 */ 802 */
797static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr) 803static int vma_needs_reservation(struct hstate *h,
804 struct vm_area_struct *vma, unsigned long addr)
798{ 805{
799 struct address_space *mapping = vma->vm_file->f_mapping; 806 struct address_space *mapping = vma->vm_file->f_mapping;
800 struct inode *inode = mapping->host; 807 struct inode *inode = mapping->host;
801 808
802 if (vma->vm_flags & VM_SHARED) { 809 if (vma->vm_flags & VM_SHARED) {
803 pgoff_t idx = vma_hugecache_offset(vma, addr); 810 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
804 return region_chg(&inode->i_mapping->private_list, 811 return region_chg(&inode->i_mapping->private_list,
805 idx, idx + 1); 812 idx, idx + 1);
806 813
@@ -809,7 +816,7 @@ static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr)
809 816
810 } else { 817 } else {
811 int err; 818 int err;
812 pgoff_t idx = vma_hugecache_offset(vma, addr); 819 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
813 struct resv_map *reservations = vma_resv_map(vma); 820 struct resv_map *reservations = vma_resv_map(vma);
814 821
815 err = region_chg(&reservations->regions, idx, idx + 1); 822 err = region_chg(&reservations->regions, idx, idx + 1);
@@ -818,18 +825,18 @@ static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr)
818 return 0; 825 return 0;
819 } 826 }
820} 827}
821static void vma_commit_reservation(struct vm_area_struct *vma, 828static void vma_commit_reservation(struct hstate *h,
822 unsigned long addr) 829 struct vm_area_struct *vma, unsigned long addr)
823{ 830{
824 struct address_space *mapping = vma->vm_file->f_mapping; 831 struct address_space *mapping = vma->vm_file->f_mapping;
825 struct inode *inode = mapping->host; 832 struct inode *inode = mapping->host;
826 833
827 if (vma->vm_flags & VM_SHARED) { 834 if (vma->vm_flags & VM_SHARED) {
828 pgoff_t idx = vma_hugecache_offset(vma, addr); 835 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
829 region_add(&inode->i_mapping->private_list, idx, idx + 1); 836 region_add(&inode->i_mapping->private_list, idx, idx + 1);
830 837
831 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 838 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
832 pgoff_t idx = vma_hugecache_offset(vma, addr); 839 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
833 struct resv_map *reservations = vma_resv_map(vma); 840 struct resv_map *reservations = vma_resv_map(vma);
834 841
835 /* Mark this page used in the map. */ 842 /* Mark this page used in the map. */
@@ -840,6 +847,7 @@ static void vma_commit_reservation(struct vm_area_struct *vma,
840static struct page *alloc_huge_page(struct vm_area_struct *vma, 847static struct page *alloc_huge_page(struct vm_area_struct *vma,
841 unsigned long addr, int avoid_reserve) 848 unsigned long addr, int avoid_reserve)
842{ 849{
850 struct hstate *h = hstate_vma(vma);
843 struct page *page; 851 struct page *page;
844 struct address_space *mapping = vma->vm_file->f_mapping; 852 struct address_space *mapping = vma->vm_file->f_mapping;
845 struct inode *inode = mapping->host; 853 struct inode *inode = mapping->host;
@@ -852,7 +860,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
852 * MAP_NORESERVE mappings may also need pages and quota allocated 860 * MAP_NORESERVE mappings may also need pages and quota allocated
853 * if no reserve mapping overlaps. 861 * if no reserve mapping overlaps.
854 */ 862 */
855 chg = vma_needs_reservation(vma, addr); 863 chg = vma_needs_reservation(h, vma, addr);
856 if (chg < 0) 864 if (chg < 0)
857 return ERR_PTR(chg); 865 return ERR_PTR(chg);
858 if (chg) 866 if (chg)
@@ -860,11 +868,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
860 return ERR_PTR(-ENOSPC); 868 return ERR_PTR(-ENOSPC);
861 869
862 spin_lock(&hugetlb_lock); 870 spin_lock(&hugetlb_lock);
863 page = dequeue_huge_page_vma(vma, addr, avoid_reserve); 871 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
864 spin_unlock(&hugetlb_lock); 872 spin_unlock(&hugetlb_lock);
865 873
866 if (!page) { 874 if (!page) {
867 page = alloc_buddy_huge_page(vma, addr); 875 page = alloc_buddy_huge_page(h, vma, addr);
868 if (!page) { 876 if (!page) {
869 hugetlb_put_quota(inode->i_mapping, chg); 877 hugetlb_put_quota(inode->i_mapping, chg);
870 return ERR_PTR(-VM_FAULT_OOM); 878 return ERR_PTR(-VM_FAULT_OOM);
@@ -874,7 +882,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
874 set_page_refcounted(page); 882 set_page_refcounted(page);
875 set_page_private(page, (unsigned long) mapping); 883 set_page_private(page, (unsigned long) mapping);
876 884
877 vma_commit_reservation(vma, addr); 885 vma_commit_reservation(h, vma, addr);
878 886
879 return page; 887 return page;
880} 888}
@@ -882,21 +890,28 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
882static int __init hugetlb_init(void) 890static int __init hugetlb_init(void)
883{ 891{
884 unsigned long i; 892 unsigned long i;
893 struct hstate *h = &default_hstate;
885 894
886 if (HPAGE_SHIFT == 0) 895 if (HPAGE_SHIFT == 0)
887 return 0; 896 return 0;
888 897
898 if (!h->order) {
899 h->order = HPAGE_SHIFT - PAGE_SHIFT;
900 h->mask = HPAGE_MASK;
901 }
902
889 for (i = 0; i < MAX_NUMNODES; ++i) 903 for (i = 0; i < MAX_NUMNODES; ++i)
890 INIT_LIST_HEAD(&hugepage_freelists[i]); 904 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
891 905
892 hugetlb_next_nid = first_node(node_online_map); 906 h->hugetlb_next_nid = first_node(node_online_map);
893 907
894 for (i = 0; i < max_huge_pages; ++i) { 908 for (i = 0; i < max_huge_pages; ++i) {
895 if (!alloc_fresh_huge_page()) 909 if (!alloc_fresh_huge_page(h))
896 break; 910 break;
897 } 911 }
898 max_huge_pages = free_huge_pages = nr_huge_pages = i; 912 max_huge_pages = h->free_huge_pages = h->nr_huge_pages = i;
899 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); 913 printk(KERN_INFO "Total HugeTLB memory allocated, %ld\n",
914 h->free_huge_pages);
900 return 0; 915 return 0;
901} 916}
902module_init(hugetlb_init); 917module_init(hugetlb_init);
@@ -922,34 +937,36 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
922 937
923#ifdef CONFIG_SYSCTL 938#ifdef CONFIG_SYSCTL
924#ifdef CONFIG_HIGHMEM 939#ifdef CONFIG_HIGHMEM
925static void try_to_free_low(unsigned long count) 940static void try_to_free_low(struct hstate *h, unsigned long count)
926{ 941{
927 int i; 942 int i;
928 943
929 for (i = 0; i < MAX_NUMNODES; ++i) { 944 for (i = 0; i < MAX_NUMNODES; ++i) {
930 struct page *page, *next; 945 struct page *page, *next;
931 list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { 946 struct list_head *freel = &h->hugepage_freelists[i];
932 if (count >= nr_huge_pages) 947 list_for_each_entry_safe(page, next, freel, lru) {
948 if (count >= h->nr_huge_pages)
933 return; 949 return;
934 if (PageHighMem(page)) 950 if (PageHighMem(page))
935 continue; 951 continue;
936 list_del(&page->lru); 952 list_del(&page->lru);
937 update_and_free_page(page); 953 update_and_free_page(page);
938 free_huge_pages--; 954 h->free_huge_pages--;
939 free_huge_pages_node[page_to_nid(page)]--; 955 h->free_huge_pages_node[page_to_nid(page)]--;
940 } 956 }
941 } 957 }
942} 958}
943#else 959#else
944static inline void try_to_free_low(unsigned long count) 960static inline void try_to_free_low(struct hstate *h, unsigned long count)
945{ 961{
946} 962}
947#endif 963#endif
948 964
949#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) 965#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
950static unsigned long set_max_huge_pages(unsigned long count) 966static unsigned long set_max_huge_pages(unsigned long count)
951{ 967{
952 unsigned long min_count, ret; 968 unsigned long min_count, ret;
969 struct hstate *h = &default_hstate;
953 970
954 /* 971 /*
955 * Increase the pool size 972 * Increase the pool size
@@ -963,19 +980,19 @@ static unsigned long set_max_huge_pages(unsigned long count)
963 * within all the constraints specified by the sysctls. 980 * within all the constraints specified by the sysctls.
964 */ 981 */
965 spin_lock(&hugetlb_lock); 982 spin_lock(&hugetlb_lock);
966 while (surplus_huge_pages && count > persistent_huge_pages) { 983 while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
967 if (!adjust_pool_surplus(-1)) 984 if (!adjust_pool_surplus(h, -1))
968 break; 985 break;
969 } 986 }
970 987
971 while (count > persistent_huge_pages) { 988 while (count > persistent_huge_pages(h)) {
972 /* 989 /*
973 * If this allocation races such that we no longer need the 990 * If this allocation races such that we no longer need the
974 * page, free_huge_page will handle it by freeing the page 991 * page, free_huge_page will handle it by freeing the page
975 * and reducing the surplus. 992 * and reducing the surplus.
976 */ 993 */
977 spin_unlock(&hugetlb_lock); 994 spin_unlock(&hugetlb_lock);
978 ret = alloc_fresh_huge_page(); 995 ret = alloc_fresh_huge_page(h);
979 spin_lock(&hugetlb_lock); 996 spin_lock(&hugetlb_lock);
980 if (!ret) 997 if (!ret)
981 goto out; 998 goto out;
@@ -997,21 +1014,21 @@ static unsigned long set_max_huge_pages(unsigned long count)
997 * and won't grow the pool anywhere else. Not until one of the 1014 * and won't grow the pool anywhere else. Not until one of the
998 * sysctls are changed, or the surplus pages go out of use. 1015 * sysctls are changed, or the surplus pages go out of use.
999 */ 1016 */
1000 min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; 1017 min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
1001 min_count = max(count, min_count); 1018 min_count = max(count, min_count);
1002 try_to_free_low(min_count); 1019 try_to_free_low(h, min_count);
1003 while (min_count < persistent_huge_pages) { 1020 while (min_count < persistent_huge_pages(h)) {
1004 struct page *page = dequeue_huge_page(); 1021 struct page *page = dequeue_huge_page(h);
1005 if (!page) 1022 if (!page)
1006 break; 1023 break;
1007 update_and_free_page(page); 1024 update_and_free_page(h, page);
1008 } 1025 }
1009 while (count < persistent_huge_pages) { 1026 while (count < persistent_huge_pages(h)) {
1010 if (!adjust_pool_surplus(1)) 1027 if (!adjust_pool_surplus(h, 1))
1011 break; 1028 break;
1012 } 1029 }
1013out: 1030out:
1014 ret = persistent_huge_pages; 1031 ret = persistent_huge_pages(h);
1015 spin_unlock(&hugetlb_lock); 1032 spin_unlock(&hugetlb_lock);
1016 return ret; 1033 return ret;
1017} 1034}
@@ -1041,9 +1058,10 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1041 struct file *file, void __user *buffer, 1058 struct file *file, void __user *buffer,
1042 size_t *length, loff_t *ppos) 1059 size_t *length, loff_t *ppos)
1043{ 1060{
1061 struct hstate *h = &default_hstate;
1044 proc_doulongvec_minmax(table, write, file, buffer, length, ppos); 1062 proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
1045 spin_lock(&hugetlb_lock); 1063 spin_lock(&hugetlb_lock);
1046 nr_overcommit_huge_pages = sysctl_overcommit_huge_pages; 1064 h->nr_overcommit_huge_pages = sysctl_overcommit_huge_pages;
1047 spin_unlock(&hugetlb_lock); 1065 spin_unlock(&hugetlb_lock);
1048 return 0; 1066 return 0;
1049} 1067}
@@ -1052,37 +1070,40 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1052 1070
1053int hugetlb_report_meminfo(char *buf) 1071int hugetlb_report_meminfo(char *buf)
1054{ 1072{
1073 struct hstate *h = &default_hstate;
1055 return sprintf(buf, 1074 return sprintf(buf,
1056 "HugePages_Total: %5lu\n" 1075 "HugePages_Total: %5lu\n"
1057 "HugePages_Free: %5lu\n" 1076 "HugePages_Free: %5lu\n"
1058 "HugePages_Rsvd: %5lu\n" 1077 "HugePages_Rsvd: %5lu\n"
1059 "HugePages_Surp: %5lu\n" 1078 "HugePages_Surp: %5lu\n"
1060 "Hugepagesize: %5lu kB\n", 1079 "Hugepagesize: %5lu kB\n",
1061 nr_huge_pages, 1080 h->nr_huge_pages,
1062 free_huge_pages, 1081 h->free_huge_pages,
1063 resv_huge_pages, 1082 h->resv_huge_pages,
1064 surplus_huge_pages, 1083 h->surplus_huge_pages,
1065 HPAGE_SIZE/1024); 1084 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
1066} 1085}
1067 1086
1068int hugetlb_report_node_meminfo(int nid, char *buf) 1087int hugetlb_report_node_meminfo(int nid, char *buf)
1069{ 1088{
1089 struct hstate *h = &default_hstate;
1070 return sprintf(buf, 1090 return sprintf(buf,
1071 "Node %d HugePages_Total: %5u\n" 1091 "Node %d HugePages_Total: %5u\n"
1072 "Node %d HugePages_Free: %5u\n" 1092 "Node %d HugePages_Free: %5u\n"
1073 "Node %d HugePages_Surp: %5u\n", 1093 "Node %d HugePages_Surp: %5u\n",
1074 nid, nr_huge_pages_node[nid], 1094 nid, h->nr_huge_pages_node[nid],
1075 nid, free_huge_pages_node[nid], 1095 nid, h->free_huge_pages_node[nid],
1076 nid, surplus_huge_pages_node[nid]); 1096 nid, h->surplus_huge_pages_node[nid]);
1077} 1097}
1078 1098
1079/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 1099/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
1080unsigned long hugetlb_total_pages(void) 1100unsigned long hugetlb_total_pages(void)
1081{ 1101{
1082 return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); 1102 struct hstate *h = &default_hstate;
1103 return h->nr_huge_pages * pages_per_huge_page(h);
1083} 1104}
1084 1105
1085static int hugetlb_acct_memory(long delta) 1106static int hugetlb_acct_memory(struct hstate *h, long delta)
1086{ 1107{
1087 int ret = -ENOMEM; 1108 int ret = -ENOMEM;
1088 1109
@@ -1105,18 +1126,18 @@ static int hugetlb_acct_memory(long delta)
1105 * semantics that cpuset has. 1126 * semantics that cpuset has.
1106 */ 1127 */
1107 if (delta > 0) { 1128 if (delta > 0) {
1108 if (gather_surplus_pages(delta) < 0) 1129 if (gather_surplus_pages(h, delta) < 0)
1109 goto out; 1130 goto out;
1110 1131
1111 if (delta > cpuset_mems_nr(free_huge_pages_node)) { 1132 if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
1112 return_unused_surplus_pages(delta); 1133 return_unused_surplus_pages(h, delta);
1113 goto out; 1134 goto out;
1114 } 1135 }
1115 } 1136 }
1116 1137
1117 ret = 0; 1138 ret = 0;
1118 if (delta < 0) 1139 if (delta < 0)
1119 return_unused_surplus_pages((unsigned long) -delta); 1140 return_unused_surplus_pages(h, (unsigned long) -delta);
1120 1141
1121out: 1142out:
1122 spin_unlock(&hugetlb_lock); 1143 spin_unlock(&hugetlb_lock);
@@ -1141,14 +1162,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
1141 1162
1142static void hugetlb_vm_op_close(struct vm_area_struct *vma) 1163static void hugetlb_vm_op_close(struct vm_area_struct *vma)
1143{ 1164{
1165 struct hstate *h = hstate_vma(vma);
1144 struct resv_map *reservations = vma_resv_map(vma); 1166 struct resv_map *reservations = vma_resv_map(vma);
1145 unsigned long reserve; 1167 unsigned long reserve;
1146 unsigned long start; 1168 unsigned long start;
1147 unsigned long end; 1169 unsigned long end;
1148 1170
1149 if (reservations) { 1171 if (reservations) {
1150 start = vma_hugecache_offset(vma, vma->vm_start); 1172 start = vma_hugecache_offset(h, vma, vma->vm_start);
1151 end = vma_hugecache_offset(vma, vma->vm_end); 1173 end = vma_hugecache_offset(h, vma, vma->vm_end);
1152 1174
1153 reserve = (end - start) - 1175 reserve = (end - start) -
1154 region_count(&reservations->regions, start, end); 1176 region_count(&reservations->regions, start, end);
@@ -1156,7 +1178,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
1156 kref_put(&reservations->refs, resv_map_release); 1178 kref_put(&reservations->refs, resv_map_release);
1157 1179
1158 if (reserve) 1180 if (reserve)
1159 hugetlb_acct_memory(-reserve); 1181 hugetlb_acct_memory(h, -reserve);
1160 } 1182 }
1161} 1183}
1162 1184
@@ -1214,14 +1236,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
1214 struct page *ptepage; 1236 struct page *ptepage;
1215 unsigned long addr; 1237 unsigned long addr;
1216 int cow; 1238 int cow;
1239 struct hstate *h = hstate_vma(vma);
1240 unsigned long sz = huge_page_size(h);
1217 1241
1218 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 1242 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
1219 1243
1220 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 1244 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
1221 src_pte = huge_pte_offset(src, addr); 1245 src_pte = huge_pte_offset(src, addr);
1222 if (!src_pte) 1246 if (!src_pte)
1223 continue; 1247 continue;
1224 dst_pte = huge_pte_alloc(dst, addr); 1248 dst_pte = huge_pte_alloc(dst, addr, sz);
1225 if (!dst_pte) 1249 if (!dst_pte)
1226 goto nomem; 1250 goto nomem;
1227 1251
@@ -1257,6 +1281,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
1257 pte_t pte; 1281 pte_t pte;
1258 struct page *page; 1282 struct page *page;
1259 struct page *tmp; 1283 struct page *tmp;
1284 struct hstate *h = hstate_vma(vma);
1285 unsigned long sz = huge_page_size(h);
1286
1260 /* 1287 /*
1261 * A page gathering list, protected by per file i_mmap_lock. The 1288 * A page gathering list, protected by per file i_mmap_lock. The
1262 * lock is used to avoid list corruption from multiple unmapping 1289 * lock is used to avoid list corruption from multiple unmapping
@@ -1265,11 +1292,11 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
1265 LIST_HEAD(page_list); 1292 LIST_HEAD(page_list);
1266 1293
1267 WARN_ON(!is_vm_hugetlb_page(vma)); 1294 WARN_ON(!is_vm_hugetlb_page(vma));
1268 BUG_ON(start & ~HPAGE_MASK); 1295 BUG_ON(start & ~huge_page_mask(h));
1269 BUG_ON(end & ~HPAGE_MASK); 1296 BUG_ON(end & ~huge_page_mask(h));
1270 1297
1271 spin_lock(&mm->page_table_lock); 1298 spin_lock(&mm->page_table_lock);
1272 for (address = start; address < end; address += HPAGE_SIZE) { 1299 for (address = start; address < end; address += sz) {
1273 ptep = huge_pte_offset(mm, address); 1300 ptep = huge_pte_offset(mm, address);
1274 if (!ptep) 1301 if (!ptep)
1275 continue; 1302 continue;
@@ -1383,6 +1410,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
1383 unsigned long address, pte_t *ptep, pte_t pte, 1410 unsigned long address, pte_t *ptep, pte_t pte,
1384 struct page *pagecache_page) 1411 struct page *pagecache_page)
1385{ 1412{
1413 struct hstate *h = hstate_vma(vma);
1386 struct page *old_page, *new_page; 1414 struct page *old_page, *new_page;
1387 int avoidcopy; 1415 int avoidcopy;
1388 int outside_reserve = 0; 1416 int outside_reserve = 0;
@@ -1443,7 +1471,7 @@ retry_avoidcopy:
1443 __SetPageUptodate(new_page); 1471 __SetPageUptodate(new_page);
1444 spin_lock(&mm->page_table_lock); 1472 spin_lock(&mm->page_table_lock);
1445 1473
1446 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 1474 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
1447 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 1475 if (likely(pte_same(huge_ptep_get(ptep), pte))) {
1448 /* Break COW */ 1476 /* Break COW */
1449 huge_ptep_clear_flush(vma, address, ptep); 1477 huge_ptep_clear_flush(vma, address, ptep);
@@ -1458,14 +1486,14 @@ retry_avoidcopy:
1458} 1486}
1459 1487
1460/* Return the pagecache page at a given address within a VMA */ 1488/* Return the pagecache page at a given address within a VMA */
1461static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma, 1489static struct page *hugetlbfs_pagecache_page(struct hstate *h,
1462 unsigned long address) 1490 struct vm_area_struct *vma, unsigned long address)
1463{ 1491{
1464 struct address_space *mapping; 1492 struct address_space *mapping;
1465 pgoff_t idx; 1493 pgoff_t idx;
1466 1494
1467 mapping = vma->vm_file->f_mapping; 1495 mapping = vma->vm_file->f_mapping;
1468 idx = vma_hugecache_offset(vma, address); 1496 idx = vma_hugecache_offset(h, vma, address);
1469 1497
1470 return find_lock_page(mapping, idx); 1498 return find_lock_page(mapping, idx);
1471} 1499}
@@ -1473,6 +1501,7 @@ static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma,
1473static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 1501static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1474 unsigned long address, pte_t *ptep, int write_access) 1502 unsigned long address, pte_t *ptep, int write_access)
1475{ 1503{
1504 struct hstate *h = hstate_vma(vma);
1476 int ret = VM_FAULT_SIGBUS; 1505 int ret = VM_FAULT_SIGBUS;
1477 pgoff_t idx; 1506 pgoff_t idx;
1478 unsigned long size; 1507 unsigned long size;
@@ -1493,7 +1522,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1493 } 1522 }
1494 1523
1495 mapping = vma->vm_file->f_mapping; 1524 mapping = vma->vm_file->f_mapping;
1496 idx = vma_hugecache_offset(vma, address); 1525 idx = vma_hugecache_offset(h, vma, address);
1497 1526
1498 /* 1527 /*
1499 * Use page lock to guard against racing truncation 1528 * Use page lock to guard against racing truncation
@@ -1502,7 +1531,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1502retry: 1531retry:
1503 page = find_lock_page(mapping, idx); 1532 page = find_lock_page(mapping, idx);
1504 if (!page) { 1533 if (!page) {
1505 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 1534 size = i_size_read(mapping->host) >> huge_page_shift(h);
1506 if (idx >= size) 1535 if (idx >= size)
1507 goto out; 1536 goto out;
1508 page = alloc_huge_page(vma, address, 0); 1537 page = alloc_huge_page(vma, address, 0);
@@ -1510,7 +1539,7 @@ retry:
1510 ret = -PTR_ERR(page); 1539 ret = -PTR_ERR(page);
1511 goto out; 1540 goto out;
1512 } 1541 }
1513 clear_huge_page(page, address); 1542 clear_huge_page(page, address, huge_page_size(h));
1514 __SetPageUptodate(page); 1543 __SetPageUptodate(page);
1515 1544
1516 if (vma->vm_flags & VM_SHARED) { 1545 if (vma->vm_flags & VM_SHARED) {
@@ -1526,14 +1555,14 @@ retry:
1526 } 1555 }
1527 1556
1528 spin_lock(&inode->i_lock); 1557 spin_lock(&inode->i_lock);
1529 inode->i_blocks += BLOCKS_PER_HUGEPAGE; 1558 inode->i_blocks += blocks_per_huge_page(h);
1530 spin_unlock(&inode->i_lock); 1559 spin_unlock(&inode->i_lock);
1531 } else 1560 } else
1532 lock_page(page); 1561 lock_page(page);
1533 } 1562 }
1534 1563
1535 spin_lock(&mm->page_table_lock); 1564 spin_lock(&mm->page_table_lock);
1536 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 1565 size = i_size_read(mapping->host) >> huge_page_shift(h);
1537 if (idx >= size) 1566 if (idx >= size)
1538 goto backout; 1567 goto backout;
1539 1568
@@ -1569,8 +1598,9 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
1569 pte_t entry; 1598 pte_t entry;
1570 int ret; 1599 int ret;
1571 static DEFINE_MUTEX(hugetlb_instantiation_mutex); 1600 static DEFINE_MUTEX(hugetlb_instantiation_mutex);
1601 struct hstate *h = hstate_vma(vma);
1572 1602
1573 ptep = huge_pte_alloc(mm, address); 1603 ptep = huge_pte_alloc(mm, address, huge_page_size(h));
1574 if (!ptep) 1604 if (!ptep)
1575 return VM_FAULT_OOM; 1605 return VM_FAULT_OOM;
1576 1606
@@ -1594,7 +1624,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
1594 if (likely(pte_same(entry, huge_ptep_get(ptep)))) 1624 if (likely(pte_same(entry, huge_ptep_get(ptep))))
1595 if (write_access && !pte_write(entry)) { 1625 if (write_access && !pte_write(entry)) {
1596 struct page *page; 1626 struct page *page;
1597 page = hugetlbfs_pagecache_page(vma, address); 1627 page = hugetlbfs_pagecache_page(h, vma, address);
1598 ret = hugetlb_cow(mm, vma, address, ptep, entry, page); 1628 ret = hugetlb_cow(mm, vma, address, ptep, entry, page);
1599 if (page) { 1629 if (page) {
1600 unlock_page(page); 1630 unlock_page(page);
@@ -1615,6 +1645,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1615 unsigned long pfn_offset; 1645 unsigned long pfn_offset;
1616 unsigned long vaddr = *position; 1646 unsigned long vaddr = *position;
1617 int remainder = *length; 1647 int remainder = *length;
1648 struct hstate *h = hstate_vma(vma);
1618 1649
1619 spin_lock(&mm->page_table_lock); 1650 spin_lock(&mm->page_table_lock);
1620 while (vaddr < vma->vm_end && remainder) { 1651 while (vaddr < vma->vm_end && remainder) {
@@ -1626,7 +1657,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1626 * each hugepage. We have to make * sure we get the 1657 * each hugepage. We have to make * sure we get the
1627 * first, for the page indexing below to work. 1658 * first, for the page indexing below to work.
1628 */ 1659 */
1629 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); 1660 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
1630 1661
1631 if (!pte || huge_pte_none(huge_ptep_get(pte)) || 1662 if (!pte || huge_pte_none(huge_ptep_get(pte)) ||
1632 (write && !pte_write(huge_ptep_get(pte)))) { 1663 (write && !pte_write(huge_ptep_get(pte)))) {
@@ -1644,7 +1675,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1644 break; 1675 break;
1645 } 1676 }
1646 1677
1647 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; 1678 pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
1648 page = pte_page(huge_ptep_get(pte)); 1679 page = pte_page(huge_ptep_get(pte));
1649same_page: 1680same_page:
1650 if (pages) { 1681 if (pages) {
@@ -1660,7 +1691,7 @@ same_page:
1660 --remainder; 1691 --remainder;
1661 ++i; 1692 ++i;
1662 if (vaddr < vma->vm_end && remainder && 1693 if (vaddr < vma->vm_end && remainder &&
1663 pfn_offset < HPAGE_SIZE/PAGE_SIZE) { 1694 pfn_offset < pages_per_huge_page(h)) {
1664 /* 1695 /*
1665 * We use pfn_offset to avoid touching the pageframes 1696 * We use pfn_offset to avoid touching the pageframes
1666 * of this compound page. 1697 * of this compound page.
@@ -1682,13 +1713,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
1682 unsigned long start = address; 1713 unsigned long start = address;
1683 pte_t *ptep; 1714 pte_t *ptep;
1684 pte_t pte; 1715 pte_t pte;
1716 struct hstate *h = hstate_vma(vma);
1685 1717
1686 BUG_ON(address >= end); 1718 BUG_ON(address >= end);
1687 flush_cache_range(vma, address, end); 1719 flush_cache_range(vma, address, end);
1688 1720
1689 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 1721 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
1690 spin_lock(&mm->page_table_lock); 1722 spin_lock(&mm->page_table_lock);
1691 for (; address < end; address += HPAGE_SIZE) { 1723 for (; address < end; address += huge_page_size(h)) {
1692 ptep = huge_pte_offset(mm, address); 1724 ptep = huge_pte_offset(mm, address);
1693 if (!ptep) 1725 if (!ptep)
1694 continue; 1726 continue;
@@ -1711,6 +1743,7 @@ int hugetlb_reserve_pages(struct inode *inode,
1711 struct vm_area_struct *vma) 1743 struct vm_area_struct *vma)
1712{ 1744{
1713 long ret, chg; 1745 long ret, chg;
1746 struct hstate *h = hstate_inode(inode);
1714 1747
1715 if (vma && vma->vm_flags & VM_NORESERVE) 1748 if (vma && vma->vm_flags & VM_NORESERVE)
1716 return 0; 1749 return 0;
@@ -1739,7 +1772,7 @@ int hugetlb_reserve_pages(struct inode *inode,
1739 1772
1740 if (hugetlb_get_quota(inode->i_mapping, chg)) 1773 if (hugetlb_get_quota(inode->i_mapping, chg))
1741 return -ENOSPC; 1774 return -ENOSPC;
1742 ret = hugetlb_acct_memory(chg); 1775 ret = hugetlb_acct_memory(h, chg);
1743 if (ret < 0) { 1776 if (ret < 0) {
1744 hugetlb_put_quota(inode->i_mapping, chg); 1777 hugetlb_put_quota(inode->i_mapping, chg);
1745 return ret; 1778 return ret;
@@ -1751,12 +1784,13 @@ int hugetlb_reserve_pages(struct inode *inode,
1751 1784
1752void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 1785void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
1753{ 1786{
1787 struct hstate *h = hstate_inode(inode);
1754 long chg = region_truncate(&inode->i_mapping->private_list, offset); 1788 long chg = region_truncate(&inode->i_mapping->private_list, offset);
1755 1789
1756 spin_lock(&inode->i_lock); 1790 spin_lock(&inode->i_lock);
1757 inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed; 1791 inode->i_blocks -= blocks_per_huge_page(h);
1758 spin_unlock(&inode->i_lock); 1792 spin_unlock(&inode->i_lock);
1759 1793
1760 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 1794 hugetlb_put_quota(inode->i_mapping, (chg - freed));
1761 hugetlb_acct_memory(-(chg - freed)); 1795 hugetlb_acct_memory(h, -(chg - freed));
1762} 1796}
diff --git a/mm/memory.c b/mm/memory.c
index 72932489a082..c1c1d6d8c22b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -903,7 +903,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
903 if (unlikely(is_vm_hugetlb_page(vma))) { 903 if (unlikely(is_vm_hugetlb_page(vma))) {
904 unmap_hugepage_range(vma, start, end, NULL); 904 unmap_hugepage_range(vma, start, end, NULL);
905 zap_work -= (end - start) / 905 zap_work -= (end - start) /
906 (HPAGE_SIZE / PAGE_SIZE); 906 pages_per_huge_page(hstate_vma(vma));
907 start = end; 907 start = end;
908 } else 908 } else
909 start = unmap_page_range(*tlbp, vma, 909 start = unmap_page_range(*tlbp, vma,
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c94e58b192c3..e550bec20582 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1481,7 +1481,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1481 1481
1482 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { 1482 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1483 zl = node_zonelist(interleave_nid(*mpol, vma, addr, 1483 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1484 HPAGE_SHIFT), gfp_flags); 1484 huge_page_shift(hstate_vma(vma))), gfp_flags);
1485 } else { 1485 } else {
1486 zl = policy_zonelist(gfp_flags, *mpol); 1486 zl = policy_zonelist(gfp_flags, *mpol);
1487 if ((*mpol)->mode == MPOL_BIND) 1487 if ((*mpol)->mode == MPOL_BIND)
@@ -2220,9 +2220,12 @@ static void check_huge_range(struct vm_area_struct *vma,
2220{ 2220{
2221 unsigned long addr; 2221 unsigned long addr;
2222 struct page *page; 2222 struct page *page;
2223 struct hstate *h = hstate_vma(vma);
2224 unsigned long sz = huge_page_size(h);
2223 2225
2224 for (addr = start; addr < end; addr += HPAGE_SIZE) { 2226 for (addr = start; addr < end; addr += sz) {
2225 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK); 2227 pte_t *ptep = huge_pte_offset(vma->vm_mm,
2228 addr & huge_page_mask(h));
2226 pte_t pte; 2229 pte_t pte;
2227 2230
2228 if (!ptep) 2231 if (!ptep)
diff --git a/mm/mmap.c b/mm/mmap.c
index 57d3b6097deb..5e0cc99e9cd5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1812,7 +1812,8 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1812 struct mempolicy *pol; 1812 struct mempolicy *pol;
1813 struct vm_area_struct *new; 1813 struct vm_area_struct *new;
1814 1814
1815 if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK)) 1815 if (is_vm_hugetlb_page(vma) && (addr &
1816 ~(huge_page_mask(hstate_vma(vma)))))
1816 return -EINVAL; 1817 return -EINVAL;
1817 1818
1818 if (mm->map_count >= sysctl_max_map_count) 1819 if (mm->map_count >= sysctl_max_map_count)