diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 368 |
1 files changed, 201 insertions, 167 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 32dff4290c66..0d8153e25f09 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -22,18 +22,12 @@ | |||
22 | #include "internal.h" | 22 | #include "internal.h" |
23 | 23 | ||
24 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 24 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
25 | static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; | ||
26 | static unsigned long surplus_huge_pages; | ||
27 | static unsigned long nr_overcommit_huge_pages; | ||
28 | unsigned long max_huge_pages; | 25 | unsigned long max_huge_pages; |
29 | unsigned long sysctl_overcommit_huge_pages; | 26 | unsigned long sysctl_overcommit_huge_pages; |
30 | static struct list_head hugepage_freelists[MAX_NUMNODES]; | ||
31 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; | ||
32 | static unsigned int free_huge_pages_node[MAX_NUMNODES]; | ||
33 | static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; | ||
34 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; | 27 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; |
35 | unsigned long hugepages_treat_as_movable; | 28 | unsigned long hugepages_treat_as_movable; |
36 | static int hugetlb_next_nid; | 29 | |
30 | struct hstate default_hstate; | ||
37 | 31 | ||
38 | /* | 32 | /* |
39 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages | 33 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages |
@@ -203,11 +197,11 @@ static long region_count(struct list_head *head, long f, long t) | |||
203 | * Convert the address within this vma to the page offset within | 197 | * Convert the address within this vma to the page offset within |
204 | * the mapping, in pagecache page units; huge pages here. | 198 | * the mapping, in pagecache page units; huge pages here. |
205 | */ | 199 | */ |
206 | static pgoff_t vma_hugecache_offset(struct vm_area_struct *vma, | 200 | static pgoff_t vma_hugecache_offset(struct hstate *h, |
207 | unsigned long address) | 201 | struct vm_area_struct *vma, unsigned long address) |
208 | { | 202 | { |
209 | return ((address - vma->vm_start) >> HPAGE_SHIFT) + | 203 | return ((address - vma->vm_start) >> huge_page_shift(h)) + |
210 | (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); | 204 | (vma->vm_pgoff >> huge_page_order(h)); |
211 | } | 205 | } |
212 | 206 | ||
213 | /* | 207 | /* |
@@ -309,20 +303,21 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) | |||
309 | } | 303 | } |
310 | 304 | ||
311 | /* Decrement the reserved pages in the hugepage pool by one */ | 305 | /* Decrement the reserved pages in the hugepage pool by one */ |
312 | static void decrement_hugepage_resv_vma(struct vm_area_struct *vma) | 306 | static void decrement_hugepage_resv_vma(struct hstate *h, |
307 | struct vm_area_struct *vma) | ||
313 | { | 308 | { |
314 | if (vma->vm_flags & VM_NORESERVE) | 309 | if (vma->vm_flags & VM_NORESERVE) |
315 | return; | 310 | return; |
316 | 311 | ||
317 | if (vma->vm_flags & VM_SHARED) { | 312 | if (vma->vm_flags & VM_SHARED) { |
318 | /* Shared mappings always use reserves */ | 313 | /* Shared mappings always use reserves */ |
319 | resv_huge_pages--; | 314 | h->resv_huge_pages--; |
320 | } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { | 315 | } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { |
321 | /* | 316 | /* |
322 | * Only the process that called mmap() has reserves for | 317 | * Only the process that called mmap() has reserves for |
323 | * private mappings. | 318 | * private mappings. |
324 | */ | 319 | */ |
325 | resv_huge_pages--; | 320 | h->resv_huge_pages--; |
326 | } | 321 | } |
327 | } | 322 | } |
328 | 323 | ||
@@ -344,12 +339,13 @@ static int vma_has_private_reserves(struct vm_area_struct *vma) | |||
344 | return 1; | 339 | return 1; |
345 | } | 340 | } |
346 | 341 | ||
347 | static void clear_huge_page(struct page *page, unsigned long addr) | 342 | static void clear_huge_page(struct page *page, |
343 | unsigned long addr, unsigned long sz) | ||
348 | { | 344 | { |
349 | int i; | 345 | int i; |
350 | 346 | ||
351 | might_sleep(); | 347 | might_sleep(); |
352 | for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { | 348 | for (i = 0; i < sz/PAGE_SIZE; i++) { |
353 | cond_resched(); | 349 | cond_resched(); |
354 | clear_user_highpage(page + i, addr + i * PAGE_SIZE); | 350 | clear_user_highpage(page + i, addr + i * PAGE_SIZE); |
355 | } | 351 | } |
@@ -359,41 +355,43 @@ static void copy_huge_page(struct page *dst, struct page *src, | |||
359 | unsigned long addr, struct vm_area_struct *vma) | 355 | unsigned long addr, struct vm_area_struct *vma) |
360 | { | 356 | { |
361 | int i; | 357 | int i; |
358 | struct hstate *h = hstate_vma(vma); | ||
362 | 359 | ||
363 | might_sleep(); | 360 | might_sleep(); |
364 | for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { | 361 | for (i = 0; i < pages_per_huge_page(h); i++) { |
365 | cond_resched(); | 362 | cond_resched(); |
366 | copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); | 363 | copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); |
367 | } | 364 | } |
368 | } | 365 | } |
369 | 366 | ||
370 | static void enqueue_huge_page(struct page *page) | 367 | static void enqueue_huge_page(struct hstate *h, struct page *page) |
371 | { | 368 | { |
372 | int nid = page_to_nid(page); | 369 | int nid = page_to_nid(page); |
373 | list_add(&page->lru, &hugepage_freelists[nid]); | 370 | list_add(&page->lru, &h->hugepage_freelists[nid]); |
374 | free_huge_pages++; | 371 | h->free_huge_pages++; |
375 | free_huge_pages_node[nid]++; | 372 | h->free_huge_pages_node[nid]++; |
376 | } | 373 | } |
377 | 374 | ||
378 | static struct page *dequeue_huge_page(void) | 375 | static struct page *dequeue_huge_page(struct hstate *h) |
379 | { | 376 | { |
380 | int nid; | 377 | int nid; |
381 | struct page *page = NULL; | 378 | struct page *page = NULL; |
382 | 379 | ||
383 | for (nid = 0; nid < MAX_NUMNODES; ++nid) { | 380 | for (nid = 0; nid < MAX_NUMNODES; ++nid) { |
384 | if (!list_empty(&hugepage_freelists[nid])) { | 381 | if (!list_empty(&h->hugepage_freelists[nid])) { |
385 | page = list_entry(hugepage_freelists[nid].next, | 382 | page = list_entry(h->hugepage_freelists[nid].next, |
386 | struct page, lru); | 383 | struct page, lru); |
387 | list_del(&page->lru); | 384 | list_del(&page->lru); |
388 | free_huge_pages--; | 385 | h->free_huge_pages--; |
389 | free_huge_pages_node[nid]--; | 386 | h->free_huge_pages_node[nid]--; |
390 | break; | 387 | break; |
391 | } | 388 | } |
392 | } | 389 | } |
393 | return page; | 390 | return page; |
394 | } | 391 | } |
395 | 392 | ||
396 | static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, | 393 | static struct page *dequeue_huge_page_vma(struct hstate *h, |
394 | struct vm_area_struct *vma, | ||
397 | unsigned long address, int avoid_reserve) | 395 | unsigned long address, int avoid_reserve) |
398 | { | 396 | { |
399 | int nid; | 397 | int nid; |
@@ -411,26 +409,26 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, | |||
411 | * not "stolen". The child may still get SIGKILLed | 409 | * not "stolen". The child may still get SIGKILLed |
412 | */ | 410 | */ |
413 | if (!vma_has_private_reserves(vma) && | 411 | if (!vma_has_private_reserves(vma) && |
414 | free_huge_pages - resv_huge_pages == 0) | 412 | h->free_huge_pages - h->resv_huge_pages == 0) |
415 | return NULL; | 413 | return NULL; |
416 | 414 | ||
417 | /* If reserves cannot be used, ensure enough pages are in the pool */ | 415 | /* If reserves cannot be used, ensure enough pages are in the pool */ |
418 | if (avoid_reserve && free_huge_pages - resv_huge_pages == 0) | 416 | if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) |
419 | return NULL; | 417 | return NULL; |
420 | 418 | ||
421 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 419 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
422 | MAX_NR_ZONES - 1, nodemask) { | 420 | MAX_NR_ZONES - 1, nodemask) { |
423 | nid = zone_to_nid(zone); | 421 | nid = zone_to_nid(zone); |
424 | if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && | 422 | if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && |
425 | !list_empty(&hugepage_freelists[nid])) { | 423 | !list_empty(&h->hugepage_freelists[nid])) { |
426 | page = list_entry(hugepage_freelists[nid].next, | 424 | page = list_entry(h->hugepage_freelists[nid].next, |
427 | struct page, lru); | 425 | struct page, lru); |
428 | list_del(&page->lru); | 426 | list_del(&page->lru); |
429 | free_huge_pages--; | 427 | h->free_huge_pages--; |
430 | free_huge_pages_node[nid]--; | 428 | h->free_huge_pages_node[nid]--; |
431 | 429 | ||
432 | if (!avoid_reserve) | 430 | if (!avoid_reserve) |
433 | decrement_hugepage_resv_vma(vma); | 431 | decrement_hugepage_resv_vma(h, vma); |
434 | 432 | ||
435 | break; | 433 | break; |
436 | } | 434 | } |
@@ -439,12 +437,13 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, | |||
439 | return page; | 437 | return page; |
440 | } | 438 | } |
441 | 439 | ||
442 | static void update_and_free_page(struct page *page) | 440 | static void update_and_free_page(struct hstate *h, struct page *page) |
443 | { | 441 | { |
444 | int i; | 442 | int i; |
445 | nr_huge_pages--; | 443 | |
446 | nr_huge_pages_node[page_to_nid(page)]--; | 444 | h->nr_huge_pages--; |
447 | for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { | 445 | h->nr_huge_pages_node[page_to_nid(page)]--; |
446 | for (i = 0; i < pages_per_huge_page(h); i++) { | ||
448 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | | 447 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | |
449 | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | | 448 | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | |
450 | 1 << PG_private | 1<< PG_writeback); | 449 | 1 << PG_private | 1<< PG_writeback); |
@@ -452,11 +451,16 @@ static void update_and_free_page(struct page *page) | |||
452 | set_compound_page_dtor(page, NULL); | 451 | set_compound_page_dtor(page, NULL); |
453 | set_page_refcounted(page); | 452 | set_page_refcounted(page); |
454 | arch_release_hugepage(page); | 453 | arch_release_hugepage(page); |
455 | __free_pages(page, HUGETLB_PAGE_ORDER); | 454 | __free_pages(page, huge_page_order(h)); |
456 | } | 455 | } |
457 | 456 | ||
458 | static void free_huge_page(struct page *page) | 457 | static void free_huge_page(struct page *page) |
459 | { | 458 | { |
459 | /* | ||
460 | * Can't pass hstate in here because it is called from the | ||
461 | * compound page destructor. | ||
462 | */ | ||
463 | struct hstate *h = &default_hstate; | ||
460 | int nid = page_to_nid(page); | 464 | int nid = page_to_nid(page); |
461 | struct address_space *mapping; | 465 | struct address_space *mapping; |
462 | 466 | ||
@@ -466,12 +470,12 @@ static void free_huge_page(struct page *page) | |||
466 | INIT_LIST_HEAD(&page->lru); | 470 | INIT_LIST_HEAD(&page->lru); |
467 | 471 | ||
468 | spin_lock(&hugetlb_lock); | 472 | spin_lock(&hugetlb_lock); |
469 | if (surplus_huge_pages_node[nid]) { | 473 | if (h->surplus_huge_pages_node[nid]) { |
470 | update_and_free_page(page); | 474 | update_and_free_page(h, page); |
471 | surplus_huge_pages--; | 475 | h->surplus_huge_pages--; |
472 | surplus_huge_pages_node[nid]--; | 476 | h->surplus_huge_pages_node[nid]--; |
473 | } else { | 477 | } else { |
474 | enqueue_huge_page(page); | 478 | enqueue_huge_page(h, page); |
475 | } | 479 | } |
476 | spin_unlock(&hugetlb_lock); | 480 | spin_unlock(&hugetlb_lock); |
477 | if (mapping) | 481 | if (mapping) |
@@ -483,7 +487,7 @@ static void free_huge_page(struct page *page) | |||
483 | * balanced by operating on them in a round-robin fashion. | 487 | * balanced by operating on them in a round-robin fashion. |
484 | * Returns 1 if an adjustment was made. | 488 | * Returns 1 if an adjustment was made. |
485 | */ | 489 | */ |
486 | static int adjust_pool_surplus(int delta) | 490 | static int adjust_pool_surplus(struct hstate *h, int delta) |
487 | { | 491 | { |
488 | static int prev_nid; | 492 | static int prev_nid; |
489 | int nid = prev_nid; | 493 | int nid = prev_nid; |
@@ -496,15 +500,15 @@ static int adjust_pool_surplus(int delta) | |||
496 | nid = first_node(node_online_map); | 500 | nid = first_node(node_online_map); |
497 | 501 | ||
498 | /* To shrink on this node, there must be a surplus page */ | 502 | /* To shrink on this node, there must be a surplus page */ |
499 | if (delta < 0 && !surplus_huge_pages_node[nid]) | 503 | if (delta < 0 && !h->surplus_huge_pages_node[nid]) |
500 | continue; | 504 | continue; |
501 | /* Surplus cannot exceed the total number of pages */ | 505 | /* Surplus cannot exceed the total number of pages */ |
502 | if (delta > 0 && surplus_huge_pages_node[nid] >= | 506 | if (delta > 0 && h->surplus_huge_pages_node[nid] >= |
503 | nr_huge_pages_node[nid]) | 507 | h->nr_huge_pages_node[nid]) |
504 | continue; | 508 | continue; |
505 | 509 | ||
506 | surplus_huge_pages += delta; | 510 | h->surplus_huge_pages += delta; |
507 | surplus_huge_pages_node[nid] += delta; | 511 | h->surplus_huge_pages_node[nid] += delta; |
508 | ret = 1; | 512 | ret = 1; |
509 | break; | 513 | break; |
510 | } while (nid != prev_nid); | 514 | } while (nid != prev_nid); |
@@ -513,46 +517,46 @@ static int adjust_pool_surplus(int delta) | |||
513 | return ret; | 517 | return ret; |
514 | } | 518 | } |
515 | 519 | ||
516 | static void prep_new_huge_page(struct page *page, int nid) | 520 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) |
517 | { | 521 | { |
518 | set_compound_page_dtor(page, free_huge_page); | 522 | set_compound_page_dtor(page, free_huge_page); |
519 | spin_lock(&hugetlb_lock); | 523 | spin_lock(&hugetlb_lock); |
520 | nr_huge_pages++; | 524 | h->nr_huge_pages++; |
521 | nr_huge_pages_node[nid]++; | 525 | h->nr_huge_pages_node[nid]++; |
522 | spin_unlock(&hugetlb_lock); | 526 | spin_unlock(&hugetlb_lock); |
523 | put_page(page); /* free it into the hugepage allocator */ | 527 | put_page(page); /* free it into the hugepage allocator */ |
524 | } | 528 | } |
525 | 529 | ||
526 | static struct page *alloc_fresh_huge_page_node(int nid) | 530 | static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) |
527 | { | 531 | { |
528 | struct page *page; | 532 | struct page *page; |
529 | 533 | ||
530 | page = alloc_pages_node(nid, | 534 | page = alloc_pages_node(nid, |
531 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| | 535 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| |
532 | __GFP_REPEAT|__GFP_NOWARN, | 536 | __GFP_REPEAT|__GFP_NOWARN, |
533 | HUGETLB_PAGE_ORDER); | 537 | huge_page_order(h)); |
534 | if (page) { | 538 | if (page) { |
535 | if (arch_prepare_hugepage(page)) { | 539 | if (arch_prepare_hugepage(page)) { |
536 | __free_pages(page, HUGETLB_PAGE_ORDER); | 540 | __free_pages(page, HUGETLB_PAGE_ORDER); |
537 | return NULL; | 541 | return NULL; |
538 | } | 542 | } |
539 | prep_new_huge_page(page, nid); | 543 | prep_new_huge_page(h, page, nid); |
540 | } | 544 | } |
541 | 545 | ||
542 | return page; | 546 | return page; |
543 | } | 547 | } |
544 | 548 | ||
545 | static int alloc_fresh_huge_page(void) | 549 | static int alloc_fresh_huge_page(struct hstate *h) |
546 | { | 550 | { |
547 | struct page *page; | 551 | struct page *page; |
548 | int start_nid; | 552 | int start_nid; |
549 | int next_nid; | 553 | int next_nid; |
550 | int ret = 0; | 554 | int ret = 0; |
551 | 555 | ||
552 | start_nid = hugetlb_next_nid; | 556 | start_nid = h->hugetlb_next_nid; |
553 | 557 | ||
554 | do { | 558 | do { |
555 | page = alloc_fresh_huge_page_node(hugetlb_next_nid); | 559 | page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid); |
556 | if (page) | 560 | if (page) |
557 | ret = 1; | 561 | ret = 1; |
558 | /* | 562 | /* |
@@ -566,11 +570,11 @@ static int alloc_fresh_huge_page(void) | |||
566 | * if we just successfully allocated a hugepage so that | 570 | * if we just successfully allocated a hugepage so that |
567 | * the next caller gets hugepages on the next node. | 571 | * the next caller gets hugepages on the next node. |
568 | */ | 572 | */ |
569 | next_nid = next_node(hugetlb_next_nid, node_online_map); | 573 | next_nid = next_node(h->hugetlb_next_nid, node_online_map); |
570 | if (next_nid == MAX_NUMNODES) | 574 | if (next_nid == MAX_NUMNODES) |
571 | next_nid = first_node(node_online_map); | 575 | next_nid = first_node(node_online_map); |
572 | hugetlb_next_nid = next_nid; | 576 | h->hugetlb_next_nid = next_nid; |
573 | } while (!page && hugetlb_next_nid != start_nid); | 577 | } while (!page && h->hugetlb_next_nid != start_nid); |
574 | 578 | ||
575 | if (ret) | 579 | if (ret) |
576 | count_vm_event(HTLB_BUDDY_PGALLOC); | 580 | count_vm_event(HTLB_BUDDY_PGALLOC); |
@@ -580,8 +584,8 @@ static int alloc_fresh_huge_page(void) | |||
580 | return ret; | 584 | return ret; |
581 | } | 585 | } |
582 | 586 | ||
583 | static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, | 587 | static struct page *alloc_buddy_huge_page(struct hstate *h, |
584 | unsigned long address) | 588 | struct vm_area_struct *vma, unsigned long address) |
585 | { | 589 | { |
586 | struct page *page; | 590 | struct page *page; |
587 | unsigned int nid; | 591 | unsigned int nid; |
@@ -610,18 +614,18 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, | |||
610 | * per-node value is checked there. | 614 | * per-node value is checked there. |
611 | */ | 615 | */ |
612 | spin_lock(&hugetlb_lock); | 616 | spin_lock(&hugetlb_lock); |
613 | if (surplus_huge_pages >= nr_overcommit_huge_pages) { | 617 | if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { |
614 | spin_unlock(&hugetlb_lock); | 618 | spin_unlock(&hugetlb_lock); |
615 | return NULL; | 619 | return NULL; |
616 | } else { | 620 | } else { |
617 | nr_huge_pages++; | 621 | h->nr_huge_pages++; |
618 | surplus_huge_pages++; | 622 | h->surplus_huge_pages++; |
619 | } | 623 | } |
620 | spin_unlock(&hugetlb_lock); | 624 | spin_unlock(&hugetlb_lock); |
621 | 625 | ||
622 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP| | 626 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP| |
623 | __GFP_REPEAT|__GFP_NOWARN, | 627 | __GFP_REPEAT|__GFP_NOWARN, |
624 | HUGETLB_PAGE_ORDER); | 628 | huge_page_order(h)); |
625 | 629 | ||
626 | spin_lock(&hugetlb_lock); | 630 | spin_lock(&hugetlb_lock); |
627 | if (page) { | 631 | if (page) { |
@@ -636,12 +640,12 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, | |||
636 | /* | 640 | /* |
637 | * We incremented the global counters already | 641 | * We incremented the global counters already |
638 | */ | 642 | */ |
639 | nr_huge_pages_node[nid]++; | 643 | h->nr_huge_pages_node[nid]++; |
640 | surplus_huge_pages_node[nid]++; | 644 | h->surplus_huge_pages_node[nid]++; |
641 | __count_vm_event(HTLB_BUDDY_PGALLOC); | 645 | __count_vm_event(HTLB_BUDDY_PGALLOC); |
642 | } else { | 646 | } else { |
643 | nr_huge_pages--; | 647 | h->nr_huge_pages--; |
644 | surplus_huge_pages--; | 648 | h->surplus_huge_pages--; |
645 | __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); | 649 | __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); |
646 | } | 650 | } |
647 | spin_unlock(&hugetlb_lock); | 651 | spin_unlock(&hugetlb_lock); |
@@ -653,16 +657,16 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, | |||
653 | * Increase the hugetlb pool such that it can accomodate a reservation | 657 | * Increase the hugetlb pool such that it can accomodate a reservation |
654 | * of size 'delta'. | 658 | * of size 'delta'. |
655 | */ | 659 | */ |
656 | static int gather_surplus_pages(int delta) | 660 | static int gather_surplus_pages(struct hstate *h, int delta) |
657 | { | 661 | { |
658 | struct list_head surplus_list; | 662 | struct list_head surplus_list; |
659 | struct page *page, *tmp; | 663 | struct page *page, *tmp; |
660 | int ret, i; | 664 | int ret, i; |
661 | int needed, allocated; | 665 | int needed, allocated; |
662 | 666 | ||
663 | needed = (resv_huge_pages + delta) - free_huge_pages; | 667 | needed = (h->resv_huge_pages + delta) - h->free_huge_pages; |
664 | if (needed <= 0) { | 668 | if (needed <= 0) { |
665 | resv_huge_pages += delta; | 669 | h->resv_huge_pages += delta; |
666 | return 0; | 670 | return 0; |
667 | } | 671 | } |
668 | 672 | ||
@@ -673,7 +677,7 @@ static int gather_surplus_pages(int delta) | |||
673 | retry: | 677 | retry: |
674 | spin_unlock(&hugetlb_lock); | 678 | spin_unlock(&hugetlb_lock); |
675 | for (i = 0; i < needed; i++) { | 679 | for (i = 0; i < needed; i++) { |
676 | page = alloc_buddy_huge_page(NULL, 0); | 680 | page = alloc_buddy_huge_page(h, NULL, 0); |
677 | if (!page) { | 681 | if (!page) { |
678 | /* | 682 | /* |
679 | * We were not able to allocate enough pages to | 683 | * We were not able to allocate enough pages to |
@@ -694,7 +698,8 @@ retry: | |||
694 | * because either resv_huge_pages or free_huge_pages may have changed. | 698 | * because either resv_huge_pages or free_huge_pages may have changed. |
695 | */ | 699 | */ |
696 | spin_lock(&hugetlb_lock); | 700 | spin_lock(&hugetlb_lock); |
697 | needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); | 701 | needed = (h->resv_huge_pages + delta) - |
702 | (h->free_huge_pages + allocated); | ||
698 | if (needed > 0) | 703 | if (needed > 0) |
699 | goto retry; | 704 | goto retry; |
700 | 705 | ||
@@ -707,7 +712,7 @@ retry: | |||
707 | * before they are reserved. | 712 | * before they are reserved. |
708 | */ | 713 | */ |
709 | needed += allocated; | 714 | needed += allocated; |
710 | resv_huge_pages += delta; | 715 | h->resv_huge_pages += delta; |
711 | ret = 0; | 716 | ret = 0; |
712 | free: | 717 | free: |
713 | /* Free the needed pages to the hugetlb pool */ | 718 | /* Free the needed pages to the hugetlb pool */ |
@@ -715,7 +720,7 @@ free: | |||
715 | if ((--needed) < 0) | 720 | if ((--needed) < 0) |
716 | break; | 721 | break; |
717 | list_del(&page->lru); | 722 | list_del(&page->lru); |
718 | enqueue_huge_page(page); | 723 | enqueue_huge_page(h, page); |
719 | } | 724 | } |
720 | 725 | ||
721 | /* Free unnecessary surplus pages to the buddy allocator */ | 726 | /* Free unnecessary surplus pages to the buddy allocator */ |
@@ -743,7 +748,8 @@ free: | |||
743 | * allocated to satisfy the reservation must be explicitly freed if they were | 748 | * allocated to satisfy the reservation must be explicitly freed if they were |
744 | * never used. | 749 | * never used. |
745 | */ | 750 | */ |
746 | static void return_unused_surplus_pages(unsigned long unused_resv_pages) | 751 | static void return_unused_surplus_pages(struct hstate *h, |
752 | unsigned long unused_resv_pages) | ||
747 | { | 753 | { |
748 | static int nid = -1; | 754 | static int nid = -1; |
749 | struct page *page; | 755 | struct page *page; |
@@ -758,27 +764,27 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages) | |||
758 | unsigned long remaining_iterations = num_online_nodes(); | 764 | unsigned long remaining_iterations = num_online_nodes(); |
759 | 765 | ||
760 | /* Uncommit the reservation */ | 766 | /* Uncommit the reservation */ |
761 | resv_huge_pages -= unused_resv_pages; | 767 | h->resv_huge_pages -= unused_resv_pages; |
762 | 768 | ||
763 | nr_pages = min(unused_resv_pages, surplus_huge_pages); | 769 | nr_pages = min(unused_resv_pages, h->surplus_huge_pages); |
764 | 770 | ||
765 | while (remaining_iterations-- && nr_pages) { | 771 | while (remaining_iterations-- && nr_pages) { |
766 | nid = next_node(nid, node_online_map); | 772 | nid = next_node(nid, node_online_map); |
767 | if (nid == MAX_NUMNODES) | 773 | if (nid == MAX_NUMNODES) |
768 | nid = first_node(node_online_map); | 774 | nid = first_node(node_online_map); |
769 | 775 | ||
770 | if (!surplus_huge_pages_node[nid]) | 776 | if (!h->surplus_huge_pages_node[nid]) |
771 | continue; | 777 | continue; |
772 | 778 | ||
773 | if (!list_empty(&hugepage_freelists[nid])) { | 779 | if (!list_empty(&h->hugepage_freelists[nid])) { |
774 | page = list_entry(hugepage_freelists[nid].next, | 780 | page = list_entry(h->hugepage_freelists[nid].next, |
775 | struct page, lru); | 781 | struct page, lru); |
776 | list_del(&page->lru); | 782 | list_del(&page->lru); |
777 | update_and_free_page(page); | 783 | update_and_free_page(h, page); |
778 | free_huge_pages--; | 784 | h->free_huge_pages--; |
779 | free_huge_pages_node[nid]--; | 785 | h->free_huge_pages_node[nid]--; |
780 | surplus_huge_pages--; | 786 | h->surplus_huge_pages--; |
781 | surplus_huge_pages_node[nid]--; | 787 | h->surplus_huge_pages_node[nid]--; |
782 | nr_pages--; | 788 | nr_pages--; |
783 | remaining_iterations = num_online_nodes(); | 789 | remaining_iterations = num_online_nodes(); |
784 | } | 790 | } |
@@ -794,13 +800,14 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages) | |||
794 | * an instantiated the change should be committed via vma_commit_reservation. | 800 | * an instantiated the change should be committed via vma_commit_reservation. |
795 | * No action is required on failure. | 801 | * No action is required on failure. |
796 | */ | 802 | */ |
797 | static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr) | 803 | static int vma_needs_reservation(struct hstate *h, |
804 | struct vm_area_struct *vma, unsigned long addr) | ||
798 | { | 805 | { |
799 | struct address_space *mapping = vma->vm_file->f_mapping; | 806 | struct address_space *mapping = vma->vm_file->f_mapping; |
800 | struct inode *inode = mapping->host; | 807 | struct inode *inode = mapping->host; |
801 | 808 | ||
802 | if (vma->vm_flags & VM_SHARED) { | 809 | if (vma->vm_flags & VM_SHARED) { |
803 | pgoff_t idx = vma_hugecache_offset(vma, addr); | 810 | pgoff_t idx = vma_hugecache_offset(h, vma, addr); |
804 | return region_chg(&inode->i_mapping->private_list, | 811 | return region_chg(&inode->i_mapping->private_list, |
805 | idx, idx + 1); | 812 | idx, idx + 1); |
806 | 813 | ||
@@ -809,7 +816,7 @@ static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr) | |||
809 | 816 | ||
810 | } else { | 817 | } else { |
811 | int err; | 818 | int err; |
812 | pgoff_t idx = vma_hugecache_offset(vma, addr); | 819 | pgoff_t idx = vma_hugecache_offset(h, vma, addr); |
813 | struct resv_map *reservations = vma_resv_map(vma); | 820 | struct resv_map *reservations = vma_resv_map(vma); |
814 | 821 | ||
815 | err = region_chg(&reservations->regions, idx, idx + 1); | 822 | err = region_chg(&reservations->regions, idx, idx + 1); |
@@ -818,18 +825,18 @@ static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr) | |||
818 | return 0; | 825 | return 0; |
819 | } | 826 | } |
820 | } | 827 | } |
821 | static void vma_commit_reservation(struct vm_area_struct *vma, | 828 | static void vma_commit_reservation(struct hstate *h, |
822 | unsigned long addr) | 829 | struct vm_area_struct *vma, unsigned long addr) |
823 | { | 830 | { |
824 | struct address_space *mapping = vma->vm_file->f_mapping; | 831 | struct address_space *mapping = vma->vm_file->f_mapping; |
825 | struct inode *inode = mapping->host; | 832 | struct inode *inode = mapping->host; |
826 | 833 | ||
827 | if (vma->vm_flags & VM_SHARED) { | 834 | if (vma->vm_flags & VM_SHARED) { |
828 | pgoff_t idx = vma_hugecache_offset(vma, addr); | 835 | pgoff_t idx = vma_hugecache_offset(h, vma, addr); |
829 | region_add(&inode->i_mapping->private_list, idx, idx + 1); | 836 | region_add(&inode->i_mapping->private_list, idx, idx + 1); |
830 | 837 | ||
831 | } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { | 838 | } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { |
832 | pgoff_t idx = vma_hugecache_offset(vma, addr); | 839 | pgoff_t idx = vma_hugecache_offset(h, vma, addr); |
833 | struct resv_map *reservations = vma_resv_map(vma); | 840 | struct resv_map *reservations = vma_resv_map(vma); |
834 | 841 | ||
835 | /* Mark this page used in the map. */ | 842 | /* Mark this page used in the map. */ |
@@ -840,6 +847,7 @@ static void vma_commit_reservation(struct vm_area_struct *vma, | |||
840 | static struct page *alloc_huge_page(struct vm_area_struct *vma, | 847 | static struct page *alloc_huge_page(struct vm_area_struct *vma, |
841 | unsigned long addr, int avoid_reserve) | 848 | unsigned long addr, int avoid_reserve) |
842 | { | 849 | { |
850 | struct hstate *h = hstate_vma(vma); | ||
843 | struct page *page; | 851 | struct page *page; |
844 | struct address_space *mapping = vma->vm_file->f_mapping; | 852 | struct address_space *mapping = vma->vm_file->f_mapping; |
845 | struct inode *inode = mapping->host; | 853 | struct inode *inode = mapping->host; |
@@ -852,7 +860,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
852 | * MAP_NORESERVE mappings may also need pages and quota allocated | 860 | * MAP_NORESERVE mappings may also need pages and quota allocated |
853 | * if no reserve mapping overlaps. | 861 | * if no reserve mapping overlaps. |
854 | */ | 862 | */ |
855 | chg = vma_needs_reservation(vma, addr); | 863 | chg = vma_needs_reservation(h, vma, addr); |
856 | if (chg < 0) | 864 | if (chg < 0) |
857 | return ERR_PTR(chg); | 865 | return ERR_PTR(chg); |
858 | if (chg) | 866 | if (chg) |
@@ -860,11 +868,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
860 | return ERR_PTR(-ENOSPC); | 868 | return ERR_PTR(-ENOSPC); |
861 | 869 | ||
862 | spin_lock(&hugetlb_lock); | 870 | spin_lock(&hugetlb_lock); |
863 | page = dequeue_huge_page_vma(vma, addr, avoid_reserve); | 871 | page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); |
864 | spin_unlock(&hugetlb_lock); | 872 | spin_unlock(&hugetlb_lock); |
865 | 873 | ||
866 | if (!page) { | 874 | if (!page) { |
867 | page = alloc_buddy_huge_page(vma, addr); | 875 | page = alloc_buddy_huge_page(h, vma, addr); |
868 | if (!page) { | 876 | if (!page) { |
869 | hugetlb_put_quota(inode->i_mapping, chg); | 877 | hugetlb_put_quota(inode->i_mapping, chg); |
870 | return ERR_PTR(-VM_FAULT_OOM); | 878 | return ERR_PTR(-VM_FAULT_OOM); |
@@ -874,7 +882,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
874 | set_page_refcounted(page); | 882 | set_page_refcounted(page); |
875 | set_page_private(page, (unsigned long) mapping); | 883 | set_page_private(page, (unsigned long) mapping); |
876 | 884 | ||
877 | vma_commit_reservation(vma, addr); | 885 | vma_commit_reservation(h, vma, addr); |
878 | 886 | ||
879 | return page; | 887 | return page; |
880 | } | 888 | } |
@@ -882,21 +890,28 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
882 | static int __init hugetlb_init(void) | 890 | static int __init hugetlb_init(void) |
883 | { | 891 | { |
884 | unsigned long i; | 892 | unsigned long i; |
893 | struct hstate *h = &default_hstate; | ||
885 | 894 | ||
886 | if (HPAGE_SHIFT == 0) | 895 | if (HPAGE_SHIFT == 0) |
887 | return 0; | 896 | return 0; |
888 | 897 | ||
898 | if (!h->order) { | ||
899 | h->order = HPAGE_SHIFT - PAGE_SHIFT; | ||
900 | h->mask = HPAGE_MASK; | ||
901 | } | ||
902 | |||
889 | for (i = 0; i < MAX_NUMNODES; ++i) | 903 | for (i = 0; i < MAX_NUMNODES; ++i) |
890 | INIT_LIST_HEAD(&hugepage_freelists[i]); | 904 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); |
891 | 905 | ||
892 | hugetlb_next_nid = first_node(node_online_map); | 906 | h->hugetlb_next_nid = first_node(node_online_map); |
893 | 907 | ||
894 | for (i = 0; i < max_huge_pages; ++i) { | 908 | for (i = 0; i < max_huge_pages; ++i) { |
895 | if (!alloc_fresh_huge_page()) | 909 | if (!alloc_fresh_huge_page(h)) |
896 | break; | 910 | break; |
897 | } | 911 | } |
898 | max_huge_pages = free_huge_pages = nr_huge_pages = i; | 912 | max_huge_pages = h->free_huge_pages = h->nr_huge_pages = i; |
899 | printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); | 913 | printk(KERN_INFO "Total HugeTLB memory allocated, %ld\n", |
914 | h->free_huge_pages); | ||
900 | return 0; | 915 | return 0; |
901 | } | 916 | } |
902 | module_init(hugetlb_init); | 917 | module_init(hugetlb_init); |
@@ -922,34 +937,36 @@ static unsigned int cpuset_mems_nr(unsigned int *array) | |||
922 | 937 | ||
923 | #ifdef CONFIG_SYSCTL | 938 | #ifdef CONFIG_SYSCTL |
924 | #ifdef CONFIG_HIGHMEM | 939 | #ifdef CONFIG_HIGHMEM |
925 | static void try_to_free_low(unsigned long count) | 940 | static void try_to_free_low(struct hstate *h, unsigned long count) |
926 | { | 941 | { |
927 | int i; | 942 | int i; |
928 | 943 | ||
929 | for (i = 0; i < MAX_NUMNODES; ++i) { | 944 | for (i = 0; i < MAX_NUMNODES; ++i) { |
930 | struct page *page, *next; | 945 | struct page *page, *next; |
931 | list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { | 946 | struct list_head *freel = &h->hugepage_freelists[i]; |
932 | if (count >= nr_huge_pages) | 947 | list_for_each_entry_safe(page, next, freel, lru) { |
948 | if (count >= h->nr_huge_pages) | ||
933 | return; | 949 | return; |
934 | if (PageHighMem(page)) | 950 | if (PageHighMem(page)) |
935 | continue; | 951 | continue; |
936 | list_del(&page->lru); | 952 | list_del(&page->lru); |
937 | update_and_free_page(page); | 953 | update_and_free_page(page); |
938 | free_huge_pages--; | 954 | h->free_huge_pages--; |
939 | free_huge_pages_node[page_to_nid(page)]--; | 955 | h->free_huge_pages_node[page_to_nid(page)]--; |
940 | } | 956 | } |
941 | } | 957 | } |
942 | } | 958 | } |
943 | #else | 959 | #else |
944 | static inline void try_to_free_low(unsigned long count) | 960 | static inline void try_to_free_low(struct hstate *h, unsigned long count) |
945 | { | 961 | { |
946 | } | 962 | } |
947 | #endif | 963 | #endif |
948 | 964 | ||
949 | #define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) | 965 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) |
950 | static unsigned long set_max_huge_pages(unsigned long count) | 966 | static unsigned long set_max_huge_pages(unsigned long count) |
951 | { | 967 | { |
952 | unsigned long min_count, ret; | 968 | unsigned long min_count, ret; |
969 | struct hstate *h = &default_hstate; | ||
953 | 970 | ||
954 | /* | 971 | /* |
955 | * Increase the pool size | 972 | * Increase the pool size |
@@ -963,19 +980,19 @@ static unsigned long set_max_huge_pages(unsigned long count) | |||
963 | * within all the constraints specified by the sysctls. | 980 | * within all the constraints specified by the sysctls. |
964 | */ | 981 | */ |
965 | spin_lock(&hugetlb_lock); | 982 | spin_lock(&hugetlb_lock); |
966 | while (surplus_huge_pages && count > persistent_huge_pages) { | 983 | while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { |
967 | if (!adjust_pool_surplus(-1)) | 984 | if (!adjust_pool_surplus(h, -1)) |
968 | break; | 985 | break; |
969 | } | 986 | } |
970 | 987 | ||
971 | while (count > persistent_huge_pages) { | 988 | while (count > persistent_huge_pages(h)) { |
972 | /* | 989 | /* |
973 | * If this allocation races such that we no longer need the | 990 | * If this allocation races such that we no longer need the |
974 | * page, free_huge_page will handle it by freeing the page | 991 | * page, free_huge_page will handle it by freeing the page |
975 | * and reducing the surplus. | 992 | * and reducing the surplus. |
976 | */ | 993 | */ |
977 | spin_unlock(&hugetlb_lock); | 994 | spin_unlock(&hugetlb_lock); |
978 | ret = alloc_fresh_huge_page(); | 995 | ret = alloc_fresh_huge_page(h); |
979 | spin_lock(&hugetlb_lock); | 996 | spin_lock(&hugetlb_lock); |
980 | if (!ret) | 997 | if (!ret) |
981 | goto out; | 998 | goto out; |
@@ -997,21 +1014,21 @@ static unsigned long set_max_huge_pages(unsigned long count) | |||
997 | * and won't grow the pool anywhere else. Not until one of the | 1014 | * and won't grow the pool anywhere else. Not until one of the |
998 | * sysctls are changed, or the surplus pages go out of use. | 1015 | * sysctls are changed, or the surplus pages go out of use. |
999 | */ | 1016 | */ |
1000 | min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; | 1017 | min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; |
1001 | min_count = max(count, min_count); | 1018 | min_count = max(count, min_count); |
1002 | try_to_free_low(min_count); | 1019 | try_to_free_low(h, min_count); |
1003 | while (min_count < persistent_huge_pages) { | 1020 | while (min_count < persistent_huge_pages(h)) { |
1004 | struct page *page = dequeue_huge_page(); | 1021 | struct page *page = dequeue_huge_page(h); |
1005 | if (!page) | 1022 | if (!page) |
1006 | break; | 1023 | break; |
1007 | update_and_free_page(page); | 1024 | update_and_free_page(h, page); |
1008 | } | 1025 | } |
1009 | while (count < persistent_huge_pages) { | 1026 | while (count < persistent_huge_pages(h)) { |
1010 | if (!adjust_pool_surplus(1)) | 1027 | if (!adjust_pool_surplus(h, 1)) |
1011 | break; | 1028 | break; |
1012 | } | 1029 | } |
1013 | out: | 1030 | out: |
1014 | ret = persistent_huge_pages; | 1031 | ret = persistent_huge_pages(h); |
1015 | spin_unlock(&hugetlb_lock); | 1032 | spin_unlock(&hugetlb_lock); |
1016 | return ret; | 1033 | return ret; |
1017 | } | 1034 | } |
@@ -1041,9 +1058,10 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, | |||
1041 | struct file *file, void __user *buffer, | 1058 | struct file *file, void __user *buffer, |
1042 | size_t *length, loff_t *ppos) | 1059 | size_t *length, loff_t *ppos) |
1043 | { | 1060 | { |
1061 | struct hstate *h = &default_hstate; | ||
1044 | proc_doulongvec_minmax(table, write, file, buffer, length, ppos); | 1062 | proc_doulongvec_minmax(table, write, file, buffer, length, ppos); |
1045 | spin_lock(&hugetlb_lock); | 1063 | spin_lock(&hugetlb_lock); |
1046 | nr_overcommit_huge_pages = sysctl_overcommit_huge_pages; | 1064 | h->nr_overcommit_huge_pages = sysctl_overcommit_huge_pages; |
1047 | spin_unlock(&hugetlb_lock); | 1065 | spin_unlock(&hugetlb_lock); |
1048 | return 0; | 1066 | return 0; |
1049 | } | 1067 | } |
@@ -1052,37 +1070,40 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, | |||
1052 | 1070 | ||
1053 | int hugetlb_report_meminfo(char *buf) | 1071 | int hugetlb_report_meminfo(char *buf) |
1054 | { | 1072 | { |
1073 | struct hstate *h = &default_hstate; | ||
1055 | return sprintf(buf, | 1074 | return sprintf(buf, |
1056 | "HugePages_Total: %5lu\n" | 1075 | "HugePages_Total: %5lu\n" |
1057 | "HugePages_Free: %5lu\n" | 1076 | "HugePages_Free: %5lu\n" |
1058 | "HugePages_Rsvd: %5lu\n" | 1077 | "HugePages_Rsvd: %5lu\n" |
1059 | "HugePages_Surp: %5lu\n" | 1078 | "HugePages_Surp: %5lu\n" |
1060 | "Hugepagesize: %5lu kB\n", | 1079 | "Hugepagesize: %5lu kB\n", |
1061 | nr_huge_pages, | 1080 | h->nr_huge_pages, |
1062 | free_huge_pages, | 1081 | h->free_huge_pages, |
1063 | resv_huge_pages, | 1082 | h->resv_huge_pages, |
1064 | surplus_huge_pages, | 1083 | h->surplus_huge_pages, |
1065 | HPAGE_SIZE/1024); | 1084 | 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); |
1066 | } | 1085 | } |
1067 | 1086 | ||
1068 | int hugetlb_report_node_meminfo(int nid, char *buf) | 1087 | int hugetlb_report_node_meminfo(int nid, char *buf) |
1069 | { | 1088 | { |
1089 | struct hstate *h = &default_hstate; | ||
1070 | return sprintf(buf, | 1090 | return sprintf(buf, |
1071 | "Node %d HugePages_Total: %5u\n" | 1091 | "Node %d HugePages_Total: %5u\n" |
1072 | "Node %d HugePages_Free: %5u\n" | 1092 | "Node %d HugePages_Free: %5u\n" |
1073 | "Node %d HugePages_Surp: %5u\n", | 1093 | "Node %d HugePages_Surp: %5u\n", |
1074 | nid, nr_huge_pages_node[nid], | 1094 | nid, h->nr_huge_pages_node[nid], |
1075 | nid, free_huge_pages_node[nid], | 1095 | nid, h->free_huge_pages_node[nid], |
1076 | nid, surplus_huge_pages_node[nid]); | 1096 | nid, h->surplus_huge_pages_node[nid]); |
1077 | } | 1097 | } |
1078 | 1098 | ||
1079 | /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ | 1099 | /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ |
1080 | unsigned long hugetlb_total_pages(void) | 1100 | unsigned long hugetlb_total_pages(void) |
1081 | { | 1101 | { |
1082 | return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); | 1102 | struct hstate *h = &default_hstate; |
1103 | return h->nr_huge_pages * pages_per_huge_page(h); | ||
1083 | } | 1104 | } |
1084 | 1105 | ||
1085 | static int hugetlb_acct_memory(long delta) | 1106 | static int hugetlb_acct_memory(struct hstate *h, long delta) |
1086 | { | 1107 | { |
1087 | int ret = -ENOMEM; | 1108 | int ret = -ENOMEM; |
1088 | 1109 | ||
@@ -1105,18 +1126,18 @@ static int hugetlb_acct_memory(long delta) | |||
1105 | * semantics that cpuset has. | 1126 | * semantics that cpuset has. |
1106 | */ | 1127 | */ |
1107 | if (delta > 0) { | 1128 | if (delta > 0) { |
1108 | if (gather_surplus_pages(delta) < 0) | 1129 | if (gather_surplus_pages(h, delta) < 0) |
1109 | goto out; | 1130 | goto out; |
1110 | 1131 | ||
1111 | if (delta > cpuset_mems_nr(free_huge_pages_node)) { | 1132 | if (delta > cpuset_mems_nr(h->free_huge_pages_node)) { |
1112 | return_unused_surplus_pages(delta); | 1133 | return_unused_surplus_pages(h, delta); |
1113 | goto out; | 1134 | goto out; |
1114 | } | 1135 | } |
1115 | } | 1136 | } |
1116 | 1137 | ||
1117 | ret = 0; | 1138 | ret = 0; |
1118 | if (delta < 0) | 1139 | if (delta < 0) |
1119 | return_unused_surplus_pages((unsigned long) -delta); | 1140 | return_unused_surplus_pages(h, (unsigned long) -delta); |
1120 | 1141 | ||
1121 | out: | 1142 | out: |
1122 | spin_unlock(&hugetlb_lock); | 1143 | spin_unlock(&hugetlb_lock); |
@@ -1141,14 +1162,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) | |||
1141 | 1162 | ||
1142 | static void hugetlb_vm_op_close(struct vm_area_struct *vma) | 1163 | static void hugetlb_vm_op_close(struct vm_area_struct *vma) |
1143 | { | 1164 | { |
1165 | struct hstate *h = hstate_vma(vma); | ||
1144 | struct resv_map *reservations = vma_resv_map(vma); | 1166 | struct resv_map *reservations = vma_resv_map(vma); |
1145 | unsigned long reserve; | 1167 | unsigned long reserve; |
1146 | unsigned long start; | 1168 | unsigned long start; |
1147 | unsigned long end; | 1169 | unsigned long end; |
1148 | 1170 | ||
1149 | if (reservations) { | 1171 | if (reservations) { |
1150 | start = vma_hugecache_offset(vma, vma->vm_start); | 1172 | start = vma_hugecache_offset(h, vma, vma->vm_start); |
1151 | end = vma_hugecache_offset(vma, vma->vm_end); | 1173 | end = vma_hugecache_offset(h, vma, vma->vm_end); |
1152 | 1174 | ||
1153 | reserve = (end - start) - | 1175 | reserve = (end - start) - |
1154 | region_count(&reservations->regions, start, end); | 1176 | region_count(&reservations->regions, start, end); |
@@ -1156,7 +1178,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) | |||
1156 | kref_put(&reservations->refs, resv_map_release); | 1178 | kref_put(&reservations->refs, resv_map_release); |
1157 | 1179 | ||
1158 | if (reserve) | 1180 | if (reserve) |
1159 | hugetlb_acct_memory(-reserve); | 1181 | hugetlb_acct_memory(h, -reserve); |
1160 | } | 1182 | } |
1161 | } | 1183 | } |
1162 | 1184 | ||
@@ -1214,14 +1236,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
1214 | struct page *ptepage; | 1236 | struct page *ptepage; |
1215 | unsigned long addr; | 1237 | unsigned long addr; |
1216 | int cow; | 1238 | int cow; |
1239 | struct hstate *h = hstate_vma(vma); | ||
1240 | unsigned long sz = huge_page_size(h); | ||
1217 | 1241 | ||
1218 | cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | 1242 | cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; |
1219 | 1243 | ||
1220 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { | 1244 | for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { |
1221 | src_pte = huge_pte_offset(src, addr); | 1245 | src_pte = huge_pte_offset(src, addr); |
1222 | if (!src_pte) | 1246 | if (!src_pte) |
1223 | continue; | 1247 | continue; |
1224 | dst_pte = huge_pte_alloc(dst, addr); | 1248 | dst_pte = huge_pte_alloc(dst, addr, sz); |
1225 | if (!dst_pte) | 1249 | if (!dst_pte) |
1226 | goto nomem; | 1250 | goto nomem; |
1227 | 1251 | ||
@@ -1257,6 +1281,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
1257 | pte_t pte; | 1281 | pte_t pte; |
1258 | struct page *page; | 1282 | struct page *page; |
1259 | struct page *tmp; | 1283 | struct page *tmp; |
1284 | struct hstate *h = hstate_vma(vma); | ||
1285 | unsigned long sz = huge_page_size(h); | ||
1286 | |||
1260 | /* | 1287 | /* |
1261 | * A page gathering list, protected by per file i_mmap_lock. The | 1288 | * A page gathering list, protected by per file i_mmap_lock. The |
1262 | * lock is used to avoid list corruption from multiple unmapping | 1289 | * lock is used to avoid list corruption from multiple unmapping |
@@ -1265,11 +1292,11 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
1265 | LIST_HEAD(page_list); | 1292 | LIST_HEAD(page_list); |
1266 | 1293 | ||
1267 | WARN_ON(!is_vm_hugetlb_page(vma)); | 1294 | WARN_ON(!is_vm_hugetlb_page(vma)); |
1268 | BUG_ON(start & ~HPAGE_MASK); | 1295 | BUG_ON(start & ~huge_page_mask(h)); |
1269 | BUG_ON(end & ~HPAGE_MASK); | 1296 | BUG_ON(end & ~huge_page_mask(h)); |
1270 | 1297 | ||
1271 | spin_lock(&mm->page_table_lock); | 1298 | spin_lock(&mm->page_table_lock); |
1272 | for (address = start; address < end; address += HPAGE_SIZE) { | 1299 | for (address = start; address < end; address += sz) { |
1273 | ptep = huge_pte_offset(mm, address); | 1300 | ptep = huge_pte_offset(mm, address); |
1274 | if (!ptep) | 1301 | if (!ptep) |
1275 | continue; | 1302 | continue; |
@@ -1383,6 +1410,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1383 | unsigned long address, pte_t *ptep, pte_t pte, | 1410 | unsigned long address, pte_t *ptep, pte_t pte, |
1384 | struct page *pagecache_page) | 1411 | struct page *pagecache_page) |
1385 | { | 1412 | { |
1413 | struct hstate *h = hstate_vma(vma); | ||
1386 | struct page *old_page, *new_page; | 1414 | struct page *old_page, *new_page; |
1387 | int avoidcopy; | 1415 | int avoidcopy; |
1388 | int outside_reserve = 0; | 1416 | int outside_reserve = 0; |
@@ -1443,7 +1471,7 @@ retry_avoidcopy: | |||
1443 | __SetPageUptodate(new_page); | 1471 | __SetPageUptodate(new_page); |
1444 | spin_lock(&mm->page_table_lock); | 1472 | spin_lock(&mm->page_table_lock); |
1445 | 1473 | ||
1446 | ptep = huge_pte_offset(mm, address & HPAGE_MASK); | 1474 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); |
1447 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { | 1475 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { |
1448 | /* Break COW */ | 1476 | /* Break COW */ |
1449 | huge_ptep_clear_flush(vma, address, ptep); | 1477 | huge_ptep_clear_flush(vma, address, ptep); |
@@ -1458,14 +1486,14 @@ retry_avoidcopy: | |||
1458 | } | 1486 | } |
1459 | 1487 | ||
1460 | /* Return the pagecache page at a given address within a VMA */ | 1488 | /* Return the pagecache page at a given address within a VMA */ |
1461 | static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma, | 1489 | static struct page *hugetlbfs_pagecache_page(struct hstate *h, |
1462 | unsigned long address) | 1490 | struct vm_area_struct *vma, unsigned long address) |
1463 | { | 1491 | { |
1464 | struct address_space *mapping; | 1492 | struct address_space *mapping; |
1465 | pgoff_t idx; | 1493 | pgoff_t idx; |
1466 | 1494 | ||
1467 | mapping = vma->vm_file->f_mapping; | 1495 | mapping = vma->vm_file->f_mapping; |
1468 | idx = vma_hugecache_offset(vma, address); | 1496 | idx = vma_hugecache_offset(h, vma, address); |
1469 | 1497 | ||
1470 | return find_lock_page(mapping, idx); | 1498 | return find_lock_page(mapping, idx); |
1471 | } | 1499 | } |
@@ -1473,6 +1501,7 @@ static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma, | |||
1473 | static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | 1501 | static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, |
1474 | unsigned long address, pte_t *ptep, int write_access) | 1502 | unsigned long address, pte_t *ptep, int write_access) |
1475 | { | 1503 | { |
1504 | struct hstate *h = hstate_vma(vma); | ||
1476 | int ret = VM_FAULT_SIGBUS; | 1505 | int ret = VM_FAULT_SIGBUS; |
1477 | pgoff_t idx; | 1506 | pgoff_t idx; |
1478 | unsigned long size; | 1507 | unsigned long size; |
@@ -1493,7 +1522,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1493 | } | 1522 | } |
1494 | 1523 | ||
1495 | mapping = vma->vm_file->f_mapping; | 1524 | mapping = vma->vm_file->f_mapping; |
1496 | idx = vma_hugecache_offset(vma, address); | 1525 | idx = vma_hugecache_offset(h, vma, address); |
1497 | 1526 | ||
1498 | /* | 1527 | /* |
1499 | * Use page lock to guard against racing truncation | 1528 | * Use page lock to guard against racing truncation |
@@ -1502,7 +1531,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1502 | retry: | 1531 | retry: |
1503 | page = find_lock_page(mapping, idx); | 1532 | page = find_lock_page(mapping, idx); |
1504 | if (!page) { | 1533 | if (!page) { |
1505 | size = i_size_read(mapping->host) >> HPAGE_SHIFT; | 1534 | size = i_size_read(mapping->host) >> huge_page_shift(h); |
1506 | if (idx >= size) | 1535 | if (idx >= size) |
1507 | goto out; | 1536 | goto out; |
1508 | page = alloc_huge_page(vma, address, 0); | 1537 | page = alloc_huge_page(vma, address, 0); |
@@ -1510,7 +1539,7 @@ retry: | |||
1510 | ret = -PTR_ERR(page); | 1539 | ret = -PTR_ERR(page); |
1511 | goto out; | 1540 | goto out; |
1512 | } | 1541 | } |
1513 | clear_huge_page(page, address); | 1542 | clear_huge_page(page, address, huge_page_size(h)); |
1514 | __SetPageUptodate(page); | 1543 | __SetPageUptodate(page); |
1515 | 1544 | ||
1516 | if (vma->vm_flags & VM_SHARED) { | 1545 | if (vma->vm_flags & VM_SHARED) { |
@@ -1526,14 +1555,14 @@ retry: | |||
1526 | } | 1555 | } |
1527 | 1556 | ||
1528 | spin_lock(&inode->i_lock); | 1557 | spin_lock(&inode->i_lock); |
1529 | inode->i_blocks += BLOCKS_PER_HUGEPAGE; | 1558 | inode->i_blocks += blocks_per_huge_page(h); |
1530 | spin_unlock(&inode->i_lock); | 1559 | spin_unlock(&inode->i_lock); |
1531 | } else | 1560 | } else |
1532 | lock_page(page); | 1561 | lock_page(page); |
1533 | } | 1562 | } |
1534 | 1563 | ||
1535 | spin_lock(&mm->page_table_lock); | 1564 | spin_lock(&mm->page_table_lock); |
1536 | size = i_size_read(mapping->host) >> HPAGE_SHIFT; | 1565 | size = i_size_read(mapping->host) >> huge_page_shift(h); |
1537 | if (idx >= size) | 1566 | if (idx >= size) |
1538 | goto backout; | 1567 | goto backout; |
1539 | 1568 | ||
@@ -1569,8 +1598,9 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1569 | pte_t entry; | 1598 | pte_t entry; |
1570 | int ret; | 1599 | int ret; |
1571 | static DEFINE_MUTEX(hugetlb_instantiation_mutex); | 1600 | static DEFINE_MUTEX(hugetlb_instantiation_mutex); |
1601 | struct hstate *h = hstate_vma(vma); | ||
1572 | 1602 | ||
1573 | ptep = huge_pte_alloc(mm, address); | 1603 | ptep = huge_pte_alloc(mm, address, huge_page_size(h)); |
1574 | if (!ptep) | 1604 | if (!ptep) |
1575 | return VM_FAULT_OOM; | 1605 | return VM_FAULT_OOM; |
1576 | 1606 | ||
@@ -1594,7 +1624,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1594 | if (likely(pte_same(entry, huge_ptep_get(ptep)))) | 1624 | if (likely(pte_same(entry, huge_ptep_get(ptep)))) |
1595 | if (write_access && !pte_write(entry)) { | 1625 | if (write_access && !pte_write(entry)) { |
1596 | struct page *page; | 1626 | struct page *page; |
1597 | page = hugetlbfs_pagecache_page(vma, address); | 1627 | page = hugetlbfs_pagecache_page(h, vma, address); |
1598 | ret = hugetlb_cow(mm, vma, address, ptep, entry, page); | 1628 | ret = hugetlb_cow(mm, vma, address, ptep, entry, page); |
1599 | if (page) { | 1629 | if (page) { |
1600 | unlock_page(page); | 1630 | unlock_page(page); |
@@ -1615,6 +1645,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1615 | unsigned long pfn_offset; | 1645 | unsigned long pfn_offset; |
1616 | unsigned long vaddr = *position; | 1646 | unsigned long vaddr = *position; |
1617 | int remainder = *length; | 1647 | int remainder = *length; |
1648 | struct hstate *h = hstate_vma(vma); | ||
1618 | 1649 | ||
1619 | spin_lock(&mm->page_table_lock); | 1650 | spin_lock(&mm->page_table_lock); |
1620 | while (vaddr < vma->vm_end && remainder) { | 1651 | while (vaddr < vma->vm_end && remainder) { |
@@ -1626,7 +1657,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1626 | * each hugepage. We have to make * sure we get the | 1657 | * each hugepage. We have to make * sure we get the |
1627 | * first, for the page indexing below to work. | 1658 | * first, for the page indexing below to work. |
1628 | */ | 1659 | */ |
1629 | pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); | 1660 | pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); |
1630 | 1661 | ||
1631 | if (!pte || huge_pte_none(huge_ptep_get(pte)) || | 1662 | if (!pte || huge_pte_none(huge_ptep_get(pte)) || |
1632 | (write && !pte_write(huge_ptep_get(pte)))) { | 1663 | (write && !pte_write(huge_ptep_get(pte)))) { |
@@ -1644,7 +1675,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1644 | break; | 1675 | break; |
1645 | } | 1676 | } |
1646 | 1677 | ||
1647 | pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; | 1678 | pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; |
1648 | page = pte_page(huge_ptep_get(pte)); | 1679 | page = pte_page(huge_ptep_get(pte)); |
1649 | same_page: | 1680 | same_page: |
1650 | if (pages) { | 1681 | if (pages) { |
@@ -1660,7 +1691,7 @@ same_page: | |||
1660 | --remainder; | 1691 | --remainder; |
1661 | ++i; | 1692 | ++i; |
1662 | if (vaddr < vma->vm_end && remainder && | 1693 | if (vaddr < vma->vm_end && remainder && |
1663 | pfn_offset < HPAGE_SIZE/PAGE_SIZE) { | 1694 | pfn_offset < pages_per_huge_page(h)) { |
1664 | /* | 1695 | /* |
1665 | * We use pfn_offset to avoid touching the pageframes | 1696 | * We use pfn_offset to avoid touching the pageframes |
1666 | * of this compound page. | 1697 | * of this compound page. |
@@ -1682,13 +1713,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
1682 | unsigned long start = address; | 1713 | unsigned long start = address; |
1683 | pte_t *ptep; | 1714 | pte_t *ptep; |
1684 | pte_t pte; | 1715 | pte_t pte; |
1716 | struct hstate *h = hstate_vma(vma); | ||
1685 | 1717 | ||
1686 | BUG_ON(address >= end); | 1718 | BUG_ON(address >= end); |
1687 | flush_cache_range(vma, address, end); | 1719 | flush_cache_range(vma, address, end); |
1688 | 1720 | ||
1689 | spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); | 1721 | spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); |
1690 | spin_lock(&mm->page_table_lock); | 1722 | spin_lock(&mm->page_table_lock); |
1691 | for (; address < end; address += HPAGE_SIZE) { | 1723 | for (; address < end; address += huge_page_size(h)) { |
1692 | ptep = huge_pte_offset(mm, address); | 1724 | ptep = huge_pte_offset(mm, address); |
1693 | if (!ptep) | 1725 | if (!ptep) |
1694 | continue; | 1726 | continue; |
@@ -1711,6 +1743,7 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
1711 | struct vm_area_struct *vma) | 1743 | struct vm_area_struct *vma) |
1712 | { | 1744 | { |
1713 | long ret, chg; | 1745 | long ret, chg; |
1746 | struct hstate *h = hstate_inode(inode); | ||
1714 | 1747 | ||
1715 | if (vma && vma->vm_flags & VM_NORESERVE) | 1748 | if (vma && vma->vm_flags & VM_NORESERVE) |
1716 | return 0; | 1749 | return 0; |
@@ -1739,7 +1772,7 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
1739 | 1772 | ||
1740 | if (hugetlb_get_quota(inode->i_mapping, chg)) | 1773 | if (hugetlb_get_quota(inode->i_mapping, chg)) |
1741 | return -ENOSPC; | 1774 | return -ENOSPC; |
1742 | ret = hugetlb_acct_memory(chg); | 1775 | ret = hugetlb_acct_memory(h, chg); |
1743 | if (ret < 0) { | 1776 | if (ret < 0) { |
1744 | hugetlb_put_quota(inode->i_mapping, chg); | 1777 | hugetlb_put_quota(inode->i_mapping, chg); |
1745 | return ret; | 1778 | return ret; |
@@ -1751,12 +1784,13 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
1751 | 1784 | ||
1752 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | 1785 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) |
1753 | { | 1786 | { |
1787 | struct hstate *h = hstate_inode(inode); | ||
1754 | long chg = region_truncate(&inode->i_mapping->private_list, offset); | 1788 | long chg = region_truncate(&inode->i_mapping->private_list, offset); |
1755 | 1789 | ||
1756 | spin_lock(&inode->i_lock); | 1790 | spin_lock(&inode->i_lock); |
1757 | inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed; | 1791 | inode->i_blocks -= blocks_per_huge_page(h); |
1758 | spin_unlock(&inode->i_lock); | 1792 | spin_unlock(&inode->i_lock); |
1759 | 1793 | ||
1760 | hugetlb_put_quota(inode->i_mapping, (chg - freed)); | 1794 | hugetlb_put_quota(inode->i_mapping, (chg - freed)); |
1761 | hugetlb_acct_memory(-(chg - freed)); | 1795 | hugetlb_acct_memory(h, -(chg - freed)); |
1762 | } | 1796 | } |