diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 2 | ||||
-rw-r--r-- | mm/compaction.c | 3 | ||||
-rw-r--r-- | mm/filemap.c | 2 | ||||
-rw-r--r-- | mm/huge_memory.c | 10 | ||||
-rw-r--r-- | mm/hugetlb.c | 447 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 4 | ||||
-rw-r--r-- | mm/internal.h | 2 | ||||
-rw-r--r-- | mm/kmemleak.c | 2 | ||||
-rw-r--r-- | mm/ksm.c | 6 | ||||
-rw-r--r-- | mm/madvise.c | 33 | ||||
-rw-r--r-- | mm/memblock.c | 18 | ||||
-rw-r--r-- | mm/memcontrol.c | 17 | ||||
-rw-r--r-- | mm/memory-failure.c | 174 | ||||
-rw-r--r-- | mm/memory.c | 41 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 116 | ||||
-rw-r--r-- | mm/mempolicy.c | 116 | ||||
-rw-r--r-- | mm/mempool.c | 2 | ||||
-rw-r--r-- | mm/migrate.c | 63 | ||||
-rw-r--r-- | mm/mlock.c | 316 | ||||
-rw-r--r-- | mm/mmap.c | 59 | ||||
-rw-r--r-- | mm/mremap.c | 5 | ||||
-rw-r--r-- | mm/page-writeback.c | 269 | ||||
-rw-r--r-- | mm/page_alloc.c | 308 | ||||
-rw-r--r-- | mm/page_isolation.c | 14 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 24 | ||||
-rw-r--r-- | mm/readahead.c | 8 | ||||
-rw-r--r-- | mm/shmem.c | 6 | ||||
-rw-r--r-- | mm/slub.c | 8 | ||||
-rw-r--r-- | mm/sparse.c | 133 | ||||
-rw-r--r-- | mm/swap.c | 77 | ||||
-rw-r--r-- | mm/swap_state.c | 4 | ||||
-rw-r--r-- | mm/swapfile.c | 596 | ||||
-rw-r--r-- | mm/util.c | 5 | ||||
-rw-r--r-- | mm/vmalloc.c | 29 | ||||
-rw-r--r-- | mm/vmscan.c | 80 | ||||
-rw-r--r-- | mm/vmstat.c | 95 | ||||
-rw-r--r-- | mm/zbud.c | 4 | ||||
-rw-r--r-- | mm/zswap.c | 18 |
38 files changed, 2116 insertions, 1000 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 37d9edcd14cf..ce682f7a4f29 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -652,7 +652,7 @@ int pdflush_proc_obsolete(struct ctl_table *table, int write, | |||
652 | { | 652 | { |
653 | char kbuf[] = "0\n"; | 653 | char kbuf[] = "0\n"; |
654 | 654 | ||
655 | if (*ppos) { | 655 | if (*ppos || *lenp < sizeof(kbuf)) { |
656 | *lenp = 0; | 656 | *lenp = 0; |
657 | return 0; | 657 | return 0; |
658 | } | 658 | } |
diff --git a/mm/compaction.c b/mm/compaction.c index 05ccb4cc0bdb..c43789388cd8 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -1131,6 +1131,9 @@ void compact_pgdat(pg_data_t *pgdat, int order) | |||
1131 | .sync = false, | 1131 | .sync = false, |
1132 | }; | 1132 | }; |
1133 | 1133 | ||
1134 | if (!order) | ||
1135 | return; | ||
1136 | |||
1134 | __compact_pgdat(pgdat, &cc); | 1137 | __compact_pgdat(pgdat, &cc); |
1135 | } | 1138 | } |
1136 | 1139 | ||
diff --git a/mm/filemap.c b/mm/filemap.c index 731a2c24532d..e607728db4a8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -469,7 +469,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, | |||
469 | if (error) | 469 | if (error) |
470 | goto out; | 470 | goto out; |
471 | 471 | ||
472 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); | 472 | error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); |
473 | if (error == 0) { | 473 | if (error == 0) { |
474 | page_cache_get(page); | 474 | page_cache_get(page); |
475 | page->mapping = mapping; | 475 | page->mapping = mapping; |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d94f7dee3997..d66010e0049d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -422,7 +422,7 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, | |||
422 | unsigned long msecs; | 422 | unsigned long msecs; |
423 | int err; | 423 | int err; |
424 | 424 | ||
425 | err = strict_strtoul(buf, 10, &msecs); | 425 | err = kstrtoul(buf, 10, &msecs); |
426 | if (err || msecs > UINT_MAX) | 426 | if (err || msecs > UINT_MAX) |
427 | return -EINVAL; | 427 | return -EINVAL; |
428 | 428 | ||
@@ -449,7 +449,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, | |||
449 | unsigned long msecs; | 449 | unsigned long msecs; |
450 | int err; | 450 | int err; |
451 | 451 | ||
452 | err = strict_strtoul(buf, 10, &msecs); | 452 | err = kstrtoul(buf, 10, &msecs); |
453 | if (err || msecs > UINT_MAX) | 453 | if (err || msecs > UINT_MAX) |
454 | return -EINVAL; | 454 | return -EINVAL; |
455 | 455 | ||
@@ -475,7 +475,7 @@ static ssize_t pages_to_scan_store(struct kobject *kobj, | |||
475 | int err; | 475 | int err; |
476 | unsigned long pages; | 476 | unsigned long pages; |
477 | 477 | ||
478 | err = strict_strtoul(buf, 10, &pages); | 478 | err = kstrtoul(buf, 10, &pages); |
479 | if (err || !pages || pages > UINT_MAX) | 479 | if (err || !pages || pages > UINT_MAX) |
480 | return -EINVAL; | 480 | return -EINVAL; |
481 | 481 | ||
@@ -543,7 +543,7 @@ static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, | |||
543 | int err; | 543 | int err; |
544 | unsigned long max_ptes_none; | 544 | unsigned long max_ptes_none; |
545 | 545 | ||
546 | err = strict_strtoul(buf, 10, &max_ptes_none); | 546 | err = kstrtoul(buf, 10, &max_ptes_none); |
547 | if (err || max_ptes_none > HPAGE_PMD_NR-1) | 547 | if (err || max_ptes_none > HPAGE_PMD_NR-1) |
548 | return -EINVAL; | 548 | return -EINVAL; |
549 | 549 | ||
@@ -2301,6 +2301,8 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
2301 | goto out; | 2301 | goto out; |
2302 | 2302 | ||
2303 | vma = find_vma(mm, address); | 2303 | vma = find_vma(mm, address); |
2304 | if (!vma) | ||
2305 | goto out; | ||
2304 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | 2306 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; |
2305 | hend = vma->vm_end & HPAGE_PMD_MASK; | 2307 | hend = vma->vm_end & HPAGE_PMD_MASK; |
2306 | if (address < hstart || address + HPAGE_PMD_SIZE > hend) | 2308 | if (address < hstart || address + HPAGE_PMD_SIZE > hend) |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b60f33080a28..b49579c7f2a5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/rmap.h> | 21 | #include <linux/rmap.h> |
22 | #include <linux/swap.h> | 22 | #include <linux/swap.h> |
23 | #include <linux/swapops.h> | 23 | #include <linux/swapops.h> |
24 | #include <linux/page-isolation.h> | ||
24 | 25 | ||
25 | #include <asm/page.h> | 26 | #include <asm/page.h> |
26 | #include <asm/pgtable.h> | 27 | #include <asm/pgtable.h> |
@@ -33,7 +34,6 @@ | |||
33 | #include "internal.h" | 34 | #include "internal.h" |
34 | 35 | ||
35 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 36 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
36 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; | ||
37 | unsigned long hugepages_treat_as_movable; | 37 | unsigned long hugepages_treat_as_movable; |
38 | 38 | ||
39 | int hugetlb_max_hstate __read_mostly; | 39 | int hugetlb_max_hstate __read_mostly; |
@@ -48,7 +48,8 @@ static unsigned long __initdata default_hstate_max_huge_pages; | |||
48 | static unsigned long __initdata default_hstate_size; | 48 | static unsigned long __initdata default_hstate_size; |
49 | 49 | ||
50 | /* | 50 | /* |
51 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages | 51 | * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, |
52 | * free_huge_pages, and surplus_huge_pages. | ||
52 | */ | 53 | */ |
53 | DEFINE_SPINLOCK(hugetlb_lock); | 54 | DEFINE_SPINLOCK(hugetlb_lock); |
54 | 55 | ||
@@ -135,9 +136,9 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) | |||
135 | * across the pages in a mapping. | 136 | * across the pages in a mapping. |
136 | * | 137 | * |
137 | * The region data structures are protected by a combination of the mmap_sem | 138 | * The region data structures are protected by a combination of the mmap_sem |
138 | * and the hugetlb_instantion_mutex. To access or modify a region the caller | 139 | * and the hugetlb_instantiation_mutex. To access or modify a region the caller |
139 | * must either hold the mmap_sem for write, or the mmap_sem for read and | 140 | * must either hold the mmap_sem for write, or the mmap_sem for read and |
140 | * the hugetlb_instantiation mutex: | 141 | * the hugetlb_instantiation_mutex: |
141 | * | 142 | * |
142 | * down_write(&mm->mmap_sem); | 143 | * down_write(&mm->mmap_sem); |
143 | * or | 144 | * or |
@@ -434,25 +435,6 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) | |||
434 | return (get_vma_private_data(vma) & flag) != 0; | 435 | return (get_vma_private_data(vma) & flag) != 0; |
435 | } | 436 | } |
436 | 437 | ||
437 | /* Decrement the reserved pages in the hugepage pool by one */ | ||
438 | static void decrement_hugepage_resv_vma(struct hstate *h, | ||
439 | struct vm_area_struct *vma) | ||
440 | { | ||
441 | if (vma->vm_flags & VM_NORESERVE) | ||
442 | return; | ||
443 | |||
444 | if (vma->vm_flags & VM_MAYSHARE) { | ||
445 | /* Shared mappings always use reserves */ | ||
446 | h->resv_huge_pages--; | ||
447 | } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { | ||
448 | /* | ||
449 | * Only the process that called mmap() has reserves for | ||
450 | * private mappings. | ||
451 | */ | ||
452 | h->resv_huge_pages--; | ||
453 | } | ||
454 | } | ||
455 | |||
456 | /* Reset counters to 0 and clear all HPAGE_RESV_* flags */ | 438 | /* Reset counters to 0 and clear all HPAGE_RESV_* flags */ |
457 | void reset_vma_resv_huge_pages(struct vm_area_struct *vma) | 439 | void reset_vma_resv_huge_pages(struct vm_area_struct *vma) |
458 | { | 440 | { |
@@ -462,12 +444,35 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma) | |||
462 | } | 444 | } |
463 | 445 | ||
464 | /* Returns true if the VMA has associated reserve pages */ | 446 | /* Returns true if the VMA has associated reserve pages */ |
465 | static int vma_has_reserves(struct vm_area_struct *vma) | 447 | static int vma_has_reserves(struct vm_area_struct *vma, long chg) |
466 | { | 448 | { |
449 | if (vma->vm_flags & VM_NORESERVE) { | ||
450 | /* | ||
451 | * This address is already reserved by other process(chg == 0), | ||
452 | * so, we should decrement reserved count. Without decrementing, | ||
453 | * reserve count remains after releasing inode, because this | ||
454 | * allocated page will go into page cache and is regarded as | ||
455 | * coming from reserved pool in releasing step. Currently, we | ||
456 | * don't have any other solution to deal with this situation | ||
457 | * properly, so add work-around here. | ||
458 | */ | ||
459 | if (vma->vm_flags & VM_MAYSHARE && chg == 0) | ||
460 | return 1; | ||
461 | else | ||
462 | return 0; | ||
463 | } | ||
464 | |||
465 | /* Shared mappings always use reserves */ | ||
467 | if (vma->vm_flags & VM_MAYSHARE) | 466 | if (vma->vm_flags & VM_MAYSHARE) |
468 | return 1; | 467 | return 1; |
468 | |||
469 | /* | ||
470 | * Only the process that called mmap() has reserves for | ||
471 | * private mappings. | ||
472 | */ | ||
469 | if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) | 473 | if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) |
470 | return 1; | 474 | return 1; |
475 | |||
471 | return 0; | 476 | return 0; |
472 | } | 477 | } |
473 | 478 | ||
@@ -517,9 +522,15 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) | |||
517 | { | 522 | { |
518 | struct page *page; | 523 | struct page *page; |
519 | 524 | ||
520 | if (list_empty(&h->hugepage_freelists[nid])) | 525 | list_for_each_entry(page, &h->hugepage_freelists[nid], lru) |
526 | if (!is_migrate_isolate_page(page)) | ||
527 | break; | ||
528 | /* | ||
529 | * if 'non-isolated free hugepage' not found on the list, | ||
530 | * the allocation fails. | ||
531 | */ | ||
532 | if (&h->hugepage_freelists[nid] == &page->lru) | ||
521 | return NULL; | 533 | return NULL; |
522 | page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); | ||
523 | list_move(&page->lru, &h->hugepage_activelist); | 534 | list_move(&page->lru, &h->hugepage_activelist); |
524 | set_page_refcounted(page); | 535 | set_page_refcounted(page); |
525 | h->free_huge_pages--; | 536 | h->free_huge_pages--; |
@@ -527,9 +538,19 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) | |||
527 | return page; | 538 | return page; |
528 | } | 539 | } |
529 | 540 | ||
541 | /* Movability of hugepages depends on migration support. */ | ||
542 | static inline gfp_t htlb_alloc_mask(struct hstate *h) | ||
543 | { | ||
544 | if (hugepages_treat_as_movable || hugepage_migration_support(h)) | ||
545 | return GFP_HIGHUSER_MOVABLE; | ||
546 | else | ||
547 | return GFP_HIGHUSER; | ||
548 | } | ||
549 | |||
530 | static struct page *dequeue_huge_page_vma(struct hstate *h, | 550 | static struct page *dequeue_huge_page_vma(struct hstate *h, |
531 | struct vm_area_struct *vma, | 551 | struct vm_area_struct *vma, |
532 | unsigned long address, int avoid_reserve) | 552 | unsigned long address, int avoid_reserve, |
553 | long chg) | ||
533 | { | 554 | { |
534 | struct page *page = NULL; | 555 | struct page *page = NULL; |
535 | struct mempolicy *mpol; | 556 | struct mempolicy *mpol; |
@@ -539,16 +560,12 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, | |||
539 | struct zoneref *z; | 560 | struct zoneref *z; |
540 | unsigned int cpuset_mems_cookie; | 561 | unsigned int cpuset_mems_cookie; |
541 | 562 | ||
542 | retry_cpuset: | ||
543 | cpuset_mems_cookie = get_mems_allowed(); | ||
544 | zonelist = huge_zonelist(vma, address, | ||
545 | htlb_alloc_mask, &mpol, &nodemask); | ||
546 | /* | 563 | /* |
547 | * A child process with MAP_PRIVATE mappings created by their parent | 564 | * A child process with MAP_PRIVATE mappings created by their parent |
548 | * have no page reserves. This check ensures that reservations are | 565 | * have no page reserves. This check ensures that reservations are |
549 | * not "stolen". The child may still get SIGKILLed | 566 | * not "stolen". The child may still get SIGKILLed |
550 | */ | 567 | */ |
551 | if (!vma_has_reserves(vma) && | 568 | if (!vma_has_reserves(vma, chg) && |
552 | h->free_huge_pages - h->resv_huge_pages == 0) | 569 | h->free_huge_pages - h->resv_huge_pages == 0) |
553 | goto err; | 570 | goto err; |
554 | 571 | ||
@@ -556,13 +573,23 @@ retry_cpuset: | |||
556 | if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) | 573 | if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) |
557 | goto err; | 574 | goto err; |
558 | 575 | ||
576 | retry_cpuset: | ||
577 | cpuset_mems_cookie = get_mems_allowed(); | ||
578 | zonelist = huge_zonelist(vma, address, | ||
579 | htlb_alloc_mask(h), &mpol, &nodemask); | ||
580 | |||
559 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 581 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
560 | MAX_NR_ZONES - 1, nodemask) { | 582 | MAX_NR_ZONES - 1, nodemask) { |
561 | if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { | 583 | if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask(h))) { |
562 | page = dequeue_huge_page_node(h, zone_to_nid(zone)); | 584 | page = dequeue_huge_page_node(h, zone_to_nid(zone)); |
563 | if (page) { | 585 | if (page) { |
564 | if (!avoid_reserve) | 586 | if (avoid_reserve) |
565 | decrement_hugepage_resv_vma(h, vma); | 587 | break; |
588 | if (!vma_has_reserves(vma, chg)) | ||
589 | break; | ||
590 | |||
591 | SetPagePrivate(page); | ||
592 | h->resv_huge_pages--; | ||
566 | break; | 593 | break; |
567 | } | 594 | } |
568 | } | 595 | } |
@@ -574,7 +601,6 @@ retry_cpuset: | |||
574 | return page; | 601 | return page; |
575 | 602 | ||
576 | err: | 603 | err: |
577 | mpol_cond_put(mpol); | ||
578 | return NULL; | 604 | return NULL; |
579 | } | 605 | } |
580 | 606 | ||
@@ -620,15 +646,20 @@ static void free_huge_page(struct page *page) | |||
620 | int nid = page_to_nid(page); | 646 | int nid = page_to_nid(page); |
621 | struct hugepage_subpool *spool = | 647 | struct hugepage_subpool *spool = |
622 | (struct hugepage_subpool *)page_private(page); | 648 | (struct hugepage_subpool *)page_private(page); |
649 | bool restore_reserve; | ||
623 | 650 | ||
624 | set_page_private(page, 0); | 651 | set_page_private(page, 0); |
625 | page->mapping = NULL; | 652 | page->mapping = NULL; |
626 | BUG_ON(page_count(page)); | 653 | BUG_ON(page_count(page)); |
627 | BUG_ON(page_mapcount(page)); | 654 | BUG_ON(page_mapcount(page)); |
655 | restore_reserve = PagePrivate(page); | ||
628 | 656 | ||
629 | spin_lock(&hugetlb_lock); | 657 | spin_lock(&hugetlb_lock); |
630 | hugetlb_cgroup_uncharge_page(hstate_index(h), | 658 | hugetlb_cgroup_uncharge_page(hstate_index(h), |
631 | pages_per_huge_page(h), page); | 659 | pages_per_huge_page(h), page); |
660 | if (restore_reserve) | ||
661 | h->resv_huge_pages++; | ||
662 | |||
632 | if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { | 663 | if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { |
633 | /* remove the page from active list */ | 664 | /* remove the page from active list */ |
634 | list_del(&page->lru); | 665 | list_del(&page->lru); |
@@ -715,7 +746,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
715 | return NULL; | 746 | return NULL; |
716 | 747 | ||
717 | page = alloc_pages_exact_node(nid, | 748 | page = alloc_pages_exact_node(nid, |
718 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| | 749 | htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| |
719 | __GFP_REPEAT|__GFP_NOWARN, | 750 | __GFP_REPEAT|__GFP_NOWARN, |
720 | huge_page_order(h)); | 751 | huge_page_order(h)); |
721 | if (page) { | 752 | if (page) { |
@@ -772,33 +803,6 @@ static int hstate_next_node_to_alloc(struct hstate *h, | |||
772 | return nid; | 803 | return nid; |
773 | } | 804 | } |
774 | 805 | ||
775 | static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) | ||
776 | { | ||
777 | struct page *page; | ||
778 | int start_nid; | ||
779 | int next_nid; | ||
780 | int ret = 0; | ||
781 | |||
782 | start_nid = hstate_next_node_to_alloc(h, nodes_allowed); | ||
783 | next_nid = start_nid; | ||
784 | |||
785 | do { | ||
786 | page = alloc_fresh_huge_page_node(h, next_nid); | ||
787 | if (page) { | ||
788 | ret = 1; | ||
789 | break; | ||
790 | } | ||
791 | next_nid = hstate_next_node_to_alloc(h, nodes_allowed); | ||
792 | } while (next_nid != start_nid); | ||
793 | |||
794 | if (ret) | ||
795 | count_vm_event(HTLB_BUDDY_PGALLOC); | ||
796 | else | ||
797 | count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); | ||
798 | |||
799 | return ret; | ||
800 | } | ||
801 | |||
802 | /* | 806 | /* |
803 | * helper for free_pool_huge_page() - return the previously saved | 807 | * helper for free_pool_huge_page() - return the previously saved |
804 | * node ["this node"] from which to free a huge page. Advance the | 808 | * node ["this node"] from which to free a huge page. Advance the |
@@ -817,6 +821,40 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) | |||
817 | return nid; | 821 | return nid; |
818 | } | 822 | } |
819 | 823 | ||
824 | #define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ | ||
825 | for (nr_nodes = nodes_weight(*mask); \ | ||
826 | nr_nodes > 0 && \ | ||
827 | ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ | ||
828 | nr_nodes--) | ||
829 | |||
830 | #define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ | ||
831 | for (nr_nodes = nodes_weight(*mask); \ | ||
832 | nr_nodes > 0 && \ | ||
833 | ((node = hstate_next_node_to_free(hs, mask)) || 1); \ | ||
834 | nr_nodes--) | ||
835 | |||
836 | static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) | ||
837 | { | ||
838 | struct page *page; | ||
839 | int nr_nodes, node; | ||
840 | int ret = 0; | ||
841 | |||
842 | for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { | ||
843 | page = alloc_fresh_huge_page_node(h, node); | ||
844 | if (page) { | ||
845 | ret = 1; | ||
846 | break; | ||
847 | } | ||
848 | } | ||
849 | |||
850 | if (ret) | ||
851 | count_vm_event(HTLB_BUDDY_PGALLOC); | ||
852 | else | ||
853 | count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); | ||
854 | |||
855 | return ret; | ||
856 | } | ||
857 | |||
820 | /* | 858 | /* |
821 | * Free huge page from pool from next node to free. | 859 | * Free huge page from pool from next node to free. |
822 | * Attempt to keep persistent huge pages more or less | 860 | * Attempt to keep persistent huge pages more or less |
@@ -826,40 +864,73 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) | |||
826 | static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, | 864 | static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, |
827 | bool acct_surplus) | 865 | bool acct_surplus) |
828 | { | 866 | { |
829 | int start_nid; | 867 | int nr_nodes, node; |
830 | int next_nid; | ||
831 | int ret = 0; | 868 | int ret = 0; |
832 | 869 | ||
833 | start_nid = hstate_next_node_to_free(h, nodes_allowed); | 870 | for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { |
834 | next_nid = start_nid; | ||
835 | |||
836 | do { | ||
837 | /* | 871 | /* |
838 | * If we're returning unused surplus pages, only examine | 872 | * If we're returning unused surplus pages, only examine |
839 | * nodes with surplus pages. | 873 | * nodes with surplus pages. |
840 | */ | 874 | */ |
841 | if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) && | 875 | if ((!acct_surplus || h->surplus_huge_pages_node[node]) && |
842 | !list_empty(&h->hugepage_freelists[next_nid])) { | 876 | !list_empty(&h->hugepage_freelists[node])) { |
843 | struct page *page = | 877 | struct page *page = |
844 | list_entry(h->hugepage_freelists[next_nid].next, | 878 | list_entry(h->hugepage_freelists[node].next, |
845 | struct page, lru); | 879 | struct page, lru); |
846 | list_del(&page->lru); | 880 | list_del(&page->lru); |
847 | h->free_huge_pages--; | 881 | h->free_huge_pages--; |
848 | h->free_huge_pages_node[next_nid]--; | 882 | h->free_huge_pages_node[node]--; |
849 | if (acct_surplus) { | 883 | if (acct_surplus) { |
850 | h->surplus_huge_pages--; | 884 | h->surplus_huge_pages--; |
851 | h->surplus_huge_pages_node[next_nid]--; | 885 | h->surplus_huge_pages_node[node]--; |
852 | } | 886 | } |
853 | update_and_free_page(h, page); | 887 | update_and_free_page(h, page); |
854 | ret = 1; | 888 | ret = 1; |
855 | break; | 889 | break; |
856 | } | 890 | } |
857 | next_nid = hstate_next_node_to_free(h, nodes_allowed); | 891 | } |
858 | } while (next_nid != start_nid); | ||
859 | 892 | ||
860 | return ret; | 893 | return ret; |
861 | } | 894 | } |
862 | 895 | ||
896 | /* | ||
897 | * Dissolve a given free hugepage into free buddy pages. This function does | ||
898 | * nothing for in-use (including surplus) hugepages. | ||
899 | */ | ||
900 | static void dissolve_free_huge_page(struct page *page) | ||
901 | { | ||
902 | spin_lock(&hugetlb_lock); | ||
903 | if (PageHuge(page) && !page_count(page)) { | ||
904 | struct hstate *h = page_hstate(page); | ||
905 | int nid = page_to_nid(page); | ||
906 | list_del(&page->lru); | ||
907 | h->free_huge_pages--; | ||
908 | h->free_huge_pages_node[nid]--; | ||
909 | update_and_free_page(h, page); | ||
910 | } | ||
911 | spin_unlock(&hugetlb_lock); | ||
912 | } | ||
913 | |||
914 | /* | ||
915 | * Dissolve free hugepages in a given pfn range. Used by memory hotplug to | ||
916 | * make specified memory blocks removable from the system. | ||
917 | * Note that start_pfn should aligned with (minimum) hugepage size. | ||
918 | */ | ||
919 | void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) | ||
920 | { | ||
921 | unsigned int order = 8 * sizeof(void *); | ||
922 | unsigned long pfn; | ||
923 | struct hstate *h; | ||
924 | |||
925 | /* Set scan step to minimum hugepage size */ | ||
926 | for_each_hstate(h) | ||
927 | if (order > huge_page_order(h)) | ||
928 | order = huge_page_order(h); | ||
929 | VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order)); | ||
930 | for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) | ||
931 | dissolve_free_huge_page(pfn_to_page(pfn)); | ||
932 | } | ||
933 | |||
863 | static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) | 934 | static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) |
864 | { | 935 | { |
865 | struct page *page; | 936 | struct page *page; |
@@ -902,12 +973,12 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) | |||
902 | spin_unlock(&hugetlb_lock); | 973 | spin_unlock(&hugetlb_lock); |
903 | 974 | ||
904 | if (nid == NUMA_NO_NODE) | 975 | if (nid == NUMA_NO_NODE) |
905 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP| | 976 | page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP| |
906 | __GFP_REPEAT|__GFP_NOWARN, | 977 | __GFP_REPEAT|__GFP_NOWARN, |
907 | huge_page_order(h)); | 978 | huge_page_order(h)); |
908 | else | 979 | else |
909 | page = alloc_pages_exact_node(nid, | 980 | page = alloc_pages_exact_node(nid, |
910 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| | 981 | htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| |
911 | __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); | 982 | __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); |
912 | 983 | ||
913 | if (page && arch_prepare_hugepage(page)) { | 984 | if (page && arch_prepare_hugepage(page)) { |
@@ -944,10 +1015,11 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) | |||
944 | */ | 1015 | */ |
945 | struct page *alloc_huge_page_node(struct hstate *h, int nid) | 1016 | struct page *alloc_huge_page_node(struct hstate *h, int nid) |
946 | { | 1017 | { |
947 | struct page *page; | 1018 | struct page *page = NULL; |
948 | 1019 | ||
949 | spin_lock(&hugetlb_lock); | 1020 | spin_lock(&hugetlb_lock); |
950 | page = dequeue_huge_page_node(h, nid); | 1021 | if (h->free_huge_pages - h->resv_huge_pages > 0) |
1022 | page = dequeue_huge_page_node(h, nid); | ||
951 | spin_unlock(&hugetlb_lock); | 1023 | spin_unlock(&hugetlb_lock); |
952 | 1024 | ||
953 | if (!page) | 1025 | if (!page) |
@@ -1035,11 +1107,8 @@ free: | |||
1035 | spin_unlock(&hugetlb_lock); | 1107 | spin_unlock(&hugetlb_lock); |
1036 | 1108 | ||
1037 | /* Free unnecessary surplus pages to the buddy allocator */ | 1109 | /* Free unnecessary surplus pages to the buddy allocator */ |
1038 | if (!list_empty(&surplus_list)) { | 1110 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) |
1039 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | 1111 | put_page(page); |
1040 | put_page(page); | ||
1041 | } | ||
1042 | } | ||
1043 | spin_lock(&hugetlb_lock); | 1112 | spin_lock(&hugetlb_lock); |
1044 | 1113 | ||
1045 | return ret; | 1114 | return ret; |
@@ -1106,9 +1175,9 @@ static long vma_needs_reservation(struct hstate *h, | |||
1106 | } else { | 1175 | } else { |
1107 | long err; | 1176 | long err; |
1108 | pgoff_t idx = vma_hugecache_offset(h, vma, addr); | 1177 | pgoff_t idx = vma_hugecache_offset(h, vma, addr); |
1109 | struct resv_map *reservations = vma_resv_map(vma); | 1178 | struct resv_map *resv = vma_resv_map(vma); |
1110 | 1179 | ||
1111 | err = region_chg(&reservations->regions, idx, idx + 1); | 1180 | err = region_chg(&resv->regions, idx, idx + 1); |
1112 | if (err < 0) | 1181 | if (err < 0) |
1113 | return err; | 1182 | return err; |
1114 | return 0; | 1183 | return 0; |
@@ -1126,10 +1195,10 @@ static void vma_commit_reservation(struct hstate *h, | |||
1126 | 1195 | ||
1127 | } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { | 1196 | } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { |
1128 | pgoff_t idx = vma_hugecache_offset(h, vma, addr); | 1197 | pgoff_t idx = vma_hugecache_offset(h, vma, addr); |
1129 | struct resv_map *reservations = vma_resv_map(vma); | 1198 | struct resv_map *resv = vma_resv_map(vma); |
1130 | 1199 | ||
1131 | /* Mark this page used in the map. */ | 1200 | /* Mark this page used in the map. */ |
1132 | region_add(&reservations->regions, idx, idx + 1); | 1201 | region_add(&resv->regions, idx, idx + 1); |
1133 | } | 1202 | } |
1134 | } | 1203 | } |
1135 | 1204 | ||
@@ -1155,38 +1224,35 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1155 | chg = vma_needs_reservation(h, vma, addr); | 1224 | chg = vma_needs_reservation(h, vma, addr); |
1156 | if (chg < 0) | 1225 | if (chg < 0) |
1157 | return ERR_PTR(-ENOMEM); | 1226 | return ERR_PTR(-ENOMEM); |
1158 | if (chg) | 1227 | if (chg || avoid_reserve) |
1159 | if (hugepage_subpool_get_pages(spool, chg)) | 1228 | if (hugepage_subpool_get_pages(spool, 1)) |
1160 | return ERR_PTR(-ENOSPC); | 1229 | return ERR_PTR(-ENOSPC); |
1161 | 1230 | ||
1162 | ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); | 1231 | ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); |
1163 | if (ret) { | 1232 | if (ret) { |
1164 | hugepage_subpool_put_pages(spool, chg); | 1233 | if (chg || avoid_reserve) |
1234 | hugepage_subpool_put_pages(spool, 1); | ||
1165 | return ERR_PTR(-ENOSPC); | 1235 | return ERR_PTR(-ENOSPC); |
1166 | } | 1236 | } |
1167 | spin_lock(&hugetlb_lock); | 1237 | spin_lock(&hugetlb_lock); |
1168 | page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); | 1238 | page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg); |
1169 | if (page) { | 1239 | if (!page) { |
1170 | /* update page cgroup details */ | ||
1171 | hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), | ||
1172 | h_cg, page); | ||
1173 | spin_unlock(&hugetlb_lock); | ||
1174 | } else { | ||
1175 | spin_unlock(&hugetlb_lock); | 1240 | spin_unlock(&hugetlb_lock); |
1176 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); | 1241 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); |
1177 | if (!page) { | 1242 | if (!page) { |
1178 | hugetlb_cgroup_uncharge_cgroup(idx, | 1243 | hugetlb_cgroup_uncharge_cgroup(idx, |
1179 | pages_per_huge_page(h), | 1244 | pages_per_huge_page(h), |
1180 | h_cg); | 1245 | h_cg); |
1181 | hugepage_subpool_put_pages(spool, chg); | 1246 | if (chg || avoid_reserve) |
1247 | hugepage_subpool_put_pages(spool, 1); | ||
1182 | return ERR_PTR(-ENOSPC); | 1248 | return ERR_PTR(-ENOSPC); |
1183 | } | 1249 | } |
1184 | spin_lock(&hugetlb_lock); | 1250 | spin_lock(&hugetlb_lock); |
1185 | hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), | ||
1186 | h_cg, page); | ||
1187 | list_move(&page->lru, &h->hugepage_activelist); | 1251 | list_move(&page->lru, &h->hugepage_activelist); |
1188 | spin_unlock(&hugetlb_lock); | 1252 | /* Fall through */ |
1189 | } | 1253 | } |
1254 | hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); | ||
1255 | spin_unlock(&hugetlb_lock); | ||
1190 | 1256 | ||
1191 | set_page_private(page, (unsigned long)spool); | 1257 | set_page_private(page, (unsigned long)spool); |
1192 | 1258 | ||
@@ -1194,17 +1260,29 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1194 | return page; | 1260 | return page; |
1195 | } | 1261 | } |
1196 | 1262 | ||
1263 | /* | ||
1264 | * alloc_huge_page()'s wrapper which simply returns the page if allocation | ||
1265 | * succeeds, otherwise NULL. This function is called from new_vma_page(), | ||
1266 | * where no ERR_VALUE is expected to be returned. | ||
1267 | */ | ||
1268 | struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, | ||
1269 | unsigned long addr, int avoid_reserve) | ||
1270 | { | ||
1271 | struct page *page = alloc_huge_page(vma, addr, avoid_reserve); | ||
1272 | if (IS_ERR(page)) | ||
1273 | page = NULL; | ||
1274 | return page; | ||
1275 | } | ||
1276 | |||
1197 | int __weak alloc_bootmem_huge_page(struct hstate *h) | 1277 | int __weak alloc_bootmem_huge_page(struct hstate *h) |
1198 | { | 1278 | { |
1199 | struct huge_bootmem_page *m; | 1279 | struct huge_bootmem_page *m; |
1200 | int nr_nodes = nodes_weight(node_states[N_MEMORY]); | 1280 | int nr_nodes, node; |
1201 | 1281 | ||
1202 | while (nr_nodes) { | 1282 | for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { |
1203 | void *addr; | 1283 | void *addr; |
1204 | 1284 | ||
1205 | addr = __alloc_bootmem_node_nopanic( | 1285 | addr = __alloc_bootmem_node_nopanic(NODE_DATA(node), |
1206 | NODE_DATA(hstate_next_node_to_alloc(h, | ||
1207 | &node_states[N_MEMORY])), | ||
1208 | huge_page_size(h), huge_page_size(h), 0); | 1286 | huge_page_size(h), huge_page_size(h), 0); |
1209 | 1287 | ||
1210 | if (addr) { | 1288 | if (addr) { |
@@ -1216,7 +1294,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) | |||
1216 | m = addr; | 1294 | m = addr; |
1217 | goto found; | 1295 | goto found; |
1218 | } | 1296 | } |
1219 | nr_nodes--; | ||
1220 | } | 1297 | } |
1221 | return 0; | 1298 | return 0; |
1222 | 1299 | ||
@@ -1355,48 +1432,28 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count, | |||
1355 | static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, | 1432 | static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, |
1356 | int delta) | 1433 | int delta) |
1357 | { | 1434 | { |
1358 | int start_nid, next_nid; | 1435 | int nr_nodes, node; |
1359 | int ret = 0; | ||
1360 | 1436 | ||
1361 | VM_BUG_ON(delta != -1 && delta != 1); | 1437 | VM_BUG_ON(delta != -1 && delta != 1); |
1362 | 1438 | ||
1363 | if (delta < 0) | 1439 | if (delta < 0) { |
1364 | start_nid = hstate_next_node_to_alloc(h, nodes_allowed); | 1440 | for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { |
1365 | else | 1441 | if (h->surplus_huge_pages_node[node]) |
1366 | start_nid = hstate_next_node_to_free(h, nodes_allowed); | 1442 | goto found; |
1367 | next_nid = start_nid; | ||
1368 | |||
1369 | do { | ||
1370 | int nid = next_nid; | ||
1371 | if (delta < 0) { | ||
1372 | /* | ||
1373 | * To shrink on this node, there must be a surplus page | ||
1374 | */ | ||
1375 | if (!h->surplus_huge_pages_node[nid]) { | ||
1376 | next_nid = hstate_next_node_to_alloc(h, | ||
1377 | nodes_allowed); | ||
1378 | continue; | ||
1379 | } | ||
1380 | } | 1443 | } |
1381 | if (delta > 0) { | 1444 | } else { |
1382 | /* | 1445 | for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { |
1383 | * Surplus cannot exceed the total number of pages | 1446 | if (h->surplus_huge_pages_node[node] < |
1384 | */ | 1447 | h->nr_huge_pages_node[node]) |
1385 | if (h->surplus_huge_pages_node[nid] >= | 1448 | goto found; |
1386 | h->nr_huge_pages_node[nid]) { | ||
1387 | next_nid = hstate_next_node_to_free(h, | ||
1388 | nodes_allowed); | ||
1389 | continue; | ||
1390 | } | ||
1391 | } | 1449 | } |
1450 | } | ||
1451 | return 0; | ||
1392 | 1452 | ||
1393 | h->surplus_huge_pages += delta; | 1453 | found: |
1394 | h->surplus_huge_pages_node[nid] += delta; | 1454 | h->surplus_huge_pages += delta; |
1395 | ret = 1; | 1455 | h->surplus_huge_pages_node[node] += delta; |
1396 | break; | 1456 | return 1; |
1397 | } while (next_nid != start_nid); | ||
1398 | |||
1399 | return ret; | ||
1400 | } | 1457 | } |
1401 | 1458 | ||
1402 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) | 1459 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) |
@@ -1526,7 +1583,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, | |||
1526 | struct hstate *h; | 1583 | struct hstate *h; |
1527 | NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); | 1584 | NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); |
1528 | 1585 | ||
1529 | err = strict_strtoul(buf, 10, &count); | 1586 | err = kstrtoul(buf, 10, &count); |
1530 | if (err) | 1587 | if (err) |
1531 | goto out; | 1588 | goto out; |
1532 | 1589 | ||
@@ -1617,7 +1674,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, | |||
1617 | if (h->order >= MAX_ORDER) | 1674 | if (h->order >= MAX_ORDER) |
1618 | return -EINVAL; | 1675 | return -EINVAL; |
1619 | 1676 | ||
1620 | err = strict_strtoul(buf, 10, &input); | 1677 | err = kstrtoul(buf, 10, &input); |
1621 | if (err) | 1678 | if (err) |
1622 | return err; | 1679 | return err; |
1623 | 1680 | ||
@@ -2068,18 +2125,6 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, | |||
2068 | } | 2125 | } |
2069 | #endif /* CONFIG_NUMA */ | 2126 | #endif /* CONFIG_NUMA */ |
2070 | 2127 | ||
2071 | int hugetlb_treat_movable_handler(struct ctl_table *table, int write, | ||
2072 | void __user *buffer, | ||
2073 | size_t *length, loff_t *ppos) | ||
2074 | { | ||
2075 | proc_dointvec(table, write, buffer, length, ppos); | ||
2076 | if (hugepages_treat_as_movable) | ||
2077 | htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; | ||
2078 | else | ||
2079 | htlb_alloc_mask = GFP_HIGHUSER; | ||
2080 | return 0; | ||
2081 | } | ||
2082 | |||
2083 | int hugetlb_overcommit_handler(struct ctl_table *table, int write, | 2128 | int hugetlb_overcommit_handler(struct ctl_table *table, int write, |
2084 | void __user *buffer, | 2129 | void __user *buffer, |
2085 | size_t *length, loff_t *ppos) | 2130 | size_t *length, loff_t *ppos) |
@@ -2207,7 +2252,7 @@ out: | |||
2207 | 2252 | ||
2208 | static void hugetlb_vm_op_open(struct vm_area_struct *vma) | 2253 | static void hugetlb_vm_op_open(struct vm_area_struct *vma) |
2209 | { | 2254 | { |
2210 | struct resv_map *reservations = vma_resv_map(vma); | 2255 | struct resv_map *resv = vma_resv_map(vma); |
2211 | 2256 | ||
2212 | /* | 2257 | /* |
2213 | * This new VMA should share its siblings reservation map if present. | 2258 | * This new VMA should share its siblings reservation map if present. |
@@ -2217,34 +2262,34 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) | |||
2217 | * after this open call completes. It is therefore safe to take a | 2262 | * after this open call completes. It is therefore safe to take a |
2218 | * new reference here without additional locking. | 2263 | * new reference here without additional locking. |
2219 | */ | 2264 | */ |
2220 | if (reservations) | 2265 | if (resv) |
2221 | kref_get(&reservations->refs); | 2266 | kref_get(&resv->refs); |
2222 | } | 2267 | } |
2223 | 2268 | ||
2224 | static void resv_map_put(struct vm_area_struct *vma) | 2269 | static void resv_map_put(struct vm_area_struct *vma) |
2225 | { | 2270 | { |
2226 | struct resv_map *reservations = vma_resv_map(vma); | 2271 | struct resv_map *resv = vma_resv_map(vma); |
2227 | 2272 | ||
2228 | if (!reservations) | 2273 | if (!resv) |
2229 | return; | 2274 | return; |
2230 | kref_put(&reservations->refs, resv_map_release); | 2275 | kref_put(&resv->refs, resv_map_release); |
2231 | } | 2276 | } |
2232 | 2277 | ||
2233 | static void hugetlb_vm_op_close(struct vm_area_struct *vma) | 2278 | static void hugetlb_vm_op_close(struct vm_area_struct *vma) |
2234 | { | 2279 | { |
2235 | struct hstate *h = hstate_vma(vma); | 2280 | struct hstate *h = hstate_vma(vma); |
2236 | struct resv_map *reservations = vma_resv_map(vma); | 2281 | struct resv_map *resv = vma_resv_map(vma); |
2237 | struct hugepage_subpool *spool = subpool_vma(vma); | 2282 | struct hugepage_subpool *spool = subpool_vma(vma); |
2238 | unsigned long reserve; | 2283 | unsigned long reserve; |
2239 | unsigned long start; | 2284 | unsigned long start; |
2240 | unsigned long end; | 2285 | unsigned long end; |
2241 | 2286 | ||
2242 | if (reservations) { | 2287 | if (resv) { |
2243 | start = vma_hugecache_offset(h, vma, vma->vm_start); | 2288 | start = vma_hugecache_offset(h, vma, vma->vm_start); |
2244 | end = vma_hugecache_offset(h, vma, vma->vm_end); | 2289 | end = vma_hugecache_offset(h, vma, vma->vm_end); |
2245 | 2290 | ||
2246 | reserve = (end - start) - | 2291 | reserve = (end - start) - |
2247 | region_count(&reservations->regions, start, end); | 2292 | region_count(&resv->regions, start, end); |
2248 | 2293 | ||
2249 | resv_map_put(vma); | 2294 | resv_map_put(vma); |
2250 | 2295 | ||
@@ -2557,7 +2602,6 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2557 | { | 2602 | { |
2558 | struct hstate *h = hstate_vma(vma); | 2603 | struct hstate *h = hstate_vma(vma); |
2559 | struct page *old_page, *new_page; | 2604 | struct page *old_page, *new_page; |
2560 | int avoidcopy; | ||
2561 | int outside_reserve = 0; | 2605 | int outside_reserve = 0; |
2562 | unsigned long mmun_start; /* For mmu_notifiers */ | 2606 | unsigned long mmun_start; /* For mmu_notifiers */ |
2563 | unsigned long mmun_end; /* For mmu_notifiers */ | 2607 | unsigned long mmun_end; /* For mmu_notifiers */ |
@@ -2567,10 +2611,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2567 | retry_avoidcopy: | 2611 | retry_avoidcopy: |
2568 | /* If no-one else is actually using this page, avoid the copy | 2612 | /* If no-one else is actually using this page, avoid the copy |
2569 | * and just make the page writable */ | 2613 | * and just make the page writable */ |
2570 | avoidcopy = (page_mapcount(old_page) == 1); | 2614 | if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { |
2571 | if (avoidcopy) { | 2615 | page_move_anon_rmap(old_page, vma, address); |
2572 | if (PageAnon(old_page)) | ||
2573 | page_move_anon_rmap(old_page, vma, address); | ||
2574 | set_huge_ptep_writable(vma, address, ptep); | 2616 | set_huge_ptep_writable(vma, address, ptep); |
2575 | return 0; | 2617 | return 0; |
2576 | } | 2618 | } |
@@ -2584,8 +2626,7 @@ retry_avoidcopy: | |||
2584 | * at the time of fork() could consume its reserves on COW instead | 2626 | * at the time of fork() could consume its reserves on COW instead |
2585 | * of the full address range. | 2627 | * of the full address range. |
2586 | */ | 2628 | */ |
2587 | if (!(vma->vm_flags & VM_MAYSHARE) && | 2629 | if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && |
2588 | is_vma_resv_set(vma, HPAGE_RESV_OWNER) && | ||
2589 | old_page != pagecache_page) | 2630 | old_page != pagecache_page) |
2590 | outside_reserve = 1; | 2631 | outside_reserve = 1; |
2591 | 2632 | ||
@@ -2657,6 +2698,8 @@ retry_avoidcopy: | |||
2657 | spin_lock(&mm->page_table_lock); | 2698 | spin_lock(&mm->page_table_lock); |
2658 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); | 2699 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); |
2659 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { | 2700 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { |
2701 | ClearPagePrivate(new_page); | ||
2702 | |||
2660 | /* Break COW */ | 2703 | /* Break COW */ |
2661 | huge_ptep_clear_flush(vma, address, ptep); | 2704 | huge_ptep_clear_flush(vma, address, ptep); |
2662 | set_huge_pte_at(mm, address, ptep, | 2705 | set_huge_pte_at(mm, address, ptep, |
@@ -2668,10 +2711,11 @@ retry_avoidcopy: | |||
2668 | } | 2711 | } |
2669 | spin_unlock(&mm->page_table_lock); | 2712 | spin_unlock(&mm->page_table_lock); |
2670 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2713 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
2671 | /* Caller expects lock to be held */ | ||
2672 | spin_lock(&mm->page_table_lock); | ||
2673 | page_cache_release(new_page); | 2714 | page_cache_release(new_page); |
2674 | page_cache_release(old_page); | 2715 | page_cache_release(old_page); |
2716 | |||
2717 | /* Caller expects lock to be held */ | ||
2718 | spin_lock(&mm->page_table_lock); | ||
2675 | return 0; | 2719 | return 0; |
2676 | } | 2720 | } |
2677 | 2721 | ||
@@ -2767,6 +2811,7 @@ retry: | |||
2767 | goto retry; | 2811 | goto retry; |
2768 | goto out; | 2812 | goto out; |
2769 | } | 2813 | } |
2814 | ClearPagePrivate(page); | ||
2770 | 2815 | ||
2771 | spin_lock(&inode->i_lock); | 2816 | spin_lock(&inode->i_lock); |
2772 | inode->i_blocks += blocks_per_huge_page(h); | 2817 | inode->i_blocks += blocks_per_huge_page(h); |
@@ -2813,8 +2858,10 @@ retry: | |||
2813 | if (!huge_pte_none(huge_ptep_get(ptep))) | 2858 | if (!huge_pte_none(huge_ptep_get(ptep))) |
2814 | goto backout; | 2859 | goto backout; |
2815 | 2860 | ||
2816 | if (anon_rmap) | 2861 | if (anon_rmap) { |
2862 | ClearPagePrivate(page); | ||
2817 | hugepage_add_new_anon_rmap(page, vma, address); | 2863 | hugepage_add_new_anon_rmap(page, vma, address); |
2864 | } | ||
2818 | else | 2865 | else |
2819 | page_dup_rmap(page); | 2866 | page_dup_rmap(page); |
2820 | new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) | 2867 | new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) |
@@ -3431,3 +3478,45 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) | |||
3431 | return ret; | 3478 | return ret; |
3432 | } | 3479 | } |
3433 | #endif | 3480 | #endif |
3481 | |||
3482 | bool isolate_huge_page(struct page *page, struct list_head *list) | ||
3483 | { | ||
3484 | VM_BUG_ON(!PageHead(page)); | ||
3485 | if (!get_page_unless_zero(page)) | ||
3486 | return false; | ||
3487 | spin_lock(&hugetlb_lock); | ||
3488 | list_move_tail(&page->lru, list); | ||
3489 | spin_unlock(&hugetlb_lock); | ||
3490 | return true; | ||
3491 | } | ||
3492 | |||
3493 | void putback_active_hugepage(struct page *page) | ||
3494 | { | ||
3495 | VM_BUG_ON(!PageHead(page)); | ||
3496 | spin_lock(&hugetlb_lock); | ||
3497 | list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); | ||
3498 | spin_unlock(&hugetlb_lock); | ||
3499 | put_page(page); | ||
3500 | } | ||
3501 | |||
3502 | bool is_hugepage_active(struct page *page) | ||
3503 | { | ||
3504 | VM_BUG_ON(!PageHuge(page)); | ||
3505 | /* | ||
3506 | * This function can be called for a tail page because the caller, | ||
3507 | * scan_movable_pages, scans through a given pfn-range which typically | ||
3508 | * covers one memory block. In systems using gigantic hugepage (1GB | ||
3509 | * for x86_64,) a hugepage is larger than a memory block, and we don't | ||
3510 | * support migrating such large hugepages for now, so return false | ||
3511 | * when called for tail pages. | ||
3512 | */ | ||
3513 | if (PageTail(page)) | ||
3514 | return false; | ||
3515 | /* | ||
3516 | * Refcount of a hwpoisoned hugepages is 1, but they are not active, | ||
3517 | * so we should return false for them. | ||
3518 | */ | ||
3519 | if (unlikely(PageHWPoison(page))) | ||
3520 | return false; | ||
3521 | return page_count(page) > 0; | ||
3522 | } | ||
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 3a61efc518d5..afc2daa91c60 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c | |||
@@ -88,12 +88,12 @@ static int pfn_inject_init(void) | |||
88 | * hardware status change, hence do not require hardware support. | 88 | * hardware status change, hence do not require hardware support. |
89 | * They are mainly for testing hwpoison in software level. | 89 | * They are mainly for testing hwpoison in software level. |
90 | */ | 90 | */ |
91 | dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, | 91 | dentry = debugfs_create_file("corrupt-pfn", 0200, hwpoison_dir, |
92 | NULL, &hwpoison_fops); | 92 | NULL, &hwpoison_fops); |
93 | if (!dentry) | 93 | if (!dentry) |
94 | goto fail; | 94 | goto fail; |
95 | 95 | ||
96 | dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir, | 96 | dentry = debugfs_create_file("unpoison-pfn", 0200, hwpoison_dir, |
97 | NULL, &unpoison_fops); | 97 | NULL, &unpoison_fops); |
98 | if (!dentry) | 98 | if (!dentry) |
99 | goto fail; | 99 | goto fail; |
diff --git a/mm/internal.h b/mm/internal.h index 4390ac6c106e..684f7aa9692a 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -85,6 +85,8 @@ extern unsigned long highest_memmap_pfn; | |||
85 | */ | 85 | */ |
86 | extern int isolate_lru_page(struct page *page); | 86 | extern int isolate_lru_page(struct page *page); |
87 | extern void putback_lru_page(struct page *page); | 87 | extern void putback_lru_page(struct page *page); |
88 | extern unsigned long zone_reclaimable_pages(struct zone *zone); | ||
89 | extern bool zone_reclaimable(struct zone *zone); | ||
88 | 90 | ||
89 | /* | 91 | /* |
90 | * in mm/rmap.c: | 92 | * in mm/rmap.c: |
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index c8d7f3110fd0..e126b0ef9ad2 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -1639,7 +1639,7 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, | |||
1639 | else if (strncmp(buf, "scan=", 5) == 0) { | 1639 | else if (strncmp(buf, "scan=", 5) == 0) { |
1640 | unsigned long secs; | 1640 | unsigned long secs; |
1641 | 1641 | ||
1642 | ret = strict_strtoul(buf + 5, 0, &secs); | 1642 | ret = kstrtoul(buf + 5, 0, &secs); |
1643 | if (ret < 0) | 1643 | if (ret < 0) |
1644 | goto out; | 1644 | goto out; |
1645 | stop_scan_thread(); | 1645 | stop_scan_thread(); |
@@ -2194,7 +2194,7 @@ static ssize_t sleep_millisecs_store(struct kobject *kobj, | |||
2194 | unsigned long msecs; | 2194 | unsigned long msecs; |
2195 | int err; | 2195 | int err; |
2196 | 2196 | ||
2197 | err = strict_strtoul(buf, 10, &msecs); | 2197 | err = kstrtoul(buf, 10, &msecs); |
2198 | if (err || msecs > UINT_MAX) | 2198 | if (err || msecs > UINT_MAX) |
2199 | return -EINVAL; | 2199 | return -EINVAL; |
2200 | 2200 | ||
@@ -2217,7 +2217,7 @@ static ssize_t pages_to_scan_store(struct kobject *kobj, | |||
2217 | int err; | 2217 | int err; |
2218 | unsigned long nr_pages; | 2218 | unsigned long nr_pages; |
2219 | 2219 | ||
2220 | err = strict_strtoul(buf, 10, &nr_pages); | 2220 | err = kstrtoul(buf, 10, &nr_pages); |
2221 | if (err || nr_pages > UINT_MAX) | 2221 | if (err || nr_pages > UINT_MAX) |
2222 | return -EINVAL; | 2222 | return -EINVAL; |
2223 | 2223 | ||
@@ -2239,7 +2239,7 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
2239 | int err; | 2239 | int err; |
2240 | unsigned long flags; | 2240 | unsigned long flags; |
2241 | 2241 | ||
2242 | err = strict_strtoul(buf, 10, &flags); | 2242 | err = kstrtoul(buf, 10, &flags); |
2243 | if (err || flags > UINT_MAX) | 2243 | if (err || flags > UINT_MAX) |
2244 | return -EINVAL; | 2244 | return -EINVAL; |
2245 | if (flags > KSM_RUN_UNMERGE) | 2245 | if (flags > KSM_RUN_UNMERGE) |
diff --git a/mm/madvise.c b/mm/madvise.c index 7055883e6e25..6975bc812542 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -42,11 +42,11 @@ static int madvise_need_mmap_write(int behavior) | |||
42 | * We can potentially split a vm area into separate | 42 | * We can potentially split a vm area into separate |
43 | * areas, each area with its own behavior. | 43 | * areas, each area with its own behavior. |
44 | */ | 44 | */ |
45 | static long madvise_behavior(struct vm_area_struct * vma, | 45 | static long madvise_behavior(struct vm_area_struct *vma, |
46 | struct vm_area_struct **prev, | 46 | struct vm_area_struct **prev, |
47 | unsigned long start, unsigned long end, int behavior) | 47 | unsigned long start, unsigned long end, int behavior) |
48 | { | 48 | { |
49 | struct mm_struct * mm = vma->vm_mm; | 49 | struct mm_struct *mm = vma->vm_mm; |
50 | int error = 0; | 50 | int error = 0; |
51 | pgoff_t pgoff; | 51 | pgoff_t pgoff; |
52 | unsigned long new_flags = vma->vm_flags; | 52 | unsigned long new_flags = vma->vm_flags; |
@@ -215,8 +215,8 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, | |||
215 | /* | 215 | /* |
216 | * Schedule all required I/O operations. Do not wait for completion. | 216 | * Schedule all required I/O operations. Do not wait for completion. |
217 | */ | 217 | */ |
218 | static long madvise_willneed(struct vm_area_struct * vma, | 218 | static long madvise_willneed(struct vm_area_struct *vma, |
219 | struct vm_area_struct ** prev, | 219 | struct vm_area_struct **prev, |
220 | unsigned long start, unsigned long end) | 220 | unsigned long start, unsigned long end) |
221 | { | 221 | { |
222 | struct file *file = vma->vm_file; | 222 | struct file *file = vma->vm_file; |
@@ -270,8 +270,8 @@ static long madvise_willneed(struct vm_area_struct * vma, | |||
270 | * An interface that causes the system to free clean pages and flush | 270 | * An interface that causes the system to free clean pages and flush |
271 | * dirty pages is already available as msync(MS_INVALIDATE). | 271 | * dirty pages is already available as msync(MS_INVALIDATE). |
272 | */ | 272 | */ |
273 | static long madvise_dontneed(struct vm_area_struct * vma, | 273 | static long madvise_dontneed(struct vm_area_struct *vma, |
274 | struct vm_area_struct ** prev, | 274 | struct vm_area_struct **prev, |
275 | unsigned long start, unsigned long end) | 275 | unsigned long start, unsigned long end) |
276 | { | 276 | { |
277 | *prev = vma; | 277 | *prev = vma; |
@@ -343,29 +343,34 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
343 | */ | 343 | */ |
344 | static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) | 344 | static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) |
345 | { | 345 | { |
346 | int ret = 0; | ||
347 | |||
348 | if (!capable(CAP_SYS_ADMIN)) | 346 | if (!capable(CAP_SYS_ADMIN)) |
349 | return -EPERM; | 347 | return -EPERM; |
350 | for (; start < end; start += PAGE_SIZE) { | 348 | for (; start < end; start += PAGE_SIZE) { |
351 | struct page *p; | 349 | struct page *p; |
352 | int ret = get_user_pages_fast(start, 1, 0, &p); | 350 | int ret; |
351 | |||
352 | ret = get_user_pages_fast(start, 1, 0, &p); | ||
353 | if (ret != 1) | 353 | if (ret != 1) |
354 | return ret; | 354 | return ret; |
355 | |||
356 | if (PageHWPoison(p)) { | ||
357 | put_page(p); | ||
358 | continue; | ||
359 | } | ||
355 | if (bhv == MADV_SOFT_OFFLINE) { | 360 | if (bhv == MADV_SOFT_OFFLINE) { |
356 | printk(KERN_INFO "Soft offlining page %lx at %lx\n", | 361 | pr_info("Soft offlining page %#lx at %#lx\n", |
357 | page_to_pfn(p), start); | 362 | page_to_pfn(p), start); |
358 | ret = soft_offline_page(p, MF_COUNT_INCREASED); | 363 | ret = soft_offline_page(p, MF_COUNT_INCREASED); |
359 | if (ret) | 364 | if (ret) |
360 | break; | 365 | return ret; |
361 | continue; | 366 | continue; |
362 | } | 367 | } |
363 | printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", | 368 | pr_info("Injecting memory failure for page %#lx at %#lx\n", |
364 | page_to_pfn(p), start); | 369 | page_to_pfn(p), start); |
365 | /* Ignore return value for now */ | 370 | /* Ignore return value for now */ |
366 | memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); | 371 | memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); |
367 | } | 372 | } |
368 | return ret; | 373 | return 0; |
369 | } | 374 | } |
370 | #endif | 375 | #endif |
371 | 376 | ||
@@ -459,7 +464,7 @@ madvise_behavior_valid(int behavior) | |||
459 | SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | 464 | SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) |
460 | { | 465 | { |
461 | unsigned long end, tmp; | 466 | unsigned long end, tmp; |
462 | struct vm_area_struct * vma, *prev; | 467 | struct vm_area_struct *vma, *prev; |
463 | int unmapped_error = 0; | 468 | int unmapped_error = 0; |
464 | int error = -EINVAL; | 469 | int error = -EINVAL; |
465 | int write; | 470 | int write; |
diff --git a/mm/memblock.c b/mm/memblock.c index a847bfe6f3ba..0ac412a0a7ee 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -914,6 +914,24 @@ int __init_memblock memblock_is_memory(phys_addr_t addr) | |||
914 | return memblock_search(&memblock.memory, addr) != -1; | 914 | return memblock_search(&memblock.memory, addr) != -1; |
915 | } | 915 | } |
916 | 916 | ||
917 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | ||
918 | int __init_memblock memblock_search_pfn_nid(unsigned long pfn, | ||
919 | unsigned long *start_pfn, unsigned long *end_pfn) | ||
920 | { | ||
921 | struct memblock_type *type = &memblock.memory; | ||
922 | int mid = memblock_search(type, (phys_addr_t)pfn << PAGE_SHIFT); | ||
923 | |||
924 | if (mid == -1) | ||
925 | return -1; | ||
926 | |||
927 | *start_pfn = type->regions[mid].base >> PAGE_SHIFT; | ||
928 | *end_pfn = (type->regions[mid].base + type->regions[mid].size) | ||
929 | >> PAGE_SHIFT; | ||
930 | |||
931 | return type->regions[mid].nid; | ||
932 | } | ||
933 | #endif | ||
934 | |||
917 | /** | 935 | /** |
918 | * memblock_is_region_memory - check if a region is a subset of memory | 936 | * memblock_is_region_memory - check if a region is a subset of memory |
919 | * @base: base of region to check | 937 | * @base: base of region to check |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3b83957b6439..c6bd28edd533 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -3121,7 +3121,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) | |||
3121 | ssize_t size = memcg_caches_array_size(num_groups); | 3121 | ssize_t size = memcg_caches_array_size(num_groups); |
3122 | 3122 | ||
3123 | size *= sizeof(void *); | 3123 | size *= sizeof(void *); |
3124 | size += sizeof(struct memcg_cache_params); | 3124 | size += offsetof(struct memcg_cache_params, memcg_caches); |
3125 | 3125 | ||
3126 | s->memcg_params = kzalloc(size, GFP_KERNEL); | 3126 | s->memcg_params = kzalloc(size, GFP_KERNEL); |
3127 | if (!s->memcg_params) { | 3127 | if (!s->memcg_params) { |
@@ -3164,13 +3164,16 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) | |||
3164 | int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, | 3164 | int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, |
3165 | struct kmem_cache *root_cache) | 3165 | struct kmem_cache *root_cache) |
3166 | { | 3166 | { |
3167 | size_t size = sizeof(struct memcg_cache_params); | 3167 | size_t size; |
3168 | 3168 | ||
3169 | if (!memcg_kmem_enabled()) | 3169 | if (!memcg_kmem_enabled()) |
3170 | return 0; | 3170 | return 0; |
3171 | 3171 | ||
3172 | if (!memcg) | 3172 | if (!memcg) { |
3173 | size = offsetof(struct memcg_cache_params, memcg_caches); | ||
3173 | size += memcg_limited_groups_array_size * sizeof(void *); | 3174 | size += memcg_limited_groups_array_size * sizeof(void *); |
3175 | } else | ||
3176 | size = sizeof(struct memcg_cache_params); | ||
3174 | 3177 | ||
3175 | s->memcg_params = kzalloc(size, GFP_KERNEL); | 3178 | s->memcg_params = kzalloc(size, GFP_KERNEL); |
3176 | if (!s->memcg_params) | 3179 | if (!s->memcg_params) |
@@ -5588,7 +5591,13 @@ static int compare_thresholds(const void *a, const void *b) | |||
5588 | const struct mem_cgroup_threshold *_a = a; | 5591 | const struct mem_cgroup_threshold *_a = a; |
5589 | const struct mem_cgroup_threshold *_b = b; | 5592 | const struct mem_cgroup_threshold *_b = b; |
5590 | 5593 | ||
5591 | return _a->threshold - _b->threshold; | 5594 | if (_a->threshold > _b->threshold) |
5595 | return 1; | ||
5596 | |||
5597 | if (_a->threshold < _b->threshold) | ||
5598 | return -1; | ||
5599 | |||
5600 | return 0; | ||
5592 | } | 5601 | } |
5593 | 5602 | ||
5594 | static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) | 5603 | static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index baa4e0a45dec..947ed5413279 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -206,7 +206,7 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, | |||
206 | #ifdef __ARCH_SI_TRAPNO | 206 | #ifdef __ARCH_SI_TRAPNO |
207 | si.si_trapno = trapno; | 207 | si.si_trapno = trapno; |
208 | #endif | 208 | #endif |
209 | si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; | 209 | si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; |
210 | 210 | ||
211 | if ((flags & MF_ACTION_REQUIRED) && t == current) { | 211 | if ((flags & MF_ACTION_REQUIRED) && t == current) { |
212 | si.si_code = BUS_MCEERR_AR; | 212 | si.si_code = BUS_MCEERR_AR; |
@@ -985,7 +985,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
985 | static void set_page_hwpoison_huge_page(struct page *hpage) | 985 | static void set_page_hwpoison_huge_page(struct page *hpage) |
986 | { | 986 | { |
987 | int i; | 987 | int i; |
988 | int nr_pages = 1 << compound_trans_order(hpage); | 988 | int nr_pages = 1 << compound_order(hpage); |
989 | for (i = 0; i < nr_pages; i++) | 989 | for (i = 0; i < nr_pages; i++) |
990 | SetPageHWPoison(hpage + i); | 990 | SetPageHWPoison(hpage + i); |
991 | } | 991 | } |
@@ -993,7 +993,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage) | |||
993 | static void clear_page_hwpoison_huge_page(struct page *hpage) | 993 | static void clear_page_hwpoison_huge_page(struct page *hpage) |
994 | { | 994 | { |
995 | int i; | 995 | int i; |
996 | int nr_pages = 1 << compound_trans_order(hpage); | 996 | int nr_pages = 1 << compound_order(hpage); |
997 | for (i = 0; i < nr_pages; i++) | 997 | for (i = 0; i < nr_pages; i++) |
998 | ClearPageHWPoison(hpage + i); | 998 | ClearPageHWPoison(hpage + i); |
999 | } | 999 | } |
@@ -1206,6 +1206,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1206 | for (ps = error_states;; ps++) | 1206 | for (ps = error_states;; ps++) |
1207 | if ((p->flags & ps->mask) == ps->res) | 1207 | if ((p->flags & ps->mask) == ps->res) |
1208 | break; | 1208 | break; |
1209 | |||
1210 | page_flags |= (p->flags & (1UL << PG_dirty)); | ||
1211 | |||
1209 | if (!ps->mask) | 1212 | if (!ps->mask) |
1210 | for (ps = error_states;; ps++) | 1213 | for (ps = error_states;; ps++) |
1211 | if ((page_flags & ps->mask) == ps->res) | 1214 | if ((page_flags & ps->mask) == ps->res) |
@@ -1341,7 +1344,17 @@ int unpoison_memory(unsigned long pfn) | |||
1341 | return 0; | 1344 | return 0; |
1342 | } | 1345 | } |
1343 | 1346 | ||
1344 | nr_pages = 1 << compound_trans_order(page); | 1347 | /* |
1348 | * unpoison_memory() can encounter thp only when the thp is being | ||
1349 | * worked by memory_failure() and the page lock is not held yet. | ||
1350 | * In such case, we yield to memory_failure() and make unpoison fail. | ||
1351 | */ | ||
1352 | if (PageTransHuge(page)) { | ||
1353 | pr_info("MCE: Memory failure is now running on %#lx\n", pfn); | ||
1354 | return 0; | ||
1355 | } | ||
1356 | |||
1357 | nr_pages = 1 << compound_order(page); | ||
1345 | 1358 | ||
1346 | if (!get_page_unless_zero(page)) { | 1359 | if (!get_page_unless_zero(page)) { |
1347 | /* | 1360 | /* |
@@ -1355,7 +1368,7 @@ int unpoison_memory(unsigned long pfn) | |||
1355 | return 0; | 1368 | return 0; |
1356 | } | 1369 | } |
1357 | if (TestClearPageHWPoison(p)) | 1370 | if (TestClearPageHWPoison(p)) |
1358 | atomic_long_sub(nr_pages, &num_poisoned_pages); | 1371 | atomic_long_dec(&num_poisoned_pages); |
1359 | pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); | 1372 | pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); |
1360 | return 0; | 1373 | return 0; |
1361 | } | 1374 | } |
@@ -1377,7 +1390,7 @@ int unpoison_memory(unsigned long pfn) | |||
1377 | unlock_page(page); | 1390 | unlock_page(page); |
1378 | 1391 | ||
1379 | put_page(page); | 1392 | put_page(page); |
1380 | if (freeit) | 1393 | if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) |
1381 | put_page(page); | 1394 | put_page(page); |
1382 | 1395 | ||
1383 | return 0; | 1396 | return 0; |
@@ -1418,7 +1431,8 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) | |||
1418 | * was free. This flag should be kept set until the source page | 1431 | * was free. This flag should be kept set until the source page |
1419 | * is freed and PG_hwpoison on it is set. | 1432 | * is freed and PG_hwpoison on it is set. |
1420 | */ | 1433 | */ |
1421 | set_migratetype_isolate(p, true); | 1434 | if (get_pageblock_migratetype(p) != MIGRATE_ISOLATE) |
1435 | set_migratetype_isolate(p, true); | ||
1422 | /* | 1436 | /* |
1423 | * When the target page is a free hugepage, just remove it | 1437 | * When the target page is a free hugepage, just remove it |
1424 | * from free hugepage list. | 1438 | * from free hugepage list. |
@@ -1472,6 +1486,7 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1472 | int ret; | 1486 | int ret; |
1473 | unsigned long pfn = page_to_pfn(page); | 1487 | unsigned long pfn = page_to_pfn(page); |
1474 | struct page *hpage = compound_head(page); | 1488 | struct page *hpage = compound_head(page); |
1489 | LIST_HEAD(pagelist); | ||
1475 | 1490 | ||
1476 | /* | 1491 | /* |
1477 | * This double-check of PageHWPoison is to avoid the race with | 1492 | * This double-check of PageHWPoison is to avoid the race with |
@@ -1487,86 +1502,29 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1487 | unlock_page(hpage); | 1502 | unlock_page(hpage); |
1488 | 1503 | ||
1489 | /* Keep page count to indicate a given hugepage is isolated. */ | 1504 | /* Keep page count to indicate a given hugepage is isolated. */ |
1490 | ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, | 1505 | list_move(&hpage->lru, &pagelist); |
1491 | MIGRATE_SYNC); | 1506 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, |
1492 | put_page(hpage); | 1507 | MIGRATE_SYNC, MR_MEMORY_FAILURE); |
1493 | if (ret) { | 1508 | if (ret) { |
1494 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1509 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
1495 | pfn, ret, page->flags); | 1510 | pfn, ret, page->flags); |
1511 | /* | ||
1512 | * We know that soft_offline_huge_page() tries to migrate | ||
1513 | * only one hugepage pointed to by hpage, so we need not | ||
1514 | * run through the pagelist here. | ||
1515 | */ | ||
1516 | putback_active_hugepage(hpage); | ||
1517 | if (ret > 0) | ||
1518 | ret = -EIO; | ||
1496 | } else { | 1519 | } else { |
1497 | set_page_hwpoison_huge_page(hpage); | 1520 | set_page_hwpoison_huge_page(hpage); |
1498 | dequeue_hwpoisoned_huge_page(hpage); | 1521 | dequeue_hwpoisoned_huge_page(hpage); |
1499 | atomic_long_add(1 << compound_trans_order(hpage), | 1522 | atomic_long_add(1 << compound_order(hpage), |
1500 | &num_poisoned_pages); | 1523 | &num_poisoned_pages); |
1501 | } | 1524 | } |
1502 | return ret; | 1525 | return ret; |
1503 | } | 1526 | } |
1504 | 1527 | ||
1505 | static int __soft_offline_page(struct page *page, int flags); | ||
1506 | |||
1507 | /** | ||
1508 | * soft_offline_page - Soft offline a page. | ||
1509 | * @page: page to offline | ||
1510 | * @flags: flags. Same as memory_failure(). | ||
1511 | * | ||
1512 | * Returns 0 on success, otherwise negated errno. | ||
1513 | * | ||
1514 | * Soft offline a page, by migration or invalidation, | ||
1515 | * without killing anything. This is for the case when | ||
1516 | * a page is not corrupted yet (so it's still valid to access), | ||
1517 | * but has had a number of corrected errors and is better taken | ||
1518 | * out. | ||
1519 | * | ||
1520 | * The actual policy on when to do that is maintained by | ||
1521 | * user space. | ||
1522 | * | ||
1523 | * This should never impact any application or cause data loss, | ||
1524 | * however it might take some time. | ||
1525 | * | ||
1526 | * This is not a 100% solution for all memory, but tries to be | ||
1527 | * ``good enough'' for the majority of memory. | ||
1528 | */ | ||
1529 | int soft_offline_page(struct page *page, int flags) | ||
1530 | { | ||
1531 | int ret; | ||
1532 | unsigned long pfn = page_to_pfn(page); | ||
1533 | struct page *hpage = compound_trans_head(page); | ||
1534 | |||
1535 | if (PageHWPoison(page)) { | ||
1536 | pr_info("soft offline: %#lx page already poisoned\n", pfn); | ||
1537 | return -EBUSY; | ||
1538 | } | ||
1539 | if (!PageHuge(page) && PageTransHuge(hpage)) { | ||
1540 | if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { | ||
1541 | pr_info("soft offline: %#lx: failed to split THP\n", | ||
1542 | pfn); | ||
1543 | return -EBUSY; | ||
1544 | } | ||
1545 | } | ||
1546 | |||
1547 | ret = get_any_page(page, pfn, flags); | ||
1548 | if (ret < 0) | ||
1549 | return ret; | ||
1550 | if (ret) { /* for in-use pages */ | ||
1551 | if (PageHuge(page)) | ||
1552 | ret = soft_offline_huge_page(page, flags); | ||
1553 | else | ||
1554 | ret = __soft_offline_page(page, flags); | ||
1555 | } else { /* for free pages */ | ||
1556 | if (PageHuge(page)) { | ||
1557 | set_page_hwpoison_huge_page(hpage); | ||
1558 | dequeue_hwpoisoned_huge_page(hpage); | ||
1559 | atomic_long_add(1 << compound_trans_order(hpage), | ||
1560 | &num_poisoned_pages); | ||
1561 | } else { | ||
1562 | SetPageHWPoison(page); | ||
1563 | atomic_long_inc(&num_poisoned_pages); | ||
1564 | } | ||
1565 | } | ||
1566 | unset_migratetype_isolate(page, MIGRATE_MOVABLE); | ||
1567 | return ret; | ||
1568 | } | ||
1569 | |||
1570 | static int __soft_offline_page(struct page *page, int flags) | 1528 | static int __soft_offline_page(struct page *page, int flags) |
1571 | { | 1529 | { |
1572 | int ret; | 1530 | int ret; |
@@ -1653,3 +1611,67 @@ static int __soft_offline_page(struct page *page, int flags) | |||
1653 | } | 1611 | } |
1654 | return ret; | 1612 | return ret; |
1655 | } | 1613 | } |
1614 | |||
1615 | /** | ||
1616 | * soft_offline_page - Soft offline a page. | ||
1617 | * @page: page to offline | ||
1618 | * @flags: flags. Same as memory_failure(). | ||
1619 | * | ||
1620 | * Returns 0 on success, otherwise negated errno. | ||
1621 | * | ||
1622 | * Soft offline a page, by migration or invalidation, | ||
1623 | * without killing anything. This is for the case when | ||
1624 | * a page is not corrupted yet (so it's still valid to access), | ||
1625 | * but has had a number of corrected errors and is better taken | ||
1626 | * out. | ||
1627 | * | ||
1628 | * The actual policy on when to do that is maintained by | ||
1629 | * user space. | ||
1630 | * | ||
1631 | * This should never impact any application or cause data loss, | ||
1632 | * however it might take some time. | ||
1633 | * | ||
1634 | * This is not a 100% solution for all memory, but tries to be | ||
1635 | * ``good enough'' for the majority of memory. | ||
1636 | */ | ||
1637 | int soft_offline_page(struct page *page, int flags) | ||
1638 | { | ||
1639 | int ret; | ||
1640 | unsigned long pfn = page_to_pfn(page); | ||
1641 | struct page *hpage = compound_trans_head(page); | ||
1642 | |||
1643 | if (PageHWPoison(page)) { | ||
1644 | pr_info("soft offline: %#lx page already poisoned\n", pfn); | ||
1645 | return -EBUSY; | ||
1646 | } | ||
1647 | if (!PageHuge(page) && PageTransHuge(hpage)) { | ||
1648 | if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { | ||
1649 | pr_info("soft offline: %#lx: failed to split THP\n", | ||
1650 | pfn); | ||
1651 | return -EBUSY; | ||
1652 | } | ||
1653 | } | ||
1654 | |||
1655 | ret = get_any_page(page, pfn, flags); | ||
1656 | if (ret < 0) | ||
1657 | goto unset; | ||
1658 | if (ret) { /* for in-use pages */ | ||
1659 | if (PageHuge(page)) | ||
1660 | ret = soft_offline_huge_page(page, flags); | ||
1661 | else | ||
1662 | ret = __soft_offline_page(page, flags); | ||
1663 | } else { /* for free pages */ | ||
1664 | if (PageHuge(page)) { | ||
1665 | set_page_hwpoison_huge_page(hpage); | ||
1666 | dequeue_hwpoisoned_huge_page(hpage); | ||
1667 | atomic_long_add(1 << compound_order(hpage), | ||
1668 | &num_poisoned_pages); | ||
1669 | } else { | ||
1670 | SetPageHWPoison(page); | ||
1671 | atomic_long_inc(&num_poisoned_pages); | ||
1672 | } | ||
1673 | } | ||
1674 | unset: | ||
1675 | unset_migratetype_isolate(page, MIGRATE_MOVABLE); | ||
1676 | return ret; | ||
1677 | } | ||
diff --git a/mm/memory.c b/mm/memory.c index b3c6bf9a398e..2b73dbde2274 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -373,30 +373,6 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) | |||
373 | #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ | 373 | #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ |
374 | 374 | ||
375 | /* | 375 | /* |
376 | * If a p?d_bad entry is found while walking page tables, report | ||
377 | * the error, before resetting entry to p?d_none. Usually (but | ||
378 | * very seldom) called out from the p?d_none_or_clear_bad macros. | ||
379 | */ | ||
380 | |||
381 | void pgd_clear_bad(pgd_t *pgd) | ||
382 | { | ||
383 | pgd_ERROR(*pgd); | ||
384 | pgd_clear(pgd); | ||
385 | } | ||
386 | |||
387 | void pud_clear_bad(pud_t *pud) | ||
388 | { | ||
389 | pud_ERROR(*pud); | ||
390 | pud_clear(pud); | ||
391 | } | ||
392 | |||
393 | void pmd_clear_bad(pmd_t *pmd) | ||
394 | { | ||
395 | pmd_ERROR(*pmd); | ||
396 | pmd_clear(pmd); | ||
397 | } | ||
398 | |||
399 | /* | ||
400 | * Note: this doesn't free the actual pages themselves. That | 376 | * Note: this doesn't free the actual pages themselves. That |
401 | * has been handled earlier when unmapping all the memory regions. | 377 | * has been handled earlier when unmapping all the memory regions. |
402 | */ | 378 | */ |
@@ -1505,7 +1481,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma, | |||
1505 | if (pud_none(*pud)) | 1481 | if (pud_none(*pud)) |
1506 | goto no_page_table; | 1482 | goto no_page_table; |
1507 | if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { | 1483 | if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { |
1508 | BUG_ON(flags & FOLL_GET); | 1484 | if (flags & FOLL_GET) |
1485 | goto out; | ||
1509 | page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); | 1486 | page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); |
1510 | goto out; | 1487 | goto out; |
1511 | } | 1488 | } |
@@ -1516,8 +1493,20 @@ struct page *follow_page_mask(struct vm_area_struct *vma, | |||
1516 | if (pmd_none(*pmd)) | 1493 | if (pmd_none(*pmd)) |
1517 | goto no_page_table; | 1494 | goto no_page_table; |
1518 | if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { | 1495 | if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { |
1519 | BUG_ON(flags & FOLL_GET); | ||
1520 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | 1496 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); |
1497 | if (flags & FOLL_GET) { | ||
1498 | /* | ||
1499 | * Refcount on tail pages are not well-defined and | ||
1500 | * shouldn't be taken. The caller should handle a NULL | ||
1501 | * return when trying to follow tail pages. | ||
1502 | */ | ||
1503 | if (PageHead(page)) | ||
1504 | get_page(page); | ||
1505 | else { | ||
1506 | page = NULL; | ||
1507 | goto out; | ||
1508 | } | ||
1509 | } | ||
1521 | goto out; | 1510 | goto out; |
1522 | } | 1511 | } |
1523 | if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) | 1512 | if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ca1dd3aa5eee..ed85fe3870e2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/mm_inline.h> | 30 | #include <linux/mm_inline.h> |
31 | #include <linux/firmware-map.h> | 31 | #include <linux/firmware-map.h> |
32 | #include <linux/stop_machine.h> | 32 | #include <linux/stop_machine.h> |
33 | #include <linux/hugetlb.h> | ||
33 | 34 | ||
34 | #include <asm/tlbflush.h> | 35 | #include <asm/tlbflush.h> |
35 | 36 | ||
@@ -51,14 +52,10 @@ DEFINE_MUTEX(mem_hotplug_mutex); | |||
51 | void lock_memory_hotplug(void) | 52 | void lock_memory_hotplug(void) |
52 | { | 53 | { |
53 | mutex_lock(&mem_hotplug_mutex); | 54 | mutex_lock(&mem_hotplug_mutex); |
54 | |||
55 | /* for exclusive hibernation if CONFIG_HIBERNATION=y */ | ||
56 | lock_system_sleep(); | ||
57 | } | 55 | } |
58 | 56 | ||
59 | void unlock_memory_hotplug(void) | 57 | void unlock_memory_hotplug(void) |
60 | { | 58 | { |
61 | unlock_system_sleep(); | ||
62 | mutex_unlock(&mem_hotplug_mutex); | 59 | mutex_unlock(&mem_hotplug_mutex); |
63 | } | 60 | } |
64 | 61 | ||
@@ -194,7 +191,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) | |||
194 | 191 | ||
195 | zone = &pgdat->node_zones[0]; | 192 | zone = &pgdat->node_zones[0]; |
196 | for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { | 193 | for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { |
197 | if (zone->wait_table) { | 194 | if (zone_is_initialized(zone)) { |
198 | nr_pages = zone->wait_table_hash_nr_entries | 195 | nr_pages = zone->wait_table_hash_nr_entries |
199 | * sizeof(wait_queue_head_t); | 196 | * sizeof(wait_queue_head_t); |
200 | nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; | 197 | nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; |
@@ -229,8 +226,8 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn, | |||
229 | 226 | ||
230 | zone_span_writelock(zone); | 227 | zone_span_writelock(zone); |
231 | 228 | ||
232 | old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; | 229 | old_zone_end_pfn = zone_end_pfn(zone); |
233 | if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn) | 230 | if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) |
234 | zone->zone_start_pfn = start_pfn; | 231 | zone->zone_start_pfn = start_pfn; |
235 | 232 | ||
236 | zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - | 233 | zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - |
@@ -305,7 +302,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, | |||
305 | goto out_fail; | 302 | goto out_fail; |
306 | 303 | ||
307 | /* use start_pfn for z1's start_pfn if z1 is empty */ | 304 | /* use start_pfn for z1's start_pfn if z1 is empty */ |
308 | if (z1->spanned_pages) | 305 | if (!zone_is_empty(z1)) |
309 | z1_start_pfn = z1->zone_start_pfn; | 306 | z1_start_pfn = z1->zone_start_pfn; |
310 | else | 307 | else |
311 | z1_start_pfn = start_pfn; | 308 | z1_start_pfn = start_pfn; |
@@ -347,7 +344,7 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, | |||
347 | goto out_fail; | 344 | goto out_fail; |
348 | 345 | ||
349 | /* use end_pfn for z2's end_pfn if z2 is empty */ | 346 | /* use end_pfn for z2's end_pfn if z2 is empty */ |
350 | if (z2->spanned_pages) | 347 | if (!zone_is_empty(z2)) |
351 | z2_end_pfn = zone_end_pfn(z2); | 348 | z2_end_pfn = zone_end_pfn(z2); |
352 | else | 349 | else |
353 | z2_end_pfn = end_pfn; | 350 | z2_end_pfn = end_pfn; |
@@ -514,8 +511,9 @@ static int find_biggest_section_pfn(int nid, struct zone *zone, | |||
514 | static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, | 511 | static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, |
515 | unsigned long end_pfn) | 512 | unsigned long end_pfn) |
516 | { | 513 | { |
517 | unsigned long zone_start_pfn = zone->zone_start_pfn; | 514 | unsigned long zone_start_pfn = zone->zone_start_pfn; |
518 | unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; | 515 | unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */ |
516 | unsigned long zone_end_pfn = z; | ||
519 | unsigned long pfn; | 517 | unsigned long pfn; |
520 | struct mem_section *ms; | 518 | struct mem_section *ms; |
521 | int nid = zone_to_nid(zone); | 519 | int nid = zone_to_nid(zone); |
@@ -1069,6 +1067,23 @@ out: | |||
1069 | return ret; | 1067 | return ret; |
1070 | } | 1068 | } |
1071 | 1069 | ||
1070 | static int check_hotplug_memory_range(u64 start, u64 size) | ||
1071 | { | ||
1072 | u64 start_pfn = start >> PAGE_SHIFT; | ||
1073 | u64 nr_pages = size >> PAGE_SHIFT; | ||
1074 | |||
1075 | /* Memory range must be aligned with section */ | ||
1076 | if ((start_pfn & ~PAGE_SECTION_MASK) || | ||
1077 | (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) { | ||
1078 | pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n", | ||
1079 | (unsigned long long)start, | ||
1080 | (unsigned long long)size); | ||
1081 | return -EINVAL; | ||
1082 | } | ||
1083 | |||
1084 | return 0; | ||
1085 | } | ||
1086 | |||
1072 | /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ | 1087 | /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ |
1073 | int __ref add_memory(int nid, u64 start, u64 size) | 1088 | int __ref add_memory(int nid, u64 start, u64 size) |
1074 | { | 1089 | { |
@@ -1078,6 +1093,10 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
1078 | struct resource *res; | 1093 | struct resource *res; |
1079 | int ret; | 1094 | int ret; |
1080 | 1095 | ||
1096 | ret = check_hotplug_memory_range(start, size); | ||
1097 | if (ret) | ||
1098 | return ret; | ||
1099 | |||
1081 | lock_memory_hotplug(); | 1100 | lock_memory_hotplug(); |
1082 | 1101 | ||
1083 | res = register_memory_resource(start, size); | 1102 | res = register_memory_resource(start, size); |
@@ -1208,10 +1227,12 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) | |||
1208 | } | 1227 | } |
1209 | 1228 | ||
1210 | /* | 1229 | /* |
1211 | * Scanning pfn is much easier than scanning lru list. | 1230 | * Scan pfn range [start,end) to find movable/migratable pages (LRU pages |
1212 | * Scan pfn from start to end and Find LRU page. | 1231 | * and hugepages). We scan pfn because it's much easier than scanning over |
1232 | * linked list. This function returns the pfn of the first found movable | ||
1233 | * page if it's found, otherwise 0. | ||
1213 | */ | 1234 | */ |
1214 | static unsigned long scan_lru_pages(unsigned long start, unsigned long end) | 1235 | static unsigned long scan_movable_pages(unsigned long start, unsigned long end) |
1215 | { | 1236 | { |
1216 | unsigned long pfn; | 1237 | unsigned long pfn; |
1217 | struct page *page; | 1238 | struct page *page; |
@@ -1220,6 +1241,13 @@ static unsigned long scan_lru_pages(unsigned long start, unsigned long end) | |||
1220 | page = pfn_to_page(pfn); | 1241 | page = pfn_to_page(pfn); |
1221 | if (PageLRU(page)) | 1242 | if (PageLRU(page)) |
1222 | return pfn; | 1243 | return pfn; |
1244 | if (PageHuge(page)) { | ||
1245 | if (is_hugepage_active(page)) | ||
1246 | return pfn; | ||
1247 | else | ||
1248 | pfn = round_up(pfn + 1, | ||
1249 | 1 << compound_order(page)) - 1; | ||
1250 | } | ||
1223 | } | 1251 | } |
1224 | } | 1252 | } |
1225 | return 0; | 1253 | return 0; |
@@ -1240,6 +1268,19 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
1240 | if (!pfn_valid(pfn)) | 1268 | if (!pfn_valid(pfn)) |
1241 | continue; | 1269 | continue; |
1242 | page = pfn_to_page(pfn); | 1270 | page = pfn_to_page(pfn); |
1271 | |||
1272 | if (PageHuge(page)) { | ||
1273 | struct page *head = compound_head(page); | ||
1274 | pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; | ||
1275 | if (compound_order(head) > PFN_SECTION_SHIFT) { | ||
1276 | ret = -EBUSY; | ||
1277 | break; | ||
1278 | } | ||
1279 | if (isolate_huge_page(page, &source)) | ||
1280 | move_pages -= 1 << compound_order(head); | ||
1281 | continue; | ||
1282 | } | ||
1283 | |||
1243 | if (!get_page_unless_zero(page)) | 1284 | if (!get_page_unless_zero(page)) |
1244 | continue; | 1285 | continue; |
1245 | /* | 1286 | /* |
@@ -1272,7 +1313,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
1272 | } | 1313 | } |
1273 | if (!list_empty(&source)) { | 1314 | if (!list_empty(&source)) { |
1274 | if (not_managed) { | 1315 | if (not_managed) { |
1275 | putback_lru_pages(&source); | 1316 | putback_movable_pages(&source); |
1276 | goto out; | 1317 | goto out; |
1277 | } | 1318 | } |
1278 | 1319 | ||
@@ -1283,7 +1324,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
1283 | ret = migrate_pages(&source, alloc_migrate_target, 0, | 1324 | ret = migrate_pages(&source, alloc_migrate_target, 0, |
1284 | MIGRATE_SYNC, MR_MEMORY_HOTPLUG); | 1325 | MIGRATE_SYNC, MR_MEMORY_HOTPLUG); |
1285 | if (ret) | 1326 | if (ret) |
1286 | putback_lru_pages(&source); | 1327 | putback_movable_pages(&source); |
1287 | } | 1328 | } |
1288 | out: | 1329 | out: |
1289 | return ret; | 1330 | return ret; |
@@ -1472,7 +1513,6 @@ static int __ref __offline_pages(unsigned long start_pfn, | |||
1472 | struct zone *zone; | 1513 | struct zone *zone; |
1473 | struct memory_notify arg; | 1514 | struct memory_notify arg; |
1474 | 1515 | ||
1475 | BUG_ON(start_pfn >= end_pfn); | ||
1476 | /* at least, alignment against pageblock is necessary */ | 1516 | /* at least, alignment against pageblock is necessary */ |
1477 | if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) | 1517 | if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) |
1478 | return -EINVAL; | 1518 | return -EINVAL; |
@@ -1527,8 +1567,8 @@ repeat: | |||
1527 | drain_all_pages(); | 1567 | drain_all_pages(); |
1528 | } | 1568 | } |
1529 | 1569 | ||
1530 | pfn = scan_lru_pages(start_pfn, end_pfn); | 1570 | pfn = scan_movable_pages(start_pfn, end_pfn); |
1531 | if (pfn) { /* We have page on LRU */ | 1571 | if (pfn) { /* We have movable pages */ |
1532 | ret = do_migrate_range(pfn, end_pfn); | 1572 | ret = do_migrate_range(pfn, end_pfn); |
1533 | if (!ret) { | 1573 | if (!ret) { |
1534 | drain = 1; | 1574 | drain = 1; |
@@ -1547,6 +1587,11 @@ repeat: | |||
1547 | yield(); | 1587 | yield(); |
1548 | /* drain pcp pages, this is synchronous. */ | 1588 | /* drain pcp pages, this is synchronous. */ |
1549 | drain_all_pages(); | 1589 | drain_all_pages(); |
1590 | /* | ||
1591 | * dissolve free hugepages in the memory block before doing offlining | ||
1592 | * actually in order to make hugetlbfs's object counting consistent. | ||
1593 | */ | ||
1594 | dissolve_free_huge_pages(start_pfn, end_pfn); | ||
1550 | /* check again */ | 1595 | /* check again */ |
1551 | offlined_pages = check_pages_isolated(start_pfn, end_pfn); | 1596 | offlined_pages = check_pages_isolated(start_pfn, end_pfn); |
1552 | if (offlined_pages < 0) { | 1597 | if (offlined_pages < 0) { |
@@ -1674,9 +1719,8 @@ static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) | |||
1674 | return ret; | 1719 | return ret; |
1675 | } | 1720 | } |
1676 | 1721 | ||
1677 | static int check_cpu_on_node(void *data) | 1722 | static int check_cpu_on_node(pg_data_t *pgdat) |
1678 | { | 1723 | { |
1679 | struct pglist_data *pgdat = data; | ||
1680 | int cpu; | 1724 | int cpu; |
1681 | 1725 | ||
1682 | for_each_present_cpu(cpu) { | 1726 | for_each_present_cpu(cpu) { |
@@ -1691,10 +1735,9 @@ static int check_cpu_on_node(void *data) | |||
1691 | return 0; | 1735 | return 0; |
1692 | } | 1736 | } |
1693 | 1737 | ||
1694 | static void unmap_cpu_on_node(void *data) | 1738 | static void unmap_cpu_on_node(pg_data_t *pgdat) |
1695 | { | 1739 | { |
1696 | #ifdef CONFIG_ACPI_NUMA | 1740 | #ifdef CONFIG_ACPI_NUMA |
1697 | struct pglist_data *pgdat = data; | ||
1698 | int cpu; | 1741 | int cpu; |
1699 | 1742 | ||
1700 | for_each_possible_cpu(cpu) | 1743 | for_each_possible_cpu(cpu) |
@@ -1703,10 +1746,11 @@ static void unmap_cpu_on_node(void *data) | |||
1703 | #endif | 1746 | #endif |
1704 | } | 1747 | } |
1705 | 1748 | ||
1706 | static int check_and_unmap_cpu_on_node(void *data) | 1749 | static int check_and_unmap_cpu_on_node(pg_data_t *pgdat) |
1707 | { | 1750 | { |
1708 | int ret = check_cpu_on_node(data); | 1751 | int ret; |
1709 | 1752 | ||
1753 | ret = check_cpu_on_node(pgdat); | ||
1710 | if (ret) | 1754 | if (ret) |
1711 | return ret; | 1755 | return ret; |
1712 | 1756 | ||
@@ -1715,11 +1759,18 @@ static int check_and_unmap_cpu_on_node(void *data) | |||
1715 | * the cpu_to_node() now. | 1759 | * the cpu_to_node() now. |
1716 | */ | 1760 | */ |
1717 | 1761 | ||
1718 | unmap_cpu_on_node(data); | 1762 | unmap_cpu_on_node(pgdat); |
1719 | return 0; | 1763 | return 0; |
1720 | } | 1764 | } |
1721 | 1765 | ||
1722 | /* offline the node if all memory sections of this node are removed */ | 1766 | /** |
1767 | * try_offline_node | ||
1768 | * | ||
1769 | * Offline a node if all memory sections and cpus of the node are removed. | ||
1770 | * | ||
1771 | * NOTE: The caller must call lock_device_hotplug() to serialize hotplug | ||
1772 | * and online/offline operations before this call. | ||
1773 | */ | ||
1723 | void try_offline_node(int nid) | 1774 | void try_offline_node(int nid) |
1724 | { | 1775 | { |
1725 | pg_data_t *pgdat = NODE_DATA(nid); | 1776 | pg_data_t *pgdat = NODE_DATA(nid); |
@@ -1745,7 +1796,7 @@ void try_offline_node(int nid) | |||
1745 | return; | 1796 | return; |
1746 | } | 1797 | } |
1747 | 1798 | ||
1748 | if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL)) | 1799 | if (check_and_unmap_cpu_on_node(pgdat)) |
1749 | return; | 1800 | return; |
1750 | 1801 | ||
1751 | /* | 1802 | /* |
@@ -1782,10 +1833,19 @@ void try_offline_node(int nid) | |||
1782 | } | 1833 | } |
1783 | EXPORT_SYMBOL(try_offline_node); | 1834 | EXPORT_SYMBOL(try_offline_node); |
1784 | 1835 | ||
1836 | /** | ||
1837 | * remove_memory | ||
1838 | * | ||
1839 | * NOTE: The caller must call lock_device_hotplug() to serialize hotplug | ||
1840 | * and online/offline operations before this call, as required by | ||
1841 | * try_offline_node(). | ||
1842 | */ | ||
1785 | void __ref remove_memory(int nid, u64 start, u64 size) | 1843 | void __ref remove_memory(int nid, u64 start, u64 size) |
1786 | { | 1844 | { |
1787 | int ret; | 1845 | int ret; |
1788 | 1846 | ||
1847 | BUG_ON(check_hotplug_memory_range(start, size)); | ||
1848 | |||
1789 | lock_memory_hotplug(); | 1849 | lock_memory_hotplug(); |
1790 | 1850 | ||
1791 | /* | 1851 | /* |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4baf12e534d1..04729647f359 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -123,16 +123,19 @@ static struct mempolicy preferred_node_policy[MAX_NUMNODES]; | |||
123 | static struct mempolicy *get_task_policy(struct task_struct *p) | 123 | static struct mempolicy *get_task_policy(struct task_struct *p) |
124 | { | 124 | { |
125 | struct mempolicy *pol = p->mempolicy; | 125 | struct mempolicy *pol = p->mempolicy; |
126 | int node; | ||
127 | 126 | ||
128 | if (!pol) { | 127 | if (!pol) { |
129 | node = numa_node_id(); | 128 | int node = numa_node_id(); |
130 | if (node != NUMA_NO_NODE) | ||
131 | pol = &preferred_node_policy[node]; | ||
132 | 129 | ||
133 | /* preferred_node_policy is not initialised early in boot */ | 130 | if (node != NUMA_NO_NODE) { |
134 | if (!pol->mode) | 131 | pol = &preferred_node_policy[node]; |
135 | pol = NULL; | 132 | /* |
133 | * preferred_node_policy is not initialised early in | ||
134 | * boot | ||
135 | */ | ||
136 | if (!pol->mode) | ||
137 | pol = NULL; | ||
138 | } | ||
136 | } | 139 | } |
137 | 140 | ||
138 | return pol; | 141 | return pol; |
@@ -473,8 +476,11 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { | |||
473 | static void migrate_page_add(struct page *page, struct list_head *pagelist, | 476 | static void migrate_page_add(struct page *page, struct list_head *pagelist, |
474 | unsigned long flags); | 477 | unsigned long flags); |
475 | 478 | ||
476 | /* Scan through pages checking if pages follow certain conditions. */ | 479 | /* |
477 | static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | 480 | * Scan through pages checking if pages follow certain conditions, |
481 | * and move them to the pagelist if they do. | ||
482 | */ | ||
483 | static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | ||
478 | unsigned long addr, unsigned long end, | 484 | unsigned long addr, unsigned long end, |
479 | const nodemask_t *nodes, unsigned long flags, | 485 | const nodemask_t *nodes, unsigned long flags, |
480 | void *private) | 486 | void *private) |
@@ -512,7 +518,31 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
512 | return addr != end; | 518 | return addr != end; |
513 | } | 519 | } |
514 | 520 | ||
515 | static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, | 521 | static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, |
522 | pmd_t *pmd, const nodemask_t *nodes, unsigned long flags, | ||
523 | void *private) | ||
524 | { | ||
525 | #ifdef CONFIG_HUGETLB_PAGE | ||
526 | int nid; | ||
527 | struct page *page; | ||
528 | |||
529 | spin_lock(&vma->vm_mm->page_table_lock); | ||
530 | page = pte_page(huge_ptep_get((pte_t *)pmd)); | ||
531 | nid = page_to_nid(page); | ||
532 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) | ||
533 | goto unlock; | ||
534 | /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ | ||
535 | if (flags & (MPOL_MF_MOVE_ALL) || | ||
536 | (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) | ||
537 | isolate_huge_page(page, private); | ||
538 | unlock: | ||
539 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
540 | #else | ||
541 | BUG(); | ||
542 | #endif | ||
543 | } | ||
544 | |||
545 | static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud, | ||
516 | unsigned long addr, unsigned long end, | 546 | unsigned long addr, unsigned long end, |
517 | const nodemask_t *nodes, unsigned long flags, | 547 | const nodemask_t *nodes, unsigned long flags, |
518 | void *private) | 548 | void *private) |
@@ -523,17 +553,24 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
523 | pmd = pmd_offset(pud, addr); | 553 | pmd = pmd_offset(pud, addr); |
524 | do { | 554 | do { |
525 | next = pmd_addr_end(addr, end); | 555 | next = pmd_addr_end(addr, end); |
556 | if (!pmd_present(*pmd)) | ||
557 | continue; | ||
558 | if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) { | ||
559 | queue_pages_hugetlb_pmd_range(vma, pmd, nodes, | ||
560 | flags, private); | ||
561 | continue; | ||
562 | } | ||
526 | split_huge_page_pmd(vma, addr, pmd); | 563 | split_huge_page_pmd(vma, addr, pmd); |
527 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | 564 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
528 | continue; | 565 | continue; |
529 | if (check_pte_range(vma, pmd, addr, next, nodes, | 566 | if (queue_pages_pte_range(vma, pmd, addr, next, nodes, |
530 | flags, private)) | 567 | flags, private)) |
531 | return -EIO; | 568 | return -EIO; |
532 | } while (pmd++, addr = next, addr != end); | 569 | } while (pmd++, addr = next, addr != end); |
533 | return 0; | 570 | return 0; |
534 | } | 571 | } |
535 | 572 | ||
536 | static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | 573 | static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd, |
537 | unsigned long addr, unsigned long end, | 574 | unsigned long addr, unsigned long end, |
538 | const nodemask_t *nodes, unsigned long flags, | 575 | const nodemask_t *nodes, unsigned long flags, |
539 | void *private) | 576 | void *private) |
@@ -544,16 +581,18 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | |||
544 | pud = pud_offset(pgd, addr); | 581 | pud = pud_offset(pgd, addr); |
545 | do { | 582 | do { |
546 | next = pud_addr_end(addr, end); | 583 | next = pud_addr_end(addr, end); |
584 | if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) | ||
585 | continue; | ||
547 | if (pud_none_or_clear_bad(pud)) | 586 | if (pud_none_or_clear_bad(pud)) |
548 | continue; | 587 | continue; |
549 | if (check_pmd_range(vma, pud, addr, next, nodes, | 588 | if (queue_pages_pmd_range(vma, pud, addr, next, nodes, |
550 | flags, private)) | 589 | flags, private)) |
551 | return -EIO; | 590 | return -EIO; |
552 | } while (pud++, addr = next, addr != end); | 591 | } while (pud++, addr = next, addr != end); |
553 | return 0; | 592 | return 0; |
554 | } | 593 | } |
555 | 594 | ||
556 | static inline int check_pgd_range(struct vm_area_struct *vma, | 595 | static inline int queue_pages_pgd_range(struct vm_area_struct *vma, |
557 | unsigned long addr, unsigned long end, | 596 | unsigned long addr, unsigned long end, |
558 | const nodemask_t *nodes, unsigned long flags, | 597 | const nodemask_t *nodes, unsigned long flags, |
559 | void *private) | 598 | void *private) |
@@ -566,7 +605,7 @@ static inline int check_pgd_range(struct vm_area_struct *vma, | |||
566 | next = pgd_addr_end(addr, end); | 605 | next = pgd_addr_end(addr, end); |
567 | if (pgd_none_or_clear_bad(pgd)) | 606 | if (pgd_none_or_clear_bad(pgd)) |
568 | continue; | 607 | continue; |
569 | if (check_pud_range(vma, pgd, addr, next, nodes, | 608 | if (queue_pages_pud_range(vma, pgd, addr, next, nodes, |
570 | flags, private)) | 609 | flags, private)) |
571 | return -EIO; | 610 | return -EIO; |
572 | } while (pgd++, addr = next, addr != end); | 611 | } while (pgd++, addr = next, addr != end); |
@@ -604,12 +643,14 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, | |||
604 | #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ | 643 | #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ |
605 | 644 | ||
606 | /* | 645 | /* |
607 | * Check if all pages in a range are on a set of nodes. | 646 | * Walk through page tables and collect pages to be migrated. |
608 | * If pagelist != NULL then isolate pages from the LRU and | 647 | * |
609 | * put them on the pagelist. | 648 | * If pages found in a given range are on a set of nodes (determined by |
649 | * @nodes and @flags,) it's isolated and queued to the pagelist which is | ||
650 | * passed via @private.) | ||
610 | */ | 651 | */ |
611 | static struct vm_area_struct * | 652 | static struct vm_area_struct * |
612 | check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | 653 | queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, |
613 | const nodemask_t *nodes, unsigned long flags, void *private) | 654 | const nodemask_t *nodes, unsigned long flags, void *private) |
614 | { | 655 | { |
615 | int err; | 656 | int err; |
@@ -635,9 +676,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
635 | return ERR_PTR(-EFAULT); | 676 | return ERR_PTR(-EFAULT); |
636 | } | 677 | } |
637 | 678 | ||
638 | if (is_vm_hugetlb_page(vma)) | ||
639 | goto next; | ||
640 | |||
641 | if (flags & MPOL_MF_LAZY) { | 679 | if (flags & MPOL_MF_LAZY) { |
642 | change_prot_numa(vma, start, endvma); | 680 | change_prot_numa(vma, start, endvma); |
643 | goto next; | 681 | goto next; |
@@ -647,7 +685,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
647 | ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && | 685 | ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && |
648 | vma_migratable(vma))) { | 686 | vma_migratable(vma))) { |
649 | 687 | ||
650 | err = check_pgd_range(vma, start, endvma, nodes, | 688 | err = queue_pages_pgd_range(vma, start, endvma, nodes, |
651 | flags, private); | 689 | flags, private); |
652 | if (err) { | 690 | if (err) { |
653 | first = ERR_PTR(err); | 691 | first = ERR_PTR(err); |
@@ -990,7 +1028,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, | |||
990 | 1028 | ||
991 | static struct page *new_node_page(struct page *page, unsigned long node, int **x) | 1029 | static struct page *new_node_page(struct page *page, unsigned long node, int **x) |
992 | { | 1030 | { |
993 | return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); | 1031 | if (PageHuge(page)) |
1032 | return alloc_huge_page_node(page_hstate(compound_head(page)), | ||
1033 | node); | ||
1034 | else | ||
1035 | return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); | ||
994 | } | 1036 | } |
995 | 1037 | ||
996 | /* | 1038 | /* |
@@ -1013,14 +1055,14 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
1013 | * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. | 1055 | * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. |
1014 | */ | 1056 | */ |
1015 | VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); | 1057 | VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); |
1016 | check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, | 1058 | queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, |
1017 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); | 1059 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); |
1018 | 1060 | ||
1019 | if (!list_empty(&pagelist)) { | 1061 | if (!list_empty(&pagelist)) { |
1020 | err = migrate_pages(&pagelist, new_node_page, dest, | 1062 | err = migrate_pages(&pagelist, new_node_page, dest, |
1021 | MIGRATE_SYNC, MR_SYSCALL); | 1063 | MIGRATE_SYNC, MR_SYSCALL); |
1022 | if (err) | 1064 | if (err) |
1023 | putback_lru_pages(&pagelist); | 1065 | putback_movable_pages(&pagelist); |
1024 | } | 1066 | } |
1025 | 1067 | ||
1026 | return err; | 1068 | return err; |
@@ -1154,10 +1196,14 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int * | |||
1154 | break; | 1196 | break; |
1155 | vma = vma->vm_next; | 1197 | vma = vma->vm_next; |
1156 | } | 1198 | } |
1157 | |||
1158 | /* | 1199 | /* |
1159 | * if !vma, alloc_page_vma() will use task or system default policy | 1200 | * queue_pages_range() confirms that @page belongs to some vma, |
1201 | * so vma shouldn't be NULL. | ||
1160 | */ | 1202 | */ |
1203 | BUG_ON(!vma); | ||
1204 | |||
1205 | if (PageHuge(page)) | ||
1206 | return alloc_huge_page_noerr(vma, address, 1); | ||
1161 | return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | 1207 | return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); |
1162 | } | 1208 | } |
1163 | #else | 1209 | #else |
@@ -1249,7 +1295,7 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1249 | if (err) | 1295 | if (err) |
1250 | goto mpol_out; | 1296 | goto mpol_out; |
1251 | 1297 | ||
1252 | vma = check_range(mm, start, end, nmask, | 1298 | vma = queue_pages_range(mm, start, end, nmask, |
1253 | flags | MPOL_MF_INVERT, &pagelist); | 1299 | flags | MPOL_MF_INVERT, &pagelist); |
1254 | 1300 | ||
1255 | err = PTR_ERR(vma); /* maybe ... */ | 1301 | err = PTR_ERR(vma); /* maybe ... */ |
@@ -1265,7 +1311,7 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1265 | (unsigned long)vma, | 1311 | (unsigned long)vma, |
1266 | MIGRATE_SYNC, MR_MEMPOLICY_MBIND); | 1312 | MIGRATE_SYNC, MR_MEMPOLICY_MBIND); |
1267 | if (nr_failed) | 1313 | if (nr_failed) |
1268 | putback_lru_pages(&pagelist); | 1314 | putback_movable_pages(&pagelist); |
1269 | } | 1315 | } |
1270 | 1316 | ||
1271 | if (nr_failed && (flags & MPOL_MF_STRICT)) | 1317 | if (nr_failed && (flags & MPOL_MF_STRICT)) |
@@ -2065,6 +2111,16 @@ retry_cpuset: | |||
2065 | } | 2111 | } |
2066 | EXPORT_SYMBOL(alloc_pages_current); | 2112 | EXPORT_SYMBOL(alloc_pages_current); |
2067 | 2113 | ||
2114 | int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) | ||
2115 | { | ||
2116 | struct mempolicy *pol = mpol_dup(vma_policy(src)); | ||
2117 | |||
2118 | if (IS_ERR(pol)) | ||
2119 | return PTR_ERR(pol); | ||
2120 | dst->vm_policy = pol; | ||
2121 | return 0; | ||
2122 | } | ||
2123 | |||
2068 | /* | 2124 | /* |
2069 | * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it | 2125 | * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it |
2070 | * rebinds the mempolicy its copying by calling mpol_rebind_policy() | 2126 | * rebinds the mempolicy its copying by calling mpol_rebind_policy() |
diff --git a/mm/mempool.c b/mm/mempool.c index 54990476c049..659aa42bad16 100644 --- a/mm/mempool.c +++ b/mm/mempool.c | |||
@@ -73,7 +73,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, | |||
73 | gfp_t gfp_mask, int node_id) | 73 | gfp_t gfp_mask, int node_id) |
74 | { | 74 | { |
75 | mempool_t *pool; | 75 | mempool_t *pool; |
76 | pool = kmalloc_node(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id); | 76 | pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id); |
77 | if (!pool) | 77 | if (!pool) |
78 | return NULL; | 78 | return NULL; |
79 | pool->elements = kmalloc_node(min_nr * sizeof(void *), | 79 | pool->elements = kmalloc_node(min_nr * sizeof(void *), |
diff --git a/mm/migrate.c b/mm/migrate.c index 6f0c24438bba..b7ded7eafe3a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -100,6 +100,10 @@ void putback_movable_pages(struct list_head *l) | |||
100 | struct page *page2; | 100 | struct page *page2; |
101 | 101 | ||
102 | list_for_each_entry_safe(page, page2, l, lru) { | 102 | list_for_each_entry_safe(page, page2, l, lru) { |
103 | if (unlikely(PageHuge(page))) { | ||
104 | putback_active_hugepage(page); | ||
105 | continue; | ||
106 | } | ||
103 | list_del(&page->lru); | 107 | list_del(&page->lru); |
104 | dec_zone_page_state(page, NR_ISOLATED_ANON + | 108 | dec_zone_page_state(page, NR_ISOLATED_ANON + |
105 | page_is_file_cache(page)); | 109 | page_is_file_cache(page)); |
@@ -945,6 +949,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
945 | struct page *new_hpage = get_new_page(hpage, private, &result); | 949 | struct page *new_hpage = get_new_page(hpage, private, &result); |
946 | struct anon_vma *anon_vma = NULL; | 950 | struct anon_vma *anon_vma = NULL; |
947 | 951 | ||
952 | /* | ||
953 | * Movability of hugepages depends on architectures and hugepage size. | ||
954 | * This check is necessary because some callers of hugepage migration | ||
955 | * like soft offline and memory hotremove don't walk through page | ||
956 | * tables or check whether the hugepage is pmd-based or not before | ||
957 | * kicking migration. | ||
958 | */ | ||
959 | if (!hugepage_migration_support(page_hstate(hpage))) | ||
960 | return -ENOSYS; | ||
961 | |||
948 | if (!new_hpage) | 962 | if (!new_hpage) |
949 | return -ENOMEM; | 963 | return -ENOMEM; |
950 | 964 | ||
@@ -975,6 +989,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
975 | 989 | ||
976 | unlock_page(hpage); | 990 | unlock_page(hpage); |
977 | out: | 991 | out: |
992 | if (rc != -EAGAIN) | ||
993 | putback_active_hugepage(hpage); | ||
978 | put_page(new_hpage); | 994 | put_page(new_hpage); |
979 | if (result) { | 995 | if (result) { |
980 | if (rc) | 996 | if (rc) |
@@ -1025,7 +1041,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, | |||
1025 | list_for_each_entry_safe(page, page2, from, lru) { | 1041 | list_for_each_entry_safe(page, page2, from, lru) { |
1026 | cond_resched(); | 1042 | cond_resched(); |
1027 | 1043 | ||
1028 | rc = unmap_and_move(get_new_page, private, | 1044 | if (PageHuge(page)) |
1045 | rc = unmap_and_move_huge_page(get_new_page, | ||
1046 | private, page, pass > 2, mode); | ||
1047 | else | ||
1048 | rc = unmap_and_move(get_new_page, private, | ||
1029 | page, pass > 2, mode); | 1049 | page, pass > 2, mode); |
1030 | 1050 | ||
1031 | switch(rc) { | 1051 | switch(rc) { |
@@ -1058,32 +1078,6 @@ out: | |||
1058 | return rc; | 1078 | return rc; |
1059 | } | 1079 | } |
1060 | 1080 | ||
1061 | int migrate_huge_page(struct page *hpage, new_page_t get_new_page, | ||
1062 | unsigned long private, enum migrate_mode mode) | ||
1063 | { | ||
1064 | int pass, rc; | ||
1065 | |||
1066 | for (pass = 0; pass < 10; pass++) { | ||
1067 | rc = unmap_and_move_huge_page(get_new_page, private, | ||
1068 | hpage, pass > 2, mode); | ||
1069 | switch (rc) { | ||
1070 | case -ENOMEM: | ||
1071 | goto out; | ||
1072 | case -EAGAIN: | ||
1073 | /* try again */ | ||
1074 | cond_resched(); | ||
1075 | break; | ||
1076 | case MIGRATEPAGE_SUCCESS: | ||
1077 | goto out; | ||
1078 | default: | ||
1079 | rc = -EIO; | ||
1080 | goto out; | ||
1081 | } | ||
1082 | } | ||
1083 | out: | ||
1084 | return rc; | ||
1085 | } | ||
1086 | |||
1087 | #ifdef CONFIG_NUMA | 1081 | #ifdef CONFIG_NUMA |
1088 | /* | 1082 | /* |
1089 | * Move a list of individual pages | 1083 | * Move a list of individual pages |
@@ -1108,7 +1102,11 @@ static struct page *new_page_node(struct page *p, unsigned long private, | |||
1108 | 1102 | ||
1109 | *result = &pm->status; | 1103 | *result = &pm->status; |
1110 | 1104 | ||
1111 | return alloc_pages_exact_node(pm->node, | 1105 | if (PageHuge(p)) |
1106 | return alloc_huge_page_node(page_hstate(compound_head(p)), | ||
1107 | pm->node); | ||
1108 | else | ||
1109 | return alloc_pages_exact_node(pm->node, | ||
1112 | GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); | 1110 | GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); |
1113 | } | 1111 | } |
1114 | 1112 | ||
@@ -1168,6 +1166,11 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
1168 | !migrate_all) | 1166 | !migrate_all) |
1169 | goto put_and_set; | 1167 | goto put_and_set; |
1170 | 1168 | ||
1169 | if (PageHuge(page)) { | ||
1170 | isolate_huge_page(page, &pagelist); | ||
1171 | goto put_and_set; | ||
1172 | } | ||
1173 | |||
1171 | err = isolate_lru_page(page); | 1174 | err = isolate_lru_page(page); |
1172 | if (!err) { | 1175 | if (!err) { |
1173 | list_add_tail(&page->lru, &pagelist); | 1176 | list_add_tail(&page->lru, &pagelist); |
@@ -1190,7 +1193,7 @@ set_status: | |||
1190 | err = migrate_pages(&pagelist, new_page_node, | 1193 | err = migrate_pages(&pagelist, new_page_node, |
1191 | (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); | 1194 | (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); |
1192 | if (err) | 1195 | if (err) |
1193 | putback_lru_pages(&pagelist); | 1196 | putback_movable_pages(&pagelist); |
1194 | } | 1197 | } |
1195 | 1198 | ||
1196 | up_read(&mm->mmap_sem); | 1199 | up_read(&mm->mmap_sem); |
@@ -1468,7 +1471,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, | |||
1468 | if (!populated_zone(zone)) | 1471 | if (!populated_zone(zone)) |
1469 | continue; | 1472 | continue; |
1470 | 1473 | ||
1471 | if (zone->all_unreclaimable) | 1474 | if (!zone_reclaimable(zone)) |
1472 | continue; | 1475 | continue; |
1473 | 1476 | ||
1474 | /* Avoid waking kswapd by allocating pages_to_migrate pages. */ | 1477 | /* Avoid waking kswapd by allocating pages_to_migrate pages. */ |
diff --git a/mm/mlock.c b/mm/mlock.c index 79b7cf7d1bca..d63802663242 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -11,6 +11,7 @@ | |||
11 | #include <linux/swap.h> | 11 | #include <linux/swap.h> |
12 | #include <linux/swapops.h> | 12 | #include <linux/swapops.h> |
13 | #include <linux/pagemap.h> | 13 | #include <linux/pagemap.h> |
14 | #include <linux/pagevec.h> | ||
14 | #include <linux/mempolicy.h> | 15 | #include <linux/mempolicy.h> |
15 | #include <linux/syscalls.h> | 16 | #include <linux/syscalls.h> |
16 | #include <linux/sched.h> | 17 | #include <linux/sched.h> |
@@ -18,6 +19,8 @@ | |||
18 | #include <linux/rmap.h> | 19 | #include <linux/rmap.h> |
19 | #include <linux/mmzone.h> | 20 | #include <linux/mmzone.h> |
20 | #include <linux/hugetlb.h> | 21 | #include <linux/hugetlb.h> |
22 | #include <linux/memcontrol.h> | ||
23 | #include <linux/mm_inline.h> | ||
21 | 24 | ||
22 | #include "internal.h" | 25 | #include "internal.h" |
23 | 26 | ||
@@ -87,6 +90,47 @@ void mlock_vma_page(struct page *page) | |||
87 | } | 90 | } |
88 | } | 91 | } |
89 | 92 | ||
93 | /* | ||
94 | * Finish munlock after successful page isolation | ||
95 | * | ||
96 | * Page must be locked. This is a wrapper for try_to_munlock() | ||
97 | * and putback_lru_page() with munlock accounting. | ||
98 | */ | ||
99 | static void __munlock_isolated_page(struct page *page) | ||
100 | { | ||
101 | int ret = SWAP_AGAIN; | ||
102 | |||
103 | /* | ||
104 | * Optimization: if the page was mapped just once, that's our mapping | ||
105 | * and we don't need to check all the other vmas. | ||
106 | */ | ||
107 | if (page_mapcount(page) > 1) | ||
108 | ret = try_to_munlock(page); | ||
109 | |||
110 | /* Did try_to_unlock() succeed or punt? */ | ||
111 | if (ret != SWAP_MLOCK) | ||
112 | count_vm_event(UNEVICTABLE_PGMUNLOCKED); | ||
113 | |||
114 | putback_lru_page(page); | ||
115 | } | ||
116 | |||
117 | /* | ||
118 | * Accounting for page isolation fail during munlock | ||
119 | * | ||
120 | * Performs accounting when page isolation fails in munlock. There is nothing | ||
121 | * else to do because it means some other task has already removed the page | ||
122 | * from the LRU. putback_lru_page() will take care of removing the page from | ||
123 | * the unevictable list, if necessary. vmscan [page_referenced()] will move | ||
124 | * the page back to the unevictable list if some other vma has it mlocked. | ||
125 | */ | ||
126 | static void __munlock_isolation_failed(struct page *page) | ||
127 | { | ||
128 | if (PageUnevictable(page)) | ||
129 | count_vm_event(UNEVICTABLE_PGSTRANDED); | ||
130 | else | ||
131 | count_vm_event(UNEVICTABLE_PGMUNLOCKED); | ||
132 | } | ||
133 | |||
90 | /** | 134 | /** |
91 | * munlock_vma_page - munlock a vma page | 135 | * munlock_vma_page - munlock a vma page |
92 | * @page - page to be unlocked | 136 | * @page - page to be unlocked |
@@ -112,37 +156,10 @@ unsigned int munlock_vma_page(struct page *page) | |||
112 | unsigned int nr_pages = hpage_nr_pages(page); | 156 | unsigned int nr_pages = hpage_nr_pages(page); |
113 | mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); | 157 | mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); |
114 | page_mask = nr_pages - 1; | 158 | page_mask = nr_pages - 1; |
115 | if (!isolate_lru_page(page)) { | 159 | if (!isolate_lru_page(page)) |
116 | int ret = SWAP_AGAIN; | 160 | __munlock_isolated_page(page); |
117 | 161 | else | |
118 | /* | 162 | __munlock_isolation_failed(page); |
119 | * Optimization: if the page was mapped just once, | ||
120 | * that's our mapping and we don't need to check all the | ||
121 | * other vmas. | ||
122 | */ | ||
123 | if (page_mapcount(page) > 1) | ||
124 | ret = try_to_munlock(page); | ||
125 | /* | ||
126 | * did try_to_unlock() succeed or punt? | ||
127 | */ | ||
128 | if (ret != SWAP_MLOCK) | ||
129 | count_vm_event(UNEVICTABLE_PGMUNLOCKED); | ||
130 | |||
131 | putback_lru_page(page); | ||
132 | } else { | ||
133 | /* | ||
134 | * Some other task has removed the page from the LRU. | ||
135 | * putback_lru_page() will take care of removing the | ||
136 | * page from the unevictable list, if necessary. | ||
137 | * vmscan [page_referenced()] will move the page back | ||
138 | * to the unevictable list if some other vma has it | ||
139 | * mlocked. | ||
140 | */ | ||
141 | if (PageUnevictable(page)) | ||
142 | count_vm_event(UNEVICTABLE_PGSTRANDED); | ||
143 | else | ||
144 | count_vm_event(UNEVICTABLE_PGMUNLOCKED); | ||
145 | } | ||
146 | } | 163 | } |
147 | 164 | ||
148 | return page_mask; | 165 | return page_mask; |
@@ -210,6 +227,191 @@ static int __mlock_posix_error_return(long retval) | |||
210 | } | 227 | } |
211 | 228 | ||
212 | /* | 229 | /* |
230 | * Prepare page for fast batched LRU putback via putback_lru_evictable_pagevec() | ||
231 | * | ||
232 | * The fast path is available only for evictable pages with single mapping. | ||
233 | * Then we can bypass the per-cpu pvec and get better performance. | ||
234 | * when mapcount > 1 we need try_to_munlock() which can fail. | ||
235 | * when !page_evictable(), we need the full redo logic of putback_lru_page to | ||
236 | * avoid leaving evictable page in unevictable list. | ||
237 | * | ||
238 | * In case of success, @page is added to @pvec and @pgrescued is incremented | ||
239 | * in case that the page was previously unevictable. @page is also unlocked. | ||
240 | */ | ||
241 | static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec, | ||
242 | int *pgrescued) | ||
243 | { | ||
244 | VM_BUG_ON(PageLRU(page)); | ||
245 | VM_BUG_ON(!PageLocked(page)); | ||
246 | |||
247 | if (page_mapcount(page) <= 1 && page_evictable(page)) { | ||
248 | pagevec_add(pvec, page); | ||
249 | if (TestClearPageUnevictable(page)) | ||
250 | (*pgrescued)++; | ||
251 | unlock_page(page); | ||
252 | return true; | ||
253 | } | ||
254 | |||
255 | return false; | ||
256 | } | ||
257 | |||
258 | /* | ||
259 | * Putback multiple evictable pages to the LRU | ||
260 | * | ||
261 | * Batched putback of evictable pages that bypasses the per-cpu pvec. Some of | ||
262 | * the pages might have meanwhile become unevictable but that is OK. | ||
263 | */ | ||
264 | static void __putback_lru_fast(struct pagevec *pvec, int pgrescued) | ||
265 | { | ||
266 | count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec)); | ||
267 | /* | ||
268 | *__pagevec_lru_add() calls release_pages() so we don't call | ||
269 | * put_page() explicitly | ||
270 | */ | ||
271 | __pagevec_lru_add(pvec); | ||
272 | count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); | ||
273 | } | ||
274 | |||
275 | /* | ||
276 | * Munlock a batch of pages from the same zone | ||
277 | * | ||
278 | * The work is split to two main phases. First phase clears the Mlocked flag | ||
279 | * and attempts to isolate the pages, all under a single zone lru lock. | ||
280 | * The second phase finishes the munlock only for pages where isolation | ||
281 | * succeeded. | ||
282 | * | ||
283 | * Note that the pagevec may be modified during the process. | ||
284 | */ | ||
285 | static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) | ||
286 | { | ||
287 | int i; | ||
288 | int nr = pagevec_count(pvec); | ||
289 | int delta_munlocked = -nr; | ||
290 | struct pagevec pvec_putback; | ||
291 | int pgrescued = 0; | ||
292 | |||
293 | /* Phase 1: page isolation */ | ||
294 | spin_lock_irq(&zone->lru_lock); | ||
295 | for (i = 0; i < nr; i++) { | ||
296 | struct page *page = pvec->pages[i]; | ||
297 | |||
298 | if (TestClearPageMlocked(page)) { | ||
299 | struct lruvec *lruvec; | ||
300 | int lru; | ||
301 | |||
302 | if (PageLRU(page)) { | ||
303 | lruvec = mem_cgroup_page_lruvec(page, zone); | ||
304 | lru = page_lru(page); | ||
305 | /* | ||
306 | * We already have pin from follow_page_mask() | ||
307 | * so we can spare the get_page() here. | ||
308 | */ | ||
309 | ClearPageLRU(page); | ||
310 | del_page_from_lru_list(page, lruvec, lru); | ||
311 | } else { | ||
312 | __munlock_isolation_failed(page); | ||
313 | goto skip_munlock; | ||
314 | } | ||
315 | |||
316 | } else { | ||
317 | skip_munlock: | ||
318 | /* | ||
319 | * We won't be munlocking this page in the next phase | ||
320 | * but we still need to release the follow_page_mask() | ||
321 | * pin. | ||
322 | */ | ||
323 | pvec->pages[i] = NULL; | ||
324 | put_page(page); | ||
325 | delta_munlocked++; | ||
326 | } | ||
327 | } | ||
328 | __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); | ||
329 | spin_unlock_irq(&zone->lru_lock); | ||
330 | |||
331 | /* Phase 2: page munlock */ | ||
332 | pagevec_init(&pvec_putback, 0); | ||
333 | for (i = 0; i < nr; i++) { | ||
334 | struct page *page = pvec->pages[i]; | ||
335 | |||
336 | if (page) { | ||
337 | lock_page(page); | ||
338 | if (!__putback_lru_fast_prepare(page, &pvec_putback, | ||
339 | &pgrescued)) { | ||
340 | /* | ||
341 | * Slow path. We don't want to lose the last | ||
342 | * pin before unlock_page() | ||
343 | */ | ||
344 | get_page(page); /* for putback_lru_page() */ | ||
345 | __munlock_isolated_page(page); | ||
346 | unlock_page(page); | ||
347 | put_page(page); /* from follow_page_mask() */ | ||
348 | } | ||
349 | } | ||
350 | } | ||
351 | |||
352 | /* | ||
353 | * Phase 3: page putback for pages that qualified for the fast path | ||
354 | * This will also call put_page() to return pin from follow_page_mask() | ||
355 | */ | ||
356 | if (pagevec_count(&pvec_putback)) | ||
357 | __putback_lru_fast(&pvec_putback, pgrescued); | ||
358 | } | ||
359 | |||
360 | /* | ||
361 | * Fill up pagevec for __munlock_pagevec using pte walk | ||
362 | * | ||
363 | * The function expects that the struct page corresponding to @start address is | ||
364 | * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone. | ||
365 | * | ||
366 | * The rest of @pvec is filled by subsequent pages within the same pmd and same | ||
367 | * zone, as long as the pte's are present and vm_normal_page() succeeds. These | ||
368 | * pages also get pinned. | ||
369 | * | ||
370 | * Returns the address of the next page that should be scanned. This equals | ||
371 | * @start + PAGE_SIZE when no page could be added by the pte walk. | ||
372 | */ | ||
373 | static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, | ||
374 | struct vm_area_struct *vma, int zoneid, unsigned long start, | ||
375 | unsigned long end) | ||
376 | { | ||
377 | pte_t *pte; | ||
378 | spinlock_t *ptl; | ||
379 | |||
380 | /* | ||
381 | * Initialize pte walk starting at the already pinned page where we | ||
382 | * are sure that there is a pte. | ||
383 | */ | ||
384 | pte = get_locked_pte(vma->vm_mm, start, &ptl); | ||
385 | end = min(end, pmd_addr_end(start, end)); | ||
386 | |||
387 | /* The page next to the pinned page is the first we will try to get */ | ||
388 | start += PAGE_SIZE; | ||
389 | while (start < end) { | ||
390 | struct page *page = NULL; | ||
391 | pte++; | ||
392 | if (pte_present(*pte)) | ||
393 | page = vm_normal_page(vma, start, *pte); | ||
394 | /* | ||
395 | * Break if page could not be obtained or the page's node+zone does not | ||
396 | * match | ||
397 | */ | ||
398 | if (!page || page_zone_id(page) != zoneid) | ||
399 | break; | ||
400 | |||
401 | get_page(page); | ||
402 | /* | ||
403 | * Increase the address that will be returned *before* the | ||
404 | * eventual break due to pvec becoming full by adding the page | ||
405 | */ | ||
406 | start += PAGE_SIZE; | ||
407 | if (pagevec_add(pvec, page) == 0) | ||
408 | break; | ||
409 | } | ||
410 | pte_unmap_unlock(pte, ptl); | ||
411 | return start; | ||
412 | } | ||
413 | |||
414 | /* | ||
213 | * munlock_vma_pages_range() - munlock all pages in the vma range.' | 415 | * munlock_vma_pages_range() - munlock all pages in the vma range.' |
214 | * @vma - vma containing range to be munlock()ed. | 416 | * @vma - vma containing range to be munlock()ed. |
215 | * @start - start address in @vma of the range | 417 | * @start - start address in @vma of the range |
@@ -233,9 +435,13 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, | |||
233 | vma->vm_flags &= ~VM_LOCKED; | 435 | vma->vm_flags &= ~VM_LOCKED; |
234 | 436 | ||
235 | while (start < end) { | 437 | while (start < end) { |
236 | struct page *page; | 438 | struct page *page = NULL; |
237 | unsigned int page_mask, page_increm; | 439 | unsigned int page_mask, page_increm; |
440 | struct pagevec pvec; | ||
441 | struct zone *zone; | ||
442 | int zoneid; | ||
238 | 443 | ||
444 | pagevec_init(&pvec, 0); | ||
239 | /* | 445 | /* |
240 | * Although FOLL_DUMP is intended for get_dump_page(), | 446 | * Although FOLL_DUMP is intended for get_dump_page(), |
241 | * it just so happens that its special treatment of the | 447 | * it just so happens that its special treatment of the |
@@ -244,21 +450,45 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, | |||
244 | * has sneaked into the range, we won't oops here: great). | 450 | * has sneaked into the range, we won't oops here: great). |
245 | */ | 451 | */ |
246 | page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP, | 452 | page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP, |
247 | &page_mask); | 453 | &page_mask); |
454 | |||
248 | if (page && !IS_ERR(page)) { | 455 | if (page && !IS_ERR(page)) { |
249 | lock_page(page); | 456 | if (PageTransHuge(page)) { |
250 | lru_add_drain(); | 457 | lock_page(page); |
251 | /* | 458 | /* |
252 | * Any THP page found by follow_page_mask() may have | 459 | * Any THP page found by follow_page_mask() may |
253 | * gotten split before reaching munlock_vma_page(), | 460 | * have gotten split before reaching |
254 | * so we need to recompute the page_mask here. | 461 | * munlock_vma_page(), so we need to recompute |
255 | */ | 462 | * the page_mask here. |
256 | page_mask = munlock_vma_page(page); | 463 | */ |
257 | unlock_page(page); | 464 | page_mask = munlock_vma_page(page); |
258 | put_page(page); | 465 | unlock_page(page); |
466 | put_page(page); /* follow_page_mask() */ | ||
467 | } else { | ||
468 | /* | ||
469 | * Non-huge pages are handled in batches via | ||
470 | * pagevec. The pin from follow_page_mask() | ||
471 | * prevents them from collapsing by THP. | ||
472 | */ | ||
473 | pagevec_add(&pvec, page); | ||
474 | zone = page_zone(page); | ||
475 | zoneid = page_zone_id(page); | ||
476 | |||
477 | /* | ||
478 | * Try to fill the rest of pagevec using fast | ||
479 | * pte walk. This will also update start to | ||
480 | * the next page to process. Then munlock the | ||
481 | * pagevec. | ||
482 | */ | ||
483 | start = __munlock_pagevec_fill(&pvec, vma, | ||
484 | zoneid, start, end); | ||
485 | __munlock_pagevec(&pvec, zone); | ||
486 | goto next; | ||
487 | } | ||
259 | } | 488 | } |
260 | page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); | 489 | page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); |
261 | start += page_increm * PAGE_SIZE; | 490 | start += page_increm * PAGE_SIZE; |
491 | next: | ||
262 | cond_resched(); | 492 | cond_resched(); |
263 | } | 493 | } |
264 | } | 494 | } |
@@ -1202,7 +1202,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
1202 | unsigned long *populate) | 1202 | unsigned long *populate) |
1203 | { | 1203 | { |
1204 | struct mm_struct * mm = current->mm; | 1204 | struct mm_struct * mm = current->mm; |
1205 | struct inode *inode; | ||
1206 | vm_flags_t vm_flags; | 1205 | vm_flags_t vm_flags; |
1207 | 1206 | ||
1208 | *populate = 0; | 1207 | *populate = 0; |
@@ -1265,9 +1264,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
1265 | return -EAGAIN; | 1264 | return -EAGAIN; |
1266 | } | 1265 | } |
1267 | 1266 | ||
1268 | inode = file ? file_inode(file) : NULL; | ||
1269 | |||
1270 | if (file) { | 1267 | if (file) { |
1268 | struct inode *inode = file_inode(file); | ||
1269 | |||
1271 | switch (flags & MAP_TYPE) { | 1270 | switch (flags & MAP_TYPE) { |
1272 | case MAP_SHARED: | 1271 | case MAP_SHARED: |
1273 | if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) | 1272 | if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) |
@@ -1302,6 +1301,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
1302 | 1301 | ||
1303 | if (!file->f_op || !file->f_op->mmap) | 1302 | if (!file->f_op || !file->f_op->mmap) |
1304 | return -ENODEV; | 1303 | return -ENODEV; |
1304 | if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) | ||
1305 | return -EINVAL; | ||
1305 | break; | 1306 | break; |
1306 | 1307 | ||
1307 | default: | 1308 | default: |
@@ -1310,6 +1311,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
1310 | } else { | 1311 | } else { |
1311 | switch (flags & MAP_TYPE) { | 1312 | switch (flags & MAP_TYPE) { |
1312 | case MAP_SHARED: | 1313 | case MAP_SHARED: |
1314 | if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) | ||
1315 | return -EINVAL; | ||
1313 | /* | 1316 | /* |
1314 | * Ignore pgoff. | 1317 | * Ignore pgoff. |
1315 | */ | 1318 | */ |
@@ -1476,11 +1479,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, | |||
1476 | { | 1479 | { |
1477 | struct mm_struct *mm = current->mm; | 1480 | struct mm_struct *mm = current->mm; |
1478 | struct vm_area_struct *vma, *prev; | 1481 | struct vm_area_struct *vma, *prev; |
1479 | int correct_wcount = 0; | ||
1480 | int error; | 1482 | int error; |
1481 | struct rb_node **rb_link, *rb_parent; | 1483 | struct rb_node **rb_link, *rb_parent; |
1482 | unsigned long charged = 0; | 1484 | unsigned long charged = 0; |
1483 | struct inode *inode = file ? file_inode(file) : NULL; | ||
1484 | 1485 | ||
1485 | /* Check against address space limit. */ | 1486 | /* Check against address space limit. */ |
1486 | if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { | 1487 | if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { |
@@ -1544,16 +1545,11 @@ munmap_back: | |||
1544 | vma->vm_pgoff = pgoff; | 1545 | vma->vm_pgoff = pgoff; |
1545 | INIT_LIST_HEAD(&vma->anon_vma_chain); | 1546 | INIT_LIST_HEAD(&vma->anon_vma_chain); |
1546 | 1547 | ||
1547 | error = -EINVAL; /* when rejecting VM_GROWSDOWN|VM_GROWSUP */ | ||
1548 | |||
1549 | if (file) { | 1548 | if (file) { |
1550 | if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) | ||
1551 | goto free_vma; | ||
1552 | if (vm_flags & VM_DENYWRITE) { | 1549 | if (vm_flags & VM_DENYWRITE) { |
1553 | error = deny_write_access(file); | 1550 | error = deny_write_access(file); |
1554 | if (error) | 1551 | if (error) |
1555 | goto free_vma; | 1552 | goto free_vma; |
1556 | correct_wcount = 1; | ||
1557 | } | 1553 | } |
1558 | vma->vm_file = get_file(file); | 1554 | vma->vm_file = get_file(file); |
1559 | error = file->f_op->mmap(file, vma); | 1555 | error = file->f_op->mmap(file, vma); |
@@ -1570,11 +1566,8 @@ munmap_back: | |||
1570 | WARN_ON_ONCE(addr != vma->vm_start); | 1566 | WARN_ON_ONCE(addr != vma->vm_start); |
1571 | 1567 | ||
1572 | addr = vma->vm_start; | 1568 | addr = vma->vm_start; |
1573 | pgoff = vma->vm_pgoff; | ||
1574 | vm_flags = vma->vm_flags; | 1569 | vm_flags = vma->vm_flags; |
1575 | } else if (vm_flags & VM_SHARED) { | 1570 | } else if (vm_flags & VM_SHARED) { |
1576 | if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP))) | ||
1577 | goto free_vma; | ||
1578 | error = shmem_zero_setup(vma); | 1571 | error = shmem_zero_setup(vma); |
1579 | if (error) | 1572 | if (error) |
1580 | goto free_vma; | 1573 | goto free_vma; |
@@ -1596,11 +1589,10 @@ munmap_back: | |||
1596 | } | 1589 | } |
1597 | 1590 | ||
1598 | vma_link(mm, vma, prev, rb_link, rb_parent); | 1591 | vma_link(mm, vma, prev, rb_link, rb_parent); |
1599 | file = vma->vm_file; | ||
1600 | |||
1601 | /* Once vma denies write, undo our temporary denial count */ | 1592 | /* Once vma denies write, undo our temporary denial count */ |
1602 | if (correct_wcount) | 1593 | if (vm_flags & VM_DENYWRITE) |
1603 | atomic_inc(&inode->i_writecount); | 1594 | allow_write_access(file); |
1595 | file = vma->vm_file; | ||
1604 | out: | 1596 | out: |
1605 | perf_event_mmap(vma); | 1597 | perf_event_mmap(vma); |
1606 | 1598 | ||
@@ -1616,11 +1608,20 @@ out: | |||
1616 | if (file) | 1608 | if (file) |
1617 | uprobe_mmap(vma); | 1609 | uprobe_mmap(vma); |
1618 | 1610 | ||
1611 | /* | ||
1612 | * New (or expanded) vma always get soft dirty status. | ||
1613 | * Otherwise user-space soft-dirty page tracker won't | ||
1614 | * be able to distinguish situation when vma area unmapped, | ||
1615 | * then new mapped in-place (which must be aimed as | ||
1616 | * a completely new data area). | ||
1617 | */ | ||
1618 | vma->vm_flags |= VM_SOFTDIRTY; | ||
1619 | |||
1619 | return addr; | 1620 | return addr; |
1620 | 1621 | ||
1621 | unmap_and_free_vma: | 1622 | unmap_and_free_vma: |
1622 | if (correct_wcount) | 1623 | if (vm_flags & VM_DENYWRITE) |
1623 | atomic_inc(&inode->i_writecount); | 1624 | allow_write_access(file); |
1624 | vma->vm_file = NULL; | 1625 | vma->vm_file = NULL; |
1625 | fput(file); | 1626 | fput(file); |
1626 | 1627 | ||
@@ -2380,7 +2381,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2380 | static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | 2381 | static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, |
2381 | unsigned long addr, int new_below) | 2382 | unsigned long addr, int new_below) |
2382 | { | 2383 | { |
2383 | struct mempolicy *pol; | ||
2384 | struct vm_area_struct *new; | 2384 | struct vm_area_struct *new; |
2385 | int err = -ENOMEM; | 2385 | int err = -ENOMEM; |
2386 | 2386 | ||
@@ -2404,12 +2404,9 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
2404 | new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); | 2404 | new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); |
2405 | } | 2405 | } |
2406 | 2406 | ||
2407 | pol = mpol_dup(vma_policy(vma)); | 2407 | err = vma_dup_policy(vma, new); |
2408 | if (IS_ERR(pol)) { | 2408 | if (err) |
2409 | err = PTR_ERR(pol); | ||
2410 | goto out_free_vma; | 2409 | goto out_free_vma; |
2411 | } | ||
2412 | vma_set_policy(new, pol); | ||
2413 | 2410 | ||
2414 | if (anon_vma_clone(new, vma)) | 2411 | if (anon_vma_clone(new, vma)) |
2415 | goto out_free_mpol; | 2412 | goto out_free_mpol; |
@@ -2437,7 +2434,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
2437 | fput(new->vm_file); | 2434 | fput(new->vm_file); |
2438 | unlink_anon_vmas(new); | 2435 | unlink_anon_vmas(new); |
2439 | out_free_mpol: | 2436 | out_free_mpol: |
2440 | mpol_put(pol); | 2437 | mpol_put(vma_policy(new)); |
2441 | out_free_vma: | 2438 | out_free_vma: |
2442 | kmem_cache_free(vm_area_cachep, new); | 2439 | kmem_cache_free(vm_area_cachep, new); |
2443 | out_err: | 2440 | out_err: |
@@ -2663,6 +2660,7 @@ out: | |||
2663 | mm->total_vm += len >> PAGE_SHIFT; | 2660 | mm->total_vm += len >> PAGE_SHIFT; |
2664 | if (flags & VM_LOCKED) | 2661 | if (flags & VM_LOCKED) |
2665 | mm->locked_vm += (len >> PAGE_SHIFT); | 2662 | mm->locked_vm += (len >> PAGE_SHIFT); |
2663 | vma->vm_flags |= VM_SOFTDIRTY; | ||
2666 | return addr; | 2664 | return addr; |
2667 | } | 2665 | } |
2668 | 2666 | ||
@@ -2780,7 +2778,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2780 | struct mm_struct *mm = vma->vm_mm; | 2778 | struct mm_struct *mm = vma->vm_mm; |
2781 | struct vm_area_struct *new_vma, *prev; | 2779 | struct vm_area_struct *new_vma, *prev; |
2782 | struct rb_node **rb_link, *rb_parent; | 2780 | struct rb_node **rb_link, *rb_parent; |
2783 | struct mempolicy *pol; | ||
2784 | bool faulted_in_anon_vma = true; | 2781 | bool faulted_in_anon_vma = true; |
2785 | 2782 | ||
2786 | /* | 2783 | /* |
@@ -2825,10 +2822,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2825 | new_vma->vm_start = addr; | 2822 | new_vma->vm_start = addr; |
2826 | new_vma->vm_end = addr + len; | 2823 | new_vma->vm_end = addr + len; |
2827 | new_vma->vm_pgoff = pgoff; | 2824 | new_vma->vm_pgoff = pgoff; |
2828 | pol = mpol_dup(vma_policy(vma)); | 2825 | if (vma_dup_policy(vma, new_vma)) |
2829 | if (IS_ERR(pol)) | ||
2830 | goto out_free_vma; | 2826 | goto out_free_vma; |
2831 | vma_set_policy(new_vma, pol); | ||
2832 | INIT_LIST_HEAD(&new_vma->anon_vma_chain); | 2827 | INIT_LIST_HEAD(&new_vma->anon_vma_chain); |
2833 | if (anon_vma_clone(new_vma, vma)) | 2828 | if (anon_vma_clone(new_vma, vma)) |
2834 | goto out_free_mempol; | 2829 | goto out_free_mempol; |
@@ -2843,7 +2838,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2843 | return new_vma; | 2838 | return new_vma; |
2844 | 2839 | ||
2845 | out_free_mempol: | 2840 | out_free_mempol: |
2846 | mpol_put(pol); | 2841 | mpol_put(vma_policy(new_vma)); |
2847 | out_free_vma: | 2842 | out_free_vma: |
2848 | kmem_cache_free(vm_area_cachep, new_vma); | 2843 | kmem_cache_free(vm_area_cachep, new_vma); |
2849 | return NULL; | 2844 | return NULL; |
@@ -2930,7 +2925,7 @@ int install_special_mapping(struct mm_struct *mm, | |||
2930 | vma->vm_start = addr; | 2925 | vma->vm_start = addr; |
2931 | vma->vm_end = addr + len; | 2926 | vma->vm_end = addr + len; |
2932 | 2927 | ||
2933 | vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND; | 2928 | vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY; |
2934 | vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); | 2929 | vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); |
2935 | 2930 | ||
2936 | vma->vm_ops = &special_mapping_vmops; | 2931 | vma->vm_ops = &special_mapping_vmops; |
diff --git a/mm/mremap.c b/mm/mremap.c index 0843feb66f3d..91b13d6a16d4 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <asm/uaccess.h> | 25 | #include <asm/uaccess.h> |
26 | #include <asm/cacheflush.h> | 26 | #include <asm/cacheflush.h> |
27 | #include <asm/tlbflush.h> | 27 | #include <asm/tlbflush.h> |
28 | #include <asm/pgalloc.h> | ||
28 | 29 | ||
29 | #include "internal.h" | 30 | #include "internal.h" |
30 | 31 | ||
@@ -62,8 +63,10 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, | |||
62 | return NULL; | 63 | return NULL; |
63 | 64 | ||
64 | pmd = pmd_alloc(mm, pud, addr); | 65 | pmd = pmd_alloc(mm, pud, addr); |
65 | if (!pmd) | 66 | if (!pmd) { |
67 | pud_free(mm, pud); | ||
66 | return NULL; | 68 | return NULL; |
69 | } | ||
67 | 70 | ||
68 | VM_BUG_ON(pmd_trans_huge(*pmd)); | 71 | VM_BUG_ON(pmd_trans_huge(*pmd)); |
69 | 72 | ||
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3f0c895c71fe..6c7b0187be8e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -36,8 +36,11 @@ | |||
36 | #include <linux/pagevec.h> | 36 | #include <linux/pagevec.h> |
37 | #include <linux/timer.h> | 37 | #include <linux/timer.h> |
38 | #include <linux/sched/rt.h> | 38 | #include <linux/sched/rt.h> |
39 | #include <linux/mm_inline.h> | ||
39 | #include <trace/events/writeback.h> | 40 | #include <trace/events/writeback.h> |
40 | 41 | ||
42 | #include "internal.h" | ||
43 | |||
41 | /* | 44 | /* |
42 | * Sleep at most 200ms at a time in balance_dirty_pages(). | 45 | * Sleep at most 200ms at a time in balance_dirty_pages(). |
43 | */ | 46 | */ |
@@ -241,9 +244,6 @@ static unsigned long global_dirtyable_memory(void) | |||
241 | if (!vm_highmem_is_dirtyable) | 244 | if (!vm_highmem_is_dirtyable) |
242 | x -= highmem_dirtyable_memory(x); | 245 | x -= highmem_dirtyable_memory(x); |
243 | 246 | ||
244 | /* Subtract min_free_kbytes */ | ||
245 | x -= min_t(unsigned long, x, min_free_kbytes >> (PAGE_SHIFT - 10)); | ||
246 | |||
247 | return x + 1; /* Ensure that we never return 0 */ | 247 | return x + 1; /* Ensure that we never return 0 */ |
248 | } | 248 | } |
249 | 249 | ||
@@ -585,6 +585,37 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) | |||
585 | } | 585 | } |
586 | 586 | ||
587 | /* | 587 | /* |
588 | * setpoint - dirty 3 | ||
589 | * f(dirty) := 1.0 + (----------------) | ||
590 | * limit - setpoint | ||
591 | * | ||
592 | * it's a 3rd order polynomial that subjects to | ||
593 | * | ||
594 | * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast | ||
595 | * (2) f(setpoint) = 1.0 => the balance point | ||
596 | * (3) f(limit) = 0 => the hard limit | ||
597 | * (4) df/dx <= 0 => negative feedback control | ||
598 | * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) | ||
599 | * => fast response on large errors; small oscillation near setpoint | ||
600 | */ | ||
601 | static inline long long pos_ratio_polynom(unsigned long setpoint, | ||
602 | unsigned long dirty, | ||
603 | unsigned long limit) | ||
604 | { | ||
605 | long long pos_ratio; | ||
606 | long x; | ||
607 | |||
608 | x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, | ||
609 | limit - setpoint + 1); | ||
610 | pos_ratio = x; | ||
611 | pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; | ||
612 | pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; | ||
613 | pos_ratio += 1 << RATELIMIT_CALC_SHIFT; | ||
614 | |||
615 | return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT); | ||
616 | } | ||
617 | |||
618 | /* | ||
588 | * Dirty position control. | 619 | * Dirty position control. |
589 | * | 620 | * |
590 | * (o) global/bdi setpoints | 621 | * (o) global/bdi setpoints |
@@ -682,26 +713,80 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, | |||
682 | /* | 713 | /* |
683 | * global setpoint | 714 | * global setpoint |
684 | * | 715 | * |
685 | * setpoint - dirty 3 | 716 | * See comment for pos_ratio_polynom(). |
686 | * f(dirty) := 1.0 + (----------------) | 717 | */ |
687 | * limit - setpoint | 718 | setpoint = (freerun + limit) / 2; |
719 | pos_ratio = pos_ratio_polynom(setpoint, dirty, limit); | ||
720 | |||
721 | /* | ||
722 | * The strictlimit feature is a tool preventing mistrusted filesystems | ||
723 | * from growing a large number of dirty pages before throttling. For | ||
724 | * such filesystems balance_dirty_pages always checks bdi counters | ||
725 | * against bdi limits. Even if global "nr_dirty" is under "freerun". | ||
726 | * This is especially important for fuse which sets bdi->max_ratio to | ||
727 | * 1% by default. Without strictlimit feature, fuse writeback may | ||
728 | * consume arbitrary amount of RAM because it is accounted in | ||
729 | * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty". | ||
688 | * | 730 | * |
689 | * it's a 3rd order polynomial that subjects to | 731 | * Here, in bdi_position_ratio(), we calculate pos_ratio based on |
732 | * two values: bdi_dirty and bdi_thresh. Let's consider an example: | ||
733 | * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global | ||
734 | * limits are set by default to 10% and 20% (background and throttle). | ||
735 | * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. | ||
736 | * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is | ||
737 | * about ~6K pages (as the average of background and throttle bdi | ||
738 | * limits). The 3rd order polynomial will provide positive feedback if | ||
739 | * bdi_dirty is under bdi_setpoint and vice versa. | ||
690 | * | 740 | * |
691 | * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast | 741 | * Note, that we cannot use global counters in these calculations |
692 | * (2) f(setpoint) = 1.0 => the balance point | 742 | * because we want to throttle process writing to a strictlimit BDI |
693 | * (3) f(limit) = 0 => the hard limit | 743 | * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB |
694 | * (4) df/dx <= 0 => negative feedback control | 744 | * in the example above). |
695 | * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) | ||
696 | * => fast response on large errors; small oscillation near setpoint | ||
697 | */ | 745 | */ |
698 | setpoint = (freerun + limit) / 2; | 746 | if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) { |
699 | x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, | 747 | long long bdi_pos_ratio; |
700 | limit - setpoint + 1); | 748 | unsigned long bdi_bg_thresh; |
701 | pos_ratio = x; | 749 | |
702 | pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; | 750 | if (bdi_dirty < 8) |
703 | pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; | 751 | return min_t(long long, pos_ratio * 2, |
704 | pos_ratio += 1 << RATELIMIT_CALC_SHIFT; | 752 | 2 << RATELIMIT_CALC_SHIFT); |
753 | |||
754 | if (bdi_dirty >= bdi_thresh) | ||
755 | return 0; | ||
756 | |||
757 | bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh); | ||
758 | bdi_setpoint = dirty_freerun_ceiling(bdi_thresh, | ||
759 | bdi_bg_thresh); | ||
760 | |||
761 | if (bdi_setpoint == 0 || bdi_setpoint == bdi_thresh) | ||
762 | return 0; | ||
763 | |||
764 | bdi_pos_ratio = pos_ratio_polynom(bdi_setpoint, bdi_dirty, | ||
765 | bdi_thresh); | ||
766 | |||
767 | /* | ||
768 | * Typically, for strictlimit case, bdi_setpoint << setpoint | ||
769 | * and pos_ratio >> bdi_pos_ratio. In the other words global | ||
770 | * state ("dirty") is not limiting factor and we have to | ||
771 | * make decision based on bdi counters. But there is an | ||
772 | * important case when global pos_ratio should get precedence: | ||
773 | * global limits are exceeded (e.g. due to activities on other | ||
774 | * BDIs) while given strictlimit BDI is below limit. | ||
775 | * | ||
776 | * "pos_ratio * bdi_pos_ratio" would work for the case above, | ||
777 | * but it would look too non-natural for the case of all | ||
778 | * activity in the system coming from a single strictlimit BDI | ||
779 | * with bdi->max_ratio == 100%. | ||
780 | * | ||
781 | * Note that min() below somewhat changes the dynamics of the | ||
782 | * control system. Normally, pos_ratio value can be well over 3 | ||
783 | * (when globally we are at freerun and bdi is well below bdi | ||
784 | * setpoint). Now the maximum pos_ratio in the same situation | ||
785 | * is 2. We might want to tweak this if we observe the control | ||
786 | * system is too slow to adapt. | ||
787 | */ | ||
788 | return min(pos_ratio, bdi_pos_ratio); | ||
789 | } | ||
705 | 790 | ||
706 | /* | 791 | /* |
707 | * We have computed basic pos_ratio above based on global situation. If | 792 | * We have computed basic pos_ratio above based on global situation. If |
@@ -994,6 +1079,27 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, | |||
994 | * keep that period small to reduce time lags). | 1079 | * keep that period small to reduce time lags). |
995 | */ | 1080 | */ |
996 | step = 0; | 1081 | step = 0; |
1082 | |||
1083 | /* | ||
1084 | * For strictlimit case, calculations above were based on bdi counters | ||
1085 | * and limits (starting from pos_ratio = bdi_position_ratio() and up to | ||
1086 | * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). | ||
1087 | * Hence, to calculate "step" properly, we have to use bdi_dirty as | ||
1088 | * "dirty" and bdi_setpoint as "setpoint". | ||
1089 | * | ||
1090 | * We rampup dirty_ratelimit forcibly if bdi_dirty is low because | ||
1091 | * it's possible that bdi_thresh is close to zero due to inactivity | ||
1092 | * of backing device (see the implementation of bdi_dirty_limit()). | ||
1093 | */ | ||
1094 | if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) { | ||
1095 | dirty = bdi_dirty; | ||
1096 | if (bdi_dirty < 8) | ||
1097 | setpoint = bdi_dirty + 1; | ||
1098 | else | ||
1099 | setpoint = (bdi_thresh + | ||
1100 | bdi_dirty_limit(bdi, bg_thresh)) / 2; | ||
1101 | } | ||
1102 | |||
997 | if (dirty < setpoint) { | 1103 | if (dirty < setpoint) { |
998 | x = min(bdi->balanced_dirty_ratelimit, | 1104 | x = min(bdi->balanced_dirty_ratelimit, |
999 | min(balanced_dirty_ratelimit, task_ratelimit)); | 1105 | min(balanced_dirty_ratelimit, task_ratelimit)); |
@@ -1198,6 +1304,56 @@ static long bdi_min_pause(struct backing_dev_info *bdi, | |||
1198 | return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; | 1304 | return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; |
1199 | } | 1305 | } |
1200 | 1306 | ||
1307 | static inline void bdi_dirty_limits(struct backing_dev_info *bdi, | ||
1308 | unsigned long dirty_thresh, | ||
1309 | unsigned long background_thresh, | ||
1310 | unsigned long *bdi_dirty, | ||
1311 | unsigned long *bdi_thresh, | ||
1312 | unsigned long *bdi_bg_thresh) | ||
1313 | { | ||
1314 | unsigned long bdi_reclaimable; | ||
1315 | |||
1316 | /* | ||
1317 | * bdi_thresh is not treated as some limiting factor as | ||
1318 | * dirty_thresh, due to reasons | ||
1319 | * - in JBOD setup, bdi_thresh can fluctuate a lot | ||
1320 | * - in a system with HDD and USB key, the USB key may somehow | ||
1321 | * go into state (bdi_dirty >> bdi_thresh) either because | ||
1322 | * bdi_dirty starts high, or because bdi_thresh drops low. | ||
1323 | * In this case we don't want to hard throttle the USB key | ||
1324 | * dirtiers for 100 seconds until bdi_dirty drops under | ||
1325 | * bdi_thresh. Instead the auxiliary bdi control line in | ||
1326 | * bdi_position_ratio() will let the dirtier task progress | ||
1327 | * at some rate <= (write_bw / 2) for bringing down bdi_dirty. | ||
1328 | */ | ||
1329 | *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); | ||
1330 | |||
1331 | if (bdi_bg_thresh) | ||
1332 | *bdi_bg_thresh = div_u64((u64)*bdi_thresh * | ||
1333 | background_thresh, | ||
1334 | dirty_thresh); | ||
1335 | |||
1336 | /* | ||
1337 | * In order to avoid the stacked BDI deadlock we need | ||
1338 | * to ensure we accurately count the 'dirty' pages when | ||
1339 | * the threshold is low. | ||
1340 | * | ||
1341 | * Otherwise it would be possible to get thresh+n pages | ||
1342 | * reported dirty, even though there are thresh-m pages | ||
1343 | * actually dirty; with m+n sitting in the percpu | ||
1344 | * deltas. | ||
1345 | */ | ||
1346 | if (*bdi_thresh < 2 * bdi_stat_error(bdi)) { | ||
1347 | bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); | ||
1348 | *bdi_dirty = bdi_reclaimable + | ||
1349 | bdi_stat_sum(bdi, BDI_WRITEBACK); | ||
1350 | } else { | ||
1351 | bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); | ||
1352 | *bdi_dirty = bdi_reclaimable + | ||
1353 | bdi_stat(bdi, BDI_WRITEBACK); | ||
1354 | } | ||
1355 | } | ||
1356 | |||
1201 | /* | 1357 | /* |
1202 | * balance_dirty_pages() must be called by processes which are generating dirty | 1358 | * balance_dirty_pages() must be called by processes which are generating dirty |
1203 | * data. It looks at the number of dirty pages in the machine and will force | 1359 | * data. It looks at the number of dirty pages in the machine and will force |
@@ -1209,13 +1365,9 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
1209 | unsigned long pages_dirtied) | 1365 | unsigned long pages_dirtied) |
1210 | { | 1366 | { |
1211 | unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ | 1367 | unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ |
1212 | unsigned long bdi_reclaimable; | ||
1213 | unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ | 1368 | unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ |
1214 | unsigned long bdi_dirty; | ||
1215 | unsigned long freerun; | ||
1216 | unsigned long background_thresh; | 1369 | unsigned long background_thresh; |
1217 | unsigned long dirty_thresh; | 1370 | unsigned long dirty_thresh; |
1218 | unsigned long bdi_thresh; | ||
1219 | long period; | 1371 | long period; |
1220 | long pause; | 1372 | long pause; |
1221 | long max_pause; | 1373 | long max_pause; |
@@ -1226,10 +1378,16 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
1226 | unsigned long dirty_ratelimit; | 1378 | unsigned long dirty_ratelimit; |
1227 | unsigned long pos_ratio; | 1379 | unsigned long pos_ratio; |
1228 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 1380 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
1381 | bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; | ||
1229 | unsigned long start_time = jiffies; | 1382 | unsigned long start_time = jiffies; |
1230 | 1383 | ||
1231 | for (;;) { | 1384 | for (;;) { |
1232 | unsigned long now = jiffies; | 1385 | unsigned long now = jiffies; |
1386 | unsigned long uninitialized_var(bdi_thresh); | ||
1387 | unsigned long thresh; | ||
1388 | unsigned long uninitialized_var(bdi_dirty); | ||
1389 | unsigned long dirty; | ||
1390 | unsigned long bg_thresh; | ||
1233 | 1391 | ||
1234 | /* | 1392 | /* |
1235 | * Unstable writes are a feature of certain networked | 1393 | * Unstable writes are a feature of certain networked |
@@ -1243,61 +1401,44 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
1243 | 1401 | ||
1244 | global_dirty_limits(&background_thresh, &dirty_thresh); | 1402 | global_dirty_limits(&background_thresh, &dirty_thresh); |
1245 | 1403 | ||
1404 | if (unlikely(strictlimit)) { | ||
1405 | bdi_dirty_limits(bdi, dirty_thresh, background_thresh, | ||
1406 | &bdi_dirty, &bdi_thresh, &bg_thresh); | ||
1407 | |||
1408 | dirty = bdi_dirty; | ||
1409 | thresh = bdi_thresh; | ||
1410 | } else { | ||
1411 | dirty = nr_dirty; | ||
1412 | thresh = dirty_thresh; | ||
1413 | bg_thresh = background_thresh; | ||
1414 | } | ||
1415 | |||
1246 | /* | 1416 | /* |
1247 | * Throttle it only when the background writeback cannot | 1417 | * Throttle it only when the background writeback cannot |
1248 | * catch-up. This avoids (excessively) small writeouts | 1418 | * catch-up. This avoids (excessively) small writeouts |
1249 | * when the bdi limits are ramping up. | 1419 | * when the bdi limits are ramping up in case of !strictlimit. |
1420 | * | ||
1421 | * In strictlimit case make decision based on the bdi counters | ||
1422 | * and limits. Small writeouts when the bdi limits are ramping | ||
1423 | * up are the price we consciously pay for strictlimit-ing. | ||
1250 | */ | 1424 | */ |
1251 | freerun = dirty_freerun_ceiling(dirty_thresh, | 1425 | if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) { |
1252 | background_thresh); | ||
1253 | if (nr_dirty <= freerun) { | ||
1254 | current->dirty_paused_when = now; | 1426 | current->dirty_paused_when = now; |
1255 | current->nr_dirtied = 0; | 1427 | current->nr_dirtied = 0; |
1256 | current->nr_dirtied_pause = | 1428 | current->nr_dirtied_pause = |
1257 | dirty_poll_interval(nr_dirty, dirty_thresh); | 1429 | dirty_poll_interval(dirty, thresh); |
1258 | break; | 1430 | break; |
1259 | } | 1431 | } |
1260 | 1432 | ||
1261 | if (unlikely(!writeback_in_progress(bdi))) | 1433 | if (unlikely(!writeback_in_progress(bdi))) |
1262 | bdi_start_background_writeback(bdi); | 1434 | bdi_start_background_writeback(bdi); |
1263 | 1435 | ||
1264 | /* | 1436 | if (!strictlimit) |
1265 | * bdi_thresh is not treated as some limiting factor as | 1437 | bdi_dirty_limits(bdi, dirty_thresh, background_thresh, |
1266 | * dirty_thresh, due to reasons | 1438 | &bdi_dirty, &bdi_thresh, NULL); |
1267 | * - in JBOD setup, bdi_thresh can fluctuate a lot | ||
1268 | * - in a system with HDD and USB key, the USB key may somehow | ||
1269 | * go into state (bdi_dirty >> bdi_thresh) either because | ||
1270 | * bdi_dirty starts high, or because bdi_thresh drops low. | ||
1271 | * In this case we don't want to hard throttle the USB key | ||
1272 | * dirtiers for 100 seconds until bdi_dirty drops under | ||
1273 | * bdi_thresh. Instead the auxiliary bdi control line in | ||
1274 | * bdi_position_ratio() will let the dirtier task progress | ||
1275 | * at some rate <= (write_bw / 2) for bringing down bdi_dirty. | ||
1276 | */ | ||
1277 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); | ||
1278 | |||
1279 | /* | ||
1280 | * In order to avoid the stacked BDI deadlock we need | ||
1281 | * to ensure we accurately count the 'dirty' pages when | ||
1282 | * the threshold is low. | ||
1283 | * | ||
1284 | * Otherwise it would be possible to get thresh+n pages | ||
1285 | * reported dirty, even though there are thresh-m pages | ||
1286 | * actually dirty; with m+n sitting in the percpu | ||
1287 | * deltas. | ||
1288 | */ | ||
1289 | if (bdi_thresh < 2 * bdi_stat_error(bdi)) { | ||
1290 | bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); | ||
1291 | bdi_dirty = bdi_reclaimable + | ||
1292 | bdi_stat_sum(bdi, BDI_WRITEBACK); | ||
1293 | } else { | ||
1294 | bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); | ||
1295 | bdi_dirty = bdi_reclaimable + | ||
1296 | bdi_stat(bdi, BDI_WRITEBACK); | ||
1297 | } | ||
1298 | 1439 | ||
1299 | dirty_exceeded = (bdi_dirty > bdi_thresh) && | 1440 | dirty_exceeded = (bdi_dirty > bdi_thresh) && |
1300 | (nr_dirty > dirty_thresh); | 1441 | ((nr_dirty > dirty_thresh) || strictlimit); |
1301 | if (dirty_exceeded && !bdi->dirty_exceeded) | 1442 | if (dirty_exceeded && !bdi->dirty_exceeded) |
1302 | bdi->dirty_exceeded = 1; | 1443 | bdi->dirty_exceeded = 1; |
1303 | 1444 | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c2b59dbda196..0ee638f76ebe 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -56,6 +56,7 @@ | |||
56 | #include <linux/ftrace_event.h> | 56 | #include <linux/ftrace_event.h> |
57 | #include <linux/memcontrol.h> | 57 | #include <linux/memcontrol.h> |
58 | #include <linux/prefetch.h> | 58 | #include <linux/prefetch.h> |
59 | #include <linux/mm_inline.h> | ||
59 | #include <linux/migrate.h> | 60 | #include <linux/migrate.h> |
60 | #include <linux/page-debug-flags.h> | 61 | #include <linux/page-debug-flags.h> |
61 | #include <linux/hugetlb.h> | 62 | #include <linux/hugetlb.h> |
@@ -488,8 +489,10 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) | |||
488 | * (c) a page and its buddy have the same order && | 489 | * (c) a page and its buddy have the same order && |
489 | * (d) a page and its buddy are in the same zone. | 490 | * (d) a page and its buddy are in the same zone. |
490 | * | 491 | * |
491 | * For recording whether a page is in the buddy system, we set ->_mapcount -2. | 492 | * For recording whether a page is in the buddy system, we set ->_mapcount |
492 | * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock. | 493 | * PAGE_BUDDY_MAPCOUNT_VALUE. |
494 | * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is | ||
495 | * serialized by zone->lock. | ||
493 | * | 496 | * |
494 | * For recording page's order, we use page_private(page). | 497 | * For recording page's order, we use page_private(page). |
495 | */ | 498 | */ |
@@ -527,8 +530,9 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
527 | * as necessary, plus some accounting needed to play nicely with other | 530 | * as necessary, plus some accounting needed to play nicely with other |
528 | * parts of the VM system. | 531 | * parts of the VM system. |
529 | * At each level, we keep a list of pages, which are heads of continuous | 532 | * At each level, we keep a list of pages, which are heads of continuous |
530 | * free pages of length of (1 << order) and marked with _mapcount -2. Page's | 533 | * free pages of length of (1 << order) and marked with _mapcount |
531 | * order is recorded in page_private(page) field. | 534 | * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page) |
535 | * field. | ||
532 | * So when we are allocating or freeing one, we can derive the state of the | 536 | * So when we are allocating or freeing one, we can derive the state of the |
533 | * other. That is, if we allocate a small block, and both were | 537 | * other. That is, if we allocate a small block, and both were |
534 | * free, the remainder of the region must be split into blocks. | 538 | * free, the remainder of the region must be split into blocks. |
@@ -647,7 +651,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
647 | int to_free = count; | 651 | int to_free = count; |
648 | 652 | ||
649 | spin_lock(&zone->lock); | 653 | spin_lock(&zone->lock); |
650 | zone->all_unreclaimable = 0; | ||
651 | zone->pages_scanned = 0; | 654 | zone->pages_scanned = 0; |
652 | 655 | ||
653 | while (to_free) { | 656 | while (to_free) { |
@@ -696,7 +699,6 @@ static void free_one_page(struct zone *zone, struct page *page, int order, | |||
696 | int migratetype) | 699 | int migratetype) |
697 | { | 700 | { |
698 | spin_lock(&zone->lock); | 701 | spin_lock(&zone->lock); |
699 | zone->all_unreclaimable = 0; | ||
700 | zone->pages_scanned = 0; | 702 | zone->pages_scanned = 0; |
701 | 703 | ||
702 | __free_one_page(page, zone, order, migratetype); | 704 | __free_one_page(page, zone, order, migratetype); |
@@ -721,7 +723,8 @@ static bool free_pages_prepare(struct page *page, unsigned int order) | |||
721 | return false; | 723 | return false; |
722 | 724 | ||
723 | if (!PageHighMem(page)) { | 725 | if (!PageHighMem(page)) { |
724 | debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); | 726 | debug_check_no_locks_freed(page_address(page), |
727 | PAGE_SIZE << order); | ||
725 | debug_check_no_obj_freed(page_address(page), | 728 | debug_check_no_obj_freed(page_address(page), |
726 | PAGE_SIZE << order); | 729 | PAGE_SIZE << order); |
727 | } | 730 | } |
@@ -750,19 +753,19 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
750 | void __init __free_pages_bootmem(struct page *page, unsigned int order) | 753 | void __init __free_pages_bootmem(struct page *page, unsigned int order) |
751 | { | 754 | { |
752 | unsigned int nr_pages = 1 << order; | 755 | unsigned int nr_pages = 1 << order; |
756 | struct page *p = page; | ||
753 | unsigned int loop; | 757 | unsigned int loop; |
754 | 758 | ||
755 | prefetchw(page); | 759 | prefetchw(p); |
756 | for (loop = 0; loop < nr_pages; loop++) { | 760 | for (loop = 0; loop < (nr_pages - 1); loop++, p++) { |
757 | struct page *p = &page[loop]; | 761 | prefetchw(p + 1); |
758 | |||
759 | if (loop + 1 < nr_pages) | ||
760 | prefetchw(p + 1); | ||
761 | __ClearPageReserved(p); | 762 | __ClearPageReserved(p); |
762 | set_page_count(p, 0); | 763 | set_page_count(p, 0); |
763 | } | 764 | } |
765 | __ClearPageReserved(p); | ||
766 | set_page_count(p, 0); | ||
764 | 767 | ||
765 | page_zone(page)->managed_pages += 1 << order; | 768 | page_zone(page)->managed_pages += nr_pages; |
766 | set_page_refcounted(page); | 769 | set_page_refcounted(page); |
767 | __free_pages(page, order); | 770 | __free_pages(page, order); |
768 | } | 771 | } |
@@ -885,7 +888,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | |||
885 | int migratetype) | 888 | int migratetype) |
886 | { | 889 | { |
887 | unsigned int current_order; | 890 | unsigned int current_order; |
888 | struct free_area * area; | 891 | struct free_area *area; |
889 | struct page *page; | 892 | struct page *page; |
890 | 893 | ||
891 | /* Find a page of the appropriate size in the preferred list */ | 894 | /* Find a page of the appropriate size in the preferred list */ |
@@ -1007,14 +1010,60 @@ static void change_pageblock_range(struct page *pageblock_page, | |||
1007 | } | 1010 | } |
1008 | } | 1011 | } |
1009 | 1012 | ||
1013 | /* | ||
1014 | * If breaking a large block of pages, move all free pages to the preferred | ||
1015 | * allocation list. If falling back for a reclaimable kernel allocation, be | ||
1016 | * more aggressive about taking ownership of free pages. | ||
1017 | * | ||
1018 | * On the other hand, never change migration type of MIGRATE_CMA pageblocks | ||
1019 | * nor move CMA pages to different free lists. We don't want unmovable pages | ||
1020 | * to be allocated from MIGRATE_CMA areas. | ||
1021 | * | ||
1022 | * Returns the new migratetype of the pageblock (or the same old migratetype | ||
1023 | * if it was unchanged). | ||
1024 | */ | ||
1025 | static int try_to_steal_freepages(struct zone *zone, struct page *page, | ||
1026 | int start_type, int fallback_type) | ||
1027 | { | ||
1028 | int current_order = page_order(page); | ||
1029 | |||
1030 | if (is_migrate_cma(fallback_type)) | ||
1031 | return fallback_type; | ||
1032 | |||
1033 | /* Take ownership for orders >= pageblock_order */ | ||
1034 | if (current_order >= pageblock_order) { | ||
1035 | change_pageblock_range(page, current_order, start_type); | ||
1036 | return start_type; | ||
1037 | } | ||
1038 | |||
1039 | if (current_order >= pageblock_order / 2 || | ||
1040 | start_type == MIGRATE_RECLAIMABLE || | ||
1041 | page_group_by_mobility_disabled) { | ||
1042 | int pages; | ||
1043 | |||
1044 | pages = move_freepages_block(zone, page, start_type); | ||
1045 | |||
1046 | /* Claim the whole block if over half of it is free */ | ||
1047 | if (pages >= (1 << (pageblock_order-1)) || | ||
1048 | page_group_by_mobility_disabled) { | ||
1049 | |||
1050 | set_pageblock_migratetype(page, start_type); | ||
1051 | return start_type; | ||
1052 | } | ||
1053 | |||
1054 | } | ||
1055 | |||
1056 | return fallback_type; | ||
1057 | } | ||
1058 | |||
1010 | /* Remove an element from the buddy allocator from the fallback list */ | 1059 | /* Remove an element from the buddy allocator from the fallback list */ |
1011 | static inline struct page * | 1060 | static inline struct page * |
1012 | __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | 1061 | __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) |
1013 | { | 1062 | { |
1014 | struct free_area * area; | 1063 | struct free_area *area; |
1015 | int current_order; | 1064 | int current_order; |
1016 | struct page *page; | 1065 | struct page *page; |
1017 | int migratetype, i; | 1066 | int migratetype, new_type, i; |
1018 | 1067 | ||
1019 | /* Find the largest possible block of pages in the other list */ | 1068 | /* Find the largest possible block of pages in the other list */ |
1020 | for (current_order = MAX_ORDER-1; current_order >= order; | 1069 | for (current_order = MAX_ORDER-1; current_order >= order; |
@@ -1034,51 +1083,29 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | |||
1034 | struct page, lru); | 1083 | struct page, lru); |
1035 | area->nr_free--; | 1084 | area->nr_free--; |
1036 | 1085 | ||
1037 | /* | 1086 | new_type = try_to_steal_freepages(zone, page, |
1038 | * If breaking a large block of pages, move all free | 1087 | start_migratetype, |
1039 | * pages to the preferred allocation list. If falling | 1088 | migratetype); |
1040 | * back for a reclaimable kernel allocation, be more | ||
1041 | * aggressive about taking ownership of free pages | ||
1042 | * | ||
1043 | * On the other hand, never change migration | ||
1044 | * type of MIGRATE_CMA pageblocks nor move CMA | ||
1045 | * pages on different free lists. We don't | ||
1046 | * want unmovable pages to be allocated from | ||
1047 | * MIGRATE_CMA areas. | ||
1048 | */ | ||
1049 | if (!is_migrate_cma(migratetype) && | ||
1050 | (current_order >= pageblock_order / 2 || | ||
1051 | start_migratetype == MIGRATE_RECLAIMABLE || | ||
1052 | page_group_by_mobility_disabled)) { | ||
1053 | int pages; | ||
1054 | pages = move_freepages_block(zone, page, | ||
1055 | start_migratetype); | ||
1056 | |||
1057 | /* Claim the whole block if over half of it is free */ | ||
1058 | if (pages >= (1 << (pageblock_order-1)) || | ||
1059 | page_group_by_mobility_disabled) | ||
1060 | set_pageblock_migratetype(page, | ||
1061 | start_migratetype); | ||
1062 | |||
1063 | migratetype = start_migratetype; | ||
1064 | } | ||
1065 | 1089 | ||
1066 | /* Remove the page from the freelists */ | 1090 | /* Remove the page from the freelists */ |
1067 | list_del(&page->lru); | 1091 | list_del(&page->lru); |
1068 | rmv_page_order(page); | 1092 | rmv_page_order(page); |
1069 | 1093 | ||
1070 | /* Take ownership for orders >= pageblock_order */ | 1094 | /* |
1071 | if (current_order >= pageblock_order && | 1095 | * Borrow the excess buddy pages as well, irrespective |
1072 | !is_migrate_cma(migratetype)) | 1096 | * of whether we stole freepages, or took ownership of |
1073 | change_pageblock_range(page, current_order, | 1097 | * the pageblock or not. |
1074 | start_migratetype); | 1098 | * |
1075 | 1099 | * Exception: When borrowing from MIGRATE_CMA, release | |
1100 | * the excess buddy pages to CMA itself. | ||
1101 | */ | ||
1076 | expand(zone, page, order, current_order, area, | 1102 | expand(zone, page, order, current_order, area, |
1077 | is_migrate_cma(migratetype) | 1103 | is_migrate_cma(migratetype) |
1078 | ? migratetype : start_migratetype); | 1104 | ? migratetype : start_migratetype); |
1079 | 1105 | ||
1080 | trace_mm_page_alloc_extfrag(page, order, current_order, | 1106 | trace_mm_page_alloc_extfrag(page, order, |
1081 | start_migratetype, migratetype); | 1107 | current_order, start_migratetype, migratetype, |
1108 | new_type == start_migratetype); | ||
1082 | 1109 | ||
1083 | return page; | 1110 | return page; |
1084 | } | 1111 | } |
@@ -1281,7 +1308,7 @@ void mark_free_pages(struct zone *zone) | |||
1281 | int order, t; | 1308 | int order, t; |
1282 | struct list_head *curr; | 1309 | struct list_head *curr; |
1283 | 1310 | ||
1284 | if (!zone->spanned_pages) | 1311 | if (zone_is_empty(zone)) |
1285 | return; | 1312 | return; |
1286 | 1313 | ||
1287 | spin_lock_irqsave(&zone->lock, flags); | 1314 | spin_lock_irqsave(&zone->lock, flags); |
@@ -1526,6 +1553,7 @@ again: | |||
1526 | get_pageblock_migratetype(page)); | 1553 | get_pageblock_migratetype(page)); |
1527 | } | 1554 | } |
1528 | 1555 | ||
1556 | __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); | ||
1529 | __count_zone_vm_events(PGALLOC, zone, 1 << order); | 1557 | __count_zone_vm_events(PGALLOC, zone, 1 << order); |
1530 | zone_statistics(preferred_zone, zone, gfp_flags); | 1558 | zone_statistics(preferred_zone, zone, gfp_flags); |
1531 | local_irq_restore(flags); | 1559 | local_irq_restore(flags); |
@@ -1792,6 +1820,11 @@ static void zlc_clear_zones_full(struct zonelist *zonelist) | |||
1792 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | 1820 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); |
1793 | } | 1821 | } |
1794 | 1822 | ||
1823 | static bool zone_local(struct zone *local_zone, struct zone *zone) | ||
1824 | { | ||
1825 | return node_distance(local_zone->node, zone->node) == LOCAL_DISTANCE; | ||
1826 | } | ||
1827 | |||
1795 | static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) | 1828 | static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) |
1796 | { | 1829 | { |
1797 | return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); | 1830 | return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); |
@@ -1829,6 +1862,11 @@ static void zlc_clear_zones_full(struct zonelist *zonelist) | |||
1829 | { | 1862 | { |
1830 | } | 1863 | } |
1831 | 1864 | ||
1865 | static bool zone_local(struct zone *local_zone, struct zone *zone) | ||
1866 | { | ||
1867 | return true; | ||
1868 | } | ||
1869 | |||
1832 | static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) | 1870 | static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) |
1833 | { | 1871 | { |
1834 | return true; | 1872 | return true; |
@@ -1860,16 +1898,41 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, | |||
1860 | zonelist_scan: | 1898 | zonelist_scan: |
1861 | /* | 1899 | /* |
1862 | * Scan zonelist, looking for a zone with enough free. | 1900 | * Scan zonelist, looking for a zone with enough free. |
1863 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 1901 | * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. |
1864 | */ | 1902 | */ |
1865 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 1903 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
1866 | high_zoneidx, nodemask) { | 1904 | high_zoneidx, nodemask) { |
1905 | unsigned long mark; | ||
1906 | |||
1867 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active && | 1907 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active && |
1868 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | 1908 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) |
1869 | continue; | 1909 | continue; |
1870 | if ((alloc_flags & ALLOC_CPUSET) && | 1910 | if ((alloc_flags & ALLOC_CPUSET) && |
1871 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) | 1911 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) |
1872 | continue; | 1912 | continue; |
1913 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); | ||
1914 | if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS)) | ||
1915 | goto try_this_zone; | ||
1916 | /* | ||
1917 | * Distribute pages in proportion to the individual | ||
1918 | * zone size to ensure fair page aging. The zone a | ||
1919 | * page was allocated in should have no effect on the | ||
1920 | * time the page has in memory before being reclaimed. | ||
1921 | * | ||
1922 | * When zone_reclaim_mode is enabled, try to stay in | ||
1923 | * local zones in the fastpath. If that fails, the | ||
1924 | * slowpath is entered, which will do another pass | ||
1925 | * starting with the local zones, but ultimately fall | ||
1926 | * back to remote zones that do not partake in the | ||
1927 | * fairness round-robin cycle of this zonelist. | ||
1928 | */ | ||
1929 | if (alloc_flags & ALLOC_WMARK_LOW) { | ||
1930 | if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) | ||
1931 | continue; | ||
1932 | if (zone_reclaim_mode && | ||
1933 | !zone_local(preferred_zone, zone)) | ||
1934 | continue; | ||
1935 | } | ||
1873 | /* | 1936 | /* |
1874 | * When allocating a page cache page for writing, we | 1937 | * When allocating a page cache page for writing, we |
1875 | * want to get it from a zone that is within its dirty | 1938 | * want to get it from a zone that is within its dirty |
@@ -1900,16 +1963,11 @@ zonelist_scan: | |||
1900 | (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) | 1963 | (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) |
1901 | goto this_zone_full; | 1964 | goto this_zone_full; |
1902 | 1965 | ||
1903 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); | 1966 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; |
1904 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { | 1967 | if (!zone_watermark_ok(zone, order, mark, |
1905 | unsigned long mark; | 1968 | classzone_idx, alloc_flags)) { |
1906 | int ret; | 1969 | int ret; |
1907 | 1970 | ||
1908 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; | ||
1909 | if (zone_watermark_ok(zone, order, mark, | ||
1910 | classzone_idx, alloc_flags)) | ||
1911 | goto try_this_zone; | ||
1912 | |||
1913 | if (IS_ENABLED(CONFIG_NUMA) && | 1971 | if (IS_ENABLED(CONFIG_NUMA) && |
1914 | !did_zlc_setup && nr_online_nodes > 1) { | 1972 | !did_zlc_setup && nr_online_nodes > 1) { |
1915 | /* | 1973 | /* |
@@ -2321,16 +2379,30 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, | |||
2321 | return page; | 2379 | return page; |
2322 | } | 2380 | } |
2323 | 2381 | ||
2324 | static inline | 2382 | static void prepare_slowpath(gfp_t gfp_mask, unsigned int order, |
2325 | void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, | 2383 | struct zonelist *zonelist, |
2326 | enum zone_type high_zoneidx, | 2384 | enum zone_type high_zoneidx, |
2327 | enum zone_type classzone_idx) | 2385 | struct zone *preferred_zone) |
2328 | { | 2386 | { |
2329 | struct zoneref *z; | 2387 | struct zoneref *z; |
2330 | struct zone *zone; | 2388 | struct zone *zone; |
2331 | 2389 | ||
2332 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) | 2390 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { |
2333 | wakeup_kswapd(zone, order, classzone_idx); | 2391 | if (!(gfp_mask & __GFP_NO_KSWAPD)) |
2392 | wakeup_kswapd(zone, order, zone_idx(preferred_zone)); | ||
2393 | /* | ||
2394 | * Only reset the batches of zones that were actually | ||
2395 | * considered in the fast path, we don't want to | ||
2396 | * thrash fairness information for zones that are not | ||
2397 | * actually part of this zonelist's round-robin cycle. | ||
2398 | */ | ||
2399 | if (zone_reclaim_mode && !zone_local(preferred_zone, zone)) | ||
2400 | continue; | ||
2401 | mod_zone_page_state(zone, NR_ALLOC_BATCH, | ||
2402 | high_wmark_pages(zone) - | ||
2403 | low_wmark_pages(zone) - | ||
2404 | zone_page_state(zone, NR_ALLOC_BATCH)); | ||
2405 | } | ||
2334 | } | 2406 | } |
2335 | 2407 | ||
2336 | static inline int | 2408 | static inline int |
@@ -2426,9 +2498,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
2426 | goto nopage; | 2498 | goto nopage; |
2427 | 2499 | ||
2428 | restart: | 2500 | restart: |
2429 | if (!(gfp_mask & __GFP_NO_KSWAPD)) | 2501 | prepare_slowpath(gfp_mask, order, zonelist, |
2430 | wake_all_kswapd(order, zonelist, high_zoneidx, | 2502 | high_zoneidx, preferred_zone); |
2431 | zone_idx(preferred_zone)); | ||
2432 | 2503 | ||
2433 | /* | 2504 | /* |
2434 | * OK, we're below the kswapd watermark and have kicked background | 2505 | * OK, we're below the kswapd watermark and have kicked background |
@@ -3095,7 +3166,7 @@ void show_free_areas(unsigned int filter) | |||
3095 | K(zone_page_state(zone, NR_FREE_CMA_PAGES)), | 3166 | K(zone_page_state(zone, NR_FREE_CMA_PAGES)), |
3096 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), | 3167 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), |
3097 | zone->pages_scanned, | 3168 | zone->pages_scanned, |
3098 | (zone->all_unreclaimable ? "yes" : "no") | 3169 | (!zone_reclaimable(zone) ? "yes" : "no") |
3099 | ); | 3170 | ); |
3100 | printk("lowmem_reserve[]:"); | 3171 | printk("lowmem_reserve[]:"); |
3101 | for (i = 0; i < MAX_NR_ZONES; i++) | 3172 | for (i = 0; i < MAX_NR_ZONES; i++) |
@@ -3104,7 +3175,7 @@ void show_free_areas(unsigned int filter) | |||
3104 | } | 3175 | } |
3105 | 3176 | ||
3106 | for_each_populated_zone(zone) { | 3177 | for_each_populated_zone(zone) { |
3107 | unsigned long nr[MAX_ORDER], flags, order, total = 0; | 3178 | unsigned long nr[MAX_ORDER], flags, order, total = 0; |
3108 | unsigned char types[MAX_ORDER]; | 3179 | unsigned char types[MAX_ORDER]; |
3109 | 3180 | ||
3110 | if (skip_free_areas_node(filter, zone_to_nid(zone))) | 3181 | if (skip_free_areas_node(filter, zone_to_nid(zone))) |
@@ -3416,11 +3487,11 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) | |||
3416 | static int default_zonelist_order(void) | 3487 | static int default_zonelist_order(void) |
3417 | { | 3488 | { |
3418 | int nid, zone_type; | 3489 | int nid, zone_type; |
3419 | unsigned long low_kmem_size,total_size; | 3490 | unsigned long low_kmem_size, total_size; |
3420 | struct zone *z; | 3491 | struct zone *z; |
3421 | int average_size; | 3492 | int average_size; |
3422 | /* | 3493 | /* |
3423 | * ZONE_DMA and ZONE_DMA32 can be very small area in the system. | 3494 | * ZONE_DMA and ZONE_DMA32 can be very small area in the system. |
3424 | * If they are really small and used heavily, the system can fall | 3495 | * If they are really small and used heavily, the system can fall |
3425 | * into OOM very easily. | 3496 | * into OOM very easily. |
3426 | * This function detect ZONE_DMA/DMA32 size and configures zone order. | 3497 | * This function detect ZONE_DMA/DMA32 size and configures zone order. |
@@ -3452,9 +3523,9 @@ static int default_zonelist_order(void) | |||
3452 | return ZONELIST_ORDER_NODE; | 3523 | return ZONELIST_ORDER_NODE; |
3453 | /* | 3524 | /* |
3454 | * look into each node's config. | 3525 | * look into each node's config. |
3455 | * If there is a node whose DMA/DMA32 memory is very big area on | 3526 | * If there is a node whose DMA/DMA32 memory is very big area on |
3456 | * local memory, NODE_ORDER may be suitable. | 3527 | * local memory, NODE_ORDER may be suitable. |
3457 | */ | 3528 | */ |
3458 | average_size = total_size / | 3529 | average_size = total_size / |
3459 | (nodes_weight(node_states[N_MEMORY]) + 1); | 3530 | (nodes_weight(node_states[N_MEMORY]) + 1); |
3460 | for_each_online_node(nid) { | 3531 | for_each_online_node(nid) { |
@@ -4180,7 +4251,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | |||
4180 | if (!zone->wait_table) | 4251 | if (!zone->wait_table) |
4181 | return -ENOMEM; | 4252 | return -ENOMEM; |
4182 | 4253 | ||
4183 | for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) | 4254 | for (i = 0; i < zone->wait_table_hash_nr_entries; ++i) |
4184 | init_waitqueue_head(zone->wait_table + i); | 4255 | init_waitqueue_head(zone->wait_table + i); |
4185 | 4256 | ||
4186 | return 0; | 4257 | return 0; |
@@ -4237,7 +4308,7 @@ int __meminit init_currently_empty_zone(struct zone *zone, | |||
4237 | int __meminit __early_pfn_to_nid(unsigned long pfn) | 4308 | int __meminit __early_pfn_to_nid(unsigned long pfn) |
4238 | { | 4309 | { |
4239 | unsigned long start_pfn, end_pfn; | 4310 | unsigned long start_pfn, end_pfn; |
4240 | int i, nid; | 4311 | int nid; |
4241 | /* | 4312 | /* |
4242 | * NOTE: The following SMP-unsafe globals are only used early in boot | 4313 | * NOTE: The following SMP-unsafe globals are only used early in boot |
4243 | * when the kernel is running single-threaded. | 4314 | * when the kernel is running single-threaded. |
@@ -4248,15 +4319,14 @@ int __meminit __early_pfn_to_nid(unsigned long pfn) | |||
4248 | if (last_start_pfn <= pfn && pfn < last_end_pfn) | 4319 | if (last_start_pfn <= pfn && pfn < last_end_pfn) |
4249 | return last_nid; | 4320 | return last_nid; |
4250 | 4321 | ||
4251 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) | 4322 | nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); |
4252 | if (start_pfn <= pfn && pfn < end_pfn) { | 4323 | if (nid != -1) { |
4253 | last_start_pfn = start_pfn; | 4324 | last_start_pfn = start_pfn; |
4254 | last_end_pfn = end_pfn; | 4325 | last_end_pfn = end_pfn; |
4255 | last_nid = nid; | 4326 | last_nid = nid; |
4256 | return nid; | 4327 | } |
4257 | } | 4328 | |
4258 | /* This is a memory hole */ | 4329 | return nid; |
4259 | return -1; | ||
4260 | } | 4330 | } |
4261 | #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ | 4331 | #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ |
4262 | 4332 | ||
@@ -4586,7 +4656,7 @@ static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, | |||
4586 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | 4656 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
4587 | 4657 | ||
4588 | /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ | 4658 | /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ |
4589 | void __init set_pageblock_order(void) | 4659 | void __paginginit set_pageblock_order(void) |
4590 | { | 4660 | { |
4591 | unsigned int order; | 4661 | unsigned int order; |
4592 | 4662 | ||
@@ -4614,7 +4684,7 @@ void __init set_pageblock_order(void) | |||
4614 | * include/linux/pageblock-flags.h for the values of pageblock_order based on | 4684 | * include/linux/pageblock-flags.h for the values of pageblock_order based on |
4615 | * the kernel config | 4685 | * the kernel config |
4616 | */ | 4686 | */ |
4617 | void __init set_pageblock_order(void) | 4687 | void __paginginit set_pageblock_order(void) |
4618 | { | 4688 | { |
4619 | } | 4689 | } |
4620 | 4690 | ||
@@ -4728,8 +4798,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4728 | spin_lock_init(&zone->lru_lock); | 4798 | spin_lock_init(&zone->lru_lock); |
4729 | zone_seqlock_init(zone); | 4799 | zone_seqlock_init(zone); |
4730 | zone->zone_pgdat = pgdat; | 4800 | zone->zone_pgdat = pgdat; |
4731 | |||
4732 | zone_pcp_init(zone); | 4801 | zone_pcp_init(zone); |
4802 | |||
4803 | /* For bootup, initialized properly in watermark setup */ | ||
4804 | mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages); | ||
4805 | |||
4733 | lruvec_init(&zone->lruvec); | 4806 | lruvec_init(&zone->lruvec); |
4734 | if (!size) | 4807 | if (!size) |
4735 | continue; | 4808 | continue; |
@@ -4930,7 +5003,7 @@ static unsigned long __init early_calculate_totalpages(void) | |||
4930 | if (pages) | 5003 | if (pages) |
4931 | node_set_state(nid, N_MEMORY); | 5004 | node_set_state(nid, N_MEMORY); |
4932 | } | 5005 | } |
4933 | return totalpages; | 5006 | return totalpages; |
4934 | } | 5007 | } |
4935 | 5008 | ||
4936 | /* | 5009 | /* |
@@ -5047,7 +5120,7 @@ restart: | |||
5047 | /* | 5120 | /* |
5048 | * Some kernelcore has been met, update counts and | 5121 | * Some kernelcore has been met, update counts and |
5049 | * break if the kernelcore for this node has been | 5122 | * break if the kernelcore for this node has been |
5050 | * satisified | 5123 | * satisfied |
5051 | */ | 5124 | */ |
5052 | required_kernelcore -= min(required_kernelcore, | 5125 | required_kernelcore -= min(required_kernelcore, |
5053 | size_pages); | 5126 | size_pages); |
@@ -5061,7 +5134,7 @@ restart: | |||
5061 | * If there is still required_kernelcore, we do another pass with one | 5134 | * If there is still required_kernelcore, we do another pass with one |
5062 | * less node in the count. This will push zone_movable_pfn[nid] further | 5135 | * less node in the count. This will push zone_movable_pfn[nid] further |
5063 | * along on the nodes that still have memory until kernelcore is | 5136 | * along on the nodes that still have memory until kernelcore is |
5064 | * satisified | 5137 | * satisfied |
5065 | */ | 5138 | */ |
5066 | usable_nodes--; | 5139 | usable_nodes--; |
5067 | if (usable_nodes && required_kernelcore > usable_nodes) | 5140 | if (usable_nodes && required_kernelcore > usable_nodes) |
@@ -5286,8 +5359,10 @@ void __init mem_init_print_info(const char *str) | |||
5286 | * 3) .rodata.* may be embedded into .text or .data sections. | 5359 | * 3) .rodata.* may be embedded into .text or .data sections. |
5287 | */ | 5360 | */ |
5288 | #define adj_init_size(start, end, size, pos, adj) \ | 5361 | #define adj_init_size(start, end, size, pos, adj) \ |
5289 | if (start <= pos && pos < end && size > adj) \ | 5362 | do { \ |
5290 | size -= adj; | 5363 | if (start <= pos && pos < end && size > adj) \ |
5364 | size -= adj; \ | ||
5365 | } while (0) | ||
5291 | 5366 | ||
5292 | adj_init_size(__init_begin, __init_end, init_data_size, | 5367 | adj_init_size(__init_begin, __init_end, init_data_size, |
5293 | _sinittext, init_code_size); | 5368 | _sinittext, init_code_size); |
@@ -5361,7 +5436,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self, | |||
5361 | * This is only okay since the processor is dead and cannot | 5436 | * This is only okay since the processor is dead and cannot |
5362 | * race with what we are doing. | 5437 | * race with what we are doing. |
5363 | */ | 5438 | */ |
5364 | refresh_cpu_vm_stats(cpu); | 5439 | cpu_vm_stats_fold(cpu); |
5365 | } | 5440 | } |
5366 | return NOTIFY_OK; | 5441 | return NOTIFY_OK; |
5367 | } | 5442 | } |
@@ -5498,6 +5573,11 @@ static void __setup_per_zone_wmarks(void) | |||
5498 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); | 5573 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); |
5499 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); | 5574 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); |
5500 | 5575 | ||
5576 | __mod_zone_page_state(zone, NR_ALLOC_BATCH, | ||
5577 | high_wmark_pages(zone) - | ||
5578 | low_wmark_pages(zone) - | ||
5579 | zone_page_state(zone, NR_ALLOC_BATCH)); | ||
5580 | |||
5501 | setup_zone_migrate_reserve(zone); | 5581 | setup_zone_migrate_reserve(zone); |
5502 | spin_unlock_irqrestore(&zone->lock, flags); | 5582 | spin_unlock_irqrestore(&zone->lock, flags); |
5503 | } | 5583 | } |
@@ -5570,7 +5650,7 @@ static void __meminit setup_per_zone_inactive_ratio(void) | |||
5570 | * we want it large (64MB max). But it is not linear, because network | 5650 | * we want it large (64MB max). But it is not linear, because network |
5571 | * bandwidth does not increase linearly with machine size. We use | 5651 | * bandwidth does not increase linearly with machine size. We use |
5572 | * | 5652 | * |
5573 | * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: | 5653 | * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: |
5574 | * min_free_kbytes = sqrt(lowmem_kbytes * 16) | 5654 | * min_free_kbytes = sqrt(lowmem_kbytes * 16) |
5575 | * | 5655 | * |
5576 | * which yields | 5656 | * which yields |
@@ -5614,11 +5694,11 @@ int __meminit init_per_zone_wmark_min(void) | |||
5614 | module_init(init_per_zone_wmark_min) | 5694 | module_init(init_per_zone_wmark_min) |
5615 | 5695 | ||
5616 | /* | 5696 | /* |
5617 | * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so | 5697 | * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so |
5618 | * that we can call two helper functions whenever min_free_kbytes | 5698 | * that we can call two helper functions whenever min_free_kbytes |
5619 | * changes. | 5699 | * changes. |
5620 | */ | 5700 | */ |
5621 | int min_free_kbytes_sysctl_handler(ctl_table *table, int write, | 5701 | int min_free_kbytes_sysctl_handler(ctl_table *table, int write, |
5622 | void __user *buffer, size_t *length, loff_t *ppos) | 5702 | void __user *buffer, size_t *length, loff_t *ppos) |
5623 | { | 5703 | { |
5624 | proc_dointvec(table, write, buffer, length, ppos); | 5704 | proc_dointvec(table, write, buffer, length, ppos); |
@@ -5682,8 +5762,8 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, | |||
5682 | 5762 | ||
5683 | /* | 5763 | /* |
5684 | * percpu_pagelist_fraction - changes the pcp->high for each zone on each | 5764 | * percpu_pagelist_fraction - changes the pcp->high for each zone on each |
5685 | * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist | 5765 | * cpu. It is the fraction of total pages in each zone that a hot per cpu |
5686 | * can have before it gets flushed back to buddy allocator. | 5766 | * pagelist can have before it gets flushed back to buddy allocator. |
5687 | */ | 5767 | */ |
5688 | int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, | 5768 | int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, |
5689 | void __user *buffer, size_t *length, loff_t *ppos) | 5769 | void __user *buffer, size_t *length, loff_t *ppos) |
@@ -5745,9 +5825,10 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
5745 | if (!numentries) { | 5825 | if (!numentries) { |
5746 | /* round applicable memory size up to nearest megabyte */ | 5826 | /* round applicable memory size up to nearest megabyte */ |
5747 | numentries = nr_kernel_pages; | 5827 | numentries = nr_kernel_pages; |
5748 | numentries += (1UL << (20 - PAGE_SHIFT)) - 1; | 5828 | |
5749 | numentries >>= 20 - PAGE_SHIFT; | 5829 | /* It isn't necessary when PAGE_SIZE >= 1MB */ |
5750 | numentries <<= 20 - PAGE_SHIFT; | 5830 | if (PAGE_SHIFT < 20) |
5831 | numentries = round_up(numentries, (1<<20)/PAGE_SIZE); | ||
5751 | 5832 | ||
5752 | /* limit to 1 bucket per 2^scale bytes of low memory */ | 5833 | /* limit to 1 bucket per 2^scale bytes of low memory */ |
5753 | if (scale > PAGE_SHIFT) | 5834 | if (scale > PAGE_SHIFT) |
@@ -5900,7 +5981,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, | |||
5900 | * This function checks whether pageblock includes unmovable pages or not. | 5981 | * This function checks whether pageblock includes unmovable pages or not. |
5901 | * If @count is not zero, it is okay to include less @count unmovable pages | 5982 | * If @count is not zero, it is okay to include less @count unmovable pages |
5902 | * | 5983 | * |
5903 | * PageLRU check wihtout isolation or lru_lock could race so that | 5984 | * PageLRU check without isolation or lru_lock could race so that |
5904 | * MIGRATE_MOVABLE block might include unmovable pages. It means you can't | 5985 | * MIGRATE_MOVABLE block might include unmovable pages. It means you can't |
5905 | * expect this function should be exact. | 5986 | * expect this function should be exact. |
5906 | */ | 5987 | */ |
@@ -5928,6 +6009,17 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, | |||
5928 | continue; | 6009 | continue; |
5929 | 6010 | ||
5930 | page = pfn_to_page(check); | 6011 | page = pfn_to_page(check); |
6012 | |||
6013 | /* | ||
6014 | * Hugepages are not in LRU lists, but they're movable. | ||
6015 | * We need not scan over tail pages bacause we don't | ||
6016 | * handle each tail page individually in migration. | ||
6017 | */ | ||
6018 | if (PageHuge(page)) { | ||
6019 | iter = round_up(iter + 1, 1<<compound_order(page)) - 1; | ||
6020 | continue; | ||
6021 | } | ||
6022 | |||
5931 | /* | 6023 | /* |
5932 | * We can't use page_count without pin a page | 6024 | * We can't use page_count without pin a page |
5933 | * because another CPU can free compound page. | 6025 | * because another CPU can free compound page. |
diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 0cee10ffb98d..d1473b2e9481 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c | |||
@@ -6,6 +6,7 @@ | |||
6 | #include <linux/page-isolation.h> | 6 | #include <linux/page-isolation.h> |
7 | #include <linux/pageblock-flags.h> | 7 | #include <linux/pageblock-flags.h> |
8 | #include <linux/memory.h> | 8 | #include <linux/memory.h> |
9 | #include <linux/hugetlb.h> | ||
9 | #include "internal.h" | 10 | #include "internal.h" |
10 | 11 | ||
11 | int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages) | 12 | int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages) |
@@ -252,6 +253,19 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private, | |||
252 | { | 253 | { |
253 | gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; | 254 | gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; |
254 | 255 | ||
256 | /* | ||
257 | * TODO: allocate a destination hugepage from a nearest neighbor node, | ||
258 | * accordance with memory policy of the user process if possible. For | ||
259 | * now as a simple work-around, we use the next node for destination. | ||
260 | */ | ||
261 | if (PageHuge(page)) { | ||
262 | nodemask_t src = nodemask_of_node(page_to_nid(page)); | ||
263 | nodemask_t dst; | ||
264 | nodes_complement(dst, src); | ||
265 | return alloc_huge_page_node(page_hstate(compound_head(page)), | ||
266 | next_node(page_to_nid(page), dst)); | ||
267 | } | ||
268 | |||
255 | if (PageHighMem(page)) | 269 | if (PageHighMem(page)) |
256 | gfp_mask |= __GFP_HIGHMEM; | 270 | gfp_mask |= __GFP_HIGHMEM; |
257 | 271 | ||
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index e1a6e4fab016..3929a40bd6c0 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c | |||
@@ -10,6 +10,30 @@ | |||
10 | #include <asm/tlb.h> | 10 | #include <asm/tlb.h> |
11 | #include <asm-generic/pgtable.h> | 11 | #include <asm-generic/pgtable.h> |
12 | 12 | ||
13 | /* | ||
14 | * If a p?d_bad entry is found while walking page tables, report | ||
15 | * the error, before resetting entry to p?d_none. Usually (but | ||
16 | * very seldom) called out from the p?d_none_or_clear_bad macros. | ||
17 | */ | ||
18 | |||
19 | void pgd_clear_bad(pgd_t *pgd) | ||
20 | { | ||
21 | pgd_ERROR(*pgd); | ||
22 | pgd_clear(pgd); | ||
23 | } | ||
24 | |||
25 | void pud_clear_bad(pud_t *pud) | ||
26 | { | ||
27 | pud_ERROR(*pud); | ||
28 | pud_clear(pud); | ||
29 | } | ||
30 | |||
31 | void pmd_clear_bad(pmd_t *pmd) | ||
32 | { | ||
33 | pmd_ERROR(*pmd); | ||
34 | pmd_clear(pmd); | ||
35 | } | ||
36 | |||
13 | #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS | 37 | #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS |
14 | /* | 38 | /* |
15 | * Only sets the access flags (dirty, accessed), as well as write | 39 | * Only sets the access flags (dirty, accessed), as well as write |
diff --git a/mm/readahead.c b/mm/readahead.c index 829a77c62834..e4ed04149785 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -371,10 +371,10 @@ static int try_context_readahead(struct address_space *mapping, | |||
371 | size = count_history_pages(mapping, ra, offset, max); | 371 | size = count_history_pages(mapping, ra, offset, max); |
372 | 372 | ||
373 | /* | 373 | /* |
374 | * no history pages: | 374 | * not enough history pages: |
375 | * it could be a random read | 375 | * it could be a random read |
376 | */ | 376 | */ |
377 | if (!size) | 377 | if (size <= req_size) |
378 | return 0; | 378 | return 0; |
379 | 379 | ||
380 | /* | 380 | /* |
@@ -385,8 +385,8 @@ static int try_context_readahead(struct address_space *mapping, | |||
385 | size *= 2; | 385 | size *= 2; |
386 | 386 | ||
387 | ra->start = offset; | 387 | ra->start = offset; |
388 | ra->size = get_init_ra_size(size + req_size, max); | 388 | ra->size = min(size + req_size, max); |
389 | ra->async_size = ra->size; | 389 | ra->async_size = 1; |
390 | 390 | ||
391 | return 1; | 391 | return 1; |
392 | } | 392 | } |
diff --git a/mm/shmem.c b/mm/shmem.c index 526149846d0a..8297623fcaed 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -1205,7 +1205,7 @@ repeat: | |||
1205 | gfp & GFP_RECLAIM_MASK); | 1205 | gfp & GFP_RECLAIM_MASK); |
1206 | if (error) | 1206 | if (error) |
1207 | goto decused; | 1207 | goto decused; |
1208 | error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); | 1208 | error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); |
1209 | if (!error) { | 1209 | if (!error) { |
1210 | error = shmem_add_to_page_cache(page, mapping, index, | 1210 | error = shmem_add_to_page_cache(page, mapping, index, |
1211 | gfp, NULL); | 1211 | gfp, NULL); |
@@ -2819,6 +2819,10 @@ int __init shmem_init(void) | |||
2819 | { | 2819 | { |
2820 | int error; | 2820 | int error; |
2821 | 2821 | ||
2822 | /* If rootfs called this, don't re-init */ | ||
2823 | if (shmem_inode_cachep) | ||
2824 | return 0; | ||
2825 | |||
2822 | error = bdi_init(&shmem_backing_dev_info); | 2826 | error = bdi_init(&shmem_backing_dev_info); |
2823 | if (error) | 2827 | if (error) |
2824 | goto out4; | 2828 | goto out4; |
@@ -4420,7 +4420,7 @@ static ssize_t order_store(struct kmem_cache *s, | |||
4420 | unsigned long order; | 4420 | unsigned long order; |
4421 | int err; | 4421 | int err; |
4422 | 4422 | ||
4423 | err = strict_strtoul(buf, 10, &order); | 4423 | err = kstrtoul(buf, 10, &order); |
4424 | if (err) | 4424 | if (err) |
4425 | return err; | 4425 | return err; |
4426 | 4426 | ||
@@ -4448,7 +4448,7 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, | |||
4448 | unsigned long min; | 4448 | unsigned long min; |
4449 | int err; | 4449 | int err; |
4450 | 4450 | ||
4451 | err = strict_strtoul(buf, 10, &min); | 4451 | err = kstrtoul(buf, 10, &min); |
4452 | if (err) | 4452 | if (err) |
4453 | return err; | 4453 | return err; |
4454 | 4454 | ||
@@ -4468,7 +4468,7 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, | |||
4468 | unsigned long objects; | 4468 | unsigned long objects; |
4469 | int err; | 4469 | int err; |
4470 | 4470 | ||
4471 | err = strict_strtoul(buf, 10, &objects); | 4471 | err = kstrtoul(buf, 10, &objects); |
4472 | if (err) | 4472 | if (err) |
4473 | return err; | 4473 | return err; |
4474 | if (objects && !kmem_cache_has_cpu_partial(s)) | 4474 | if (objects && !kmem_cache_has_cpu_partial(s)) |
@@ -4784,7 +4784,7 @@ static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, | |||
4784 | unsigned long ratio; | 4784 | unsigned long ratio; |
4785 | int err; | 4785 | int err; |
4786 | 4786 | ||
4787 | err = strict_strtoul(buf, 10, &ratio); | 4787 | err = kstrtoul(buf, 10, &ratio); |
4788 | if (err) | 4788 | if (err) |
4789 | return err; | 4789 | return err; |
4790 | 4790 | ||
diff --git a/mm/sparse.c b/mm/sparse.c index 308d50331bc3..4ac1d7ef548f 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -339,13 +339,14 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | |||
339 | } | 339 | } |
340 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | 340 | #endif /* CONFIG_MEMORY_HOTREMOVE */ |
341 | 341 | ||
342 | static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map, | 342 | static void __init sparse_early_usemaps_alloc_node(void *data, |
343 | unsigned long pnum_begin, | 343 | unsigned long pnum_begin, |
344 | unsigned long pnum_end, | 344 | unsigned long pnum_end, |
345 | unsigned long usemap_count, int nodeid) | 345 | unsigned long usemap_count, int nodeid) |
346 | { | 346 | { |
347 | void *usemap; | 347 | void *usemap; |
348 | unsigned long pnum; | 348 | unsigned long pnum; |
349 | unsigned long **usemap_map = (unsigned long **)data; | ||
349 | int size = usemap_size(); | 350 | int size = usemap_size(); |
350 | 351 | ||
351 | usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), | 352 | usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), |
@@ -430,11 +431,12 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, | |||
430 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ | 431 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ |
431 | 432 | ||
432 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | 433 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER |
433 | static void __init sparse_early_mem_maps_alloc_node(struct page **map_map, | 434 | static void __init sparse_early_mem_maps_alloc_node(void *data, |
434 | unsigned long pnum_begin, | 435 | unsigned long pnum_begin, |
435 | unsigned long pnum_end, | 436 | unsigned long pnum_end, |
436 | unsigned long map_count, int nodeid) | 437 | unsigned long map_count, int nodeid) |
437 | { | 438 | { |
439 | struct page **map_map = (struct page **)data; | ||
438 | sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end, | 440 | sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end, |
439 | map_count, nodeid); | 441 | map_count, nodeid); |
440 | } | 442 | } |
@@ -460,6 +462,55 @@ void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) | |||
460 | { | 462 | { |
461 | } | 463 | } |
462 | 464 | ||
465 | /** | ||
466 | * alloc_usemap_and_memmap - memory alloction for pageblock flags and vmemmap | ||
467 | * @map: usemap_map for pageblock flags or mmap_map for vmemmap | ||
468 | */ | ||
469 | static void __init alloc_usemap_and_memmap(void (*alloc_func) | ||
470 | (void *, unsigned long, unsigned long, | ||
471 | unsigned long, int), void *data) | ||
472 | { | ||
473 | unsigned long pnum; | ||
474 | unsigned long map_count; | ||
475 | int nodeid_begin = 0; | ||
476 | unsigned long pnum_begin = 0; | ||
477 | |||
478 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { | ||
479 | struct mem_section *ms; | ||
480 | |||
481 | if (!present_section_nr(pnum)) | ||
482 | continue; | ||
483 | ms = __nr_to_section(pnum); | ||
484 | nodeid_begin = sparse_early_nid(ms); | ||
485 | pnum_begin = pnum; | ||
486 | break; | ||
487 | } | ||
488 | map_count = 1; | ||
489 | for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { | ||
490 | struct mem_section *ms; | ||
491 | int nodeid; | ||
492 | |||
493 | if (!present_section_nr(pnum)) | ||
494 | continue; | ||
495 | ms = __nr_to_section(pnum); | ||
496 | nodeid = sparse_early_nid(ms); | ||
497 | if (nodeid == nodeid_begin) { | ||
498 | map_count++; | ||
499 | continue; | ||
500 | } | ||
501 | /* ok, we need to take cake of from pnum_begin to pnum - 1*/ | ||
502 | alloc_func(data, pnum_begin, pnum, | ||
503 | map_count, nodeid_begin); | ||
504 | /* new start, update count etc*/ | ||
505 | nodeid_begin = nodeid; | ||
506 | pnum_begin = pnum; | ||
507 | map_count = 1; | ||
508 | } | ||
509 | /* ok, last chunk */ | ||
510 | alloc_func(data, pnum_begin, NR_MEM_SECTIONS, | ||
511 | map_count, nodeid_begin); | ||
512 | } | ||
513 | |||
463 | /* | 514 | /* |
464 | * Allocate the accumulated non-linear sections, allocate a mem_map | 515 | * Allocate the accumulated non-linear sections, allocate a mem_map |
465 | * for each and record the physical to section mapping. | 516 | * for each and record the physical to section mapping. |
@@ -471,11 +522,7 @@ void __init sparse_init(void) | |||
471 | unsigned long *usemap; | 522 | unsigned long *usemap; |
472 | unsigned long **usemap_map; | 523 | unsigned long **usemap_map; |
473 | int size; | 524 | int size; |
474 | int nodeid_begin = 0; | ||
475 | unsigned long pnum_begin = 0; | ||
476 | unsigned long usemap_count; | ||
477 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | 525 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER |
478 | unsigned long map_count; | ||
479 | int size2; | 526 | int size2; |
480 | struct page **map_map; | 527 | struct page **map_map; |
481 | #endif | 528 | #endif |
@@ -501,82 +548,16 @@ void __init sparse_init(void) | |||
501 | usemap_map = alloc_bootmem(size); | 548 | usemap_map = alloc_bootmem(size); |
502 | if (!usemap_map) | 549 | if (!usemap_map) |
503 | panic("can not allocate usemap_map\n"); | 550 | panic("can not allocate usemap_map\n"); |
504 | 551 | alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, | |
505 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { | 552 | (void *)usemap_map); |
506 | struct mem_section *ms; | ||
507 | |||
508 | if (!present_section_nr(pnum)) | ||
509 | continue; | ||
510 | ms = __nr_to_section(pnum); | ||
511 | nodeid_begin = sparse_early_nid(ms); | ||
512 | pnum_begin = pnum; | ||
513 | break; | ||
514 | } | ||
515 | usemap_count = 1; | ||
516 | for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { | ||
517 | struct mem_section *ms; | ||
518 | int nodeid; | ||
519 | |||
520 | if (!present_section_nr(pnum)) | ||
521 | continue; | ||
522 | ms = __nr_to_section(pnum); | ||
523 | nodeid = sparse_early_nid(ms); | ||
524 | if (nodeid == nodeid_begin) { | ||
525 | usemap_count++; | ||
526 | continue; | ||
527 | } | ||
528 | /* ok, we need to take cake of from pnum_begin to pnum - 1*/ | ||
529 | sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, pnum, | ||
530 | usemap_count, nodeid_begin); | ||
531 | /* new start, update count etc*/ | ||
532 | nodeid_begin = nodeid; | ||
533 | pnum_begin = pnum; | ||
534 | usemap_count = 1; | ||
535 | } | ||
536 | /* ok, last chunk */ | ||
537 | sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS, | ||
538 | usemap_count, nodeid_begin); | ||
539 | 553 | ||
540 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | 554 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER |
541 | size2 = sizeof(struct page *) * NR_MEM_SECTIONS; | 555 | size2 = sizeof(struct page *) * NR_MEM_SECTIONS; |
542 | map_map = alloc_bootmem(size2); | 556 | map_map = alloc_bootmem(size2); |
543 | if (!map_map) | 557 | if (!map_map) |
544 | panic("can not allocate map_map\n"); | 558 | panic("can not allocate map_map\n"); |
545 | 559 | alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, | |
546 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { | 560 | (void *)map_map); |
547 | struct mem_section *ms; | ||
548 | |||
549 | if (!present_section_nr(pnum)) | ||
550 | continue; | ||
551 | ms = __nr_to_section(pnum); | ||
552 | nodeid_begin = sparse_early_nid(ms); | ||
553 | pnum_begin = pnum; | ||
554 | break; | ||
555 | } | ||
556 | map_count = 1; | ||
557 | for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { | ||
558 | struct mem_section *ms; | ||
559 | int nodeid; | ||
560 | |||
561 | if (!present_section_nr(pnum)) | ||
562 | continue; | ||
563 | ms = __nr_to_section(pnum); | ||
564 | nodeid = sparse_early_nid(ms); | ||
565 | if (nodeid == nodeid_begin) { | ||
566 | map_count++; | ||
567 | continue; | ||
568 | } | ||
569 | /* ok, we need to take cake of from pnum_begin to pnum - 1*/ | ||
570 | sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum, | ||
571 | map_count, nodeid_begin); | ||
572 | /* new start, update count etc*/ | ||
573 | nodeid_begin = nodeid; | ||
574 | pnum_begin = pnum; | ||
575 | map_count = 1; | ||
576 | } | ||
577 | /* ok, last chunk */ | ||
578 | sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS, | ||
579 | map_count, nodeid_begin); | ||
580 | #endif | 561 | #endif |
581 | 562 | ||
582 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { | 563 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { |
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/memcontrol.h> | 31 | #include <linux/memcontrol.h> |
32 | #include <linux/gfp.h> | 32 | #include <linux/gfp.h> |
33 | #include <linux/uio.h> | 33 | #include <linux/uio.h> |
34 | #include <linux/hugetlb.h> | ||
34 | 35 | ||
35 | #include "internal.h" | 36 | #include "internal.h" |
36 | 37 | ||
@@ -81,6 +82,19 @@ static void __put_compound_page(struct page *page) | |||
81 | 82 | ||
82 | static void put_compound_page(struct page *page) | 83 | static void put_compound_page(struct page *page) |
83 | { | 84 | { |
85 | /* | ||
86 | * hugetlbfs pages cannot be split from under us. If this is a | ||
87 | * hugetlbfs page, check refcount on head page and release the page if | ||
88 | * the refcount becomes zero. | ||
89 | */ | ||
90 | if (PageHuge(page)) { | ||
91 | page = compound_head(page); | ||
92 | if (put_page_testzero(page)) | ||
93 | __put_compound_page(page); | ||
94 | |||
95 | return; | ||
96 | } | ||
97 | |||
84 | if (unlikely(PageTail(page))) { | 98 | if (unlikely(PageTail(page))) { |
85 | /* __split_huge_page_refcount can run under us */ | 99 | /* __split_huge_page_refcount can run under us */ |
86 | struct page *page_head = compound_trans_head(page); | 100 | struct page *page_head = compound_trans_head(page); |
@@ -184,38 +198,51 @@ bool __get_page_tail(struct page *page) | |||
184 | * proper PT lock that already serializes against | 198 | * proper PT lock that already serializes against |
185 | * split_huge_page(). | 199 | * split_huge_page(). |
186 | */ | 200 | */ |
187 | unsigned long flags; | ||
188 | bool got = false; | 201 | bool got = false; |
189 | struct page *page_head = compound_trans_head(page); | 202 | struct page *page_head; |
190 | 203 | ||
191 | if (likely(page != page_head && get_page_unless_zero(page_head))) { | 204 | /* |
205 | * If this is a hugetlbfs page it cannot be split under us. Simply | ||
206 | * increment refcount for the head page. | ||
207 | */ | ||
208 | if (PageHuge(page)) { | ||
209 | page_head = compound_head(page); | ||
210 | atomic_inc(&page_head->_count); | ||
211 | got = true; | ||
212 | } else { | ||
213 | unsigned long flags; | ||
214 | |||
215 | page_head = compound_trans_head(page); | ||
216 | if (likely(page != page_head && | ||
217 | get_page_unless_zero(page_head))) { | ||
218 | |||
219 | /* Ref to put_compound_page() comment. */ | ||
220 | if (PageSlab(page_head)) { | ||
221 | if (likely(PageTail(page))) { | ||
222 | __get_page_tail_foll(page, false); | ||
223 | return true; | ||
224 | } else { | ||
225 | put_page(page_head); | ||
226 | return false; | ||
227 | } | ||
228 | } | ||
192 | 229 | ||
193 | /* Ref to put_compound_page() comment. */ | 230 | /* |
194 | if (PageSlab(page_head)) { | 231 | * page_head wasn't a dangling pointer but it |
232 | * may not be a head page anymore by the time | ||
233 | * we obtain the lock. That is ok as long as it | ||
234 | * can't be freed from under us. | ||
235 | */ | ||
236 | flags = compound_lock_irqsave(page_head); | ||
237 | /* here __split_huge_page_refcount won't run anymore */ | ||
195 | if (likely(PageTail(page))) { | 238 | if (likely(PageTail(page))) { |
196 | __get_page_tail_foll(page, false); | 239 | __get_page_tail_foll(page, false); |
197 | return true; | 240 | got = true; |
198 | } else { | ||
199 | put_page(page_head); | ||
200 | return false; | ||
201 | } | 241 | } |
242 | compound_unlock_irqrestore(page_head, flags); | ||
243 | if (unlikely(!got)) | ||
244 | put_page(page_head); | ||
202 | } | 245 | } |
203 | |||
204 | /* | ||
205 | * page_head wasn't a dangling pointer but it | ||
206 | * may not be a head page anymore by the time | ||
207 | * we obtain the lock. That is ok as long as it | ||
208 | * can't be freed from under us. | ||
209 | */ | ||
210 | flags = compound_lock_irqsave(page_head); | ||
211 | /* here __split_huge_page_refcount won't run anymore */ | ||
212 | if (likely(PageTail(page))) { | ||
213 | __get_page_tail_foll(page, false); | ||
214 | got = true; | ||
215 | } | ||
216 | compound_unlock_irqrestore(page_head, flags); | ||
217 | if (unlikely(!got)) | ||
218 | put_page(page_head); | ||
219 | } | 246 | } |
220 | return got; | 247 | return got; |
221 | } | 248 | } |
diff --git a/mm/swap_state.c b/mm/swap_state.c index f24ab0dff554..e6f15f8ca2af 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -122,7 +122,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) | |||
122 | { | 122 | { |
123 | int error; | 123 | int error; |
124 | 124 | ||
125 | error = radix_tree_preload(gfp_mask); | 125 | error = radix_tree_maybe_preload(gfp_mask); |
126 | if (!error) { | 126 | if (!error) { |
127 | error = __add_to_swap_cache(page, entry); | 127 | error = __add_to_swap_cache(page, entry); |
128 | radix_tree_preload_end(); | 128 | radix_tree_preload_end(); |
@@ -328,7 +328,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
328 | /* | 328 | /* |
329 | * call radix_tree_preload() while we can wait. | 329 | * call radix_tree_preload() while we can wait. |
330 | */ | 330 | */ |
331 | err = radix_tree_preload(gfp_mask & GFP_KERNEL); | 331 | err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL); |
332 | if (err) | 332 | if (err) |
333 | break; | 333 | break; |
334 | 334 | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c index 6cf2e60983b7..3963fc24fcc1 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -175,14 +175,296 @@ static void discard_swap_cluster(struct swap_info_struct *si, | |||
175 | } | 175 | } |
176 | } | 176 | } |
177 | 177 | ||
178 | static int wait_for_discard(void *word) | 178 | #define SWAPFILE_CLUSTER 256 |
179 | #define LATENCY_LIMIT 256 | ||
180 | |||
181 | static inline void cluster_set_flag(struct swap_cluster_info *info, | ||
182 | unsigned int flag) | ||
179 | { | 183 | { |
180 | schedule(); | 184 | info->flags = flag; |
181 | return 0; | ||
182 | } | 185 | } |
183 | 186 | ||
184 | #define SWAPFILE_CLUSTER 256 | 187 | static inline unsigned int cluster_count(struct swap_cluster_info *info) |
185 | #define LATENCY_LIMIT 256 | 188 | { |
189 | return info->data; | ||
190 | } | ||
191 | |||
192 | static inline void cluster_set_count(struct swap_cluster_info *info, | ||
193 | unsigned int c) | ||
194 | { | ||
195 | info->data = c; | ||
196 | } | ||
197 | |||
198 | static inline void cluster_set_count_flag(struct swap_cluster_info *info, | ||
199 | unsigned int c, unsigned int f) | ||
200 | { | ||
201 | info->flags = f; | ||
202 | info->data = c; | ||
203 | } | ||
204 | |||
205 | static inline unsigned int cluster_next(struct swap_cluster_info *info) | ||
206 | { | ||
207 | return info->data; | ||
208 | } | ||
209 | |||
210 | static inline void cluster_set_next(struct swap_cluster_info *info, | ||
211 | unsigned int n) | ||
212 | { | ||
213 | info->data = n; | ||
214 | } | ||
215 | |||
216 | static inline void cluster_set_next_flag(struct swap_cluster_info *info, | ||
217 | unsigned int n, unsigned int f) | ||
218 | { | ||
219 | info->flags = f; | ||
220 | info->data = n; | ||
221 | } | ||
222 | |||
223 | static inline bool cluster_is_free(struct swap_cluster_info *info) | ||
224 | { | ||
225 | return info->flags & CLUSTER_FLAG_FREE; | ||
226 | } | ||
227 | |||
228 | static inline bool cluster_is_null(struct swap_cluster_info *info) | ||
229 | { | ||
230 | return info->flags & CLUSTER_FLAG_NEXT_NULL; | ||
231 | } | ||
232 | |||
233 | static inline void cluster_set_null(struct swap_cluster_info *info) | ||
234 | { | ||
235 | info->flags = CLUSTER_FLAG_NEXT_NULL; | ||
236 | info->data = 0; | ||
237 | } | ||
238 | |||
239 | /* Add a cluster to discard list and schedule it to do discard */ | ||
240 | static void swap_cluster_schedule_discard(struct swap_info_struct *si, | ||
241 | unsigned int idx) | ||
242 | { | ||
243 | /* | ||
244 | * If scan_swap_map() can't find a free cluster, it will check | ||
245 | * si->swap_map directly. To make sure the discarding cluster isn't | ||
246 | * taken by scan_swap_map(), mark the swap entries bad (occupied). It | ||
247 | * will be cleared after discard | ||
248 | */ | ||
249 | memset(si->swap_map + idx * SWAPFILE_CLUSTER, | ||
250 | SWAP_MAP_BAD, SWAPFILE_CLUSTER); | ||
251 | |||
252 | if (cluster_is_null(&si->discard_cluster_head)) { | ||
253 | cluster_set_next_flag(&si->discard_cluster_head, | ||
254 | idx, 0); | ||
255 | cluster_set_next_flag(&si->discard_cluster_tail, | ||
256 | idx, 0); | ||
257 | } else { | ||
258 | unsigned int tail = cluster_next(&si->discard_cluster_tail); | ||
259 | cluster_set_next(&si->cluster_info[tail], idx); | ||
260 | cluster_set_next_flag(&si->discard_cluster_tail, | ||
261 | idx, 0); | ||
262 | } | ||
263 | |||
264 | schedule_work(&si->discard_work); | ||
265 | } | ||
266 | |||
267 | /* | ||
268 | * Doing discard actually. After a cluster discard is finished, the cluster | ||
269 | * will be added to free cluster list. caller should hold si->lock. | ||
270 | */ | ||
271 | static void swap_do_scheduled_discard(struct swap_info_struct *si) | ||
272 | { | ||
273 | struct swap_cluster_info *info; | ||
274 | unsigned int idx; | ||
275 | |||
276 | info = si->cluster_info; | ||
277 | |||
278 | while (!cluster_is_null(&si->discard_cluster_head)) { | ||
279 | idx = cluster_next(&si->discard_cluster_head); | ||
280 | |||
281 | cluster_set_next_flag(&si->discard_cluster_head, | ||
282 | cluster_next(&info[idx]), 0); | ||
283 | if (cluster_next(&si->discard_cluster_tail) == idx) { | ||
284 | cluster_set_null(&si->discard_cluster_head); | ||
285 | cluster_set_null(&si->discard_cluster_tail); | ||
286 | } | ||
287 | spin_unlock(&si->lock); | ||
288 | |||
289 | discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, | ||
290 | SWAPFILE_CLUSTER); | ||
291 | |||
292 | spin_lock(&si->lock); | ||
293 | cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE); | ||
294 | if (cluster_is_null(&si->free_cluster_head)) { | ||
295 | cluster_set_next_flag(&si->free_cluster_head, | ||
296 | idx, 0); | ||
297 | cluster_set_next_flag(&si->free_cluster_tail, | ||
298 | idx, 0); | ||
299 | } else { | ||
300 | unsigned int tail; | ||
301 | |||
302 | tail = cluster_next(&si->free_cluster_tail); | ||
303 | cluster_set_next(&info[tail], idx); | ||
304 | cluster_set_next_flag(&si->free_cluster_tail, | ||
305 | idx, 0); | ||
306 | } | ||
307 | memset(si->swap_map + idx * SWAPFILE_CLUSTER, | ||
308 | 0, SWAPFILE_CLUSTER); | ||
309 | } | ||
310 | } | ||
311 | |||
312 | static void swap_discard_work(struct work_struct *work) | ||
313 | { | ||
314 | struct swap_info_struct *si; | ||
315 | |||
316 | si = container_of(work, struct swap_info_struct, discard_work); | ||
317 | |||
318 | spin_lock(&si->lock); | ||
319 | swap_do_scheduled_discard(si); | ||
320 | spin_unlock(&si->lock); | ||
321 | } | ||
322 | |||
323 | /* | ||
324 | * The cluster corresponding to page_nr will be used. The cluster will be | ||
325 | * removed from free cluster list and its usage counter will be increased. | ||
326 | */ | ||
327 | static void inc_cluster_info_page(struct swap_info_struct *p, | ||
328 | struct swap_cluster_info *cluster_info, unsigned long page_nr) | ||
329 | { | ||
330 | unsigned long idx = page_nr / SWAPFILE_CLUSTER; | ||
331 | |||
332 | if (!cluster_info) | ||
333 | return; | ||
334 | if (cluster_is_free(&cluster_info[idx])) { | ||
335 | VM_BUG_ON(cluster_next(&p->free_cluster_head) != idx); | ||
336 | cluster_set_next_flag(&p->free_cluster_head, | ||
337 | cluster_next(&cluster_info[idx]), 0); | ||
338 | if (cluster_next(&p->free_cluster_tail) == idx) { | ||
339 | cluster_set_null(&p->free_cluster_tail); | ||
340 | cluster_set_null(&p->free_cluster_head); | ||
341 | } | ||
342 | cluster_set_count_flag(&cluster_info[idx], 0, 0); | ||
343 | } | ||
344 | |||
345 | VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER); | ||
346 | cluster_set_count(&cluster_info[idx], | ||
347 | cluster_count(&cluster_info[idx]) + 1); | ||
348 | } | ||
349 | |||
350 | /* | ||
351 | * The cluster corresponding to page_nr decreases one usage. If the usage | ||
352 | * counter becomes 0, which means no page in the cluster is in using, we can | ||
353 | * optionally discard the cluster and add it to free cluster list. | ||
354 | */ | ||
355 | static void dec_cluster_info_page(struct swap_info_struct *p, | ||
356 | struct swap_cluster_info *cluster_info, unsigned long page_nr) | ||
357 | { | ||
358 | unsigned long idx = page_nr / SWAPFILE_CLUSTER; | ||
359 | |||
360 | if (!cluster_info) | ||
361 | return; | ||
362 | |||
363 | VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0); | ||
364 | cluster_set_count(&cluster_info[idx], | ||
365 | cluster_count(&cluster_info[idx]) - 1); | ||
366 | |||
367 | if (cluster_count(&cluster_info[idx]) == 0) { | ||
368 | /* | ||
369 | * If the swap is discardable, prepare discard the cluster | ||
370 | * instead of free it immediately. The cluster will be freed | ||
371 | * after discard. | ||
372 | */ | ||
373 | if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == | ||
374 | (SWP_WRITEOK | SWP_PAGE_DISCARD)) { | ||
375 | swap_cluster_schedule_discard(p, idx); | ||
376 | return; | ||
377 | } | ||
378 | |||
379 | cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); | ||
380 | if (cluster_is_null(&p->free_cluster_head)) { | ||
381 | cluster_set_next_flag(&p->free_cluster_head, idx, 0); | ||
382 | cluster_set_next_flag(&p->free_cluster_tail, idx, 0); | ||
383 | } else { | ||
384 | unsigned int tail = cluster_next(&p->free_cluster_tail); | ||
385 | cluster_set_next(&cluster_info[tail], idx); | ||
386 | cluster_set_next_flag(&p->free_cluster_tail, idx, 0); | ||
387 | } | ||
388 | } | ||
389 | } | ||
390 | |||
391 | /* | ||
392 | * It's possible scan_swap_map() uses a free cluster in the middle of free | ||
393 | * cluster list. Avoiding such abuse to avoid list corruption. | ||
394 | */ | ||
395 | static bool | ||
396 | scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, | ||
397 | unsigned long offset) | ||
398 | { | ||
399 | struct percpu_cluster *percpu_cluster; | ||
400 | bool conflict; | ||
401 | |||
402 | offset /= SWAPFILE_CLUSTER; | ||
403 | conflict = !cluster_is_null(&si->free_cluster_head) && | ||
404 | offset != cluster_next(&si->free_cluster_head) && | ||
405 | cluster_is_free(&si->cluster_info[offset]); | ||
406 | |||
407 | if (!conflict) | ||
408 | return false; | ||
409 | |||
410 | percpu_cluster = this_cpu_ptr(si->percpu_cluster); | ||
411 | cluster_set_null(&percpu_cluster->index); | ||
412 | return true; | ||
413 | } | ||
414 | |||
415 | /* | ||
416 | * Try to get a swap entry from current cpu's swap entry pool (a cluster). This | ||
417 | * might involve allocating a new cluster for current CPU too. | ||
418 | */ | ||
419 | static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, | ||
420 | unsigned long *offset, unsigned long *scan_base) | ||
421 | { | ||
422 | struct percpu_cluster *cluster; | ||
423 | bool found_free; | ||
424 | unsigned long tmp; | ||
425 | |||
426 | new_cluster: | ||
427 | cluster = this_cpu_ptr(si->percpu_cluster); | ||
428 | if (cluster_is_null(&cluster->index)) { | ||
429 | if (!cluster_is_null(&si->free_cluster_head)) { | ||
430 | cluster->index = si->free_cluster_head; | ||
431 | cluster->next = cluster_next(&cluster->index) * | ||
432 | SWAPFILE_CLUSTER; | ||
433 | } else if (!cluster_is_null(&si->discard_cluster_head)) { | ||
434 | /* | ||
435 | * we don't have free cluster but have some clusters in | ||
436 | * discarding, do discard now and reclaim them | ||
437 | */ | ||
438 | swap_do_scheduled_discard(si); | ||
439 | *scan_base = *offset = si->cluster_next; | ||
440 | goto new_cluster; | ||
441 | } else | ||
442 | return; | ||
443 | } | ||
444 | |||
445 | found_free = false; | ||
446 | |||
447 | /* | ||
448 | * Other CPUs can use our cluster if they can't find a free cluster, | ||
449 | * check if there is still free entry in the cluster | ||
450 | */ | ||
451 | tmp = cluster->next; | ||
452 | while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) * | ||
453 | SWAPFILE_CLUSTER) { | ||
454 | if (!si->swap_map[tmp]) { | ||
455 | found_free = true; | ||
456 | break; | ||
457 | } | ||
458 | tmp++; | ||
459 | } | ||
460 | if (!found_free) { | ||
461 | cluster_set_null(&cluster->index); | ||
462 | goto new_cluster; | ||
463 | } | ||
464 | cluster->next = tmp + 1; | ||
465 | *offset = tmp; | ||
466 | *scan_base = tmp; | ||
467 | } | ||
186 | 468 | ||
187 | static unsigned long scan_swap_map(struct swap_info_struct *si, | 469 | static unsigned long scan_swap_map(struct swap_info_struct *si, |
188 | unsigned char usage) | 470 | unsigned char usage) |
@@ -191,7 +473,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
191 | unsigned long scan_base; | 473 | unsigned long scan_base; |
192 | unsigned long last_in_cluster = 0; | 474 | unsigned long last_in_cluster = 0; |
193 | int latency_ration = LATENCY_LIMIT; | 475 | int latency_ration = LATENCY_LIMIT; |
194 | int found_free_cluster = 0; | ||
195 | 476 | ||
196 | /* | 477 | /* |
197 | * We try to cluster swap pages by allocating them sequentially | 478 | * We try to cluster swap pages by allocating them sequentially |
@@ -207,24 +488,18 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
207 | si->flags += SWP_SCANNING; | 488 | si->flags += SWP_SCANNING; |
208 | scan_base = offset = si->cluster_next; | 489 | scan_base = offset = si->cluster_next; |
209 | 490 | ||
491 | /* SSD algorithm */ | ||
492 | if (si->cluster_info) { | ||
493 | scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); | ||
494 | goto checks; | ||
495 | } | ||
496 | |||
210 | if (unlikely(!si->cluster_nr--)) { | 497 | if (unlikely(!si->cluster_nr--)) { |
211 | if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { | 498 | if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { |
212 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 499 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
213 | goto checks; | 500 | goto checks; |
214 | } | 501 | } |
215 | if (si->flags & SWP_PAGE_DISCARD) { | 502 | |
216 | /* | ||
217 | * Start range check on racing allocations, in case | ||
218 | * they overlap the cluster we eventually decide on | ||
219 | * (we scan without swap_lock to allow preemption). | ||
220 | * It's hardly conceivable that cluster_nr could be | ||
221 | * wrapped during our scan, but don't depend on it. | ||
222 | */ | ||
223 | if (si->lowest_alloc) | ||
224 | goto checks; | ||
225 | si->lowest_alloc = si->max; | ||
226 | si->highest_alloc = 0; | ||
227 | } | ||
228 | spin_unlock(&si->lock); | 503 | spin_unlock(&si->lock); |
229 | 504 | ||
230 | /* | 505 | /* |
@@ -248,7 +523,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
248 | offset -= SWAPFILE_CLUSTER - 1; | 523 | offset -= SWAPFILE_CLUSTER - 1; |
249 | si->cluster_next = offset; | 524 | si->cluster_next = offset; |
250 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 525 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
251 | found_free_cluster = 1; | ||
252 | goto checks; | 526 | goto checks; |
253 | } | 527 | } |
254 | if (unlikely(--latency_ration < 0)) { | 528 | if (unlikely(--latency_ration < 0)) { |
@@ -269,7 +543,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
269 | offset -= SWAPFILE_CLUSTER - 1; | 543 | offset -= SWAPFILE_CLUSTER - 1; |
270 | si->cluster_next = offset; | 544 | si->cluster_next = offset; |
271 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 545 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
272 | found_free_cluster = 1; | ||
273 | goto checks; | 546 | goto checks; |
274 | } | 547 | } |
275 | if (unlikely(--latency_ration < 0)) { | 548 | if (unlikely(--latency_ration < 0)) { |
@@ -281,10 +554,13 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
281 | offset = scan_base; | 554 | offset = scan_base; |
282 | spin_lock(&si->lock); | 555 | spin_lock(&si->lock); |
283 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 556 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
284 | si->lowest_alloc = 0; | ||
285 | } | 557 | } |
286 | 558 | ||
287 | checks: | 559 | checks: |
560 | if (si->cluster_info) { | ||
561 | while (scan_swap_map_ssd_cluster_conflict(si, offset)) | ||
562 | scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); | ||
563 | } | ||
288 | if (!(si->flags & SWP_WRITEOK)) | 564 | if (!(si->flags & SWP_WRITEOK)) |
289 | goto no_page; | 565 | goto no_page; |
290 | if (!si->highest_bit) | 566 | if (!si->highest_bit) |
@@ -317,62 +593,10 @@ checks: | |||
317 | si->highest_bit = 0; | 593 | si->highest_bit = 0; |
318 | } | 594 | } |
319 | si->swap_map[offset] = usage; | 595 | si->swap_map[offset] = usage; |
596 | inc_cluster_info_page(si, si->cluster_info, offset); | ||
320 | si->cluster_next = offset + 1; | 597 | si->cluster_next = offset + 1; |
321 | si->flags -= SWP_SCANNING; | 598 | si->flags -= SWP_SCANNING; |
322 | 599 | ||
323 | if (si->lowest_alloc) { | ||
324 | /* | ||
325 | * Only set when SWP_PAGE_DISCARD, and there's a scan | ||
326 | * for a free cluster in progress or just completed. | ||
327 | */ | ||
328 | if (found_free_cluster) { | ||
329 | /* | ||
330 | * To optimize wear-levelling, discard the | ||
331 | * old data of the cluster, taking care not to | ||
332 | * discard any of its pages that have already | ||
333 | * been allocated by racing tasks (offset has | ||
334 | * already stepped over any at the beginning). | ||
335 | */ | ||
336 | if (offset < si->highest_alloc && | ||
337 | si->lowest_alloc <= last_in_cluster) | ||
338 | last_in_cluster = si->lowest_alloc - 1; | ||
339 | si->flags |= SWP_DISCARDING; | ||
340 | spin_unlock(&si->lock); | ||
341 | |||
342 | if (offset < last_in_cluster) | ||
343 | discard_swap_cluster(si, offset, | ||
344 | last_in_cluster - offset + 1); | ||
345 | |||
346 | spin_lock(&si->lock); | ||
347 | si->lowest_alloc = 0; | ||
348 | si->flags &= ~SWP_DISCARDING; | ||
349 | |||
350 | smp_mb(); /* wake_up_bit advises this */ | ||
351 | wake_up_bit(&si->flags, ilog2(SWP_DISCARDING)); | ||
352 | |||
353 | } else if (si->flags & SWP_DISCARDING) { | ||
354 | /* | ||
355 | * Delay using pages allocated by racing tasks | ||
356 | * until the whole discard has been issued. We | ||
357 | * could defer that delay until swap_writepage, | ||
358 | * but it's easier to keep this self-contained. | ||
359 | */ | ||
360 | spin_unlock(&si->lock); | ||
361 | wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), | ||
362 | wait_for_discard, TASK_UNINTERRUPTIBLE); | ||
363 | spin_lock(&si->lock); | ||
364 | } else { | ||
365 | /* | ||
366 | * Note pages allocated by racing tasks while | ||
367 | * scan for a free cluster is in progress, so | ||
368 | * that its final discard can exclude them. | ||
369 | */ | ||
370 | if (offset < si->lowest_alloc) | ||
371 | si->lowest_alloc = offset; | ||
372 | if (offset > si->highest_alloc) | ||
373 | si->highest_alloc = offset; | ||
374 | } | ||
375 | } | ||
376 | return offset; | 600 | return offset; |
377 | 601 | ||
378 | scan: | 602 | scan: |
@@ -527,16 +751,16 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry) | |||
527 | return p; | 751 | return p; |
528 | 752 | ||
529 | bad_free: | 753 | bad_free: |
530 | printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); | 754 | pr_err("swap_free: %s%08lx\n", Unused_offset, entry.val); |
531 | goto out; | 755 | goto out; |
532 | bad_offset: | 756 | bad_offset: |
533 | printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); | 757 | pr_err("swap_free: %s%08lx\n", Bad_offset, entry.val); |
534 | goto out; | 758 | goto out; |
535 | bad_device: | 759 | bad_device: |
536 | printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); | 760 | pr_err("swap_free: %s%08lx\n", Unused_file, entry.val); |
537 | goto out; | 761 | goto out; |
538 | bad_nofile: | 762 | bad_nofile: |
539 | printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); | 763 | pr_err("swap_free: %s%08lx\n", Bad_file, entry.val); |
540 | out: | 764 | out: |
541 | return NULL; | 765 | return NULL; |
542 | } | 766 | } |
@@ -600,6 +824,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, | |||
600 | 824 | ||
601 | /* free if no reference */ | 825 | /* free if no reference */ |
602 | if (!usage) { | 826 | if (!usage) { |
827 | dec_cluster_info_page(p, p->cluster_info, offset); | ||
603 | if (offset < p->lowest_bit) | 828 | if (offset < p->lowest_bit) |
604 | p->lowest_bit = offset; | 829 | p->lowest_bit = offset; |
605 | if (offset > p->highest_bit) | 830 | if (offset > p->highest_bit) |
@@ -1107,7 +1332,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
1107 | else | 1332 | else |
1108 | continue; | 1333 | continue; |
1109 | } | 1334 | } |
1110 | count = si->swap_map[i]; | 1335 | count = ACCESS_ONCE(si->swap_map[i]); |
1111 | if (count && swap_count(count) != SWAP_MAP_BAD) | 1336 | if (count && swap_count(count) != SWAP_MAP_BAD) |
1112 | break; | 1337 | break; |
1113 | } | 1338 | } |
@@ -1127,7 +1352,11 @@ int try_to_unuse(unsigned int type, bool frontswap, | |||
1127 | { | 1352 | { |
1128 | struct swap_info_struct *si = swap_info[type]; | 1353 | struct swap_info_struct *si = swap_info[type]; |
1129 | struct mm_struct *start_mm; | 1354 | struct mm_struct *start_mm; |
1130 | unsigned char *swap_map; | 1355 | volatile unsigned char *swap_map; /* swap_map is accessed without |
1356 | * locking. Mark it as volatile | ||
1357 | * to prevent compiler doing | ||
1358 | * something odd. | ||
1359 | */ | ||
1131 | unsigned char swcount; | 1360 | unsigned char swcount; |
1132 | struct page *page; | 1361 | struct page *page; |
1133 | swp_entry_t entry; | 1362 | swp_entry_t entry; |
@@ -1178,7 +1407,15 @@ int try_to_unuse(unsigned int type, bool frontswap, | |||
1178 | * reused since sys_swapoff() already disabled | 1407 | * reused since sys_swapoff() already disabled |
1179 | * allocation from here, or alloc_page() failed. | 1408 | * allocation from here, or alloc_page() failed. |
1180 | */ | 1409 | */ |
1181 | if (!*swap_map) | 1410 | swcount = *swap_map; |
1411 | /* | ||
1412 | * We don't hold lock here, so the swap entry could be | ||
1413 | * SWAP_MAP_BAD (when the cluster is discarding). | ||
1414 | * Instead of fail out, We can just skip the swap | ||
1415 | * entry because swapoff will wait for discarding | ||
1416 | * finish anyway. | ||
1417 | */ | ||
1418 | if (!swcount || swcount == SWAP_MAP_BAD) | ||
1182 | continue; | 1419 | continue; |
1183 | retval = -ENOMEM; | 1420 | retval = -ENOMEM; |
1184 | break; | 1421 | break; |
@@ -1524,7 +1761,8 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) | |||
1524 | } | 1761 | } |
1525 | 1762 | ||
1526 | static void _enable_swap_info(struct swap_info_struct *p, int prio, | 1763 | static void _enable_swap_info(struct swap_info_struct *p, int prio, |
1527 | unsigned char *swap_map) | 1764 | unsigned char *swap_map, |
1765 | struct swap_cluster_info *cluster_info) | ||
1528 | { | 1766 | { |
1529 | int i, prev; | 1767 | int i, prev; |
1530 | 1768 | ||
@@ -1533,6 +1771,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, | |||
1533 | else | 1771 | else |
1534 | p->prio = --least_priority; | 1772 | p->prio = --least_priority; |
1535 | p->swap_map = swap_map; | 1773 | p->swap_map = swap_map; |
1774 | p->cluster_info = cluster_info; | ||
1536 | p->flags |= SWP_WRITEOK; | 1775 | p->flags |= SWP_WRITEOK; |
1537 | atomic_long_add(p->pages, &nr_swap_pages); | 1776 | atomic_long_add(p->pages, &nr_swap_pages); |
1538 | total_swap_pages += p->pages; | 1777 | total_swap_pages += p->pages; |
@@ -1553,12 +1792,13 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, | |||
1553 | 1792 | ||
1554 | static void enable_swap_info(struct swap_info_struct *p, int prio, | 1793 | static void enable_swap_info(struct swap_info_struct *p, int prio, |
1555 | unsigned char *swap_map, | 1794 | unsigned char *swap_map, |
1795 | struct swap_cluster_info *cluster_info, | ||
1556 | unsigned long *frontswap_map) | 1796 | unsigned long *frontswap_map) |
1557 | { | 1797 | { |
1558 | frontswap_init(p->type, frontswap_map); | 1798 | frontswap_init(p->type, frontswap_map); |
1559 | spin_lock(&swap_lock); | 1799 | spin_lock(&swap_lock); |
1560 | spin_lock(&p->lock); | 1800 | spin_lock(&p->lock); |
1561 | _enable_swap_info(p, prio, swap_map); | 1801 | _enable_swap_info(p, prio, swap_map, cluster_info); |
1562 | spin_unlock(&p->lock); | 1802 | spin_unlock(&p->lock); |
1563 | spin_unlock(&swap_lock); | 1803 | spin_unlock(&swap_lock); |
1564 | } | 1804 | } |
@@ -1567,7 +1807,7 @@ static void reinsert_swap_info(struct swap_info_struct *p) | |||
1567 | { | 1807 | { |
1568 | spin_lock(&swap_lock); | 1808 | spin_lock(&swap_lock); |
1569 | spin_lock(&p->lock); | 1809 | spin_lock(&p->lock); |
1570 | _enable_swap_info(p, p->prio, p->swap_map); | 1810 | _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info); |
1571 | spin_unlock(&p->lock); | 1811 | spin_unlock(&p->lock); |
1572 | spin_unlock(&swap_lock); | 1812 | spin_unlock(&swap_lock); |
1573 | } | 1813 | } |
@@ -1576,6 +1816,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1576 | { | 1816 | { |
1577 | struct swap_info_struct *p = NULL; | 1817 | struct swap_info_struct *p = NULL; |
1578 | unsigned char *swap_map; | 1818 | unsigned char *swap_map; |
1819 | struct swap_cluster_info *cluster_info; | ||
1579 | unsigned long *frontswap_map; | 1820 | unsigned long *frontswap_map; |
1580 | struct file *swap_file, *victim; | 1821 | struct file *swap_file, *victim; |
1581 | struct address_space *mapping; | 1822 | struct address_space *mapping; |
@@ -1651,6 +1892,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1651 | goto out_dput; | 1892 | goto out_dput; |
1652 | } | 1893 | } |
1653 | 1894 | ||
1895 | flush_work(&p->discard_work); | ||
1896 | |||
1654 | destroy_swap_extents(p); | 1897 | destroy_swap_extents(p); |
1655 | if (p->flags & SWP_CONTINUED) | 1898 | if (p->flags & SWP_CONTINUED) |
1656 | free_swap_count_continuations(p); | 1899 | free_swap_count_continuations(p); |
@@ -1675,6 +1918,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1675 | p->max = 0; | 1918 | p->max = 0; |
1676 | swap_map = p->swap_map; | 1919 | swap_map = p->swap_map; |
1677 | p->swap_map = NULL; | 1920 | p->swap_map = NULL; |
1921 | cluster_info = p->cluster_info; | ||
1922 | p->cluster_info = NULL; | ||
1678 | p->flags = 0; | 1923 | p->flags = 0; |
1679 | frontswap_map = frontswap_map_get(p); | 1924 | frontswap_map = frontswap_map_get(p); |
1680 | frontswap_map_set(p, NULL); | 1925 | frontswap_map_set(p, NULL); |
@@ -1682,7 +1927,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1682 | spin_unlock(&swap_lock); | 1927 | spin_unlock(&swap_lock); |
1683 | frontswap_invalidate_area(type); | 1928 | frontswap_invalidate_area(type); |
1684 | mutex_unlock(&swapon_mutex); | 1929 | mutex_unlock(&swapon_mutex); |
1930 | free_percpu(p->percpu_cluster); | ||
1931 | p->percpu_cluster = NULL; | ||
1685 | vfree(swap_map); | 1932 | vfree(swap_map); |
1933 | vfree(cluster_info); | ||
1686 | vfree(frontswap_map); | 1934 | vfree(frontswap_map); |
1687 | /* Destroy swap account informatin */ | 1935 | /* Destroy swap account informatin */ |
1688 | swap_cgroup_swapoff(type); | 1936 | swap_cgroup_swapoff(type); |
@@ -1926,9 +2174,10 @@ static unsigned long read_swap_header(struct swap_info_struct *p, | |||
1926 | int i; | 2174 | int i; |
1927 | unsigned long maxpages; | 2175 | unsigned long maxpages; |
1928 | unsigned long swapfilepages; | 2176 | unsigned long swapfilepages; |
2177 | unsigned long last_page; | ||
1929 | 2178 | ||
1930 | if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { | 2179 | if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { |
1931 | printk(KERN_ERR "Unable to find swap-space signature\n"); | 2180 | pr_err("Unable to find swap-space signature\n"); |
1932 | return 0; | 2181 | return 0; |
1933 | } | 2182 | } |
1934 | 2183 | ||
@@ -1942,9 +2191,8 @@ static unsigned long read_swap_header(struct swap_info_struct *p, | |||
1942 | } | 2191 | } |
1943 | /* Check the swap header's sub-version */ | 2192 | /* Check the swap header's sub-version */ |
1944 | if (swap_header->info.version != 1) { | 2193 | if (swap_header->info.version != 1) { |
1945 | printk(KERN_WARNING | 2194 | pr_warn("Unable to handle swap header version %d\n", |
1946 | "Unable to handle swap header version %d\n", | 2195 | swap_header->info.version); |
1947 | swap_header->info.version); | ||
1948 | return 0; | 2196 | return 0; |
1949 | } | 2197 | } |
1950 | 2198 | ||
@@ -1968,8 +2216,14 @@ static unsigned long read_swap_header(struct swap_info_struct *p, | |||
1968 | */ | 2216 | */ |
1969 | maxpages = swp_offset(pte_to_swp_entry( | 2217 | maxpages = swp_offset(pte_to_swp_entry( |
1970 | swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; | 2218 | swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; |
1971 | if (maxpages > swap_header->info.last_page) { | 2219 | last_page = swap_header->info.last_page; |
1972 | maxpages = swap_header->info.last_page + 1; | 2220 | if (last_page > maxpages) { |
2221 | pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", | ||
2222 | maxpages << (PAGE_SHIFT - 10), | ||
2223 | last_page << (PAGE_SHIFT - 10)); | ||
2224 | } | ||
2225 | if (maxpages > last_page) { | ||
2226 | maxpages = last_page + 1; | ||
1973 | /* p->max is an unsigned int: don't overflow it */ | 2227 | /* p->max is an unsigned int: don't overflow it */ |
1974 | if ((unsigned int)maxpages == 0) | 2228 | if ((unsigned int)maxpages == 0) |
1975 | maxpages = UINT_MAX; | 2229 | maxpages = UINT_MAX; |
@@ -1980,8 +2234,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p, | |||
1980 | return 0; | 2234 | return 0; |
1981 | swapfilepages = i_size_read(inode) >> PAGE_SHIFT; | 2235 | swapfilepages = i_size_read(inode) >> PAGE_SHIFT; |
1982 | if (swapfilepages && maxpages > swapfilepages) { | 2236 | if (swapfilepages && maxpages > swapfilepages) { |
1983 | printk(KERN_WARNING | 2237 | pr_warn("Swap area shorter than signature indicates\n"); |
1984 | "Swap area shorter than signature indicates\n"); | ||
1985 | return 0; | 2238 | return 0; |
1986 | } | 2239 | } |
1987 | if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) | 2240 | if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) |
@@ -1995,15 +2248,23 @@ static unsigned long read_swap_header(struct swap_info_struct *p, | |||
1995 | static int setup_swap_map_and_extents(struct swap_info_struct *p, | 2248 | static int setup_swap_map_and_extents(struct swap_info_struct *p, |
1996 | union swap_header *swap_header, | 2249 | union swap_header *swap_header, |
1997 | unsigned char *swap_map, | 2250 | unsigned char *swap_map, |
2251 | struct swap_cluster_info *cluster_info, | ||
1998 | unsigned long maxpages, | 2252 | unsigned long maxpages, |
1999 | sector_t *span) | 2253 | sector_t *span) |
2000 | { | 2254 | { |
2001 | int i; | 2255 | int i; |
2002 | unsigned int nr_good_pages; | 2256 | unsigned int nr_good_pages; |
2003 | int nr_extents; | 2257 | int nr_extents; |
2258 | unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); | ||
2259 | unsigned long idx = p->cluster_next / SWAPFILE_CLUSTER; | ||
2004 | 2260 | ||
2005 | nr_good_pages = maxpages - 1; /* omit header page */ | 2261 | nr_good_pages = maxpages - 1; /* omit header page */ |
2006 | 2262 | ||
2263 | cluster_set_null(&p->free_cluster_head); | ||
2264 | cluster_set_null(&p->free_cluster_tail); | ||
2265 | cluster_set_null(&p->discard_cluster_head); | ||
2266 | cluster_set_null(&p->discard_cluster_tail); | ||
2267 | |||
2007 | for (i = 0; i < swap_header->info.nr_badpages; i++) { | 2268 | for (i = 0; i < swap_header->info.nr_badpages; i++) { |
2008 | unsigned int page_nr = swap_header->info.badpages[i]; | 2269 | unsigned int page_nr = swap_header->info.badpages[i]; |
2009 | if (page_nr == 0 || page_nr > swap_header->info.last_page) | 2270 | if (page_nr == 0 || page_nr > swap_header->info.last_page) |
@@ -2011,11 +2272,25 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, | |||
2011 | if (page_nr < maxpages) { | 2272 | if (page_nr < maxpages) { |
2012 | swap_map[page_nr] = SWAP_MAP_BAD; | 2273 | swap_map[page_nr] = SWAP_MAP_BAD; |
2013 | nr_good_pages--; | 2274 | nr_good_pages--; |
2275 | /* | ||
2276 | * Haven't marked the cluster free yet, no list | ||
2277 | * operation involved | ||
2278 | */ | ||
2279 | inc_cluster_info_page(p, cluster_info, page_nr); | ||
2014 | } | 2280 | } |
2015 | } | 2281 | } |
2016 | 2282 | ||
2283 | /* Haven't marked the cluster free yet, no list operation involved */ | ||
2284 | for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) | ||
2285 | inc_cluster_info_page(p, cluster_info, i); | ||
2286 | |||
2017 | if (nr_good_pages) { | 2287 | if (nr_good_pages) { |
2018 | swap_map[0] = SWAP_MAP_BAD; | 2288 | swap_map[0] = SWAP_MAP_BAD; |
2289 | /* | ||
2290 | * Not mark the cluster free yet, no list | ||
2291 | * operation involved | ||
2292 | */ | ||
2293 | inc_cluster_info_page(p, cluster_info, 0); | ||
2019 | p->max = maxpages; | 2294 | p->max = maxpages; |
2020 | p->pages = nr_good_pages; | 2295 | p->pages = nr_good_pages; |
2021 | nr_extents = setup_swap_extents(p, span); | 2296 | nr_extents = setup_swap_extents(p, span); |
@@ -2024,10 +2299,34 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, | |||
2024 | nr_good_pages = p->pages; | 2299 | nr_good_pages = p->pages; |
2025 | } | 2300 | } |
2026 | if (!nr_good_pages) { | 2301 | if (!nr_good_pages) { |
2027 | printk(KERN_WARNING "Empty swap-file\n"); | 2302 | pr_warn("Empty swap-file\n"); |
2028 | return -EINVAL; | 2303 | return -EINVAL; |
2029 | } | 2304 | } |
2030 | 2305 | ||
2306 | if (!cluster_info) | ||
2307 | return nr_extents; | ||
2308 | |||
2309 | for (i = 0; i < nr_clusters; i++) { | ||
2310 | if (!cluster_count(&cluster_info[idx])) { | ||
2311 | cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); | ||
2312 | if (cluster_is_null(&p->free_cluster_head)) { | ||
2313 | cluster_set_next_flag(&p->free_cluster_head, | ||
2314 | idx, 0); | ||
2315 | cluster_set_next_flag(&p->free_cluster_tail, | ||
2316 | idx, 0); | ||
2317 | } else { | ||
2318 | unsigned int tail; | ||
2319 | |||
2320 | tail = cluster_next(&p->free_cluster_tail); | ||
2321 | cluster_set_next(&cluster_info[tail], idx); | ||
2322 | cluster_set_next_flag(&p->free_cluster_tail, | ||
2323 | idx, 0); | ||
2324 | } | ||
2325 | } | ||
2326 | idx++; | ||
2327 | if (idx == nr_clusters) | ||
2328 | idx = 0; | ||
2329 | } | ||
2031 | return nr_extents; | 2330 | return nr_extents; |
2032 | } | 2331 | } |
2033 | 2332 | ||
@@ -2059,6 +2358,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2059 | sector_t span; | 2358 | sector_t span; |
2060 | unsigned long maxpages; | 2359 | unsigned long maxpages; |
2061 | unsigned char *swap_map = NULL; | 2360 | unsigned char *swap_map = NULL; |
2361 | struct swap_cluster_info *cluster_info = NULL; | ||
2062 | unsigned long *frontswap_map = NULL; | 2362 | unsigned long *frontswap_map = NULL; |
2063 | struct page *page = NULL; | 2363 | struct page *page = NULL; |
2064 | struct inode *inode = NULL; | 2364 | struct inode *inode = NULL; |
@@ -2073,6 +2373,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2073 | if (IS_ERR(p)) | 2373 | if (IS_ERR(p)) |
2074 | return PTR_ERR(p); | 2374 | return PTR_ERR(p); |
2075 | 2375 | ||
2376 | INIT_WORK(&p->discard_work, swap_discard_work); | ||
2377 | |||
2076 | name = getname(specialfile); | 2378 | name = getname(specialfile); |
2077 | if (IS_ERR(name)) { | 2379 | if (IS_ERR(name)) { |
2078 | error = PTR_ERR(name); | 2380 | error = PTR_ERR(name); |
@@ -2132,13 +2434,38 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2132 | error = -ENOMEM; | 2434 | error = -ENOMEM; |
2133 | goto bad_swap; | 2435 | goto bad_swap; |
2134 | } | 2436 | } |
2437 | if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { | ||
2438 | p->flags |= SWP_SOLIDSTATE; | ||
2439 | /* | ||
2440 | * select a random position to start with to help wear leveling | ||
2441 | * SSD | ||
2442 | */ | ||
2443 | p->cluster_next = 1 + (prandom_u32() % p->highest_bit); | ||
2444 | |||
2445 | cluster_info = vzalloc(DIV_ROUND_UP(maxpages, | ||
2446 | SWAPFILE_CLUSTER) * sizeof(*cluster_info)); | ||
2447 | if (!cluster_info) { | ||
2448 | error = -ENOMEM; | ||
2449 | goto bad_swap; | ||
2450 | } | ||
2451 | p->percpu_cluster = alloc_percpu(struct percpu_cluster); | ||
2452 | if (!p->percpu_cluster) { | ||
2453 | error = -ENOMEM; | ||
2454 | goto bad_swap; | ||
2455 | } | ||
2456 | for_each_possible_cpu(i) { | ||
2457 | struct percpu_cluster *cluster; | ||
2458 | cluster = per_cpu_ptr(p->percpu_cluster, i); | ||
2459 | cluster_set_null(&cluster->index); | ||
2460 | } | ||
2461 | } | ||
2135 | 2462 | ||
2136 | error = swap_cgroup_swapon(p->type, maxpages); | 2463 | error = swap_cgroup_swapon(p->type, maxpages); |
2137 | if (error) | 2464 | if (error) |
2138 | goto bad_swap; | 2465 | goto bad_swap; |
2139 | 2466 | ||
2140 | nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, | 2467 | nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, |
2141 | maxpages, &span); | 2468 | cluster_info, maxpages, &span); |
2142 | if (unlikely(nr_extents < 0)) { | 2469 | if (unlikely(nr_extents < 0)) { |
2143 | error = nr_extents; | 2470 | error = nr_extents; |
2144 | goto bad_swap; | 2471 | goto bad_swap; |
@@ -2147,41 +2474,33 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2147 | if (frontswap_enabled) | 2474 | if (frontswap_enabled) |
2148 | frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long)); | 2475 | frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long)); |
2149 | 2476 | ||
2150 | if (p->bdev) { | 2477 | if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { |
2151 | if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { | 2478 | /* |
2152 | p->flags |= SWP_SOLIDSTATE; | 2479 | * When discard is enabled for swap with no particular |
2153 | p->cluster_next = 1 + (prandom_u32() % p->highest_bit); | 2480 | * policy flagged, we set all swap discard flags here in |
2154 | } | 2481 | * order to sustain backward compatibility with older |
2155 | 2482 | * swapon(8) releases. | |
2156 | if ((swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { | 2483 | */ |
2157 | /* | 2484 | p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | |
2158 | * When discard is enabled for swap with no particular | 2485 | SWP_PAGE_DISCARD); |
2159 | * policy flagged, we set all swap discard flags here in | ||
2160 | * order to sustain backward compatibility with older | ||
2161 | * swapon(8) releases. | ||
2162 | */ | ||
2163 | p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | | ||
2164 | SWP_PAGE_DISCARD); | ||
2165 | 2486 | ||
2166 | /* | 2487 | /* |
2167 | * By flagging sys_swapon, a sysadmin can tell us to | 2488 | * By flagging sys_swapon, a sysadmin can tell us to |
2168 | * either do single-time area discards only, or to just | 2489 | * either do single-time area discards only, or to just |
2169 | * perform discards for released swap page-clusters. | 2490 | * perform discards for released swap page-clusters. |
2170 | * Now it's time to adjust the p->flags accordingly. | 2491 | * Now it's time to adjust the p->flags accordingly. |
2171 | */ | 2492 | */ |
2172 | if (swap_flags & SWAP_FLAG_DISCARD_ONCE) | 2493 | if (swap_flags & SWAP_FLAG_DISCARD_ONCE) |
2173 | p->flags &= ~SWP_PAGE_DISCARD; | 2494 | p->flags &= ~SWP_PAGE_DISCARD; |
2174 | else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) | 2495 | else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) |
2175 | p->flags &= ~SWP_AREA_DISCARD; | 2496 | p->flags &= ~SWP_AREA_DISCARD; |
2176 | 2497 | ||
2177 | /* issue a swapon-time discard if it's still required */ | 2498 | /* issue a swapon-time discard if it's still required */ |
2178 | if (p->flags & SWP_AREA_DISCARD) { | 2499 | if (p->flags & SWP_AREA_DISCARD) { |
2179 | int err = discard_swap(p); | 2500 | int err = discard_swap(p); |
2180 | if (unlikely(err)) | 2501 | if (unlikely(err)) |
2181 | printk(KERN_ERR | 2502 | pr_err("swapon: discard_swap(%p): %d\n", |
2182 | "swapon: discard_swap(%p): %d\n", | 2503 | p, err); |
2183 | p, err); | ||
2184 | } | ||
2185 | } | 2504 | } |
2186 | } | 2505 | } |
2187 | 2506 | ||
@@ -2190,9 +2509,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2190 | if (swap_flags & SWAP_FLAG_PREFER) | 2509 | if (swap_flags & SWAP_FLAG_PREFER) |
2191 | prio = | 2510 | prio = |
2192 | (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; | 2511 | (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; |
2193 | enable_swap_info(p, prio, swap_map, frontswap_map); | 2512 | enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map); |
2194 | 2513 | ||
2195 | printk(KERN_INFO "Adding %uk swap on %s. " | 2514 | pr_info("Adding %uk swap on %s. " |
2196 | "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", | 2515 | "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", |
2197 | p->pages<<(PAGE_SHIFT-10), name->name, p->prio, | 2516 | p->pages<<(PAGE_SHIFT-10), name->name, p->prio, |
2198 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), | 2517 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), |
@@ -2211,6 +2530,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2211 | error = 0; | 2530 | error = 0; |
2212 | goto out; | 2531 | goto out; |
2213 | bad_swap: | 2532 | bad_swap: |
2533 | free_percpu(p->percpu_cluster); | ||
2534 | p->percpu_cluster = NULL; | ||
2214 | if (inode && S_ISBLK(inode->i_mode) && p->bdev) { | 2535 | if (inode && S_ISBLK(inode->i_mode) && p->bdev) { |
2215 | set_blocksize(p->bdev, p->old_block_size); | 2536 | set_blocksize(p->bdev, p->old_block_size); |
2216 | blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); | 2537 | blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); |
@@ -2222,6 +2543,7 @@ bad_swap: | |||
2222 | p->flags = 0; | 2543 | p->flags = 0; |
2223 | spin_unlock(&swap_lock); | 2544 | spin_unlock(&swap_lock); |
2224 | vfree(swap_map); | 2545 | vfree(swap_map); |
2546 | vfree(cluster_info); | ||
2225 | if (swap_file) { | 2547 | if (swap_file) { |
2226 | if (inode && S_ISREG(inode->i_mode)) { | 2548 | if (inode && S_ISREG(inode->i_mode)) { |
2227 | mutex_unlock(&inode->i_mutex); | 2549 | mutex_unlock(&inode->i_mutex); |
@@ -2291,6 +2613,16 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) | |||
2291 | goto unlock_out; | 2613 | goto unlock_out; |
2292 | 2614 | ||
2293 | count = p->swap_map[offset]; | 2615 | count = p->swap_map[offset]; |
2616 | |||
2617 | /* | ||
2618 | * swapin_readahead() doesn't check if a swap entry is valid, so the | ||
2619 | * swap entry could be SWAP_MAP_BAD. Check here with lock held. | ||
2620 | */ | ||
2621 | if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { | ||
2622 | err = -ENOENT; | ||
2623 | goto unlock_out; | ||
2624 | } | ||
2625 | |||
2294 | has_cache = count & SWAP_HAS_CACHE; | 2626 | has_cache = count & SWAP_HAS_CACHE; |
2295 | count &= ~SWAP_HAS_CACHE; | 2627 | count &= ~SWAP_HAS_CACHE; |
2296 | err = 0; | 2628 | err = 0; |
@@ -2326,7 +2658,7 @@ out: | |||
2326 | return err; | 2658 | return err; |
2327 | 2659 | ||
2328 | bad_file: | 2660 | bad_file: |
2329 | printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); | 2661 | pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val); |
2330 | goto out; | 2662 | goto out; |
2331 | } | 2663 | } |
2332 | 2664 | ||
@@ -388,15 +388,12 @@ struct address_space *page_mapping(struct page *page) | |||
388 | struct address_space *mapping = page->mapping; | 388 | struct address_space *mapping = page->mapping; |
389 | 389 | ||
390 | VM_BUG_ON(PageSlab(page)); | 390 | VM_BUG_ON(PageSlab(page)); |
391 | #ifdef CONFIG_SWAP | ||
392 | if (unlikely(PageSwapCache(page))) { | 391 | if (unlikely(PageSwapCache(page))) { |
393 | swp_entry_t entry; | 392 | swp_entry_t entry; |
394 | 393 | ||
395 | entry.val = page_private(page); | 394 | entry.val = page_private(page); |
396 | mapping = swap_address_space(entry); | 395 | mapping = swap_address_space(entry); |
397 | } else | 396 | } else if ((unsigned long)mapping & PAGE_MAPPING_ANON) |
398 | #endif | ||
399 | if ((unsigned long)mapping & PAGE_MAPPING_ANON) | ||
400 | mapping = NULL; | 397 | mapping = NULL; |
401 | return mapping; | 398 | return mapping; |
402 | } | 399 | } |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 13a54953a273..107454312d5e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -752,7 +752,6 @@ struct vmap_block_queue { | |||
752 | struct vmap_block { | 752 | struct vmap_block { |
753 | spinlock_t lock; | 753 | spinlock_t lock; |
754 | struct vmap_area *va; | 754 | struct vmap_area *va; |
755 | struct vmap_block_queue *vbq; | ||
756 | unsigned long free, dirty; | 755 | unsigned long free, dirty; |
757 | DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); | 756 | DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); |
758 | struct list_head free_list; | 757 | struct list_head free_list; |
@@ -830,7 +829,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) | |||
830 | radix_tree_preload_end(); | 829 | radix_tree_preload_end(); |
831 | 830 | ||
832 | vbq = &get_cpu_var(vmap_block_queue); | 831 | vbq = &get_cpu_var(vmap_block_queue); |
833 | vb->vbq = vbq; | ||
834 | spin_lock(&vbq->lock); | 832 | spin_lock(&vbq->lock); |
835 | list_add_rcu(&vb->free_list, &vbq->free); | 833 | list_add_rcu(&vb->free_list, &vbq->free); |
836 | spin_unlock(&vbq->lock); | 834 | spin_unlock(&vbq->lock); |
@@ -1018,15 +1016,16 @@ void vm_unmap_aliases(void) | |||
1018 | 1016 | ||
1019 | rcu_read_lock(); | 1017 | rcu_read_lock(); |
1020 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { | 1018 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { |
1021 | int i; | 1019 | int i, j; |
1022 | 1020 | ||
1023 | spin_lock(&vb->lock); | 1021 | spin_lock(&vb->lock); |
1024 | i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); | 1022 | i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); |
1025 | while (i < VMAP_BBMAP_BITS) { | 1023 | if (i < VMAP_BBMAP_BITS) { |
1026 | unsigned long s, e; | 1024 | unsigned long s, e; |
1027 | int j; | 1025 | |
1028 | j = find_next_zero_bit(vb->dirty_map, | 1026 | j = find_last_bit(vb->dirty_map, |
1029 | VMAP_BBMAP_BITS, i); | 1027 | VMAP_BBMAP_BITS); |
1028 | j = j + 1; /* need exclusive index */ | ||
1030 | 1029 | ||
1031 | s = vb->va->va_start + (i << PAGE_SHIFT); | 1030 | s = vb->va->va_start + (i << PAGE_SHIFT); |
1032 | e = vb->va->va_start + (j << PAGE_SHIFT); | 1031 | e = vb->va->va_start + (j << PAGE_SHIFT); |
@@ -1036,10 +1035,6 @@ void vm_unmap_aliases(void) | |||
1036 | start = s; | 1035 | start = s; |
1037 | if (e > end) | 1036 | if (e > end) |
1038 | end = e; | 1037 | end = e; |
1039 | |||
1040 | i = j; | ||
1041 | i = find_next_bit(vb->dirty_map, | ||
1042 | VMAP_BBMAP_BITS, i); | ||
1043 | } | 1038 | } |
1044 | spin_unlock(&vb->lock); | 1039 | spin_unlock(&vb->lock); |
1045 | } | 1040 | } |
@@ -1263,7 +1258,7 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) | |||
1263 | int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) | 1258 | int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) |
1264 | { | 1259 | { |
1265 | unsigned long addr = (unsigned long)area->addr; | 1260 | unsigned long addr = (unsigned long)area->addr; |
1266 | unsigned long end = addr + area->size - PAGE_SIZE; | 1261 | unsigned long end = addr + get_vm_area_size(area); |
1267 | int err; | 1262 | int err; |
1268 | 1263 | ||
1269 | err = vmap_page_range(addr, end, prot, *pages); | 1264 | err = vmap_page_range(addr, end, prot, *pages); |
@@ -1558,7 +1553,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
1558 | unsigned int nr_pages, array_size, i; | 1553 | unsigned int nr_pages, array_size, i; |
1559 | gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; | 1554 | gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; |
1560 | 1555 | ||
1561 | nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; | 1556 | nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; |
1562 | array_size = (nr_pages * sizeof(struct page *)); | 1557 | array_size = (nr_pages * sizeof(struct page *)); |
1563 | 1558 | ||
1564 | area->nr_pages = nr_pages; | 1559 | area->nr_pages = nr_pages; |
@@ -1990,7 +1985,7 @@ long vread(char *buf, char *addr, unsigned long count) | |||
1990 | 1985 | ||
1991 | vm = va->vm; | 1986 | vm = va->vm; |
1992 | vaddr = (char *) vm->addr; | 1987 | vaddr = (char *) vm->addr; |
1993 | if (addr >= vaddr + vm->size - PAGE_SIZE) | 1988 | if (addr >= vaddr + get_vm_area_size(vm)) |
1994 | continue; | 1989 | continue; |
1995 | while (addr < vaddr) { | 1990 | while (addr < vaddr) { |
1996 | if (count == 0) | 1991 | if (count == 0) |
@@ -2000,7 +1995,7 @@ long vread(char *buf, char *addr, unsigned long count) | |||
2000 | addr++; | 1995 | addr++; |
2001 | count--; | 1996 | count--; |
2002 | } | 1997 | } |
2003 | n = vaddr + vm->size - PAGE_SIZE - addr; | 1998 | n = vaddr + get_vm_area_size(vm) - addr; |
2004 | if (n > count) | 1999 | if (n > count) |
2005 | n = count; | 2000 | n = count; |
2006 | if (!(vm->flags & VM_IOREMAP)) | 2001 | if (!(vm->flags & VM_IOREMAP)) |
@@ -2072,7 +2067,7 @@ long vwrite(char *buf, char *addr, unsigned long count) | |||
2072 | 2067 | ||
2073 | vm = va->vm; | 2068 | vm = va->vm; |
2074 | vaddr = (char *) vm->addr; | 2069 | vaddr = (char *) vm->addr; |
2075 | if (addr >= vaddr + vm->size - PAGE_SIZE) | 2070 | if (addr >= vaddr + get_vm_area_size(vm)) |
2076 | continue; | 2071 | continue; |
2077 | while (addr < vaddr) { | 2072 | while (addr < vaddr) { |
2078 | if (count == 0) | 2073 | if (count == 0) |
@@ -2081,7 +2076,7 @@ long vwrite(char *buf, char *addr, unsigned long count) | |||
2081 | addr++; | 2076 | addr++; |
2082 | count--; | 2077 | count--; |
2083 | } | 2078 | } |
2084 | n = vaddr + vm->size - PAGE_SIZE - addr; | 2079 | n = vaddr + get_vm_area_size(vm) - addr; |
2085 | if (n > count) | 2080 | if (n > count) |
2086 | n = count; | 2081 | n = count; |
2087 | if (!(vm->flags & VM_IOREMAP)) { | 2082 | if (!(vm->flags & VM_IOREMAP)) { |
diff --git a/mm/vmscan.c b/mm/vmscan.c index e36454220614..beb35778c69f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -146,6 +146,25 @@ static bool global_reclaim(struct scan_control *sc) | |||
146 | } | 146 | } |
147 | #endif | 147 | #endif |
148 | 148 | ||
149 | unsigned long zone_reclaimable_pages(struct zone *zone) | ||
150 | { | ||
151 | int nr; | ||
152 | |||
153 | nr = zone_page_state(zone, NR_ACTIVE_FILE) + | ||
154 | zone_page_state(zone, NR_INACTIVE_FILE); | ||
155 | |||
156 | if (get_nr_swap_pages() > 0) | ||
157 | nr += zone_page_state(zone, NR_ACTIVE_ANON) + | ||
158 | zone_page_state(zone, NR_INACTIVE_ANON); | ||
159 | |||
160 | return nr; | ||
161 | } | ||
162 | |||
163 | bool zone_reclaimable(struct zone *zone) | ||
164 | { | ||
165 | return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; | ||
166 | } | ||
167 | |||
149 | static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) | 168 | static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) |
150 | { | 169 | { |
151 | if (!mem_cgroup_disabled()) | 170 | if (!mem_cgroup_disabled()) |
@@ -579,7 +598,7 @@ int remove_mapping(struct address_space *mapping, struct page *page) | |||
579 | */ | 598 | */ |
580 | void putback_lru_page(struct page *page) | 599 | void putback_lru_page(struct page *page) |
581 | { | 600 | { |
582 | int lru; | 601 | bool is_unevictable; |
583 | int was_unevictable = PageUnevictable(page); | 602 | int was_unevictable = PageUnevictable(page); |
584 | 603 | ||
585 | VM_BUG_ON(PageLRU(page)); | 604 | VM_BUG_ON(PageLRU(page)); |
@@ -594,14 +613,14 @@ redo: | |||
594 | * unevictable page on [in]active list. | 613 | * unevictable page on [in]active list. |
595 | * We know how to handle that. | 614 | * We know how to handle that. |
596 | */ | 615 | */ |
597 | lru = page_lru_base_type(page); | 616 | is_unevictable = false; |
598 | lru_cache_add(page); | 617 | lru_cache_add(page); |
599 | } else { | 618 | } else { |
600 | /* | 619 | /* |
601 | * Put unevictable pages directly on zone's unevictable | 620 | * Put unevictable pages directly on zone's unevictable |
602 | * list. | 621 | * list. |
603 | */ | 622 | */ |
604 | lru = LRU_UNEVICTABLE; | 623 | is_unevictable = true; |
605 | add_page_to_unevictable_list(page); | 624 | add_page_to_unevictable_list(page); |
606 | /* | 625 | /* |
607 | * When racing with an mlock or AS_UNEVICTABLE clearing | 626 | * When racing with an mlock or AS_UNEVICTABLE clearing |
@@ -621,7 +640,7 @@ redo: | |||
621 | * page is on unevictable list, it never be freed. To avoid that, | 640 | * page is on unevictable list, it never be freed. To avoid that, |
622 | * check after we added it to the list, again. | 641 | * check after we added it to the list, again. |
623 | */ | 642 | */ |
624 | if (lru == LRU_UNEVICTABLE && page_evictable(page)) { | 643 | if (is_unevictable && page_evictable(page)) { |
625 | if (!isolate_lru_page(page)) { | 644 | if (!isolate_lru_page(page)) { |
626 | put_page(page); | 645 | put_page(page); |
627 | goto redo; | 646 | goto redo; |
@@ -632,9 +651,9 @@ redo: | |||
632 | */ | 651 | */ |
633 | } | 652 | } |
634 | 653 | ||
635 | if (was_unevictable && lru != LRU_UNEVICTABLE) | 654 | if (was_unevictable && !is_unevictable) |
636 | count_vm_event(UNEVICTABLE_PGRESCUED); | 655 | count_vm_event(UNEVICTABLE_PGRESCUED); |
637 | else if (!was_unevictable && lru == LRU_UNEVICTABLE) | 656 | else if (!was_unevictable && is_unevictable) |
638 | count_vm_event(UNEVICTABLE_PGCULLED); | 657 | count_vm_event(UNEVICTABLE_PGCULLED); |
639 | 658 | ||
640 | put_page(page); /* drop ref from isolate */ | 659 | put_page(page); /* drop ref from isolate */ |
@@ -1823,7 +1842,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
1823 | * latencies, so it's better to scan a minimum amount there as | 1842 | * latencies, so it's better to scan a minimum amount there as |
1824 | * well. | 1843 | * well. |
1825 | */ | 1844 | */ |
1826 | if (current_is_kswapd() && zone->all_unreclaimable) | 1845 | if (current_is_kswapd() && !zone_reclaimable(zone)) |
1827 | force_scan = true; | 1846 | force_scan = true; |
1828 | if (!global_reclaim(sc)) | 1847 | if (!global_reclaim(sc)) |
1829 | force_scan = true; | 1848 | force_scan = true; |
@@ -2278,8 +2297,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2278 | if (global_reclaim(sc)) { | 2297 | if (global_reclaim(sc)) { |
2279 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 2298 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
2280 | continue; | 2299 | continue; |
2281 | if (zone->all_unreclaimable && | 2300 | if (sc->priority != DEF_PRIORITY && |
2282 | sc->priority != DEF_PRIORITY) | 2301 | !zone_reclaimable(zone)) |
2283 | continue; /* Let kswapd poll it */ | 2302 | continue; /* Let kswapd poll it */ |
2284 | if (IS_ENABLED(CONFIG_COMPACTION)) { | 2303 | if (IS_ENABLED(CONFIG_COMPACTION)) { |
2285 | /* | 2304 | /* |
@@ -2317,11 +2336,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2317 | return aborted_reclaim; | 2336 | return aborted_reclaim; |
2318 | } | 2337 | } |
2319 | 2338 | ||
2320 | static bool zone_reclaimable(struct zone *zone) | ||
2321 | { | ||
2322 | return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; | ||
2323 | } | ||
2324 | |||
2325 | /* All zones in zonelist are unreclaimable? */ | 2339 | /* All zones in zonelist are unreclaimable? */ |
2326 | static bool all_unreclaimable(struct zonelist *zonelist, | 2340 | static bool all_unreclaimable(struct zonelist *zonelist, |
2327 | struct scan_control *sc) | 2341 | struct scan_control *sc) |
@@ -2335,7 +2349,7 @@ static bool all_unreclaimable(struct zonelist *zonelist, | |||
2335 | continue; | 2349 | continue; |
2336 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 2350 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
2337 | continue; | 2351 | continue; |
2338 | if (!zone->all_unreclaimable) | 2352 | if (zone_reclaimable(zone)) |
2339 | return false; | 2353 | return false; |
2340 | } | 2354 | } |
2341 | 2355 | ||
@@ -2750,7 +2764,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) | |||
2750 | * DEF_PRIORITY. Effectively, it considers them balanced so | 2764 | * DEF_PRIORITY. Effectively, it considers them balanced so |
2751 | * they must be considered balanced here as well! | 2765 | * they must be considered balanced here as well! |
2752 | */ | 2766 | */ |
2753 | if (zone->all_unreclaimable) { | 2767 | if (!zone_reclaimable(zone)) { |
2754 | balanced_pages += zone->managed_pages; | 2768 | balanced_pages += zone->managed_pages; |
2755 | continue; | 2769 | continue; |
2756 | } | 2770 | } |
@@ -2811,7 +2825,6 @@ static bool kswapd_shrink_zone(struct zone *zone, | |||
2811 | unsigned long lru_pages, | 2825 | unsigned long lru_pages, |
2812 | unsigned long *nr_attempted) | 2826 | unsigned long *nr_attempted) |
2813 | { | 2827 | { |
2814 | unsigned long nr_slab; | ||
2815 | int testorder = sc->order; | 2828 | int testorder = sc->order; |
2816 | unsigned long balance_gap; | 2829 | unsigned long balance_gap; |
2817 | struct reclaim_state *reclaim_state = current->reclaim_state; | 2830 | struct reclaim_state *reclaim_state = current->reclaim_state; |
@@ -2858,15 +2871,12 @@ static bool kswapd_shrink_zone(struct zone *zone, | |||
2858 | node_set(zone_to_nid(zone), shrink.nodes_to_scan); | 2871 | node_set(zone_to_nid(zone), shrink.nodes_to_scan); |
2859 | 2872 | ||
2860 | reclaim_state->reclaimed_slab = 0; | 2873 | reclaim_state->reclaimed_slab = 0; |
2861 | nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages); | 2874 | shrink_slab(&shrink, sc->nr_scanned, lru_pages); |
2862 | sc->nr_reclaimed += reclaim_state->reclaimed_slab; | 2875 | sc->nr_reclaimed += reclaim_state->reclaimed_slab; |
2863 | 2876 | ||
2864 | /* Account for the number of pages attempted to reclaim */ | 2877 | /* Account for the number of pages attempted to reclaim */ |
2865 | *nr_attempted += sc->nr_to_reclaim; | 2878 | *nr_attempted += sc->nr_to_reclaim; |
2866 | 2879 | ||
2867 | if (nr_slab == 0 && !zone_reclaimable(zone)) | ||
2868 | zone->all_unreclaimable = 1; | ||
2869 | |||
2870 | zone_clear_flag(zone, ZONE_WRITEBACK); | 2880 | zone_clear_flag(zone, ZONE_WRITEBACK); |
2871 | 2881 | ||
2872 | /* | 2882 | /* |
@@ -2875,7 +2885,7 @@ static bool kswapd_shrink_zone(struct zone *zone, | |||
2875 | * BDIs but as pressure is relieved, speculatively avoid congestion | 2885 | * BDIs but as pressure is relieved, speculatively avoid congestion |
2876 | * waits. | 2886 | * waits. |
2877 | */ | 2887 | */ |
2878 | if (!zone->all_unreclaimable && | 2888 | if (zone_reclaimable(zone) && |
2879 | zone_balanced(zone, testorder, 0, classzone_idx)) { | 2889 | zone_balanced(zone, testorder, 0, classzone_idx)) { |
2880 | zone_clear_flag(zone, ZONE_CONGESTED); | 2890 | zone_clear_flag(zone, ZONE_CONGESTED); |
2881 | zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); | 2891 | zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); |
@@ -2941,8 +2951,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
2941 | if (!populated_zone(zone)) | 2951 | if (!populated_zone(zone)) |
2942 | continue; | 2952 | continue; |
2943 | 2953 | ||
2944 | if (zone->all_unreclaimable && | 2954 | if (sc.priority != DEF_PRIORITY && |
2945 | sc.priority != DEF_PRIORITY) | 2955 | !zone_reclaimable(zone)) |
2946 | continue; | 2956 | continue; |
2947 | 2957 | ||
2948 | /* | 2958 | /* |
@@ -3020,8 +3030,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
3020 | if (!populated_zone(zone)) | 3030 | if (!populated_zone(zone)) |
3021 | continue; | 3031 | continue; |
3022 | 3032 | ||
3023 | if (zone->all_unreclaimable && | 3033 | if (sc.priority != DEF_PRIORITY && |
3024 | sc.priority != DEF_PRIORITY) | 3034 | !zone_reclaimable(zone)) |
3025 | continue; | 3035 | continue; |
3026 | 3036 | ||
3027 | sc.nr_scanned = 0; | 3037 | sc.nr_scanned = 0; |
@@ -3277,7 +3287,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) | |||
3277 | } | 3287 | } |
3278 | if (!waitqueue_active(&pgdat->kswapd_wait)) | 3288 | if (!waitqueue_active(&pgdat->kswapd_wait)) |
3279 | return; | 3289 | return; |
3280 | if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) | 3290 | if (zone_balanced(zone, order, 0, 0)) |
3281 | return; | 3291 | return; |
3282 | 3292 | ||
3283 | trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); | 3293 | trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); |
@@ -3305,20 +3315,6 @@ unsigned long global_reclaimable_pages(void) | |||
3305 | return nr; | 3315 | return nr; |
3306 | } | 3316 | } |
3307 | 3317 | ||
3308 | unsigned long zone_reclaimable_pages(struct zone *zone) | ||
3309 | { | ||
3310 | int nr; | ||
3311 | |||
3312 | nr = zone_page_state(zone, NR_ACTIVE_FILE) + | ||
3313 | zone_page_state(zone, NR_INACTIVE_FILE); | ||
3314 | |||
3315 | if (get_nr_swap_pages() > 0) | ||
3316 | nr += zone_page_state(zone, NR_ACTIVE_ANON) + | ||
3317 | zone_page_state(zone, NR_INACTIVE_ANON); | ||
3318 | |||
3319 | return nr; | ||
3320 | } | ||
3321 | |||
3322 | #ifdef CONFIG_HIBERNATION | 3318 | #ifdef CONFIG_HIBERNATION |
3323 | /* | 3319 | /* |
3324 | * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of | 3320 | * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of |
@@ -3615,7 +3611,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
3615 | zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) | 3611 | zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) |
3616 | return ZONE_RECLAIM_FULL; | 3612 | return ZONE_RECLAIM_FULL; |
3617 | 3613 | ||
3618 | if (zone->all_unreclaimable) | 3614 | if (!zone_reclaimable(zone)) |
3619 | return ZONE_RECLAIM_FULL; | 3615 | return ZONE_RECLAIM_FULL; |
3620 | 3616 | ||
3621 | /* | 3617 | /* |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 20c2ef4458fa..9bb314577911 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -19,6 +19,9 @@ | |||
19 | #include <linux/math64.h> | 19 | #include <linux/math64.h> |
20 | #include <linux/writeback.h> | 20 | #include <linux/writeback.h> |
21 | #include <linux/compaction.h> | 21 | #include <linux/compaction.h> |
22 | #include <linux/mm_inline.h> | ||
23 | |||
24 | #include "internal.h" | ||
22 | 25 | ||
23 | #ifdef CONFIG_VM_EVENT_COUNTERS | 26 | #ifdef CONFIG_VM_EVENT_COUNTERS |
24 | DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; | 27 | DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; |
@@ -414,12 +417,17 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item) | |||
414 | EXPORT_SYMBOL(dec_zone_page_state); | 417 | EXPORT_SYMBOL(dec_zone_page_state); |
415 | #endif | 418 | #endif |
416 | 419 | ||
420 | static inline void fold_diff(int *diff) | ||
421 | { | ||
422 | int i; | ||
423 | |||
424 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | ||
425 | if (diff[i]) | ||
426 | atomic_long_add(diff[i], &vm_stat[i]); | ||
427 | } | ||
428 | |||
417 | /* | 429 | /* |
418 | * Update the zone counters for one cpu. | 430 | * Update the zone counters for the current cpu. |
419 | * | ||
420 | * The cpu specified must be either the current cpu or a processor that | ||
421 | * is not online. If it is the current cpu then the execution thread must | ||
422 | * be pinned to the current cpu. | ||
423 | * | 431 | * |
424 | * Note that refresh_cpu_vm_stats strives to only access | 432 | * Note that refresh_cpu_vm_stats strives to only access |
425 | * node local memory. The per cpu pagesets on remote zones are placed | 433 | * node local memory. The per cpu pagesets on remote zones are placed |
@@ -432,33 +440,29 @@ EXPORT_SYMBOL(dec_zone_page_state); | |||
432 | * with the global counters. These could cause remote node cache line | 440 | * with the global counters. These could cause remote node cache line |
433 | * bouncing and will have to be only done when necessary. | 441 | * bouncing and will have to be only done when necessary. |
434 | */ | 442 | */ |
435 | void refresh_cpu_vm_stats(int cpu) | 443 | static void refresh_cpu_vm_stats(void) |
436 | { | 444 | { |
437 | struct zone *zone; | 445 | struct zone *zone; |
438 | int i; | 446 | int i; |
439 | int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; | 447 | int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; |
440 | 448 | ||
441 | for_each_populated_zone(zone) { | 449 | for_each_populated_zone(zone) { |
442 | struct per_cpu_pageset *p; | 450 | struct per_cpu_pageset __percpu *p = zone->pageset; |
443 | 451 | ||
444 | p = per_cpu_ptr(zone->pageset, cpu); | 452 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { |
453 | int v; | ||
445 | 454 | ||
446 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | 455 | v = this_cpu_xchg(p->vm_stat_diff[i], 0); |
447 | if (p->vm_stat_diff[i]) { | 456 | if (v) { |
448 | unsigned long flags; | ||
449 | int v; | ||
450 | 457 | ||
451 | local_irq_save(flags); | ||
452 | v = p->vm_stat_diff[i]; | ||
453 | p->vm_stat_diff[i] = 0; | ||
454 | local_irq_restore(flags); | ||
455 | atomic_long_add(v, &zone->vm_stat[i]); | 458 | atomic_long_add(v, &zone->vm_stat[i]); |
456 | global_diff[i] += v; | 459 | global_diff[i] += v; |
457 | #ifdef CONFIG_NUMA | 460 | #ifdef CONFIG_NUMA |
458 | /* 3 seconds idle till flush */ | 461 | /* 3 seconds idle till flush */ |
459 | p->expire = 3; | 462 | __this_cpu_write(p->expire, 3); |
460 | #endif | 463 | #endif |
461 | } | 464 | } |
465 | } | ||
462 | cond_resched(); | 466 | cond_resched(); |
463 | #ifdef CONFIG_NUMA | 467 | #ifdef CONFIG_NUMA |
464 | /* | 468 | /* |
@@ -468,29 +472,57 @@ void refresh_cpu_vm_stats(int cpu) | |||
468 | * Check if there are pages remaining in this pageset | 472 | * Check if there are pages remaining in this pageset |
469 | * if not then there is nothing to expire. | 473 | * if not then there is nothing to expire. |
470 | */ | 474 | */ |
471 | if (!p->expire || !p->pcp.count) | 475 | if (!__this_cpu_read(p->expire) || |
476 | !__this_cpu_read(p->pcp.count)) | ||
472 | continue; | 477 | continue; |
473 | 478 | ||
474 | /* | 479 | /* |
475 | * We never drain zones local to this processor. | 480 | * We never drain zones local to this processor. |
476 | */ | 481 | */ |
477 | if (zone_to_nid(zone) == numa_node_id()) { | 482 | if (zone_to_nid(zone) == numa_node_id()) { |
478 | p->expire = 0; | 483 | __this_cpu_write(p->expire, 0); |
479 | continue; | 484 | continue; |
480 | } | 485 | } |
481 | 486 | ||
482 | p->expire--; | 487 | |
483 | if (p->expire) | 488 | if (__this_cpu_dec_return(p->expire)) |
484 | continue; | 489 | continue; |
485 | 490 | ||
486 | if (p->pcp.count) | 491 | if (__this_cpu_read(p->pcp.count)) |
487 | drain_zone_pages(zone, &p->pcp); | 492 | drain_zone_pages(zone, __this_cpu_ptr(&p->pcp)); |
488 | #endif | 493 | #endif |
489 | } | 494 | } |
495 | fold_diff(global_diff); | ||
496 | } | ||
490 | 497 | ||
491 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | 498 | /* |
492 | if (global_diff[i]) | 499 | * Fold the data for an offline cpu into the global array. |
493 | atomic_long_add(global_diff[i], &vm_stat[i]); | 500 | * There cannot be any access by the offline cpu and therefore |
501 | * synchronization is simplified. | ||
502 | */ | ||
503 | void cpu_vm_stats_fold(int cpu) | ||
504 | { | ||
505 | struct zone *zone; | ||
506 | int i; | ||
507 | int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; | ||
508 | |||
509 | for_each_populated_zone(zone) { | ||
510 | struct per_cpu_pageset *p; | ||
511 | |||
512 | p = per_cpu_ptr(zone->pageset, cpu); | ||
513 | |||
514 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | ||
515 | if (p->vm_stat_diff[i]) { | ||
516 | int v; | ||
517 | |||
518 | v = p->vm_stat_diff[i]; | ||
519 | p->vm_stat_diff[i] = 0; | ||
520 | atomic_long_add(v, &zone->vm_stat[i]); | ||
521 | global_diff[i] += v; | ||
522 | } | ||
523 | } | ||
524 | |||
525 | fold_diff(global_diff); | ||
494 | } | 526 | } |
495 | 527 | ||
496 | /* | 528 | /* |
@@ -703,6 +735,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, | |||
703 | const char * const vmstat_text[] = { | 735 | const char * const vmstat_text[] = { |
704 | /* Zoned VM counters */ | 736 | /* Zoned VM counters */ |
705 | "nr_free_pages", | 737 | "nr_free_pages", |
738 | "nr_alloc_batch", | ||
706 | "nr_inactive_anon", | 739 | "nr_inactive_anon", |
707 | "nr_active_anon", | 740 | "nr_active_anon", |
708 | "nr_inactive_file", | 741 | "nr_inactive_file", |
@@ -817,6 +850,12 @@ const char * const vmstat_text[] = { | |||
817 | "thp_zero_page_alloc", | 850 | "thp_zero_page_alloc", |
818 | "thp_zero_page_alloc_failed", | 851 | "thp_zero_page_alloc_failed", |
819 | #endif | 852 | #endif |
853 | #ifdef CONFIG_SMP | ||
854 | "nr_tlb_remote_flush", | ||
855 | "nr_tlb_remote_flush_received", | ||
856 | #endif | ||
857 | "nr_tlb_local_flush_all", | ||
858 | "nr_tlb_local_flush_one", | ||
820 | 859 | ||
821 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ | 860 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ |
822 | }; | 861 | }; |
@@ -1052,7 +1091,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
1052 | "\n all_unreclaimable: %u" | 1091 | "\n all_unreclaimable: %u" |
1053 | "\n start_pfn: %lu" | 1092 | "\n start_pfn: %lu" |
1054 | "\n inactive_ratio: %u", | 1093 | "\n inactive_ratio: %u", |
1055 | zone->all_unreclaimable, | 1094 | !zone_reclaimable(zone), |
1056 | zone->zone_start_pfn, | 1095 | zone->zone_start_pfn, |
1057 | zone->inactive_ratio); | 1096 | zone->inactive_ratio); |
1058 | seq_putc(m, '\n'); | 1097 | seq_putc(m, '\n'); |
@@ -1177,7 +1216,7 @@ int sysctl_stat_interval __read_mostly = HZ; | |||
1177 | 1216 | ||
1178 | static void vmstat_update(struct work_struct *w) | 1217 | static void vmstat_update(struct work_struct *w) |
1179 | { | 1218 | { |
1180 | refresh_cpu_vm_stats(smp_processor_id()); | 1219 | refresh_cpu_vm_stats(); |
1181 | schedule_delayed_work(&__get_cpu_var(vmstat_work), | 1220 | schedule_delayed_work(&__get_cpu_var(vmstat_work), |
1182 | round_jiffies_relative(sysctl_stat_interval)); | 1221 | round_jiffies_relative(sysctl_stat_interval)); |
1183 | } | 1222 | } |
@@ -16,7 +16,7 @@ | |||
16 | * | 16 | * |
17 | * zbud works by storing compressed pages, or "zpages", together in pairs in a | 17 | * zbud works by storing compressed pages, or "zpages", together in pairs in a |
18 | * single memory page called a "zbud page". The first buddy is "left | 18 | * single memory page called a "zbud page". The first buddy is "left |
19 | * justifed" at the beginning of the zbud page, and the last buddy is "right | 19 | * justified" at the beginning of the zbud page, and the last buddy is "right |
20 | * justified" at the end of the zbud page. The benefit is that if either | 20 | * justified" at the end of the zbud page. The benefit is that if either |
21 | * buddy is freed, the freed buddy space, coalesced with whatever slack space | 21 | * buddy is freed, the freed buddy space, coalesced with whatever slack space |
22 | * that existed between the buddies, results in the largest possible free region | 22 | * that existed between the buddies, results in the largest possible free region |
@@ -243,7 +243,7 @@ void zbud_destroy_pool(struct zbud_pool *pool) | |||
243 | * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used | 243 | * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used |
244 | * as zbud pool pages. | 244 | * as zbud pool pages. |
245 | * | 245 | * |
246 | * Return: 0 if success and handle is set, otherwise -EINVAL is the size or | 246 | * Return: 0 if success and handle is set, otherwise -EINVAL if the size or |
247 | * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate | 247 | * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate |
248 | * a new page. | 248 | * a new page. |
249 | */ | 249 | */ |
diff --git a/mm/zswap.c b/mm/zswap.c index deda2b671e12..841e35f1db22 100644 --- a/mm/zswap.c +++ b/mm/zswap.c | |||
@@ -409,7 +409,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, | |||
409 | struct page **retpage) | 409 | struct page **retpage) |
410 | { | 410 | { |
411 | struct page *found_page, *new_page = NULL; | 411 | struct page *found_page, *new_page = NULL; |
412 | struct address_space *swapper_space = &swapper_spaces[swp_type(entry)]; | 412 | struct address_space *swapper_space = swap_address_space(entry); |
413 | int err; | 413 | int err; |
414 | 414 | ||
415 | *retpage = NULL; | 415 | *retpage = NULL; |
@@ -790,26 +790,14 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) | |||
790 | static void zswap_frontswap_invalidate_area(unsigned type) | 790 | static void zswap_frontswap_invalidate_area(unsigned type) |
791 | { | 791 | { |
792 | struct zswap_tree *tree = zswap_trees[type]; | 792 | struct zswap_tree *tree = zswap_trees[type]; |
793 | struct rb_node *node; | 793 | struct zswap_entry *entry, *n; |
794 | struct zswap_entry *entry; | ||
795 | 794 | ||
796 | if (!tree) | 795 | if (!tree) |
797 | return; | 796 | return; |
798 | 797 | ||
799 | /* walk the tree and free everything */ | 798 | /* walk the tree and free everything */ |
800 | spin_lock(&tree->lock); | 799 | spin_lock(&tree->lock); |
801 | /* | 800 | rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) { |
802 | * TODO: Even though this code should not be executed because | ||
803 | * the try_to_unuse() in swapoff should have emptied the tree, | ||
804 | * it is very wasteful to rebalance the tree after every | ||
805 | * removal when we are freeing the whole tree. | ||
806 | * | ||
807 | * If post-order traversal code is ever added to the rbtree | ||
808 | * implementation, it should be used here. | ||
809 | */ | ||
810 | while ((node = rb_first(&tree->rbroot))) { | ||
811 | entry = rb_entry(node, struct zswap_entry, rbnode); | ||
812 | rb_erase(&entry->rbnode, &tree->rbroot); | ||
813 | zbud_free(tree->pool, entry->handle); | 801 | zbud_free(tree->pool, entry->handle); |
814 | zswap_entry_cache_free(entry); | 802 | zswap_entry_cache_free(entry); |
815 | atomic_dec(&zswap_stored_pages); | 803 | atomic_dec(&zswap_stored_pages); |