diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/filemap.c | 29 | ||||
-rw-r--r-- | mm/hugetlb.c | 8 | ||||
-rw-r--r-- | mm/internal.h | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 63 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 2 | ||||
-rw-r--r-- | mm/mempolicy.c | 2 | ||||
-rw-r--r-- | mm/migrate.c | 48 | ||||
-rw-r--r-- | mm/mmap.c | 16 | ||||
-rw-r--r-- | mm/mmzone.c | 21 | ||||
-rw-r--r-- | mm/mprotect.c | 2 | ||||
-rw-r--r-- | mm/nommu.c | 1 | ||||
-rw-r--r-- | mm/page_alloc.c | 56 | ||||
-rw-r--r-- | mm/vmalloc.c | 9 | ||||
-rw-r--r-- | mm/vmscan.c | 23 | ||||
-rw-r--r-- | mm/vmstat.c | 68 |
15 files changed, 220 insertions, 130 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 3d4df44e4221..9701a501f769 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -631,7 +631,9 @@ repeat: | |||
631 | pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); | 631 | pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); |
632 | if (pagep) { | 632 | if (pagep) { |
633 | page = radix_tree_deref_slot(pagep); | 633 | page = radix_tree_deref_slot(pagep); |
634 | if (unlikely(!page || page == RADIX_TREE_RETRY)) | 634 | if (unlikely(!page)) |
635 | goto out; | ||
636 | if (radix_tree_deref_retry(page)) | ||
635 | goto repeat; | 637 | goto repeat; |
636 | 638 | ||
637 | if (!page_cache_get_speculative(page)) | 639 | if (!page_cache_get_speculative(page)) |
@@ -647,6 +649,7 @@ repeat: | |||
647 | goto repeat; | 649 | goto repeat; |
648 | } | 650 | } |
649 | } | 651 | } |
652 | out: | ||
650 | rcu_read_unlock(); | 653 | rcu_read_unlock(); |
651 | 654 | ||
652 | return page; | 655 | return page; |
@@ -764,12 +767,11 @@ repeat: | |||
764 | page = radix_tree_deref_slot((void **)pages[i]); | 767 | page = radix_tree_deref_slot((void **)pages[i]); |
765 | if (unlikely(!page)) | 768 | if (unlikely(!page)) |
766 | continue; | 769 | continue; |
767 | /* | 770 | if (radix_tree_deref_retry(page)) { |
768 | * this can only trigger if nr_found == 1, making livelock | 771 | if (ret) |
769 | * a non issue. | 772 | start = pages[ret-1]->index; |
770 | */ | ||
771 | if (unlikely(page == RADIX_TREE_RETRY)) | ||
772 | goto restart; | 773 | goto restart; |
774 | } | ||
773 | 775 | ||
774 | if (!page_cache_get_speculative(page)) | 776 | if (!page_cache_get_speculative(page)) |
775 | goto repeat; | 777 | goto repeat; |
@@ -817,11 +819,7 @@ repeat: | |||
817 | page = radix_tree_deref_slot((void **)pages[i]); | 819 | page = radix_tree_deref_slot((void **)pages[i]); |
818 | if (unlikely(!page)) | 820 | if (unlikely(!page)) |
819 | continue; | 821 | continue; |
820 | /* | 822 | if (radix_tree_deref_retry(page)) |
821 | * this can only trigger if nr_found == 1, making livelock | ||
822 | * a non issue. | ||
823 | */ | ||
824 | if (unlikely(page == RADIX_TREE_RETRY)) | ||
825 | goto restart; | 823 | goto restart; |
826 | 824 | ||
827 | if (page->mapping == NULL || page->index != index) | 825 | if (page->mapping == NULL || page->index != index) |
@@ -874,11 +872,7 @@ repeat: | |||
874 | page = radix_tree_deref_slot((void **)pages[i]); | 872 | page = radix_tree_deref_slot((void **)pages[i]); |
875 | if (unlikely(!page)) | 873 | if (unlikely(!page)) |
876 | continue; | 874 | continue; |
877 | /* | 875 | if (radix_tree_deref_retry(page)) |
878 | * this can only trigger if nr_found == 1, making livelock | ||
879 | * a non issue. | ||
880 | */ | ||
881 | if (unlikely(page == RADIX_TREE_RETRY)) | ||
882 | goto restart; | 876 | goto restart; |
883 | 877 | ||
884 | if (!page_cache_get_speculative(page)) | 878 | if (!page_cache_get_speculative(page)) |
@@ -1016,6 +1010,9 @@ find_page: | |||
1016 | goto page_not_up_to_date; | 1010 | goto page_not_up_to_date; |
1017 | if (!trylock_page(page)) | 1011 | if (!trylock_page(page)) |
1018 | goto page_not_up_to_date; | 1012 | goto page_not_up_to_date; |
1013 | /* Did it get truncated before we got the lock? */ | ||
1014 | if (!page->mapping) | ||
1015 | goto page_not_up_to_date_locked; | ||
1019 | if (!mapping->a_ops->is_partially_uptodate(page, | 1016 | if (!mapping->a_ops->is_partially_uptodate(page, |
1020 | desc, offset)) | 1017 | desc, offset)) |
1021 | goto page_not_up_to_date_locked; | 1018 | goto page_not_up_to_date_locked; |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c03273807182..2697806746d0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -2380,8 +2380,11 @@ retry_avoidcopy: | |||
2380 | * When the original hugepage is shared one, it does not have | 2380 | * When the original hugepage is shared one, it does not have |
2381 | * anon_vma prepared. | 2381 | * anon_vma prepared. |
2382 | */ | 2382 | */ |
2383 | if (unlikely(anon_vma_prepare(vma))) | 2383 | if (unlikely(anon_vma_prepare(vma))) { |
2384 | /* Caller expects lock to be held */ | ||
2385 | spin_lock(&mm->page_table_lock); | ||
2384 | return VM_FAULT_OOM; | 2386 | return VM_FAULT_OOM; |
2387 | } | ||
2385 | 2388 | ||
2386 | copy_huge_page(new_page, old_page, address, vma); | 2389 | copy_huge_page(new_page, old_page, address, vma); |
2387 | __SetPageUptodate(new_page); | 2390 | __SetPageUptodate(new_page); |
@@ -2665,7 +2668,8 @@ out_page_table_lock: | |||
2665 | unlock_page(pagecache_page); | 2668 | unlock_page(pagecache_page); |
2666 | put_page(pagecache_page); | 2669 | put_page(pagecache_page); |
2667 | } | 2670 | } |
2668 | unlock_page(page); | 2671 | if (page != pagecache_page) |
2672 | unlock_page(page); | ||
2669 | 2673 | ||
2670 | out_mutex: | 2674 | out_mutex: |
2671 | mutex_unlock(&hugetlb_instantiation_mutex); | 2675 | mutex_unlock(&hugetlb_instantiation_mutex); |
diff --git a/mm/internal.h b/mm/internal.h index 6a697bb97fc5..dedb0aff673f 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -62,7 +62,7 @@ extern bool is_free_buddy_page(struct page *page); | |||
62 | */ | 62 | */ |
63 | static inline unsigned long page_order(struct page *page) | 63 | static inline unsigned long page_order(struct page *page) |
64 | { | 64 | { |
65 | VM_BUG_ON(!PageBuddy(page)); | 65 | /* PageBuddy() must be checked by the caller */ |
66 | return page_private(page); | 66 | return page_private(page); |
67 | } | 67 | } |
68 | 68 | ||
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9be3cf8a5da4..a9a534a38ac0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -269,13 +269,14 @@ enum move_type { | |||
269 | 269 | ||
270 | /* "mc" and its members are protected by cgroup_mutex */ | 270 | /* "mc" and its members are protected by cgroup_mutex */ |
271 | static struct move_charge_struct { | 271 | static struct move_charge_struct { |
272 | spinlock_t lock; /* for from, to, moving_task */ | 272 | spinlock_t lock; /* for from, to */ |
273 | struct mem_cgroup *from; | 273 | struct mem_cgroup *from; |
274 | struct mem_cgroup *to; | 274 | struct mem_cgroup *to; |
275 | unsigned long precharge; | 275 | unsigned long precharge; |
276 | unsigned long moved_charge; | 276 | unsigned long moved_charge; |
277 | unsigned long moved_swap; | 277 | unsigned long moved_swap; |
278 | struct task_struct *moving_task; /* a task moving charges */ | 278 | struct task_struct *moving_task; /* a task moving charges */ |
279 | struct mm_struct *mm; | ||
279 | wait_queue_head_t waitq; /* a waitq for other context */ | 280 | wait_queue_head_t waitq; /* a waitq for other context */ |
280 | } mc = { | 281 | } mc = { |
281 | .lock = __SPIN_LOCK_UNLOCKED(mc.lock), | 282 | .lock = __SPIN_LOCK_UNLOCKED(mc.lock), |
@@ -1646,6 +1647,7 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | |||
1646 | if (likely(!ret)) | 1647 | if (likely(!ret)) |
1647 | return CHARGE_OK; | 1648 | return CHARGE_OK; |
1648 | 1649 | ||
1650 | res_counter_uncharge(&mem->res, csize); | ||
1649 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); | 1651 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); |
1650 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; | 1652 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; |
1651 | } else | 1653 | } else |
@@ -1729,19 +1731,18 @@ again: | |||
1729 | 1731 | ||
1730 | rcu_read_lock(); | 1732 | rcu_read_lock(); |
1731 | p = rcu_dereference(mm->owner); | 1733 | p = rcu_dereference(mm->owner); |
1732 | VM_BUG_ON(!p); | ||
1733 | /* | 1734 | /* |
1734 | * because we don't have task_lock(), "p" can exit while | 1735 | * Because we don't have task_lock(), "p" can exit. |
1735 | * we're here. In that case, "mem" can point to root | 1736 | * In that case, "mem" can point to root or p can be NULL with |
1736 | * cgroup but never be NULL. (and task_struct itself is freed | 1737 | * race with swapoff. Then, we have small risk of mis-accouning. |
1737 | * by RCU, cgroup itself is RCU safe.) Then, we have small | 1738 | * But such kind of mis-account by race always happens because |
1738 | * risk here to get wrong cgroup. But such kind of mis-account | 1739 | * we don't have cgroup_mutex(). It's overkill and we allo that |
1739 | * by race always happens because we don't have cgroup_mutex(). | 1740 | * small race, here. |
1740 | * It's overkill and we allow that small race, here. | 1741 | * (*) swapoff at el will charge against mm-struct not against |
1742 | * task-struct. So, mm->owner can be NULL. | ||
1741 | */ | 1743 | */ |
1742 | mem = mem_cgroup_from_task(p); | 1744 | mem = mem_cgroup_from_task(p); |
1743 | VM_BUG_ON(!mem); | 1745 | if (!mem || mem_cgroup_is_root(mem)) { |
1744 | if (mem_cgroup_is_root(mem)) { | ||
1745 | rcu_read_unlock(); | 1746 | rcu_read_unlock(); |
1746 | goto done; | 1747 | goto done; |
1747 | } | 1748 | } |
@@ -4445,7 +4446,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | |||
4445 | unsigned long precharge; | 4446 | unsigned long precharge; |
4446 | struct vm_area_struct *vma; | 4447 | struct vm_area_struct *vma; |
4447 | 4448 | ||
4448 | down_read(&mm->mmap_sem); | 4449 | /* We've already held the mmap_sem */ |
4449 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 4450 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
4450 | struct mm_walk mem_cgroup_count_precharge_walk = { | 4451 | struct mm_walk mem_cgroup_count_precharge_walk = { |
4451 | .pmd_entry = mem_cgroup_count_precharge_pte_range, | 4452 | .pmd_entry = mem_cgroup_count_precharge_pte_range, |
@@ -4457,7 +4458,6 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | |||
4457 | walk_page_range(vma->vm_start, vma->vm_end, | 4458 | walk_page_range(vma->vm_start, vma->vm_end, |
4458 | &mem_cgroup_count_precharge_walk); | 4459 | &mem_cgroup_count_precharge_walk); |
4459 | } | 4460 | } |
4460 | up_read(&mm->mmap_sem); | ||
4461 | 4461 | ||
4462 | precharge = mc.precharge; | 4462 | precharge = mc.precharge; |
4463 | mc.precharge = 0; | 4463 | mc.precharge = 0; |
@@ -4508,11 +4508,16 @@ static void mem_cgroup_clear_mc(void) | |||
4508 | 4508 | ||
4509 | mc.moved_swap = 0; | 4509 | mc.moved_swap = 0; |
4510 | } | 4510 | } |
4511 | if (mc.mm) { | ||
4512 | up_read(&mc.mm->mmap_sem); | ||
4513 | mmput(mc.mm); | ||
4514 | } | ||
4511 | spin_lock(&mc.lock); | 4515 | spin_lock(&mc.lock); |
4512 | mc.from = NULL; | 4516 | mc.from = NULL; |
4513 | mc.to = NULL; | 4517 | mc.to = NULL; |
4514 | mc.moving_task = NULL; | ||
4515 | spin_unlock(&mc.lock); | 4518 | spin_unlock(&mc.lock); |
4519 | mc.moving_task = NULL; | ||
4520 | mc.mm = NULL; | ||
4516 | memcg_oom_recover(from); | 4521 | memcg_oom_recover(from); |
4517 | memcg_oom_recover(to); | 4522 | memcg_oom_recover(to); |
4518 | wake_up_all(&mc.waitq); | 4523 | wake_up_all(&mc.waitq); |
@@ -4537,26 +4542,37 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | |||
4537 | return 0; | 4542 | return 0; |
4538 | /* We move charges only when we move a owner of the mm */ | 4543 | /* We move charges only when we move a owner of the mm */ |
4539 | if (mm->owner == p) { | 4544 | if (mm->owner == p) { |
4545 | /* | ||
4546 | * We do all the move charge works under one mmap_sem to | ||
4547 | * avoid deadlock with down_write(&mmap_sem) | ||
4548 | * -> try_charge() -> if (mc.moving_task) -> sleep. | ||
4549 | */ | ||
4550 | down_read(&mm->mmap_sem); | ||
4551 | |||
4540 | VM_BUG_ON(mc.from); | 4552 | VM_BUG_ON(mc.from); |
4541 | VM_BUG_ON(mc.to); | 4553 | VM_BUG_ON(mc.to); |
4542 | VM_BUG_ON(mc.precharge); | 4554 | VM_BUG_ON(mc.precharge); |
4543 | VM_BUG_ON(mc.moved_charge); | 4555 | VM_BUG_ON(mc.moved_charge); |
4544 | VM_BUG_ON(mc.moved_swap); | 4556 | VM_BUG_ON(mc.moved_swap); |
4545 | VM_BUG_ON(mc.moving_task); | 4557 | VM_BUG_ON(mc.moving_task); |
4558 | VM_BUG_ON(mc.mm); | ||
4559 | |||
4546 | spin_lock(&mc.lock); | 4560 | spin_lock(&mc.lock); |
4547 | mc.from = from; | 4561 | mc.from = from; |
4548 | mc.to = mem; | 4562 | mc.to = mem; |
4549 | mc.precharge = 0; | 4563 | mc.precharge = 0; |
4550 | mc.moved_charge = 0; | 4564 | mc.moved_charge = 0; |
4551 | mc.moved_swap = 0; | 4565 | mc.moved_swap = 0; |
4552 | mc.moving_task = current; | ||
4553 | spin_unlock(&mc.lock); | 4566 | spin_unlock(&mc.lock); |
4567 | mc.moving_task = current; | ||
4568 | mc.mm = mm; | ||
4554 | 4569 | ||
4555 | ret = mem_cgroup_precharge_mc(mm); | 4570 | ret = mem_cgroup_precharge_mc(mm); |
4556 | if (ret) | 4571 | if (ret) |
4557 | mem_cgroup_clear_mc(); | 4572 | mem_cgroup_clear_mc(); |
4558 | } | 4573 | /* We call up_read() and mmput() in clear_mc(). */ |
4559 | mmput(mm); | 4574 | } else |
4575 | mmput(mm); | ||
4560 | } | 4576 | } |
4561 | return ret; | 4577 | return ret; |
4562 | } | 4578 | } |
@@ -4644,7 +4660,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) | |||
4644 | struct vm_area_struct *vma; | 4660 | struct vm_area_struct *vma; |
4645 | 4661 | ||
4646 | lru_add_drain_all(); | 4662 | lru_add_drain_all(); |
4647 | down_read(&mm->mmap_sem); | 4663 | /* We've already held the mmap_sem */ |
4648 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 4664 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
4649 | int ret; | 4665 | int ret; |
4650 | struct mm_walk mem_cgroup_move_charge_walk = { | 4666 | struct mm_walk mem_cgroup_move_charge_walk = { |
@@ -4663,7 +4679,6 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) | |||
4663 | */ | 4679 | */ |
4664 | break; | 4680 | break; |
4665 | } | 4681 | } |
4666 | up_read(&mm->mmap_sem); | ||
4667 | } | 4682 | } |
4668 | 4683 | ||
4669 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 4684 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
@@ -4672,17 +4687,11 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, | |||
4672 | struct task_struct *p, | 4687 | struct task_struct *p, |
4673 | bool threadgroup) | 4688 | bool threadgroup) |
4674 | { | 4689 | { |
4675 | struct mm_struct *mm; | 4690 | if (!mc.mm) |
4676 | |||
4677 | if (!mc.to) | ||
4678 | /* no need to move charge */ | 4691 | /* no need to move charge */ |
4679 | return; | 4692 | return; |
4680 | 4693 | ||
4681 | mm = get_task_mm(p); | 4694 | mem_cgroup_move_charge(mc.mm); |
4682 | if (mm) { | ||
4683 | mem_cgroup_move_charge(mm); | ||
4684 | mmput(mm); | ||
4685 | } | ||
4686 | mem_cgroup_clear_mc(); | 4695 | mem_cgroup_clear_mc(); |
4687 | } | 4696 | } |
4688 | #else /* !CONFIG_MMU */ | 4697 | #else /* !CONFIG_MMU */ |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index dd186c1a5d53..6345dfe78d2c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -659,7 +659,7 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) | |||
659 | * Scanning pfn is much easier than scanning lru list. | 659 | * Scanning pfn is much easier than scanning lru list. |
660 | * Scan pfn from start to end and Find LRU page. | 660 | * Scan pfn from start to end and Find LRU page. |
661 | */ | 661 | */ |
662 | int scan_lru_pages(unsigned long start, unsigned long end) | 662 | unsigned long scan_lru_pages(unsigned long start, unsigned long end) |
663 | { | 663 | { |
664 | unsigned long pfn; | 664 | unsigned long pfn; |
665 | struct page *page; | 665 | struct page *page; |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f969da5dd8a2..c1002c68d617 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -1588,7 +1588,7 @@ unsigned slab_node(struct mempolicy *policy) | |||
1588 | (void)first_zones_zonelist(zonelist, highest_zoneidx, | 1588 | (void)first_zones_zonelist(zonelist, highest_zoneidx, |
1589 | &policy->v.nodes, | 1589 | &policy->v.nodes, |
1590 | &zone); | 1590 | &zone); |
1591 | return zone->node; | 1591 | return zone ? zone->node : numa_node_id(); |
1592 | } | 1592 | } |
1593 | 1593 | ||
1594 | default: | 1594 | default: |
diff --git a/mm/migrate.c b/mm/migrate.c index 38e7cad782f4..2cfa9bf1f0d4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -553,7 +553,6 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
553 | int *result = NULL; | 553 | int *result = NULL; |
554 | struct page *newpage = get_new_page(page, private, &result); | 554 | struct page *newpage = get_new_page(page, private, &result); |
555 | int remap_swapcache = 1; | 555 | int remap_swapcache = 1; |
556 | int rcu_locked = 0; | ||
557 | int charge = 0; | 556 | int charge = 0; |
558 | struct mem_cgroup *mem = NULL; | 557 | struct mem_cgroup *mem = NULL; |
559 | struct anon_vma *anon_vma = NULL; | 558 | struct anon_vma *anon_vma = NULL; |
@@ -605,20 +604,26 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
605 | /* | 604 | /* |
606 | * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, | 605 | * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, |
607 | * we cannot notice that anon_vma is freed while we migrates a page. | 606 | * we cannot notice that anon_vma is freed while we migrates a page. |
608 | * This rcu_read_lock() delays freeing anon_vma pointer until the end | 607 | * This get_anon_vma() delays freeing anon_vma pointer until the end |
609 | * of migration. File cache pages are no problem because of page_lock() | 608 | * of migration. File cache pages are no problem because of page_lock() |
610 | * File Caches may use write_page() or lock_page() in migration, then, | 609 | * File Caches may use write_page() or lock_page() in migration, then, |
611 | * just care Anon page here. | 610 | * just care Anon page here. |
612 | */ | 611 | */ |
613 | if (PageAnon(page)) { | 612 | if (PageAnon(page)) { |
614 | rcu_read_lock(); | 613 | /* |
615 | rcu_locked = 1; | 614 | * Only page_lock_anon_vma() understands the subtleties of |
616 | 615 | * getting a hold on an anon_vma from outside one of its mms. | |
617 | /* Determine how to safely use anon_vma */ | 616 | */ |
618 | if (!page_mapped(page)) { | 617 | anon_vma = page_lock_anon_vma(page); |
619 | if (!PageSwapCache(page)) | 618 | if (anon_vma) { |
620 | goto rcu_unlock; | 619 | /* |
621 | 620 | * Take a reference count on the anon_vma if the | |
621 | * page is mapped so that it is guaranteed to | ||
622 | * exist when the page is remapped later | ||
623 | */ | ||
624 | get_anon_vma(anon_vma); | ||
625 | page_unlock_anon_vma(anon_vma); | ||
626 | } else if (PageSwapCache(page)) { | ||
622 | /* | 627 | /* |
623 | * We cannot be sure that the anon_vma of an unmapped | 628 | * We cannot be sure that the anon_vma of an unmapped |
624 | * swapcache page is safe to use because we don't | 629 | * swapcache page is safe to use because we don't |
@@ -633,13 +638,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
633 | */ | 638 | */ |
634 | remap_swapcache = 0; | 639 | remap_swapcache = 0; |
635 | } else { | 640 | } else { |
636 | /* | 641 | goto uncharge; |
637 | * Take a reference count on the anon_vma if the | ||
638 | * page is mapped so that it is guaranteed to | ||
639 | * exist when the page is remapped later | ||
640 | */ | ||
641 | anon_vma = page_anon_vma(page); | ||
642 | get_anon_vma(anon_vma); | ||
643 | } | 642 | } |
644 | } | 643 | } |
645 | 644 | ||
@@ -656,16 +655,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
656 | * free the metadata, so the page can be freed. | 655 | * free the metadata, so the page can be freed. |
657 | */ | 656 | */ |
658 | if (!page->mapping) { | 657 | if (!page->mapping) { |
659 | if (!PageAnon(page) && page_has_private(page)) { | 658 | VM_BUG_ON(PageAnon(page)); |
660 | /* | 659 | if (page_has_private(page)) { |
661 | * Go direct to try_to_free_buffers() here because | ||
662 | * a) that's what try_to_release_page() would do anyway | ||
663 | * b) we may be under rcu_read_lock() here, so we can't | ||
664 | * use GFP_KERNEL which is what try_to_release_page() | ||
665 | * needs to be effective. | ||
666 | */ | ||
667 | try_to_free_buffers(page); | 660 | try_to_free_buffers(page); |
668 | goto rcu_unlock; | 661 | goto uncharge; |
669 | } | 662 | } |
670 | goto skip_unmap; | 663 | goto skip_unmap; |
671 | } | 664 | } |
@@ -679,14 +672,11 @@ skip_unmap: | |||
679 | 672 | ||
680 | if (rc && remap_swapcache) | 673 | if (rc && remap_swapcache) |
681 | remove_migration_ptes(page, page); | 674 | remove_migration_ptes(page, page); |
682 | rcu_unlock: | ||
683 | 675 | ||
684 | /* Drop an anon_vma reference if we took one */ | 676 | /* Drop an anon_vma reference if we took one */ |
685 | if (anon_vma) | 677 | if (anon_vma) |
686 | drop_anon_vma(anon_vma); | 678 | drop_anon_vma(anon_vma); |
687 | 679 | ||
688 | if (rcu_locked) | ||
689 | rcu_read_unlock(); | ||
690 | uncharge: | 680 | uncharge: |
691 | if (!charge) | 681 | if (!charge) |
692 | mem_cgroup_end_migration(mem, page, newpage); | 682 | mem_cgroup_end_migration(mem, page, newpage); |
@@ -2460,6 +2460,7 @@ int install_special_mapping(struct mm_struct *mm, | |||
2460 | unsigned long addr, unsigned long len, | 2460 | unsigned long addr, unsigned long len, |
2461 | unsigned long vm_flags, struct page **pages) | 2461 | unsigned long vm_flags, struct page **pages) |
2462 | { | 2462 | { |
2463 | int ret; | ||
2463 | struct vm_area_struct *vma; | 2464 | struct vm_area_struct *vma; |
2464 | 2465 | ||
2465 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); | 2466 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); |
@@ -2477,16 +2478,23 @@ int install_special_mapping(struct mm_struct *mm, | |||
2477 | vma->vm_ops = &special_mapping_vmops; | 2478 | vma->vm_ops = &special_mapping_vmops; |
2478 | vma->vm_private_data = pages; | 2479 | vma->vm_private_data = pages; |
2479 | 2480 | ||
2480 | if (unlikely(insert_vm_struct(mm, vma))) { | 2481 | ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1); |
2481 | kmem_cache_free(vm_area_cachep, vma); | 2482 | if (ret) |
2482 | return -ENOMEM; | 2483 | goto out; |
2483 | } | 2484 | |
2485 | ret = insert_vm_struct(mm, vma); | ||
2486 | if (ret) | ||
2487 | goto out; | ||
2484 | 2488 | ||
2485 | mm->total_vm += len >> PAGE_SHIFT; | 2489 | mm->total_vm += len >> PAGE_SHIFT; |
2486 | 2490 | ||
2487 | perf_event_mmap(vma); | 2491 | perf_event_mmap(vma); |
2488 | 2492 | ||
2489 | return 0; | 2493 | return 0; |
2494 | |||
2495 | out: | ||
2496 | kmem_cache_free(vm_area_cachep, vma); | ||
2497 | return ret; | ||
2490 | } | 2498 | } |
2491 | 2499 | ||
2492 | static DEFINE_MUTEX(mm_all_locks_mutex); | 2500 | static DEFINE_MUTEX(mm_all_locks_mutex); |
diff --git a/mm/mmzone.c b/mm/mmzone.c index e35bfb82c855..f5b7d1760213 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
@@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn, | |||
87 | return 1; | 87 | return 1; |
88 | } | 88 | } |
89 | #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ | 89 | #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ |
90 | |||
91 | #ifdef CONFIG_SMP | ||
92 | /* Called when a more accurate view of NR_FREE_PAGES is needed */ | ||
93 | unsigned long zone_nr_free_pages(struct zone *zone) | ||
94 | { | ||
95 | unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES); | ||
96 | |||
97 | /* | ||
98 | * While kswapd is awake, it is considered the zone is under some | ||
99 | * memory pressure. Under pressure, there is a risk that | ||
100 | * per-cpu-counter-drift will allow the min watermark to be breached | ||
101 | * potentially causing a live-lock. While kswapd is awake and | ||
102 | * free pages are low, get a better estimate for free pages | ||
103 | */ | ||
104 | if (nr_free_pages < zone->percpu_drift_mark && | ||
105 | !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) | ||
106 | return zone_page_state_snapshot(zone, NR_FREE_PAGES); | ||
107 | |||
108 | return nr_free_pages; | ||
109 | } | ||
110 | #endif /* CONFIG_SMP */ | ||
diff --git a/mm/mprotect.c b/mm/mprotect.c index 2d1bf7cf8851..4c5133873097 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -211,6 +211,7 @@ success: | |||
211 | mmu_notifier_invalidate_range_end(mm, start, end); | 211 | mmu_notifier_invalidate_range_end(mm, start, end); |
212 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); | 212 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); |
213 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); | 213 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); |
214 | perf_event_mmap(vma); | ||
214 | return 0; | 215 | return 0; |
215 | 216 | ||
216 | fail: | 217 | fail: |
@@ -299,7 +300,6 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, | |||
299 | error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); | 300 | error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); |
300 | if (error) | 301 | if (error) |
301 | goto out; | 302 | goto out; |
302 | perf_event_mmap(vma); | ||
303 | nstart = tmp; | 303 | nstart = tmp; |
304 | 304 | ||
305 | if (nstart < prev->vm_end) | 305 | if (nstart < prev->vm_end) |
diff --git a/mm/nommu.c b/mm/nommu.c index 88ff091eb07a..acb3bd3c1cb9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -1668,6 +1668,7 @@ void exit_mmap(struct mm_struct *mm) | |||
1668 | mm->mmap = vma->vm_next; | 1668 | mm->mmap = vma->vm_next; |
1669 | delete_vma_from_mm(vma); | 1669 | delete_vma_from_mm(vma); |
1670 | delete_vma(mm, vma); | 1670 | delete_vma(mm, vma); |
1671 | cond_resched(); | ||
1671 | } | 1672 | } |
1672 | 1673 | ||
1673 | kleave(""); | 1674 | kleave(""); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f12ad1836abe..985e072a3dd9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -103,19 +103,24 @@ gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; | |||
103 | * only be modified with pm_mutex held, unless the suspend/hibernate code is | 103 | * only be modified with pm_mutex held, unless the suspend/hibernate code is |
104 | * guaranteed not to run in parallel with that modification). | 104 | * guaranteed not to run in parallel with that modification). |
105 | */ | 105 | */ |
106 | void set_gfp_allowed_mask(gfp_t mask) | 106 | |
107 | static gfp_t saved_gfp_mask; | ||
108 | |||
109 | void pm_restore_gfp_mask(void) | ||
107 | { | 110 | { |
108 | WARN_ON(!mutex_is_locked(&pm_mutex)); | 111 | WARN_ON(!mutex_is_locked(&pm_mutex)); |
109 | gfp_allowed_mask = mask; | 112 | if (saved_gfp_mask) { |
113 | gfp_allowed_mask = saved_gfp_mask; | ||
114 | saved_gfp_mask = 0; | ||
115 | } | ||
110 | } | 116 | } |
111 | 117 | ||
112 | gfp_t clear_gfp_allowed_mask(gfp_t mask) | 118 | void pm_restrict_gfp_mask(void) |
113 | { | 119 | { |
114 | gfp_t ret = gfp_allowed_mask; | ||
115 | |||
116 | WARN_ON(!mutex_is_locked(&pm_mutex)); | 120 | WARN_ON(!mutex_is_locked(&pm_mutex)); |
117 | gfp_allowed_mask &= ~mask; | 121 | WARN_ON(saved_gfp_mask); |
118 | return ret; | 122 | saved_gfp_mask = gfp_allowed_mask; |
123 | gfp_allowed_mask &= ~GFP_IOFS; | ||
119 | } | 124 | } |
120 | #endif /* CONFIG_PM_SLEEP */ | 125 | #endif /* CONFIG_PM_SLEEP */ |
121 | 126 | ||
@@ -530,7 +535,7 @@ static inline void __free_one_page(struct page *page, | |||
530 | * so it's less likely to be used soon and more likely to be merged | 535 | * so it's less likely to be used soon and more likely to be merged |
531 | * as a higher order page | 536 | * as a higher order page |
532 | */ | 537 | */ |
533 | if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) { | 538 | if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { |
534 | struct page *higher_page, *higher_buddy; | 539 | struct page *higher_page, *higher_buddy; |
535 | combined_idx = __find_combined_index(page_idx, order); | 540 | combined_idx = __find_combined_index(page_idx, order); |
536 | higher_page = page + combined_idx - page_idx; | 541 | higher_page = page + combined_idx - page_idx; |
@@ -1454,24 +1459,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | |||
1454 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ | 1459 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ |
1455 | 1460 | ||
1456 | /* | 1461 | /* |
1457 | * Return 1 if free pages are above 'mark'. This takes into account the order | 1462 | * Return true if free pages are above 'mark'. This takes into account the order |
1458 | * of the allocation. | 1463 | * of the allocation. |
1459 | */ | 1464 | */ |
1460 | int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 1465 | static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, |
1461 | int classzone_idx, int alloc_flags) | 1466 | int classzone_idx, int alloc_flags, long free_pages) |
1462 | { | 1467 | { |
1463 | /* free_pages my go negative - that's OK */ | 1468 | /* free_pages my go negative - that's OK */ |
1464 | long min = mark; | 1469 | long min = mark; |
1465 | long free_pages = zone_nr_free_pages(z) - (1 << order) + 1; | ||
1466 | int o; | 1470 | int o; |
1467 | 1471 | ||
1472 | free_pages -= (1 << order) + 1; | ||
1468 | if (alloc_flags & ALLOC_HIGH) | 1473 | if (alloc_flags & ALLOC_HIGH) |
1469 | min -= min / 2; | 1474 | min -= min / 2; |
1470 | if (alloc_flags & ALLOC_HARDER) | 1475 | if (alloc_flags & ALLOC_HARDER) |
1471 | min -= min / 4; | 1476 | min -= min / 4; |
1472 | 1477 | ||
1473 | if (free_pages <= min + z->lowmem_reserve[classzone_idx]) | 1478 | if (free_pages <= min + z->lowmem_reserve[classzone_idx]) |
1474 | return 0; | 1479 | return false; |
1475 | for (o = 0; o < order; o++) { | 1480 | for (o = 0; o < order; o++) { |
1476 | /* At the next order, this order's pages become unavailable */ | 1481 | /* At the next order, this order's pages become unavailable */ |
1477 | free_pages -= z->free_area[o].nr_free << o; | 1482 | free_pages -= z->free_area[o].nr_free << o; |
@@ -1480,9 +1485,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1480 | min >>= 1; | 1485 | min >>= 1; |
1481 | 1486 | ||
1482 | if (free_pages <= min) | 1487 | if (free_pages <= min) |
1483 | return 0; | 1488 | return false; |
1484 | } | 1489 | } |
1485 | return 1; | 1490 | return true; |
1491 | } | ||
1492 | |||
1493 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, | ||
1494 | int classzone_idx, int alloc_flags) | ||
1495 | { | ||
1496 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, | ||
1497 | zone_page_state(z, NR_FREE_PAGES)); | ||
1498 | } | ||
1499 | |||
1500 | bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | ||
1501 | int classzone_idx, int alloc_flags) | ||
1502 | { | ||
1503 | long free_pages = zone_page_state(z, NR_FREE_PAGES); | ||
1504 | |||
1505 | if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) | ||
1506 | free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); | ||
1507 | |||
1508 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, | ||
1509 | free_pages); | ||
1486 | } | 1510 | } |
1487 | 1511 | ||
1488 | #ifdef CONFIG_NUMA | 1512 | #ifdef CONFIG_NUMA |
@@ -2436,7 +2460,7 @@ void show_free_areas(void) | |||
2436 | " all_unreclaimable? %s" | 2460 | " all_unreclaimable? %s" |
2437 | "\n", | 2461 | "\n", |
2438 | zone->name, | 2462 | zone->name, |
2439 | K(zone_nr_free_pages(zone)), | 2463 | K(zone_page_state(zone, NR_FREE_PAGES)), |
2440 | K(min_wmark_pages(zone)), | 2464 | K(min_wmark_pages(zone)), |
2441 | K(low_wmark_pages(zone)), | 2465 | K(low_wmark_pages(zone)), |
2442 | K(high_wmark_pages(zone)), | 2466 | K(high_wmark_pages(zone)), |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6b8889da69a6..d8087f0db507 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -517,6 +517,15 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); | |||
517 | static void purge_fragmented_blocks_allcpus(void); | 517 | static void purge_fragmented_blocks_allcpus(void); |
518 | 518 | ||
519 | /* | 519 | /* |
520 | * called before a call to iounmap() if the caller wants vm_area_struct's | ||
521 | * immediately freed. | ||
522 | */ | ||
523 | void set_iounmap_nonlazy(void) | ||
524 | { | ||
525 | atomic_set(&vmap_lazy_nr, lazy_max_pages()+1); | ||
526 | } | ||
527 | |||
528 | /* | ||
520 | * Purges all lazily-freed vmap areas. | 529 | * Purges all lazily-freed vmap areas. |
521 | * | 530 | * |
522 | * If sync is 0 then don't purge if there is already a purge in progress. | 531 | * If sync is 0 then don't purge if there is already a purge in progress. |
diff --git a/mm/vmscan.c b/mm/vmscan.c index c5dfabf25f11..3e71cb1ee28c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -2082,7 +2082,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) | |||
2082 | if (zone->all_unreclaimable) | 2082 | if (zone->all_unreclaimable) |
2083 | continue; | 2083 | continue; |
2084 | 2084 | ||
2085 | if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), | 2085 | if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), |
2086 | 0, 0)) | 2086 | 0, 0)) |
2087 | return 1; | 2087 | return 1; |
2088 | } | 2088 | } |
@@ -2169,7 +2169,7 @@ loop_again: | |||
2169 | shrink_active_list(SWAP_CLUSTER_MAX, zone, | 2169 | shrink_active_list(SWAP_CLUSTER_MAX, zone, |
2170 | &sc, priority, 0); | 2170 | &sc, priority, 0); |
2171 | 2171 | ||
2172 | if (!zone_watermark_ok(zone, order, | 2172 | if (!zone_watermark_ok_safe(zone, order, |
2173 | high_wmark_pages(zone), 0, 0)) { | 2173 | high_wmark_pages(zone), 0, 0)) { |
2174 | end_zone = i; | 2174 | end_zone = i; |
2175 | break; | 2175 | break; |
@@ -2215,7 +2215,7 @@ loop_again: | |||
2215 | * We put equal pressure on every zone, unless one | 2215 | * We put equal pressure on every zone, unless one |
2216 | * zone has way too many pages free already. | 2216 | * zone has way too many pages free already. |
2217 | */ | 2217 | */ |
2218 | if (!zone_watermark_ok(zone, order, | 2218 | if (!zone_watermark_ok_safe(zone, order, |
2219 | 8*high_wmark_pages(zone), end_zone, 0)) | 2219 | 8*high_wmark_pages(zone), end_zone, 0)) |
2220 | shrink_zone(priority, zone, &sc); | 2220 | shrink_zone(priority, zone, &sc); |
2221 | reclaim_state->reclaimed_slab = 0; | 2221 | reclaim_state->reclaimed_slab = 0; |
@@ -2236,7 +2236,7 @@ loop_again: | |||
2236 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) | 2236 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) |
2237 | sc.may_writepage = 1; | 2237 | sc.may_writepage = 1; |
2238 | 2238 | ||
2239 | if (!zone_watermark_ok(zone, order, | 2239 | if (!zone_watermark_ok_safe(zone, order, |
2240 | high_wmark_pages(zone), end_zone, 0)) { | 2240 | high_wmark_pages(zone), end_zone, 0)) { |
2241 | all_zones_ok = 0; | 2241 | all_zones_ok = 0; |
2242 | /* | 2242 | /* |
@@ -2244,7 +2244,7 @@ loop_again: | |||
2244 | * means that we have a GFP_ATOMIC allocation | 2244 | * means that we have a GFP_ATOMIC allocation |
2245 | * failure risk. Hurry up! | 2245 | * failure risk. Hurry up! |
2246 | */ | 2246 | */ |
2247 | if (!zone_watermark_ok(zone, order, | 2247 | if (!zone_watermark_ok_safe(zone, order, |
2248 | min_wmark_pages(zone), end_zone, 0)) | 2248 | min_wmark_pages(zone), end_zone, 0)) |
2249 | has_under_min_watermark_zone = 1; | 2249 | has_under_min_watermark_zone = 1; |
2250 | } | 2250 | } |
@@ -2378,7 +2378,9 @@ static int kswapd(void *p) | |||
2378 | */ | 2378 | */ |
2379 | if (!sleeping_prematurely(pgdat, order, remaining)) { | 2379 | if (!sleeping_prematurely(pgdat, order, remaining)) { |
2380 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); | 2380 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); |
2381 | restore_pgdat_percpu_threshold(pgdat); | ||
2381 | schedule(); | 2382 | schedule(); |
2383 | reduce_pgdat_percpu_threshold(pgdat); | ||
2382 | } else { | 2384 | } else { |
2383 | if (remaining) | 2385 | if (remaining) |
2384 | count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); | 2386 | count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); |
@@ -2417,16 +2419,17 @@ void wakeup_kswapd(struct zone *zone, int order) | |||
2417 | if (!populated_zone(zone)) | 2419 | if (!populated_zone(zone)) |
2418 | return; | 2420 | return; |
2419 | 2421 | ||
2420 | pgdat = zone->zone_pgdat; | 2422 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
2421 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) | ||
2422 | return; | 2423 | return; |
2424 | pgdat = zone->zone_pgdat; | ||
2423 | if (pgdat->kswapd_max_order < order) | 2425 | if (pgdat->kswapd_max_order < order) |
2424 | pgdat->kswapd_max_order = order; | 2426 | pgdat->kswapd_max_order = order; |
2425 | trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); | ||
2426 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | ||
2427 | return; | ||
2428 | if (!waitqueue_active(&pgdat->kswapd_wait)) | 2427 | if (!waitqueue_active(&pgdat->kswapd_wait)) |
2429 | return; | 2428 | return; |
2429 | if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) | ||
2430 | return; | ||
2431 | |||
2432 | trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); | ||
2430 | wake_up_interruptible(&pgdat->kswapd_wait); | 2433 | wake_up_interruptible(&pgdat->kswapd_wait); |
2431 | } | 2434 | } |
2432 | 2435 | ||
diff --git a/mm/vmstat.c b/mm/vmstat.c index 355a9e669aaa..4d7faebb9b70 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -81,6 +81,30 @@ EXPORT_SYMBOL(vm_stat); | |||
81 | 81 | ||
82 | #ifdef CONFIG_SMP | 82 | #ifdef CONFIG_SMP |
83 | 83 | ||
84 | static int calculate_pressure_threshold(struct zone *zone) | ||
85 | { | ||
86 | int threshold; | ||
87 | int watermark_distance; | ||
88 | |||
89 | /* | ||
90 | * As vmstats are not up to date, there is drift between the estimated | ||
91 | * and real values. For high thresholds and a high number of CPUs, it | ||
92 | * is possible for the min watermark to be breached while the estimated | ||
93 | * value looks fine. The pressure threshold is a reduced value such | ||
94 | * that even the maximum amount of drift will not accidentally breach | ||
95 | * the min watermark | ||
96 | */ | ||
97 | watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone); | ||
98 | threshold = max(1, (int)(watermark_distance / num_online_cpus())); | ||
99 | |||
100 | /* | ||
101 | * Maximum threshold is 125 | ||
102 | */ | ||
103 | threshold = min(125, threshold); | ||
104 | |||
105 | return threshold; | ||
106 | } | ||
107 | |||
84 | static int calculate_threshold(struct zone *zone) | 108 | static int calculate_threshold(struct zone *zone) |
85 | { | 109 | { |
86 | int threshold; | 110 | int threshold; |
@@ -159,6 +183,48 @@ static void refresh_zone_stat_thresholds(void) | |||
159 | } | 183 | } |
160 | } | 184 | } |
161 | 185 | ||
186 | void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) | ||
187 | { | ||
188 | struct zone *zone; | ||
189 | int cpu; | ||
190 | int threshold; | ||
191 | int i; | ||
192 | |||
193 | get_online_cpus(); | ||
194 | for (i = 0; i < pgdat->nr_zones; i++) { | ||
195 | zone = &pgdat->node_zones[i]; | ||
196 | if (!zone->percpu_drift_mark) | ||
197 | continue; | ||
198 | |||
199 | threshold = calculate_pressure_threshold(zone); | ||
200 | for_each_online_cpu(cpu) | ||
201 | per_cpu_ptr(zone->pageset, cpu)->stat_threshold | ||
202 | = threshold; | ||
203 | } | ||
204 | put_online_cpus(); | ||
205 | } | ||
206 | |||
207 | void restore_pgdat_percpu_threshold(pg_data_t *pgdat) | ||
208 | { | ||
209 | struct zone *zone; | ||
210 | int cpu; | ||
211 | int threshold; | ||
212 | int i; | ||
213 | |||
214 | get_online_cpus(); | ||
215 | for (i = 0; i < pgdat->nr_zones; i++) { | ||
216 | zone = &pgdat->node_zones[i]; | ||
217 | if (!zone->percpu_drift_mark) | ||
218 | continue; | ||
219 | |||
220 | threshold = calculate_threshold(zone); | ||
221 | for_each_online_cpu(cpu) | ||
222 | per_cpu_ptr(zone->pageset, cpu)->stat_threshold | ||
223 | = threshold; | ||
224 | } | ||
225 | put_online_cpus(); | ||
226 | } | ||
227 | |||
162 | /* | 228 | /* |
163 | * For use when we know that interrupts are disabled. | 229 | * For use when we know that interrupts are disabled. |
164 | */ | 230 | */ |
@@ -826,7 +892,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
826 | "\n scanned %lu" | 892 | "\n scanned %lu" |
827 | "\n spanned %lu" | 893 | "\n spanned %lu" |
828 | "\n present %lu", | 894 | "\n present %lu", |
829 | zone_nr_free_pages(zone), | 895 | zone_page_state(zone, NR_FREE_PAGES), |
830 | min_wmark_pages(zone), | 896 | min_wmark_pages(zone), |
831 | low_wmark_pages(zone), | 897 | low_wmark_pages(zone), |
832 | high_wmark_pages(zone), | 898 | high_wmark_pages(zone), |