aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c29
-rw-r--r--mm/hugetlb.c8
-rw-r--r--mm/internal.h2
-rw-r--r--mm/memcontrol.c63
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/migrate.c48
-rw-r--r--mm/mmap.c16
-rw-r--r--mm/mmzone.c21
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/nommu.c1
-rw-r--r--mm/page_alloc.c56
-rw-r--r--mm/vmalloc.c9
-rw-r--r--mm/vmscan.c23
-rw-r--r--mm/vmstat.c68
15 files changed, 220 insertions, 130 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 3d4df44e4221..9701a501f769 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -631,7 +631,9 @@ repeat:
631 pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); 631 pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
632 if (pagep) { 632 if (pagep) {
633 page = radix_tree_deref_slot(pagep); 633 page = radix_tree_deref_slot(pagep);
634 if (unlikely(!page || page == RADIX_TREE_RETRY)) 634 if (unlikely(!page))
635 goto out;
636 if (radix_tree_deref_retry(page))
635 goto repeat; 637 goto repeat;
636 638
637 if (!page_cache_get_speculative(page)) 639 if (!page_cache_get_speculative(page))
@@ -647,6 +649,7 @@ repeat:
647 goto repeat; 649 goto repeat;
648 } 650 }
649 } 651 }
652out:
650 rcu_read_unlock(); 653 rcu_read_unlock();
651 654
652 return page; 655 return page;
@@ -764,12 +767,11 @@ repeat:
764 page = radix_tree_deref_slot((void **)pages[i]); 767 page = radix_tree_deref_slot((void **)pages[i]);
765 if (unlikely(!page)) 768 if (unlikely(!page))
766 continue; 769 continue;
767 /* 770 if (radix_tree_deref_retry(page)) {
768 * this can only trigger if nr_found == 1, making livelock 771 if (ret)
769 * a non issue. 772 start = pages[ret-1]->index;
770 */
771 if (unlikely(page == RADIX_TREE_RETRY))
772 goto restart; 773 goto restart;
774 }
773 775
774 if (!page_cache_get_speculative(page)) 776 if (!page_cache_get_speculative(page))
775 goto repeat; 777 goto repeat;
@@ -817,11 +819,7 @@ repeat:
817 page = radix_tree_deref_slot((void **)pages[i]); 819 page = radix_tree_deref_slot((void **)pages[i]);
818 if (unlikely(!page)) 820 if (unlikely(!page))
819 continue; 821 continue;
820 /* 822 if (radix_tree_deref_retry(page))
821 * this can only trigger if nr_found == 1, making livelock
822 * a non issue.
823 */
824 if (unlikely(page == RADIX_TREE_RETRY))
825 goto restart; 823 goto restart;
826 824
827 if (page->mapping == NULL || page->index != index) 825 if (page->mapping == NULL || page->index != index)
@@ -874,11 +872,7 @@ repeat:
874 page = radix_tree_deref_slot((void **)pages[i]); 872 page = radix_tree_deref_slot((void **)pages[i]);
875 if (unlikely(!page)) 873 if (unlikely(!page))
876 continue; 874 continue;
877 /* 875 if (radix_tree_deref_retry(page))
878 * this can only trigger if nr_found == 1, making livelock
879 * a non issue.
880 */
881 if (unlikely(page == RADIX_TREE_RETRY))
882 goto restart; 876 goto restart;
883 877
884 if (!page_cache_get_speculative(page)) 878 if (!page_cache_get_speculative(page))
@@ -1016,6 +1010,9 @@ find_page:
1016 goto page_not_up_to_date; 1010 goto page_not_up_to_date;
1017 if (!trylock_page(page)) 1011 if (!trylock_page(page))
1018 goto page_not_up_to_date; 1012 goto page_not_up_to_date;
1013 /* Did it get truncated before we got the lock? */
1014 if (!page->mapping)
1015 goto page_not_up_to_date_locked;
1019 if (!mapping->a_ops->is_partially_uptodate(page, 1016 if (!mapping->a_ops->is_partially_uptodate(page,
1020 desc, offset)) 1017 desc, offset))
1021 goto page_not_up_to_date_locked; 1018 goto page_not_up_to_date_locked;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c03273807182..2697806746d0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2380,8 +2380,11 @@ retry_avoidcopy:
2380 * When the original hugepage is shared one, it does not have 2380 * When the original hugepage is shared one, it does not have
2381 * anon_vma prepared. 2381 * anon_vma prepared.
2382 */ 2382 */
2383 if (unlikely(anon_vma_prepare(vma))) 2383 if (unlikely(anon_vma_prepare(vma))) {
2384 /* Caller expects lock to be held */
2385 spin_lock(&mm->page_table_lock);
2384 return VM_FAULT_OOM; 2386 return VM_FAULT_OOM;
2387 }
2385 2388
2386 copy_huge_page(new_page, old_page, address, vma); 2389 copy_huge_page(new_page, old_page, address, vma);
2387 __SetPageUptodate(new_page); 2390 __SetPageUptodate(new_page);
@@ -2665,7 +2668,8 @@ out_page_table_lock:
2665 unlock_page(pagecache_page); 2668 unlock_page(pagecache_page);
2666 put_page(pagecache_page); 2669 put_page(pagecache_page);
2667 } 2670 }
2668 unlock_page(page); 2671 if (page != pagecache_page)
2672 unlock_page(page);
2669 2673
2670out_mutex: 2674out_mutex:
2671 mutex_unlock(&hugetlb_instantiation_mutex); 2675 mutex_unlock(&hugetlb_instantiation_mutex);
diff --git a/mm/internal.h b/mm/internal.h
index 6a697bb97fc5..dedb0aff673f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -62,7 +62,7 @@ extern bool is_free_buddy_page(struct page *page);
62 */ 62 */
63static inline unsigned long page_order(struct page *page) 63static inline unsigned long page_order(struct page *page)
64{ 64{
65 VM_BUG_ON(!PageBuddy(page)); 65 /* PageBuddy() must be checked by the caller */
66 return page_private(page); 66 return page_private(page);
67} 67}
68 68
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9be3cf8a5da4..a9a534a38ac0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -269,13 +269,14 @@ enum move_type {
269 269
270/* "mc" and its members are protected by cgroup_mutex */ 270/* "mc" and its members are protected by cgroup_mutex */
271static struct move_charge_struct { 271static struct move_charge_struct {
272 spinlock_t lock; /* for from, to, moving_task */ 272 spinlock_t lock; /* for from, to */
273 struct mem_cgroup *from; 273 struct mem_cgroup *from;
274 struct mem_cgroup *to; 274 struct mem_cgroup *to;
275 unsigned long precharge; 275 unsigned long precharge;
276 unsigned long moved_charge; 276 unsigned long moved_charge;
277 unsigned long moved_swap; 277 unsigned long moved_swap;
278 struct task_struct *moving_task; /* a task moving charges */ 278 struct task_struct *moving_task; /* a task moving charges */
279 struct mm_struct *mm;
279 wait_queue_head_t waitq; /* a waitq for other context */ 280 wait_queue_head_t waitq; /* a waitq for other context */
280} mc = { 281} mc = {
281 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 282 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
@@ -1646,6 +1647,7 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1646 if (likely(!ret)) 1647 if (likely(!ret))
1647 return CHARGE_OK; 1648 return CHARGE_OK;
1648 1649
1650 res_counter_uncharge(&mem->res, csize);
1649 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 1651 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
1650 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 1652 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1651 } else 1653 } else
@@ -1729,19 +1731,18 @@ again:
1729 1731
1730 rcu_read_lock(); 1732 rcu_read_lock();
1731 p = rcu_dereference(mm->owner); 1733 p = rcu_dereference(mm->owner);
1732 VM_BUG_ON(!p);
1733 /* 1734 /*
1734 * because we don't have task_lock(), "p" can exit while 1735 * Because we don't have task_lock(), "p" can exit.
1735 * we're here. In that case, "mem" can point to root 1736 * In that case, "mem" can point to root or p can be NULL with
1736 * cgroup but never be NULL. (and task_struct itself is freed 1737 * race with swapoff. Then, we have small risk of mis-accouning.
1737 * by RCU, cgroup itself is RCU safe.) Then, we have small 1738 * But such kind of mis-account by race always happens because
1738 * risk here to get wrong cgroup. But such kind of mis-account 1739 * we don't have cgroup_mutex(). It's overkill and we allo that
1739 * by race always happens because we don't have cgroup_mutex(). 1740 * small race, here.
1740 * It's overkill and we allow that small race, here. 1741 * (*) swapoff at el will charge against mm-struct not against
1742 * task-struct. So, mm->owner can be NULL.
1741 */ 1743 */
1742 mem = mem_cgroup_from_task(p); 1744 mem = mem_cgroup_from_task(p);
1743 VM_BUG_ON(!mem); 1745 if (!mem || mem_cgroup_is_root(mem)) {
1744 if (mem_cgroup_is_root(mem)) {
1745 rcu_read_unlock(); 1746 rcu_read_unlock();
1746 goto done; 1747 goto done;
1747 } 1748 }
@@ -4445,7 +4446,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4445 unsigned long precharge; 4446 unsigned long precharge;
4446 struct vm_area_struct *vma; 4447 struct vm_area_struct *vma;
4447 4448
4448 down_read(&mm->mmap_sem); 4449 /* We've already held the mmap_sem */
4449 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4450 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4450 struct mm_walk mem_cgroup_count_precharge_walk = { 4451 struct mm_walk mem_cgroup_count_precharge_walk = {
4451 .pmd_entry = mem_cgroup_count_precharge_pte_range, 4452 .pmd_entry = mem_cgroup_count_precharge_pte_range,
@@ -4457,7 +4458,6 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4457 walk_page_range(vma->vm_start, vma->vm_end, 4458 walk_page_range(vma->vm_start, vma->vm_end,
4458 &mem_cgroup_count_precharge_walk); 4459 &mem_cgroup_count_precharge_walk);
4459 } 4460 }
4460 up_read(&mm->mmap_sem);
4461 4461
4462 precharge = mc.precharge; 4462 precharge = mc.precharge;
4463 mc.precharge = 0; 4463 mc.precharge = 0;
@@ -4508,11 +4508,16 @@ static void mem_cgroup_clear_mc(void)
4508 4508
4509 mc.moved_swap = 0; 4509 mc.moved_swap = 0;
4510 } 4510 }
4511 if (mc.mm) {
4512 up_read(&mc.mm->mmap_sem);
4513 mmput(mc.mm);
4514 }
4511 spin_lock(&mc.lock); 4515 spin_lock(&mc.lock);
4512 mc.from = NULL; 4516 mc.from = NULL;
4513 mc.to = NULL; 4517 mc.to = NULL;
4514 mc.moving_task = NULL;
4515 spin_unlock(&mc.lock); 4518 spin_unlock(&mc.lock);
4519 mc.moving_task = NULL;
4520 mc.mm = NULL;
4516 memcg_oom_recover(from); 4521 memcg_oom_recover(from);
4517 memcg_oom_recover(to); 4522 memcg_oom_recover(to);
4518 wake_up_all(&mc.waitq); 4523 wake_up_all(&mc.waitq);
@@ -4537,26 +4542,37 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4537 return 0; 4542 return 0;
4538 /* We move charges only when we move a owner of the mm */ 4543 /* We move charges only when we move a owner of the mm */
4539 if (mm->owner == p) { 4544 if (mm->owner == p) {
4545 /*
4546 * We do all the move charge works under one mmap_sem to
4547 * avoid deadlock with down_write(&mmap_sem)
4548 * -> try_charge() -> if (mc.moving_task) -> sleep.
4549 */
4550 down_read(&mm->mmap_sem);
4551
4540 VM_BUG_ON(mc.from); 4552 VM_BUG_ON(mc.from);
4541 VM_BUG_ON(mc.to); 4553 VM_BUG_ON(mc.to);
4542 VM_BUG_ON(mc.precharge); 4554 VM_BUG_ON(mc.precharge);
4543 VM_BUG_ON(mc.moved_charge); 4555 VM_BUG_ON(mc.moved_charge);
4544 VM_BUG_ON(mc.moved_swap); 4556 VM_BUG_ON(mc.moved_swap);
4545 VM_BUG_ON(mc.moving_task); 4557 VM_BUG_ON(mc.moving_task);
4558 VM_BUG_ON(mc.mm);
4559
4546 spin_lock(&mc.lock); 4560 spin_lock(&mc.lock);
4547 mc.from = from; 4561 mc.from = from;
4548 mc.to = mem; 4562 mc.to = mem;
4549 mc.precharge = 0; 4563 mc.precharge = 0;
4550 mc.moved_charge = 0; 4564 mc.moved_charge = 0;
4551 mc.moved_swap = 0; 4565 mc.moved_swap = 0;
4552 mc.moving_task = current;
4553 spin_unlock(&mc.lock); 4566 spin_unlock(&mc.lock);
4567 mc.moving_task = current;
4568 mc.mm = mm;
4554 4569
4555 ret = mem_cgroup_precharge_mc(mm); 4570 ret = mem_cgroup_precharge_mc(mm);
4556 if (ret) 4571 if (ret)
4557 mem_cgroup_clear_mc(); 4572 mem_cgroup_clear_mc();
4558 } 4573 /* We call up_read() and mmput() in clear_mc(). */
4559 mmput(mm); 4574 } else
4575 mmput(mm);
4560 } 4576 }
4561 return ret; 4577 return ret;
4562} 4578}
@@ -4644,7 +4660,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
4644 struct vm_area_struct *vma; 4660 struct vm_area_struct *vma;
4645 4661
4646 lru_add_drain_all(); 4662 lru_add_drain_all();
4647 down_read(&mm->mmap_sem); 4663 /* We've already held the mmap_sem */
4648 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4664 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4649 int ret; 4665 int ret;
4650 struct mm_walk mem_cgroup_move_charge_walk = { 4666 struct mm_walk mem_cgroup_move_charge_walk = {
@@ -4663,7 +4679,6 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
4663 */ 4679 */
4664 break; 4680 break;
4665 } 4681 }
4666 up_read(&mm->mmap_sem);
4667} 4682}
4668 4683
4669static void mem_cgroup_move_task(struct cgroup_subsys *ss, 4684static void mem_cgroup_move_task(struct cgroup_subsys *ss,
@@ -4672,17 +4687,11 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4672 struct task_struct *p, 4687 struct task_struct *p,
4673 bool threadgroup) 4688 bool threadgroup)
4674{ 4689{
4675 struct mm_struct *mm; 4690 if (!mc.mm)
4676
4677 if (!mc.to)
4678 /* no need to move charge */ 4691 /* no need to move charge */
4679 return; 4692 return;
4680 4693
4681 mm = get_task_mm(p); 4694 mem_cgroup_move_charge(mc.mm);
4682 if (mm) {
4683 mem_cgroup_move_charge(mm);
4684 mmput(mm);
4685 }
4686 mem_cgroup_clear_mc(); 4695 mem_cgroup_clear_mc();
4687} 4696}
4688#else /* !CONFIG_MMU */ 4697#else /* !CONFIG_MMU */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index dd186c1a5d53..6345dfe78d2c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -659,7 +659,7 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
659 * Scanning pfn is much easier than scanning lru list. 659 * Scanning pfn is much easier than scanning lru list.
660 * Scan pfn from start to end and Find LRU page. 660 * Scan pfn from start to end and Find LRU page.
661 */ 661 */
662int scan_lru_pages(unsigned long start, unsigned long end) 662unsigned long scan_lru_pages(unsigned long start, unsigned long end)
663{ 663{
664 unsigned long pfn; 664 unsigned long pfn;
665 struct page *page; 665 struct page *page;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f969da5dd8a2..c1002c68d617 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1588,7 +1588,7 @@ unsigned slab_node(struct mempolicy *policy)
1588 (void)first_zones_zonelist(zonelist, highest_zoneidx, 1588 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1589 &policy->v.nodes, 1589 &policy->v.nodes,
1590 &zone); 1590 &zone);
1591 return zone->node; 1591 return zone ? zone->node : numa_node_id();
1592 } 1592 }
1593 1593
1594 default: 1594 default:
diff --git a/mm/migrate.c b/mm/migrate.c
index 38e7cad782f4..2cfa9bf1f0d4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -553,7 +553,6 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
553 int *result = NULL; 553 int *result = NULL;
554 struct page *newpage = get_new_page(page, private, &result); 554 struct page *newpage = get_new_page(page, private, &result);
555 int remap_swapcache = 1; 555 int remap_swapcache = 1;
556 int rcu_locked = 0;
557 int charge = 0; 556 int charge = 0;
558 struct mem_cgroup *mem = NULL; 557 struct mem_cgroup *mem = NULL;
559 struct anon_vma *anon_vma = NULL; 558 struct anon_vma *anon_vma = NULL;
@@ -605,20 +604,26 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
605 /* 604 /*
606 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, 605 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
607 * we cannot notice that anon_vma is freed while we migrates a page. 606 * we cannot notice that anon_vma is freed while we migrates a page.
608 * This rcu_read_lock() delays freeing anon_vma pointer until the end 607 * This get_anon_vma() delays freeing anon_vma pointer until the end
609 * of migration. File cache pages are no problem because of page_lock() 608 * of migration. File cache pages are no problem because of page_lock()
610 * File Caches may use write_page() or lock_page() in migration, then, 609 * File Caches may use write_page() or lock_page() in migration, then,
611 * just care Anon page here. 610 * just care Anon page here.
612 */ 611 */
613 if (PageAnon(page)) { 612 if (PageAnon(page)) {
614 rcu_read_lock(); 613 /*
615 rcu_locked = 1; 614 * Only page_lock_anon_vma() understands the subtleties of
616 615 * getting a hold on an anon_vma from outside one of its mms.
617 /* Determine how to safely use anon_vma */ 616 */
618 if (!page_mapped(page)) { 617 anon_vma = page_lock_anon_vma(page);
619 if (!PageSwapCache(page)) 618 if (anon_vma) {
620 goto rcu_unlock; 619 /*
621 620 * Take a reference count on the anon_vma if the
621 * page is mapped so that it is guaranteed to
622 * exist when the page is remapped later
623 */
624 get_anon_vma(anon_vma);
625 page_unlock_anon_vma(anon_vma);
626 } else if (PageSwapCache(page)) {
622 /* 627 /*
623 * We cannot be sure that the anon_vma of an unmapped 628 * We cannot be sure that the anon_vma of an unmapped
624 * swapcache page is safe to use because we don't 629 * swapcache page is safe to use because we don't
@@ -633,13 +638,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
633 */ 638 */
634 remap_swapcache = 0; 639 remap_swapcache = 0;
635 } else { 640 } else {
636 /* 641 goto uncharge;
637 * Take a reference count on the anon_vma if the
638 * page is mapped so that it is guaranteed to
639 * exist when the page is remapped later
640 */
641 anon_vma = page_anon_vma(page);
642 get_anon_vma(anon_vma);
643 } 642 }
644 } 643 }
645 644
@@ -656,16 +655,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
656 * free the metadata, so the page can be freed. 655 * free the metadata, so the page can be freed.
657 */ 656 */
658 if (!page->mapping) { 657 if (!page->mapping) {
659 if (!PageAnon(page) && page_has_private(page)) { 658 VM_BUG_ON(PageAnon(page));
660 /* 659 if (page_has_private(page)) {
661 * Go direct to try_to_free_buffers() here because
662 * a) that's what try_to_release_page() would do anyway
663 * b) we may be under rcu_read_lock() here, so we can't
664 * use GFP_KERNEL which is what try_to_release_page()
665 * needs to be effective.
666 */
667 try_to_free_buffers(page); 660 try_to_free_buffers(page);
668 goto rcu_unlock; 661 goto uncharge;
669 } 662 }
670 goto skip_unmap; 663 goto skip_unmap;
671 } 664 }
@@ -679,14 +672,11 @@ skip_unmap:
679 672
680 if (rc && remap_swapcache) 673 if (rc && remap_swapcache)
681 remove_migration_ptes(page, page); 674 remove_migration_ptes(page, page);
682rcu_unlock:
683 675
684 /* Drop an anon_vma reference if we took one */ 676 /* Drop an anon_vma reference if we took one */
685 if (anon_vma) 677 if (anon_vma)
686 drop_anon_vma(anon_vma); 678 drop_anon_vma(anon_vma);
687 679
688 if (rcu_locked)
689 rcu_read_unlock();
690uncharge: 680uncharge:
691 if (!charge) 681 if (!charge)
692 mem_cgroup_end_migration(mem, page, newpage); 682 mem_cgroup_end_migration(mem, page, newpage);
diff --git a/mm/mmap.c b/mm/mmap.c
index 00161a48a451..283a0a84ea2c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2460,6 +2460,7 @@ int install_special_mapping(struct mm_struct *mm,
2460 unsigned long addr, unsigned long len, 2460 unsigned long addr, unsigned long len,
2461 unsigned long vm_flags, struct page **pages) 2461 unsigned long vm_flags, struct page **pages)
2462{ 2462{
2463 int ret;
2463 struct vm_area_struct *vma; 2464 struct vm_area_struct *vma;
2464 2465
2465 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); 2466 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
@@ -2477,16 +2478,23 @@ int install_special_mapping(struct mm_struct *mm,
2477 vma->vm_ops = &special_mapping_vmops; 2478 vma->vm_ops = &special_mapping_vmops;
2478 vma->vm_private_data = pages; 2479 vma->vm_private_data = pages;
2479 2480
2480 if (unlikely(insert_vm_struct(mm, vma))) { 2481 ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
2481 kmem_cache_free(vm_area_cachep, vma); 2482 if (ret)
2482 return -ENOMEM; 2483 goto out;
2483 } 2484
2485 ret = insert_vm_struct(mm, vma);
2486 if (ret)
2487 goto out;
2484 2488
2485 mm->total_vm += len >> PAGE_SHIFT; 2489 mm->total_vm += len >> PAGE_SHIFT;
2486 2490
2487 perf_event_mmap(vma); 2491 perf_event_mmap(vma);
2488 2492
2489 return 0; 2493 return 0;
2494
2495out:
2496 kmem_cache_free(vm_area_cachep, vma);
2497 return ret;
2490} 2498}
2491 2499
2492static DEFINE_MUTEX(mm_all_locks_mutex); 2500static DEFINE_MUTEX(mm_all_locks_mutex);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index e35bfb82c855..f5b7d1760213 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn,
87 return 1; 87 return 1;
88} 88}
89#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ 89#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
90
91#ifdef CONFIG_SMP
92/* Called when a more accurate view of NR_FREE_PAGES is needed */
93unsigned long zone_nr_free_pages(struct zone *zone)
94{
95 unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
96
97 /*
98 * While kswapd is awake, it is considered the zone is under some
99 * memory pressure. Under pressure, there is a risk that
100 * per-cpu-counter-drift will allow the min watermark to be breached
101 * potentially causing a live-lock. While kswapd is awake and
102 * free pages are low, get a better estimate for free pages
103 */
104 if (nr_free_pages < zone->percpu_drift_mark &&
105 !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
106 return zone_page_state_snapshot(zone, NR_FREE_PAGES);
107
108 return nr_free_pages;
109}
110#endif /* CONFIG_SMP */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 2d1bf7cf8851..4c5133873097 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -211,6 +211,7 @@ success:
211 mmu_notifier_invalidate_range_end(mm, start, end); 211 mmu_notifier_invalidate_range_end(mm, start, end);
212 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); 212 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
213 vm_stat_account(mm, newflags, vma->vm_file, nrpages); 213 vm_stat_account(mm, newflags, vma->vm_file, nrpages);
214 perf_event_mmap(vma);
214 return 0; 215 return 0;
215 216
216fail: 217fail:
@@ -299,7 +300,6 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
299 error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); 300 error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
300 if (error) 301 if (error)
301 goto out; 302 goto out;
302 perf_event_mmap(vma);
303 nstart = tmp; 303 nstart = tmp;
304 304
305 if (nstart < prev->vm_end) 305 if (nstart < prev->vm_end)
diff --git a/mm/nommu.c b/mm/nommu.c
index 88ff091eb07a..acb3bd3c1cb9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1668,6 +1668,7 @@ void exit_mmap(struct mm_struct *mm)
1668 mm->mmap = vma->vm_next; 1668 mm->mmap = vma->vm_next;
1669 delete_vma_from_mm(vma); 1669 delete_vma_from_mm(vma);
1670 delete_vma(mm, vma); 1670 delete_vma(mm, vma);
1671 cond_resched();
1671 } 1672 }
1672 1673
1673 kleave(""); 1674 kleave("");
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f12ad1836abe..985e072a3dd9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -103,19 +103,24 @@ gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
103 * only be modified with pm_mutex held, unless the suspend/hibernate code is 103 * only be modified with pm_mutex held, unless the suspend/hibernate code is
104 * guaranteed not to run in parallel with that modification). 104 * guaranteed not to run in parallel with that modification).
105 */ 105 */
106void set_gfp_allowed_mask(gfp_t mask) 106
107static gfp_t saved_gfp_mask;
108
109void pm_restore_gfp_mask(void)
107{ 110{
108 WARN_ON(!mutex_is_locked(&pm_mutex)); 111 WARN_ON(!mutex_is_locked(&pm_mutex));
109 gfp_allowed_mask = mask; 112 if (saved_gfp_mask) {
113 gfp_allowed_mask = saved_gfp_mask;
114 saved_gfp_mask = 0;
115 }
110} 116}
111 117
112gfp_t clear_gfp_allowed_mask(gfp_t mask) 118void pm_restrict_gfp_mask(void)
113{ 119{
114 gfp_t ret = gfp_allowed_mask;
115
116 WARN_ON(!mutex_is_locked(&pm_mutex)); 120 WARN_ON(!mutex_is_locked(&pm_mutex));
117 gfp_allowed_mask &= ~mask; 121 WARN_ON(saved_gfp_mask);
118 return ret; 122 saved_gfp_mask = gfp_allowed_mask;
123 gfp_allowed_mask &= ~GFP_IOFS;
119} 124}
120#endif /* CONFIG_PM_SLEEP */ 125#endif /* CONFIG_PM_SLEEP */
121 126
@@ -530,7 +535,7 @@ static inline void __free_one_page(struct page *page,
530 * so it's less likely to be used soon and more likely to be merged 535 * so it's less likely to be used soon and more likely to be merged
531 * as a higher order page 536 * as a higher order page
532 */ 537 */
533 if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) { 538 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
534 struct page *higher_page, *higher_buddy; 539 struct page *higher_page, *higher_buddy;
535 combined_idx = __find_combined_index(page_idx, order); 540 combined_idx = __find_combined_index(page_idx, order);
536 higher_page = page + combined_idx - page_idx; 541 higher_page = page + combined_idx - page_idx;
@@ -1454,24 +1459,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1454#endif /* CONFIG_FAIL_PAGE_ALLOC */ 1459#endif /* CONFIG_FAIL_PAGE_ALLOC */
1455 1460
1456/* 1461/*
1457 * Return 1 if free pages are above 'mark'. This takes into account the order 1462 * Return true if free pages are above 'mark'. This takes into account the order
1458 * of the allocation. 1463 * of the allocation.
1459 */ 1464 */
1460int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1465static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1461 int classzone_idx, int alloc_flags) 1466 int classzone_idx, int alloc_flags, long free_pages)
1462{ 1467{
1463 /* free_pages my go negative - that's OK */ 1468 /* free_pages my go negative - that's OK */
1464 long min = mark; 1469 long min = mark;
1465 long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
1466 int o; 1470 int o;
1467 1471
1472 free_pages -= (1 << order) + 1;
1468 if (alloc_flags & ALLOC_HIGH) 1473 if (alloc_flags & ALLOC_HIGH)
1469 min -= min / 2; 1474 min -= min / 2;
1470 if (alloc_flags & ALLOC_HARDER) 1475 if (alloc_flags & ALLOC_HARDER)
1471 min -= min / 4; 1476 min -= min / 4;
1472 1477
1473 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 1478 if (free_pages <= min + z->lowmem_reserve[classzone_idx])
1474 return 0; 1479 return false;
1475 for (o = 0; o < order; o++) { 1480 for (o = 0; o < order; o++) {
1476 /* At the next order, this order's pages become unavailable */ 1481 /* At the next order, this order's pages become unavailable */
1477 free_pages -= z->free_area[o].nr_free << o; 1482 free_pages -= z->free_area[o].nr_free << o;
@@ -1480,9 +1485,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1480 min >>= 1; 1485 min >>= 1;
1481 1486
1482 if (free_pages <= min) 1487 if (free_pages <= min)
1483 return 0; 1488 return false;
1484 } 1489 }
1485 return 1; 1490 return true;
1491}
1492
1493bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1494 int classzone_idx, int alloc_flags)
1495{
1496 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1497 zone_page_state(z, NR_FREE_PAGES));
1498}
1499
1500bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1501 int classzone_idx, int alloc_flags)
1502{
1503 long free_pages = zone_page_state(z, NR_FREE_PAGES);
1504
1505 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1506 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1507
1508 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1509 free_pages);
1486} 1510}
1487 1511
1488#ifdef CONFIG_NUMA 1512#ifdef CONFIG_NUMA
@@ -2436,7 +2460,7 @@ void show_free_areas(void)
2436 " all_unreclaimable? %s" 2460 " all_unreclaimable? %s"
2437 "\n", 2461 "\n",
2438 zone->name, 2462 zone->name,
2439 K(zone_nr_free_pages(zone)), 2463 K(zone_page_state(zone, NR_FREE_PAGES)),
2440 K(min_wmark_pages(zone)), 2464 K(min_wmark_pages(zone)),
2441 K(low_wmark_pages(zone)), 2465 K(low_wmark_pages(zone)),
2442 K(high_wmark_pages(zone)), 2466 K(high_wmark_pages(zone)),
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6b8889da69a6..d8087f0db507 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -517,6 +517,15 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
517static void purge_fragmented_blocks_allcpus(void); 517static void purge_fragmented_blocks_allcpus(void);
518 518
519/* 519/*
520 * called before a call to iounmap() if the caller wants vm_area_struct's
521 * immediately freed.
522 */
523void set_iounmap_nonlazy(void)
524{
525 atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
526}
527
528/*
520 * Purges all lazily-freed vmap areas. 529 * Purges all lazily-freed vmap areas.
521 * 530 *
522 * If sync is 0 then don't purge if there is already a purge in progress. 531 * If sync is 0 then don't purge if there is already a purge in progress.
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c5dfabf25f11..3e71cb1ee28c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2082,7 +2082,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
2082 if (zone->all_unreclaimable) 2082 if (zone->all_unreclaimable)
2083 continue; 2083 continue;
2084 2084
2085 if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), 2085 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
2086 0, 0)) 2086 0, 0))
2087 return 1; 2087 return 1;
2088 } 2088 }
@@ -2169,7 +2169,7 @@ loop_again:
2169 shrink_active_list(SWAP_CLUSTER_MAX, zone, 2169 shrink_active_list(SWAP_CLUSTER_MAX, zone,
2170 &sc, priority, 0); 2170 &sc, priority, 0);
2171 2171
2172 if (!zone_watermark_ok(zone, order, 2172 if (!zone_watermark_ok_safe(zone, order,
2173 high_wmark_pages(zone), 0, 0)) { 2173 high_wmark_pages(zone), 0, 0)) {
2174 end_zone = i; 2174 end_zone = i;
2175 break; 2175 break;
@@ -2215,7 +2215,7 @@ loop_again:
2215 * We put equal pressure on every zone, unless one 2215 * We put equal pressure on every zone, unless one
2216 * zone has way too many pages free already. 2216 * zone has way too many pages free already.
2217 */ 2217 */
2218 if (!zone_watermark_ok(zone, order, 2218 if (!zone_watermark_ok_safe(zone, order,
2219 8*high_wmark_pages(zone), end_zone, 0)) 2219 8*high_wmark_pages(zone), end_zone, 0))
2220 shrink_zone(priority, zone, &sc); 2220 shrink_zone(priority, zone, &sc);
2221 reclaim_state->reclaimed_slab = 0; 2221 reclaim_state->reclaimed_slab = 0;
@@ -2236,7 +2236,7 @@ loop_again:
2236 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) 2236 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2237 sc.may_writepage = 1; 2237 sc.may_writepage = 1;
2238 2238
2239 if (!zone_watermark_ok(zone, order, 2239 if (!zone_watermark_ok_safe(zone, order,
2240 high_wmark_pages(zone), end_zone, 0)) { 2240 high_wmark_pages(zone), end_zone, 0)) {
2241 all_zones_ok = 0; 2241 all_zones_ok = 0;
2242 /* 2242 /*
@@ -2244,7 +2244,7 @@ loop_again:
2244 * means that we have a GFP_ATOMIC allocation 2244 * means that we have a GFP_ATOMIC allocation
2245 * failure risk. Hurry up! 2245 * failure risk. Hurry up!
2246 */ 2246 */
2247 if (!zone_watermark_ok(zone, order, 2247 if (!zone_watermark_ok_safe(zone, order,
2248 min_wmark_pages(zone), end_zone, 0)) 2248 min_wmark_pages(zone), end_zone, 0))
2249 has_under_min_watermark_zone = 1; 2249 has_under_min_watermark_zone = 1;
2250 } 2250 }
@@ -2378,7 +2378,9 @@ static int kswapd(void *p)
2378 */ 2378 */
2379 if (!sleeping_prematurely(pgdat, order, remaining)) { 2379 if (!sleeping_prematurely(pgdat, order, remaining)) {
2380 trace_mm_vmscan_kswapd_sleep(pgdat->node_id); 2380 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2381 restore_pgdat_percpu_threshold(pgdat);
2381 schedule(); 2382 schedule();
2383 reduce_pgdat_percpu_threshold(pgdat);
2382 } else { 2384 } else {
2383 if (remaining) 2385 if (remaining)
2384 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); 2386 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
@@ -2417,16 +2419,17 @@ void wakeup_kswapd(struct zone *zone, int order)
2417 if (!populated_zone(zone)) 2419 if (!populated_zone(zone))
2418 return; 2420 return;
2419 2421
2420 pgdat = zone->zone_pgdat; 2422 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2421 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
2422 return; 2423 return;
2424 pgdat = zone->zone_pgdat;
2423 if (pgdat->kswapd_max_order < order) 2425 if (pgdat->kswapd_max_order < order)
2424 pgdat->kswapd_max_order = order; 2426 pgdat->kswapd_max_order = order;
2425 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
2426 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2427 return;
2428 if (!waitqueue_active(&pgdat->kswapd_wait)) 2427 if (!waitqueue_active(&pgdat->kswapd_wait))
2429 return; 2428 return;
2429 if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
2430 return;
2431
2432 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
2430 wake_up_interruptible(&pgdat->kswapd_wait); 2433 wake_up_interruptible(&pgdat->kswapd_wait);
2431} 2434}
2432 2435
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 355a9e669aaa..4d7faebb9b70 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -81,6 +81,30 @@ EXPORT_SYMBOL(vm_stat);
81 81
82#ifdef CONFIG_SMP 82#ifdef CONFIG_SMP
83 83
84static int calculate_pressure_threshold(struct zone *zone)
85{
86 int threshold;
87 int watermark_distance;
88
89 /*
90 * As vmstats are not up to date, there is drift between the estimated
91 * and real values. For high thresholds and a high number of CPUs, it
92 * is possible for the min watermark to be breached while the estimated
93 * value looks fine. The pressure threshold is a reduced value such
94 * that even the maximum amount of drift will not accidentally breach
95 * the min watermark
96 */
97 watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
98 threshold = max(1, (int)(watermark_distance / num_online_cpus()));
99
100 /*
101 * Maximum threshold is 125
102 */
103 threshold = min(125, threshold);
104
105 return threshold;
106}
107
84static int calculate_threshold(struct zone *zone) 108static int calculate_threshold(struct zone *zone)
85{ 109{
86 int threshold; 110 int threshold;
@@ -159,6 +183,48 @@ static void refresh_zone_stat_thresholds(void)
159 } 183 }
160} 184}
161 185
186void reduce_pgdat_percpu_threshold(pg_data_t *pgdat)
187{
188 struct zone *zone;
189 int cpu;
190 int threshold;
191 int i;
192
193 get_online_cpus();
194 for (i = 0; i < pgdat->nr_zones; i++) {
195 zone = &pgdat->node_zones[i];
196 if (!zone->percpu_drift_mark)
197 continue;
198
199 threshold = calculate_pressure_threshold(zone);
200 for_each_online_cpu(cpu)
201 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
202 = threshold;
203 }
204 put_online_cpus();
205}
206
207void restore_pgdat_percpu_threshold(pg_data_t *pgdat)
208{
209 struct zone *zone;
210 int cpu;
211 int threshold;
212 int i;
213
214 get_online_cpus();
215 for (i = 0; i < pgdat->nr_zones; i++) {
216 zone = &pgdat->node_zones[i];
217 if (!zone->percpu_drift_mark)
218 continue;
219
220 threshold = calculate_threshold(zone);
221 for_each_online_cpu(cpu)
222 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
223 = threshold;
224 }
225 put_online_cpus();
226}
227
162/* 228/*
163 * For use when we know that interrupts are disabled. 229 * For use when we know that interrupts are disabled.
164 */ 230 */
@@ -826,7 +892,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
826 "\n scanned %lu" 892 "\n scanned %lu"
827 "\n spanned %lu" 893 "\n spanned %lu"
828 "\n present %lu", 894 "\n present %lu",
829 zone_nr_free_pages(zone), 895 zone_page_state(zone, NR_FREE_PAGES),
830 min_wmark_pages(zone), 896 min_wmark_pages(zone),
831 low_wmark_pages(zone), 897 low_wmark_pages(zone),
832 high_wmark_pages(zone), 898 high_wmark_pages(zone),