aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/bounce.c2
-rw-r--r--mm/compaction.c7
-rw-r--r--mm/filemap.c11
-rw-r--r--mm/huge_memory.c10
-rw-r--r--mm/hugetlb.c17
-rw-r--r--mm/hwpoison-inject.c5
-rw-r--r--mm/madvise.c5
-rw-r--r--mm/memcontrol.c703
-rw-r--r--mm/memory-failure.c8
-rw-r--r--mm/memory.c20
-rw-r--r--mm/migrate.c4
-rw-r--r--mm/mlock.c9
-rw-r--r--mm/mprotect.c7
-rw-r--r--mm/mremap.c5
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page-writeback.c10
-rw-r--r--mm/page_alloc.c4
-rw-r--r--mm/slab_common.c2
-rw-r--r--mm/swapfile.c4
-rw-r--r--mm/vmscan.c88
-rw-r--r--mm/zswap.c4
22 files changed, 589 insertions, 340 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 026771a9b097..394838f489eb 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -183,7 +183,7 @@ config MEMORY_HOTPLUG_SPARSE
183config MEMORY_HOTREMOVE 183config MEMORY_HOTREMOVE
184 bool "Allow for memory hot remove" 184 bool "Allow for memory hot remove"
185 select MEMORY_ISOLATION 185 select MEMORY_ISOLATION
186 select HAVE_BOOTMEM_INFO_NODE if X86_64 186 select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64)
187 depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE 187 depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
188 depends on MIGRATION 188 depends on MIGRATION
189 189
diff --git a/mm/bounce.c b/mm/bounce.c
index c9f0a4339a7d..5a7d58fb883b 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -204,6 +204,8 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
204 struct bio_vec *to, *from; 204 struct bio_vec *to, *from;
205 unsigned i; 205 unsigned i;
206 206
207 if (force)
208 goto bounce;
207 bio_for_each_segment(from, *bio_orig, i) 209 bio_for_each_segment(from, *bio_orig, i)
208 if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q)) 210 if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q))
209 goto bounce; 211 goto bounce;
diff --git a/mm/compaction.c b/mm/compaction.c
index c43789388cd8..b5326b141a25 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -677,6 +677,13 @@ static void isolate_freepages(struct zone *zone,
677 pfn -= pageblock_nr_pages) { 677 pfn -= pageblock_nr_pages) {
678 unsigned long isolated; 678 unsigned long isolated;
679 679
680 /*
681 * This can iterate a massively long zone without finding any
682 * suitable migration targets, so periodically check if we need
683 * to schedule.
684 */
685 cond_resched();
686
680 if (!pfn_valid(pfn)) 687 if (!pfn_valid(pfn))
681 continue; 688 continue;
682 689
diff --git a/mm/filemap.c b/mm/filemap.c
index 1e6aec4a2d2e..ae4846ff4849 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1616,7 +1616,6 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1616 struct inode *inode = mapping->host; 1616 struct inode *inode = mapping->host;
1617 pgoff_t offset = vmf->pgoff; 1617 pgoff_t offset = vmf->pgoff;
1618 struct page *page; 1618 struct page *page;
1619 bool memcg_oom;
1620 pgoff_t size; 1619 pgoff_t size;
1621 int ret = 0; 1620 int ret = 0;
1622 1621
@@ -1625,11 +1624,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1625 return VM_FAULT_SIGBUS; 1624 return VM_FAULT_SIGBUS;
1626 1625
1627 /* 1626 /*
1628 * Do we have something in the page cache already? Either 1627 * Do we have something in the page cache already?
1629 * way, try readahead, but disable the memcg OOM killer for it
1630 * as readahead is optional and no errors are propagated up
1631 * the fault stack. The OOM killer is enabled while trying to
1632 * instantiate the faulting page individually below.
1633 */ 1628 */
1634 page = find_get_page(mapping, offset); 1629 page = find_get_page(mapping, offset);
1635 if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { 1630 if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
@@ -1637,14 +1632,10 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1637 * We found the page, so try async readahead before 1632 * We found the page, so try async readahead before
1638 * waiting for the lock. 1633 * waiting for the lock.
1639 */ 1634 */
1640 memcg_oom = mem_cgroup_toggle_oom(false);
1641 do_async_mmap_readahead(vma, ra, file, page, offset); 1635 do_async_mmap_readahead(vma, ra, file, page, offset);
1642 mem_cgroup_toggle_oom(memcg_oom);
1643 } else if (!page) { 1636 } else if (!page) {
1644 /* No page in the page cache at all */ 1637 /* No page in the page cache at all */
1645 memcg_oom = mem_cgroup_toggle_oom(false);
1646 do_sync_mmap_readahead(vma, ra, file, offset); 1638 do_sync_mmap_readahead(vma, ra, file, offset);
1647 mem_cgroup_toggle_oom(memcg_oom);
1648 count_vm_event(PGMAJFAULT); 1639 count_vm_event(PGMAJFAULT);
1649 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 1640 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1650 ret = VM_FAULT_MAJOR; 1641 ret = VM_FAULT_MAJOR;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7489884682d8..610e3df2768a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2697,6 +2697,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
2697 2697
2698 mmun_start = haddr; 2698 mmun_start = haddr;
2699 mmun_end = haddr + HPAGE_PMD_SIZE; 2699 mmun_end = haddr + HPAGE_PMD_SIZE;
2700again:
2700 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2701 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2701 spin_lock(&mm->page_table_lock); 2702 spin_lock(&mm->page_table_lock);
2702 if (unlikely(!pmd_trans_huge(*pmd))) { 2703 if (unlikely(!pmd_trans_huge(*pmd))) {
@@ -2719,7 +2720,14 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
2719 split_huge_page(page); 2720 split_huge_page(page);
2720 2721
2721 put_page(page); 2722 put_page(page);
2722 BUG_ON(pmd_trans_huge(*pmd)); 2723
2724 /*
2725 * We don't always have down_write of mmap_sem here: a racing
2726 * do_huge_pmd_wp_page() might have copied-on-write to another
2727 * huge page before our split_huge_page() got the anon_vma lock.
2728 */
2729 if (unlikely(pmd_trans_huge(*pmd)))
2730 goto again;
2723} 2731}
2724 2732
2725void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, 2733void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b49579c7f2a5..0b7656e804d1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -653,6 +653,7 @@ static void free_huge_page(struct page *page)
653 BUG_ON(page_count(page)); 653 BUG_ON(page_count(page));
654 BUG_ON(page_mapcount(page)); 654 BUG_ON(page_mapcount(page));
655 restore_reserve = PagePrivate(page); 655 restore_reserve = PagePrivate(page);
656 ClearPagePrivate(page);
656 657
657 spin_lock(&hugetlb_lock); 658 spin_lock(&hugetlb_lock);
658 hugetlb_cgroup_uncharge_page(hstate_index(h), 659 hugetlb_cgroup_uncharge_page(hstate_index(h),
@@ -695,8 +696,22 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
695 /* we rely on prep_new_huge_page to set the destructor */ 696 /* we rely on prep_new_huge_page to set the destructor */
696 set_compound_order(page, order); 697 set_compound_order(page, order);
697 __SetPageHead(page); 698 __SetPageHead(page);
699 __ClearPageReserved(page);
698 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 700 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
699 __SetPageTail(p); 701 __SetPageTail(p);
702 /*
703 * For gigantic hugepages allocated through bootmem at
704 * boot, it's safer to be consistent with the not-gigantic
705 * hugepages and clear the PG_reserved bit from all tail pages
706 * too. Otherwse drivers using get_user_pages() to access tail
707 * pages may get the reference counting wrong if they see
708 * PG_reserved set on a tail page (despite the head page not
709 * having PG_reserved set). Enforcing this consistency between
710 * head and tail pages allows drivers to optimize away a check
711 * on the head page when they need know if put_page() is needed
712 * after get_user_pages().
713 */
714 __ClearPageReserved(p);
700 set_page_count(p, 0); 715 set_page_count(p, 0);
701 p->first_page = page; 716 p->first_page = page;
702 } 717 }
@@ -1329,9 +1344,9 @@ static void __init gather_bootmem_prealloc(void)
1329#else 1344#else
1330 page = virt_to_page(m); 1345 page = virt_to_page(m);
1331#endif 1346#endif
1332 __ClearPageReserved(page);
1333 WARN_ON(page_count(page) != 1); 1347 WARN_ON(page_count(page) != 1);
1334 prep_compound_huge_page(page, h->order); 1348 prep_compound_huge_page(page, h->order);
1349 WARN_ON(PageReserved(page));
1335 prep_new_huge_page(h, page, page_to_nid(page)); 1350 prep_new_huge_page(h, page, page_to_nid(page));
1336 /* 1351 /*
1337 * If we had gigantic hugepages allocated at boot time, we need 1352 * If we had gigantic hugepages allocated at boot time, we need
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index afc2daa91c60..4c84678371eb 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -20,8 +20,6 @@ static int hwpoison_inject(void *data, u64 val)
20 if (!capable(CAP_SYS_ADMIN)) 20 if (!capable(CAP_SYS_ADMIN))
21 return -EPERM; 21 return -EPERM;
22 22
23 if (!hwpoison_filter_enable)
24 goto inject;
25 if (!pfn_valid(pfn)) 23 if (!pfn_valid(pfn))
26 return -ENXIO; 24 return -ENXIO;
27 25
@@ -33,6 +31,9 @@ static int hwpoison_inject(void *data, u64 val)
33 if (!get_page_unless_zero(hpage)) 31 if (!get_page_unless_zero(hpage))
34 return 0; 32 return 0;
35 33
34 if (!hwpoison_filter_enable)
35 goto inject;
36
36 if (!PageLRU(p) && !PageHuge(p)) 37 if (!PageLRU(p) && !PageHuge(p))
37 shake_page(p, 0); 38 shake_page(p, 0);
38 /* 39 /*
diff --git a/mm/madvise.c b/mm/madvise.c
index 6975bc812542..539eeb96b323 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -343,10 +343,11 @@ static long madvise_remove(struct vm_area_struct *vma,
343 */ 343 */
344static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) 344static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
345{ 345{
346 struct page *p;
346 if (!capable(CAP_SYS_ADMIN)) 347 if (!capable(CAP_SYS_ADMIN))
347 return -EPERM; 348 return -EPERM;
348 for (; start < end; start += PAGE_SIZE) { 349 for (; start < end; start += PAGE_SIZE <<
349 struct page *p; 350 compound_order(compound_head(p))) {
350 int ret; 351 int ret;
351 352
352 ret = get_user_pages_fast(start, 1, 0, &p); 353 ret = get_user_pages_fast(start, 1, 0, &p);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d5ff3ce13029..34d3ca9572d6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -39,6 +39,7 @@
39#include <linux/limits.h> 39#include <linux/limits.h>
40#include <linux/export.h> 40#include <linux/export.h>
41#include <linux/mutex.h> 41#include <linux/mutex.h>
42#include <linux/rbtree.h>
42#include <linux/slab.h> 43#include <linux/slab.h>
43#include <linux/swap.h> 44#include <linux/swap.h>
44#include <linux/swapops.h> 45#include <linux/swapops.h>
@@ -160,6 +161,10 @@ struct mem_cgroup_per_zone {
160 161
161 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 162 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
162 163
164 struct rb_node tree_node; /* RB tree node */
165 unsigned long long usage_in_excess;/* Set to the value by which */
166 /* the soft limit is exceeded*/
167 bool on_tree;
163 struct mem_cgroup *memcg; /* Back pointer, we cannot */ 168 struct mem_cgroup *memcg; /* Back pointer, we cannot */
164 /* use container_of */ 169 /* use container_of */
165}; 170};
@@ -168,6 +173,26 @@ struct mem_cgroup_per_node {
168 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 173 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
169}; 174};
170 175
176/*
177 * Cgroups above their limits are maintained in a RB-Tree, independent of
178 * their hierarchy representation
179 */
180
181struct mem_cgroup_tree_per_zone {
182 struct rb_root rb_root;
183 spinlock_t lock;
184};
185
186struct mem_cgroup_tree_per_node {
187 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
188};
189
190struct mem_cgroup_tree {
191 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
192};
193
194static struct mem_cgroup_tree soft_limit_tree __read_mostly;
195
171struct mem_cgroup_threshold { 196struct mem_cgroup_threshold {
172 struct eventfd_ctx *eventfd; 197 struct eventfd_ctx *eventfd;
173 u64 threshold; 198 u64 threshold;
@@ -303,22 +328,6 @@ struct mem_cgroup {
303 atomic_t numainfo_events; 328 atomic_t numainfo_events;
304 atomic_t numainfo_updating; 329 atomic_t numainfo_updating;
305#endif 330#endif
306 /*
307 * Protects soft_contributed transitions.
308 * See mem_cgroup_update_soft_limit
309 */
310 spinlock_t soft_lock;
311
312 /*
313 * If true then this group has increased parents' children_in_excess
314 * when it got over the soft limit.
315 * When a group falls bellow the soft limit, parents' children_in_excess
316 * is decreased and soft_contributed changed to false.
317 */
318 bool soft_contributed;
319
320 /* Number of children that are in soft limit excess */
321 atomic_t children_in_excess;
322 331
323 struct mem_cgroup_per_node *nodeinfo[0]; 332 struct mem_cgroup_per_node *nodeinfo[0];
324 /* WARNING: nodeinfo must be the last member here */ 333 /* WARNING: nodeinfo must be the last member here */
@@ -422,6 +431,7 @@ static bool move_file(void)
422 * limit reclaim to prevent infinite loops, if they ever occur. 431 * limit reclaim to prevent infinite loops, if they ever occur.
423 */ 432 */
424#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 433#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
434#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
425 435
426enum charge_type { 436enum charge_type {
427 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 437 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
@@ -648,6 +658,164 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
648 return mem_cgroup_zoneinfo(memcg, nid, zid); 658 return mem_cgroup_zoneinfo(memcg, nid, zid);
649} 659}
650 660
661static struct mem_cgroup_tree_per_zone *
662soft_limit_tree_node_zone(int nid, int zid)
663{
664 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
665}
666
667static struct mem_cgroup_tree_per_zone *
668soft_limit_tree_from_page(struct page *page)
669{
670 int nid = page_to_nid(page);
671 int zid = page_zonenum(page);
672
673 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
674}
675
676static void
677__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
678 struct mem_cgroup_per_zone *mz,
679 struct mem_cgroup_tree_per_zone *mctz,
680 unsigned long long new_usage_in_excess)
681{
682 struct rb_node **p = &mctz->rb_root.rb_node;
683 struct rb_node *parent = NULL;
684 struct mem_cgroup_per_zone *mz_node;
685
686 if (mz->on_tree)
687 return;
688
689 mz->usage_in_excess = new_usage_in_excess;
690 if (!mz->usage_in_excess)
691 return;
692 while (*p) {
693 parent = *p;
694 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
695 tree_node);
696 if (mz->usage_in_excess < mz_node->usage_in_excess)
697 p = &(*p)->rb_left;
698 /*
699 * We can't avoid mem cgroups that are over their soft
700 * limit by the same amount
701 */
702 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
703 p = &(*p)->rb_right;
704 }
705 rb_link_node(&mz->tree_node, parent, p);
706 rb_insert_color(&mz->tree_node, &mctz->rb_root);
707 mz->on_tree = true;
708}
709
710static void
711__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
712 struct mem_cgroup_per_zone *mz,
713 struct mem_cgroup_tree_per_zone *mctz)
714{
715 if (!mz->on_tree)
716 return;
717 rb_erase(&mz->tree_node, &mctz->rb_root);
718 mz->on_tree = false;
719}
720
721static void
722mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
723 struct mem_cgroup_per_zone *mz,
724 struct mem_cgroup_tree_per_zone *mctz)
725{
726 spin_lock(&mctz->lock);
727 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
728 spin_unlock(&mctz->lock);
729}
730
731
732static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
733{
734 unsigned long long excess;
735 struct mem_cgroup_per_zone *mz;
736 struct mem_cgroup_tree_per_zone *mctz;
737 int nid = page_to_nid(page);
738 int zid = page_zonenum(page);
739 mctz = soft_limit_tree_from_page(page);
740
741 /*
742 * Necessary to update all ancestors when hierarchy is used.
743 * because their event counter is not touched.
744 */
745 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
746 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
747 excess = res_counter_soft_limit_excess(&memcg->res);
748 /*
749 * We have to update the tree if mz is on RB-tree or
750 * mem is over its softlimit.
751 */
752 if (excess || mz->on_tree) {
753 spin_lock(&mctz->lock);
754 /* if on-tree, remove it */
755 if (mz->on_tree)
756 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
757 /*
758 * Insert again. mz->usage_in_excess will be updated.
759 * If excess is 0, no tree ops.
760 */
761 __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
762 spin_unlock(&mctz->lock);
763 }
764 }
765}
766
767static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
768{
769 int node, zone;
770 struct mem_cgroup_per_zone *mz;
771 struct mem_cgroup_tree_per_zone *mctz;
772
773 for_each_node(node) {
774 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
775 mz = mem_cgroup_zoneinfo(memcg, node, zone);
776 mctz = soft_limit_tree_node_zone(node, zone);
777 mem_cgroup_remove_exceeded(memcg, mz, mctz);
778 }
779 }
780}
781
782static struct mem_cgroup_per_zone *
783__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
784{
785 struct rb_node *rightmost = NULL;
786 struct mem_cgroup_per_zone *mz;
787
788retry:
789 mz = NULL;
790 rightmost = rb_last(&mctz->rb_root);
791 if (!rightmost)
792 goto done; /* Nothing to reclaim from */
793
794 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
795 /*
796 * Remove the node now but someone else can add it back,
797 * we will to add it back at the end of reclaim to its correct
798 * position in the tree.
799 */
800 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
801 if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
802 !css_tryget(&mz->memcg->css))
803 goto retry;
804done:
805 return mz;
806}
807
808static struct mem_cgroup_per_zone *
809mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
810{
811 struct mem_cgroup_per_zone *mz;
812
813 spin_lock(&mctz->lock);
814 mz = __mem_cgroup_largest_soft_limit_node(mctz);
815 spin_unlock(&mctz->lock);
816 return mz;
817}
818
651/* 819/*
652 * Implementation Note: reading percpu statistics for memcg. 820 * Implementation Note: reading percpu statistics for memcg.
653 * 821 *
@@ -698,6 +866,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
698 unsigned long val = 0; 866 unsigned long val = 0;
699 int cpu; 867 int cpu;
700 868
869 get_online_cpus();
701 for_each_online_cpu(cpu) 870 for_each_online_cpu(cpu)
702 val += per_cpu(memcg->stat->events[idx], cpu); 871 val += per_cpu(memcg->stat->events[idx], cpu);
703#ifdef CONFIG_HOTPLUG_CPU 872#ifdef CONFIG_HOTPLUG_CPU
@@ -705,6 +874,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
705 val += memcg->nocpu_base.events[idx]; 874 val += memcg->nocpu_base.events[idx];
706 spin_unlock(&memcg->pcp_counter_lock); 875 spin_unlock(&memcg->pcp_counter_lock);
707#endif 876#endif
877 put_online_cpus();
708 return val; 878 return val;
709} 879}
710 880
@@ -822,48 +992,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
822} 992}
823 993
824/* 994/*
825 * Called from rate-limited memcg_check_events when enough
826 * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure
827 * that all the parents up the hierarchy will be notified that this group
828 * is in excess or that it is not in excess anymore. mmecg->soft_contributed
829 * makes the transition a single action whenever the state flips from one to
830 * the other.
831 */
832static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg)
833{
834 unsigned long long excess = res_counter_soft_limit_excess(&memcg->res);
835 struct mem_cgroup *parent = memcg;
836 int delta = 0;
837
838 spin_lock(&memcg->soft_lock);
839 if (excess) {
840 if (!memcg->soft_contributed) {
841 delta = 1;
842 memcg->soft_contributed = true;
843 }
844 } else {
845 if (memcg->soft_contributed) {
846 delta = -1;
847 memcg->soft_contributed = false;
848 }
849 }
850
851 /*
852 * Necessary to update all ancestors when hierarchy is used
853 * because their event counter is not touched.
854 * We track children even outside the hierarchy for the root
855 * cgroup because tree walk starting at root should visit
856 * all cgroups and we want to prevent from pointless tree
857 * walk if no children is below the limit.
858 */
859 while (delta && (parent = parent_mem_cgroup(parent)))
860 atomic_add(delta, &parent->children_in_excess);
861 if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
862 atomic_add(delta, &root_mem_cgroup->children_in_excess);
863 spin_unlock(&memcg->soft_lock);
864}
865
866/*
867 * Check events in order. 995 * Check events in order.
868 * 996 *
869 */ 997 */
@@ -886,7 +1014,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
886 1014
887 mem_cgroup_threshold(memcg); 1015 mem_cgroup_threshold(memcg);
888 if (unlikely(do_softlimit)) 1016 if (unlikely(do_softlimit))
889 mem_cgroup_update_soft_limit(memcg); 1017 mem_cgroup_update_tree(memcg, page);
890#if MAX_NUMNODES > 1 1018#if MAX_NUMNODES > 1
891 if (unlikely(do_numainfo)) 1019 if (unlikely(do_numainfo))
892 atomic_inc(&memcg->numainfo_events); 1020 atomic_inc(&memcg->numainfo_events);
@@ -929,15 +1057,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
929 return memcg; 1057 return memcg;
930} 1058}
931 1059
932static enum mem_cgroup_filter_t
933mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
934 mem_cgroup_iter_filter cond)
935{
936 if (!cond)
937 return VISIT;
938 return cond(memcg, root);
939}
940
941/* 1060/*
942 * Returns a next (in a pre-order walk) alive memcg (with elevated css 1061 * Returns a next (in a pre-order walk) alive memcg (with elevated css
943 * ref. count) or NULL if the whole root's subtree has been visited. 1062 * ref. count) or NULL if the whole root's subtree has been visited.
@@ -945,7 +1064,7 @@ mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
945 * helper function to be used by mem_cgroup_iter 1064 * helper function to be used by mem_cgroup_iter
946 */ 1065 */
947static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, 1066static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
948 struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond) 1067 struct mem_cgroup *last_visited)
949{ 1068{
950 struct cgroup_subsys_state *prev_css, *next_css; 1069 struct cgroup_subsys_state *prev_css, *next_css;
951 1070
@@ -963,31 +1082,11 @@ skip_node:
963 if (next_css) { 1082 if (next_css) {
964 struct mem_cgroup *mem = mem_cgroup_from_css(next_css); 1083 struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
965 1084
966 switch (mem_cgroup_filter(mem, root, cond)) { 1085 if (css_tryget(&mem->css))
967 case SKIP: 1086 return mem;
1087 else {
968 prev_css = next_css; 1088 prev_css = next_css;
969 goto skip_node; 1089 goto skip_node;
970 case SKIP_TREE:
971 if (mem == root)
972 return NULL;
973 /*
974 * css_rightmost_descendant is not an optimal way to
975 * skip through a subtree (especially for imbalanced
976 * trees leaning to right) but that's what we have right
977 * now. More effective solution would be traversing
978 * right-up for first non-NULL without calling
979 * css_next_descendant_pre afterwards.
980 */
981 prev_css = css_rightmost_descendant(next_css);
982 goto skip_node;
983 case VISIT:
984 if (css_tryget(&mem->css))
985 return mem;
986 else {
987 prev_css = next_css;
988 goto skip_node;
989 }
990 break;
991 } 1090 }
992 } 1091 }
993 1092
@@ -1051,7 +1150,6 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1051 * @root: hierarchy root 1150 * @root: hierarchy root
1052 * @prev: previously returned memcg, NULL on first invocation 1151 * @prev: previously returned memcg, NULL on first invocation
1053 * @reclaim: cookie for shared reclaim walks, NULL for full walks 1152 * @reclaim: cookie for shared reclaim walks, NULL for full walks
1054 * @cond: filter for visited nodes, NULL for no filter
1055 * 1153 *
1056 * Returns references to children of the hierarchy below @root, or 1154 * Returns references to children of the hierarchy below @root, or
1057 * @root itself, or %NULL after a full round-trip. 1155 * @root itself, or %NULL after a full round-trip.
@@ -1064,18 +1162,15 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1064 * divide up the memcgs in the hierarchy among all concurrent 1162 * divide up the memcgs in the hierarchy among all concurrent
1065 * reclaimers operating on the same zone and priority. 1163 * reclaimers operating on the same zone and priority.
1066 */ 1164 */
1067struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, 1165struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1068 struct mem_cgroup *prev, 1166 struct mem_cgroup *prev,
1069 struct mem_cgroup_reclaim_cookie *reclaim, 1167 struct mem_cgroup_reclaim_cookie *reclaim)
1070 mem_cgroup_iter_filter cond)
1071{ 1168{
1072 struct mem_cgroup *memcg = NULL; 1169 struct mem_cgroup *memcg = NULL;
1073 struct mem_cgroup *last_visited = NULL; 1170 struct mem_cgroup *last_visited = NULL;
1074 1171
1075 if (mem_cgroup_disabled()) { 1172 if (mem_cgroup_disabled())
1076 /* first call must return non-NULL, second return NULL */ 1173 return NULL;
1077 return (struct mem_cgroup *)(unsigned long)!prev;
1078 }
1079 1174
1080 if (!root) 1175 if (!root)
1081 root = root_mem_cgroup; 1176 root = root_mem_cgroup;
@@ -1086,9 +1181,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
1086 if (!root->use_hierarchy && root != root_mem_cgroup) { 1181 if (!root->use_hierarchy && root != root_mem_cgroup) {
1087 if (prev) 1182 if (prev)
1088 goto out_css_put; 1183 goto out_css_put;
1089 if (mem_cgroup_filter(root, root, cond) == VISIT) 1184 return root;
1090 return root;
1091 return NULL;
1092 } 1185 }
1093 1186
1094 rcu_read_lock(); 1187 rcu_read_lock();
@@ -1111,7 +1204,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
1111 last_visited = mem_cgroup_iter_load(iter, root, &seq); 1204 last_visited = mem_cgroup_iter_load(iter, root, &seq);
1112 } 1205 }
1113 1206
1114 memcg = __mem_cgroup_iter_next(root, last_visited, cond); 1207 memcg = __mem_cgroup_iter_next(root, last_visited);
1115 1208
1116 if (reclaim) { 1209 if (reclaim) {
1117 mem_cgroup_iter_update(iter, last_visited, memcg, seq); 1210 mem_cgroup_iter_update(iter, last_visited, memcg, seq);
@@ -1122,11 +1215,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
1122 reclaim->generation = iter->generation; 1215 reclaim->generation = iter->generation;
1123 } 1216 }
1124 1217
1125 /* 1218 if (prev && !memcg)
1126 * We have finished the whole tree walk or no group has been
1127 * visited because filter told us to skip the root node.
1128 */
1129 if (!memcg && (prev || (cond && !last_visited)))
1130 goto out_unlock; 1219 goto out_unlock;
1131 } 1220 }
1132out_unlock: 1221out_unlock:
@@ -1767,7 +1856,6 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1767 return total; 1856 return total;
1768} 1857}
1769 1858
1770#if MAX_NUMNODES > 1
1771/** 1859/**
1772 * test_mem_cgroup_node_reclaimable 1860 * test_mem_cgroup_node_reclaimable
1773 * @memcg: the target memcg 1861 * @memcg: the target memcg
@@ -1790,6 +1878,7 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1790 return false; 1878 return false;
1791 1879
1792} 1880}
1881#if MAX_NUMNODES > 1
1793 1882
1794/* 1883/*
1795 * Always updating the nodemask is not very good - even if we have an empty 1884 * Always updating the nodemask is not very good - even if we have an empty
@@ -1857,50 +1946,104 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1857 return node; 1946 return node;
1858} 1947}
1859 1948
1949/*
1950 * Check all nodes whether it contains reclaimable pages or not.
1951 * For quick scan, we make use of scan_nodes. This will allow us to skip
1952 * unused nodes. But scan_nodes is lazily updated and may not cotain
1953 * enough new information. We need to do double check.
1954 */
1955static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1956{
1957 int nid;
1958
1959 /*
1960 * quick check...making use of scan_node.
1961 * We can skip unused nodes.
1962 */
1963 if (!nodes_empty(memcg->scan_nodes)) {
1964 for (nid = first_node(memcg->scan_nodes);
1965 nid < MAX_NUMNODES;
1966 nid = next_node(nid, memcg->scan_nodes)) {
1967
1968 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1969 return true;
1970 }
1971 }
1972 /*
1973 * Check rest of nodes.
1974 */
1975 for_each_node_state(nid, N_MEMORY) {
1976 if (node_isset(nid, memcg->scan_nodes))
1977 continue;
1978 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1979 return true;
1980 }
1981 return false;
1982}
1983
1860#else 1984#else
1861int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1985int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1862{ 1986{
1863 return 0; 1987 return 0;
1864} 1988}
1865 1989
1866#endif 1990static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1867
1868/*
1869 * A group is eligible for the soft limit reclaim under the given root
1870 * hierarchy if
1871 * a) it is over its soft limit
1872 * b) any parent up the hierarchy is over its soft limit
1873 *
1874 * If the given group doesn't have any children over the limit then it
1875 * doesn't make any sense to iterate its subtree.
1876 */
1877enum mem_cgroup_filter_t
1878mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
1879 struct mem_cgroup *root)
1880{ 1991{
1881 struct mem_cgroup *parent; 1992 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
1882 1993}
1883 if (!memcg) 1994#endif
1884 memcg = root_mem_cgroup;
1885 parent = memcg;
1886
1887 if (res_counter_soft_limit_excess(&memcg->res))
1888 return VISIT;
1889 1995
1890 /* 1996static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1891 * If any parent up to the root in the hierarchy is over its soft limit 1997 struct zone *zone,
1892 * then we have to obey and reclaim from this group as well. 1998 gfp_t gfp_mask,
1893 */ 1999 unsigned long *total_scanned)
1894 while ((parent = parent_mem_cgroup(parent))) { 2000{
1895 if (res_counter_soft_limit_excess(&parent->res)) 2001 struct mem_cgroup *victim = NULL;
1896 return VISIT; 2002 int total = 0;
1897 if (parent == root) 2003 int loop = 0;
2004 unsigned long excess;
2005 unsigned long nr_scanned;
2006 struct mem_cgroup_reclaim_cookie reclaim = {
2007 .zone = zone,
2008 .priority = 0,
2009 };
2010
2011 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
2012
2013 while (1) {
2014 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
2015 if (!victim) {
2016 loop++;
2017 if (loop >= 2) {
2018 /*
2019 * If we have not been able to reclaim
2020 * anything, it might because there are
2021 * no reclaimable pages under this hierarchy
2022 */
2023 if (!total)
2024 break;
2025 /*
2026 * We want to do more targeted reclaim.
2027 * excess >> 2 is not to excessive so as to
2028 * reclaim too much, nor too less that we keep
2029 * coming back to reclaim from this cgroup
2030 */
2031 if (total >= (excess >> 2) ||
2032 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
2033 break;
2034 }
2035 continue;
2036 }
2037 if (!mem_cgroup_reclaimable(victim, false))
2038 continue;
2039 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
2040 zone, &nr_scanned);
2041 *total_scanned += nr_scanned;
2042 if (!res_counter_soft_limit_excess(&root_memcg->res))
1898 break; 2043 break;
1899 } 2044 }
1900 2045 mem_cgroup_iter_break(root_memcg, victim);
1901 if (!atomic_read(&memcg->children_in_excess)) 2046 return total;
1902 return SKIP_TREE;
1903 return SKIP;
1904} 2047}
1905 2048
1906static DEFINE_SPINLOCK(memcg_oom_lock); 2049static DEFINE_SPINLOCK(memcg_oom_lock);
@@ -2018,110 +2161,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
2018 memcg_wakeup_oom(memcg); 2161 memcg_wakeup_oom(memcg);
2019} 2162}
2020 2163
2021/*
2022 * try to call OOM killer
2023 */
2024static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 2164static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
2025{ 2165{
2026 bool locked;
2027 int wakeups;
2028
2029 if (!current->memcg_oom.may_oom) 2166 if (!current->memcg_oom.may_oom)
2030 return; 2167 return;
2031
2032 current->memcg_oom.in_memcg_oom = 1;
2033
2034 /* 2168 /*
2035 * As with any blocking lock, a contender needs to start 2169 * We are in the middle of the charge context here, so we
2036 * listening for wakeups before attempting the trylock, 2170 * don't want to block when potentially sitting on a callstack
2037 * otherwise it can miss the wakeup from the unlock and sleep 2171 * that holds all kinds of filesystem and mm locks.
2038 * indefinitely. This is just open-coded because our locking 2172 *
2039 * is so particular to memcg hierarchies. 2173 * Also, the caller may handle a failed allocation gracefully
2174 * (like optional page cache readahead) and so an OOM killer
2175 * invocation might not even be necessary.
2176 *
2177 * That's why we don't do anything here except remember the
2178 * OOM context and then deal with it at the end of the page
2179 * fault when the stack is unwound, the locks are released,
2180 * and when we know whether the fault was overall successful.
2040 */ 2181 */
2041 wakeups = atomic_read(&memcg->oom_wakeups); 2182 css_get(&memcg->css);
2042 mem_cgroup_mark_under_oom(memcg); 2183 current->memcg_oom.memcg = memcg;
2043 2184 current->memcg_oom.gfp_mask = mask;
2044 locked = mem_cgroup_oom_trylock(memcg); 2185 current->memcg_oom.order = order;
2045
2046 if (locked)
2047 mem_cgroup_oom_notify(memcg);
2048
2049 if (locked && !memcg->oom_kill_disable) {
2050 mem_cgroup_unmark_under_oom(memcg);
2051 mem_cgroup_out_of_memory(memcg, mask, order);
2052 mem_cgroup_oom_unlock(memcg);
2053 /*
2054 * There is no guarantee that an OOM-lock contender
2055 * sees the wakeups triggered by the OOM kill
2056 * uncharges. Wake any sleepers explicitely.
2057 */
2058 memcg_oom_recover(memcg);
2059 } else {
2060 /*
2061 * A system call can just return -ENOMEM, but if this
2062 * is a page fault and somebody else is handling the
2063 * OOM already, we need to sleep on the OOM waitqueue
2064 * for this memcg until the situation is resolved.
2065 * Which can take some time because it might be
2066 * handled by a userspace task.
2067 *
2068 * However, this is the charge context, which means
2069 * that we may sit on a large call stack and hold
2070 * various filesystem locks, the mmap_sem etc. and we
2071 * don't want the OOM handler to deadlock on them
2072 * while we sit here and wait. Store the current OOM
2073 * context in the task_struct, then return -ENOMEM.
2074 * At the end of the page fault handler, with the
2075 * stack unwound, pagefault_out_of_memory() will check
2076 * back with us by calling
2077 * mem_cgroup_oom_synchronize(), possibly putting the
2078 * task to sleep.
2079 */
2080 current->memcg_oom.oom_locked = locked;
2081 current->memcg_oom.wakeups = wakeups;
2082 css_get(&memcg->css);
2083 current->memcg_oom.wait_on_memcg = memcg;
2084 }
2085} 2186}
2086 2187
2087/** 2188/**
2088 * mem_cgroup_oom_synchronize - complete memcg OOM handling 2189 * mem_cgroup_oom_synchronize - complete memcg OOM handling
2190 * @handle: actually kill/wait or just clean up the OOM state
2089 * 2191 *
2090 * This has to be called at the end of a page fault if the the memcg 2192 * This has to be called at the end of a page fault if the memcg OOM
2091 * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. 2193 * handler was enabled.
2092 * 2194 *
2093 * Memcg supports userspace OOM handling, so failed allocations must 2195 * Memcg supports userspace OOM handling where failed allocations must
2094 * sleep on a waitqueue until the userspace task resolves the 2196 * sleep on a waitqueue until the userspace task resolves the
2095 * situation. Sleeping directly in the charge context with all kinds 2197 * situation. Sleeping directly in the charge context with all kinds
2096 * of locks held is not a good idea, instead we remember an OOM state 2198 * of locks held is not a good idea, instead we remember an OOM state
2097 * in the task and mem_cgroup_oom_synchronize() has to be called at 2199 * in the task and mem_cgroup_oom_synchronize() has to be called at
2098 * the end of the page fault to put the task to sleep and clean up the 2200 * the end of the page fault to complete the OOM handling.
2099 * OOM state.
2100 * 2201 *
2101 * Returns %true if an ongoing memcg OOM situation was detected and 2202 * Returns %true if an ongoing memcg OOM situation was detected and
2102 * finalized, %false otherwise. 2203 * completed, %false otherwise.
2103 */ 2204 */
2104bool mem_cgroup_oom_synchronize(void) 2205bool mem_cgroup_oom_synchronize(bool handle)
2105{ 2206{
2207 struct mem_cgroup *memcg = current->memcg_oom.memcg;
2106 struct oom_wait_info owait; 2208 struct oom_wait_info owait;
2107 struct mem_cgroup *memcg; 2209 bool locked;
2108 2210
2109 /* OOM is global, do not handle */ 2211 /* OOM is global, do not handle */
2110 if (!current->memcg_oom.in_memcg_oom)
2111 return false;
2112
2113 /*
2114 * We invoked the OOM killer but there is a chance that a kill
2115 * did not free up any charges. Everybody else might already
2116 * be sleeping, so restart the fault and keep the rampage
2117 * going until some charges are released.
2118 */
2119 memcg = current->memcg_oom.wait_on_memcg;
2120 if (!memcg) 2212 if (!memcg)
2121 goto out; 2213 return false;
2122 2214
2123 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 2215 if (!handle)
2124 goto out_memcg; 2216 goto cleanup;
2125 2217
2126 owait.memcg = memcg; 2218 owait.memcg = memcg;
2127 owait.wait.flags = 0; 2219 owait.wait.flags = 0;
@@ -2130,13 +2222,25 @@ bool mem_cgroup_oom_synchronize(void)
2130 INIT_LIST_HEAD(&owait.wait.task_list); 2222 INIT_LIST_HEAD(&owait.wait.task_list);
2131 2223
2132 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 2224 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2133 /* Only sleep if we didn't miss any wakeups since OOM */ 2225 mem_cgroup_mark_under_oom(memcg);
2134 if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) 2226
2227 locked = mem_cgroup_oom_trylock(memcg);
2228
2229 if (locked)
2230 mem_cgroup_oom_notify(memcg);
2231
2232 if (locked && !memcg->oom_kill_disable) {
2233 mem_cgroup_unmark_under_oom(memcg);
2234 finish_wait(&memcg_oom_waitq, &owait.wait);
2235 mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
2236 current->memcg_oom.order);
2237 } else {
2135 schedule(); 2238 schedule();
2136 finish_wait(&memcg_oom_waitq, &owait.wait); 2239 mem_cgroup_unmark_under_oom(memcg);
2137out_memcg: 2240 finish_wait(&memcg_oom_waitq, &owait.wait);
2138 mem_cgroup_unmark_under_oom(memcg); 2241 }
2139 if (current->memcg_oom.oom_locked) { 2242
2243 if (locked) {
2140 mem_cgroup_oom_unlock(memcg); 2244 mem_cgroup_oom_unlock(memcg);
2141 /* 2245 /*
2142 * There is no guarantee that an OOM-lock contender 2246 * There is no guarantee that an OOM-lock contender
@@ -2145,10 +2249,9 @@ out_memcg:
2145 */ 2249 */
2146 memcg_oom_recover(memcg); 2250 memcg_oom_recover(memcg);
2147 } 2251 }
2252cleanup:
2253 current->memcg_oom.memcg = NULL;
2148 css_put(&memcg->css); 2254 css_put(&memcg->css);
2149 current->memcg_oom.wait_on_memcg = NULL;
2150out:
2151 current->memcg_oom.in_memcg_oom = 0;
2152 return true; 2255 return true;
2153} 2256}
2154 2257
@@ -2562,6 +2665,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
2562 || fatal_signal_pending(current))) 2665 || fatal_signal_pending(current)))
2563 goto bypass; 2666 goto bypass;
2564 2667
2668 if (unlikely(task_in_memcg_oom(current)))
2669 goto bypass;
2670
2565 /* 2671 /*
2566 * We always charge the cgroup the mm_struct belongs to. 2672 * We always charge the cgroup the mm_struct belongs to.
2567 * The mm_struct's mem_cgroup changes on task migration if the 2673 * The mm_struct's mem_cgroup changes on task migration if the
@@ -2660,6 +2766,8 @@ done:
2660 return 0; 2766 return 0;
2661nomem: 2767nomem:
2662 *ptr = NULL; 2768 *ptr = NULL;
2769 if (gfp_mask & __GFP_NOFAIL)
2770 return 0;
2663 return -ENOMEM; 2771 return -ENOMEM;
2664bypass: 2772bypass:
2665 *ptr = root_mem_cgroup; 2773 *ptr = root_mem_cgroup;
@@ -2812,7 +2920,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2812 unlock_page_cgroup(pc); 2920 unlock_page_cgroup(pc);
2813 2921
2814 /* 2922 /*
2815 * "charge_statistics" updated event counter. 2923 * "charge_statistics" updated event counter. Then, check it.
2924 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
2925 * if they exceeds softlimit.
2816 */ 2926 */
2817 memcg_check_events(memcg, page); 2927 memcg_check_events(memcg, page);
2818} 2928}
@@ -4647,6 +4757,98 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
4647 return ret; 4757 return ret;
4648} 4758}
4649 4759
4760unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
4761 gfp_t gfp_mask,
4762 unsigned long *total_scanned)
4763{
4764 unsigned long nr_reclaimed = 0;
4765 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
4766 unsigned long reclaimed;
4767 int loop = 0;
4768 struct mem_cgroup_tree_per_zone *mctz;
4769 unsigned long long excess;
4770 unsigned long nr_scanned;
4771
4772 if (order > 0)
4773 return 0;
4774
4775 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
4776 /*
4777 * This loop can run a while, specially if mem_cgroup's continuously
4778 * keep exceeding their soft limit and putting the system under
4779 * pressure
4780 */
4781 do {
4782 if (next_mz)
4783 mz = next_mz;
4784 else
4785 mz = mem_cgroup_largest_soft_limit_node(mctz);
4786 if (!mz)
4787 break;
4788
4789 nr_scanned = 0;
4790 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
4791 gfp_mask, &nr_scanned);
4792 nr_reclaimed += reclaimed;
4793 *total_scanned += nr_scanned;
4794 spin_lock(&mctz->lock);
4795
4796 /*
4797 * If we failed to reclaim anything from this memory cgroup
4798 * it is time to move on to the next cgroup
4799 */
4800 next_mz = NULL;
4801 if (!reclaimed) {
4802 do {
4803 /*
4804 * Loop until we find yet another one.
4805 *
4806 * By the time we get the soft_limit lock
4807 * again, someone might have aded the
4808 * group back on the RB tree. Iterate to
4809 * make sure we get a different mem.
4810 * mem_cgroup_largest_soft_limit_node returns
4811 * NULL if no other cgroup is present on
4812 * the tree
4813 */
4814 next_mz =
4815 __mem_cgroup_largest_soft_limit_node(mctz);
4816 if (next_mz == mz)
4817 css_put(&next_mz->memcg->css);
4818 else /* next_mz == NULL or other memcg */
4819 break;
4820 } while (1);
4821 }
4822 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
4823 excess = res_counter_soft_limit_excess(&mz->memcg->res);
4824 /*
4825 * One school of thought says that we should not add
4826 * back the node to the tree if reclaim returns 0.
4827 * But our reclaim could return 0, simply because due
4828 * to priority we are exposing a smaller subset of
4829 * memory to reclaim from. Consider this as a longer
4830 * term TODO.
4831 */
4832 /* If excess == 0, no tree ops */
4833 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
4834 spin_unlock(&mctz->lock);
4835 css_put(&mz->memcg->css);
4836 loop++;
4837 /*
4838 * Could not reclaim anything and there are no more
4839 * mem cgroups to try or we seem to be looping without
4840 * reclaiming anything.
4841 */
4842 if (!nr_reclaimed &&
4843 (next_mz == NULL ||
4844 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
4845 break;
4846 } while (!nr_reclaimed);
4847 if (next_mz)
4848 css_put(&next_mz->memcg->css);
4849 return nr_reclaimed;
4850}
4851
4650/** 4852/**
4651 * mem_cgroup_force_empty_list - clears LRU of a group 4853 * mem_cgroup_force_empty_list - clears LRU of a group
4652 * @memcg: group to clear 4854 * @memcg: group to clear
@@ -5911,6 +6113,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
5911 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 6113 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
5912 mz = &pn->zoneinfo[zone]; 6114 mz = &pn->zoneinfo[zone];
5913 lruvec_init(&mz->lruvec); 6115 lruvec_init(&mz->lruvec);
6116 mz->usage_in_excess = 0;
6117 mz->on_tree = false;
5914 mz->memcg = memcg; 6118 mz->memcg = memcg;
5915 } 6119 }
5916 memcg->nodeinfo[node] = pn; 6120 memcg->nodeinfo[node] = pn;
@@ -5966,6 +6170,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
5966 int node; 6170 int node;
5967 size_t size = memcg_size(); 6171 size_t size = memcg_size();
5968 6172
6173 mem_cgroup_remove_from_trees(memcg);
5969 free_css_id(&mem_cgroup_subsys, &memcg->css); 6174 free_css_id(&mem_cgroup_subsys, &memcg->css);
5970 6175
5971 for_each_node(node) 6176 for_each_node(node)
@@ -6002,6 +6207,29 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
6002} 6207}
6003EXPORT_SYMBOL(parent_mem_cgroup); 6208EXPORT_SYMBOL(parent_mem_cgroup);
6004 6209
6210static void __init mem_cgroup_soft_limit_tree_init(void)
6211{
6212 struct mem_cgroup_tree_per_node *rtpn;
6213 struct mem_cgroup_tree_per_zone *rtpz;
6214 int tmp, node, zone;
6215
6216 for_each_node(node) {
6217 tmp = node;
6218 if (!node_state(node, N_NORMAL_MEMORY))
6219 tmp = -1;
6220 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
6221 BUG_ON(!rtpn);
6222
6223 soft_limit_tree.rb_tree_per_node[node] = rtpn;
6224
6225 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
6226 rtpz = &rtpn->rb_tree_per_zone[zone];
6227 rtpz->rb_root = RB_ROOT;
6228 spin_lock_init(&rtpz->lock);
6229 }
6230 }
6231}
6232
6005static struct cgroup_subsys_state * __ref 6233static struct cgroup_subsys_state * __ref
6006mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 6234mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6007{ 6235{
@@ -6031,7 +6259,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6031 mutex_init(&memcg->thresholds_lock); 6259 mutex_init(&memcg->thresholds_lock);
6032 spin_lock_init(&memcg->move_lock); 6260 spin_lock_init(&memcg->move_lock);
6033 vmpressure_init(&memcg->vmpressure); 6261 vmpressure_init(&memcg->vmpressure);
6034 spin_lock_init(&memcg->soft_lock);
6035 6262
6036 return &memcg->css; 6263 return &memcg->css;
6037 6264
@@ -6109,13 +6336,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6109 6336
6110 mem_cgroup_invalidate_reclaim_iterators(memcg); 6337 mem_cgroup_invalidate_reclaim_iterators(memcg);
6111 mem_cgroup_reparent_charges(memcg); 6338 mem_cgroup_reparent_charges(memcg);
6112 if (memcg->soft_contributed) {
6113 while ((memcg = parent_mem_cgroup(memcg)))
6114 atomic_dec(&memcg->children_in_excess);
6115
6116 if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
6117 atomic_dec(&root_mem_cgroup->children_in_excess);
6118 }
6119 mem_cgroup_destroy_all_caches(memcg); 6339 mem_cgroup_destroy_all_caches(memcg);
6120 vmpressure_cleanup(&memcg->vmpressure); 6340 vmpressure_cleanup(&memcg->vmpressure);
6121} 6341}
@@ -6790,6 +7010,7 @@ static int __init mem_cgroup_init(void)
6790{ 7010{
6791 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 7011 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
6792 enable_swap_cgroup(); 7012 enable_swap_cgroup();
7013 mem_cgroup_soft_limit_tree_init();
6793 memcg_stock_init(); 7014 memcg_stock_init();
6794 return 0; 7015 return 0;
6795} 7016}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 947ed5413279..bf3351b5115e 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1114,8 +1114,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1114 * shake_page could have turned it free. 1114 * shake_page could have turned it free.
1115 */ 1115 */
1116 if (is_free_buddy_page(p)) { 1116 if (is_free_buddy_page(p)) {
1117 action_result(pfn, "free buddy, 2nd try", 1117 if (flags & MF_COUNT_INCREASED)
1118 DELAYED); 1118 action_result(pfn, "free buddy", DELAYED);
1119 else
1120 action_result(pfn, "free buddy, 2nd try", DELAYED);
1119 return 0; 1121 return 0;
1120 } 1122 }
1121 action_result(pfn, "non LRU", IGNORED); 1123 action_result(pfn, "non LRU", IGNORED);
@@ -1349,7 +1351,7 @@ int unpoison_memory(unsigned long pfn)
1349 * worked by memory_failure() and the page lock is not held yet. 1351 * worked by memory_failure() and the page lock is not held yet.
1350 * In such case, we yield to memory_failure() and make unpoison fail. 1352 * In such case, we yield to memory_failure() and make unpoison fail.
1351 */ 1353 */
1352 if (PageTransHuge(page)) { 1354 if (!PageHuge(page) && PageTransHuge(page)) {
1353 pr_info("MCE: Memory failure is now running on %#lx\n", pfn); 1355 pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
1354 return 0; 1356 return 0;
1355 } 1357 }
diff --git a/mm/memory.c b/mm/memory.c
index ca0003947115..1311f26497e6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -837,6 +837,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
837 */ 837 */
838 make_migration_entry_read(&entry); 838 make_migration_entry_read(&entry);
839 pte = swp_entry_to_pte(entry); 839 pte = swp_entry_to_pte(entry);
840 if (pte_swp_soft_dirty(*src_pte))
841 pte = pte_swp_mksoft_dirty(pte);
840 set_pte_at(src_mm, addr, src_pte, pte); 842 set_pte_at(src_mm, addr, src_pte, pte);
841 } 843 }
842 } 844 }
@@ -3863,15 +3865,21 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3863 * space. Kernel faults are handled more gracefully. 3865 * space. Kernel faults are handled more gracefully.
3864 */ 3866 */
3865 if (flags & FAULT_FLAG_USER) 3867 if (flags & FAULT_FLAG_USER)
3866 mem_cgroup_enable_oom(); 3868 mem_cgroup_oom_enable();
3867 3869
3868 ret = __handle_mm_fault(mm, vma, address, flags); 3870 ret = __handle_mm_fault(mm, vma, address, flags);
3869 3871
3870 if (flags & FAULT_FLAG_USER) 3872 if (flags & FAULT_FLAG_USER) {
3871 mem_cgroup_disable_oom(); 3873 mem_cgroup_oom_disable();
3872 3874 /*
3873 if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))) 3875 * The task may have entered a memcg OOM situation but
3874 mem_cgroup_oom_synchronize(); 3876 * if the allocation error was handled gracefully (no
3877 * VM_FAULT_OOM), there is no need to kill anything.
3878 * Just clean up the OOM state peacefully.
3879 */
3880 if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
3881 mem_cgroup_oom_synchronize(false);
3882 }
3875 3883
3876 return ret; 3884 return ret;
3877} 3885}
diff --git a/mm/migrate.c b/mm/migrate.c
index 9c8d5f59d30b..7a7325ee1d08 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -107,7 +107,7 @@ void putback_movable_pages(struct list_head *l)
107 list_del(&page->lru); 107 list_del(&page->lru);
108 dec_zone_page_state(page, NR_ISOLATED_ANON + 108 dec_zone_page_state(page, NR_ISOLATED_ANON +
109 page_is_file_cache(page)); 109 page_is_file_cache(page));
110 if (unlikely(balloon_page_movable(page))) 110 if (unlikely(isolated_balloon_page(page)))
111 balloon_page_putback(page); 111 balloon_page_putback(page);
112 else 112 else
113 putback_lru_page(page); 113 putback_lru_page(page);
@@ -161,6 +161,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
161 161
162 get_page(new); 162 get_page(new);
163 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 163 pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
164 if (pte_swp_soft_dirty(*ptep))
165 pte = pte_mksoft_dirty(pte);
164 if (is_write_migration_entry(entry)) 166 if (is_write_migration_entry(entry))
165 pte = pte_mkwrite(pte); 167 pte = pte_mkwrite(pte);
166#ifdef CONFIG_HUGETLB_PAGE 168#ifdef CONFIG_HUGETLB_PAGE
diff --git a/mm/mlock.c b/mm/mlock.c
index d63802663242..d480cd6fc475 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -379,10 +379,14 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
379 379
380 /* 380 /*
381 * Initialize pte walk starting at the already pinned page where we 381 * Initialize pte walk starting at the already pinned page where we
382 * are sure that there is a pte. 382 * are sure that there is a pte, as it was pinned under the same
383 * mmap_sem write op.
383 */ 384 */
384 pte = get_locked_pte(vma->vm_mm, start, &ptl); 385 pte = get_locked_pte(vma->vm_mm, start, &ptl);
385 end = min(end, pmd_addr_end(start, end)); 386 /* Make sure we do not cross the page table boundary */
387 end = pgd_addr_end(start, end);
388 end = pud_addr_end(start, end);
389 end = pmd_addr_end(start, end);
386 390
387 /* The page next to the pinned page is the first we will try to get */ 391 /* The page next to the pinned page is the first we will try to get */
388 start += PAGE_SIZE; 392 start += PAGE_SIZE;
@@ -736,6 +740,7 @@ static int do_mlockall(int flags)
736 740
737 /* Ignore errors */ 741 /* Ignore errors */
738 mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); 742 mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
743 cond_resched();
739 } 744 }
740out: 745out:
741 return 0; 746 return 0;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 94722a4d6b43..a3af058f68e4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -94,13 +94,16 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
94 swp_entry_t entry = pte_to_swp_entry(oldpte); 94 swp_entry_t entry = pte_to_swp_entry(oldpte);
95 95
96 if (is_write_migration_entry(entry)) { 96 if (is_write_migration_entry(entry)) {
97 pte_t newpte;
97 /* 98 /*
98 * A protection check is difficult so 99 * A protection check is difficult so
99 * just be safe and disable write 100 * just be safe and disable write
100 */ 101 */
101 make_migration_entry_read(&entry); 102 make_migration_entry_read(&entry);
102 set_pte_at(mm, addr, pte, 103 newpte = swp_entry_to_pte(entry);
103 swp_entry_to_pte(entry)); 104 if (pte_swp_soft_dirty(oldpte))
105 newpte = pte_swp_mksoft_dirty(newpte);
106 set_pte_at(mm, addr, pte, newpte);
104 } 107 }
105 pages++; 108 pages++;
106 } 109 }
diff --git a/mm/mremap.c b/mm/mremap.c
index 91b13d6a16d4..0843feb66f3d 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -25,7 +25,6 @@
25#include <asm/uaccess.h> 25#include <asm/uaccess.h>
26#include <asm/cacheflush.h> 26#include <asm/cacheflush.h>
27#include <asm/tlbflush.h> 27#include <asm/tlbflush.h>
28#include <asm/pgalloc.h>
29 28
30#include "internal.h" 29#include "internal.h"
31 30
@@ -63,10 +62,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
63 return NULL; 62 return NULL;
64 63
65 pmd = pmd_alloc(mm, pud, addr); 64 pmd = pmd_alloc(mm, pud, addr);
66 if (!pmd) { 65 if (!pmd)
67 pud_free(mm, pud);
68 return NULL; 66 return NULL;
69 }
70 67
71 VM_BUG_ON(pmd_trans_huge(*pmd)); 68 VM_BUG_ON(pmd_trans_huge(*pmd));
72 69
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 314e9d274381..6738c47f1f72 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -680,7 +680,7 @@ void pagefault_out_of_memory(void)
680{ 680{
681 struct zonelist *zonelist; 681 struct zonelist *zonelist;
682 682
683 if (mem_cgroup_oom_synchronize()) 683 if (mem_cgroup_oom_synchronize(true))
684 return; 684 return;
685 685
686 zonelist = node_zonelist(first_online_node, GFP_KERNEL); 686 zonelist = node_zonelist(first_online_node, GFP_KERNEL);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index f5236f804aa6..63807583d8e8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1210,11 +1210,11 @@ static unsigned long dirty_poll_interval(unsigned long dirty,
1210 return 1; 1210 return 1;
1211} 1211}
1212 1212
1213static long bdi_max_pause(struct backing_dev_info *bdi, 1213static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
1214 unsigned long bdi_dirty) 1214 unsigned long bdi_dirty)
1215{ 1215{
1216 long bw = bdi->avg_write_bandwidth; 1216 unsigned long bw = bdi->avg_write_bandwidth;
1217 long t; 1217 unsigned long t;
1218 1218
1219 /* 1219 /*
1220 * Limit pause time for small memory systems. If sleeping for too long 1220 * Limit pause time for small memory systems. If sleeping for too long
@@ -1226,7 +1226,7 @@ static long bdi_max_pause(struct backing_dev_info *bdi,
1226 t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); 1226 t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
1227 t++; 1227 t++;
1228 1228
1229 return min_t(long, t, MAX_PAUSE); 1229 return min_t(unsigned long, t, MAX_PAUSE);
1230} 1230}
1231 1231
1232static long bdi_min_pause(struct backing_dev_info *bdi, 1232static long bdi_min_pause(struct backing_dev_info *bdi,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0ee638f76ebe..dd886fac451a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6366,10 +6366,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
6366 list_del(&page->lru); 6366 list_del(&page->lru);
6367 rmv_page_order(page); 6367 rmv_page_order(page);
6368 zone->free_area[order].nr_free--; 6368 zone->free_area[order].nr_free--;
6369#ifdef CONFIG_HIGHMEM
6370 if (PageHighMem(page))
6371 totalhigh_pages -= 1 << order;
6372#endif
6373 for (i = 0; i < (1 << order); i++) 6369 for (i = 0; i < (1 << order); i++)
6374 SetPageReserved((page+i)); 6370 SetPageReserved((page+i));
6375 pfn += (1 << order); 6371 pfn += (1 << order);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index a3443278ce3a..e2e98af703ea 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -56,6 +56,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
56 continue; 56 continue;
57 } 57 }
58 58
59#if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON)
59 /* 60 /*
60 * For simplicity, we won't check this in the list of memcg 61 * For simplicity, we won't check this in the list of memcg
61 * caches. We have control over memcg naming, and if there 62 * caches. We have control over memcg naming, and if there
@@ -69,6 +70,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
69 s = NULL; 70 s = NULL;
70 return -EINVAL; 71 return -EINVAL;
71 } 72 }
73#endif
72 } 74 }
73 75
74 WARN_ON(strchr(name, ' ')); /* It confuses parsers */ 76 WARN_ON(strchr(name, ' ')); /* It confuses parsers */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 3963fc24fcc1..de7c904e52e5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1824,6 +1824,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1824 struct filename *pathname; 1824 struct filename *pathname;
1825 int i, type, prev; 1825 int i, type, prev;
1826 int err; 1826 int err;
1827 unsigned int old_block_size;
1827 1828
1828 if (!capable(CAP_SYS_ADMIN)) 1829 if (!capable(CAP_SYS_ADMIN))
1829 return -EPERM; 1830 return -EPERM;
@@ -1914,6 +1915,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1914 } 1915 }
1915 1916
1916 swap_file = p->swap_file; 1917 swap_file = p->swap_file;
1918 old_block_size = p->old_block_size;
1917 p->swap_file = NULL; 1919 p->swap_file = NULL;
1918 p->max = 0; 1920 p->max = 0;
1919 swap_map = p->swap_map; 1921 swap_map = p->swap_map;
@@ -1938,7 +1940,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1938 inode = mapping->host; 1940 inode = mapping->host;
1939 if (S_ISBLK(inode->i_mode)) { 1941 if (S_ISBLK(inode->i_mode)) {
1940 struct block_device *bdev = I_BDEV(inode); 1942 struct block_device *bdev = I_BDEV(inode);
1941 set_blocksize(bdev, p->old_block_size); 1943 set_blocksize(bdev, old_block_size);
1942 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); 1944 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1943 } else { 1945 } else {
1944 mutex_lock(&inode->i_mutex); 1946 mutex_lock(&inode->i_mutex);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8ed1b775bdc9..eea668d9cff6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -48,6 +48,7 @@
48#include <asm/div64.h> 48#include <asm/div64.h>
49 49
50#include <linux/swapops.h> 50#include <linux/swapops.h>
51#include <linux/balloon_compaction.h>
51 52
52#include "internal.h" 53#include "internal.h"
53 54
@@ -139,23 +140,11 @@ static bool global_reclaim(struct scan_control *sc)
139{ 140{
140 return !sc->target_mem_cgroup; 141 return !sc->target_mem_cgroup;
141} 142}
142
143static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
144{
145 struct mem_cgroup *root = sc->target_mem_cgroup;
146 return !mem_cgroup_disabled() &&
147 mem_cgroup_soft_reclaim_eligible(root, root) != SKIP_TREE;
148}
149#else 143#else
150static bool global_reclaim(struct scan_control *sc) 144static bool global_reclaim(struct scan_control *sc)
151{ 145{
152 return true; 146 return true;
153} 147}
154
155static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
156{
157 return false;
158}
159#endif 148#endif
160 149
161unsigned long zone_reclaimable_pages(struct zone *zone) 150unsigned long zone_reclaimable_pages(struct zone *zone)
@@ -222,6 +211,7 @@ void unregister_shrinker(struct shrinker *shrinker)
222 down_write(&shrinker_rwsem); 211 down_write(&shrinker_rwsem);
223 list_del(&shrinker->list); 212 list_del(&shrinker->list);
224 up_write(&shrinker_rwsem); 213 up_write(&shrinker_rwsem);
214 kfree(shrinker->nr_deferred);
225} 215}
226EXPORT_SYMBOL(unregister_shrinker); 216EXPORT_SYMBOL(unregister_shrinker);
227 217
@@ -1125,7 +1115,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
1125 LIST_HEAD(clean_pages); 1115 LIST_HEAD(clean_pages);
1126 1116
1127 list_for_each_entry_safe(page, next, page_list, lru) { 1117 list_for_each_entry_safe(page, next, page_list, lru) {
1128 if (page_is_file_cache(page) && !PageDirty(page)) { 1118 if (page_is_file_cache(page) && !PageDirty(page) &&
1119 !isolated_balloon_page(page)) {
1129 ClearPageActive(page); 1120 ClearPageActive(page);
1130 list_move(&page->lru, &clean_pages); 1121 list_move(&page->lru, &clean_pages);
1131 } 1122 }
@@ -2176,11 +2167,9 @@ static inline bool should_continue_reclaim(struct zone *zone,
2176 } 2167 }
2177} 2168}
2178 2169
2179static int 2170static void shrink_zone(struct zone *zone, struct scan_control *sc)
2180__shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
2181{ 2171{
2182 unsigned long nr_reclaimed, nr_scanned; 2172 unsigned long nr_reclaimed, nr_scanned;
2183 int groups_scanned = 0;
2184 2173
2185 do { 2174 do {
2186 struct mem_cgroup *root = sc->target_mem_cgroup; 2175 struct mem_cgroup *root = sc->target_mem_cgroup;
@@ -2188,17 +2177,15 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
2188 .zone = zone, 2177 .zone = zone,
2189 .priority = sc->priority, 2178 .priority = sc->priority,
2190 }; 2179 };
2191 struct mem_cgroup *memcg = NULL; 2180 struct mem_cgroup *memcg;
2192 mem_cgroup_iter_filter filter = (soft_reclaim) ?
2193 mem_cgroup_soft_reclaim_eligible : NULL;
2194 2181
2195 nr_reclaimed = sc->nr_reclaimed; 2182 nr_reclaimed = sc->nr_reclaimed;
2196 nr_scanned = sc->nr_scanned; 2183 nr_scanned = sc->nr_scanned;
2197 2184
2198 while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) { 2185 memcg = mem_cgroup_iter(root, NULL, &reclaim);
2186 do {
2199 struct lruvec *lruvec; 2187 struct lruvec *lruvec;
2200 2188
2201 groups_scanned++;
2202 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2189 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2203 2190
2204 shrink_lruvec(lruvec, sc); 2191 shrink_lruvec(lruvec, sc);
@@ -2218,7 +2205,8 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
2218 mem_cgroup_iter_break(root, memcg); 2205 mem_cgroup_iter_break(root, memcg);
2219 break; 2206 break;
2220 } 2207 }
2221 } 2208 memcg = mem_cgroup_iter(root, memcg, &reclaim);
2209 } while (memcg);
2222 2210
2223 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, 2211 vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
2224 sc->nr_scanned - nr_scanned, 2212 sc->nr_scanned - nr_scanned,
@@ -2226,37 +2214,6 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
2226 2214
2227 } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, 2215 } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
2228 sc->nr_scanned - nr_scanned, sc)); 2216 sc->nr_scanned - nr_scanned, sc));
2229
2230 return groups_scanned;
2231}
2232
2233
2234static void shrink_zone(struct zone *zone, struct scan_control *sc)
2235{
2236 bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc);
2237 unsigned long nr_scanned = sc->nr_scanned;
2238 int scanned_groups;
2239
2240 scanned_groups = __shrink_zone(zone, sc, do_soft_reclaim);
2241 /*
2242 * memcg iterator might race with other reclaimer or start from
2243 * a incomplete tree walk so the tree walk in __shrink_zone
2244 * might have missed groups that are above the soft limit. Try
2245 * another loop to catch up with others. Do it just once to
2246 * prevent from reclaim latencies when other reclaimers always
2247 * preempt this one.
2248 */
2249 if (do_soft_reclaim && !scanned_groups)
2250 __shrink_zone(zone, sc, do_soft_reclaim);
2251
2252 /*
2253 * No group is over the soft limit or those that are do not have
2254 * pages in the zone we are reclaiming so we have to reclaim everybody
2255 */
2256 if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) {
2257 __shrink_zone(zone, sc, false);
2258 return;
2259 }
2260} 2217}
2261 2218
2262/* Returns true if compaction should go ahead for a high-order request */ 2219/* Returns true if compaction should go ahead for a high-order request */
@@ -2320,6 +2277,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2320{ 2277{
2321 struct zoneref *z; 2278 struct zoneref *z;
2322 struct zone *zone; 2279 struct zone *zone;
2280 unsigned long nr_soft_reclaimed;
2281 unsigned long nr_soft_scanned;
2323 bool aborted_reclaim = false; 2282 bool aborted_reclaim = false;
2324 2283
2325 /* 2284 /*
@@ -2359,6 +2318,18 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2359 continue; 2318 continue;
2360 } 2319 }
2361 } 2320 }
2321 /*
2322 * This steals pages from memory cgroups over softlimit
2323 * and returns the number of reclaimed pages and
2324 * scanned pages. This works for global memory pressure
2325 * and balancing, not for a memcg's limit.
2326 */
2327 nr_soft_scanned = 0;
2328 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2329 sc->order, sc->gfp_mask,
2330 &nr_soft_scanned);
2331 sc->nr_reclaimed += nr_soft_reclaimed;
2332 sc->nr_scanned += nr_soft_scanned;
2362 /* need some check for avoid more shrink_zone() */ 2333 /* need some check for avoid more shrink_zone() */
2363 } 2334 }
2364 2335
@@ -2952,6 +2923,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2952{ 2923{
2953 int i; 2924 int i;
2954 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2925 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2926 unsigned long nr_soft_reclaimed;
2927 unsigned long nr_soft_scanned;
2955 struct scan_control sc = { 2928 struct scan_control sc = {
2956 .gfp_mask = GFP_KERNEL, 2929 .gfp_mask = GFP_KERNEL,
2957 .priority = DEF_PRIORITY, 2930 .priority = DEF_PRIORITY,
@@ -3066,6 +3039,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
3066 3039
3067 sc.nr_scanned = 0; 3040 sc.nr_scanned = 0;
3068 3041
3042 nr_soft_scanned = 0;
3043 /*
3044 * Call soft limit reclaim before calling shrink_zone.
3045 */
3046 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
3047 order, sc.gfp_mask,
3048 &nr_soft_scanned);
3049 sc.nr_reclaimed += nr_soft_reclaimed;
3050
3069 /* 3051 /*
3070 * There should be no need to raise the scanning 3052 * There should be no need to raise the scanning
3071 * priority if enough pages are already being scanned 3053 * priority if enough pages are already being scanned
diff --git a/mm/zswap.c b/mm/zswap.c
index 841e35f1db22..d93510c6aa2d 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -804,6 +804,10 @@ static void zswap_frontswap_invalidate_area(unsigned type)
804 } 804 }
805 tree->rbroot = RB_ROOT; 805 tree->rbroot = RB_ROOT;
806 spin_unlock(&tree->lock); 806 spin_unlock(&tree->lock);
807
808 zbud_destroy_pool(tree->pool);
809 kfree(tree);
810 zswap_trees[type] = NULL;
807} 811}
808 812
809static struct zbud_ops zswap_zbud_ops = { 813static struct zbud_ops zswap_zbud_ops = {