aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-09-12 18:44:27 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-09-12 18:44:27 -0400
commitac4de9543aca59f2b763746647577302fbedd57e (patch)
tree40407750569ee030de56233c41c9a97f7e89cf67 /mm
parent26935fb06ee88f1188789807687c03041f3c70d9 (diff)
parentde32a8177f64bc62e1b19c685dd391af664ab13f (diff)
Merge branch 'akpm' (patches from Andrew Morton)
Merge more patches from Andrew Morton: "The rest of MM. Plus one misc cleanup" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (35 commits) mm/Kconfig: add MMU dependency for MIGRATION. kernel: replace strict_strto*() with kstrto*() mm, thp: count thp_fault_fallback anytime thp fault fails thp: consolidate code between handle_mm_fault() and do_huge_pmd_anonymous_page() thp: do_huge_pmd_anonymous_page() cleanup thp: move maybe_pmd_mkwrite() out of mk_huge_pmd() mm: cleanup add_to_page_cache_locked() thp: account anon transparent huge pages into NR_ANON_PAGES truncate: drop 'oldsize' truncate_pagecache() parameter mm: make lru_add_drain_all() selective memcg: document cgroup dirty/writeback memory statistics memcg: add per cgroup writeback pages accounting memcg: check for proper lock held in mem_cgroup_update_page_stat memcg: remove MEMCG_NR_FILE_MAPPED memcg: reduce function dereference memcg: avoid overflow caused by PAGE_ALIGN memcg: rename RESOURCE_MAX to RES_COUNTER_MAX memcg: correct RESOURCE_MAX to ULLONG_MAX mm: memcg: do not trap chargers with full callstack on OOM mm: memcg: rework and document OOM waiting and wakeup ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig4
-rw-r--r--mm/filemap.c59
-rw-r--r--mm/huge_memory.c129
-rw-r--r--mm/memcontrol.c871
-rw-r--r--mm/memory.c52
-rw-r--r--mm/oom_kill.c7
-rw-r--r--mm/page-writeback.c15
-rw-r--r--mm/rmap.c22
-rw-r--r--mm/swap.c44
-rw-r--r--mm/truncate.c9
-rw-r--r--mm/vmscan.c83
11 files changed, 593 insertions, 702 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 6cdd27043303..026771a9b097 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -245,7 +245,7 @@ config COMPACTION
245config MIGRATION 245config MIGRATION
246 bool "Page migration" 246 bool "Page migration"
247 def_bool y 247 def_bool y
248 depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA 248 depends on (NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA) && MMU
249 help 249 help
250 Allows the migration of the physical location of pages of processes 250 Allows the migration of the physical location of pages of processes
251 while the virtual addresses are not changed. This is useful in 251 while the virtual addresses are not changed. This is useful in
@@ -480,7 +480,7 @@ config FRONTSWAP
480 480
481config CMA 481config CMA
482 bool "Contiguous Memory Allocator" 482 bool "Contiguous Memory Allocator"
483 depends on HAVE_MEMBLOCK 483 depends on HAVE_MEMBLOCK && MMU
484 select MIGRATION 484 select MIGRATION
485 select MEMORY_ISOLATION 485 select MEMORY_ISOLATION
486 help 486 help
diff --git a/mm/filemap.c b/mm/filemap.c
index e607728db4a8..1e6aec4a2d2e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -467,32 +467,34 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
467 error = mem_cgroup_cache_charge(page, current->mm, 467 error = mem_cgroup_cache_charge(page, current->mm,
468 gfp_mask & GFP_RECLAIM_MASK); 468 gfp_mask & GFP_RECLAIM_MASK);
469 if (error) 469 if (error)
470 goto out; 470 return error;
471 471
472 error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); 472 error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
473 if (error == 0) { 473 if (error) {
474 page_cache_get(page);
475 page->mapping = mapping;
476 page->index = offset;
477
478 spin_lock_irq(&mapping->tree_lock);
479 error = radix_tree_insert(&mapping->page_tree, offset, page);
480 if (likely(!error)) {
481 mapping->nrpages++;
482 __inc_zone_page_state(page, NR_FILE_PAGES);
483 spin_unlock_irq(&mapping->tree_lock);
484 trace_mm_filemap_add_to_page_cache(page);
485 } else {
486 page->mapping = NULL;
487 /* Leave page->index set: truncation relies upon it */
488 spin_unlock_irq(&mapping->tree_lock);
489 mem_cgroup_uncharge_cache_page(page);
490 page_cache_release(page);
491 }
492 radix_tree_preload_end();
493 } else
494 mem_cgroup_uncharge_cache_page(page); 474 mem_cgroup_uncharge_cache_page(page);
495out: 475 return error;
476 }
477
478 page_cache_get(page);
479 page->mapping = mapping;
480 page->index = offset;
481
482 spin_lock_irq(&mapping->tree_lock);
483 error = radix_tree_insert(&mapping->page_tree, offset, page);
484 radix_tree_preload_end();
485 if (unlikely(error))
486 goto err_insert;
487 mapping->nrpages++;
488 __inc_zone_page_state(page, NR_FILE_PAGES);
489 spin_unlock_irq(&mapping->tree_lock);
490 trace_mm_filemap_add_to_page_cache(page);
491 return 0;
492err_insert:
493 page->mapping = NULL;
494 /* Leave page->index set: truncation relies upon it */
495 spin_unlock_irq(&mapping->tree_lock);
496 mem_cgroup_uncharge_cache_page(page);
497 page_cache_release(page);
496 return error; 498 return error;
497} 499}
498EXPORT_SYMBOL(add_to_page_cache_locked); 500EXPORT_SYMBOL(add_to_page_cache_locked);
@@ -1614,6 +1616,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1614 struct inode *inode = mapping->host; 1616 struct inode *inode = mapping->host;
1615 pgoff_t offset = vmf->pgoff; 1617 pgoff_t offset = vmf->pgoff;
1616 struct page *page; 1618 struct page *page;
1619 bool memcg_oom;
1617 pgoff_t size; 1620 pgoff_t size;
1618 int ret = 0; 1621 int ret = 0;
1619 1622
@@ -1622,7 +1625,11 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1622 return VM_FAULT_SIGBUS; 1625 return VM_FAULT_SIGBUS;
1623 1626
1624 /* 1627 /*
1625 * Do we have something in the page cache already? 1628 * Do we have something in the page cache already? Either
1629 * way, try readahead, but disable the memcg OOM killer for it
1630 * as readahead is optional and no errors are propagated up
1631 * the fault stack. The OOM killer is enabled while trying to
1632 * instantiate the faulting page individually below.
1626 */ 1633 */
1627 page = find_get_page(mapping, offset); 1634 page = find_get_page(mapping, offset);
1628 if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { 1635 if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
@@ -1630,10 +1637,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1630 * We found the page, so try async readahead before 1637 * We found the page, so try async readahead before
1631 * waiting for the lock. 1638 * waiting for the lock.
1632 */ 1639 */
1640 memcg_oom = mem_cgroup_toggle_oom(false);
1633 do_async_mmap_readahead(vma, ra, file, page, offset); 1641 do_async_mmap_readahead(vma, ra, file, page, offset);
1642 mem_cgroup_toggle_oom(memcg_oom);
1634 } else if (!page) { 1643 } else if (!page) {
1635 /* No page in the page cache at all */ 1644 /* No page in the page cache at all */
1645 memcg_oom = mem_cgroup_toggle_oom(false);
1636 do_sync_mmap_readahead(vma, ra, file, offset); 1646 do_sync_mmap_readahead(vma, ra, file, offset);
1647 mem_cgroup_toggle_oom(memcg_oom);
1637 count_vm_event(PGMAJFAULT); 1648 count_vm_event(PGMAJFAULT);
1638 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 1649 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1639 ret = VM_FAULT_MAJOR; 1650 ret = VM_FAULT_MAJOR;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d66010e0049d..7489884682d8 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -695,11 +695,10 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
695 return pmd; 695 return pmd;
696} 696}
697 697
698static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma) 698static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
699{ 699{
700 pmd_t entry; 700 pmd_t entry;
701 entry = mk_pmd(page, vma->vm_page_prot); 701 entry = mk_pmd(page, prot);
702 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
703 entry = pmd_mkhuge(entry); 702 entry = pmd_mkhuge(entry);
704 return entry; 703 return entry;
705} 704}
@@ -732,7 +731,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
732 pte_free(mm, pgtable); 731 pte_free(mm, pgtable);
733 } else { 732 } else {
734 pmd_t entry; 733 pmd_t entry;
735 entry = mk_huge_pmd(page, vma); 734 entry = mk_huge_pmd(page, vma->vm_page_prot);
735 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
736 page_add_new_anon_rmap(page, vma, haddr); 736 page_add_new_anon_rmap(page, vma, haddr);
737 pgtable_trans_huge_deposit(mm, pmd, pgtable); 737 pgtable_trans_huge_deposit(mm, pmd, pgtable);
738 set_pmd_at(mm, haddr, pmd, entry); 738 set_pmd_at(mm, haddr, pmd, entry);
@@ -788,77 +788,57 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
788{ 788{
789 struct page *page; 789 struct page *page;
790 unsigned long haddr = address & HPAGE_PMD_MASK; 790 unsigned long haddr = address & HPAGE_PMD_MASK;
791 pte_t *pte;
792 791
793 if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { 792 if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
794 if (unlikely(anon_vma_prepare(vma))) 793 return VM_FAULT_FALLBACK;
795 return VM_FAULT_OOM; 794 if (unlikely(anon_vma_prepare(vma)))
796 if (unlikely(khugepaged_enter(vma))) 795 return VM_FAULT_OOM;
796 if (unlikely(khugepaged_enter(vma)))
797 return VM_FAULT_OOM;
798 if (!(flags & FAULT_FLAG_WRITE) &&
799 transparent_hugepage_use_zero_page()) {
800 pgtable_t pgtable;
801 struct page *zero_page;
802 bool set;
803 pgtable = pte_alloc_one(mm, haddr);
804 if (unlikely(!pgtable))
797 return VM_FAULT_OOM; 805 return VM_FAULT_OOM;
798 if (!(flags & FAULT_FLAG_WRITE) && 806 zero_page = get_huge_zero_page();
799 transparent_hugepage_use_zero_page()) { 807 if (unlikely(!zero_page)) {
800 pgtable_t pgtable; 808 pte_free(mm, pgtable);
801 struct page *zero_page;
802 bool set;
803 pgtable = pte_alloc_one(mm, haddr);
804 if (unlikely(!pgtable))
805 return VM_FAULT_OOM;
806 zero_page = get_huge_zero_page();
807 if (unlikely(!zero_page)) {
808 pte_free(mm, pgtable);
809 count_vm_event(THP_FAULT_FALLBACK);
810 goto out;
811 }
812 spin_lock(&mm->page_table_lock);
813 set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
814 zero_page);
815 spin_unlock(&mm->page_table_lock);
816 if (!set) {
817 pte_free(mm, pgtable);
818 put_huge_zero_page();
819 }
820 return 0;
821 }
822 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
823 vma, haddr, numa_node_id(), 0);
824 if (unlikely(!page)) {
825 count_vm_event(THP_FAULT_FALLBACK); 809 count_vm_event(THP_FAULT_FALLBACK);
826 goto out; 810 return VM_FAULT_FALLBACK;
827 }
828 count_vm_event(THP_FAULT_ALLOC);
829 if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
830 put_page(page);
831 goto out;
832 } 811 }
833 if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, 812 spin_lock(&mm->page_table_lock);
834 page))) { 813 set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
835 mem_cgroup_uncharge_page(page); 814 zero_page);
836 put_page(page); 815 spin_unlock(&mm->page_table_lock);
837 goto out; 816 if (!set) {
817 pte_free(mm, pgtable);
818 put_huge_zero_page();
838 } 819 }
839
840 return 0; 820 return 0;
841 } 821 }
842out: 822 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
843 /* 823 vma, haddr, numa_node_id(), 0);
844 * Use __pte_alloc instead of pte_alloc_map, because we can't 824 if (unlikely(!page)) {
845 * run pte_offset_map on the pmd, if an huge pmd could 825 count_vm_event(THP_FAULT_FALLBACK);
846 * materialize from under us from a different thread. 826 return VM_FAULT_FALLBACK;
847 */ 827 }
848 if (unlikely(pmd_none(*pmd)) && 828 if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
849 unlikely(__pte_alloc(mm, vma, pmd, address))) 829 put_page(page);
850 return VM_FAULT_OOM; 830 count_vm_event(THP_FAULT_FALLBACK);
851 /* if an huge pmd materialized from under us just retry later */ 831 return VM_FAULT_FALLBACK;
852 if (unlikely(pmd_trans_huge(*pmd))) 832 }
853 return 0; 833 if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) {
854 /* 834 mem_cgroup_uncharge_page(page);
855 * A regular pmd is established and it can't morph into a huge pmd 835 put_page(page);
856 * from under us anymore at this point because we hold the mmap_sem 836 count_vm_event(THP_FAULT_FALLBACK);
857 * read mode and khugepaged takes it in write mode. So now it's 837 return VM_FAULT_FALLBACK;
858 * safe to run pte_offset_map(). 838 }
859 */ 839
860 pte = pte_offset_map(pmd, address); 840 count_vm_event(THP_FAULT_ALLOC);
861 return handle_pte_fault(mm, vma, address, pte, pmd, flags); 841 return 0;
862} 842}
863 843
864int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, 844int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -1170,7 +1150,6 @@ alloc:
1170 new_page = NULL; 1150 new_page = NULL;
1171 1151
1172 if (unlikely(!new_page)) { 1152 if (unlikely(!new_page)) {
1173 count_vm_event(THP_FAULT_FALLBACK);
1174 if (is_huge_zero_pmd(orig_pmd)) { 1153 if (is_huge_zero_pmd(orig_pmd)) {
1175 ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, 1154 ret = do_huge_pmd_wp_zero_page_fallback(mm, vma,
1176 address, pmd, orig_pmd, haddr); 1155 address, pmd, orig_pmd, haddr);
@@ -1181,9 +1160,9 @@ alloc:
1181 split_huge_page(page); 1160 split_huge_page(page);
1182 put_page(page); 1161 put_page(page);
1183 } 1162 }
1163 count_vm_event(THP_FAULT_FALLBACK);
1184 goto out; 1164 goto out;
1185 } 1165 }
1186 count_vm_event(THP_FAULT_ALLOC);
1187 1166
1188 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { 1167 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
1189 put_page(new_page); 1168 put_page(new_page);
@@ -1191,10 +1170,13 @@ alloc:
1191 split_huge_page(page); 1170 split_huge_page(page);
1192 put_page(page); 1171 put_page(page);
1193 } 1172 }
1173 count_vm_event(THP_FAULT_FALLBACK);
1194 ret |= VM_FAULT_OOM; 1174 ret |= VM_FAULT_OOM;
1195 goto out; 1175 goto out;
1196 } 1176 }
1197 1177
1178 count_vm_event(THP_FAULT_ALLOC);
1179
1198 if (is_huge_zero_pmd(orig_pmd)) 1180 if (is_huge_zero_pmd(orig_pmd))
1199 clear_huge_page(new_page, haddr, HPAGE_PMD_NR); 1181 clear_huge_page(new_page, haddr, HPAGE_PMD_NR);
1200 else 1182 else
@@ -1215,7 +1197,8 @@ alloc:
1215 goto out_mn; 1197 goto out_mn;
1216 } else { 1198 } else {
1217 pmd_t entry; 1199 pmd_t entry;
1218 entry = mk_huge_pmd(new_page, vma); 1200 entry = mk_huge_pmd(new_page, vma->vm_page_prot);
1201 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1219 pmdp_clear_flush(vma, haddr, pmd); 1202 pmdp_clear_flush(vma, haddr, pmd);
1220 page_add_new_anon_rmap(new_page, vma, haddr); 1203 page_add_new_anon_rmap(new_page, vma, haddr);
1221 set_pmd_at(mm, haddr, pmd, entry); 1204 set_pmd_at(mm, haddr, pmd, entry);
@@ -1666,7 +1649,6 @@ static void __split_huge_page_refcount(struct page *page,
1666 BUG_ON(atomic_read(&page->_count) <= 0); 1649 BUG_ON(atomic_read(&page->_count) <= 0);
1667 1650
1668 __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1); 1651 __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
1669 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
1670 1652
1671 ClearPageCompound(page); 1653 ClearPageCompound(page);
1672 compound_unlock(page); 1654 compound_unlock(page);
@@ -2364,7 +2346,8 @@ static void collapse_huge_page(struct mm_struct *mm,
2364 __SetPageUptodate(new_page); 2346 __SetPageUptodate(new_page);
2365 pgtable = pmd_pgtable(_pmd); 2347 pgtable = pmd_pgtable(_pmd);
2366 2348
2367 _pmd = mk_huge_pmd(new_page, vma); 2349 _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
2350 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
2368 2351
2369 /* 2352 /*
2370 * spin_lock() below is not the equivalent of smp_wmb(), so 2353 * spin_lock() below is not the equivalent of smp_wmb(), so
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c6bd28edd533..d5ff3ce13029 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -39,7 +39,6 @@
39#include <linux/limits.h> 39#include <linux/limits.h>
40#include <linux/export.h> 40#include <linux/export.h>
41#include <linux/mutex.h> 41#include <linux/mutex.h>
42#include <linux/rbtree.h>
43#include <linux/slab.h> 42#include <linux/slab.h>
44#include <linux/swap.h> 43#include <linux/swap.h>
45#include <linux/swapops.h> 44#include <linux/swapops.h>
@@ -85,26 +84,12 @@ static int really_do_swap_account __initdata = 0;
85#endif 84#endif
86 85
87 86
88/*
89 * Statistics for memory cgroup.
90 */
91enum mem_cgroup_stat_index {
92 /*
93 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
94 */
95 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
96 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
97 MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */
98 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
99 MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
100 MEM_CGROUP_STAT_NSTATS,
101};
102
103static const char * const mem_cgroup_stat_names[] = { 87static const char * const mem_cgroup_stat_names[] = {
104 "cache", 88 "cache",
105 "rss", 89 "rss",
106 "rss_huge", 90 "rss_huge",
107 "mapped_file", 91 "mapped_file",
92 "writeback",
108 "swap", 93 "swap",
109}; 94};
110 95
@@ -175,10 +160,6 @@ struct mem_cgroup_per_zone {
175 160
176 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 161 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
177 162
178 struct rb_node tree_node; /* RB tree node */
179 unsigned long long usage_in_excess;/* Set to the value by which */
180 /* the soft limit is exceeded*/
181 bool on_tree;
182 struct mem_cgroup *memcg; /* Back pointer, we cannot */ 163 struct mem_cgroup *memcg; /* Back pointer, we cannot */
183 /* use container_of */ 164 /* use container_of */
184}; 165};
@@ -187,26 +168,6 @@ struct mem_cgroup_per_node {
187 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 168 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
188}; 169};
189 170
190/*
191 * Cgroups above their limits are maintained in a RB-Tree, independent of
192 * their hierarchy representation
193 */
194
195struct mem_cgroup_tree_per_zone {
196 struct rb_root rb_root;
197 spinlock_t lock;
198};
199
200struct mem_cgroup_tree_per_node {
201 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
202};
203
204struct mem_cgroup_tree {
205 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
206};
207
208static struct mem_cgroup_tree soft_limit_tree __read_mostly;
209
210struct mem_cgroup_threshold { 171struct mem_cgroup_threshold {
211 struct eventfd_ctx *eventfd; 172 struct eventfd_ctx *eventfd;
212 u64 threshold; 173 u64 threshold;
@@ -280,6 +241,7 @@ struct mem_cgroup {
280 241
281 bool oom_lock; 242 bool oom_lock;
282 atomic_t under_oom; 243 atomic_t under_oom;
244 atomic_t oom_wakeups;
283 245
284 int swappiness; 246 int swappiness;
285 /* OOM-Killer disable */ 247 /* OOM-Killer disable */
@@ -304,7 +266,7 @@ struct mem_cgroup {
304 * Should we move charges of a task when a task is moved into this 266 * Should we move charges of a task when a task is moved into this
305 * mem_cgroup ? And what type of charges should we move ? 267 * mem_cgroup ? And what type of charges should we move ?
306 */ 268 */
307 unsigned long move_charge_at_immigrate; 269 unsigned long move_charge_at_immigrate;
308 /* 270 /*
309 * set > 0 if pages under this cgroup are moving to other cgroup. 271 * set > 0 if pages under this cgroup are moving to other cgroup.
310 */ 272 */
@@ -341,6 +303,22 @@ struct mem_cgroup {
341 atomic_t numainfo_events; 303 atomic_t numainfo_events;
342 atomic_t numainfo_updating; 304 atomic_t numainfo_updating;
343#endif 305#endif
306 /*
307 * Protects soft_contributed transitions.
308 * See mem_cgroup_update_soft_limit
309 */
310 spinlock_t soft_lock;
311
312 /*
313 * If true then this group has increased parents' children_in_excess
314 * when it got over the soft limit.
315 * When a group falls bellow the soft limit, parents' children_in_excess
316 * is decreased and soft_contributed changed to false.
317 */
318 bool soft_contributed;
319
320 /* Number of children that are in soft limit excess */
321 atomic_t children_in_excess;
344 322
345 struct mem_cgroup_per_node *nodeinfo[0]; 323 struct mem_cgroup_per_node *nodeinfo[0];
346 /* WARNING: nodeinfo must be the last member here */ 324 /* WARNING: nodeinfo must be the last member here */
@@ -444,7 +422,6 @@ static bool move_file(void)
444 * limit reclaim to prevent infinite loops, if they ever occur. 422 * limit reclaim to prevent infinite loops, if they ever occur.
445 */ 423 */
446#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 424#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
447#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
448 425
449enum charge_type { 426enum charge_type {
450 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 427 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
@@ -671,164 +648,6 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
671 return mem_cgroup_zoneinfo(memcg, nid, zid); 648 return mem_cgroup_zoneinfo(memcg, nid, zid);
672} 649}
673 650
674static struct mem_cgroup_tree_per_zone *
675soft_limit_tree_node_zone(int nid, int zid)
676{
677 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
678}
679
680static struct mem_cgroup_tree_per_zone *
681soft_limit_tree_from_page(struct page *page)
682{
683 int nid = page_to_nid(page);
684 int zid = page_zonenum(page);
685
686 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
687}
688
689static void
690__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
691 struct mem_cgroup_per_zone *mz,
692 struct mem_cgroup_tree_per_zone *mctz,
693 unsigned long long new_usage_in_excess)
694{
695 struct rb_node **p = &mctz->rb_root.rb_node;
696 struct rb_node *parent = NULL;
697 struct mem_cgroup_per_zone *mz_node;
698
699 if (mz->on_tree)
700 return;
701
702 mz->usage_in_excess = new_usage_in_excess;
703 if (!mz->usage_in_excess)
704 return;
705 while (*p) {
706 parent = *p;
707 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
708 tree_node);
709 if (mz->usage_in_excess < mz_node->usage_in_excess)
710 p = &(*p)->rb_left;
711 /*
712 * We can't avoid mem cgroups that are over their soft
713 * limit by the same amount
714 */
715 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
716 p = &(*p)->rb_right;
717 }
718 rb_link_node(&mz->tree_node, parent, p);
719 rb_insert_color(&mz->tree_node, &mctz->rb_root);
720 mz->on_tree = true;
721}
722
723static void
724__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
725 struct mem_cgroup_per_zone *mz,
726 struct mem_cgroup_tree_per_zone *mctz)
727{
728 if (!mz->on_tree)
729 return;
730 rb_erase(&mz->tree_node, &mctz->rb_root);
731 mz->on_tree = false;
732}
733
734static void
735mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
736 struct mem_cgroup_per_zone *mz,
737 struct mem_cgroup_tree_per_zone *mctz)
738{
739 spin_lock(&mctz->lock);
740 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
741 spin_unlock(&mctz->lock);
742}
743
744
745static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
746{
747 unsigned long long excess;
748 struct mem_cgroup_per_zone *mz;
749 struct mem_cgroup_tree_per_zone *mctz;
750 int nid = page_to_nid(page);
751 int zid = page_zonenum(page);
752 mctz = soft_limit_tree_from_page(page);
753
754 /*
755 * Necessary to update all ancestors when hierarchy is used.
756 * because their event counter is not touched.
757 */
758 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
759 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
760 excess = res_counter_soft_limit_excess(&memcg->res);
761 /*
762 * We have to update the tree if mz is on RB-tree or
763 * mem is over its softlimit.
764 */
765 if (excess || mz->on_tree) {
766 spin_lock(&mctz->lock);
767 /* if on-tree, remove it */
768 if (mz->on_tree)
769 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
770 /*
771 * Insert again. mz->usage_in_excess will be updated.
772 * If excess is 0, no tree ops.
773 */
774 __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
775 spin_unlock(&mctz->lock);
776 }
777 }
778}
779
780static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
781{
782 int node, zone;
783 struct mem_cgroup_per_zone *mz;
784 struct mem_cgroup_tree_per_zone *mctz;
785
786 for_each_node(node) {
787 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
788 mz = mem_cgroup_zoneinfo(memcg, node, zone);
789 mctz = soft_limit_tree_node_zone(node, zone);
790 mem_cgroup_remove_exceeded(memcg, mz, mctz);
791 }
792 }
793}
794
795static struct mem_cgroup_per_zone *
796__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
797{
798 struct rb_node *rightmost = NULL;
799 struct mem_cgroup_per_zone *mz;
800
801retry:
802 mz = NULL;
803 rightmost = rb_last(&mctz->rb_root);
804 if (!rightmost)
805 goto done; /* Nothing to reclaim from */
806
807 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
808 /*
809 * Remove the node now but someone else can add it back,
810 * we will to add it back at the end of reclaim to its correct
811 * position in the tree.
812 */
813 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
814 if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
815 !css_tryget(&mz->memcg->css))
816 goto retry;
817done:
818 return mz;
819}
820
821static struct mem_cgroup_per_zone *
822mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
823{
824 struct mem_cgroup_per_zone *mz;
825
826 spin_lock(&mctz->lock);
827 mz = __mem_cgroup_largest_soft_limit_node(mctz);
828 spin_unlock(&mctz->lock);
829 return mz;
830}
831
832/* 651/*
833 * Implementation Note: reading percpu statistics for memcg. 652 * Implementation Note: reading percpu statistics for memcg.
834 * 653 *
@@ -1003,6 +822,48 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
1003} 822}
1004 823
1005/* 824/*
825 * Called from rate-limited memcg_check_events when enough
826 * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure
827 * that all the parents up the hierarchy will be notified that this group
828 * is in excess or that it is not in excess anymore. mmecg->soft_contributed
829 * makes the transition a single action whenever the state flips from one to
830 * the other.
831 */
832static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg)
833{
834 unsigned long long excess = res_counter_soft_limit_excess(&memcg->res);
835 struct mem_cgroup *parent = memcg;
836 int delta = 0;
837
838 spin_lock(&memcg->soft_lock);
839 if (excess) {
840 if (!memcg->soft_contributed) {
841 delta = 1;
842 memcg->soft_contributed = true;
843 }
844 } else {
845 if (memcg->soft_contributed) {
846 delta = -1;
847 memcg->soft_contributed = false;
848 }
849 }
850
851 /*
852 * Necessary to update all ancestors when hierarchy is used
853 * because their event counter is not touched.
854 * We track children even outside the hierarchy for the root
855 * cgroup because tree walk starting at root should visit
856 * all cgroups and we want to prevent from pointless tree
857 * walk if no children is below the limit.
858 */
859 while (delta && (parent = parent_mem_cgroup(parent)))
860 atomic_add(delta, &parent->children_in_excess);
861 if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
862 atomic_add(delta, &root_mem_cgroup->children_in_excess);
863 spin_unlock(&memcg->soft_lock);
864}
865
866/*
1006 * Check events in order. 867 * Check events in order.
1007 * 868 *
1008 */ 869 */
@@ -1025,7 +886,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
1025 886
1026 mem_cgroup_threshold(memcg); 887 mem_cgroup_threshold(memcg);
1027 if (unlikely(do_softlimit)) 888 if (unlikely(do_softlimit))
1028 mem_cgroup_update_tree(memcg, page); 889 mem_cgroup_update_soft_limit(memcg);
1029#if MAX_NUMNODES > 1 890#if MAX_NUMNODES > 1
1030 if (unlikely(do_numainfo)) 891 if (unlikely(do_numainfo))
1031 atomic_inc(&memcg->numainfo_events); 892 atomic_inc(&memcg->numainfo_events);
@@ -1068,6 +929,15 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
1068 return memcg; 929 return memcg;
1069} 930}
1070 931
932static enum mem_cgroup_filter_t
933mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
934 mem_cgroup_iter_filter cond)
935{
936 if (!cond)
937 return VISIT;
938 return cond(memcg, root);
939}
940
1071/* 941/*
1072 * Returns a next (in a pre-order walk) alive memcg (with elevated css 942 * Returns a next (in a pre-order walk) alive memcg (with elevated css
1073 * ref. count) or NULL if the whole root's subtree has been visited. 943 * ref. count) or NULL if the whole root's subtree has been visited.
@@ -1075,7 +945,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
1075 * helper function to be used by mem_cgroup_iter 945 * helper function to be used by mem_cgroup_iter
1076 */ 946 */
1077static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, 947static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
1078 struct mem_cgroup *last_visited) 948 struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond)
1079{ 949{
1080 struct cgroup_subsys_state *prev_css, *next_css; 950 struct cgroup_subsys_state *prev_css, *next_css;
1081 951
@@ -1093,11 +963,31 @@ skip_node:
1093 if (next_css) { 963 if (next_css) {
1094 struct mem_cgroup *mem = mem_cgroup_from_css(next_css); 964 struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
1095 965
1096 if (css_tryget(&mem->css)) 966 switch (mem_cgroup_filter(mem, root, cond)) {
1097 return mem; 967 case SKIP:
1098 else {
1099 prev_css = next_css; 968 prev_css = next_css;
1100 goto skip_node; 969 goto skip_node;
970 case SKIP_TREE:
971 if (mem == root)
972 return NULL;
973 /*
974 * css_rightmost_descendant is not an optimal way to
975 * skip through a subtree (especially for imbalanced
976 * trees leaning to right) but that's what we have right
977 * now. More effective solution would be traversing
978 * right-up for first non-NULL without calling
979 * css_next_descendant_pre afterwards.
980 */
981 prev_css = css_rightmost_descendant(next_css);
982 goto skip_node;
983 case VISIT:
984 if (css_tryget(&mem->css))
985 return mem;
986 else {
987 prev_css = next_css;
988 goto skip_node;
989 }
990 break;
1101 } 991 }
1102 } 992 }
1103 993
@@ -1161,6 +1051,7 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1161 * @root: hierarchy root 1051 * @root: hierarchy root
1162 * @prev: previously returned memcg, NULL on first invocation 1052 * @prev: previously returned memcg, NULL on first invocation
1163 * @reclaim: cookie for shared reclaim walks, NULL for full walks 1053 * @reclaim: cookie for shared reclaim walks, NULL for full walks
1054 * @cond: filter for visited nodes, NULL for no filter
1164 * 1055 *
1165 * Returns references to children of the hierarchy below @root, or 1056 * Returns references to children of the hierarchy below @root, or
1166 * @root itself, or %NULL after a full round-trip. 1057 * @root itself, or %NULL after a full round-trip.
@@ -1173,15 +1064,18 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1173 * divide up the memcgs in the hierarchy among all concurrent 1064 * divide up the memcgs in the hierarchy among all concurrent
1174 * reclaimers operating on the same zone and priority. 1065 * reclaimers operating on the same zone and priority.
1175 */ 1066 */
1176struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 1067struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
1177 struct mem_cgroup *prev, 1068 struct mem_cgroup *prev,
1178 struct mem_cgroup_reclaim_cookie *reclaim) 1069 struct mem_cgroup_reclaim_cookie *reclaim,
1070 mem_cgroup_iter_filter cond)
1179{ 1071{
1180 struct mem_cgroup *memcg = NULL; 1072 struct mem_cgroup *memcg = NULL;
1181 struct mem_cgroup *last_visited = NULL; 1073 struct mem_cgroup *last_visited = NULL;
1182 1074
1183 if (mem_cgroup_disabled()) 1075 if (mem_cgroup_disabled()) {
1184 return NULL; 1076 /* first call must return non-NULL, second return NULL */
1077 return (struct mem_cgroup *)(unsigned long)!prev;
1078 }
1185 1079
1186 if (!root) 1080 if (!root)
1187 root = root_mem_cgroup; 1081 root = root_mem_cgroup;
@@ -1192,7 +1086,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1192 if (!root->use_hierarchy && root != root_mem_cgroup) { 1086 if (!root->use_hierarchy && root != root_mem_cgroup) {
1193 if (prev) 1087 if (prev)
1194 goto out_css_put; 1088 goto out_css_put;
1195 return root; 1089 if (mem_cgroup_filter(root, root, cond) == VISIT)
1090 return root;
1091 return NULL;
1196 } 1092 }
1197 1093
1198 rcu_read_lock(); 1094 rcu_read_lock();
@@ -1215,7 +1111,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1215 last_visited = mem_cgroup_iter_load(iter, root, &seq); 1111 last_visited = mem_cgroup_iter_load(iter, root, &seq);
1216 } 1112 }
1217 1113
1218 memcg = __mem_cgroup_iter_next(root, last_visited); 1114 memcg = __mem_cgroup_iter_next(root, last_visited, cond);
1219 1115
1220 if (reclaim) { 1116 if (reclaim) {
1221 mem_cgroup_iter_update(iter, last_visited, memcg, seq); 1117 mem_cgroup_iter_update(iter, last_visited, memcg, seq);
@@ -1226,7 +1122,11 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1226 reclaim->generation = iter->generation; 1122 reclaim->generation = iter->generation;
1227 } 1123 }
1228 1124
1229 if (prev && !memcg) 1125 /*
1126 * We have finished the whole tree walk or no group has been
1127 * visited because filter told us to skip the root node.
1128 */
1129 if (!memcg && (prev || (cond && !last_visited)))
1230 goto out_unlock; 1130 goto out_unlock;
1231 } 1131 }
1232out_unlock: 1132out_unlock:
@@ -1867,6 +1767,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1867 return total; 1767 return total;
1868} 1768}
1869 1769
1770#if MAX_NUMNODES > 1
1870/** 1771/**
1871 * test_mem_cgroup_node_reclaimable 1772 * test_mem_cgroup_node_reclaimable
1872 * @memcg: the target memcg 1773 * @memcg: the target memcg
@@ -1889,7 +1790,6 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1889 return false; 1790 return false;
1890 1791
1891} 1792}
1892#if MAX_NUMNODES > 1
1893 1793
1894/* 1794/*
1895 * Always updating the nodemask is not very good - even if we have an empty 1795 * Always updating the nodemask is not very good - even if we have an empty
@@ -1957,115 +1857,64 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1957 return node; 1857 return node;
1958} 1858}
1959 1859
1960/*
1961 * Check all nodes whether it contains reclaimable pages or not.
1962 * For quick scan, we make use of scan_nodes. This will allow us to skip
1963 * unused nodes. But scan_nodes is lazily updated and may not cotain
1964 * enough new information. We need to do double check.
1965 */
1966static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1967{
1968 int nid;
1969
1970 /*
1971 * quick check...making use of scan_node.
1972 * We can skip unused nodes.
1973 */
1974 if (!nodes_empty(memcg->scan_nodes)) {
1975 for (nid = first_node(memcg->scan_nodes);
1976 nid < MAX_NUMNODES;
1977 nid = next_node(nid, memcg->scan_nodes)) {
1978
1979 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1980 return true;
1981 }
1982 }
1983 /*
1984 * Check rest of nodes.
1985 */
1986 for_each_node_state(nid, N_MEMORY) {
1987 if (node_isset(nid, memcg->scan_nodes))
1988 continue;
1989 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1990 return true;
1991 }
1992 return false;
1993}
1994
1995#else 1860#else
1996int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1861int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1997{ 1862{
1998 return 0; 1863 return 0;
1999} 1864}
2000 1865
2001static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
2002{
2003 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
2004}
2005#endif 1866#endif
2006 1867
2007static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1868/*
2008 struct zone *zone, 1869 * A group is eligible for the soft limit reclaim under the given root
2009 gfp_t gfp_mask, 1870 * hierarchy if
2010 unsigned long *total_scanned) 1871 * a) it is over its soft limit
2011{ 1872 * b) any parent up the hierarchy is over its soft limit
2012 struct mem_cgroup *victim = NULL; 1873 *
2013 int total = 0; 1874 * If the given group doesn't have any children over the limit then it
2014 int loop = 0; 1875 * doesn't make any sense to iterate its subtree.
2015 unsigned long excess; 1876 */
2016 unsigned long nr_scanned; 1877enum mem_cgroup_filter_t
2017 struct mem_cgroup_reclaim_cookie reclaim = { 1878mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
2018 .zone = zone, 1879 struct mem_cgroup *root)
2019 .priority = 0, 1880{
2020 }; 1881 struct mem_cgroup *parent;
2021 1882
2022 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; 1883 if (!memcg)
2023 1884 memcg = root_mem_cgroup;
2024 while (1) { 1885 parent = memcg;
2025 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1886
2026 if (!victim) { 1887 if (res_counter_soft_limit_excess(&memcg->res))
2027 loop++; 1888 return VISIT;
2028 if (loop >= 2) { 1889
2029 /* 1890 /*
2030 * If we have not been able to reclaim 1891 * If any parent up to the root in the hierarchy is over its soft limit
2031 * anything, it might because there are 1892 * then we have to obey and reclaim from this group as well.
2032 * no reclaimable pages under this hierarchy 1893 */
2033 */ 1894 while ((parent = parent_mem_cgroup(parent))) {
2034 if (!total) 1895 if (res_counter_soft_limit_excess(&parent->res))
2035 break; 1896 return VISIT;
2036 /* 1897 if (parent == root)
2037 * We want to do more targeted reclaim.
2038 * excess >> 2 is not to excessive so as to
2039 * reclaim too much, nor too less that we keep
2040 * coming back to reclaim from this cgroup
2041 */
2042 if (total >= (excess >> 2) ||
2043 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
2044 break;
2045 }
2046 continue;
2047 }
2048 if (!mem_cgroup_reclaimable(victim, false))
2049 continue;
2050 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
2051 zone, &nr_scanned);
2052 *total_scanned += nr_scanned;
2053 if (!res_counter_soft_limit_excess(&root_memcg->res))
2054 break; 1898 break;
2055 } 1899 }
2056 mem_cgroup_iter_break(root_memcg, victim); 1900
2057 return total; 1901 if (!atomic_read(&memcg->children_in_excess))
1902 return SKIP_TREE;
1903 return SKIP;
2058} 1904}
2059 1905
1906static DEFINE_SPINLOCK(memcg_oom_lock);
1907
2060/* 1908/*
2061 * Check OOM-Killer is already running under our hierarchy. 1909 * Check OOM-Killer is already running under our hierarchy.
2062 * If someone is running, return false. 1910 * If someone is running, return false.
2063 * Has to be called with memcg_oom_lock
2064 */ 1911 */
2065static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) 1912static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
2066{ 1913{
2067 struct mem_cgroup *iter, *failed = NULL; 1914 struct mem_cgroup *iter, *failed = NULL;
2068 1915
1916 spin_lock(&memcg_oom_lock);
1917
2069 for_each_mem_cgroup_tree(iter, memcg) { 1918 for_each_mem_cgroup_tree(iter, memcg) {
2070 if (iter->oom_lock) { 1919 if (iter->oom_lock) {
2071 /* 1920 /*
@@ -2079,33 +1928,33 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
2079 iter->oom_lock = true; 1928 iter->oom_lock = true;
2080 } 1929 }
2081 1930
2082 if (!failed) 1931 if (failed) {
2083 return true; 1932 /*
2084 1933 * OK, we failed to lock the whole subtree so we have
2085 /* 1934 * to clean up what we set up to the failing subtree
2086 * OK, we failed to lock the whole subtree so we have to clean up 1935 */
2087 * what we set up to the failing subtree 1936 for_each_mem_cgroup_tree(iter, memcg) {
2088 */ 1937 if (iter == failed) {
2089 for_each_mem_cgroup_tree(iter, memcg) { 1938 mem_cgroup_iter_break(memcg, iter);
2090 if (iter == failed) { 1939 break;
2091 mem_cgroup_iter_break(memcg, iter); 1940 }
2092 break; 1941 iter->oom_lock = false;
2093 } 1942 }
2094 iter->oom_lock = false;
2095 } 1943 }
2096 return false; 1944
1945 spin_unlock(&memcg_oom_lock);
1946
1947 return !failed;
2097} 1948}
2098 1949
2099/* 1950static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
2100 * Has to be called with memcg_oom_lock
2101 */
2102static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
2103{ 1951{
2104 struct mem_cgroup *iter; 1952 struct mem_cgroup *iter;
2105 1953
1954 spin_lock(&memcg_oom_lock);
2106 for_each_mem_cgroup_tree(iter, memcg) 1955 for_each_mem_cgroup_tree(iter, memcg)
2107 iter->oom_lock = false; 1956 iter->oom_lock = false;
2108 return 0; 1957 spin_unlock(&memcg_oom_lock);
2109} 1958}
2110 1959
2111static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 1960static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
@@ -2129,7 +1978,6 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
2129 atomic_add_unless(&iter->under_oom, -1, 0); 1978 atomic_add_unless(&iter->under_oom, -1, 0);
2130} 1979}
2131 1980
2132static DEFINE_SPINLOCK(memcg_oom_lock);
2133static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1981static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
2134 1982
2135struct oom_wait_info { 1983struct oom_wait_info {
@@ -2159,6 +2007,7 @@ static int memcg_oom_wake_function(wait_queue_t *wait,
2159 2007
2160static void memcg_wakeup_oom(struct mem_cgroup *memcg) 2008static void memcg_wakeup_oom(struct mem_cgroup *memcg)
2161{ 2009{
2010 atomic_inc(&memcg->oom_wakeups);
2162 /* for filtering, pass "memcg" as argument. */ 2011 /* for filtering, pass "memcg" as argument. */
2163 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 2012 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
2164} 2013}
@@ -2170,56 +2019,136 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
2170} 2019}
2171 2020
2172/* 2021/*
2173 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 2022 * try to call OOM killer
2174 */ 2023 */
2175static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, 2024static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
2176 int order)
2177{ 2025{
2178 struct oom_wait_info owait; 2026 bool locked;
2179 bool locked, need_to_kill; 2027 int wakeups;
2180 2028
2181 owait.memcg = memcg; 2029 if (!current->memcg_oom.may_oom)
2182 owait.wait.flags = 0; 2030 return;
2183 owait.wait.func = memcg_oom_wake_function; 2031
2184 owait.wait.private = current; 2032 current->memcg_oom.in_memcg_oom = 1;
2185 INIT_LIST_HEAD(&owait.wait.task_list);
2186 need_to_kill = true;
2187 mem_cgroup_mark_under_oom(memcg);
2188 2033
2189 /* At first, try to OOM lock hierarchy under memcg.*/
2190 spin_lock(&memcg_oom_lock);
2191 locked = mem_cgroup_oom_lock(memcg);
2192 /* 2034 /*
2193 * Even if signal_pending(), we can't quit charge() loop without 2035 * As with any blocking lock, a contender needs to start
2194 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL 2036 * listening for wakeups before attempting the trylock,
2195 * under OOM is always welcomed, use TASK_KILLABLE here. 2037 * otherwise it can miss the wakeup from the unlock and sleep
2038 * indefinitely. This is just open-coded because our locking
2039 * is so particular to memcg hierarchies.
2196 */ 2040 */
2197 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 2041 wakeups = atomic_read(&memcg->oom_wakeups);
2198 if (!locked || memcg->oom_kill_disable) 2042 mem_cgroup_mark_under_oom(memcg);
2199 need_to_kill = false; 2043
2044 locked = mem_cgroup_oom_trylock(memcg);
2045
2200 if (locked) 2046 if (locked)
2201 mem_cgroup_oom_notify(memcg); 2047 mem_cgroup_oom_notify(memcg);
2202 spin_unlock(&memcg_oom_lock);
2203 2048
2204 if (need_to_kill) { 2049 if (locked && !memcg->oom_kill_disable) {
2205 finish_wait(&memcg_oom_waitq, &owait.wait); 2050 mem_cgroup_unmark_under_oom(memcg);
2206 mem_cgroup_out_of_memory(memcg, mask, order); 2051 mem_cgroup_out_of_memory(memcg, mask, order);
2052 mem_cgroup_oom_unlock(memcg);
2053 /*
2054 * There is no guarantee that an OOM-lock contender
2055 * sees the wakeups triggered by the OOM kill
2056 * uncharges. Wake any sleepers explicitely.
2057 */
2058 memcg_oom_recover(memcg);
2207 } else { 2059 } else {
2208 schedule(); 2060 /*
2209 finish_wait(&memcg_oom_waitq, &owait.wait); 2061 * A system call can just return -ENOMEM, but if this
2062 * is a page fault and somebody else is handling the
2063 * OOM already, we need to sleep on the OOM waitqueue
2064 * for this memcg until the situation is resolved.
2065 * Which can take some time because it might be
2066 * handled by a userspace task.
2067 *
2068 * However, this is the charge context, which means
2069 * that we may sit on a large call stack and hold
2070 * various filesystem locks, the mmap_sem etc. and we
2071 * don't want the OOM handler to deadlock on them
2072 * while we sit here and wait. Store the current OOM
2073 * context in the task_struct, then return -ENOMEM.
2074 * At the end of the page fault handler, with the
2075 * stack unwound, pagefault_out_of_memory() will check
2076 * back with us by calling
2077 * mem_cgroup_oom_synchronize(), possibly putting the
2078 * task to sleep.
2079 */
2080 current->memcg_oom.oom_locked = locked;
2081 current->memcg_oom.wakeups = wakeups;
2082 css_get(&memcg->css);
2083 current->memcg_oom.wait_on_memcg = memcg;
2210 } 2084 }
2211 spin_lock(&memcg_oom_lock); 2085}
2212 if (locked)
2213 mem_cgroup_oom_unlock(memcg);
2214 memcg_wakeup_oom(memcg);
2215 spin_unlock(&memcg_oom_lock);
2216 2086
2217 mem_cgroup_unmark_under_oom(memcg); 2087/**
2088 * mem_cgroup_oom_synchronize - complete memcg OOM handling
2089 *
2090 * This has to be called at the end of a page fault if the the memcg
2091 * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
2092 *
2093 * Memcg supports userspace OOM handling, so failed allocations must
2094 * sleep on a waitqueue until the userspace task resolves the
2095 * situation. Sleeping directly in the charge context with all kinds
2096 * of locks held is not a good idea, instead we remember an OOM state
2097 * in the task and mem_cgroup_oom_synchronize() has to be called at
2098 * the end of the page fault to put the task to sleep and clean up the
2099 * OOM state.
2100 *
2101 * Returns %true if an ongoing memcg OOM situation was detected and
2102 * finalized, %false otherwise.
2103 */
2104bool mem_cgroup_oom_synchronize(void)
2105{
2106 struct oom_wait_info owait;
2107 struct mem_cgroup *memcg;
2218 2108
2219 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 2109 /* OOM is global, do not handle */
2110 if (!current->memcg_oom.in_memcg_oom)
2220 return false; 2111 return false;
2221 /* Give chance to dying process */ 2112
2222 schedule_timeout_uninterruptible(1); 2113 /*
2114 * We invoked the OOM killer but there is a chance that a kill
2115 * did not free up any charges. Everybody else might already
2116 * be sleeping, so restart the fault and keep the rampage
2117 * going until some charges are released.
2118 */
2119 memcg = current->memcg_oom.wait_on_memcg;
2120 if (!memcg)
2121 goto out;
2122
2123 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
2124 goto out_memcg;
2125
2126 owait.memcg = memcg;
2127 owait.wait.flags = 0;
2128 owait.wait.func = memcg_oom_wake_function;
2129 owait.wait.private = current;
2130 INIT_LIST_HEAD(&owait.wait.task_list);
2131
2132 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2133 /* Only sleep if we didn't miss any wakeups since OOM */
2134 if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
2135 schedule();
2136 finish_wait(&memcg_oom_waitq, &owait.wait);
2137out_memcg:
2138 mem_cgroup_unmark_under_oom(memcg);
2139 if (current->memcg_oom.oom_locked) {
2140 mem_cgroup_oom_unlock(memcg);
2141 /*
2142 * There is no guarantee that an OOM-lock contender
2143 * sees the wakeups triggered by the OOM kill
2144 * uncharges. Wake any sleepers explicitely.
2145 */
2146 memcg_oom_recover(memcg);
2147 }
2148 css_put(&memcg->css);
2149 current->memcg_oom.wait_on_memcg = NULL;
2150out:
2151 current->memcg_oom.in_memcg_oom = 0;
2223 return true; 2152 return true;
2224} 2153}
2225 2154
@@ -2288,7 +2217,7 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
2288} 2217}
2289 2218
2290void mem_cgroup_update_page_stat(struct page *page, 2219void mem_cgroup_update_page_stat(struct page *page,
2291 enum mem_cgroup_page_stat_item idx, int val) 2220 enum mem_cgroup_stat_index idx, int val)
2292{ 2221{
2293 struct mem_cgroup *memcg; 2222 struct mem_cgroup *memcg;
2294 struct page_cgroup *pc = lookup_page_cgroup(page); 2223 struct page_cgroup *pc = lookup_page_cgroup(page);
@@ -2297,18 +2226,11 @@ void mem_cgroup_update_page_stat(struct page *page,
2297 if (mem_cgroup_disabled()) 2226 if (mem_cgroup_disabled())
2298 return; 2227 return;
2299 2228
2229 VM_BUG_ON(!rcu_read_lock_held());
2300 memcg = pc->mem_cgroup; 2230 memcg = pc->mem_cgroup;
2301 if (unlikely(!memcg || !PageCgroupUsed(pc))) 2231 if (unlikely(!memcg || !PageCgroupUsed(pc)))
2302 return; 2232 return;
2303 2233
2304 switch (idx) {
2305 case MEMCG_NR_FILE_MAPPED:
2306 idx = MEM_CGROUP_STAT_FILE_MAPPED;
2307 break;
2308 default:
2309 BUG();
2310 }
2311
2312 this_cpu_add(memcg->stat->count[idx], val); 2234 this_cpu_add(memcg->stat->count[idx], val);
2313} 2235}
2314 2236
@@ -2450,7 +2372,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
2450 flush_work(&stock->work); 2372 flush_work(&stock->work);
2451 } 2373 }
2452out: 2374out:
2453 put_online_cpus(); 2375 put_online_cpus();
2454} 2376}
2455 2377
2456/* 2378/*
@@ -2532,12 +2454,11 @@ enum {
2532 CHARGE_RETRY, /* need to retry but retry is not bad */ 2454 CHARGE_RETRY, /* need to retry but retry is not bad */
2533 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ 2455 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
2534 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ 2456 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
2535 CHARGE_OOM_DIE, /* the current is killed because of OOM */
2536}; 2457};
2537 2458
2538static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2459static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2539 unsigned int nr_pages, unsigned int min_pages, 2460 unsigned int nr_pages, unsigned int min_pages,
2540 bool oom_check) 2461 bool invoke_oom)
2541{ 2462{
2542 unsigned long csize = nr_pages * PAGE_SIZE; 2463 unsigned long csize = nr_pages * PAGE_SIZE;
2543 struct mem_cgroup *mem_over_limit; 2464 struct mem_cgroup *mem_over_limit;
@@ -2594,14 +2515,10 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2594 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2515 if (mem_cgroup_wait_acct_move(mem_over_limit))
2595 return CHARGE_RETRY; 2516 return CHARGE_RETRY;
2596 2517
2597 /* If we don't need to call oom-killer at el, return immediately */ 2518 if (invoke_oom)
2598 if (!oom_check) 2519 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
2599 return CHARGE_NOMEM;
2600 /* check OOM */
2601 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
2602 return CHARGE_OOM_DIE;
2603 2520
2604 return CHARGE_RETRY; 2521 return CHARGE_NOMEM;
2605} 2522}
2606 2523
2607/* 2524/*
@@ -2704,7 +2621,7 @@ again:
2704 } 2621 }
2705 2622
2706 do { 2623 do {
2707 bool oom_check; 2624 bool invoke_oom = oom && !nr_oom_retries;
2708 2625
2709 /* If killed, bypass charge */ 2626 /* If killed, bypass charge */
2710 if (fatal_signal_pending(current)) { 2627 if (fatal_signal_pending(current)) {
@@ -2712,14 +2629,8 @@ again:
2712 goto bypass; 2629 goto bypass;
2713 } 2630 }
2714 2631
2715 oom_check = false; 2632 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
2716 if (oom && !nr_oom_retries) { 2633 nr_pages, invoke_oom);
2717 oom_check = true;
2718 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2719 }
2720
2721 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
2722 oom_check);
2723 switch (ret) { 2634 switch (ret) {
2724 case CHARGE_OK: 2635 case CHARGE_OK:
2725 break; 2636 break;
@@ -2732,16 +2643,12 @@ again:
2732 css_put(&memcg->css); 2643 css_put(&memcg->css);
2733 goto nomem; 2644 goto nomem;
2734 case CHARGE_NOMEM: /* OOM routine works */ 2645 case CHARGE_NOMEM: /* OOM routine works */
2735 if (!oom) { 2646 if (!oom || invoke_oom) {
2736 css_put(&memcg->css); 2647 css_put(&memcg->css);
2737 goto nomem; 2648 goto nomem;
2738 } 2649 }
2739 /* If oom, we never return -ENOMEM */
2740 nr_oom_retries--; 2650 nr_oom_retries--;
2741 break; 2651 break;
2742 case CHARGE_OOM_DIE: /* Killed by OOM Killer */
2743 css_put(&memcg->css);
2744 goto bypass;
2745 } 2652 }
2746 } while (ret != CHARGE_OK); 2653 } while (ret != CHARGE_OK);
2747 2654
@@ -2882,7 +2789,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2882 * is accessed after testing USED bit. To make pc->mem_cgroup visible 2789 * is accessed after testing USED bit. To make pc->mem_cgroup visible
2883 * before USED bit, we need memory barrier here. 2790 * before USED bit, we need memory barrier here.
2884 * See mem_cgroup_add_lru_list(), etc. 2791 * See mem_cgroup_add_lru_list(), etc.
2885 */ 2792 */
2886 smp_wmb(); 2793 smp_wmb();
2887 SetPageCgroupUsed(pc); 2794 SetPageCgroupUsed(pc);
2888 2795
@@ -2905,9 +2812,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2905 unlock_page_cgroup(pc); 2812 unlock_page_cgroup(pc);
2906 2813
2907 /* 2814 /*
2908 * "charge_statistics" updated event counter. Then, check it. 2815 * "charge_statistics" updated event counter.
2909 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
2910 * if they exceeds softlimit.
2911 */ 2816 */
2912 memcg_check_events(memcg, page); 2817 memcg_check_events(memcg, page);
2913} 2818}
@@ -3626,9 +3531,9 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3626 * the page allocator. Therefore, the following sequence when backed by 3531 * the page allocator. Therefore, the following sequence when backed by
3627 * the SLUB allocator: 3532 * the SLUB allocator:
3628 * 3533 *
3629 * memcg_stop_kmem_account(); 3534 * memcg_stop_kmem_account();
3630 * kmalloc(<large_number>) 3535 * kmalloc(<large_number>)
3631 * memcg_resume_kmem_account(); 3536 * memcg_resume_kmem_account();
3632 * 3537 *
3633 * would effectively ignore the fact that we should skip accounting, 3538 * would effectively ignore the fact that we should skip accounting,
3634 * since it will drive us directly to this function without passing 3539 * since it will drive us directly to this function without passing
@@ -3750,6 +3655,20 @@ void mem_cgroup_split_huge_fixup(struct page *head)
3750} 3655}
3751#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 3656#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
3752 3657
3658static inline
3659void mem_cgroup_move_account_page_stat(struct mem_cgroup *from,
3660 struct mem_cgroup *to,
3661 unsigned int nr_pages,
3662 enum mem_cgroup_stat_index idx)
3663{
3664 /* Update stat data for mem_cgroup */
3665 preempt_disable();
3666 WARN_ON_ONCE(from->stat->count[idx] < nr_pages);
3667 __this_cpu_add(from->stat->count[idx], -nr_pages);
3668 __this_cpu_add(to->stat->count[idx], nr_pages);
3669 preempt_enable();
3670}
3671
3753/** 3672/**
3754 * mem_cgroup_move_account - move account of the page 3673 * mem_cgroup_move_account - move account of the page
3755 * @page: the page 3674 * @page: the page
@@ -3795,13 +3714,14 @@ static int mem_cgroup_move_account(struct page *page,
3795 3714
3796 move_lock_mem_cgroup(from, &flags); 3715 move_lock_mem_cgroup(from, &flags);
3797 3716
3798 if (!anon && page_mapped(page)) { 3717 if (!anon && page_mapped(page))
3799 /* Update mapped_file data for mem_cgroup */ 3718 mem_cgroup_move_account_page_stat(from, to, nr_pages,
3800 preempt_disable(); 3719 MEM_CGROUP_STAT_FILE_MAPPED);
3801 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 3720
3802 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 3721 if (PageWriteback(page))
3803 preempt_enable(); 3722 mem_cgroup_move_account_page_stat(from, to, nr_pages,
3804 } 3723 MEM_CGROUP_STAT_WRITEBACK);
3724
3805 mem_cgroup_charge_statistics(from, page, anon, -nr_pages); 3725 mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
3806 3726
3807 /* caller should have done css_get */ 3727 /* caller should have done css_get */
@@ -4657,7 +4577,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
4657 MEM_CGROUP_RECLAIM_SHRINK); 4577 MEM_CGROUP_RECLAIM_SHRINK);
4658 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 4578 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
4659 /* Usage is reduced ? */ 4579 /* Usage is reduced ? */
4660 if (curusage >= oldusage) 4580 if (curusage >= oldusage)
4661 retry_count--; 4581 retry_count--;
4662 else 4582 else
4663 oldusage = curusage; 4583 oldusage = curusage;
@@ -4678,7 +4598,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
4678 int enlarge = 0; 4598 int enlarge = 0;
4679 4599
4680 /* see mem_cgroup_resize_res_limit */ 4600 /* see mem_cgroup_resize_res_limit */
4681 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 4601 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
4682 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 4602 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
4683 while (retry_count) { 4603 while (retry_count) {
4684 if (signal_pending(current)) { 4604 if (signal_pending(current)) {
@@ -4727,98 +4647,6 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
4727 return ret; 4647 return ret;
4728} 4648}
4729 4649
4730unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
4731 gfp_t gfp_mask,
4732 unsigned long *total_scanned)
4733{
4734 unsigned long nr_reclaimed = 0;
4735 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
4736 unsigned long reclaimed;
4737 int loop = 0;
4738 struct mem_cgroup_tree_per_zone *mctz;
4739 unsigned long long excess;
4740 unsigned long nr_scanned;
4741
4742 if (order > 0)
4743 return 0;
4744
4745 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
4746 /*
4747 * This loop can run a while, specially if mem_cgroup's continuously
4748 * keep exceeding their soft limit and putting the system under
4749 * pressure
4750 */
4751 do {
4752 if (next_mz)
4753 mz = next_mz;
4754 else
4755 mz = mem_cgroup_largest_soft_limit_node(mctz);
4756 if (!mz)
4757 break;
4758
4759 nr_scanned = 0;
4760 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
4761 gfp_mask, &nr_scanned);
4762 nr_reclaimed += reclaimed;
4763 *total_scanned += nr_scanned;
4764 spin_lock(&mctz->lock);
4765
4766 /*
4767 * If we failed to reclaim anything from this memory cgroup
4768 * it is time to move on to the next cgroup
4769 */
4770 next_mz = NULL;
4771 if (!reclaimed) {
4772 do {
4773 /*
4774 * Loop until we find yet another one.
4775 *
4776 * By the time we get the soft_limit lock
4777 * again, someone might have aded the
4778 * group back on the RB tree. Iterate to
4779 * make sure we get a different mem.
4780 * mem_cgroup_largest_soft_limit_node returns
4781 * NULL if no other cgroup is present on
4782 * the tree
4783 */
4784 next_mz =
4785 __mem_cgroup_largest_soft_limit_node(mctz);
4786 if (next_mz == mz)
4787 css_put(&next_mz->memcg->css);
4788 else /* next_mz == NULL or other memcg */
4789 break;
4790 } while (1);
4791 }
4792 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
4793 excess = res_counter_soft_limit_excess(&mz->memcg->res);
4794 /*
4795 * One school of thought says that we should not add
4796 * back the node to the tree if reclaim returns 0.
4797 * But our reclaim could return 0, simply because due
4798 * to priority we are exposing a smaller subset of
4799 * memory to reclaim from. Consider this as a longer
4800 * term TODO.
4801 */
4802 /* If excess == 0, no tree ops */
4803 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
4804 spin_unlock(&mctz->lock);
4805 css_put(&mz->memcg->css);
4806 loop++;
4807 /*
4808 * Could not reclaim anything and there are no more
4809 * mem cgroups to try or we seem to be looping without
4810 * reclaiming anything.
4811 */
4812 if (!nr_reclaimed &&
4813 (next_mz == NULL ||
4814 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
4815 break;
4816 } while (!nr_reclaimed);
4817 if (next_mz)
4818 css_put(&next_mz->memcg->css);
4819 return nr_reclaimed;
4820}
4821
4822/** 4650/**
4823 * mem_cgroup_force_empty_list - clears LRU of a group 4651 * mem_cgroup_force_empty_list - clears LRU of a group
4824 * @memcg: group to clear 4652 * @memcg: group to clear
@@ -4990,18 +4818,12 @@ static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css,
4990 unsigned int event) 4818 unsigned int event)
4991{ 4819{
4992 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4820 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4993 int ret;
4994 4821
4995 if (mem_cgroup_is_root(memcg)) 4822 if (mem_cgroup_is_root(memcg))
4996 return -EINVAL; 4823 return -EINVAL;
4997 css_get(&memcg->css); 4824 return mem_cgroup_force_empty(memcg);
4998 ret = mem_cgroup_force_empty(memcg);
4999 css_put(&memcg->css);
5000
5001 return ret;
5002} 4825}
5003 4826
5004
5005static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 4827static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
5006 struct cftype *cft) 4828 struct cftype *cft)
5007{ 4829{
@@ -5139,7 +4961,7 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
5139 */ 4961 */
5140 mutex_lock(&memcg_create_mutex); 4962 mutex_lock(&memcg_create_mutex);
5141 mutex_lock(&set_limit_mutex); 4963 mutex_lock(&set_limit_mutex);
5142 if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { 4964 if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) {
5143 if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) { 4965 if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) {
5144 ret = -EBUSY; 4966 ret = -EBUSY;
5145 goto out; 4967 goto out;
@@ -5149,7 +4971,7 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
5149 4971
5150 ret = memcg_update_cache_sizes(memcg); 4972 ret = memcg_update_cache_sizes(memcg);
5151 if (ret) { 4973 if (ret) {
5152 res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); 4974 res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX);
5153 goto out; 4975 goto out;
5154 } 4976 }
5155 static_key_slow_inc(&memcg_kmem_enabled_key); 4977 static_key_slow_inc(&memcg_kmem_enabled_key);
@@ -6089,8 +5911,6 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
6089 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 5911 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
6090 mz = &pn->zoneinfo[zone]; 5912 mz = &pn->zoneinfo[zone];
6091 lruvec_init(&mz->lruvec); 5913 lruvec_init(&mz->lruvec);
6092 mz->usage_in_excess = 0;
6093 mz->on_tree = false;
6094 mz->memcg = memcg; 5914 mz->memcg = memcg;
6095 } 5915 }
6096 memcg->nodeinfo[node] = pn; 5916 memcg->nodeinfo[node] = pn;
@@ -6146,7 +5966,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
6146 int node; 5966 int node;
6147 size_t size = memcg_size(); 5967 size_t size = memcg_size();
6148 5968
6149 mem_cgroup_remove_from_trees(memcg);
6150 free_css_id(&mem_cgroup_subsys, &memcg->css); 5969 free_css_id(&mem_cgroup_subsys, &memcg->css);
6151 5970
6152 for_each_node(node) 5971 for_each_node(node)
@@ -6183,29 +6002,6 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
6183} 6002}
6184EXPORT_SYMBOL(parent_mem_cgroup); 6003EXPORT_SYMBOL(parent_mem_cgroup);
6185 6004
6186static void __init mem_cgroup_soft_limit_tree_init(void)
6187{
6188 struct mem_cgroup_tree_per_node *rtpn;
6189 struct mem_cgroup_tree_per_zone *rtpz;
6190 int tmp, node, zone;
6191
6192 for_each_node(node) {
6193 tmp = node;
6194 if (!node_state(node, N_NORMAL_MEMORY))
6195 tmp = -1;
6196 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
6197 BUG_ON(!rtpn);
6198
6199 soft_limit_tree.rb_tree_per_node[node] = rtpn;
6200
6201 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
6202 rtpz = &rtpn->rb_tree_per_zone[zone];
6203 rtpz->rb_root = RB_ROOT;
6204 spin_lock_init(&rtpz->lock);
6205 }
6206 }
6207}
6208
6209static struct cgroup_subsys_state * __ref 6005static struct cgroup_subsys_state * __ref
6210mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 6006mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6211{ 6007{
@@ -6235,6 +6031,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6235 mutex_init(&memcg->thresholds_lock); 6031 mutex_init(&memcg->thresholds_lock);
6236 spin_lock_init(&memcg->move_lock); 6032 spin_lock_init(&memcg->move_lock);
6237 vmpressure_init(&memcg->vmpressure); 6033 vmpressure_init(&memcg->vmpressure);
6034 spin_lock_init(&memcg->soft_lock);
6238 6035
6239 return &memcg->css; 6036 return &memcg->css;
6240 6037
@@ -6312,6 +6109,13 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6312 6109
6313 mem_cgroup_invalidate_reclaim_iterators(memcg); 6110 mem_cgroup_invalidate_reclaim_iterators(memcg);
6314 mem_cgroup_reparent_charges(memcg); 6111 mem_cgroup_reparent_charges(memcg);
6112 if (memcg->soft_contributed) {
6113 while ((memcg = parent_mem_cgroup(memcg)))
6114 atomic_dec(&memcg->children_in_excess);
6115
6116 if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
6117 atomic_dec(&root_mem_cgroup->children_in_excess);
6118 }
6315 mem_cgroup_destroy_all_caches(memcg); 6119 mem_cgroup_destroy_all_caches(memcg);
6316 vmpressure_cleanup(&memcg->vmpressure); 6120 vmpressure_cleanup(&memcg->vmpressure);
6317} 6121}
@@ -6986,7 +6790,6 @@ static int __init mem_cgroup_init(void)
6986{ 6790{
6987 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 6791 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
6988 enable_swap_cgroup(); 6792 enable_swap_cgroup();
6989 mem_cgroup_soft_limit_tree_init();
6990 memcg_stock_init(); 6793 memcg_stock_init();
6991 return 0; 6794 return 0;
6992} 6795}
diff --git a/mm/memory.c b/mm/memory.c
index 2b73dbde2274..ca0003947115 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3695,7 +3695,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3695 * but allow concurrent faults), and pte mapped but not yet locked. 3695 * but allow concurrent faults), and pte mapped but not yet locked.
3696 * We return with mmap_sem still held, but pte unmapped and unlocked. 3696 * We return with mmap_sem still held, but pte unmapped and unlocked.
3697 */ 3697 */
3698int handle_pte_fault(struct mm_struct *mm, 3698static int handle_pte_fault(struct mm_struct *mm,
3699 struct vm_area_struct *vma, unsigned long address, 3699 struct vm_area_struct *vma, unsigned long address,
3700 pte_t *pte, pmd_t *pmd, unsigned int flags) 3700 pte_t *pte, pmd_t *pmd, unsigned int flags)
3701{ 3701{
@@ -3754,22 +3754,14 @@ unlock:
3754/* 3754/*
3755 * By the time we get here, we already hold the mm semaphore 3755 * By the time we get here, we already hold the mm semaphore
3756 */ 3756 */
3757int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3757static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3758 unsigned long address, unsigned int flags) 3758 unsigned long address, unsigned int flags)
3759{ 3759{
3760 pgd_t *pgd; 3760 pgd_t *pgd;
3761 pud_t *pud; 3761 pud_t *pud;
3762 pmd_t *pmd; 3762 pmd_t *pmd;
3763 pte_t *pte; 3763 pte_t *pte;
3764 3764
3765 __set_current_state(TASK_RUNNING);
3766
3767 count_vm_event(PGFAULT);
3768 mem_cgroup_count_vm_event(mm, PGFAULT);
3769
3770 /* do counter updates before entering really critical section. */
3771 check_sync_rss_stat(current);
3772
3773 if (unlikely(is_vm_hugetlb_page(vma))) 3765 if (unlikely(is_vm_hugetlb_page(vma)))
3774 return hugetlb_fault(mm, vma, address, flags); 3766 return hugetlb_fault(mm, vma, address, flags);
3775 3767
@@ -3782,9 +3774,12 @@ retry:
3782 if (!pmd) 3774 if (!pmd)
3783 return VM_FAULT_OOM; 3775 return VM_FAULT_OOM;
3784 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { 3776 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
3777 int ret = VM_FAULT_FALLBACK;
3785 if (!vma->vm_ops) 3778 if (!vma->vm_ops)
3786 return do_huge_pmd_anonymous_page(mm, vma, address, 3779 ret = do_huge_pmd_anonymous_page(mm, vma, address,
3787 pmd, flags); 3780 pmd, flags);
3781 if (!(ret & VM_FAULT_FALLBACK))
3782 return ret;
3788 } else { 3783 } else {
3789 pmd_t orig_pmd = *pmd; 3784 pmd_t orig_pmd = *pmd;
3790 int ret; 3785 int ret;
@@ -3850,6 +3845,37 @@ retry:
3850 return handle_pte_fault(mm, vma, address, pte, pmd, flags); 3845 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
3851} 3846}
3852 3847
3848int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3849 unsigned long address, unsigned int flags)
3850{
3851 int ret;
3852
3853 __set_current_state(TASK_RUNNING);
3854
3855 count_vm_event(PGFAULT);
3856 mem_cgroup_count_vm_event(mm, PGFAULT);
3857
3858 /* do counter updates before entering really critical section. */
3859 check_sync_rss_stat(current);
3860
3861 /*
3862 * Enable the memcg OOM handling for faults triggered in user
3863 * space. Kernel faults are handled more gracefully.
3864 */
3865 if (flags & FAULT_FLAG_USER)
3866 mem_cgroup_enable_oom();
3867
3868 ret = __handle_mm_fault(mm, vma, address, flags);
3869
3870 if (flags & FAULT_FLAG_USER)
3871 mem_cgroup_disable_oom();
3872
3873 if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)))
3874 mem_cgroup_oom_synchronize();
3875
3876 return ret;
3877}
3878
3853#ifndef __PAGETABLE_PUD_FOLDED 3879#ifndef __PAGETABLE_PUD_FOLDED
3854/* 3880/*
3855 * Allocate page upper directory. 3881 * Allocate page upper directory.
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 98e75f2ac7bc..314e9d274381 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -678,9 +678,12 @@ out:
678 */ 678 */
679void pagefault_out_of_memory(void) 679void pagefault_out_of_memory(void)
680{ 680{
681 struct zonelist *zonelist = node_zonelist(first_online_node, 681 struct zonelist *zonelist;
682 GFP_KERNEL);
683 682
683 if (mem_cgroup_oom_synchronize())
684 return;
685
686 zonelist = node_zonelist(first_online_node, GFP_KERNEL);
684 if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { 687 if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
685 out_of_memory(NULL, 0, 0, NULL, false); 688 out_of_memory(NULL, 0, 0, NULL, false);
686 clear_zonelist_oom(zonelist, GFP_KERNEL); 689 clear_zonelist_oom(zonelist, GFP_KERNEL);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 6c7b0187be8e..f5236f804aa6 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2143,11 +2143,17 @@ EXPORT_SYMBOL(account_page_dirtied);
2143 2143
2144/* 2144/*
2145 * Helper function for set_page_writeback family. 2145 * Helper function for set_page_writeback family.
2146 *
2147 * The caller must hold mem_cgroup_begin/end_update_page_stat() lock
2148 * while calling this function.
2149 * See test_set_page_writeback for example.
2150 *
2146 * NOTE: Unlike account_page_dirtied this does not rely on being atomic 2151 * NOTE: Unlike account_page_dirtied this does not rely on being atomic
2147 * wrt interrupts. 2152 * wrt interrupts.
2148 */ 2153 */
2149void account_page_writeback(struct page *page) 2154void account_page_writeback(struct page *page)
2150{ 2155{
2156 mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
2151 inc_zone_page_state(page, NR_WRITEBACK); 2157 inc_zone_page_state(page, NR_WRITEBACK);
2152} 2158}
2153EXPORT_SYMBOL(account_page_writeback); 2159EXPORT_SYMBOL(account_page_writeback);
@@ -2364,7 +2370,10 @@ int test_clear_page_writeback(struct page *page)
2364{ 2370{
2365 struct address_space *mapping = page_mapping(page); 2371 struct address_space *mapping = page_mapping(page);
2366 int ret; 2372 int ret;
2373 bool locked;
2374 unsigned long memcg_flags;
2367 2375
2376 mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags);
2368 if (mapping) { 2377 if (mapping) {
2369 struct backing_dev_info *bdi = mapping->backing_dev_info; 2378 struct backing_dev_info *bdi = mapping->backing_dev_info;
2370 unsigned long flags; 2379 unsigned long flags;
@@ -2385,9 +2394,11 @@ int test_clear_page_writeback(struct page *page)
2385 ret = TestClearPageWriteback(page); 2394 ret = TestClearPageWriteback(page);
2386 } 2395 }
2387 if (ret) { 2396 if (ret) {
2397 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
2388 dec_zone_page_state(page, NR_WRITEBACK); 2398 dec_zone_page_state(page, NR_WRITEBACK);
2389 inc_zone_page_state(page, NR_WRITTEN); 2399 inc_zone_page_state(page, NR_WRITTEN);
2390 } 2400 }
2401 mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags);
2391 return ret; 2402 return ret;
2392} 2403}
2393 2404
@@ -2395,7 +2406,10 @@ int test_set_page_writeback(struct page *page)
2395{ 2406{
2396 struct address_space *mapping = page_mapping(page); 2407 struct address_space *mapping = page_mapping(page);
2397 int ret; 2408 int ret;
2409 bool locked;
2410 unsigned long memcg_flags;
2398 2411
2412 mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags);
2399 if (mapping) { 2413 if (mapping) {
2400 struct backing_dev_info *bdi = mapping->backing_dev_info; 2414 struct backing_dev_info *bdi = mapping->backing_dev_info;
2401 unsigned long flags; 2415 unsigned long flags;
@@ -2422,6 +2436,7 @@ int test_set_page_writeback(struct page *page)
2422 } 2436 }
2423 if (!ret) 2437 if (!ret)
2424 account_page_writeback(page); 2438 account_page_writeback(page);
2439 mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags);
2425 return ret; 2440 return ret;
2426 2441
2427} 2442}
diff --git a/mm/rmap.c b/mm/rmap.c
index 07748e68b729..fd3ee7a54a13 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1052,11 +1052,11 @@ void do_page_add_anon_rmap(struct page *page,
1052{ 1052{
1053 int first = atomic_inc_and_test(&page->_mapcount); 1053 int first = atomic_inc_and_test(&page->_mapcount);
1054 if (first) { 1054 if (first) {
1055 if (!PageTransHuge(page)) 1055 if (PageTransHuge(page))
1056 __inc_zone_page_state(page, NR_ANON_PAGES);
1057 else
1058 __inc_zone_page_state(page, 1056 __inc_zone_page_state(page,
1059 NR_ANON_TRANSPARENT_HUGEPAGES); 1057 NR_ANON_TRANSPARENT_HUGEPAGES);
1058 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
1059 hpage_nr_pages(page));
1060 } 1060 }
1061 if (unlikely(PageKsm(page))) 1061 if (unlikely(PageKsm(page)))
1062 return; 1062 return;
@@ -1085,10 +1085,10 @@ void page_add_new_anon_rmap(struct page *page,
1085 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 1085 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
1086 SetPageSwapBacked(page); 1086 SetPageSwapBacked(page);
1087 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ 1087 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
1088 if (!PageTransHuge(page)) 1088 if (PageTransHuge(page))
1089 __inc_zone_page_state(page, NR_ANON_PAGES);
1090 else
1091 __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); 1089 __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1090 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
1091 hpage_nr_pages(page));
1092 __page_set_anon_rmap(page, vma, address, 1); 1092 __page_set_anon_rmap(page, vma, address, 1);
1093 if (!mlocked_vma_newpage(vma, page)) { 1093 if (!mlocked_vma_newpage(vma, page)) {
1094 SetPageActive(page); 1094 SetPageActive(page);
@@ -1111,7 +1111,7 @@ void page_add_file_rmap(struct page *page)
1111 mem_cgroup_begin_update_page_stat(page, &locked, &flags); 1111 mem_cgroup_begin_update_page_stat(page, &locked, &flags);
1112 if (atomic_inc_and_test(&page->_mapcount)) { 1112 if (atomic_inc_and_test(&page->_mapcount)) {
1113 __inc_zone_page_state(page, NR_FILE_MAPPED); 1113 __inc_zone_page_state(page, NR_FILE_MAPPED);
1114 mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); 1114 mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
1115 } 1115 }
1116 mem_cgroup_end_update_page_stat(page, &locked, &flags); 1116 mem_cgroup_end_update_page_stat(page, &locked, &flags);
1117} 1117}
@@ -1148,14 +1148,14 @@ void page_remove_rmap(struct page *page)
1148 goto out; 1148 goto out;
1149 if (anon) { 1149 if (anon) {
1150 mem_cgroup_uncharge_page(page); 1150 mem_cgroup_uncharge_page(page);
1151 if (!PageTransHuge(page)) 1151 if (PageTransHuge(page))
1152 __dec_zone_page_state(page, NR_ANON_PAGES);
1153 else
1154 __dec_zone_page_state(page, 1152 __dec_zone_page_state(page,
1155 NR_ANON_TRANSPARENT_HUGEPAGES); 1153 NR_ANON_TRANSPARENT_HUGEPAGES);
1154 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
1155 -hpage_nr_pages(page));
1156 } else { 1156 } else {
1157 __dec_zone_page_state(page, NR_FILE_MAPPED); 1157 __dec_zone_page_state(page, NR_FILE_MAPPED);
1158 mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); 1158 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
1159 mem_cgroup_end_update_page_stat(page, &locked, &flags); 1159 mem_cgroup_end_update_page_stat(page, &locked, &flags);
1160 } 1160 }
1161 if (unlikely(PageMlocked(page))) 1161 if (unlikely(PageMlocked(page)))
diff --git a/mm/swap.c b/mm/swap.c
index c899502d3e36..759c3caf44bd 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -432,6 +432,11 @@ static void activate_page_drain(int cpu)
432 pagevec_lru_move_fn(pvec, __activate_page, NULL); 432 pagevec_lru_move_fn(pvec, __activate_page, NULL);
433} 433}
434 434
435static bool need_activate_page_drain(int cpu)
436{
437 return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
438}
439
435void activate_page(struct page *page) 440void activate_page(struct page *page)
436{ 441{
437 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 442 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
@@ -449,6 +454,11 @@ static inline void activate_page_drain(int cpu)
449{ 454{
450} 455}
451 456
457static bool need_activate_page_drain(int cpu)
458{
459 return false;
460}
461
452void activate_page(struct page *page) 462void activate_page(struct page *page)
453{ 463{
454 struct zone *zone = page_zone(page); 464 struct zone *zone = page_zone(page);
@@ -701,12 +711,36 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
701 lru_add_drain(); 711 lru_add_drain();
702} 712}
703 713
704/* 714static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
705 * Returns 0 for success 715
706 */ 716void lru_add_drain_all(void)
707int lru_add_drain_all(void)
708{ 717{
709 return schedule_on_each_cpu(lru_add_drain_per_cpu); 718 static DEFINE_MUTEX(lock);
719 static struct cpumask has_work;
720 int cpu;
721
722 mutex_lock(&lock);
723 get_online_cpus();
724 cpumask_clear(&has_work);
725
726 for_each_online_cpu(cpu) {
727 struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
728
729 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
730 pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
731 pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
732 need_activate_page_drain(cpu)) {
733 INIT_WORK(work, lru_add_drain_per_cpu);
734 schedule_work_on(cpu, work);
735 cpumask_set_cpu(cpu, &has_work);
736 }
737 }
738
739 for_each_cpu(cpu, &has_work)
740 flush_work(&per_cpu(lru_add_drain_work, cpu));
741
742 put_online_cpus();
743 mutex_unlock(&lock);
710} 744}
711 745
712/* 746/*
diff --git a/mm/truncate.c b/mm/truncate.c
index e2e8a8a7eb9d..353b683afd6e 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -567,7 +567,6 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
567/** 567/**
568 * truncate_pagecache - unmap and remove pagecache that has been truncated 568 * truncate_pagecache - unmap and remove pagecache that has been truncated
569 * @inode: inode 569 * @inode: inode
570 * @oldsize: old file size
571 * @newsize: new file size 570 * @newsize: new file size
572 * 571 *
573 * inode's new i_size must already be written before truncate_pagecache 572 * inode's new i_size must already be written before truncate_pagecache
@@ -580,7 +579,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
580 * situations such as writepage being called for a page that has already 579 * situations such as writepage being called for a page that has already
581 * had its underlying blocks deallocated. 580 * had its underlying blocks deallocated.
582 */ 581 */
583void truncate_pagecache(struct inode *inode, loff_t oldsize, loff_t newsize) 582void truncate_pagecache(struct inode *inode, loff_t newsize)
584{ 583{
585 struct address_space *mapping = inode->i_mapping; 584 struct address_space *mapping = inode->i_mapping;
586 loff_t holebegin = round_up(newsize, PAGE_SIZE); 585 loff_t holebegin = round_up(newsize, PAGE_SIZE);
@@ -614,12 +613,8 @@ EXPORT_SYMBOL(truncate_pagecache);
614 */ 613 */
615void truncate_setsize(struct inode *inode, loff_t newsize) 614void truncate_setsize(struct inode *inode, loff_t newsize)
616{ 615{
617 loff_t oldsize;
618
619 oldsize = inode->i_size;
620 i_size_write(inode, newsize); 616 i_size_write(inode, newsize);
621 617 truncate_pagecache(inode, newsize);
622 truncate_pagecache(inode, oldsize, newsize);
623} 618}
624EXPORT_SYMBOL(truncate_setsize); 619EXPORT_SYMBOL(truncate_setsize);
625 620
diff --git a/mm/vmscan.c b/mm/vmscan.c
index beb35778c69f..8ed1b775bdc9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -139,11 +139,23 @@ static bool global_reclaim(struct scan_control *sc)
139{ 139{
140 return !sc->target_mem_cgroup; 140 return !sc->target_mem_cgroup;
141} 141}
142
143static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
144{
145 struct mem_cgroup *root = sc->target_mem_cgroup;
146 return !mem_cgroup_disabled() &&
147 mem_cgroup_soft_reclaim_eligible(root, root) != SKIP_TREE;
148}
142#else 149#else
143static bool global_reclaim(struct scan_control *sc) 150static bool global_reclaim(struct scan_control *sc)
144{ 151{
145 return true; 152 return true;
146} 153}
154
155static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
156{
157 return false;
158}
147#endif 159#endif
148 160
149unsigned long zone_reclaimable_pages(struct zone *zone) 161unsigned long zone_reclaimable_pages(struct zone *zone)
@@ -2164,9 +2176,11 @@ static inline bool should_continue_reclaim(struct zone *zone,
2164 } 2176 }
2165} 2177}
2166 2178
2167static void shrink_zone(struct zone *zone, struct scan_control *sc) 2179static int
2180__shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
2168{ 2181{
2169 unsigned long nr_reclaimed, nr_scanned; 2182 unsigned long nr_reclaimed, nr_scanned;
2183 int groups_scanned = 0;
2170 2184
2171 do { 2185 do {
2172 struct mem_cgroup *root = sc->target_mem_cgroup; 2186 struct mem_cgroup *root = sc->target_mem_cgroup;
@@ -2174,15 +2188,17 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
2174 .zone = zone, 2188 .zone = zone,
2175 .priority = sc->priority, 2189 .priority = sc->priority,
2176 }; 2190 };
2177 struct mem_cgroup *memcg; 2191 struct mem_cgroup *memcg = NULL;
2192 mem_cgroup_iter_filter filter = (soft_reclaim) ?
2193 mem_cgroup_soft_reclaim_eligible : NULL;
2178 2194
2179 nr_reclaimed = sc->nr_reclaimed; 2195 nr_reclaimed = sc->nr_reclaimed;
2180 nr_scanned = sc->nr_scanned; 2196 nr_scanned = sc->nr_scanned;
2181 2197
2182 memcg = mem_cgroup_iter(root, NULL, &reclaim); 2198 while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) {
2183 do {
2184 struct lruvec *lruvec; 2199 struct lruvec *lruvec;
2185 2200
2201 groups_scanned++;
2186 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2202 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2187 2203
2188 shrink_lruvec(lruvec, sc); 2204 shrink_lruvec(lruvec, sc);
@@ -2202,8 +2218,7 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
2202 mem_cgroup_iter_break(root, memcg); 2218 mem_cgroup_iter_break(root, memcg);
2203 break; 2219 break;
2204 } 2220 }
2205 memcg = mem_cgroup_iter(root, memcg, &reclaim); 2221 }
2206 } while (memcg);
2207 2222
2208 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, 2223 vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
2209 sc->nr_scanned - nr_scanned, 2224 sc->nr_scanned - nr_scanned,
@@ -2211,6 +2226,37 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
2211 2226
2212 } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, 2227 } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
2213 sc->nr_scanned - nr_scanned, sc)); 2228 sc->nr_scanned - nr_scanned, sc));
2229
2230 return groups_scanned;
2231}
2232
2233
2234static void shrink_zone(struct zone *zone, struct scan_control *sc)
2235{
2236 bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc);
2237 unsigned long nr_scanned = sc->nr_scanned;
2238 int scanned_groups;
2239
2240 scanned_groups = __shrink_zone(zone, sc, do_soft_reclaim);
2241 /*
2242 * memcg iterator might race with other reclaimer or start from
2243 * a incomplete tree walk so the tree walk in __shrink_zone
2244 * might have missed groups that are above the soft limit. Try
2245 * another loop to catch up with others. Do it just once to
2246 * prevent from reclaim latencies when other reclaimers always
2247 * preempt this one.
2248 */
2249 if (do_soft_reclaim && !scanned_groups)
2250 __shrink_zone(zone, sc, do_soft_reclaim);
2251
2252 /*
2253 * No group is over the soft limit or those that are do not have
2254 * pages in the zone we are reclaiming so we have to reclaim everybody
2255 */
2256 if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) {
2257 __shrink_zone(zone, sc, false);
2258 return;
2259 }
2214} 2260}
2215 2261
2216/* Returns true if compaction should go ahead for a high-order request */ 2262/* Returns true if compaction should go ahead for a high-order request */
@@ -2274,8 +2320,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2274{ 2320{
2275 struct zoneref *z; 2321 struct zoneref *z;
2276 struct zone *zone; 2322 struct zone *zone;
2277 unsigned long nr_soft_reclaimed;
2278 unsigned long nr_soft_scanned;
2279 bool aborted_reclaim = false; 2323 bool aborted_reclaim = false;
2280 2324
2281 /* 2325 /*
@@ -2315,18 +2359,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2315 continue; 2359 continue;
2316 } 2360 }
2317 } 2361 }
2318 /*
2319 * This steals pages from memory cgroups over softlimit
2320 * and returns the number of reclaimed pages and
2321 * scanned pages. This works for global memory pressure
2322 * and balancing, not for a memcg's limit.
2323 */
2324 nr_soft_scanned = 0;
2325 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2326 sc->order, sc->gfp_mask,
2327 &nr_soft_scanned);
2328 sc->nr_reclaimed += nr_soft_reclaimed;
2329 sc->nr_scanned += nr_soft_scanned;
2330 /* need some check for avoid more shrink_zone() */ 2362 /* need some check for avoid more shrink_zone() */
2331 } 2363 }
2332 2364
@@ -2920,8 +2952,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2920{ 2952{
2921 int i; 2953 int i;
2922 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2954 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2923 unsigned long nr_soft_reclaimed;
2924 unsigned long nr_soft_scanned;
2925 struct scan_control sc = { 2955 struct scan_control sc = {
2926 .gfp_mask = GFP_KERNEL, 2956 .gfp_mask = GFP_KERNEL,
2927 .priority = DEF_PRIORITY, 2957 .priority = DEF_PRIORITY,
@@ -3036,15 +3066,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
3036 3066
3037 sc.nr_scanned = 0; 3067 sc.nr_scanned = 0;
3038 3068
3039 nr_soft_scanned = 0;
3040 /*
3041 * Call soft limit reclaim before calling shrink_zone.
3042 */
3043 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
3044 order, sc.gfp_mask,
3045 &nr_soft_scanned);
3046 sc.nr_reclaimed += nr_soft_reclaimed;
3047
3048 /* 3069 /*
3049 * There should be no need to raise the scanning 3070 * There should be no need to raise the scanning
3050 * priority if enough pages are already being scanned 3071 * priority if enough pages are already being scanned