aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c3
-rw-r--r--mm/hugetlb.c104
-rw-r--r--mm/hwpoison-inject.c15
-rw-r--r--mm/memcontrol.c407
-rw-r--r--mm/memory-failure.c120
-rw-r--r--mm/memory.c25
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page-writeback.c185
-rw-r--r--mm/rmap.c59
-rw-r--r--mm/vmalloc.c4
-rw-r--r--mm/vmscan.c15
11 files changed, 630 insertions, 309 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 08d357522e78..eaa4a5bbe063 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -81,7 +81,8 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
81 nr_more_io++; 81 nr_more_io++;
82 spin_unlock(&inode_lock); 82 spin_unlock(&inode_lock);
83 83
84 get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); 84 global_dirty_limits(&background_thresh, &dirty_thresh);
85 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
85 86
86#define K(x) ((x) << (PAGE_SHIFT - 10)) 87#define K(x) ((x) << (PAGE_SHIFT - 10))
87 seq_printf(m, 88 seq_printf(m,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b61d2db9f34e..cc5be788a39f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -18,6 +18,9 @@
18#include <linux/bootmem.h> 18#include <linux/bootmem.h>
19#include <linux/sysfs.h> 19#include <linux/sysfs.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/rmap.h>
22#include <linux/swap.h>
23#include <linux/swapops.h>
21 24
22#include <asm/page.h> 25#include <asm/page.h>
23#include <asm/pgtable.h> 26#include <asm/pgtable.h>
@@ -220,6 +223,12 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
220 (vma->vm_pgoff >> huge_page_order(h)); 223 (vma->vm_pgoff >> huge_page_order(h));
221} 224}
222 225
226pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
227 unsigned long address)
228{
229 return vma_hugecache_offset(hstate_vma(vma), vma, address);
230}
231
223/* 232/*
224 * Return the size of the pages allocated when backing a VMA. In the majority 233 * Return the size of the pages allocated when backing a VMA. In the majority
225 * cases this will be same size as used by the page table entries. 234 * cases this will be same size as used by the page table entries.
@@ -552,6 +561,7 @@ static void free_huge_page(struct page *page)
552 set_page_private(page, 0); 561 set_page_private(page, 0);
553 page->mapping = NULL; 562 page->mapping = NULL;
554 BUG_ON(page_count(page)); 563 BUG_ON(page_count(page));
564 BUG_ON(page_mapcount(page));
555 INIT_LIST_HEAD(&page->lru); 565 INIT_LIST_HEAD(&page->lru);
556 566
557 spin_lock(&hugetlb_lock); 567 spin_lock(&hugetlb_lock);
@@ -605,6 +615,8 @@ int PageHuge(struct page *page)
605 return dtor == free_huge_page; 615 return dtor == free_huge_page;
606} 616}
607 617
618EXPORT_SYMBOL_GPL(PageHuge);
619
608static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) 620static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
609{ 621{
610 struct page *page; 622 struct page *page;
@@ -2129,6 +2141,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
2129 entry = huge_ptep_get(src_pte); 2141 entry = huge_ptep_get(src_pte);
2130 ptepage = pte_page(entry); 2142 ptepage = pte_page(entry);
2131 get_page(ptepage); 2143 get_page(ptepage);
2144 page_dup_rmap(ptepage);
2132 set_huge_pte_at(dst, addr, dst_pte, entry); 2145 set_huge_pte_at(dst, addr, dst_pte, entry);
2133 } 2146 }
2134 spin_unlock(&src->page_table_lock); 2147 spin_unlock(&src->page_table_lock);
@@ -2140,6 +2153,19 @@ nomem:
2140 return -ENOMEM; 2153 return -ENOMEM;
2141} 2154}
2142 2155
2156static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2157{
2158 swp_entry_t swp;
2159
2160 if (huge_pte_none(pte) || pte_present(pte))
2161 return 0;
2162 swp = pte_to_swp_entry(pte);
2163 if (non_swap_entry(swp) && is_hwpoison_entry(swp)) {
2164 return 1;
2165 } else
2166 return 0;
2167}
2168
2143void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 2169void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2144 unsigned long end, struct page *ref_page) 2170 unsigned long end, struct page *ref_page)
2145{ 2171{
@@ -2198,6 +2224,12 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2198 if (huge_pte_none(pte)) 2224 if (huge_pte_none(pte))
2199 continue; 2225 continue;
2200 2226
2227 /*
2228 * HWPoisoned hugepage is already unmapped and dropped reference
2229 */
2230 if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
2231 continue;
2232
2201 page = pte_page(pte); 2233 page = pte_page(pte);
2202 if (pte_dirty(pte)) 2234 if (pte_dirty(pte))
2203 set_page_dirty(page); 2235 set_page_dirty(page);
@@ -2207,6 +2239,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2207 flush_tlb_range(vma, start, end); 2239 flush_tlb_range(vma, start, end);
2208 mmu_notifier_invalidate_range_end(mm, start, end); 2240 mmu_notifier_invalidate_range_end(mm, start, end);
2209 list_for_each_entry_safe(page, tmp, &page_list, lru) { 2241 list_for_each_entry_safe(page, tmp, &page_list, lru) {
2242 page_remove_rmap(page);
2210 list_del(&page->lru); 2243 list_del(&page->lru);
2211 put_page(page); 2244 put_page(page);
2212 } 2245 }
@@ -2272,6 +2305,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2272 return 1; 2305 return 1;
2273} 2306}
2274 2307
2308/*
2309 * Hugetlb_cow() should be called with page lock of the original hugepage held.
2310 */
2275static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 2311static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
2276 unsigned long address, pte_t *ptep, pte_t pte, 2312 unsigned long address, pte_t *ptep, pte_t pte,
2277 struct page *pagecache_page) 2313 struct page *pagecache_page)
@@ -2286,8 +2322,13 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
2286retry_avoidcopy: 2322retry_avoidcopy:
2287 /* If no-one else is actually using this page, avoid the copy 2323 /* If no-one else is actually using this page, avoid the copy
2288 * and just make the page writable */ 2324 * and just make the page writable */
2289 avoidcopy = (page_count(old_page) == 1); 2325 avoidcopy = (page_mapcount(old_page) == 1);
2290 if (avoidcopy) { 2326 if (avoidcopy) {
2327 if (!trylock_page(old_page)) {
2328 if (PageAnon(old_page))
2329 page_move_anon_rmap(old_page, vma, address);
2330 } else
2331 unlock_page(old_page);
2291 set_huge_ptep_writable(vma, address, ptep); 2332 set_huge_ptep_writable(vma, address, ptep);
2292 return 0; 2333 return 0;
2293 } 2334 }
@@ -2338,6 +2379,13 @@ retry_avoidcopy:
2338 return -PTR_ERR(new_page); 2379 return -PTR_ERR(new_page);
2339 } 2380 }
2340 2381
2382 /*
2383 * When the original hugepage is shared one, it does not have
2384 * anon_vma prepared.
2385 */
2386 if (unlikely(anon_vma_prepare(vma)))
2387 return VM_FAULT_OOM;
2388
2341 copy_huge_page(new_page, old_page, address, vma); 2389 copy_huge_page(new_page, old_page, address, vma);
2342 __SetPageUptodate(new_page); 2390 __SetPageUptodate(new_page);
2343 2391
@@ -2355,6 +2403,8 @@ retry_avoidcopy:
2355 huge_ptep_clear_flush(vma, address, ptep); 2403 huge_ptep_clear_flush(vma, address, ptep);
2356 set_huge_pte_at(mm, address, ptep, 2404 set_huge_pte_at(mm, address, ptep,
2357 make_huge_pte(vma, new_page, 1)); 2405 make_huge_pte(vma, new_page, 1));
2406 page_remove_rmap(old_page);
2407 hugepage_add_anon_rmap(new_page, vma, address);
2358 /* Make the old page be freed below */ 2408 /* Make the old page be freed below */
2359 new_page = old_page; 2409 new_page = old_page;
2360 mmu_notifier_invalidate_range_end(mm, 2410 mmu_notifier_invalidate_range_end(mm,
@@ -2458,10 +2508,29 @@ retry:
2458 spin_lock(&inode->i_lock); 2508 spin_lock(&inode->i_lock);
2459 inode->i_blocks += blocks_per_huge_page(h); 2509 inode->i_blocks += blocks_per_huge_page(h);
2460 spin_unlock(&inode->i_lock); 2510 spin_unlock(&inode->i_lock);
2511 page_dup_rmap(page);
2461 } else { 2512 } else {
2462 lock_page(page); 2513 lock_page(page);
2463 page->mapping = HUGETLB_POISON; 2514 if (unlikely(anon_vma_prepare(vma))) {
2515 ret = VM_FAULT_OOM;
2516 goto backout_unlocked;
2517 }
2518 hugepage_add_new_anon_rmap(page, vma, address);
2464 } 2519 }
2520 } else {
2521 page_dup_rmap(page);
2522 }
2523
2524 /*
2525 * Since memory error handler replaces pte into hwpoison swap entry
2526 * at the time of error handling, a process which reserved but not have
2527 * the mapping to the error hugepage does not have hwpoison swap entry.
2528 * So we need to block accesses from such a process by checking
2529 * PG_hwpoison bit here.
2530 */
2531 if (unlikely(PageHWPoison(page))) {
2532 ret = VM_FAULT_HWPOISON;
2533 goto backout_unlocked;
2465 } 2534 }
2466 2535
2467 /* 2536 /*
@@ -2513,10 +2582,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2513 pte_t *ptep; 2582 pte_t *ptep;
2514 pte_t entry; 2583 pte_t entry;
2515 int ret; 2584 int ret;
2585 struct page *page = NULL;
2516 struct page *pagecache_page = NULL; 2586 struct page *pagecache_page = NULL;
2517 static DEFINE_MUTEX(hugetlb_instantiation_mutex); 2587 static DEFINE_MUTEX(hugetlb_instantiation_mutex);
2518 struct hstate *h = hstate_vma(vma); 2588 struct hstate *h = hstate_vma(vma);
2519 2589
2590 ptep = huge_pte_offset(mm, address);
2591 if (ptep) {
2592 entry = huge_ptep_get(ptep);
2593 if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2594 return VM_FAULT_HWPOISON;
2595 }
2596
2520 ptep = huge_pte_alloc(mm, address, huge_page_size(h)); 2597 ptep = huge_pte_alloc(mm, address, huge_page_size(h));
2521 if (!ptep) 2598 if (!ptep)
2522 return VM_FAULT_OOM; 2599 return VM_FAULT_OOM;
@@ -2554,6 +2631,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2554 vma, address); 2631 vma, address);
2555 } 2632 }
2556 2633
2634 if (!pagecache_page) {
2635 page = pte_page(entry);
2636 lock_page(page);
2637 }
2638
2557 spin_lock(&mm->page_table_lock); 2639 spin_lock(&mm->page_table_lock);
2558 /* Check for a racing update before calling hugetlb_cow */ 2640 /* Check for a racing update before calling hugetlb_cow */
2559 if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) 2641 if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
@@ -2579,6 +2661,8 @@ out_page_table_lock:
2579 if (pagecache_page) { 2661 if (pagecache_page) {
2580 unlock_page(pagecache_page); 2662 unlock_page(pagecache_page);
2581 put_page(pagecache_page); 2663 put_page(pagecache_page);
2664 } else {
2665 unlock_page(page);
2582 } 2666 }
2583 2667
2584out_mutex: 2668out_mutex:
@@ -2791,3 +2875,19 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
2791 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 2875 hugetlb_put_quota(inode->i_mapping, (chg - freed));
2792 hugetlb_acct_memory(h, -(chg - freed)); 2876 hugetlb_acct_memory(h, -(chg - freed));
2793} 2877}
2878
2879/*
2880 * This function is called from memory failure code.
2881 * Assume the caller holds page lock of the head page.
2882 */
2883void __isolate_hwpoisoned_huge_page(struct page *hpage)
2884{
2885 struct hstate *h = page_hstate(hpage);
2886 int nid = page_to_nid(hpage);
2887
2888 spin_lock(&hugetlb_lock);
2889 list_del(&hpage->lru);
2890 h->free_huge_pages--;
2891 h->free_huge_pages_node[nid]--;
2892 spin_unlock(&hugetlb_lock);
2893}
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 10ea71905c1f..0948f1072d6b 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -5,6 +5,7 @@
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/swap.h> 6#include <linux/swap.h>
7#include <linux/pagemap.h> 7#include <linux/pagemap.h>
8#include <linux/hugetlb.h>
8#include "internal.h" 9#include "internal.h"
9 10
10static struct dentry *hwpoison_dir; 11static struct dentry *hwpoison_dir;
@@ -13,6 +14,7 @@ static int hwpoison_inject(void *data, u64 val)
13{ 14{
14 unsigned long pfn = val; 15 unsigned long pfn = val;
15 struct page *p; 16 struct page *p;
17 struct page *hpage;
16 int err; 18 int err;
17 19
18 if (!capable(CAP_SYS_ADMIN)) 20 if (!capable(CAP_SYS_ADMIN))
@@ -24,18 +26,19 @@ static int hwpoison_inject(void *data, u64 val)
24 return -ENXIO; 26 return -ENXIO;
25 27
26 p = pfn_to_page(pfn); 28 p = pfn_to_page(pfn);
29 hpage = compound_head(p);
27 /* 30 /*
28 * This implies unable to support free buddy pages. 31 * This implies unable to support free buddy pages.
29 */ 32 */
30 if (!get_page_unless_zero(p)) 33 if (!get_page_unless_zero(hpage))
31 return 0; 34 return 0;
32 35
33 if (!PageLRU(p)) 36 if (!PageLRU(p) && !PageHuge(p))
34 shake_page(p, 0); 37 shake_page(p, 0);
35 /* 38 /*
36 * This implies unable to support non-LRU pages. 39 * This implies unable to support non-LRU pages.
37 */ 40 */
38 if (!PageLRU(p)) 41 if (!PageLRU(p) && !PageHuge(p))
39 return 0; 42 return 0;
40 43
41 /* 44 /*
@@ -44,9 +47,9 @@ static int hwpoison_inject(void *data, u64 val)
44 * We temporarily take page lock for try_get_mem_cgroup_from_page(). 47 * We temporarily take page lock for try_get_mem_cgroup_from_page().
45 * __memory_failure() will redo the check reliably inside page lock. 48 * __memory_failure() will redo the check reliably inside page lock.
46 */ 49 */
47 lock_page(p); 50 lock_page(hpage);
48 err = hwpoison_filter(p); 51 err = hwpoison_filter(hpage);
49 unlock_page(p); 52 unlock_page(hpage);
50 if (err) 53 if (err)
51 return 0; 54 return 0;
52 55
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0576e9e64586..3eed583895a6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -47,6 +47,7 @@
47#include <linux/mm_inline.h> 47#include <linux/mm_inline.h>
48#include <linux/page_cgroup.h> 48#include <linux/page_cgroup.h>
49#include <linux/cpu.h> 49#include <linux/cpu.h>
50#include <linux/oom.h>
50#include "internal.h" 51#include "internal.h"
51 52
52#include <asm/uaccess.h> 53#include <asm/uaccess.h>
@@ -268,6 +269,7 @@ enum move_type {
268 269
269/* "mc" and its members are protected by cgroup_mutex */ 270/* "mc" and its members are protected by cgroup_mutex */
270static struct move_charge_struct { 271static struct move_charge_struct {
272 spinlock_t lock; /* for from, to, moving_task */
271 struct mem_cgroup *from; 273 struct mem_cgroup *from;
272 struct mem_cgroup *to; 274 struct mem_cgroup *to;
273 unsigned long precharge; 275 unsigned long precharge;
@@ -276,6 +278,7 @@ static struct move_charge_struct {
276 struct task_struct *moving_task; /* a task moving charges */ 278 struct task_struct *moving_task; /* a task moving charges */
277 wait_queue_head_t waitq; /* a waitq for other context */ 279 wait_queue_head_t waitq; /* a waitq for other context */
278} mc = { 280} mc = {
281 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
279 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 282 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
280}; 283};
281 284
@@ -836,12 +839,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
836{ 839{
837 int ret; 840 int ret;
838 struct mem_cgroup *curr = NULL; 841 struct mem_cgroup *curr = NULL;
842 struct task_struct *p;
839 843
840 task_lock(task); 844 p = find_lock_task_mm(task);
841 rcu_read_lock(); 845 if (!p)
842 curr = try_get_mem_cgroup_from_mm(task->mm); 846 return 0;
843 rcu_read_unlock(); 847 curr = try_get_mem_cgroup_from_mm(p->mm);
844 task_unlock(task); 848 task_unlock(p);
845 if (!curr) 849 if (!curr)
846 return 0; 850 return 0;
847 /* 851 /*
@@ -915,7 +919,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
915 struct zone *zone, 919 struct zone *zone,
916 enum lru_list lru) 920 enum lru_list lru)
917{ 921{
918 int nid = zone->zone_pgdat->node_id; 922 int nid = zone_to_nid(zone);
919 int zid = zone_idx(zone); 923 int zid = zone_idx(zone);
920 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 924 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
921 925
@@ -925,7 +929,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
925struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 929struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
926 struct zone *zone) 930 struct zone *zone)
927{ 931{
928 int nid = zone->zone_pgdat->node_id; 932 int nid = zone_to_nid(zone);
929 int zid = zone_idx(zone); 933 int zid = zone_idx(zone);
930 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 934 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
931 935
@@ -970,7 +974,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
970 LIST_HEAD(pc_list); 974 LIST_HEAD(pc_list);
971 struct list_head *src; 975 struct list_head *src;
972 struct page_cgroup *pc, *tmp; 976 struct page_cgroup *pc, *tmp;
973 int nid = z->zone_pgdat->node_id; 977 int nid = zone_to_nid(z);
974 int zid = zone_idx(z); 978 int zid = zone_idx(z);
975 struct mem_cgroup_per_zone *mz; 979 struct mem_cgroup_per_zone *mz;
976 int lru = LRU_FILE * file + active; 980 int lru = LRU_FILE * file + active;
@@ -1047,6 +1051,47 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
1047 return swappiness; 1051 return swappiness;
1048} 1052}
1049 1053
1054/* A routine for testing mem is not under move_account */
1055
1056static bool mem_cgroup_under_move(struct mem_cgroup *mem)
1057{
1058 struct mem_cgroup *from;
1059 struct mem_cgroup *to;
1060 bool ret = false;
1061 /*
1062 * Unlike task_move routines, we access mc.to, mc.from not under
1063 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1064 */
1065 spin_lock(&mc.lock);
1066 from = mc.from;
1067 to = mc.to;
1068 if (!from)
1069 goto unlock;
1070 if (from == mem || to == mem
1071 || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
1072 || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
1073 ret = true;
1074unlock:
1075 spin_unlock(&mc.lock);
1076 return ret;
1077}
1078
1079static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
1080{
1081 if (mc.moving_task && current != mc.moving_task) {
1082 if (mem_cgroup_under_move(mem)) {
1083 DEFINE_WAIT(wait);
1084 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1085 /* moving charge context might have finished. */
1086 if (mc.moving_task)
1087 schedule();
1088 finish_wait(&mc.waitq, &wait);
1089 return true;
1090 }
1091 }
1092 return false;
1093}
1094
1050static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) 1095static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
1051{ 1096{
1052 int *val = data; 1097 int *val = data;
@@ -1255,8 +1300,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1255 /* we use swappiness of local cgroup */ 1300 /* we use swappiness of local cgroup */
1256 if (check_soft) 1301 if (check_soft)
1257 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1302 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1258 noswap, get_swappiness(victim), zone, 1303 noswap, get_swappiness(victim), zone);
1259 zone->zone_pgdat->node_id);
1260 else 1304 else
1261 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1305 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1262 noswap, get_swappiness(victim)); 1306 noswap, get_swappiness(victim));
@@ -1363,7 +1407,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem)
1363 1407
1364static void memcg_oom_recover(struct mem_cgroup *mem) 1408static void memcg_oom_recover(struct mem_cgroup *mem)
1365{ 1409{
1366 if (atomic_read(&mem->oom_lock)) 1410 if (mem && atomic_read(&mem->oom_lock))
1367 memcg_wakeup_oom(mem); 1411 memcg_wakeup_oom(mem);
1368} 1412}
1369 1413
@@ -1575,16 +1619,83 @@ static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
1575 return NOTIFY_OK; 1619 return NOTIFY_OK;
1576} 1620}
1577 1621
1622
1623/* See __mem_cgroup_try_charge() for details */
1624enum {
1625 CHARGE_OK, /* success */
1626 CHARGE_RETRY, /* need to retry but retry is not bad */
1627 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
1628 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
1629 CHARGE_OOM_DIE, /* the current is killed because of OOM */
1630};
1631
1632static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1633 int csize, bool oom_check)
1634{
1635 struct mem_cgroup *mem_over_limit;
1636 struct res_counter *fail_res;
1637 unsigned long flags = 0;
1638 int ret;
1639
1640 ret = res_counter_charge(&mem->res, csize, &fail_res);
1641
1642 if (likely(!ret)) {
1643 if (!do_swap_account)
1644 return CHARGE_OK;
1645 ret = res_counter_charge(&mem->memsw, csize, &fail_res);
1646 if (likely(!ret))
1647 return CHARGE_OK;
1648
1649 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
1650 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1651 } else
1652 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
1653
1654 if (csize > PAGE_SIZE) /* change csize and retry */
1655 return CHARGE_RETRY;
1656
1657 if (!(gfp_mask & __GFP_WAIT))
1658 return CHARGE_WOULDBLOCK;
1659
1660 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1661 gfp_mask, flags);
1662 /*
1663 * try_to_free_mem_cgroup_pages() might not give us a full
1664 * picture of reclaim. Some pages are reclaimed and might be
1665 * moved to swap cache or just unmapped from the cgroup.
1666 * Check the limit again to see if the reclaim reduced the
1667 * current usage of the cgroup before giving up
1668 */
1669 if (ret || mem_cgroup_check_under_limit(mem_over_limit))
1670 return CHARGE_RETRY;
1671
1672 /*
1673 * At task move, charge accounts can be doubly counted. So, it's
1674 * better to wait until the end of task_move if something is going on.
1675 */
1676 if (mem_cgroup_wait_acct_move(mem_over_limit))
1677 return CHARGE_RETRY;
1678
1679 /* If we don't need to call oom-killer at el, return immediately */
1680 if (!oom_check)
1681 return CHARGE_NOMEM;
1682 /* check OOM */
1683 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
1684 return CHARGE_OOM_DIE;
1685
1686 return CHARGE_RETRY;
1687}
1688
1578/* 1689/*
1579 * Unlike exported interface, "oom" parameter is added. if oom==true, 1690 * Unlike exported interface, "oom" parameter is added. if oom==true,
1580 * oom-killer can be invoked. 1691 * oom-killer can be invoked.
1581 */ 1692 */
1582static int __mem_cgroup_try_charge(struct mm_struct *mm, 1693static int __mem_cgroup_try_charge(struct mm_struct *mm,
1583 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) 1694 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
1584{ 1695{
1585 struct mem_cgroup *mem, *mem_over_limit; 1696 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
1586 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1697 struct mem_cgroup *mem = NULL;
1587 struct res_counter *fail_res; 1698 int ret;
1588 int csize = CHARGE_SIZE; 1699 int csize = CHARGE_SIZE;
1589 1700
1590 /* 1701 /*
@@ -1602,126 +1713,108 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1602 * thread group leader migrates. It's possible that mm is not 1713 * thread group leader migrates. It's possible that mm is not
1603 * set, if so charge the init_mm (happens for pagecache usage). 1714 * set, if so charge the init_mm (happens for pagecache usage).
1604 */ 1715 */
1605 mem = *memcg; 1716 if (!*memcg && !mm)
1606 if (likely(!mem)) { 1717 goto bypass;
1607 mem = try_get_mem_cgroup_from_mm(mm); 1718again:
1608 *memcg = mem; 1719 if (*memcg) { /* css should be a valid one */
1609 } else { 1720 mem = *memcg;
1610 css_get(&mem->css); 1721 VM_BUG_ON(css_is_removed(&mem->css));
1611 } 1722 if (mem_cgroup_is_root(mem))
1612 if (unlikely(!mem)) 1723 goto done;
1613 return 0;
1614
1615 VM_BUG_ON(css_is_removed(&mem->css));
1616 if (mem_cgroup_is_root(mem))
1617 goto done;
1618
1619 while (1) {
1620 int ret = 0;
1621 unsigned long flags = 0;
1622
1623 if (consume_stock(mem)) 1724 if (consume_stock(mem))
1624 goto done; 1725 goto done;
1726 css_get(&mem->css);
1727 } else {
1728 struct task_struct *p;
1625 1729
1626 ret = res_counter_charge(&mem->res, csize, &fail_res); 1730 rcu_read_lock();
1627 if (likely(!ret)) { 1731 p = rcu_dereference(mm->owner);
1628 if (!do_swap_account) 1732 VM_BUG_ON(!p);
1629 break;
1630 ret = res_counter_charge(&mem->memsw, csize, &fail_res);
1631 if (likely(!ret))
1632 break;
1633 /* mem+swap counter fails */
1634 res_counter_uncharge(&mem->res, csize);
1635 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1636 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1637 memsw);
1638 } else
1639 /* mem counter fails */
1640 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1641 res);
1642
1643 /* reduce request size and retry */
1644 if (csize > PAGE_SIZE) {
1645 csize = PAGE_SIZE;
1646 continue;
1647 }
1648 if (!(gfp_mask & __GFP_WAIT))
1649 goto nomem;
1650
1651 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1652 gfp_mask, flags);
1653 if (ret)
1654 continue;
1655
1656 /* 1733 /*
1657 * try_to_free_mem_cgroup_pages() might not give us a full 1734 * because we don't have task_lock(), "p" can exit while
1658 * picture of reclaim. Some pages are reclaimed and might be 1735 * we're here. In that case, "mem" can point to root
1659 * moved to swap cache or just unmapped from the cgroup. 1736 * cgroup but never be NULL. (and task_struct itself is freed
1660 * Check the limit again to see if the reclaim reduced the 1737 * by RCU, cgroup itself is RCU safe.) Then, we have small
1661 * current usage of the cgroup before giving up 1738 * risk here to get wrong cgroup. But such kind of mis-account
1662 * 1739 * by race always happens because we don't have cgroup_mutex().
1740 * It's overkill and we allow that small race, here.
1663 */ 1741 */
1664 if (mem_cgroup_check_under_limit(mem_over_limit)) 1742 mem = mem_cgroup_from_task(p);
1665 continue; 1743 VM_BUG_ON(!mem);
1666 1744 if (mem_cgroup_is_root(mem)) {
1667 /* try to avoid oom while someone is moving charge */ 1745 rcu_read_unlock();
1668 if (mc.moving_task && current != mc.moving_task) { 1746 goto done;
1669 struct mem_cgroup *from, *to; 1747 }
1670 bool do_continue = false; 1748 if (consume_stock(mem)) {
1671 /* 1749 /*
1672 * There is a small race that "from" or "to" can be 1750 * It seems dagerous to access memcg without css_get().
1673 * freed by rmdir, so we use css_tryget(). 1751 * But considering how consume_stok works, it's not
1752 * necessary. If consume_stock success, some charges
1753 * from this memcg are cached on this cpu. So, we
1754 * don't need to call css_get()/css_tryget() before
1755 * calling consume_stock().
1674 */ 1756 */
1675 from = mc.from; 1757 rcu_read_unlock();
1676 to = mc.to; 1758 goto done;
1677 if (from && css_tryget(&from->css)) { 1759 }
1678 if (mem_over_limit->use_hierarchy) 1760 /* after here, we may be blocked. we need to get refcnt */
1679 do_continue = css_is_ancestor( 1761 if (!css_tryget(&mem->css)) {
1680 &from->css, 1762 rcu_read_unlock();
1681 &mem_over_limit->css); 1763 goto again;
1682 else 1764 }
1683 do_continue = (from == mem_over_limit); 1765 rcu_read_unlock();
1684 css_put(&from->css); 1766 }
1685 } 1767
1686 if (!do_continue && to && css_tryget(&to->css)) { 1768 do {
1687 if (mem_over_limit->use_hierarchy) 1769 bool oom_check;
1688 do_continue = css_is_ancestor( 1770
1689 &to->css, 1771 /* If killed, bypass charge */
1690 &mem_over_limit->css); 1772 if (fatal_signal_pending(current)) {
1691 else 1773 css_put(&mem->css);
1692 do_continue = (to == mem_over_limit); 1774 goto bypass;
1693 css_put(&to->css);
1694 }
1695 if (do_continue) {
1696 DEFINE_WAIT(wait);
1697 prepare_to_wait(&mc.waitq, &wait,
1698 TASK_INTERRUPTIBLE);
1699 /* moving charge context might have finished. */
1700 if (mc.moving_task)
1701 schedule();
1702 finish_wait(&mc.waitq, &wait);
1703 continue;
1704 }
1705 } 1775 }
1706 1776
1707 if (!nr_retries--) { 1777 oom_check = false;
1708 if (!oom) 1778 if (oom && !nr_oom_retries) {
1779 oom_check = true;
1780 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
1781 }
1782
1783 ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check);
1784
1785 switch (ret) {
1786 case CHARGE_OK:
1787 break;
1788 case CHARGE_RETRY: /* not in OOM situation but retry */
1789 csize = PAGE_SIZE;
1790 css_put(&mem->css);
1791 mem = NULL;
1792 goto again;
1793 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
1794 css_put(&mem->css);
1795 goto nomem;
1796 case CHARGE_NOMEM: /* OOM routine works */
1797 if (!oom) {
1798 css_put(&mem->css);
1709 goto nomem; 1799 goto nomem;
1710 if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
1711 nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1712 continue;
1713 } 1800 }
1714 /* When we reach here, current task is dying .*/ 1801 /* If oom, we never return -ENOMEM */
1802 nr_oom_retries--;
1803 break;
1804 case CHARGE_OOM_DIE: /* Killed by OOM Killer */
1715 css_put(&mem->css); 1805 css_put(&mem->css);
1716 goto bypass; 1806 goto bypass;
1717 } 1807 }
1718 } 1808 } while (ret != CHARGE_OK);
1809
1719 if (csize > PAGE_SIZE) 1810 if (csize > PAGE_SIZE)
1720 refill_stock(mem, csize - PAGE_SIZE); 1811 refill_stock(mem, csize - PAGE_SIZE);
1812 css_put(&mem->css);
1721done: 1813done:
1814 *memcg = mem;
1722 return 0; 1815 return 0;
1723nomem: 1816nomem:
1724 css_put(&mem->css); 1817 *memcg = NULL;
1725 return -ENOMEM; 1818 return -ENOMEM;
1726bypass: 1819bypass:
1727 *memcg = NULL; 1820 *memcg = NULL;
@@ -1740,11 +1833,7 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
1740 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 1833 res_counter_uncharge(&mem->res, PAGE_SIZE * count);
1741 if (do_swap_account) 1834 if (do_swap_account)
1742 res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); 1835 res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
1743 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
1744 WARN_ON_ONCE(count > INT_MAX);
1745 __css_put(&mem->css, (int)count);
1746 } 1836 }
1747 /* we don't need css_put for root */
1748} 1837}
1749 1838
1750static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) 1839static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
@@ -1972,10 +2061,9 @@ out:
1972 * < 0 if the cgroup is over its limit 2061 * < 0 if the cgroup is over its limit
1973 */ 2062 */
1974static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 2063static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1975 gfp_t gfp_mask, enum charge_type ctype, 2064 gfp_t gfp_mask, enum charge_type ctype)
1976 struct mem_cgroup *memcg)
1977{ 2065{
1978 struct mem_cgroup *mem; 2066 struct mem_cgroup *mem = NULL;
1979 struct page_cgroup *pc; 2067 struct page_cgroup *pc;
1980 int ret; 2068 int ret;
1981 2069
@@ -1985,7 +2073,6 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1985 return 0; 2073 return 0;
1986 prefetchw(pc); 2074 prefetchw(pc);
1987 2075
1988 mem = memcg;
1989 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); 2076 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
1990 if (ret || !mem) 2077 if (ret || !mem)
1991 return ret; 2078 return ret;
@@ -2013,7 +2100,7 @@ int mem_cgroup_newpage_charge(struct page *page,
2013 if (unlikely(!mm)) 2100 if (unlikely(!mm))
2014 mm = &init_mm; 2101 mm = &init_mm;
2015 return mem_cgroup_charge_common(page, mm, gfp_mask, 2102 return mem_cgroup_charge_common(page, mm, gfp_mask,
2016 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); 2103 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2017} 2104}
2018 2105
2019static void 2106static void
@@ -2023,7 +2110,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2023int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 2110int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2024 gfp_t gfp_mask) 2111 gfp_t gfp_mask)
2025{ 2112{
2026 struct mem_cgroup *mem = NULL;
2027 int ret; 2113 int ret;
2028 2114
2029 if (mem_cgroup_disabled()) 2115 if (mem_cgroup_disabled())
@@ -2044,7 +2130,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2044 if (!(gfp_mask & __GFP_WAIT)) { 2130 if (!(gfp_mask & __GFP_WAIT)) {
2045 struct page_cgroup *pc; 2131 struct page_cgroup *pc;
2046 2132
2047
2048 pc = lookup_page_cgroup(page); 2133 pc = lookup_page_cgroup(page);
2049 if (!pc) 2134 if (!pc)
2050 return 0; 2135 return 0;
@@ -2056,22 +2141,24 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2056 unlock_page_cgroup(pc); 2141 unlock_page_cgroup(pc);
2057 } 2142 }
2058 2143
2059 if (unlikely(!mm && !mem)) 2144 if (unlikely(!mm))
2060 mm = &init_mm; 2145 mm = &init_mm;
2061 2146
2062 if (page_is_file_cache(page)) 2147 if (page_is_file_cache(page))
2063 return mem_cgroup_charge_common(page, mm, gfp_mask, 2148 return mem_cgroup_charge_common(page, mm, gfp_mask,
2064 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); 2149 MEM_CGROUP_CHARGE_TYPE_CACHE);
2065 2150
2066 /* shmem */ 2151 /* shmem */
2067 if (PageSwapCache(page)) { 2152 if (PageSwapCache(page)) {
2153 struct mem_cgroup *mem = NULL;
2154
2068 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2155 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2069 if (!ret) 2156 if (!ret)
2070 __mem_cgroup_commit_charge_swapin(page, mem, 2157 __mem_cgroup_commit_charge_swapin(page, mem,
2071 MEM_CGROUP_CHARGE_TYPE_SHMEM); 2158 MEM_CGROUP_CHARGE_TYPE_SHMEM);
2072 } else 2159 } else
2073 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 2160 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
2074 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); 2161 MEM_CGROUP_CHARGE_TYPE_SHMEM);
2075 2162
2076 return ret; 2163 return ret;
2077} 2164}
@@ -2107,7 +2194,6 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2107 goto charge_cur_mm; 2194 goto charge_cur_mm;
2108 *ptr = mem; 2195 *ptr = mem;
2109 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); 2196 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
2110 /* drop extra refcnt from tryget */
2111 css_put(&mem->css); 2197 css_put(&mem->css);
2112 return ret; 2198 return ret;
2113charge_cur_mm: 2199charge_cur_mm:
@@ -2238,7 +2324,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2238{ 2324{
2239 struct page_cgroup *pc; 2325 struct page_cgroup *pc;
2240 struct mem_cgroup *mem = NULL; 2326 struct mem_cgroup *mem = NULL;
2241 struct mem_cgroup_per_zone *mz;
2242 2327
2243 if (mem_cgroup_disabled()) 2328 if (mem_cgroup_disabled())
2244 return NULL; 2329 return NULL;
@@ -2278,10 +2363,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2278 break; 2363 break;
2279 } 2364 }
2280 2365
2281 if (!mem_cgroup_is_root(mem))
2282 __do_uncharge(mem, ctype);
2283 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2284 mem_cgroup_swap_statistics(mem, true);
2285 mem_cgroup_charge_statistics(mem, pc, false); 2366 mem_cgroup_charge_statistics(mem, pc, false);
2286 2367
2287 ClearPageCgroupUsed(pc); 2368 ClearPageCgroupUsed(pc);
@@ -2292,13 +2373,18 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2292 * special functions. 2373 * special functions.
2293 */ 2374 */
2294 2375
2295 mz = page_cgroup_zoneinfo(pc);
2296 unlock_page_cgroup(pc); 2376 unlock_page_cgroup(pc);
2297 2377 /*
2378 * even after unlock, we have mem->res.usage here and this memcg
2379 * will never be freed.
2380 */
2298 memcg_check_events(mem, page); 2381 memcg_check_events(mem, page);
2299 /* at swapout, this memcg will be accessed to record to swap */ 2382 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
2300 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2383 mem_cgroup_swap_statistics(mem, true);
2301 css_put(&mem->css); 2384 mem_cgroup_get(mem);
2385 }
2386 if (!mem_cgroup_is_root(mem))
2387 __do_uncharge(mem, ctype);
2302 2388
2303 return mem; 2389 return mem;
2304 2390
@@ -2385,13 +2471,12 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
2385 2471
2386 memcg = __mem_cgroup_uncharge_common(page, ctype); 2472 memcg = __mem_cgroup_uncharge_common(page, ctype);
2387 2473
2388 /* record memcg information */ 2474 /*
2389 if (do_swap_account && swapout && memcg) { 2475 * record memcg information, if swapout && memcg != NULL,
2476 * mem_cgroup_get() was called in uncharge().
2477 */
2478 if (do_swap_account && swapout && memcg)
2390 swap_cgroup_record(ent, css_id(&memcg->css)); 2479 swap_cgroup_record(ent, css_id(&memcg->css));
2391 mem_cgroup_get(memcg);
2392 }
2393 if (swapout && memcg)
2394 css_put(&memcg->css);
2395} 2480}
2396#endif 2481#endif
2397 2482
@@ -2469,7 +2554,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
2469 */ 2554 */
2470 if (!mem_cgroup_is_root(to)) 2555 if (!mem_cgroup_is_root(to))
2471 res_counter_uncharge(&to->res, PAGE_SIZE); 2556 res_counter_uncharge(&to->res, PAGE_SIZE);
2472 css_put(&to->css);
2473 } 2557 }
2474 return 0; 2558 return 0;
2475 } 2559 }
@@ -2604,11 +2688,8 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
2604 ClearPageCgroupMigration(pc); 2688 ClearPageCgroupMigration(pc);
2605 unlock_page_cgroup(pc); 2689 unlock_page_cgroup(pc);
2606 2690
2607 if (unused != oldpage)
2608 pc = lookup_page_cgroup(unused);
2609 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); 2691 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
2610 2692
2611 pc = lookup_page_cgroup(used);
2612 /* 2693 /*
2613 * If a page is a file cache, radix-tree replacement is very atomic 2694 * If a page is a file cache, radix-tree replacement is very atomic
2614 * and we can skip this check. When it was an Anon page, its mapcount 2695 * and we can skip this check. When it was an Anon page, its mapcount
@@ -2784,8 +2865,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2784} 2865}
2785 2866
2786unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 2867unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2787 gfp_t gfp_mask, int nid, 2868 gfp_t gfp_mask)
2788 int zid)
2789{ 2869{
2790 unsigned long nr_reclaimed = 0; 2870 unsigned long nr_reclaimed = 0;
2791 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 2871 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
@@ -2797,7 +2877,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2797 if (order > 0) 2877 if (order > 0)
2798 return 0; 2878 return 0;
2799 2879
2800 mctz = soft_limit_tree_node_zone(nid, zid); 2880 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
2801 /* 2881 /*
2802 * This loop can run a while, specially if mem_cgroup's continuously 2882 * This loop can run a while, specially if mem_cgroup's continuously
2803 * keep exceeding their soft limit and putting the system under 2883 * keep exceeding their soft limit and putting the system under
@@ -3752,8 +3832,6 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
3752 return 0; 3832 return 0;
3753} 3833}
3754 3834
3755/*
3756 */
3757static int mem_cgroup_oom_control_write(struct cgroup *cgrp, 3835static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
3758 struct cftype *cft, u64 val) 3836 struct cftype *cft, u64 val)
3759{ 3837{
@@ -4173,9 +4251,6 @@ static int mem_cgroup_do_precharge(unsigned long count)
4173 goto one_by_one; 4251 goto one_by_one;
4174 } 4252 }
4175 mc.precharge += count; 4253 mc.precharge += count;
4176 VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
4177 WARN_ON_ONCE(count > INT_MAX);
4178 __css_get(&mem->css, (int)count);
4179 return ret; 4254 return ret;
4180 } 4255 }
4181one_by_one: 4256one_by_one:
@@ -4393,11 +4468,13 @@ static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4393 4468
4394static void mem_cgroup_clear_mc(void) 4469static void mem_cgroup_clear_mc(void)
4395{ 4470{
4471 struct mem_cgroup *from = mc.from;
4472 struct mem_cgroup *to = mc.to;
4473
4396 /* we must uncharge all the leftover precharges from mc.to */ 4474 /* we must uncharge all the leftover precharges from mc.to */
4397 if (mc.precharge) { 4475 if (mc.precharge) {
4398 __mem_cgroup_cancel_charge(mc.to, mc.precharge); 4476 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
4399 mc.precharge = 0; 4477 mc.precharge = 0;
4400 memcg_oom_recover(mc.to);
4401 } 4478 }
4402 /* 4479 /*
4403 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 4480 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
@@ -4406,11 +4483,9 @@ static void mem_cgroup_clear_mc(void)
4406 if (mc.moved_charge) { 4483 if (mc.moved_charge) {
4407 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 4484 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
4408 mc.moved_charge = 0; 4485 mc.moved_charge = 0;
4409 memcg_oom_recover(mc.from);
4410 } 4486 }
4411 /* we must fixup refcnts and charges */ 4487 /* we must fixup refcnts and charges */
4412 if (mc.moved_swap) { 4488 if (mc.moved_swap) {
4413 WARN_ON_ONCE(mc.moved_swap > INT_MAX);
4414 /* uncharge swap account from the old cgroup */ 4489 /* uncharge swap account from the old cgroup */
4415 if (!mem_cgroup_is_root(mc.from)) 4490 if (!mem_cgroup_is_root(mc.from))
4416 res_counter_uncharge(&mc.from->memsw, 4491 res_counter_uncharge(&mc.from->memsw,
@@ -4424,16 +4499,18 @@ static void mem_cgroup_clear_mc(void)
4424 */ 4499 */
4425 res_counter_uncharge(&mc.to->res, 4500 res_counter_uncharge(&mc.to->res,
4426 PAGE_SIZE * mc.moved_swap); 4501 PAGE_SIZE * mc.moved_swap);
4427 VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags));
4428 __css_put(&mc.to->css, mc.moved_swap);
4429 } 4502 }
4430 /* we've already done mem_cgroup_get(mc.to) */ 4503 /* we've already done mem_cgroup_get(mc.to) */
4431 4504
4432 mc.moved_swap = 0; 4505 mc.moved_swap = 0;
4433 } 4506 }
4507 spin_lock(&mc.lock);
4434 mc.from = NULL; 4508 mc.from = NULL;
4435 mc.to = NULL; 4509 mc.to = NULL;
4436 mc.moving_task = NULL; 4510 mc.moving_task = NULL;
4511 spin_unlock(&mc.lock);
4512 memcg_oom_recover(from);
4513 memcg_oom_recover(to);
4437 wake_up_all(&mc.waitq); 4514 wake_up_all(&mc.waitq);
4438} 4515}
4439 4516
@@ -4462,12 +4539,14 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4462 VM_BUG_ON(mc.moved_charge); 4539 VM_BUG_ON(mc.moved_charge);
4463 VM_BUG_ON(mc.moved_swap); 4540 VM_BUG_ON(mc.moved_swap);
4464 VM_BUG_ON(mc.moving_task); 4541 VM_BUG_ON(mc.moving_task);
4542 spin_lock(&mc.lock);
4465 mc.from = from; 4543 mc.from = from;
4466 mc.to = mem; 4544 mc.to = mem;
4467 mc.precharge = 0; 4545 mc.precharge = 0;
4468 mc.moved_charge = 0; 4546 mc.moved_charge = 0;
4469 mc.moved_swap = 0; 4547 mc.moved_swap = 0;
4470 mc.moving_task = current; 4548 mc.moving_task = current;
4549 spin_unlock(&mc.lock);
4471 4550
4472 ret = mem_cgroup_precharge_mc(mm); 4551 ret = mem_cgroup_precharge_mc(mm);
4473 if (ret) 4552 if (ret)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6b44e52cacaa..9c26eeca1342 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -46,6 +46,7 @@
46#include <linux/suspend.h> 46#include <linux/suspend.h>
47#include <linux/slab.h> 47#include <linux/slab.h>
48#include <linux/swapops.h> 48#include <linux/swapops.h>
49#include <linux/hugetlb.h>
49#include "internal.h" 50#include "internal.h"
50 51
51int sysctl_memory_failure_early_kill __read_mostly = 0; 52int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -690,17 +691,29 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
690/* 691/*
691 * Huge pages. Needs work. 692 * Huge pages. Needs work.
692 * Issues: 693 * Issues:
693 * No rmap support so we cannot find the original mapper. In theory could walk 694 * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
694 * all MMs and look for the mappings, but that would be non atomic and racy. 695 * To narrow down kill region to one page, we need to break up pmd.
695 * Need rmap for hugepages for this. Alternatively we could employ a heuristic, 696 * - To support soft-offlining for hugepage, we need to support hugepage
696 * like just walking the current process and hoping it has it mapped (that 697 * migration.
697 * should be usually true for the common "shared database cache" case)
698 * Should handle free huge pages and dequeue them too, but this needs to
699 * handle huge page accounting correctly.
700 */ 698 */
701static int me_huge_page(struct page *p, unsigned long pfn) 699static int me_huge_page(struct page *p, unsigned long pfn)
702{ 700{
703 return FAILED; 701 struct page *hpage = compound_head(p);
702 /*
703 * We can safely recover from error on free or reserved (i.e.
704 * not in-use) hugepage by dequeuing it from freelist.
705 * To check whether a hugepage is in-use or not, we can't use
706 * page->lru because it can be used in other hugepage operations,
707 * such as __unmap_hugepage_range() and gather_surplus_pages().
708 * So instead we use page_mapping() and PageAnon().
709 * We assume that this function is called with page lock held,
710 * so there is no race between isolation and mapping/unmapping.
711 */
712 if (!(page_mapping(hpage) || PageAnon(hpage))) {
713 __isolate_hwpoisoned_huge_page(hpage);
714 return RECOVERED;
715 }
716 return DELAYED;
704} 717}
705 718
706/* 719/*
@@ -838,6 +851,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
838 int ret; 851 int ret;
839 int i; 852 int i;
840 int kill = 1; 853 int kill = 1;
854 struct page *hpage = compound_head(p);
841 855
842 if (PageReserved(p) || PageSlab(p)) 856 if (PageReserved(p) || PageSlab(p))
843 return SWAP_SUCCESS; 857 return SWAP_SUCCESS;
@@ -846,10 +860,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
846 * This check implies we don't kill processes if their pages 860 * This check implies we don't kill processes if their pages
847 * are in the swap cache early. Those are always late kills. 861 * are in the swap cache early. Those are always late kills.
848 */ 862 */
849 if (!page_mapped(p)) 863 if (!page_mapped(hpage))
850 return SWAP_SUCCESS; 864 return SWAP_SUCCESS;
851 865
852 if (PageCompound(p) || PageKsm(p)) 866 if (PageKsm(p))
853 return SWAP_FAIL; 867 return SWAP_FAIL;
854 868
855 if (PageSwapCache(p)) { 869 if (PageSwapCache(p)) {
@@ -864,10 +878,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
864 * XXX: the dirty test could be racy: set_page_dirty() may not always 878 * XXX: the dirty test could be racy: set_page_dirty() may not always
865 * be called inside page lock (it's recommended but not enforced). 879 * be called inside page lock (it's recommended but not enforced).
866 */ 880 */
867 mapping = page_mapping(p); 881 mapping = page_mapping(hpage);
868 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { 882 if (!PageDirty(hpage) && mapping &&
869 if (page_mkclean(p)) { 883 mapping_cap_writeback_dirty(mapping)) {
870 SetPageDirty(p); 884 if (page_mkclean(hpage)) {
885 SetPageDirty(hpage);
871 } else { 886 } else {
872 kill = 0; 887 kill = 0;
873 ttu |= TTU_IGNORE_HWPOISON; 888 ttu |= TTU_IGNORE_HWPOISON;
@@ -886,14 +901,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
886 * there's nothing that can be done. 901 * there's nothing that can be done.
887 */ 902 */
888 if (kill) 903 if (kill)
889 collect_procs(p, &tokill); 904 collect_procs(hpage, &tokill);
890 905
891 /* 906 /*
892 * try_to_unmap can fail temporarily due to races. 907 * try_to_unmap can fail temporarily due to races.
893 * Try a few times (RED-PEN better strategy?) 908 * Try a few times (RED-PEN better strategy?)
894 */ 909 */
895 for (i = 0; i < N_UNMAP_TRIES; i++) { 910 for (i = 0; i < N_UNMAP_TRIES; i++) {
896 ret = try_to_unmap(p, ttu); 911 ret = try_to_unmap(hpage, ttu);
897 if (ret == SWAP_SUCCESS) 912 if (ret == SWAP_SUCCESS)
898 break; 913 break;
899 pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); 914 pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
@@ -901,7 +916,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
901 916
902 if (ret != SWAP_SUCCESS) 917 if (ret != SWAP_SUCCESS)
903 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", 918 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
904 pfn, page_mapcount(p)); 919 pfn, page_mapcount(hpage));
905 920
906 /* 921 /*
907 * Now that the dirty bit has been propagated to the 922 * Now that the dirty bit has been propagated to the
@@ -912,17 +927,35 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
912 * use a more force-full uncatchable kill to prevent 927 * use a more force-full uncatchable kill to prevent
913 * any accesses to the poisoned memory. 928 * any accesses to the poisoned memory.
914 */ 929 */
915 kill_procs_ao(&tokill, !!PageDirty(p), trapno, 930 kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
916 ret != SWAP_SUCCESS, pfn); 931 ret != SWAP_SUCCESS, pfn);
917 932
918 return ret; 933 return ret;
919} 934}
920 935
936static void set_page_hwpoison_huge_page(struct page *hpage)
937{
938 int i;
939 int nr_pages = 1 << compound_order(hpage);
940 for (i = 0; i < nr_pages; i++)
941 SetPageHWPoison(hpage + i);
942}
943
944static void clear_page_hwpoison_huge_page(struct page *hpage)
945{
946 int i;
947 int nr_pages = 1 << compound_order(hpage);
948 for (i = 0; i < nr_pages; i++)
949 ClearPageHWPoison(hpage + i);
950}
951
921int __memory_failure(unsigned long pfn, int trapno, int flags) 952int __memory_failure(unsigned long pfn, int trapno, int flags)
922{ 953{
923 struct page_state *ps; 954 struct page_state *ps;
924 struct page *p; 955 struct page *p;
956 struct page *hpage;
925 int res; 957 int res;
958 unsigned int nr_pages;
926 959
927 if (!sysctl_memory_failure_recovery) 960 if (!sysctl_memory_failure_recovery)
928 panic("Memory failure from trap %d on page %lx", trapno, pfn); 961 panic("Memory failure from trap %d on page %lx", trapno, pfn);
@@ -935,12 +968,14 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
935 } 968 }
936 969
937 p = pfn_to_page(pfn); 970 p = pfn_to_page(pfn);
971 hpage = compound_head(p);
938 if (TestSetPageHWPoison(p)) { 972 if (TestSetPageHWPoison(p)) {
939 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); 973 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
940 return 0; 974 return 0;
941 } 975 }
942 976
943 atomic_long_add(1, &mce_bad_pages); 977 nr_pages = 1 << compound_order(hpage);
978 atomic_long_add(nr_pages, &mce_bad_pages);
944 979
945 /* 980 /*
946 * We need/can do nothing about count=0 pages. 981 * We need/can do nothing about count=0 pages.
@@ -954,7 +989,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
954 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. 989 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
955 */ 990 */
956 if (!(flags & MF_COUNT_INCREASED) && 991 if (!(flags & MF_COUNT_INCREASED) &&
957 !get_page_unless_zero(compound_head(p))) { 992 !get_page_unless_zero(hpage)) {
958 if (is_free_buddy_page(p)) { 993 if (is_free_buddy_page(p)) {
959 action_result(pfn, "free buddy", DELAYED); 994 action_result(pfn, "free buddy", DELAYED);
960 return 0; 995 return 0;
@@ -972,9 +1007,9 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
972 * The check (unnecessarily) ignores LRU pages being isolated and 1007 * The check (unnecessarily) ignores LRU pages being isolated and
973 * walked by the page reclaim code, however that's not a big loss. 1008 * walked by the page reclaim code, however that's not a big loss.
974 */ 1009 */
975 if (!PageLRU(p)) 1010 if (!PageLRU(p) && !PageHuge(p))
976 shake_page(p, 0); 1011 shake_page(p, 0);
977 if (!PageLRU(p)) { 1012 if (!PageLRU(p) && !PageHuge(p)) {
978 /* 1013 /*
979 * shake_page could have turned it free. 1014 * shake_page could have turned it free.
980 */ 1015 */
@@ -992,7 +1027,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
992 * It's very difficult to mess with pages currently under IO 1027 * It's very difficult to mess with pages currently under IO
993 * and in many cases impossible, so we just avoid it here. 1028 * and in many cases impossible, so we just avoid it here.
994 */ 1029 */
995 lock_page_nosync(p); 1030 lock_page_nosync(hpage);
996 1031
997 /* 1032 /*
998 * unpoison always clear PG_hwpoison inside page lock 1033 * unpoison always clear PG_hwpoison inside page lock
@@ -1004,11 +1039,31 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1004 } 1039 }
1005 if (hwpoison_filter(p)) { 1040 if (hwpoison_filter(p)) {
1006 if (TestClearPageHWPoison(p)) 1041 if (TestClearPageHWPoison(p))
1007 atomic_long_dec(&mce_bad_pages); 1042 atomic_long_sub(nr_pages, &mce_bad_pages);
1008 unlock_page(p); 1043 unlock_page(hpage);
1009 put_page(p); 1044 put_page(hpage);
1045 return 0;
1046 }
1047
1048 /*
1049 * For error on the tail page, we should set PG_hwpoison
1050 * on the head page to show that the hugepage is hwpoisoned
1051 */
1052 if (PageTail(p) && TestSetPageHWPoison(hpage)) {
1053 action_result(pfn, "hugepage already hardware poisoned",
1054 IGNORED);
1055 unlock_page(hpage);
1056 put_page(hpage);
1010 return 0; 1057 return 0;
1011 } 1058 }
1059 /*
1060 * Set PG_hwpoison on all pages in an error hugepage,
1061 * because containment is done in hugepage unit for now.
1062 * Since we have done TestSetPageHWPoison() for the head page with
1063 * page lock held, we can safely set PG_hwpoison bits on tail pages.
1064 */
1065 if (PageHuge(p))
1066 set_page_hwpoison_huge_page(hpage);
1012 1067
1013 wait_on_page_writeback(p); 1068 wait_on_page_writeback(p);
1014 1069
@@ -1039,7 +1094,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1039 } 1094 }
1040 } 1095 }
1041out: 1096out:
1042 unlock_page(p); 1097 unlock_page(hpage);
1043 return res; 1098 return res;
1044} 1099}
1045EXPORT_SYMBOL_GPL(__memory_failure); 1100EXPORT_SYMBOL_GPL(__memory_failure);
@@ -1083,6 +1138,7 @@ int unpoison_memory(unsigned long pfn)
1083 struct page *page; 1138 struct page *page;
1084 struct page *p; 1139 struct page *p;
1085 int freeit = 0; 1140 int freeit = 0;
1141 unsigned int nr_pages;
1086 1142
1087 if (!pfn_valid(pfn)) 1143 if (!pfn_valid(pfn))
1088 return -ENXIO; 1144 return -ENXIO;
@@ -1095,9 +1151,11 @@ int unpoison_memory(unsigned long pfn)
1095 return 0; 1151 return 0;
1096 } 1152 }
1097 1153
1154 nr_pages = 1 << compound_order(page);
1155
1098 if (!get_page_unless_zero(page)) { 1156 if (!get_page_unless_zero(page)) {
1099 if (TestClearPageHWPoison(p)) 1157 if (TestClearPageHWPoison(p))
1100 atomic_long_dec(&mce_bad_pages); 1158 atomic_long_sub(nr_pages, &mce_bad_pages);
1101 pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); 1159 pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
1102 return 0; 1160 return 0;
1103 } 1161 }
@@ -1109,11 +1167,13 @@ int unpoison_memory(unsigned long pfn)
1109 * the PG_hwpoison page will be caught and isolated on the entrance to 1167 * the PG_hwpoison page will be caught and isolated on the entrance to
1110 * the free buddy page pool. 1168 * the free buddy page pool.
1111 */ 1169 */
1112 if (TestClearPageHWPoison(p)) { 1170 if (TestClearPageHWPoison(page)) {
1113 pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); 1171 pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
1114 atomic_long_dec(&mce_bad_pages); 1172 atomic_long_sub(nr_pages, &mce_bad_pages);
1115 freeit = 1; 1173 freeit = 1;
1116 } 1174 }
1175 if (PageHuge(p))
1176 clear_page_hwpoison_huge_page(page);
1117 unlock_page(page); 1177 unlock_page(page);
1118 1178
1119 put_page(page); 1179 put_page(page);
diff --git a/mm/memory.c b/mm/memory.c
index 858829d06a92..9b3b73f4ae9c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2760,6 +2760,26 @@ out_release:
2760} 2760}
2761 2761
2762/* 2762/*
2763 * This is like a special single-page "expand_downwards()",
2764 * except we must first make sure that 'address-PAGE_SIZE'
2765 * doesn't hit another vma.
2766 *
2767 * The "find_vma()" will do the right thing even if we wrap
2768 */
2769static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
2770{
2771 address &= PAGE_MASK;
2772 if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
2773 address -= PAGE_SIZE;
2774 if (find_vma(vma->vm_mm, address) != vma)
2775 return -ENOMEM;
2776
2777 expand_stack(vma, address);
2778 }
2779 return 0;
2780}
2781
2782/*
2763 * We enter with non-exclusive mmap_sem (to exclude vma changes, 2783 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2764 * but allow concurrent faults), and pte mapped but not yet locked. 2784 * but allow concurrent faults), and pte mapped but not yet locked.
2765 * We return with mmap_sem still held, but pte unmapped and unlocked. 2785 * We return with mmap_sem still held, but pte unmapped and unlocked.
@@ -2772,6 +2792,11 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2772 spinlock_t *ptl; 2792 spinlock_t *ptl;
2773 pte_t entry; 2793 pte_t entry;
2774 2794
2795 if (check_stack_guard_page(vma, address) < 0) {
2796 pte_unmap(page_table);
2797 return VM_FAULT_SIGBUS;
2798 }
2799
2775 if (!(flags & FAULT_FLAG_WRITE)) { 2800 if (!(flags & FAULT_FLAG_WRITE)) {
2776 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), 2801 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
2777 vma->vm_page_prot)); 2802 vma->vm_page_prot));
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d3def05a33d9..5014e50644d1 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -106,7 +106,7 @@ static void boost_dying_task_prio(struct task_struct *p,
106 * pointer. Return p, or any of its subthreads with a valid ->mm, with 106 * pointer. Return p, or any of its subthreads with a valid ->mm, with
107 * task_lock() held. 107 * task_lock() held.
108 */ 108 */
109static struct task_struct *find_lock_task_mm(struct task_struct *p) 109struct task_struct *find_lock_task_mm(struct task_struct *p)
110{ 110{
111 struct task_struct *t = p; 111 struct task_struct *t = p;
112 112
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0c6258bd1ba3..20890d80c7ef 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -253,32 +253,6 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
253 } 253 }
254} 254}
255 255
256/*
257 * Clip the earned share of dirty pages to that which is actually available.
258 * This avoids exceeding the total dirty_limit when the floating averages
259 * fluctuate too quickly.
260 */
261static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
262 unsigned long dirty, unsigned long *pbdi_dirty)
263{
264 unsigned long avail_dirty;
265
266 avail_dirty = global_page_state(NR_FILE_DIRTY) +
267 global_page_state(NR_WRITEBACK) +
268 global_page_state(NR_UNSTABLE_NFS) +
269 global_page_state(NR_WRITEBACK_TEMP);
270
271 if (avail_dirty < dirty)
272 avail_dirty = dirty - avail_dirty;
273 else
274 avail_dirty = 0;
275
276 avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
277 bdi_stat(bdi, BDI_WRITEBACK);
278
279 *pbdi_dirty = min(*pbdi_dirty, avail_dirty);
280}
281
282static inline void task_dirties_fraction(struct task_struct *tsk, 256static inline void task_dirties_fraction(struct task_struct *tsk,
283 long *numerator, long *denominator) 257 long *numerator, long *denominator)
284{ 258{
@@ -287,16 +261,24 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
287} 261}
288 262
289/* 263/*
290 * scale the dirty limit 264 * task_dirty_limit - scale down dirty throttling threshold for one task
291 * 265 *
292 * task specific dirty limit: 266 * task specific dirty limit:
293 * 267 *
294 * dirty -= (dirty/8) * p_{t} 268 * dirty -= (dirty/8) * p_{t}
269 *
270 * To protect light/slow dirtying tasks from heavier/fast ones, we start
271 * throttling individual tasks before reaching the bdi dirty limit.
272 * Relatively low thresholds will be allocated to heavy dirtiers. So when
273 * dirty pages grow large, heavy dirtiers will be throttled first, which will
274 * effectively curb the growth of dirty pages. Light dirtiers with high enough
275 * dirty threshold may never get throttled.
295 */ 276 */
296static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty) 277static unsigned long task_dirty_limit(struct task_struct *tsk,
278 unsigned long bdi_dirty)
297{ 279{
298 long numerator, denominator; 280 long numerator, denominator;
299 unsigned long dirty = *pdirty; 281 unsigned long dirty = bdi_dirty;
300 u64 inv = dirty >> 3; 282 u64 inv = dirty >> 3;
301 283
302 task_dirties_fraction(tsk, &numerator, &denominator); 284 task_dirties_fraction(tsk, &numerator, &denominator);
@@ -304,10 +286,8 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
304 do_div(inv, denominator); 286 do_div(inv, denominator);
305 287
306 dirty -= inv; 288 dirty -= inv;
307 if (dirty < *pdirty/2)
308 dirty = *pdirty/2;
309 289
310 *pdirty = dirty; 290 return max(dirty, bdi_dirty/2);
311} 291}
312 292
313/* 293/*
@@ -417,9 +397,16 @@ unsigned long determine_dirtyable_memory(void)
417 return x + 1; /* Ensure that we never return 0 */ 397 return x + 1; /* Ensure that we never return 0 */
418} 398}
419 399
420void 400/**
421get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, 401 * global_dirty_limits - background-writeback and dirty-throttling thresholds
422 unsigned long *pbdi_dirty, struct backing_dev_info *bdi) 402 *
403 * Calculate the dirty thresholds based on sysctl parameters
404 * - vm.dirty_background_ratio or vm.dirty_background_bytes
405 * - vm.dirty_ratio or vm.dirty_bytes
406 * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
407 * runtime tasks.
408 */
409void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
423{ 410{
424 unsigned long background; 411 unsigned long background;
425 unsigned long dirty; 412 unsigned long dirty;
@@ -451,27 +438,37 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
451 } 438 }
452 *pbackground = background; 439 *pbackground = background;
453 *pdirty = dirty; 440 *pdirty = dirty;
441}
442
443/**
444 * bdi_dirty_limit - @bdi's share of dirty throttling threshold
445 *
446 * Allocate high/low dirty limits to fast/slow devices, in order to prevent
447 * - starving fast devices
448 * - piling up dirty pages (that will take long time to sync) on slow devices
449 *
450 * The bdi's share of dirty limit will be adapting to its throughput and
451 * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
452 */
453unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
454{
455 u64 bdi_dirty;
456 long numerator, denominator;
457
458 /*
459 * Calculate this BDI's share of the dirty ratio.
460 */
461 bdi_writeout_fraction(bdi, &numerator, &denominator);
454 462
455 if (bdi) { 463 bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
456 u64 bdi_dirty; 464 bdi_dirty *= numerator;
457 long numerator, denominator; 465 do_div(bdi_dirty, denominator);
458 466
459 /* 467 bdi_dirty += (dirty * bdi->min_ratio) / 100;
460 * Calculate this BDI's share of the dirty ratio. 468 if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
461 */ 469 bdi_dirty = dirty * bdi->max_ratio / 100;
462 bdi_writeout_fraction(bdi, &numerator, &denominator); 470
463 471 return bdi_dirty;
464 bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
465 bdi_dirty *= numerator;
466 do_div(bdi_dirty, denominator);
467 bdi_dirty += (dirty * bdi->min_ratio) / 100;
468 if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
469 bdi_dirty = dirty * bdi->max_ratio / 100;
470
471 *pbdi_dirty = bdi_dirty;
472 clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
473 task_dirty_limit(current, pbdi_dirty);
474 }
475} 472}
476 473
477/* 474/*
@@ -491,7 +488,7 @@ static void balance_dirty_pages(struct address_space *mapping,
491 unsigned long bdi_thresh; 488 unsigned long bdi_thresh;
492 unsigned long pages_written = 0; 489 unsigned long pages_written = 0;
493 unsigned long pause = 1; 490 unsigned long pause = 1;
494 491 bool dirty_exceeded = false;
495 struct backing_dev_info *bdi = mapping->backing_dev_info; 492 struct backing_dev_info *bdi = mapping->backing_dev_info;
496 493
497 for (;;) { 494 for (;;) {
@@ -502,18 +499,11 @@ static void balance_dirty_pages(struct address_space *mapping,
502 .range_cyclic = 1, 499 .range_cyclic = 1,
503 }; 500 };
504 501
505 get_dirty_limits(&background_thresh, &dirty_thresh,
506 &bdi_thresh, bdi);
507
508 nr_reclaimable = global_page_state(NR_FILE_DIRTY) + 502 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
509 global_page_state(NR_UNSTABLE_NFS); 503 global_page_state(NR_UNSTABLE_NFS);
510 nr_writeback = global_page_state(NR_WRITEBACK); 504 nr_writeback = global_page_state(NR_WRITEBACK);
511 505
512 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); 506 global_dirty_limits(&background_thresh, &dirty_thresh);
513 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
514
515 if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
516 break;
517 507
518 /* 508 /*
519 * Throttle it only when the background writeback cannot 509 * Throttle it only when the background writeback cannot
@@ -524,26 +514,8 @@ static void balance_dirty_pages(struct address_space *mapping,
524 (background_thresh + dirty_thresh) / 2) 514 (background_thresh + dirty_thresh) / 2)
525 break; 515 break;
526 516
527 if (!bdi->dirty_exceeded) 517 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
528 bdi->dirty_exceeded = 1; 518 bdi_thresh = task_dirty_limit(current, bdi_thresh);
529
530 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
531 * Unstable writes are a feature of certain networked
532 * filesystems (i.e. NFS) in which data may have been
533 * written to the server's write cache, but has not yet
534 * been flushed to permanent storage.
535 * Only move pages to writeback if this bdi is over its
536 * threshold otherwise wait until the disk writes catch
537 * up.
538 */
539 trace_wbc_balance_dirty_start(&wbc, bdi);
540 if (bdi_nr_reclaimable > bdi_thresh) {
541 writeback_inodes_wb(&bdi->wb, &wbc);
542 pages_written += write_chunk - wbc.nr_to_write;
543 get_dirty_limits(&background_thresh, &dirty_thresh,
544 &bdi_thresh, bdi);
545 trace_wbc_balance_dirty_written(&wbc, bdi);
546 }
547 519
548 /* 520 /*
549 * In order to avoid the stacked BDI deadlock we need 521 * In order to avoid the stacked BDI deadlock we need
@@ -558,16 +530,44 @@ static void balance_dirty_pages(struct address_space *mapping,
558 if (bdi_thresh < 2*bdi_stat_error(bdi)) { 530 if (bdi_thresh < 2*bdi_stat_error(bdi)) {
559 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); 531 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
560 bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); 532 bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
561 } else if (bdi_nr_reclaimable) { 533 } else {
562 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); 534 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
563 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); 535 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
564 } 536 }
565 537
566 if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) 538 /*
539 * The bdi thresh is somehow "soft" limit derived from the
540 * global "hard" limit. The former helps to prevent heavy IO
541 * bdi or process from holding back light ones; The latter is
542 * the last resort safeguard.
543 */
544 dirty_exceeded =
545 (bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh)
546 || (nr_reclaimable + nr_writeback >= dirty_thresh);
547
548 if (!dirty_exceeded)
567 break; 549 break;
568 if (pages_written >= write_chunk)
569 break; /* We've done our duty */
570 550
551 if (!bdi->dirty_exceeded)
552 bdi->dirty_exceeded = 1;
553
554 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
555 * Unstable writes are a feature of certain networked
556 * filesystems (i.e. NFS) in which data may have been
557 * written to the server's write cache, but has not yet
558 * been flushed to permanent storage.
559 * Only move pages to writeback if this bdi is over its
560 * threshold otherwise wait until the disk writes catch
561 * up.
562 */
563 trace_wbc_balance_dirty_start(&wbc, bdi);
564 if (bdi_nr_reclaimable > bdi_thresh) {
565 writeback_inodes_wb(&bdi->wb, &wbc);
566 pages_written += write_chunk - wbc.nr_to_write;
567 trace_wbc_balance_dirty_written(&wbc, bdi);
568 if (pages_written >= write_chunk)
569 break; /* We've done our duty */
570 }
571 trace_wbc_balance_dirty_wait(&wbc, bdi); 571 trace_wbc_balance_dirty_wait(&wbc, bdi);
572 __set_current_state(TASK_INTERRUPTIBLE); 572 __set_current_state(TASK_INTERRUPTIBLE);
573 io_schedule_timeout(pause); 573 io_schedule_timeout(pause);
@@ -581,8 +581,7 @@ static void balance_dirty_pages(struct address_space *mapping,
581 pause = HZ / 10; 581 pause = HZ / 10;
582 } 582 }
583 583
584 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && 584 if (!dirty_exceeded && bdi->dirty_exceeded)
585 bdi->dirty_exceeded)
586 bdi->dirty_exceeded = 0; 585 bdi->dirty_exceeded = 0;
587 586
588 if (writeback_in_progress(bdi)) 587 if (writeback_in_progress(bdi))
@@ -597,9 +596,7 @@ static void balance_dirty_pages(struct address_space *mapping,
597 * background_thresh, to keep the amount of dirty memory low. 596 * background_thresh, to keep the amount of dirty memory low.
598 */ 597 */
599 if ((laptop_mode && pages_written) || 598 if ((laptop_mode && pages_written) ||
600 (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) 599 (!laptop_mode && (nr_reclaimable > background_thresh)))
601 + global_page_state(NR_UNSTABLE_NFS))
602 > background_thresh)))
603 bdi_start_background_writeback(bdi); 600 bdi_start_background_writeback(bdi);
604} 601}
605 602
@@ -663,7 +660,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
663 unsigned long dirty_thresh; 660 unsigned long dirty_thresh;
664 661
665 for ( ; ; ) { 662 for ( ; ; ) {
666 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); 663 global_dirty_limits(&background_thresh, &dirty_thresh);
667 664
668 /* 665 /*
669 * Boost the allowable dirty threshold a bit for page 666 * Boost the allowable dirty threshold a bit for page
@@ -825,10 +822,10 @@ void __init page_writeback_init(void)
825/* 822/*
826 * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency. 823 * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency.
827 */ 824 */
828#define WRITEBACK_TAG_BATCH 4096
829void tag_pages_for_writeback(struct address_space *mapping, 825void tag_pages_for_writeback(struct address_space *mapping,
830 pgoff_t start, pgoff_t end) 826 pgoff_t start, pgoff_t end)
831{ 827{
828#define WRITEBACK_TAG_BATCH 4096
832 unsigned long tagged; 829 unsigned long tagged;
833 830
834 do { 831 do {
diff --git a/mm/rmap.c b/mm/rmap.c
index a7d0f5482634..87b9e8ad4509 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,6 +56,7 @@
56#include <linux/memcontrol.h> 56#include <linux/memcontrol.h>
57#include <linux/mmu_notifier.h> 57#include <linux/mmu_notifier.h>
58#include <linux/migrate.h> 58#include <linux/migrate.h>
59#include <linux/hugetlb.h>
59 60
60#include <asm/tlbflush.h> 61#include <asm/tlbflush.h>
61 62
@@ -350,6 +351,8 @@ vma_address(struct page *page, struct vm_area_struct *vma)
350 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 351 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
351 unsigned long address; 352 unsigned long address;
352 353
354 if (unlikely(is_vm_hugetlb_page(vma)))
355 pgoff = page->index << huge_page_order(page_hstate(page));
353 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 356 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
354 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { 357 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
355 /* page should be within @vma mapping range */ 358 /* page should be within @vma mapping range */
@@ -394,6 +397,12 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
394 pte_t *pte; 397 pte_t *pte;
395 spinlock_t *ptl; 398 spinlock_t *ptl;
396 399
400 if (unlikely(PageHuge(page))) {
401 pte = huge_pte_offset(mm, address);
402 ptl = &mm->page_table_lock;
403 goto check;
404 }
405
397 pgd = pgd_offset(mm, address); 406 pgd = pgd_offset(mm, address);
398 if (!pgd_present(*pgd)) 407 if (!pgd_present(*pgd))
399 return NULL; 408 return NULL;
@@ -414,6 +423,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
414 } 423 }
415 424
416 ptl = pte_lockptr(mm, pmd); 425 ptl = pte_lockptr(mm, pmd);
426check:
417 spin_lock(ptl); 427 spin_lock(ptl);
418 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { 428 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
419 *ptlp = ptl; 429 *ptlp = ptl;
@@ -916,6 +926,12 @@ void page_remove_rmap(struct page *page)
916 page_clear_dirty(page); 926 page_clear_dirty(page);
917 set_page_dirty(page); 927 set_page_dirty(page);
918 } 928 }
929 /*
930 * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
931 * and not charged by memcg for now.
932 */
933 if (unlikely(PageHuge(page)))
934 return;
919 if (PageAnon(page)) { 935 if (PageAnon(page)) {
920 mem_cgroup_uncharge_page(page); 936 mem_cgroup_uncharge_page(page);
921 __dec_zone_page_state(page, NR_ANON_PAGES); 937 __dec_zone_page_state(page, NR_ANON_PAGES);
@@ -1524,3 +1540,46 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
1524 return rmap_walk_file(page, rmap_one, arg); 1540 return rmap_walk_file(page, rmap_one, arg);
1525} 1541}
1526#endif /* CONFIG_MIGRATION */ 1542#endif /* CONFIG_MIGRATION */
1543
1544#ifdef CONFIG_HUGETLB_PAGE
1545/*
1546 * The following three functions are for anonymous (private mapped) hugepages.
1547 * Unlike common anonymous pages, anonymous hugepages have no accounting code
1548 * and no lru code, because we handle hugepages differently from common pages.
1549 */
1550static void __hugepage_set_anon_rmap(struct page *page,
1551 struct vm_area_struct *vma, unsigned long address, int exclusive)
1552{
1553 struct anon_vma *anon_vma = vma->anon_vma;
1554 BUG_ON(!anon_vma);
1555 if (!exclusive) {
1556 struct anon_vma_chain *avc;
1557 avc = list_entry(vma->anon_vma_chain.prev,
1558 struct anon_vma_chain, same_vma);
1559 anon_vma = avc->anon_vma;
1560 }
1561 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
1562 page->mapping = (struct address_space *) anon_vma;
1563 page->index = linear_page_index(vma, address);
1564}
1565
1566void hugepage_add_anon_rmap(struct page *page,
1567 struct vm_area_struct *vma, unsigned long address)
1568{
1569 struct anon_vma *anon_vma = vma->anon_vma;
1570 int first;
1571 BUG_ON(!anon_vma);
1572 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
1573 first = atomic_inc_and_test(&page->_mapcount);
1574 if (first)
1575 __hugepage_set_anon_rmap(page, vma, address, 0);
1576}
1577
1578void hugepage_add_new_anon_rmap(struct page *page,
1579 struct vm_area_struct *vma, unsigned long address)
1580{
1581 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
1582 atomic_set(&page->_mapcount, 0);
1583 __hugepage_set_anon_rmap(page, vma, address, 1);
1584}
1585#endif /* CONFIG_HUGETLB_PAGE */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 918c51335d64..6b8889da69a6 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -31,6 +31,7 @@
31#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
32#include <asm/shmparam.h> 32#include <asm/shmparam.h>
33 33
34bool vmap_lazy_unmap __read_mostly = true;
34 35
35/*** Page table manipulation functions ***/ 36/*** Page table manipulation functions ***/
36 37
@@ -502,6 +503,9 @@ static unsigned long lazy_max_pages(void)
502{ 503{
503 unsigned int log; 504 unsigned int log;
504 505
506 if (!vmap_lazy_unmap)
507 return 0;
508
505 log = fls(num_online_cpus()); 509 log = fls(num_online_cpus());
506 510
507 return log * (32UL * 1024 * 1024 / PAGE_SIZE); 511 return log * (32UL * 1024 * 1024 / PAGE_SIZE);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ec5ddccbf82e..c391c320dbaf 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1969,9 +1969,10 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1969unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, 1969unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
1970 gfp_t gfp_mask, bool noswap, 1970 gfp_t gfp_mask, bool noswap,
1971 unsigned int swappiness, 1971 unsigned int swappiness,
1972 struct zone *zone, int nid) 1972 struct zone *zone)
1973{ 1973{
1974 struct scan_control sc = { 1974 struct scan_control sc = {
1975 .nr_to_reclaim = SWAP_CLUSTER_MAX,
1975 .may_writepage = !laptop_mode, 1976 .may_writepage = !laptop_mode,
1976 .may_unmap = 1, 1977 .may_unmap = 1,
1977 .may_swap = !noswap, 1978 .may_swap = !noswap,
@@ -1979,13 +1980,8 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
1979 .order = 0, 1980 .order = 0,
1980 .mem_cgroup = mem, 1981 .mem_cgroup = mem,
1981 }; 1982 };
1982 nodemask_t nm = nodemask_of_node(nid);
1983
1984 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 1983 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1985 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 1984 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
1986 sc.nodemask = &nm;
1987 sc.nr_reclaimed = 0;
1988 sc.nr_scanned = 0;
1989 1985
1990 trace_mm_vmscan_memcg_softlimit_reclaim_begin(0, 1986 trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
1991 sc.may_writepage, 1987 sc.may_writepage,
@@ -2172,7 +2168,6 @@ loop_again:
2172 for (i = 0; i <= end_zone; i++) { 2168 for (i = 0; i <= end_zone; i++) {
2173 struct zone *zone = pgdat->node_zones + i; 2169 struct zone *zone = pgdat->node_zones + i;
2174 int nr_slab; 2170 int nr_slab;
2175 int nid, zid;
2176 2171
2177 if (!populated_zone(zone)) 2172 if (!populated_zone(zone))
2178 continue; 2173 continue;
@@ -2182,14 +2177,12 @@ loop_again:
2182 2177
2183 sc.nr_scanned = 0; 2178 sc.nr_scanned = 0;
2184 2179
2185 nid = pgdat->node_id;
2186 zid = zone_idx(zone);
2187 /* 2180 /*
2188 * Call soft limit reclaim before calling shrink_zone. 2181 * Call soft limit reclaim before calling shrink_zone.
2189 * For now we ignore the return value 2182 * For now we ignore the return value
2190 */ 2183 */
2191 mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask, 2184 mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
2192 nid, zid); 2185
2193 /* 2186 /*
2194 * We put equal pressure on every zone, unless one 2187 * We put equal pressure on every zone, unless one
2195 * zone has way too many pages free already. 2188 * zone has way too many pages free already.