aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c2
-rw-r--r--mm/hugetlb.c6
-rw-r--r--mm/init-mm.c6
-rw-r--r--mm/ksm.c71
-rw-r--r--mm/memcontrol.c55
-rw-r--r--mm/memory.c12
-rw-r--r--mm/mempolicy.c82
-rw-r--r--mm/migrate.c10
-rw-r--r--mm/mmap.c44
-rw-r--r--mm/oom_kill.c683
-rw-r--r--mm/page-writeback.c70
-rw-r--r--mm/page_alloc.c33
-rw-r--r--mm/rmap.c127
-rw-r--r--mm/shmem.c110
-rw-r--r--mm/slab.c2
-rw-r--r--mm/swapfile.c100
-rw-r--r--mm/util.c11
-rw-r--r--mm/vmalloc.c7
-rw-r--r--mm/vmscan.c533
-rw-r--r--mm/vmstat.c8
20 files changed, 1165 insertions, 807 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 20e5642e9f9f..3d4df44e4221 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2238,14 +2238,12 @@ static ssize_t generic_perform_write(struct file *file,
2238 2238
2239 do { 2239 do {
2240 struct page *page; 2240 struct page *page;
2241 pgoff_t index; /* Pagecache index for current page */
2242 unsigned long offset; /* Offset into pagecache page */ 2241 unsigned long offset; /* Offset into pagecache page */
2243 unsigned long bytes; /* Bytes to write to page */ 2242 unsigned long bytes; /* Bytes to write to page */
2244 size_t copied; /* Bytes copied from user */ 2243 size_t copied; /* Bytes copied from user */
2245 void *fsdata; 2244 void *fsdata;
2246 2245
2247 offset = (pos & (PAGE_CACHE_SIZE - 1)); 2246 offset = (pos & (PAGE_CACHE_SIZE - 1));
2248 index = pos >> PAGE_CACHE_SHIFT;
2249 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, 2247 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2250 iov_iter_count(i)); 2248 iov_iter_count(i));
2251 2249
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 54d42b009dbe..b61d2db9f34e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2349,11 +2349,17 @@ retry_avoidcopy:
2349 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 2349 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
2350 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 2350 if (likely(pte_same(huge_ptep_get(ptep), pte))) {
2351 /* Break COW */ 2351 /* Break COW */
2352 mmu_notifier_invalidate_range_start(mm,
2353 address & huge_page_mask(h),
2354 (address & huge_page_mask(h)) + huge_page_size(h));
2352 huge_ptep_clear_flush(vma, address, ptep); 2355 huge_ptep_clear_flush(vma, address, ptep);
2353 set_huge_pte_at(mm, address, ptep, 2356 set_huge_pte_at(mm, address, ptep,
2354 make_huge_pte(vma, new_page, 1)); 2357 make_huge_pte(vma, new_page, 1));
2355 /* Make the old page be freed below */ 2358 /* Make the old page be freed below */
2356 new_page = old_page; 2359 new_page = old_page;
2360 mmu_notifier_invalidate_range_end(mm,
2361 address & huge_page_mask(h),
2362 (address & huge_page_mask(h)) + huge_page_size(h));
2357 } 2363 }
2358 page_cache_release(new_page); 2364 page_cache_release(new_page);
2359 page_cache_release(old_page); 2365 page_cache_release(old_page);
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 57aba0da9668..1d29cdfe8ebb 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -7,6 +7,11 @@
7 7
8#include <asm/atomic.h> 8#include <asm/atomic.h>
9#include <asm/pgtable.h> 9#include <asm/pgtable.h>
10#include <asm/mmu.h>
11
12#ifndef INIT_MM_CONTEXT
13#define INIT_MM_CONTEXT(name)
14#endif
10 15
11struct mm_struct init_mm = { 16struct mm_struct init_mm = {
12 .mm_rb = RB_ROOT, 17 .mm_rb = RB_ROOT,
@@ -17,4 +22,5 @@ struct mm_struct init_mm = {
17 .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), 22 .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
18 .mmlist = LIST_HEAD_INIT(init_mm.mmlist), 23 .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
19 .cpu_vm_mask = CPU_MASK_ALL, 24 .cpu_vm_mask = CPU_MASK_ALL,
25 INIT_MM_CONTEXT(init_mm)
20}; 26};
diff --git a/mm/ksm.c b/mm/ksm.c
index 6c3e99b4ae7c..e2ae00458320 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -33,6 +33,7 @@
33#include <linux/mmu_notifier.h> 33#include <linux/mmu_notifier.h>
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/ksm.h> 35#include <linux/ksm.h>
36#include <linux/hash.h>
36 37
37#include <asm/tlbflush.h> 38#include <asm/tlbflush.h>
38#include "internal.h" 39#include "internal.h"
@@ -153,8 +154,9 @@ struct rmap_item {
153static struct rb_root root_stable_tree = RB_ROOT; 154static struct rb_root root_stable_tree = RB_ROOT;
154static struct rb_root root_unstable_tree = RB_ROOT; 155static struct rb_root root_unstable_tree = RB_ROOT;
155 156
156#define MM_SLOTS_HASH_HEADS 1024 157#define MM_SLOTS_HASH_SHIFT 10
157static struct hlist_head *mm_slots_hash; 158#define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT)
159static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS];
158 160
159static struct mm_slot ksm_mm_head = { 161static struct mm_slot ksm_mm_head = {
160 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), 162 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
@@ -269,28 +271,13 @@ static inline void free_mm_slot(struct mm_slot *mm_slot)
269 kmem_cache_free(mm_slot_cache, mm_slot); 271 kmem_cache_free(mm_slot_cache, mm_slot);
270} 272}
271 273
272static int __init mm_slots_hash_init(void)
273{
274 mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
275 GFP_KERNEL);
276 if (!mm_slots_hash)
277 return -ENOMEM;
278 return 0;
279}
280
281static void __init mm_slots_hash_free(void)
282{
283 kfree(mm_slots_hash);
284}
285
286static struct mm_slot *get_mm_slot(struct mm_struct *mm) 274static struct mm_slot *get_mm_slot(struct mm_struct *mm)
287{ 275{
288 struct mm_slot *mm_slot; 276 struct mm_slot *mm_slot;
289 struct hlist_head *bucket; 277 struct hlist_head *bucket;
290 struct hlist_node *node; 278 struct hlist_node *node;
291 279
292 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) 280 bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
293 % MM_SLOTS_HASH_HEADS];
294 hlist_for_each_entry(mm_slot, node, bucket, link) { 281 hlist_for_each_entry(mm_slot, node, bucket, link) {
295 if (mm == mm_slot->mm) 282 if (mm == mm_slot->mm)
296 return mm_slot; 283 return mm_slot;
@@ -303,8 +290,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
303{ 290{
304 struct hlist_head *bucket; 291 struct hlist_head *bucket;
305 292
306 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) 293 bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
307 % MM_SLOTS_HASH_HEADS];
308 mm_slot->mm = mm; 294 mm_slot->mm = mm;
309 hlist_add_head(&mm_slot->link, bucket); 295 hlist_add_head(&mm_slot->link, bucket);
310} 296}
@@ -318,19 +304,14 @@ static void hold_anon_vma(struct rmap_item *rmap_item,
318 struct anon_vma *anon_vma) 304 struct anon_vma *anon_vma)
319{ 305{
320 rmap_item->anon_vma = anon_vma; 306 rmap_item->anon_vma = anon_vma;
321 atomic_inc(&anon_vma->external_refcount); 307 get_anon_vma(anon_vma);
322} 308}
323 309
324static void drop_anon_vma(struct rmap_item *rmap_item) 310static void ksm_drop_anon_vma(struct rmap_item *rmap_item)
325{ 311{
326 struct anon_vma *anon_vma = rmap_item->anon_vma; 312 struct anon_vma *anon_vma = rmap_item->anon_vma;
327 313
328 if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) { 314 drop_anon_vma(anon_vma);
329 int empty = list_empty(&anon_vma->head);
330 spin_unlock(&anon_vma->lock);
331 if (empty)
332 anon_vma_free(anon_vma);
333 }
334} 315}
335 316
336/* 317/*
@@ -415,7 +396,7 @@ static void break_cow(struct rmap_item *rmap_item)
415 * It is not an accident that whenever we want to break COW 396 * It is not an accident that whenever we want to break COW
416 * to undo, we also need to drop a reference to the anon_vma. 397 * to undo, we also need to drop a reference to the anon_vma.
417 */ 398 */
418 drop_anon_vma(rmap_item); 399 ksm_drop_anon_vma(rmap_item);
419 400
420 down_read(&mm->mmap_sem); 401 down_read(&mm->mmap_sem);
421 if (ksm_test_exit(mm)) 402 if (ksm_test_exit(mm))
@@ -470,7 +451,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
470 ksm_pages_sharing--; 451 ksm_pages_sharing--;
471 else 452 else
472 ksm_pages_shared--; 453 ksm_pages_shared--;
473 drop_anon_vma(rmap_item); 454 ksm_drop_anon_vma(rmap_item);
474 rmap_item->address &= PAGE_MASK; 455 rmap_item->address &= PAGE_MASK;
475 cond_resched(); 456 cond_resched();
476 } 457 }
@@ -558,7 +539,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
558 else 539 else
559 ksm_pages_shared--; 540 ksm_pages_shared--;
560 541
561 drop_anon_vma(rmap_item); 542 ksm_drop_anon_vma(rmap_item);
562 rmap_item->address &= PAGE_MASK; 543 rmap_item->address &= PAGE_MASK;
563 544
564 } else if (rmap_item->address & UNSTABLE_FLAG) { 545 } else if (rmap_item->address & UNSTABLE_FLAG) {
@@ -1566,7 +1547,7 @@ again:
1566 struct anon_vma_chain *vmac; 1547 struct anon_vma_chain *vmac;
1567 struct vm_area_struct *vma; 1548 struct vm_area_struct *vma;
1568 1549
1569 spin_lock(&anon_vma->lock); 1550 anon_vma_lock(anon_vma);
1570 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { 1551 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1571 vma = vmac->vma; 1552 vma = vmac->vma;
1572 if (rmap_item->address < vma->vm_start || 1553 if (rmap_item->address < vma->vm_start ||
@@ -1589,7 +1570,7 @@ again:
1589 if (!search_new_forks || !mapcount) 1570 if (!search_new_forks || !mapcount)
1590 break; 1571 break;
1591 } 1572 }
1592 spin_unlock(&anon_vma->lock); 1573 anon_vma_unlock(anon_vma);
1593 if (!mapcount) 1574 if (!mapcount)
1594 goto out; 1575 goto out;
1595 } 1576 }
@@ -1619,7 +1600,7 @@ again:
1619 struct anon_vma_chain *vmac; 1600 struct anon_vma_chain *vmac;
1620 struct vm_area_struct *vma; 1601 struct vm_area_struct *vma;
1621 1602
1622 spin_lock(&anon_vma->lock); 1603 anon_vma_lock(anon_vma);
1623 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { 1604 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1624 vma = vmac->vma; 1605 vma = vmac->vma;
1625 if (rmap_item->address < vma->vm_start || 1606 if (rmap_item->address < vma->vm_start ||
@@ -1637,11 +1618,11 @@ again:
1637 ret = try_to_unmap_one(page, vma, 1618 ret = try_to_unmap_one(page, vma,
1638 rmap_item->address, flags); 1619 rmap_item->address, flags);
1639 if (ret != SWAP_AGAIN || !page_mapped(page)) { 1620 if (ret != SWAP_AGAIN || !page_mapped(page)) {
1640 spin_unlock(&anon_vma->lock); 1621 anon_vma_unlock(anon_vma);
1641 goto out; 1622 goto out;
1642 } 1623 }
1643 } 1624 }
1644 spin_unlock(&anon_vma->lock); 1625 anon_vma_unlock(anon_vma);
1645 } 1626 }
1646 if (!search_new_forks++) 1627 if (!search_new_forks++)
1647 goto again; 1628 goto again;
@@ -1671,7 +1652,7 @@ again:
1671 struct anon_vma_chain *vmac; 1652 struct anon_vma_chain *vmac;
1672 struct vm_area_struct *vma; 1653 struct vm_area_struct *vma;
1673 1654
1674 spin_lock(&anon_vma->lock); 1655 anon_vma_lock(anon_vma);
1675 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { 1656 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
1676 vma = vmac->vma; 1657 vma = vmac->vma;
1677 if (rmap_item->address < vma->vm_start || 1658 if (rmap_item->address < vma->vm_start ||
@@ -1688,11 +1669,11 @@ again:
1688 1669
1689 ret = rmap_one(page, vma, rmap_item->address, arg); 1670 ret = rmap_one(page, vma, rmap_item->address, arg);
1690 if (ret != SWAP_AGAIN) { 1671 if (ret != SWAP_AGAIN) {
1691 spin_unlock(&anon_vma->lock); 1672 anon_vma_unlock(anon_vma);
1692 goto out; 1673 goto out;
1693 } 1674 }
1694 } 1675 }
1695 spin_unlock(&anon_vma->lock); 1676 anon_vma_unlock(anon_vma);
1696 } 1677 }
1697 if (!search_new_forks++) 1678 if (!search_new_forks++)
1698 goto again; 1679 goto again;
@@ -1943,15 +1924,11 @@ static int __init ksm_init(void)
1943 if (err) 1924 if (err)
1944 goto out; 1925 goto out;
1945 1926
1946 err = mm_slots_hash_init();
1947 if (err)
1948 goto out_free1;
1949
1950 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); 1927 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
1951 if (IS_ERR(ksm_thread)) { 1928 if (IS_ERR(ksm_thread)) {
1952 printk(KERN_ERR "ksm: creating kthread failed\n"); 1929 printk(KERN_ERR "ksm: creating kthread failed\n");
1953 err = PTR_ERR(ksm_thread); 1930 err = PTR_ERR(ksm_thread);
1954 goto out_free2; 1931 goto out_free;
1955 } 1932 }
1956 1933
1957#ifdef CONFIG_SYSFS 1934#ifdef CONFIG_SYSFS
@@ -1959,7 +1936,7 @@ static int __init ksm_init(void)
1959 if (err) { 1936 if (err) {
1960 printk(KERN_ERR "ksm: register sysfs failed\n"); 1937 printk(KERN_ERR "ksm: register sysfs failed\n");
1961 kthread_stop(ksm_thread); 1938 kthread_stop(ksm_thread);
1962 goto out_free2; 1939 goto out_free;
1963 } 1940 }
1964#else 1941#else
1965 ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ 1942 ksm_run = KSM_RUN_MERGE; /* no way for user to start it */
@@ -1975,9 +1952,7 @@ static int __init ksm_init(void)
1975#endif 1952#endif
1976 return 0; 1953 return 0;
1977 1954
1978out_free2: 1955out_free:
1979 mm_slots_hash_free();
1980out_free1:
1981 ksm_slab_free(); 1956 ksm_slab_free();
1982out: 1957out:
1983 return err; 1958 return err;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 20a8193a7af8..0576e9e64586 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -51,6 +51,8 @@
51 51
52#include <asm/uaccess.h> 52#include <asm/uaccess.h>
53 53
54#include <trace/events/vmscan.h>
55
54struct cgroup_subsys mem_cgroup_subsys __read_mostly; 56struct cgroup_subsys mem_cgroup_subsys __read_mostly;
55#define MEM_CGROUP_RECLAIM_RETRIES 5 57#define MEM_CGROUP_RECLAIM_RETRIES 5
56struct mem_cgroup *root_mem_cgroup __read_mostly; 58struct mem_cgroup *root_mem_cgroup __read_mostly;
@@ -211,8 +213,6 @@ struct mem_cgroup {
211 */ 213 */
212 spinlock_t reclaim_param_lock; 214 spinlock_t reclaim_param_lock;
213 215
214 int prev_priority; /* for recording reclaim priority */
215
216 /* 216 /*
217 * While reclaiming in a hierarchy, we cache the last child we 217 * While reclaiming in a hierarchy, we cache the last child we
218 * reclaimed from. 218 * reclaimed from.
@@ -858,35 +858,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
858 return ret; 858 return ret;
859} 859}
860 860
861/*
862 * prev_priority control...this will be used in memory reclaim path.
863 */
864int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
865{
866 int prev_priority;
867
868 spin_lock(&mem->reclaim_param_lock);
869 prev_priority = mem->prev_priority;
870 spin_unlock(&mem->reclaim_param_lock);
871
872 return prev_priority;
873}
874
875void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
876{
877 spin_lock(&mem->reclaim_param_lock);
878 if (priority < mem->prev_priority)
879 mem->prev_priority = priority;
880 spin_unlock(&mem->reclaim_param_lock);
881}
882
883void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
884{
885 spin_lock(&mem->reclaim_param_lock);
886 mem->prev_priority = priority;
887 spin_unlock(&mem->reclaim_param_lock);
888}
889
890static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) 861static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
891{ 862{
892 unsigned long active; 863 unsigned long active;
@@ -1038,6 +1009,10 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1038 } 1009 }
1039 1010
1040 *scanned = scan; 1011 *scanned = scan;
1012
1013 trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
1014 0, 0, 0, mode);
1015
1041 return nr_taken; 1016 return nr_taken;
1042} 1017}
1043 1018
@@ -1158,6 +1133,24 @@ static int mem_cgroup_count_children(struct mem_cgroup *mem)
1158} 1133}
1159 1134
1160/* 1135/*
1136 * Return the memory (and swap, if configured) limit for a memcg.
1137 */
1138u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1139{
1140 u64 limit;
1141 u64 memsw;
1142
1143 limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
1144 total_swap_pages;
1145 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1146 /*
1147 * If memsw is finite and limits the amount of swap space available
1148 * to this memcg, return that limit.
1149 */
1150 return min(limit, memsw);
1151}
1152
1153/*
1161 * Visit the first child (need not be the first child as per the ordering 1154 * Visit the first child (need not be the first child as per the ordering
1162 * of the cgroup list, since we track last_scanned_child) of @mem and use 1155 * of the cgroup list, since we track last_scanned_child) of @mem and use
1163 * that to reclaim free pages from. 1156 * that to reclaim free pages from.
diff --git a/mm/memory.c b/mm/memory.c
index bde42c6d3633..858829d06a92 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -307,7 +307,6 @@ void free_pgd_range(struct mmu_gather *tlb,
307{ 307{
308 pgd_t *pgd; 308 pgd_t *pgd;
309 unsigned long next; 309 unsigned long next;
310 unsigned long start;
311 310
312 /* 311 /*
313 * The next few lines have given us lots of grief... 312 * The next few lines have given us lots of grief...
@@ -351,7 +350,6 @@ void free_pgd_range(struct mmu_gather *tlb,
351 if (addr > end - 1) 350 if (addr > end - 1)
352 return; 351 return;
353 352
354 start = addr;
355 pgd = pgd_offset(tlb->mm, addr); 353 pgd = pgd_offset(tlb->mm, addr);
356 do { 354 do {
357 next = pgd_addr_end(addr, end); 355 next = pgd_addr_end(addr, end);
@@ -2008,11 +2006,10 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2008{ 2006{
2009 pgd_t *pgd; 2007 pgd_t *pgd;
2010 unsigned long next; 2008 unsigned long next;
2011 unsigned long start = addr, end = addr + size; 2009 unsigned long end = addr + size;
2012 int err; 2010 int err;
2013 2011
2014 BUG_ON(addr >= end); 2012 BUG_ON(addr >= end);
2015 mmu_notifier_invalidate_range_start(mm, start, end);
2016 pgd = pgd_offset(mm, addr); 2013 pgd = pgd_offset(mm, addr);
2017 do { 2014 do {
2018 next = pgd_addr_end(addr, end); 2015 next = pgd_addr_end(addr, end);
@@ -2020,7 +2017,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2020 if (err) 2017 if (err)
2021 break; 2018 break;
2022 } while (pgd++, addr = next, addr != end); 2019 } while (pgd++, addr = next, addr != end);
2023 mmu_notifier_invalidate_range_end(mm, start, end); 2020
2024 return err; 2021 return err;
2025} 2022}
2026EXPORT_SYMBOL_GPL(apply_to_page_range); 2023EXPORT_SYMBOL_GPL(apply_to_page_range);
@@ -2630,6 +2627,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2630 swp_entry_t entry; 2627 swp_entry_t entry;
2631 pte_t pte; 2628 pte_t pte;
2632 struct mem_cgroup *ptr = NULL; 2629 struct mem_cgroup *ptr = NULL;
2630 int exclusive = 0;
2633 int ret = 0; 2631 int ret = 0;
2634 2632
2635 if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) 2633 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
@@ -2724,10 +2722,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2724 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { 2722 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
2725 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 2723 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2726 flags &= ~FAULT_FLAG_WRITE; 2724 flags &= ~FAULT_FLAG_WRITE;
2725 ret |= VM_FAULT_WRITE;
2726 exclusive = 1;
2727 } 2727 }
2728 flush_icache_page(vma, page); 2728 flush_icache_page(vma, page);
2729 set_pte_at(mm, address, page_table, pte); 2729 set_pte_at(mm, address, page_table, pte);
2730 page_add_anon_rmap(page, vma, address); 2730 do_page_add_anon_rmap(page, vma, address, exclusive);
2731 /* It's better to call commit-charge after rmap is established */ 2731 /* It's better to call commit-charge after rmap is established */
2732 mem_cgroup_commit_charge_swapin(page, ptr); 2732 mem_cgroup_commit_charge_swapin(page, ptr);
2733 2733
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5bc0a96beb51..f969da5dd8a2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1275,33 +1275,42 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1275 const unsigned long __user *, new_nodes) 1275 const unsigned long __user *, new_nodes)
1276{ 1276{
1277 const struct cred *cred = current_cred(), *tcred; 1277 const struct cred *cred = current_cred(), *tcred;
1278 struct mm_struct *mm; 1278 struct mm_struct *mm = NULL;
1279 struct task_struct *task; 1279 struct task_struct *task;
1280 nodemask_t old;
1281 nodemask_t new;
1282 nodemask_t task_nodes; 1280 nodemask_t task_nodes;
1283 int err; 1281 int err;
1282 nodemask_t *old;
1283 nodemask_t *new;
1284 NODEMASK_SCRATCH(scratch);
1285
1286 if (!scratch)
1287 return -ENOMEM;
1288
1289 old = &scratch->mask1;
1290 new = &scratch->mask2;
1284 1291
1285 err = get_nodes(&old, old_nodes, maxnode); 1292 err = get_nodes(old, old_nodes, maxnode);
1286 if (err) 1293 if (err)
1287 return err; 1294 goto out;
1288 1295
1289 err = get_nodes(&new, new_nodes, maxnode); 1296 err = get_nodes(new, new_nodes, maxnode);
1290 if (err) 1297 if (err)
1291 return err; 1298 goto out;
1292 1299
1293 /* Find the mm_struct */ 1300 /* Find the mm_struct */
1294 read_lock(&tasklist_lock); 1301 read_lock(&tasklist_lock);
1295 task = pid ? find_task_by_vpid(pid) : current; 1302 task = pid ? find_task_by_vpid(pid) : current;
1296 if (!task) { 1303 if (!task) {
1297 read_unlock(&tasklist_lock); 1304 read_unlock(&tasklist_lock);
1298 return -ESRCH; 1305 err = -ESRCH;
1306 goto out;
1299 } 1307 }
1300 mm = get_task_mm(task); 1308 mm = get_task_mm(task);
1301 read_unlock(&tasklist_lock); 1309 read_unlock(&tasklist_lock);
1302 1310
1311 err = -EINVAL;
1303 if (!mm) 1312 if (!mm)
1304 return -EINVAL; 1313 goto out;
1305 1314
1306 /* 1315 /*
1307 * Check if this process has the right to modify the specified 1316 * Check if this process has the right to modify the specified
@@ -1322,12 +1331,12 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1322 1331
1323 task_nodes = cpuset_mems_allowed(task); 1332 task_nodes = cpuset_mems_allowed(task);
1324 /* Is the user allowed to access the target nodes? */ 1333 /* Is the user allowed to access the target nodes? */
1325 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) { 1334 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1326 err = -EPERM; 1335 err = -EPERM;
1327 goto out; 1336 goto out;
1328 } 1337 }
1329 1338
1330 if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) { 1339 if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
1331 err = -EINVAL; 1340 err = -EINVAL;
1332 goto out; 1341 goto out;
1333 } 1342 }
@@ -1336,10 +1345,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1336 if (err) 1345 if (err)
1337 goto out; 1346 goto out;
1338 1347
1339 err = do_migrate_pages(mm, &old, &new, 1348 err = do_migrate_pages(mm, old, new,
1340 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); 1349 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1341out: 1350out:
1342 mmput(mm); 1351 if (mm)
1352 mmput(mm);
1353 NODEMASK_SCRATCH_FREE(scratch);
1354
1343 return err; 1355 return err;
1344} 1356}
1345 1357
@@ -1712,6 +1724,50 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
1712} 1724}
1713#endif 1725#endif
1714 1726
1727/*
1728 * mempolicy_nodemask_intersects
1729 *
1730 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1731 * policy. Otherwise, check for intersection between mask and the policy
1732 * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
1733 * policy, always return true since it may allocate elsewhere on fallback.
1734 *
1735 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1736 */
1737bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1738 const nodemask_t *mask)
1739{
1740 struct mempolicy *mempolicy;
1741 bool ret = true;
1742
1743 if (!mask)
1744 return ret;
1745 task_lock(tsk);
1746 mempolicy = tsk->mempolicy;
1747 if (!mempolicy)
1748 goto out;
1749
1750 switch (mempolicy->mode) {
1751 case MPOL_PREFERRED:
1752 /*
1753 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1754 * allocate from, they may fallback to other nodes when oom.
1755 * Thus, it's possible for tsk to have allocated memory from
1756 * nodes in mask.
1757 */
1758 break;
1759 case MPOL_BIND:
1760 case MPOL_INTERLEAVE:
1761 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1762 break;
1763 default:
1764 BUG();
1765 }
1766out:
1767 task_unlock(tsk);
1768 return ret;
1769}
1770
1715/* Allocate a page in interleaved policy. 1771/* Allocate a page in interleaved policy.
1716 Own path because it needs to do special accounting. */ 1772 Own path because it needs to do special accounting. */
1717static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, 1773static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
diff --git a/mm/migrate.c b/mm/migrate.c
index 4205b1d6049e..38e7cad782f4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -639,7 +639,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
639 * exist when the page is remapped later 639 * exist when the page is remapped later
640 */ 640 */
641 anon_vma = page_anon_vma(page); 641 anon_vma = page_anon_vma(page);
642 atomic_inc(&anon_vma->external_refcount); 642 get_anon_vma(anon_vma);
643 } 643 }
644 } 644 }
645 645
@@ -682,12 +682,8 @@ skip_unmap:
682rcu_unlock: 682rcu_unlock:
683 683
684 /* Drop an anon_vma reference if we took one */ 684 /* Drop an anon_vma reference if we took one */
685 if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) { 685 if (anon_vma)
686 int empty = list_empty(&anon_vma->head); 686 drop_anon_vma(anon_vma);
687 spin_unlock(&anon_vma->lock);
688 if (empty)
689 anon_vma_free(anon_vma);
690 }
691 687
692 if (rcu_locked) 688 if (rcu_locked)
693 rcu_read_unlock(); 689 rcu_read_unlock();
diff --git a/mm/mmap.c b/mm/mmap.c
index e38e910cb756..31003338b978 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -452,12 +452,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
452 spin_lock(&mapping->i_mmap_lock); 452 spin_lock(&mapping->i_mmap_lock);
453 vma->vm_truncate_count = mapping->truncate_count; 453 vma->vm_truncate_count = mapping->truncate_count;
454 } 454 }
455 anon_vma_lock(vma);
456 455
457 __vma_link(mm, vma, prev, rb_link, rb_parent); 456 __vma_link(mm, vma, prev, rb_link, rb_parent);
458 __vma_link_file(vma); 457 __vma_link_file(vma);
459 458
460 anon_vma_unlock(vma);
461 if (mapping) 459 if (mapping)
462 spin_unlock(&mapping->i_mmap_lock); 460 spin_unlock(&mapping->i_mmap_lock);
463 461
@@ -506,6 +504,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
506 struct vm_area_struct *importer = NULL; 504 struct vm_area_struct *importer = NULL;
507 struct address_space *mapping = NULL; 505 struct address_space *mapping = NULL;
508 struct prio_tree_root *root = NULL; 506 struct prio_tree_root *root = NULL;
507 struct anon_vma *anon_vma = NULL;
509 struct file *file = vma->vm_file; 508 struct file *file = vma->vm_file;
510 long adjust_next = 0; 509 long adjust_next = 0;
511 int remove_next = 0; 510 int remove_next = 0;
@@ -578,6 +577,17 @@ again: remove_next = 1 + (end > next->vm_end);
578 } 577 }
579 } 578 }
580 579
580 /*
581 * When changing only vma->vm_end, we don't really need anon_vma
582 * lock. This is a fairly rare case by itself, but the anon_vma
583 * lock may be shared between many sibling processes. Skipping
584 * the lock for brk adjustments makes a difference sometimes.
585 */
586 if (vma->anon_vma && (insert || importer || start != vma->vm_start)) {
587 anon_vma = vma->anon_vma;
588 anon_vma_lock(anon_vma);
589 }
590
581 if (root) { 591 if (root) {
582 flush_dcache_mmap_lock(mapping); 592 flush_dcache_mmap_lock(mapping);
583 vma_prio_tree_remove(vma, root); 593 vma_prio_tree_remove(vma, root);
@@ -617,6 +627,8 @@ again: remove_next = 1 + (end > next->vm_end);
617 __insert_vm_struct(mm, insert); 627 __insert_vm_struct(mm, insert);
618 } 628 }
619 629
630 if (anon_vma)
631 anon_vma_unlock(anon_vma);
620 if (mapping) 632 if (mapping)
621 spin_unlock(&mapping->i_mmap_lock); 633 spin_unlock(&mapping->i_mmap_lock);
622 634
@@ -1710,7 +1722,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1710 */ 1722 */
1711 if (unlikely(anon_vma_prepare(vma))) 1723 if (unlikely(anon_vma_prepare(vma)))
1712 return -ENOMEM; 1724 return -ENOMEM;
1713 anon_vma_lock(vma); 1725 vma_lock_anon_vma(vma);
1714 1726
1715 /* 1727 /*
1716 * vma->vm_start/vm_end cannot change under us because the caller 1728 * vma->vm_start/vm_end cannot change under us because the caller
@@ -1721,7 +1733,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1721 if (address < PAGE_ALIGN(address+4)) 1733 if (address < PAGE_ALIGN(address+4))
1722 address = PAGE_ALIGN(address+4); 1734 address = PAGE_ALIGN(address+4);
1723 else { 1735 else {
1724 anon_vma_unlock(vma); 1736 vma_unlock_anon_vma(vma);
1725 return -ENOMEM; 1737 return -ENOMEM;
1726 } 1738 }
1727 error = 0; 1739 error = 0;
@@ -1739,7 +1751,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1739 perf_event_mmap(vma); 1751 perf_event_mmap(vma);
1740 } 1752 }
1741 } 1753 }
1742 anon_vma_unlock(vma); 1754 vma_unlock_anon_vma(vma);
1743 return error; 1755 return error;
1744} 1756}
1745#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ 1757#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1764,7 +1776,7 @@ static int expand_downwards(struct vm_area_struct *vma,
1764 if (error) 1776 if (error)
1765 return error; 1777 return error;
1766 1778
1767 anon_vma_lock(vma); 1779 vma_lock_anon_vma(vma);
1768 1780
1769 /* 1781 /*
1770 * vma->vm_start/vm_end cannot change under us because the caller 1782 * vma->vm_start/vm_end cannot change under us because the caller
@@ -1786,7 +1798,7 @@ static int expand_downwards(struct vm_area_struct *vma,
1786 perf_event_mmap(vma); 1798 perf_event_mmap(vma);
1787 } 1799 }
1788 } 1800 }
1789 anon_vma_unlock(vma); 1801 vma_unlock_anon_vma(vma);
1790 return error; 1802 return error;
1791} 1803}
1792 1804
@@ -2470,23 +2482,23 @@ static DEFINE_MUTEX(mm_all_locks_mutex);
2470 2482
2471static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) 2483static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
2472{ 2484{
2473 if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) { 2485 if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
2474 /* 2486 /*
2475 * The LSB of head.next can't change from under us 2487 * The LSB of head.next can't change from under us
2476 * because we hold the mm_all_locks_mutex. 2488 * because we hold the mm_all_locks_mutex.
2477 */ 2489 */
2478 spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem); 2490 spin_lock_nest_lock(&anon_vma->root->lock, &mm->mmap_sem);
2479 /* 2491 /*
2480 * We can safely modify head.next after taking the 2492 * We can safely modify head.next after taking the
2481 * anon_vma->lock. If some other vma in this mm shares 2493 * anon_vma->root->lock. If some other vma in this mm shares
2482 * the same anon_vma we won't take it again. 2494 * the same anon_vma we won't take it again.
2483 * 2495 *
2484 * No need of atomic instructions here, head.next 2496 * No need of atomic instructions here, head.next
2485 * can't change from under us thanks to the 2497 * can't change from under us thanks to the
2486 * anon_vma->lock. 2498 * anon_vma->root->lock.
2487 */ 2499 */
2488 if (__test_and_set_bit(0, (unsigned long *) 2500 if (__test_and_set_bit(0, (unsigned long *)
2489 &anon_vma->head.next)) 2501 &anon_vma->root->head.next))
2490 BUG(); 2502 BUG();
2491 } 2503 }
2492} 2504}
@@ -2577,7 +2589,7 @@ out_unlock:
2577 2589
2578static void vm_unlock_anon_vma(struct anon_vma *anon_vma) 2590static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2579{ 2591{
2580 if (test_bit(0, (unsigned long *) &anon_vma->head.next)) { 2592 if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
2581 /* 2593 /*
2582 * The LSB of head.next can't change to 0 from under 2594 * The LSB of head.next can't change to 0 from under
2583 * us because we hold the mm_all_locks_mutex. 2595 * us because we hold the mm_all_locks_mutex.
@@ -2588,12 +2600,12 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2588 * 2600 *
2589 * No need of atomic instructions here, head.next 2601 * No need of atomic instructions here, head.next
2590 * can't change from under us until we release the 2602 * can't change from under us until we release the
2591 * anon_vma->lock. 2603 * anon_vma->root->lock.
2592 */ 2604 */
2593 if (!__test_and_clear_bit(0, (unsigned long *) 2605 if (!__test_and_clear_bit(0, (unsigned long *)
2594 &anon_vma->head.next)) 2606 &anon_vma->root->head.next))
2595 BUG(); 2607 BUG();
2596 spin_unlock(&anon_vma->lock); 2608 anon_vma_unlock(anon_vma);
2597 } 2609 }
2598} 2610}
2599 2611
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 709aedfaa014..d3def05a33d9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -4,6 +4,8 @@
4 * Copyright (C) 1998,2000 Rik van Riel 4 * Copyright (C) 1998,2000 Rik van Riel
5 * Thanks go out to Claus Fischer for some serious inspiration and 5 * Thanks go out to Claus Fischer for some serious inspiration and
6 * for goading me into coding this file... 6 * for goading me into coding this file...
7 * Copyright (C) 2010 Google, Inc.
8 * Rewritten by David Rientjes
7 * 9 *
8 * The routines in this file are used to kill a process when 10 * The routines in this file are used to kill a process when
9 * we're seriously out of memory. This gets called from __alloc_pages() 11 * we're seriously out of memory. This gets called from __alloc_pages()
@@ -27,171 +29,188 @@
27#include <linux/module.h> 29#include <linux/module.h>
28#include <linux/notifier.h> 30#include <linux/notifier.h>
29#include <linux/memcontrol.h> 31#include <linux/memcontrol.h>
32#include <linux/mempolicy.h>
30#include <linux/security.h> 33#include <linux/security.h>
31 34
32int sysctl_panic_on_oom; 35int sysctl_panic_on_oom;
33int sysctl_oom_kill_allocating_task; 36int sysctl_oom_kill_allocating_task;
34int sysctl_oom_dump_tasks; 37int sysctl_oom_dump_tasks = 1;
35static DEFINE_SPINLOCK(zone_scan_lock); 38static DEFINE_SPINLOCK(zone_scan_lock);
36/* #define DEBUG */ 39
40#ifdef CONFIG_NUMA
41/**
42 * has_intersects_mems_allowed() - check task eligiblity for kill
43 * @tsk: task struct of which task to consider
44 * @mask: nodemask passed to page allocator for mempolicy ooms
45 *
46 * Task eligibility is determined by whether or not a candidate task, @tsk,
47 * shares the same mempolicy nodes as current if it is bound by such a policy
48 * and whether or not it has the same set of allowed cpuset nodes.
49 */
50static bool has_intersects_mems_allowed(struct task_struct *tsk,
51 const nodemask_t *mask)
52{
53 struct task_struct *start = tsk;
54
55 do {
56 if (mask) {
57 /*
58 * If this is a mempolicy constrained oom, tsk's
59 * cpuset is irrelevant. Only return true if its
60 * mempolicy intersects current, otherwise it may be
61 * needlessly killed.
62 */
63 if (mempolicy_nodemask_intersects(tsk, mask))
64 return true;
65 } else {
66 /*
67 * This is not a mempolicy constrained oom, so only
68 * check the mems of tsk's cpuset.
69 */
70 if (cpuset_mems_allowed_intersects(current, tsk))
71 return true;
72 }
73 } while_each_thread(start, tsk);
74
75 return false;
76}
77#else
78static bool has_intersects_mems_allowed(struct task_struct *tsk,
79 const nodemask_t *mask)
80{
81 return true;
82}
83#endif /* CONFIG_NUMA */
37 84
38/* 85/*
39 * Is all threads of the target process nodes overlap ours? 86 * If this is a system OOM (not a memcg OOM) and the task selected to be
87 * killed is not already running at high (RT) priorities, speed up the
88 * recovery by boosting the dying task to the lowest FIFO priority.
89 * That helps with the recovery and avoids interfering with RT tasks.
40 */ 90 */
41static int has_intersects_mems_allowed(struct task_struct *tsk) 91static void boost_dying_task_prio(struct task_struct *p,
92 struct mem_cgroup *mem)
42{ 93{
43 struct task_struct *t; 94 struct sched_param param = { .sched_priority = 1 };
95
96 if (mem)
97 return;
98
99 if (!rt_task(p))
100 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
101}
102
103/*
104 * The process p may have detached its own ->mm while exiting or through
105 * use_mm(), but one or more of its subthreads may still have a valid
106 * pointer. Return p, or any of its subthreads with a valid ->mm, with
107 * task_lock() held.
108 */
109static struct task_struct *find_lock_task_mm(struct task_struct *p)
110{
111 struct task_struct *t = p;
44 112
45 t = tsk;
46 do { 113 do {
47 if (cpuset_mems_allowed_intersects(current, t)) 114 task_lock(t);
48 return 1; 115 if (likely(t->mm))
49 t = next_thread(t); 116 return t;
50 } while (t != tsk); 117 task_unlock(t);
118 } while_each_thread(p, t);
51 119
52 return 0; 120 return NULL;
121}
122
123/* return true if the task is not adequate as candidate victim task. */
124static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *mem,
125 const nodemask_t *nodemask)
126{
127 if (is_global_init(p))
128 return true;
129 if (p->flags & PF_KTHREAD)
130 return true;
131
132 /* When mem_cgroup_out_of_memory() and p is not member of the group */
133 if (mem && !task_in_mem_cgroup(p, mem))
134 return true;
135
136 /* p may not have freeable memory in nodemask */
137 if (!has_intersects_mems_allowed(p, nodemask))
138 return true;
139
140 return false;
53} 141}
54 142
55/** 143/**
56 * badness - calculate a numeric value for how bad this task has been 144 * oom_badness - heuristic function to determine which candidate task to kill
57 * @p: task struct of which task we should calculate 145 * @p: task struct of which task we should calculate
58 * @uptime: current uptime in seconds 146 * @totalpages: total present RAM allowed for page allocation
59 *
60 * The formula used is relatively simple and documented inline in the
61 * function. The main rationale is that we want to select a good task
62 * to kill when we run out of memory.
63 * 147 *
64 * Good in this context means that: 148 * The heuristic for determining which task to kill is made to be as simple and
65 * 1) we lose the minimum amount of work done 149 * predictable as possible. The goal is to return the highest value for the
66 * 2) we recover a large amount of memory 150 * task consuming the most memory to avoid subsequent oom failures.
67 * 3) we don't kill anything innocent of eating tons of memory
68 * 4) we want to kill the minimum amount of processes (one)
69 * 5) we try to kill the process the user expects us to kill, this
70 * algorithm has been meticulously tuned to meet the principle
71 * of least surprise ... (be careful when you change it)
72 */ 151 */
73 152unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
74unsigned long badness(struct task_struct *p, unsigned long uptime) 153 const nodemask_t *nodemask, unsigned long totalpages)
75{ 154{
76 unsigned long points, cpu_time, run_time; 155 int points;
77 struct mm_struct *mm;
78 struct task_struct *child;
79 int oom_adj = p->signal->oom_adj;
80 struct task_cputime task_time;
81 unsigned long utime;
82 unsigned long stime;
83 156
84 if (oom_adj == OOM_DISABLE) 157 if (oom_unkillable_task(p, mem, nodemask))
85 return 0; 158 return 0;
86 159
87 task_lock(p); 160 p = find_lock_task_mm(p);
88 mm = p->mm; 161 if (!p)
89 if (!mm) {
90 task_unlock(p);
91 return 0; 162 return 0;
92 }
93
94 /*
95 * The memory size of the process is the basis for the badness.
96 */
97 points = mm->total_vm;
98 163
99 /* 164 /*
100 * After this unlock we can no longer dereference local variable `mm' 165 * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't
166 * need to be executed for something that cannot be killed.
101 */ 167 */
102 task_unlock(p); 168 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
103 169 task_unlock(p);
104 /* 170 return 0;
105 * swapoff can easily use up all memory, so kill those first.
106 */
107 if (p->flags & PF_OOM_ORIGIN)
108 return ULONG_MAX;
109
110 /*
111 * Processes which fork a lot of child processes are likely
112 * a good choice. We add half the vmsize of the children if they
113 * have an own mm. This prevents forking servers to flood the
114 * machine with an endless amount of children. In case a single
115 * child is eating the vast majority of memory, adding only half
116 * to the parents will make the child our kill candidate of choice.
117 */
118 list_for_each_entry(child, &p->children, sibling) {
119 task_lock(child);
120 if (child->mm != mm && child->mm)
121 points += child->mm->total_vm/2 + 1;
122 task_unlock(child);
123 } 171 }
124 172
125 /* 173 /*
126 * CPU time is in tens of seconds and run time is in thousands 174 * When the PF_OOM_ORIGIN bit is set, it indicates the task should have
127 * of seconds. There is no particular reason for this other than 175 * priority for oom killing.
128 * that it turned out to work very well in practice.
129 */
130 thread_group_cputime(p, &task_time);
131 utime = cputime_to_jiffies(task_time.utime);
132 stime = cputime_to_jiffies(task_time.stime);
133 cpu_time = (utime + stime) >> (SHIFT_HZ + 3);
134
135
136 if (uptime >= p->start_time.tv_sec)
137 run_time = (uptime - p->start_time.tv_sec) >> 10;
138 else
139 run_time = 0;
140
141 if (cpu_time)
142 points /= int_sqrt(cpu_time);
143 if (run_time)
144 points /= int_sqrt(int_sqrt(run_time));
145
146 /*
147 * Niced processes are most likely less important, so double
148 * their badness points.
149 */ 176 */
150 if (task_nice(p) > 0) 177 if (p->flags & PF_OOM_ORIGIN) {
151 points *= 2; 178 task_unlock(p);
179 return 1000;
180 }
152 181
153 /* 182 /*
154 * Superuser processes are usually more important, so we make it 183 * The memory controller may have a limit of 0 bytes, so avoid a divide
155 * less likely that we kill those. 184 * by zero, if necessary.
156 */ 185 */
157 if (has_capability_noaudit(p, CAP_SYS_ADMIN) || 186 if (!totalpages)
158 has_capability_noaudit(p, CAP_SYS_RESOURCE)) 187 totalpages = 1;
159 points /= 4;
160 188
161 /* 189 /*
162 * We don't want to kill a process with direct hardware access. 190 * The baseline for the badness score is the proportion of RAM that each
163 * Not only could that mess up the hardware, but usually users 191 * task's rss and swap space use.
164 * tend to only have this flag set on applications they think
165 * of as important.
166 */ 192 */
167 if (has_capability_noaudit(p, CAP_SYS_RAWIO)) 193 points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 /
168 points /= 4; 194 totalpages;
195 task_unlock(p);
169 196
170 /* 197 /*
171 * If p's nodes don't overlap ours, it may still help to kill p 198 * Root processes get 3% bonus, just like the __vm_enough_memory()
172 * because p may have allocated or otherwise mapped memory on 199 * implementation used by LSMs.
173 * this node before. However it will be less likely.
174 */ 200 */
175 if (!has_intersects_mems_allowed(p)) 201 if (has_capability_noaudit(p, CAP_SYS_ADMIN))
176 points /= 8; 202 points -= 30;
177 203
178 /* 204 /*
179 * Adjust the score by oom_adj. 205 * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
206 * either completely disable oom killing or always prefer a certain
207 * task.
180 */ 208 */
181 if (oom_adj) { 209 points += p->signal->oom_score_adj;
182 if (oom_adj > 0) {
183 if (!points)
184 points = 1;
185 points <<= oom_adj;
186 } else
187 points >>= -(oom_adj);
188 }
189 210
190#ifdef DEBUG 211 if (points < 0)
191 printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points\n", 212 return 0;
192 p->pid, p->comm, points); 213 return (points < 1000) ? points : 1000;
193#endif
194 return points;
195} 214}
196 215
197/* 216/*
@@ -199,12 +218,20 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
199 */ 218 */
200#ifdef CONFIG_NUMA 219#ifdef CONFIG_NUMA
201static enum oom_constraint constrained_alloc(struct zonelist *zonelist, 220static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
202 gfp_t gfp_mask, nodemask_t *nodemask) 221 gfp_t gfp_mask, nodemask_t *nodemask,
222 unsigned long *totalpages)
203{ 223{
204 struct zone *zone; 224 struct zone *zone;
205 struct zoneref *z; 225 struct zoneref *z;
206 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 226 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
227 bool cpuset_limited = false;
228 int nid;
229
230 /* Default to all available memory */
231 *totalpages = totalram_pages + total_swap_pages;
207 232
233 if (!zonelist)
234 return CONSTRAINT_NONE;
208 /* 235 /*
209 * Reach here only when __GFP_NOFAIL is used. So, we should avoid 236 * Reach here only when __GFP_NOFAIL is used. So, we should avoid
210 * to kill current.We have to random task kill in this case. 237 * to kill current.We have to random task kill in this case.
@@ -214,26 +241,37 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
214 return CONSTRAINT_NONE; 241 return CONSTRAINT_NONE;
215 242
216 /* 243 /*
217 * The nodemask here is a nodemask passed to alloc_pages(). Now, 244 * This is not a __GFP_THISNODE allocation, so a truncated nodemask in
218 * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy 245 * the page allocator means a mempolicy is in effect. Cpuset policy
219 * feature. mempolicy is an only user of nodemask here. 246 * is enforced in get_page_from_freelist().
220 * check mempolicy's nodemask contains all N_HIGH_MEMORY
221 */ 247 */
222 if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) 248 if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) {
249 *totalpages = total_swap_pages;
250 for_each_node_mask(nid, *nodemask)
251 *totalpages += node_spanned_pages(nid);
223 return CONSTRAINT_MEMORY_POLICY; 252 return CONSTRAINT_MEMORY_POLICY;
253 }
224 254
225 /* Check this allocation failure is caused by cpuset's wall function */ 255 /* Check this allocation failure is caused by cpuset's wall function */
226 for_each_zone_zonelist_nodemask(zone, z, zonelist, 256 for_each_zone_zonelist_nodemask(zone, z, zonelist,
227 high_zoneidx, nodemask) 257 high_zoneidx, nodemask)
228 if (!cpuset_zone_allowed_softwall(zone, gfp_mask)) 258 if (!cpuset_zone_allowed_softwall(zone, gfp_mask))
229 return CONSTRAINT_CPUSET; 259 cpuset_limited = true;
230 260
261 if (cpuset_limited) {
262 *totalpages = total_swap_pages;
263 for_each_node_mask(nid, cpuset_current_mems_allowed)
264 *totalpages += node_spanned_pages(nid);
265 return CONSTRAINT_CPUSET;
266 }
231 return CONSTRAINT_NONE; 267 return CONSTRAINT_NONE;
232} 268}
233#else 269#else
234static enum oom_constraint constrained_alloc(struct zonelist *zonelist, 270static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
235 gfp_t gfp_mask, nodemask_t *nodemask) 271 gfp_t gfp_mask, nodemask_t *nodemask,
272 unsigned long *totalpages)
236{ 273{
274 *totalpages = totalram_pages + total_swap_pages;
237 return CONSTRAINT_NONE; 275 return CONSTRAINT_NONE;
238} 276}
239#endif 277#endif
@@ -244,28 +282,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
244 * 282 *
245 * (not docbooked, we don't want this one cluttering up the manual) 283 * (not docbooked, we don't want this one cluttering up the manual)
246 */ 284 */
247static struct task_struct *select_bad_process(unsigned long *ppoints, 285static struct task_struct *select_bad_process(unsigned int *ppoints,
248 struct mem_cgroup *mem) 286 unsigned long totalpages, struct mem_cgroup *mem,
287 const nodemask_t *nodemask)
249{ 288{
250 struct task_struct *p; 289 struct task_struct *p;
251 struct task_struct *chosen = NULL; 290 struct task_struct *chosen = NULL;
252 struct timespec uptime;
253 *ppoints = 0; 291 *ppoints = 0;
254 292
255 do_posix_clock_monotonic_gettime(&uptime);
256 for_each_process(p) { 293 for_each_process(p) {
257 unsigned long points; 294 unsigned int points;
258 295
259 /* 296 if (oom_unkillable_task(p, mem, nodemask))
260 * skip kernel threads and tasks which have already released
261 * their mm.
262 */
263 if (!p->mm)
264 continue;
265 /* skip the init task */
266 if (is_global_init(p))
267 continue;
268 if (mem && !task_in_mem_cgroup(p, mem))
269 continue; 297 continue;
270 298
271 /* 299 /*
@@ -290,19 +318,16 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
290 * the process of exiting and releasing its resources. 318 * the process of exiting and releasing its resources.
291 * Otherwise we could get an easy OOM deadlock. 319 * Otherwise we could get an easy OOM deadlock.
292 */ 320 */
293 if (p->flags & PF_EXITING) { 321 if (thread_group_empty(p) && (p->flags & PF_EXITING) && p->mm) {
294 if (p != current) 322 if (p != current)
295 return ERR_PTR(-1UL); 323 return ERR_PTR(-1UL);
296 324
297 chosen = p; 325 chosen = p;
298 *ppoints = ULONG_MAX; 326 *ppoints = 1000;
299 } 327 }
300 328
301 if (p->signal->oom_adj == OOM_DISABLE) 329 points = oom_badness(p, mem, nodemask, totalpages);
302 continue; 330 if (points > *ppoints) {
303
304 points = badness(p, uptime.tv_sec);
305 if (points > *ppoints || !chosen) {
306 chosen = p; 331 chosen = p;
307 *ppoints = points; 332 *ppoints = points;
308 } 333 }
@@ -313,11 +338,11 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
313 338
314/** 339/**
315 * dump_tasks - dump current memory state of all system tasks 340 * dump_tasks - dump current memory state of all system tasks
316 * @mem: target memory controller 341 * @mem: current's memory controller, if constrained
317 * 342 *
318 * Dumps the current memory state of all system tasks, excluding kernel threads. 343 * Dumps the current memory state of all system tasks, excluding kernel threads.
319 * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj 344 * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
320 * score, and name. 345 * value, oom_score_adj value, and name.
321 * 346 *
322 * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are 347 * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are
323 * shown. 348 * shown.
@@ -326,44 +351,43 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
326 */ 351 */
327static void dump_tasks(const struct mem_cgroup *mem) 352static void dump_tasks(const struct mem_cgroup *mem)
328{ 353{
329 struct task_struct *g, *p; 354 struct task_struct *p;
330 355 struct task_struct *task;
331 printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj "
332 "name\n");
333 do_each_thread(g, p) {
334 struct mm_struct *mm;
335 356
336 if (mem && !task_in_mem_cgroup(p, mem)) 357 pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n");
358 for_each_process(p) {
359 if (p->flags & PF_KTHREAD)
337 continue; 360 continue;
338 if (!thread_group_leader(p)) 361 if (mem && !task_in_mem_cgroup(p, mem))
339 continue; 362 continue;
340 363
341 task_lock(p); 364 task = find_lock_task_mm(p);
342 mm = p->mm; 365 if (!task) {
343 if (!mm) {
344 /* 366 /*
345 * total_vm and rss sizes do not exist for tasks with no 367 * This is a kthread or all of p's threads have already
346 * mm so there's no need to report them; they can't be 368 * detached their mm's. There's no need to report
347 * oom killed anyway. 369 * them; they can't be oom killed anyway.
348 */ 370 */
349 task_unlock(p);
350 continue; 371 continue;
351 } 372 }
352 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", 373
353 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, 374 pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n",
354 get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj, 375 task->pid, __task_cred(task)->uid, task->tgid,
355 p->comm); 376 task->mm->total_vm, get_mm_rss(task->mm),
356 task_unlock(p); 377 task_cpu(task), task->signal->oom_adj,
357 } while_each_thread(g, p); 378 task->signal->oom_score_adj, task->comm);
379 task_unlock(task);
380 }
358} 381}
359 382
360static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, 383static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
361 struct mem_cgroup *mem) 384 struct mem_cgroup *mem)
362{ 385{
363 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
364 "oom_adj=%d\n",
365 current->comm, gfp_mask, order, current->signal->oom_adj);
366 task_lock(current); 386 task_lock(current);
387 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
388 "oom_adj=%d, oom_score_adj=%d\n",
389 current->comm, gfp_mask, order, current->signal->oom_adj,
390 current->signal->oom_score_adj);
367 cpuset_print_task_mems_allowed(current); 391 cpuset_print_task_mems_allowed(current);
368 task_unlock(current); 392 task_unlock(current);
369 dump_stack(); 393 dump_stack();
@@ -374,72 +398,43 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
374} 398}
375 399
376#define K(x) ((x) << (PAGE_SHIFT-10)) 400#define K(x) ((x) << (PAGE_SHIFT-10))
377 401static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
378/*
379 * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO
380 * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO
381 * set.
382 */
383static void __oom_kill_task(struct task_struct *p, int verbose)
384{ 402{
385 if (is_global_init(p)) { 403 p = find_lock_task_mm(p);
386 WARN_ON(1); 404 if (!p) {
387 printk(KERN_WARNING "tried to kill init!\n");
388 return;
389 }
390
391 task_lock(p);
392 if (!p->mm) {
393 WARN_ON(1);
394 printk(KERN_WARNING "tried to kill an mm-less task %d (%s)!\n",
395 task_pid_nr(p), p->comm);
396 task_unlock(p); 405 task_unlock(p);
397 return; 406 return 1;
398 } 407 }
399 408 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
400 if (verbose) 409 task_pid_nr(p), p->comm, K(p->mm->total_vm),
401 printk(KERN_ERR "Killed process %d (%s) " 410 K(get_mm_counter(p->mm, MM_ANONPAGES)),
402 "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n", 411 K(get_mm_counter(p->mm, MM_FILEPAGES)));
403 task_pid_nr(p), p->comm,
404 K(p->mm->total_vm),
405 K(get_mm_counter(p->mm, MM_ANONPAGES)),
406 K(get_mm_counter(p->mm, MM_FILEPAGES)));
407 task_unlock(p); 412 task_unlock(p);
408 413
414
415 set_tsk_thread_flag(p, TIF_MEMDIE);
416 force_sig(SIGKILL, p);
417
409 /* 418 /*
410 * We give our sacrificial lamb high priority and access to 419 * We give our sacrificial lamb high priority and access to
411 * all the memory it needs. That way it should be able to 420 * all the memory it needs. That way it should be able to
412 * exit() and clear out its resources quickly... 421 * exit() and clear out its resources quickly...
413 */ 422 */
414 p->rt.time_slice = HZ; 423 boost_dying_task_prio(p, mem);
415 set_tsk_thread_flag(p, TIF_MEMDIE);
416
417 force_sig(SIGKILL, p);
418}
419
420static int oom_kill_task(struct task_struct *p)
421{
422 /* WARNING: mm may not be dereferenced since we did not obtain its
423 * value from get_task_mm(p). This is OK since all we need to do is
424 * compare mm to q->mm below.
425 *
426 * Furthermore, even if mm contains a non-NULL value, p->mm may
427 * change to NULL at any time since we do not hold task_lock(p).
428 * However, this is of no concern to us.
429 */
430 if (!p->mm || p->signal->oom_adj == OOM_DISABLE)
431 return 1;
432
433 __oom_kill_task(p, 1);
434 424
435 return 0; 425 return 0;
436} 426}
427#undef K
437 428
438static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, 429static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
439 unsigned long points, struct mem_cgroup *mem, 430 unsigned int points, unsigned long totalpages,
431 struct mem_cgroup *mem, nodemask_t *nodemask,
440 const char *message) 432 const char *message)
441{ 433{
442 struct task_struct *c; 434 struct task_struct *victim = p;
435 struct task_struct *child;
436 struct task_struct *t = p;
437 unsigned int victim_points = 0;
443 438
444 if (printk_ratelimit()) 439 if (printk_ratelimit())
445 dump_header(p, gfp_mask, order, mem); 440 dump_header(p, gfp_mask, order, mem);
@@ -449,40 +444,81 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
449 * its children or threads, just set TIF_MEMDIE so it can die quickly 444 * its children or threads, just set TIF_MEMDIE so it can die quickly
450 */ 445 */
451 if (p->flags & PF_EXITING) { 446 if (p->flags & PF_EXITING) {
452 __oom_kill_task(p, 0); 447 set_tsk_thread_flag(p, TIF_MEMDIE);
448 boost_dying_task_prio(p, mem);
453 return 0; 449 return 0;
454 } 450 }
455 451
456 printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n", 452 task_lock(p);
457 message, task_pid_nr(p), p->comm, points); 453 pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
454 message, task_pid_nr(p), p->comm, points);
455 task_unlock(p);
458 456
459 /* Try to kill a child first */ 457 /*
460 list_for_each_entry(c, &p->children, sibling) { 458 * If any of p's children has a different mm and is eligible for kill,
461 if (c->mm == p->mm) 459 * the one with the highest badness() score is sacrificed for its
462 continue; 460 * parent. This attempts to lose the minimal amount of work done while
463 if (mem && !task_in_mem_cgroup(c, mem)) 461 * still freeing memory.
464 continue; 462 */
465 if (!oom_kill_task(c)) 463 do {
466 return 0; 464 list_for_each_entry(child, &t->children, sibling) {
465 unsigned int child_points;
466
467 /*
468 * oom_badness() returns 0 if the thread is unkillable
469 */
470 child_points = oom_badness(child, mem, nodemask,
471 totalpages);
472 if (child_points > victim_points) {
473 victim = child;
474 victim_points = child_points;
475 }
476 }
477 } while_each_thread(p, t);
478
479 return oom_kill_task(victim, mem);
480}
481
482/*
483 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
484 */
485static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
486 int order)
487{
488 if (likely(!sysctl_panic_on_oom))
489 return;
490 if (sysctl_panic_on_oom != 2) {
491 /*
492 * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
493 * does not panic for cpuset, mempolicy, or memcg allocation
494 * failures.
495 */
496 if (constraint != CONSTRAINT_NONE)
497 return;
467 } 498 }
468 return oom_kill_task(p); 499 read_lock(&tasklist_lock);
500 dump_header(NULL, gfp_mask, order, NULL);
501 read_unlock(&tasklist_lock);
502 panic("Out of memory: %s panic_on_oom is enabled\n",
503 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
469} 504}
470 505
471#ifdef CONFIG_CGROUP_MEM_RES_CTLR 506#ifdef CONFIG_CGROUP_MEM_RES_CTLR
472void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) 507void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
473{ 508{
474 unsigned long points = 0; 509 unsigned long limit;
510 unsigned int points = 0;
475 struct task_struct *p; 511 struct task_struct *p;
476 512
477 if (sysctl_panic_on_oom == 2) 513 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0);
478 panic("out of memory(memcg). panic_on_oom is selected.\n"); 514 limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
479 read_lock(&tasklist_lock); 515 read_lock(&tasklist_lock);
480retry: 516retry:
481 p = select_bad_process(&points, mem); 517 p = select_bad_process(&points, limit, mem, NULL);
482 if (!p || PTR_ERR(p) == -1UL) 518 if (!p || PTR_ERR(p) == -1UL)
483 goto out; 519 goto out;
484 520
485 if (oom_kill_process(p, gfp_mask, 0, points, mem, 521 if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL,
486 "Memory cgroup out of memory")) 522 "Memory cgroup out of memory"))
487 goto retry; 523 goto retry;
488out: 524out:
@@ -509,7 +545,7 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
509 * if a parallel OOM killing is already taking place that includes a zone in 545 * if a parallel OOM killing is already taking place that includes a zone in
510 * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. 546 * the zonelist. Otherwise, locks all zones in the zonelist and returns 1.
511 */ 547 */
512int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) 548int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
513{ 549{
514 struct zoneref *z; 550 struct zoneref *z;
515 struct zone *zone; 551 struct zone *zone;
@@ -526,7 +562,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
526 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { 562 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
527 /* 563 /*
528 * Lock each zone in the zonelist under zone_scan_lock so a 564 * Lock each zone in the zonelist under zone_scan_lock so a
529 * parallel invocation of try_set_zone_oom() doesn't succeed 565 * parallel invocation of try_set_zonelist_oom() doesn't succeed
530 * when it shouldn't. 566 * when it shouldn't.
531 */ 567 */
532 zone_set_flag(zone, ZONE_OOM_LOCKED); 568 zone_set_flag(zone, ZONE_OOM_LOCKED);
@@ -555,65 +591,40 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
555} 591}
556 592
557/* 593/*
558 * Must be called with tasklist_lock held for read. 594 * Try to acquire the oom killer lock for all system zones. Returns zero if a
595 * parallel oom killing is taking place, otherwise locks all zones and returns
596 * non-zero.
559 */ 597 */
560static void __out_of_memory(gfp_t gfp_mask, int order) 598static int try_set_system_oom(void)
561{ 599{
562 struct task_struct *p; 600 struct zone *zone;
563 unsigned long points; 601 int ret = 1;
564
565 if (sysctl_oom_kill_allocating_task)
566 if (!oom_kill_process(current, gfp_mask, order, 0, NULL,
567 "Out of memory (oom_kill_allocating_task)"))
568 return;
569retry:
570 /*
571 * Rambo mode: Shoot down a process and hope it solves whatever
572 * issues we may have.
573 */
574 p = select_bad_process(&points, NULL);
575
576 if (PTR_ERR(p) == -1UL)
577 return;
578
579 /* Found nothing?!?! Either we hang forever, or we panic. */
580 if (!p) {
581 read_unlock(&tasklist_lock);
582 dump_header(NULL, gfp_mask, order, NULL);
583 panic("Out of memory and no killable processes...\n");
584 }
585 602
586 if (oom_kill_process(p, gfp_mask, order, points, NULL, 603 spin_lock(&zone_scan_lock);
587 "Out of memory")) 604 for_each_populated_zone(zone)
588 goto retry; 605 if (zone_is_oom_locked(zone)) {
606 ret = 0;
607 goto out;
608 }
609 for_each_populated_zone(zone)
610 zone_set_flag(zone, ZONE_OOM_LOCKED);
611out:
612 spin_unlock(&zone_scan_lock);
613 return ret;
589} 614}
590 615
591/* 616/*
592 * pagefault handler calls into here because it is out of memory but 617 * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation
593 * doesn't know exactly how or why. 618 * attempts or page faults may now recall the oom killer, if necessary.
594 */ 619 */
595void pagefault_out_of_memory(void) 620static void clear_system_oom(void)
596{ 621{
597 unsigned long freed = 0; 622 struct zone *zone;
598
599 blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
600 if (freed > 0)
601 /* Got some memory back in the last second. */
602 return;
603
604 if (sysctl_panic_on_oom)
605 panic("out of memory from page fault. panic_on_oom is selected.\n");
606
607 read_lock(&tasklist_lock);
608 __out_of_memory(0, 0); /* unknown gfp_mask and order */
609 read_unlock(&tasklist_lock);
610 623
611 /* 624 spin_lock(&zone_scan_lock);
612 * Give "p" a good chance of killing itself before we 625 for_each_populated_zone(zone)
613 * retry to allocate memory. 626 zone_clear_flag(zone, ZONE_OOM_LOCKED);
614 */ 627 spin_unlock(&zone_scan_lock);
615 if (!test_thread_flag(TIF_MEMDIE))
616 schedule_timeout_uninterruptible(1);
617} 628}
618 629
619/** 630/**
@@ -621,6 +632,7 @@ void pagefault_out_of_memory(void)
621 * @zonelist: zonelist pointer 632 * @zonelist: zonelist pointer
622 * @gfp_mask: memory allocation flags 633 * @gfp_mask: memory allocation flags
623 * @order: amount of memory being requested as a power of 2 634 * @order: amount of memory being requested as a power of 2
635 * @nodemask: nodemask passed to page allocator
624 * 636 *
625 * If we run out of memory, we have the choice between either 637 * If we run out of memory, we have the choice between either
626 * killing a random task (bad), letting the system crash (worse) 638 * killing a random task (bad), letting the system crash (worse)
@@ -630,43 +642,68 @@ void pagefault_out_of_memory(void)
630void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, 642void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
631 int order, nodemask_t *nodemask) 643 int order, nodemask_t *nodemask)
632{ 644{
645 struct task_struct *p;
646 unsigned long totalpages;
633 unsigned long freed = 0; 647 unsigned long freed = 0;
634 enum oom_constraint constraint; 648 unsigned int points;
649 enum oom_constraint constraint = CONSTRAINT_NONE;
635 650
636 blocking_notifier_call_chain(&oom_notify_list, 0, &freed); 651 blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
637 if (freed > 0) 652 if (freed > 0)
638 /* Got some memory back in the last second. */ 653 /* Got some memory back in the last second. */
639 return; 654 return;
640 655
641 if (sysctl_panic_on_oom == 2) { 656 /*
642 dump_header(NULL, gfp_mask, order, NULL); 657 * If current has a pending SIGKILL, then automatically select it. The
643 panic("out of memory. Compulsory panic_on_oom is selected.\n"); 658 * goal is to allow it to allocate so that it may quickly exit and free
659 * its memory.
660 */
661 if (fatal_signal_pending(current)) {
662 set_thread_flag(TIF_MEMDIE);
663 boost_dying_task_prio(current, NULL);
664 return;
644 } 665 }
645 666
646 /* 667 /*
647 * Check if there were limitations on the allocation (only relevant for 668 * Check if there were limitations on the allocation (only relevant for
648 * NUMA) that may require different handling. 669 * NUMA) that may require different handling.
649 */ 670 */
650 constraint = constrained_alloc(zonelist, gfp_mask, nodemask); 671 constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
672 &totalpages);
673 check_panic_on_oom(constraint, gfp_mask, order);
674
651 read_lock(&tasklist_lock); 675 read_lock(&tasklist_lock);
676 if (sysctl_oom_kill_allocating_task &&
677 !oom_unkillable_task(current, NULL, nodemask) &&
678 (current->signal->oom_adj != OOM_DISABLE)) {
679 /*
680 * oom_kill_process() needs tasklist_lock held. If it returns
681 * non-zero, current could not be killed so we must fallback to
682 * the tasklist scan.
683 */
684 if (!oom_kill_process(current, gfp_mask, order, 0, totalpages,
685 NULL, nodemask,
686 "Out of memory (oom_kill_allocating_task)"))
687 return;
688 }
652 689
653 switch (constraint) { 690retry:
654 case CONSTRAINT_MEMORY_POLICY: 691 p = select_bad_process(&points, totalpages, NULL,
655 oom_kill_process(current, gfp_mask, order, 0, NULL, 692 constraint == CONSTRAINT_MEMORY_POLICY ? nodemask :
656 "No available memory (MPOL_BIND)"); 693 NULL);
657 break; 694 if (PTR_ERR(p) == -1UL)
695 return;
658 696
659 case CONSTRAINT_NONE: 697 /* Found nothing?!?! Either we hang forever, or we panic. */
660 if (sysctl_panic_on_oom) { 698 if (!p) {
661 dump_header(NULL, gfp_mask, order, NULL); 699 dump_header(NULL, gfp_mask, order, NULL);
662 panic("out of memory. panic_on_oom is selected\n"); 700 read_unlock(&tasklist_lock);
663 } 701 panic("Out of memory and no killable processes...\n");
664 /* Fall-through */
665 case CONSTRAINT_CPUSET:
666 __out_of_memory(gfp_mask, order);
667 break;
668 } 702 }
669 703
704 if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
705 nodemask, "Out of memory"))
706 goto retry;
670 read_unlock(&tasklist_lock); 707 read_unlock(&tasklist_lock);
671 708
672 /* 709 /*
@@ -676,3 +713,19 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
676 if (!test_thread_flag(TIF_MEMDIE)) 713 if (!test_thread_flag(TIF_MEMDIE))
677 schedule_timeout_uninterruptible(1); 714 schedule_timeout_uninterruptible(1);
678} 715}
716
717/*
718 * The pagefault handler calls here because it is out of memory, so kill a
719 * memory-hogging task. If a populated zone has ZONE_OOM_LOCKED set, a parallel
720 * oom killing is already in progress so do nothing. If a task is found with
721 * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit.
722 */
723void pagefault_out_of_memory(void)
724{
725 if (try_set_system_oom()) {
726 out_of_memory(NULL, 0, 0, NULL);
727 clear_system_oom();
728 }
729 if (!test_thread_flag(TIF_MEMDIE))
730 schedule_timeout_uninterruptible(1);
731}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 37498ef61548..df8202ebc7b8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -805,6 +805,41 @@ void __init page_writeback_init(void)
805} 805}
806 806
807/** 807/**
808 * tag_pages_for_writeback - tag pages to be written by write_cache_pages
809 * @mapping: address space structure to write
810 * @start: starting page index
811 * @end: ending page index (inclusive)
812 *
813 * This function scans the page range from @start to @end (inclusive) and tags
814 * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is
815 * that write_cache_pages (or whoever calls this function) will then use
816 * TOWRITE tag to identify pages eligible for writeback. This mechanism is
817 * used to avoid livelocking of writeback by a process steadily creating new
818 * dirty pages in the file (thus it is important for this function to be quick
819 * so that it can tag pages faster than a dirtying process can create them).
820 */
821/*
822 * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency.
823 */
824#define WRITEBACK_TAG_BATCH 4096
825void tag_pages_for_writeback(struct address_space *mapping,
826 pgoff_t start, pgoff_t end)
827{
828 unsigned long tagged;
829
830 do {
831 spin_lock_irq(&mapping->tree_lock);
832 tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
833 &start, end, WRITEBACK_TAG_BATCH,
834 PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
835 spin_unlock_irq(&mapping->tree_lock);
836 WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
837 cond_resched();
838 } while (tagged >= WRITEBACK_TAG_BATCH);
839}
840EXPORT_SYMBOL(tag_pages_for_writeback);
841
842/**
808 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. 843 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
809 * @mapping: address space structure to write 844 * @mapping: address space structure to write
810 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 845 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
@@ -818,6 +853,13 @@ void __init page_writeback_init(void)
818 * the call was made get new I/O started against them. If wbc->sync_mode is 853 * the call was made get new I/O started against them. If wbc->sync_mode is
819 * WB_SYNC_ALL then we were called for data integrity and we must wait for 854 * WB_SYNC_ALL then we were called for data integrity and we must wait for
820 * existing IO to complete. 855 * existing IO to complete.
856 *
857 * To avoid livelocks (when other process dirties new pages), we first tag
858 * pages which should be written back with TOWRITE tag and only then start
859 * writing them. For data-integrity sync we have to be careful so that we do
860 * not miss some pages (e.g., because some other process has cleared TOWRITE
861 * tag we set). The rule we follow is that TOWRITE tag can be cleared only
862 * by the process clearing the DIRTY tag (and submitting the page for IO).
821 */ 863 */
822int write_cache_pages(struct address_space *mapping, 864int write_cache_pages(struct address_space *mapping,
823 struct writeback_control *wbc, writepage_t writepage, 865 struct writeback_control *wbc, writepage_t writepage,
@@ -833,6 +875,7 @@ int write_cache_pages(struct address_space *mapping,
833 pgoff_t done_index; 875 pgoff_t done_index;
834 int cycled; 876 int cycled;
835 int range_whole = 0; 877 int range_whole = 0;
878 int tag;
836 879
837 pagevec_init(&pvec, 0); 880 pagevec_init(&pvec, 0);
838 if (wbc->range_cyclic) { 881 if (wbc->range_cyclic) {
@@ -849,29 +892,19 @@ int write_cache_pages(struct address_space *mapping,
849 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 892 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
850 range_whole = 1; 893 range_whole = 1;
851 cycled = 1; /* ignore range_cyclic tests */ 894 cycled = 1; /* ignore range_cyclic tests */
852
853 /*
854 * If this is a data integrity sync, cap the writeback to the
855 * current end of file. Any extension to the file that occurs
856 * after this is a new write and we don't need to write those
857 * pages out to fulfil our data integrity requirements. If we
858 * try to write them out, we can get stuck in this scan until
859 * the concurrent writer stops adding dirty pages and extending
860 * EOF.
861 */
862 if (wbc->sync_mode == WB_SYNC_ALL &&
863 wbc->range_end == LLONG_MAX) {
864 end = i_size_read(mapping->host) >> PAGE_CACHE_SHIFT;
865 }
866 } 895 }
867 896 if (wbc->sync_mode == WB_SYNC_ALL)
897 tag = PAGECACHE_TAG_TOWRITE;
898 else
899 tag = PAGECACHE_TAG_DIRTY;
868retry: 900retry:
901 if (wbc->sync_mode == WB_SYNC_ALL)
902 tag_pages_for_writeback(mapping, index, end);
869 done_index = index; 903 done_index = index;
870 while (!done && (index <= end)) { 904 while (!done && (index <= end)) {
871 int i; 905 int i;
872 906
873 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 907 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
874 PAGECACHE_TAG_DIRTY,
875 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 908 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
876 if (nr_pages == 0) 909 if (nr_pages == 0)
877 break; 910 break;
@@ -1327,6 +1360,9 @@ int test_set_page_writeback(struct page *page)
1327 radix_tree_tag_clear(&mapping->page_tree, 1360 radix_tree_tag_clear(&mapping->page_tree,
1328 page_index(page), 1361 page_index(page),
1329 PAGECACHE_TAG_DIRTY); 1362 PAGECACHE_TAG_DIRTY);
1363 radix_tree_tag_clear(&mapping->page_tree,
1364 page_index(page),
1365 PAGECACHE_TAG_TOWRITE);
1330 spin_unlock_irqrestore(&mapping->tree_lock, flags); 1366 spin_unlock_irqrestore(&mapping->tree_lock, flags);
1331 } else { 1367 } else {
1332 ret = TestSetPageWriteback(page); 1368 ret = TestSetPageWriteback(page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9bd339eb04c6..a9649f4b261e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1738,7 +1738,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1738 struct page *page; 1738 struct page *page;
1739 1739
1740 /* Acquire the OOM killer lock for the zones in zonelist */ 1740 /* Acquire the OOM killer lock for the zones in zonelist */
1741 if (!try_set_zone_oom(zonelist, gfp_mask)) { 1741 if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
1742 schedule_timeout_uninterruptible(1); 1742 schedule_timeout_uninterruptible(1);
1743 return NULL; 1743 return NULL;
1744 } 1744 }
@@ -1759,6 +1759,9 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1759 /* The OOM killer will not help higher order allocs */ 1759 /* The OOM killer will not help higher order allocs */
1760 if (order > PAGE_ALLOC_COSTLY_ORDER) 1760 if (order > PAGE_ALLOC_COSTLY_ORDER)
1761 goto out; 1761 goto out;
1762 /* The OOM killer does not needlessly kill tasks for lowmem */
1763 if (high_zoneidx < ZONE_NORMAL)
1764 goto out;
1762 /* 1765 /*
1763 * GFP_THISNODE contains __GFP_NORETRY and we never hit this. 1766 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
1764 * Sanity check for bare calls of __GFP_THISNODE, not real OOM. 1767 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
@@ -2052,15 +2055,23 @@ rebalance:
2052 if (page) 2055 if (page)
2053 goto got_pg; 2056 goto got_pg;
2054 2057
2055 /* 2058 if (!(gfp_mask & __GFP_NOFAIL)) {
2056 * The OOM killer does not trigger for high-order 2059 /*
2057 * ~__GFP_NOFAIL allocations so if no progress is being 2060 * The oom killer is not called for high-order
2058 * made, there are no other options and retrying is 2061 * allocations that may fail, so if no progress
2059 * unlikely to help. 2062 * is being made, there are no other options and
2060 */ 2063 * retrying is unlikely to help.
2061 if (order > PAGE_ALLOC_COSTLY_ORDER && 2064 */
2062 !(gfp_mask & __GFP_NOFAIL)) 2065 if (order > PAGE_ALLOC_COSTLY_ORDER)
2063 goto nopage; 2066 goto nopage;
2067 /*
2068 * The oom killer is not called for lowmem
2069 * allocations to prevent needlessly killing
2070 * innocent tasks.
2071 */
2072 if (high_zoneidx < ZONE_NORMAL)
2073 goto nopage;
2074 }
2064 2075
2065 goto restart; 2076 goto restart;
2066 } 2077 }
@@ -4089,8 +4100,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4089 zone_seqlock_init(zone); 4100 zone_seqlock_init(zone);
4090 zone->zone_pgdat = pgdat; 4101 zone->zone_pgdat = pgdat;
4091 4102
4092 zone->prev_priority = DEF_PRIORITY;
4093
4094 zone_pcp_init(zone); 4103 zone_pcp_init(zone);
4095 for_each_lru(l) { 4104 for_each_lru(l) {
4096 INIT_LIST_HEAD(&zone->lru[l].list); 4105 INIT_LIST_HEAD(&zone->lru[l].list);
diff --git a/mm/rmap.c b/mm/rmap.c
index 38a336e2eea1..a7d0f5482634 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -132,9 +132,14 @@ int anon_vma_prepare(struct vm_area_struct *vma)
132 if (unlikely(!anon_vma)) 132 if (unlikely(!anon_vma))
133 goto out_enomem_free_avc; 133 goto out_enomem_free_avc;
134 allocated = anon_vma; 134 allocated = anon_vma;
135 /*
136 * This VMA had no anon_vma yet. This anon_vma is
137 * the root of any anon_vma tree that might form.
138 */
139 anon_vma->root = anon_vma;
135 } 140 }
136 141
137 spin_lock(&anon_vma->lock); 142 anon_vma_lock(anon_vma);
138 /* page_table_lock to protect against threads */ 143 /* page_table_lock to protect against threads */
139 spin_lock(&mm->page_table_lock); 144 spin_lock(&mm->page_table_lock);
140 if (likely(!vma->anon_vma)) { 145 if (likely(!vma->anon_vma)) {
@@ -142,12 +147,12 @@ int anon_vma_prepare(struct vm_area_struct *vma)
142 avc->anon_vma = anon_vma; 147 avc->anon_vma = anon_vma;
143 avc->vma = vma; 148 avc->vma = vma;
144 list_add(&avc->same_vma, &vma->anon_vma_chain); 149 list_add(&avc->same_vma, &vma->anon_vma_chain);
145 list_add(&avc->same_anon_vma, &anon_vma->head); 150 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
146 allocated = NULL; 151 allocated = NULL;
147 avc = NULL; 152 avc = NULL;
148 } 153 }
149 spin_unlock(&mm->page_table_lock); 154 spin_unlock(&mm->page_table_lock);
150 spin_unlock(&anon_vma->lock); 155 anon_vma_unlock(anon_vma);
151 156
152 if (unlikely(allocated)) 157 if (unlikely(allocated))
153 anon_vma_free(allocated); 158 anon_vma_free(allocated);
@@ -170,9 +175,9 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
170 avc->anon_vma = anon_vma; 175 avc->anon_vma = anon_vma;
171 list_add(&avc->same_vma, &vma->anon_vma_chain); 176 list_add(&avc->same_vma, &vma->anon_vma_chain);
172 177
173 spin_lock(&anon_vma->lock); 178 anon_vma_lock(anon_vma);
174 list_add_tail(&avc->same_anon_vma, &anon_vma->head); 179 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
175 spin_unlock(&anon_vma->lock); 180 anon_vma_unlock(anon_vma);
176} 181}
177 182
178/* 183/*
@@ -224,9 +229,21 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
224 avc = anon_vma_chain_alloc(); 229 avc = anon_vma_chain_alloc();
225 if (!avc) 230 if (!avc)
226 goto out_error_free_anon_vma; 231 goto out_error_free_anon_vma;
227 anon_vma_chain_link(vma, avc, anon_vma); 232
233 /*
234 * The root anon_vma's spinlock is the lock actually used when we
235 * lock any of the anon_vmas in this anon_vma tree.
236 */
237 anon_vma->root = pvma->anon_vma->root;
238 /*
239 * With KSM refcounts, an anon_vma can stay around longer than the
240 * process it belongs to. The root anon_vma needs to be pinned
241 * until this anon_vma is freed, because the lock lives in the root.
242 */
243 get_anon_vma(anon_vma->root);
228 /* Mark this anon_vma as the one where our new (COWed) pages go. */ 244 /* Mark this anon_vma as the one where our new (COWed) pages go. */
229 vma->anon_vma = anon_vma; 245 vma->anon_vma = anon_vma;
246 anon_vma_chain_link(vma, avc, anon_vma);
230 247
231 return 0; 248 return 0;
232 249
@@ -246,22 +263,29 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
246 if (!anon_vma) 263 if (!anon_vma)
247 return; 264 return;
248 265
249 spin_lock(&anon_vma->lock); 266 anon_vma_lock(anon_vma);
250 list_del(&anon_vma_chain->same_anon_vma); 267 list_del(&anon_vma_chain->same_anon_vma);
251 268
252 /* We must garbage collect the anon_vma if it's empty */ 269 /* We must garbage collect the anon_vma if it's empty */
253 empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma); 270 empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma);
254 spin_unlock(&anon_vma->lock); 271 anon_vma_unlock(anon_vma);
255 272
256 if (empty) 273 if (empty) {
274 /* We no longer need the root anon_vma */
275 if (anon_vma->root != anon_vma)
276 drop_anon_vma(anon_vma->root);
257 anon_vma_free(anon_vma); 277 anon_vma_free(anon_vma);
278 }
258} 279}
259 280
260void unlink_anon_vmas(struct vm_area_struct *vma) 281void unlink_anon_vmas(struct vm_area_struct *vma)
261{ 282{
262 struct anon_vma_chain *avc, *next; 283 struct anon_vma_chain *avc, *next;
263 284
264 /* Unlink each anon_vma chained to the VMA. */ 285 /*
286 * Unlink each anon_vma chained to the VMA. This list is ordered
287 * from newest to oldest, ensuring the root anon_vma gets freed last.
288 */
265 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 289 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
266 anon_vma_unlink(avc); 290 anon_vma_unlink(avc);
267 list_del(&avc->same_vma); 291 list_del(&avc->same_vma);
@@ -302,7 +326,7 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
302 goto out; 326 goto out;
303 327
304 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 328 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
305 spin_lock(&anon_vma->lock); 329 anon_vma_lock(anon_vma);
306 return anon_vma; 330 return anon_vma;
307out: 331out:
308 rcu_read_unlock(); 332 rcu_read_unlock();
@@ -311,7 +335,7 @@ out:
311 335
312void page_unlock_anon_vma(struct anon_vma *anon_vma) 336void page_unlock_anon_vma(struct anon_vma *anon_vma)
313{ 337{
314 spin_unlock(&anon_vma->lock); 338 anon_vma_unlock(anon_vma);
315 rcu_read_unlock(); 339 rcu_read_unlock();
316} 340}
317 341
@@ -340,9 +364,10 @@ vma_address(struct page *page, struct vm_area_struct *vma)
340 */ 364 */
341unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 365unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
342{ 366{
343 if (PageAnon(page)) 367 if (PageAnon(page)) {
344 ; 368 if (vma->anon_vma->root != page_anon_vma(page)->root)
345 else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { 369 return -EFAULT;
370 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
346 if (!vma->vm_file || 371 if (!vma->vm_file ||
347 vma->vm_file->f_mapping != page->mapping) 372 vma->vm_file->f_mapping != page->mapping)
348 return -EFAULT; 373 return -EFAULT;
@@ -743,14 +768,20 @@ static void __page_set_anon_rmap(struct page *page,
743 * If the page isn't exclusively mapped into this vma, 768 * If the page isn't exclusively mapped into this vma,
744 * we must use the _oldest_ possible anon_vma for the 769 * we must use the _oldest_ possible anon_vma for the
745 * page mapping! 770 * page mapping!
746 *
747 * So take the last AVC chain entry in the vma, which is
748 * the deepest ancestor, and use the anon_vma from that.
749 */ 771 */
750 if (!exclusive) { 772 if (!exclusive) {
751 struct anon_vma_chain *avc; 773 if (PageAnon(page))
752 avc = list_entry(vma->anon_vma_chain.prev, struct anon_vma_chain, same_vma); 774 return;
753 anon_vma = avc->anon_vma; 775 anon_vma = anon_vma->root;
776 } else {
777 /*
778 * In this case, swapped-out-but-not-discarded swap-cache
779 * is remapped. So, no need to update page->mapping here.
780 * We convice anon_vma poitned by page->mapping is not obsolete
781 * because vma->anon_vma is necessary to be a family of it.
782 */
783 if (PageAnon(page))
784 return;
754 } 785 }
755 786
756 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; 787 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
@@ -780,6 +811,7 @@ static void __page_check_anon_rmap(struct page *page,
780 * are initially only visible via the pagetables, and the pte is locked 811 * are initially only visible via the pagetables, and the pte is locked
781 * over the call to page_add_new_anon_rmap. 812 * over the call to page_add_new_anon_rmap.
782 */ 813 */
814 BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
783 BUG_ON(page->index != linear_page_index(vma, address)); 815 BUG_ON(page->index != linear_page_index(vma, address));
784#endif 816#endif
785} 817}
@@ -798,6 +830,17 @@ static void __page_check_anon_rmap(struct page *page,
798void page_add_anon_rmap(struct page *page, 830void page_add_anon_rmap(struct page *page,
799 struct vm_area_struct *vma, unsigned long address) 831 struct vm_area_struct *vma, unsigned long address)
800{ 832{
833 do_page_add_anon_rmap(page, vma, address, 0);
834}
835
836/*
837 * Special version of the above for do_swap_page, which often runs
838 * into pages that are exclusively owned by the current process.
839 * Everybody else should continue to use page_add_anon_rmap above.
840 */
841void do_page_add_anon_rmap(struct page *page,
842 struct vm_area_struct *vma, unsigned long address, int exclusive)
843{
801 int first = atomic_inc_and_test(&page->_mapcount); 844 int first = atomic_inc_and_test(&page->_mapcount);
802 if (first) 845 if (first)
803 __inc_zone_page_state(page, NR_ANON_PAGES); 846 __inc_zone_page_state(page, NR_ANON_PAGES);
@@ -807,7 +850,7 @@ void page_add_anon_rmap(struct page *page,
807 VM_BUG_ON(!PageLocked(page)); 850 VM_BUG_ON(!PageLocked(page));
808 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 851 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
809 if (first) 852 if (first)
810 __page_set_anon_rmap(page, vma, address, 0); 853 __page_set_anon_rmap(page, vma, address, exclusive);
811 else 854 else
812 __page_check_anon_rmap(page, vma, address); 855 __page_check_anon_rmap(page, vma, address);
813} 856}
@@ -1368,6 +1411,42 @@ int try_to_munlock(struct page *page)
1368 return try_to_unmap_file(page, TTU_MUNLOCK); 1411 return try_to_unmap_file(page, TTU_MUNLOCK);
1369} 1412}
1370 1413
1414#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION)
1415/*
1416 * Drop an anon_vma refcount, freeing the anon_vma and anon_vma->root
1417 * if necessary. Be careful to do all the tests under the lock. Once
1418 * we know we are the last user, nobody else can get a reference and we
1419 * can do the freeing without the lock.
1420 */
1421void drop_anon_vma(struct anon_vma *anon_vma)
1422{
1423 BUG_ON(atomic_read(&anon_vma->external_refcount) <= 0);
1424 if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) {
1425 struct anon_vma *root = anon_vma->root;
1426 int empty = list_empty(&anon_vma->head);
1427 int last_root_user = 0;
1428 int root_empty = 0;
1429
1430 /*
1431 * The refcount on a non-root anon_vma got dropped. Drop
1432 * the refcount on the root and check if we need to free it.
1433 */
1434 if (empty && anon_vma != root) {
1435 BUG_ON(atomic_read(&root->external_refcount) <= 0);
1436 last_root_user = atomic_dec_and_test(&root->external_refcount);
1437 root_empty = list_empty(&root->head);
1438 }
1439 anon_vma_unlock(anon_vma);
1440
1441 if (empty) {
1442 anon_vma_free(anon_vma);
1443 if (root_empty && last_root_user)
1444 anon_vma_free(root);
1445 }
1446 }
1447}
1448#endif
1449
1371#ifdef CONFIG_MIGRATION 1450#ifdef CONFIG_MIGRATION
1372/* 1451/*
1373 * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): 1452 * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
@@ -1389,7 +1468,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1389 anon_vma = page_anon_vma(page); 1468 anon_vma = page_anon_vma(page);
1390 if (!anon_vma) 1469 if (!anon_vma)
1391 return ret; 1470 return ret;
1392 spin_lock(&anon_vma->lock); 1471 anon_vma_lock(anon_vma);
1393 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 1472 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1394 struct vm_area_struct *vma = avc->vma; 1473 struct vm_area_struct *vma = avc->vma;
1395 unsigned long address = vma_address(page, vma); 1474 unsigned long address = vma_address(page, vma);
@@ -1399,7 +1478,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1399 if (ret != SWAP_AGAIN) 1478 if (ret != SWAP_AGAIN)
1400 break; 1479 break;
1401 } 1480 }
1402 spin_unlock(&anon_vma->lock); 1481 anon_vma_unlock(anon_vma);
1403 return ret; 1482 return ret;
1404} 1483}
1405 1484
diff --git a/mm/shmem.c b/mm/shmem.c
index f65f84062db5..566f9a481e64 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -28,6 +28,7 @@
28#include <linux/file.h> 28#include <linux/file.h>
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/percpu_counter.h>
31#include <linux/swap.h> 32#include <linux/swap.h>
32 33
33static struct vfsmount *shm_mnt; 34static struct vfsmount *shm_mnt;
@@ -233,10 +234,10 @@ static void shmem_free_blocks(struct inode *inode, long pages)
233{ 234{
234 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 235 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
235 if (sbinfo->max_blocks) { 236 if (sbinfo->max_blocks) {
236 spin_lock(&sbinfo->stat_lock); 237 percpu_counter_add(&sbinfo->used_blocks, -pages);
237 sbinfo->free_blocks += pages; 238 spin_lock(&inode->i_lock);
238 inode->i_blocks -= pages*BLOCKS_PER_PAGE; 239 inode->i_blocks -= pages*BLOCKS_PER_PAGE;
239 spin_unlock(&sbinfo->stat_lock); 240 spin_unlock(&inode->i_lock);
240 } 241 }
241} 242}
242 243
@@ -416,19 +417,17 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
416 if (sgp == SGP_READ) 417 if (sgp == SGP_READ)
417 return shmem_swp_map(ZERO_PAGE(0)); 418 return shmem_swp_map(ZERO_PAGE(0));
418 /* 419 /*
419 * Test free_blocks against 1 not 0, since we have 1 data 420 * Test used_blocks against 1 less max_blocks, since we have 1 data
420 * page (and perhaps indirect index pages) yet to allocate: 421 * page (and perhaps indirect index pages) yet to allocate:
421 * a waste to allocate index if we cannot allocate data. 422 * a waste to allocate index if we cannot allocate data.
422 */ 423 */
423 if (sbinfo->max_blocks) { 424 if (sbinfo->max_blocks) {
424 spin_lock(&sbinfo->stat_lock); 425 if (percpu_counter_compare(&sbinfo->used_blocks, (sbinfo->max_blocks - 1)) > 0)
425 if (sbinfo->free_blocks <= 1) {
426 spin_unlock(&sbinfo->stat_lock);
427 return ERR_PTR(-ENOSPC); 426 return ERR_PTR(-ENOSPC);
428 } 427 percpu_counter_inc(&sbinfo->used_blocks);
429 sbinfo->free_blocks--; 428 spin_lock(&inode->i_lock);
430 inode->i_blocks += BLOCKS_PER_PAGE; 429 inode->i_blocks += BLOCKS_PER_PAGE;
431 spin_unlock(&sbinfo->stat_lock); 430 spin_unlock(&inode->i_lock);
432 } 431 }
433 432
434 spin_unlock(&info->lock); 433 spin_unlock(&info->lock);
@@ -1223,6 +1222,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
1223 struct shmem_sb_info *sbinfo; 1222 struct shmem_sb_info *sbinfo;
1224 struct page *filepage = *pagep; 1223 struct page *filepage = *pagep;
1225 struct page *swappage; 1224 struct page *swappage;
1225 struct page *prealloc_page = NULL;
1226 swp_entry_t *entry; 1226 swp_entry_t *entry;
1227 swp_entry_t swap; 1227 swp_entry_t swap;
1228 gfp_t gfp; 1228 gfp_t gfp;
@@ -1247,7 +1247,6 @@ repeat:
1247 filepage = find_lock_page(mapping, idx); 1247 filepage = find_lock_page(mapping, idx);
1248 if (filepage && PageUptodate(filepage)) 1248 if (filepage && PageUptodate(filepage))
1249 goto done; 1249 goto done;
1250 error = 0;
1251 gfp = mapping_gfp_mask(mapping); 1250 gfp = mapping_gfp_mask(mapping);
1252 if (!filepage) { 1251 if (!filepage) {
1253 /* 1252 /*
@@ -1258,7 +1257,19 @@ repeat:
1258 if (error) 1257 if (error)
1259 goto failed; 1258 goto failed;
1260 radix_tree_preload_end(); 1259 radix_tree_preload_end();
1260 if (sgp != SGP_READ && !prealloc_page) {
1261 /* We don't care if this fails */
1262 prealloc_page = shmem_alloc_page(gfp, info, idx);
1263 if (prealloc_page) {
1264 if (mem_cgroup_cache_charge(prealloc_page,
1265 current->mm, GFP_KERNEL)) {
1266 page_cache_release(prealloc_page);
1267 prealloc_page = NULL;
1268 }
1269 }
1270 }
1261 } 1271 }
1272 error = 0;
1262 1273
1263 spin_lock(&info->lock); 1274 spin_lock(&info->lock);
1264 shmem_recalc_inode(inode); 1275 shmem_recalc_inode(inode);
@@ -1387,17 +1398,16 @@ repeat:
1387 shmem_swp_unmap(entry); 1398 shmem_swp_unmap(entry);
1388 sbinfo = SHMEM_SB(inode->i_sb); 1399 sbinfo = SHMEM_SB(inode->i_sb);
1389 if (sbinfo->max_blocks) { 1400 if (sbinfo->max_blocks) {
1390 spin_lock(&sbinfo->stat_lock); 1401 if ((percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks) > 0) ||
1391 if (sbinfo->free_blocks == 0 ||
1392 shmem_acct_block(info->flags)) { 1402 shmem_acct_block(info->flags)) {
1393 spin_unlock(&sbinfo->stat_lock);
1394 spin_unlock(&info->lock); 1403 spin_unlock(&info->lock);
1395 error = -ENOSPC; 1404 error = -ENOSPC;
1396 goto failed; 1405 goto failed;
1397 } 1406 }
1398 sbinfo->free_blocks--; 1407 percpu_counter_inc(&sbinfo->used_blocks);
1408 spin_lock(&inode->i_lock);
1399 inode->i_blocks += BLOCKS_PER_PAGE; 1409 inode->i_blocks += BLOCKS_PER_PAGE;
1400 spin_unlock(&sbinfo->stat_lock); 1410 spin_unlock(&inode->i_lock);
1401 } else if (shmem_acct_block(info->flags)) { 1411 } else if (shmem_acct_block(info->flags)) {
1402 spin_unlock(&info->lock); 1412 spin_unlock(&info->lock);
1403 error = -ENOSPC; 1413 error = -ENOSPC;
@@ -1407,28 +1417,38 @@ repeat:
1407 if (!filepage) { 1417 if (!filepage) {
1408 int ret; 1418 int ret;
1409 1419
1410 spin_unlock(&info->lock); 1420 if (!prealloc_page) {
1411 filepage = shmem_alloc_page(gfp, info, idx); 1421 spin_unlock(&info->lock);
1412 if (!filepage) { 1422 filepage = shmem_alloc_page(gfp, info, idx);
1413 shmem_unacct_blocks(info->flags, 1); 1423 if (!filepage) {
1414 shmem_free_blocks(inode, 1); 1424 shmem_unacct_blocks(info->flags, 1);
1415 error = -ENOMEM; 1425 shmem_free_blocks(inode, 1);
1416 goto failed; 1426 error = -ENOMEM;
1417 } 1427 goto failed;
1418 SetPageSwapBacked(filepage); 1428 }
1429 SetPageSwapBacked(filepage);
1419 1430
1420 /* Precharge page while we can wait, compensate after */ 1431 /*
1421 error = mem_cgroup_cache_charge(filepage, current->mm, 1432 * Precharge page while we can wait, compensate
1422 GFP_KERNEL); 1433 * after
1423 if (error) { 1434 */
1424 page_cache_release(filepage); 1435 error = mem_cgroup_cache_charge(filepage,
1425 shmem_unacct_blocks(info->flags, 1); 1436 current->mm, GFP_KERNEL);
1426 shmem_free_blocks(inode, 1); 1437 if (error) {
1427 filepage = NULL; 1438 page_cache_release(filepage);
1428 goto failed; 1439 shmem_unacct_blocks(info->flags, 1);
1440 shmem_free_blocks(inode, 1);
1441 filepage = NULL;
1442 goto failed;
1443 }
1444
1445 spin_lock(&info->lock);
1446 } else {
1447 filepage = prealloc_page;
1448 prealloc_page = NULL;
1449 SetPageSwapBacked(filepage);
1429 } 1450 }
1430 1451
1431 spin_lock(&info->lock);
1432 entry = shmem_swp_alloc(info, idx, sgp); 1452 entry = shmem_swp_alloc(info, idx, sgp);
1433 if (IS_ERR(entry)) 1453 if (IS_ERR(entry))
1434 error = PTR_ERR(entry); 1454 error = PTR_ERR(entry);
@@ -1469,13 +1489,19 @@ repeat:
1469 } 1489 }
1470done: 1490done:
1471 *pagep = filepage; 1491 *pagep = filepage;
1472 return 0; 1492 error = 0;
1493 goto out;
1473 1494
1474failed: 1495failed:
1475 if (*pagep != filepage) { 1496 if (*pagep != filepage) {
1476 unlock_page(filepage); 1497 unlock_page(filepage);
1477 page_cache_release(filepage); 1498 page_cache_release(filepage);
1478 } 1499 }
1500out:
1501 if (prealloc_page) {
1502 mem_cgroup_uncharge_cache_page(prealloc_page);
1503 page_cache_release(prealloc_page);
1504 }
1479 return error; 1505 return error;
1480} 1506}
1481 1507
@@ -1791,17 +1817,16 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1791 buf->f_type = TMPFS_MAGIC; 1817 buf->f_type = TMPFS_MAGIC;
1792 buf->f_bsize = PAGE_CACHE_SIZE; 1818 buf->f_bsize = PAGE_CACHE_SIZE;
1793 buf->f_namelen = NAME_MAX; 1819 buf->f_namelen = NAME_MAX;
1794 spin_lock(&sbinfo->stat_lock);
1795 if (sbinfo->max_blocks) { 1820 if (sbinfo->max_blocks) {
1796 buf->f_blocks = sbinfo->max_blocks; 1821 buf->f_blocks = sbinfo->max_blocks;
1797 buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; 1822 buf->f_bavail = buf->f_bfree =
1823 sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks);
1798 } 1824 }
1799 if (sbinfo->max_inodes) { 1825 if (sbinfo->max_inodes) {
1800 buf->f_files = sbinfo->max_inodes; 1826 buf->f_files = sbinfo->max_inodes;
1801 buf->f_ffree = sbinfo->free_inodes; 1827 buf->f_ffree = sbinfo->free_inodes;
1802 } 1828 }
1803 /* else leave those fields 0 like simple_statfs */ 1829 /* else leave those fields 0 like simple_statfs */
1804 spin_unlock(&sbinfo->stat_lock);
1805 return 0; 1830 return 0;
1806} 1831}
1807 1832
@@ -2242,7 +2267,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2242{ 2267{
2243 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 2268 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2244 struct shmem_sb_info config = *sbinfo; 2269 struct shmem_sb_info config = *sbinfo;
2245 unsigned long blocks;
2246 unsigned long inodes; 2270 unsigned long inodes;
2247 int error = -EINVAL; 2271 int error = -EINVAL;
2248 2272
@@ -2250,9 +2274,8 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2250 return error; 2274 return error;
2251 2275
2252 spin_lock(&sbinfo->stat_lock); 2276 spin_lock(&sbinfo->stat_lock);
2253 blocks = sbinfo->max_blocks - sbinfo->free_blocks;
2254 inodes = sbinfo->max_inodes - sbinfo->free_inodes; 2277 inodes = sbinfo->max_inodes - sbinfo->free_inodes;
2255 if (config.max_blocks < blocks) 2278 if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
2256 goto out; 2279 goto out;
2257 if (config.max_inodes < inodes) 2280 if (config.max_inodes < inodes)
2258 goto out; 2281 goto out;
@@ -2269,7 +2292,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2269 2292
2270 error = 0; 2293 error = 0;
2271 sbinfo->max_blocks = config.max_blocks; 2294 sbinfo->max_blocks = config.max_blocks;
2272 sbinfo->free_blocks = config.max_blocks - blocks;
2273 sbinfo->max_inodes = config.max_inodes; 2295 sbinfo->max_inodes = config.max_inodes;
2274 sbinfo->free_inodes = config.max_inodes - inodes; 2296 sbinfo->free_inodes = config.max_inodes - inodes;
2275 2297
@@ -2344,7 +2366,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
2344#endif 2366#endif
2345 2367
2346 spin_lock_init(&sbinfo->stat_lock); 2368 spin_lock_init(&sbinfo->stat_lock);
2347 sbinfo->free_blocks = sbinfo->max_blocks; 2369 percpu_counter_init(&sbinfo->used_blocks, 0);
2348 sbinfo->free_inodes = sbinfo->max_inodes; 2370 sbinfo->free_inodes = sbinfo->max_inodes;
2349 2371
2350 sb->s_maxbytes = SHMEM_MAX_BYTES; 2372 sb->s_maxbytes = SHMEM_MAX_BYTES;
diff --git a/mm/slab.c b/mm/slab.c
index 736e497733d6..88435fcc8387 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -394,7 +394,7 @@ static void kmem_list3_init(struct kmem_list3 *parent)
394#define STATS_DEC_ACTIVE(x) do { } while (0) 394#define STATS_DEC_ACTIVE(x) do { } while (0)
395#define STATS_INC_ALLOCED(x) do { } while (0) 395#define STATS_INC_ALLOCED(x) do { } while (0)
396#define STATS_INC_GROWN(x) do { } while (0) 396#define STATS_INC_GROWN(x) do { } while (0)
397#define STATS_ADD_REAPED(x,y) do { } while (0) 397#define STATS_ADD_REAPED(x,y) do { (void)(y); } while (0)
398#define STATS_SET_HIGH(x) do { } while (0) 398#define STATS_SET_HIGH(x) do { } while (0)
399#define STATS_INC_ERR(x) do { } while (0) 399#define STATS_INC_ERR(x) do { } while (0)
400#define STATS_INC_NODEALLOCS(x) do { } while (0) 400#define STATS_INC_NODEALLOCS(x) do { } while (0)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 03aa2d55f1a2..1f3f9c59a73a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -47,6 +47,8 @@ long nr_swap_pages;
47long total_swap_pages; 47long total_swap_pages;
48static int least_priority; 48static int least_priority;
49 49
50static bool swap_for_hibernation;
51
50static const char Bad_file[] = "Bad swap file entry "; 52static const char Bad_file[] = "Bad swap file entry ";
51static const char Unused_file[] = "Unused swap file entry "; 53static const char Unused_file[] = "Unused swap file entry ";
52static const char Bad_offset[] = "Bad swap offset entry "; 54static const char Bad_offset[] = "Bad swap offset entry ";
@@ -318,8 +320,10 @@ checks:
318 if (offset > si->highest_bit) 320 if (offset > si->highest_bit)
319 scan_base = offset = si->lowest_bit; 321 scan_base = offset = si->lowest_bit;
320 322
321 /* reuse swap entry of cache-only swap if not busy. */ 323 /* reuse swap entry of cache-only swap if not hibernation. */
322 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 324 if (vm_swap_full()
325 && usage == SWAP_HAS_CACHE
326 && si->swap_map[offset] == SWAP_HAS_CACHE) {
323 int swap_was_freed; 327 int swap_was_freed;
324 spin_unlock(&swap_lock); 328 spin_unlock(&swap_lock);
325 swap_was_freed = __try_to_reclaim_swap(si, offset); 329 swap_was_freed = __try_to_reclaim_swap(si, offset);
@@ -449,6 +453,8 @@ swp_entry_t get_swap_page(void)
449 spin_lock(&swap_lock); 453 spin_lock(&swap_lock);
450 if (nr_swap_pages <= 0) 454 if (nr_swap_pages <= 0)
451 goto noswap; 455 goto noswap;
456 if (swap_for_hibernation)
457 goto noswap;
452 nr_swap_pages--; 458 nr_swap_pages--;
453 459
454 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { 460 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
@@ -481,28 +487,6 @@ noswap:
481 return (swp_entry_t) {0}; 487 return (swp_entry_t) {0};
482} 488}
483 489
484/* The only caller of this function is now susupend routine */
485swp_entry_t get_swap_page_of_type(int type)
486{
487 struct swap_info_struct *si;
488 pgoff_t offset;
489
490 spin_lock(&swap_lock);
491 si = swap_info[type];
492 if (si && (si->flags & SWP_WRITEOK)) {
493 nr_swap_pages--;
494 /* This is called for allocating swap entry, not cache */
495 offset = scan_swap_map(si, 1);
496 if (offset) {
497 spin_unlock(&swap_lock);
498 return swp_entry(type, offset);
499 }
500 nr_swap_pages++;
501 }
502 spin_unlock(&swap_lock);
503 return (swp_entry_t) {0};
504}
505
506static struct swap_info_struct *swap_info_get(swp_entry_t entry) 490static struct swap_info_struct *swap_info_get(swp_entry_t entry)
507{ 491{
508 struct swap_info_struct *p; 492 struct swap_info_struct *p;
@@ -762,6 +746,74 @@ int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
762#endif 746#endif
763 747
764#ifdef CONFIG_HIBERNATION 748#ifdef CONFIG_HIBERNATION
749
750static pgoff_t hibernation_offset[MAX_SWAPFILES];
751/*
752 * Once hibernation starts to use swap, we freeze swap_map[]. Otherwise,
753 * saved swap_map[] image to the disk will be an incomplete because it's
754 * changing without synchronization with hibernation snap shot.
755 * At resume, we just make swap_for_hibernation=false. We can forget
756 * used maps easily.
757 */
758void hibernation_freeze_swap(void)
759{
760 int i;
761
762 spin_lock(&swap_lock);
763
764 printk(KERN_INFO "PM: Freeze Swap\n");
765 swap_for_hibernation = true;
766 for (i = 0; i < MAX_SWAPFILES; i++)
767 hibernation_offset[i] = 1;
768 spin_unlock(&swap_lock);
769}
770
771void hibernation_thaw_swap(void)
772{
773 spin_lock(&swap_lock);
774 if (swap_for_hibernation) {
775 printk(KERN_INFO "PM: Thaw Swap\n");
776 swap_for_hibernation = false;
777 }
778 spin_unlock(&swap_lock);
779}
780
781/*
782 * Because updateing swap_map[] can make not-saved-status-change,
783 * we use our own easy allocator.
784 * Please see kernel/power/swap.c, Used swaps are recorded into
785 * RB-tree.
786 */
787swp_entry_t get_swap_for_hibernation(int type)
788{
789 pgoff_t off;
790 swp_entry_t val = {0};
791 struct swap_info_struct *si;
792
793 spin_lock(&swap_lock);
794
795 si = swap_info[type];
796 if (!si || !(si->flags & SWP_WRITEOK))
797 goto done;
798
799 for (off = hibernation_offset[type]; off < si->max; ++off) {
800 if (!si->swap_map[off])
801 break;
802 }
803 if (off < si->max) {
804 val = swp_entry(type, off);
805 hibernation_offset[type] = off + 1;
806 }
807done:
808 spin_unlock(&swap_lock);
809 return val;
810}
811
812void swap_free_for_hibernation(swp_entry_t ent)
813{
814 /* Nothing to do */
815}
816
765/* 817/*
766 * Find the swap type that corresponds to given device (if any). 818 * Find the swap type that corresponds to given device (if any).
767 * 819 *
diff --git a/mm/util.c b/mm/util.c
index f5712e8964be..4735ea481816 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -225,15 +225,10 @@ char *strndup_user(const char __user *s, long n)
225 if (length > n) 225 if (length > n)
226 return ERR_PTR(-EINVAL); 226 return ERR_PTR(-EINVAL);
227 227
228 p = kmalloc(length, GFP_KERNEL); 228 p = memdup_user(s, length);
229 229
230 if (!p) 230 if (IS_ERR(p))
231 return ERR_PTR(-ENOMEM); 231 return p;
232
233 if (copy_from_user(p, s, length)) {
234 kfree(p);
235 return ERR_PTR(-EFAULT);
236 }
237 232
238 p[length - 1] = '\0'; 233 p[length - 1] = '\0';
239 234
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b7e314b1009f..918c51335d64 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -732,7 +732,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
732 node, gfp_mask); 732 node, gfp_mask);
733 if (unlikely(IS_ERR(va))) { 733 if (unlikely(IS_ERR(va))) {
734 kfree(vb); 734 kfree(vb);
735 return ERR_PTR(PTR_ERR(va)); 735 return ERR_CAST(va);
736 } 736 }
737 737
738 err = radix_tree_preload(gfp_mask); 738 err = radix_tree_preload(gfp_mask);
@@ -2437,8 +2437,11 @@ static int vmalloc_open(struct inode *inode, struct file *file)
2437 unsigned int *ptr = NULL; 2437 unsigned int *ptr = NULL;
2438 int ret; 2438 int ret;
2439 2439
2440 if (NUMA_BUILD) 2440 if (NUMA_BUILD) {
2441 ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); 2441 ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
2442 if (ptr == NULL)
2443 return -ENOMEM;
2444 }
2442 ret = seq_open(file, &vmalloc_op); 2445 ret = seq_open(file, &vmalloc_op);
2443 if (!ret) { 2446 if (!ret) {
2444 struct seq_file *m = file->private_data; 2447 struct seq_file *m = file->private_data;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b94fe1b3da43..ec5ddccbf82e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -48,6 +48,9 @@
48 48
49#include "internal.h" 49#include "internal.h"
50 50
51#define CREATE_TRACE_POINTS
52#include <trace/events/vmscan.h>
53
51struct scan_control { 54struct scan_control {
52 /* Incremented by the number of inactive pages that were scanned */ 55 /* Incremented by the number of inactive pages that were scanned */
53 unsigned long nr_scanned; 56 unsigned long nr_scanned;
@@ -398,6 +401,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
398 /* synchronous write or broken a_ops? */ 401 /* synchronous write or broken a_ops? */
399 ClearPageReclaim(page); 402 ClearPageReclaim(page);
400 } 403 }
404 trace_mm_vmscan_writepage(page,
405 trace_reclaim_flags(page, sync_writeback));
401 inc_zone_page_state(page, NR_VMSCAN_WRITE); 406 inc_zone_page_state(page, NR_VMSCAN_WRITE);
402 return PAGE_SUCCESS; 407 return PAGE_SUCCESS;
403 } 408 }
@@ -617,6 +622,24 @@ static enum page_references page_check_references(struct page *page,
617 return PAGEREF_RECLAIM; 622 return PAGEREF_RECLAIM;
618} 623}
619 624
625static noinline_for_stack void free_page_list(struct list_head *free_pages)
626{
627 struct pagevec freed_pvec;
628 struct page *page, *tmp;
629
630 pagevec_init(&freed_pvec, 1);
631
632 list_for_each_entry_safe(page, tmp, free_pages, lru) {
633 list_del(&page->lru);
634 if (!pagevec_add(&freed_pvec, page)) {
635 __pagevec_free(&freed_pvec);
636 pagevec_reinit(&freed_pvec);
637 }
638 }
639
640 pagevec_free(&freed_pvec);
641}
642
620/* 643/*
621 * shrink_page_list() returns the number of reclaimed pages 644 * shrink_page_list() returns the number of reclaimed pages
622 */ 645 */
@@ -625,13 +648,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
625 enum pageout_io sync_writeback) 648 enum pageout_io sync_writeback)
626{ 649{
627 LIST_HEAD(ret_pages); 650 LIST_HEAD(ret_pages);
628 struct pagevec freed_pvec; 651 LIST_HEAD(free_pages);
629 int pgactivate = 0; 652 int pgactivate = 0;
630 unsigned long nr_reclaimed = 0; 653 unsigned long nr_reclaimed = 0;
631 654
632 cond_resched(); 655 cond_resched();
633 656
634 pagevec_init(&freed_pvec, 1);
635 while (!list_empty(page_list)) { 657 while (!list_empty(page_list)) {
636 enum page_references references; 658 enum page_references references;
637 struct address_space *mapping; 659 struct address_space *mapping;
@@ -806,10 +828,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
806 __clear_page_locked(page); 828 __clear_page_locked(page);
807free_it: 829free_it:
808 nr_reclaimed++; 830 nr_reclaimed++;
809 if (!pagevec_add(&freed_pvec, page)) { 831
810 __pagevec_free(&freed_pvec); 832 /*
811 pagevec_reinit(&freed_pvec); 833 * Is there need to periodically free_page_list? It would
812 } 834 * appear not as the counts should be low
835 */
836 list_add(&page->lru, &free_pages);
813 continue; 837 continue;
814 838
815cull_mlocked: 839cull_mlocked:
@@ -832,9 +856,10 @@ keep:
832 list_add(&page->lru, &ret_pages); 856 list_add(&page->lru, &ret_pages);
833 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 857 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
834 } 858 }
859
860 free_page_list(&free_pages);
861
835 list_splice(&ret_pages, page_list); 862 list_splice(&ret_pages, page_list);
836 if (pagevec_count(&freed_pvec))
837 __pagevec_free(&freed_pvec);
838 count_vm_events(PGACTIVATE, pgactivate); 863 count_vm_events(PGACTIVATE, pgactivate);
839 return nr_reclaimed; 864 return nr_reclaimed;
840} 865}
@@ -916,6 +941,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
916 unsigned long *scanned, int order, int mode, int file) 941 unsigned long *scanned, int order, int mode, int file)
917{ 942{
918 unsigned long nr_taken = 0; 943 unsigned long nr_taken = 0;
944 unsigned long nr_lumpy_taken = 0;
945 unsigned long nr_lumpy_dirty = 0;
946 unsigned long nr_lumpy_failed = 0;
919 unsigned long scan; 947 unsigned long scan;
920 948
921 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { 949 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
@@ -993,12 +1021,25 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
993 list_move(&cursor_page->lru, dst); 1021 list_move(&cursor_page->lru, dst);
994 mem_cgroup_del_lru(cursor_page); 1022 mem_cgroup_del_lru(cursor_page);
995 nr_taken++; 1023 nr_taken++;
1024 nr_lumpy_taken++;
1025 if (PageDirty(cursor_page))
1026 nr_lumpy_dirty++;
996 scan++; 1027 scan++;
1028 } else {
1029 if (mode == ISOLATE_BOTH &&
1030 page_count(cursor_page))
1031 nr_lumpy_failed++;
997 } 1032 }
998 } 1033 }
999 } 1034 }
1000 1035
1001 *scanned = scan; 1036 *scanned = scan;
1037
1038 trace_mm_vmscan_lru_isolate(order,
1039 nr_to_scan, scan,
1040 nr_taken,
1041 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
1042 mode);
1002 return nr_taken; 1043 return nr_taken;
1003} 1044}
1004 1045
@@ -1035,7 +1076,8 @@ static unsigned long clear_active_flags(struct list_head *page_list,
1035 ClearPageActive(page); 1076 ClearPageActive(page);
1036 nr_active++; 1077 nr_active++;
1037 } 1078 }
1038 count[lru]++; 1079 if (count)
1080 count[lru]++;
1039 } 1081 }
1040 1082
1041 return nr_active; 1083 return nr_active;
@@ -1112,174 +1154,212 @@ static int too_many_isolated(struct zone *zone, int file,
1112} 1154}
1113 1155
1114/* 1156/*
1115 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 1157 * TODO: Try merging with migrations version of putback_lru_pages
1116 * of reclaimed pages
1117 */ 1158 */
1118static unsigned long shrink_inactive_list(unsigned long max_scan, 1159static noinline_for_stack void
1119 struct zone *zone, struct scan_control *sc, 1160putback_lru_pages(struct zone *zone, struct scan_control *sc,
1120 int priority, int file) 1161 unsigned long nr_anon, unsigned long nr_file,
1162 struct list_head *page_list)
1121{ 1163{
1122 LIST_HEAD(page_list); 1164 struct page *page;
1123 struct pagevec pvec; 1165 struct pagevec pvec;
1124 unsigned long nr_scanned = 0;
1125 unsigned long nr_reclaimed = 0;
1126 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1166 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1127 1167
1128 while (unlikely(too_many_isolated(zone, file, sc))) { 1168 pagevec_init(&pvec, 1);
1129 congestion_wait(BLK_RW_ASYNC, HZ/10);
1130 1169
1131 /* We are about to die and free our memory. Return now. */ 1170 /*
1132 if (fatal_signal_pending(current)) 1171 * Put back any unfreeable pages.
1133 return SWAP_CLUSTER_MAX; 1172 */
1173 spin_lock(&zone->lru_lock);
1174 while (!list_empty(page_list)) {
1175 int lru;
1176 page = lru_to_page(page_list);
1177 VM_BUG_ON(PageLRU(page));
1178 list_del(&page->lru);
1179 if (unlikely(!page_evictable(page, NULL))) {
1180 spin_unlock_irq(&zone->lru_lock);
1181 putback_lru_page(page);
1182 spin_lock_irq(&zone->lru_lock);
1183 continue;
1184 }
1185 SetPageLRU(page);
1186 lru = page_lru(page);
1187 add_page_to_lru_list(zone, page, lru);
1188 if (is_active_lru(lru)) {
1189 int file = is_file_lru(lru);
1190 reclaim_stat->recent_rotated[file]++;
1191 }
1192 if (!pagevec_add(&pvec, page)) {
1193 spin_unlock_irq(&zone->lru_lock);
1194 __pagevec_release(&pvec);
1195 spin_lock_irq(&zone->lru_lock);
1196 }
1134 } 1197 }
1198 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
1199 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
1135 1200
1201 spin_unlock_irq(&zone->lru_lock);
1202 pagevec_release(&pvec);
1203}
1136 1204
1137 pagevec_init(&pvec, 1); 1205static noinline_for_stack void update_isolated_counts(struct zone *zone,
1206 struct scan_control *sc,
1207 unsigned long *nr_anon,
1208 unsigned long *nr_file,
1209 struct list_head *isolated_list)
1210{
1211 unsigned long nr_active;
1212 unsigned int count[NR_LRU_LISTS] = { 0, };
1213 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1138 1214
1139 lru_add_drain(); 1215 nr_active = clear_active_flags(isolated_list, count);
1140 spin_lock_irq(&zone->lru_lock); 1216 __count_vm_events(PGDEACTIVATE, nr_active);
1141 do {
1142 struct page *page;
1143 unsigned long nr_taken;
1144 unsigned long nr_scan;
1145 unsigned long nr_freed;
1146 unsigned long nr_active;
1147 unsigned int count[NR_LRU_LISTS] = { 0, };
1148 int mode = sc->lumpy_reclaim_mode ? ISOLATE_BOTH : ISOLATE_INACTIVE;
1149 unsigned long nr_anon;
1150 unsigned long nr_file;
1151 1217
1152 if (scanning_global_lru(sc)) { 1218 __mod_zone_page_state(zone, NR_ACTIVE_FILE,
1153 nr_taken = isolate_pages_global(SWAP_CLUSTER_MAX, 1219 -count[LRU_ACTIVE_FILE]);
1154 &page_list, &nr_scan, 1220 __mod_zone_page_state(zone, NR_INACTIVE_FILE,
1155 sc->order, mode, 1221 -count[LRU_INACTIVE_FILE]);
1156 zone, 0, file); 1222 __mod_zone_page_state(zone, NR_ACTIVE_ANON,
1157 zone->pages_scanned += nr_scan; 1223 -count[LRU_ACTIVE_ANON]);
1158 if (current_is_kswapd()) 1224 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1159 __count_zone_vm_events(PGSCAN_KSWAPD, zone, 1225 -count[LRU_INACTIVE_ANON]);
1160 nr_scan);
1161 else
1162 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1163 nr_scan);
1164 } else {
1165 nr_taken = mem_cgroup_isolate_pages(SWAP_CLUSTER_MAX,
1166 &page_list, &nr_scan,
1167 sc->order, mode,
1168 zone, sc->mem_cgroup,
1169 0, file);
1170 /*
1171 * mem_cgroup_isolate_pages() keeps track of
1172 * scanned pages on its own.
1173 */
1174 }
1175 1226
1176 if (nr_taken == 0) 1227 *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
1177 goto done; 1228 *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
1229 __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
1230 __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
1231
1232 reclaim_stat->recent_scanned[0] += *nr_anon;
1233 reclaim_stat->recent_scanned[1] += *nr_file;
1234}
1178 1235
1179 nr_active = clear_active_flags(&page_list, count); 1236/*
1180 __count_vm_events(PGDEACTIVATE, nr_active); 1237 * Returns true if the caller should wait to clean dirty/writeback pages.
1238 *
1239 * If we are direct reclaiming for contiguous pages and we do not reclaim
1240 * everything in the list, try again and wait for writeback IO to complete.
1241 * This will stall high-order allocations noticeably. Only do that when really
1242 * need to free the pages under high memory pressure.
1243 */
1244static inline bool should_reclaim_stall(unsigned long nr_taken,
1245 unsigned long nr_freed,
1246 int priority,
1247 struct scan_control *sc)
1248{
1249 int lumpy_stall_priority;
1181 1250
1182 __mod_zone_page_state(zone, NR_ACTIVE_FILE, 1251 /* kswapd should not stall on sync IO */
1183 -count[LRU_ACTIVE_FILE]); 1252 if (current_is_kswapd())
1184 __mod_zone_page_state(zone, NR_INACTIVE_FILE, 1253 return false;
1185 -count[LRU_INACTIVE_FILE]);
1186 __mod_zone_page_state(zone, NR_ACTIVE_ANON,
1187 -count[LRU_ACTIVE_ANON]);
1188 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1189 -count[LRU_INACTIVE_ANON]);
1190 1254
1191 nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; 1255 /* Only stall on lumpy reclaim */
1192 nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; 1256 if (!sc->lumpy_reclaim_mode)
1193 __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); 1257 return false;
1194 __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
1195 1258
1196 reclaim_stat->recent_scanned[0] += nr_anon; 1259 /* If we have relaimed everything on the isolated list, no stall */
1197 reclaim_stat->recent_scanned[1] += nr_file; 1260 if (nr_freed == nr_taken)
1261 return false;
1198 1262
1199 spin_unlock_irq(&zone->lru_lock); 1263 /*
1264 * For high-order allocations, there are two stall thresholds.
1265 * High-cost allocations stall immediately where as lower
1266 * order allocations such as stacks require the scanning
1267 * priority to be much higher before stalling.
1268 */
1269 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1270 lumpy_stall_priority = DEF_PRIORITY;
1271 else
1272 lumpy_stall_priority = DEF_PRIORITY / 3;
1273
1274 return priority <= lumpy_stall_priority;
1275}
1276
1277/*
1278 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
1279 * of reclaimed pages
1280 */
1281static noinline_for_stack unsigned long
1282shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1283 struct scan_control *sc, int priority, int file)
1284{
1285 LIST_HEAD(page_list);
1286 unsigned long nr_scanned;
1287 unsigned long nr_reclaimed = 0;
1288 unsigned long nr_taken;
1289 unsigned long nr_active;
1290 unsigned long nr_anon;
1291 unsigned long nr_file;
1200 1292
1201 nr_scanned += nr_scan; 1293 while (unlikely(too_many_isolated(zone, file, sc))) {
1202 nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); 1294 congestion_wait(BLK_RW_ASYNC, HZ/10);
1295
1296 /* We are about to die and free our memory. Return now. */
1297 if (fatal_signal_pending(current))
1298 return SWAP_CLUSTER_MAX;
1299 }
1203 1300
1301
1302 lru_add_drain();
1303 spin_lock_irq(&zone->lru_lock);
1304
1305 if (scanning_global_lru(sc)) {
1306 nr_taken = isolate_pages_global(nr_to_scan,
1307 &page_list, &nr_scanned, sc->order,
1308 sc->lumpy_reclaim_mode ?
1309 ISOLATE_BOTH : ISOLATE_INACTIVE,
1310 zone, 0, file);
1311 zone->pages_scanned += nr_scanned;
1312 if (current_is_kswapd())
1313 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
1314 nr_scanned);
1315 else
1316 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1317 nr_scanned);
1318 } else {
1319 nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
1320 &page_list, &nr_scanned, sc->order,
1321 sc->lumpy_reclaim_mode ?
1322 ISOLATE_BOTH : ISOLATE_INACTIVE,
1323 zone, sc->mem_cgroup,
1324 0, file);
1204 /* 1325 /*
1205 * If we are direct reclaiming for contiguous pages and we do 1326 * mem_cgroup_isolate_pages() keeps track of
1206 * not reclaim everything in the list, try again and wait 1327 * scanned pages on its own.
1207 * for IO to complete. This will stall high-order allocations
1208 * but that should be acceptable to the caller
1209 */ 1328 */
1210 if (nr_freed < nr_taken && !current_is_kswapd() && 1329 }
1211 sc->lumpy_reclaim_mode) {
1212 congestion_wait(BLK_RW_ASYNC, HZ/10);
1213 1330
1214 /* 1331 if (nr_taken == 0) {
1215 * The attempt at page out may have made some 1332 spin_unlock_irq(&zone->lru_lock);
1216 * of the pages active, mark them inactive again. 1333 return 0;
1217 */ 1334 }
1218 nr_active = clear_active_flags(&page_list, count);
1219 count_vm_events(PGDEACTIVATE, nr_active);
1220 1335
1221 nr_freed += shrink_page_list(&page_list, sc, 1336 update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
1222 PAGEOUT_IO_SYNC);
1223 }
1224 1337
1225 nr_reclaimed += nr_freed; 1338 spin_unlock_irq(&zone->lru_lock);
1226 1339
1227 local_irq_disable(); 1340 nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
1228 if (current_is_kswapd()) 1341
1229 __count_vm_events(KSWAPD_STEAL, nr_freed); 1342 /* Check if we should syncronously wait for writeback */
1230 __count_zone_vm_events(PGSTEAL, zone, nr_freed); 1343 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1344 congestion_wait(BLK_RW_ASYNC, HZ/10);
1231 1345
1232 spin_lock(&zone->lru_lock);
1233 /* 1346 /*
1234 * Put back any unfreeable pages. 1347 * The attempt at page out may have made some
1348 * of the pages active, mark them inactive again.
1235 */ 1349 */
1236 while (!list_empty(&page_list)) { 1350 nr_active = clear_active_flags(&page_list, NULL);
1237 int lru; 1351 count_vm_events(PGDEACTIVATE, nr_active);
1238 page = lru_to_page(&page_list);
1239 VM_BUG_ON(PageLRU(page));
1240 list_del(&page->lru);
1241 if (unlikely(!page_evictable(page, NULL))) {
1242 spin_unlock_irq(&zone->lru_lock);
1243 putback_lru_page(page);
1244 spin_lock_irq(&zone->lru_lock);
1245 continue;
1246 }
1247 SetPageLRU(page);
1248 lru = page_lru(page);
1249 add_page_to_lru_list(zone, page, lru);
1250 if (is_active_lru(lru)) {
1251 int file = is_file_lru(lru);
1252 reclaim_stat->recent_rotated[file]++;
1253 }
1254 if (!pagevec_add(&pvec, page)) {
1255 spin_unlock_irq(&zone->lru_lock);
1256 __pagevec_release(&pvec);
1257 spin_lock_irq(&zone->lru_lock);
1258 }
1259 }
1260 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
1261 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
1262 1352
1263 } while (nr_scanned < max_scan); 1353 nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC);
1354 }
1264 1355
1265done: 1356 local_irq_disable();
1266 spin_unlock_irq(&zone->lru_lock); 1357 if (current_is_kswapd())
1267 pagevec_release(&pvec); 1358 __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
1268 return nr_reclaimed; 1359 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
1269}
1270 1360
1271/* 1361 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
1272 * We are about to scan this zone at a certain priority level. If that priority 1362 return nr_reclaimed;
1273 * level is smaller (ie: more urgent) than the previous priority, then note
1274 * that priority level within the zone. This is done so that when the next
1275 * process comes in to scan this zone, it will immediately start out at this
1276 * priority level rather than having to build up its own scanning priority.
1277 * Here, this priority affects only the reclaim-mapped threshold.
1278 */
1279static inline void note_zone_scanning_priority(struct zone *zone, int priority)
1280{
1281 if (priority < zone->prev_priority)
1282 zone->prev_priority = priority;
1283} 1363}
1284 1364
1285/* 1365/*
@@ -1583,6 +1663,13 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1583 } 1663 }
1584 1664
1585 /* 1665 /*
1666 * With swappiness at 100, anonymous and file have the same priority.
1667 * This scanning priority is essentially the inverse of IO cost.
1668 */
1669 anon_prio = sc->swappiness;
1670 file_prio = 200 - sc->swappiness;
1671
1672 /*
1586 * OK, so we have swap space and a fair amount of page cache 1673 * OK, so we have swap space and a fair amount of page cache
1587 * pages. We use the recently rotated / recently scanned 1674 * pages. We use the recently rotated / recently scanned
1588 * ratios to determine how valuable each cache is. 1675 * ratios to determine how valuable each cache is.
@@ -1593,28 +1680,18 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1593 * 1680 *
1594 * anon in [0], file in [1] 1681 * anon in [0], file in [1]
1595 */ 1682 */
1683 spin_lock_irq(&zone->lru_lock);
1596 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { 1684 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
1597 spin_lock_irq(&zone->lru_lock);
1598 reclaim_stat->recent_scanned[0] /= 2; 1685 reclaim_stat->recent_scanned[0] /= 2;
1599 reclaim_stat->recent_rotated[0] /= 2; 1686 reclaim_stat->recent_rotated[0] /= 2;
1600 spin_unlock_irq(&zone->lru_lock);
1601 } 1687 }
1602 1688
1603 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { 1689 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
1604 spin_lock_irq(&zone->lru_lock);
1605 reclaim_stat->recent_scanned[1] /= 2; 1690 reclaim_stat->recent_scanned[1] /= 2;
1606 reclaim_stat->recent_rotated[1] /= 2; 1691 reclaim_stat->recent_rotated[1] /= 2;
1607 spin_unlock_irq(&zone->lru_lock);
1608 } 1692 }
1609 1693
1610 /* 1694 /*
1611 * With swappiness at 100, anonymous and file have the same priority.
1612 * This scanning priority is essentially the inverse of IO cost.
1613 */
1614 anon_prio = sc->swappiness;
1615 file_prio = 200 - sc->swappiness;
1616
1617 /*
1618 * The amount of pressure on anon vs file pages is inversely 1695 * The amount of pressure on anon vs file pages is inversely
1619 * proportional to the fraction of recently scanned pages on 1696 * proportional to the fraction of recently scanned pages on
1620 * each list that were recently referenced and in active use. 1697 * each list that were recently referenced and in active use.
@@ -1624,6 +1701,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1624 1701
1625 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); 1702 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
1626 fp /= reclaim_stat->recent_rotated[1] + 1; 1703 fp /= reclaim_stat->recent_rotated[1] + 1;
1704 spin_unlock_irq(&zone->lru_lock);
1627 1705
1628 fraction[0] = ap; 1706 fraction[0] = ap;
1629 fraction[1] = fp; 1707 fraction[1] = fp;
@@ -1729,13 +1807,12 @@ static void shrink_zone(int priority, struct zone *zone,
1729static bool shrink_zones(int priority, struct zonelist *zonelist, 1807static bool shrink_zones(int priority, struct zonelist *zonelist,
1730 struct scan_control *sc) 1808 struct scan_control *sc)
1731{ 1809{
1732 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1733 struct zoneref *z; 1810 struct zoneref *z;
1734 struct zone *zone; 1811 struct zone *zone;
1735 bool all_unreclaimable = true; 1812 bool all_unreclaimable = true;
1736 1813
1737 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, 1814 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1738 sc->nodemask) { 1815 gfp_zone(sc->gfp_mask), sc->nodemask) {
1739 if (!populated_zone(zone)) 1816 if (!populated_zone(zone))
1740 continue; 1817 continue;
1741 /* 1818 /*
@@ -1745,17 +1822,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
1745 if (scanning_global_lru(sc)) { 1822 if (scanning_global_lru(sc)) {
1746 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1823 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1747 continue; 1824 continue;
1748 note_zone_scanning_priority(zone, priority);
1749
1750 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1825 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1751 continue; /* Let kswapd poll it */ 1826 continue; /* Let kswapd poll it */
1752 } else {
1753 /*
1754 * Ignore cpuset limitation here. We just want to reduce
1755 * # of used pages by us regardless of memory shortage.
1756 */
1757 mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
1758 priority);
1759 } 1827 }
1760 1828
1761 shrink_zone(priority, zone, sc); 1829 shrink_zone(priority, zone, sc);
@@ -1787,10 +1855,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1787 bool all_unreclaimable; 1855 bool all_unreclaimable;
1788 unsigned long total_scanned = 0; 1856 unsigned long total_scanned = 0;
1789 struct reclaim_state *reclaim_state = current->reclaim_state; 1857 struct reclaim_state *reclaim_state = current->reclaim_state;
1790 unsigned long lru_pages = 0;
1791 struct zoneref *z; 1858 struct zoneref *z;
1792 struct zone *zone; 1859 struct zone *zone;
1793 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1794 unsigned long writeback_threshold; 1860 unsigned long writeback_threshold;
1795 1861
1796 get_mems_allowed(); 1862 get_mems_allowed();
@@ -1798,18 +1864,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1798 1864
1799 if (scanning_global_lru(sc)) 1865 if (scanning_global_lru(sc))
1800 count_vm_event(ALLOCSTALL); 1866 count_vm_event(ALLOCSTALL);
1801 /*
1802 * mem_cgroup will not do shrink_slab.
1803 */
1804 if (scanning_global_lru(sc)) {
1805 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1806
1807 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1808 continue;
1809
1810 lru_pages += zone_reclaimable_pages(zone);
1811 }
1812 }
1813 1867
1814 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 1868 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1815 sc->nr_scanned = 0; 1869 sc->nr_scanned = 0;
@@ -1821,6 +1875,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1821 * over limit cgroups 1875 * over limit cgroups
1822 */ 1876 */
1823 if (scanning_global_lru(sc)) { 1877 if (scanning_global_lru(sc)) {
1878 unsigned long lru_pages = 0;
1879 for_each_zone_zonelist(zone, z, zonelist,
1880 gfp_zone(sc->gfp_mask)) {
1881 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1882 continue;
1883
1884 lru_pages += zone_reclaimable_pages(zone);
1885 }
1886
1824 shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); 1887 shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
1825 if (reclaim_state) { 1888 if (reclaim_state) {
1826 sc->nr_reclaimed += reclaim_state->reclaimed_slab; 1889 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -1861,17 +1924,6 @@ out:
1861 if (priority < 0) 1924 if (priority < 0)
1862 priority = 0; 1925 priority = 0;
1863 1926
1864 if (scanning_global_lru(sc)) {
1865 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1866
1867 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1868 continue;
1869
1870 zone->prev_priority = priority;
1871 }
1872 } else
1873 mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
1874
1875 delayacct_freepages_end(); 1927 delayacct_freepages_end();
1876 put_mems_allowed(); 1928 put_mems_allowed();
1877 1929
@@ -1888,6 +1940,7 @@ out:
1888unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 1940unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1889 gfp_t gfp_mask, nodemask_t *nodemask) 1941 gfp_t gfp_mask, nodemask_t *nodemask)
1890{ 1942{
1943 unsigned long nr_reclaimed;
1891 struct scan_control sc = { 1944 struct scan_control sc = {
1892 .gfp_mask = gfp_mask, 1945 .gfp_mask = gfp_mask,
1893 .may_writepage = !laptop_mode, 1946 .may_writepage = !laptop_mode,
@@ -1900,7 +1953,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1900 .nodemask = nodemask, 1953 .nodemask = nodemask,
1901 }; 1954 };
1902 1955
1903 return do_try_to_free_pages(zonelist, &sc); 1956 trace_mm_vmscan_direct_reclaim_begin(order,
1957 sc.may_writepage,
1958 gfp_mask);
1959
1960 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
1961
1962 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
1963
1964 return nr_reclaimed;
1904} 1965}
1905 1966
1906#ifdef CONFIG_CGROUP_MEM_RES_CTLR 1967#ifdef CONFIG_CGROUP_MEM_RES_CTLR
@@ -1925,6 +1986,11 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
1925 sc.nodemask = &nm; 1986 sc.nodemask = &nm;
1926 sc.nr_reclaimed = 0; 1987 sc.nr_reclaimed = 0;
1927 sc.nr_scanned = 0; 1988 sc.nr_scanned = 0;
1989
1990 trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
1991 sc.may_writepage,
1992 sc.gfp_mask);
1993
1928 /* 1994 /*
1929 * NOTE: Although we can get the priority field, using it 1995 * NOTE: Although we can get the priority field, using it
1930 * here is not a good idea, since it limits the pages we can scan. 1996 * here is not a good idea, since it limits the pages we can scan.
@@ -1933,6 +1999,9 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
1933 * the priority and make it zero. 1999 * the priority and make it zero.
1934 */ 2000 */
1935 shrink_zone(0, zone, &sc); 2001 shrink_zone(0, zone, &sc);
2002
2003 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2004
1936 return sc.nr_reclaimed; 2005 return sc.nr_reclaimed;
1937} 2006}
1938 2007
@@ -1942,6 +2011,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1942 unsigned int swappiness) 2011 unsigned int swappiness)
1943{ 2012{
1944 struct zonelist *zonelist; 2013 struct zonelist *zonelist;
2014 unsigned long nr_reclaimed;
1945 struct scan_control sc = { 2015 struct scan_control sc = {
1946 .may_writepage = !laptop_mode, 2016 .may_writepage = !laptop_mode,
1947 .may_unmap = 1, 2017 .may_unmap = 1,
@@ -1956,7 +2026,16 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1956 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2026 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1957 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2027 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
1958 zonelist = NODE_DATA(numa_node_id())->node_zonelists; 2028 zonelist = NODE_DATA(numa_node_id())->node_zonelists;
1959 return do_try_to_free_pages(zonelist, &sc); 2029
2030 trace_mm_vmscan_memcg_reclaim_begin(0,
2031 sc.may_writepage,
2032 sc.gfp_mask);
2033
2034 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
2035
2036 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
2037
2038 return nr_reclaimed;
1960} 2039}
1961#endif 2040#endif
1962 2041
@@ -2028,22 +2107,12 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
2028 .order = order, 2107 .order = order,
2029 .mem_cgroup = NULL, 2108 .mem_cgroup = NULL,
2030 }; 2109 };
2031 /*
2032 * temp_priority is used to remember the scanning priority at which
2033 * this zone was successfully refilled to
2034 * free_pages == high_wmark_pages(zone).
2035 */
2036 int temp_priority[MAX_NR_ZONES];
2037
2038loop_again: 2110loop_again:
2039 total_scanned = 0; 2111 total_scanned = 0;
2040 sc.nr_reclaimed = 0; 2112 sc.nr_reclaimed = 0;
2041 sc.may_writepage = !laptop_mode; 2113 sc.may_writepage = !laptop_mode;
2042 count_vm_event(PAGEOUTRUN); 2114 count_vm_event(PAGEOUTRUN);
2043 2115
2044 for (i = 0; i < pgdat->nr_zones; i++)
2045 temp_priority[i] = DEF_PRIORITY;
2046
2047 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2116 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2048 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2117 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2049 unsigned long lru_pages = 0; 2118 unsigned long lru_pages = 0;
@@ -2111,9 +2180,7 @@ loop_again:
2111 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2180 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2112 continue; 2181 continue;
2113 2182
2114 temp_priority[i] = priority;
2115 sc.nr_scanned = 0; 2183 sc.nr_scanned = 0;
2116 note_zone_scanning_priority(zone, priority);
2117 2184
2118 nid = pgdat->node_id; 2185 nid = pgdat->node_id;
2119 zid = zone_idx(zone); 2186 zid = zone_idx(zone);
@@ -2186,16 +2253,6 @@ loop_again:
2186 break; 2253 break;
2187 } 2254 }
2188out: 2255out:
2189 /*
2190 * Note within each zone the priority level at which this zone was
2191 * brought into a happy state. So that the next thread which scans this
2192 * zone will start out at that priority level.
2193 */
2194 for (i = 0; i < pgdat->nr_zones; i++) {
2195 struct zone *zone = pgdat->node_zones + i;
2196
2197 zone->prev_priority = temp_priority[i];
2198 }
2199 if (!all_zones_ok) { 2256 if (!all_zones_ok) {
2200 cond_resched(); 2257 cond_resched();
2201 2258
@@ -2299,9 +2356,10 @@ static int kswapd(void *p)
2299 * premature sleep. If not, then go fully 2356 * premature sleep. If not, then go fully
2300 * to sleep until explicitly woken up 2357 * to sleep until explicitly woken up
2301 */ 2358 */
2302 if (!sleeping_prematurely(pgdat, order, remaining)) 2359 if (!sleeping_prematurely(pgdat, order, remaining)) {
2360 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2303 schedule(); 2361 schedule();
2304 else { 2362 } else {
2305 if (remaining) 2363 if (remaining)
2306 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); 2364 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
2307 else 2365 else
@@ -2321,8 +2379,10 @@ static int kswapd(void *p)
2321 * We can speed up thawing tasks if we don't call balance_pgdat 2379 * We can speed up thawing tasks if we don't call balance_pgdat
2322 * after returning from the refrigerator 2380 * after returning from the refrigerator
2323 */ 2381 */
2324 if (!ret) 2382 if (!ret) {
2383 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
2325 balance_pgdat(pgdat, order); 2384 balance_pgdat(pgdat, order);
2385 }
2326 } 2386 }
2327 return 0; 2387 return 0;
2328} 2388}
@@ -2342,6 +2402,7 @@ void wakeup_kswapd(struct zone *zone, int order)
2342 return; 2402 return;
2343 if (pgdat->kswapd_max_order < order) 2403 if (pgdat->kswapd_max_order < order)
2344 pgdat->kswapd_max_order = order; 2404 pgdat->kswapd_max_order = order;
2405 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
2345 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2406 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2346 return; 2407 return;
2347 if (!waitqueue_active(&pgdat->kswapd_wait)) 2408 if (!waitqueue_active(&pgdat->kswapd_wait))
@@ -2590,9 +2651,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2590 .swappiness = vm_swappiness, 2651 .swappiness = vm_swappiness,
2591 .order = order, 2652 .order = order,
2592 }; 2653 };
2593 unsigned long slab_reclaimable; 2654 unsigned long nr_slab_pages0, nr_slab_pages1;
2594 2655
2595 disable_swap_token();
2596 cond_resched(); 2656 cond_resched();
2597 /* 2657 /*
2598 * We need to be able to allocate from the reserves for RECLAIM_SWAP 2658 * We need to be able to allocate from the reserves for RECLAIM_SWAP
@@ -2611,14 +2671,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2611 */ 2671 */
2612 priority = ZONE_RECLAIM_PRIORITY; 2672 priority = ZONE_RECLAIM_PRIORITY;
2613 do { 2673 do {
2614 note_zone_scanning_priority(zone, priority);
2615 shrink_zone(priority, zone, &sc); 2674 shrink_zone(priority, zone, &sc);
2616 priority--; 2675 priority--;
2617 } while (priority >= 0 && sc.nr_reclaimed < nr_pages); 2676 } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
2618 } 2677 }
2619 2678
2620 slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); 2679 nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
2621 if (slab_reclaimable > zone->min_slab_pages) { 2680 if (nr_slab_pages0 > zone->min_slab_pages) {
2622 /* 2681 /*
2623 * shrink_slab() does not currently allow us to determine how 2682 * shrink_slab() does not currently allow us to determine how
2624 * many pages were freed in this zone. So we take the current 2683 * many pages were freed in this zone. So we take the current
@@ -2629,17 +2688,27 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2629 * Note that shrink_slab will free memory on all zones and may 2688 * Note that shrink_slab will free memory on all zones and may
2630 * take a long time. 2689 * take a long time.
2631 */ 2690 */
2632 while (shrink_slab(sc.nr_scanned, gfp_mask, order) && 2691 for (;;) {
2633 zone_page_state(zone, NR_SLAB_RECLAIMABLE) > 2692 unsigned long lru_pages = zone_reclaimable_pages(zone);
2634 slab_reclaimable - nr_pages) 2693
2635 ; 2694 /* No reclaimable slab or very low memory pressure */
2695 if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages))
2696 break;
2697
2698 /* Freed enough memory */
2699 nr_slab_pages1 = zone_page_state(zone,
2700 NR_SLAB_RECLAIMABLE);
2701 if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
2702 break;
2703 }
2636 2704
2637 /* 2705 /*
2638 * Update nr_reclaimed by the number of slab pages we 2706 * Update nr_reclaimed by the number of slab pages we
2639 * reclaimed from this zone. 2707 * reclaimed from this zone.
2640 */ 2708 */
2641 sc.nr_reclaimed += slab_reclaimable - 2709 nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
2642 zone_page_state(zone, NR_SLAB_RECLAIMABLE); 2710 if (nr_slab_pages1 < nr_slab_pages0)
2711 sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
2643 } 2712 }
2644 2713
2645 p->reclaim_state = NULL; 2714 p->reclaim_state = NULL;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7759941d4e77..f389168f9a83 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -22,14 +22,14 @@
22DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; 22DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
23EXPORT_PER_CPU_SYMBOL(vm_event_states); 23EXPORT_PER_CPU_SYMBOL(vm_event_states);
24 24
25static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask) 25static void sum_vm_events(unsigned long *ret)
26{ 26{
27 int cpu; 27 int cpu;
28 int i; 28 int i;
29 29
30 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); 30 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
31 31
32 for_each_cpu(cpu, cpumask) { 32 for_each_online_cpu(cpu) {
33 struct vm_event_state *this = &per_cpu(vm_event_states, cpu); 33 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
34 34
35 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 35 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
@@ -45,7 +45,7 @@ static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask)
45void all_vm_events(unsigned long *ret) 45void all_vm_events(unsigned long *ret)
46{ 46{
47 get_online_cpus(); 47 get_online_cpus();
48 sum_vm_events(ret, cpu_online_mask); 48 sum_vm_events(ret);
49 put_online_cpus(); 49 put_online_cpus();
50} 50}
51EXPORT_SYMBOL_GPL(all_vm_events); 51EXPORT_SYMBOL_GPL(all_vm_events);
@@ -853,11 +853,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
853 } 853 }
854 seq_printf(m, 854 seq_printf(m,
855 "\n all_unreclaimable: %u" 855 "\n all_unreclaimable: %u"
856 "\n prev_priority: %i"
857 "\n start_pfn: %lu" 856 "\n start_pfn: %lu"
858 "\n inactive_ratio: %u", 857 "\n inactive_ratio: %u",
859 zone->all_unreclaimable, 858 zone->all_unreclaimable,
860 zone->prev_priority,
861 zone->zone_start_pfn, 859 zone->zone_start_pfn,
862 zone->inactive_ratio); 860 zone->inactive_ratio);
863 seq_putc(m, '\n'); 861 seq_putc(m, '\n');