aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig23
-rw-r--r--mm/Makefile1
-rw-r--r--mm/backing-dev.c4
-rw-r--r--mm/cleancache.c244
-rw-r--r--mm/filemap.c83
-rw-r--r--mm/filemap_xip.c4
-rw-r--r--mm/fremap.c6
-rw-r--r--mm/huge_memory.c25
-rw-r--r--mm/hugetlb.c20
-rw-r--r--mm/init-mm.c1
-rw-r--r--mm/internal.h4
-rw-r--r--mm/kmemleak.c7
-rw-r--r--mm/ksm.c7
-rw-r--r--mm/memcontrol.c377
-rw-r--r--mm/memory-failure.c21
-rw-r--r--mm/memory.c444
-rw-r--r--mm/memory_hotplug.c21
-rw-r--r--mm/mempolicy.c164
-rw-r--r--mm/migrate.c17
-rw-r--r--mm/mlock.c8
-rw-r--r--mm/mmap.c129
-rw-r--r--mm/mremap.c5
-rw-r--r--mm/nobootmem.c23
-rw-r--r--mm/nommu.c108
-rw-r--r--mm/oom_kill.c36
-rw-r--r--mm/page_alloc.c128
-rw-r--r--mm/page_cgroup.c28
-rw-r--r--mm/percpu.c6
-rw-r--r--mm/prio_tree.c1
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/rmap.c183
-rw-r--r--mm/shmem.c334
-rw-r--r--mm/slab.c1
-rw-r--r--mm/slub.c169
-rw-r--r--mm/swap.c52
-rw-r--r--mm/swapfile.c6
-rw-r--r--mm/truncate.c6
-rw-r--r--mm/util.c24
-rw-r--r--mm/vmalloc.c15
-rw-r--r--mm/vmscan.c185
-rw-r--r--mm/vmstat.c264
41 files changed, 2057 insertions, 1129 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index e9c0c61f2ddd..8ca47a5ee9c8 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -347,3 +347,26 @@ config NEED_PER_CPU_KM
347 depends on !SMP 347 depends on !SMP
348 bool 348 bool
349 default y 349 default y
350
351config CLEANCACHE
352 bool "Enable cleancache driver to cache clean pages if tmem is present"
353 default n
354 help
355 Cleancache can be thought of as a page-granularity victim cache
356 for clean pages that the kernel's pageframe replacement algorithm
357 (PFRA) would like to keep around, but can't since there isn't enough
358 memory. So when the PFRA "evicts" a page, it first attempts to use
359 cleancacne code to put the data contained in that page into
360 "transcendent memory", memory that is not directly accessible or
361 addressable by the kernel and is of unknown and possibly
362 time-varying size. And when a cleancache-enabled
363 filesystem wishes to access a page in a file on disk, it first
364 checks cleancache to see if it already contains it; if it does,
365 the page is copied into the kernel and a disk access is avoided.
366 When a transcendent memory driver is available (such as zcache or
367 Xen transcendent memory), a significant I/O reduction
368 may be achieved. When none is available, all cleancache calls
369 are reduced to a single pointer-compare-against-NULL resulting
370 in a negligible performance hit.
371
372 If unsure, say Y to enable cleancache
diff --git a/mm/Makefile b/mm/Makefile
index 42a8326c3e3d..836e4163c1bf 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -49,3 +49,4 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
49obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o 49obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
50obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o 50obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
51obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o 51obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
52obj-$(CONFIG_CLEANCACHE) += cleancache.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index befc87531e4f..f032e6e1e09a 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -63,10 +63,10 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
63 unsigned long background_thresh; 63 unsigned long background_thresh;
64 unsigned long dirty_thresh; 64 unsigned long dirty_thresh;
65 unsigned long bdi_thresh; 65 unsigned long bdi_thresh;
66 unsigned long nr_dirty, nr_io, nr_more_io, nr_wb; 66 unsigned long nr_dirty, nr_io, nr_more_io;
67 struct inode *inode; 67 struct inode *inode;
68 68
69 nr_wb = nr_dirty = nr_io = nr_more_io = 0; 69 nr_dirty = nr_io = nr_more_io = 0;
70 spin_lock(&inode_wb_list_lock); 70 spin_lock(&inode_wb_list_lock);
71 list_for_each_entry(inode, &wb->b_dirty, i_wb_list) 71 list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
72 nr_dirty++; 72 nr_dirty++;
diff --git a/mm/cleancache.c b/mm/cleancache.c
new file mode 100644
index 000000000000..bcaae4c2a770
--- /dev/null
+++ b/mm/cleancache.c
@@ -0,0 +1,244 @@
1/*
2 * Cleancache frontend
3 *
4 * This code provides the generic "frontend" layer to call a matching
5 * "backend" driver implementation of cleancache. See
6 * Documentation/vm/cleancache.txt for more information.
7 *
8 * Copyright (C) 2009-2010 Oracle Corp. All rights reserved.
9 * Author: Dan Magenheimer
10 *
11 * This work is licensed under the terms of the GNU GPL, version 2.
12 */
13
14#include <linux/module.h>
15#include <linux/fs.h>
16#include <linux/exportfs.h>
17#include <linux/mm.h>
18#include <linux/cleancache.h>
19
20/*
21 * This global enablement flag may be read thousands of times per second
22 * by cleancache_get/put/flush even on systems where cleancache_ops
23 * is not claimed (e.g. cleancache is config'ed on but remains
24 * disabled), so is preferred to the slower alternative: a function
25 * call that checks a non-global.
26 */
27int cleancache_enabled;
28EXPORT_SYMBOL(cleancache_enabled);
29
30/*
31 * cleancache_ops is set by cleancache_ops_register to contain the pointers
32 * to the cleancache "backend" implementation functions.
33 */
34static struct cleancache_ops cleancache_ops;
35
36/* useful stats available in /sys/kernel/mm/cleancache */
37static unsigned long cleancache_succ_gets;
38static unsigned long cleancache_failed_gets;
39static unsigned long cleancache_puts;
40static unsigned long cleancache_flushes;
41
42/*
43 * register operations for cleancache, returning previous thus allowing
44 * detection of multiple backends and possible nesting
45 */
46struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops)
47{
48 struct cleancache_ops old = cleancache_ops;
49
50 cleancache_ops = *ops;
51 cleancache_enabled = 1;
52 return old;
53}
54EXPORT_SYMBOL(cleancache_register_ops);
55
56/* Called by a cleancache-enabled filesystem at time of mount */
57void __cleancache_init_fs(struct super_block *sb)
58{
59 sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE);
60}
61EXPORT_SYMBOL(__cleancache_init_fs);
62
63/* Called by a cleancache-enabled clustered filesystem at time of mount */
64void __cleancache_init_shared_fs(char *uuid, struct super_block *sb)
65{
66 sb->cleancache_poolid =
67 (*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE);
68}
69EXPORT_SYMBOL(__cleancache_init_shared_fs);
70
71/*
72 * If the filesystem uses exportable filehandles, use the filehandle as
73 * the key, else use the inode number.
74 */
75static int cleancache_get_key(struct inode *inode,
76 struct cleancache_filekey *key)
77{
78 int (*fhfn)(struct dentry *, __u32 *fh, int *, int);
79 int len = 0, maxlen = CLEANCACHE_KEY_MAX;
80 struct super_block *sb = inode->i_sb;
81
82 key->u.ino = inode->i_ino;
83 if (sb->s_export_op != NULL) {
84 fhfn = sb->s_export_op->encode_fh;
85 if (fhfn) {
86 struct dentry d;
87 d.d_inode = inode;
88 len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0);
89 if (len <= 0 || len == 255)
90 return -1;
91 if (maxlen > CLEANCACHE_KEY_MAX)
92 return -1;
93 }
94 }
95 return 0;
96}
97
98/*
99 * "Get" data from cleancache associated with the poolid/inode/index
100 * that were specified when the data was put to cleanache and, if
101 * successful, use it to fill the specified page with data and return 0.
102 * The pageframe is unchanged and returns -1 if the get fails.
103 * Page must be locked by caller.
104 */
105int __cleancache_get_page(struct page *page)
106{
107 int ret = -1;
108 int pool_id;
109 struct cleancache_filekey key = { .u.key = { 0 } };
110
111 VM_BUG_ON(!PageLocked(page));
112 pool_id = page->mapping->host->i_sb->cleancache_poolid;
113 if (pool_id < 0)
114 goto out;
115
116 if (cleancache_get_key(page->mapping->host, &key) < 0)
117 goto out;
118
119 ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page);
120 if (ret == 0)
121 cleancache_succ_gets++;
122 else
123 cleancache_failed_gets++;
124out:
125 return ret;
126}
127EXPORT_SYMBOL(__cleancache_get_page);
128
129/*
130 * "Put" data from a page to cleancache and associate it with the
131 * (previously-obtained per-filesystem) poolid and the page's,
132 * inode and page index. Page must be locked. Note that a put_page
133 * always "succeeds", though a subsequent get_page may succeed or fail.
134 */
135void __cleancache_put_page(struct page *page)
136{
137 int pool_id;
138 struct cleancache_filekey key = { .u.key = { 0 } };
139
140 VM_BUG_ON(!PageLocked(page));
141 pool_id = page->mapping->host->i_sb->cleancache_poolid;
142 if (pool_id >= 0 &&
143 cleancache_get_key(page->mapping->host, &key) >= 0) {
144 (*cleancache_ops.put_page)(pool_id, key, page->index, page);
145 cleancache_puts++;
146 }
147}
148EXPORT_SYMBOL(__cleancache_put_page);
149
150/*
151 * Flush any data from cleancache associated with the poolid and the
152 * page's inode and page index so that a subsequent "get" will fail.
153 */
154void __cleancache_flush_page(struct address_space *mapping, struct page *page)
155{
156 /* careful... page->mapping is NULL sometimes when this is called */
157 int pool_id = mapping->host->i_sb->cleancache_poolid;
158 struct cleancache_filekey key = { .u.key = { 0 } };
159
160 if (pool_id >= 0) {
161 VM_BUG_ON(!PageLocked(page));
162 if (cleancache_get_key(mapping->host, &key) >= 0) {
163 (*cleancache_ops.flush_page)(pool_id, key, page->index);
164 cleancache_flushes++;
165 }
166 }
167}
168EXPORT_SYMBOL(__cleancache_flush_page);
169
170/*
171 * Flush all data from cleancache associated with the poolid and the
172 * mappings's inode so that all subsequent gets to this poolid/inode
173 * will fail.
174 */
175void __cleancache_flush_inode(struct address_space *mapping)
176{
177 int pool_id = mapping->host->i_sb->cleancache_poolid;
178 struct cleancache_filekey key = { .u.key = { 0 } };
179
180 if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
181 (*cleancache_ops.flush_inode)(pool_id, key);
182}
183EXPORT_SYMBOL(__cleancache_flush_inode);
184
185/*
186 * Called by any cleancache-enabled filesystem at time of unmount;
187 * note that pool_id is surrendered and may be reutrned by a subsequent
188 * cleancache_init_fs or cleancache_init_shared_fs
189 */
190void __cleancache_flush_fs(struct super_block *sb)
191{
192 if (sb->cleancache_poolid >= 0) {
193 int old_poolid = sb->cleancache_poolid;
194 sb->cleancache_poolid = -1;
195 (*cleancache_ops.flush_fs)(old_poolid);
196 }
197}
198EXPORT_SYMBOL(__cleancache_flush_fs);
199
200#ifdef CONFIG_SYSFS
201
202/* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */
203
204#define CLEANCACHE_SYSFS_RO(_name) \
205 static ssize_t cleancache_##_name##_show(struct kobject *kobj, \
206 struct kobj_attribute *attr, char *buf) \
207 { \
208 return sprintf(buf, "%lu\n", cleancache_##_name); \
209 } \
210 static struct kobj_attribute cleancache_##_name##_attr = { \
211 .attr = { .name = __stringify(_name), .mode = 0444 }, \
212 .show = cleancache_##_name##_show, \
213 }
214
215CLEANCACHE_SYSFS_RO(succ_gets);
216CLEANCACHE_SYSFS_RO(failed_gets);
217CLEANCACHE_SYSFS_RO(puts);
218CLEANCACHE_SYSFS_RO(flushes);
219
220static struct attribute *cleancache_attrs[] = {
221 &cleancache_succ_gets_attr.attr,
222 &cleancache_failed_gets_attr.attr,
223 &cleancache_puts_attr.attr,
224 &cleancache_flushes_attr.attr,
225 NULL,
226};
227
228static struct attribute_group cleancache_attr_group = {
229 .attrs = cleancache_attrs,
230 .name = "cleancache",
231};
232
233#endif /* CONFIG_SYSFS */
234
235static int __init init_cleancache(void)
236{
237#ifdef CONFIG_SYSFS
238 int err;
239
240 err = sysfs_create_group(mm_kobj, &cleancache_attr_group);
241#endif /* CONFIG_SYSFS */
242 return 0;
243}
244module_init(init_cleancache)
diff --git a/mm/filemap.c b/mm/filemap.c
index c641edf553a9..bcdc393b6580 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -34,6 +34,7 @@
34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
35#include <linux/memcontrol.h> 35#include <linux/memcontrol.h>
36#include <linux/mm_inline.h> /* for page_is_file_cache() */ 36#include <linux/mm_inline.h> /* for page_is_file_cache() */
37#include <linux/cleancache.h>
37#include "internal.h" 38#include "internal.h"
38 39
39/* 40/*
@@ -58,16 +59,16 @@
58/* 59/*
59 * Lock ordering: 60 * Lock ordering:
60 * 61 *
61 * ->i_mmap_lock (truncate_pagecache) 62 * ->i_mmap_mutex (truncate_pagecache)
62 * ->private_lock (__free_pte->__set_page_dirty_buffers) 63 * ->private_lock (__free_pte->__set_page_dirty_buffers)
63 * ->swap_lock (exclusive_swap_page, others) 64 * ->swap_lock (exclusive_swap_page, others)
64 * ->mapping->tree_lock 65 * ->mapping->tree_lock
65 * 66 *
66 * ->i_mutex 67 * ->i_mutex
67 * ->i_mmap_lock (truncate->unmap_mapping_range) 68 * ->i_mmap_mutex (truncate->unmap_mapping_range)
68 * 69 *
69 * ->mmap_sem 70 * ->mmap_sem
70 * ->i_mmap_lock 71 * ->i_mmap_mutex
71 * ->page_table_lock or pte_lock (various, mainly in memory.c) 72 * ->page_table_lock or pte_lock (various, mainly in memory.c)
72 * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) 73 * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock)
73 * 74 *
@@ -84,7 +85,7 @@
84 * sb_lock (fs/fs-writeback.c) 85 * sb_lock (fs/fs-writeback.c)
85 * ->mapping->tree_lock (__sync_single_inode) 86 * ->mapping->tree_lock (__sync_single_inode)
86 * 87 *
87 * ->i_mmap_lock 88 * ->i_mmap_mutex
88 * ->anon_vma.lock (vma_adjust) 89 * ->anon_vma.lock (vma_adjust)
89 * 90 *
90 * ->anon_vma.lock 91 * ->anon_vma.lock
@@ -106,7 +107,7 @@
106 * 107 *
107 * (code doesn't rely on that order, so you could switch it around) 108 * (code doesn't rely on that order, so you could switch it around)
108 * ->tasklist_lock (memory_failure, collect_procs_ao) 109 * ->tasklist_lock (memory_failure, collect_procs_ao)
109 * ->i_mmap_lock 110 * ->i_mmap_mutex
110 */ 111 */
111 112
112/* 113/*
@@ -118,6 +119,16 @@ void __delete_from_page_cache(struct page *page)
118{ 119{
119 struct address_space *mapping = page->mapping; 120 struct address_space *mapping = page->mapping;
120 121
122 /*
123 * if we're uptodate, flush out into the cleancache, otherwise
124 * invalidate any existing cleancache entries. We can't leave
125 * stale data around in the cleancache once our page is gone
126 */
127 if (PageUptodate(page) && PageMappedToDisk(page))
128 cleancache_put_page(page);
129 else
130 cleancache_flush_page(mapping, page);
131
121 radix_tree_delete(&mapping->page_tree, page->index); 132 radix_tree_delete(&mapping->page_tree, page->index);
122 page->mapping = NULL; 133 page->mapping = NULL;
123 mapping->nrpages--; 134 mapping->nrpages--;
@@ -562,6 +573,17 @@ void wait_on_page_bit(struct page *page, int bit_nr)
562} 573}
563EXPORT_SYMBOL(wait_on_page_bit); 574EXPORT_SYMBOL(wait_on_page_bit);
564 575
576int wait_on_page_bit_killable(struct page *page, int bit_nr)
577{
578 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
579
580 if (!test_bit(bit_nr, &page->flags))
581 return 0;
582
583 return __wait_on_bit(page_waitqueue(page), &wait,
584 sleep_on_page_killable, TASK_KILLABLE);
585}
586
565/** 587/**
566 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue 588 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
567 * @page: Page defining the wait queue of interest 589 * @page: Page defining the wait queue of interest
@@ -643,15 +665,32 @@ EXPORT_SYMBOL_GPL(__lock_page_killable);
643int __lock_page_or_retry(struct page *page, struct mm_struct *mm, 665int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
644 unsigned int flags) 666 unsigned int flags)
645{ 667{
646 if (!(flags & FAULT_FLAG_ALLOW_RETRY)) { 668 if (flags & FAULT_FLAG_ALLOW_RETRY) {
647 __lock_page(page); 669 /*
648 return 1; 670 * CAUTION! In this case, mmap_sem is not released
649 } else { 671 * even though return 0.
650 if (!(flags & FAULT_FLAG_RETRY_NOWAIT)) { 672 */
651 up_read(&mm->mmap_sem); 673 if (flags & FAULT_FLAG_RETRY_NOWAIT)
674 return 0;
675
676 up_read(&mm->mmap_sem);
677 if (flags & FAULT_FLAG_KILLABLE)
678 wait_on_page_locked_killable(page);
679 else
652 wait_on_page_locked(page); 680 wait_on_page_locked(page);
653 }
654 return 0; 681 return 0;
682 } else {
683 if (flags & FAULT_FLAG_KILLABLE) {
684 int ret;
685
686 ret = __lock_page_killable(page);
687 if (ret) {
688 up_read(&mm->mmap_sem);
689 return 0;
690 }
691 } else
692 __lock_page(page);
693 return 1;
655 } 694 }
656} 695}
657 696
@@ -1528,15 +1567,17 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
1528 /* If we don't want any read-ahead, don't bother */ 1567 /* If we don't want any read-ahead, don't bother */
1529 if (VM_RandomReadHint(vma)) 1568 if (VM_RandomReadHint(vma))
1530 return; 1569 return;
1570 if (!ra->ra_pages)
1571 return;
1531 1572
1532 if (VM_SequentialReadHint(vma) || 1573 if (VM_SequentialReadHint(vma)) {
1533 offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) {
1534 page_cache_sync_readahead(mapping, ra, file, offset, 1574 page_cache_sync_readahead(mapping, ra, file, offset,
1535 ra->ra_pages); 1575 ra->ra_pages);
1536 return; 1576 return;
1537 } 1577 }
1538 1578
1539 if (ra->mmap_miss < INT_MAX) 1579 /* Avoid banging the cache line if not needed */
1580 if (ra->mmap_miss < MMAP_LOTSAMISS * 10)
1540 ra->mmap_miss++; 1581 ra->mmap_miss++;
1541 1582
1542 /* 1583 /*
@@ -1550,12 +1591,10 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
1550 * mmap read-around 1591 * mmap read-around
1551 */ 1592 */
1552 ra_pages = max_sane_readahead(ra->ra_pages); 1593 ra_pages = max_sane_readahead(ra->ra_pages);
1553 if (ra_pages) { 1594 ra->start = max_t(long, 0, offset - ra_pages / 2);
1554 ra->start = max_t(long, 0, offset - ra_pages/2); 1595 ra->size = ra_pages;
1555 ra->size = ra_pages; 1596 ra->async_size = ra_pages / 4;
1556 ra->async_size = 0; 1597 ra_submit(ra, mapping, file);
1557 ra_submit(ra, mapping, file);
1558 }
1559} 1598}
1560 1599
1561/* 1600/*
@@ -1622,6 +1661,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1622 /* No page in the page cache at all */ 1661 /* No page in the page cache at all */
1623 do_sync_mmap_readahead(vma, ra, file, offset); 1662 do_sync_mmap_readahead(vma, ra, file, offset);
1624 count_vm_event(PGMAJFAULT); 1663 count_vm_event(PGMAJFAULT);
1664 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1625 ret = VM_FAULT_MAJOR; 1665 ret = VM_FAULT_MAJOR;
1626retry_find: 1666retry_find:
1627 page = find_get_page(mapping, offset); 1667 page = find_get_page(mapping, offset);
@@ -1660,7 +1700,6 @@ retry_find:
1660 return VM_FAULT_SIGBUS; 1700 return VM_FAULT_SIGBUS;
1661 } 1701 }
1662 1702
1663 ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT;
1664 vmf->page = page; 1703 vmf->page = page;
1665 return ret | VM_FAULT_LOCKED; 1704 return ret | VM_FAULT_LOCKED;
1666 1705
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 83364df74a33..93356cd12828 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -183,7 +183,7 @@ __xip_unmap (struct address_space * mapping,
183 return; 183 return;
184 184
185retry: 185retry:
186 spin_lock(&mapping->i_mmap_lock); 186 mutex_lock(&mapping->i_mmap_mutex);
187 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 187 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
188 mm = vma->vm_mm; 188 mm = vma->vm_mm;
189 address = vma->vm_start + 189 address = vma->vm_start +
@@ -201,7 +201,7 @@ retry:
201 page_cache_release(page); 201 page_cache_release(page);
202 } 202 }
203 } 203 }
204 spin_unlock(&mapping->i_mmap_lock); 204 mutex_unlock(&mapping->i_mmap_mutex);
205 205
206 if (locked) { 206 if (locked) {
207 mutex_unlock(&xip_sparse_mutex); 207 mutex_unlock(&xip_sparse_mutex);
diff --git a/mm/fremap.c b/mm/fremap.c
index ec520c7b28df..b8e0e2d468af 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -211,20 +211,20 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
211 } 211 }
212 goto out; 212 goto out;
213 } 213 }
214 spin_lock(&mapping->i_mmap_lock); 214 mutex_lock(&mapping->i_mmap_mutex);
215 flush_dcache_mmap_lock(mapping); 215 flush_dcache_mmap_lock(mapping);
216 vma->vm_flags |= VM_NONLINEAR; 216 vma->vm_flags |= VM_NONLINEAR;
217 vma_prio_tree_remove(vma, &mapping->i_mmap); 217 vma_prio_tree_remove(vma, &mapping->i_mmap);
218 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); 218 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
219 flush_dcache_mmap_unlock(mapping); 219 flush_dcache_mmap_unlock(mapping);
220 spin_unlock(&mapping->i_mmap_lock); 220 mutex_unlock(&mapping->i_mmap_mutex);
221 } 221 }
222 222
223 if (vma->vm_flags & VM_LOCKED) { 223 if (vma->vm_flags & VM_LOCKED) {
224 /* 224 /*
225 * drop PG_Mlocked flag for over-mapped range 225 * drop PG_Mlocked flag for over-mapped range
226 */ 226 */
227 unsigned int saved_flags = vma->vm_flags; 227 vm_flags_t saved_flags = vma->vm_flags;
228 munlock_vma_pages_range(vma, start, start + size); 228 munlock_vma_pages_range(vma, start, start + size);
229 vma->vm_flags = saved_flags; 229 vma->vm_flags = saved_flags;
230 } 230 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 83326ad66d9b..615d9743a3cb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1139,7 +1139,7 @@ static int __split_huge_page_splitting(struct page *page,
1139 * We can't temporarily set the pmd to null in order 1139 * We can't temporarily set the pmd to null in order
1140 * to split it, the pmd must remain marked huge at all 1140 * to split it, the pmd must remain marked huge at all
1141 * times or the VM won't take the pmd_trans_huge paths 1141 * times or the VM won't take the pmd_trans_huge paths
1142 * and it won't wait on the anon_vma->root->lock to 1142 * and it won't wait on the anon_vma->root->mutex to
1143 * serialize against split_huge_page*. 1143 * serialize against split_huge_page*.
1144 */ 1144 */
1145 pmdp_splitting_flush_notify(vma, address, pmd); 1145 pmdp_splitting_flush_notify(vma, address, pmd);
@@ -1333,7 +1333,7 @@ static int __split_huge_page_map(struct page *page,
1333 return ret; 1333 return ret;
1334} 1334}
1335 1335
1336/* must be called with anon_vma->root->lock hold */ 1336/* must be called with anon_vma->root->mutex hold */
1337static void __split_huge_page(struct page *page, 1337static void __split_huge_page(struct page *page,
1338 struct anon_vma *anon_vma) 1338 struct anon_vma *anon_vma)
1339{ 1339{
@@ -1771,12 +1771,9 @@ static void collapse_huge_page(struct mm_struct *mm,
1771 1771
1772 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1772 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1773#ifndef CONFIG_NUMA 1773#ifndef CONFIG_NUMA
1774 up_read(&mm->mmap_sem);
1774 VM_BUG_ON(!*hpage); 1775 VM_BUG_ON(!*hpage);
1775 new_page = *hpage; 1776 new_page = *hpage;
1776 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
1777 up_read(&mm->mmap_sem);
1778 return;
1779 }
1780#else 1777#else
1781 VM_BUG_ON(*hpage); 1778 VM_BUG_ON(*hpage);
1782 /* 1779 /*
@@ -1791,22 +1788,26 @@ static void collapse_huge_page(struct mm_struct *mm,
1791 */ 1788 */
1792 new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address, 1789 new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
1793 node, __GFP_OTHER_NODE); 1790 node, __GFP_OTHER_NODE);
1791
1792 /*
1793 * After allocating the hugepage, release the mmap_sem read lock in
1794 * preparation for taking it in write mode.
1795 */
1796 up_read(&mm->mmap_sem);
1794 if (unlikely(!new_page)) { 1797 if (unlikely(!new_page)) {
1795 up_read(&mm->mmap_sem);
1796 count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 1798 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
1797 *hpage = ERR_PTR(-ENOMEM); 1799 *hpage = ERR_PTR(-ENOMEM);
1798 return; 1800 return;
1799 } 1801 }
1802#endif
1803
1800 count_vm_event(THP_COLLAPSE_ALLOC); 1804 count_vm_event(THP_COLLAPSE_ALLOC);
1801 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { 1805 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
1802 up_read(&mm->mmap_sem); 1806#ifdef CONFIG_NUMA
1803 put_page(new_page); 1807 put_page(new_page);
1808#endif
1804 return; 1809 return;
1805 } 1810 }
1806#endif
1807
1808 /* after allocating the hugepage upgrade to mmap_sem write mode */
1809 up_read(&mm->mmap_sem);
1810 1811
1811 /* 1812 /*
1812 * Prevent all access to pagetables with the exception of 1813 * Prevent all access to pagetables with the exception of
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8ee3bd8ec5b5..f33bb319b73f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -475,7 +475,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
475 475
476 /* If reserves cannot be used, ensure enough pages are in the pool */ 476 /* If reserves cannot be used, ensure enough pages are in the pool */
477 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) 477 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
478 goto err;; 478 goto err;
479 479
480 for_each_zone_zonelist_nodemask(zone, z, zonelist, 480 for_each_zone_zonelist_nodemask(zone, z, zonelist,
481 MAX_NR_ZONES - 1, nodemask) { 481 MAX_NR_ZONES - 1, nodemask) {
@@ -2205,7 +2205,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2205 unsigned long sz = huge_page_size(h); 2205 unsigned long sz = huge_page_size(h);
2206 2206
2207 /* 2207 /*
2208 * A page gathering list, protected by per file i_mmap_lock. The 2208 * A page gathering list, protected by per file i_mmap_mutex. The
2209 * lock is used to avoid list corruption from multiple unmapping 2209 * lock is used to avoid list corruption from multiple unmapping
2210 * of the same page since we are using page->lru. 2210 * of the same page since we are using page->lru.
2211 */ 2211 */
@@ -2274,9 +2274,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2274void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 2274void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2275 unsigned long end, struct page *ref_page) 2275 unsigned long end, struct page *ref_page)
2276{ 2276{
2277 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 2277 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
2278 __unmap_hugepage_range(vma, start, end, ref_page); 2278 __unmap_hugepage_range(vma, start, end, ref_page);
2279 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 2279 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
2280} 2280}
2281 2281
2282/* 2282/*
@@ -2308,7 +2308,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2308 * this mapping should be shared between all the VMAs, 2308 * this mapping should be shared between all the VMAs,
2309 * __unmap_hugepage_range() is called as the lock is already held 2309 * __unmap_hugepage_range() is called as the lock is already held
2310 */ 2310 */
2311 spin_lock(&mapping->i_mmap_lock); 2311 mutex_lock(&mapping->i_mmap_mutex);
2312 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 2312 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
2313 /* Do not unmap the current VMA */ 2313 /* Do not unmap the current VMA */
2314 if (iter_vma == vma) 2314 if (iter_vma == vma)
@@ -2326,7 +2326,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2326 address, address + huge_page_size(h), 2326 address, address + huge_page_size(h),
2327 page); 2327 page);
2328 } 2328 }
2329 spin_unlock(&mapping->i_mmap_lock); 2329 mutex_unlock(&mapping->i_mmap_mutex);
2330 2330
2331 return 1; 2331 return 1;
2332} 2332}
@@ -2810,7 +2810,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
2810 BUG_ON(address >= end); 2810 BUG_ON(address >= end);
2811 flush_cache_range(vma, address, end); 2811 flush_cache_range(vma, address, end);
2812 2812
2813 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); 2813 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
2814 spin_lock(&mm->page_table_lock); 2814 spin_lock(&mm->page_table_lock);
2815 for (; address < end; address += huge_page_size(h)) { 2815 for (; address < end; address += huge_page_size(h)) {
2816 ptep = huge_pte_offset(mm, address); 2816 ptep = huge_pte_offset(mm, address);
@@ -2825,7 +2825,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
2825 } 2825 }
2826 } 2826 }
2827 spin_unlock(&mm->page_table_lock); 2827 spin_unlock(&mm->page_table_lock);
2828 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); 2828 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
2829 2829
2830 flush_tlb_range(vma, start, end); 2830 flush_tlb_range(vma, start, end);
2831} 2831}
@@ -2833,7 +2833,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
2833int hugetlb_reserve_pages(struct inode *inode, 2833int hugetlb_reserve_pages(struct inode *inode,
2834 long from, long to, 2834 long from, long to,
2835 struct vm_area_struct *vma, 2835 struct vm_area_struct *vma,
2836 int acctflag) 2836 vm_flags_t vm_flags)
2837{ 2837{
2838 long ret, chg; 2838 long ret, chg;
2839 struct hstate *h = hstate_inode(inode); 2839 struct hstate *h = hstate_inode(inode);
@@ -2843,7 +2843,7 @@ int hugetlb_reserve_pages(struct inode *inode,
2843 * attempt will be made for VM_NORESERVE to allocate a page 2843 * attempt will be made for VM_NORESERVE to allocate a page
2844 * and filesystem quota without using reserves 2844 * and filesystem quota without using reserves
2845 */ 2845 */
2846 if (acctflag & VM_NORESERVE) 2846 if (vm_flags & VM_NORESERVE)
2847 return 0; 2847 return 0;
2848 2848
2849 /* 2849 /*
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 1d29cdfe8ebb..4019979b2637 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -21,6 +21,5 @@ struct mm_struct init_mm = {
21 .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), 21 .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
22 .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), 22 .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
23 .mmlist = LIST_HEAD_INIT(init_mm.mmlist), 23 .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
24 .cpu_vm_mask = CPU_MASK_ALL,
25 INIT_MM_CONTEXT(init_mm) 24 INIT_MM_CONTEXT(init_mm)
26}; 25};
diff --git a/mm/internal.h b/mm/internal.h
index 9d0ced8e505e..d071d380fb49 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -66,6 +66,10 @@ static inline unsigned long page_order(struct page *page)
66 return page_private(page); 66 return page_private(page);
67} 67}
68 68
69/* mm/util.c */
70void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
71 struct vm_area_struct *prev, struct rb_node *rb_parent);
72
69#ifdef CONFIG_MMU 73#ifdef CONFIG_MMU
70extern long mlock_vma_pages_range(struct vm_area_struct *vma, 74extern long mlock_vma_pages_range(struct vm_area_struct *vma,
71 unsigned long start, unsigned long end); 75 unsigned long start, unsigned long end);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index c1d5867543e4..aacee45616fc 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1414,9 +1414,12 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1414 ++(*pos); 1414 ++(*pos);
1415 1415
1416 list_for_each_continue_rcu(n, &object_list) { 1416 list_for_each_continue_rcu(n, &object_list) {
1417 next_obj = list_entry(n, struct kmemleak_object, object_list); 1417 struct kmemleak_object *obj =
1418 if (get_object(next_obj)) 1418 list_entry(n, struct kmemleak_object, object_list);
1419 if (get_object(obj)) {
1420 next_obj = obj;
1419 break; 1421 break;
1422 }
1420 } 1423 }
1421 1424
1422 put_object(prev_obj); 1425 put_object(prev_obj);
diff --git a/mm/ksm.c b/mm/ksm.c
index 942dfc73a2ff..d708b3ef2260 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -35,6 +35,7 @@
35#include <linux/ksm.h> 35#include <linux/ksm.h>
36#include <linux/hash.h> 36#include <linux/hash.h>
37#include <linux/freezer.h> 37#include <linux/freezer.h>
38#include <linux/oom.h>
38 39
39#include <asm/tlbflush.h> 40#include <asm/tlbflush.h>
40#include "internal.h" 41#include "internal.h"
@@ -1894,9 +1895,11 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1894 if (ksm_run != flags) { 1895 if (ksm_run != flags) {
1895 ksm_run = flags; 1896 ksm_run = flags;
1896 if (flags & KSM_RUN_UNMERGE) { 1897 if (flags & KSM_RUN_UNMERGE) {
1897 current->flags |= PF_OOM_ORIGIN; 1898 int oom_score_adj;
1899
1900 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
1898 err = unmerge_and_remove_all_rmap_items(); 1901 err = unmerge_and_remove_all_rmap_items();
1899 current->flags &= ~PF_OOM_ORIGIN; 1902 test_set_oom_score_adj(oom_score_adj);
1900 if (err) { 1903 if (err) {
1901 ksm_run = KSM_RUN_STOP; 1904 ksm_run = KSM_RUN_STOP;
1902 count = err; 1905 count = err;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 010f9166fa6e..bd9052a5d3ad 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -94,6 +94,8 @@ enum mem_cgroup_events_index {
94 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ 94 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
95 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ 95 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
96 MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ 96 MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */
97 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
98 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
97 MEM_CGROUP_EVENTS_NSTATS, 99 MEM_CGROUP_EVENTS_NSTATS,
98}; 100};
99/* 101/*
@@ -231,6 +233,11 @@ struct mem_cgroup {
231 * reclaimed from. 233 * reclaimed from.
232 */ 234 */
233 int last_scanned_child; 235 int last_scanned_child;
236 int last_scanned_node;
237#if MAX_NUMNODES > 1
238 nodemask_t scan_nodes;
239 unsigned long next_scan_node_update;
240#endif
234 /* 241 /*
235 * Should the accounting and control be hierarchical, per subtree? 242 * Should the accounting and control be hierarchical, per subtree?
236 */ 243 */
@@ -585,6 +592,16 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
585 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 592 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
586} 593}
587 594
595void mem_cgroup_pgfault(struct mem_cgroup *mem, int val)
596{
597 this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
598}
599
600void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val)
601{
602 this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
603}
604
588static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, 605static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem,
589 enum mem_cgroup_events_index idx) 606 enum mem_cgroup_events_index idx)
590{ 607{
@@ -624,18 +641,27 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
624 preempt_enable(); 641 preempt_enable();
625} 642}
626 643
644static unsigned long
645mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
646{
647 struct mem_cgroup_per_zone *mz;
648 u64 total = 0;
649 int zid;
650
651 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
652 mz = mem_cgroup_zoneinfo(mem, nid, zid);
653 total += MEM_CGROUP_ZSTAT(mz, idx);
654 }
655 return total;
656}
627static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 657static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
628 enum lru_list idx) 658 enum lru_list idx)
629{ 659{
630 int nid, zid; 660 int nid;
631 struct mem_cgroup_per_zone *mz;
632 u64 total = 0; 661 u64 total = 0;
633 662
634 for_each_online_node(nid) 663 for_each_online_node(nid)
635 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 664 total += mem_cgroup_get_zonestat_node(mem, nid, idx);
636 mz = mem_cgroup_zoneinfo(mem, nid, zid);
637 total += MEM_CGROUP_ZSTAT(mz, idx);
638 }
639 return total; 665 return total;
640} 666}
641 667
@@ -813,6 +839,33 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
813 return (mem == root_mem_cgroup); 839 return (mem == root_mem_cgroup);
814} 840}
815 841
842void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
843{
844 struct mem_cgroup *mem;
845
846 if (!mm)
847 return;
848
849 rcu_read_lock();
850 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
851 if (unlikely(!mem))
852 goto out;
853
854 switch (idx) {
855 case PGMAJFAULT:
856 mem_cgroup_pgmajfault(mem, 1);
857 break;
858 case PGFAULT:
859 mem_cgroup_pgfault(mem, 1);
860 break;
861 default:
862 BUG();
863 }
864out:
865 rcu_read_unlock();
866}
867EXPORT_SYMBOL(mem_cgroup_count_vm_event);
868
816/* 869/*
817 * Following LRU functions are allowed to be used without PCG_LOCK. 870 * Following LRU functions are allowed to be used without PCG_LOCK.
818 * Operations are called by routine of global LRU independently from memcg. 871 * Operations are called by routine of global LRU independently from memcg.
@@ -1064,9 +1117,9 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
1064 return (active > inactive); 1117 return (active > inactive);
1065} 1118}
1066 1119
1067unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, 1120unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
1068 struct zone *zone, 1121 struct zone *zone,
1069 enum lru_list lru) 1122 enum lru_list lru)
1070{ 1123{
1071 int nid = zone_to_nid(zone); 1124 int nid = zone_to_nid(zone);
1072 int zid = zone_idx(zone); 1125 int zid = zone_idx(zone);
@@ -1075,6 +1128,93 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
1075 return MEM_CGROUP_ZSTAT(mz, lru); 1128 return MEM_CGROUP_ZSTAT(mz, lru);
1076} 1129}
1077 1130
1131#ifdef CONFIG_NUMA
1132static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
1133 int nid)
1134{
1135 unsigned long ret;
1136
1137 ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_FILE) +
1138 mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_FILE);
1139
1140 return ret;
1141}
1142
1143static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
1144{
1145 u64 total = 0;
1146 int nid;
1147
1148 for_each_node_state(nid, N_HIGH_MEMORY)
1149 total += mem_cgroup_node_nr_file_lru_pages(memcg, nid);
1150
1151 return total;
1152}
1153
1154static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
1155 int nid)
1156{
1157 unsigned long ret;
1158
1159 ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
1160 mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
1161
1162 return ret;
1163}
1164
1165static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg)
1166{
1167 u64 total = 0;
1168 int nid;
1169
1170 for_each_node_state(nid, N_HIGH_MEMORY)
1171 total += mem_cgroup_node_nr_anon_lru_pages(memcg, nid);
1172
1173 return total;
1174}
1175
1176static unsigned long
1177mem_cgroup_node_nr_unevictable_lru_pages(struct mem_cgroup *memcg, int nid)
1178{
1179 return mem_cgroup_get_zonestat_node(memcg, nid, LRU_UNEVICTABLE);
1180}
1181
1182static unsigned long
1183mem_cgroup_nr_unevictable_lru_pages(struct mem_cgroup *memcg)
1184{
1185 u64 total = 0;
1186 int nid;
1187
1188 for_each_node_state(nid, N_HIGH_MEMORY)
1189 total += mem_cgroup_node_nr_unevictable_lru_pages(memcg, nid);
1190
1191 return total;
1192}
1193
1194static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
1195 int nid)
1196{
1197 enum lru_list l;
1198 u64 total = 0;
1199
1200 for_each_lru(l)
1201 total += mem_cgroup_get_zonestat_node(memcg, nid, l);
1202
1203 return total;
1204}
1205
1206static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg)
1207{
1208 u64 total = 0;
1209 int nid;
1210
1211 for_each_node_state(nid, N_HIGH_MEMORY)
1212 total += mem_cgroup_node_nr_lru_pages(memcg, nid);
1213
1214 return total;
1215}
1216#endif /* CONFIG_NUMA */
1217
1078struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 1218struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
1079 struct zone *zone) 1219 struct zone *zone)
1080{ 1220{
@@ -1418,6 +1558,81 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1418 return ret; 1558 return ret;
1419} 1559}
1420 1560
1561#if MAX_NUMNODES > 1
1562
1563/*
1564 * Always updating the nodemask is not very good - even if we have an empty
1565 * list or the wrong list here, we can start from some node and traverse all
1566 * nodes based on the zonelist. So update the list loosely once per 10 secs.
1567 *
1568 */
1569static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
1570{
1571 int nid;
1572
1573 if (time_after(mem->next_scan_node_update, jiffies))
1574 return;
1575
1576 mem->next_scan_node_update = jiffies + 10*HZ;
1577 /* make a nodemask where this memcg uses memory from */
1578 mem->scan_nodes = node_states[N_HIGH_MEMORY];
1579
1580 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
1581
1582 if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
1583 mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
1584 continue;
1585
1586 if (total_swap_pages &&
1587 (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
1588 mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
1589 continue;
1590 node_clear(nid, mem->scan_nodes);
1591 }
1592}
1593
1594/*
1595 * Selecting a node where we start reclaim from. Because what we need is just
1596 * reducing usage counter, start from anywhere is O,K. Considering
1597 * memory reclaim from current node, there are pros. and cons.
1598 *
1599 * Freeing memory from current node means freeing memory from a node which
1600 * we'll use or we've used. So, it may make LRU bad. And if several threads
1601 * hit limits, it will see a contention on a node. But freeing from remote
1602 * node means more costs for memory reclaim because of memory latency.
1603 *
1604 * Now, we use round-robin. Better algorithm is welcomed.
1605 */
1606int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1607{
1608 int node;
1609
1610 mem_cgroup_may_update_nodemask(mem);
1611 node = mem->last_scanned_node;
1612
1613 node = next_node(node, mem->scan_nodes);
1614 if (node == MAX_NUMNODES)
1615 node = first_node(mem->scan_nodes);
1616 /*
1617 * We call this when we hit limit, not when pages are added to LRU.
1618 * No LRU may hold pages because all pages are UNEVICTABLE or
1619 * memcg is too small and all pages are not on LRU. In that case,
1620 * we use curret node.
1621 */
1622 if (unlikely(node == MAX_NUMNODES))
1623 node = numa_node_id();
1624
1625 mem->last_scanned_node = node;
1626 return node;
1627}
1628
1629#else
1630int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1631{
1632 return 0;
1633}
1634#endif
1635
1421/* 1636/*
1422 * Scan the hierarchy if needed to reclaim memory. We remember the last child 1637 * Scan the hierarchy if needed to reclaim memory. We remember the last child
1423 * we reclaimed from, so that we don't end up penalizing one child extensively 1638 * we reclaimed from, so that we don't end up penalizing one child extensively
@@ -1433,7 +1648,8 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1433static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 1648static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1434 struct zone *zone, 1649 struct zone *zone,
1435 gfp_t gfp_mask, 1650 gfp_t gfp_mask,
1436 unsigned long reclaim_options) 1651 unsigned long reclaim_options,
1652 unsigned long *total_scanned)
1437{ 1653{
1438 struct mem_cgroup *victim; 1654 struct mem_cgroup *victim;
1439 int ret, total = 0; 1655 int ret, total = 0;
@@ -1442,6 +1658,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1442 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; 1658 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1443 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; 1659 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1444 unsigned long excess; 1660 unsigned long excess;
1661 unsigned long nr_scanned;
1445 1662
1446 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; 1663 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
1447 1664
@@ -1484,10 +1701,12 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1484 continue; 1701 continue;
1485 } 1702 }
1486 /* we use swappiness of local cgroup */ 1703 /* we use swappiness of local cgroup */
1487 if (check_soft) 1704 if (check_soft) {
1488 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1705 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1489 noswap, get_swappiness(victim), zone); 1706 noswap, get_swappiness(victim), zone,
1490 else 1707 &nr_scanned);
1708 *total_scanned += nr_scanned;
1709 } else
1491 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1710 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1492 noswap, get_swappiness(victim)); 1711 noswap, get_swappiness(victim));
1493 css_put(&victim->css); 1712 css_put(&victim->css);
@@ -1503,7 +1722,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1503 if (!res_counter_soft_limit_excess(&root_mem->res)) 1722 if (!res_counter_soft_limit_excess(&root_mem->res))
1504 return total; 1723 return total;
1505 } else if (mem_cgroup_margin(root_mem)) 1724 } else if (mem_cgroup_margin(root_mem))
1506 return 1 + total; 1725 return total;
1507 } 1726 }
1508 return total; 1727 return total;
1509} 1728}
@@ -1928,7 +2147,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1928 return CHARGE_WOULDBLOCK; 2147 return CHARGE_WOULDBLOCK;
1929 2148
1930 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, 2149 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1931 gfp_mask, flags); 2150 gfp_mask, flags, NULL);
1932 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2151 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
1933 return CHARGE_RETRY; 2152 return CHARGE_RETRY;
1934 /* 2153 /*
@@ -3211,7 +3430,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3211 break; 3430 break;
3212 3431
3213 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 3432 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
3214 MEM_CGROUP_RECLAIM_SHRINK); 3433 MEM_CGROUP_RECLAIM_SHRINK,
3434 NULL);
3215 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3435 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3216 /* Usage is reduced ? */ 3436 /* Usage is reduced ? */
3217 if (curusage >= oldusage) 3437 if (curusage >= oldusage)
@@ -3271,7 +3491,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3271 3491
3272 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 3492 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
3273 MEM_CGROUP_RECLAIM_NOSWAP | 3493 MEM_CGROUP_RECLAIM_NOSWAP |
3274 MEM_CGROUP_RECLAIM_SHRINK); 3494 MEM_CGROUP_RECLAIM_SHRINK,
3495 NULL);
3275 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3496 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3276 /* Usage is reduced ? */ 3497 /* Usage is reduced ? */
3277 if (curusage >= oldusage) 3498 if (curusage >= oldusage)
@@ -3285,7 +3506,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3285} 3506}
3286 3507
3287unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 3508unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3288 gfp_t gfp_mask) 3509 gfp_t gfp_mask,
3510 unsigned long *total_scanned)
3289{ 3511{
3290 unsigned long nr_reclaimed = 0; 3512 unsigned long nr_reclaimed = 0;
3291 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 3513 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
@@ -3293,6 +3515,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3293 int loop = 0; 3515 int loop = 0;
3294 struct mem_cgroup_tree_per_zone *mctz; 3516 struct mem_cgroup_tree_per_zone *mctz;
3295 unsigned long long excess; 3517 unsigned long long excess;
3518 unsigned long nr_scanned;
3296 3519
3297 if (order > 0) 3520 if (order > 0)
3298 return 0; 3521 return 0;
@@ -3311,10 +3534,13 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3311 if (!mz) 3534 if (!mz)
3312 break; 3535 break;
3313 3536
3537 nr_scanned = 0;
3314 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, 3538 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
3315 gfp_mask, 3539 gfp_mask,
3316 MEM_CGROUP_RECLAIM_SOFT); 3540 MEM_CGROUP_RECLAIM_SOFT,
3541 &nr_scanned);
3317 nr_reclaimed += reclaimed; 3542 nr_reclaimed += reclaimed;
3543 *total_scanned += nr_scanned;
3318 spin_lock(&mctz->lock); 3544 spin_lock(&mctz->lock);
3319 3545
3320 /* 3546 /*
@@ -3337,10 +3563,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3337 */ 3563 */
3338 next_mz = 3564 next_mz =
3339 __mem_cgroup_largest_soft_limit_node(mctz); 3565 __mem_cgroup_largest_soft_limit_node(mctz);
3340 if (next_mz == mz) { 3566 if (next_mz == mz)
3341 css_put(&next_mz->mem->css); 3567 css_put(&next_mz->mem->css);
3342 next_mz = NULL; 3568 else /* next_mz == NULL or other memcg */
3343 } else /* next_mz == NULL or other memcg */
3344 break; 3569 break;
3345 } while (1); 3570 } while (1);
3346 } 3571 }
@@ -3772,6 +3997,8 @@ enum {
3772 MCS_PGPGIN, 3997 MCS_PGPGIN,
3773 MCS_PGPGOUT, 3998 MCS_PGPGOUT,
3774 MCS_SWAP, 3999 MCS_SWAP,
4000 MCS_PGFAULT,
4001 MCS_PGMAJFAULT,
3775 MCS_INACTIVE_ANON, 4002 MCS_INACTIVE_ANON,
3776 MCS_ACTIVE_ANON, 4003 MCS_ACTIVE_ANON,
3777 MCS_INACTIVE_FILE, 4004 MCS_INACTIVE_FILE,
@@ -3794,6 +4021,8 @@ struct {
3794 {"pgpgin", "total_pgpgin"}, 4021 {"pgpgin", "total_pgpgin"},
3795 {"pgpgout", "total_pgpgout"}, 4022 {"pgpgout", "total_pgpgout"},
3796 {"swap", "total_swap"}, 4023 {"swap", "total_swap"},
4024 {"pgfault", "total_pgfault"},
4025 {"pgmajfault", "total_pgmajfault"},
3797 {"inactive_anon", "total_inactive_anon"}, 4026 {"inactive_anon", "total_inactive_anon"},
3798 {"active_anon", "total_active_anon"}, 4027 {"active_anon", "total_active_anon"},
3799 {"inactive_file", "total_inactive_file"}, 4028 {"inactive_file", "total_inactive_file"},
@@ -3822,6 +4051,10 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3822 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); 4051 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
3823 s->stat[MCS_SWAP] += val * PAGE_SIZE; 4052 s->stat[MCS_SWAP] += val * PAGE_SIZE;
3824 } 4053 }
4054 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT);
4055 s->stat[MCS_PGFAULT] += val;
4056 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT);
4057 s->stat[MCS_PGMAJFAULT] += val;
3825 4058
3826 /* per zone stat */ 4059 /* per zone stat */
3827 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); 4060 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
@@ -3845,6 +4078,51 @@ mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3845 mem_cgroup_get_local_stat(iter, s); 4078 mem_cgroup_get_local_stat(iter, s);
3846} 4079}
3847 4080
4081#ifdef CONFIG_NUMA
4082static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
4083{
4084 int nid;
4085 unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
4086 unsigned long node_nr;
4087 struct cgroup *cont = m->private;
4088 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
4089
4090 total_nr = mem_cgroup_nr_lru_pages(mem_cont);
4091 seq_printf(m, "total=%lu", total_nr);
4092 for_each_node_state(nid, N_HIGH_MEMORY) {
4093 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid);
4094 seq_printf(m, " N%d=%lu", nid, node_nr);
4095 }
4096 seq_putc(m, '\n');
4097
4098 file_nr = mem_cgroup_nr_file_lru_pages(mem_cont);
4099 seq_printf(m, "file=%lu", file_nr);
4100 for_each_node_state(nid, N_HIGH_MEMORY) {
4101 node_nr = mem_cgroup_node_nr_file_lru_pages(mem_cont, nid);
4102 seq_printf(m, " N%d=%lu", nid, node_nr);
4103 }
4104 seq_putc(m, '\n');
4105
4106 anon_nr = mem_cgroup_nr_anon_lru_pages(mem_cont);
4107 seq_printf(m, "anon=%lu", anon_nr);
4108 for_each_node_state(nid, N_HIGH_MEMORY) {
4109 node_nr = mem_cgroup_node_nr_anon_lru_pages(mem_cont, nid);
4110 seq_printf(m, " N%d=%lu", nid, node_nr);
4111 }
4112 seq_putc(m, '\n');
4113
4114 unevictable_nr = mem_cgroup_nr_unevictable_lru_pages(mem_cont);
4115 seq_printf(m, "unevictable=%lu", unevictable_nr);
4116 for_each_node_state(nid, N_HIGH_MEMORY) {
4117 node_nr = mem_cgroup_node_nr_unevictable_lru_pages(mem_cont,
4118 nid);
4119 seq_printf(m, " N%d=%lu", nid, node_nr);
4120 }
4121 seq_putc(m, '\n');
4122 return 0;
4123}
4124#endif /* CONFIG_NUMA */
4125
3848static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 4126static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
3849 struct cgroup_map_cb *cb) 4127 struct cgroup_map_cb *cb)
3850{ 4128{
@@ -3855,6 +4133,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
3855 memset(&mystat, 0, sizeof(mystat)); 4133 memset(&mystat, 0, sizeof(mystat));
3856 mem_cgroup_get_local_stat(mem_cont, &mystat); 4134 mem_cgroup_get_local_stat(mem_cont, &mystat);
3857 4135
4136
3858 for (i = 0; i < NR_MCS_STAT; i++) { 4137 for (i = 0; i < NR_MCS_STAT; i++) {
3859 if (i == MCS_SWAP && !do_swap_account) 4138 if (i == MCS_SWAP && !do_swap_account)
3860 continue; 4139 continue;
@@ -4278,6 +4557,22 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4278 return 0; 4557 return 0;
4279} 4558}
4280 4559
4560#ifdef CONFIG_NUMA
4561static const struct file_operations mem_control_numa_stat_file_operations = {
4562 .read = seq_read,
4563 .llseek = seq_lseek,
4564 .release = single_release,
4565};
4566
4567static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
4568{
4569 struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
4570
4571 file->f_op = &mem_control_numa_stat_file_operations;
4572 return single_open(file, mem_control_numa_stat_show, cont);
4573}
4574#endif /* CONFIG_NUMA */
4575
4281static struct cftype mem_cgroup_files[] = { 4576static struct cftype mem_cgroup_files[] = {
4282 { 4577 {
4283 .name = "usage_in_bytes", 4578 .name = "usage_in_bytes",
@@ -4341,6 +4636,12 @@ static struct cftype mem_cgroup_files[] = {
4341 .unregister_event = mem_cgroup_oom_unregister_event, 4636 .unregister_event = mem_cgroup_oom_unregister_event,
4342 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 4637 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4343 }, 4638 },
4639#ifdef CONFIG_NUMA
4640 {
4641 .name = "numa_stat",
4642 .open = mem_control_numa_stat_open,
4643 },
4644#endif
4344}; 4645};
4345 4646
4346#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4647#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -4596,6 +4897,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4596 res_counter_init(&mem->memsw, NULL); 4897 res_counter_init(&mem->memsw, NULL);
4597 } 4898 }
4598 mem->last_scanned_child = 0; 4899 mem->last_scanned_child = 0;
4900 mem->last_scanned_node = MAX_NUMNODES;
4599 INIT_LIST_HEAD(&mem->oom_notify); 4901 INIT_LIST_HEAD(&mem->oom_notify);
4600 4902
4601 if (parent) 4903 if (parent)
@@ -4953,8 +5255,7 @@ static void mem_cgroup_clear_mc(void)
4953 5255
4954static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 5256static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4955 struct cgroup *cgroup, 5257 struct cgroup *cgroup,
4956 struct task_struct *p, 5258 struct task_struct *p)
4957 bool threadgroup)
4958{ 5259{
4959 int ret = 0; 5260 int ret = 0;
4960 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); 5261 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
@@ -4993,8 +5294,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4993 5294
4994static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 5295static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
4995 struct cgroup *cgroup, 5296 struct cgroup *cgroup,
4996 struct task_struct *p, 5297 struct task_struct *p)
4997 bool threadgroup)
4998{ 5298{
4999 mem_cgroup_clear_mc(); 5299 mem_cgroup_clear_mc();
5000} 5300}
@@ -5112,8 +5412,7 @@ retry:
5112static void mem_cgroup_move_task(struct cgroup_subsys *ss, 5412static void mem_cgroup_move_task(struct cgroup_subsys *ss,
5113 struct cgroup *cont, 5413 struct cgroup *cont,
5114 struct cgroup *old_cont, 5414 struct cgroup *old_cont,
5115 struct task_struct *p, 5415 struct task_struct *p)
5116 bool threadgroup)
5117{ 5416{
5118 struct mm_struct *mm; 5417 struct mm_struct *mm;
5119 5418
@@ -5131,22 +5430,19 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
5131#else /* !CONFIG_MMU */ 5430#else /* !CONFIG_MMU */
5132static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 5431static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
5133 struct cgroup *cgroup, 5432 struct cgroup *cgroup,
5134 struct task_struct *p, 5433 struct task_struct *p)
5135 bool threadgroup)
5136{ 5434{
5137 return 0; 5435 return 0;
5138} 5436}
5139static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 5437static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
5140 struct cgroup *cgroup, 5438 struct cgroup *cgroup,
5141 struct task_struct *p, 5439 struct task_struct *p)
5142 bool threadgroup)
5143{ 5440{
5144} 5441}
5145static void mem_cgroup_move_task(struct cgroup_subsys *ss, 5442static void mem_cgroup_move_task(struct cgroup_subsys *ss,
5146 struct cgroup *cont, 5443 struct cgroup *cont,
5147 struct cgroup *old_cont, 5444 struct cgroup *old_cont,
5148 struct task_struct *p, 5445 struct task_struct *p)
5149 bool threadgroup)
5150{ 5446{
5151} 5447}
5152#endif 5448#endif
@@ -5169,19 +5465,12 @@ struct cgroup_subsys mem_cgroup_subsys = {
5169static int __init enable_swap_account(char *s) 5465static int __init enable_swap_account(char *s)
5170{ 5466{
5171 /* consider enabled if no parameter or 1 is given */ 5467 /* consider enabled if no parameter or 1 is given */
5172 if (!(*s) || !strcmp(s, "=1")) 5468 if (!strcmp(s, "1"))
5173 really_do_swap_account = 1; 5469 really_do_swap_account = 1;
5174 else if (!strcmp(s, "=0")) 5470 else if (!strcmp(s, "0"))
5175 really_do_swap_account = 0; 5471 really_do_swap_account = 0;
5176 return 1; 5472 return 1;
5177} 5473}
5178__setup("swapaccount", enable_swap_account); 5474__setup("swapaccount=", enable_swap_account);
5179 5475
5180static int __init disable_swap_account(char *s)
5181{
5182 printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n");
5183 enable_swap_account("=0");
5184 return 1;
5185}
5186__setup("noswapaccount", disable_swap_account);
5187#endif 5476#endif
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 2b9a5eef39e0..5c8f7e08928d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -239,7 +239,11 @@ void shake_page(struct page *p, int access)
239 if (access) { 239 if (access) {
240 int nr; 240 int nr;
241 do { 241 do {
242 nr = shrink_slab(1000, GFP_KERNEL, 1000); 242 struct shrink_control shrink = {
243 .gfp_mask = GFP_KERNEL,
244 };
245
246 nr = shrink_slab(&shrink, 1000, 1000);
243 if (page_count(p) == 1) 247 if (page_count(p) == 1)
244 break; 248 break;
245 } while (nr > 10); 249 } while (nr > 10);
@@ -429,7 +433,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
429 */ 433 */
430 434
431 read_lock(&tasklist_lock); 435 read_lock(&tasklist_lock);
432 spin_lock(&mapping->i_mmap_lock); 436 mutex_lock(&mapping->i_mmap_mutex);
433 for_each_process(tsk) { 437 for_each_process(tsk) {
434 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 438 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
435 439
@@ -449,7 +453,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
449 add_to_kill(tsk, page, vma, to_kill, tkc); 453 add_to_kill(tsk, page, vma, to_kill, tkc);
450 } 454 }
451 } 455 }
452 spin_unlock(&mapping->i_mmap_lock); 456 mutex_unlock(&mapping->i_mmap_mutex);
453 read_unlock(&tasklist_lock); 457 read_unlock(&tasklist_lock);
454} 458}
455 459
@@ -1440,16 +1444,12 @@ int soft_offline_page(struct page *page, int flags)
1440 */ 1444 */
1441 ret = invalidate_inode_page(page); 1445 ret = invalidate_inode_page(page);
1442 unlock_page(page); 1446 unlock_page(page);
1443
1444 /* 1447 /*
1445 * Drop count because page migration doesn't like raised
1446 * counts. The page could get re-allocated, but if it becomes
1447 * LRU the isolation will just fail.
1448 * RED-PEN would be better to keep it isolated here, but we 1448 * RED-PEN would be better to keep it isolated here, but we
1449 * would need to fix isolation locking first. 1449 * would need to fix isolation locking first.
1450 */ 1450 */
1451 put_page(page);
1452 if (ret == 1) { 1451 if (ret == 1) {
1452 put_page(page);
1453 ret = 0; 1453 ret = 0;
1454 pr_info("soft_offline: %#lx: invalidated\n", pfn); 1454 pr_info("soft_offline: %#lx: invalidated\n", pfn);
1455 goto done; 1455 goto done;
@@ -1461,6 +1461,11 @@ int soft_offline_page(struct page *page, int flags)
1461 * handles a large number of cases for us. 1461 * handles a large number of cases for us.
1462 */ 1462 */
1463 ret = isolate_lru_page(page); 1463 ret = isolate_lru_page(page);
1464 /*
1465 * Drop page reference which is came from get_any_page()
1466 * successful isolate_lru_page() already took another one.
1467 */
1468 put_page(page);
1464 if (!ret) { 1469 if (!ret) {
1465 LIST_HEAD(pagelist); 1470 LIST_HEAD(pagelist);
1466 1471
diff --git a/mm/memory.c b/mm/memory.c
index 61e66f026563..6953d3926e01 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -182,7 +182,7 @@ void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
182{ 182{
183 __sync_task_rss_stat(task, mm); 183 __sync_task_rss_stat(task, mm);
184} 184}
185#else 185#else /* SPLIT_RSS_COUNTING */
186 186
187#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) 187#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
188#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) 188#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
@@ -191,8 +191,205 @@ static void check_sync_rss_stat(struct task_struct *task)
191{ 191{
192} 192}
193 193
194#endif /* SPLIT_RSS_COUNTING */
195
196#ifdef HAVE_GENERIC_MMU_GATHER
197
198static int tlb_next_batch(struct mmu_gather *tlb)
199{
200 struct mmu_gather_batch *batch;
201
202 batch = tlb->active;
203 if (batch->next) {
204 tlb->active = batch->next;
205 return 1;
206 }
207
208 batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
209 if (!batch)
210 return 0;
211
212 batch->next = NULL;
213 batch->nr = 0;
214 batch->max = MAX_GATHER_BATCH;
215
216 tlb->active->next = batch;
217 tlb->active = batch;
218
219 return 1;
220}
221
222/* tlb_gather_mmu
223 * Called to initialize an (on-stack) mmu_gather structure for page-table
224 * tear-down from @mm. The @fullmm argument is used when @mm is without
225 * users and we're going to destroy the full address space (exit/execve).
226 */
227void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
228{
229 tlb->mm = mm;
230
231 tlb->fullmm = fullmm;
232 tlb->need_flush = 0;
233 tlb->fast_mode = (num_possible_cpus() == 1);
234 tlb->local.next = NULL;
235 tlb->local.nr = 0;
236 tlb->local.max = ARRAY_SIZE(tlb->__pages);
237 tlb->active = &tlb->local;
238
239#ifdef CONFIG_HAVE_RCU_TABLE_FREE
240 tlb->batch = NULL;
241#endif
242}
243
244void tlb_flush_mmu(struct mmu_gather *tlb)
245{
246 struct mmu_gather_batch *batch;
247
248 if (!tlb->need_flush)
249 return;
250 tlb->need_flush = 0;
251 tlb_flush(tlb);
252#ifdef CONFIG_HAVE_RCU_TABLE_FREE
253 tlb_table_flush(tlb);
194#endif 254#endif
195 255
256 if (tlb_fast_mode(tlb))
257 return;
258
259 for (batch = &tlb->local; batch; batch = batch->next) {
260 free_pages_and_swap_cache(batch->pages, batch->nr);
261 batch->nr = 0;
262 }
263 tlb->active = &tlb->local;
264}
265
266/* tlb_finish_mmu
267 * Called at the end of the shootdown operation to free up any resources
268 * that were required.
269 */
270void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
271{
272 struct mmu_gather_batch *batch, *next;
273
274 tlb_flush_mmu(tlb);
275
276 /* keep the page table cache within bounds */
277 check_pgt_cache();
278
279 for (batch = tlb->local.next; batch; batch = next) {
280 next = batch->next;
281 free_pages((unsigned long)batch, 0);
282 }
283 tlb->local.next = NULL;
284}
285
286/* __tlb_remove_page
287 * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
288 * handling the additional races in SMP caused by other CPUs caching valid
289 * mappings in their TLBs. Returns the number of free page slots left.
290 * When out of page slots we must call tlb_flush_mmu().
291 */
292int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
293{
294 struct mmu_gather_batch *batch;
295
296 tlb->need_flush = 1;
297
298 if (tlb_fast_mode(tlb)) {
299 free_page_and_swap_cache(page);
300 return 1; /* avoid calling tlb_flush_mmu() */
301 }
302
303 batch = tlb->active;
304 batch->pages[batch->nr++] = page;
305 if (batch->nr == batch->max) {
306 if (!tlb_next_batch(tlb))
307 return 0;
308 }
309 VM_BUG_ON(batch->nr > batch->max);
310
311 return batch->max - batch->nr;
312}
313
314#endif /* HAVE_GENERIC_MMU_GATHER */
315
316#ifdef CONFIG_HAVE_RCU_TABLE_FREE
317
318/*
319 * See the comment near struct mmu_table_batch.
320 */
321
322static void tlb_remove_table_smp_sync(void *arg)
323{
324 /* Simply deliver the interrupt */
325}
326
327static void tlb_remove_table_one(void *table)
328{
329 /*
330 * This isn't an RCU grace period and hence the page-tables cannot be
331 * assumed to be actually RCU-freed.
332 *
333 * It is however sufficient for software page-table walkers that rely on
334 * IRQ disabling. See the comment near struct mmu_table_batch.
335 */
336 smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
337 __tlb_remove_table(table);
338}
339
340static void tlb_remove_table_rcu(struct rcu_head *head)
341{
342 struct mmu_table_batch *batch;
343 int i;
344
345 batch = container_of(head, struct mmu_table_batch, rcu);
346
347 for (i = 0; i < batch->nr; i++)
348 __tlb_remove_table(batch->tables[i]);
349
350 free_page((unsigned long)batch);
351}
352
353void tlb_table_flush(struct mmu_gather *tlb)
354{
355 struct mmu_table_batch **batch = &tlb->batch;
356
357 if (*batch) {
358 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
359 *batch = NULL;
360 }
361}
362
363void tlb_remove_table(struct mmu_gather *tlb, void *table)
364{
365 struct mmu_table_batch **batch = &tlb->batch;
366
367 tlb->need_flush = 1;
368
369 /*
370 * When there's less then two users of this mm there cannot be a
371 * concurrent page-table walk.
372 */
373 if (atomic_read(&tlb->mm->mm_users) < 2) {
374 __tlb_remove_table(table);
375 return;
376 }
377
378 if (*batch == NULL) {
379 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
380 if (*batch == NULL) {
381 tlb_remove_table_one(table);
382 return;
383 }
384 (*batch)->nr = 0;
385 }
386 (*batch)->tables[(*batch)->nr++] = table;
387 if ((*batch)->nr == MAX_TABLE_BATCH)
388 tlb_table_flush(tlb);
389}
390
391#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
392
196/* 393/*
197 * If a p?d_bad entry is found while walking page tables, report 394 * If a p?d_bad entry is found while walking page tables, report
198 * the error, before resetting entry to p?d_none. Usually (but 395 * the error, before resetting entry to p?d_none. Usually (but
@@ -533,7 +730,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
533 add_taint(TAINT_BAD_PAGE); 730 add_taint(TAINT_BAD_PAGE);
534} 731}
535 732
536static inline int is_cow_mapping(unsigned int flags) 733static inline int is_cow_mapping(vm_flags_t flags)
537{ 734{
538 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 735 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
539} 736}
@@ -909,26 +1106,24 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
909static unsigned long zap_pte_range(struct mmu_gather *tlb, 1106static unsigned long zap_pte_range(struct mmu_gather *tlb,
910 struct vm_area_struct *vma, pmd_t *pmd, 1107 struct vm_area_struct *vma, pmd_t *pmd,
911 unsigned long addr, unsigned long end, 1108 unsigned long addr, unsigned long end,
912 long *zap_work, struct zap_details *details) 1109 struct zap_details *details)
913{ 1110{
914 struct mm_struct *mm = tlb->mm; 1111 struct mm_struct *mm = tlb->mm;
915 pte_t *pte; 1112 int force_flush = 0;
916 spinlock_t *ptl;
917 int rss[NR_MM_COUNTERS]; 1113 int rss[NR_MM_COUNTERS];
1114 spinlock_t *ptl;
1115 pte_t *pte;
918 1116
1117again:
919 init_rss_vec(rss); 1118 init_rss_vec(rss);
920
921 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 1119 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
922 arch_enter_lazy_mmu_mode(); 1120 arch_enter_lazy_mmu_mode();
923 do { 1121 do {
924 pte_t ptent = *pte; 1122 pte_t ptent = *pte;
925 if (pte_none(ptent)) { 1123 if (pte_none(ptent)) {
926 (*zap_work)--;
927 continue; 1124 continue;
928 } 1125 }
929 1126
930 (*zap_work) -= PAGE_SIZE;
931
932 if (pte_present(ptent)) { 1127 if (pte_present(ptent)) {
933 struct page *page; 1128 struct page *page;
934 1129
@@ -974,7 +1169,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
974 page_remove_rmap(page); 1169 page_remove_rmap(page);
975 if (unlikely(page_mapcount(page) < 0)) 1170 if (unlikely(page_mapcount(page) < 0))
976 print_bad_pte(vma, addr, ptent, page); 1171 print_bad_pte(vma, addr, ptent, page);
977 tlb_remove_page(tlb, page); 1172 force_flush = !__tlb_remove_page(tlb, page);
1173 if (force_flush)
1174 break;
978 continue; 1175 continue;
979 } 1176 }
980 /* 1177 /*
@@ -995,19 +1192,31 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
995 print_bad_pte(vma, addr, ptent, NULL); 1192 print_bad_pte(vma, addr, ptent, NULL);
996 } 1193 }
997 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 1194 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
998 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); 1195 } while (pte++, addr += PAGE_SIZE, addr != end);
999 1196
1000 add_mm_rss_vec(mm, rss); 1197 add_mm_rss_vec(mm, rss);
1001 arch_leave_lazy_mmu_mode(); 1198 arch_leave_lazy_mmu_mode();
1002 pte_unmap_unlock(pte - 1, ptl); 1199 pte_unmap_unlock(pte - 1, ptl);
1003 1200
1201 /*
1202 * mmu_gather ran out of room to batch pages, we break out of
1203 * the PTE lock to avoid doing the potential expensive TLB invalidate
1204 * and page-free while holding it.
1205 */
1206 if (force_flush) {
1207 force_flush = 0;
1208 tlb_flush_mmu(tlb);
1209 if (addr != end)
1210 goto again;
1211 }
1212
1004 return addr; 1213 return addr;
1005} 1214}
1006 1215
1007static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, 1216static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1008 struct vm_area_struct *vma, pud_t *pud, 1217 struct vm_area_struct *vma, pud_t *pud,
1009 unsigned long addr, unsigned long end, 1218 unsigned long addr, unsigned long end,
1010 long *zap_work, struct zap_details *details) 1219 struct zap_details *details)
1011{ 1220{
1012 pmd_t *pmd; 1221 pmd_t *pmd;
1013 unsigned long next; 1222 unsigned long next;
@@ -1019,19 +1228,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1019 if (next-addr != HPAGE_PMD_SIZE) { 1228 if (next-addr != HPAGE_PMD_SIZE) {
1020 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); 1229 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
1021 split_huge_page_pmd(vma->vm_mm, pmd); 1230 split_huge_page_pmd(vma->vm_mm, pmd);
1022 } else if (zap_huge_pmd(tlb, vma, pmd)) { 1231 } else if (zap_huge_pmd(tlb, vma, pmd))
1023 (*zap_work)--;
1024 continue; 1232 continue;
1025 }
1026 /* fall through */ 1233 /* fall through */
1027 } 1234 }
1028 if (pmd_none_or_clear_bad(pmd)) { 1235 if (pmd_none_or_clear_bad(pmd))
1029 (*zap_work)--;
1030 continue; 1236 continue;
1031 } 1237 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1032 next = zap_pte_range(tlb, vma, pmd, addr, next, 1238 cond_resched();
1033 zap_work, details); 1239 } while (pmd++, addr = next, addr != end);
1034 } while (pmd++, addr = next, (addr != end && *zap_work > 0));
1035 1240
1036 return addr; 1241 return addr;
1037} 1242}
@@ -1039,7 +1244,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1039static inline unsigned long zap_pud_range(struct mmu_gather *tlb, 1244static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1040 struct vm_area_struct *vma, pgd_t *pgd, 1245 struct vm_area_struct *vma, pgd_t *pgd,
1041 unsigned long addr, unsigned long end, 1246 unsigned long addr, unsigned long end,
1042 long *zap_work, struct zap_details *details) 1247 struct zap_details *details)
1043{ 1248{
1044 pud_t *pud; 1249 pud_t *pud;
1045 unsigned long next; 1250 unsigned long next;
@@ -1047,13 +1252,10 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1047 pud = pud_offset(pgd, addr); 1252 pud = pud_offset(pgd, addr);
1048 do { 1253 do {
1049 next = pud_addr_end(addr, end); 1254 next = pud_addr_end(addr, end);
1050 if (pud_none_or_clear_bad(pud)) { 1255 if (pud_none_or_clear_bad(pud))
1051 (*zap_work)--;
1052 continue; 1256 continue;
1053 } 1257 next = zap_pmd_range(tlb, vma, pud, addr, next, details);
1054 next = zap_pmd_range(tlb, vma, pud, addr, next, 1258 } while (pud++, addr = next, addr != end);
1055 zap_work, details);
1056 } while (pud++, addr = next, (addr != end && *zap_work > 0));
1057 1259
1058 return addr; 1260 return addr;
1059} 1261}
@@ -1061,7 +1263,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1061static unsigned long unmap_page_range(struct mmu_gather *tlb, 1263static unsigned long unmap_page_range(struct mmu_gather *tlb,
1062 struct vm_area_struct *vma, 1264 struct vm_area_struct *vma,
1063 unsigned long addr, unsigned long end, 1265 unsigned long addr, unsigned long end,
1064 long *zap_work, struct zap_details *details) 1266 struct zap_details *details)
1065{ 1267{
1066 pgd_t *pgd; 1268 pgd_t *pgd;
1067 unsigned long next; 1269 unsigned long next;
@@ -1075,13 +1277,10 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
1075 pgd = pgd_offset(vma->vm_mm, addr); 1277 pgd = pgd_offset(vma->vm_mm, addr);
1076 do { 1278 do {
1077 next = pgd_addr_end(addr, end); 1279 next = pgd_addr_end(addr, end);
1078 if (pgd_none_or_clear_bad(pgd)) { 1280 if (pgd_none_or_clear_bad(pgd))
1079 (*zap_work)--;
1080 continue; 1281 continue;
1081 } 1282 next = zap_pud_range(tlb, vma, pgd, addr, next, details);
1082 next = zap_pud_range(tlb, vma, pgd, addr, next, 1283 } while (pgd++, addr = next, addr != end);
1083 zap_work, details);
1084 } while (pgd++, addr = next, (addr != end && *zap_work > 0));
1085 tlb_end_vma(tlb, vma); 1284 tlb_end_vma(tlb, vma);
1086 mem_cgroup_uncharge_end(); 1285 mem_cgroup_uncharge_end();
1087 1286
@@ -1121,17 +1320,12 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
1121 * ensure that any thus-far unmapped pages are flushed before unmap_vmas() 1320 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
1122 * drops the lock and schedules. 1321 * drops the lock and schedules.
1123 */ 1322 */
1124unsigned long unmap_vmas(struct mmu_gather **tlbp, 1323unsigned long unmap_vmas(struct mmu_gather *tlb,
1125 struct vm_area_struct *vma, unsigned long start_addr, 1324 struct vm_area_struct *vma, unsigned long start_addr,
1126 unsigned long end_addr, unsigned long *nr_accounted, 1325 unsigned long end_addr, unsigned long *nr_accounted,
1127 struct zap_details *details) 1326 struct zap_details *details)
1128{ 1327{
1129 long zap_work = ZAP_BLOCK_SIZE;
1130 unsigned long tlb_start = 0; /* For tlb_finish_mmu */
1131 int tlb_start_valid = 0;
1132 unsigned long start = start_addr; 1328 unsigned long start = start_addr;
1133 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
1134 int fullmm = (*tlbp)->fullmm;
1135 struct mm_struct *mm = vma->vm_mm; 1329 struct mm_struct *mm = vma->vm_mm;
1136 1330
1137 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); 1331 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
@@ -1152,11 +1346,6 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
1152 untrack_pfn_vma(vma, 0, 0); 1346 untrack_pfn_vma(vma, 0, 0);
1153 1347
1154 while (start != end) { 1348 while (start != end) {
1155 if (!tlb_start_valid) {
1156 tlb_start = start;
1157 tlb_start_valid = 1;
1158 }
1159
1160 if (unlikely(is_vm_hugetlb_page(vma))) { 1349 if (unlikely(is_vm_hugetlb_page(vma))) {
1161 /* 1350 /*
1162 * It is undesirable to test vma->vm_file as it 1351 * It is undesirable to test vma->vm_file as it
@@ -1169,39 +1358,15 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
1169 * Since no pte has actually been setup, it is 1358 * Since no pte has actually been setup, it is
1170 * safe to do nothing in this case. 1359 * safe to do nothing in this case.
1171 */ 1360 */
1172 if (vma->vm_file) { 1361 if (vma->vm_file)
1173 unmap_hugepage_range(vma, start, end, NULL); 1362 unmap_hugepage_range(vma, start, end, NULL);
1174 zap_work -= (end - start) /
1175 pages_per_huge_page(hstate_vma(vma));
1176 }
1177 1363
1178 start = end; 1364 start = end;
1179 } else 1365 } else
1180 start = unmap_page_range(*tlbp, vma, 1366 start = unmap_page_range(tlb, vma, start, end, details);
1181 start, end, &zap_work, details);
1182
1183 if (zap_work > 0) {
1184 BUG_ON(start != end);
1185 break;
1186 }
1187
1188 tlb_finish_mmu(*tlbp, tlb_start, start);
1189
1190 if (need_resched() ||
1191 (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
1192 if (i_mmap_lock) {
1193 *tlbp = NULL;
1194 goto out;
1195 }
1196 cond_resched();
1197 }
1198
1199 *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
1200 tlb_start_valid = 0;
1201 zap_work = ZAP_BLOCK_SIZE;
1202 } 1367 }
1203 } 1368 }
1204out: 1369
1205 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); 1370 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1206 return start; /* which is now the end (or restart) address */ 1371 return start; /* which is now the end (or restart) address */
1207} 1372}
@@ -1217,16 +1382,15 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
1217 unsigned long size, struct zap_details *details) 1382 unsigned long size, struct zap_details *details)
1218{ 1383{
1219 struct mm_struct *mm = vma->vm_mm; 1384 struct mm_struct *mm = vma->vm_mm;
1220 struct mmu_gather *tlb; 1385 struct mmu_gather tlb;
1221 unsigned long end = address + size; 1386 unsigned long end = address + size;
1222 unsigned long nr_accounted = 0; 1387 unsigned long nr_accounted = 0;
1223 1388
1224 lru_add_drain(); 1389 lru_add_drain();
1225 tlb = tlb_gather_mmu(mm, 0); 1390 tlb_gather_mmu(&tlb, mm, 0);
1226 update_hiwater_rss(mm); 1391 update_hiwater_rss(mm);
1227 end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); 1392 end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
1228 if (tlb) 1393 tlb_finish_mmu(&tlb, address, end);
1229 tlb_finish_mmu(tlb, address, end);
1230 return end; 1394 return end;
1231} 1395}
1232 1396
@@ -2535,96 +2699,11 @@ unwritable_page:
2535 return ret; 2699 return ret;
2536} 2700}
2537 2701
2538/* 2702static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2539 * Helper functions for unmap_mapping_range().
2540 *
2541 * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
2542 *
2543 * We have to restart searching the prio_tree whenever we drop the lock,
2544 * since the iterator is only valid while the lock is held, and anyway
2545 * a later vma might be split and reinserted earlier while lock dropped.
2546 *
2547 * The list of nonlinear vmas could be handled more efficiently, using
2548 * a placeholder, but handle it in the same way until a need is shown.
2549 * It is important to search the prio_tree before nonlinear list: a vma
2550 * may become nonlinear and be shifted from prio_tree to nonlinear list
2551 * while the lock is dropped; but never shifted from list to prio_tree.
2552 *
2553 * In order to make forward progress despite restarting the search,
2554 * vm_truncate_count is used to mark a vma as now dealt with, so we can
2555 * quickly skip it next time around. Since the prio_tree search only
2556 * shows us those vmas affected by unmapping the range in question, we
2557 * can't efficiently keep all vmas in step with mapping->truncate_count:
2558 * so instead reset them all whenever it wraps back to 0 (then go to 1).
2559 * mapping->truncate_count and vma->vm_truncate_count are protected by
2560 * i_mmap_lock.
2561 *
2562 * In order to make forward progress despite repeatedly restarting some
2563 * large vma, note the restart_addr from unmap_vmas when it breaks out:
2564 * and restart from that address when we reach that vma again. It might
2565 * have been split or merged, shrunk or extended, but never shifted: so
2566 * restart_addr remains valid so long as it remains in the vma's range.
2567 * unmap_mapping_range forces truncate_count to leap over page-aligned
2568 * values so we can save vma's restart_addr in its truncate_count field.
2569 */
2570#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
2571
2572static void reset_vma_truncate_counts(struct address_space *mapping)
2573{
2574 struct vm_area_struct *vma;
2575 struct prio_tree_iter iter;
2576
2577 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
2578 vma->vm_truncate_count = 0;
2579 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
2580 vma->vm_truncate_count = 0;
2581}
2582
2583static int unmap_mapping_range_vma(struct vm_area_struct *vma,
2584 unsigned long start_addr, unsigned long end_addr, 2703 unsigned long start_addr, unsigned long end_addr,
2585 struct zap_details *details) 2704 struct zap_details *details)
2586{ 2705{
2587 unsigned long restart_addr; 2706 zap_page_range(vma, start_addr, end_addr - start_addr, details);
2588 int need_break;
2589
2590 /*
2591 * files that support invalidating or truncating portions of the
2592 * file from under mmaped areas must have their ->fault function
2593 * return a locked page (and set VM_FAULT_LOCKED in the return).
2594 * This provides synchronisation against concurrent unmapping here.
2595 */
2596
2597again:
2598 restart_addr = vma->vm_truncate_count;
2599 if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
2600 start_addr = restart_addr;
2601 if (start_addr >= end_addr) {
2602 /* Top of vma has been split off since last time */
2603 vma->vm_truncate_count = details->truncate_count;
2604 return 0;
2605 }
2606 }
2607
2608 restart_addr = zap_page_range(vma, start_addr,
2609 end_addr - start_addr, details);
2610 need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
2611
2612 if (restart_addr >= end_addr) {
2613 /* We have now completed this vma: mark it so */
2614 vma->vm_truncate_count = details->truncate_count;
2615 if (!need_break)
2616 return 0;
2617 } else {
2618 /* Note restart_addr in vma's truncate_count field */
2619 vma->vm_truncate_count = restart_addr;
2620 if (!need_break)
2621 goto again;
2622 }
2623
2624 spin_unlock(details->i_mmap_lock);
2625 cond_resched();
2626 spin_lock(details->i_mmap_lock);
2627 return -EINTR;
2628} 2707}
2629 2708
2630static inline void unmap_mapping_range_tree(struct prio_tree_root *root, 2709static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
@@ -2634,12 +2713,8 @@ static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
2634 struct prio_tree_iter iter; 2713 struct prio_tree_iter iter;
2635 pgoff_t vba, vea, zba, zea; 2714 pgoff_t vba, vea, zba, zea;
2636 2715
2637restart:
2638 vma_prio_tree_foreach(vma, &iter, root, 2716 vma_prio_tree_foreach(vma, &iter, root,
2639 details->first_index, details->last_index) { 2717 details->first_index, details->last_index) {
2640 /* Skip quickly over those we have already dealt with */
2641 if (vma->vm_truncate_count == details->truncate_count)
2642 continue;
2643 2718
2644 vba = vma->vm_pgoff; 2719 vba = vma->vm_pgoff;
2645 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; 2720 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
@@ -2651,11 +2726,10 @@ restart:
2651 if (zea > vea) 2726 if (zea > vea)
2652 zea = vea; 2727 zea = vea;
2653 2728
2654 if (unmap_mapping_range_vma(vma, 2729 unmap_mapping_range_vma(vma,
2655 ((zba - vba) << PAGE_SHIFT) + vma->vm_start, 2730 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2656 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, 2731 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2657 details) < 0) 2732 details);
2658 goto restart;
2659 } 2733 }
2660} 2734}
2661 2735
@@ -2670,15 +2744,9 @@ static inline void unmap_mapping_range_list(struct list_head *head,
2670 * across *all* the pages in each nonlinear VMA, not just the pages 2744 * across *all* the pages in each nonlinear VMA, not just the pages
2671 * whose virtual address lies outside the file truncation point. 2745 * whose virtual address lies outside the file truncation point.
2672 */ 2746 */
2673restart:
2674 list_for_each_entry(vma, head, shared.vm_set.list) { 2747 list_for_each_entry(vma, head, shared.vm_set.list) {
2675 /* Skip quickly over those we have already dealt with */
2676 if (vma->vm_truncate_count == details->truncate_count)
2677 continue;
2678 details->nonlinear_vma = vma; 2748 details->nonlinear_vma = vma;
2679 if (unmap_mapping_range_vma(vma, vma->vm_start, 2749 unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
2680 vma->vm_end, details) < 0)
2681 goto restart;
2682 } 2750 }
2683} 2751}
2684 2752
@@ -2717,26 +2785,14 @@ void unmap_mapping_range(struct address_space *mapping,
2717 details.last_index = hba + hlen - 1; 2785 details.last_index = hba + hlen - 1;
2718 if (details.last_index < details.first_index) 2786 if (details.last_index < details.first_index)
2719 details.last_index = ULONG_MAX; 2787 details.last_index = ULONG_MAX;
2720 details.i_mmap_lock = &mapping->i_mmap_lock;
2721 2788
2722 mutex_lock(&mapping->unmap_mutex);
2723 spin_lock(&mapping->i_mmap_lock);
2724
2725 /* Protect against endless unmapping loops */
2726 mapping->truncate_count++;
2727 if (unlikely(is_restart_addr(mapping->truncate_count))) {
2728 if (mapping->truncate_count == 0)
2729 reset_vma_truncate_counts(mapping);
2730 mapping->truncate_count++;
2731 }
2732 details.truncate_count = mapping->truncate_count;
2733 2789
2790 mutex_lock(&mapping->i_mmap_mutex);
2734 if (unlikely(!prio_tree_empty(&mapping->i_mmap))) 2791 if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
2735 unmap_mapping_range_tree(&mapping->i_mmap, &details); 2792 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2736 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) 2793 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2737 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); 2794 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2738 spin_unlock(&mapping->i_mmap_lock); 2795 mutex_unlock(&mapping->i_mmap_mutex);
2739 mutex_unlock(&mapping->unmap_mutex);
2740} 2796}
2741EXPORT_SYMBOL(unmap_mapping_range); 2797EXPORT_SYMBOL(unmap_mapping_range);
2742 2798
@@ -2818,6 +2874,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2818 /* Had to read the page from swap area: Major fault */ 2874 /* Had to read the page from swap area: Major fault */
2819 ret = VM_FAULT_MAJOR; 2875 ret = VM_FAULT_MAJOR;
2820 count_vm_event(PGMAJFAULT); 2876 count_vm_event(PGMAJFAULT);
2877 mem_cgroup_count_vm_event(mm, PGMAJFAULT);
2821 } else if (PageHWPoison(page)) { 2878 } else if (PageHWPoison(page)) {
2822 /* 2879 /*
2823 * hwpoisoned dirty swapcache pages are kept for killing 2880 * hwpoisoned dirty swapcache pages are kept for killing
@@ -2966,7 +3023,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
2966 if (prev && prev->vm_end == address) 3023 if (prev && prev->vm_end == address)
2967 return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; 3024 return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
2968 3025
2969 expand_stack(vma, address - PAGE_SIZE); 3026 expand_downwards(vma, address - PAGE_SIZE);
2970 } 3027 }
2971 if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { 3028 if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
2972 struct vm_area_struct *next = vma->vm_next; 3029 struct vm_area_struct *next = vma->vm_next;
@@ -3357,6 +3414,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3357 __set_current_state(TASK_RUNNING); 3414 __set_current_state(TASK_RUNNING);
3358 3415
3359 count_vm_event(PGFAULT); 3416 count_vm_event(PGFAULT);
3417 mem_cgroup_count_vm_event(mm, PGFAULT);
3360 3418
3361 /* do counter updates before entering really critical section. */ 3419 /* do counter updates before entering really critical section. */
3362 check_sync_rss_stat(current); 3420 check_sync_rss_stat(current);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9ca1d604f7cd..9f646374e32f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -374,10 +374,6 @@ void online_page(struct page *page)
374 totalhigh_pages++; 374 totalhigh_pages++;
375#endif 375#endif
376 376
377#ifdef CONFIG_FLATMEM
378 max_mapnr = max(pfn, max_mapnr);
379#endif
380
381 ClearPageReserved(page); 377 ClearPageReserved(page);
382 init_page_count(page); 378 init_page_count(page);
383 __free_page(page); 379 __free_page(page);
@@ -400,7 +396,7 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
400} 396}
401 397
402 398
403int online_pages(unsigned long pfn, unsigned long nr_pages) 399int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
404{ 400{
405 unsigned long onlined_pages = 0; 401 unsigned long onlined_pages = 0;
406 struct zone *zone; 402 struct zone *zone;
@@ -459,8 +455,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
459 zone_pcp_update(zone); 455 zone_pcp_update(zone);
460 456
461 mutex_unlock(&zonelists_mutex); 457 mutex_unlock(&zonelists_mutex);
462 setup_per_zone_wmarks(); 458
463 calculate_zone_inactive_ratio(zone); 459 init_per_zone_wmark_min();
460
464 if (onlined_pages) { 461 if (onlined_pages) {
465 kswapd_run(zone_to_nid(zone)); 462 kswapd_run(zone_to_nid(zone));
466 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); 463 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
@@ -705,7 +702,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
705 if (!pfn_valid(pfn)) 702 if (!pfn_valid(pfn))
706 continue; 703 continue;
707 page = pfn_to_page(pfn); 704 page = pfn_to_page(pfn);
708 if (!page_count(page)) 705 if (!get_page_unless_zero(page))
709 continue; 706 continue;
710 /* 707 /*
711 * We can skip free pages. And we can only deal with pages on 708 * We can skip free pages. And we can only deal with pages on
@@ -713,6 +710,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
713 */ 710 */
714 ret = isolate_lru_page(page); 711 ret = isolate_lru_page(page);
715 if (!ret) { /* Success */ 712 if (!ret) { /* Success */
713 put_page(page);
716 list_add_tail(&page->lru, &source); 714 list_add_tail(&page->lru, &source);
717 move_pages--; 715 move_pages--;
718 inc_zone_page_state(page, NR_ISOLATED_ANON + 716 inc_zone_page_state(page, NR_ISOLATED_ANON +
@@ -724,6 +722,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
724 pfn); 722 pfn);
725 dump_page(page); 723 dump_page(page);
726#endif 724#endif
725 put_page(page);
727 /* Because we don't have big zone->lock. we should 726 /* Because we don't have big zone->lock. we should
728 check this again here. */ 727 check this again here. */
729 if (page_count(page)) { 728 if (page_count(page)) {
@@ -795,7 +794,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
795 return offlined; 794 return offlined;
796} 795}
797 796
798static int offline_pages(unsigned long start_pfn, 797static int __ref offline_pages(unsigned long start_pfn,
799 unsigned long end_pfn, unsigned long timeout) 798 unsigned long end_pfn, unsigned long timeout)
800{ 799{
801 unsigned long pfn, nr_pages, expire; 800 unsigned long pfn, nr_pages, expire;
@@ -893,8 +892,8 @@ repeat:
893 zone->zone_pgdat->node_present_pages -= offlined_pages; 892 zone->zone_pgdat->node_present_pages -= offlined_pages;
894 totalram_pages -= offlined_pages; 893 totalram_pages -= offlined_pages;
895 894
896 setup_per_zone_wmarks(); 895 init_per_zone_wmark_min();
897 calculate_zone_inactive_ratio(zone); 896
898 if (!node_present_pages(node)) { 897 if (!node_present_pages(node)) {
899 node_clear_state(node, N_HIGH_MEMORY); 898 node_clear_state(node, N_HIGH_MEMORY);
900 kswapd_stop(node); 899 kswapd_stop(node);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 959a8b8c7350..e7fb9d25c54e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -99,7 +99,6 @@
99/* Internal flags */ 99/* Internal flags */
100#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ 100#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
101#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ 101#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
102#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
103 102
104static struct kmem_cache *policy_cache; 103static struct kmem_cache *policy_cache;
105static struct kmem_cache *sn_cache; 104static struct kmem_cache *sn_cache;
@@ -457,7 +456,6 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
457 }, 456 },
458}; 457};
459 458
460static void gather_stats(struct page *, void *, int pte_dirty);
461static void migrate_page_add(struct page *page, struct list_head *pagelist, 459static void migrate_page_add(struct page *page, struct list_head *pagelist,
462 unsigned long flags); 460 unsigned long flags);
463 461
@@ -492,9 +490,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
492 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) 490 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
493 continue; 491 continue;
494 492
495 if (flags & MPOL_MF_STATS) 493 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
496 gather_stats(page, private, pte_dirty(*pte));
497 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
498 migrate_page_add(page, private, flags); 494 migrate_page_add(page, private, flags);
499 else 495 else
500 break; 496 break;
@@ -1489,7 +1485,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1489 * freeing by another task. It is the caller's responsibility to free the 1485 * freeing by another task. It is the caller's responsibility to free the
1490 * extra reference for shared policies. 1486 * extra reference for shared policies.
1491 */ 1487 */
1492static struct mempolicy *get_vma_policy(struct task_struct *task, 1488struct mempolicy *get_vma_policy(struct task_struct *task,
1493 struct vm_area_struct *vma, unsigned long addr) 1489 struct vm_area_struct *vma, unsigned long addr)
1494{ 1490{
1495 struct mempolicy *pol = task->mempolicy; 1491 struct mempolicy *pol = task->mempolicy;
@@ -2529,159 +2525,3 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2529 } 2525 }
2530 return p - buffer; 2526 return p - buffer;
2531} 2527}
2532
2533struct numa_maps {
2534 unsigned long pages;
2535 unsigned long anon;
2536 unsigned long active;
2537 unsigned long writeback;
2538 unsigned long mapcount_max;
2539 unsigned long dirty;
2540 unsigned long swapcache;
2541 unsigned long node[MAX_NUMNODES];
2542};
2543
2544static void gather_stats(struct page *page, void *private, int pte_dirty)
2545{
2546 struct numa_maps *md = private;
2547 int count = page_mapcount(page);
2548
2549 md->pages++;
2550 if (pte_dirty || PageDirty(page))
2551 md->dirty++;
2552
2553 if (PageSwapCache(page))
2554 md->swapcache++;
2555
2556 if (PageActive(page) || PageUnevictable(page))
2557 md->active++;
2558
2559 if (PageWriteback(page))
2560 md->writeback++;
2561
2562 if (PageAnon(page))
2563 md->anon++;
2564
2565 if (count > md->mapcount_max)
2566 md->mapcount_max = count;
2567
2568 md->node[page_to_nid(page)]++;
2569}
2570
2571#ifdef CONFIG_HUGETLB_PAGE
2572static void check_huge_range(struct vm_area_struct *vma,
2573 unsigned long start, unsigned long end,
2574 struct numa_maps *md)
2575{
2576 unsigned long addr;
2577 struct page *page;
2578 struct hstate *h = hstate_vma(vma);
2579 unsigned long sz = huge_page_size(h);
2580
2581 for (addr = start; addr < end; addr += sz) {
2582 pte_t *ptep = huge_pte_offset(vma->vm_mm,
2583 addr & huge_page_mask(h));
2584 pte_t pte;
2585
2586 if (!ptep)
2587 continue;
2588
2589 pte = *ptep;
2590 if (pte_none(pte))
2591 continue;
2592
2593 page = pte_page(pte);
2594 if (!page)
2595 continue;
2596
2597 gather_stats(page, md, pte_dirty(*ptep));
2598 }
2599}
2600#else
2601static inline void check_huge_range(struct vm_area_struct *vma,
2602 unsigned long start, unsigned long end,
2603 struct numa_maps *md)
2604{
2605}
2606#endif
2607
2608/*
2609 * Display pages allocated per node and memory policy via /proc.
2610 */
2611int show_numa_map(struct seq_file *m, void *v)
2612{
2613 struct proc_maps_private *priv = m->private;
2614 struct vm_area_struct *vma = v;
2615 struct numa_maps *md;
2616 struct file *file = vma->vm_file;
2617 struct mm_struct *mm = vma->vm_mm;
2618 struct mempolicy *pol;
2619 int n;
2620 char buffer[50];
2621
2622 if (!mm)
2623 return 0;
2624
2625 md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2626 if (!md)
2627 return 0;
2628
2629 pol = get_vma_policy(priv->task, vma, vma->vm_start);
2630 mpol_to_str(buffer, sizeof(buffer), pol, 0);
2631 mpol_cond_put(pol);
2632
2633 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2634
2635 if (file) {
2636 seq_printf(m, " file=");
2637 seq_path(m, &file->f_path, "\n\t= ");
2638 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2639 seq_printf(m, " heap");
2640 } else if (vma->vm_start <= mm->start_stack &&
2641 vma->vm_end >= mm->start_stack) {
2642 seq_printf(m, " stack");
2643 }
2644
2645 if (is_vm_hugetlb_page(vma)) {
2646 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2647 seq_printf(m, " huge");
2648 } else {
2649 check_pgd_range(vma, vma->vm_start, vma->vm_end,
2650 &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2651 }
2652
2653 if (!md->pages)
2654 goto out;
2655
2656 if (md->anon)
2657 seq_printf(m," anon=%lu",md->anon);
2658
2659 if (md->dirty)
2660 seq_printf(m," dirty=%lu",md->dirty);
2661
2662 if (md->pages != md->anon && md->pages != md->dirty)
2663 seq_printf(m, " mapped=%lu", md->pages);
2664
2665 if (md->mapcount_max > 1)
2666 seq_printf(m, " mapmax=%lu", md->mapcount_max);
2667
2668 if (md->swapcache)
2669 seq_printf(m," swapcache=%lu", md->swapcache);
2670
2671 if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2672 seq_printf(m," active=%lu", md->active);
2673
2674 if (md->writeback)
2675 seq_printf(m," writeback=%lu", md->writeback);
2676
2677 for_each_node_state(n, N_HIGH_MEMORY)
2678 if (md->node[n])
2679 seq_printf(m, " N%d=%lu", n, md->node[n]);
2680out:
2681 seq_putc(m, '\n');
2682 kfree(md);
2683
2684 if (m->count < m->size)
2685 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2686 return 0;
2687}
diff --git a/mm/migrate.c b/mm/migrate.c
index 34132f8e9109..e4a5c912983d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -721,15 +721,11 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
721 * Only page_lock_anon_vma() understands the subtleties of 721 * Only page_lock_anon_vma() understands the subtleties of
722 * getting a hold on an anon_vma from outside one of its mms. 722 * getting a hold on an anon_vma from outside one of its mms.
723 */ 723 */
724 anon_vma = page_lock_anon_vma(page); 724 anon_vma = page_get_anon_vma(page);
725 if (anon_vma) { 725 if (anon_vma) {
726 /* 726 /*
727 * Take a reference count on the anon_vma if the 727 * Anon page
728 * page is mapped so that it is guaranteed to
729 * exist when the page is remapped later
730 */ 728 */
731 get_anon_vma(anon_vma);
732 page_unlock_anon_vma(anon_vma);
733 } else if (PageSwapCache(page)) { 729 } else if (PageSwapCache(page)) {
734 /* 730 /*
735 * We cannot be sure that the anon_vma of an unmapped 731 * We cannot be sure that the anon_vma of an unmapped
@@ -857,13 +853,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
857 lock_page(hpage); 853 lock_page(hpage);
858 } 854 }
859 855
860 if (PageAnon(hpage)) { 856 if (PageAnon(hpage))
861 anon_vma = page_lock_anon_vma(hpage); 857 anon_vma = page_get_anon_vma(hpage);
862 if (anon_vma) {
863 get_anon_vma(anon_vma);
864 page_unlock_anon_vma(anon_vma);
865 }
866 }
867 858
868 try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); 859 try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
869 860
diff --git a/mm/mlock.c b/mm/mlock.c
index 516b2c2ddd5a..048260c4e02e 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -307,13 +307,13 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
307 * For vmas that pass the filters, merge/split as appropriate. 307 * For vmas that pass the filters, merge/split as appropriate.
308 */ 308 */
309static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, 309static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
310 unsigned long start, unsigned long end, unsigned int newflags) 310 unsigned long start, unsigned long end, vm_flags_t newflags)
311{ 311{
312 struct mm_struct *mm = vma->vm_mm; 312 struct mm_struct *mm = vma->vm_mm;
313 pgoff_t pgoff; 313 pgoff_t pgoff;
314 int nr_pages; 314 int nr_pages;
315 int ret = 0; 315 int ret = 0;
316 int lock = newflags & VM_LOCKED; 316 int lock = !!(newflags & VM_LOCKED);
317 317
318 if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || 318 if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
319 is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) 319 is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
@@ -385,7 +385,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
385 prev = vma; 385 prev = vma;
386 386
387 for (nstart = start ; ; ) { 387 for (nstart = start ; ; ) {
388 unsigned int newflags; 388 vm_flags_t newflags;
389 389
390 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ 390 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
391 391
@@ -524,7 +524,7 @@ static int do_mlockall(int flags)
524 goto out; 524 goto out;
525 525
526 for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { 526 for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
527 unsigned int newflags; 527 vm_flags_t newflags;
528 528
529 newflags = vma->vm_flags | VM_LOCKED; 529 newflags = vma->vm_flags | VM_LOCKED;
530 if (!(flags & MCL_CURRENT)) 530 if (!(flags & MCL_CURRENT))
diff --git a/mm/mmap.c b/mm/mmap.c
index 772140c53ab1..bbdc9af5e117 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -84,10 +84,14 @@ pgprot_t vm_get_page_prot(unsigned long vm_flags)
84} 84}
85EXPORT_SYMBOL(vm_get_page_prot); 85EXPORT_SYMBOL(vm_get_page_prot);
86 86
87int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 87int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
88int sysctl_overcommit_ratio = 50; /* default is 50% */ 88int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
89int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 89int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
90struct percpu_counter vm_committed_as; 90/*
91 * Make sure vm_committed_as in one cacheline and not cacheline shared with
92 * other variables. It can be updated by several CPUs frequently.
93 */
94struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
91 95
92/* 96/*
93 * Check that a process has enough memory to allocate a new virtual 97 * Check that a process has enough memory to allocate a new virtual
@@ -190,7 +194,7 @@ error:
190} 194}
191 195
192/* 196/*
193 * Requires inode->i_mapping->i_mmap_lock 197 * Requires inode->i_mapping->i_mmap_mutex
194 */ 198 */
195static void __remove_shared_vm_struct(struct vm_area_struct *vma, 199static void __remove_shared_vm_struct(struct vm_area_struct *vma,
196 struct file *file, struct address_space *mapping) 200 struct file *file, struct address_space *mapping)
@@ -218,9 +222,9 @@ void unlink_file_vma(struct vm_area_struct *vma)
218 222
219 if (file) { 223 if (file) {
220 struct address_space *mapping = file->f_mapping; 224 struct address_space *mapping = file->f_mapping;
221 spin_lock(&mapping->i_mmap_lock); 225 mutex_lock(&mapping->i_mmap_mutex);
222 __remove_shared_vm_struct(vma, file, mapping); 226 __remove_shared_vm_struct(vma, file, mapping);
223 spin_unlock(&mapping->i_mmap_lock); 227 mutex_unlock(&mapping->i_mmap_mutex);
224 } 228 }
225} 229}
226 230
@@ -394,29 +398,6 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
394 return vma; 398 return vma;
395} 399}
396 400
397static inline void
398__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
399 struct vm_area_struct *prev, struct rb_node *rb_parent)
400{
401 struct vm_area_struct *next;
402
403 vma->vm_prev = prev;
404 if (prev) {
405 next = prev->vm_next;
406 prev->vm_next = vma;
407 } else {
408 mm->mmap = vma;
409 if (rb_parent)
410 next = rb_entry(rb_parent,
411 struct vm_area_struct, vm_rb);
412 else
413 next = NULL;
414 }
415 vma->vm_next = next;
416 if (next)
417 next->vm_prev = vma;
418}
419
420void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, 401void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
421 struct rb_node **rb_link, struct rb_node *rb_parent) 402 struct rb_node **rb_link, struct rb_node *rb_parent)
422{ 403{
@@ -464,16 +445,14 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
464 if (vma->vm_file) 445 if (vma->vm_file)
465 mapping = vma->vm_file->f_mapping; 446 mapping = vma->vm_file->f_mapping;
466 447
467 if (mapping) { 448 if (mapping)
468 spin_lock(&mapping->i_mmap_lock); 449 mutex_lock(&mapping->i_mmap_mutex);
469 vma->vm_truncate_count = mapping->truncate_count;
470 }
471 450
472 __vma_link(mm, vma, prev, rb_link, rb_parent); 451 __vma_link(mm, vma, prev, rb_link, rb_parent);
473 __vma_link_file(vma); 452 __vma_link_file(vma);
474 453
475 if (mapping) 454 if (mapping)
476 spin_unlock(&mapping->i_mmap_lock); 455 mutex_unlock(&mapping->i_mmap_mutex);
477 456
478 mm->map_count++; 457 mm->map_count++;
479 validate_mm(mm); 458 validate_mm(mm);
@@ -576,17 +555,8 @@ again: remove_next = 1 + (end > next->vm_end);
576 mapping = file->f_mapping; 555 mapping = file->f_mapping;
577 if (!(vma->vm_flags & VM_NONLINEAR)) 556 if (!(vma->vm_flags & VM_NONLINEAR))
578 root = &mapping->i_mmap; 557 root = &mapping->i_mmap;
579 spin_lock(&mapping->i_mmap_lock); 558 mutex_lock(&mapping->i_mmap_mutex);
580 if (importer &&
581 vma->vm_truncate_count != next->vm_truncate_count) {
582 /*
583 * unmap_mapping_range might be in progress:
584 * ensure that the expanding vma is rescanned.
585 */
586 importer->vm_truncate_count = 0;
587 }
588 if (insert) { 559 if (insert) {
589 insert->vm_truncate_count = vma->vm_truncate_count;
590 /* 560 /*
591 * Put into prio_tree now, so instantiated pages 561 * Put into prio_tree now, so instantiated pages
592 * are visible to arm/parisc __flush_dcache_page 562 * are visible to arm/parisc __flush_dcache_page
@@ -605,7 +575,7 @@ again: remove_next = 1 + (end > next->vm_end);
605 * lock may be shared between many sibling processes. Skipping 575 * lock may be shared between many sibling processes. Skipping
606 * the lock for brk adjustments makes a difference sometimes. 576 * the lock for brk adjustments makes a difference sometimes.
607 */ 577 */
608 if (vma->anon_vma && (insert || importer || start != vma->vm_start)) { 578 if (vma->anon_vma && (importer || start != vma->vm_start)) {
609 anon_vma = vma->anon_vma; 579 anon_vma = vma->anon_vma;
610 anon_vma_lock(anon_vma); 580 anon_vma_lock(anon_vma);
611 } 581 }
@@ -652,7 +622,7 @@ again: remove_next = 1 + (end > next->vm_end);
652 if (anon_vma) 622 if (anon_vma)
653 anon_vma_unlock(anon_vma); 623 anon_vma_unlock(anon_vma);
654 if (mapping) 624 if (mapping)
655 spin_unlock(&mapping->i_mmap_lock); 625 mutex_unlock(&mapping->i_mmap_mutex);
656 626
657 if (remove_next) { 627 if (remove_next) {
658 if (file) { 628 if (file) {
@@ -699,9 +669,17 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
699} 669}
700 670
701static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, 671static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
702 struct anon_vma *anon_vma2) 672 struct anon_vma *anon_vma2,
673 struct vm_area_struct *vma)
703{ 674{
704 return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2); 675 /*
676 * The list_is_singular() test is to avoid merging VMA cloned from
677 * parents. This can improve scalability caused by anon_vma lock.
678 */
679 if ((!anon_vma1 || !anon_vma2) && (!vma ||
680 list_is_singular(&vma->anon_vma_chain)))
681 return 1;
682 return anon_vma1 == anon_vma2;
705} 683}
706 684
707/* 685/*
@@ -720,7 +698,7 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
720 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) 698 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
721{ 699{
722 if (is_mergeable_vma(vma, file, vm_flags) && 700 if (is_mergeable_vma(vma, file, vm_flags) &&
723 is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { 701 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
724 if (vma->vm_pgoff == vm_pgoff) 702 if (vma->vm_pgoff == vm_pgoff)
725 return 1; 703 return 1;
726 } 704 }
@@ -739,7 +717,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
739 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) 717 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
740{ 718{
741 if (is_mergeable_vma(vma, file, vm_flags) && 719 if (is_mergeable_vma(vma, file, vm_flags) &&
742 is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { 720 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
743 pgoff_t vm_pglen; 721 pgoff_t vm_pglen;
744 vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; 722 vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
745 if (vma->vm_pgoff + vm_pglen == vm_pgoff) 723 if (vma->vm_pgoff + vm_pglen == vm_pgoff)
@@ -817,7 +795,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
817 can_vma_merge_before(next, vm_flags, 795 can_vma_merge_before(next, vm_flags,
818 anon_vma, file, pgoff+pglen) && 796 anon_vma, file, pgoff+pglen) &&
819 is_mergeable_anon_vma(prev->anon_vma, 797 is_mergeable_anon_vma(prev->anon_vma,
820 next->anon_vma)) { 798 next->anon_vma, NULL)) {
821 /* cases 1, 6 */ 799 /* cases 1, 6 */
822 err = vma_adjust(prev, prev->vm_start, 800 err = vma_adjust(prev, prev->vm_start,
823 next->vm_end, prev->vm_pgoff, NULL); 801 next->vm_end, prev->vm_pgoff, NULL);
@@ -982,7 +960,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
982{ 960{
983 struct mm_struct * mm = current->mm; 961 struct mm_struct * mm = current->mm;
984 struct inode *inode; 962 struct inode *inode;
985 unsigned int vm_flags; 963 vm_flags_t vm_flags;
986 int error; 964 int error;
987 unsigned long reqprot = prot; 965 unsigned long reqprot = prot;
988 966
@@ -1187,7 +1165,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
1187 */ 1165 */
1188int vma_wants_writenotify(struct vm_area_struct *vma) 1166int vma_wants_writenotify(struct vm_area_struct *vma)
1189{ 1167{
1190 unsigned int vm_flags = vma->vm_flags; 1168 vm_flags_t vm_flags = vma->vm_flags;
1191 1169
1192 /* If it was private or non-writable, the write bit is already clear */ 1170 /* If it was private or non-writable, the write bit is already clear */
1193 if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) 1171 if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
@@ -1215,7 +1193,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
1215 * We account for memory if it's a private writeable mapping, 1193 * We account for memory if it's a private writeable mapping,
1216 * not hugepages and VM_NORESERVE wasn't set. 1194 * not hugepages and VM_NORESERVE wasn't set.
1217 */ 1195 */
1218static inline int accountable_mapping(struct file *file, unsigned int vm_flags) 1196static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
1219{ 1197{
1220 /* 1198 /*
1221 * hugetlb has its own accounting separate from the core VM 1199 * hugetlb has its own accounting separate from the core VM
@@ -1229,7 +1207,7 @@ static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
1229 1207
1230unsigned long mmap_region(struct file *file, unsigned long addr, 1208unsigned long mmap_region(struct file *file, unsigned long addr,
1231 unsigned long len, unsigned long flags, 1209 unsigned long len, unsigned long flags,
1232 unsigned int vm_flags, unsigned long pgoff) 1210 vm_flags_t vm_flags, unsigned long pgoff)
1233{ 1211{
1234 struct mm_struct *mm = current->mm; 1212 struct mm_struct *mm = current->mm;
1235 struct vm_area_struct *vma, *prev; 1213 struct vm_area_struct *vma, *prev;
@@ -1785,7 +1763,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1785/* 1763/*
1786 * vma is the first one with address < vma->vm_start. Have to extend vma. 1764 * vma is the first one with address < vma->vm_start. Have to extend vma.
1787 */ 1765 */
1788static int expand_downwards(struct vm_area_struct *vma, 1766int expand_downwards(struct vm_area_struct *vma,
1789 unsigned long address) 1767 unsigned long address)
1790{ 1768{
1791 int error; 1769 int error;
@@ -1832,11 +1810,6 @@ static int expand_downwards(struct vm_area_struct *vma,
1832 return error; 1810 return error;
1833} 1811}
1834 1812
1835int expand_stack_downwards(struct vm_area_struct *vma, unsigned long address)
1836{
1837 return expand_downwards(vma, address);
1838}
1839
1840#ifdef CONFIG_STACK_GROWSUP 1813#ifdef CONFIG_STACK_GROWSUP
1841int expand_stack(struct vm_area_struct *vma, unsigned long address) 1814int expand_stack(struct vm_area_struct *vma, unsigned long address)
1842{ 1815{
@@ -1919,17 +1892,17 @@ static void unmap_region(struct mm_struct *mm,
1919 unsigned long start, unsigned long end) 1892 unsigned long start, unsigned long end)
1920{ 1893{
1921 struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; 1894 struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
1922 struct mmu_gather *tlb; 1895 struct mmu_gather tlb;
1923 unsigned long nr_accounted = 0; 1896 unsigned long nr_accounted = 0;
1924 1897
1925 lru_add_drain(); 1898 lru_add_drain();
1926 tlb = tlb_gather_mmu(mm, 0); 1899 tlb_gather_mmu(&tlb, mm, 0);
1927 update_hiwater_rss(mm); 1900 update_hiwater_rss(mm);
1928 unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); 1901 unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
1929 vm_unacct_memory(nr_accounted); 1902 vm_unacct_memory(nr_accounted);
1930 free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, 1903 free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
1931 next? next->vm_start: 0); 1904 next ? next->vm_start : 0);
1932 tlb_finish_mmu(tlb, start, end); 1905 tlb_finish_mmu(&tlb, start, end);
1933} 1906}
1934 1907
1935/* 1908/*
@@ -2271,7 +2244,7 @@ EXPORT_SYMBOL(do_brk);
2271/* Release all mmaps. */ 2244/* Release all mmaps. */
2272void exit_mmap(struct mm_struct *mm) 2245void exit_mmap(struct mm_struct *mm)
2273{ 2246{
2274 struct mmu_gather *tlb; 2247 struct mmu_gather tlb;
2275 struct vm_area_struct *vma; 2248 struct vm_area_struct *vma;
2276 unsigned long nr_accounted = 0; 2249 unsigned long nr_accounted = 0;
2277 unsigned long end; 2250 unsigned long end;
@@ -2296,14 +2269,14 @@ void exit_mmap(struct mm_struct *mm)
2296 2269
2297 lru_add_drain(); 2270 lru_add_drain();
2298 flush_cache_mm(mm); 2271 flush_cache_mm(mm);
2299 tlb = tlb_gather_mmu(mm, 1); 2272 tlb_gather_mmu(&tlb, mm, 1);
2300 /* update_hiwater_rss(mm) here? but nobody should be looking */ 2273 /* update_hiwater_rss(mm) here? but nobody should be looking */
2301 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 2274 /* Use -1 here to ensure all VMAs in the mm are unmapped */
2302 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); 2275 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
2303 vm_unacct_memory(nr_accounted); 2276 vm_unacct_memory(nr_accounted);
2304 2277
2305 free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); 2278 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
2306 tlb_finish_mmu(tlb, 0, end); 2279 tlb_finish_mmu(&tlb, 0, end);
2307 2280
2308 /* 2281 /*
2309 * Walk the list again, actually closing and freeing it, 2282 * Walk the list again, actually closing and freeing it,
@@ -2317,7 +2290,7 @@ void exit_mmap(struct mm_struct *mm)
2317 2290
2318/* Insert vm structure into process list sorted by address 2291/* Insert vm structure into process list sorted by address
2319 * and into the inode's i_mmap tree. If vm_file is non-NULL 2292 * and into the inode's i_mmap tree. If vm_file is non-NULL
2320 * then i_mmap_lock is taken here. 2293 * then i_mmap_mutex is taken here.
2321 */ 2294 */
2322int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) 2295int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
2323{ 2296{
@@ -2529,15 +2502,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
2529 * The LSB of head.next can't change from under us 2502 * The LSB of head.next can't change from under us
2530 * because we hold the mm_all_locks_mutex. 2503 * because we hold the mm_all_locks_mutex.
2531 */ 2504 */
2532 spin_lock_nest_lock(&anon_vma->root->lock, &mm->mmap_sem); 2505 mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem);
2533 /* 2506 /*
2534 * We can safely modify head.next after taking the 2507 * We can safely modify head.next after taking the
2535 * anon_vma->root->lock. If some other vma in this mm shares 2508 * anon_vma->root->mutex. If some other vma in this mm shares
2536 * the same anon_vma we won't take it again. 2509 * the same anon_vma we won't take it again.
2537 * 2510 *
2538 * No need of atomic instructions here, head.next 2511 * No need of atomic instructions here, head.next
2539 * can't change from under us thanks to the 2512 * can't change from under us thanks to the
2540 * anon_vma->root->lock. 2513 * anon_vma->root->mutex.
2541 */ 2514 */
2542 if (__test_and_set_bit(0, (unsigned long *) 2515 if (__test_and_set_bit(0, (unsigned long *)
2543 &anon_vma->root->head.next)) 2516 &anon_vma->root->head.next))
@@ -2559,7 +2532,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
2559 */ 2532 */
2560 if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) 2533 if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
2561 BUG(); 2534 BUG();
2562 spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem); 2535 mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem);
2563 } 2536 }
2564} 2537}
2565 2538
@@ -2586,7 +2559,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
2586 * vma in this mm is backed by the same anon_vma or address_space. 2559 * vma in this mm is backed by the same anon_vma or address_space.
2587 * 2560 *
2588 * We can take all the locks in random order because the VM code 2561 * We can take all the locks in random order because the VM code
2589 * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never 2562 * taking i_mmap_mutex or anon_vma->mutex outside the mmap_sem never
2590 * takes more than one of them in a row. Secondly we're protected 2563 * takes more than one of them in a row. Secondly we're protected
2591 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. 2564 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
2592 * 2565 *
@@ -2642,7 +2615,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2642 * 2615 *
2643 * No need of atomic instructions here, head.next 2616 * No need of atomic instructions here, head.next
2644 * can't change from under us until we release the 2617 * can't change from under us until we release the
2645 * anon_vma->root->lock. 2618 * anon_vma->root->mutex.
2646 */ 2619 */
2647 if (!__test_and_clear_bit(0, (unsigned long *) 2620 if (!__test_and_clear_bit(0, (unsigned long *)
2648 &anon_vma->root->head.next)) 2621 &anon_vma->root->head.next))
@@ -2658,7 +2631,7 @@ static void vm_unlock_mapping(struct address_space *mapping)
2658 * AS_MM_ALL_LOCKS can't change to 0 from under us 2631 * AS_MM_ALL_LOCKS can't change to 0 from under us
2659 * because we hold the mm_all_locks_mutex. 2632 * because we hold the mm_all_locks_mutex.
2660 */ 2633 */
2661 spin_unlock(&mapping->i_mmap_lock); 2634 mutex_unlock(&mapping->i_mmap_mutex);
2662 if (!test_and_clear_bit(AS_MM_ALL_LOCKS, 2635 if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
2663 &mapping->flags)) 2636 &mapping->flags))
2664 BUG(); 2637 BUG();
diff --git a/mm/mremap.c b/mm/mremap.c
index a7c1f9f9b941..506fa44403df 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -93,8 +93,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
93 * and we propagate stale pages into the dst afterward. 93 * and we propagate stale pages into the dst afterward.
94 */ 94 */
95 mapping = vma->vm_file->f_mapping; 95 mapping = vma->vm_file->f_mapping;
96 spin_lock(&mapping->i_mmap_lock); 96 mutex_lock(&mapping->i_mmap_mutex);
97 new_vma->vm_truncate_count = 0;
98 } 97 }
99 98
100 /* 99 /*
@@ -123,7 +122,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
123 pte_unmap(new_pte - 1); 122 pte_unmap(new_pte - 1);
124 pte_unmap_unlock(old_pte - 1, old_ptl); 123 pte_unmap_unlock(old_pte - 1, old_ptl);
125 if (mapping) 124 if (mapping)
126 spin_unlock(&mapping->i_mmap_lock); 125 mutex_unlock(&mapping->i_mmap_mutex);
127 mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end); 126 mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
128} 127}
129 128
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 9109049f0bbc..6e93dc7f2586 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -307,30 +307,7 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
307void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, 307void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
308 unsigned long align, unsigned long goal) 308 unsigned long align, unsigned long goal)
309{ 309{
310#ifdef MAX_DMA32_PFN
311 unsigned long end_pfn;
312
313 if (WARN_ON_ONCE(slab_is_available()))
314 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
315
316 /* update goal according ...MAX_DMA32_PFN */
317 end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages;
318
319 if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
320 (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
321 void *ptr;
322 unsigned long new_goal;
323
324 new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
325 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
326 new_goal, -1ULL);
327 if (ptr)
328 return ptr;
329 }
330#endif
331
332 return __alloc_bootmem_node(pgdat, size, align, goal); 310 return __alloc_bootmem_node(pgdat, size, align, goal);
333
334} 311}
335 312
336#ifdef CONFIG_SPARSEMEM 313#ifdef CONFIG_SPARSEMEM
diff --git a/mm/nommu.c b/mm/nommu.c
index c4c542c736a9..1fd0c51b10a6 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -680,9 +680,9 @@ static void protect_vma(struct vm_area_struct *vma, unsigned long flags)
680 */ 680 */
681static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) 681static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
682{ 682{
683 struct vm_area_struct *pvma, **pp, *next; 683 struct vm_area_struct *pvma, *prev;
684 struct address_space *mapping; 684 struct address_space *mapping;
685 struct rb_node **p, *parent; 685 struct rb_node **p, *parent, *rb_prev;
686 686
687 kenter(",%p", vma); 687 kenter(",%p", vma);
688 688
@@ -703,7 +703,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
703 } 703 }
704 704
705 /* add the VMA to the tree */ 705 /* add the VMA to the tree */
706 parent = NULL; 706 parent = rb_prev = NULL;
707 p = &mm->mm_rb.rb_node; 707 p = &mm->mm_rb.rb_node;
708 while (*p) { 708 while (*p) {
709 parent = *p; 709 parent = *p;
@@ -713,17 +713,20 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
713 * (the latter is necessary as we may get identical VMAs) */ 713 * (the latter is necessary as we may get identical VMAs) */
714 if (vma->vm_start < pvma->vm_start) 714 if (vma->vm_start < pvma->vm_start)
715 p = &(*p)->rb_left; 715 p = &(*p)->rb_left;
716 else if (vma->vm_start > pvma->vm_start) 716 else if (vma->vm_start > pvma->vm_start) {
717 rb_prev = parent;
717 p = &(*p)->rb_right; 718 p = &(*p)->rb_right;
718 else if (vma->vm_end < pvma->vm_end) 719 } else if (vma->vm_end < pvma->vm_end)
719 p = &(*p)->rb_left; 720 p = &(*p)->rb_left;
720 else if (vma->vm_end > pvma->vm_end) 721 else if (vma->vm_end > pvma->vm_end) {
722 rb_prev = parent;
721 p = &(*p)->rb_right; 723 p = &(*p)->rb_right;
722 else if (vma < pvma) 724 } else if (vma < pvma)
723 p = &(*p)->rb_left; 725 p = &(*p)->rb_left;
724 else if (vma > pvma) 726 else if (vma > pvma) {
727 rb_prev = parent;
725 p = &(*p)->rb_right; 728 p = &(*p)->rb_right;
726 else 729 } else
727 BUG(); 730 BUG();
728 } 731 }
729 732
@@ -731,20 +734,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
731 rb_insert_color(&vma->vm_rb, &mm->mm_rb); 734 rb_insert_color(&vma->vm_rb, &mm->mm_rb);
732 735
733 /* add VMA to the VMA list also */ 736 /* add VMA to the VMA list also */
734 for (pp = &mm->mmap; (pvma = *pp); pp = &(*pp)->vm_next) { 737 prev = NULL;
735 if (pvma->vm_start > vma->vm_start) 738 if (rb_prev)
736 break; 739 prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
737 if (pvma->vm_start < vma->vm_start)
738 continue;
739 if (pvma->vm_end < vma->vm_end)
740 break;
741 }
742 740
743 next = *pp; 741 __vma_link_list(mm, vma, prev, parent);
744 *pp = vma;
745 vma->vm_next = next;
746 if (next)
747 next->vm_prev = vma;
748} 742}
749 743
750/* 744/*
@@ -752,7 +746,6 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
752 */ 746 */
753static void delete_vma_from_mm(struct vm_area_struct *vma) 747static void delete_vma_from_mm(struct vm_area_struct *vma)
754{ 748{
755 struct vm_area_struct **pp;
756 struct address_space *mapping; 749 struct address_space *mapping;
757 struct mm_struct *mm = vma->vm_mm; 750 struct mm_struct *mm = vma->vm_mm;
758 751
@@ -775,12 +768,14 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
775 768
776 /* remove from the MM's tree and list */ 769 /* remove from the MM's tree and list */
777 rb_erase(&vma->vm_rb, &mm->mm_rb); 770 rb_erase(&vma->vm_rb, &mm->mm_rb);
778 for (pp = &mm->mmap; *pp; pp = &(*pp)->vm_next) { 771
779 if (*pp == vma) { 772 if (vma->vm_prev)
780 *pp = vma->vm_next; 773 vma->vm_prev->vm_next = vma->vm_next;
781 break; 774 else
782 } 775 mm->mmap = vma->vm_next;
783 } 776
777 if (vma->vm_next)
778 vma->vm_next->vm_prev = vma->vm_prev;
784 779
785 vma->vm_mm = NULL; 780 vma->vm_mm = NULL;
786} 781}
@@ -809,17 +804,15 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
809struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) 804struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
810{ 805{
811 struct vm_area_struct *vma; 806 struct vm_area_struct *vma;
812 struct rb_node *n = mm->mm_rb.rb_node;
813 807
814 /* check the cache first */ 808 /* check the cache first */
815 vma = mm->mmap_cache; 809 vma = mm->mmap_cache;
816 if (vma && vma->vm_start <= addr && vma->vm_end > addr) 810 if (vma && vma->vm_start <= addr && vma->vm_end > addr)
817 return vma; 811 return vma;
818 812
819 /* trawl the tree (there may be multiple mappings in which addr 813 /* trawl the list (there may be multiple mappings in which addr
820 * resides) */ 814 * resides) */
821 for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) { 815 for (vma = mm->mmap; vma; vma = vma->vm_next) {
822 vma = rb_entry(n, struct vm_area_struct, vm_rb);
823 if (vma->vm_start > addr) 816 if (vma->vm_start > addr)
824 return NULL; 817 return NULL;
825 if (vma->vm_end > addr) { 818 if (vma->vm_end > addr) {
@@ -859,7 +852,6 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
859 unsigned long len) 852 unsigned long len)
860{ 853{
861 struct vm_area_struct *vma; 854 struct vm_area_struct *vma;
862 struct rb_node *n = mm->mm_rb.rb_node;
863 unsigned long end = addr + len; 855 unsigned long end = addr + len;
864 856
865 /* check the cache first */ 857 /* check the cache first */
@@ -867,10 +859,9 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
867 if (vma && vma->vm_start == addr && vma->vm_end == end) 859 if (vma && vma->vm_start == addr && vma->vm_end == end)
868 return vma; 860 return vma;
869 861
870 /* trawl the tree (there may be multiple mappings in which addr 862 /* trawl the list (there may be multiple mappings in which addr
871 * resides) */ 863 * resides) */
872 for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) { 864 for (vma = mm->mmap; vma; vma = vma->vm_next) {
873 vma = rb_entry(n, struct vm_area_struct, vm_rb);
874 if (vma->vm_start < addr) 865 if (vma->vm_start < addr)
875 continue; 866 continue;
876 if (vma->vm_start > addr) 867 if (vma->vm_start > addr)
@@ -1133,7 +1124,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
1133 unsigned long capabilities) 1124 unsigned long capabilities)
1134{ 1125{
1135 struct page *pages; 1126 struct page *pages;
1136 unsigned long total, point, n, rlen; 1127 unsigned long total, point, n;
1137 void *base; 1128 void *base;
1138 int ret, order; 1129 int ret, order;
1139 1130
@@ -1157,13 +1148,12 @@ static int do_mmap_private(struct vm_area_struct *vma,
1157 * make a private copy of the data and map that instead */ 1148 * make a private copy of the data and map that instead */
1158 } 1149 }
1159 1150
1160 rlen = PAGE_ALIGN(len);
1161 1151
1162 /* allocate some memory to hold the mapping 1152 /* allocate some memory to hold the mapping
1163 * - note that this may not return a page-aligned address if the object 1153 * - note that this may not return a page-aligned address if the object
1164 * we're allocating is smaller than a page 1154 * we're allocating is smaller than a page
1165 */ 1155 */
1166 order = get_order(rlen); 1156 order = get_order(len);
1167 kdebug("alloc order %d for %lx", order, len); 1157 kdebug("alloc order %d for %lx", order, len);
1168 1158
1169 pages = alloc_pages(GFP_KERNEL, order); 1159 pages = alloc_pages(GFP_KERNEL, order);
@@ -1173,7 +1163,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
1173 total = 1 << order; 1163 total = 1 << order;
1174 atomic_long_add(total, &mmap_pages_allocated); 1164 atomic_long_add(total, &mmap_pages_allocated);
1175 1165
1176 point = rlen >> PAGE_SHIFT; 1166 point = len >> PAGE_SHIFT;
1177 1167
1178 /* we allocated a power-of-2 sized page set, so we may want to trim off 1168 /* we allocated a power-of-2 sized page set, so we may want to trim off
1179 * the excess */ 1169 * the excess */
@@ -1195,7 +1185,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
1195 base = page_address(pages); 1185 base = page_address(pages);
1196 region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; 1186 region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY;
1197 region->vm_start = (unsigned long) base; 1187 region->vm_start = (unsigned long) base;
1198 region->vm_end = region->vm_start + rlen; 1188 region->vm_end = region->vm_start + len;
1199 region->vm_top = region->vm_start + (total << PAGE_SHIFT); 1189 region->vm_top = region->vm_start + (total << PAGE_SHIFT);
1200 1190
1201 vma->vm_start = region->vm_start; 1191 vma->vm_start = region->vm_start;
@@ -1211,22 +1201,22 @@ static int do_mmap_private(struct vm_area_struct *vma,
1211 1201
1212 old_fs = get_fs(); 1202 old_fs = get_fs();
1213 set_fs(KERNEL_DS); 1203 set_fs(KERNEL_DS);
1214 ret = vma->vm_file->f_op->read(vma->vm_file, base, rlen, &fpos); 1204 ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos);
1215 set_fs(old_fs); 1205 set_fs(old_fs);
1216 1206
1217 if (ret < 0) 1207 if (ret < 0)
1218 goto error_free; 1208 goto error_free;
1219 1209
1220 /* clear the last little bit */ 1210 /* clear the last little bit */
1221 if (ret < rlen) 1211 if (ret < len)
1222 memset(base + ret, 0, rlen - ret); 1212 memset(base + ret, 0, len - ret);
1223 1213
1224 } 1214 }
1225 1215
1226 return 0; 1216 return 0;
1227 1217
1228error_free: 1218error_free:
1229 free_page_series(region->vm_start, region->vm_end); 1219 free_page_series(region->vm_start, region->vm_top);
1230 region->vm_start = vma->vm_start = 0; 1220 region->vm_start = vma->vm_start = 0;
1231 region->vm_end = vma->vm_end = 0; 1221 region->vm_end = vma->vm_end = 0;
1232 region->vm_top = 0; 1222 region->vm_top = 0;
@@ -1235,7 +1225,7 @@ error_free:
1235enomem: 1225enomem:
1236 printk("Allocation of length %lu from process %d (%s) failed\n", 1226 printk("Allocation of length %lu from process %d (%s) failed\n",
1237 len, current->pid, current->comm); 1227 len, current->pid, current->comm);
1238 show_free_areas(); 1228 show_free_areas(0);
1239 return -ENOMEM; 1229 return -ENOMEM;
1240} 1230}
1241 1231
@@ -1268,6 +1258,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1268 1258
1269 /* we ignore the address hint */ 1259 /* we ignore the address hint */
1270 addr = 0; 1260 addr = 0;
1261 len = PAGE_ALIGN(len);
1271 1262
1272 /* we've determined that we can make the mapping, now translate what we 1263 /* we've determined that we can make the mapping, now translate what we
1273 * now know into VMA flags */ 1264 * now know into VMA flags */
@@ -1385,15 +1376,15 @@ unsigned long do_mmap_pgoff(struct file *file,
1385 if (capabilities & BDI_CAP_MAP_DIRECT) { 1376 if (capabilities & BDI_CAP_MAP_DIRECT) {
1386 addr = file->f_op->get_unmapped_area(file, addr, len, 1377 addr = file->f_op->get_unmapped_area(file, addr, len,
1387 pgoff, flags); 1378 pgoff, flags);
1388 if (IS_ERR((void *) addr)) { 1379 if (IS_ERR_VALUE(addr)) {
1389 ret = addr; 1380 ret = addr;
1390 if (ret != (unsigned long) -ENOSYS) 1381 if (ret != -ENOSYS)
1391 goto error_just_free; 1382 goto error_just_free;
1392 1383
1393 /* the driver refused to tell us where to site 1384 /* the driver refused to tell us where to site
1394 * the mapping so we'll have to attempt to copy 1385 * the mapping so we'll have to attempt to copy
1395 * it */ 1386 * it */
1396 ret = (unsigned long) -ENODEV; 1387 ret = -ENODEV;
1397 if (!(capabilities & BDI_CAP_MAP_COPY)) 1388 if (!(capabilities & BDI_CAP_MAP_COPY))
1398 goto error_just_free; 1389 goto error_just_free;
1399 1390
@@ -1468,14 +1459,14 @@ error_getting_vma:
1468 printk(KERN_WARNING "Allocation of vma for %lu byte allocation" 1459 printk(KERN_WARNING "Allocation of vma for %lu byte allocation"
1469 " from process %d failed\n", 1460 " from process %d failed\n",
1470 len, current->pid); 1461 len, current->pid);
1471 show_free_areas(); 1462 show_free_areas(0);
1472 return -ENOMEM; 1463 return -ENOMEM;
1473 1464
1474error_getting_region: 1465error_getting_region:
1475 printk(KERN_WARNING "Allocation of vm region for %lu byte allocation" 1466 printk(KERN_WARNING "Allocation of vm region for %lu byte allocation"
1476 " from process %d failed\n", 1467 " from process %d failed\n",
1477 len, current->pid); 1468 len, current->pid);
1478 show_free_areas(); 1469 show_free_areas(0);
1479 return -ENOMEM; 1470 return -ENOMEM;
1480} 1471}
1481EXPORT_SYMBOL(do_mmap_pgoff); 1472EXPORT_SYMBOL(do_mmap_pgoff);
@@ -1644,15 +1635,17 @@ static int shrink_vma(struct mm_struct *mm,
1644int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) 1635int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1645{ 1636{
1646 struct vm_area_struct *vma; 1637 struct vm_area_struct *vma;
1647 struct rb_node *rb; 1638 unsigned long end;
1648 unsigned long end = start + len;
1649 int ret; 1639 int ret;
1650 1640
1651 kenter(",%lx,%zx", start, len); 1641 kenter(",%lx,%zx", start, len);
1652 1642
1643 len = PAGE_ALIGN(len);
1653 if (len == 0) 1644 if (len == 0)
1654 return -EINVAL; 1645 return -EINVAL;
1655 1646
1647 end = start + len;
1648
1656 /* find the first potentially overlapping VMA */ 1649 /* find the first potentially overlapping VMA */
1657 vma = find_vma(mm, start); 1650 vma = find_vma(mm, start);
1658 if (!vma) { 1651 if (!vma) {
@@ -1677,9 +1670,8 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1677 } 1670 }
1678 if (end == vma->vm_end) 1671 if (end == vma->vm_end)
1679 goto erase_whole_vma; 1672 goto erase_whole_vma;
1680 rb = rb_next(&vma->vm_rb); 1673 vma = vma->vm_next;
1681 vma = rb_entry(rb, struct vm_area_struct, vm_rb); 1674 } while (vma);
1682 } while (rb);
1683 kleave(" = -EINVAL [split file]"); 1675 kleave(" = -EINVAL [split file]");
1684 return -EINVAL; 1676 return -EINVAL;
1685 } else { 1677 } else {
@@ -1773,6 +1765,8 @@ unsigned long do_mremap(unsigned long addr,
1773 struct vm_area_struct *vma; 1765 struct vm_area_struct *vma;
1774 1766
1775 /* insanity checks first */ 1767 /* insanity checks first */
1768 old_len = PAGE_ALIGN(old_len);
1769 new_len = PAGE_ALIGN(new_len);
1776 if (old_len == 0 || new_len == 0) 1770 if (old_len == 0 || new_len == 0)
1777 return (unsigned long) -EINVAL; 1771 return (unsigned long) -EINVAL;
1778 1772
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f52e85c80e8d..e4b0991ca351 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -38,6 +38,33 @@ int sysctl_oom_kill_allocating_task;
38int sysctl_oom_dump_tasks = 1; 38int sysctl_oom_dump_tasks = 1;
39static DEFINE_SPINLOCK(zone_scan_lock); 39static DEFINE_SPINLOCK(zone_scan_lock);
40 40
41/**
42 * test_set_oom_score_adj() - set current's oom_score_adj and return old value
43 * @new_val: new oom_score_adj value
44 *
45 * Sets the oom_score_adj value for current to @new_val with proper
46 * synchronization and returns the old value. Usually used to temporarily
47 * set a value, save the old value in the caller, and then reinstate it later.
48 */
49int test_set_oom_score_adj(int new_val)
50{
51 struct sighand_struct *sighand = current->sighand;
52 int old_val;
53
54 spin_lock_irq(&sighand->siglock);
55 old_val = current->signal->oom_score_adj;
56 if (new_val != old_val) {
57 if (new_val == OOM_SCORE_ADJ_MIN)
58 atomic_inc(&current->mm->oom_disable_count);
59 else if (old_val == OOM_SCORE_ADJ_MIN)
60 atomic_dec(&current->mm->oom_disable_count);
61 current->signal->oom_score_adj = new_val;
62 }
63 spin_unlock_irq(&sighand->siglock);
64
65 return old_val;
66}
67
41#ifdef CONFIG_NUMA 68#ifdef CONFIG_NUMA
42/** 69/**
43 * has_intersects_mems_allowed() - check task eligiblity for kill 70 * has_intersects_mems_allowed() - check task eligiblity for kill
@@ -155,15 +182,6 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
155 } 182 }
156 183
157 /* 184 /*
158 * When the PF_OOM_ORIGIN bit is set, it indicates the task should have
159 * priority for oom killing.
160 */
161 if (p->flags & PF_OOM_ORIGIN) {
162 task_unlock(p);
163 return 1000;
164 }
165
166 /*
167 * The memory controller may have a limit of 0 bytes, so avoid a divide 185 * The memory controller may have a limit of 0 bytes, so avoid a divide
168 * by zero, if necessary. 186 * by zero, if necessary.
169 */ 187 */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3f8bce264df6..a4e1db3f1981 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -30,6 +30,7 @@
30#include <linux/pagevec.h> 30#include <linux/pagevec.h>
31#include <linux/blkdev.h> 31#include <linux/blkdev.h>
32#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/ratelimit.h>
33#include <linux/oom.h> 34#include <linux/oom.h>
34#include <linux/notifier.h> 35#include <linux/notifier.h>
35#include <linux/topology.h> 36#include <linux/topology.h>
@@ -39,6 +40,7 @@
39#include <linux/memory_hotplug.h> 40#include <linux/memory_hotplug.h>
40#include <linux/nodemask.h> 41#include <linux/nodemask.h>
41#include <linux/vmalloc.h> 42#include <linux/vmalloc.h>
43#include <linux/vmstat.h>
42#include <linux/mempolicy.h> 44#include <linux/mempolicy.h>
43#include <linux/stop_machine.h> 45#include <linux/stop_machine.h>
44#include <linux/sort.h> 46#include <linux/sort.h>
@@ -54,6 +56,7 @@
54#include <trace/events/kmem.h> 56#include <trace/events/kmem.h>
55#include <linux/ftrace_event.h> 57#include <linux/ftrace_event.h>
56#include <linux/memcontrol.h> 58#include <linux/memcontrol.h>
59#include <linux/prefetch.h>
57 60
58#include <asm/tlbflush.h> 61#include <asm/tlbflush.h>
59#include <asm/div64.h> 62#include <asm/div64.h>
@@ -1734,6 +1737,45 @@ static inline bool should_suppress_show_mem(void)
1734 return ret; 1737 return ret;
1735} 1738}
1736 1739
1740static DEFINE_RATELIMIT_STATE(nopage_rs,
1741 DEFAULT_RATELIMIT_INTERVAL,
1742 DEFAULT_RATELIMIT_BURST);
1743
1744void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
1745{
1746 va_list args;
1747 unsigned int filter = SHOW_MEM_FILTER_NODES;
1748
1749 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
1750 return;
1751
1752 /*
1753 * This documents exceptions given to allocations in certain
1754 * contexts that are allowed to allocate outside current's set
1755 * of allowed nodes.
1756 */
1757 if (!(gfp_mask & __GFP_NOMEMALLOC))
1758 if (test_thread_flag(TIF_MEMDIE) ||
1759 (current->flags & (PF_MEMALLOC | PF_EXITING)))
1760 filter &= ~SHOW_MEM_FILTER_NODES;
1761 if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
1762 filter &= ~SHOW_MEM_FILTER_NODES;
1763
1764 if (fmt) {
1765 printk(KERN_WARNING);
1766 va_start(args, fmt);
1767 vprintk(fmt, args);
1768 va_end(args);
1769 }
1770
1771 pr_warning("%s: page allocation failure: order:%d, mode:0x%x\n",
1772 current->comm, order, gfp_mask);
1773
1774 dump_stack();
1775 if (!should_suppress_show_mem())
1776 show_mem(filter);
1777}
1778
1737static inline int 1779static inline int
1738should_alloc_retry(gfp_t gfp_mask, unsigned int order, 1780should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1739 unsigned long pages_reclaimed) 1781 unsigned long pages_reclaimed)
@@ -2064,6 +2106,7 @@ restart:
2064 first_zones_zonelist(zonelist, high_zoneidx, NULL, 2106 first_zones_zonelist(zonelist, high_zoneidx, NULL,
2065 &preferred_zone); 2107 &preferred_zone);
2066 2108
2109rebalance:
2067 /* This is the last chance, in general, before the goto nopage. */ 2110 /* This is the last chance, in general, before the goto nopage. */
2068 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 2111 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
2069 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 2112 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2071,7 +2114,6 @@ restart:
2071 if (page) 2114 if (page)
2072 goto got_pg; 2115 goto got_pg;
2073 2116
2074rebalance:
2075 /* Allocate without watermarks if the context allows */ 2117 /* Allocate without watermarks if the context allows */
2076 if (alloc_flags & ALLOC_NO_WATERMARKS) { 2118 if (alloc_flags & ALLOC_NO_WATERMARKS) {
2077 page = __alloc_pages_high_priority(gfp_mask, order, 2119 page = __alloc_pages_high_priority(gfp_mask, order,
@@ -2105,7 +2147,7 @@ rebalance:
2105 sync_migration); 2147 sync_migration);
2106 if (page) 2148 if (page)
2107 goto got_pg; 2149 goto got_pg;
2108 sync_migration = !(gfp_mask & __GFP_NO_KSWAPD); 2150 sync_migration = true;
2109 2151
2110 /* Try direct reclaim and then allocating */ 2152 /* Try direct reclaim and then allocating */
2111 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2153 page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2176,27 +2218,7 @@ rebalance:
2176 } 2218 }
2177 2219
2178nopage: 2220nopage:
2179 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { 2221 warn_alloc_failed(gfp_mask, order, NULL);
2180 unsigned int filter = SHOW_MEM_FILTER_NODES;
2181
2182 /*
2183 * This documents exceptions given to allocations in certain
2184 * contexts that are allowed to allocate outside current's set
2185 * of allowed nodes.
2186 */
2187 if (!(gfp_mask & __GFP_NOMEMALLOC))
2188 if (test_thread_flag(TIF_MEMDIE) ||
2189 (current->flags & (PF_MEMALLOC | PF_EXITING)))
2190 filter &= ~SHOW_MEM_FILTER_NODES;
2191 if (in_interrupt() || !wait)
2192 filter &= ~SHOW_MEM_FILTER_NODES;
2193
2194 pr_warning("%s: page allocation failure. order:%d, mode:0x%x\n",
2195 current->comm, order, gfp_mask);
2196 dump_stack();
2197 if (!should_suppress_show_mem())
2198 show_mem(filter);
2199 }
2200 return page; 2222 return page;
2201got_pg: 2223got_pg:
2202 if (kmemcheck_enabled) 2224 if (kmemcheck_enabled)
@@ -2225,6 +2247,10 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2225 2247
2226 if (should_fail_alloc_page(gfp_mask, order)) 2248 if (should_fail_alloc_page(gfp_mask, order))
2227 return NULL; 2249 return NULL;
2250#ifndef CONFIG_ZONE_DMA
2251 if (WARN_ON_ONCE(gfp_mask & __GFP_DMA))
2252 return NULL;
2253#endif
2228 2254
2229 /* 2255 /*
2230 * Check the zones suitable for the gfp_mask contain at least one 2256 * Check the zones suitable for the gfp_mask contain at least one
@@ -2472,10 +2498,10 @@ void si_meminfo_node(struct sysinfo *val, int nid)
2472#endif 2498#endif
2473 2499
2474/* 2500/*
2475 * Determine whether the zone's node should be displayed or not, depending on 2501 * Determine whether the node should be displayed or not, depending on whether
2476 * whether SHOW_MEM_FILTER_NODES was passed to __show_free_areas(). 2502 * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
2477 */ 2503 */
2478static bool skip_free_areas_zone(unsigned int flags, const struct zone *zone) 2504bool skip_free_areas_node(unsigned int flags, int nid)
2479{ 2505{
2480 bool ret = false; 2506 bool ret = false;
2481 2507
@@ -2483,8 +2509,7 @@ static bool skip_free_areas_zone(unsigned int flags, const struct zone *zone)
2483 goto out; 2509 goto out;
2484 2510
2485 get_mems_allowed(); 2511 get_mems_allowed();
2486 ret = !node_isset(zone->zone_pgdat->node_id, 2512 ret = !node_isset(nid, cpuset_current_mems_allowed);
2487 cpuset_current_mems_allowed);
2488 put_mems_allowed(); 2513 put_mems_allowed();
2489out: 2514out:
2490 return ret; 2515 return ret;
@@ -2499,13 +2524,13 @@ out:
2499 * Suppresses nodes that are not allowed by current's cpuset if 2524 * Suppresses nodes that are not allowed by current's cpuset if
2500 * SHOW_MEM_FILTER_NODES is passed. 2525 * SHOW_MEM_FILTER_NODES is passed.
2501 */ 2526 */
2502void __show_free_areas(unsigned int filter) 2527void show_free_areas(unsigned int filter)
2503{ 2528{
2504 int cpu; 2529 int cpu;
2505 struct zone *zone; 2530 struct zone *zone;
2506 2531
2507 for_each_populated_zone(zone) { 2532 for_each_populated_zone(zone) {
2508 if (skip_free_areas_zone(filter, zone)) 2533 if (skip_free_areas_node(filter, zone_to_nid(zone)))
2509 continue; 2534 continue;
2510 show_node(zone); 2535 show_node(zone);
2511 printk("%s per-cpu:\n", zone->name); 2536 printk("%s per-cpu:\n", zone->name);
@@ -2548,7 +2573,7 @@ void __show_free_areas(unsigned int filter)
2548 for_each_populated_zone(zone) { 2573 for_each_populated_zone(zone) {
2549 int i; 2574 int i;
2550 2575
2551 if (skip_free_areas_zone(filter, zone)) 2576 if (skip_free_areas_node(filter, zone_to_nid(zone)))
2552 continue; 2577 continue;
2553 show_node(zone); 2578 show_node(zone);
2554 printk("%s" 2579 printk("%s"
@@ -2617,7 +2642,7 @@ void __show_free_areas(unsigned int filter)
2617 for_each_populated_zone(zone) { 2642 for_each_populated_zone(zone) {
2618 unsigned long nr[MAX_ORDER], flags, order, total = 0; 2643 unsigned long nr[MAX_ORDER], flags, order, total = 0;
2619 2644
2620 if (skip_free_areas_zone(filter, zone)) 2645 if (skip_free_areas_node(filter, zone_to_nid(zone)))
2621 continue; 2646 continue;
2622 show_node(zone); 2647 show_node(zone);
2623 printk("%s: ", zone->name); 2648 printk("%s: ", zone->name);
@@ -2638,11 +2663,6 @@ void __show_free_areas(unsigned int filter)
2638 show_swap_cache_info(); 2663 show_swap_cache_info();
2639} 2664}
2640 2665
2641void show_free_areas(void)
2642{
2643 __show_free_areas(0);
2644}
2645
2646static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) 2666static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
2647{ 2667{
2648 zoneref->zone = zone; 2668 zoneref->zone = zone;
@@ -3313,6 +3333,20 @@ static inline unsigned long wait_table_bits(unsigned long size)
3313#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 3333#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
3314 3334
3315/* 3335/*
3336 * Check if a pageblock contains reserved pages
3337 */
3338static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
3339{
3340 unsigned long pfn;
3341
3342 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
3343 if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
3344 return 1;
3345 }
3346 return 0;
3347}
3348
3349/*
3316 * Mark a number of pageblocks as MIGRATE_RESERVE. The number 3350 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
3317 * of blocks reserved is based on min_wmark_pages(zone). The memory within 3351 * of blocks reserved is based on min_wmark_pages(zone). The memory within
3318 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes 3352 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
@@ -3321,7 +3355,7 @@ static inline unsigned long wait_table_bits(unsigned long size)
3321 */ 3355 */
3322static void setup_zone_migrate_reserve(struct zone *zone) 3356static void setup_zone_migrate_reserve(struct zone *zone)
3323{ 3357{
3324 unsigned long start_pfn, pfn, end_pfn; 3358 unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
3325 struct page *page; 3359 struct page *page;
3326 unsigned long block_migratetype; 3360 unsigned long block_migratetype;
3327 int reserve; 3361 int reserve;
@@ -3351,7 +3385,8 @@ static void setup_zone_migrate_reserve(struct zone *zone)
3351 continue; 3385 continue;
3352 3386
3353 /* Blocks with reserved pages will never free, skip them. */ 3387 /* Blocks with reserved pages will never free, skip them. */
3354 if (PageReserved(page)) 3388 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
3389 if (pageblock_is_reserved(pfn, block_end_pfn))
3355 continue; 3390 continue;
3356 3391
3357 block_migratetype = get_pageblock_migratetype(page); 3392 block_migratetype = get_pageblock_migratetype(page);
@@ -3540,7 +3575,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
3540 pcp->batch = PAGE_SHIFT * 8; 3575 pcp->batch = PAGE_SHIFT * 8;
3541} 3576}
3542 3577
3543static __meminit void setup_zone_pageset(struct zone *zone) 3578static void setup_zone_pageset(struct zone *zone)
3544{ 3579{
3545 int cpu; 3580 int cpu;
3546 3581
@@ -4288,10 +4323,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4288 zone->zone_pgdat = pgdat; 4323 zone->zone_pgdat = pgdat;
4289 4324
4290 zone_pcp_init(zone); 4325 zone_pcp_init(zone);
4291 for_each_lru(l) { 4326 for_each_lru(l)
4292 INIT_LIST_HEAD(&zone->lru[l].list); 4327 INIT_LIST_HEAD(&zone->lru[l].list);
4293 zone->reclaim_stat.nr_saved_scan[l] = 0;
4294 }
4295 zone->reclaim_stat.recent_rotated[0] = 0; 4328 zone->reclaim_stat.recent_rotated[0] = 0;
4296 zone->reclaim_stat.recent_rotated[1] = 0; 4329 zone->reclaim_stat.recent_rotated[1] = 0;
4297 zone->reclaim_stat.recent_scanned[0] = 0; 4330 zone->reclaim_stat.recent_scanned[0] = 0;
@@ -5099,7 +5132,7 @@ void setup_per_zone_wmarks(void)
5099 * 1TB 101 10GB 5132 * 1TB 101 10GB
5100 * 10TB 320 32GB 5133 * 10TB 320 32GB
5101 */ 5134 */
5102void calculate_zone_inactive_ratio(struct zone *zone) 5135static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
5103{ 5136{
5104 unsigned int gb, ratio; 5137 unsigned int gb, ratio;
5105 5138
@@ -5113,7 +5146,7 @@ void calculate_zone_inactive_ratio(struct zone *zone)
5113 zone->inactive_ratio = ratio; 5146 zone->inactive_ratio = ratio;
5114} 5147}
5115 5148
5116static void __init setup_per_zone_inactive_ratio(void) 5149static void __meminit setup_per_zone_inactive_ratio(void)
5117{ 5150{
5118 struct zone *zone; 5151 struct zone *zone;
5119 5152
@@ -5145,7 +5178,7 @@ static void __init setup_per_zone_inactive_ratio(void)
5145 * 8192MB: 11584k 5178 * 8192MB: 11584k
5146 * 16384MB: 16384k 5179 * 16384MB: 16384k
5147 */ 5180 */
5148static int __init init_per_zone_wmark_min(void) 5181int __meminit init_per_zone_wmark_min(void)
5149{ 5182{
5150 unsigned long lowmem_kbytes; 5183 unsigned long lowmem_kbytes;
5151 5184
@@ -5157,6 +5190,7 @@ static int __init init_per_zone_wmark_min(void)
5157 if (min_free_kbytes > 65536) 5190 if (min_free_kbytes > 65536)
5158 min_free_kbytes = 65536; 5191 min_free_kbytes = 65536;
5159 setup_per_zone_wmarks(); 5192 setup_per_zone_wmarks();
5193 refresh_zone_stat_thresholds();
5160 setup_per_zone_lowmem_reserve(); 5194 setup_per_zone_lowmem_reserve();
5161 setup_per_zone_inactive_ratio(); 5195 setup_per_zone_inactive_ratio();
5162 return 0; 5196 return 0;
@@ -5507,10 +5541,8 @@ int set_migratetype_isolate(struct page *page)
5507 struct memory_isolate_notify arg; 5541 struct memory_isolate_notify arg;
5508 int notifier_ret; 5542 int notifier_ret;
5509 int ret = -EBUSY; 5543 int ret = -EBUSY;
5510 int zone_idx;
5511 5544
5512 zone = page_zone(page); 5545 zone = page_zone(page);
5513 zone_idx = zone_idx(zone);
5514 5546
5515 spin_lock_irqsave(&zone->lock, flags); 5547 spin_lock_irqsave(&zone->lock, flags);
5516 5548
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 2daadc322ba6..74ccff61d1be 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -130,7 +130,7 @@ struct page *lookup_cgroup_page(struct page_cgroup *pc)
130 return page; 130 return page;
131} 131}
132 132
133static void *__init_refok alloc_page_cgroup(size_t size, int nid) 133static void *__meminit alloc_page_cgroup(size_t size, int nid)
134{ 134{
135 void *addr = NULL; 135 void *addr = NULL;
136 136
@@ -162,7 +162,7 @@ static void free_page_cgroup(void *addr)
162} 162}
163#endif 163#endif
164 164
165static int __init_refok init_section_page_cgroup(unsigned long pfn) 165static int __meminit init_section_page_cgroup(unsigned long pfn)
166{ 166{
167 struct page_cgroup *base, *pc; 167 struct page_cgroup *base, *pc;
168 struct mem_section *section; 168 struct mem_section *section;
@@ -475,7 +475,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
475 if (!do_swap_account) 475 if (!do_swap_account)
476 return 0; 476 return 0;
477 477
478 length = ((max_pages/SC_PER_PAGE) + 1); 478 length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
479 array_size = length * sizeof(void *); 479 array_size = length * sizeof(void *);
480 480
481 array = vmalloc(array_size); 481 array = vmalloc(array_size);
@@ -492,8 +492,8 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
492 /* memory shortage */ 492 /* memory shortage */
493 ctrl->map = NULL; 493 ctrl->map = NULL;
494 ctrl->length = 0; 494 ctrl->length = 0;
495 vfree(array);
496 mutex_unlock(&swap_cgroup_mutex); 495 mutex_unlock(&swap_cgroup_mutex);
496 vfree(array);
497 goto nomem; 497 goto nomem;
498 } 498 }
499 mutex_unlock(&swap_cgroup_mutex); 499 mutex_unlock(&swap_cgroup_mutex);
@@ -508,7 +508,8 @@ nomem:
508 508
509void swap_cgroup_swapoff(int type) 509void swap_cgroup_swapoff(int type)
510{ 510{
511 int i; 511 struct page **map;
512 unsigned long i, length;
512 struct swap_cgroup_ctrl *ctrl; 513 struct swap_cgroup_ctrl *ctrl;
513 514
514 if (!do_swap_account) 515 if (!do_swap_account)
@@ -516,17 +517,20 @@ void swap_cgroup_swapoff(int type)
516 517
517 mutex_lock(&swap_cgroup_mutex); 518 mutex_lock(&swap_cgroup_mutex);
518 ctrl = &swap_cgroup_ctrl[type]; 519 ctrl = &swap_cgroup_ctrl[type];
519 if (ctrl->map) { 520 map = ctrl->map;
520 for (i = 0; i < ctrl->length; i++) { 521 length = ctrl->length;
521 struct page *page = ctrl->map[i]; 522 ctrl->map = NULL;
523 ctrl->length = 0;
524 mutex_unlock(&swap_cgroup_mutex);
525
526 if (map) {
527 for (i = 0; i < length; i++) {
528 struct page *page = map[i];
522 if (page) 529 if (page)
523 __free_page(page); 530 __free_page(page);
524 } 531 }
525 vfree(ctrl->map); 532 vfree(map);
526 ctrl->map = NULL;
527 ctrl->length = 0;
528 } 533 }
529 mutex_unlock(&swap_cgroup_mutex);
530} 534}
531 535
532#endif 536#endif
diff --git a/mm/percpu.c b/mm/percpu.c
index a160db39b810..bf80e55dbed7 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1215,8 +1215,10 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1215 PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); 1215 PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
1216#ifdef CONFIG_SMP 1216#ifdef CONFIG_SMP
1217 PCPU_SETUP_BUG_ON(!ai->static_size); 1217 PCPU_SETUP_BUG_ON(!ai->static_size);
1218 PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK);
1218#endif 1219#endif
1219 PCPU_SETUP_BUG_ON(!base_addr); 1220 PCPU_SETUP_BUG_ON(!base_addr);
1221 PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK);
1220 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); 1222 PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
1221 PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); 1223 PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
1222 PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); 1224 PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
@@ -1645,8 +1647,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
1645 /* warn if maximum distance is further than 75% of vmalloc space */ 1647 /* warn if maximum distance is further than 75% of vmalloc space */
1646 if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) { 1648 if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) {
1647 pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc " 1649 pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
1648 "space 0x%lx\n", 1650 "space 0x%lx\n", max_distance,
1649 max_distance, VMALLOC_END - VMALLOC_START); 1651 (unsigned long)(VMALLOC_END - VMALLOC_START));
1650#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK 1652#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
1651 /* and fail if we have fallback */ 1653 /* and fail if we have fallback */
1652 rc = -EINVAL; 1654 rc = -EINVAL;
diff --git a/mm/prio_tree.c b/mm/prio_tree.c
index 603ae98d9694..799dcfd7cd8c 100644
--- a/mm/prio_tree.c
+++ b/mm/prio_tree.c
@@ -13,6 +13,7 @@
13 13
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/prio_tree.h> 15#include <linux/prio_tree.h>
16#include <linux/prefetch.h>
16 17
17/* 18/*
18 * See lib/prio_tree.c for details on the general radix priority search tree 19 * See lib/prio_tree.c for details on the general radix priority search tree
diff --git a/mm/readahead.c b/mm/readahead.c
index 2c0cc489e288..867f9dd82dcd 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -180,7 +180,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
180 if (page) 180 if (page)
181 continue; 181 continue;
182 182
183 page = page_cache_alloc_cold(mapping); 183 page = page_cache_alloc_readahead(mapping);
184 if (!page) 184 if (!page)
185 break; 185 break;
186 page->index = page_offset; 186 page->index = page_offset;
diff --git a/mm/rmap.c b/mm/rmap.c
index 8da044a1db0f..3a39b518a653 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -24,8 +24,8 @@
24 * inode->i_alloc_sem (vmtruncate_range) 24 * inode->i_alloc_sem (vmtruncate_range)
25 * mm->mmap_sem 25 * mm->mmap_sem
26 * page->flags PG_locked (lock_page) 26 * page->flags PG_locked (lock_page)
27 * mapping->i_mmap_lock 27 * mapping->i_mmap_mutex
28 * anon_vma->lock 28 * anon_vma->mutex
29 * mm->page_table_lock or pte_lock 29 * mm->page_table_lock or pte_lock
30 * zone->lru_lock (in mark_page_accessed, isolate_lru_page) 30 * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
31 * swap_lock (in swap_duplicate, swap_info_get) 31 * swap_lock (in swap_duplicate, swap_info_get)
@@ -40,7 +40,7 @@
40 * 40 *
41 * (code doesn't rely on that order so it could be switched around) 41 * (code doesn't rely on that order so it could be switched around)
42 * ->tasklist_lock 42 * ->tasklist_lock
43 * anon_vma->lock (memory_failure, collect_procs_anon) 43 * anon_vma->mutex (memory_failure, collect_procs_anon)
44 * pte map lock 44 * pte map lock
45 */ 45 */
46 46
@@ -86,6 +86,29 @@ static inline struct anon_vma *anon_vma_alloc(void)
86static inline void anon_vma_free(struct anon_vma *anon_vma) 86static inline void anon_vma_free(struct anon_vma *anon_vma)
87{ 87{
88 VM_BUG_ON(atomic_read(&anon_vma->refcount)); 88 VM_BUG_ON(atomic_read(&anon_vma->refcount));
89
90 /*
91 * Synchronize against page_lock_anon_vma() such that
92 * we can safely hold the lock without the anon_vma getting
93 * freed.
94 *
95 * Relies on the full mb implied by the atomic_dec_and_test() from
96 * put_anon_vma() against the acquire barrier implied by
97 * mutex_trylock() from page_lock_anon_vma(). This orders:
98 *
99 * page_lock_anon_vma() VS put_anon_vma()
100 * mutex_trylock() atomic_dec_and_test()
101 * LOCK MB
102 * atomic_read() mutex_is_locked()
103 *
104 * LOCK should suffice since the actual taking of the lock must
105 * happen _before_ what follows.
106 */
107 if (mutex_is_locked(&anon_vma->root->mutex)) {
108 anon_vma_lock(anon_vma);
109 anon_vma_unlock(anon_vma);
110 }
111
89 kmem_cache_free(anon_vma_cachep, anon_vma); 112 kmem_cache_free(anon_vma_cachep, anon_vma);
90} 113}
91 114
@@ -307,7 +330,7 @@ static void anon_vma_ctor(void *data)
307{ 330{
308 struct anon_vma *anon_vma = data; 331 struct anon_vma *anon_vma = data;
309 332
310 spin_lock_init(&anon_vma->lock); 333 mutex_init(&anon_vma->mutex);
311 atomic_set(&anon_vma->refcount, 0); 334 atomic_set(&anon_vma->refcount, 0);
312 INIT_LIST_HEAD(&anon_vma->head); 335 INIT_LIST_HEAD(&anon_vma->head);
313} 336}
@@ -320,12 +343,26 @@ void __init anon_vma_init(void)
320} 343}
321 344
322/* 345/*
323 * Getting a lock on a stable anon_vma from a page off the LRU is 346 * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
324 * tricky: page_lock_anon_vma rely on RCU to guard against the races. 347 *
348 * Since there is no serialization what so ever against page_remove_rmap()
349 * the best this function can do is return a locked anon_vma that might
350 * have been relevant to this page.
351 *
352 * The page might have been remapped to a different anon_vma or the anon_vma
353 * returned may already be freed (and even reused).
354 *
355 * All users of this function must be very careful when walking the anon_vma
356 * chain and verify that the page in question is indeed mapped in it
357 * [ something equivalent to page_mapped_in_vma() ].
358 *
359 * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap()
360 * that the anon_vma pointer from page->mapping is valid if there is a
361 * mapcount, we can dereference the anon_vma after observing those.
325 */ 362 */
326struct anon_vma *__page_lock_anon_vma(struct page *page) 363struct anon_vma *page_get_anon_vma(struct page *page)
327{ 364{
328 struct anon_vma *anon_vma, *root_anon_vma; 365 struct anon_vma *anon_vma = NULL;
329 unsigned long anon_mapping; 366 unsigned long anon_mapping;
330 367
331 rcu_read_lock(); 368 rcu_read_lock();
@@ -336,32 +373,97 @@ struct anon_vma *__page_lock_anon_vma(struct page *page)
336 goto out; 373 goto out;
337 374
338 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 375 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
339 root_anon_vma = ACCESS_ONCE(anon_vma->root); 376 if (!atomic_inc_not_zero(&anon_vma->refcount)) {
340 spin_lock(&root_anon_vma->lock); 377 anon_vma = NULL;
378 goto out;
379 }
341 380
342 /* 381 /*
343 * If this page is still mapped, then its anon_vma cannot have been 382 * If this page is still mapped, then its anon_vma cannot have been
344 * freed. But if it has been unmapped, we have no security against 383 * freed. But if it has been unmapped, we have no security against the
345 * the anon_vma structure being freed and reused (for another anon_vma: 384 * anon_vma structure being freed and reused (for another anon_vma:
346 * SLAB_DESTROY_BY_RCU guarantees that - so the spin_lock above cannot 385 * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero()
347 * corrupt): with anon_vma_prepare() or anon_vma_fork() redirecting 386 * above cannot corrupt).
348 * anon_vma->root before page_unlock_anon_vma() is called to unlock.
349 */ 387 */
350 if (page_mapped(page)) 388 if (!page_mapped(page)) {
351 return anon_vma; 389 put_anon_vma(anon_vma);
390 anon_vma = NULL;
391 }
392out:
393 rcu_read_unlock();
394
395 return anon_vma;
396}
397
398/*
399 * Similar to page_get_anon_vma() except it locks the anon_vma.
400 *
401 * Its a little more complex as it tries to keep the fast path to a single
402 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
403 * reference like with page_get_anon_vma() and then block on the mutex.
404 */
405struct anon_vma *page_lock_anon_vma(struct page *page)
406{
407 struct anon_vma *anon_vma = NULL;
408 unsigned long anon_mapping;
409
410 rcu_read_lock();
411 anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
412 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
413 goto out;
414 if (!page_mapped(page))
415 goto out;
416
417 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
418 if (mutex_trylock(&anon_vma->root->mutex)) {
419 /*
420 * If we observe a !0 refcount, then holding the lock ensures
421 * the anon_vma will not go away, see __put_anon_vma().
422 */
423 if (!atomic_read(&anon_vma->refcount)) {
424 anon_vma_unlock(anon_vma);
425 anon_vma = NULL;
426 }
427 goto out;
428 }
429
430 /* trylock failed, we got to sleep */
431 if (!atomic_inc_not_zero(&anon_vma->refcount)) {
432 anon_vma = NULL;
433 goto out;
434 }
435
436 if (!page_mapped(page)) {
437 put_anon_vma(anon_vma);
438 anon_vma = NULL;
439 goto out;
440 }
441
442 /* we pinned the anon_vma, its safe to sleep */
443 rcu_read_unlock();
444 anon_vma_lock(anon_vma);
445
446 if (atomic_dec_and_test(&anon_vma->refcount)) {
447 /*
448 * Oops, we held the last refcount, release the lock
449 * and bail -- can't simply use put_anon_vma() because
450 * we'll deadlock on the anon_vma_lock() recursion.
451 */
452 anon_vma_unlock(anon_vma);
453 __put_anon_vma(anon_vma);
454 anon_vma = NULL;
455 }
456
457 return anon_vma;
352 458
353 spin_unlock(&root_anon_vma->lock);
354out: 459out:
355 rcu_read_unlock(); 460 rcu_read_unlock();
356 return NULL; 461 return anon_vma;
357} 462}
358 463
359void page_unlock_anon_vma(struct anon_vma *anon_vma) 464void page_unlock_anon_vma(struct anon_vma *anon_vma)
360 __releases(&anon_vma->root->lock)
361 __releases(RCU)
362{ 465{
363 anon_vma_unlock(anon_vma); 466 anon_vma_unlock(anon_vma);
364 rcu_read_unlock();
365} 467}
366 468
367/* 469/*
@@ -646,14 +748,14 @@ static int page_referenced_file(struct page *page,
646 * The page lock not only makes sure that page->mapping cannot 748 * The page lock not only makes sure that page->mapping cannot
647 * suddenly be NULLified by truncation, it makes sure that the 749 * suddenly be NULLified by truncation, it makes sure that the
648 * structure at mapping cannot be freed and reused yet, 750 * structure at mapping cannot be freed and reused yet,
649 * so we can safely take mapping->i_mmap_lock. 751 * so we can safely take mapping->i_mmap_mutex.
650 */ 752 */
651 BUG_ON(!PageLocked(page)); 753 BUG_ON(!PageLocked(page));
652 754
653 spin_lock(&mapping->i_mmap_lock); 755 mutex_lock(&mapping->i_mmap_mutex);
654 756
655 /* 757 /*
656 * i_mmap_lock does not stabilize mapcount at all, but mapcount 758 * i_mmap_mutex does not stabilize mapcount at all, but mapcount
657 * is more likely to be accurate if we note it after spinning. 759 * is more likely to be accurate if we note it after spinning.
658 */ 760 */
659 mapcount = page_mapcount(page); 761 mapcount = page_mapcount(page);
@@ -675,7 +777,7 @@ static int page_referenced_file(struct page *page,
675 break; 777 break;
676 } 778 }
677 779
678 spin_unlock(&mapping->i_mmap_lock); 780 mutex_unlock(&mapping->i_mmap_mutex);
679 return referenced; 781 return referenced;
680} 782}
681 783
@@ -719,7 +821,7 @@ int page_referenced(struct page *page,
719 unlock_page(page); 821 unlock_page(page);
720 } 822 }
721out: 823out:
722 if (page_test_and_clear_young(page)) 824 if (page_test_and_clear_young(page_to_pfn(page)))
723 referenced++; 825 referenced++;
724 826
725 return referenced; 827 return referenced;
@@ -762,7 +864,7 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
762 864
763 BUG_ON(PageAnon(page)); 865 BUG_ON(PageAnon(page));
764 866
765 spin_lock(&mapping->i_mmap_lock); 867 mutex_lock(&mapping->i_mmap_mutex);
766 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 868 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
767 if (vma->vm_flags & VM_SHARED) { 869 if (vma->vm_flags & VM_SHARED) {
768 unsigned long address = vma_address(page, vma); 870 unsigned long address = vma_address(page, vma);
@@ -771,7 +873,7 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
771 ret += page_mkclean_one(page, vma, address); 873 ret += page_mkclean_one(page, vma, address);
772 } 874 }
773 } 875 }
774 spin_unlock(&mapping->i_mmap_lock); 876 mutex_unlock(&mapping->i_mmap_mutex);
775 return ret; 877 return ret;
776} 878}
777 879
@@ -785,10 +887,8 @@ int page_mkclean(struct page *page)
785 struct address_space *mapping = page_mapping(page); 887 struct address_space *mapping = page_mapping(page);
786 if (mapping) { 888 if (mapping) {
787 ret = page_mkclean_file(mapping, page); 889 ret = page_mkclean_file(mapping, page);
788 if (page_test_dirty(page)) { 890 if (page_test_and_clear_dirty(page_to_pfn(page), 1))
789 page_clear_dirty(page, 1);
790 ret = 1; 891 ret = 1;
791 }
792 } 892 }
793 } 893 }
794 894
@@ -981,10 +1081,9 @@ void page_remove_rmap(struct page *page)
981 * not if it's in swapcache - there might be another pte slot 1081 * not if it's in swapcache - there might be another pte slot
982 * containing the swap entry, but page not yet written to swap. 1082 * containing the swap entry, but page not yet written to swap.
983 */ 1083 */
984 if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) { 1084 if ((!PageAnon(page) || PageSwapCache(page)) &&
985 page_clear_dirty(page, 1); 1085 page_test_and_clear_dirty(page_to_pfn(page), 1))
986 set_page_dirty(page); 1086 set_page_dirty(page);
987 }
988 /* 1087 /*
989 * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED 1088 * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
990 * and not charged by memcg for now. 1089 * and not charged by memcg for now.
@@ -1122,7 +1221,7 @@ out_mlock:
1122 /* 1221 /*
1123 * We need mmap_sem locking, Otherwise VM_LOCKED check makes 1222 * We need mmap_sem locking, Otherwise VM_LOCKED check makes
1124 * unstable result and race. Plus, We can't wait here because 1223 * unstable result and race. Plus, We can't wait here because
1125 * we now hold anon_vma->lock or mapping->i_mmap_lock. 1224 * we now hold anon_vma->mutex or mapping->i_mmap_mutex.
1126 * if trylock failed, the page remain in evictable lru and later 1225 * if trylock failed, the page remain in evictable lru and later
1127 * vmscan could retry to move the page to unevictable lru if the 1226 * vmscan could retry to move the page to unevictable lru if the
1128 * page is actually mlocked. 1227 * page is actually mlocked.
@@ -1348,7 +1447,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1348 unsigned long max_nl_size = 0; 1447 unsigned long max_nl_size = 0;
1349 unsigned int mapcount; 1448 unsigned int mapcount;
1350 1449
1351 spin_lock(&mapping->i_mmap_lock); 1450 mutex_lock(&mapping->i_mmap_mutex);
1352 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 1451 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1353 unsigned long address = vma_address(page, vma); 1452 unsigned long address = vma_address(page, vma);
1354 if (address == -EFAULT) 1453 if (address == -EFAULT)
@@ -1394,7 +1493,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1394 mapcount = page_mapcount(page); 1493 mapcount = page_mapcount(page);
1395 if (!mapcount) 1494 if (!mapcount)
1396 goto out; 1495 goto out;
1397 cond_resched_lock(&mapping->i_mmap_lock); 1496 cond_resched();
1398 1497
1399 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; 1498 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
1400 if (max_nl_cursor == 0) 1499 if (max_nl_cursor == 0)
@@ -1416,7 +1515,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1416 } 1515 }
1417 vma->vm_private_data = (void *) max_nl_cursor; 1516 vma->vm_private_data = (void *) max_nl_cursor;
1418 } 1517 }
1419 cond_resched_lock(&mapping->i_mmap_lock); 1518 cond_resched();
1420 max_nl_cursor += CLUSTER_SIZE; 1519 max_nl_cursor += CLUSTER_SIZE;
1421 } while (max_nl_cursor <= max_nl_size); 1520 } while (max_nl_cursor <= max_nl_size);
1422 1521
@@ -1428,7 +1527,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1428 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) 1527 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
1429 vma->vm_private_data = NULL; 1528 vma->vm_private_data = NULL;
1430out: 1529out:
1431 spin_unlock(&mapping->i_mmap_lock); 1530 mutex_unlock(&mapping->i_mmap_mutex);
1432 return ret; 1531 return ret;
1433} 1532}
1434 1533
@@ -1547,7 +1646,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
1547 1646
1548 if (!mapping) 1647 if (!mapping)
1549 return ret; 1648 return ret;
1550 spin_lock(&mapping->i_mmap_lock); 1649 mutex_lock(&mapping->i_mmap_mutex);
1551 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 1650 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
1552 unsigned long address = vma_address(page, vma); 1651 unsigned long address = vma_address(page, vma);
1553 if (address == -EFAULT) 1652 if (address == -EFAULT)
@@ -1561,7 +1660,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
1561 * never contain migration ptes. Decide what to do about this 1660 * never contain migration ptes. Decide what to do about this
1562 * limitation to linear when we need rmap_walk() on nonlinear. 1661 * limitation to linear when we need rmap_walk() on nonlinear.
1563 */ 1662 */
1564 spin_unlock(&mapping->i_mmap_lock); 1663 mutex_unlock(&mapping->i_mmap_mutex);
1565 return ret; 1664 return ret;
1566} 1665}
1567 1666
diff --git a/mm/shmem.c b/mm/shmem.c
index dfc7069102ee..1acfb2687bfa 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -99,6 +99,13 @@ static struct vfsmount *shm_mnt;
99/* Pretend that each entry is of this size in directory's i_size */ 99/* Pretend that each entry is of this size in directory's i_size */
100#define BOGO_DIRENT_SIZE 20 100#define BOGO_DIRENT_SIZE 20
101 101
102struct shmem_xattr {
103 struct list_head list; /* anchored by shmem_inode_info->xattr_list */
104 char *name; /* xattr name */
105 size_t size;
106 char value[0];
107};
108
102/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ 109/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
103enum sgp_type { 110enum sgp_type {
104 SGP_READ, /* don't exceed i_size, don't allocate page */ 111 SGP_READ, /* don't exceed i_size, don't allocate page */
@@ -822,6 +829,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
822static void shmem_evict_inode(struct inode *inode) 829static void shmem_evict_inode(struct inode *inode)
823{ 830{
824 struct shmem_inode_info *info = SHMEM_I(inode); 831 struct shmem_inode_info *info = SHMEM_I(inode);
832 struct shmem_xattr *xattr, *nxattr;
825 833
826 if (inode->i_mapping->a_ops == &shmem_aops) { 834 if (inode->i_mapping->a_ops == &shmem_aops) {
827 truncate_inode_pages(inode->i_mapping, 0); 835 truncate_inode_pages(inode->i_mapping, 0);
@@ -834,6 +842,11 @@ static void shmem_evict_inode(struct inode *inode)
834 mutex_unlock(&shmem_swaplist_mutex); 842 mutex_unlock(&shmem_swaplist_mutex);
835 } 843 }
836 } 844 }
845
846 list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
847 kfree(xattr->name);
848 kfree(xattr);
849 }
837 BUG_ON(inode->i_blocks); 850 BUG_ON(inode->i_blocks);
838 shmem_free_inode(inode->i_sb); 851 shmem_free_inode(inode->i_sb);
839 end_writeback(inode); 852 end_writeback(inode);
@@ -916,11 +929,12 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
916 if (size > ENTRIES_PER_PAGE) 929 if (size > ENTRIES_PER_PAGE)
917 size = ENTRIES_PER_PAGE; 930 size = ENTRIES_PER_PAGE;
918 offset = shmem_find_swp(entry, ptr, ptr+size); 931 offset = shmem_find_swp(entry, ptr, ptr+size);
932 shmem_swp_unmap(ptr);
919 if (offset >= 0) { 933 if (offset >= 0) {
920 shmem_dir_unmap(dir); 934 shmem_dir_unmap(dir);
935 ptr = shmem_swp_map(subdir);
921 goto found; 936 goto found;
922 } 937 }
923 shmem_swp_unmap(ptr);
924 } 938 }
925 } 939 }
926lost1: 940lost1:
@@ -1291,12 +1305,10 @@ repeat:
1291 swappage = lookup_swap_cache(swap); 1305 swappage = lookup_swap_cache(swap);
1292 if (!swappage) { 1306 if (!swappage) {
1293 shmem_swp_unmap(entry); 1307 shmem_swp_unmap(entry);
1308 spin_unlock(&info->lock);
1294 /* here we actually do the io */ 1309 /* here we actually do the io */
1295 if (type && !(*type & VM_FAULT_MAJOR)) { 1310 if (type)
1296 __count_vm_event(PGMAJFAULT);
1297 *type |= VM_FAULT_MAJOR; 1311 *type |= VM_FAULT_MAJOR;
1298 }
1299 spin_unlock(&info->lock);
1300 swappage = shmem_swapin(swap, gfp, info, idx); 1312 swappage = shmem_swapin(swap, gfp, info, idx);
1301 if (!swappage) { 1313 if (!swappage) {
1302 spin_lock(&info->lock); 1314 spin_lock(&info->lock);
@@ -1535,7 +1547,10 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1535 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); 1547 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
1536 if (error) 1548 if (error)
1537 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); 1549 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
1538 1550 if (ret & VM_FAULT_MAJOR) {
1551 count_vm_event(PGMAJFAULT);
1552 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1553 }
1539 return ret | VM_FAULT_LOCKED; 1554 return ret | VM_FAULT_LOCKED;
1540} 1555}
1541 1556
@@ -1614,6 +1629,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
1614 spin_lock_init(&info->lock); 1629 spin_lock_init(&info->lock);
1615 info->flags = flags & VM_NORESERVE; 1630 info->flags = flags & VM_NORESERVE;
1616 INIT_LIST_HEAD(&info->swaplist); 1631 INIT_LIST_HEAD(&info->swaplist);
1632 INIT_LIST_HEAD(&info->xattr_list);
1617 cache_no_acl(inode); 1633 cache_no_acl(inode);
1618 1634
1619 switch (mode & S_IFMT) { 1635 switch (mode & S_IFMT) {
@@ -2013,9 +2029,9 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
2013 2029
2014 info = SHMEM_I(inode); 2030 info = SHMEM_I(inode);
2015 inode->i_size = len-1; 2031 inode->i_size = len-1;
2016 if (len <= (char *)inode - (char *)info) { 2032 if (len <= SHMEM_SYMLINK_INLINE_LEN) {
2017 /* do it inline */ 2033 /* do it inline */
2018 memcpy(info, symname, len); 2034 memcpy(info->inline_symlink, symname, len);
2019 inode->i_op = &shmem_symlink_inline_operations; 2035 inode->i_op = &shmem_symlink_inline_operations;
2020 } else { 2036 } else {
2021 error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); 2037 error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
@@ -2041,7 +2057,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
2041 2057
2042static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd) 2058static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
2043{ 2059{
2044 nd_set_link(nd, (char *)SHMEM_I(dentry->d_inode)); 2060 nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink);
2045 return NULL; 2061 return NULL;
2046} 2062}
2047 2063
@@ -2065,63 +2081,253 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co
2065 } 2081 }
2066} 2082}
2067 2083
2068static const struct inode_operations shmem_symlink_inline_operations = { 2084#ifdef CONFIG_TMPFS_XATTR
2069 .readlink = generic_readlink,
2070 .follow_link = shmem_follow_link_inline,
2071};
2072
2073static const struct inode_operations shmem_symlink_inode_operations = {
2074 .readlink = generic_readlink,
2075 .follow_link = shmem_follow_link,
2076 .put_link = shmem_put_link,
2077};
2078
2079#ifdef CONFIG_TMPFS_POSIX_ACL
2080/* 2085/*
2081 * Superblocks without xattr inode operations will get security.* xattr 2086 * Superblocks without xattr inode operations may get some security.* xattr
2082 * support from the VFS "for free". As soon as we have any other xattrs 2087 * support from the LSM "for free". As soon as we have any other xattrs
2083 * like ACLs, we also need to implement the security.* handlers at 2088 * like ACLs, we also need to implement the security.* handlers at
2084 * filesystem level, though. 2089 * filesystem level, though.
2085 */ 2090 */
2086 2091
2087static size_t shmem_xattr_security_list(struct dentry *dentry, char *list, 2092static int shmem_xattr_get(struct dentry *dentry, const char *name,
2088 size_t list_len, const char *name, 2093 void *buffer, size_t size)
2089 size_t name_len, int handler_flags)
2090{ 2094{
2091 return security_inode_listsecurity(dentry->d_inode, list, list_len); 2095 struct shmem_inode_info *info;
2092} 2096 struct shmem_xattr *xattr;
2097 int ret = -ENODATA;
2093 2098
2094static int shmem_xattr_security_get(struct dentry *dentry, const char *name, 2099 info = SHMEM_I(dentry->d_inode);
2095 void *buffer, size_t size, int handler_flags) 2100
2096{ 2101 spin_lock(&info->lock);
2097 if (strcmp(name, "") == 0) 2102 list_for_each_entry(xattr, &info->xattr_list, list) {
2098 return -EINVAL; 2103 if (strcmp(name, xattr->name))
2099 return xattr_getsecurity(dentry->d_inode, name, buffer, size); 2104 continue;
2105
2106 ret = xattr->size;
2107 if (buffer) {
2108 if (size < xattr->size)
2109 ret = -ERANGE;
2110 else
2111 memcpy(buffer, xattr->value, xattr->size);
2112 }
2113 break;
2114 }
2115 spin_unlock(&info->lock);
2116 return ret;
2100} 2117}
2101 2118
2102static int shmem_xattr_security_set(struct dentry *dentry, const char *name, 2119static int shmem_xattr_set(struct dentry *dentry, const char *name,
2103 const void *value, size_t size, int flags, int handler_flags) 2120 const void *value, size_t size, int flags)
2104{ 2121{
2105 if (strcmp(name, "") == 0) 2122 struct inode *inode = dentry->d_inode;
2106 return -EINVAL; 2123 struct shmem_inode_info *info = SHMEM_I(inode);
2107 return security_inode_setsecurity(dentry->d_inode, name, value, 2124 struct shmem_xattr *xattr;
2108 size, flags); 2125 struct shmem_xattr *new_xattr = NULL;
2126 size_t len;
2127 int err = 0;
2128
2129 /* value == NULL means remove */
2130 if (value) {
2131 /* wrap around? */
2132 len = sizeof(*new_xattr) + size;
2133 if (len <= sizeof(*new_xattr))
2134 return -ENOMEM;
2135
2136 new_xattr = kmalloc(len, GFP_KERNEL);
2137 if (!new_xattr)
2138 return -ENOMEM;
2139
2140 new_xattr->name = kstrdup(name, GFP_KERNEL);
2141 if (!new_xattr->name) {
2142 kfree(new_xattr);
2143 return -ENOMEM;
2144 }
2145
2146 new_xattr->size = size;
2147 memcpy(new_xattr->value, value, size);
2148 }
2149
2150 spin_lock(&info->lock);
2151 list_for_each_entry(xattr, &info->xattr_list, list) {
2152 if (!strcmp(name, xattr->name)) {
2153 if (flags & XATTR_CREATE) {
2154 xattr = new_xattr;
2155 err = -EEXIST;
2156 } else if (new_xattr) {
2157 list_replace(&xattr->list, &new_xattr->list);
2158 } else {
2159 list_del(&xattr->list);
2160 }
2161 goto out;
2162 }
2163 }
2164 if (flags & XATTR_REPLACE) {
2165 xattr = new_xattr;
2166 err = -ENODATA;
2167 } else {
2168 list_add(&new_xattr->list, &info->xattr_list);
2169 xattr = NULL;
2170 }
2171out:
2172 spin_unlock(&info->lock);
2173 if (xattr)
2174 kfree(xattr->name);
2175 kfree(xattr);
2176 return err;
2109} 2177}
2110 2178
2111static const struct xattr_handler shmem_xattr_security_handler = {
2112 .prefix = XATTR_SECURITY_PREFIX,
2113 .list = shmem_xattr_security_list,
2114 .get = shmem_xattr_security_get,
2115 .set = shmem_xattr_security_set,
2116};
2117 2179
2118static const struct xattr_handler *shmem_xattr_handlers[] = { 2180static const struct xattr_handler *shmem_xattr_handlers[] = {
2181#ifdef CONFIG_TMPFS_POSIX_ACL
2119 &generic_acl_access_handler, 2182 &generic_acl_access_handler,
2120 &generic_acl_default_handler, 2183 &generic_acl_default_handler,
2121 &shmem_xattr_security_handler, 2184#endif
2122 NULL 2185 NULL
2123}; 2186};
2187
2188static int shmem_xattr_validate(const char *name)
2189{
2190 struct { const char *prefix; size_t len; } arr[] = {
2191 { XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN },
2192 { XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN }
2193 };
2194 int i;
2195
2196 for (i = 0; i < ARRAY_SIZE(arr); i++) {
2197 size_t preflen = arr[i].len;
2198 if (strncmp(name, arr[i].prefix, preflen) == 0) {
2199 if (!name[preflen])
2200 return -EINVAL;
2201 return 0;
2202 }
2203 }
2204 return -EOPNOTSUPP;
2205}
2206
2207static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
2208 void *buffer, size_t size)
2209{
2210 int err;
2211
2212 /*
2213 * If this is a request for a synthetic attribute in the system.*
2214 * namespace use the generic infrastructure to resolve a handler
2215 * for it via sb->s_xattr.
2216 */
2217 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
2218 return generic_getxattr(dentry, name, buffer, size);
2219
2220 err = shmem_xattr_validate(name);
2221 if (err)
2222 return err;
2223
2224 return shmem_xattr_get(dentry, name, buffer, size);
2225}
2226
2227static int shmem_setxattr(struct dentry *dentry, const char *name,
2228 const void *value, size_t size, int flags)
2229{
2230 int err;
2231
2232 /*
2233 * If this is a request for a synthetic attribute in the system.*
2234 * namespace use the generic infrastructure to resolve a handler
2235 * for it via sb->s_xattr.
2236 */
2237 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
2238 return generic_setxattr(dentry, name, value, size, flags);
2239
2240 err = shmem_xattr_validate(name);
2241 if (err)
2242 return err;
2243
2244 if (size == 0)
2245 value = ""; /* empty EA, do not remove */
2246
2247 return shmem_xattr_set(dentry, name, value, size, flags);
2248
2249}
2250
2251static int shmem_removexattr(struct dentry *dentry, const char *name)
2252{
2253 int err;
2254
2255 /*
2256 * If this is a request for a synthetic attribute in the system.*
2257 * namespace use the generic infrastructure to resolve a handler
2258 * for it via sb->s_xattr.
2259 */
2260 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
2261 return generic_removexattr(dentry, name);
2262
2263 err = shmem_xattr_validate(name);
2264 if (err)
2265 return err;
2266
2267 return shmem_xattr_set(dentry, name, NULL, 0, XATTR_REPLACE);
2268}
2269
2270static bool xattr_is_trusted(const char *name)
2271{
2272 return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
2273}
2274
2275static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
2276{
2277 bool trusted = capable(CAP_SYS_ADMIN);
2278 struct shmem_xattr *xattr;
2279 struct shmem_inode_info *info;
2280 size_t used = 0;
2281
2282 info = SHMEM_I(dentry->d_inode);
2283
2284 spin_lock(&info->lock);
2285 list_for_each_entry(xattr, &info->xattr_list, list) {
2286 size_t len;
2287
2288 /* skip "trusted." attributes for unprivileged callers */
2289 if (!trusted && xattr_is_trusted(xattr->name))
2290 continue;
2291
2292 len = strlen(xattr->name) + 1;
2293 used += len;
2294 if (buffer) {
2295 if (size < used) {
2296 used = -ERANGE;
2297 break;
2298 }
2299 memcpy(buffer, xattr->name, len);
2300 buffer += len;
2301 }
2302 }
2303 spin_unlock(&info->lock);
2304
2305 return used;
2306}
2307#endif /* CONFIG_TMPFS_XATTR */
2308
2309static const struct inode_operations shmem_symlink_inline_operations = {
2310 .readlink = generic_readlink,
2311 .follow_link = shmem_follow_link_inline,
2312#ifdef CONFIG_TMPFS_XATTR
2313 .setxattr = shmem_setxattr,
2314 .getxattr = shmem_getxattr,
2315 .listxattr = shmem_listxattr,
2316 .removexattr = shmem_removexattr,
2124#endif 2317#endif
2318};
2319
2320static const struct inode_operations shmem_symlink_inode_operations = {
2321 .readlink = generic_readlink,
2322 .follow_link = shmem_follow_link,
2323 .put_link = shmem_put_link,
2324#ifdef CONFIG_TMPFS_XATTR
2325 .setxattr = shmem_setxattr,
2326 .getxattr = shmem_getxattr,
2327 .listxattr = shmem_listxattr,
2328 .removexattr = shmem_removexattr,
2329#endif
2330};
2125 2331
2126static struct dentry *shmem_get_parent(struct dentry *child) 2332static struct dentry *shmem_get_parent(struct dentry *child)
2127{ 2333{
@@ -2401,8 +2607,10 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
2401 sb->s_magic = TMPFS_MAGIC; 2607 sb->s_magic = TMPFS_MAGIC;
2402 sb->s_op = &shmem_ops; 2608 sb->s_op = &shmem_ops;
2403 sb->s_time_gran = 1; 2609 sb->s_time_gran = 1;
2404#ifdef CONFIG_TMPFS_POSIX_ACL 2610#ifdef CONFIG_TMPFS_XATTR
2405 sb->s_xattr = shmem_xattr_handlers; 2611 sb->s_xattr = shmem_xattr_handlers;
2612#endif
2613#ifdef CONFIG_TMPFS_POSIX_ACL
2406 sb->s_flags |= MS_POSIXACL; 2614 sb->s_flags |= MS_POSIXACL;
2407#endif 2615#endif
2408 2616
@@ -2500,11 +2708,13 @@ static const struct file_operations shmem_file_operations = {
2500static const struct inode_operations shmem_inode_operations = { 2708static const struct inode_operations shmem_inode_operations = {
2501 .setattr = shmem_notify_change, 2709 .setattr = shmem_notify_change,
2502 .truncate_range = shmem_truncate_range, 2710 .truncate_range = shmem_truncate_range,
2711#ifdef CONFIG_TMPFS_XATTR
2712 .setxattr = shmem_setxattr,
2713 .getxattr = shmem_getxattr,
2714 .listxattr = shmem_listxattr,
2715 .removexattr = shmem_removexattr,
2716#endif
2503#ifdef CONFIG_TMPFS_POSIX_ACL 2717#ifdef CONFIG_TMPFS_POSIX_ACL
2504 .setxattr = generic_setxattr,
2505 .getxattr = generic_getxattr,
2506 .listxattr = generic_listxattr,
2507 .removexattr = generic_removexattr,
2508 .check_acl = generic_check_acl, 2718 .check_acl = generic_check_acl,
2509#endif 2719#endif
2510 2720
@@ -2522,23 +2732,27 @@ static const struct inode_operations shmem_dir_inode_operations = {
2522 .mknod = shmem_mknod, 2732 .mknod = shmem_mknod,
2523 .rename = shmem_rename, 2733 .rename = shmem_rename,
2524#endif 2734#endif
2735#ifdef CONFIG_TMPFS_XATTR
2736 .setxattr = shmem_setxattr,
2737 .getxattr = shmem_getxattr,
2738 .listxattr = shmem_listxattr,
2739 .removexattr = shmem_removexattr,
2740#endif
2525#ifdef CONFIG_TMPFS_POSIX_ACL 2741#ifdef CONFIG_TMPFS_POSIX_ACL
2526 .setattr = shmem_notify_change, 2742 .setattr = shmem_notify_change,
2527 .setxattr = generic_setxattr,
2528 .getxattr = generic_getxattr,
2529 .listxattr = generic_listxattr,
2530 .removexattr = generic_removexattr,
2531 .check_acl = generic_check_acl, 2743 .check_acl = generic_check_acl,
2532#endif 2744#endif
2533}; 2745};
2534 2746
2535static const struct inode_operations shmem_special_inode_operations = { 2747static const struct inode_operations shmem_special_inode_operations = {
2748#ifdef CONFIG_TMPFS_XATTR
2749 .setxattr = shmem_setxattr,
2750 .getxattr = shmem_getxattr,
2751 .listxattr = shmem_listxattr,
2752 .removexattr = shmem_removexattr,
2753#endif
2536#ifdef CONFIG_TMPFS_POSIX_ACL 2754#ifdef CONFIG_TMPFS_POSIX_ACL
2537 .setattr = shmem_notify_change, 2755 .setattr = shmem_notify_change,
2538 .setxattr = generic_setxattr,
2539 .getxattr = generic_getxattr,
2540 .listxattr = generic_listxattr,
2541 .removexattr = generic_removexattr,
2542 .check_acl = generic_check_acl, 2756 .check_acl = generic_check_acl,
2543#endif 2757#endif
2544}; 2758};
diff --git a/mm/slab.c b/mm/slab.c
index 46a9c163a92f..bcfa4987c8ae 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -115,6 +115,7 @@
115#include <linux/debugobjects.h> 115#include <linux/debugobjects.h>
116#include <linux/kmemcheck.h> 116#include <linux/kmemcheck.h>
117#include <linux/memory.h> 117#include <linux/memory.h>
118#include <linux/prefetch.h>
118 119
119#include <asm/cacheflush.h> 120#include <asm/cacheflush.h>
120#include <asm/tlbflush.h> 121#include <asm/tlbflush.h>
diff --git a/mm/slub.c b/mm/slub.c
index 9d2e5e46bf09..7be0223531b0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -261,6 +261,18 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object)
261 return *(void **)(object + s->offset); 261 return *(void **)(object + s->offset);
262} 262}
263 263
264static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
265{
266 void *p;
267
268#ifdef CONFIG_DEBUG_PAGEALLOC
269 probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p));
270#else
271 p = get_freepointer(s, object);
272#endif
273 return p;
274}
275
264static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) 276static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
265{ 277{
266 *(void **)(object + s->offset) = fp; 278 *(void **)(object + s->offset) = fp;
@@ -271,10 +283,6 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
271 for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ 283 for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
272 __p += (__s)->size) 284 __p += (__s)->size)
273 285
274/* Scan freelist */
275#define for_each_free_object(__p, __s, __free) \
276 for (__p = (__free); __p; __p = get_freepointer((__s), __p))
277
278/* Determine object index from a given position */ 286/* Determine object index from a given position */
279static inline int slab_index(void *p, struct kmem_cache *s, void *addr) 287static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
280{ 288{
@@ -332,6 +340,21 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
332 340
333#ifdef CONFIG_SLUB_DEBUG 341#ifdef CONFIG_SLUB_DEBUG
334/* 342/*
343 * Determine a map of object in use on a page.
344 *
345 * Slab lock or node listlock must be held to guarantee that the page does
346 * not vanish from under us.
347 */
348static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
349{
350 void *p;
351 void *addr = page_address(page);
352
353 for (p = page->freelist; p; p = get_freepointer(s, p))
354 set_bit(slab_index(p, s, addr), map);
355}
356
357/*
335 * Debug settings: 358 * Debug settings:
336 */ 359 */
337#ifdef CONFIG_SLUB_DEBUG_ON 360#ifdef CONFIG_SLUB_DEBUG_ON
@@ -1487,7 +1510,7 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
1487 int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; 1510 int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
1488 1511
1489 page = get_partial_node(get_node(s, searchnode)); 1512 page = get_partial_node(get_node(s, searchnode));
1490 if (page || node != -1) 1513 if (page || node != NUMA_NO_NODE)
1491 return page; 1514 return page;
1492 1515
1493 return get_any_partial(s, flags); 1516 return get_any_partial(s, flags);
@@ -1540,7 +1563,6 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1540 } 1563 }
1541} 1564}
1542 1565
1543#ifdef CONFIG_CMPXCHG_LOCAL
1544#ifdef CONFIG_PREEMPT 1566#ifdef CONFIG_PREEMPT
1545/* 1567/*
1546 * Calculate the next globally unique transaction for disambiguiation 1568 * Calculate the next globally unique transaction for disambiguiation
@@ -1600,17 +1622,12 @@ static inline void note_cmpxchg_failure(const char *n,
1600 stat(s, CMPXCHG_DOUBLE_CPU_FAIL); 1622 stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
1601} 1623}
1602 1624
1603#endif
1604
1605void init_kmem_cache_cpus(struct kmem_cache *s) 1625void init_kmem_cache_cpus(struct kmem_cache *s)
1606{ 1626{
1607#ifdef CONFIG_CMPXCHG_LOCAL
1608 int cpu; 1627 int cpu;
1609 1628
1610 for_each_possible_cpu(cpu) 1629 for_each_possible_cpu(cpu)
1611 per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); 1630 per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
1612#endif
1613
1614} 1631}
1615/* 1632/*
1616 * Remove the cpu slab 1633 * Remove the cpu slab
@@ -1643,9 +1660,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1643 page->inuse--; 1660 page->inuse--;
1644 } 1661 }
1645 c->page = NULL; 1662 c->page = NULL;
1646#ifdef CONFIG_CMPXCHG_LOCAL
1647 c->tid = next_tid(c->tid); 1663 c->tid = next_tid(c->tid);
1648#endif
1649 unfreeze_slab(s, page, tail); 1664 unfreeze_slab(s, page, tail);
1650} 1665}
1651 1666
@@ -1779,8 +1794,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1779 unsigned long addr, struct kmem_cache_cpu *c) 1794 unsigned long addr, struct kmem_cache_cpu *c)
1780{ 1795{
1781 void **object; 1796 void **object;
1782 struct page *new; 1797 struct page *page;
1783#ifdef CONFIG_CMPXCHG_LOCAL
1784 unsigned long flags; 1798 unsigned long flags;
1785 1799
1786 local_irq_save(flags); 1800 local_irq_save(flags);
@@ -1792,37 +1806,34 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1792 */ 1806 */
1793 c = this_cpu_ptr(s->cpu_slab); 1807 c = this_cpu_ptr(s->cpu_slab);
1794#endif 1808#endif
1795#endif
1796 1809
1797 /* We handle __GFP_ZERO in the caller */ 1810 /* We handle __GFP_ZERO in the caller */
1798 gfpflags &= ~__GFP_ZERO; 1811 gfpflags &= ~__GFP_ZERO;
1799 1812
1800 if (!c->page) 1813 page = c->page;
1814 if (!page)
1801 goto new_slab; 1815 goto new_slab;
1802 1816
1803 slab_lock(c->page); 1817 slab_lock(page);
1804 if (unlikely(!node_match(c, node))) 1818 if (unlikely(!node_match(c, node)))
1805 goto another_slab; 1819 goto another_slab;
1806 1820
1807 stat(s, ALLOC_REFILL); 1821 stat(s, ALLOC_REFILL);
1808 1822
1809load_freelist: 1823load_freelist:
1810 object = c->page->freelist; 1824 object = page->freelist;
1811 if (unlikely(!object)) 1825 if (unlikely(!object))
1812 goto another_slab; 1826 goto another_slab;
1813 if (kmem_cache_debug(s)) 1827 if (kmem_cache_debug(s))
1814 goto debug; 1828 goto debug;
1815 1829
1816 c->freelist = get_freepointer(s, object); 1830 c->freelist = get_freepointer(s, object);
1817 c->page->inuse = c->page->objects; 1831 page->inuse = page->objects;
1818 c->page->freelist = NULL; 1832 page->freelist = NULL;
1819 c->node = page_to_nid(c->page); 1833
1820unlock_out: 1834 slab_unlock(page);
1821 slab_unlock(c->page);
1822#ifdef CONFIG_CMPXCHG_LOCAL
1823 c->tid = next_tid(c->tid); 1835 c->tid = next_tid(c->tid);
1824 local_irq_restore(flags); 1836 local_irq_restore(flags);
1825#endif
1826 stat(s, ALLOC_SLOWPATH); 1837 stat(s, ALLOC_SLOWPATH);
1827 return object; 1838 return object;
1828 1839
@@ -1830,10 +1841,11 @@ another_slab:
1830 deactivate_slab(s, c); 1841 deactivate_slab(s, c);
1831 1842
1832new_slab: 1843new_slab:
1833 new = get_partial(s, gfpflags, node); 1844 page = get_partial(s, gfpflags, node);
1834 if (new) { 1845 if (page) {
1835 c->page = new;
1836 stat(s, ALLOC_FROM_PARTIAL); 1846 stat(s, ALLOC_FROM_PARTIAL);
1847 c->node = page_to_nid(page);
1848 c->page = page;
1837 goto load_freelist; 1849 goto load_freelist;
1838 } 1850 }
1839 1851
@@ -1841,35 +1853,38 @@ new_slab:
1841 if (gfpflags & __GFP_WAIT) 1853 if (gfpflags & __GFP_WAIT)
1842 local_irq_enable(); 1854 local_irq_enable();
1843 1855
1844 new = new_slab(s, gfpflags, node); 1856 page = new_slab(s, gfpflags, node);
1845 1857
1846 if (gfpflags & __GFP_WAIT) 1858 if (gfpflags & __GFP_WAIT)
1847 local_irq_disable(); 1859 local_irq_disable();
1848 1860
1849 if (new) { 1861 if (page) {
1850 c = __this_cpu_ptr(s->cpu_slab); 1862 c = __this_cpu_ptr(s->cpu_slab);
1851 stat(s, ALLOC_SLAB); 1863 stat(s, ALLOC_SLAB);
1852 if (c->page) 1864 if (c->page)
1853 flush_slab(s, c); 1865 flush_slab(s, c);
1854 slab_lock(new); 1866
1855 __SetPageSlubFrozen(new); 1867 slab_lock(page);
1856 c->page = new; 1868 __SetPageSlubFrozen(page);
1869 c->node = page_to_nid(page);
1870 c->page = page;
1857 goto load_freelist; 1871 goto load_freelist;
1858 } 1872 }
1859 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) 1873 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
1860 slab_out_of_memory(s, gfpflags, node); 1874 slab_out_of_memory(s, gfpflags, node);
1861#ifdef CONFIG_CMPXCHG_LOCAL
1862 local_irq_restore(flags); 1875 local_irq_restore(flags);
1863#endif
1864 return NULL; 1876 return NULL;
1865debug: 1877debug:
1866 if (!alloc_debug_processing(s, c->page, object, addr)) 1878 if (!alloc_debug_processing(s, page, object, addr))
1867 goto another_slab; 1879 goto another_slab;
1868 1880
1869 c->page->inuse++; 1881 page->inuse++;
1870 c->page->freelist = get_freepointer(s, object); 1882 page->freelist = get_freepointer(s, object);
1883 deactivate_slab(s, c);
1884 c->page = NULL;
1871 c->node = NUMA_NO_NODE; 1885 c->node = NUMA_NO_NODE;
1872 goto unlock_out; 1886 local_irq_restore(flags);
1887 return object;
1873} 1888}
1874 1889
1875/* 1890/*
@@ -1887,20 +1902,12 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1887{ 1902{
1888 void **object; 1903 void **object;
1889 struct kmem_cache_cpu *c; 1904 struct kmem_cache_cpu *c;
1890#ifdef CONFIG_CMPXCHG_LOCAL
1891 unsigned long tid; 1905 unsigned long tid;
1892#else
1893 unsigned long flags;
1894#endif
1895 1906
1896 if (slab_pre_alloc_hook(s, gfpflags)) 1907 if (slab_pre_alloc_hook(s, gfpflags))
1897 return NULL; 1908 return NULL;
1898 1909
1899#ifndef CONFIG_CMPXCHG_LOCAL
1900 local_irq_save(flags);
1901#else
1902redo: 1910redo:
1903#endif
1904 1911
1905 /* 1912 /*
1906 * Must read kmem_cache cpu data via this cpu ptr. Preemption is 1913 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
@@ -1910,7 +1917,6 @@ redo:
1910 */ 1917 */
1911 c = __this_cpu_ptr(s->cpu_slab); 1918 c = __this_cpu_ptr(s->cpu_slab);
1912 1919
1913#ifdef CONFIG_CMPXCHG_LOCAL
1914 /* 1920 /*
1915 * The transaction ids are globally unique per cpu and per operation on 1921 * The transaction ids are globally unique per cpu and per operation on
1916 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double 1922 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
@@ -1919,7 +1925,6 @@ redo:
1919 */ 1925 */
1920 tid = c->tid; 1926 tid = c->tid;
1921 barrier(); 1927 barrier();
1922#endif
1923 1928
1924 object = c->freelist; 1929 object = c->freelist;
1925 if (unlikely(!object || !node_match(c, node))) 1930 if (unlikely(!object || !node_match(c, node)))
@@ -1927,7 +1932,6 @@ redo:
1927 object = __slab_alloc(s, gfpflags, node, addr, c); 1932 object = __slab_alloc(s, gfpflags, node, addr, c);
1928 1933
1929 else { 1934 else {
1930#ifdef CONFIG_CMPXCHG_LOCAL
1931 /* 1935 /*
1932 * The cmpxchg will only match if there was no additional 1936 * The cmpxchg will only match if there was no additional
1933 * operation and if we are on the right processor. 1937 * operation and if we are on the right processor.
@@ -1943,21 +1947,14 @@ redo:
1943 if (unlikely(!irqsafe_cpu_cmpxchg_double( 1947 if (unlikely(!irqsafe_cpu_cmpxchg_double(
1944 s->cpu_slab->freelist, s->cpu_slab->tid, 1948 s->cpu_slab->freelist, s->cpu_slab->tid,
1945 object, tid, 1949 object, tid,
1946 get_freepointer(s, object), next_tid(tid)))) { 1950 get_freepointer_safe(s, object), next_tid(tid)))) {
1947 1951
1948 note_cmpxchg_failure("slab_alloc", s, tid); 1952 note_cmpxchg_failure("slab_alloc", s, tid);
1949 goto redo; 1953 goto redo;
1950 } 1954 }
1951#else
1952 c->freelist = get_freepointer(s, object);
1953#endif
1954 stat(s, ALLOC_FASTPATH); 1955 stat(s, ALLOC_FASTPATH);
1955 } 1956 }
1956 1957
1957#ifndef CONFIG_CMPXCHG_LOCAL
1958 local_irq_restore(flags);
1959#endif
1960
1961 if (unlikely(gfpflags & __GFP_ZERO) && object) 1958 if (unlikely(gfpflags & __GFP_ZERO) && object)
1962 memset(object, 0, s->objsize); 1959 memset(object, 0, s->objsize);
1963 1960
@@ -2034,18 +2031,15 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2034{ 2031{
2035 void *prior; 2032 void *prior;
2036 void **object = (void *)x; 2033 void **object = (void *)x;
2037#ifdef CONFIG_CMPXCHG_LOCAL
2038 unsigned long flags; 2034 unsigned long flags;
2039 2035
2040 local_irq_save(flags); 2036 local_irq_save(flags);
2041#endif
2042 slab_lock(page); 2037 slab_lock(page);
2043 stat(s, FREE_SLOWPATH); 2038 stat(s, FREE_SLOWPATH);
2044 2039
2045 if (kmem_cache_debug(s)) 2040 if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr))
2046 goto debug; 2041 goto out_unlock;
2047 2042
2048checks_ok:
2049 prior = page->freelist; 2043 prior = page->freelist;
2050 set_freepointer(s, object, prior); 2044 set_freepointer(s, object, prior);
2051 page->freelist = object; 2045 page->freelist = object;
@@ -2070,9 +2064,7 @@ checks_ok:
2070 2064
2071out_unlock: 2065out_unlock:
2072 slab_unlock(page); 2066 slab_unlock(page);
2073#ifdef CONFIG_CMPXCHG_LOCAL
2074 local_irq_restore(flags); 2067 local_irq_restore(flags);
2075#endif
2076 return; 2068 return;
2077 2069
2078slab_empty: 2070slab_empty:
@@ -2084,17 +2076,9 @@ slab_empty:
2084 stat(s, FREE_REMOVE_PARTIAL); 2076 stat(s, FREE_REMOVE_PARTIAL);
2085 } 2077 }
2086 slab_unlock(page); 2078 slab_unlock(page);
2087#ifdef CONFIG_CMPXCHG_LOCAL
2088 local_irq_restore(flags); 2079 local_irq_restore(flags);
2089#endif
2090 stat(s, FREE_SLAB); 2080 stat(s, FREE_SLAB);
2091 discard_slab(s, page); 2081 discard_slab(s, page);
2092 return;
2093
2094debug:
2095 if (!free_debug_processing(s, page, x, addr))
2096 goto out_unlock;
2097 goto checks_ok;
2098} 2082}
2099 2083
2100/* 2084/*
@@ -2113,20 +2097,11 @@ static __always_inline void slab_free(struct kmem_cache *s,
2113{ 2097{
2114 void **object = (void *)x; 2098 void **object = (void *)x;
2115 struct kmem_cache_cpu *c; 2099 struct kmem_cache_cpu *c;
2116#ifdef CONFIG_CMPXCHG_LOCAL
2117 unsigned long tid; 2100 unsigned long tid;
2118#else
2119 unsigned long flags;
2120#endif
2121 2101
2122 slab_free_hook(s, x); 2102 slab_free_hook(s, x);
2123 2103
2124#ifndef CONFIG_CMPXCHG_LOCAL
2125 local_irq_save(flags);
2126
2127#else
2128redo: 2104redo:
2129#endif
2130 2105
2131 /* 2106 /*
2132 * Determine the currently cpus per cpu slab. 2107 * Determine the currently cpus per cpu slab.
@@ -2136,15 +2111,12 @@ redo:
2136 */ 2111 */
2137 c = __this_cpu_ptr(s->cpu_slab); 2112 c = __this_cpu_ptr(s->cpu_slab);
2138 2113
2139#ifdef CONFIG_CMPXCHG_LOCAL
2140 tid = c->tid; 2114 tid = c->tid;
2141 barrier(); 2115 barrier();
2142#endif
2143 2116
2144 if (likely(page == c->page && c->node != NUMA_NO_NODE)) { 2117 if (likely(page == c->page)) {
2145 set_freepointer(s, object, c->freelist); 2118 set_freepointer(s, object, c->freelist);
2146 2119
2147#ifdef CONFIG_CMPXCHG_LOCAL
2148 if (unlikely(!irqsafe_cpu_cmpxchg_double( 2120 if (unlikely(!irqsafe_cpu_cmpxchg_double(
2149 s->cpu_slab->freelist, s->cpu_slab->tid, 2121 s->cpu_slab->freelist, s->cpu_slab->tid,
2150 c->freelist, tid, 2122 c->freelist, tid,
@@ -2153,16 +2125,10 @@ redo:
2153 note_cmpxchg_failure("slab_free", s, tid); 2125 note_cmpxchg_failure("slab_free", s, tid);
2154 goto redo; 2126 goto redo;
2155 } 2127 }
2156#else
2157 c->freelist = object;
2158#endif
2159 stat(s, FREE_FASTPATH); 2128 stat(s, FREE_FASTPATH);
2160 } else 2129 } else
2161 __slab_free(s, page, x, addr); 2130 __slab_free(s, page, x, addr);
2162 2131
2163#ifndef CONFIG_CMPXCHG_LOCAL
2164 local_irq_restore(flags);
2165#endif
2166} 2132}
2167 2133
2168void kmem_cache_free(struct kmem_cache *s, void *x) 2134void kmem_cache_free(struct kmem_cache *s, void *x)
@@ -2673,9 +2639,8 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
2673 return; 2639 return;
2674 slab_err(s, page, "%s", text); 2640 slab_err(s, page, "%s", text);
2675 slab_lock(page); 2641 slab_lock(page);
2676 for_each_free_object(p, s, page->freelist)
2677 set_bit(slab_index(p, s, addr), map);
2678 2642
2643 get_map(s, page, map);
2679 for_each_object(p, s, addr, page->objects) { 2644 for_each_object(p, s, addr, page->objects) {
2680 2645
2681 if (!test_bit(slab_index(p, s, addr), map)) { 2646 if (!test_bit(slab_index(p, s, addr), map)) {
@@ -3203,7 +3168,7 @@ static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
3203 list_for_each_entry(p, &n->partial, lru) 3168 list_for_each_entry(p, &n->partial, lru)
3204 p->slab = s; 3169 p->slab = s;
3205 3170
3206#ifdef CONFIG_SLAB_DEBUG 3171#ifdef CONFIG_SLUB_DEBUG
3207 list_for_each_entry(p, &n->full, lru) 3172 list_for_each_entry(p, &n->full, lru)
3208 p->slab = s; 3173 p->slab = s;
3209#endif 3174#endif
@@ -3610,10 +3575,11 @@ static int validate_slab(struct kmem_cache *s, struct page *page,
3610 /* Now we know that a valid freelist exists */ 3575 /* Now we know that a valid freelist exists */
3611 bitmap_zero(map, page->objects); 3576 bitmap_zero(map, page->objects);
3612 3577
3613 for_each_free_object(p, s, page->freelist) { 3578 get_map(s, page, map);
3614 set_bit(slab_index(p, s, addr), map); 3579 for_each_object(p, s, addr, page->objects) {
3615 if (!check_object(s, page, p, SLUB_RED_INACTIVE)) 3580 if (test_bit(slab_index(p, s, addr), map))
3616 return 0; 3581 if (!check_object(s, page, p, SLUB_RED_INACTIVE))
3582 return 0;
3617 } 3583 }
3618 3584
3619 for_each_object(p, s, addr, page->objects) 3585 for_each_object(p, s, addr, page->objects)
@@ -3821,8 +3787,7 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s,
3821 void *p; 3787 void *p;
3822 3788
3823 bitmap_zero(map, page->objects); 3789 bitmap_zero(map, page->objects);
3824 for_each_free_object(p, s, page->freelist) 3790 get_map(s, page, map);
3825 set_bit(slab_index(p, s, addr), map);
3826 3791
3827 for_each_object(p, s, addr, page->objects) 3792 for_each_object(p, s, addr, page->objects)
3828 if (!test_bit(slab_index(p, s, addr), map)) 3793 if (!test_bit(slab_index(p, s, addr), map))
diff --git a/mm/swap.c b/mm/swap.c
index 5602f1a1b1e7..3a442f18b0b3 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -272,14 +272,10 @@ static void update_page_reclaim_stat(struct zone *zone, struct page *page,
272 memcg_reclaim_stat->recent_rotated[file]++; 272 memcg_reclaim_stat->recent_rotated[file]++;
273} 273}
274 274
275/* 275static void __activate_page(struct page *page, void *arg)
276 * FIXME: speed this up?
277 */
278void activate_page(struct page *page)
279{ 276{
280 struct zone *zone = page_zone(page); 277 struct zone *zone = page_zone(page);
281 278
282 spin_lock_irq(&zone->lru_lock);
283 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 279 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
284 int file = page_is_file_cache(page); 280 int file = page_is_file_cache(page);
285 int lru = page_lru_base_type(page); 281 int lru = page_lru_base_type(page);
@@ -292,8 +288,45 @@ void activate_page(struct page *page)
292 288
293 update_page_reclaim_stat(zone, page, file, 1); 289 update_page_reclaim_stat(zone, page, file, 1);
294 } 290 }
291}
292
293#ifdef CONFIG_SMP
294static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
295
296static void activate_page_drain(int cpu)
297{
298 struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu);
299
300 if (pagevec_count(pvec))
301 pagevec_lru_move_fn(pvec, __activate_page, NULL);
302}
303
304void activate_page(struct page *page)
305{
306 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
307 struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
308
309 page_cache_get(page);
310 if (!pagevec_add(pvec, page))
311 pagevec_lru_move_fn(pvec, __activate_page, NULL);
312 put_cpu_var(activate_page_pvecs);
313 }
314}
315
316#else
317static inline void activate_page_drain(int cpu)
318{
319}
320
321void activate_page(struct page *page)
322{
323 struct zone *zone = page_zone(page);
324
325 spin_lock_irq(&zone->lru_lock);
326 __activate_page(page, NULL);
295 spin_unlock_irq(&zone->lru_lock); 327 spin_unlock_irq(&zone->lru_lock);
296} 328}
329#endif
297 330
298/* 331/*
299 * Mark a page as having seen activity. 332 * Mark a page as having seen activity.
@@ -464,6 +497,8 @@ static void drain_cpu_pagevecs(int cpu)
464 pvec = &per_cpu(lru_deactivate_pvecs, cpu); 497 pvec = &per_cpu(lru_deactivate_pvecs, cpu);
465 if (pagevec_count(pvec)) 498 if (pagevec_count(pvec))
466 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); 499 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
500
501 activate_page_drain(cpu);
467} 502}
468 503
469/** 504/**
@@ -476,6 +511,13 @@ static void drain_cpu_pagevecs(int cpu)
476 */ 511 */
477void deactivate_page(struct page *page) 512void deactivate_page(struct page *page)
478{ 513{
514 /*
515 * In a workload with many unevictable page such as mprotect, unevictable
516 * page deactivation for accelerating reclaim is pointless.
517 */
518 if (PageUnevictable(page))
519 return;
520
479 if (likely(get_page_unless_zero(page))) { 521 if (likely(get_page_unless_zero(page))) {
480 struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); 522 struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
481 523
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8c6b3ce38f09..d537d29e9b7b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -31,6 +31,7 @@
31#include <linux/syscalls.h> 31#include <linux/syscalls.h>
32#include <linux/memcontrol.h> 32#include <linux/memcontrol.h>
33#include <linux/poll.h> 33#include <linux/poll.h>
34#include <linux/oom.h>
34 35
35#include <asm/pgtable.h> 36#include <asm/pgtable.h>
36#include <asm/tlbflush.h> 37#include <asm/tlbflush.h>
@@ -1555,6 +1556,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1555 struct address_space *mapping; 1556 struct address_space *mapping;
1556 struct inode *inode; 1557 struct inode *inode;
1557 char *pathname; 1558 char *pathname;
1559 int oom_score_adj;
1558 int i, type, prev; 1560 int i, type, prev;
1559 int err; 1561 int err;
1560 1562
@@ -1613,9 +1615,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1613 p->flags &= ~SWP_WRITEOK; 1615 p->flags &= ~SWP_WRITEOK;
1614 spin_unlock(&swap_lock); 1616 spin_unlock(&swap_lock);
1615 1617
1616 current->flags |= PF_OOM_ORIGIN; 1618 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
1617 err = try_to_unuse(type); 1619 err = try_to_unuse(type);
1618 current->flags &= ~PF_OOM_ORIGIN; 1620 test_set_oom_score_adj(oom_score_adj);
1619 1621
1620 if (err) { 1622 if (err) {
1621 /* 1623 /*
diff --git a/mm/truncate.c b/mm/truncate.c
index a95667529135..3a29a6180212 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -19,6 +19,7 @@
19#include <linux/task_io_accounting_ops.h> 19#include <linux/task_io_accounting_ops.h>
20#include <linux/buffer_head.h> /* grr. try_to_release_page, 20#include <linux/buffer_head.h> /* grr. try_to_release_page,
21 do_invalidatepage */ 21 do_invalidatepage */
22#include <linux/cleancache.h>
22#include "internal.h" 23#include "internal.h"
23 24
24 25
@@ -51,6 +52,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
51static inline void truncate_partial_page(struct page *page, unsigned partial) 52static inline void truncate_partial_page(struct page *page, unsigned partial)
52{ 53{
53 zero_user_segment(page, partial, PAGE_CACHE_SIZE); 54 zero_user_segment(page, partial, PAGE_CACHE_SIZE);
55 cleancache_flush_page(page->mapping, page);
54 if (page_has_private(page)) 56 if (page_has_private(page))
55 do_invalidatepage(page, partial); 57 do_invalidatepage(page, partial);
56} 58}
@@ -214,6 +216,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
214 pgoff_t next; 216 pgoff_t next;
215 int i; 217 int i;
216 218
219 cleancache_flush_inode(mapping);
217 if (mapping->nrpages == 0) 220 if (mapping->nrpages == 0)
218 return; 221 return;
219 222
@@ -291,6 +294,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
291 pagevec_release(&pvec); 294 pagevec_release(&pvec);
292 mem_cgroup_uncharge_end(); 295 mem_cgroup_uncharge_end();
293 } 296 }
297 cleancache_flush_inode(mapping);
294} 298}
295EXPORT_SYMBOL(truncate_inode_pages_range); 299EXPORT_SYMBOL(truncate_inode_pages_range);
296 300
@@ -440,6 +444,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
440 int did_range_unmap = 0; 444 int did_range_unmap = 0;
441 int wrapped = 0; 445 int wrapped = 0;
442 446
447 cleancache_flush_inode(mapping);
443 pagevec_init(&pvec, 0); 448 pagevec_init(&pvec, 0);
444 next = start; 449 next = start;
445 while (next <= end && !wrapped && 450 while (next <= end && !wrapped &&
@@ -498,6 +503,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
498 mem_cgroup_uncharge_end(); 503 mem_cgroup_uncharge_end();
499 cond_resched(); 504 cond_resched();
500 } 505 }
506 cleancache_flush_inode(mapping);
501 return ret; 507 return ret;
502} 508}
503EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); 509EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
diff --git a/mm/util.c b/mm/util.c
index e7b103a6fd21..88ea1bd661c0 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -6,6 +6,8 @@
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <asm/uaccess.h> 7#include <asm/uaccess.h>
8 8
9#include "internal.h"
10
9#define CREATE_TRACE_POINTS 11#define CREATE_TRACE_POINTS
10#include <trace/events/kmem.h> 12#include <trace/events/kmem.h>
11 13
@@ -215,6 +217,28 @@ char *strndup_user(const char __user *s, long n)
215} 217}
216EXPORT_SYMBOL(strndup_user); 218EXPORT_SYMBOL(strndup_user);
217 219
220void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
221 struct vm_area_struct *prev, struct rb_node *rb_parent)
222{
223 struct vm_area_struct *next;
224
225 vma->vm_prev = prev;
226 if (prev) {
227 next = prev->vm_next;
228 prev->vm_next = vma;
229 } else {
230 mm->mmap = vma;
231 if (rb_parent)
232 next = rb_entry(rb_parent,
233 struct vm_area_struct, vm_rb);
234 else
235 next = NULL;
236 }
237 vma->vm_next = next;
238 if (next)
239 next->vm_prev = vma;
240}
241
218#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) 242#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
219void arch_pick_mmap_layout(struct mm_struct *mm) 243void arch_pick_mmap_layout(struct mm_struct *mm)
220{ 244{
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index fdf4b1e88e53..1d34d75366a7 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -375,7 +375,7 @@ nocache:
375 /* find starting point for our search */ 375 /* find starting point for our search */
376 if (free_vmap_cache) { 376 if (free_vmap_cache) {
377 first = rb_entry(free_vmap_cache, struct vmap_area, rb_node); 377 first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
378 addr = ALIGN(first->va_end + PAGE_SIZE, align); 378 addr = ALIGN(first->va_end, align);
379 if (addr < vstart) 379 if (addr < vstart)
380 goto nocache; 380 goto nocache;
381 if (addr + size - 1 < addr) 381 if (addr + size - 1 < addr)
@@ -406,10 +406,10 @@ nocache:
406 } 406 }
407 407
408 /* from the starting point, walk areas until a suitable hole is found */ 408 /* from the starting point, walk areas until a suitable hole is found */
409 while (addr + size >= first->va_start && addr + size <= vend) { 409 while (addr + size > first->va_start && addr + size <= vend) {
410 if (addr + cached_hole_size < first->va_start) 410 if (addr + cached_hole_size < first->va_start)
411 cached_hole_size = first->va_start - addr; 411 cached_hole_size = first->va_start - addr;
412 addr = ALIGN(first->va_end + PAGE_SIZE, align); 412 addr = ALIGN(first->va_end, align);
413 if (addr + size - 1 < addr) 413 if (addr + size - 1 < addr)
414 goto overflow; 414 goto overflow;
415 415
@@ -1534,6 +1534,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
1534static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 1534static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1535 pgprot_t prot, int node, void *caller) 1535 pgprot_t prot, int node, void *caller)
1536{ 1536{
1537 const int order = 0;
1537 struct page **pages; 1538 struct page **pages;
1538 unsigned int nr_pages, array_size, i; 1539 unsigned int nr_pages, array_size, i;
1539 gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; 1540 gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
@@ -1560,11 +1561,12 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1560 1561
1561 for (i = 0; i < area->nr_pages; i++) { 1562 for (i = 0; i < area->nr_pages; i++) {
1562 struct page *page; 1563 struct page *page;
1564 gfp_t tmp_mask = gfp_mask | __GFP_NOWARN;
1563 1565
1564 if (node < 0) 1566 if (node < 0)
1565 page = alloc_page(gfp_mask); 1567 page = alloc_page(tmp_mask);
1566 else 1568 else
1567 page = alloc_pages_node(node, gfp_mask, 0); 1569 page = alloc_pages_node(node, tmp_mask, order);
1568 1570
1569 if (unlikely(!page)) { 1571 if (unlikely(!page)) {
1570 /* Successfully allocated i pages, free them in __vunmap() */ 1572 /* Successfully allocated i pages, free them in __vunmap() */
@@ -1579,6 +1581,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1579 return area->addr; 1581 return area->addr;
1580 1582
1581fail: 1583fail:
1584 warn_alloc_failed(gfp_mask, order, "vmalloc: allocation failure, "
1585 "allocated %ld of %ld bytes\n",
1586 (area->nr_pages*PAGE_SIZE), area->size);
1582 vfree(area->addr); 1587 vfree(area->addr);
1583 return NULL; 1588 return NULL;
1584} 1589}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8bfd45050a61..faa0a088f9cc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -42,6 +42,7 @@
42#include <linux/delayacct.h> 42#include <linux/delayacct.h>
43#include <linux/sysctl.h> 43#include <linux/sysctl.h>
44#include <linux/oom.h> 44#include <linux/oom.h>
45#include <linux/prefetch.h>
45 46
46#include <asm/tlbflush.h> 47#include <asm/tlbflush.h>
47#include <asm/div64.h> 48#include <asm/div64.h>
@@ -172,7 +173,7 @@ static unsigned long zone_nr_lru_pages(struct zone *zone,
172 struct scan_control *sc, enum lru_list lru) 173 struct scan_control *sc, enum lru_list lru)
173{ 174{
174 if (!scanning_global_lru(sc)) 175 if (!scanning_global_lru(sc))
175 return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru); 176 return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru);
176 177
177 return zone_page_state(zone, NR_LRU_BASE + lru); 178 return zone_page_state(zone, NR_LRU_BASE + lru);
178} 179}
@@ -201,6 +202,14 @@ void unregister_shrinker(struct shrinker *shrinker)
201} 202}
202EXPORT_SYMBOL(unregister_shrinker); 203EXPORT_SYMBOL(unregister_shrinker);
203 204
205static inline int do_shrinker_shrink(struct shrinker *shrinker,
206 struct shrink_control *sc,
207 unsigned long nr_to_scan)
208{
209 sc->nr_to_scan = nr_to_scan;
210 return (*shrinker->shrink)(shrinker, sc);
211}
212
204#define SHRINK_BATCH 128 213#define SHRINK_BATCH 128
205/* 214/*
206 * Call the shrink functions to age shrinkable caches 215 * Call the shrink functions to age shrinkable caches
@@ -221,25 +230,29 @@ EXPORT_SYMBOL(unregister_shrinker);
221 * 230 *
222 * Returns the number of slab objects which we shrunk. 231 * Returns the number of slab objects which we shrunk.
223 */ 232 */
224unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, 233unsigned long shrink_slab(struct shrink_control *shrink,
225 unsigned long lru_pages) 234 unsigned long nr_pages_scanned,
235 unsigned long lru_pages)
226{ 236{
227 struct shrinker *shrinker; 237 struct shrinker *shrinker;
228 unsigned long ret = 0; 238 unsigned long ret = 0;
229 239
230 if (scanned == 0) 240 if (nr_pages_scanned == 0)
231 scanned = SWAP_CLUSTER_MAX; 241 nr_pages_scanned = SWAP_CLUSTER_MAX;
232 242
233 if (!down_read_trylock(&shrinker_rwsem)) 243 if (!down_read_trylock(&shrinker_rwsem)) {
234 return 1; /* Assume we'll be able to shrink next time */ 244 /* Assume we'll be able to shrink next time */
245 ret = 1;
246 goto out;
247 }
235 248
236 list_for_each_entry(shrinker, &shrinker_list, list) { 249 list_for_each_entry(shrinker, &shrinker_list, list) {
237 unsigned long long delta; 250 unsigned long long delta;
238 unsigned long total_scan; 251 unsigned long total_scan;
239 unsigned long max_pass; 252 unsigned long max_pass;
240 253
241 max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask); 254 max_pass = do_shrinker_shrink(shrinker, shrink, 0);
242 delta = (4 * scanned) / shrinker->seeks; 255 delta = (4 * nr_pages_scanned) / shrinker->seeks;
243 delta *= max_pass; 256 delta *= max_pass;
244 do_div(delta, lru_pages + 1); 257 do_div(delta, lru_pages + 1);
245 shrinker->nr += delta; 258 shrinker->nr += delta;
@@ -266,9 +279,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
266 int shrink_ret; 279 int shrink_ret;
267 int nr_before; 280 int nr_before;
268 281
269 nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask); 282 nr_before = do_shrinker_shrink(shrinker, shrink, 0);
270 shrink_ret = (*shrinker->shrink)(shrinker, this_scan, 283 shrink_ret = do_shrinker_shrink(shrinker, shrink,
271 gfp_mask); 284 this_scan);
272 if (shrink_ret == -1) 285 if (shrink_ret == -1)
273 break; 286 break;
274 if (shrink_ret < nr_before) 287 if (shrink_ret < nr_before)
@@ -282,6 +295,8 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
282 shrinker->nr += total_scan; 295 shrinker->nr += total_scan;
283 } 296 }
284 up_read(&shrinker_rwsem); 297 up_read(&shrinker_rwsem);
298out:
299 cond_resched();
285 return ret; 300 return ret;
286} 301}
287 302
@@ -1201,13 +1216,16 @@ int isolate_lru_page(struct page *page)
1201{ 1216{
1202 int ret = -EBUSY; 1217 int ret = -EBUSY;
1203 1218
1219 VM_BUG_ON(!page_count(page));
1220
1204 if (PageLRU(page)) { 1221 if (PageLRU(page)) {
1205 struct zone *zone = page_zone(page); 1222 struct zone *zone = page_zone(page);
1206 1223
1207 spin_lock_irq(&zone->lru_lock); 1224 spin_lock_irq(&zone->lru_lock);
1208 if (PageLRU(page) && get_page_unless_zero(page)) { 1225 if (PageLRU(page)) {
1209 int lru = page_lru(page); 1226 int lru = page_lru(page);
1210 ret = 0; 1227 ret = 0;
1228 get_page(page);
1211 ClearPageLRU(page); 1229 ClearPageLRU(page);
1212 1230
1213 del_page_from_lru_list(zone, page, lru); 1231 del_page_from_lru_list(zone, page, lru);
@@ -1700,26 +1718,6 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1700} 1718}
1701 1719
1702/* 1720/*
1703 * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
1704 * until we collected @swap_cluster_max pages to scan.
1705 */
1706static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
1707 unsigned long *nr_saved_scan)
1708{
1709 unsigned long nr;
1710
1711 *nr_saved_scan += nr_to_scan;
1712 nr = *nr_saved_scan;
1713
1714 if (nr >= SWAP_CLUSTER_MAX)
1715 *nr_saved_scan = 0;
1716 else
1717 nr = 0;
1718
1719 return nr;
1720}
1721
1722/*
1723 * Determine how aggressively the anon and file LRU lists should be 1721 * Determine how aggressively the anon and file LRU lists should be
1724 * scanned. The relative value of each set of LRU lists is determined 1722 * scanned. The relative value of each set of LRU lists is determined
1725 * by looking at the fraction of the pages scanned we did rotate back 1723 * by looking at the fraction of the pages scanned we did rotate back
@@ -1737,6 +1735,22 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1737 u64 fraction[2], denominator; 1735 u64 fraction[2], denominator;
1738 enum lru_list l; 1736 enum lru_list l;
1739 int noswap = 0; 1737 int noswap = 0;
1738 int force_scan = 0;
1739
1740
1741 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1742 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1743 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
1744 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1745
1746 if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) {
1747 /* kswapd does zone balancing and need to scan this zone */
1748 if (scanning_global_lru(sc) && current_is_kswapd())
1749 force_scan = 1;
1750 /* memcg may have small limit and need to avoid priority drop */
1751 if (!scanning_global_lru(sc))
1752 force_scan = 1;
1753 }
1740 1754
1741 /* If we have no swap space, do not bother scanning anon pages. */ 1755 /* If we have no swap space, do not bother scanning anon pages. */
1742 if (!sc->may_swap || (nr_swap_pages <= 0)) { 1756 if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1747,11 +1761,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1747 goto out; 1761 goto out;
1748 } 1762 }
1749 1763
1750 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1751 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1752 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
1753 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1754
1755 if (scanning_global_lru(sc)) { 1764 if (scanning_global_lru(sc)) {
1756 free = zone_page_state(zone, NR_FREE_PAGES); 1765 free = zone_page_state(zone, NR_FREE_PAGES);
1757 /* If we have very few page cache pages, 1766 /* If we have very few page cache pages,
@@ -1818,8 +1827,23 @@ out:
1818 scan >>= priority; 1827 scan >>= priority;
1819 scan = div64_u64(scan * fraction[file], denominator); 1828 scan = div64_u64(scan * fraction[file], denominator);
1820 } 1829 }
1821 nr[l] = nr_scan_try_batch(scan, 1830
1822 &reclaim_stat->nr_saved_scan[l]); 1831 /*
1832 * If zone is small or memcg is small, nr[l] can be 0.
1833 * This results no-scan on this priority and priority drop down.
1834 * For global direct reclaim, it can visit next zone and tend
1835 * not to have problems. For global kswapd, it's for zone
1836 * balancing and it need to scan a small amounts. When using
1837 * memcg, priority drop can cause big latency. So, it's better
1838 * to scan small amount. See may_noscan above.
1839 */
1840 if (!scan && force_scan) {
1841 if (file)
1842 scan = SWAP_CLUSTER_MAX;
1843 else if (!noswap)
1844 scan = SWAP_CLUSTER_MAX;
1845 }
1846 nr[l] = scan;
1823 } 1847 }
1824} 1848}
1825 1849
@@ -1959,11 +1983,14 @@ restart:
1959 * If a zone is deemed to be full of pinned pages then just give it a light 1983 * If a zone is deemed to be full of pinned pages then just give it a light
1960 * scan then give up on it. 1984 * scan then give up on it.
1961 */ 1985 */
1962static void shrink_zones(int priority, struct zonelist *zonelist, 1986static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
1963 struct scan_control *sc) 1987 struct scan_control *sc)
1964{ 1988{
1965 struct zoneref *z; 1989 struct zoneref *z;
1966 struct zone *zone; 1990 struct zone *zone;
1991 unsigned long nr_soft_reclaimed;
1992 unsigned long nr_soft_scanned;
1993 unsigned long total_scanned = 0;
1967 1994
1968 for_each_zone_zonelist_nodemask(zone, z, zonelist, 1995 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1969 gfp_zone(sc->gfp_mask), sc->nodemask) { 1996 gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -1980,8 +2007,17 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
1980 continue; /* Let kswapd poll it */ 2007 continue; /* Let kswapd poll it */
1981 } 2008 }
1982 2009
2010 nr_soft_scanned = 0;
2011 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2012 sc->order, sc->gfp_mask,
2013 &nr_soft_scanned);
2014 sc->nr_reclaimed += nr_soft_reclaimed;
2015 total_scanned += nr_soft_scanned;
2016
1983 shrink_zone(priority, zone, sc); 2017 shrink_zone(priority, zone, sc);
1984 } 2018 }
2019
2020 return total_scanned;
1985} 2021}
1986 2022
1987static bool zone_reclaimable(struct zone *zone) 2023static bool zone_reclaimable(struct zone *zone)
@@ -2026,7 +2062,8 @@ static bool all_unreclaimable(struct zonelist *zonelist,
2026 * else, the number of pages reclaimed 2062 * else, the number of pages reclaimed
2027 */ 2063 */
2028static unsigned long do_try_to_free_pages(struct zonelist *zonelist, 2064static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2029 struct scan_control *sc) 2065 struct scan_control *sc,
2066 struct shrink_control *shrink)
2030{ 2067{
2031 int priority; 2068 int priority;
2032 unsigned long total_scanned = 0; 2069 unsigned long total_scanned = 0;
@@ -2045,7 +2082,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2045 sc->nr_scanned = 0; 2082 sc->nr_scanned = 0;
2046 if (!priority) 2083 if (!priority)
2047 disable_swap_token(); 2084 disable_swap_token();
2048 shrink_zones(priority, zonelist, sc); 2085 total_scanned += shrink_zones(priority, zonelist, sc);
2049 /* 2086 /*
2050 * Don't shrink slabs when reclaiming memory from 2087 * Don't shrink slabs when reclaiming memory from
2051 * over limit cgroups 2088 * over limit cgroups
@@ -2060,7 +2097,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2060 lru_pages += zone_reclaimable_pages(zone); 2097 lru_pages += zone_reclaimable_pages(zone);
2061 } 2098 }
2062 2099
2063 shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); 2100 shrink_slab(shrink, sc->nr_scanned, lru_pages);
2064 if (reclaim_state) { 2101 if (reclaim_state) {
2065 sc->nr_reclaimed += reclaim_state->reclaimed_slab; 2102 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2066 reclaim_state->reclaimed_slab = 0; 2103 reclaim_state->reclaimed_slab = 0;
@@ -2132,12 +2169,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2132 .mem_cgroup = NULL, 2169 .mem_cgroup = NULL,
2133 .nodemask = nodemask, 2170 .nodemask = nodemask,
2134 }; 2171 };
2172 struct shrink_control shrink = {
2173 .gfp_mask = sc.gfp_mask,
2174 };
2135 2175
2136 trace_mm_vmscan_direct_reclaim_begin(order, 2176 trace_mm_vmscan_direct_reclaim_begin(order,
2137 sc.may_writepage, 2177 sc.may_writepage,
2138 gfp_mask); 2178 gfp_mask);
2139 2179
2140 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 2180 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2141 2181
2142 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); 2182 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
2143 2183
@@ -2149,9 +2189,11 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2149unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, 2189unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2150 gfp_t gfp_mask, bool noswap, 2190 gfp_t gfp_mask, bool noswap,
2151 unsigned int swappiness, 2191 unsigned int swappiness,
2152 struct zone *zone) 2192 struct zone *zone,
2193 unsigned long *nr_scanned)
2153{ 2194{
2154 struct scan_control sc = { 2195 struct scan_control sc = {
2196 .nr_scanned = 0,
2155 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2197 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2156 .may_writepage = !laptop_mode, 2198 .may_writepage = !laptop_mode,
2157 .may_unmap = 1, 2199 .may_unmap = 1,
@@ -2160,6 +2202,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2160 .order = 0, 2202 .order = 0,
2161 .mem_cgroup = mem, 2203 .mem_cgroup = mem,
2162 }; 2204 };
2205
2163 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2206 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2164 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2207 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
2165 2208
@@ -2178,6 +2221,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2178 2221
2179 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 2222 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2180 2223
2224 *nr_scanned = sc.nr_scanned;
2181 return sc.nr_reclaimed; 2225 return sc.nr_reclaimed;
2182} 2226}
2183 2227
@@ -2188,6 +2232,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2188{ 2232{
2189 struct zonelist *zonelist; 2233 struct zonelist *zonelist;
2190 unsigned long nr_reclaimed; 2234 unsigned long nr_reclaimed;
2235 int nid;
2191 struct scan_control sc = { 2236 struct scan_control sc = {
2192 .may_writepage = !laptop_mode, 2237 .may_writepage = !laptop_mode,
2193 .may_unmap = 1, 2238 .may_unmap = 1,
@@ -2197,17 +2242,27 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2197 .order = 0, 2242 .order = 0,
2198 .mem_cgroup = mem_cont, 2243 .mem_cgroup = mem_cont,
2199 .nodemask = NULL, /* we don't care the placement */ 2244 .nodemask = NULL, /* we don't care the placement */
2245 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2246 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
2247 };
2248 struct shrink_control shrink = {
2249 .gfp_mask = sc.gfp_mask,
2200 }; 2250 };
2201 2251
2202 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2252 /*
2203 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2253 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
2204 zonelist = NODE_DATA(numa_node_id())->node_zonelists; 2254 * take care of from where we get pages. So the node where we start the
2255 * scan does not need to be the current node.
2256 */
2257 nid = mem_cgroup_select_victim_node(mem_cont);
2258
2259 zonelist = NODE_DATA(nid)->node_zonelists;
2205 2260
2206 trace_mm_vmscan_memcg_reclaim_begin(0, 2261 trace_mm_vmscan_memcg_reclaim_begin(0,
2207 sc.may_writepage, 2262 sc.may_writepage,
2208 sc.gfp_mask); 2263 sc.gfp_mask);
2209 2264
2210 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 2265 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2211 2266
2212 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); 2267 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
2213 2268
@@ -2286,7 +2341,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2286 * must be balanced 2341 * must be balanced
2287 */ 2342 */
2288 if (order) 2343 if (order)
2289 return pgdat_balanced(pgdat, balanced, classzone_idx); 2344 return !pgdat_balanced(pgdat, balanced, classzone_idx);
2290 else 2345 else
2291 return !all_zones_ok; 2346 return !all_zones_ok;
2292} 2347}
@@ -2322,6 +2377,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2322 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2377 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2323 unsigned long total_scanned; 2378 unsigned long total_scanned;
2324 struct reclaim_state *reclaim_state = current->reclaim_state; 2379 struct reclaim_state *reclaim_state = current->reclaim_state;
2380 unsigned long nr_soft_reclaimed;
2381 unsigned long nr_soft_scanned;
2325 struct scan_control sc = { 2382 struct scan_control sc = {
2326 .gfp_mask = GFP_KERNEL, 2383 .gfp_mask = GFP_KERNEL,
2327 .may_unmap = 1, 2384 .may_unmap = 1,
@@ -2335,6 +2392,9 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2335 .order = order, 2392 .order = order,
2336 .mem_cgroup = NULL, 2393 .mem_cgroup = NULL,
2337 }; 2394 };
2395 struct shrink_control shrink = {
2396 .gfp_mask = sc.gfp_mask,
2397 };
2338loop_again: 2398loop_again:
2339 total_scanned = 0; 2399 total_scanned = 0;
2340 sc.nr_reclaimed = 0; 2400 sc.nr_reclaimed = 0;
@@ -2411,11 +2471,15 @@ loop_again:
2411 2471
2412 sc.nr_scanned = 0; 2472 sc.nr_scanned = 0;
2413 2473
2474 nr_soft_scanned = 0;
2414 /* 2475 /*
2415 * Call soft limit reclaim before calling shrink_zone. 2476 * Call soft limit reclaim before calling shrink_zone.
2416 * For now we ignore the return value
2417 */ 2477 */
2418 mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask); 2478 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2479 order, sc.gfp_mask,
2480 &nr_soft_scanned);
2481 sc.nr_reclaimed += nr_soft_reclaimed;
2482 total_scanned += nr_soft_scanned;
2419 2483
2420 /* 2484 /*
2421 * We put equal pressure on every zone, unless 2485 * We put equal pressure on every zone, unless
@@ -2434,8 +2498,7 @@ loop_again:
2434 end_zone, 0)) 2498 end_zone, 0))
2435 shrink_zone(priority, zone, &sc); 2499 shrink_zone(priority, zone, &sc);
2436 reclaim_state->reclaimed_slab = 0; 2500 reclaim_state->reclaimed_slab = 0;
2437 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 2501 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
2438 lru_pages);
2439 sc.nr_reclaimed += reclaim_state->reclaimed_slab; 2502 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2440 total_scanned += sc.nr_scanned; 2503 total_scanned += sc.nr_scanned;
2441 2504
@@ -2787,7 +2850,10 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
2787 .swappiness = vm_swappiness, 2850 .swappiness = vm_swappiness,
2788 .order = 0, 2851 .order = 0,
2789 }; 2852 };
2790 struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); 2853 struct shrink_control shrink = {
2854 .gfp_mask = sc.gfp_mask,
2855 };
2856 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
2791 struct task_struct *p = current; 2857 struct task_struct *p = current;
2792 unsigned long nr_reclaimed; 2858 unsigned long nr_reclaimed;
2793 2859
@@ -2796,7 +2862,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
2796 reclaim_state.reclaimed_slab = 0; 2862 reclaim_state.reclaimed_slab = 0;
2797 p->reclaim_state = &reclaim_state; 2863 p->reclaim_state = &reclaim_state;
2798 2864
2799 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 2865 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2800 2866
2801 p->reclaim_state = NULL; 2867 p->reclaim_state = NULL;
2802 lockdep_clear_current_reclaim_state(); 2868 lockdep_clear_current_reclaim_state();
@@ -2971,6 +3037,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2971 .swappiness = vm_swappiness, 3037 .swappiness = vm_swappiness,
2972 .order = order, 3038 .order = order,
2973 }; 3039 };
3040 struct shrink_control shrink = {
3041 .gfp_mask = sc.gfp_mask,
3042 };
2974 unsigned long nr_slab_pages0, nr_slab_pages1; 3043 unsigned long nr_slab_pages0, nr_slab_pages1;
2975 3044
2976 cond_resched(); 3045 cond_resched();
@@ -3012,7 +3081,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3012 unsigned long lru_pages = zone_reclaimable_pages(zone); 3081 unsigned long lru_pages = zone_reclaimable_pages(zone);
3013 3082
3014 /* No reclaimable slab or very low memory pressure */ 3083 /* No reclaimable slab or very low memory pressure */
3015 if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages)) 3084 if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
3016 break; 3085 break;
3017 3086
3018 /* Freed enough memory */ 3087 /* Freed enough memory */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 897ea9e88238..20c18b7694b2 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -157,7 +157,7 @@ int calculate_normal_threshold(struct zone *zone)
157/* 157/*
158 * Refresh the thresholds for each zone. 158 * Refresh the thresholds for each zone.
159 */ 159 */
160static void refresh_zone_stat_thresholds(void) 160void refresh_zone_stat_thresholds(void)
161{ 161{
162 struct zone *zone; 162 struct zone *zone;
163 int cpu; 163 int cpu;
@@ -659,6 +659,138 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
659} 659}
660#endif 660#endif
661 661
662#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS)
663#ifdef CONFIG_ZONE_DMA
664#define TEXT_FOR_DMA(xx) xx "_dma",
665#else
666#define TEXT_FOR_DMA(xx)
667#endif
668
669#ifdef CONFIG_ZONE_DMA32
670#define TEXT_FOR_DMA32(xx) xx "_dma32",
671#else
672#define TEXT_FOR_DMA32(xx)
673#endif
674
675#ifdef CONFIG_HIGHMEM
676#define TEXT_FOR_HIGHMEM(xx) xx "_high",
677#else
678#define TEXT_FOR_HIGHMEM(xx)
679#endif
680
681#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
682 TEXT_FOR_HIGHMEM(xx) xx "_movable",
683
684const char * const vmstat_text[] = {
685 /* Zoned VM counters */
686 "nr_free_pages",
687 "nr_inactive_anon",
688 "nr_active_anon",
689 "nr_inactive_file",
690 "nr_active_file",
691 "nr_unevictable",
692 "nr_mlock",
693 "nr_anon_pages",
694 "nr_mapped",
695 "nr_file_pages",
696 "nr_dirty",
697 "nr_writeback",
698 "nr_slab_reclaimable",
699 "nr_slab_unreclaimable",
700 "nr_page_table_pages",
701 "nr_kernel_stack",
702 "nr_unstable",
703 "nr_bounce",
704 "nr_vmscan_write",
705 "nr_writeback_temp",
706 "nr_isolated_anon",
707 "nr_isolated_file",
708 "nr_shmem",
709 "nr_dirtied",
710 "nr_written",
711
712#ifdef CONFIG_NUMA
713 "numa_hit",
714 "numa_miss",
715 "numa_foreign",
716 "numa_interleave",
717 "numa_local",
718 "numa_other",
719#endif
720 "nr_anon_transparent_hugepages",
721 "nr_dirty_threshold",
722 "nr_dirty_background_threshold",
723
724#ifdef CONFIG_VM_EVENT_COUNTERS
725 "pgpgin",
726 "pgpgout",
727 "pswpin",
728 "pswpout",
729
730 TEXTS_FOR_ZONES("pgalloc")
731
732 "pgfree",
733 "pgactivate",
734 "pgdeactivate",
735
736 "pgfault",
737 "pgmajfault",
738
739 TEXTS_FOR_ZONES("pgrefill")
740 TEXTS_FOR_ZONES("pgsteal")
741 TEXTS_FOR_ZONES("pgscan_kswapd")
742 TEXTS_FOR_ZONES("pgscan_direct")
743
744#ifdef CONFIG_NUMA
745 "zone_reclaim_failed",
746#endif
747 "pginodesteal",
748 "slabs_scanned",
749 "kswapd_steal",
750 "kswapd_inodesteal",
751 "kswapd_low_wmark_hit_quickly",
752 "kswapd_high_wmark_hit_quickly",
753 "kswapd_skip_congestion_wait",
754 "pageoutrun",
755 "allocstall",
756
757 "pgrotated",
758
759#ifdef CONFIG_COMPACTION
760 "compact_blocks_moved",
761 "compact_pages_moved",
762 "compact_pagemigrate_failed",
763 "compact_stall",
764 "compact_fail",
765 "compact_success",
766#endif
767
768#ifdef CONFIG_HUGETLB_PAGE
769 "htlb_buddy_alloc_success",
770 "htlb_buddy_alloc_fail",
771#endif
772 "unevictable_pgs_culled",
773 "unevictable_pgs_scanned",
774 "unevictable_pgs_rescued",
775 "unevictable_pgs_mlocked",
776 "unevictable_pgs_munlocked",
777 "unevictable_pgs_cleared",
778 "unevictable_pgs_stranded",
779 "unevictable_pgs_mlockfreed",
780
781#ifdef CONFIG_TRANSPARENT_HUGEPAGE
782 "thp_fault_alloc",
783 "thp_fault_fallback",
784 "thp_collapse_alloc",
785 "thp_collapse_alloc_failed",
786 "thp_split",
787#endif
788
789#endif /* CONFIG_VM_EVENTS_COUNTERS */
790};
791#endif /* CONFIG_PROC_FS || CONFIG_SYSFS */
792
793
662#ifdef CONFIG_PROC_FS 794#ifdef CONFIG_PROC_FS
663static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, 795static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
664 struct zone *zone) 796 struct zone *zone)
@@ -831,135 +963,6 @@ static const struct file_operations pagetypeinfo_file_ops = {
831 .release = seq_release, 963 .release = seq_release,
832}; 964};
833 965
834#ifdef CONFIG_ZONE_DMA
835#define TEXT_FOR_DMA(xx) xx "_dma",
836#else
837#define TEXT_FOR_DMA(xx)
838#endif
839
840#ifdef CONFIG_ZONE_DMA32
841#define TEXT_FOR_DMA32(xx) xx "_dma32",
842#else
843#define TEXT_FOR_DMA32(xx)
844#endif
845
846#ifdef CONFIG_HIGHMEM
847#define TEXT_FOR_HIGHMEM(xx) xx "_high",
848#else
849#define TEXT_FOR_HIGHMEM(xx)
850#endif
851
852#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
853 TEXT_FOR_HIGHMEM(xx) xx "_movable",
854
855static const char * const vmstat_text[] = {
856 /* Zoned VM counters */
857 "nr_free_pages",
858 "nr_inactive_anon",
859 "nr_active_anon",
860 "nr_inactive_file",
861 "nr_active_file",
862 "nr_unevictable",
863 "nr_mlock",
864 "nr_anon_pages",
865 "nr_mapped",
866 "nr_file_pages",
867 "nr_dirty",
868 "nr_writeback",
869 "nr_slab_reclaimable",
870 "nr_slab_unreclaimable",
871 "nr_page_table_pages",
872 "nr_kernel_stack",
873 "nr_unstable",
874 "nr_bounce",
875 "nr_vmscan_write",
876 "nr_writeback_temp",
877 "nr_isolated_anon",
878 "nr_isolated_file",
879 "nr_shmem",
880 "nr_dirtied",
881 "nr_written",
882
883#ifdef CONFIG_NUMA
884 "numa_hit",
885 "numa_miss",
886 "numa_foreign",
887 "numa_interleave",
888 "numa_local",
889 "numa_other",
890#endif
891 "nr_anon_transparent_hugepages",
892 "nr_dirty_threshold",
893 "nr_dirty_background_threshold",
894
895#ifdef CONFIG_VM_EVENT_COUNTERS
896 "pgpgin",
897 "pgpgout",
898 "pswpin",
899 "pswpout",
900
901 TEXTS_FOR_ZONES("pgalloc")
902
903 "pgfree",
904 "pgactivate",
905 "pgdeactivate",
906
907 "pgfault",
908 "pgmajfault",
909
910 TEXTS_FOR_ZONES("pgrefill")
911 TEXTS_FOR_ZONES("pgsteal")
912 TEXTS_FOR_ZONES("pgscan_kswapd")
913 TEXTS_FOR_ZONES("pgscan_direct")
914
915#ifdef CONFIG_NUMA
916 "zone_reclaim_failed",
917#endif
918 "pginodesteal",
919 "slabs_scanned",
920 "kswapd_steal",
921 "kswapd_inodesteal",
922 "kswapd_low_wmark_hit_quickly",
923 "kswapd_high_wmark_hit_quickly",
924 "kswapd_skip_congestion_wait",
925 "pageoutrun",
926 "allocstall",
927
928 "pgrotated",
929
930#ifdef CONFIG_COMPACTION
931 "compact_blocks_moved",
932 "compact_pages_moved",
933 "compact_pagemigrate_failed",
934 "compact_stall",
935 "compact_fail",
936 "compact_success",
937#endif
938
939#ifdef CONFIG_HUGETLB_PAGE
940 "htlb_buddy_alloc_success",
941 "htlb_buddy_alloc_fail",
942#endif
943 "unevictable_pgs_culled",
944 "unevictable_pgs_scanned",
945 "unevictable_pgs_rescued",
946 "unevictable_pgs_mlocked",
947 "unevictable_pgs_munlocked",
948 "unevictable_pgs_cleared",
949 "unevictable_pgs_stranded",
950 "unevictable_pgs_mlockfreed",
951
952#ifdef CONFIG_TRANSPARENT_HUGEPAGE
953 "thp_fault_alloc",
954 "thp_fault_fallback",
955 "thp_collapse_alloc",
956 "thp_collapse_alloc_failed",
957 "thp_split",
958#endif
959
960#endif /* CONFIG_VM_EVENTS_COUNTERS */
961};
962
963static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, 966static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
964 struct zone *zone) 967 struct zone *zone)
965{ 968{
@@ -1198,7 +1201,6 @@ static int __init setup_vmstat(void)
1198#ifdef CONFIG_SMP 1201#ifdef CONFIG_SMP
1199 int cpu; 1202 int cpu;
1200 1203
1201 refresh_zone_stat_thresholds();
1202 register_cpu_notifier(&vmstat_notifier); 1204 register_cpu_notifier(&vmstat_notifier);
1203 1205
1204 for_each_online_cpu(cpu) 1206 for_each_online_cpu(cpu)